1 /*- 2 * Copyright (c) 2016-2020 Netflix, Inc. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 16 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 23 * SUCH DAMAGE. 24 * 25 */ 26 27 #include <sys/cdefs.h> 28 __FBSDID("$FreeBSD$"); 29 30 #include "opt_inet.h" 31 #include "opt_inet6.h" 32 #include "opt_ipsec.h" 33 #include "opt_tcpdebug.h" 34 #include "opt_ratelimit.h" 35 #include "opt_kern_tls.h" 36 #include <sys/param.h> 37 #include <sys/arb.h> 38 #include <sys/module.h> 39 #include <sys/kernel.h> 40 #ifdef TCP_HHOOK 41 #include <sys/hhook.h> 42 #endif 43 #include <sys/lock.h> 44 #include <sys/malloc.h> 45 #include <sys/lock.h> 46 #include <sys/mutex.h> 47 #include <sys/mbuf.h> 48 #include <sys/proc.h> /* for proc0 declaration */ 49 #include <sys/socket.h> 50 #include <sys/socketvar.h> 51 #ifdef KERN_TLS 52 #include <sys/ktls.h> 53 #endif 54 #include <sys/sysctl.h> 55 #include <sys/systm.h> 56 #ifdef STATS 57 #include <sys/qmath.h> 58 #include <sys/tree.h> 59 #include <sys/stats.h> /* Must come after qmath.h and tree.h */ 60 #else 61 #include <sys/tree.h> 62 #endif 63 #include <sys/refcount.h> 64 #include <sys/queue.h> 65 #include <sys/tim_filter.h> 66 #include <sys/smp.h> 67 #include <sys/kthread.h> 68 #include <sys/kern_prefetch.h> 69 #include <sys/protosw.h> 70 71 #include <vm/uma.h> 72 73 #include <net/route.h> 74 #include <net/route/nhop.h> 75 #include <net/vnet.h> 76 77 #define TCPSTATES /* for logging */ 78 79 #include <netinet/in.h> 80 #include <netinet/in_kdtrace.h> 81 #include <netinet/in_pcb.h> 82 #include <netinet/ip.h> 83 #include <netinet/ip_icmp.h> /* required for icmp_var.h */ 84 #include <netinet/icmp_var.h> /* for ICMP_BANDLIM */ 85 #include <netinet/ip_var.h> 86 #include <netinet/ip6.h> 87 #include <netinet6/in6_pcb.h> 88 #include <netinet6/ip6_var.h> 89 #include <netinet/tcp.h> 90 #define TCPOUTFLAGS 91 #include <netinet/tcp_fsm.h> 92 #include <netinet/tcp_log_buf.h> 93 #include <netinet/tcp_seq.h> 94 #include <netinet/tcp_timer.h> 95 #include <netinet/tcp_var.h> 96 #include <netinet/tcp_hpts.h> 97 #include <netinet/tcp_ratelimit.h> 98 #include <netinet/tcpip.h> 99 #include <netinet/cc/cc.h> 100 #include <netinet/tcp_fastopen.h> 101 #include <netinet/tcp_lro.h> 102 #ifdef NETFLIX_SHARED_CWND 103 #include <netinet/tcp_shared_cwnd.h> 104 #endif 105 #ifdef TCPDEBUG 106 #include <netinet/tcp_debug.h> 107 #endif /* TCPDEBUG */ 108 #ifdef TCP_OFFLOAD 109 #include <netinet/tcp_offload.h> 110 #endif 111 #ifdef INET6 112 #include <netinet6/tcp6_var.h> 113 #endif 114 115 #include <netipsec/ipsec_support.h> 116 117 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 118 #include <netipsec/ipsec.h> 119 #include <netipsec/ipsec6.h> 120 #endif /* IPSEC */ 121 122 #include <netinet/udp.h> 123 #include <netinet/udp_var.h> 124 #include <machine/in_cksum.h> 125 126 #ifdef MAC 127 #include <security/mac/mac_framework.h> 128 #endif 129 #include "sack_filter.h" 130 #include "tcp_rack.h" 131 #include "rack_bbr_common.h" 132 133 uma_zone_t rack_zone; 134 uma_zone_t rack_pcb_zone; 135 136 #ifndef TICKS2SBT 137 #define TICKS2SBT(__t) (tick_sbt * ((sbintime_t)(__t))) 138 #endif 139 140 struct sysctl_ctx_list rack_sysctl_ctx; 141 struct sysctl_oid *rack_sysctl_root; 142 143 #define CUM_ACKED 1 144 #define SACKED 2 145 146 /* 147 * The RACK module incorporates a number of 148 * TCP ideas that have been put out into the IETF 149 * over the last few years: 150 * - Matt Mathis's Rate Halving which slowly drops 151 * the congestion window so that the ack clock can 152 * be maintained during a recovery. 153 * - Yuchung Cheng's RACK TCP (for which its named) that 154 * will stop us using the number of dup acks and instead 155 * use time as the gage of when we retransmit. 156 * - Reorder Detection of RFC4737 and the Tail-Loss probe draft 157 * of Dukkipati et.al. 158 * RACK depends on SACK, so if an endpoint arrives that 159 * cannot do SACK the state machine below will shuttle the 160 * connection back to using the "default" TCP stack that is 161 * in FreeBSD. 162 * 163 * To implement RACK the original TCP stack was first decomposed 164 * into a functional state machine with individual states 165 * for each of the possible TCP connection states. The do_segement 166 * functions role in life is to mandate the connection supports SACK 167 * initially and then assure that the RACK state matches the conenction 168 * state before calling the states do_segment function. Each 169 * state is simplified due to the fact that the original do_segment 170 * has been decomposed and we *know* what state we are in (no 171 * switches on the state) and all tests for SACK are gone. This 172 * greatly simplifies what each state does. 173 * 174 * TCP output is also over-written with a new version since it 175 * must maintain the new rack scoreboard. 176 * 177 */ 178 static int32_t rack_tlp_thresh = 1; 179 static int32_t rack_tlp_limit = 2; /* No more than 2 TLPs w-out new data */ 180 static int32_t rack_tlp_use_greater = 1; 181 static int32_t rack_reorder_thresh = 2; 182 static int32_t rack_reorder_fade = 60000; /* 0 - never fade, def 60,000 183 * - 60 seconds */ 184 /* Attack threshold detections */ 185 static uint32_t rack_highest_sack_thresh_seen = 0; 186 static uint32_t rack_highest_move_thresh_seen = 0; 187 188 static int32_t rack_pkt_delay = 1; 189 static int32_t rack_early_recovery = 1; 190 static int32_t rack_send_a_lot_in_prr = 1; 191 static int32_t rack_min_to = 1; /* Number of ms minimum timeout */ 192 static int32_t rack_verbose_logging = 0; 193 static int32_t rack_ignore_data_after_close = 1; 194 static int32_t rack_enable_shared_cwnd = 0; 195 static int32_t rack_limits_scwnd = 1; 196 static int32_t rack_enable_mqueue_for_nonpaced = 0; 197 static int32_t rack_disable_prr = 0; 198 static int32_t use_rack_rr = 1; 199 static int32_t rack_non_rxt_use_cr = 0; /* does a non-rxt in recovery use the configured rate (ss/ca)? */ 200 static int32_t rack_persist_min = 250; /* 250ms */ 201 static int32_t rack_persist_max = 2000; /* 2 Second */ 202 static int32_t rack_sack_not_required = 0; /* set to one to allow non-sack to use rack */ 203 static int32_t rack_hw_tls_max_seg = 3; /* 3 means use hw-tls single segment */ 204 static int32_t rack_default_init_window = 0; /* Use system default */ 205 static int32_t rack_limit_time_with_srtt = 0; 206 static int32_t rack_hw_pace_adjust = 0; 207 /* 208 * Currently regular tcp has a rto_min of 30ms 209 * the backoff goes 12 times so that ends up 210 * being a total of 122.850 seconds before a 211 * connection is killed. 212 */ 213 static uint32_t rack_def_data_window = 20; 214 static uint32_t rack_goal_bdp = 2; 215 static uint32_t rack_min_srtts = 1; 216 static uint32_t rack_min_measure_usec = 0; 217 static int32_t rack_tlp_min = 10; 218 static int32_t rack_rto_min = 30; /* 30ms same as main freebsd */ 219 static int32_t rack_rto_max = 4000; /* 4 seconds */ 220 static const int32_t rack_free_cache = 2; 221 static int32_t rack_hptsi_segments = 40; 222 static int32_t rack_rate_sample_method = USE_RTT_LOW; 223 static int32_t rack_pace_every_seg = 0; 224 static int32_t rack_delayed_ack_time = 200; /* 200ms */ 225 static int32_t rack_slot_reduction = 4; 226 static int32_t rack_wma_divisor = 8; /* For WMA calculation */ 227 static int32_t rack_cwnd_block_ends_measure = 0; 228 static int32_t rack_rwnd_block_ends_measure = 0; 229 230 static int32_t rack_lower_cwnd_at_tlp = 0; 231 static int32_t rack_use_proportional_reduce = 0; 232 static int32_t rack_proportional_rate = 10; 233 static int32_t rack_tlp_max_resend = 2; 234 static int32_t rack_limited_retran = 0; 235 static int32_t rack_always_send_oldest = 0; 236 static int32_t rack_tlp_threshold_use = TLP_USE_TWO_ONE; 237 238 static uint16_t rack_per_of_gp_ss = 250; /* 250 % slow-start */ 239 static uint16_t rack_per_of_gp_ca = 200; /* 200 % congestion-avoidance */ 240 static uint16_t rack_per_of_gp_rec = 200; /* 200 % of bw */ 241 242 /* Probertt */ 243 static uint16_t rack_per_of_gp_probertt = 60; /* 60% of bw */ 244 static uint16_t rack_per_of_gp_lowthresh = 40; /* 40% is bottom */ 245 static uint16_t rack_per_of_gp_probertt_reduce = 10; /* 10% reduction */ 246 static uint16_t rack_atexit_prtt_hbp = 130; /* Clamp to 130% on exit prtt if highly buffered path */ 247 static uint16_t rack_atexit_prtt = 130; /* Clamp to 100% on exit prtt if non highly buffered path */ 248 249 static uint32_t rack_max_drain_wait = 2; /* How man gp srtt's before we give up draining */ 250 static uint32_t rack_must_drain = 1; /* How many GP srtt's we *must* wait */ 251 static uint32_t rack_probertt_use_min_rtt_entry = 1; /* Use the min to calculate the goal else gp_srtt */ 252 static uint32_t rack_probertt_use_min_rtt_exit = 0; 253 static uint32_t rack_probe_rtt_sets_cwnd = 0; 254 static uint32_t rack_probe_rtt_safety_val = 2000000; /* No more than 2 sec in probe-rtt */ 255 static uint32_t rack_time_between_probertt = 9600000; /* 9.6 sec in us */ 256 static uint32_t rack_probertt_gpsrtt_cnt_mul = 0; /* How many srtt periods does probe-rtt last top fraction */ 257 static uint32_t rack_probertt_gpsrtt_cnt_div = 0; /* How many srtt periods does probe-rtt last bottom fraction */ 258 static uint32_t rack_min_probertt_hold = 200000; /* Equal to delayed ack time */ 259 static uint32_t rack_probertt_filter_life = 10000000; 260 static uint32_t rack_probertt_lower_within = 10; 261 static uint32_t rack_min_rtt_movement = 250; /* Must move at least 250 useconds to count as a lowering */ 262 static int32_t rack_pace_one_seg = 0; /* Shall we pace for less than 1.4Meg 1MSS at a time */ 263 static int32_t rack_probertt_clear_is = 1; 264 static int32_t rack_max_drain_hbp = 1; /* Extra drain times gpsrtt for highly buffered paths */ 265 static int32_t rack_hbp_thresh = 3; /* what is the divisor max_rtt/min_rtt to decided a hbp */ 266 267 268 /* Part of pacing */ 269 static int32_t rack_max_per_above = 30; /* When we go to increment stop if above 100+this% */ 270 271 /* Timely information */ 272 /* Combine these two gives the range of 'no change' to bw */ 273 /* ie the up/down provide the upper and lower bound */ 274 static int32_t rack_gp_per_bw_mul_up = 2; /* 2% */ 275 static int32_t rack_gp_per_bw_mul_down = 4; /* 4% */ 276 static int32_t rack_gp_rtt_maxmul = 3; /* 3 x maxmin */ 277 static int32_t rack_gp_rtt_minmul = 1; /* minrtt + (minrtt/mindiv) is lower rtt */ 278 static int32_t rack_gp_rtt_mindiv = 4; /* minrtt + (minrtt * minmul/mindiv) is lower rtt */ 279 static int32_t rack_gp_decrease_per = 20; /* 20% decrease in multipler */ 280 static int32_t rack_gp_increase_per = 2; /* 2% increase in multipler */ 281 static int32_t rack_per_lower_bound = 50; /* Don't allow to drop below this multiplier */ 282 static int32_t rack_per_upper_bound_ss = 0; /* Don't allow SS to grow above this */ 283 static int32_t rack_per_upper_bound_ca = 0; /* Don't allow CA to grow above this */ 284 static int32_t rack_do_dyn_mul = 0; /* Are the rack gp multipliers dynamic */ 285 static int32_t rack_gp_no_rec_chg = 1; /* Prohibit recovery from reducing it's multiplier */ 286 static int32_t rack_timely_dec_clear = 6; /* Do we clear decrement count at a value (6)? */ 287 static int32_t rack_timely_max_push_rise = 3; /* One round of pushing */ 288 static int32_t rack_timely_max_push_drop = 3; /* Three round of pushing */ 289 static int32_t rack_timely_min_segs = 4; /* 4 segment minimum */ 290 static int32_t rack_use_max_for_nobackoff = 0; 291 static int32_t rack_timely_int_timely_only = 0; /* do interim timely's only use the timely algo (no b/w changes)? */ 292 static int32_t rack_timely_no_stopping = 0; 293 static int32_t rack_down_raise_thresh = 100; 294 static int32_t rack_req_segs = 1; 295 296 /* Weird delayed ack mode */ 297 static int32_t rack_use_imac_dack = 0; 298 /* Rack specific counters */ 299 counter_u64_t rack_badfr; 300 counter_u64_t rack_badfr_bytes; 301 counter_u64_t rack_rtm_prr_retran; 302 counter_u64_t rack_rtm_prr_newdata; 303 counter_u64_t rack_timestamp_mismatch; 304 counter_u64_t rack_reorder_seen; 305 counter_u64_t rack_paced_segments; 306 counter_u64_t rack_unpaced_segments; 307 counter_u64_t rack_calc_zero; 308 counter_u64_t rack_calc_nonzero; 309 counter_u64_t rack_saw_enobuf; 310 counter_u64_t rack_saw_enetunreach; 311 counter_u64_t rack_per_timer_hole; 312 313 /* Tail loss probe counters */ 314 counter_u64_t rack_tlp_tot; 315 counter_u64_t rack_tlp_newdata; 316 counter_u64_t rack_tlp_retran; 317 counter_u64_t rack_tlp_retran_bytes; 318 counter_u64_t rack_tlp_retran_fail; 319 counter_u64_t rack_to_tot; 320 counter_u64_t rack_to_arm_rack; 321 counter_u64_t rack_to_arm_tlp; 322 counter_u64_t rack_to_alloc; 323 counter_u64_t rack_to_alloc_hard; 324 counter_u64_t rack_to_alloc_emerg; 325 counter_u64_t rack_to_alloc_limited; 326 counter_u64_t rack_alloc_limited_conns; 327 counter_u64_t rack_split_limited; 328 329 counter_u64_t rack_sack_proc_all; 330 counter_u64_t rack_sack_proc_short; 331 counter_u64_t rack_sack_proc_restart; 332 counter_u64_t rack_sack_attacks_detected; 333 counter_u64_t rack_sack_attacks_reversed; 334 counter_u64_t rack_sack_used_next_merge; 335 counter_u64_t rack_sack_splits; 336 counter_u64_t rack_sack_used_prev_merge; 337 counter_u64_t rack_sack_skipped_acked; 338 counter_u64_t rack_ack_total; 339 counter_u64_t rack_express_sack; 340 counter_u64_t rack_sack_total; 341 counter_u64_t rack_move_none; 342 counter_u64_t rack_move_some; 343 344 counter_u64_t rack_used_tlpmethod; 345 counter_u64_t rack_used_tlpmethod2; 346 counter_u64_t rack_enter_tlp_calc; 347 counter_u64_t rack_input_idle_reduces; 348 counter_u64_t rack_collapsed_win; 349 counter_u64_t rack_tlp_does_nada; 350 counter_u64_t rack_try_scwnd; 351 352 /* Counters for HW TLS */ 353 counter_u64_t rack_tls_rwnd; 354 counter_u64_t rack_tls_cwnd; 355 counter_u64_t rack_tls_app; 356 counter_u64_t rack_tls_other; 357 counter_u64_t rack_tls_filled; 358 counter_u64_t rack_tls_rxt; 359 counter_u64_t rack_tls_tlp; 360 361 /* Temp CPU counters */ 362 counter_u64_t rack_find_high; 363 364 counter_u64_t rack_progress_drops; 365 counter_u64_t rack_out_size[TCP_MSS_ACCT_SIZE]; 366 counter_u64_t rack_opts_arry[RACK_OPTS_SIZE]; 367 368 static void 369 rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, int event, int line); 370 371 static int 372 rack_process_ack(struct mbuf *m, struct tcphdr *th, 373 struct socket *so, struct tcpcb *tp, struct tcpopt *to, 374 uint32_t tiwin, int32_t tlen, int32_t * ofia, int32_t thflags, int32_t * ret_val); 375 static int 376 rack_process_data(struct mbuf *m, struct tcphdr *th, 377 struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, 378 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); 379 static void 380 rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, 381 struct tcphdr *th, uint16_t nsegs, uint16_t type, int32_t recovery); 382 static struct rack_sendmap *rack_alloc(struct tcp_rack *rack); 383 static struct rack_sendmap *rack_alloc_limit(struct tcp_rack *rack, 384 uint8_t limit_type); 385 static struct rack_sendmap * 386 rack_check_recovery_mode(struct tcpcb *tp, 387 uint32_t tsused); 388 static void 389 rack_cong_signal(struct tcpcb *tp, struct tcphdr *th, 390 uint32_t type); 391 static void rack_counter_destroy(void); 392 static int 393 rack_ctloutput(struct socket *so, struct sockopt *sopt, 394 struct inpcb *inp, struct tcpcb *tp); 395 static int32_t rack_ctor(void *mem, int32_t size, void *arg, int32_t how); 396 static void 397 rack_set_pace_segments(struct tcpcb *tp, struct tcp_rack *rack, uint32_t line); 398 static void 399 rack_do_segment(struct mbuf *m, struct tcphdr *th, 400 struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, 401 uint8_t iptos); 402 static void rack_dtor(void *mem, int32_t size, void *arg); 403 static void 404 rack_earlier_retran(struct tcpcb *tp, struct rack_sendmap *rsm, 405 uint32_t t, uint32_t cts); 406 static void 407 rack_log_alt_to_to_cancel(struct tcp_rack *rack, 408 uint32_t flex1, uint32_t flex2, 409 uint32_t flex3, uint32_t flex4, 410 uint32_t flex5, uint32_t flex6, 411 uint16_t flex7, uint8_t mod); 412 static void 413 rack_log_pacing_delay_calc(struct tcp_rack *rack, uint32_t len, uint32_t slot, 414 uint64_t bw_est, uint64_t bw, uint64_t len_time, int method, int line, struct rack_sendmap *rsm); 415 static struct rack_sendmap * 416 rack_find_high_nonack(struct tcp_rack *rack, 417 struct rack_sendmap *rsm); 418 static struct rack_sendmap *rack_find_lowest_rsm(struct tcp_rack *rack); 419 static void rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm); 420 static void rack_fini(struct tcpcb *tp, int32_t tcb_is_purged); 421 static int 422 rack_get_sockopt(struct socket *so, struct sockopt *sopt, 423 struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack); 424 static void 425 rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack, 426 tcp_seq th_ack, int line); 427 static uint32_t 428 rack_get_pacing_len(struct tcp_rack *rack, uint64_t bw, uint32_t mss); 429 static int32_t rack_handoff_ok(struct tcpcb *tp); 430 static int32_t rack_init(struct tcpcb *tp); 431 static void rack_init_sysctls(void); 432 static void 433 rack_log_ack(struct tcpcb *tp, struct tcpopt *to, 434 struct tcphdr *th); 435 static void 436 rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len, 437 uint32_t seq_out, uint8_t th_flags, int32_t err, uint32_t ts, 438 uint8_t pass, struct rack_sendmap *hintrsm, uint32_t us_cts); 439 static void 440 rack_log_sack_passed(struct tcpcb *tp, struct tcp_rack *rack, 441 struct rack_sendmap *rsm); 442 static void rack_log_to_event(struct tcp_rack *rack, int32_t to_num, struct rack_sendmap *rsm); 443 static int32_t rack_output(struct tcpcb *tp); 444 445 static uint32_t 446 rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, 447 struct sackblk *sack, struct tcpopt *to, struct rack_sendmap **prsm, 448 uint32_t cts, int *moved_two); 449 static void rack_post_recovery(struct tcpcb *tp, struct tcphdr *th); 450 static void rack_remxt_tmr(struct tcpcb *tp); 451 static int 452 rack_set_sockopt(struct socket *so, struct sockopt *sopt, 453 struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack); 454 static void rack_set_state(struct tcpcb *tp, struct tcp_rack *rack); 455 static int32_t rack_stopall(struct tcpcb *tp); 456 static void 457 rack_timer_activate(struct tcpcb *tp, uint32_t timer_type, 458 uint32_t delta); 459 static int32_t rack_timer_active(struct tcpcb *tp, uint32_t timer_type); 460 static void rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line); 461 static void rack_timer_stop(struct tcpcb *tp, uint32_t timer_type); 462 static uint32_t 463 rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack, 464 struct rack_sendmap *rsm, uint32_t ts, int32_t * lenp); 465 static void 466 rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack, 467 struct rack_sendmap *rsm, uint32_t ts); 468 static int 469 rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack, 470 struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type, tcp_seq th_ack); 471 static int32_t tcp_addrack(module_t mod, int32_t type, void *data); 472 static int 473 rack_do_close_wait(struct mbuf *m, struct tcphdr *th, 474 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 475 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 476 static int 477 rack_do_closing(struct mbuf *m, struct tcphdr *th, 478 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 479 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 480 static int 481 rack_do_established(struct mbuf *m, struct tcphdr *th, 482 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 483 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 484 static int 485 rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, 486 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 487 int32_t tlen, uint32_t tiwin, int32_t nxt_pkt, uint8_t iptos); 488 static int 489 rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, 490 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 491 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 492 static int 493 rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, 494 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 495 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 496 static int 497 rack_do_lastack(struct mbuf *m, struct tcphdr *th, 498 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 499 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 500 static int 501 rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, 502 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 503 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 504 static int 505 rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, 506 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 507 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 508 struct rack_sendmap * 509 tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, 510 uint32_t tsused); 511 static void tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt, 512 uint32_t len, uint32_t us_tim, int confidence, struct rack_sendmap *rsm, uint16_t rtrcnt); 513 static void 514 tcp_rack_partialack(struct tcpcb *tp, struct tcphdr *th); 515 516 int32_t rack_clear_counter=0; 517 518 519 static int 520 sysctl_rack_clear(SYSCTL_HANDLER_ARGS) 521 { 522 uint32_t stat; 523 int32_t error; 524 525 error = SYSCTL_OUT(req, &rack_clear_counter, sizeof(uint32_t)); 526 if (error || req->newptr == NULL) 527 return error; 528 529 error = SYSCTL_IN(req, &stat, sizeof(uint32_t)); 530 if (error) 531 return (error); 532 if (stat == 1) { 533 #ifdef INVARIANTS 534 printf("Clearing RACK counters\n"); 535 #endif 536 counter_u64_zero(rack_badfr); 537 counter_u64_zero(rack_badfr_bytes); 538 counter_u64_zero(rack_rtm_prr_retran); 539 counter_u64_zero(rack_rtm_prr_newdata); 540 counter_u64_zero(rack_timestamp_mismatch); 541 counter_u64_zero(rack_reorder_seen); 542 counter_u64_zero(rack_tlp_tot); 543 counter_u64_zero(rack_tlp_newdata); 544 counter_u64_zero(rack_tlp_retran); 545 counter_u64_zero(rack_tlp_retran_bytes); 546 counter_u64_zero(rack_tlp_retran_fail); 547 counter_u64_zero(rack_to_tot); 548 counter_u64_zero(rack_to_arm_rack); 549 counter_u64_zero(rack_to_arm_tlp); 550 counter_u64_zero(rack_paced_segments); 551 counter_u64_zero(rack_calc_zero); 552 counter_u64_zero(rack_calc_nonzero); 553 counter_u64_zero(rack_unpaced_segments); 554 counter_u64_zero(rack_saw_enobuf); 555 counter_u64_zero(rack_saw_enetunreach); 556 counter_u64_zero(rack_per_timer_hole); 557 counter_u64_zero(rack_to_alloc_hard); 558 counter_u64_zero(rack_to_alloc_emerg); 559 counter_u64_zero(rack_sack_proc_all); 560 counter_u64_zero(rack_sack_proc_short); 561 counter_u64_zero(rack_sack_proc_restart); 562 counter_u64_zero(rack_to_alloc); 563 counter_u64_zero(rack_to_alloc_limited); 564 counter_u64_zero(rack_alloc_limited_conns); 565 counter_u64_zero(rack_split_limited); 566 counter_u64_zero(rack_find_high); 567 counter_u64_zero(rack_tls_rwnd); 568 counter_u64_zero(rack_tls_cwnd); 569 counter_u64_zero(rack_tls_app); 570 counter_u64_zero(rack_tls_other); 571 counter_u64_zero(rack_tls_filled); 572 counter_u64_zero(rack_tls_rxt); 573 counter_u64_zero(rack_tls_tlp); 574 counter_u64_zero(rack_sack_attacks_detected); 575 counter_u64_zero(rack_sack_attacks_reversed); 576 counter_u64_zero(rack_sack_used_next_merge); 577 counter_u64_zero(rack_sack_used_prev_merge); 578 counter_u64_zero(rack_sack_splits); 579 counter_u64_zero(rack_sack_skipped_acked); 580 counter_u64_zero(rack_ack_total); 581 counter_u64_zero(rack_express_sack); 582 counter_u64_zero(rack_sack_total); 583 counter_u64_zero(rack_move_none); 584 counter_u64_zero(rack_move_some); 585 counter_u64_zero(rack_used_tlpmethod); 586 counter_u64_zero(rack_used_tlpmethod2); 587 counter_u64_zero(rack_enter_tlp_calc); 588 counter_u64_zero(rack_progress_drops); 589 counter_u64_zero(rack_tlp_does_nada); 590 counter_u64_zero(rack_try_scwnd); 591 counter_u64_zero(rack_collapsed_win); 592 593 } 594 rack_clear_counter = 0; 595 return (0); 596 } 597 598 599 600 static void 601 rack_init_sysctls(void) 602 { 603 struct sysctl_oid *rack_counters; 604 struct sysctl_oid *rack_attack; 605 struct sysctl_oid *rack_pacing; 606 struct sysctl_oid *rack_timely; 607 struct sysctl_oid *rack_timers; 608 struct sysctl_oid *rack_tlp; 609 struct sysctl_oid *rack_misc; 610 struct sysctl_oid *rack_measure; 611 struct sysctl_oid *rack_probertt; 612 613 rack_attack = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 614 SYSCTL_CHILDREN(rack_sysctl_root), 615 OID_AUTO, 616 "sack_attack", 617 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 618 "Rack Sack Attack Counters and Controls"); 619 rack_counters = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 620 SYSCTL_CHILDREN(rack_sysctl_root), 621 OID_AUTO, 622 "stats", 623 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 624 "Rack Counters"); 625 SYSCTL_ADD_S32(&rack_sysctl_ctx, 626 SYSCTL_CHILDREN(rack_sysctl_root), 627 OID_AUTO, "rate_sample_method", CTLFLAG_RW, 628 &rack_rate_sample_method , USE_RTT_LOW, 629 "What method should we use for rate sampling 0=high, 1=low "); 630 SYSCTL_ADD_S32(&rack_sysctl_ctx, 631 SYSCTL_CHILDREN(rack_sysctl_root), 632 OID_AUTO, "hw_tlsmax", CTLFLAG_RW, 633 &rack_hw_tls_max_seg , 3, 634 "What is the maximum number of full TLS records that will be sent at once"); 635 /* Probe rtt related controls */ 636 rack_probertt = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 637 SYSCTL_CHILDREN(rack_sysctl_root), 638 OID_AUTO, 639 "probertt", 640 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 641 "ProbeRTT related Controls"); 642 SYSCTL_ADD_U16(&rack_sysctl_ctx, 643 SYSCTL_CHILDREN(rack_probertt), 644 OID_AUTO, "exit_per_hpb", CTLFLAG_RW, 645 &rack_atexit_prtt_hbp, 130, 646 "What percentage above goodput do we clamp CA/SS to at exit on high-BDP path 110%"); 647 SYSCTL_ADD_U16(&rack_sysctl_ctx, 648 SYSCTL_CHILDREN(rack_probertt), 649 OID_AUTO, "exit_per_nonhpb", CTLFLAG_RW, 650 &rack_atexit_prtt, 130, 651 "What percentage above goodput do we clamp CA/SS to at exit on a non high-BDP path 100%"); 652 SYSCTL_ADD_U16(&rack_sysctl_ctx, 653 SYSCTL_CHILDREN(rack_probertt), 654 OID_AUTO, "gp_per_mul", CTLFLAG_RW, 655 &rack_per_of_gp_probertt, 60, 656 "What percentage of goodput do we pace at in probertt"); 657 SYSCTL_ADD_U16(&rack_sysctl_ctx, 658 SYSCTL_CHILDREN(rack_probertt), 659 OID_AUTO, "gp_per_reduce", CTLFLAG_RW, 660 &rack_per_of_gp_probertt_reduce, 10, 661 "What percentage of goodput do we reduce every gp_srtt"); 662 SYSCTL_ADD_U16(&rack_sysctl_ctx, 663 SYSCTL_CHILDREN(rack_probertt), 664 OID_AUTO, "gp_per_low", CTLFLAG_RW, 665 &rack_per_of_gp_lowthresh, 40, 666 "What percentage of goodput do we allow the multiplier to fall to"); 667 SYSCTL_ADD_U32(&rack_sysctl_ctx, 668 SYSCTL_CHILDREN(rack_probertt), 669 OID_AUTO, "time_between", CTLFLAG_RW, 670 & rack_time_between_probertt, 96000000, 671 "How many useconds between the lowest rtt falling must past before we enter probertt"); 672 SYSCTL_ADD_U32(&rack_sysctl_ctx, 673 SYSCTL_CHILDREN(rack_probertt), 674 OID_AUTO, "safety", CTLFLAG_RW, 675 &rack_probe_rtt_safety_val, 2000000, 676 "If not zero, provides a maximum usecond that you can stay in probertt (2sec = 2000000)"); 677 SYSCTL_ADD_U32(&rack_sysctl_ctx, 678 SYSCTL_CHILDREN(rack_probertt), 679 OID_AUTO, "sets_cwnd", CTLFLAG_RW, 680 &rack_probe_rtt_sets_cwnd, 0, 681 "Do we set the cwnd too (if always_lower is on)"); 682 SYSCTL_ADD_U32(&rack_sysctl_ctx, 683 SYSCTL_CHILDREN(rack_probertt), 684 OID_AUTO, "maxdrainsrtts", CTLFLAG_RW, 685 &rack_max_drain_wait, 2, 686 "Maximum number of gp_srtt's to hold in drain waiting for flight to reach goal"); 687 SYSCTL_ADD_U32(&rack_sysctl_ctx, 688 SYSCTL_CHILDREN(rack_probertt), 689 OID_AUTO, "mustdrainsrtts", CTLFLAG_RW, 690 &rack_must_drain, 1, 691 "We must drain this many gp_srtt's waiting for flight to reach goal"); 692 SYSCTL_ADD_U32(&rack_sysctl_ctx, 693 SYSCTL_CHILDREN(rack_probertt), 694 OID_AUTO, "goal_use_min_entry", CTLFLAG_RW, 695 &rack_probertt_use_min_rtt_entry, 1, 696 "Should we use the min-rtt to calculate the goal rtt (else gp_srtt) at entry"); 697 SYSCTL_ADD_U32(&rack_sysctl_ctx, 698 SYSCTL_CHILDREN(rack_probertt), 699 OID_AUTO, "goal_use_min_exit", CTLFLAG_RW, 700 &rack_probertt_use_min_rtt_exit, 0, 701 "How to set cwnd at exit, 0 - dynamic, 1 - use min-rtt, 2 - use curgprtt, 3 - entry gp-rtt"); 702 SYSCTL_ADD_U32(&rack_sysctl_ctx, 703 SYSCTL_CHILDREN(rack_probertt), 704 OID_AUTO, "length_div", CTLFLAG_RW, 705 &rack_probertt_gpsrtt_cnt_div, 0, 706 "How many recent goodput srtt periods plus hold tim does probertt last (bottom of fraction)"); 707 SYSCTL_ADD_U32(&rack_sysctl_ctx, 708 SYSCTL_CHILDREN(rack_probertt), 709 OID_AUTO, "length_mul", CTLFLAG_RW, 710 &rack_probertt_gpsrtt_cnt_mul, 0, 711 "How many recent goodput srtt periods plus hold tim does probertt last (top of fraction)"); 712 SYSCTL_ADD_U32(&rack_sysctl_ctx, 713 SYSCTL_CHILDREN(rack_probertt), 714 OID_AUTO, "holdtim_at_target", CTLFLAG_RW, 715 &rack_min_probertt_hold, 200000, 716 "What is the minimum time we hold probertt at target"); 717 SYSCTL_ADD_U32(&rack_sysctl_ctx, 718 SYSCTL_CHILDREN(rack_probertt), 719 OID_AUTO, "filter_life", CTLFLAG_RW, 720 &rack_probertt_filter_life, 10000000, 721 "What is the time for the filters life in useconds"); 722 SYSCTL_ADD_U32(&rack_sysctl_ctx, 723 SYSCTL_CHILDREN(rack_probertt), 724 OID_AUTO, "lower_within", CTLFLAG_RW, 725 &rack_probertt_lower_within, 10, 726 "If the rtt goes lower within this percentage of the time, go into probe-rtt"); 727 SYSCTL_ADD_U32(&rack_sysctl_ctx, 728 SYSCTL_CHILDREN(rack_probertt), 729 OID_AUTO, "must_move", CTLFLAG_RW, 730 &rack_min_rtt_movement, 250, 731 "How much is the minimum movement in rtt to count as a drop for probertt purposes"); 732 SYSCTL_ADD_U32(&rack_sysctl_ctx, 733 SYSCTL_CHILDREN(rack_probertt), 734 OID_AUTO, "clear_is_cnts", CTLFLAG_RW, 735 &rack_probertt_clear_is, 1, 736 "Do we clear I/S counts on exiting probe-rtt"); 737 SYSCTL_ADD_S32(&rack_sysctl_ctx, 738 SYSCTL_CHILDREN(rack_probertt), 739 OID_AUTO, "hbp_extra_drain", CTLFLAG_RW, 740 &rack_max_drain_hbp, 1, 741 "How many extra drain gpsrtt's do we get in highly buffered paths"); 742 SYSCTL_ADD_S32(&rack_sysctl_ctx, 743 SYSCTL_CHILDREN(rack_probertt), 744 OID_AUTO, "hbp_threshold", CTLFLAG_RW, 745 &rack_hbp_thresh, 3, 746 "We are highly buffered if min_rtt_seen / max_rtt_seen > this-threshold"); 747 /* Pacing related sysctls */ 748 rack_pacing = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 749 SYSCTL_CHILDREN(rack_sysctl_root), 750 OID_AUTO, 751 "pacing", 752 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 753 "Pacing related Controls"); 754 SYSCTL_ADD_S32(&rack_sysctl_ctx, 755 SYSCTL_CHILDREN(rack_pacing), 756 OID_AUTO, "max_pace_over", CTLFLAG_RW, 757 &rack_max_per_above, 30, 758 "What is the maximum allowable percentage that we can pace above (so 30 = 130% of our goal)"); 759 SYSCTL_ADD_S32(&rack_sysctl_ctx, 760 SYSCTL_CHILDREN(rack_pacing), 761 OID_AUTO, "pace_to_one", CTLFLAG_RW, 762 &rack_pace_one_seg, 0, 763 "Do we allow low b/w pacing of 1MSS instead of two"); 764 SYSCTL_ADD_S32(&rack_sysctl_ctx, 765 SYSCTL_CHILDREN(rack_pacing), 766 OID_AUTO, "limit_wsrtt", CTLFLAG_RW, 767 &rack_limit_time_with_srtt, 0, 768 "Do we limit pacing time based on srtt"); 769 SYSCTL_ADD_S32(&rack_sysctl_ctx, 770 SYSCTL_CHILDREN(rack_pacing), 771 OID_AUTO, "init_win", CTLFLAG_RW, 772 &rack_default_init_window, 0, 773 "Do we have a rack initial window 0 = system default"); 774 SYSCTL_ADD_U32(&rack_sysctl_ctx, 775 SYSCTL_CHILDREN(rack_pacing), 776 OID_AUTO, "hw_pacing_adjust", CTLFLAG_RW, 777 &rack_hw_pace_adjust, 0, 778 "What percentage do we raise the MSS by (11 = 1.1%)"); 779 SYSCTL_ADD_U16(&rack_sysctl_ctx, 780 SYSCTL_CHILDREN(rack_pacing), 781 OID_AUTO, "gp_per_ss", CTLFLAG_RW, 782 &rack_per_of_gp_ss, 250, 783 "If non zero, what percentage of goodput to pace at in slow start"); 784 SYSCTL_ADD_U16(&rack_sysctl_ctx, 785 SYSCTL_CHILDREN(rack_pacing), 786 OID_AUTO, "gp_per_ca", CTLFLAG_RW, 787 &rack_per_of_gp_ca, 150, 788 "If non zero, what percentage of goodput to pace at in congestion avoidance"); 789 SYSCTL_ADD_U16(&rack_sysctl_ctx, 790 SYSCTL_CHILDREN(rack_pacing), 791 OID_AUTO, "gp_per_rec", CTLFLAG_RW, 792 &rack_per_of_gp_rec, 200, 793 "If non zero, what percentage of goodput to pace at in recovery"); 794 SYSCTL_ADD_S32(&rack_sysctl_ctx, 795 SYSCTL_CHILDREN(rack_pacing), 796 OID_AUTO, "pace_max_seg", CTLFLAG_RW, 797 &rack_hptsi_segments, 40, 798 "What size is the max for TSO segments in pacing and burst mitigation"); 799 SYSCTL_ADD_S32(&rack_sysctl_ctx, 800 SYSCTL_CHILDREN(rack_pacing), 801 OID_AUTO, "burst_reduces", CTLFLAG_RW, 802 &rack_slot_reduction, 4, 803 "When doing only burst mitigation what is the reduce divisor"); 804 SYSCTL_ADD_S32(&rack_sysctl_ctx, 805 SYSCTL_CHILDREN(rack_sysctl_root), 806 OID_AUTO, "use_pacing", CTLFLAG_RW, 807 &rack_pace_every_seg, 0, 808 "If set we use pacing, if clear we use only the original burst mitigation"); 809 810 rack_timely = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 811 SYSCTL_CHILDREN(rack_sysctl_root), 812 OID_AUTO, 813 "timely", 814 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 815 "Rack Timely RTT Controls"); 816 /* Timely based GP dynmics */ 817 SYSCTL_ADD_S32(&rack_sysctl_ctx, 818 SYSCTL_CHILDREN(rack_timely), 819 OID_AUTO, "upper", CTLFLAG_RW, 820 &rack_gp_per_bw_mul_up, 2, 821 "Rack timely upper range for equal b/w (in percentage)"); 822 SYSCTL_ADD_S32(&rack_sysctl_ctx, 823 SYSCTL_CHILDREN(rack_timely), 824 OID_AUTO, "lower", CTLFLAG_RW, 825 &rack_gp_per_bw_mul_down, 4, 826 "Rack timely lower range for equal b/w (in percentage)"); 827 SYSCTL_ADD_S32(&rack_sysctl_ctx, 828 SYSCTL_CHILDREN(rack_timely), 829 OID_AUTO, "rtt_max_mul", CTLFLAG_RW, 830 &rack_gp_rtt_maxmul, 3, 831 "Rack timely multipler of lowest rtt for rtt_max"); 832 SYSCTL_ADD_S32(&rack_sysctl_ctx, 833 SYSCTL_CHILDREN(rack_timely), 834 OID_AUTO, "rtt_min_div", CTLFLAG_RW, 835 &rack_gp_rtt_mindiv, 4, 836 "Rack timely divisor used for rtt + (rtt * mul/divisor) for check for lower rtt"); 837 SYSCTL_ADD_S32(&rack_sysctl_ctx, 838 SYSCTL_CHILDREN(rack_timely), 839 OID_AUTO, "rtt_min_mul", CTLFLAG_RW, 840 &rack_gp_rtt_minmul, 1, 841 "Rack timely multiplier used for rtt + (rtt * mul/divisor) for check for lower rtt"); 842 SYSCTL_ADD_S32(&rack_sysctl_ctx, 843 SYSCTL_CHILDREN(rack_timely), 844 OID_AUTO, "decrease", CTLFLAG_RW, 845 &rack_gp_decrease_per, 20, 846 "Rack timely decrease percentage of our GP multiplication factor"); 847 SYSCTL_ADD_S32(&rack_sysctl_ctx, 848 SYSCTL_CHILDREN(rack_timely), 849 OID_AUTO, "increase", CTLFLAG_RW, 850 &rack_gp_increase_per, 2, 851 "Rack timely increase perentage of our GP multiplication factor"); 852 SYSCTL_ADD_S32(&rack_sysctl_ctx, 853 SYSCTL_CHILDREN(rack_timely), 854 OID_AUTO, "lowerbound", CTLFLAG_RW, 855 &rack_per_lower_bound, 50, 856 "Rack timely lowest percentage we allow GP multiplier to fall to"); 857 SYSCTL_ADD_S32(&rack_sysctl_ctx, 858 SYSCTL_CHILDREN(rack_timely), 859 OID_AUTO, "upperboundss", CTLFLAG_RW, 860 &rack_per_upper_bound_ss, 0, 861 "Rack timely higest percentage we allow GP multiplier in SS to raise to (0 is no upperbound)"); 862 SYSCTL_ADD_S32(&rack_sysctl_ctx, 863 SYSCTL_CHILDREN(rack_timely), 864 OID_AUTO, "upperboundca", CTLFLAG_RW, 865 &rack_per_upper_bound_ca, 0, 866 "Rack timely higest percentage we allow GP multiplier to CA raise to (0 is no upperbound)"); 867 SYSCTL_ADD_S32(&rack_sysctl_ctx, 868 SYSCTL_CHILDREN(rack_timely), 869 OID_AUTO, "dynamicgp", CTLFLAG_RW, 870 &rack_do_dyn_mul, 0, 871 "Rack timely do we enable dynmaic timely goodput by default"); 872 SYSCTL_ADD_S32(&rack_sysctl_ctx, 873 SYSCTL_CHILDREN(rack_timely), 874 OID_AUTO, "no_rec_red", CTLFLAG_RW, 875 &rack_gp_no_rec_chg, 1, 876 "Rack timely do we prohibit the recovery multiplier from being lowered"); 877 SYSCTL_ADD_S32(&rack_sysctl_ctx, 878 SYSCTL_CHILDREN(rack_timely), 879 OID_AUTO, "red_clear_cnt", CTLFLAG_RW, 880 &rack_timely_dec_clear, 6, 881 "Rack timely what threshold do we count to before another boost during b/w decent"); 882 SYSCTL_ADD_S32(&rack_sysctl_ctx, 883 SYSCTL_CHILDREN(rack_timely), 884 OID_AUTO, "max_push_rise", CTLFLAG_RW, 885 &rack_timely_max_push_rise, 3, 886 "Rack timely how many times do we push up with b/w increase"); 887 SYSCTL_ADD_S32(&rack_sysctl_ctx, 888 SYSCTL_CHILDREN(rack_timely), 889 OID_AUTO, "max_push_drop", CTLFLAG_RW, 890 &rack_timely_max_push_drop, 3, 891 "Rack timely how many times do we push back on b/w decent"); 892 SYSCTL_ADD_S32(&rack_sysctl_ctx, 893 SYSCTL_CHILDREN(rack_timely), 894 OID_AUTO, "min_segs", CTLFLAG_RW, 895 &rack_timely_min_segs, 4, 896 "Rack timely when setting the cwnd what is the min num segments"); 897 SYSCTL_ADD_S32(&rack_sysctl_ctx, 898 SYSCTL_CHILDREN(rack_timely), 899 OID_AUTO, "noback_max", CTLFLAG_RW, 900 &rack_use_max_for_nobackoff, 0, 901 "Rack timely when deciding if to backoff on a loss, do we use under max rtt else min"); 902 SYSCTL_ADD_S32(&rack_sysctl_ctx, 903 SYSCTL_CHILDREN(rack_timely), 904 OID_AUTO, "interim_timely_only", CTLFLAG_RW, 905 &rack_timely_int_timely_only, 0, 906 "Rack timely when doing interim timely's do we only do timely (no b/w consideration)"); 907 SYSCTL_ADD_S32(&rack_sysctl_ctx, 908 SYSCTL_CHILDREN(rack_timely), 909 OID_AUTO, "nonstop", CTLFLAG_RW, 910 &rack_timely_no_stopping, 0, 911 "Rack timely don't stop increase"); 912 SYSCTL_ADD_S32(&rack_sysctl_ctx, 913 SYSCTL_CHILDREN(rack_timely), 914 OID_AUTO, "dec_raise_thresh", CTLFLAG_RW, 915 &rack_down_raise_thresh, 100, 916 "If the CA or SS is below this threshold raise on the first 3 b/w lowers (0=always)"); 917 SYSCTL_ADD_S32(&rack_sysctl_ctx, 918 SYSCTL_CHILDREN(rack_timely), 919 OID_AUTO, "bottom_drag_segs", CTLFLAG_RW, 920 &rack_req_segs, 1, 921 "Bottom dragging if not these many segments outstanding and room"); 922 923 /* TLP and Rack related parameters */ 924 rack_tlp = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 925 SYSCTL_CHILDREN(rack_sysctl_root), 926 OID_AUTO, 927 "tlp", 928 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 929 "TLP and Rack related Controls"); 930 SYSCTL_ADD_S32(&rack_sysctl_ctx, 931 SYSCTL_CHILDREN(rack_tlp), 932 OID_AUTO, "use_rrr", CTLFLAG_RW, 933 &use_rack_rr, 1, 934 "Do we use Rack Rapid Recovery"); 935 SYSCTL_ADD_S32(&rack_sysctl_ctx, 936 SYSCTL_CHILDREN(rack_tlp), 937 OID_AUTO, "nonrxt_use_cr", CTLFLAG_RW, 938 &rack_non_rxt_use_cr, 0, 939 "Do we use ss/ca rate if in recovery we are transmitting a new data chunk"); 940 SYSCTL_ADD_S32(&rack_sysctl_ctx, 941 SYSCTL_CHILDREN(rack_tlp), 942 OID_AUTO, "tlpmethod", CTLFLAG_RW, 943 &rack_tlp_threshold_use, TLP_USE_TWO_ONE, 944 "What method do we do for TLP time calc 0=no-de-ack-comp, 1=ID, 2=2.1, 3=2.2"); 945 SYSCTL_ADD_S32(&rack_sysctl_ctx, 946 SYSCTL_CHILDREN(rack_tlp), 947 OID_AUTO, "limit", CTLFLAG_RW, 948 &rack_tlp_limit, 2, 949 "How many TLP's can be sent without sending new data"); 950 SYSCTL_ADD_S32(&rack_sysctl_ctx, 951 SYSCTL_CHILDREN(rack_tlp), 952 OID_AUTO, "use_greater", CTLFLAG_RW, 953 &rack_tlp_use_greater, 1, 954 "Should we use the rack_rtt time if its greater than srtt"); 955 SYSCTL_ADD_S32(&rack_sysctl_ctx, 956 SYSCTL_CHILDREN(rack_tlp), 957 OID_AUTO, "tlpminto", CTLFLAG_RW, 958 &rack_tlp_min, 10, 959 "TLP minimum timeout per the specification (10ms)"); 960 SYSCTL_ADD_S32(&rack_sysctl_ctx, 961 SYSCTL_CHILDREN(rack_tlp), 962 OID_AUTO, "send_oldest", CTLFLAG_RW, 963 &rack_always_send_oldest, 0, 964 "Should we always send the oldest TLP and RACK-TLP"); 965 SYSCTL_ADD_S32(&rack_sysctl_ctx, 966 SYSCTL_CHILDREN(rack_tlp), 967 OID_AUTO, "rack_tlimit", CTLFLAG_RW, 968 &rack_limited_retran, 0, 969 "How many times can a rack timeout drive out sends"); 970 SYSCTL_ADD_S32(&rack_sysctl_ctx, 971 SYSCTL_CHILDREN(rack_tlp), 972 OID_AUTO, "tlp_retry", CTLFLAG_RW, 973 &rack_tlp_max_resend, 2, 974 "How many times does TLP retry a single segment or multiple with no ACK"); 975 SYSCTL_ADD_S32(&rack_sysctl_ctx, 976 SYSCTL_CHILDREN(rack_tlp), 977 OID_AUTO, "tlp_cwnd_flag", CTLFLAG_RW, 978 &rack_lower_cwnd_at_tlp, 0, 979 "When a TLP completes a retran should we enter recovery"); 980 SYSCTL_ADD_S32(&rack_sysctl_ctx, 981 SYSCTL_CHILDREN(rack_tlp), 982 OID_AUTO, "reorder_thresh", CTLFLAG_RW, 983 &rack_reorder_thresh, 2, 984 "What factor for rack will be added when seeing reordering (shift right)"); 985 SYSCTL_ADD_S32(&rack_sysctl_ctx, 986 SYSCTL_CHILDREN(rack_tlp), 987 OID_AUTO, "rtt_tlp_thresh", CTLFLAG_RW, 988 &rack_tlp_thresh, 1, 989 "What divisor for TLP rtt/retran will be added (1=rtt, 2=1/2 rtt etc)"); 990 SYSCTL_ADD_S32(&rack_sysctl_ctx, 991 SYSCTL_CHILDREN(rack_tlp), 992 OID_AUTO, "reorder_fade", CTLFLAG_RW, 993 &rack_reorder_fade, 0, 994 "Does reorder detection fade, if so how many ms (0 means never)"); 995 SYSCTL_ADD_S32(&rack_sysctl_ctx, 996 SYSCTL_CHILDREN(rack_tlp), 997 OID_AUTO, "pktdelay", CTLFLAG_RW, 998 &rack_pkt_delay, 1, 999 "Extra RACK time (in ms) besides reordering thresh"); 1000 1001 /* Timer related controls */ 1002 rack_timers = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 1003 SYSCTL_CHILDREN(rack_sysctl_root), 1004 OID_AUTO, 1005 "timers", 1006 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1007 "Timer related controls"); 1008 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1009 SYSCTL_CHILDREN(rack_timers), 1010 OID_AUTO, "persmin", CTLFLAG_RW, 1011 &rack_persist_min, 250, 1012 "What is the minimum time in milliseconds between persists"); 1013 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1014 SYSCTL_CHILDREN(rack_timers), 1015 OID_AUTO, "persmax", CTLFLAG_RW, 1016 &rack_persist_max, 2000, 1017 "What is the largest delay in milliseconds between persists"); 1018 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1019 SYSCTL_CHILDREN(rack_timers), 1020 OID_AUTO, "delayed_ack", CTLFLAG_RW, 1021 &rack_delayed_ack_time, 200, 1022 "Delayed ack time (200ms)"); 1023 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1024 SYSCTL_CHILDREN(rack_timers), 1025 OID_AUTO, "minrto", CTLFLAG_RW, 1026 &rack_rto_min, 0, 1027 "Minimum RTO in ms -- set with caution below 1000 due to TLP"); 1028 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1029 SYSCTL_CHILDREN(rack_timers), 1030 OID_AUTO, "maxrto", CTLFLAG_RW, 1031 &rack_rto_max, 0, 1032 "Maxiumum RTO in ms -- should be at least as large as min_rto"); 1033 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1034 SYSCTL_CHILDREN(rack_timers), 1035 OID_AUTO, "minto", CTLFLAG_RW, 1036 &rack_min_to, 1, 1037 "Minimum rack timeout in milliseconds"); 1038 /* Measure controls */ 1039 rack_measure = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 1040 SYSCTL_CHILDREN(rack_sysctl_root), 1041 OID_AUTO, 1042 "measure", 1043 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1044 "Measure related controls"); 1045 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1046 SYSCTL_CHILDREN(rack_measure), 1047 OID_AUTO, "wma_divisor", CTLFLAG_RW, 1048 &rack_wma_divisor, 8, 1049 "When doing b/w calculation what is the divisor for the WMA"); 1050 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1051 SYSCTL_CHILDREN(rack_measure), 1052 OID_AUTO, "end_cwnd", CTLFLAG_RW, 1053 &rack_cwnd_block_ends_measure, 0, 1054 "Does a cwnd just-return end the measurement window (app limited)"); 1055 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1056 SYSCTL_CHILDREN(rack_measure), 1057 OID_AUTO, "end_rwnd", CTLFLAG_RW, 1058 &rack_rwnd_block_ends_measure, 0, 1059 "Does an rwnd just-return end the measurement window (app limited -- not persists)"); 1060 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1061 SYSCTL_CHILDREN(rack_measure), 1062 OID_AUTO, "min_target", CTLFLAG_RW, 1063 &rack_def_data_window, 20, 1064 "What is the minimum target window (in mss) for a GP measurements"); 1065 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1066 SYSCTL_CHILDREN(rack_measure), 1067 OID_AUTO, "goal_bdp", CTLFLAG_RW, 1068 &rack_goal_bdp, 2, 1069 "What is the goal BDP to measure"); 1070 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1071 SYSCTL_CHILDREN(rack_measure), 1072 OID_AUTO, "min_srtts", CTLFLAG_RW, 1073 &rack_min_srtts, 1, 1074 "What is the goal BDP to measure"); 1075 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1076 SYSCTL_CHILDREN(rack_measure), 1077 OID_AUTO, "min_measure_tim", CTLFLAG_RW, 1078 &rack_min_measure_usec, 0, 1079 "What is the Minimum time time for a measurement if 0, this is off"); 1080 /* Misc rack controls */ 1081 rack_misc = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 1082 SYSCTL_CHILDREN(rack_sysctl_root), 1083 OID_AUTO, 1084 "misc", 1085 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1086 "Misc related controls"); 1087 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1088 SYSCTL_CHILDREN(rack_misc), 1089 OID_AUTO, "shared_cwnd", CTLFLAG_RW, 1090 &rack_enable_shared_cwnd, 0, 1091 "Should RACK try to use the shared cwnd on connections where allowed"); 1092 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1093 SYSCTL_CHILDREN(rack_misc), 1094 OID_AUTO, "limits_on_scwnd", CTLFLAG_RW, 1095 &rack_limits_scwnd, 1, 1096 "Should RACK place low end time limits on the shared cwnd feature"); 1097 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1098 SYSCTL_CHILDREN(rack_misc), 1099 OID_AUTO, "non_paced_lro_queue", CTLFLAG_RW, 1100 &rack_enable_mqueue_for_nonpaced, 0, 1101 "Should RACK use mbuf queuing for non-paced connections"); 1102 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1103 SYSCTL_CHILDREN(rack_misc), 1104 OID_AUTO, "iMac_dack", CTLFLAG_RW, 1105 &rack_use_imac_dack, 0, 1106 "Should RACK try to emulate iMac delayed ack"); 1107 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1108 SYSCTL_CHILDREN(rack_misc), 1109 OID_AUTO, "no_prr", CTLFLAG_RW, 1110 &rack_disable_prr, 0, 1111 "Should RACK not use prr and only pace (must have pacing on)"); 1112 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1113 SYSCTL_CHILDREN(rack_misc), 1114 OID_AUTO, "bb_verbose", CTLFLAG_RW, 1115 &rack_verbose_logging, 0, 1116 "Should RACK black box logging be verbose"); 1117 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1118 SYSCTL_CHILDREN(rack_misc), 1119 OID_AUTO, "data_after_close", CTLFLAG_RW, 1120 &rack_ignore_data_after_close, 1, 1121 "Do we hold off sending a RST until all pending data is ack'd"); 1122 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1123 SYSCTL_CHILDREN(rack_misc), 1124 OID_AUTO, "no_sack_needed", CTLFLAG_RW, 1125 &rack_sack_not_required, 0, 1126 "Do we allow rack to run on connections not supporting SACK"); 1127 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1128 SYSCTL_CHILDREN(rack_misc), 1129 OID_AUTO, "recovery_loss_prop", CTLFLAG_RW, 1130 &rack_use_proportional_reduce, 0, 1131 "Should we proportionaly reduce cwnd based on the number of losses "); 1132 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1133 SYSCTL_CHILDREN(rack_misc), 1134 OID_AUTO, "recovery_prop", CTLFLAG_RW, 1135 &rack_proportional_rate, 10, 1136 "What percent reduction per loss"); 1137 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1138 SYSCTL_CHILDREN(rack_misc), 1139 OID_AUTO, "prr_sendalot", CTLFLAG_RW, 1140 &rack_send_a_lot_in_prr, 1, 1141 "Send a lot in prr"); 1142 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1143 SYSCTL_CHILDREN(rack_misc), 1144 OID_AUTO, "earlyrecovery", CTLFLAG_RW, 1145 &rack_early_recovery, 1, 1146 "Do we do early recovery with rack"); 1147 /* Sack Attacker detection stuff */ 1148 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1149 SYSCTL_CHILDREN(rack_attack), 1150 OID_AUTO, "detect_highsackratio", CTLFLAG_RW, 1151 &rack_highest_sack_thresh_seen, 0, 1152 "Highest sack to ack ratio seen"); 1153 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1154 SYSCTL_CHILDREN(rack_attack), 1155 OID_AUTO, "detect_highmoveratio", CTLFLAG_RW, 1156 &rack_highest_move_thresh_seen, 0, 1157 "Highest move to non-move ratio seen"); 1158 rack_ack_total = counter_u64_alloc(M_WAITOK); 1159 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1160 SYSCTL_CHILDREN(rack_attack), 1161 OID_AUTO, "acktotal", CTLFLAG_RD, 1162 &rack_ack_total, 1163 "Total number of Ack's"); 1164 rack_express_sack = counter_u64_alloc(M_WAITOK); 1165 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1166 SYSCTL_CHILDREN(rack_attack), 1167 OID_AUTO, "exp_sacktotal", CTLFLAG_RD, 1168 &rack_express_sack, 1169 "Total expresss number of Sack's"); 1170 rack_sack_total = counter_u64_alloc(M_WAITOK); 1171 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1172 SYSCTL_CHILDREN(rack_attack), 1173 OID_AUTO, "sacktotal", CTLFLAG_RD, 1174 &rack_sack_total, 1175 "Total number of SACKs"); 1176 rack_move_none = counter_u64_alloc(M_WAITOK); 1177 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1178 SYSCTL_CHILDREN(rack_attack), 1179 OID_AUTO, "move_none", CTLFLAG_RD, 1180 &rack_move_none, 1181 "Total number of SACK index reuse of postions under threshold"); 1182 rack_move_some = counter_u64_alloc(M_WAITOK); 1183 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1184 SYSCTL_CHILDREN(rack_attack), 1185 OID_AUTO, "move_some", CTLFLAG_RD, 1186 &rack_move_some, 1187 "Total number of SACK index reuse of postions over threshold"); 1188 rack_sack_attacks_detected = counter_u64_alloc(M_WAITOK); 1189 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1190 SYSCTL_CHILDREN(rack_attack), 1191 OID_AUTO, "attacks", CTLFLAG_RD, 1192 &rack_sack_attacks_detected, 1193 "Total number of SACK attackers that had sack disabled"); 1194 rack_sack_attacks_reversed = counter_u64_alloc(M_WAITOK); 1195 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1196 SYSCTL_CHILDREN(rack_attack), 1197 OID_AUTO, "reversed", CTLFLAG_RD, 1198 &rack_sack_attacks_reversed, 1199 "Total number of SACK attackers that were later determined false positive"); 1200 rack_sack_used_next_merge = counter_u64_alloc(M_WAITOK); 1201 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1202 SYSCTL_CHILDREN(rack_attack), 1203 OID_AUTO, "nextmerge", CTLFLAG_RD, 1204 &rack_sack_used_next_merge, 1205 "Total number of times we used the next merge"); 1206 rack_sack_used_prev_merge = counter_u64_alloc(M_WAITOK); 1207 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1208 SYSCTL_CHILDREN(rack_attack), 1209 OID_AUTO, "prevmerge", CTLFLAG_RD, 1210 &rack_sack_used_prev_merge, 1211 "Total number of times we used the prev merge"); 1212 /* Counters */ 1213 rack_badfr = counter_u64_alloc(M_WAITOK); 1214 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1215 SYSCTL_CHILDREN(rack_counters), 1216 OID_AUTO, "badfr", CTLFLAG_RD, 1217 &rack_badfr, "Total number of bad FRs"); 1218 rack_badfr_bytes = counter_u64_alloc(M_WAITOK); 1219 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1220 SYSCTL_CHILDREN(rack_counters), 1221 OID_AUTO, "badfr_bytes", CTLFLAG_RD, 1222 &rack_badfr_bytes, "Total number of bad FRs"); 1223 rack_rtm_prr_retran = counter_u64_alloc(M_WAITOK); 1224 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1225 SYSCTL_CHILDREN(rack_counters), 1226 OID_AUTO, "prrsndret", CTLFLAG_RD, 1227 &rack_rtm_prr_retran, 1228 "Total number of prr based retransmits"); 1229 rack_rtm_prr_newdata = counter_u64_alloc(M_WAITOK); 1230 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1231 SYSCTL_CHILDREN(rack_counters), 1232 OID_AUTO, "prrsndnew", CTLFLAG_RD, 1233 &rack_rtm_prr_newdata, 1234 "Total number of prr based new transmits"); 1235 rack_timestamp_mismatch = counter_u64_alloc(M_WAITOK); 1236 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1237 SYSCTL_CHILDREN(rack_counters), 1238 OID_AUTO, "tsnf", CTLFLAG_RD, 1239 &rack_timestamp_mismatch, 1240 "Total number of timestamps that we could not find the reported ts"); 1241 rack_find_high = counter_u64_alloc(M_WAITOK); 1242 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1243 SYSCTL_CHILDREN(rack_counters), 1244 OID_AUTO, "findhigh", CTLFLAG_RD, 1245 &rack_find_high, 1246 "Total number of FIN causing find-high"); 1247 rack_reorder_seen = counter_u64_alloc(M_WAITOK); 1248 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1249 SYSCTL_CHILDREN(rack_counters), 1250 OID_AUTO, "reordering", CTLFLAG_RD, 1251 &rack_reorder_seen, 1252 "Total number of times we added delay due to reordering"); 1253 rack_tlp_tot = counter_u64_alloc(M_WAITOK); 1254 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1255 SYSCTL_CHILDREN(rack_counters), 1256 OID_AUTO, "tlp_to_total", CTLFLAG_RD, 1257 &rack_tlp_tot, 1258 "Total number of tail loss probe expirations"); 1259 rack_tlp_newdata = counter_u64_alloc(M_WAITOK); 1260 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1261 SYSCTL_CHILDREN(rack_counters), 1262 OID_AUTO, "tlp_new", CTLFLAG_RD, 1263 &rack_tlp_newdata, 1264 "Total number of tail loss probe sending new data"); 1265 rack_tlp_retran = counter_u64_alloc(M_WAITOK); 1266 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1267 SYSCTL_CHILDREN(rack_counters), 1268 OID_AUTO, "tlp_retran", CTLFLAG_RD, 1269 &rack_tlp_retran, 1270 "Total number of tail loss probe sending retransmitted data"); 1271 rack_tlp_retran_bytes = counter_u64_alloc(M_WAITOK); 1272 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1273 SYSCTL_CHILDREN(rack_counters), 1274 OID_AUTO, "tlp_retran_bytes", CTLFLAG_RD, 1275 &rack_tlp_retran_bytes, 1276 "Total bytes of tail loss probe sending retransmitted data"); 1277 rack_tlp_retran_fail = counter_u64_alloc(M_WAITOK); 1278 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1279 SYSCTL_CHILDREN(rack_counters), 1280 OID_AUTO, "tlp_retran_fail", CTLFLAG_RD, 1281 &rack_tlp_retran_fail, 1282 "Total number of tail loss probe sending retransmitted data that failed (wait for t3)"); 1283 rack_to_tot = counter_u64_alloc(M_WAITOK); 1284 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1285 SYSCTL_CHILDREN(rack_counters), 1286 OID_AUTO, "rack_to_tot", CTLFLAG_RD, 1287 &rack_to_tot, 1288 "Total number of times the rack to expired"); 1289 rack_to_arm_rack = counter_u64_alloc(M_WAITOK); 1290 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1291 SYSCTL_CHILDREN(rack_counters), 1292 OID_AUTO, "arm_rack", CTLFLAG_RD, 1293 &rack_to_arm_rack, 1294 "Total number of times the rack timer armed"); 1295 rack_to_arm_tlp = counter_u64_alloc(M_WAITOK); 1296 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1297 SYSCTL_CHILDREN(rack_counters), 1298 OID_AUTO, "arm_tlp", CTLFLAG_RD, 1299 &rack_to_arm_tlp, 1300 "Total number of times the tlp timer armed"); 1301 rack_calc_zero = counter_u64_alloc(M_WAITOK); 1302 rack_calc_nonzero = counter_u64_alloc(M_WAITOK); 1303 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1304 SYSCTL_CHILDREN(rack_counters), 1305 OID_AUTO, "calc_zero", CTLFLAG_RD, 1306 &rack_calc_zero, 1307 "Total number of times pacing time worked out to zero"); 1308 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1309 SYSCTL_CHILDREN(rack_counters), 1310 OID_AUTO, "calc_nonzero", CTLFLAG_RD, 1311 &rack_calc_nonzero, 1312 "Total number of times pacing time worked out to non-zero"); 1313 rack_paced_segments = counter_u64_alloc(M_WAITOK); 1314 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1315 SYSCTL_CHILDREN(rack_counters), 1316 OID_AUTO, "paced", CTLFLAG_RD, 1317 &rack_paced_segments, 1318 "Total number of times a segment send caused hptsi"); 1319 rack_unpaced_segments = counter_u64_alloc(M_WAITOK); 1320 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1321 SYSCTL_CHILDREN(rack_counters), 1322 OID_AUTO, "unpaced", CTLFLAG_RD, 1323 &rack_unpaced_segments, 1324 "Total number of times a segment did not cause hptsi"); 1325 rack_saw_enobuf = counter_u64_alloc(M_WAITOK); 1326 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1327 SYSCTL_CHILDREN(rack_counters), 1328 OID_AUTO, "saw_enobufs", CTLFLAG_RD, 1329 &rack_saw_enobuf, 1330 "Total number of times a segment did not cause hptsi"); 1331 rack_saw_enetunreach = counter_u64_alloc(M_WAITOK); 1332 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1333 SYSCTL_CHILDREN(rack_counters), 1334 OID_AUTO, "saw_enetunreach", CTLFLAG_RD, 1335 &rack_saw_enetunreach, 1336 "Total number of times a segment did not cause hptsi"); 1337 rack_to_alloc = counter_u64_alloc(M_WAITOK); 1338 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1339 SYSCTL_CHILDREN(rack_counters), 1340 OID_AUTO, "allocs", CTLFLAG_RD, 1341 &rack_to_alloc, 1342 "Total allocations of tracking structures"); 1343 rack_to_alloc_hard = counter_u64_alloc(M_WAITOK); 1344 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1345 SYSCTL_CHILDREN(rack_counters), 1346 OID_AUTO, "allochard", CTLFLAG_RD, 1347 &rack_to_alloc_hard, 1348 "Total allocations done with sleeping the hard way"); 1349 rack_to_alloc_emerg = counter_u64_alloc(M_WAITOK); 1350 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1351 SYSCTL_CHILDREN(rack_counters), 1352 OID_AUTO, "allocemerg", CTLFLAG_RD, 1353 &rack_to_alloc_emerg, 1354 "Total allocations done from emergency cache"); 1355 rack_to_alloc_limited = counter_u64_alloc(M_WAITOK); 1356 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1357 SYSCTL_CHILDREN(rack_counters), 1358 OID_AUTO, "alloc_limited", CTLFLAG_RD, 1359 &rack_to_alloc_limited, 1360 "Total allocations dropped due to limit"); 1361 rack_alloc_limited_conns = counter_u64_alloc(M_WAITOK); 1362 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1363 SYSCTL_CHILDREN(rack_counters), 1364 OID_AUTO, "alloc_limited_conns", CTLFLAG_RD, 1365 &rack_alloc_limited_conns, 1366 "Connections with allocations dropped due to limit"); 1367 rack_split_limited = counter_u64_alloc(M_WAITOK); 1368 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1369 SYSCTL_CHILDREN(rack_counters), 1370 OID_AUTO, "split_limited", CTLFLAG_RD, 1371 &rack_split_limited, 1372 "Split allocations dropped due to limit"); 1373 rack_sack_proc_all = counter_u64_alloc(M_WAITOK); 1374 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1375 SYSCTL_CHILDREN(rack_counters), 1376 OID_AUTO, "sack_long", CTLFLAG_RD, 1377 &rack_sack_proc_all, 1378 "Total times we had to walk whole list for sack processing"); 1379 rack_sack_proc_restart = counter_u64_alloc(M_WAITOK); 1380 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1381 SYSCTL_CHILDREN(rack_counters), 1382 OID_AUTO, "sack_restart", CTLFLAG_RD, 1383 &rack_sack_proc_restart, 1384 "Total times we had to walk whole list due to a restart"); 1385 rack_sack_proc_short = counter_u64_alloc(M_WAITOK); 1386 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1387 SYSCTL_CHILDREN(rack_counters), 1388 OID_AUTO, "sack_short", CTLFLAG_RD, 1389 &rack_sack_proc_short, 1390 "Total times we took shortcut for sack processing"); 1391 rack_enter_tlp_calc = counter_u64_alloc(M_WAITOK); 1392 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1393 SYSCTL_CHILDREN(rack_counters), 1394 OID_AUTO, "tlp_calc_entered", CTLFLAG_RD, 1395 &rack_enter_tlp_calc, 1396 "Total times we called calc-tlp"); 1397 rack_used_tlpmethod = counter_u64_alloc(M_WAITOK); 1398 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1399 SYSCTL_CHILDREN(rack_counters), 1400 OID_AUTO, "hit_tlp_method", CTLFLAG_RD, 1401 &rack_used_tlpmethod, 1402 "Total number of runt sacks"); 1403 rack_used_tlpmethod2 = counter_u64_alloc(M_WAITOK); 1404 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1405 SYSCTL_CHILDREN(rack_counters), 1406 OID_AUTO, "hit_tlp_method2", CTLFLAG_RD, 1407 &rack_used_tlpmethod2, 1408 "Total number of times we hit TLP method 2"); 1409 rack_sack_skipped_acked = counter_u64_alloc(M_WAITOK); 1410 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1411 SYSCTL_CHILDREN(rack_attack), 1412 OID_AUTO, "skipacked", CTLFLAG_RD, 1413 &rack_sack_skipped_acked, 1414 "Total number of times we skipped previously sacked"); 1415 rack_sack_splits = counter_u64_alloc(M_WAITOK); 1416 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1417 SYSCTL_CHILDREN(rack_attack), 1418 OID_AUTO, "ofsplit", CTLFLAG_RD, 1419 &rack_sack_splits, 1420 "Total number of times we did the old fashion tree split"); 1421 rack_progress_drops = counter_u64_alloc(M_WAITOK); 1422 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1423 SYSCTL_CHILDREN(rack_counters), 1424 OID_AUTO, "prog_drops", CTLFLAG_RD, 1425 &rack_progress_drops, 1426 "Total number of progress drops"); 1427 rack_input_idle_reduces = counter_u64_alloc(M_WAITOK); 1428 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1429 SYSCTL_CHILDREN(rack_counters), 1430 OID_AUTO, "idle_reduce_oninput", CTLFLAG_RD, 1431 &rack_input_idle_reduces, 1432 "Total number of idle reductions on input"); 1433 rack_collapsed_win = counter_u64_alloc(M_WAITOK); 1434 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1435 SYSCTL_CHILDREN(rack_counters), 1436 OID_AUTO, "collapsed_win", CTLFLAG_RD, 1437 &rack_collapsed_win, 1438 "Total number of collapsed windows"); 1439 rack_tlp_does_nada = counter_u64_alloc(M_WAITOK); 1440 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1441 SYSCTL_CHILDREN(rack_counters), 1442 OID_AUTO, "tlp_nada", CTLFLAG_RD, 1443 &rack_tlp_does_nada, 1444 "Total number of nada tlp calls"); 1445 rack_try_scwnd = counter_u64_alloc(M_WAITOK); 1446 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1447 SYSCTL_CHILDREN(rack_counters), 1448 OID_AUTO, "tried_scwnd", CTLFLAG_RD, 1449 &rack_try_scwnd, 1450 "Total number of scwnd attempts"); 1451 1452 rack_tls_rwnd = counter_u64_alloc(M_WAITOK); 1453 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1454 SYSCTL_CHILDREN(rack_counters), 1455 OID_AUTO, "tls_rwnd", CTLFLAG_RD, 1456 &rack_tls_rwnd, 1457 "Total hdwr tls rwnd limited"); 1458 rack_tls_cwnd = counter_u64_alloc(M_WAITOK); 1459 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1460 SYSCTL_CHILDREN(rack_counters), 1461 OID_AUTO, "tls_cwnd", CTLFLAG_RD, 1462 &rack_tls_cwnd, 1463 "Total hdwr tls cwnd limited"); 1464 rack_tls_app = counter_u64_alloc(M_WAITOK); 1465 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1466 SYSCTL_CHILDREN(rack_counters), 1467 OID_AUTO, "tls_app", CTLFLAG_RD, 1468 &rack_tls_app, 1469 "Total hdwr tls app limited"); 1470 rack_tls_other = counter_u64_alloc(M_WAITOK); 1471 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1472 SYSCTL_CHILDREN(rack_counters), 1473 OID_AUTO, "tls_other", CTLFLAG_RD, 1474 &rack_tls_other, 1475 "Total hdwr tls other limited"); 1476 rack_tls_filled = counter_u64_alloc(M_WAITOK); 1477 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1478 SYSCTL_CHILDREN(rack_counters), 1479 OID_AUTO, "tls_filled", CTLFLAG_RD, 1480 &rack_tls_filled, 1481 "Total hdwr tls filled"); 1482 rack_tls_rxt = counter_u64_alloc(M_WAITOK); 1483 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1484 SYSCTL_CHILDREN(rack_counters), 1485 OID_AUTO, "tls_rxt", CTLFLAG_RD, 1486 &rack_tls_rxt, 1487 "Total hdwr rxt"); 1488 rack_tls_tlp = counter_u64_alloc(M_WAITOK); 1489 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1490 SYSCTL_CHILDREN(rack_counters), 1491 OID_AUTO, "tls_tlp", CTLFLAG_RD, 1492 &rack_tls_tlp, 1493 "Total hdwr tls tlp"); 1494 rack_per_timer_hole = counter_u64_alloc(M_WAITOK); 1495 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1496 SYSCTL_CHILDREN(rack_counters), 1497 OID_AUTO, "timer_hole", CTLFLAG_RD, 1498 &rack_per_timer_hole, 1499 "Total persists start in timer hole"); 1500 COUNTER_ARRAY_ALLOC(rack_out_size, TCP_MSS_ACCT_SIZE, M_WAITOK); 1501 SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), 1502 OID_AUTO, "outsize", CTLFLAG_RD, 1503 rack_out_size, TCP_MSS_ACCT_SIZE, "MSS send sizes"); 1504 COUNTER_ARRAY_ALLOC(rack_opts_arry, RACK_OPTS_SIZE, M_WAITOK); 1505 SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), 1506 OID_AUTO, "opts", CTLFLAG_RD, 1507 rack_opts_arry, RACK_OPTS_SIZE, "RACK Option Stats"); 1508 SYSCTL_ADD_PROC(&rack_sysctl_ctx, 1509 SYSCTL_CHILDREN(rack_sysctl_root), 1510 OID_AUTO, "clear", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, 1511 &rack_clear_counter, 0, sysctl_rack_clear, "IU", "Clear counters"); 1512 } 1513 1514 static __inline int 1515 rb_map_cmp(struct rack_sendmap *b, struct rack_sendmap *a) 1516 { 1517 if (SEQ_GEQ(b->r_start, a->r_start) && 1518 SEQ_LT(b->r_start, a->r_end)) { 1519 /* 1520 * The entry b is within the 1521 * block a. i.e.: 1522 * a -- |-------------| 1523 * b -- |----| 1524 * <or> 1525 * b -- |------| 1526 * <or> 1527 * b -- |-----------| 1528 */ 1529 return (0); 1530 } else if (SEQ_GEQ(b->r_start, a->r_end)) { 1531 /* 1532 * b falls as either the next 1533 * sequence block after a so a 1534 * is said to be smaller than b. 1535 * i.e: 1536 * a -- |------| 1537 * b -- |--------| 1538 * or 1539 * b -- |-----| 1540 */ 1541 return (1); 1542 } 1543 /* 1544 * Whats left is where a is 1545 * larger than b. i.e: 1546 * a -- |-------| 1547 * b -- |---| 1548 * or even possibly 1549 * b -- |--------------| 1550 */ 1551 return (-1); 1552 } 1553 1554 RB_PROTOTYPE(rack_rb_tree_head, rack_sendmap, r_next, rb_map_cmp); 1555 RB_GENERATE(rack_rb_tree_head, rack_sendmap, r_next, rb_map_cmp); 1556 1557 static uint32_t 1558 rc_init_window(struct tcp_rack *rack) 1559 { 1560 uint32_t win; 1561 1562 if (rack->rc_init_win == 0) { 1563 /* 1564 * Nothing set by the user, use the system stack 1565 * default. 1566 */ 1567 return(tcp_compute_initwnd(tcp_maxseg(rack->rc_tp))); 1568 } 1569 win = ctf_fixed_maxseg(rack->rc_tp) * rack->rc_init_win; 1570 return(win); 1571 } 1572 1573 static uint64_t 1574 rack_get_fixed_pacing_bw(struct tcp_rack *rack) 1575 { 1576 if (IN_RECOVERY(rack->rc_tp->t_flags)) 1577 return (rack->r_ctl.rc_fixed_pacing_rate_rec); 1578 else if (rack->r_ctl.cwnd_to_use < rack->rc_tp->snd_ssthresh) 1579 return (rack->r_ctl.rc_fixed_pacing_rate_ss); 1580 else 1581 return (rack->r_ctl.rc_fixed_pacing_rate_ca); 1582 } 1583 1584 static uint64_t 1585 rack_get_bw(struct tcp_rack *rack) 1586 { 1587 if (rack->use_fixed_rate) { 1588 /* Return the fixed pacing rate */ 1589 return (rack_get_fixed_pacing_bw(rack)); 1590 } 1591 if (rack->r_ctl.gp_bw == 0) { 1592 /* 1593 * We have yet no b/w measurement, 1594 * if we have a user set initial bw 1595 * return it. If we don't have that and 1596 * we have an srtt, use the tcp IW (10) to 1597 * calculate a fictional b/w over the SRTT 1598 * which is more or less a guess. Note 1599 * we don't use our IW from rack on purpose 1600 * so if we have like IW=30, we are not 1601 * calculating a "huge" b/w. 1602 */ 1603 uint64_t bw, srtt; 1604 if (rack->r_ctl.init_rate) 1605 return (rack->r_ctl.init_rate); 1606 1607 /* Has the user set a max peak rate? */ 1608 #ifdef NETFLIX_PEAKRATE 1609 if (rack->rc_tp->t_maxpeakrate) 1610 return (rack->rc_tp->t_maxpeakrate); 1611 #endif 1612 /* Ok lets come up with the IW guess, if we have a srtt */ 1613 if (rack->rc_tp->t_srtt == 0) { 1614 /* 1615 * Go with old pacing method 1616 * i.e. burst mitigation only. 1617 */ 1618 return (0); 1619 } 1620 /* Ok lets get the initial TCP win (not racks) */ 1621 bw = tcp_compute_initwnd(tcp_maxseg(rack->rc_tp)); 1622 srtt = ((uint64_t)TICKS_2_USEC(rack->rc_tp->t_srtt) >> TCP_RTT_SHIFT); 1623 bw *= (uint64_t)USECS_IN_SECOND; 1624 bw /= srtt; 1625 return (bw); 1626 } else { 1627 uint64_t bw; 1628 1629 if(rack->r_ctl.num_avg >= RACK_REQ_AVG) { 1630 /* Averaging is done, we can return the value */ 1631 bw = rack->r_ctl.gp_bw; 1632 } else { 1633 /* Still doing initial average must calculate */ 1634 bw = rack->r_ctl.gp_bw / rack->r_ctl.num_avg; 1635 } 1636 #ifdef NETFLIX_PEAKRATE 1637 if ((rack->rc_tp->t_maxpeakrate) && 1638 (bw > rack->rc_tp->t_maxpeakrate)) { 1639 /* The user has set a peak rate to pace at 1640 * don't allow us to pace faster than that. 1641 */ 1642 return (rack->rc_tp->t_maxpeakrate); 1643 } 1644 #endif 1645 return (bw); 1646 } 1647 } 1648 1649 static uint16_t 1650 rack_get_output_gain(struct tcp_rack *rack, struct rack_sendmap *rsm) 1651 { 1652 if (rack->use_fixed_rate) { 1653 return (100); 1654 } else if (rack->in_probe_rtt && (rsm == NULL)) 1655 return(rack->r_ctl.rack_per_of_gp_probertt); 1656 else if ((IN_RECOVERY(rack->rc_tp->t_flags) && 1657 rack->r_ctl.rack_per_of_gp_rec)) { 1658 if (rsm) { 1659 /* a retransmission always use the recovery rate */ 1660 return(rack->r_ctl.rack_per_of_gp_rec); 1661 } else if (rack->rack_rec_nonrxt_use_cr) { 1662 /* Directed to use the configured rate */ 1663 goto configured_rate; 1664 } else if (rack->rack_no_prr && 1665 (rack->r_ctl.rack_per_of_gp_rec > 100)) { 1666 /* No PRR, lets just use the b/w estimate only */ 1667 return(100); 1668 } else { 1669 /* 1670 * Here we may have a non-retransmit but we 1671 * have no overrides, so just use the recovery 1672 * rate (prr is in effect). 1673 */ 1674 return(rack->r_ctl.rack_per_of_gp_rec); 1675 } 1676 } 1677 configured_rate: 1678 /* For the configured rate we look at our cwnd vs the ssthresh */ 1679 if (rack->r_ctl.cwnd_to_use < rack->rc_tp->snd_ssthresh) 1680 return (rack->r_ctl.rack_per_of_gp_ss); 1681 else 1682 return(rack->r_ctl.rack_per_of_gp_ca); 1683 } 1684 1685 static uint64_t 1686 rack_get_output_bw(struct tcp_rack *rack, uint64_t bw, struct rack_sendmap *rsm) 1687 { 1688 /* 1689 * We allow rack_per_of_gp_xx to dictate our bw rate we want. 1690 */ 1691 uint64_t bw_est; 1692 uint64_t gain; 1693 1694 gain = (uint64_t)rack_get_output_gain(rack, rsm); 1695 bw_est = bw * gain; 1696 bw_est /= (uint64_t)100; 1697 /* Never fall below the minimum (def 64kbps) */ 1698 if (bw_est < RACK_MIN_BW) 1699 bw_est = RACK_MIN_BW; 1700 return (bw_est); 1701 } 1702 1703 static void 1704 rack_log_retran_reason(struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t tsused, uint32_t thresh, int mod) 1705 { 1706 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 1707 union tcp_log_stackspecific log; 1708 struct timeval tv; 1709 1710 if ((mod != 1) && (rack_verbose_logging == 0)) { 1711 /* 1712 * We get 3 values currently for mod 1713 * 1 - We are retransmitting and this tells the reason. 1714 * 2 - We are clearing a dup-ack count. 1715 * 3 - We are incrementing a dup-ack count. 1716 * 1717 * The clear/increment are only logged 1718 * if you have BBverbose on. 1719 */ 1720 return; 1721 } 1722 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 1723 log.u_bbr.flex1 = tsused; 1724 log.u_bbr.flex2 = thresh; 1725 log.u_bbr.flex3 = rsm->r_flags; 1726 log.u_bbr.flex4 = rsm->r_dupack; 1727 log.u_bbr.flex5 = rsm->r_start; 1728 log.u_bbr.flex6 = rsm->r_end; 1729 log.u_bbr.flex8 = mod; 1730 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 1731 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 1732 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 1733 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 1734 TCP_LOG_EVENTP(rack->rc_tp, NULL, 1735 &rack->rc_inp->inp_socket->so_rcv, 1736 &rack->rc_inp->inp_socket->so_snd, 1737 BBR_LOG_SETTINGS_CHG, 0, 1738 0, &log, false, &tv); 1739 } 1740 } 1741 1742 1743 1744 static void 1745 rack_log_to_start(struct tcp_rack *rack, uint32_t cts, uint32_t to, int32_t slot, uint8_t which) 1746 { 1747 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 1748 union tcp_log_stackspecific log; 1749 struct timeval tv; 1750 1751 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 1752 log.u_bbr.flex1 = TICKS_2_MSEC(rack->rc_tp->t_srtt >> TCP_RTT_SHIFT); 1753 log.u_bbr.flex2 = to * 1000; 1754 log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags; 1755 log.u_bbr.flex4 = slot; 1756 log.u_bbr.flex5 = rack->rc_inp->inp_hptsslot; 1757 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur; 1758 log.u_bbr.flex7 = rack->rc_in_persist; 1759 log.u_bbr.flex8 = which; 1760 if (rack->rack_no_prr) 1761 log.u_bbr.pkts_out = 0; 1762 else 1763 log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt; 1764 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 1765 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 1766 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 1767 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 1768 TCP_LOG_EVENTP(rack->rc_tp, NULL, 1769 &rack->rc_inp->inp_socket->so_rcv, 1770 &rack->rc_inp->inp_socket->so_snd, 1771 BBR_LOG_TIMERSTAR, 0, 1772 0, &log, false, &tv); 1773 } 1774 } 1775 1776 static void 1777 rack_log_to_event(struct tcp_rack *rack, int32_t to_num, struct rack_sendmap *rsm) 1778 { 1779 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 1780 union tcp_log_stackspecific log; 1781 struct timeval tv; 1782 1783 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 1784 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 1785 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 1786 log.u_bbr.flex8 = to_num; 1787 log.u_bbr.flex1 = rack->r_ctl.rc_rack_min_rtt; 1788 log.u_bbr.flex2 = rack->rc_rack_rtt; 1789 if (rsm == NULL) 1790 log.u_bbr.flex3 = 0; 1791 else 1792 log.u_bbr.flex3 = rsm->r_end - rsm->r_start; 1793 if (rack->rack_no_prr) 1794 log.u_bbr.flex5 = 0; 1795 else 1796 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; 1797 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 1798 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 1799 TCP_LOG_EVENTP(rack->rc_tp, NULL, 1800 &rack->rc_inp->inp_socket->so_rcv, 1801 &rack->rc_inp->inp_socket->so_snd, 1802 BBR_LOG_RTO, 0, 1803 0, &log, false, &tv); 1804 } 1805 } 1806 1807 static void 1808 rack_log_rtt_upd(struct tcpcb *tp, struct tcp_rack *rack, uint32_t t, uint32_t len, 1809 struct rack_sendmap *rsm, int conf) 1810 { 1811 if (tp->t_logstate != TCP_LOG_STATE_OFF) { 1812 union tcp_log_stackspecific log; 1813 struct timeval tv; 1814 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 1815 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 1816 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 1817 log.u_bbr.flex1 = t; 1818 log.u_bbr.flex2 = len; 1819 log.u_bbr.flex3 = rack->r_ctl.rc_rack_min_rtt * HPTS_USEC_IN_MSEC; 1820 log.u_bbr.flex4 = rack->r_ctl.rack_rs.rs_rtt_lowest * HPTS_USEC_IN_MSEC; 1821 log.u_bbr.flex5 = rack->r_ctl.rack_rs.rs_rtt_highest * HPTS_USEC_IN_MSEC; 1822 log.u_bbr.flex6 = rack->r_ctl.rack_rs.rs_rtt_cnt; 1823 log.u_bbr.flex7 = conf; 1824 log.u_bbr.rttProp = (uint64_t)rack->r_ctl.rack_rs.rs_rtt_tot * (uint64_t)HPTS_USEC_IN_MSEC; 1825 log.u_bbr.flex8 = rack->r_ctl.rc_rate_sample_method; 1826 if (rack->rack_no_prr) 1827 log.u_bbr.pkts_out = 0; 1828 else 1829 log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt; 1830 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 1831 log.u_bbr.delivered = rack->r_ctl.rack_rs.rs_us_rtt; 1832 log.u_bbr.pkts_out = rack->r_ctl.rack_rs.rs_flags; 1833 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 1834 if (rsm) { 1835 log.u_bbr.pkt_epoch = rsm->r_start; 1836 log.u_bbr.lost = rsm->r_end; 1837 log.u_bbr.cwnd_gain = rsm->r_rtr_cnt; 1838 } else { 1839 1840 /* Its a SYN */ 1841 log.u_bbr.pkt_epoch = rack->rc_tp->iss; 1842 log.u_bbr.lost = 0; 1843 log.u_bbr.cwnd_gain = 0; 1844 } 1845 /* Write out general bits of interest rrs here */ 1846 log.u_bbr.use_lt_bw = rack->rc_highly_buffered; 1847 log.u_bbr.use_lt_bw <<= 1; 1848 log.u_bbr.use_lt_bw |= rack->forced_ack; 1849 log.u_bbr.use_lt_bw <<= 1; 1850 log.u_bbr.use_lt_bw |= rack->rc_gp_dyn_mul; 1851 log.u_bbr.use_lt_bw <<= 1; 1852 log.u_bbr.use_lt_bw |= rack->in_probe_rtt; 1853 log.u_bbr.use_lt_bw <<= 1; 1854 log.u_bbr.use_lt_bw |= rack->measure_saw_probe_rtt; 1855 log.u_bbr.use_lt_bw <<= 1; 1856 log.u_bbr.use_lt_bw |= rack->app_limited_needs_set; 1857 log.u_bbr.use_lt_bw <<= 1; 1858 log.u_bbr.use_lt_bw |= rack->rc_gp_filled; 1859 log.u_bbr.use_lt_bw <<= 1; 1860 log.u_bbr.use_lt_bw |= rack->rc_dragged_bottom; 1861 log.u_bbr.applimited = rack->r_ctl.rc_target_probertt_flight; 1862 log.u_bbr.epoch = rack->r_ctl.rc_time_probertt_starts; 1863 log.u_bbr.lt_epoch = rack->r_ctl.rc_time_probertt_entered; 1864 log.u_bbr.cur_del_rate = rack->r_ctl.rc_lower_rtt_us_cts; 1865 log.u_bbr.delRate = rack->r_ctl.rc_gp_srtt; 1866 TCP_LOG_EVENTP(tp, NULL, 1867 &rack->rc_inp->inp_socket->so_rcv, 1868 &rack->rc_inp->inp_socket->so_snd, 1869 BBR_LOG_BBRRTT, 0, 1870 0, &log, false, &tv); 1871 } 1872 } 1873 1874 static void 1875 rack_log_rtt_sample(struct tcp_rack *rack, uint32_t rtt) 1876 { 1877 /* 1878 * Log the rtt sample we are 1879 * applying to the srtt algorithm in 1880 * useconds. 1881 */ 1882 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 1883 union tcp_log_stackspecific log; 1884 struct timeval tv; 1885 1886 /* Convert our ms to a microsecond */ 1887 memset(&log, 0, sizeof(log)); 1888 log.u_bbr.flex1 = rtt * 1000; 1889 log.u_bbr.flex2 = rack->r_ctl.ack_count; 1890 log.u_bbr.flex3 = rack->r_ctl.sack_count; 1891 log.u_bbr.flex4 = rack->r_ctl.sack_noextra_move; 1892 log.u_bbr.flex5 = rack->r_ctl.sack_moved_extra; 1893 log.u_bbr.flex8 = rack->sack_attack_disable; 1894 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 1895 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 1896 TCP_LOG_EVENTP(rack->rc_tp, NULL, 1897 &rack->rc_inp->inp_socket->so_rcv, 1898 &rack->rc_inp->inp_socket->so_snd, 1899 TCP_LOG_RTT, 0, 1900 0, &log, false, &tv); 1901 } 1902 } 1903 1904 1905 static inline void 1906 rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, int event, int line) 1907 { 1908 if (rack_verbose_logging && (tp->t_logstate != TCP_LOG_STATE_OFF)) { 1909 union tcp_log_stackspecific log; 1910 struct timeval tv; 1911 1912 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 1913 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 1914 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 1915 log.u_bbr.flex1 = line; 1916 log.u_bbr.flex2 = tick; 1917 log.u_bbr.flex3 = tp->t_maxunacktime; 1918 log.u_bbr.flex4 = tp->t_acktime; 1919 log.u_bbr.flex8 = event; 1920 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 1921 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 1922 TCP_LOG_EVENTP(tp, NULL, 1923 &rack->rc_inp->inp_socket->so_rcv, 1924 &rack->rc_inp->inp_socket->so_snd, 1925 BBR_LOG_PROGRESS, 0, 1926 0, &log, false, &tv); 1927 } 1928 } 1929 1930 static void 1931 rack_log_type_bbrsnd(struct tcp_rack *rack, uint32_t len, uint32_t slot, uint32_t cts, struct timeval *tv) 1932 { 1933 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 1934 union tcp_log_stackspecific log; 1935 1936 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 1937 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 1938 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 1939 log.u_bbr.flex1 = slot; 1940 if (rack->rack_no_prr) 1941 log.u_bbr.flex2 = 0; 1942 else 1943 log.u_bbr.flex2 = rack->r_ctl.rc_prr_sndcnt; 1944 log.u_bbr.flex7 = (0x0000ffff & rack->r_ctl.rc_hpts_flags); 1945 log.u_bbr.flex8 = rack->rc_in_persist; 1946 log.u_bbr.timeStamp = cts; 1947 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 1948 TCP_LOG_EVENTP(rack->rc_tp, NULL, 1949 &rack->rc_inp->inp_socket->so_rcv, 1950 &rack->rc_inp->inp_socket->so_snd, 1951 BBR_LOG_BBRSND, 0, 1952 0, &log, false, tv); 1953 } 1954 } 1955 1956 static void 1957 rack_log_doseg_done(struct tcp_rack *rack, uint32_t cts, int32_t nxt_pkt, int32_t did_out, int way_out) 1958 { 1959 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 1960 union tcp_log_stackspecific log; 1961 struct timeval tv; 1962 1963 memset(&log, 0, sizeof(log)); 1964 log.u_bbr.flex1 = did_out; 1965 log.u_bbr.flex2 = nxt_pkt; 1966 log.u_bbr.flex3 = way_out; 1967 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; 1968 if (rack->rack_no_prr) 1969 log.u_bbr.flex5 = 0; 1970 else 1971 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; 1972 log.u_bbr.applimited = rack->r_ctl.rc_pace_min_segs; 1973 log.u_bbr.flex7 = rack->r_wanted_output; 1974 log.u_bbr.flex8 = rack->rc_in_persist; 1975 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 1976 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 1977 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 1978 TCP_LOG_EVENTP(rack->rc_tp, NULL, 1979 &rack->rc_inp->inp_socket->so_rcv, 1980 &rack->rc_inp->inp_socket->so_snd, 1981 BBR_LOG_DOSEG_DONE, 0, 1982 0, &log, false, &tv); 1983 } 1984 } 1985 1986 static void 1987 rack_log_type_hrdwtso(struct tcpcb *tp, struct tcp_rack *rack, int len, int mod, int32_t orig_len, int frm) 1988 { 1989 if (tp->t_logstate != TCP_LOG_STATE_OFF) { 1990 union tcp_log_stackspecific log; 1991 struct timeval tv; 1992 uint32_t cts; 1993 1994 memset(&log, 0, sizeof(log)); 1995 cts = tcp_get_usecs(&tv); 1996 log.u_bbr.flex1 = rack->r_ctl.rc_pace_min_segs; 1997 log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs; 1998 log.u_bbr.flex4 = len; 1999 log.u_bbr.flex5 = orig_len; 2000 log.u_bbr.flex6 = rack->r_ctl.rc_sacked; 2001 log.u_bbr.flex7 = mod; 2002 log.u_bbr.flex8 = frm; 2003 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2004 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2005 TCP_LOG_EVENTP(tp, NULL, 2006 &tp->t_inpcb->inp_socket->so_rcv, 2007 &tp->t_inpcb->inp_socket->so_snd, 2008 TCP_HDWR_TLS, 0, 2009 0, &log, false, &tv); 2010 } 2011 } 2012 2013 static void 2014 rack_log_type_just_return(struct tcp_rack *rack, uint32_t cts, uint32_t tlen, uint32_t slot, 2015 uint8_t hpts_calling, int reason, uint32_t cwnd_to_use) 2016 { 2017 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2018 union tcp_log_stackspecific log; 2019 struct timeval tv; 2020 2021 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2022 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 2023 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 2024 log.u_bbr.flex1 = slot; 2025 log.u_bbr.flex2 = rack->r_ctl.rc_hpts_flags; 2026 log.u_bbr.flex4 = reason; 2027 if (rack->rack_no_prr) 2028 log.u_bbr.flex5 = 0; 2029 else 2030 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; 2031 log.u_bbr.flex7 = hpts_calling; 2032 log.u_bbr.flex8 = rack->rc_in_persist; 2033 log.u_bbr.lt_epoch = cwnd_to_use; 2034 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2035 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2036 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2037 &rack->rc_inp->inp_socket->so_rcv, 2038 &rack->rc_inp->inp_socket->so_snd, 2039 BBR_LOG_JUSTRET, 0, 2040 tlen, &log, false, &tv); 2041 } 2042 } 2043 2044 static void 2045 rack_log_to_cancel(struct tcp_rack *rack, int32_t hpts_removed, int line, uint32_t us_cts, 2046 struct timeval *tv, uint32_t flags_on_entry) 2047 { 2048 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2049 union tcp_log_stackspecific log; 2050 2051 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2052 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 2053 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 2054 log.u_bbr.flex1 = line; 2055 log.u_bbr.flex2 = rack->r_ctl.rc_last_output_to; 2056 log.u_bbr.flex3 = flags_on_entry; 2057 log.u_bbr.flex4 = us_cts; 2058 if (rack->rack_no_prr) 2059 log.u_bbr.flex5 = 0; 2060 else 2061 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; 2062 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur; 2063 log.u_bbr.flex7 = hpts_removed; 2064 log.u_bbr.flex8 = 1; 2065 log.u_bbr.applimited = rack->r_ctl.rc_hpts_flags; 2066 log.u_bbr.timeStamp = us_cts; 2067 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2068 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2069 &rack->rc_inp->inp_socket->so_rcv, 2070 &rack->rc_inp->inp_socket->so_snd, 2071 BBR_LOG_TIMERCANC, 0, 2072 0, &log, false, tv); 2073 } 2074 } 2075 2076 static void 2077 rack_log_alt_to_to_cancel(struct tcp_rack *rack, 2078 uint32_t flex1, uint32_t flex2, 2079 uint32_t flex3, uint32_t flex4, 2080 uint32_t flex5, uint32_t flex6, 2081 uint16_t flex7, uint8_t mod) 2082 { 2083 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2084 union tcp_log_stackspecific log; 2085 struct timeval tv; 2086 2087 if (mod == 1) { 2088 /* No you can't use 1, its for the real to cancel */ 2089 return; 2090 } 2091 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2092 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2093 log.u_bbr.flex1 = flex1; 2094 log.u_bbr.flex2 = flex2; 2095 log.u_bbr.flex3 = flex3; 2096 log.u_bbr.flex4 = flex4; 2097 log.u_bbr.flex5 = flex5; 2098 log.u_bbr.flex6 = flex6; 2099 log.u_bbr.flex7 = flex7; 2100 log.u_bbr.flex8 = mod; 2101 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2102 &rack->rc_inp->inp_socket->so_rcv, 2103 &rack->rc_inp->inp_socket->so_snd, 2104 BBR_LOG_TIMERCANC, 0, 2105 0, &log, false, &tv); 2106 } 2107 } 2108 2109 static void 2110 rack_log_to_processing(struct tcp_rack *rack, uint32_t cts, int32_t ret, int32_t timers) 2111 { 2112 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2113 union tcp_log_stackspecific log; 2114 struct timeval tv; 2115 2116 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2117 log.u_bbr.flex1 = timers; 2118 log.u_bbr.flex2 = ret; 2119 log.u_bbr.flex3 = rack->r_ctl.rc_timer_exp; 2120 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; 2121 log.u_bbr.flex5 = cts; 2122 if (rack->rack_no_prr) 2123 log.u_bbr.flex6 = 0; 2124 else 2125 log.u_bbr.flex6 = rack->r_ctl.rc_prr_sndcnt; 2126 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2127 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2128 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2129 &rack->rc_inp->inp_socket->so_rcv, 2130 &rack->rc_inp->inp_socket->so_snd, 2131 BBR_LOG_TO_PROCESS, 0, 2132 0, &log, false, &tv); 2133 } 2134 } 2135 2136 static void 2137 rack_log_to_prr(struct tcp_rack *rack, int frm, int orig_cwnd) 2138 { 2139 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2140 union tcp_log_stackspecific log; 2141 struct timeval tv; 2142 2143 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2144 log.u_bbr.flex1 = rack->r_ctl.rc_prr_out; 2145 log.u_bbr.flex2 = rack->r_ctl.rc_prr_recovery_fs; 2146 if (rack->rack_no_prr) 2147 log.u_bbr.flex3 = 0; 2148 else 2149 log.u_bbr.flex3 = rack->r_ctl.rc_prr_sndcnt; 2150 log.u_bbr.flex4 = rack->r_ctl.rc_prr_delivered; 2151 log.u_bbr.flex5 = rack->r_ctl.rc_sacked; 2152 log.u_bbr.flex6 = rack->r_ctl.rc_holes_rxt; 2153 log.u_bbr.flex8 = frm; 2154 log.u_bbr.pkts_out = orig_cwnd; 2155 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2156 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2157 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2158 &rack->rc_inp->inp_socket->so_rcv, 2159 &rack->rc_inp->inp_socket->so_snd, 2160 BBR_LOG_BBRUPD, 0, 2161 0, &log, false, &tv); 2162 } 2163 } 2164 2165 #ifdef NETFLIX_EXP_DETECTION 2166 static void 2167 rack_log_sad(struct tcp_rack *rack, int event) 2168 { 2169 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2170 union tcp_log_stackspecific log; 2171 struct timeval tv; 2172 2173 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2174 log.u_bbr.flex1 = rack->r_ctl.sack_count; 2175 log.u_bbr.flex2 = rack->r_ctl.ack_count; 2176 log.u_bbr.flex3 = rack->r_ctl.sack_moved_extra; 2177 log.u_bbr.flex4 = rack->r_ctl.sack_noextra_move; 2178 log.u_bbr.flex5 = rack->r_ctl.rc_num_maps_alloced; 2179 log.u_bbr.flex6 = tcp_sack_to_ack_thresh; 2180 log.u_bbr.pkts_out = tcp_sack_to_move_thresh; 2181 log.u_bbr.lt_epoch = (tcp_force_detection << 8); 2182 log.u_bbr.lt_epoch |= rack->do_detection; 2183 log.u_bbr.applimited = tcp_map_minimum; 2184 log.u_bbr.flex7 = rack->sack_attack_disable; 2185 log.u_bbr.flex8 = event; 2186 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2187 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2188 log.u_bbr.delivered = tcp_sad_decay_val; 2189 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2190 &rack->rc_inp->inp_socket->so_rcv, 2191 &rack->rc_inp->inp_socket->so_snd, 2192 TCP_SAD_DETECTION, 0, 2193 0, &log, false, &tv); 2194 } 2195 } 2196 #endif 2197 2198 static void 2199 rack_counter_destroy(void) 2200 { 2201 counter_u64_free(rack_ack_total); 2202 counter_u64_free(rack_express_sack); 2203 counter_u64_free(rack_sack_total); 2204 counter_u64_free(rack_move_none); 2205 counter_u64_free(rack_move_some); 2206 counter_u64_free(rack_sack_attacks_detected); 2207 counter_u64_free(rack_sack_attacks_reversed); 2208 counter_u64_free(rack_sack_used_next_merge); 2209 counter_u64_free(rack_sack_used_prev_merge); 2210 counter_u64_free(rack_badfr); 2211 counter_u64_free(rack_badfr_bytes); 2212 counter_u64_free(rack_rtm_prr_retran); 2213 counter_u64_free(rack_rtm_prr_newdata); 2214 counter_u64_free(rack_timestamp_mismatch); 2215 counter_u64_free(rack_find_high); 2216 counter_u64_free(rack_reorder_seen); 2217 counter_u64_free(rack_tlp_tot); 2218 counter_u64_free(rack_tlp_newdata); 2219 counter_u64_free(rack_tlp_retran); 2220 counter_u64_free(rack_tlp_retran_bytes); 2221 counter_u64_free(rack_tlp_retran_fail); 2222 counter_u64_free(rack_to_tot); 2223 counter_u64_free(rack_to_arm_rack); 2224 counter_u64_free(rack_to_arm_tlp); 2225 counter_u64_free(rack_calc_zero); 2226 counter_u64_free(rack_calc_nonzero); 2227 counter_u64_free(rack_paced_segments); 2228 counter_u64_free(rack_unpaced_segments); 2229 counter_u64_free(rack_saw_enobuf); 2230 counter_u64_free(rack_saw_enetunreach); 2231 counter_u64_free(rack_to_alloc); 2232 counter_u64_free(rack_to_alloc_hard); 2233 counter_u64_free(rack_to_alloc_emerg); 2234 counter_u64_free(rack_to_alloc_limited); 2235 counter_u64_free(rack_alloc_limited_conns); 2236 counter_u64_free(rack_split_limited); 2237 counter_u64_free(rack_sack_proc_all); 2238 counter_u64_free(rack_sack_proc_restart); 2239 counter_u64_free(rack_sack_proc_short); 2240 counter_u64_free(rack_enter_tlp_calc); 2241 counter_u64_free(rack_used_tlpmethod); 2242 counter_u64_free(rack_used_tlpmethod2); 2243 counter_u64_free(rack_sack_skipped_acked); 2244 counter_u64_free(rack_sack_splits); 2245 counter_u64_free(rack_progress_drops); 2246 counter_u64_free(rack_input_idle_reduces); 2247 counter_u64_free(rack_collapsed_win); 2248 counter_u64_free(rack_tlp_does_nada); 2249 counter_u64_free(rack_try_scwnd); 2250 counter_u64_free(rack_tls_rwnd); 2251 counter_u64_free(rack_tls_cwnd); 2252 counter_u64_free(rack_tls_app); 2253 counter_u64_free(rack_tls_other); 2254 counter_u64_free(rack_tls_filled); 2255 counter_u64_free(rack_tls_rxt); 2256 counter_u64_free(rack_tls_tlp); 2257 counter_u64_free(rack_per_timer_hole); 2258 COUNTER_ARRAY_FREE(rack_out_size, TCP_MSS_ACCT_SIZE); 2259 COUNTER_ARRAY_FREE(rack_opts_arry, RACK_OPTS_SIZE); 2260 } 2261 2262 static struct rack_sendmap * 2263 rack_alloc(struct tcp_rack *rack) 2264 { 2265 struct rack_sendmap *rsm; 2266 2267 rsm = uma_zalloc(rack_zone, M_NOWAIT); 2268 if (rsm) { 2269 rack->r_ctl.rc_num_maps_alloced++; 2270 counter_u64_add(rack_to_alloc, 1); 2271 return (rsm); 2272 } 2273 if (rack->rc_free_cnt) { 2274 counter_u64_add(rack_to_alloc_emerg, 1); 2275 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); 2276 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext); 2277 rack->rc_free_cnt--; 2278 return (rsm); 2279 } 2280 return (NULL); 2281 } 2282 2283 static struct rack_sendmap * 2284 rack_alloc_full_limit(struct tcp_rack *rack) 2285 { 2286 if ((V_tcp_map_entries_limit > 0) && 2287 (rack->do_detection == 0) && 2288 (rack->r_ctl.rc_num_maps_alloced >= V_tcp_map_entries_limit)) { 2289 counter_u64_add(rack_to_alloc_limited, 1); 2290 if (!rack->alloc_limit_reported) { 2291 rack->alloc_limit_reported = 1; 2292 counter_u64_add(rack_alloc_limited_conns, 1); 2293 } 2294 return (NULL); 2295 } 2296 return (rack_alloc(rack)); 2297 } 2298 2299 /* wrapper to allocate a sendmap entry, subject to a specific limit */ 2300 static struct rack_sendmap * 2301 rack_alloc_limit(struct tcp_rack *rack, uint8_t limit_type) 2302 { 2303 struct rack_sendmap *rsm; 2304 2305 if (limit_type) { 2306 /* currently there is only one limit type */ 2307 if (V_tcp_map_split_limit > 0 && 2308 (rack->do_detection == 0) && 2309 rack->r_ctl.rc_num_split_allocs >= V_tcp_map_split_limit) { 2310 counter_u64_add(rack_split_limited, 1); 2311 if (!rack->alloc_limit_reported) { 2312 rack->alloc_limit_reported = 1; 2313 counter_u64_add(rack_alloc_limited_conns, 1); 2314 } 2315 return (NULL); 2316 } 2317 } 2318 2319 /* allocate and mark in the limit type, if set */ 2320 rsm = rack_alloc(rack); 2321 if (rsm != NULL && limit_type) { 2322 rsm->r_limit_type = limit_type; 2323 rack->r_ctl.rc_num_split_allocs++; 2324 } 2325 return (rsm); 2326 } 2327 2328 static void 2329 rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm) 2330 { 2331 if (rsm->r_flags & RACK_APP_LIMITED) { 2332 if (rack->r_ctl.rc_app_limited_cnt > 0) { 2333 rack->r_ctl.rc_app_limited_cnt--; 2334 } 2335 } 2336 if (rsm->r_limit_type) { 2337 /* currently there is only one limit type */ 2338 rack->r_ctl.rc_num_split_allocs--; 2339 } 2340 if (rsm == rack->r_ctl.rc_first_appl) { 2341 if (rack->r_ctl.rc_app_limited_cnt == 0) 2342 rack->r_ctl.rc_first_appl = NULL; 2343 else { 2344 /* Follow the next one out */ 2345 struct rack_sendmap fe; 2346 2347 fe.r_start = rsm->r_nseq_appl; 2348 rack->r_ctl.rc_first_appl = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); 2349 } 2350 } 2351 if (rsm == rack->r_ctl.rc_resend) 2352 rack->r_ctl.rc_resend = NULL; 2353 if (rsm == rack->r_ctl.rc_rsm_at_retran) 2354 rack->r_ctl.rc_rsm_at_retran = NULL; 2355 if (rsm == rack->r_ctl.rc_end_appl) 2356 rack->r_ctl.rc_end_appl = NULL; 2357 if (rack->r_ctl.rc_tlpsend == rsm) 2358 rack->r_ctl.rc_tlpsend = NULL; 2359 if (rack->r_ctl.rc_sacklast == rsm) 2360 rack->r_ctl.rc_sacklast = NULL; 2361 if (rack->rc_free_cnt < rack_free_cache) { 2362 memset(rsm, 0, sizeof(struct rack_sendmap)); 2363 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_tnext); 2364 rsm->r_limit_type = 0; 2365 rack->rc_free_cnt++; 2366 return; 2367 } 2368 rack->r_ctl.rc_num_maps_alloced--; 2369 uma_zfree(rack_zone, rsm); 2370 } 2371 2372 static uint32_t 2373 rack_get_measure_window(struct tcpcb *tp, struct tcp_rack *rack) 2374 { 2375 uint64_t srtt, bw, len, tim; 2376 uint32_t segsiz, def_len, minl; 2377 2378 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 2379 def_len = rack_def_data_window * segsiz; 2380 if (rack->rc_gp_filled == 0) { 2381 /* 2382 * We have no measurement (IW is in flight?) so 2383 * we can only guess using our data_window sysctl 2384 * value (usually 100MSS). 2385 */ 2386 return (def_len); 2387 } 2388 /* 2389 * Now we have a number of factors to consider. 2390 * 2391 * 1) We have a desired BDP which is usually 2392 * at least 2. 2393 * 2) We have a minimum number of rtt's usually 1 SRTT 2394 * but we allow it too to be more. 2395 * 3) We want to make sure a measurement last N useconds (if 2396 * we have set rack_min_measure_usec. 2397 * 2398 * We handle the first concern here by trying to create a data 2399 * window of max(rack_def_data_window, DesiredBDP). The 2400 * second concern we handle in not letting the measurement 2401 * window end normally until at least the required SRTT's 2402 * have gone by which is done further below in 2403 * rack_enough_for_measurement(). Finally the third concern 2404 * we also handle here by calculating how long that time 2405 * would take at the current BW and then return the 2406 * max of our first calculation and that length. Note 2407 * that if rack_min_measure_usec is 0, we don't deal 2408 * with concern 3. Also for both Concern 1 and 3 an 2409 * application limited period could end the measurement 2410 * earlier. 2411 * 2412 * So lets calculate the BDP with the "known" b/w using 2413 * the SRTT has our rtt and then multiply it by the 2414 * goal. 2415 */ 2416 bw = rack_get_bw(rack); 2417 srtt = ((uint64_t)TICKS_2_USEC(tp->t_srtt) >> TCP_RTT_SHIFT); 2418 len = bw * srtt; 2419 len /= (uint64_t)HPTS_USEC_IN_SEC; 2420 len *= max(1, rack_goal_bdp); 2421 /* Now we need to round up to the nearest MSS */ 2422 len = roundup(len, segsiz); 2423 if (rack_min_measure_usec) { 2424 /* Now calculate our min length for this b/w */ 2425 tim = rack_min_measure_usec; 2426 minl = (tim * bw) / (uint64_t)HPTS_USEC_IN_SEC; 2427 if (minl == 0) 2428 minl = 1; 2429 minl = roundup(minl, segsiz); 2430 if (len < minl) 2431 len = minl; 2432 } 2433 /* 2434 * Now if we have a very small window we want 2435 * to attempt to get the window that is 2436 * as small as possible. This happens on 2437 * low b/w connections and we don't want to 2438 * span huge numbers of rtt's between measurements. 2439 * 2440 * We basically include 2 over our "MIN window" so 2441 * that the measurement can be shortened (possibly) by 2442 * an ack'ed packet. 2443 */ 2444 if (len < def_len) 2445 return (max((uint32_t)len, ((MIN_GP_WIN+2) * segsiz))); 2446 else 2447 return (max((uint32_t)len, def_len)); 2448 2449 } 2450 2451 static int 2452 rack_enough_for_measurement(struct tcpcb *tp, struct tcp_rack *rack, tcp_seq th_ack) 2453 { 2454 uint32_t tim, srtts, segsiz; 2455 2456 /* 2457 * Has enough time passed for the GP measurement to be valid? 2458 */ 2459 if ((tp->snd_max == tp->snd_una) || 2460 (th_ack == tp->snd_max)){ 2461 /* All is acked */ 2462 return (1); 2463 } 2464 if (SEQ_LT(th_ack, tp->gput_seq)) { 2465 /* Not enough bytes yet */ 2466 return (0); 2467 } 2468 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 2469 if (SEQ_LT(th_ack, tp->gput_ack) && 2470 ((th_ack - tp->gput_seq) < max(rc_init_window(rack), (MIN_GP_WIN * segsiz)))) { 2471 /* Not enough bytes yet */ 2472 return (0); 2473 } 2474 if (rack->r_ctl.rc_first_appl && 2475 (rack->r_ctl.rc_first_appl->r_start == th_ack)) { 2476 /* 2477 * We are up to the app limited point 2478 * we have to measure irrespective of the time.. 2479 */ 2480 return (1); 2481 } 2482 /* Now what about time? */ 2483 srtts = (rack->r_ctl.rc_gp_srtt * rack_min_srtts); 2484 tim = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time) - tp->gput_ts; 2485 if (tim >= srtts) { 2486 return (1); 2487 } 2488 /* Nope not even a full SRTT has passed */ 2489 return (0); 2490 } 2491 2492 2493 static void 2494 rack_log_timely(struct tcp_rack *rack, 2495 uint32_t logged, uint64_t cur_bw, uint64_t low_bnd, 2496 uint64_t up_bnd, int line, uint8_t method) 2497 { 2498 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2499 union tcp_log_stackspecific log; 2500 struct timeval tv; 2501 2502 memset(&log, 0, sizeof(log)); 2503 log.u_bbr.flex1 = logged; 2504 log.u_bbr.flex2 = rack->rc_gp_timely_inc_cnt; 2505 log.u_bbr.flex2 <<= 4; 2506 log.u_bbr.flex2 |= rack->rc_gp_timely_dec_cnt; 2507 log.u_bbr.flex2 <<= 4; 2508 log.u_bbr.flex2 |= rack->rc_gp_incr; 2509 log.u_bbr.flex2 <<= 4; 2510 log.u_bbr.flex2 |= rack->rc_gp_bwred; 2511 log.u_bbr.flex3 = rack->rc_gp_incr; 2512 log.u_bbr.flex4 = rack->r_ctl.rack_per_of_gp_ss; 2513 log.u_bbr.flex5 = rack->r_ctl.rack_per_of_gp_ca; 2514 log.u_bbr.flex6 = rack->r_ctl.rack_per_of_gp_rec; 2515 log.u_bbr.flex7 = rack->rc_gp_bwred; 2516 log.u_bbr.flex8 = method; 2517 log.u_bbr.cur_del_rate = cur_bw; 2518 log.u_bbr.delRate = low_bnd; 2519 log.u_bbr.bw_inuse = up_bnd; 2520 log.u_bbr.rttProp = rack_get_bw(rack); 2521 log.u_bbr.pkt_epoch = line; 2522 log.u_bbr.pkts_out = rack->r_ctl.rc_rtt_diff; 2523 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2524 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2525 log.u_bbr.epoch = rack->r_ctl.rc_gp_srtt; 2526 log.u_bbr.lt_epoch = rack->r_ctl.rc_prev_gp_srtt; 2527 log.u_bbr.cwnd_gain = rack->rc_dragged_bottom; 2528 log.u_bbr.cwnd_gain <<= 1; 2529 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_rec; 2530 log.u_bbr.cwnd_gain <<= 1; 2531 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ss; 2532 log.u_bbr.cwnd_gain <<= 1; 2533 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ca; 2534 log.u_bbr.lost = rack->r_ctl.rc_loss_count; 2535 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2536 &rack->rc_inp->inp_socket->so_rcv, 2537 &rack->rc_inp->inp_socket->so_snd, 2538 TCP_TIMELY_WORK, 0, 2539 0, &log, false, &tv); 2540 } 2541 } 2542 2543 static int 2544 rack_bw_can_be_raised(struct tcp_rack *rack, uint64_t cur_bw, uint64_t last_bw_est, uint16_t mult) 2545 { 2546 /* 2547 * Before we increase we need to know if 2548 * the estimate just made was less than 2549 * our pacing goal (i.e. (cur_bw * mult) > last_bw_est) 2550 * 2551 * If we already are pacing at a fast enough 2552 * rate to push us faster there is no sense of 2553 * increasing. 2554 * 2555 * We first caculate our actual pacing rate (ss or ca multipler 2556 * times our cur_bw). 2557 * 2558 * Then we take the last measured rate and multipy by our 2559 * maximum pacing overage to give us a max allowable rate. 2560 * 2561 * If our act_rate is smaller than our max_allowable rate 2562 * then we should increase. Else we should hold steady. 2563 * 2564 */ 2565 uint64_t act_rate, max_allow_rate; 2566 2567 if (rack_timely_no_stopping) 2568 return (1); 2569 2570 if ((cur_bw == 0) || (last_bw_est == 0)) { 2571 /* 2572 * Initial startup case or 2573 * everything is acked case. 2574 */ 2575 rack_log_timely(rack, mult, cur_bw, 0, 0, 2576 __LINE__, 9); 2577 return (1); 2578 } 2579 if (mult <= 100) { 2580 /* 2581 * We can always pace at or slightly above our rate. 2582 */ 2583 rack_log_timely(rack, mult, cur_bw, 0, 0, 2584 __LINE__, 9); 2585 return (1); 2586 } 2587 act_rate = cur_bw * (uint64_t)mult; 2588 act_rate /= 100; 2589 max_allow_rate = last_bw_est * ((uint64_t)rack_max_per_above + (uint64_t)100); 2590 max_allow_rate /= 100; 2591 if (act_rate < max_allow_rate) { 2592 /* 2593 * Here the rate we are actually pacing at 2594 * is smaller than 10% above our last measurement. 2595 * This means we are pacing below what we would 2596 * like to try to achieve (plus some wiggle room). 2597 */ 2598 rack_log_timely(rack, mult, cur_bw, act_rate, max_allow_rate, 2599 __LINE__, 9); 2600 return (1); 2601 } else { 2602 /* 2603 * Here we are already pacing at least rack_max_per_above(10%) 2604 * what we are getting back. This indicates most likely 2605 * that we are being limited (cwnd/rwnd/app) and can't 2606 * get any more b/w. There is no sense of trying to 2607 * raise up the pacing rate its not speeding us up 2608 * and we already are pacing faster than we are getting. 2609 */ 2610 rack_log_timely(rack, mult, cur_bw, act_rate, max_allow_rate, 2611 __LINE__, 8); 2612 return (0); 2613 } 2614 } 2615 2616 static void 2617 rack_validate_multipliers_at_or_above100(struct tcp_rack *rack) 2618 { 2619 /* 2620 * When we drag bottom, we want to assure 2621 * that no multiplier is below 1.0, if so 2622 * we want to restore it to at least that. 2623 */ 2624 if (rack->r_ctl.rack_per_of_gp_rec < 100) { 2625 /* This is unlikely we usually do not touch recovery */ 2626 rack->r_ctl.rack_per_of_gp_rec = 100; 2627 } 2628 if (rack->r_ctl.rack_per_of_gp_ca < 100) { 2629 rack->r_ctl.rack_per_of_gp_ca = 100; 2630 } 2631 if (rack->r_ctl.rack_per_of_gp_ss < 100) { 2632 rack->r_ctl.rack_per_of_gp_ss = 100; 2633 } 2634 } 2635 2636 static void 2637 rack_validate_multipliers_at_or_below_100(struct tcp_rack *rack) 2638 { 2639 if (rack->r_ctl.rack_per_of_gp_ca > 100) { 2640 rack->r_ctl.rack_per_of_gp_ca = 100; 2641 } 2642 if (rack->r_ctl.rack_per_of_gp_ss > 100) { 2643 rack->r_ctl.rack_per_of_gp_ss = 100; 2644 } 2645 } 2646 2647 static void 2648 rack_increase_bw_mul(struct tcp_rack *rack, int timely_says, uint64_t cur_bw, uint64_t last_bw_est, int override) 2649 { 2650 int32_t calc, logged, plus; 2651 2652 logged = 0; 2653 2654 if (override) { 2655 /* 2656 * override is passed when we are 2657 * loosing b/w and making one last 2658 * gasp at trying to not loose out 2659 * to a new-reno flow. 2660 */ 2661 goto extra_boost; 2662 } 2663 /* In classic timely we boost by 5x if we have 5 increases in a row, lets not */ 2664 if (rack->rc_gp_incr && 2665 ((rack->rc_gp_timely_inc_cnt + 1) >= RACK_TIMELY_CNT_BOOST)) { 2666 /* 2667 * Reset and get 5 strokes more before the boost. Note 2668 * that the count is 0 based so we have to add one. 2669 */ 2670 extra_boost: 2671 plus = (uint32_t)rack_gp_increase_per * RACK_TIMELY_CNT_BOOST; 2672 rack->rc_gp_timely_inc_cnt = 0; 2673 } else 2674 plus = (uint32_t)rack_gp_increase_per; 2675 /* Must be at least 1% increase for true timely increases */ 2676 if ((plus < 1) && 2677 ((rack->r_ctl.rc_rtt_diff <= 0) || (timely_says <= 0))) 2678 plus = 1; 2679 if (rack->rc_gp_saw_rec && 2680 (rack->rc_gp_no_rec_chg == 0) && 2681 rack_bw_can_be_raised(rack, cur_bw, last_bw_est, 2682 rack->r_ctl.rack_per_of_gp_rec)) { 2683 /* We have been in recovery ding it too */ 2684 calc = rack->r_ctl.rack_per_of_gp_rec + plus; 2685 if (calc > 0xffff) 2686 calc = 0xffff; 2687 logged |= 1; 2688 rack->r_ctl.rack_per_of_gp_rec = (uint16_t)calc; 2689 if (rack_per_upper_bound_ss && 2690 (rack->rc_dragged_bottom == 0) && 2691 (rack->r_ctl.rack_per_of_gp_rec > rack_per_upper_bound_ss)) 2692 rack->r_ctl.rack_per_of_gp_rec = rack_per_upper_bound_ss; 2693 } 2694 if (rack->rc_gp_saw_ca && 2695 (rack->rc_gp_saw_ss == 0) && 2696 rack_bw_can_be_raised(rack, cur_bw, last_bw_est, 2697 rack->r_ctl.rack_per_of_gp_ca)) { 2698 /* In CA */ 2699 calc = rack->r_ctl.rack_per_of_gp_ca + plus; 2700 if (calc > 0xffff) 2701 calc = 0xffff; 2702 logged |= 2; 2703 rack->r_ctl.rack_per_of_gp_ca = (uint16_t)calc; 2704 if (rack_per_upper_bound_ca && 2705 (rack->rc_dragged_bottom == 0) && 2706 (rack->r_ctl.rack_per_of_gp_ca > rack_per_upper_bound_ca)) 2707 rack->r_ctl.rack_per_of_gp_ca = rack_per_upper_bound_ca; 2708 } 2709 if (rack->rc_gp_saw_ss && 2710 rack_bw_can_be_raised(rack, cur_bw, last_bw_est, 2711 rack->r_ctl.rack_per_of_gp_ss)) { 2712 /* In SS */ 2713 calc = rack->r_ctl.rack_per_of_gp_ss + plus; 2714 if (calc > 0xffff) 2715 calc = 0xffff; 2716 rack->r_ctl.rack_per_of_gp_ss = (uint16_t)calc; 2717 if (rack_per_upper_bound_ss && 2718 (rack->rc_dragged_bottom == 0) && 2719 (rack->r_ctl.rack_per_of_gp_ss > rack_per_upper_bound_ss)) 2720 rack->r_ctl.rack_per_of_gp_ss = rack_per_upper_bound_ss; 2721 logged |= 4; 2722 } 2723 if (logged && 2724 (rack->rc_gp_incr == 0)){ 2725 /* Go into increment mode */ 2726 rack->rc_gp_incr = 1; 2727 rack->rc_gp_timely_inc_cnt = 0; 2728 } 2729 if (rack->rc_gp_incr && 2730 logged && 2731 (rack->rc_gp_timely_inc_cnt < RACK_TIMELY_CNT_BOOST)) { 2732 rack->rc_gp_timely_inc_cnt++; 2733 } 2734 rack_log_timely(rack, logged, plus, 0, 0, 2735 __LINE__, 1); 2736 } 2737 2738 static uint32_t 2739 rack_get_decrease(struct tcp_rack *rack, uint32_t curper, int32_t rtt_diff) 2740 { 2741 /* 2742 * norm_grad = rtt_diff / minrtt; 2743 * new_per = curper * (1 - B * norm_grad) 2744 * 2745 * B = rack_gp_decrease_per (default 10%) 2746 * rtt_dif = input var current rtt-diff 2747 * curper = input var current percentage 2748 * minrtt = from rack filter 2749 * 2750 */ 2751 uint64_t perf; 2752 2753 perf = (((uint64_t)curper * ((uint64_t)1000000 - 2754 ((uint64_t)rack_gp_decrease_per * (uint64_t)10000 * 2755 (((uint64_t)rtt_diff * (uint64_t)1000000)/ 2756 (uint64_t)get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt)))/ 2757 (uint64_t)1000000)) / 2758 (uint64_t)1000000); 2759 if (perf > curper) { 2760 /* TSNH */ 2761 perf = curper - 1; 2762 } 2763 return ((uint32_t)perf); 2764 } 2765 2766 static uint32_t 2767 rack_decrease_highrtt(struct tcp_rack *rack, uint32_t curper, uint32_t rtt) 2768 { 2769 /* 2770 * highrttthresh 2771 * result = curper * (1 - (B * ( 1 - ------ )) 2772 * gp_srtt 2773 * 2774 * B = rack_gp_decrease_per (default 10%) 2775 * highrttthresh = filter_min * rack_gp_rtt_maxmul 2776 */ 2777 uint64_t perf; 2778 uint32_t highrttthresh; 2779 2780 highrttthresh = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_maxmul; 2781 2782 perf = (((uint64_t)curper * ((uint64_t)1000000 - 2783 ((uint64_t)rack_gp_decrease_per * ((uint64_t)1000000 - 2784 ((uint64_t)highrttthresh * (uint64_t)1000000) / 2785 (uint64_t)rtt)) / 100)) /(uint64_t)1000000); 2786 return (perf); 2787 } 2788 2789 2790 static void 2791 rack_decrease_bw_mul(struct tcp_rack *rack, int timely_says, uint32_t rtt, int32_t rtt_diff) 2792 { 2793 uint64_t logvar, logvar2, logvar3; 2794 uint32_t logged, new_per, ss_red, ca_red, rec_red, alt, val; 2795 2796 if (rack->rc_gp_incr) { 2797 /* Turn off increment counting */ 2798 rack->rc_gp_incr = 0; 2799 rack->rc_gp_timely_inc_cnt = 0; 2800 } 2801 ss_red = ca_red = rec_red = 0; 2802 logged = 0; 2803 /* Calculate the reduction value */ 2804 if (rtt_diff < 0) { 2805 rtt_diff *= -1; 2806 } 2807 /* Must be at least 1% reduction */ 2808 if (rack->rc_gp_saw_rec && (rack->rc_gp_no_rec_chg == 0)) { 2809 /* We have been in recovery ding it too */ 2810 if (timely_says == 2) { 2811 new_per = rack_decrease_highrtt(rack, rack->r_ctl.rack_per_of_gp_rec, rtt); 2812 alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff); 2813 if (alt < new_per) 2814 val = alt; 2815 else 2816 val = new_per; 2817 } else 2818 val = new_per = alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff); 2819 if (rack->r_ctl.rack_per_of_gp_rec > val) { 2820 rec_red = (rack->r_ctl.rack_per_of_gp_rec - val); 2821 rack->r_ctl.rack_per_of_gp_rec = (uint16_t)val; 2822 } else { 2823 rack->r_ctl.rack_per_of_gp_rec = rack_per_lower_bound; 2824 rec_red = 0; 2825 } 2826 if (rack_per_lower_bound > rack->r_ctl.rack_per_of_gp_rec) 2827 rack->r_ctl.rack_per_of_gp_rec = rack_per_lower_bound; 2828 logged |= 1; 2829 } 2830 if (rack->rc_gp_saw_ss) { 2831 /* Sent in SS */ 2832 if (timely_says == 2) { 2833 new_per = rack_decrease_highrtt(rack, rack->r_ctl.rack_per_of_gp_ss, rtt); 2834 alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff); 2835 if (alt < new_per) 2836 val = alt; 2837 else 2838 val = new_per; 2839 } else 2840 val = new_per = alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_ss, rtt_diff); 2841 if (rack->r_ctl.rack_per_of_gp_ss > new_per) { 2842 ss_red = rack->r_ctl.rack_per_of_gp_ss - val; 2843 rack->r_ctl.rack_per_of_gp_ss = (uint16_t)val; 2844 } else { 2845 ss_red = new_per; 2846 rack->r_ctl.rack_per_of_gp_ss = rack_per_lower_bound; 2847 logvar = new_per; 2848 logvar <<= 32; 2849 logvar |= alt; 2850 logvar2 = (uint32_t)rtt; 2851 logvar2 <<= 32; 2852 logvar2 |= (uint32_t)rtt_diff; 2853 logvar3 = rack_gp_rtt_maxmul; 2854 logvar3 <<= 32; 2855 logvar3 |= get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); 2856 rack_log_timely(rack, timely_says, 2857 logvar2, logvar3, 2858 logvar, __LINE__, 10); 2859 } 2860 if (rack_per_lower_bound > rack->r_ctl.rack_per_of_gp_ss) 2861 rack->r_ctl.rack_per_of_gp_ss = rack_per_lower_bound; 2862 logged |= 4; 2863 } else if (rack->rc_gp_saw_ca) { 2864 /* Sent in CA */ 2865 if (timely_says == 2) { 2866 new_per = rack_decrease_highrtt(rack, rack->r_ctl.rack_per_of_gp_ca, rtt); 2867 alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff); 2868 if (alt < new_per) 2869 val = alt; 2870 else 2871 val = new_per; 2872 } else 2873 val = new_per = alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_ca, rtt_diff); 2874 if (rack->r_ctl.rack_per_of_gp_ca > val) { 2875 ca_red = rack->r_ctl.rack_per_of_gp_ca - val; 2876 rack->r_ctl.rack_per_of_gp_ca = (uint16_t)val; 2877 } else { 2878 rack->r_ctl.rack_per_of_gp_ca = rack_per_lower_bound; 2879 ca_red = 0; 2880 logvar = new_per; 2881 logvar <<= 32; 2882 logvar |= alt; 2883 logvar2 = (uint32_t)rtt; 2884 logvar2 <<= 32; 2885 logvar2 |= (uint32_t)rtt_diff; 2886 logvar3 = rack_gp_rtt_maxmul; 2887 logvar3 <<= 32; 2888 logvar3 |= get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); 2889 rack_log_timely(rack, timely_says, 2890 logvar2, logvar3, 2891 logvar, __LINE__, 10); 2892 } 2893 if (rack_per_lower_bound > rack->r_ctl.rack_per_of_gp_ca) 2894 rack->r_ctl.rack_per_of_gp_ca = rack_per_lower_bound; 2895 logged |= 2; 2896 } 2897 if (rack->rc_gp_timely_dec_cnt < 0x7) { 2898 rack->rc_gp_timely_dec_cnt++; 2899 if (rack_timely_dec_clear && 2900 (rack->rc_gp_timely_dec_cnt == rack_timely_dec_clear)) 2901 rack->rc_gp_timely_dec_cnt = 0; 2902 } 2903 logvar = ss_red; 2904 logvar <<= 32; 2905 logvar |= ca_red; 2906 rack_log_timely(rack, logged, rec_red, rack_per_lower_bound, logvar, 2907 __LINE__, 2); 2908 } 2909 2910 static void 2911 rack_log_rtt_shrinks(struct tcp_rack *rack, uint32_t us_cts, 2912 uint32_t rtt, uint32_t line, uint8_t reas) 2913 { 2914 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2915 union tcp_log_stackspecific log; 2916 struct timeval tv; 2917 2918 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2919 log.u_bbr.flex1 = line; 2920 log.u_bbr.flex2 = rack->r_ctl.rc_time_probertt_starts; 2921 log.u_bbr.flex3 = rack->r_ctl.rc_lower_rtt_us_cts; 2922 log.u_bbr.flex4 = rack->r_ctl.rack_per_of_gp_ss; 2923 log.u_bbr.flex5 = rtt; 2924 log.u_bbr.flex6 = rack->rc_highly_buffered; 2925 log.u_bbr.flex6 <<= 1; 2926 log.u_bbr.flex6 |= rack->forced_ack; 2927 log.u_bbr.flex6 <<= 1; 2928 log.u_bbr.flex6 |= rack->rc_gp_dyn_mul; 2929 log.u_bbr.flex6 <<= 1; 2930 log.u_bbr.flex6 |= rack->in_probe_rtt; 2931 log.u_bbr.flex6 <<= 1; 2932 log.u_bbr.flex6 |= rack->measure_saw_probe_rtt; 2933 log.u_bbr.flex7 = rack->r_ctl.rack_per_of_gp_probertt; 2934 log.u_bbr.pacing_gain = rack->r_ctl.rack_per_of_gp_ca; 2935 log.u_bbr.cwnd_gain = rack->r_ctl.rack_per_of_gp_rec; 2936 log.u_bbr.flex8 = reas; 2937 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2938 log.u_bbr.delRate = rack_get_bw(rack); 2939 log.u_bbr.cur_del_rate = rack->r_ctl.rc_highest_us_rtt; 2940 log.u_bbr.cur_del_rate <<= 32; 2941 log.u_bbr.cur_del_rate |= rack->r_ctl.rc_lowest_us_rtt; 2942 log.u_bbr.applimited = rack->r_ctl.rc_time_probertt_entered; 2943 log.u_bbr.pkts_out = rack->r_ctl.rc_rtt_diff; 2944 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2945 log.u_bbr.epoch = rack->r_ctl.rc_gp_srtt; 2946 log.u_bbr.lt_epoch = rack->r_ctl.rc_prev_gp_srtt; 2947 log.u_bbr.pkt_epoch = rack->r_ctl.rc_lower_rtt_us_cts; 2948 log.u_bbr.delivered = rack->r_ctl.rc_target_probertt_flight; 2949 log.u_bbr.lost = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); 2950 log.u_bbr.rttProp = us_cts; 2951 log.u_bbr.rttProp <<= 32; 2952 log.u_bbr.rttProp |= rack->r_ctl.rc_entry_gp_rtt; 2953 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2954 &rack->rc_inp->inp_socket->so_rcv, 2955 &rack->rc_inp->inp_socket->so_snd, 2956 BBR_LOG_RTT_SHRINKS, 0, 2957 0, &log, false, &rack->r_ctl.act_rcv_time); 2958 } 2959 } 2960 2961 static void 2962 rack_set_prtt_target(struct tcp_rack *rack, uint32_t segsiz, uint32_t rtt) 2963 { 2964 uint64_t bwdp; 2965 2966 bwdp = rack_get_bw(rack); 2967 bwdp *= (uint64_t)rtt; 2968 bwdp /= (uint64_t)HPTS_USEC_IN_SEC; 2969 rack->r_ctl.rc_target_probertt_flight = roundup((uint32_t)bwdp, segsiz); 2970 if (rack->r_ctl.rc_target_probertt_flight < (segsiz * rack_timely_min_segs)) { 2971 /* 2972 * A window protocol must be able to have 4 packets 2973 * outstanding as the floor in order to function 2974 * (especially considering delayed ack :D). 2975 */ 2976 rack->r_ctl.rc_target_probertt_flight = (segsiz * rack_timely_min_segs); 2977 } 2978 } 2979 2980 static void 2981 rack_enter_probertt(struct tcp_rack *rack, uint32_t us_cts) 2982 { 2983 /** 2984 * ProbeRTT is a bit different in rack_pacing than in 2985 * BBR. It is like BBR in that it uses the lowering of 2986 * the RTT as a signal that we saw something new and 2987 * counts from there for how long between. But it is 2988 * different in that its quite simple. It does not 2989 * play with the cwnd and wait until we get down 2990 * to N segments outstanding and hold that for 2991 * 200ms. Instead it just sets the pacing reduction 2992 * rate to a set percentage (70 by default) and hold 2993 * that for a number of recent GP Srtt's. 2994 */ 2995 uint32_t segsiz; 2996 2997 if (rack->rc_gp_dyn_mul == 0) 2998 return; 2999 3000 if (rack->rc_tp->snd_max == rack->rc_tp->snd_una) { 3001 /* We are idle */ 3002 return; 3003 } 3004 if ((rack->rc_tp->t_flags & TF_GPUTINPROG) && 3005 SEQ_GT(rack->rc_tp->snd_una, rack->rc_tp->gput_seq)) { 3006 /* 3007 * Stop the goodput now, the idea here is 3008 * that future measurements with in_probe_rtt 3009 * won't register if they are not greater so 3010 * we want to get what info (if any) is available 3011 * now. 3012 */ 3013 rack_do_goodput_measurement(rack->rc_tp, rack, 3014 rack->rc_tp->snd_una, __LINE__); 3015 } 3016 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt; 3017 rack->r_ctl.rc_time_probertt_entered = us_cts; 3018 segsiz = min(ctf_fixed_maxseg(rack->rc_tp), 3019 rack->r_ctl.rc_pace_min_segs); 3020 rack->in_probe_rtt = 1; 3021 rack->measure_saw_probe_rtt = 1; 3022 rack->r_ctl.rc_lower_rtt_us_cts = us_cts; 3023 rack->r_ctl.rc_time_probertt_starts = 0; 3024 rack->r_ctl.rc_entry_gp_rtt = rack->r_ctl.rc_gp_srtt; 3025 if (rack_probertt_use_min_rtt_entry) 3026 rack_set_prtt_target(rack, segsiz, get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt)); 3027 else 3028 rack_set_prtt_target(rack, segsiz, rack->r_ctl.rc_gp_srtt); 3029 rack_log_rtt_shrinks(rack, us_cts, get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 3030 __LINE__, RACK_RTTS_ENTERPROBE); 3031 } 3032 3033 static void 3034 rack_exit_probertt(struct tcp_rack *rack, uint32_t us_cts) 3035 { 3036 struct rack_sendmap *rsm; 3037 uint32_t segsiz; 3038 3039 segsiz = min(ctf_fixed_maxseg(rack->rc_tp), 3040 rack->r_ctl.rc_pace_min_segs); 3041 rack->in_probe_rtt = 0; 3042 if ((rack->rc_tp->t_flags & TF_GPUTINPROG) && 3043 SEQ_GT(rack->rc_tp->snd_una, rack->rc_tp->gput_seq)) { 3044 /* 3045 * Stop the goodput now, the idea here is 3046 * that future measurements with in_probe_rtt 3047 * won't register if they are not greater so 3048 * we want to get what info (if any) is available 3049 * now. 3050 */ 3051 rack_do_goodput_measurement(rack->rc_tp, rack, 3052 rack->rc_tp->snd_una, __LINE__); 3053 } else if (rack->rc_tp->t_flags & TF_GPUTINPROG) { 3054 /* 3055 * We don't have enough data to make a measurement. 3056 * So lets just stop and start here after exiting 3057 * probe-rtt. We probably are not interested in 3058 * the results anyway. 3059 */ 3060 rack->rc_tp->t_flags &= ~TF_GPUTINPROG; 3061 } 3062 /* 3063 * Measurements through the current snd_max are going 3064 * to be limited by the slower pacing rate. 3065 * 3066 * We need to mark these as app-limited so we 3067 * don't collapse the b/w. 3068 */ 3069 rsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 3070 if (rsm && ((rsm->r_flags & RACK_APP_LIMITED) == 0)) { 3071 if (rack->r_ctl.rc_app_limited_cnt == 0) 3072 rack->r_ctl.rc_end_appl = rack->r_ctl.rc_first_appl = rsm; 3073 else { 3074 /* 3075 * Go out to the end app limited and mark 3076 * this new one as next and move the end_appl up 3077 * to this guy. 3078 */ 3079 if (rack->r_ctl.rc_end_appl) 3080 rack->r_ctl.rc_end_appl->r_nseq_appl = rsm->r_start; 3081 rack->r_ctl.rc_end_appl = rsm; 3082 } 3083 rsm->r_flags |= RACK_APP_LIMITED; 3084 rack->r_ctl.rc_app_limited_cnt++; 3085 } 3086 /* 3087 * Now, we need to examine our pacing rate multipliers. 3088 * If its under 100%, we need to kick it back up to 3089 * 100%. We also don't let it be over our "max" above 3090 * the actual rate i.e. 100% + rack_clamp_atexit_prtt. 3091 * Note setting clamp_atexit_prtt to 0 has the effect 3092 * of setting CA/SS to 100% always at exit (which is 3093 * the default behavior). 3094 */ 3095 if (rack_probertt_clear_is) { 3096 rack->rc_gp_incr = 0; 3097 rack->rc_gp_bwred = 0; 3098 rack->rc_gp_timely_inc_cnt = 0; 3099 rack->rc_gp_timely_dec_cnt = 0; 3100 } 3101 /* Do we do any clamping at exit? */ 3102 if (rack->rc_highly_buffered && rack_atexit_prtt_hbp) { 3103 rack->r_ctl.rack_per_of_gp_ca = rack_atexit_prtt_hbp; 3104 rack->r_ctl.rack_per_of_gp_ss = rack_atexit_prtt_hbp; 3105 } 3106 if ((rack->rc_highly_buffered == 0) && rack_atexit_prtt) { 3107 rack->r_ctl.rack_per_of_gp_ca = rack_atexit_prtt; 3108 rack->r_ctl.rack_per_of_gp_ss = rack_atexit_prtt; 3109 } 3110 /* 3111 * Lets set rtt_diff to 0, so that we will get a "boost" 3112 * after exiting. 3113 */ 3114 rack->r_ctl.rc_rtt_diff = 0; 3115 3116 /* Clear all flags so we start fresh */ 3117 rack->rc_tp->t_bytes_acked = 0; 3118 rack->rc_tp->ccv->flags &= ~CCF_ABC_SENTAWND; 3119 /* 3120 * If configured to, set the cwnd and ssthresh to 3121 * our targets. 3122 */ 3123 if (rack_probe_rtt_sets_cwnd) { 3124 uint64_t ebdp; 3125 uint32_t setto; 3126 3127 /* Set ssthresh so we get into CA once we hit our target */ 3128 if (rack_probertt_use_min_rtt_exit == 1) { 3129 /* Set to min rtt */ 3130 rack_set_prtt_target(rack, segsiz, 3131 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt)); 3132 } else if (rack_probertt_use_min_rtt_exit == 2) { 3133 /* Set to current gp rtt */ 3134 rack_set_prtt_target(rack, segsiz, 3135 rack->r_ctl.rc_gp_srtt); 3136 } else if (rack_probertt_use_min_rtt_exit == 3) { 3137 /* Set to entry gp rtt */ 3138 rack_set_prtt_target(rack, segsiz, 3139 rack->r_ctl.rc_entry_gp_rtt); 3140 } else { 3141 uint64_t sum; 3142 uint32_t setval; 3143 3144 sum = rack->r_ctl.rc_entry_gp_rtt; 3145 sum *= 10; 3146 sum /= (uint64_t)(max(1, rack->r_ctl.rc_gp_srtt)); 3147 if (sum >= 20) { 3148 /* 3149 * A highly buffered path needs 3150 * cwnd space for timely to work. 3151 * Lets set things up as if 3152 * we are heading back here again. 3153 */ 3154 setval = rack->r_ctl.rc_entry_gp_rtt; 3155 } else if (sum >= 15) { 3156 /* 3157 * Lets take the smaller of the 3158 * two since we are just somewhat 3159 * buffered. 3160 */ 3161 setval = rack->r_ctl.rc_gp_srtt; 3162 if (setval > rack->r_ctl.rc_entry_gp_rtt) 3163 setval = rack->r_ctl.rc_entry_gp_rtt; 3164 } else { 3165 /* 3166 * Here we are not highly buffered 3167 * and should pick the min we can to 3168 * keep from causing loss. 3169 */ 3170 setval = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); 3171 } 3172 rack_set_prtt_target(rack, segsiz, 3173 setval); 3174 } 3175 if (rack_probe_rtt_sets_cwnd > 1) { 3176 /* There is a percentage here to boost */ 3177 ebdp = rack->r_ctl.rc_target_probertt_flight; 3178 ebdp *= rack_probe_rtt_sets_cwnd; 3179 ebdp /= 100; 3180 setto = rack->r_ctl.rc_target_probertt_flight + ebdp; 3181 } else 3182 setto = rack->r_ctl.rc_target_probertt_flight; 3183 rack->rc_tp->snd_cwnd = roundup(setto, segsiz); 3184 if (rack->rc_tp->snd_cwnd < (segsiz * rack_timely_min_segs)) { 3185 /* Enforce a min */ 3186 rack->rc_tp->snd_cwnd = segsiz * rack_timely_min_segs; 3187 } 3188 /* If we set in the cwnd also set the ssthresh point so we are in CA */ 3189 rack->rc_tp->snd_ssthresh = (rack->rc_tp->snd_cwnd - 1); 3190 } 3191 rack_log_rtt_shrinks(rack, us_cts, 3192 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 3193 __LINE__, RACK_RTTS_EXITPROBE); 3194 /* Clear times last so log has all the info */ 3195 rack->r_ctl.rc_probertt_sndmax_atexit = rack->rc_tp->snd_max; 3196 rack->r_ctl.rc_time_probertt_entered = us_cts; 3197 rack->r_ctl.rc_time_probertt_starts = rack->r_ctl.rc_lower_rtt_us_cts = us_cts; 3198 rack->r_ctl.rc_time_of_last_probertt = us_cts; 3199 } 3200 3201 static void 3202 rack_check_probe_rtt(struct tcp_rack *rack, uint32_t us_cts) 3203 { 3204 /* Check in on probe-rtt */ 3205 if (rack->rc_gp_filled == 0) { 3206 /* We do not do p-rtt unless we have gp measurements */ 3207 return; 3208 } 3209 if (rack->in_probe_rtt) { 3210 uint64_t no_overflow; 3211 uint32_t endtime, must_stay; 3212 3213 if (rack->r_ctl.rc_went_idle_time && 3214 ((us_cts - rack->r_ctl.rc_went_idle_time) > rack_min_probertt_hold)) { 3215 /* 3216 * We went idle during prtt, just exit now. 3217 */ 3218 rack_exit_probertt(rack, us_cts); 3219 } else if (rack_probe_rtt_safety_val && 3220 TSTMP_GT(us_cts, rack->r_ctl.rc_time_probertt_entered) && 3221 ((us_cts - rack->r_ctl.rc_time_probertt_entered) > rack_probe_rtt_safety_val)) { 3222 /* 3223 * Probe RTT safety value triggered! 3224 */ 3225 rack_log_rtt_shrinks(rack, us_cts, 3226 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 3227 __LINE__, RACK_RTTS_SAFETY); 3228 rack_exit_probertt(rack, us_cts); 3229 } 3230 /* Calculate the max we will wait */ 3231 endtime = rack->r_ctl.rc_time_probertt_entered + (rack->r_ctl.rc_gp_srtt * rack_max_drain_wait); 3232 if (rack->rc_highly_buffered) 3233 endtime += (rack->r_ctl.rc_gp_srtt * rack_max_drain_hbp); 3234 /* Calculate the min we must wait */ 3235 must_stay = rack->r_ctl.rc_time_probertt_entered + (rack->r_ctl.rc_gp_srtt * rack_must_drain); 3236 if ((ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) > rack->r_ctl.rc_target_probertt_flight) && 3237 TSTMP_LT(us_cts, endtime)) { 3238 uint32_t calc; 3239 /* Do we lower more? */ 3240 no_exit: 3241 if (TSTMP_GT(us_cts, rack->r_ctl.rc_time_probertt_entered)) 3242 calc = us_cts - rack->r_ctl.rc_time_probertt_entered; 3243 else 3244 calc = 0; 3245 calc /= max(rack->r_ctl.rc_gp_srtt, 1); 3246 if (calc) { 3247 /* Maybe */ 3248 calc *= rack_per_of_gp_probertt_reduce; 3249 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt - calc; 3250 /* Limit it too */ 3251 if (rack->r_ctl.rack_per_of_gp_probertt < rack_per_of_gp_lowthresh) 3252 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_lowthresh; 3253 } 3254 /* We must reach target or the time set */ 3255 return; 3256 } 3257 if (rack->r_ctl.rc_time_probertt_starts == 0) { 3258 if ((TSTMP_LT(us_cts, must_stay) && 3259 rack->rc_highly_buffered) || 3260 (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) > 3261 rack->r_ctl.rc_target_probertt_flight)) { 3262 /* We are not past the must_stay time */ 3263 goto no_exit; 3264 } 3265 rack_log_rtt_shrinks(rack, us_cts, 3266 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 3267 __LINE__, RACK_RTTS_REACHTARGET); 3268 rack->r_ctl.rc_time_probertt_starts = us_cts; 3269 if (rack->r_ctl.rc_time_probertt_starts == 0) 3270 rack->r_ctl.rc_time_probertt_starts = 1; 3271 /* Restore back to our rate we want to pace at in prtt */ 3272 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt; 3273 } 3274 /* 3275 * Setup our end time, some number of gp_srtts plus 200ms. 3276 */ 3277 no_overflow = ((uint64_t)rack->r_ctl.rc_gp_srtt * 3278 (uint64_t)rack_probertt_gpsrtt_cnt_mul); 3279 if (rack_probertt_gpsrtt_cnt_div) 3280 endtime = (uint32_t)(no_overflow / (uint64_t)rack_probertt_gpsrtt_cnt_div); 3281 else 3282 endtime = 0; 3283 endtime += rack_min_probertt_hold; 3284 endtime += rack->r_ctl.rc_time_probertt_starts; 3285 if (TSTMP_GEQ(us_cts, endtime)) { 3286 /* yes, exit probertt */ 3287 rack_exit_probertt(rack, us_cts); 3288 } 3289 3290 } else if((us_cts - rack->r_ctl.rc_lower_rtt_us_cts) >= rack_time_between_probertt) { 3291 /* Go into probertt, its been too long since we went lower */ 3292 rack_enter_probertt(rack, us_cts); 3293 } 3294 } 3295 3296 static void 3297 rack_update_multiplier(struct tcp_rack *rack, int32_t timely_says, uint64_t last_bw_est, 3298 uint32_t rtt, int32_t rtt_diff) 3299 { 3300 uint64_t cur_bw, up_bnd, low_bnd, subfr; 3301 uint32_t losses; 3302 3303 if ((rack->rc_gp_dyn_mul == 0) || 3304 (rack->use_fixed_rate) || 3305 (rack->in_probe_rtt) || 3306 (rack->rc_always_pace == 0)) { 3307 /* No dynamic GP multipler in play */ 3308 return; 3309 } 3310 losses = rack->r_ctl.rc_loss_count - rack->r_ctl.rc_loss_at_start; 3311 cur_bw = rack_get_bw(rack); 3312 /* Calculate our up and down range */ 3313 up_bnd = rack->r_ctl.last_gp_comp_bw * (uint64_t)rack_gp_per_bw_mul_up; 3314 up_bnd /= 100; 3315 up_bnd += rack->r_ctl.last_gp_comp_bw; 3316 3317 subfr = (uint64_t)rack->r_ctl.last_gp_comp_bw * (uint64_t)rack_gp_per_bw_mul_down; 3318 subfr /= 100; 3319 low_bnd = rack->r_ctl.last_gp_comp_bw - subfr; 3320 if ((timely_says == 2) && (rack->r_ctl.rc_no_push_at_mrtt)) { 3321 /* 3322 * This is the case where our RTT is above 3323 * the max target and we have been configured 3324 * to just do timely no bonus up stuff in that case. 3325 * 3326 * There are two configurations, set to 1, and we 3327 * just do timely if we are over our max. If its 3328 * set above 1 then we slam the multipliers down 3329 * to 100 and then decrement per timely. 3330 */ 3331 rack_log_timely(rack, timely_says, cur_bw, low_bnd, up_bnd, 3332 __LINE__, 3); 3333 if (rack->r_ctl.rc_no_push_at_mrtt > 1) 3334 rack_validate_multipliers_at_or_below_100(rack); 3335 rack_decrease_bw_mul(rack, timely_says, rtt, rtt_diff); 3336 } else if ((last_bw_est < low_bnd) && !losses) { 3337 /* 3338 * We are decreasing this is a bit complicated this 3339 * means we are loosing ground. This could be 3340 * because another flow entered and we are competing 3341 * for b/w with it. This will push the RTT up which 3342 * makes timely unusable unless we want to get shoved 3343 * into a corner and just be backed off (the age 3344 * old problem with delay based CC). 3345 * 3346 * On the other hand if it was a route change we 3347 * would like to stay somewhat contained and not 3348 * blow out the buffers. 3349 */ 3350 rack_log_timely(rack, timely_says, cur_bw, low_bnd, up_bnd, 3351 __LINE__, 3); 3352 rack->r_ctl.last_gp_comp_bw = cur_bw; 3353 if (rack->rc_gp_bwred == 0) { 3354 /* Go into reduction counting */ 3355 rack->rc_gp_bwred = 1; 3356 rack->rc_gp_timely_dec_cnt = 0; 3357 } 3358 if ((rack->rc_gp_timely_dec_cnt < rack_timely_max_push_drop) || 3359 (timely_says == 0)) { 3360 /* 3361 * Push another time with a faster pacing 3362 * to try to gain back (we include override to 3363 * get a full raise factor). 3364 */ 3365 if ((rack->rc_gp_saw_ca && rack->r_ctl.rack_per_of_gp_ca <= rack_down_raise_thresh) || 3366 (rack->rc_gp_saw_ss && rack->r_ctl.rack_per_of_gp_ss <= rack_down_raise_thresh) || 3367 (timely_says == 0) || 3368 (rack_down_raise_thresh == 0)) { 3369 /* 3370 * Do an override up in b/w if we were 3371 * below the threshold or if the threshold 3372 * is zero we always do the raise. 3373 */ 3374 rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 1); 3375 } else { 3376 /* Log it stays the same */ 3377 rack_log_timely(rack, 0, last_bw_est, low_bnd, 0, 3378 __LINE__, 11); 3379 3380 } 3381 rack->rc_gp_timely_dec_cnt++; 3382 /* We are not incrementing really no-count */ 3383 rack->rc_gp_incr = 0; 3384 rack->rc_gp_timely_inc_cnt = 0; 3385 } else { 3386 /* 3387 * Lets just use the RTT 3388 * information and give up 3389 * pushing. 3390 */ 3391 goto use_timely; 3392 } 3393 } else if ((timely_says != 2) && 3394 !losses && 3395 (last_bw_est > up_bnd)) { 3396 /* 3397 * We are increasing b/w lets keep going, updating 3398 * our b/w and ignoring any timely input, unless 3399 * of course we are at our max raise (if there is one). 3400 */ 3401 3402 rack_log_timely(rack, timely_says, cur_bw, low_bnd, up_bnd, 3403 __LINE__, 3); 3404 rack->r_ctl.last_gp_comp_bw = cur_bw; 3405 if (rack->rc_gp_saw_ss && 3406 rack_per_upper_bound_ss && 3407 (rack->r_ctl.rack_per_of_gp_ss == rack_per_upper_bound_ss)) { 3408 /* 3409 * In cases where we can't go higher 3410 * we should just use timely. 3411 */ 3412 goto use_timely; 3413 } 3414 if (rack->rc_gp_saw_ca && 3415 rack_per_upper_bound_ca && 3416 (rack->r_ctl.rack_per_of_gp_ca == rack_per_upper_bound_ca)) { 3417 /* 3418 * In cases where we can't go higher 3419 * we should just use timely. 3420 */ 3421 goto use_timely; 3422 } 3423 rack->rc_gp_bwred = 0; 3424 rack->rc_gp_timely_dec_cnt = 0; 3425 /* You get a set number of pushes if timely is trying to reduce */ 3426 if ((rack->rc_gp_incr < rack_timely_max_push_rise) || (timely_says == 0)) { 3427 rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 0); 3428 } else { 3429 /* Log it stays the same */ 3430 rack_log_timely(rack, 0, last_bw_est, up_bnd, 0, 3431 __LINE__, 12); 3432 3433 } 3434 return; 3435 } else { 3436 /* 3437 * We are staying between the lower and upper range bounds 3438 * so use timely to decide. 3439 */ 3440 rack_log_timely(rack, timely_says, cur_bw, low_bnd, up_bnd, 3441 __LINE__, 3); 3442 use_timely: 3443 if (timely_says) { 3444 rack->rc_gp_incr = 0; 3445 rack->rc_gp_timely_inc_cnt = 0; 3446 if ((rack->rc_gp_timely_dec_cnt < rack_timely_max_push_drop) && 3447 !losses && 3448 (last_bw_est < low_bnd)) { 3449 /* We are loosing ground */ 3450 rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 0); 3451 rack->rc_gp_timely_dec_cnt++; 3452 /* We are not incrementing really no-count */ 3453 rack->rc_gp_incr = 0; 3454 rack->rc_gp_timely_inc_cnt = 0; 3455 } else 3456 rack_decrease_bw_mul(rack, timely_says, rtt, rtt_diff); 3457 } else { 3458 rack->rc_gp_bwred = 0; 3459 rack->rc_gp_timely_dec_cnt = 0; 3460 rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 0); 3461 } 3462 } 3463 } 3464 3465 static int32_t 3466 rack_make_timely_judgement(struct tcp_rack *rack, uint32_t rtt, int32_t rtt_diff, uint32_t prev_rtt) 3467 { 3468 int32_t timely_says; 3469 uint64_t log_mult, log_rtt_a_diff; 3470 3471 log_rtt_a_diff = rtt; 3472 log_rtt_a_diff <<= 32; 3473 log_rtt_a_diff |= (uint32_t)rtt_diff; 3474 if (rtt >= (get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * 3475 rack_gp_rtt_maxmul)) { 3476 /* Reduce the b/w multipler */ 3477 timely_says = 2; 3478 log_mult = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_maxmul; 3479 log_mult <<= 32; 3480 log_mult |= prev_rtt; 3481 rack_log_timely(rack, timely_says, log_mult, 3482 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 3483 log_rtt_a_diff, __LINE__, 4); 3484 } else if (rtt <= (get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) + 3485 ((get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_minmul) / 3486 max(rack_gp_rtt_mindiv , 1)))) { 3487 /* Increase the b/w multipler */ 3488 log_mult = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) + 3489 ((get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_minmul) / 3490 max(rack_gp_rtt_mindiv , 1)); 3491 log_mult <<= 32; 3492 log_mult |= prev_rtt; 3493 timely_says = 0; 3494 rack_log_timely(rack, timely_says, log_mult , 3495 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 3496 log_rtt_a_diff, __LINE__, 5); 3497 } else { 3498 /* 3499 * Use a gradient to find it the timely gradient 3500 * is: 3501 * grad = rc_rtt_diff / min_rtt; 3502 * 3503 * anything below or equal to 0 will be 3504 * a increase indication. Anything above 3505 * zero is a decrease. Note we take care 3506 * of the actual gradient calculation 3507 * in the reduction (its not needed for 3508 * increase). 3509 */ 3510 log_mult = prev_rtt; 3511 if (rtt_diff <= 0) { 3512 /* 3513 * Rttdiff is less than zero, increase the 3514 * b/w multipler (its 0 or negative) 3515 */ 3516 timely_says = 0; 3517 rack_log_timely(rack, timely_says, log_mult, 3518 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), log_rtt_a_diff, __LINE__, 6); 3519 } else { 3520 /* Reduce the b/w multipler */ 3521 timely_says = 1; 3522 rack_log_timely(rack, timely_says, log_mult, 3523 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), log_rtt_a_diff, __LINE__, 7); 3524 } 3525 } 3526 return (timely_says); 3527 } 3528 3529 static void 3530 rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack, 3531 tcp_seq th_ack, int line) 3532 { 3533 uint64_t tim, bytes_ps, ltim, stim, utim; 3534 uint32_t segsiz, bytes, reqbytes, us_cts; 3535 int32_t gput, new_rtt_diff, timely_says; 3536 3537 us_cts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 3538 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 3539 if (TSTMP_GEQ(us_cts, tp->gput_ts)) 3540 tim = us_cts - tp->gput_ts; 3541 else 3542 tim = 0; 3543 3544 if (TSTMP_GT(rack->r_ctl.rc_gp_cumack_ts, rack->r_ctl.rc_gp_output_ts)) 3545 stim = rack->r_ctl.rc_gp_cumack_ts - rack->r_ctl.rc_gp_output_ts; 3546 else 3547 stim = 0; 3548 /* 3549 * Use the larger of the send time or ack time. This prevents us 3550 * from being influenced by ack artifacts to come up with too 3551 * high of measurement. Note that since we are spanning over many more 3552 * bytes in most of our measurements hopefully that is less likely to 3553 * occur. 3554 */ 3555 if (tim > stim) 3556 utim = max(tim, 1); 3557 else 3558 utim = max(stim, 1); 3559 /* Lets validate utim */ 3560 ltim = max(1, (utim/HPTS_USEC_IN_MSEC)); 3561 gput = (((uint64_t) (th_ack - tp->gput_seq)) << 3) / ltim; 3562 reqbytes = min(rc_init_window(rack), (MIN_GP_WIN * segsiz)); 3563 if ((tim == 0) && (stim == 0)) { 3564 /* 3565 * Invalid measurement time, maybe 3566 * all on one ack/one send? 3567 */ 3568 bytes = 0; 3569 bytes_ps = 0; 3570 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, 3571 0, 0, 0, 10, __LINE__, NULL); 3572 goto skip_measurement; 3573 } 3574 if (rack->r_ctl.rc_gp_lowrtt == 0xffffffff) { 3575 /* We never made a us_rtt measurement? */ 3576 bytes = 0; 3577 bytes_ps = 0; 3578 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, 3579 0, 0, 0, 10, __LINE__, NULL); 3580 goto skip_measurement; 3581 } 3582 /* 3583 * Calculate the maximum possible b/w this connection 3584 * could have. We base our calculation on the lowest 3585 * rtt we have seen during the measurement and the 3586 * largest rwnd the client has given us in that time. This 3587 * forms a BDP that is the maximum that we could ever 3588 * get to the client. Anything larger is not valid. 3589 * 3590 * I originally had code here that rejected measurements 3591 * where the time was less than 1/2 the latest us_rtt. 3592 * But after thinking on that I realized its wrong since 3593 * say you had a 150Mbps or even 1Gbps link, and you 3594 * were a long way away.. example I am in Europe (100ms rtt) 3595 * talking to my 1Gbps link in S.C. Now measuring say 150,000 3596 * bytes my time would be 1.2ms, and yet my rtt would say 3597 * the measurement was invalid the time was < 50ms. The 3598 * same thing is true for 150Mb (8ms of time). 3599 * 3600 * A better way I realized is to look at what the maximum 3601 * the connection could possibly do. This is gated on 3602 * the lowest RTT we have seen and the highest rwnd. 3603 * We should in theory never exceed that, if we are 3604 * then something on the path is storing up packets 3605 * and then feeding them all at once to our endpoint 3606 * messing up our measurement. 3607 */ 3608 rack->r_ctl.last_max_bw = rack->r_ctl.rc_gp_high_rwnd; 3609 rack->r_ctl.last_max_bw *= HPTS_USEC_IN_SEC; 3610 rack->r_ctl.last_max_bw /= rack->r_ctl.rc_gp_lowrtt; 3611 if (SEQ_LT(th_ack, tp->gput_seq)) { 3612 /* No measurement can be made */ 3613 bytes = 0; 3614 bytes_ps = 0; 3615 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, 3616 0, 0, 0, 10, __LINE__, NULL); 3617 goto skip_measurement; 3618 } else 3619 bytes = (th_ack - tp->gput_seq); 3620 bytes_ps = (uint64_t)bytes; 3621 /* 3622 * Don't measure a b/w for pacing unless we have gotten at least 3623 * an initial windows worth of data in this measurement interval. 3624 * 3625 * Small numbers of bytes get badly influenced by delayed ack and 3626 * other artifacts. Note we take the initial window or our 3627 * defined minimum GP (defaulting to 10 which hopefully is the 3628 * IW). 3629 */ 3630 if (rack->rc_gp_filled == 0) { 3631 /* 3632 * The initial estimate is special. We 3633 * have blasted out an IW worth of packets 3634 * without a real valid ack ts results. We 3635 * then setup the app_limited_needs_set flag, 3636 * this should get the first ack in (probably 2 3637 * MSS worth) to be recorded as the timestamp. 3638 * We thus allow a smaller number of bytes i.e. 3639 * IW - 2MSS. 3640 */ 3641 reqbytes -= (2 * segsiz); 3642 /* Also lets fill previous for our first measurement to be neutral */ 3643 rack->r_ctl.rc_prev_gp_srtt = rack->r_ctl.rc_gp_srtt; 3644 } 3645 if ((bytes_ps < reqbytes) || rack->app_limited_needs_set) { 3646 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, 3647 rack->r_ctl.rc_app_limited_cnt, 3648 0, 0, 10, __LINE__, NULL); 3649 goto skip_measurement; 3650 } 3651 /* 3652 * We now need to calculate the Timely like status so 3653 * we can update (possibly) the b/w multipliers. 3654 */ 3655 new_rtt_diff = (int32_t)rack->r_ctl.rc_gp_srtt - (int32_t)rack->r_ctl.rc_prev_gp_srtt; 3656 if (rack->rc_gp_filled == 0) { 3657 /* No previous reading */ 3658 rack->r_ctl.rc_rtt_diff = new_rtt_diff; 3659 } else { 3660 if (rack->measure_saw_probe_rtt == 0) { 3661 /* 3662 * We don't want a probertt to be counted 3663 * since it will be negative incorrectly. We 3664 * expect to be reducing the RTT when we 3665 * pace at a slower rate. 3666 */ 3667 rack->r_ctl.rc_rtt_diff -= (rack->r_ctl.rc_rtt_diff / 8); 3668 rack->r_ctl.rc_rtt_diff += (new_rtt_diff / 8); 3669 } 3670 } 3671 timely_says = rack_make_timely_judgement(rack, 3672 rack->r_ctl.rc_gp_srtt, 3673 rack->r_ctl.rc_rtt_diff, 3674 rack->r_ctl.rc_prev_gp_srtt 3675 ); 3676 bytes_ps *= HPTS_USEC_IN_SEC; 3677 bytes_ps /= utim; 3678 if (bytes_ps > rack->r_ctl.last_max_bw) { 3679 /* 3680 * Something is on path playing 3681 * since this b/w is not possible based 3682 * on our BDP (highest rwnd and lowest rtt 3683 * we saw in the measurement window). 3684 * 3685 * Another option here would be to 3686 * instead skip the measurement. 3687 */ 3688 rack_log_pacing_delay_calc(rack, bytes, reqbytes, 3689 bytes_ps, rack->r_ctl.last_max_bw, 0, 3690 11, __LINE__, NULL); 3691 bytes_ps = rack->r_ctl.last_max_bw; 3692 } 3693 /* We store gp for b/w in bytes per second */ 3694 if (rack->rc_gp_filled == 0) { 3695 /* Initial measurment */ 3696 if (bytes_ps) { 3697 rack->r_ctl.gp_bw = bytes_ps; 3698 rack->rc_gp_filled = 1; 3699 rack->r_ctl.num_avg = 1; 3700 rack_set_pace_segments(rack->rc_tp, rack, __LINE__); 3701 } else { 3702 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, 3703 rack->r_ctl.rc_app_limited_cnt, 3704 0, 0, 10, __LINE__, NULL); 3705 } 3706 if (rack->rc_inp->inp_in_hpts && 3707 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { 3708 /* 3709 * Ok we can't trust the pacer in this case 3710 * where we transition from un-paced to paced. 3711 * Or for that matter when the burst mitigation 3712 * was making a wild guess and got it wrong. 3713 * Stop the pacer and clear up all the aggregate 3714 * delays etc. 3715 */ 3716 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); 3717 rack->r_ctl.rc_hpts_flags = 0; 3718 rack->r_ctl.rc_last_output_to = 0; 3719 } 3720 } else if (rack->r_ctl.num_avg < RACK_REQ_AVG) { 3721 /* Still a small number run an average */ 3722 rack->r_ctl.gp_bw += bytes_ps; 3723 rack->r_ctl.num_avg++; 3724 if (rack->r_ctl.num_avg >= RACK_REQ_AVG) { 3725 /* We have collected enought to move forward */ 3726 rack->r_ctl.gp_bw /= (uint64_t)rack->r_ctl.num_avg; 3727 } 3728 } else { 3729 /* 3730 * We want to take 1/wma of the goodput and add in to 7/8th 3731 * of the old value weighted by the srtt. So if your measurement 3732 * period is say 2 SRTT's long you would get 1/4 as the 3733 * value, if it was like 1/2 SRTT then you would get 1/16th. 3734 * 3735 * But we must be careful not to take too much i.e. if the 3736 * srtt is say 20ms and the measurement is taken over 3737 * 400ms our weight would be 400/20 i.e. 20. On the 3738 * other hand if we get a measurement over 1ms with a 3739 * 10ms rtt we only want to take a much smaller portion. 3740 */ 3741 uint64_t resid_bw, subpart, addpart, srtt; 3742 3743 srtt = ((uint64_t)TICKS_2_USEC(tp->t_srtt) >> TCP_RTT_SHIFT); 3744 if (srtt == 0) { 3745 /* 3746 * Strange why did t_srtt go back to zero? 3747 */ 3748 if (rack->r_ctl.rc_rack_min_rtt) 3749 srtt = (rack->r_ctl.rc_rack_min_rtt * HPTS_USEC_IN_MSEC); 3750 else 3751 srtt = HPTS_USEC_IN_MSEC; 3752 } 3753 /* 3754 * XXXrrs: Note for reviewers, in playing with 3755 * dynamic pacing I discovered this GP calculation 3756 * as done originally leads to some undesired results. 3757 * Basically you can get longer measurements contributing 3758 * too much to the WMA. Thus I changed it if you are doing 3759 * dynamic adjustments to only do the aportioned adjustment 3760 * if we have a very small (time wise) measurement. Longer 3761 * measurements just get there weight (defaulting to 1/8) 3762 * add to the WMA. We may want to think about changing 3763 * this to always do that for both sides i.e. dynamic 3764 * and non-dynamic... but considering lots of folks 3765 * were playing with this I did not want to change the 3766 * calculation per.se. without your thoughts.. Lawerence? 3767 * Peter?? 3768 */ 3769 if (rack->rc_gp_dyn_mul == 0) { 3770 subpart = rack->r_ctl.gp_bw * utim; 3771 subpart /= (srtt * 8); 3772 if (subpart < (rack->r_ctl.gp_bw / 2)) { 3773 /* 3774 * The b/w update takes no more 3775 * away then 1/2 our running total 3776 * so factor it in. 3777 */ 3778 addpart = bytes_ps * utim; 3779 addpart /= (srtt * 8); 3780 } else { 3781 /* 3782 * Don't allow a single measurement 3783 * to account for more than 1/2 of the 3784 * WMA. This could happen on a retransmission 3785 * where utim becomes huge compared to 3786 * srtt (multiple retransmissions when using 3787 * the sending rate which factors in all the 3788 * transmissions from the first one). 3789 */ 3790 subpart = rack->r_ctl.gp_bw / 2; 3791 addpart = bytes_ps / 2; 3792 } 3793 resid_bw = rack->r_ctl.gp_bw - subpart; 3794 rack->r_ctl.gp_bw = resid_bw + addpart; 3795 } else { 3796 if ((utim / srtt) <= 1) { 3797 /* 3798 * The b/w update was over a small period 3799 * of time. The idea here is to prevent a small 3800 * measurement time period from counting 3801 * too much. So we scale it based on the 3802 * time so it attributes less than 1/rack_wma_divisor 3803 * of its measurement. 3804 */ 3805 subpart = rack->r_ctl.gp_bw * utim; 3806 subpart /= (srtt * rack_wma_divisor); 3807 addpart = bytes_ps * utim; 3808 addpart /= (srtt * rack_wma_divisor); 3809 } else { 3810 /* 3811 * The scaled measurement was long 3812 * enough so lets just add in the 3813 * portion of the measurment i.e. 1/rack_wma_divisor 3814 */ 3815 subpart = rack->r_ctl.gp_bw / rack_wma_divisor; 3816 addpart = bytes_ps / rack_wma_divisor; 3817 } 3818 if ((rack->measure_saw_probe_rtt == 0) || 3819 (bytes_ps > rack->r_ctl.gp_bw)) { 3820 /* 3821 * For probe-rtt we only add it in 3822 * if its larger, all others we just 3823 * add in. 3824 */ 3825 resid_bw = rack->r_ctl.gp_bw - subpart; 3826 rack->r_ctl.gp_bw = resid_bw + addpart; 3827 } 3828 } 3829 } 3830 /* We do not update any multipliers if we are in or have seen a probe-rtt */ 3831 if ((rack->measure_saw_probe_rtt == 0) && rack->rc_gp_rtt_set) 3832 rack_update_multiplier(rack, timely_says, bytes_ps, 3833 rack->r_ctl.rc_gp_srtt, 3834 rack->r_ctl.rc_rtt_diff); 3835 rack_log_pacing_delay_calc(rack, bytes, tim, bytes_ps, stim, 3836 rack_get_bw(rack), 3, line, NULL); 3837 /* reset the gp srtt and setup the new prev */ 3838 rack->r_ctl.rc_prev_gp_srtt = rack->r_ctl.rc_gp_srtt; 3839 /* Record the lost count for the next measurement */ 3840 rack->r_ctl.rc_loss_at_start = rack->r_ctl.rc_loss_count; 3841 /* 3842 * We restart our diffs based on the gpsrtt in the 3843 * measurement window. 3844 */ 3845 rack->rc_gp_rtt_set = 0; 3846 rack->rc_gp_saw_rec = 0; 3847 rack->rc_gp_saw_ca = 0; 3848 rack->rc_gp_saw_ss = 0; 3849 rack->rc_dragged_bottom = 0; 3850 skip_measurement: 3851 3852 #ifdef STATS 3853 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_GPUT, 3854 gput); 3855 /* 3856 * XXXLAS: This is a temporary hack, and should be 3857 * chained off VOI_TCP_GPUT when stats(9) grows an 3858 * API to deal with chained VOIs. 3859 */ 3860 if (tp->t_stats_gput_prev > 0) 3861 stats_voi_update_abs_s32(tp->t_stats, 3862 VOI_TCP_GPUT_ND, 3863 ((gput - tp->t_stats_gput_prev) * 100) / 3864 tp->t_stats_gput_prev); 3865 #endif 3866 tp->t_flags &= ~TF_GPUTINPROG; 3867 tp->t_stats_gput_prev = gput; 3868 /* 3869 * Now are we app limited now and there is space from where we 3870 * were to where we want to go? 3871 * 3872 * We don't do the other case i.e. non-applimited here since 3873 * the next send will trigger us picking up the missing data. 3874 */ 3875 if (rack->r_ctl.rc_first_appl && 3876 TCPS_HAVEESTABLISHED(tp->t_state) && 3877 rack->r_ctl.rc_app_limited_cnt && 3878 (SEQ_GT(rack->r_ctl.rc_first_appl->r_start, th_ack)) && 3879 ((rack->r_ctl.rc_first_appl->r_start - th_ack) > 3880 max(rc_init_window(rack), (MIN_GP_WIN * segsiz)))) { 3881 /* 3882 * Yep there is enough outstanding to make a measurement here. 3883 */ 3884 struct rack_sendmap *rsm, fe; 3885 3886 tp->t_flags |= TF_GPUTINPROG; 3887 rack->r_ctl.rc_gp_lowrtt = 0xffffffff; 3888 rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd; 3889 tp->gput_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 3890 rack->app_limited_needs_set = 0; 3891 tp->gput_seq = th_ack; 3892 if (rack->in_probe_rtt) 3893 rack->measure_saw_probe_rtt = 1; 3894 else if ((rack->measure_saw_probe_rtt) && 3895 (SEQ_GEQ(tp->gput_seq, rack->r_ctl.rc_probertt_sndmax_atexit))) 3896 rack->measure_saw_probe_rtt = 0; 3897 if ((rack->r_ctl.rc_first_appl->r_start - th_ack) >= rack_get_measure_window(tp, rack)) { 3898 /* There is a full window to gain info from */ 3899 tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack); 3900 } else { 3901 /* We can only measure up to the applimited point */ 3902 tp->gput_ack = tp->gput_seq + (rack->r_ctl.rc_first_appl->r_start - th_ack); 3903 } 3904 /* 3905 * Now we need to find the timestamp of the send at tp->gput_seq 3906 * for the send based measurement. 3907 */ 3908 fe.r_start = tp->gput_seq; 3909 rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); 3910 if (rsm) { 3911 /* Ok send-based limit is set */ 3912 if (SEQ_LT(rsm->r_start, tp->gput_seq)) { 3913 /* 3914 * Move back to include the earlier part 3915 * so our ack time lines up right (this may 3916 * make an overlapping measurement but thats 3917 * ok). 3918 */ 3919 tp->gput_seq = rsm->r_start; 3920 } 3921 if (rsm->r_flags & RACK_ACKED) 3922 tp->gput_ts = rsm->r_ack_arrival; 3923 else 3924 rack->app_limited_needs_set = 1; 3925 rack->r_ctl.rc_gp_output_ts = rsm->usec_orig_send; 3926 } else { 3927 /* 3928 * If we don't find the rsm due to some 3929 * send-limit set the current time, which 3930 * basically disables the send-limit. 3931 */ 3932 rack->r_ctl.rc_gp_output_ts = tcp_get_usecs(NULL); 3933 } 3934 rack_log_pacing_delay_calc(rack, 3935 tp->gput_seq, 3936 tp->gput_ack, 3937 (uint64_t)rsm, 3938 tp->gput_ts, 3939 rack->r_ctl.rc_app_limited_cnt, 3940 9, 3941 __LINE__, NULL); 3942 } 3943 } 3944 3945 /* 3946 * CC wrapper hook functions 3947 */ 3948 static void 3949 rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, struct tcphdr *th, uint16_t nsegs, 3950 uint16_t type, int32_t recovery) 3951 { 3952 INP_WLOCK_ASSERT(tp->t_inpcb); 3953 tp->ccv->nsegs = nsegs; 3954 tp->ccv->bytes_this_ack = BYTES_THIS_ACK(tp, th); 3955 if ((recovery) && (rack->r_ctl.rc_early_recovery_segs)) { 3956 uint32_t max; 3957 3958 max = rack->r_ctl.rc_early_recovery_segs * ctf_fixed_maxseg(tp); 3959 if (tp->ccv->bytes_this_ack > max) { 3960 tp->ccv->bytes_this_ack = max; 3961 } 3962 } 3963 if (rack->r_ctl.cwnd_to_use <= tp->snd_wnd) 3964 tp->ccv->flags |= CCF_CWND_LIMITED; 3965 else 3966 tp->ccv->flags &= ~CCF_CWND_LIMITED; 3967 #ifdef STATS 3968 stats_voi_update_abs_s32(tp->t_stats, VOI_TCP_CALCFRWINDIFF, 3969 ((int32_t)rack->r_ctl.cwnd_to_use) - tp->snd_wnd); 3970 #endif 3971 if ((tp->t_flags & TF_GPUTINPROG) && 3972 rack_enough_for_measurement(tp, rack, th->th_ack)) { 3973 /* Measure the Goodput */ 3974 rack_do_goodput_measurement(tp, rack, th->th_ack, __LINE__); 3975 #ifdef NETFLIX_PEAKRATE 3976 if ((type == CC_ACK) && 3977 (tp->t_maxpeakrate)) { 3978 /* 3979 * We update t_peakrate_thr. This gives us roughly 3980 * one update per round trip time. Note 3981 * it will only be used if pace_always is off i.e 3982 * we don't do this for paced flows. 3983 */ 3984 tcp_update_peakrate_thr(tp); 3985 } 3986 #endif 3987 } 3988 if (rack->r_ctl.cwnd_to_use > tp->snd_ssthresh) { 3989 tp->t_bytes_acked += min(tp->ccv->bytes_this_ack, 3990 nsegs * V_tcp_abc_l_var * ctf_fixed_maxseg(tp)); 3991 if (tp->t_bytes_acked >= rack->r_ctl.cwnd_to_use) { 3992 tp->t_bytes_acked -= rack->r_ctl.cwnd_to_use; 3993 tp->ccv->flags |= CCF_ABC_SENTAWND; 3994 } 3995 } else { 3996 tp->ccv->flags &= ~CCF_ABC_SENTAWND; 3997 tp->t_bytes_acked = 0; 3998 } 3999 if (CC_ALGO(tp)->ack_received != NULL) { 4000 /* XXXLAS: Find a way to live without this */ 4001 tp->ccv->curack = th->th_ack; 4002 CC_ALGO(tp)->ack_received(tp->ccv, type); 4003 } 4004 #ifdef STATS 4005 stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_LCWIN, rack->r_ctl.cwnd_to_use); 4006 #endif 4007 if (rack->r_ctl.rc_rack_largest_cwnd < rack->r_ctl.cwnd_to_use) { 4008 rack->r_ctl.rc_rack_largest_cwnd = rack->r_ctl.cwnd_to_use; 4009 } 4010 #ifdef NETFLIX_PEAKRATE 4011 /* we enforce max peak rate if it is set and we are not pacing */ 4012 if ((rack->rc_always_pace == 0) && 4013 tp->t_peakrate_thr && 4014 (tp->snd_cwnd > tp->t_peakrate_thr)) { 4015 tp->snd_cwnd = tp->t_peakrate_thr; 4016 } 4017 #endif 4018 } 4019 4020 static void 4021 tcp_rack_partialack(struct tcpcb *tp, struct tcphdr *th) 4022 { 4023 struct tcp_rack *rack; 4024 4025 rack = (struct tcp_rack *)tp->t_fb_ptr; 4026 INP_WLOCK_ASSERT(tp->t_inpcb); 4027 /* 4028 * If we are doing PRR and have enough 4029 * room to send <or> we are pacing and prr 4030 * is disabled we will want to see if we 4031 * can send data (by setting r_wanted_output to 4032 * true). 4033 */ 4034 if ((rack->r_ctl.rc_prr_sndcnt > 0) || 4035 rack->rack_no_prr) 4036 rack->r_wanted_output = 1; 4037 } 4038 4039 static void 4040 rack_post_recovery(struct tcpcb *tp, struct tcphdr *th) 4041 { 4042 struct tcp_rack *rack; 4043 uint32_t orig_cwnd; 4044 4045 4046 orig_cwnd = tp->snd_cwnd; 4047 INP_WLOCK_ASSERT(tp->t_inpcb); 4048 rack = (struct tcp_rack *)tp->t_fb_ptr; 4049 if (rack->rc_not_backing_off == 0) { 4050 /* only alert CC if we alerted when we entered */ 4051 if (CC_ALGO(tp)->post_recovery != NULL) { 4052 tp->ccv->curack = th->th_ack; 4053 CC_ALGO(tp)->post_recovery(tp->ccv); 4054 } 4055 if (tp->snd_cwnd > tp->snd_ssthresh) { 4056 /* Drop us down to the ssthresh (1/2 cwnd at loss) */ 4057 tp->snd_cwnd = tp->snd_ssthresh; 4058 } 4059 } 4060 if ((rack->rack_no_prr == 0) && 4061 (rack->r_ctl.rc_prr_sndcnt > 0)) { 4062 /* Suck the next prr cnt back into cwnd */ 4063 tp->snd_cwnd += rack->r_ctl.rc_prr_sndcnt; 4064 rack->r_ctl.rc_prr_sndcnt = 0; 4065 rack_log_to_prr(rack, 1, 0); 4066 } 4067 rack_log_to_prr(rack, 14, orig_cwnd); 4068 tp->snd_recover = tp->snd_una; 4069 EXIT_RECOVERY(tp->t_flags); 4070 } 4071 4072 static void 4073 rack_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type) 4074 { 4075 struct tcp_rack *rack; 4076 4077 INP_WLOCK_ASSERT(tp->t_inpcb); 4078 4079 rack = (struct tcp_rack *)tp->t_fb_ptr; 4080 switch (type) { 4081 case CC_NDUPACK: 4082 tp->t_flags &= ~TF_WASFRECOVERY; 4083 tp->t_flags &= ~TF_WASCRECOVERY; 4084 if (!IN_FASTRECOVERY(tp->t_flags)) { 4085 rack->r_ctl.rc_prr_delivered = 0; 4086 rack->r_ctl.rc_prr_out = 0; 4087 if (rack->rack_no_prr == 0) { 4088 rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); 4089 rack_log_to_prr(rack, 2, 0); 4090 } 4091 rack->r_ctl.rc_prr_recovery_fs = tp->snd_max - tp->snd_una; 4092 tp->snd_recover = tp->snd_max; 4093 if (tp->t_flags2 & TF2_ECN_PERMIT) 4094 tp->t_flags2 |= TF2_ECN_SND_CWR; 4095 } 4096 break; 4097 case CC_ECN: 4098 if (!IN_CONGRECOVERY(tp->t_flags) || 4099 /* 4100 * Allow ECN reaction on ACK to CWR, if 4101 * that data segment was also CE marked. 4102 */ 4103 SEQ_GEQ(th->th_ack, tp->snd_recover)) { 4104 EXIT_CONGRECOVERY(tp->t_flags); 4105 KMOD_TCPSTAT_INC(tcps_ecn_rcwnd); 4106 tp->snd_recover = tp->snd_max + 1; 4107 if (tp->t_flags2 & TF2_ECN_PERMIT) 4108 tp->t_flags2 |= TF2_ECN_SND_CWR; 4109 } 4110 break; 4111 case CC_RTO: 4112 tp->t_dupacks = 0; 4113 tp->t_bytes_acked = 0; 4114 EXIT_RECOVERY(tp->t_flags); 4115 tp->snd_ssthresh = max(2, min(tp->snd_wnd, rack->r_ctl.cwnd_to_use) / 2 / 4116 ctf_fixed_maxseg(tp)) * ctf_fixed_maxseg(tp); 4117 tp->snd_cwnd = ctf_fixed_maxseg(tp); 4118 if (tp->t_flags2 & TF2_ECN_PERMIT) 4119 tp->t_flags2 |= TF2_ECN_SND_CWR; 4120 break; 4121 case CC_RTO_ERR: 4122 KMOD_TCPSTAT_INC(tcps_sndrexmitbad); 4123 /* RTO was unnecessary, so reset everything. */ 4124 tp->snd_cwnd = tp->snd_cwnd_prev; 4125 tp->snd_ssthresh = tp->snd_ssthresh_prev; 4126 tp->snd_recover = tp->snd_recover_prev; 4127 if (tp->t_flags & TF_WASFRECOVERY) { 4128 ENTER_FASTRECOVERY(tp->t_flags); 4129 tp->t_flags &= ~TF_WASFRECOVERY; 4130 } 4131 if (tp->t_flags & TF_WASCRECOVERY) { 4132 ENTER_CONGRECOVERY(tp->t_flags); 4133 tp->t_flags &= ~TF_WASCRECOVERY; 4134 } 4135 tp->snd_nxt = tp->snd_max; 4136 tp->t_badrxtwin = 0; 4137 break; 4138 } 4139 /* 4140 * If we are below our max rtt, don't 4141 * signal the CC control to change things. 4142 * instead set it up so that we are in 4143 * recovery but not going to back off. 4144 */ 4145 4146 if (rack->rc_highly_buffered) { 4147 /* 4148 * Do we use the higher rtt for 4149 * our threshold to not backoff (like CDG)? 4150 */ 4151 uint32_t rtt_mul, rtt_div; 4152 4153 if (rack_use_max_for_nobackoff) { 4154 rtt_mul = (rack_gp_rtt_maxmul - 1); 4155 rtt_div = 1; 4156 } else { 4157 rtt_mul = rack_gp_rtt_minmul; 4158 rtt_div = max(rack_gp_rtt_mindiv , 1); 4159 } 4160 if (rack->r_ctl.rc_gp_srtt <= (rack->r_ctl.rc_lowest_us_rtt + 4161 ((rack->r_ctl.rc_lowest_us_rtt * rtt_mul) / 4162 rtt_div))) { 4163 /* below our min threshold */ 4164 rack->rc_not_backing_off = 1; 4165 ENTER_RECOVERY(rack->rc_tp->t_flags); 4166 rack_log_rtt_shrinks(rack, 0, 4167 rtt_mul, 4168 rtt_div, 4169 RACK_RTTS_NOBACKOFF); 4170 return; 4171 } 4172 } 4173 rack->rc_not_backing_off = 0; 4174 if (CC_ALGO(tp)->cong_signal != NULL) { 4175 if (th != NULL) 4176 tp->ccv->curack = th->th_ack; 4177 CC_ALGO(tp)->cong_signal(tp->ccv, type); 4178 } 4179 } 4180 4181 4182 4183 static inline void 4184 rack_cc_after_idle(struct tcp_rack *rack, struct tcpcb *tp) 4185 { 4186 uint32_t i_cwnd; 4187 4188 INP_WLOCK_ASSERT(tp->t_inpcb); 4189 4190 #ifdef NETFLIX_STATS 4191 KMOD_TCPSTAT_INC(tcps_idle_restarts); 4192 if (tp->t_state == TCPS_ESTABLISHED) 4193 KMOD_TCPSTAT_INC(tcps_idle_estrestarts); 4194 #endif 4195 if (CC_ALGO(tp)->after_idle != NULL) 4196 CC_ALGO(tp)->after_idle(tp->ccv); 4197 4198 if (tp->snd_cwnd == 1) 4199 i_cwnd = tp->t_maxseg; /* SYN(-ACK) lost */ 4200 else 4201 i_cwnd = rc_init_window(rack); 4202 4203 /* 4204 * Being idle is no differnt than the initial window. If the cc 4205 * clamps it down below the initial window raise it to the initial 4206 * window. 4207 */ 4208 if (tp->snd_cwnd < i_cwnd) { 4209 tp->snd_cwnd = i_cwnd; 4210 } 4211 } 4212 4213 4214 /* 4215 * Indicate whether this ack should be delayed. We can delay the ack if 4216 * following conditions are met: 4217 * - There is no delayed ack timer in progress. 4218 * - Our last ack wasn't a 0-sized window. We never want to delay 4219 * the ack that opens up a 0-sized window. 4220 * - LRO wasn't used for this segment. We make sure by checking that the 4221 * segment size is not larger than the MSS. 4222 * - Delayed acks are enabled or this is a half-synchronized T/TCP 4223 * connection. 4224 */ 4225 #define DELAY_ACK(tp, tlen) \ 4226 (((tp->t_flags & TF_RXWIN0SENT) == 0) && \ 4227 ((tp->t_flags & TF_DELACK) == 0) && \ 4228 (tlen <= tp->t_maxseg) && \ 4229 (tp->t_delayed_ack || (tp->t_flags & TF_NEEDSYN))) 4230 4231 static struct rack_sendmap * 4232 rack_find_lowest_rsm(struct tcp_rack *rack) 4233 { 4234 struct rack_sendmap *rsm; 4235 4236 /* 4237 * Walk the time-order transmitted list looking for an rsm that is 4238 * not acked. This will be the one that was sent the longest time 4239 * ago that is still outstanding. 4240 */ 4241 TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) { 4242 if (rsm->r_flags & RACK_ACKED) { 4243 continue; 4244 } 4245 goto finish; 4246 } 4247 finish: 4248 return (rsm); 4249 } 4250 4251 static struct rack_sendmap * 4252 rack_find_high_nonack(struct tcp_rack *rack, struct rack_sendmap *rsm) 4253 { 4254 struct rack_sendmap *prsm; 4255 4256 /* 4257 * Walk the sequence order list backward until we hit and arrive at 4258 * the highest seq not acked. In theory when this is called it 4259 * should be the last segment (which it was not). 4260 */ 4261 counter_u64_add(rack_find_high, 1); 4262 prsm = rsm; 4263 RB_FOREACH_REVERSE_FROM(prsm, rack_rb_tree_head, rsm) { 4264 if (prsm->r_flags & (RACK_ACKED | RACK_HAS_FIN)) { 4265 continue; 4266 } 4267 return (prsm); 4268 } 4269 return (NULL); 4270 } 4271 4272 4273 static uint32_t 4274 rack_calc_thresh_rack(struct tcp_rack *rack, uint32_t srtt, uint32_t cts) 4275 { 4276 int32_t lro; 4277 uint32_t thresh; 4278 4279 /* 4280 * lro is the flag we use to determine if we have seen reordering. 4281 * If it gets set we have seen reordering. The reorder logic either 4282 * works in one of two ways: 4283 * 4284 * If reorder-fade is configured, then we track the last time we saw 4285 * re-ordering occur. If we reach the point where enough time as 4286 * passed we no longer consider reordering has occuring. 4287 * 4288 * Or if reorder-face is 0, then once we see reordering we consider 4289 * the connection to alway be subject to reordering and just set lro 4290 * to 1. 4291 * 4292 * In the end if lro is non-zero we add the extra time for 4293 * reordering in. 4294 */ 4295 if (srtt == 0) 4296 srtt = 1; 4297 if (rack->r_ctl.rc_reorder_ts) { 4298 if (rack->r_ctl.rc_reorder_fade) { 4299 if (SEQ_GEQ(cts, rack->r_ctl.rc_reorder_ts)) { 4300 lro = cts - rack->r_ctl.rc_reorder_ts; 4301 if (lro == 0) { 4302 /* 4303 * No time as passed since the last 4304 * reorder, mark it as reordering. 4305 */ 4306 lro = 1; 4307 } 4308 } else { 4309 /* Negative time? */ 4310 lro = 0; 4311 } 4312 if (lro > rack->r_ctl.rc_reorder_fade) { 4313 /* Turn off reordering seen too */ 4314 rack->r_ctl.rc_reorder_ts = 0; 4315 lro = 0; 4316 } 4317 } else { 4318 /* Reodering does not fade */ 4319 lro = 1; 4320 } 4321 } else { 4322 lro = 0; 4323 } 4324 thresh = srtt + rack->r_ctl.rc_pkt_delay; 4325 if (lro) { 4326 /* It must be set, if not you get 1/4 rtt */ 4327 if (rack->r_ctl.rc_reorder_shift) 4328 thresh += (srtt >> rack->r_ctl.rc_reorder_shift); 4329 else 4330 thresh += (srtt >> 2); 4331 } else { 4332 thresh += 1; 4333 } 4334 /* We don't let the rack timeout be above a RTO */ 4335 if (thresh > TICKS_2_MSEC(rack->rc_tp->t_rxtcur)) { 4336 thresh = TICKS_2_MSEC(rack->rc_tp->t_rxtcur); 4337 } 4338 /* And we don't want it above the RTO max either */ 4339 if (thresh > rack_rto_max) { 4340 thresh = rack_rto_max; 4341 } 4342 return (thresh); 4343 } 4344 4345 static uint32_t 4346 rack_calc_thresh_tlp(struct tcpcb *tp, struct tcp_rack *rack, 4347 struct rack_sendmap *rsm, uint32_t srtt) 4348 { 4349 struct rack_sendmap *prsm; 4350 uint32_t thresh, len; 4351 int segsiz; 4352 4353 if (srtt == 0) 4354 srtt = 1; 4355 if (rack->r_ctl.rc_tlp_threshold) 4356 thresh = srtt + (srtt / rack->r_ctl.rc_tlp_threshold); 4357 else 4358 thresh = (srtt * 2); 4359 4360 /* Get the previous sent packet, if any */ 4361 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 4362 counter_u64_add(rack_enter_tlp_calc, 1); 4363 len = rsm->r_end - rsm->r_start; 4364 if (rack->rack_tlp_threshold_use == TLP_USE_ID) { 4365 /* Exactly like the ID */ 4366 if (((tp->snd_max - tp->snd_una) - rack->r_ctl.rc_sacked + rack->r_ctl.rc_holes_rxt) <= segsiz) { 4367 uint32_t alt_thresh; 4368 /* 4369 * Compensate for delayed-ack with the d-ack time. 4370 */ 4371 counter_u64_add(rack_used_tlpmethod, 1); 4372 alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time; 4373 if (alt_thresh > thresh) 4374 thresh = alt_thresh; 4375 } 4376 } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_ONE) { 4377 /* 2.1 behavior */ 4378 prsm = TAILQ_PREV(rsm, rack_head, r_tnext); 4379 if (prsm && (len <= segsiz)) { 4380 /* 4381 * Two packets outstanding, thresh should be (2*srtt) + 4382 * possible inter-packet delay (if any). 4383 */ 4384 uint32_t inter_gap = 0; 4385 int idx, nidx; 4386 4387 counter_u64_add(rack_used_tlpmethod, 1); 4388 idx = rsm->r_rtr_cnt - 1; 4389 nidx = prsm->r_rtr_cnt - 1; 4390 if (TSTMP_GEQ(rsm->r_tim_lastsent[nidx], prsm->r_tim_lastsent[idx])) { 4391 /* Yes it was sent later (or at the same time) */ 4392 inter_gap = rsm->r_tim_lastsent[idx] - prsm->r_tim_lastsent[nidx]; 4393 } 4394 thresh += inter_gap; 4395 } else if (len <= segsiz) { 4396 /* 4397 * Possibly compensate for delayed-ack. 4398 */ 4399 uint32_t alt_thresh; 4400 4401 counter_u64_add(rack_used_tlpmethod2, 1); 4402 alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time; 4403 if (alt_thresh > thresh) 4404 thresh = alt_thresh; 4405 } 4406 } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_TWO) { 4407 /* 2.2 behavior */ 4408 if (len <= segsiz) { 4409 uint32_t alt_thresh; 4410 /* 4411 * Compensate for delayed-ack with the d-ack time. 4412 */ 4413 counter_u64_add(rack_used_tlpmethod, 1); 4414 alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time; 4415 if (alt_thresh > thresh) 4416 thresh = alt_thresh; 4417 } 4418 } 4419 /* Not above an RTO */ 4420 if (thresh > TICKS_2_MSEC(tp->t_rxtcur)) { 4421 thresh = TICKS_2_MSEC(tp->t_rxtcur); 4422 } 4423 /* Not above a RTO max */ 4424 if (thresh > rack_rto_max) { 4425 thresh = rack_rto_max; 4426 } 4427 /* Apply user supplied min TLP */ 4428 if (thresh < rack_tlp_min) { 4429 thresh = rack_tlp_min; 4430 } 4431 return (thresh); 4432 } 4433 4434 static uint32_t 4435 rack_grab_rtt(struct tcpcb *tp, struct tcp_rack *rack) 4436 { 4437 /* 4438 * We want the rack_rtt which is the 4439 * last rtt we measured. However if that 4440 * does not exist we fallback to the srtt (which 4441 * we probably will never do) and then as a last 4442 * resort we use RACK_INITIAL_RTO if no srtt is 4443 * yet set. 4444 */ 4445 if (rack->rc_rack_rtt) 4446 return(rack->rc_rack_rtt); 4447 else if (tp->t_srtt == 0) 4448 return(RACK_INITIAL_RTO); 4449 return (TICKS_2_MSEC(tp->t_srtt >> TCP_RTT_SHIFT)); 4450 } 4451 4452 static struct rack_sendmap * 4453 rack_check_recovery_mode(struct tcpcb *tp, uint32_t tsused) 4454 { 4455 /* 4456 * Check to see that we don't need to fall into recovery. We will 4457 * need to do so if our oldest transmit is past the time we should 4458 * have had an ack. 4459 */ 4460 struct tcp_rack *rack; 4461 struct rack_sendmap *rsm; 4462 int32_t idx; 4463 uint32_t srtt, thresh; 4464 4465 rack = (struct tcp_rack *)tp->t_fb_ptr; 4466 if (RB_EMPTY(&rack->r_ctl.rc_mtree)) { 4467 return (NULL); 4468 } 4469 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 4470 if (rsm == NULL) 4471 return (NULL); 4472 4473 if (rsm->r_flags & RACK_ACKED) { 4474 rsm = rack_find_lowest_rsm(rack); 4475 if (rsm == NULL) 4476 return (NULL); 4477 } 4478 idx = rsm->r_rtr_cnt - 1; 4479 srtt = rack_grab_rtt(tp, rack); 4480 thresh = rack_calc_thresh_rack(rack, srtt, tsused); 4481 if (TSTMP_LT(tsused, rsm->r_tim_lastsent[idx])) { 4482 return (NULL); 4483 } 4484 if ((tsused - rsm->r_tim_lastsent[idx]) < thresh) { 4485 return (NULL); 4486 } 4487 /* Ok if we reach here we are over-due and this guy can be sent */ 4488 if (IN_RECOVERY(tp->t_flags) == 0) { 4489 /* 4490 * For the one that enters us into recovery record undo 4491 * info. 4492 */ 4493 rack->r_ctl.rc_rsm_start = rsm->r_start; 4494 rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; 4495 rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; 4496 } 4497 rack_cong_signal(tp, NULL, CC_NDUPACK); 4498 return (rsm); 4499 } 4500 4501 static uint32_t 4502 rack_get_persists_timer_val(struct tcpcb *tp, struct tcp_rack *rack) 4503 { 4504 int32_t t; 4505 int32_t tt; 4506 uint32_t ret_val; 4507 4508 t = TICKS_2_MSEC((tp->t_srtt >> TCP_RTT_SHIFT) + ((tp->t_rttvar * 4) >> TCP_RTT_SHIFT)); 4509 TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift], 4510 rack_persist_min, rack_persist_max); 4511 if (tp->t_rxtshift < TCP_MAXRXTSHIFT) 4512 tp->t_rxtshift++; 4513 rack->r_ctl.rc_hpts_flags |= PACE_TMR_PERSIT; 4514 ret_val = (uint32_t)tt; 4515 return (ret_val); 4516 } 4517 4518 static uint32_t 4519 rack_timer_start(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int sup_rack) 4520 { 4521 /* 4522 * Start the FR timer, we do this based on getting the first one in 4523 * the rc_tmap. Note that if its NULL we must stop the timer. in all 4524 * events we need to stop the running timer (if its running) before 4525 * starting the new one. 4526 */ 4527 uint32_t thresh, exp, to, srtt, time_since_sent, tstmp_touse; 4528 uint32_t srtt_cur; 4529 int32_t idx; 4530 int32_t is_tlp_timer = 0; 4531 struct rack_sendmap *rsm; 4532 4533 if (rack->t_timers_stopped) { 4534 /* All timers have been stopped none are to run */ 4535 return (0); 4536 } 4537 if (rack->rc_in_persist) { 4538 /* We can't start any timer in persists */ 4539 return (rack_get_persists_timer_val(tp, rack)); 4540 } 4541 rack->rc_on_min_to = 0; 4542 if ((tp->t_state < TCPS_ESTABLISHED) || 4543 ((tp->t_flags & TF_SACK_PERMIT) == 0)) 4544 goto activate_rxt; 4545 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 4546 if ((rsm == NULL) || sup_rack) { 4547 /* Nothing on the send map */ 4548 activate_rxt: 4549 time_since_sent = 0; 4550 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 4551 if (rsm) { 4552 idx = rsm->r_rtr_cnt - 1; 4553 if (TSTMP_GEQ(rsm->r_tim_lastsent[idx], rack->r_ctl.rc_tlp_rxt_last_time)) 4554 tstmp_touse = rsm->r_tim_lastsent[idx]; 4555 else 4556 tstmp_touse = rack->r_ctl.rc_tlp_rxt_last_time; 4557 if (TSTMP_GT(cts, tstmp_touse)) 4558 time_since_sent = cts - tstmp_touse; 4559 } 4560 if (SEQ_LT(tp->snd_una, tp->snd_max) || sbavail(&(tp->t_inpcb->inp_socket->so_snd))) { 4561 rack->r_ctl.rc_hpts_flags |= PACE_TMR_RXT; 4562 to = TICKS_2_MSEC(tp->t_rxtcur); 4563 if (to > time_since_sent) 4564 to -= time_since_sent; 4565 else 4566 to = rack->r_ctl.rc_min_to; 4567 if (to == 0) 4568 to = 1; 4569 return (to); 4570 } 4571 return (0); 4572 } 4573 if (rsm->r_flags & RACK_ACKED) { 4574 rsm = rack_find_lowest_rsm(rack); 4575 if (rsm == NULL) { 4576 /* No lowest? */ 4577 goto activate_rxt; 4578 } 4579 } 4580 if (rack->sack_attack_disable) { 4581 /* 4582 * We don't want to do 4583 * any TLP's if you are an attacker. 4584 * Though if you are doing what 4585 * is expected you may still have 4586 * SACK-PASSED marks. 4587 */ 4588 goto activate_rxt; 4589 } 4590 /* Convert from ms to usecs */ 4591 if ((rsm->r_flags & RACK_SACK_PASSED) || (rsm->r_dupack >= DUP_ACK_THRESHOLD)) { 4592 if ((tp->t_flags & TF_SENTFIN) && 4593 ((tp->snd_max - tp->snd_una) == 1) && 4594 (rsm->r_flags & RACK_HAS_FIN)) { 4595 /* 4596 * We don't start a rack timer if all we have is a 4597 * FIN outstanding. 4598 */ 4599 goto activate_rxt; 4600 } 4601 if ((rack->use_rack_rr == 0) && 4602 (IN_RECOVERY(tp->t_flags)) && 4603 (rack->rack_no_prr == 0) && 4604 (rack->r_ctl.rc_prr_sndcnt < ctf_fixed_maxseg(tp))) { 4605 /* 4606 * We are not cheating, in recovery and 4607 * not enough ack's to yet get our next 4608 * retransmission out. 4609 * 4610 * Note that classified attackers do not 4611 * get to use the rack-cheat. 4612 */ 4613 goto activate_tlp; 4614 } 4615 srtt = rack_grab_rtt(tp, rack); 4616 thresh = rack_calc_thresh_rack(rack, srtt, cts); 4617 idx = rsm->r_rtr_cnt - 1; 4618 exp = rsm->r_tim_lastsent[idx] + thresh; 4619 if (SEQ_GEQ(exp, cts)) { 4620 to = exp - cts; 4621 if (to < rack->r_ctl.rc_min_to) { 4622 to = rack->r_ctl.rc_min_to; 4623 if (rack->r_rr_config == 3) 4624 rack->rc_on_min_to = 1; 4625 } 4626 } else { 4627 to = rack->r_ctl.rc_min_to; 4628 if (rack->r_rr_config == 3) 4629 rack->rc_on_min_to = 1; 4630 } 4631 } else { 4632 /* Ok we need to do a TLP not RACK */ 4633 activate_tlp: 4634 if ((rack->rc_tlp_in_progress != 0) && 4635 (rack->r_ctl.rc_tlp_cnt_out >= rack_tlp_limit)) { 4636 /* 4637 * The previous send was a TLP and we have sent 4638 * N TLP's without sending new data. 4639 */ 4640 goto activate_rxt; 4641 } 4642 rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext); 4643 if (rsm == NULL) { 4644 /* We found no rsm to TLP with. */ 4645 goto activate_rxt; 4646 } 4647 if (rsm->r_flags & RACK_HAS_FIN) { 4648 /* If its a FIN we dont do TLP */ 4649 rsm = NULL; 4650 goto activate_rxt; 4651 } 4652 idx = rsm->r_rtr_cnt - 1; 4653 time_since_sent = 0; 4654 if (TSTMP_GEQ(rsm->r_tim_lastsent[idx], rack->r_ctl.rc_tlp_rxt_last_time)) 4655 tstmp_touse = rsm->r_tim_lastsent[idx]; 4656 else 4657 tstmp_touse = rack->r_ctl.rc_tlp_rxt_last_time; 4658 if (TSTMP_GT(cts, tstmp_touse)) 4659 time_since_sent = cts - tstmp_touse; 4660 is_tlp_timer = 1; 4661 if (tp->t_srtt) { 4662 srtt_cur = (tp->t_srtt >> TCP_RTT_SHIFT); 4663 srtt = TICKS_2_MSEC(srtt_cur); 4664 } else 4665 srtt = RACK_INITIAL_RTO; 4666 /* 4667 * If the SRTT is not keeping up and the 4668 * rack RTT has spiked we want to use 4669 * the last RTT not the smoothed one. 4670 */ 4671 if (rack_tlp_use_greater && (srtt < rack_grab_rtt(tp, rack))) 4672 srtt = rack_grab_rtt(tp, rack); 4673 thresh = rack_calc_thresh_tlp(tp, rack, rsm, srtt); 4674 if (thresh > time_since_sent) 4675 to = thresh - time_since_sent; 4676 else { 4677 to = rack->r_ctl.rc_min_to; 4678 rack_log_alt_to_to_cancel(rack, 4679 thresh, /* flex1 */ 4680 time_since_sent, /* flex2 */ 4681 tstmp_touse, /* flex3 */ 4682 rack->r_ctl.rc_tlp_rxt_last_time, /* flex4 */ 4683 rsm->r_tim_lastsent[idx], 4684 srtt, 4685 idx, 99); 4686 } 4687 if (to > TCPTV_REXMTMAX) { 4688 /* 4689 * If the TLP time works out to larger than the max 4690 * RTO lets not do TLP.. just RTO. 4691 */ 4692 goto activate_rxt; 4693 } 4694 } 4695 if (is_tlp_timer == 0) { 4696 rack->r_ctl.rc_hpts_flags |= PACE_TMR_RACK; 4697 } else { 4698 rack->r_ctl.rc_hpts_flags |= PACE_TMR_TLP; 4699 } 4700 if (to == 0) 4701 to = 1; 4702 return (to); 4703 } 4704 4705 static void 4706 rack_enter_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 4707 { 4708 if (rack->rc_in_persist == 0) { 4709 if (tp->t_flags & TF_GPUTINPROG) { 4710 /* 4711 * Stop the goodput now, the calling of the 4712 * measurement function clears the flag. 4713 */ 4714 rack_do_goodput_measurement(tp, rack, tp->snd_una, __LINE__); 4715 } 4716 #ifdef NETFLIX_SHARED_CWND 4717 if (rack->r_ctl.rc_scw) { 4718 tcp_shared_cwnd_idle(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index); 4719 rack->rack_scwnd_is_idle = 1; 4720 } 4721 #endif 4722 rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL); 4723 if (rack->r_ctl.rc_went_idle_time == 0) 4724 rack->r_ctl.rc_went_idle_time = 1; 4725 rack_timer_cancel(tp, rack, cts, __LINE__); 4726 tp->t_rxtshift = 0; 4727 rack->rc_in_persist = 1; 4728 } 4729 } 4730 4731 static void 4732 rack_exit_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 4733 { 4734 if (rack->rc_inp->inp_in_hpts) { 4735 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); 4736 rack->r_ctl.rc_hpts_flags = 0; 4737 } 4738 #ifdef NETFLIX_SHARED_CWND 4739 if (rack->r_ctl.rc_scw) { 4740 tcp_shared_cwnd_active(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index); 4741 rack->rack_scwnd_is_idle = 0; 4742 } 4743 #endif 4744 if (rack->rc_gp_dyn_mul && 4745 (rack->use_fixed_rate == 0) && 4746 (rack->rc_always_pace)) { 4747 /* 4748 * Do we count this as if a probe-rtt just 4749 * finished? 4750 */ 4751 uint32_t time_idle, idle_min; 4752 4753 time_idle = tcp_get_usecs(NULL) - rack->r_ctl.rc_went_idle_time; 4754 idle_min = rack_min_probertt_hold; 4755 if (rack_probertt_gpsrtt_cnt_div) { 4756 uint64_t extra; 4757 extra = (uint64_t)rack->r_ctl.rc_gp_srtt * 4758 (uint64_t)rack_probertt_gpsrtt_cnt_mul; 4759 extra /= (uint64_t)rack_probertt_gpsrtt_cnt_div; 4760 idle_min += (uint32_t)extra; 4761 } 4762 if (time_idle >= idle_min) { 4763 /* Yes, we count it as a probe-rtt. */ 4764 uint32_t us_cts; 4765 4766 us_cts = tcp_get_usecs(NULL); 4767 if (rack->in_probe_rtt == 0) { 4768 rack->r_ctl.rc_lower_rtt_us_cts = us_cts; 4769 rack->r_ctl.rc_time_probertt_entered = rack->r_ctl.rc_lower_rtt_us_cts; 4770 rack->r_ctl.rc_time_probertt_starts = rack->r_ctl.rc_lower_rtt_us_cts; 4771 rack->r_ctl.rc_time_of_last_probertt = rack->r_ctl.rc_lower_rtt_us_cts; 4772 } else { 4773 rack_exit_probertt(rack, us_cts); 4774 } 4775 } 4776 4777 } 4778 rack->rc_in_persist = 0; 4779 rack->r_ctl.rc_went_idle_time = 0; 4780 tp->t_rxtshift = 0; 4781 rack->r_ctl.rc_agg_delayed = 0; 4782 rack->r_early = 0; 4783 rack->r_late = 0; 4784 rack->r_ctl.rc_agg_early = 0; 4785 } 4786 4787 static void 4788 rack_log_hpts_diag(struct tcp_rack *rack, uint32_t cts, 4789 struct hpts_diag *diag, struct timeval *tv) 4790 { 4791 if (rack_verbose_logging && rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 4792 union tcp_log_stackspecific log; 4793 4794 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 4795 log.u_bbr.flex1 = diag->p_nxt_slot; 4796 log.u_bbr.flex2 = diag->p_cur_slot; 4797 log.u_bbr.flex3 = diag->slot_req; 4798 log.u_bbr.flex4 = diag->inp_hptsslot; 4799 log.u_bbr.flex5 = diag->slot_remaining; 4800 log.u_bbr.flex6 = diag->need_new_to; 4801 log.u_bbr.flex7 = diag->p_hpts_active; 4802 log.u_bbr.flex8 = diag->p_on_min_sleep; 4803 /* Hijack other fields as needed */ 4804 log.u_bbr.epoch = diag->have_slept; 4805 log.u_bbr.lt_epoch = diag->yet_to_sleep; 4806 log.u_bbr.pkts_out = diag->co_ret; 4807 log.u_bbr.applimited = diag->hpts_sleep_time; 4808 log.u_bbr.delivered = diag->p_prev_slot; 4809 log.u_bbr.inflight = diag->p_runningtick; 4810 log.u_bbr.bw_inuse = diag->wheel_tick; 4811 log.u_bbr.rttProp = diag->wheel_cts; 4812 log.u_bbr.timeStamp = cts; 4813 log.u_bbr.delRate = diag->maxticks; 4814 log.u_bbr.cur_del_rate = diag->p_curtick; 4815 log.u_bbr.cur_del_rate <<= 32; 4816 log.u_bbr.cur_del_rate |= diag->p_lasttick; 4817 TCP_LOG_EVENTP(rack->rc_tp, NULL, 4818 &rack->rc_inp->inp_socket->so_rcv, 4819 &rack->rc_inp->inp_socket->so_snd, 4820 BBR_LOG_HPTSDIAG, 0, 4821 0, &log, false, tv); 4822 } 4823 4824 } 4825 4826 static void 4827 rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, 4828 int32_t slot, uint32_t tot_len_this_send, int sup_rack) 4829 { 4830 struct hpts_diag diag; 4831 struct inpcb *inp; 4832 struct timeval tv; 4833 uint32_t delayed_ack = 0; 4834 uint32_t hpts_timeout; 4835 uint8_t stopped; 4836 uint32_t left = 0; 4837 uint32_t us_cts; 4838 4839 inp = tp->t_inpcb; 4840 if ((tp->t_state == TCPS_CLOSED) || 4841 (tp->t_state == TCPS_LISTEN)) { 4842 return; 4843 } 4844 if (inp->inp_in_hpts) { 4845 /* Already on the pacer */ 4846 return; 4847 } 4848 stopped = rack->rc_tmr_stopped; 4849 if (stopped && TSTMP_GT(rack->r_ctl.rc_timer_exp, cts)) { 4850 left = rack->r_ctl.rc_timer_exp - cts; 4851 } 4852 rack->r_ctl.rc_timer_exp = 0; 4853 rack->r_ctl.rc_hpts_flags = 0; 4854 us_cts = tcp_get_usecs(&tv); 4855 /* Now early/late accounting */ 4856 if (rack->r_early) { 4857 /* 4858 * We have a early carry over set, 4859 * we can always add more time so we 4860 * can always make this compensation. 4861 */ 4862 slot += rack->r_ctl.rc_agg_early; 4863 rack->r_early = 0; 4864 rack->r_ctl.rc_agg_early = 0; 4865 } 4866 if (rack->r_late) { 4867 /* 4868 * This is harder, we can 4869 * compensate some but it 4870 * really depends on what 4871 * the current pacing time is. 4872 */ 4873 if (rack->r_ctl.rc_agg_delayed >= slot) { 4874 /* 4875 * We can't compensate for it all. 4876 * And we have to have some time 4877 * on the clock. We always have a min 4878 * 10 slots (10 x 10 i.e. 100 usecs). 4879 */ 4880 if (slot <= HPTS_TICKS_PER_USEC) { 4881 /* We gain delay */ 4882 rack->r_ctl.rc_agg_delayed += (HPTS_TICKS_PER_USEC - slot); 4883 slot = HPTS_TICKS_PER_USEC; 4884 } else { 4885 /* We take off some */ 4886 rack->r_ctl.rc_agg_delayed -= (slot - HPTS_TICKS_PER_USEC); 4887 slot = HPTS_TICKS_PER_USEC; 4888 } 4889 } else { 4890 4891 slot -= rack->r_ctl.rc_agg_delayed; 4892 rack->r_ctl.rc_agg_delayed = 0; 4893 /* Make sure we have 100 useconds at minimum */ 4894 if (slot < HPTS_TICKS_PER_USEC) { 4895 rack->r_ctl.rc_agg_delayed = HPTS_TICKS_PER_USEC - slot; 4896 slot = HPTS_TICKS_PER_USEC; 4897 } 4898 if (rack->r_ctl.rc_agg_delayed == 0) 4899 rack->r_late = 0; 4900 } 4901 } 4902 if (slot) { 4903 /* We are pacing too */ 4904 rack->r_ctl.rc_hpts_flags |= PACE_PKT_OUTPUT; 4905 } 4906 hpts_timeout = rack_timer_start(tp, rack, cts, sup_rack); 4907 #ifdef NETFLIX_EXP_DETECTION 4908 if (rack->sack_attack_disable && 4909 (slot < tcp_sad_pacing_interval)) { 4910 /* 4911 * We have a potential attacker on 4912 * the line. We have possibly some 4913 * (or now) pacing time set. We want to 4914 * slow down the processing of sacks by some 4915 * amount (if it is an attacker). Set the default 4916 * slot for attackers in place (unless the orginal 4917 * interval is longer). Its stored in 4918 * micro-seconds, so lets convert to msecs. 4919 */ 4920 slot = tcp_sad_pacing_interval; 4921 } 4922 #endif 4923 if (tp->t_flags & TF_DELACK) { 4924 delayed_ack = TICKS_2_MSEC(tcp_delacktime); 4925 rack->r_ctl.rc_hpts_flags |= PACE_TMR_DELACK; 4926 } 4927 if (delayed_ack && ((hpts_timeout == 0) || 4928 (delayed_ack < hpts_timeout))) 4929 hpts_timeout = delayed_ack; 4930 else 4931 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK; 4932 /* 4933 * If no timers are going to run and we will fall off the hptsi 4934 * wheel, we resort to a keep-alive timer if its configured. 4935 */ 4936 if ((hpts_timeout == 0) && 4937 (slot == 0)) { 4938 if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && 4939 (tp->t_state <= TCPS_CLOSING)) { 4940 /* 4941 * Ok we have no timer (persists, rack, tlp, rxt or 4942 * del-ack), we don't have segments being paced. So 4943 * all that is left is the keepalive timer. 4944 */ 4945 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 4946 /* Get the established keep-alive time */ 4947 hpts_timeout = TP_KEEPIDLE(tp); 4948 } else { 4949 /* Get the initial setup keep-alive time */ 4950 hpts_timeout = TP_KEEPINIT(tp); 4951 } 4952 rack->r_ctl.rc_hpts_flags |= PACE_TMR_KEEP; 4953 if (rack->in_probe_rtt) { 4954 /* 4955 * We want to instead not wake up a long time from 4956 * now but to wake up about the time we would 4957 * exit probe-rtt and initiate a keep-alive ack. 4958 * This will get us out of probe-rtt and update 4959 * our min-rtt. 4960 */ 4961 hpts_timeout = (rack_min_probertt_hold / HPTS_USEC_IN_MSEC); 4962 } 4963 } 4964 } 4965 if (left && (stopped & (PACE_TMR_KEEP | PACE_TMR_DELACK)) == 4966 (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK)) { 4967 /* 4968 * RACK, TLP, persists and RXT timers all are restartable 4969 * based on actions input .. i.e we received a packet (ack 4970 * or sack) and that changes things (rw, or snd_una etc). 4971 * Thus we can restart them with a new value. For 4972 * keep-alive, delayed_ack we keep track of what was left 4973 * and restart the timer with a smaller value. 4974 */ 4975 if (left < hpts_timeout) 4976 hpts_timeout = left; 4977 } 4978 if (hpts_timeout) { 4979 /* 4980 * Hack alert for now we can't time-out over 2,147,483 4981 * seconds (a bit more than 596 hours), which is probably ok 4982 * :). 4983 */ 4984 if (hpts_timeout > 0x7ffffffe) 4985 hpts_timeout = 0x7ffffffe; 4986 rack->r_ctl.rc_timer_exp = cts + hpts_timeout; 4987 } 4988 if ((rack->rc_gp_filled == 0) && 4989 (hpts_timeout < slot) && 4990 (rack->r_ctl.rc_hpts_flags & (PACE_TMR_TLP|PACE_TMR_RXT))) { 4991 /* 4992 * We have no good estimate yet for the 4993 * old clunky burst mitigation or the 4994 * real pacing. And the tlp or rxt is smaller 4995 * than the pacing calculation. Lets not 4996 * pace that long since we know the calculation 4997 * so far is not accurate. 4998 */ 4999 slot = hpts_timeout; 5000 } 5001 rack->r_ctl.last_pacing_time = slot; 5002 if (slot) { 5003 rack->r_ctl.rc_last_output_to = us_cts + slot; 5004 if (rack->rc_always_pace || rack->r_mbuf_queue) { 5005 if ((rack->rc_gp_filled == 0) || 5006 rack->pacing_longer_than_rtt) { 5007 inp->inp_flags2 &= ~(INP_DONT_SACK_QUEUE|INP_MBUF_QUEUE_READY); 5008 } else { 5009 inp->inp_flags2 |= INP_MBUF_QUEUE_READY; 5010 if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK) && 5011 (rack->r_rr_config != 3)) 5012 inp->inp_flags2 |= INP_DONT_SACK_QUEUE; 5013 else 5014 inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE; 5015 } 5016 } 5017 if ((rack->use_rack_rr) && 5018 (rack->r_rr_config < 2) && 5019 ((hpts_timeout) && ((hpts_timeout * HPTS_USEC_IN_MSEC) < slot))) { 5020 /* 5021 * Arrange for the hpts to kick back in after the 5022 * t-o if the t-o does not cause a send. 5023 */ 5024 (void)tcp_hpts_insert_diag(tp->t_inpcb, HPTS_MS_TO_SLOTS(hpts_timeout), 5025 __LINE__, &diag); 5026 rack_log_hpts_diag(rack, us_cts, &diag, &tv); 5027 rack_log_to_start(rack, cts, hpts_timeout, slot, 0); 5028 } else { 5029 (void)tcp_hpts_insert_diag(tp->t_inpcb, HPTS_USEC_TO_SLOTS(slot), 5030 __LINE__, &diag); 5031 rack_log_hpts_diag(rack, us_cts, &diag, &tv); 5032 rack_log_to_start(rack, cts, hpts_timeout, slot, 1); 5033 } 5034 } else if (hpts_timeout) { 5035 if (rack->rc_always_pace || rack->r_mbuf_queue) { 5036 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK) { 5037 /* For a rack timer, don't wake us */ 5038 inp->inp_flags2 |= INP_MBUF_QUEUE_READY; 5039 if (rack->r_rr_config != 3) 5040 inp->inp_flags2 |= INP_DONT_SACK_QUEUE; 5041 else 5042 inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE; 5043 } else { 5044 /* All other timers wake us up */ 5045 inp->inp_flags2 &= ~INP_MBUF_QUEUE_READY; 5046 inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE; 5047 } 5048 } 5049 (void)tcp_hpts_insert_diag(tp->t_inpcb, HPTS_MS_TO_SLOTS(hpts_timeout), 5050 __LINE__, &diag); 5051 rack_log_hpts_diag(rack, us_cts, &diag, &tv); 5052 rack_log_to_start(rack, cts, hpts_timeout, slot, 0); 5053 } else { 5054 /* No timer starting */ 5055 #ifdef INVARIANTS 5056 if (SEQ_GT(tp->snd_max, tp->snd_una)) { 5057 panic("tp:%p rack:%p tlts:%d cts:%u slot:%u pto:%u -- no timer started?", 5058 tp, rack, tot_len_this_send, cts, slot, hpts_timeout); 5059 } 5060 #endif 5061 } 5062 rack->rc_tmr_stopped = 0; 5063 if (slot) 5064 rack_log_type_bbrsnd(rack, tot_len_this_send, slot, us_cts, &tv); 5065 } 5066 5067 /* 5068 * RACK Timer, here we simply do logging and house keeping. 5069 * the normal rack_output() function will call the 5070 * appropriate thing to check if we need to do a RACK retransmit. 5071 * We return 1, saying don't proceed with rack_output only 5072 * when all timers have been stopped (destroyed PCB?). 5073 */ 5074 static int 5075 rack_timeout_rack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 5076 { 5077 /* 5078 * This timer simply provides an internal trigger to send out data. 5079 * The check_recovery_mode call will see if there are needed 5080 * retransmissions, if so we will enter fast-recovery. The output 5081 * call may or may not do the same thing depending on sysctl 5082 * settings. 5083 */ 5084 struct rack_sendmap *rsm; 5085 int32_t recovery; 5086 5087 if (tp->t_timers->tt_flags & TT_STOPPED) { 5088 return (1); 5089 } 5090 recovery = IN_RECOVERY(tp->t_flags); 5091 counter_u64_add(rack_to_tot, 1); 5092 if (rack->r_state && (rack->r_state != tp->t_state)) 5093 rack_set_state(tp, rack); 5094 rack->rc_on_min_to = 0; 5095 rsm = rack_check_recovery_mode(tp, cts); 5096 rack_log_to_event(rack, RACK_TO_FRM_RACK, rsm); 5097 if (rsm) { 5098 uint32_t rtt; 5099 5100 rack->r_ctl.rc_resend = rsm; 5101 if (rack->use_rack_rr) { 5102 /* 5103 * Don't accumulate extra pacing delay 5104 * we are allowing the rack timer to 5105 * over-ride pacing i.e. rrr takes precedence 5106 * if the pacing interval is longer than the rrr 5107 * time (in other words we get the min pacing 5108 * time versus rrr pacing time). 5109 */ 5110 rack->r_timer_override = 1; 5111 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 5112 } 5113 rtt = rack->rc_rack_rtt; 5114 if (rtt == 0) 5115 rtt = 1; 5116 if (rack->rack_no_prr == 0) { 5117 if ((recovery == 0) && 5118 (rack->r_ctl.rc_prr_sndcnt < ctf_fixed_maxseg(tp))) { 5119 /* 5120 * The rack-timeout that enter's us into recovery 5121 * will force out one MSS and set us up so that we 5122 * can do one more send in 2*rtt (transitioning the 5123 * rack timeout into a rack-tlp). 5124 */ 5125 rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); 5126 rack->r_timer_override = 1; 5127 rack_log_to_prr(rack, 3, 0); 5128 } else if ((rack->r_ctl.rc_prr_sndcnt < (rsm->r_end - rsm->r_start)) && 5129 rack->use_rack_rr) { 5130 /* 5131 * When a rack timer goes, if the rack rr is 5132 * on, arrange it so we can send a full segment 5133 * overriding prr (though we pay a price for this 5134 * for future new sends). 5135 */ 5136 rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); 5137 rack_log_to_prr(rack, 4, 0); 5138 } 5139 } 5140 } 5141 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RACK; 5142 if (rsm == NULL) { 5143 /* restart a timer and return 1 */ 5144 rack_start_hpts_timer(rack, tp, cts, 5145 0, 0, 0); 5146 return (1); 5147 } 5148 return (0); 5149 } 5150 5151 static __inline void 5152 rack_clone_rsm(struct tcp_rack *rack, struct rack_sendmap *nrsm, 5153 struct rack_sendmap *rsm, uint32_t start) 5154 { 5155 int idx; 5156 5157 nrsm->r_start = start; 5158 nrsm->r_end = rsm->r_end; 5159 nrsm->r_rtr_cnt = rsm->r_rtr_cnt; 5160 nrsm->r_flags = rsm->r_flags; 5161 nrsm->r_dupack = rsm->r_dupack; 5162 nrsm->usec_orig_send = rsm->usec_orig_send; 5163 nrsm->r_rtr_bytes = 0; 5164 rsm->r_end = nrsm->r_start; 5165 nrsm->r_just_ret = rsm->r_just_ret; 5166 for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { 5167 nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; 5168 } 5169 } 5170 5171 static struct rack_sendmap * 5172 rack_merge_rsm(struct tcp_rack *rack, 5173 struct rack_sendmap *l_rsm, 5174 struct rack_sendmap *r_rsm) 5175 { 5176 /* 5177 * We are merging two ack'd RSM's, 5178 * the l_rsm is on the left (lower seq 5179 * values) and the r_rsm is on the right 5180 * (higher seq value). The simplest way 5181 * to merge these is to move the right 5182 * one into the left. I don't think there 5183 * is any reason we need to try to find 5184 * the oldest (or last oldest retransmitted). 5185 */ 5186 struct rack_sendmap *rm; 5187 5188 l_rsm->r_end = r_rsm->r_end; 5189 if (l_rsm->r_dupack < r_rsm->r_dupack) 5190 l_rsm->r_dupack = r_rsm->r_dupack; 5191 if (r_rsm->r_rtr_bytes) 5192 l_rsm->r_rtr_bytes += r_rsm->r_rtr_bytes; 5193 if (r_rsm->r_in_tmap) { 5194 /* This really should not happen */ 5195 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, r_rsm, r_tnext); 5196 r_rsm->r_in_tmap = 0; 5197 } 5198 5199 /* Now the flags */ 5200 if (r_rsm->r_flags & RACK_HAS_FIN) 5201 l_rsm->r_flags |= RACK_HAS_FIN; 5202 if (r_rsm->r_flags & RACK_TLP) 5203 l_rsm->r_flags |= RACK_TLP; 5204 if (r_rsm->r_flags & RACK_RWND_COLLAPSED) 5205 l_rsm->r_flags |= RACK_RWND_COLLAPSED; 5206 if ((r_rsm->r_flags & RACK_APP_LIMITED) && 5207 ((l_rsm->r_flags & RACK_APP_LIMITED) == 0)) { 5208 /* 5209 * If both are app-limited then let the 5210 * free lower the count. If right is app 5211 * limited and left is not, transfer. 5212 */ 5213 l_rsm->r_flags |= RACK_APP_LIMITED; 5214 r_rsm->r_flags &= ~RACK_APP_LIMITED; 5215 if (r_rsm == rack->r_ctl.rc_first_appl) 5216 rack->r_ctl.rc_first_appl = l_rsm; 5217 } 5218 rm = RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, r_rsm); 5219 #ifdef INVARIANTS 5220 if (rm != r_rsm) { 5221 panic("removing head in rack:%p rsm:%p rm:%p", 5222 rack, r_rsm, rm); 5223 } 5224 #endif 5225 if ((r_rsm->r_limit_type == 0) && (l_rsm->r_limit_type != 0)) { 5226 /* Transfer the split limit to the map we free */ 5227 r_rsm->r_limit_type = l_rsm->r_limit_type; 5228 l_rsm->r_limit_type = 0; 5229 } 5230 rack_free(rack, r_rsm); 5231 return(l_rsm); 5232 } 5233 5234 /* 5235 * TLP Timer, here we simply setup what segment we want to 5236 * have the TLP expire on, the normal rack_output() will then 5237 * send it out. 5238 * 5239 * We return 1, saying don't proceed with rack_output only 5240 * when all timers have been stopped (destroyed PCB?). 5241 */ 5242 static int 5243 rack_timeout_tlp(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 5244 { 5245 /* 5246 * Tail Loss Probe. 5247 */ 5248 struct rack_sendmap *rsm = NULL; 5249 struct rack_sendmap *insret; 5250 struct socket *so; 5251 uint32_t amm, old_prr_snd = 0; 5252 uint32_t out, avail; 5253 int collapsed_win = 0; 5254 5255 if (tp->t_timers->tt_flags & TT_STOPPED) { 5256 return (1); 5257 } 5258 if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) { 5259 /* Its not time yet */ 5260 return (0); 5261 } 5262 if (ctf_progress_timeout_check(tp, true)) { 5263 rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__); 5264 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 5265 return (1); 5266 } 5267 /* 5268 * A TLP timer has expired. We have been idle for 2 rtts. So we now 5269 * need to figure out how to force a full MSS segment out. 5270 */ 5271 rack_log_to_event(rack, RACK_TO_FRM_TLP, NULL); 5272 counter_u64_add(rack_tlp_tot, 1); 5273 if (rack->r_state && (rack->r_state != tp->t_state)) 5274 rack_set_state(tp, rack); 5275 so = tp->t_inpcb->inp_socket; 5276 #ifdef KERN_TLS 5277 if (rack->rc_inp->inp_socket->so_snd.sb_flags & SB_TLS_IFNET) { 5278 /* 5279 * For hardware TLS we do *not* want to send 5280 * new data, lets instead just do a retransmission. 5281 */ 5282 goto need_retran; 5283 } 5284 #endif 5285 avail = sbavail(&so->so_snd); 5286 out = tp->snd_max - tp->snd_una; 5287 if (out > tp->snd_wnd) { 5288 /* special case, we need a retransmission */ 5289 collapsed_win = 1; 5290 goto need_retran; 5291 } 5292 /* 5293 * Check our send oldest always settings, and if 5294 * there is an oldest to send jump to the need_retran. 5295 */ 5296 if (rack_always_send_oldest && (TAILQ_EMPTY(&rack->r_ctl.rc_tmap) == 0)) 5297 goto need_retran; 5298 5299 if (avail > out) { 5300 /* New data is available */ 5301 amm = avail - out; 5302 if (amm > ctf_fixed_maxseg(tp)) { 5303 amm = ctf_fixed_maxseg(tp); 5304 if ((amm + out) > tp->snd_wnd) { 5305 /* We are rwnd limited */ 5306 goto need_retran; 5307 } 5308 } else if (amm < ctf_fixed_maxseg(tp)) { 5309 /* not enough to fill a MTU */ 5310 goto need_retran; 5311 } 5312 if (IN_RECOVERY(tp->t_flags)) { 5313 /* Unlikely */ 5314 if (rack->rack_no_prr == 0) { 5315 old_prr_snd = rack->r_ctl.rc_prr_sndcnt; 5316 if (out + amm <= tp->snd_wnd) { 5317 rack->r_ctl.rc_prr_sndcnt = amm; 5318 rack_log_to_prr(rack, 4, 0); 5319 } 5320 } else 5321 goto need_retran; 5322 } else { 5323 /* Set the send-new override */ 5324 if (out + amm <= tp->snd_wnd) 5325 rack->r_ctl.rc_tlp_new_data = amm; 5326 else 5327 goto need_retran; 5328 } 5329 rack->r_ctl.rc_tlpsend = NULL; 5330 counter_u64_add(rack_tlp_newdata, 1); 5331 goto send; 5332 } 5333 need_retran: 5334 /* 5335 * Ok we need to arrange the last un-acked segment to be re-sent, or 5336 * optionally the first un-acked segment. 5337 */ 5338 if (collapsed_win == 0) { 5339 if (rack_always_send_oldest) 5340 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 5341 else { 5342 rsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 5343 if (rsm && (rsm->r_flags & (RACK_ACKED | RACK_HAS_FIN))) { 5344 rsm = rack_find_high_nonack(rack, rsm); 5345 } 5346 } 5347 if (rsm == NULL) { 5348 counter_u64_add(rack_tlp_does_nada, 1); 5349 #ifdef TCP_BLACKBOX 5350 tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true); 5351 #endif 5352 goto out; 5353 } 5354 } else { 5355 /* 5356 * We must find the last segment 5357 * that was acceptable by the client. 5358 */ 5359 RB_FOREACH_REVERSE(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) { 5360 if ((rsm->r_flags & RACK_RWND_COLLAPSED) == 0) { 5361 /* Found one */ 5362 break; 5363 } 5364 } 5365 if (rsm == NULL) { 5366 /* None? if so send the first */ 5367 rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 5368 if (rsm == NULL) { 5369 counter_u64_add(rack_tlp_does_nada, 1); 5370 #ifdef TCP_BLACKBOX 5371 tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true); 5372 #endif 5373 goto out; 5374 } 5375 } 5376 } 5377 if ((rsm->r_end - rsm->r_start) > ctf_fixed_maxseg(tp)) { 5378 /* 5379 * We need to split this the last segment in two. 5380 */ 5381 struct rack_sendmap *nrsm; 5382 5383 5384 nrsm = rack_alloc_full_limit(rack); 5385 if (nrsm == NULL) { 5386 /* 5387 * No memory to split, we will just exit and punt 5388 * off to the RXT timer. 5389 */ 5390 counter_u64_add(rack_tlp_does_nada, 1); 5391 goto out; 5392 } 5393 rack_clone_rsm(rack, nrsm, rsm, 5394 (rsm->r_end - ctf_fixed_maxseg(tp))); 5395 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); 5396 #ifdef INVARIANTS 5397 if (insret != NULL) { 5398 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", 5399 nrsm, insret, rack, rsm); 5400 } 5401 #endif 5402 if (rsm->r_in_tmap) { 5403 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 5404 nrsm->r_in_tmap = 1; 5405 } 5406 rsm->r_flags &= (~RACK_HAS_FIN); 5407 rsm = nrsm; 5408 } 5409 rack->r_ctl.rc_tlpsend = rsm; 5410 send: 5411 rack->r_timer_override = 1; 5412 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP; 5413 return (0); 5414 out: 5415 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP; 5416 return (0); 5417 } 5418 5419 /* 5420 * Delayed ack Timer, here we simply need to setup the 5421 * ACK_NOW flag and remove the DELACK flag. From there 5422 * the output routine will send the ack out. 5423 * 5424 * We only return 1, saying don't proceed, if all timers 5425 * are stopped (destroyed PCB?). 5426 */ 5427 static int 5428 rack_timeout_delack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 5429 { 5430 if (tp->t_timers->tt_flags & TT_STOPPED) { 5431 return (1); 5432 } 5433 rack_log_to_event(rack, RACK_TO_FRM_DELACK, NULL); 5434 tp->t_flags &= ~TF_DELACK; 5435 tp->t_flags |= TF_ACKNOW; 5436 KMOD_TCPSTAT_INC(tcps_delack); 5437 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK; 5438 return (0); 5439 } 5440 5441 /* 5442 * Persists timer, here we simply send the 5443 * same thing as a keepalive will. 5444 * the one byte send. 5445 * 5446 * We only return 1, saying don't proceed, if all timers 5447 * are stopped (destroyed PCB?). 5448 */ 5449 static int 5450 rack_timeout_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 5451 { 5452 struct tcptemp *t_template; 5453 struct inpcb *inp; 5454 int32_t retval = 1; 5455 5456 inp = tp->t_inpcb; 5457 5458 if (tp->t_timers->tt_flags & TT_STOPPED) { 5459 return (1); 5460 } 5461 if (rack->rc_in_persist == 0) 5462 return (0); 5463 if (ctf_progress_timeout_check(tp, false)) { 5464 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); 5465 rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__); 5466 tcp_set_inp_to_drop(inp, ETIMEDOUT); 5467 return (1); 5468 } 5469 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 5470 /* 5471 * Persistence timer into zero window. Force a byte to be output, if 5472 * possible. 5473 */ 5474 KMOD_TCPSTAT_INC(tcps_persisttimeo); 5475 /* 5476 * Hack: if the peer is dead/unreachable, we do not time out if the 5477 * window is closed. After a full backoff, drop the connection if 5478 * the idle time (no responses to probes) reaches the maximum 5479 * backoff that we would use if retransmitting. 5480 */ 5481 if (tp->t_rxtshift == TCP_MAXRXTSHIFT && 5482 (ticks - tp->t_rcvtime >= tcp_maxpersistidle || 5483 ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) { 5484 KMOD_TCPSTAT_INC(tcps_persistdrop); 5485 retval = 1; 5486 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); 5487 tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT); 5488 goto out; 5489 } 5490 if ((sbavail(&rack->rc_inp->inp_socket->so_snd) == 0) && 5491 tp->snd_una == tp->snd_max) 5492 rack_exit_persist(tp, rack, cts); 5493 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_PERSIT; 5494 /* 5495 * If the user has closed the socket then drop a persisting 5496 * connection after a much reduced timeout. 5497 */ 5498 if (tp->t_state > TCPS_CLOSE_WAIT && 5499 (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) { 5500 retval = 1; 5501 KMOD_TCPSTAT_INC(tcps_persistdrop); 5502 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); 5503 tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT); 5504 goto out; 5505 } 5506 t_template = tcpip_maketemplate(rack->rc_inp); 5507 if (t_template) { 5508 /* only set it if we were answered */ 5509 if (rack->forced_ack == 0) { 5510 rack->forced_ack = 1; 5511 rack->r_ctl.forced_ack_ts = tcp_get_usecs(NULL); 5512 } 5513 tcp_respond(tp, t_template->tt_ipgen, 5514 &t_template->tt_t, (struct mbuf *)NULL, 5515 tp->rcv_nxt, tp->snd_una - 1, 0); 5516 /* This sends an ack */ 5517 if (tp->t_flags & TF_DELACK) 5518 tp->t_flags &= ~TF_DELACK; 5519 free(t_template, M_TEMP); 5520 } 5521 if (tp->t_rxtshift < TCP_MAXRXTSHIFT) 5522 tp->t_rxtshift++; 5523 out: 5524 rack_log_to_event(rack, RACK_TO_FRM_PERSIST, NULL); 5525 rack_start_hpts_timer(rack, tp, cts, 5526 0, 0, 0); 5527 return (retval); 5528 } 5529 5530 /* 5531 * If a keepalive goes off, we had no other timers 5532 * happening. We always return 1 here since this 5533 * routine either drops the connection or sends 5534 * out a segment with respond. 5535 */ 5536 static int 5537 rack_timeout_keepalive(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 5538 { 5539 struct tcptemp *t_template; 5540 struct inpcb *inp; 5541 5542 if (tp->t_timers->tt_flags & TT_STOPPED) { 5543 return (1); 5544 } 5545 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_KEEP; 5546 inp = tp->t_inpcb; 5547 rack_log_to_event(rack, RACK_TO_FRM_KEEP, NULL); 5548 /* 5549 * Keep-alive timer went off; send something or drop connection if 5550 * idle for too long. 5551 */ 5552 KMOD_TCPSTAT_INC(tcps_keeptimeo); 5553 if (tp->t_state < TCPS_ESTABLISHED) 5554 goto dropit; 5555 if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && 5556 tp->t_state <= TCPS_CLOSING) { 5557 if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp)) 5558 goto dropit; 5559 /* 5560 * Send a packet designed to force a response if the peer is 5561 * up and reachable: either an ACK if the connection is 5562 * still alive, or an RST if the peer has closed the 5563 * connection due to timeout or reboot. Using sequence 5564 * number tp->snd_una-1 causes the transmitted zero-length 5565 * segment to lie outside the receive window; by the 5566 * protocol spec, this requires the correspondent TCP to 5567 * respond. 5568 */ 5569 KMOD_TCPSTAT_INC(tcps_keepprobe); 5570 t_template = tcpip_maketemplate(inp); 5571 if (t_template) { 5572 if (rack->forced_ack == 0) { 5573 rack->forced_ack = 1; 5574 rack->r_ctl.forced_ack_ts = tcp_get_usecs(NULL); 5575 } 5576 tcp_respond(tp, t_template->tt_ipgen, 5577 &t_template->tt_t, (struct mbuf *)NULL, 5578 tp->rcv_nxt, tp->snd_una - 1, 0); 5579 free(t_template, M_TEMP); 5580 } 5581 } 5582 rack_start_hpts_timer(rack, tp, cts, 0, 0, 0); 5583 return (1); 5584 dropit: 5585 KMOD_TCPSTAT_INC(tcps_keepdrops); 5586 tcp_log_end_status(tp, TCP_EI_STATUS_KEEP_MAX); 5587 tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT); 5588 return (1); 5589 } 5590 5591 /* 5592 * Retransmit helper function, clear up all the ack 5593 * flags and take care of important book keeping. 5594 */ 5595 static void 5596 rack_remxt_tmr(struct tcpcb *tp) 5597 { 5598 /* 5599 * The retransmit timer went off, all sack'd blocks must be 5600 * un-acked. 5601 */ 5602 struct rack_sendmap *rsm, *trsm = NULL; 5603 struct tcp_rack *rack; 5604 int32_t cnt = 0; 5605 5606 rack = (struct tcp_rack *)tp->t_fb_ptr; 5607 rack_timer_cancel(tp, rack, tcp_ts_getticks(), __LINE__); 5608 rack_log_to_event(rack, RACK_TO_FRM_TMR, NULL); 5609 if (rack->r_state && (rack->r_state != tp->t_state)) 5610 rack_set_state(tp, rack); 5611 /* 5612 * Ideally we would like to be able to 5613 * mark SACK-PASS on anything not acked here. 5614 * However, if we do that we would burst out 5615 * all that data 1ms apart. This would be unwise, 5616 * so for now we will just let the normal rxt timer 5617 * and tlp timer take care of it. 5618 */ 5619 RB_FOREACH(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) { 5620 if (rsm->r_flags & RACK_ACKED) { 5621 cnt++; 5622 rsm->r_dupack = 0; 5623 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 5624 if (rsm->r_in_tmap == 0) { 5625 /* We must re-add it back to the tlist */ 5626 if (trsm == NULL) { 5627 TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext); 5628 } else { 5629 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, trsm, rsm, r_tnext); 5630 } 5631 rsm->r_in_tmap = 1; 5632 } 5633 } 5634 trsm = rsm; 5635 if (rsm->r_flags & RACK_ACKED) 5636 rsm->r_flags |= RACK_WAS_ACKED; 5637 rsm->r_flags &= ~(RACK_ACKED | RACK_SACK_PASSED | RACK_WAS_SACKPASS); 5638 } 5639 /* Clear the count (we just un-acked them) */ 5640 rack->r_ctl.rc_sacked = 0; 5641 rack->r_ctl.rc_agg_delayed = 0; 5642 rack->r_early = 0; 5643 rack->r_ctl.rc_agg_early = 0; 5644 rack->r_late = 0; 5645 /* Clear the tlp rtx mark */ 5646 rack->r_ctl.rc_resend = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 5647 rack->r_ctl.rc_prr_sndcnt = 0; 5648 rack_log_to_prr(rack, 6, 0); 5649 rack->r_timer_override = 1; 5650 } 5651 5652 static void 5653 rack_cc_conn_init(struct tcpcb *tp) 5654 { 5655 struct tcp_rack *rack; 5656 5657 5658 rack = (struct tcp_rack *)tp->t_fb_ptr; 5659 cc_conn_init(tp); 5660 /* 5661 * We want a chance to stay in slowstart as 5662 * we create a connection. TCP spec says that 5663 * initially ssthresh is infinite. For our 5664 * purposes that is the snd_wnd. 5665 */ 5666 if (tp->snd_ssthresh < tp->snd_wnd) { 5667 tp->snd_ssthresh = tp->snd_wnd; 5668 } 5669 /* 5670 * We also want to assure a IW worth of 5671 * data can get inflight. 5672 */ 5673 if (rc_init_window(rack) < tp->snd_cwnd) 5674 tp->snd_cwnd = rc_init_window(rack); 5675 } 5676 5677 /* 5678 * Re-transmit timeout! If we drop the PCB we will return 1, otherwise 5679 * we will setup to retransmit the lowest seq number outstanding. 5680 */ 5681 static int 5682 rack_timeout_rxt(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 5683 { 5684 int32_t rexmt; 5685 struct inpcb *inp; 5686 int32_t retval = 0; 5687 bool isipv6; 5688 5689 inp = tp->t_inpcb; 5690 if (tp->t_timers->tt_flags & TT_STOPPED) { 5691 return (1); 5692 } 5693 if (ctf_progress_timeout_check(tp, false)) { 5694 tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN); 5695 rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__); 5696 tcp_set_inp_to_drop(inp, ETIMEDOUT); 5697 return (1); 5698 } 5699 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RXT; 5700 if (TCPS_HAVEESTABLISHED(tp->t_state) && 5701 (tp->snd_una == tp->snd_max)) { 5702 /* Nothing outstanding .. nothing to do */ 5703 return (0); 5704 } 5705 /* 5706 * Retransmission timer went off. Message has not been acked within 5707 * retransmit interval. Back off to a longer retransmit interval 5708 * and retransmit one segment. 5709 */ 5710 rack_remxt_tmr(tp); 5711 if ((rack->r_ctl.rc_resend == NULL) || 5712 ((rack->r_ctl.rc_resend->r_flags & RACK_RWND_COLLAPSED) == 0)) { 5713 /* 5714 * If the rwnd collapsed on 5715 * the one we are retransmitting 5716 * it does not count against the 5717 * rxt count. 5718 */ 5719 tp->t_rxtshift++; 5720 } 5721 if (tp->t_rxtshift > TCP_MAXRXTSHIFT) { 5722 tp->t_rxtshift = TCP_MAXRXTSHIFT; 5723 KMOD_TCPSTAT_INC(tcps_timeoutdrop); 5724 retval = 1; 5725 tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN); 5726 tcp_set_inp_to_drop(rack->rc_inp, 5727 (tp->t_softerror ? (uint16_t) tp->t_softerror : ETIMEDOUT)); 5728 goto out; 5729 } 5730 if (tp->t_state == TCPS_SYN_SENT) { 5731 /* 5732 * If the SYN was retransmitted, indicate CWND to be limited 5733 * to 1 segment in cc_conn_init(). 5734 */ 5735 tp->snd_cwnd = 1; 5736 } else if (tp->t_rxtshift == 1) { 5737 /* 5738 * first retransmit; record ssthresh and cwnd so they can be 5739 * recovered if this turns out to be a "bad" retransmit. A 5740 * retransmit is considered "bad" if an ACK for this segment 5741 * is received within RTT/2 interval; the assumption here is 5742 * that the ACK was already in flight. See "On Estimating 5743 * End-to-End Network Path Properties" by Allman and Paxson 5744 * for more details. 5745 */ 5746 tp->snd_cwnd_prev = tp->snd_cwnd; 5747 tp->snd_ssthresh_prev = tp->snd_ssthresh; 5748 tp->snd_recover_prev = tp->snd_recover; 5749 if (IN_FASTRECOVERY(tp->t_flags)) 5750 tp->t_flags |= TF_WASFRECOVERY; 5751 else 5752 tp->t_flags &= ~TF_WASFRECOVERY; 5753 if (IN_CONGRECOVERY(tp->t_flags)) 5754 tp->t_flags |= TF_WASCRECOVERY; 5755 else 5756 tp->t_flags &= ~TF_WASCRECOVERY; 5757 tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1)); 5758 tp->t_flags |= TF_PREVVALID; 5759 } else 5760 tp->t_flags &= ~TF_PREVVALID; 5761 KMOD_TCPSTAT_INC(tcps_rexmttimeo); 5762 if ((tp->t_state == TCPS_SYN_SENT) || 5763 (tp->t_state == TCPS_SYN_RECEIVED)) 5764 rexmt = MSEC_2_TICKS(RACK_INITIAL_RTO * tcp_backoff[tp->t_rxtshift]); 5765 else 5766 rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; 5767 TCPT_RANGESET(tp->t_rxtcur, rexmt, 5768 max(MSEC_2_TICKS(rack_rto_min), rexmt), 5769 MSEC_2_TICKS(rack_rto_max)); 5770 /* 5771 * We enter the path for PLMTUD if connection is established or, if 5772 * connection is FIN_WAIT_1 status, reason for the last is that if 5773 * amount of data we send is very small, we could send it in couple 5774 * of packets and process straight to FIN. In that case we won't 5775 * catch ESTABLISHED state. 5776 */ 5777 #ifdef INET6 5778 isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? true : false; 5779 #else 5780 isipv6 = false; 5781 #endif 5782 if (((V_tcp_pmtud_blackhole_detect == 1) || 5783 (V_tcp_pmtud_blackhole_detect == 2 && !isipv6) || 5784 (V_tcp_pmtud_blackhole_detect == 3 && isipv6)) && 5785 ((tp->t_state == TCPS_ESTABLISHED) || 5786 (tp->t_state == TCPS_FIN_WAIT_1))) { 5787 5788 /* 5789 * Idea here is that at each stage of mtu probe (usually, 5790 * 1448 -> 1188 -> 524) should be given 2 chances to recover 5791 * before further clamping down. 'tp->t_rxtshift % 2 == 0' 5792 * should take care of that. 5793 */ 5794 if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) == 5795 (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) && 5796 (tp->t_rxtshift >= 2 && tp->t_rxtshift < 6 && 5797 tp->t_rxtshift % 2 == 0)) { 5798 /* 5799 * Enter Path MTU Black-hole Detection mechanism: - 5800 * Disable Path MTU Discovery (IP "DF" bit). - 5801 * Reduce MTU to lower value than what we negotiated 5802 * with peer. 5803 */ 5804 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) == 0) { 5805 /* Record that we may have found a black hole. */ 5806 tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE; 5807 /* Keep track of previous MSS. */ 5808 tp->t_pmtud_saved_maxseg = tp->t_maxseg; 5809 } 5810 5811 /* 5812 * Reduce the MSS to blackhole value or to the 5813 * default in an attempt to retransmit. 5814 */ 5815 #ifdef INET6 5816 if (isipv6 && 5817 tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) { 5818 /* Use the sysctl tuneable blackhole MSS. */ 5819 tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss; 5820 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated); 5821 } else if (isipv6) { 5822 /* Use the default MSS. */ 5823 tp->t_maxseg = V_tcp_v6mssdflt; 5824 /* 5825 * Disable Path MTU Discovery when we switch 5826 * to minmss. 5827 */ 5828 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 5829 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); 5830 } 5831 #endif 5832 #if defined(INET6) && defined(INET) 5833 else 5834 #endif 5835 #ifdef INET 5836 if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) { 5837 /* Use the sysctl tuneable blackhole MSS. */ 5838 tp->t_maxseg = V_tcp_pmtud_blackhole_mss; 5839 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated); 5840 } else { 5841 /* Use the default MSS. */ 5842 tp->t_maxseg = V_tcp_mssdflt; 5843 /* 5844 * Disable Path MTU Discovery when we switch 5845 * to minmss. 5846 */ 5847 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 5848 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); 5849 } 5850 #endif 5851 } else { 5852 /* 5853 * If further retransmissions are still unsuccessful 5854 * with a lowered MTU, maybe this isn't a blackhole 5855 * and we restore the previous MSS and blackhole 5856 * detection flags. The limit '6' is determined by 5857 * giving each probe stage (1448, 1188, 524) 2 5858 * chances to recover. 5859 */ 5860 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) && 5861 (tp->t_rxtshift >= 6)) { 5862 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 5863 tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE; 5864 tp->t_maxseg = tp->t_pmtud_saved_maxseg; 5865 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_failed); 5866 } 5867 } 5868 } 5869 /* 5870 * If we backed off this far, our srtt estimate is probably bogus. 5871 * Clobber it so we'll take the next rtt measurement as our srtt; 5872 * move the current srtt into rttvar to keep the current retransmit 5873 * times until then. 5874 */ 5875 if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) { 5876 #ifdef INET6 5877 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) 5878 in6_losing(tp->t_inpcb); 5879 else 5880 #endif 5881 in_losing(tp->t_inpcb); 5882 tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT); 5883 tp->t_srtt = 0; 5884 } 5885 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 5886 tp->snd_recover = tp->snd_max; 5887 tp->t_flags |= TF_ACKNOW; 5888 tp->t_rtttime = 0; 5889 rack_cong_signal(tp, NULL, CC_RTO); 5890 out: 5891 return (retval); 5892 } 5893 5894 static int 5895 rack_process_timers(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8_t hpts_calling) 5896 { 5897 int32_t ret = 0; 5898 int32_t timers = (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK); 5899 5900 if (timers == 0) { 5901 return (0); 5902 } 5903 if (tp->t_state == TCPS_LISTEN) { 5904 /* no timers on listen sockets */ 5905 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) 5906 return (0); 5907 return (1); 5908 } 5909 if ((timers & PACE_TMR_RACK) && 5910 rack->rc_on_min_to) { 5911 /* 5912 * For the rack timer when we 5913 * are on a min-timeout (which means rrr_conf = 3) 5914 * we don't want to check the timer. It may 5915 * be going off for a pace and thats ok we 5916 * want to send the retransmit (if its ready). 5917 * 5918 * If its on a normal rack timer (non-min) then 5919 * we will check if its expired. 5920 */ 5921 goto skip_time_check; 5922 } 5923 if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) { 5924 uint32_t left; 5925 5926 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { 5927 ret = -1; 5928 rack_log_to_processing(rack, cts, ret, 0); 5929 return (0); 5930 } 5931 if (hpts_calling == 0) { 5932 /* 5933 * A user send or queued mbuf (sack) has called us? We 5934 * return 0 and let the pacing guards 5935 * deal with it if they should or 5936 * should not cause a send. 5937 */ 5938 ret = -2; 5939 rack_log_to_processing(rack, cts, ret, 0); 5940 return (0); 5941 } 5942 /* 5943 * Ok our timer went off early and we are not paced false 5944 * alarm, go back to sleep. 5945 */ 5946 ret = -3; 5947 left = rack->r_ctl.rc_timer_exp - cts; 5948 tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(left)); 5949 rack_log_to_processing(rack, cts, ret, left); 5950 return (1); 5951 } 5952 skip_time_check: 5953 rack->rc_tmr_stopped = 0; 5954 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_MASK; 5955 if (timers & PACE_TMR_DELACK) { 5956 ret = rack_timeout_delack(tp, rack, cts); 5957 } else if (timers & PACE_TMR_RACK) { 5958 rack->r_ctl.rc_tlp_rxt_last_time = cts; 5959 ret = rack_timeout_rack(tp, rack, cts); 5960 } else if (timers & PACE_TMR_TLP) { 5961 rack->r_ctl.rc_tlp_rxt_last_time = cts; 5962 ret = rack_timeout_tlp(tp, rack, cts); 5963 } else if (timers & PACE_TMR_RXT) { 5964 rack->r_ctl.rc_tlp_rxt_last_time = cts; 5965 ret = rack_timeout_rxt(tp, rack, cts); 5966 } else if (timers & PACE_TMR_PERSIT) { 5967 ret = rack_timeout_persist(tp, rack, cts); 5968 } else if (timers & PACE_TMR_KEEP) { 5969 ret = rack_timeout_keepalive(tp, rack, cts); 5970 } 5971 rack_log_to_processing(rack, cts, ret, timers); 5972 return (ret); 5973 } 5974 5975 static void 5976 rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line) 5977 { 5978 struct timeval tv; 5979 uint32_t us_cts, flags_on_entry; 5980 uint8_t hpts_removed = 0; 5981 5982 5983 flags_on_entry = rack->r_ctl.rc_hpts_flags; 5984 us_cts = tcp_get_usecs(&tv); 5985 if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) && 5986 ((TSTMP_GEQ(us_cts, rack->r_ctl.rc_last_output_to)) || 5987 ((tp->snd_max - tp->snd_una) == 0))) { 5988 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); 5989 hpts_removed = 1; 5990 /* If we were not delayed cancel out the flag. */ 5991 if ((tp->snd_max - tp->snd_una) == 0) 5992 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 5993 rack_log_to_cancel(rack, hpts_removed, line, us_cts, &tv, flags_on_entry); 5994 } 5995 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { 5996 rack->rc_tmr_stopped = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK; 5997 if (rack->rc_inp->inp_in_hpts && 5998 ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0)) { 5999 /* 6000 * Canceling timer's when we have no output being 6001 * paced. We also must remove ourselves from the 6002 * hpts. 6003 */ 6004 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); 6005 hpts_removed = 1; 6006 } 6007 rack->r_ctl.rc_hpts_flags &= ~(PACE_TMR_MASK); 6008 } 6009 if (hpts_removed == 0) 6010 rack_log_to_cancel(rack, hpts_removed, line, us_cts, &tv, flags_on_entry); 6011 } 6012 6013 static void 6014 rack_timer_stop(struct tcpcb *tp, uint32_t timer_type) 6015 { 6016 return; 6017 } 6018 6019 static int 6020 rack_stopall(struct tcpcb *tp) 6021 { 6022 struct tcp_rack *rack; 6023 rack = (struct tcp_rack *)tp->t_fb_ptr; 6024 rack->t_timers_stopped = 1; 6025 return (0); 6026 } 6027 6028 static void 6029 rack_timer_activate(struct tcpcb *tp, uint32_t timer_type, uint32_t delta) 6030 { 6031 return; 6032 } 6033 6034 static int 6035 rack_timer_active(struct tcpcb *tp, uint32_t timer_type) 6036 { 6037 return (0); 6038 } 6039 6040 static void 6041 rack_stop_all_timers(struct tcpcb *tp) 6042 { 6043 struct tcp_rack *rack; 6044 6045 /* 6046 * Assure no timers are running. 6047 */ 6048 if (tcp_timer_active(tp, TT_PERSIST)) { 6049 /* We enter in persists, set the flag appropriately */ 6050 rack = (struct tcp_rack *)tp->t_fb_ptr; 6051 rack->rc_in_persist = 1; 6052 } 6053 tcp_timer_suspend(tp, TT_PERSIST); 6054 tcp_timer_suspend(tp, TT_REXMT); 6055 tcp_timer_suspend(tp, TT_KEEP); 6056 tcp_timer_suspend(tp, TT_DELACK); 6057 } 6058 6059 static void 6060 rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack, 6061 struct rack_sendmap *rsm, uint32_t ts) 6062 { 6063 int32_t idx; 6064 6065 rsm->r_rtr_cnt++; 6066 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 6067 rsm->r_dupack = 0; 6068 if (rsm->r_rtr_cnt > RACK_NUM_OF_RETRANS) { 6069 rsm->r_rtr_cnt = RACK_NUM_OF_RETRANS; 6070 rsm->r_flags |= RACK_OVERMAX; 6071 } 6072 if ((rsm->r_rtr_cnt > 1) && ((rsm->r_flags & RACK_TLP) == 0)) { 6073 rack->r_ctl.rc_holes_rxt += (rsm->r_end - rsm->r_start); 6074 rsm->r_rtr_bytes += (rsm->r_end - rsm->r_start); 6075 } 6076 idx = rsm->r_rtr_cnt - 1; 6077 rsm->r_tim_lastsent[idx] = ts; 6078 if (rsm->r_flags & RACK_ACKED) { 6079 /* Problably MTU discovery messing with us */ 6080 rsm->r_flags &= ~RACK_ACKED; 6081 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); 6082 } 6083 if (rsm->r_in_tmap) { 6084 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 6085 rsm->r_in_tmap = 0; 6086 } 6087 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); 6088 rsm->r_in_tmap = 1; 6089 if (rsm->r_flags & RACK_SACK_PASSED) { 6090 /* We have retransmitted due to the SACK pass */ 6091 rsm->r_flags &= ~RACK_SACK_PASSED; 6092 rsm->r_flags |= RACK_WAS_SACKPASS; 6093 } 6094 } 6095 6096 6097 static uint32_t 6098 rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack, 6099 struct rack_sendmap *rsm, uint32_t ts, int32_t *lenp) 6100 { 6101 /* 6102 * We (re-)transmitted starting at rsm->r_start for some length 6103 * (possibly less than r_end. 6104 */ 6105 struct rack_sendmap *nrsm, *insret; 6106 uint32_t c_end; 6107 int32_t len; 6108 6109 len = *lenp; 6110 c_end = rsm->r_start + len; 6111 if (SEQ_GEQ(c_end, rsm->r_end)) { 6112 /* 6113 * We retransmitted the whole piece or more than the whole 6114 * slopping into the next rsm. 6115 */ 6116 rack_update_rsm(tp, rack, rsm, ts); 6117 if (c_end == rsm->r_end) { 6118 *lenp = 0; 6119 return (0); 6120 } else { 6121 int32_t act_len; 6122 6123 /* Hangs over the end return whats left */ 6124 act_len = rsm->r_end - rsm->r_start; 6125 *lenp = (len - act_len); 6126 return (rsm->r_end); 6127 } 6128 /* We don't get out of this block. */ 6129 } 6130 /* 6131 * Here we retransmitted less than the whole thing which means we 6132 * have to split this into what was transmitted and what was not. 6133 */ 6134 nrsm = rack_alloc_full_limit(rack); 6135 if (nrsm == NULL) { 6136 /* 6137 * We can't get memory, so lets not proceed. 6138 */ 6139 *lenp = 0; 6140 return (0); 6141 } 6142 /* 6143 * So here we are going to take the original rsm and make it what we 6144 * retransmitted. nrsm will be the tail portion we did not 6145 * retransmit. For example say the chunk was 1, 11 (10 bytes). And 6146 * we retransmitted 5 bytes i.e. 1, 5. The original piece shrinks to 6147 * 1, 6 and the new piece will be 6, 11. 6148 */ 6149 rack_clone_rsm(rack, nrsm, rsm, c_end); 6150 nrsm->r_dupack = 0; 6151 rack_log_retran_reason(rack, nrsm, __LINE__, 0, 2); 6152 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); 6153 #ifdef INVARIANTS 6154 if (insret != NULL) { 6155 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", 6156 nrsm, insret, rack, rsm); 6157 } 6158 #endif 6159 if (rsm->r_in_tmap) { 6160 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 6161 nrsm->r_in_tmap = 1; 6162 } 6163 rsm->r_flags &= (~RACK_HAS_FIN); 6164 rack_update_rsm(tp, rack, rsm, ts); 6165 *lenp = 0; 6166 return (0); 6167 } 6168 6169 6170 static void 6171 rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len, 6172 uint32_t seq_out, uint8_t th_flags, int32_t err, uint32_t ts, 6173 uint8_t pass, struct rack_sendmap *hintrsm, uint32_t us_cts) 6174 { 6175 struct tcp_rack *rack; 6176 struct rack_sendmap *rsm, *nrsm, *insret, fe; 6177 register uint32_t snd_max, snd_una; 6178 6179 /* 6180 * Add to the RACK log of packets in flight or retransmitted. If 6181 * there is a TS option we will use the TS echoed, if not we will 6182 * grab a TS. 6183 * 6184 * Retransmissions will increment the count and move the ts to its 6185 * proper place. Note that if options do not include TS's then we 6186 * won't be able to effectively use the ACK for an RTT on a retran. 6187 * 6188 * Notes about r_start and r_end. Lets consider a send starting at 6189 * sequence 1 for 10 bytes. In such an example the r_start would be 6190 * 1 (starting sequence) but the r_end would be r_start+len i.e. 11. 6191 * This means that r_end is actually the first sequence for the next 6192 * slot (11). 6193 * 6194 */ 6195 /* 6196 * If err is set what do we do XXXrrs? should we not add the thing? 6197 * -- i.e. return if err != 0 or should we pretend we sent it? -- 6198 * i.e. proceed with add ** do this for now. 6199 */ 6200 INP_WLOCK_ASSERT(tp->t_inpcb); 6201 if (err) 6202 /* 6203 * We don't log errors -- we could but snd_max does not 6204 * advance in this case either. 6205 */ 6206 return; 6207 6208 if (th_flags & TH_RST) { 6209 /* 6210 * We don't log resets and we return immediately from 6211 * sending 6212 */ 6213 return; 6214 } 6215 rack = (struct tcp_rack *)tp->t_fb_ptr; 6216 snd_una = tp->snd_una; 6217 if (SEQ_LEQ((seq_out + len), snd_una)) { 6218 /* Are sending an old segment to induce an ack (keep-alive)? */ 6219 return; 6220 } 6221 if (SEQ_LT(seq_out, snd_una)) { 6222 /* huh? should we panic? */ 6223 uint32_t end; 6224 6225 end = seq_out + len; 6226 seq_out = snd_una; 6227 if (SEQ_GEQ(end, seq_out)) 6228 len = end - seq_out; 6229 else 6230 len = 0; 6231 } 6232 snd_max = tp->snd_max; 6233 if (th_flags & (TH_SYN | TH_FIN)) { 6234 /* 6235 * The call to rack_log_output is made before bumping 6236 * snd_max. This means we can record one extra byte on a SYN 6237 * or FIN if seq_out is adding more on and a FIN is present 6238 * (and we are not resending). 6239 */ 6240 if ((th_flags & TH_SYN) && (seq_out == tp->iss)) 6241 len++; 6242 if (th_flags & TH_FIN) 6243 len++; 6244 if (SEQ_LT(snd_max, tp->snd_nxt)) { 6245 /* 6246 * The add/update as not been done for the FIN/SYN 6247 * yet. 6248 */ 6249 snd_max = tp->snd_nxt; 6250 } 6251 } 6252 if (len == 0) { 6253 /* We don't log zero window probes */ 6254 return; 6255 } 6256 rack->r_ctl.rc_time_last_sent = ts; 6257 if (IN_RECOVERY(tp->t_flags)) { 6258 rack->r_ctl.rc_prr_out += len; 6259 } 6260 /* First question is it a retransmission or new? */ 6261 if (seq_out == snd_max) { 6262 /* Its new */ 6263 again: 6264 rsm = rack_alloc(rack); 6265 if (rsm == NULL) { 6266 /* 6267 * Hmm out of memory and the tcb got destroyed while 6268 * we tried to wait. 6269 */ 6270 return; 6271 } 6272 if (th_flags & TH_FIN) { 6273 rsm->r_flags = RACK_HAS_FIN; 6274 } else { 6275 rsm->r_flags = 0; 6276 } 6277 rsm->r_tim_lastsent[0] = ts; 6278 rsm->r_rtr_cnt = 1; 6279 rsm->r_rtr_bytes = 0; 6280 rsm->usec_orig_send = us_cts; 6281 if (th_flags & TH_SYN) { 6282 /* The data space is one beyond snd_una */ 6283 rsm->r_flags |= RACK_HAS_SIN; 6284 rsm->r_start = seq_out + 1; 6285 rsm->r_end = rsm->r_start + (len - 1); 6286 } else { 6287 /* Normal case */ 6288 rsm->r_start = seq_out; 6289 rsm->r_end = rsm->r_start + len; 6290 } 6291 rsm->r_dupack = 0; 6292 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 6293 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 6294 #ifdef INVARIANTS 6295 if (insret != NULL) { 6296 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", 6297 nrsm, insret, rack, rsm); 6298 } 6299 #endif 6300 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); 6301 rsm->r_in_tmap = 1; 6302 /* 6303 * Special case detection, is there just a single 6304 * packet outstanding when we are not in recovery? 6305 * 6306 * If this is true mark it so. 6307 */ 6308 if ((IN_RECOVERY(tp->t_flags) == 0) && 6309 (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) == ctf_fixed_maxseg(tp))) { 6310 struct rack_sendmap *prsm; 6311 6312 prsm = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 6313 if (prsm) 6314 prsm->r_one_out_nr = 1; 6315 } 6316 return; 6317 } 6318 /* 6319 * If we reach here its a retransmission and we need to find it. 6320 */ 6321 memset(&fe, 0, sizeof(fe)); 6322 more: 6323 if (hintrsm && (hintrsm->r_start == seq_out)) { 6324 rsm = hintrsm; 6325 hintrsm = NULL; 6326 } else { 6327 /* No hints sorry */ 6328 rsm = NULL; 6329 } 6330 if ((rsm) && (rsm->r_start == seq_out)) { 6331 seq_out = rack_update_entry(tp, rack, rsm, ts, &len); 6332 if (len == 0) { 6333 return; 6334 } else { 6335 goto more; 6336 } 6337 } 6338 /* Ok it was not the last pointer go through it the hard way. */ 6339 refind: 6340 fe.r_start = seq_out; 6341 rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); 6342 if (rsm) { 6343 if (rsm->r_start == seq_out) { 6344 seq_out = rack_update_entry(tp, rack, rsm, ts, &len); 6345 if (len == 0) { 6346 return; 6347 } else { 6348 goto refind; 6349 } 6350 } 6351 if (SEQ_GEQ(seq_out, rsm->r_start) && SEQ_LT(seq_out, rsm->r_end)) { 6352 /* Transmitted within this piece */ 6353 /* 6354 * Ok we must split off the front and then let the 6355 * update do the rest 6356 */ 6357 nrsm = rack_alloc_full_limit(rack); 6358 if (nrsm == NULL) { 6359 rack_update_rsm(tp, rack, rsm, ts); 6360 return; 6361 } 6362 /* 6363 * copy rsm to nrsm and then trim the front of rsm 6364 * to not include this part. 6365 */ 6366 rack_clone_rsm(rack, nrsm, rsm, seq_out); 6367 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); 6368 #ifdef INVARIANTS 6369 if (insret != NULL) { 6370 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", 6371 nrsm, insret, rack, rsm); 6372 } 6373 #endif 6374 if (rsm->r_in_tmap) { 6375 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 6376 nrsm->r_in_tmap = 1; 6377 } 6378 rsm->r_flags &= (~RACK_HAS_FIN); 6379 seq_out = rack_update_entry(tp, rack, nrsm, ts, &len); 6380 if (len == 0) { 6381 return; 6382 } else if (len > 0) 6383 goto refind; 6384 } 6385 } 6386 /* 6387 * Hmm not found in map did they retransmit both old and on into the 6388 * new? 6389 */ 6390 if (seq_out == tp->snd_max) { 6391 goto again; 6392 } else if (SEQ_LT(seq_out, tp->snd_max)) { 6393 #ifdef INVARIANTS 6394 printf("seq_out:%u len:%d snd_una:%u snd_max:%u -- but rsm not found?\n", 6395 seq_out, len, tp->snd_una, tp->snd_max); 6396 printf("Starting Dump of all rack entries\n"); 6397 RB_FOREACH(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) { 6398 printf("rsm:%p start:%u end:%u\n", 6399 rsm, rsm->r_start, rsm->r_end); 6400 } 6401 printf("Dump complete\n"); 6402 panic("seq_out not found rack:%p tp:%p", 6403 rack, tp); 6404 #endif 6405 } else { 6406 #ifdef INVARIANTS 6407 /* 6408 * Hmm beyond sndmax? (only if we are using the new rtt-pack 6409 * flag) 6410 */ 6411 panic("seq_out:%u(%d) is beyond snd_max:%u tp:%p", 6412 seq_out, len, tp->snd_max, tp); 6413 #endif 6414 } 6415 } 6416 6417 /* 6418 * Record one of the RTT updates from an ack into 6419 * our sample structure. 6420 */ 6421 6422 static void 6423 tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt, uint32_t len, uint32_t us_rtt, 6424 int confidence, struct rack_sendmap *rsm, uint16_t rtrcnt) 6425 { 6426 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) || 6427 (rack->r_ctl.rack_rs.rs_rtt_lowest > rtt)) { 6428 rack->r_ctl.rack_rs.rs_rtt_lowest = rtt; 6429 } 6430 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) || 6431 (rack->r_ctl.rack_rs.rs_rtt_highest < rtt)) { 6432 rack->r_ctl.rack_rs.rs_rtt_highest = rtt; 6433 } 6434 if (rack->rc_tp->t_flags & TF_GPUTINPROG) { 6435 if (us_rtt < rack->r_ctl.rc_gp_lowrtt) 6436 rack->r_ctl.rc_gp_lowrtt = us_rtt; 6437 if (rack->rc_tp->snd_wnd > rack->r_ctl.rc_gp_high_rwnd) 6438 rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd; 6439 } 6440 if ((confidence == 1) && 6441 ((rsm == NULL) || 6442 (rsm->r_just_ret) || 6443 (rsm->r_one_out_nr && 6444 len < (ctf_fixed_maxseg(rack->rc_tp) * 2)))) { 6445 /* 6446 * If the rsm had a just return 6447 * hit it then we can't trust the 6448 * rtt measurement for buffer deterimination 6449 * Note that a confidence of 2, indicates 6450 * SACK'd which overrides the r_just_ret or 6451 * the r_one_out_nr. If it was a CUM-ACK and 6452 * we had only two outstanding, but get an 6453 * ack for only 1. Then that also lowers our 6454 * confidence. 6455 */ 6456 confidence = 0; 6457 } 6458 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) || 6459 (rack->r_ctl.rack_rs.rs_us_rtt > us_rtt)) { 6460 if (rack->r_ctl.rack_rs.confidence == 0) { 6461 /* 6462 * We take anything with no current confidence 6463 * saved. 6464 */ 6465 rack->r_ctl.rack_rs.rs_us_rtt = us_rtt; 6466 rack->r_ctl.rack_rs.confidence = confidence; 6467 rack->r_ctl.rack_rs.rs_us_rtrcnt = rtrcnt; 6468 } else if (confidence || rack->r_ctl.rack_rs.confidence) { 6469 /* 6470 * Once we have a confident number, 6471 * we can update it with a smaller 6472 * value since this confident number 6473 * may include the DSACK time until 6474 * the next segment (the second one) arrived. 6475 */ 6476 rack->r_ctl.rack_rs.rs_us_rtt = us_rtt; 6477 rack->r_ctl.rack_rs.confidence = confidence; 6478 rack->r_ctl.rack_rs.rs_us_rtrcnt = rtrcnt; 6479 } 6480 6481 } 6482 rack_log_rtt_upd(rack->rc_tp, rack, us_rtt, len, rsm, confidence); 6483 rack->r_ctl.rack_rs.rs_flags = RACK_RTT_VALID; 6484 rack->r_ctl.rack_rs.rs_rtt_tot += rtt; 6485 rack->r_ctl.rack_rs.rs_rtt_cnt++; 6486 } 6487 6488 /* 6489 * Collect new round-trip time estimate 6490 * and update averages and current timeout. 6491 */ 6492 static void 6493 tcp_rack_xmit_timer_commit(struct tcp_rack *rack, struct tcpcb *tp) 6494 { 6495 int32_t delta; 6496 uint32_t o_srtt, o_var; 6497 int32_t hrtt_up = 0; 6498 int32_t rtt; 6499 6500 if (rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) 6501 /* No valid sample */ 6502 return; 6503 if (rack->r_ctl.rc_rate_sample_method == USE_RTT_LOW) { 6504 /* We are to use the lowest RTT seen in a single ack */ 6505 rtt = rack->r_ctl.rack_rs.rs_rtt_lowest; 6506 } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_HIGH) { 6507 /* We are to use the highest RTT seen in a single ack */ 6508 rtt = rack->r_ctl.rack_rs.rs_rtt_highest; 6509 } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_AVG) { 6510 /* We are to use the average RTT seen in a single ack */ 6511 rtt = (int32_t)(rack->r_ctl.rack_rs.rs_rtt_tot / 6512 (uint64_t)rack->r_ctl.rack_rs.rs_rtt_cnt); 6513 } else { 6514 #ifdef INVARIANTS 6515 panic("Unknown rtt variant %d", rack->r_ctl.rc_rate_sample_method); 6516 #endif 6517 return; 6518 } 6519 if (rtt == 0) 6520 rtt = 1; 6521 if (rack->rc_gp_rtt_set == 0) { 6522 /* 6523 * With no RTT we have to accept 6524 * even one we are not confident of. 6525 */ 6526 rack->r_ctl.rc_gp_srtt = rack->r_ctl.rack_rs.rs_us_rtt; 6527 rack->rc_gp_rtt_set = 1; 6528 } else if (rack->r_ctl.rack_rs.confidence) { 6529 /* update the running gp srtt */ 6530 rack->r_ctl.rc_gp_srtt -= (rack->r_ctl.rc_gp_srtt/8); 6531 rack->r_ctl.rc_gp_srtt += rack->r_ctl.rack_rs.rs_us_rtt / 8; 6532 } 6533 if (rack->r_ctl.rack_rs.confidence) { 6534 /* 6535 * record the low and high for highly buffered path computation, 6536 * we only do this if we are confident (not a retransmission). 6537 */ 6538 if (rack->r_ctl.rc_highest_us_rtt < rack->r_ctl.rack_rs.rs_us_rtt) { 6539 rack->r_ctl.rc_highest_us_rtt = rack->r_ctl.rack_rs.rs_us_rtt; 6540 hrtt_up = 1; 6541 } 6542 if (rack->rc_highly_buffered == 0) { 6543 /* 6544 * Currently once we declare a path has 6545 * highly buffered there is no going 6546 * back, which may be a problem... 6547 */ 6548 if ((rack->r_ctl.rc_highest_us_rtt / rack->r_ctl.rc_lowest_us_rtt) > rack_hbp_thresh) { 6549 rack_log_rtt_shrinks(rack, rack->r_ctl.rack_rs.rs_us_rtt, 6550 rack->r_ctl.rc_highest_us_rtt, 6551 rack->r_ctl.rc_lowest_us_rtt, 6552 RACK_RTTS_SEEHBP); 6553 rack->rc_highly_buffered = 1; 6554 } 6555 } 6556 } 6557 if ((rack->r_ctl.rack_rs.confidence) || 6558 (rack->r_ctl.rack_rs.rs_us_rtrcnt == 1)) { 6559 /* 6560 * If we are highly confident of it <or> it was 6561 * never retransmitted we accept it as the last us_rtt. 6562 */ 6563 rack->r_ctl.rc_last_us_rtt = rack->r_ctl.rack_rs.rs_us_rtt; 6564 /* The lowest rtt can be set if its was not retransmited */ 6565 if (rack->r_ctl.rc_lowest_us_rtt > rack->r_ctl.rack_rs.rs_us_rtt) { 6566 rack->r_ctl.rc_lowest_us_rtt = rack->r_ctl.rack_rs.rs_us_rtt; 6567 if (rack->r_ctl.rc_lowest_us_rtt == 0) 6568 rack->r_ctl.rc_lowest_us_rtt = 1; 6569 } 6570 } 6571 rack_log_rtt_sample(rack, rtt); 6572 o_srtt = tp->t_srtt; 6573 o_var = tp->t_rttvar; 6574 rack = (struct tcp_rack *)tp->t_fb_ptr; 6575 if (tp->t_srtt != 0) { 6576 /* 6577 * srtt is stored as fixed point with 5 bits after the 6578 * binary point (i.e., scaled by 8). The following magic is 6579 * equivalent to the smoothing algorithm in rfc793 with an 6580 * alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed point). 6581 * Adjust rtt to origin 0. 6582 */ 6583 delta = ((rtt - 1) << TCP_DELTA_SHIFT) 6584 - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)); 6585 6586 tp->t_srtt += delta; 6587 if (tp->t_srtt <= 0) 6588 tp->t_srtt = 1; 6589 6590 /* 6591 * We accumulate a smoothed rtt variance (actually, a 6592 * smoothed mean difference), then set the retransmit timer 6593 * to smoothed rtt + 4 times the smoothed variance. rttvar 6594 * is stored as fixed point with 4 bits after the binary 6595 * point (scaled by 16). The following is equivalent to 6596 * rfc793 smoothing with an alpha of .75 (rttvar = 6597 * rttvar*3/4 + |delta| / 4). This replaces rfc793's 6598 * wired-in beta. 6599 */ 6600 if (delta < 0) 6601 delta = -delta; 6602 delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT); 6603 tp->t_rttvar += delta; 6604 if (tp->t_rttvar <= 0) 6605 tp->t_rttvar = 1; 6606 if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar) 6607 tp->t_rttbest = tp->t_srtt + tp->t_rttvar; 6608 } else { 6609 /* 6610 * No rtt measurement yet - use the unsmoothed rtt. Set the 6611 * variance to half the rtt (so our first retransmit happens 6612 * at 3*rtt). 6613 */ 6614 tp->t_srtt = rtt << TCP_RTT_SHIFT; 6615 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1); 6616 tp->t_rttbest = tp->t_srtt + tp->t_rttvar; 6617 } 6618 KMOD_TCPSTAT_INC(tcps_rttupdated); 6619 tp->t_rttupdated++; 6620 #ifdef STATS 6621 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rtt)); 6622 #endif 6623 tp->t_rxtshift = 0; 6624 6625 /* 6626 * the retransmit should happen at rtt + 4 * rttvar. Because of the 6627 * way we do the smoothing, srtt and rttvar will each average +1/2 6628 * tick of bias. When we compute the retransmit timer, we want 1/2 6629 * tick of rounding and 1 extra tick because of +-1/2 tick 6630 * uncertainty in the firing of the timer. The bias will give us 6631 * exactly the 1.5 tick we need. But, because the bias is 6632 * statistical, we have to test that we don't drop below the minimum 6633 * feasible timer (which is 2 ticks). 6634 */ 6635 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), 6636 max(MSEC_2_TICKS(rack_rto_min), rtt + 2), MSEC_2_TICKS(rack_rto_max)); 6637 tp->t_softerror = 0; 6638 } 6639 6640 static void 6641 rack_earlier_retran(struct tcpcb *tp, struct rack_sendmap *rsm, 6642 uint32_t t, uint32_t cts) 6643 { 6644 /* 6645 * For this RSM, we acknowledged the data from a previous 6646 * transmission, not the last one we made. This means we did a false 6647 * retransmit. 6648 */ 6649 struct tcp_rack *rack; 6650 6651 if (rsm->r_flags & RACK_HAS_FIN) { 6652 /* 6653 * The sending of the FIN often is multiple sent when we 6654 * have everything outstanding ack'd. We ignore this case 6655 * since its over now. 6656 */ 6657 return; 6658 } 6659 if (rsm->r_flags & RACK_TLP) { 6660 /* 6661 * We expect TLP's to have this occur. 6662 */ 6663 return; 6664 } 6665 rack = (struct tcp_rack *)tp->t_fb_ptr; 6666 /* should we undo cc changes and exit recovery? */ 6667 if (IN_RECOVERY(tp->t_flags)) { 6668 if (rack->r_ctl.rc_rsm_start == rsm->r_start) { 6669 /* 6670 * Undo what we ratched down and exit recovery if 6671 * possible 6672 */ 6673 EXIT_RECOVERY(tp->t_flags); 6674 tp->snd_recover = tp->snd_una; 6675 if (rack->r_ctl.rc_cwnd_at > tp->snd_cwnd) 6676 tp->snd_cwnd = rack->r_ctl.rc_cwnd_at; 6677 if (rack->r_ctl.rc_ssthresh_at > tp->snd_ssthresh) 6678 tp->snd_ssthresh = rack->r_ctl.rc_ssthresh_at; 6679 } 6680 } 6681 if (rsm->r_flags & RACK_WAS_SACKPASS) { 6682 /* 6683 * We retransmitted based on a sack and the earlier 6684 * retransmission ack'd it - re-ordering is occuring. 6685 */ 6686 counter_u64_add(rack_reorder_seen, 1); 6687 rack->r_ctl.rc_reorder_ts = cts; 6688 } 6689 counter_u64_add(rack_badfr, 1); 6690 counter_u64_add(rack_badfr_bytes, (rsm->r_end - rsm->r_start)); 6691 } 6692 6693 static void 6694 rack_apply_updated_usrtt(struct tcp_rack *rack, uint32_t us_rtt, uint32_t us_cts) 6695 { 6696 /* 6697 * Apply to filter the inbound us-rtt at us_cts. 6698 */ 6699 uint32_t old_rtt; 6700 6701 old_rtt = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); 6702 apply_filter_min_small(&rack->r_ctl.rc_gp_min_rtt, 6703 us_rtt, us_cts); 6704 if (rack->r_ctl.last_pacing_time && 6705 rack->rc_gp_dyn_mul && 6706 (rack->r_ctl.last_pacing_time > us_rtt)) 6707 rack->pacing_longer_than_rtt = 1; 6708 else 6709 rack->pacing_longer_than_rtt = 0; 6710 if (old_rtt > us_rtt) { 6711 /* We just hit a new lower rtt time */ 6712 rack_log_rtt_shrinks(rack, us_cts, old_rtt, 6713 __LINE__, RACK_RTTS_NEWRTT); 6714 /* 6715 * Only count it if its lower than what we saw within our 6716 * calculated range. 6717 */ 6718 if ((old_rtt - us_rtt) > rack_min_rtt_movement) { 6719 if (rack_probertt_lower_within && 6720 rack->rc_gp_dyn_mul && 6721 (rack->use_fixed_rate == 0) && 6722 (rack->rc_always_pace)) { 6723 /* 6724 * We are seeing a new lower rtt very close 6725 * to the time that we would have entered probe-rtt. 6726 * This is probably due to the fact that a peer flow 6727 * has entered probe-rtt. Lets go in now too. 6728 */ 6729 uint32_t val; 6730 6731 val = rack_probertt_lower_within * rack_time_between_probertt; 6732 val /= 100; 6733 if ((rack->in_probe_rtt == 0) && 6734 ((us_cts - rack->r_ctl.rc_lower_rtt_us_cts) >= (rack_time_between_probertt - val))) { 6735 rack_enter_probertt(rack, us_cts); 6736 } 6737 } 6738 rack->r_ctl.rc_lower_rtt_us_cts = us_cts; 6739 } 6740 } 6741 } 6742 6743 static int 6744 rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack, 6745 struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type, tcp_seq th_ack) 6746 { 6747 int32_t i; 6748 uint32_t t, len_acked; 6749 6750 if ((rsm->r_flags & RACK_ACKED) || 6751 (rsm->r_flags & RACK_WAS_ACKED)) 6752 /* Already done */ 6753 return (0); 6754 6755 if (ack_type == CUM_ACKED) { 6756 if (SEQ_GT(th_ack, rsm->r_end)) 6757 len_acked = rsm->r_end - rsm->r_start; 6758 else 6759 len_acked = th_ack - rsm->r_start; 6760 } else 6761 len_acked = rsm->r_end - rsm->r_start; 6762 if (rsm->r_rtr_cnt == 1) { 6763 uint32_t us_rtt; 6764 6765 t = cts - rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; 6766 if ((int)t <= 0) 6767 t = 1; 6768 if (!tp->t_rttlow || tp->t_rttlow > t) 6769 tp->t_rttlow = t; 6770 if (!rack->r_ctl.rc_rack_min_rtt || 6771 SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 6772 rack->r_ctl.rc_rack_min_rtt = t; 6773 if (rack->r_ctl.rc_rack_min_rtt == 0) { 6774 rack->r_ctl.rc_rack_min_rtt = 1; 6775 } 6776 } 6777 us_rtt = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time) - rsm->usec_orig_send; 6778 if (us_rtt == 0) 6779 us_rtt = 1; 6780 rack_apply_updated_usrtt(rack, us_rtt, tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time)); 6781 if (ack_type == SACKED) 6782 tcp_rack_xmit_timer(rack, t + 1, len_acked, us_rtt, 2 , rsm, rsm->r_rtr_cnt); 6783 else { 6784 /* 6785 * For cum-ack we are only confident if what 6786 * is being acked is included in a measurement. 6787 * Otherwise it could be an idle period that 6788 * includes Delayed-ack time. 6789 */ 6790 tcp_rack_xmit_timer(rack, t + 1, len_acked, us_rtt, 6791 (rack->app_limited_needs_set ? 0 : 1), rsm, rsm->r_rtr_cnt); 6792 } 6793 if ((rsm->r_flags & RACK_TLP) && 6794 (!IN_RECOVERY(tp->t_flags))) { 6795 /* Segment was a TLP and our retrans matched */ 6796 if (rack->r_ctl.rc_tlp_cwnd_reduce) { 6797 rack->r_ctl.rc_rsm_start = tp->snd_max; 6798 rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; 6799 rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; 6800 rack_cong_signal(tp, NULL, CC_NDUPACK); 6801 /* 6802 * When we enter recovery we need to assure 6803 * we send one packet. 6804 */ 6805 if (rack->rack_no_prr == 0) { 6806 rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); 6807 rack_log_to_prr(rack, 7, 0); 6808 } 6809 } 6810 } 6811 if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) { 6812 /* New more recent rack_tmit_time */ 6813 rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; 6814 rack->rc_rack_rtt = t; 6815 } 6816 return (1); 6817 } 6818 /* 6819 * We clear the soft/rxtshift since we got an ack. 6820 * There is no assurance we will call the commit() function 6821 * so we need to clear these to avoid incorrect handling. 6822 */ 6823 tp->t_rxtshift = 0; 6824 tp->t_softerror = 0; 6825 if ((to->to_flags & TOF_TS) && 6826 (ack_type == CUM_ACKED) && 6827 (to->to_tsecr) && 6828 ((rsm->r_flags & RACK_OVERMAX) == 0)) { 6829 /* 6830 * Now which timestamp does it match? In this block the ACK 6831 * must be coming from a previous transmission. 6832 */ 6833 for (i = 0; i < rsm->r_rtr_cnt; i++) { 6834 if (rsm->r_tim_lastsent[i] == to->to_tsecr) { 6835 t = cts - rsm->r_tim_lastsent[i]; 6836 if ((int)t <= 0) 6837 t = 1; 6838 if ((i + 1) < rsm->r_rtr_cnt) { 6839 /* Likely */ 6840 rack_earlier_retran(tp, rsm, t, cts); 6841 } 6842 if (!tp->t_rttlow || tp->t_rttlow > t) 6843 tp->t_rttlow = t; 6844 if (!rack->r_ctl.rc_rack_min_rtt || SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 6845 rack->r_ctl.rc_rack_min_rtt = t; 6846 if (rack->r_ctl.rc_rack_min_rtt == 0) { 6847 rack->r_ctl.rc_rack_min_rtt = 1; 6848 } 6849 } 6850 if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, 6851 rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) { 6852 /* New more recent rack_tmit_time */ 6853 rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; 6854 rack->rc_rack_rtt = t; 6855 } 6856 tcp_rack_xmit_timer(rack, t + 1, len_acked, (t * HPTS_USEC_IN_MSEC), 0, rsm, 6857 rsm->r_rtr_cnt); 6858 return (1); 6859 } 6860 } 6861 goto ts_not_found; 6862 } else { 6863 /* 6864 * Ok its a SACK block that we retransmitted. or a windows 6865 * machine without timestamps. We can tell nothing from the 6866 * time-stamp since its not there or the time the peer last 6867 * recieved a segment that moved forward its cum-ack point. 6868 */ 6869 ts_not_found: 6870 i = rsm->r_rtr_cnt - 1; 6871 t = cts - rsm->r_tim_lastsent[i]; 6872 if ((int)t <= 0) 6873 t = 1; 6874 if (rack->r_ctl.rc_rack_min_rtt && SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 6875 /* 6876 * We retransmitted and the ack came back in less 6877 * than the smallest rtt we have observed. We most 6878 * likey did an improper retransmit as outlined in 6879 * 4.2 Step 3 point 2 in the rack-draft. 6880 */ 6881 i = rsm->r_rtr_cnt - 2; 6882 t = cts - rsm->r_tim_lastsent[i]; 6883 rack_earlier_retran(tp, rsm, t, cts); 6884 } else if (rack->r_ctl.rc_rack_min_rtt) { 6885 /* 6886 * We retransmitted it and the retransmit did the 6887 * job. 6888 */ 6889 if (!rack->r_ctl.rc_rack_min_rtt || 6890 SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 6891 rack->r_ctl.rc_rack_min_rtt = t; 6892 if (rack->r_ctl.rc_rack_min_rtt == 0) { 6893 rack->r_ctl.rc_rack_min_rtt = 1; 6894 } 6895 } 6896 if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, rsm->r_tim_lastsent[i])) { 6897 /* New more recent rack_tmit_time */ 6898 rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[i]; 6899 rack->rc_rack_rtt = t; 6900 } 6901 return (1); 6902 } 6903 } 6904 return (0); 6905 } 6906 6907 /* 6908 * Mark the SACK_PASSED flag on all entries prior to rsm send wise. 6909 */ 6910 static void 6911 rack_log_sack_passed(struct tcpcb *tp, 6912 struct tcp_rack *rack, struct rack_sendmap *rsm) 6913 { 6914 struct rack_sendmap *nrsm; 6915 6916 nrsm = rsm; 6917 TAILQ_FOREACH_REVERSE_FROM(nrsm, &rack->r_ctl.rc_tmap, 6918 rack_head, r_tnext) { 6919 if (nrsm == rsm) { 6920 /* Skip orginal segment he is acked */ 6921 continue; 6922 } 6923 if (nrsm->r_flags & RACK_ACKED) { 6924 /* 6925 * Skip ack'd segments, though we 6926 * should not see these, since tmap 6927 * should not have ack'd segments. 6928 */ 6929 continue; 6930 } 6931 if (nrsm->r_flags & RACK_SACK_PASSED) { 6932 /* 6933 * We found one that is already marked 6934 * passed, we have been here before and 6935 * so all others below this are marked. 6936 */ 6937 break; 6938 } 6939 nrsm->r_flags |= RACK_SACK_PASSED; 6940 nrsm->r_flags &= ~RACK_WAS_SACKPASS; 6941 } 6942 } 6943 6944 static void 6945 rack_need_set_test(struct tcpcb *tp, 6946 struct tcp_rack *rack, 6947 struct rack_sendmap *rsm, 6948 tcp_seq th_ack, 6949 int line, 6950 int use_which) 6951 { 6952 6953 if ((tp->t_flags & TF_GPUTINPROG) && 6954 SEQ_GEQ(rsm->r_end, tp->gput_seq)) { 6955 /* 6956 * We were app limited, and this ack 6957 * butts up or goes beyond the point where we want 6958 * to start our next measurement. We need 6959 * to record the new gput_ts as here and 6960 * possibly update the start sequence. 6961 */ 6962 uint32_t seq, ts; 6963 6964 if (rsm->r_rtr_cnt > 1) { 6965 /* 6966 * This is a retransmit, can we 6967 * really make any assessment at this 6968 * point? We are not really sure of 6969 * the timestamp, is it this or the 6970 * previous transmission? 6971 * 6972 * Lets wait for something better that 6973 * is not retransmitted. 6974 */ 6975 return; 6976 } 6977 seq = tp->gput_seq; 6978 ts = tp->gput_ts; 6979 rack->app_limited_needs_set = 0; 6980 tp->gput_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 6981 /* Do we start at a new end? */ 6982 if ((use_which == RACK_USE_BEG) && 6983 SEQ_GEQ(rsm->r_start, tp->gput_seq)) { 6984 /* 6985 * When we get an ACK that just eats 6986 * up some of the rsm, we set RACK_USE_BEG 6987 * since whats at r_start (i.e. th_ack) 6988 * is left unacked and thats where the 6989 * measurement not starts. 6990 */ 6991 tp->gput_seq = rsm->r_start; 6992 rack->r_ctl.rc_gp_output_ts = rsm->usec_orig_send; 6993 } 6994 if ((use_which == RACK_USE_END) && 6995 SEQ_GEQ(rsm->r_end, tp->gput_seq)) { 6996 /* 6997 * We use the end when the cumack 6998 * is moving forward and completely 6999 * deleting the rsm passed so basically 7000 * r_end holds th_ack. 7001 * 7002 * For SACK's we also want to use the end 7003 * since this piece just got sacked and 7004 * we want to target anything after that 7005 * in our measurement. 7006 */ 7007 tp->gput_seq = rsm->r_end; 7008 rack->r_ctl.rc_gp_output_ts = rsm->usec_orig_send; 7009 } 7010 if (use_which == RACK_USE_END_OR_THACK) { 7011 /* 7012 * special case for ack moving forward, 7013 * not a sack, we need to move all the 7014 * way up to where this ack cum-ack moves 7015 * to. 7016 */ 7017 if (SEQ_GT(th_ack, rsm->r_end)) 7018 tp->gput_seq = th_ack; 7019 else 7020 tp->gput_seq = rsm->r_end; 7021 rack->r_ctl.rc_gp_output_ts = rsm->usec_orig_send; 7022 } 7023 if (SEQ_GT(tp->gput_seq, tp->gput_ack)) { 7024 /* 7025 * We moved beyond this guy's range, re-calculate 7026 * the new end point. 7027 */ 7028 if (rack->rc_gp_filled == 0) { 7029 tp->gput_ack = tp->gput_seq + max(rc_init_window(rack), (MIN_GP_WIN * ctf_fixed_maxseg(tp))); 7030 } else { 7031 tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack); 7032 } 7033 } 7034 /* 7035 * We are moving the goal post, we may be able to clear the 7036 * measure_saw_probe_rtt flag. 7037 */ 7038 if ((rack->in_probe_rtt == 0) && 7039 (rack->measure_saw_probe_rtt) && 7040 (SEQ_GEQ(tp->gput_seq, rack->r_ctl.rc_probertt_sndmax_atexit))) 7041 rack->measure_saw_probe_rtt = 0; 7042 rack_log_pacing_delay_calc(rack, ts, tp->gput_ts, 7043 seq, tp->gput_seq, 0, 5, line, NULL); 7044 if (rack->rc_gp_filled && 7045 ((tp->gput_ack - tp->gput_seq) < 7046 max(rc_init_window(rack), (MIN_GP_WIN * 7047 ctf_fixed_maxseg(tp))))) { 7048 /* 7049 * There is no sense of continuing this measurement 7050 * because its too small to gain us anything we 7051 * trust. Skip it and that way we can start a new 7052 * measurement quicker. 7053 */ 7054 rack_log_pacing_delay_calc(rack, tp->gput_ack, tp->gput_seq, 7055 0, 0, 0, 6, __LINE__, NULL); 7056 tp->t_flags &= ~TF_GPUTINPROG; 7057 } 7058 } 7059 } 7060 7061 static uint32_t 7062 rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, struct sackblk *sack, 7063 struct tcpopt *to, struct rack_sendmap **prsm, uint32_t cts, int *moved_two) 7064 { 7065 uint32_t start, end, changed = 0; 7066 struct rack_sendmap stack_map; 7067 struct rack_sendmap *rsm, *nrsm, fe, *insret, *prev, *next; 7068 int32_t used_ref = 1; 7069 int moved = 0; 7070 7071 start = sack->start; 7072 end = sack->end; 7073 rsm = *prsm; 7074 memset(&fe, 0, sizeof(fe)); 7075 do_rest_ofb: 7076 if ((rsm == NULL) || 7077 (SEQ_LT(end, rsm->r_start)) || 7078 (SEQ_GEQ(start, rsm->r_end)) || 7079 (SEQ_LT(start, rsm->r_start))) { 7080 /* 7081 * We are not in the right spot, 7082 * find the correct spot in the tree. 7083 */ 7084 used_ref = 0; 7085 fe.r_start = start; 7086 rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); 7087 moved++; 7088 } 7089 if (rsm == NULL) { 7090 /* TSNH */ 7091 goto out; 7092 } 7093 /* Ok we have an ACK for some piece of this rsm */ 7094 if (rsm->r_start != start) { 7095 if ((rsm->r_flags & RACK_ACKED) == 0) { 7096 /** 7097 * Need to split this in two pieces the before and after, 7098 * the before remains in the map, the after must be 7099 * added. In other words we have: 7100 * rsm |--------------| 7101 * sackblk |-------> 7102 * rsm will become 7103 * rsm |---| 7104 * and nrsm will be the sacked piece 7105 * nrsm |----------| 7106 * 7107 * But before we start down that path lets 7108 * see if the sack spans over on top of 7109 * the next guy and it is already sacked. 7110 */ 7111 next = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7112 if (next && (next->r_flags & RACK_ACKED) && 7113 SEQ_GEQ(end, next->r_start)) { 7114 /** 7115 * So the next one is already acked, and 7116 * we can thus by hookery use our stack_map 7117 * to reflect the piece being sacked and 7118 * then adjust the two tree entries moving 7119 * the start and ends around. So we start like: 7120 * rsm |------------| (not-acked) 7121 * next |-----------| (acked) 7122 * sackblk |--------> 7123 * We want to end like so: 7124 * rsm |------| (not-acked) 7125 * next |-----------------| (acked) 7126 * nrsm |-----| 7127 * Where nrsm is a temporary stack piece we 7128 * use to update all the gizmos. 7129 */ 7130 /* Copy up our fudge block */ 7131 nrsm = &stack_map; 7132 memcpy(nrsm, rsm, sizeof(struct rack_sendmap)); 7133 /* Now adjust our tree blocks */ 7134 rsm->r_end = start; 7135 next->r_start = start; 7136 /* Clear out the dup ack count of the remainder */ 7137 rsm->r_dupack = 0; 7138 rsm->r_just_ret = 0; 7139 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 7140 /* Now lets make sure our fudge block is right */ 7141 nrsm->r_start = start; 7142 /* Now lets update all the stats and such */ 7143 rack_update_rtt(tp, rack, nrsm, to, cts, SACKED, 0); 7144 if (rack->app_limited_needs_set) 7145 rack_need_set_test(tp, rack, nrsm, tp->snd_una, __LINE__, RACK_USE_END); 7146 changed += (nrsm->r_end - nrsm->r_start); 7147 rack->r_ctl.rc_sacked += (nrsm->r_end - nrsm->r_start); 7148 if (nrsm->r_flags & RACK_SACK_PASSED) { 7149 counter_u64_add(rack_reorder_seen, 1); 7150 rack->r_ctl.rc_reorder_ts = cts; 7151 } 7152 /* 7153 * Now we want to go up from rsm (the 7154 * one left un-acked) to the next one 7155 * in the tmap. We do this so when 7156 * we walk backwards we include marking 7157 * sack-passed on rsm (The one passed in 7158 * is skipped since it is generally called 7159 * on something sacked before removing it 7160 * from the tmap). 7161 */ 7162 if (rsm->r_in_tmap) { 7163 nrsm = TAILQ_NEXT(rsm, r_tnext); 7164 /* 7165 * Now that we have the next 7166 * one walk backwards from there. 7167 */ 7168 if (nrsm && nrsm->r_in_tmap) 7169 rack_log_sack_passed(tp, rack, nrsm); 7170 } 7171 /* Now are we done? */ 7172 if (SEQ_LT(end, next->r_end) || 7173 (end == next->r_end)) { 7174 /* Done with block */ 7175 goto out; 7176 } 7177 counter_u64_add(rack_sack_used_next_merge, 1); 7178 /* Postion for the next block */ 7179 start = next->r_end; 7180 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, next); 7181 if (rsm == NULL) 7182 goto out; 7183 } else { 7184 /** 7185 * We can't use any hookery here, so we 7186 * need to split the map. We enter like 7187 * so: 7188 * rsm |--------| 7189 * sackblk |-----> 7190 * We will add the new block nrsm and 7191 * that will be the new portion, and then 7192 * fall through after reseting rsm. So we 7193 * split and look like this: 7194 * rsm |----| 7195 * sackblk |-----> 7196 * nrsm |---| 7197 * We then fall through reseting 7198 * rsm to nrsm, so the next block 7199 * picks it up. 7200 */ 7201 nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT); 7202 if (nrsm == NULL) { 7203 /* 7204 * failed XXXrrs what can we do but loose the sack 7205 * info? 7206 */ 7207 goto out; 7208 } 7209 counter_u64_add(rack_sack_splits, 1); 7210 rack_clone_rsm(rack, nrsm, rsm, start); 7211 rsm->r_just_ret = 0; 7212 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); 7213 #ifdef INVARIANTS 7214 if (insret != NULL) { 7215 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", 7216 nrsm, insret, rack, rsm); 7217 } 7218 #endif 7219 if (rsm->r_in_tmap) { 7220 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 7221 nrsm->r_in_tmap = 1; 7222 } 7223 rsm->r_flags &= (~RACK_HAS_FIN); 7224 /* Position us to point to the new nrsm that starts the sack blk */ 7225 rsm = nrsm; 7226 } 7227 } else { 7228 /* Already sacked this piece */ 7229 counter_u64_add(rack_sack_skipped_acked, 1); 7230 moved++; 7231 if (end == rsm->r_end) { 7232 /* Done with block */ 7233 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7234 goto out; 7235 } else if (SEQ_LT(end, rsm->r_end)) { 7236 /* A partial sack to a already sacked block */ 7237 moved++; 7238 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7239 goto out; 7240 } else { 7241 /* 7242 * The end goes beyond this guy 7243 * repostion the start to the 7244 * next block. 7245 */ 7246 start = rsm->r_end; 7247 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7248 if (rsm == NULL) 7249 goto out; 7250 } 7251 } 7252 } 7253 if (SEQ_GEQ(end, rsm->r_end)) { 7254 /** 7255 * The end of this block is either beyond this guy or right 7256 * at this guy. I.e.: 7257 * rsm --- |-----| 7258 * end |-----| 7259 * <or> 7260 * end |---------| 7261 */ 7262 if ((rsm->r_flags & RACK_ACKED) == 0) { 7263 rack_update_rtt(tp, rack, rsm, to, cts, SACKED, 0); 7264 changed += (rsm->r_end - rsm->r_start); 7265 rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start); 7266 if (rsm->r_in_tmap) /* should be true */ 7267 rack_log_sack_passed(tp, rack, rsm); 7268 /* Is Reordering occuring? */ 7269 if (rsm->r_flags & RACK_SACK_PASSED) { 7270 rsm->r_flags &= ~RACK_SACK_PASSED; 7271 counter_u64_add(rack_reorder_seen, 1); 7272 rack->r_ctl.rc_reorder_ts = cts; 7273 } 7274 if (rack->app_limited_needs_set) 7275 rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_END); 7276 rsm->r_ack_arrival = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 7277 rsm->r_flags |= RACK_ACKED; 7278 rsm->r_flags &= ~RACK_TLP; 7279 if (rsm->r_in_tmap) { 7280 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 7281 rsm->r_in_tmap = 0; 7282 } 7283 } else { 7284 counter_u64_add(rack_sack_skipped_acked, 1); 7285 moved++; 7286 } 7287 if (end == rsm->r_end) { 7288 /* This block only - done, setup for next */ 7289 goto out; 7290 } 7291 /* 7292 * There is more not coverend by this rsm move on 7293 * to the next block in the RB tree. 7294 */ 7295 nrsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7296 start = rsm->r_end; 7297 rsm = nrsm; 7298 if (rsm == NULL) 7299 goto out; 7300 goto do_rest_ofb; 7301 } 7302 /** 7303 * The end of this sack block is smaller than 7304 * our rsm i.e.: 7305 * rsm --- |-----| 7306 * end |--| 7307 */ 7308 if ((rsm->r_flags & RACK_ACKED) == 0) { 7309 prev = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7310 if (prev && (prev->r_flags & RACK_ACKED)) { 7311 /** 7312 * Goal, we want the right remainder of rsm to shrink 7313 * in place and span from (rsm->r_start = end) to rsm->r_end. 7314 * We want to expand prev to go all the way 7315 * to prev->r_end <- end. 7316 * so in the tree we have before: 7317 * prev |--------| (acked) 7318 * rsm |-------| (non-acked) 7319 * sackblk |-| 7320 * We churn it so we end up with 7321 * prev |----------| (acked) 7322 * rsm |-----| (non-acked) 7323 * nrsm |-| (temporary) 7324 */ 7325 nrsm = &stack_map; 7326 memcpy(nrsm, rsm, sizeof(struct rack_sendmap)); 7327 prev->r_end = end; 7328 rsm->r_start = end; 7329 /* Now adjust nrsm (stack copy) to be 7330 * the one that is the small 7331 * piece that was "sacked". 7332 */ 7333 nrsm->r_end = end; 7334 rsm->r_dupack = 0; 7335 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 7336 /* 7337 * Now nrsm is our new little piece 7338 * that is acked (which was merged 7339 * to prev). Update the rtt and changed 7340 * based on that. Also check for reordering. 7341 */ 7342 rack_update_rtt(tp, rack, nrsm, to, cts, SACKED, 0); 7343 if (rack->app_limited_needs_set) 7344 rack_need_set_test(tp, rack, nrsm, tp->snd_una, __LINE__, RACK_USE_END); 7345 changed += (nrsm->r_end - nrsm->r_start); 7346 rack->r_ctl.rc_sacked += (nrsm->r_end - nrsm->r_start); 7347 if (nrsm->r_flags & RACK_SACK_PASSED) { 7348 counter_u64_add(rack_reorder_seen, 1); 7349 rack->r_ctl.rc_reorder_ts = cts; 7350 } 7351 rsm = prev; 7352 counter_u64_add(rack_sack_used_prev_merge, 1); 7353 } else { 7354 /** 7355 * This is the case where our previous 7356 * block is not acked either, so we must 7357 * split the block in two. 7358 */ 7359 nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT); 7360 if (nrsm == NULL) { 7361 /* failed rrs what can we do but loose the sack info? */ 7362 goto out; 7363 } 7364 /** 7365 * In this case nrsm becomes 7366 * nrsm->r_start = end; 7367 * nrsm->r_end = rsm->r_end; 7368 * which is un-acked. 7369 * <and> 7370 * rsm->r_end = nrsm->r_start; 7371 * i.e. the remaining un-acked 7372 * piece is left on the left 7373 * hand side. 7374 * 7375 * So we start like this 7376 * rsm |----------| (not acked) 7377 * sackblk |---| 7378 * build it so we have 7379 * rsm |---| (acked) 7380 * nrsm |------| (not acked) 7381 */ 7382 counter_u64_add(rack_sack_splits, 1); 7383 rack_clone_rsm(rack, nrsm, rsm, end); 7384 rsm->r_flags &= (~RACK_HAS_FIN); 7385 rsm->r_just_ret = 0; 7386 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); 7387 #ifdef INVARIANTS 7388 if (insret != NULL) { 7389 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", 7390 nrsm, insret, rack, rsm); 7391 } 7392 #endif 7393 if (rsm->r_in_tmap) { 7394 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 7395 nrsm->r_in_tmap = 1; 7396 } 7397 nrsm->r_dupack = 0; 7398 rack_log_retran_reason(rack, nrsm, __LINE__, 0, 2); 7399 rack_update_rtt(tp, rack, rsm, to, cts, SACKED, 0); 7400 changed += (rsm->r_end - rsm->r_start); 7401 rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start); 7402 if (rsm->r_in_tmap) /* should be true */ 7403 rack_log_sack_passed(tp, rack, rsm); 7404 /* Is Reordering occuring? */ 7405 if (rsm->r_flags & RACK_SACK_PASSED) { 7406 rsm->r_flags &= ~RACK_SACK_PASSED; 7407 counter_u64_add(rack_reorder_seen, 1); 7408 rack->r_ctl.rc_reorder_ts = cts; 7409 } 7410 if (rack->app_limited_needs_set) 7411 rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_END); 7412 rsm->r_ack_arrival = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 7413 rsm->r_flags |= RACK_ACKED; 7414 rsm->r_flags &= ~RACK_TLP; 7415 if (rsm->r_in_tmap) { 7416 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 7417 rsm->r_in_tmap = 0; 7418 } 7419 } 7420 } else if (start != end){ 7421 /* 7422 * The block was already acked. 7423 */ 7424 counter_u64_add(rack_sack_skipped_acked, 1); 7425 moved++; 7426 } 7427 out: 7428 if (rsm && (rsm->r_flags & RACK_ACKED)) { 7429 /* 7430 * Now can we merge where we worked 7431 * with either the previous or 7432 * next block? 7433 */ 7434 next = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7435 while (next) { 7436 if (next->r_flags & RACK_ACKED) { 7437 /* yep this and next can be merged */ 7438 rsm = rack_merge_rsm(rack, rsm, next); 7439 next = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7440 } else 7441 break; 7442 } 7443 /* Now what about the previous? */ 7444 prev = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7445 while (prev) { 7446 if (prev->r_flags & RACK_ACKED) { 7447 /* yep the previous and this can be merged */ 7448 rsm = rack_merge_rsm(rack, prev, rsm); 7449 prev = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7450 } else 7451 break; 7452 } 7453 } 7454 if (used_ref == 0) { 7455 counter_u64_add(rack_sack_proc_all, 1); 7456 } else { 7457 counter_u64_add(rack_sack_proc_short, 1); 7458 } 7459 /* Save off the next one for quick reference. */ 7460 if (rsm) 7461 nrsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7462 else 7463 nrsm = NULL; 7464 *prsm = rack->r_ctl.rc_sacklast = nrsm; 7465 /* Pass back the moved. */ 7466 *moved_two = moved; 7467 return (changed); 7468 } 7469 7470 static void inline 7471 rack_peer_reneges(struct tcp_rack *rack, struct rack_sendmap *rsm, tcp_seq th_ack) 7472 { 7473 struct rack_sendmap *tmap; 7474 7475 tmap = NULL; 7476 while (rsm && (rsm->r_flags & RACK_ACKED)) { 7477 /* Its no longer sacked, mark it so */ 7478 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); 7479 #ifdef INVARIANTS 7480 if (rsm->r_in_tmap) { 7481 panic("rack:%p rsm:%p flags:0x%x in tmap?", 7482 rack, rsm, rsm->r_flags); 7483 } 7484 #endif 7485 rsm->r_flags &= ~(RACK_ACKED|RACK_SACK_PASSED|RACK_WAS_SACKPASS); 7486 /* Rebuild it into our tmap */ 7487 if (tmap == NULL) { 7488 TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext); 7489 tmap = rsm; 7490 } else { 7491 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, tmap, rsm, r_tnext); 7492 tmap = rsm; 7493 } 7494 tmap->r_in_tmap = 1; 7495 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7496 } 7497 /* 7498 * Now lets possibly clear the sack filter so we start 7499 * recognizing sacks that cover this area. 7500 */ 7501 sack_filter_clear(&rack->r_ctl.rack_sf, th_ack); 7502 7503 } 7504 7505 static void 7506 rack_do_decay(struct tcp_rack *rack) 7507 { 7508 struct timeval res; 7509 7510 #define timersub(tvp, uvp, vvp) \ 7511 do { \ 7512 (vvp)->tv_sec = (tvp)->tv_sec - (uvp)->tv_sec; \ 7513 (vvp)->tv_usec = (tvp)->tv_usec - (uvp)->tv_usec; \ 7514 if ((vvp)->tv_usec < 0) { \ 7515 (vvp)->tv_sec--; \ 7516 (vvp)->tv_usec += 1000000; \ 7517 } \ 7518 } while (0) 7519 7520 timersub(&rack->r_ctl.act_rcv_time, &rack->r_ctl.rc_last_time_decay, &res); 7521 #undef timersub 7522 7523 rack->r_ctl.input_pkt++; 7524 if ((rack->rc_in_persist) || 7525 (res.tv_sec >= 1) || 7526 (rack->rc_tp->snd_max == rack->rc_tp->snd_una)) { 7527 /* 7528 * Check for decay of non-SAD, 7529 * we want all SAD detection metrics to 7530 * decay 1/4 per second (or more) passed. 7531 */ 7532 uint32_t pkt_delta; 7533 7534 pkt_delta = rack->r_ctl.input_pkt - rack->r_ctl.saved_input_pkt; 7535 /* Update our saved tracking values */ 7536 rack->r_ctl.saved_input_pkt = rack->r_ctl.input_pkt; 7537 rack->r_ctl.rc_last_time_decay = rack->r_ctl.act_rcv_time; 7538 /* Now do we escape without decay? */ 7539 #ifdef NETFLIX_EXP_DETECTION 7540 if (rack->rc_in_persist || 7541 (rack->rc_tp->snd_max == rack->rc_tp->snd_una) || 7542 (pkt_delta < tcp_sad_low_pps)){ 7543 /* 7544 * We don't decay idle connections 7545 * or ones that have a low input pps. 7546 */ 7547 return; 7548 } 7549 /* Decay the counters */ 7550 rack->r_ctl.ack_count = ctf_decay_count(rack->r_ctl.ack_count, 7551 tcp_sad_decay_val); 7552 rack->r_ctl.sack_count = ctf_decay_count(rack->r_ctl.sack_count, 7553 tcp_sad_decay_val); 7554 rack->r_ctl.sack_moved_extra = ctf_decay_count(rack->r_ctl.sack_moved_extra, 7555 tcp_sad_decay_val); 7556 rack->r_ctl.sack_noextra_move = ctf_decay_count(rack->r_ctl.sack_noextra_move, 7557 tcp_sad_decay_val); 7558 #endif 7559 } 7560 } 7561 7562 static void 7563 rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th) 7564 { 7565 uint32_t changed, entered_recovery = 0; 7566 struct tcp_rack *rack; 7567 struct rack_sendmap *rsm, *rm; 7568 struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1]; 7569 register uint32_t th_ack; 7570 int32_t i, j, k, num_sack_blks = 0; 7571 uint32_t cts, acked, ack_point, sack_changed = 0; 7572 int loop_start = 0, moved_two = 0; 7573 uint32_t tsused; 7574 7575 7576 INP_WLOCK_ASSERT(tp->t_inpcb); 7577 if (th->th_flags & TH_RST) { 7578 /* We don't log resets */ 7579 return; 7580 } 7581 rack = (struct tcp_rack *)tp->t_fb_ptr; 7582 cts = tcp_ts_getticks(); 7583 rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 7584 changed = 0; 7585 th_ack = th->th_ack; 7586 if (rack->sack_attack_disable == 0) 7587 rack_do_decay(rack); 7588 if (BYTES_THIS_ACK(tp, th) >= ctf_fixed_maxseg(rack->rc_tp)) { 7589 /* 7590 * You only get credit for 7591 * MSS and greater (and you get extra 7592 * credit for larger cum-ack moves). 7593 */ 7594 int ac; 7595 7596 ac = BYTES_THIS_ACK(tp, th) / ctf_fixed_maxseg(rack->rc_tp); 7597 rack->r_ctl.ack_count += ac; 7598 counter_u64_add(rack_ack_total, ac); 7599 } 7600 if (rack->r_ctl.ack_count > 0xfff00000) { 7601 /* 7602 * reduce the number to keep us under 7603 * a uint32_t. 7604 */ 7605 rack->r_ctl.ack_count /= 2; 7606 rack->r_ctl.sack_count /= 2; 7607 } 7608 if (SEQ_GT(th_ack, tp->snd_una)) { 7609 rack_log_progress_event(rack, tp, ticks, PROGRESS_UPDATE, __LINE__); 7610 tp->t_acktime = ticks; 7611 } 7612 if (rsm && SEQ_GT(th_ack, rsm->r_start)) 7613 changed = th_ack - rsm->r_start; 7614 if (changed) { 7615 /* 7616 * The ACK point is advancing to th_ack, we must drop off 7617 * the packets in the rack log and calculate any eligble 7618 * RTT's. 7619 */ 7620 rack->r_wanted_output = 1; 7621 more: 7622 rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 7623 if (rsm == NULL) { 7624 if ((th_ack - 1) == tp->iss) { 7625 /* 7626 * For the SYN incoming case we will not 7627 * have called tcp_output for the sending of 7628 * the SYN, so there will be no map. All 7629 * other cases should probably be a panic. 7630 */ 7631 goto proc_sack; 7632 } 7633 if (tp->t_flags & TF_SENTFIN) { 7634 /* if we send a FIN we will not hav a map */ 7635 goto proc_sack; 7636 } 7637 #ifdef INVARIANTS 7638 panic("No rack map tp:%p for th:%p state:%d rack:%p snd_una:%u snd_max:%u snd_nxt:%u chg:%d\n", 7639 tp, 7640 th, tp->t_state, rack, 7641 tp->snd_una, tp->snd_max, tp->snd_nxt, changed); 7642 #endif 7643 goto proc_sack; 7644 } 7645 if (SEQ_LT(th_ack, rsm->r_start)) { 7646 /* Huh map is missing this */ 7647 #ifdef INVARIANTS 7648 printf("Rack map starts at r_start:%u for th_ack:%u huh? ts:%d rs:%d\n", 7649 rsm->r_start, 7650 th_ack, tp->t_state, rack->r_state); 7651 #endif 7652 goto proc_sack; 7653 } 7654 rack_update_rtt(tp, rack, rsm, to, cts, CUM_ACKED, th_ack); 7655 /* Now do we consume the whole thing? */ 7656 if (SEQ_GEQ(th_ack, rsm->r_end)) { 7657 /* Its all consumed. */ 7658 uint32_t left; 7659 uint8_t newly_acked; 7660 7661 rack->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes; 7662 rsm->r_rtr_bytes = 0; 7663 /* Record the time of highest cumack sent */ 7664 rack->r_ctl.rc_gp_cumack_ts = rsm->usec_orig_send; 7665 rm = RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7666 #ifdef INVARIANTS 7667 if (rm != rsm) { 7668 panic("removing head in rack:%p rsm:%p rm:%p", 7669 rack, rsm, rm); 7670 } 7671 #endif 7672 if (rsm->r_in_tmap) { 7673 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 7674 rsm->r_in_tmap = 0; 7675 } 7676 newly_acked = 1; 7677 if (rsm->r_flags & RACK_ACKED) { 7678 /* 7679 * It was acked on the scoreboard -- remove 7680 * it from total 7681 */ 7682 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); 7683 newly_acked = 0; 7684 } else if (rsm->r_flags & RACK_SACK_PASSED) { 7685 /* 7686 * There are segments ACKED on the 7687 * scoreboard further up. We are seeing 7688 * reordering. 7689 */ 7690 rsm->r_flags &= ~RACK_SACK_PASSED; 7691 counter_u64_add(rack_reorder_seen, 1); 7692 rsm->r_ack_arrival = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 7693 rsm->r_flags |= RACK_ACKED; 7694 rack->r_ctl.rc_reorder_ts = cts; 7695 } 7696 left = th_ack - rsm->r_end; 7697 if (rack->app_limited_needs_set && newly_acked) 7698 rack_need_set_test(tp, rack, rsm, th_ack, __LINE__, RACK_USE_END_OR_THACK); 7699 /* Free back to zone */ 7700 rack_free(rack, rsm); 7701 if (left) { 7702 goto more; 7703 } 7704 goto proc_sack; 7705 } 7706 if (rsm->r_flags & RACK_ACKED) { 7707 /* 7708 * It was acked on the scoreboard -- remove it from 7709 * total for the part being cum-acked. 7710 */ 7711 rack->r_ctl.rc_sacked -= (th_ack - rsm->r_start); 7712 } 7713 /* 7714 * Clear the dup ack count for 7715 * the piece that remains. 7716 */ 7717 rsm->r_dupack = 0; 7718 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 7719 if (rsm->r_rtr_bytes) { 7720 /* 7721 * It was retransmitted adjust the 7722 * sack holes for what was acked. 7723 */ 7724 int ack_am; 7725 7726 ack_am = (th_ack - rsm->r_start); 7727 if (ack_am >= rsm->r_rtr_bytes) { 7728 rack->r_ctl.rc_holes_rxt -= ack_am; 7729 rsm->r_rtr_bytes -= ack_am; 7730 } 7731 } 7732 /* 7733 * Update where the piece starts and record 7734 * the time of send of highest cumack sent. 7735 */ 7736 rack->r_ctl.rc_gp_cumack_ts = rsm->usec_orig_send; 7737 rsm->r_start = th_ack; 7738 if (rack->app_limited_needs_set) 7739 rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_BEG); 7740 7741 } 7742 proc_sack: 7743 /* Check for reneging */ 7744 rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 7745 if (rsm && (rsm->r_flags & RACK_ACKED) && (th_ack == rsm->r_start)) { 7746 /* 7747 * The peer has moved snd_una up to 7748 * the edge of this send, i.e. one 7749 * that it had previously acked. The only 7750 * way that can be true if the peer threw 7751 * away data (space issues) that it had 7752 * previously sacked (else it would have 7753 * given us snd_una up to (rsm->r_end). 7754 * We need to undo the acked markings here. 7755 * 7756 * Note we have to look to make sure th_ack is 7757 * our rsm->r_start in case we get an old ack 7758 * where th_ack is behind snd_una. 7759 */ 7760 rack_peer_reneges(rack, rsm, th->th_ack); 7761 } 7762 if ((to->to_flags & TOF_SACK) == 0) { 7763 /* We are done nothing left */ 7764 goto out; 7765 } 7766 /* Sack block processing */ 7767 if (SEQ_GT(th_ack, tp->snd_una)) 7768 ack_point = th_ack; 7769 else 7770 ack_point = tp->snd_una; 7771 for (i = 0; i < to->to_nsacks; i++) { 7772 bcopy((to->to_sacks + i * TCPOLEN_SACK), 7773 &sack, sizeof(sack)); 7774 sack.start = ntohl(sack.start); 7775 sack.end = ntohl(sack.end); 7776 if (SEQ_GT(sack.end, sack.start) && 7777 SEQ_GT(sack.start, ack_point) && 7778 SEQ_LT(sack.start, tp->snd_max) && 7779 SEQ_GT(sack.end, ack_point) && 7780 SEQ_LEQ(sack.end, tp->snd_max)) { 7781 sack_blocks[num_sack_blks] = sack; 7782 num_sack_blks++; 7783 #ifdef NETFLIX_STATS 7784 } else if (SEQ_LEQ(sack.start, th_ack) && 7785 SEQ_LEQ(sack.end, th_ack)) { 7786 /* 7787 * Its a D-SACK block. 7788 */ 7789 tcp_record_dsack(sack.start, sack.end); 7790 #endif 7791 } 7792 7793 } 7794 /* 7795 * Sort the SACK blocks so we can update the rack scoreboard with 7796 * just one pass. 7797 */ 7798 num_sack_blks = sack_filter_blks(&rack->r_ctl.rack_sf, sack_blocks, 7799 num_sack_blks, th->th_ack); 7800 ctf_log_sack_filter(rack->rc_tp, num_sack_blks, sack_blocks); 7801 if (num_sack_blks == 0) { 7802 /* Nothing to sack (DSACKs?) */ 7803 goto out_with_totals; 7804 } 7805 if (num_sack_blks < 2) { 7806 /* Only one, we don't need to sort */ 7807 goto do_sack_work; 7808 } 7809 /* Sort the sacks */ 7810 for (i = 0; i < num_sack_blks; i++) { 7811 for (j = i + 1; j < num_sack_blks; j++) { 7812 if (SEQ_GT(sack_blocks[i].end, sack_blocks[j].end)) { 7813 sack = sack_blocks[i]; 7814 sack_blocks[i] = sack_blocks[j]; 7815 sack_blocks[j] = sack; 7816 } 7817 } 7818 } 7819 /* 7820 * Now are any of the sack block ends the same (yes some 7821 * implementations send these)? 7822 */ 7823 again: 7824 if (num_sack_blks == 0) 7825 goto out_with_totals; 7826 if (num_sack_blks > 1) { 7827 for (i = 0; i < num_sack_blks; i++) { 7828 for (j = i + 1; j < num_sack_blks; j++) { 7829 if (sack_blocks[i].end == sack_blocks[j].end) { 7830 /* 7831 * Ok these two have the same end we 7832 * want the smallest end and then 7833 * throw away the larger and start 7834 * again. 7835 */ 7836 if (SEQ_LT(sack_blocks[j].start, sack_blocks[i].start)) { 7837 /* 7838 * The second block covers 7839 * more area use that 7840 */ 7841 sack_blocks[i].start = sack_blocks[j].start; 7842 } 7843 /* 7844 * Now collapse out the dup-sack and 7845 * lower the count 7846 */ 7847 for (k = (j + 1); k < num_sack_blks; k++) { 7848 sack_blocks[j].start = sack_blocks[k].start; 7849 sack_blocks[j].end = sack_blocks[k].end; 7850 j++; 7851 } 7852 num_sack_blks--; 7853 goto again; 7854 } 7855 } 7856 } 7857 } 7858 do_sack_work: 7859 /* 7860 * First lets look to see if 7861 * we have retransmitted and 7862 * can use the transmit next? 7863 */ 7864 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 7865 if (rsm && 7866 SEQ_GT(sack_blocks[0].end, rsm->r_start) && 7867 SEQ_LT(sack_blocks[0].start, rsm->r_end)) { 7868 /* 7869 * We probably did the FR and the next 7870 * SACK in continues as we would expect. 7871 */ 7872 acked = rack_proc_sack_blk(tp, rack, &sack_blocks[0], to, &rsm, cts, &moved_two); 7873 if (acked) { 7874 rack->r_wanted_output = 1; 7875 changed += acked; 7876 sack_changed += acked; 7877 } 7878 if (num_sack_blks == 1) { 7879 /* 7880 * This is what we would expect from 7881 * a normal implementation to happen 7882 * after we have retransmitted the FR, 7883 * i.e the sack-filter pushes down 7884 * to 1 block and the next to be retransmitted 7885 * is the sequence in the sack block (has more 7886 * are acked). Count this as ACK'd data to boost 7887 * up the chances of recovering any false positives. 7888 */ 7889 rack->r_ctl.ack_count += (acked / ctf_fixed_maxseg(rack->rc_tp)); 7890 counter_u64_add(rack_ack_total, (acked / ctf_fixed_maxseg(rack->rc_tp))); 7891 counter_u64_add(rack_express_sack, 1); 7892 if (rack->r_ctl.ack_count > 0xfff00000) { 7893 /* 7894 * reduce the number to keep us under 7895 * a uint32_t. 7896 */ 7897 rack->r_ctl.ack_count /= 2; 7898 rack->r_ctl.sack_count /= 2; 7899 } 7900 goto out_with_totals; 7901 } else { 7902 /* 7903 * Start the loop through the 7904 * rest of blocks, past the first block. 7905 */ 7906 moved_two = 0; 7907 loop_start = 1; 7908 } 7909 } 7910 /* Its a sack of some sort */ 7911 rack->r_ctl.sack_count++; 7912 if (rack->r_ctl.sack_count > 0xfff00000) { 7913 /* 7914 * reduce the number to keep us under 7915 * a uint32_t. 7916 */ 7917 rack->r_ctl.ack_count /= 2; 7918 rack->r_ctl.sack_count /= 2; 7919 } 7920 counter_u64_add(rack_sack_total, 1); 7921 if (rack->sack_attack_disable) { 7922 /* An attacker disablement is in place */ 7923 if (num_sack_blks > 1) { 7924 rack->r_ctl.sack_count += (num_sack_blks - 1); 7925 rack->r_ctl.sack_moved_extra++; 7926 counter_u64_add(rack_move_some, 1); 7927 if (rack->r_ctl.sack_moved_extra > 0xfff00000) { 7928 rack->r_ctl.sack_moved_extra /= 2; 7929 rack->r_ctl.sack_noextra_move /= 2; 7930 } 7931 } 7932 goto out; 7933 } 7934 rsm = rack->r_ctl.rc_sacklast; 7935 for (i = loop_start; i < num_sack_blks; i++) { 7936 acked = rack_proc_sack_blk(tp, rack, &sack_blocks[i], to, &rsm, cts, &moved_two); 7937 if (acked) { 7938 rack->r_wanted_output = 1; 7939 changed += acked; 7940 sack_changed += acked; 7941 } 7942 if (moved_two) { 7943 /* 7944 * If we did not get a SACK for at least a MSS and 7945 * had to move at all, or if we moved more than our 7946 * threshold, it counts against the "extra" move. 7947 */ 7948 rack->r_ctl.sack_moved_extra += moved_two; 7949 counter_u64_add(rack_move_some, 1); 7950 } else { 7951 /* 7952 * else we did not have to move 7953 * any more than we would expect. 7954 */ 7955 rack->r_ctl.sack_noextra_move++; 7956 counter_u64_add(rack_move_none, 1); 7957 } 7958 if (moved_two && (acked < ctf_fixed_maxseg(rack->rc_tp))) { 7959 /* 7960 * If the SACK was not a full MSS then 7961 * we add to sack_count the number of 7962 * MSS's (or possibly more than 7963 * a MSS if its a TSO send) we had to skip by. 7964 */ 7965 rack->r_ctl.sack_count += moved_two; 7966 counter_u64_add(rack_sack_total, moved_two); 7967 } 7968 /* 7969 * Now we need to setup for the next 7970 * round. First we make sure we won't 7971 * exceed the size of our uint32_t on 7972 * the various counts, and then clear out 7973 * moved_two. 7974 */ 7975 if ((rack->r_ctl.sack_moved_extra > 0xfff00000) || 7976 (rack->r_ctl.sack_noextra_move > 0xfff00000)) { 7977 rack->r_ctl.sack_moved_extra /= 2; 7978 rack->r_ctl.sack_noextra_move /= 2; 7979 } 7980 if (rack->r_ctl.sack_count > 0xfff00000) { 7981 rack->r_ctl.ack_count /= 2; 7982 rack->r_ctl.sack_count /= 2; 7983 } 7984 moved_two = 0; 7985 } 7986 out_with_totals: 7987 if (num_sack_blks > 1) { 7988 /* 7989 * You get an extra stroke if 7990 * you have more than one sack-blk, this 7991 * could be where we are skipping forward 7992 * and the sack-filter is still working, or 7993 * it could be an attacker constantly 7994 * moving us. 7995 */ 7996 rack->r_ctl.sack_moved_extra++; 7997 counter_u64_add(rack_move_some, 1); 7998 } 7999 out: 8000 #ifdef NETFLIX_EXP_DETECTION 8001 if ((rack->do_detection || tcp_force_detection) && 8002 tcp_sack_to_ack_thresh && 8003 tcp_sack_to_move_thresh && 8004 ((rack->r_ctl.rc_num_maps_alloced > tcp_map_minimum) || rack->sack_attack_disable)) { 8005 /* 8006 * We have thresholds set to find 8007 * possible attackers and disable sack. 8008 * Check them. 8009 */ 8010 uint64_t ackratio, moveratio, movetotal; 8011 8012 /* Log detecting */ 8013 rack_log_sad(rack, 1); 8014 ackratio = (uint64_t)(rack->r_ctl.sack_count); 8015 ackratio *= (uint64_t)(1000); 8016 if (rack->r_ctl.ack_count) 8017 ackratio /= (uint64_t)(rack->r_ctl.ack_count); 8018 else { 8019 /* We really should not hit here */ 8020 ackratio = 1000; 8021 } 8022 if ((rack->sack_attack_disable == 0) && 8023 (ackratio > rack_highest_sack_thresh_seen)) 8024 rack_highest_sack_thresh_seen = (uint32_t)ackratio; 8025 movetotal = rack->r_ctl.sack_moved_extra; 8026 movetotal += rack->r_ctl.sack_noextra_move; 8027 moveratio = rack->r_ctl.sack_moved_extra; 8028 moveratio *= (uint64_t)1000; 8029 if (movetotal) 8030 moveratio /= movetotal; 8031 else { 8032 /* No moves, thats pretty good */ 8033 moveratio = 0; 8034 } 8035 if ((rack->sack_attack_disable == 0) && 8036 (moveratio > rack_highest_move_thresh_seen)) 8037 rack_highest_move_thresh_seen = (uint32_t)moveratio; 8038 if (rack->sack_attack_disable == 0) { 8039 if ((ackratio > tcp_sack_to_ack_thresh) && 8040 (moveratio > tcp_sack_to_move_thresh)) { 8041 /* Disable sack processing */ 8042 rack->sack_attack_disable = 1; 8043 if (rack->r_rep_attack == 0) { 8044 rack->r_rep_attack = 1; 8045 counter_u64_add(rack_sack_attacks_detected, 1); 8046 } 8047 if (tcp_attack_on_turns_on_logging) { 8048 /* 8049 * Turn on logging, used for debugging 8050 * false positives. 8051 */ 8052 rack->rc_tp->t_logstate = tcp_attack_on_turns_on_logging; 8053 } 8054 /* Clamp the cwnd at flight size */ 8055 rack->r_ctl.rc_saved_cwnd = rack->rc_tp->snd_cwnd; 8056 rack->rc_tp->snd_cwnd = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 8057 rack_log_sad(rack, 2); 8058 } 8059 } else { 8060 /* We are sack-disabled check for false positives */ 8061 if ((ackratio <= tcp_restoral_thresh) || 8062 (rack->r_ctl.rc_num_maps_alloced < tcp_map_minimum)) { 8063 rack->sack_attack_disable = 0; 8064 rack_log_sad(rack, 3); 8065 /* Restart counting */ 8066 rack->r_ctl.sack_count = 0; 8067 rack->r_ctl.sack_moved_extra = 0; 8068 rack->r_ctl.sack_noextra_move = 1; 8069 rack->r_ctl.ack_count = max(1, 8070 (BYTES_THIS_ACK(tp, th)/ctf_fixed_maxseg(rack->rc_tp))); 8071 8072 if (rack->r_rep_reverse == 0) { 8073 rack->r_rep_reverse = 1; 8074 counter_u64_add(rack_sack_attacks_reversed, 1); 8075 } 8076 /* Restore the cwnd */ 8077 if (rack->r_ctl.rc_saved_cwnd > rack->rc_tp->snd_cwnd) 8078 rack->rc_tp->snd_cwnd = rack->r_ctl.rc_saved_cwnd; 8079 } 8080 } 8081 } 8082 #endif 8083 if (changed) { 8084 /* Something changed cancel the rack timer */ 8085 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 8086 } 8087 tsused = tcp_ts_getticks(); 8088 rsm = tcp_rack_output(tp, rack, tsused); 8089 if ((!IN_RECOVERY(tp->t_flags)) && 8090 rsm) { 8091 /* Enter recovery */ 8092 rack->r_ctl.rc_rsm_start = rsm->r_start; 8093 rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; 8094 rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; 8095 entered_recovery = 1; 8096 rack_cong_signal(tp, NULL, CC_NDUPACK); 8097 /* 8098 * When we enter recovery we need to assure we send 8099 * one packet. 8100 */ 8101 if (rack->rack_no_prr == 0) { 8102 rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); 8103 rack_log_to_prr(rack, 8, 0); 8104 } 8105 rack->r_timer_override = 1; 8106 rack->r_early = 0; 8107 rack->r_ctl.rc_agg_early = 0; 8108 } else if (IN_RECOVERY(tp->t_flags) && 8109 rsm && 8110 (rack->r_rr_config == 3)) { 8111 /* 8112 * Assure we can output and we get no 8113 * remembered pace time except the retransmit. 8114 */ 8115 rack->r_timer_override = 1; 8116 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 8117 rack->r_ctl.rc_resend = rsm; 8118 } 8119 if (IN_RECOVERY(tp->t_flags) && 8120 (rack->rack_no_prr == 0) && 8121 (entered_recovery == 0)) { 8122 /* Deal with PRR here (in recovery only) */ 8123 uint32_t pipe, snd_una; 8124 8125 rack->r_ctl.rc_prr_delivered += changed; 8126 /* Compute prr_sndcnt */ 8127 if (SEQ_GT(tp->snd_una, th_ack)) { 8128 snd_una = tp->snd_una; 8129 } else { 8130 snd_una = th_ack; 8131 } 8132 pipe = ((tp->snd_max - snd_una) - rack->r_ctl.rc_sacked) + rack->r_ctl.rc_holes_rxt; 8133 if (pipe > tp->snd_ssthresh) { 8134 long sndcnt; 8135 8136 sndcnt = rack->r_ctl.rc_prr_delivered * tp->snd_ssthresh; 8137 if (rack->r_ctl.rc_prr_recovery_fs > 0) 8138 sndcnt /= (long)rack->r_ctl.rc_prr_recovery_fs; 8139 else { 8140 rack->r_ctl.rc_prr_sndcnt = 0; 8141 rack_log_to_prr(rack, 9, 0); 8142 sndcnt = 0; 8143 } 8144 sndcnt++; 8145 if (sndcnt > (long)rack->r_ctl.rc_prr_out) 8146 sndcnt -= rack->r_ctl.rc_prr_out; 8147 else 8148 sndcnt = 0; 8149 rack->r_ctl.rc_prr_sndcnt = sndcnt; 8150 rack_log_to_prr(rack, 10, 0); 8151 } else { 8152 uint32_t limit; 8153 8154 if (rack->r_ctl.rc_prr_delivered > rack->r_ctl.rc_prr_out) 8155 limit = (rack->r_ctl.rc_prr_delivered - rack->r_ctl.rc_prr_out); 8156 else 8157 limit = 0; 8158 if (changed > limit) 8159 limit = changed; 8160 limit += ctf_fixed_maxseg(tp); 8161 if (tp->snd_ssthresh > pipe) { 8162 rack->r_ctl.rc_prr_sndcnt = min((tp->snd_ssthresh - pipe), limit); 8163 rack_log_to_prr(rack, 11, 0); 8164 } else { 8165 rack->r_ctl.rc_prr_sndcnt = min(0, limit); 8166 rack_log_to_prr(rack, 12, 0); 8167 } 8168 } 8169 if ((rsm && (rack->r_ctl.rc_prr_sndcnt >= ctf_fixed_maxseg(tp)) && 8170 ((rack->rc_inp->inp_in_hpts == 0) && 8171 ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0)))) { 8172 /* 8173 * If you are pacing output you don't want 8174 * to override. 8175 */ 8176 rack->r_early = 0; 8177 rack->r_ctl.rc_agg_early = 0; 8178 rack->r_timer_override = 1; 8179 } 8180 } 8181 } 8182 8183 static void 8184 rack_strike_dupack(struct tcp_rack *rack) 8185 { 8186 struct rack_sendmap *rsm; 8187 8188 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 8189 if (rsm && (rsm->r_dupack < 0xff)) { 8190 rsm->r_dupack++; 8191 if (rsm->r_dupack >= DUP_ACK_THRESHOLD) { 8192 rack->r_wanted_output = 1; 8193 rack->r_timer_override = 1; 8194 rack_log_retran_reason(rack, rsm, __LINE__, 1, 3); 8195 } else { 8196 rack_log_retran_reason(rack, rsm, __LINE__, 0, 3); 8197 } 8198 } 8199 } 8200 8201 static void 8202 rack_check_bottom_drag(struct tcpcb *tp, 8203 struct tcp_rack *rack, 8204 struct socket *so, int32_t acked) 8205 { 8206 uint32_t segsiz, minseg; 8207 8208 segsiz = ctf_fixed_maxseg(tp); 8209 if (so->so_snd.sb_flags & SB_TLS_IFNET) { 8210 minseg = rack->r_ctl.rc_pace_min_segs; 8211 } else { 8212 minseg = segsiz; 8213 } 8214 if (tp->snd_max == tp->snd_una) { 8215 /* 8216 * We are doing dynamic pacing and we are way 8217 * under. Basically everything got acked while 8218 * we were still waiting on the pacer to expire. 8219 * 8220 * This means we need to boost the b/w in 8221 * addition to any earlier boosting of 8222 * the multipler. 8223 */ 8224 rack->rc_dragged_bottom = 1; 8225 rack_validate_multipliers_at_or_above100(rack); 8226 /* 8227 * Lets use the segment bytes acked plus 8228 * the lowest RTT seen as the basis to 8229 * form a b/w estimate. This will be off 8230 * due to the fact that the true estimate 8231 * should be around 1/2 the time of the RTT 8232 * but we can settle for that. 8233 */ 8234 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_VALID) && 8235 acked) { 8236 uint64_t bw, calc_bw, rtt; 8237 8238 rtt = rack->r_ctl.rack_rs.rs_us_rtt; 8239 bw = acked; 8240 calc_bw = bw * 1000000; 8241 calc_bw /= rtt; 8242 if (rack->r_ctl.last_max_bw && 8243 (rack->r_ctl.last_max_bw < calc_bw)) { 8244 /* 8245 * If we have a last calculated max bw 8246 * enforce it. 8247 */ 8248 calc_bw = rack->r_ctl.last_max_bw; 8249 } 8250 /* now plop it in */ 8251 if (rack->rc_gp_filled == 0) { 8252 if (calc_bw > ONE_POINT_TWO_MEG) { 8253 /* 8254 * If we have no measurement 8255 * don't let us set in more than 8256 * 1.2Mbps. If we are still too 8257 * low after pacing with this we 8258 * will hopefully have a max b/w 8259 * available to sanity check things. 8260 */ 8261 calc_bw = ONE_POINT_TWO_MEG; 8262 } 8263 rack->r_ctl.rc_rtt_diff = 0; 8264 rack->r_ctl.gp_bw = calc_bw; 8265 rack->rc_gp_filled = 1; 8266 rack->r_ctl.num_avg = RACK_REQ_AVG; 8267 rack_set_pace_segments(rack->rc_tp, rack, __LINE__); 8268 } else if (calc_bw > rack->r_ctl.gp_bw) { 8269 rack->r_ctl.rc_rtt_diff = 0; 8270 rack->r_ctl.num_avg = RACK_REQ_AVG; 8271 rack->r_ctl.gp_bw = calc_bw; 8272 rack_set_pace_segments(rack->rc_tp, rack, __LINE__); 8273 } else 8274 rack_increase_bw_mul(rack, -1, 0, 0, 1); 8275 /* 8276 * For acks over 1mss we do a extra boost to simulate 8277 * where we would get 2 acks (we want 110 for the mul). 8278 */ 8279 if (acked > segsiz) 8280 rack_increase_bw_mul(rack, -1, 0, 0, 1); 8281 } else { 8282 /* 8283 * Huh, this should not be, settle 8284 * for just an old increase. 8285 */ 8286 rack_increase_bw_mul(rack, -1, 0, 0, 1); 8287 } 8288 } else if ((IN_RECOVERY(tp->t_flags) == 0) && 8289 (sbavail(&so->so_snd) > max((segsiz * (4 + rack_req_segs)), 8290 minseg)) && 8291 (rack->r_ctl.cwnd_to_use > max((segsiz * (rack_req_segs + 2)), minseg)) && 8292 (tp->snd_wnd > max((segsiz * (rack_req_segs + 2)), minseg)) && 8293 (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) <= 8294 (segsiz * rack_req_segs))) { 8295 /* 8296 * We are doing dynamic GP pacing and 8297 * we have everything except 1MSS or less 8298 * bytes left out. We are still pacing away. 8299 * And there is data that could be sent, This 8300 * means we are inserting delayed ack time in 8301 * our measurements because we are pacing too slow. 8302 */ 8303 rack_validate_multipliers_at_or_above100(rack); 8304 rack->rc_dragged_bottom = 1; 8305 rack_increase_bw_mul(rack, -1, 0, 0, 1); 8306 } 8307 } 8308 8309 /* 8310 * Return value of 1, we do not need to call rack_process_data(). 8311 * return value of 0, rack_process_data can be called. 8312 * For ret_val if its 0 the TCP is locked, if its non-zero 8313 * its unlocked and probably unsafe to touch the TCB. 8314 */ 8315 static int 8316 rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so, 8317 struct tcpcb *tp, struct tcpopt *to, 8318 uint32_t tiwin, int32_t tlen, 8319 int32_t * ofia, int32_t thflags, int32_t * ret_val) 8320 { 8321 int32_t ourfinisacked = 0; 8322 int32_t nsegs, acked_amount; 8323 int32_t acked; 8324 struct mbuf *mfree; 8325 struct tcp_rack *rack; 8326 int32_t under_pacing = 0; 8327 int32_t recovery = 0; 8328 8329 rack = (struct tcp_rack *)tp->t_fb_ptr; 8330 if (SEQ_GT(th->th_ack, tp->snd_max)) { 8331 ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val); 8332 rack->r_wanted_output = 1; 8333 return (1); 8334 } 8335 if (rack->rc_gp_filled && 8336 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { 8337 under_pacing = 1; 8338 } 8339 if (SEQ_GEQ(th->th_ack, tp->snd_una) || to->to_nsacks) { 8340 if (rack->rc_in_persist) 8341 tp->t_rxtshift = 0; 8342 if ((th->th_ack == tp->snd_una) && (tiwin == tp->snd_wnd)) 8343 rack_strike_dupack(rack); 8344 rack_log_ack(tp, to, th); 8345 } 8346 if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) { 8347 /* 8348 * Old ack, behind (or duplicate to) the last one rcv'd 8349 * Note: Should mark reordering is occuring! We should also 8350 * look for sack blocks arriving e.g. ack 1, 4-4 then ack 1, 8351 * 3-3, 4-4 would be reording. As well as ack 1, 3-3 <no 8352 * retran and> ack 3 8353 */ 8354 return (0); 8355 } 8356 /* 8357 * If we reach this point, ACK is not a duplicate, i.e., it ACKs 8358 * something we sent. 8359 */ 8360 if (tp->t_flags & TF_NEEDSYN) { 8361 /* 8362 * T/TCP: Connection was half-synchronized, and our SYN has 8363 * been ACK'd (so connection is now fully synchronized). Go 8364 * to non-starred state, increment snd_una for ACK of SYN, 8365 * and check if we can do window scaling. 8366 */ 8367 tp->t_flags &= ~TF_NEEDSYN; 8368 tp->snd_una++; 8369 /* Do window scaling? */ 8370 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 8371 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 8372 tp->rcv_scale = tp->request_r_scale; 8373 /* Send window already scaled. */ 8374 } 8375 } 8376 nsegs = max(1, m->m_pkthdr.lro_nsegs); 8377 INP_WLOCK_ASSERT(tp->t_inpcb); 8378 8379 acked = BYTES_THIS_ACK(tp, th); 8380 KMOD_TCPSTAT_ADD(tcps_rcvackpack, nsegs); 8381 KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked); 8382 /* 8383 * If we just performed our first retransmit, and the ACK arrives 8384 * within our recovery window, then it was a mistake to do the 8385 * retransmit in the first place. Recover our original cwnd and 8386 * ssthresh, and proceed to transmit where we left off. 8387 */ 8388 if (tp->t_flags & TF_PREVVALID) { 8389 tp->t_flags &= ~TF_PREVVALID; 8390 if (tp->t_rxtshift == 1 && 8391 (int)(ticks - tp->t_badrxtwin) < 0) 8392 rack_cong_signal(tp, th, CC_RTO_ERR); 8393 } 8394 if (acked) { 8395 /* assure we are not backed off */ 8396 tp->t_rxtshift = 0; 8397 rack->rc_tlp_in_progress = 0; 8398 rack->r_ctl.rc_tlp_cnt_out = 0; 8399 /* 8400 * If it is the RXT timer we want to 8401 * stop it, so we can restart a TLP. 8402 */ 8403 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) 8404 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 8405 #ifdef NETFLIX_HTTP_LOGGING 8406 tcp_http_check_for_comp(rack->rc_tp, th->th_ack); 8407 #endif 8408 } 8409 /* 8410 * If we have a timestamp reply, update smoothed round trip time. If 8411 * no timestamp is present but transmit timer is running and timed 8412 * sequence number was acked, update smoothed round trip time. Since 8413 * we now have an rtt measurement, cancel the timer backoff (cf., 8414 * Phil Karn's retransmit alg.). Recompute the initial retransmit 8415 * timer. 8416 * 8417 * Some boxes send broken timestamp replies during the SYN+ACK 8418 * phase, ignore timestamps of 0 or we could calculate a huge RTT 8419 * and blow up the retransmit timer. 8420 */ 8421 /* 8422 * If all outstanding data is acked, stop retransmit timer and 8423 * remember to restart (more output or persist). If there is more 8424 * data to be acked, restart retransmit timer, using current 8425 * (possibly backed-off) value. 8426 */ 8427 if (acked == 0) { 8428 if (ofia) 8429 *ofia = ourfinisacked; 8430 return (0); 8431 } 8432 if (rack->r_ctl.rc_early_recovery) { 8433 if (IN_RECOVERY(tp->t_flags)) { 8434 if (SEQ_LT(th->th_ack, tp->snd_recover) && 8435 (SEQ_LT(th->th_ack, tp->snd_max))) { 8436 tcp_rack_partialack(tp, th); 8437 } else { 8438 rack_post_recovery(tp, th); 8439 recovery = 1; 8440 } 8441 } 8442 } 8443 /* 8444 * Let the congestion control algorithm update congestion control 8445 * related information. This typically means increasing the 8446 * congestion window. 8447 */ 8448 rack_ack_received(tp, rack, th, nsegs, CC_ACK, recovery); 8449 SOCKBUF_LOCK(&so->so_snd); 8450 acked_amount = min(acked, (int)sbavail(&so->so_snd)); 8451 tp->snd_wnd -= acked_amount; 8452 mfree = sbcut_locked(&so->so_snd, acked_amount); 8453 if ((sbused(&so->so_snd) == 0) && 8454 (acked > acked_amount) && 8455 (tp->t_state >= TCPS_FIN_WAIT_1) && 8456 (tp->t_flags & TF_SENTFIN)) { 8457 /* 8458 * We must be sure our fin 8459 * was sent and acked (we can be 8460 * in FIN_WAIT_1 without having 8461 * sent the fin). 8462 */ 8463 ourfinisacked = 1; 8464 } 8465 /* NB: sowwakeup_locked() does an implicit unlock. */ 8466 sowwakeup_locked(so); 8467 m_freem(mfree); 8468 if (rack->r_ctl.rc_early_recovery == 0) { 8469 if (IN_RECOVERY(tp->t_flags)) { 8470 if (SEQ_LT(th->th_ack, tp->snd_recover) && 8471 (SEQ_LT(th->th_ack, tp->snd_max))) { 8472 tcp_rack_partialack(tp, th); 8473 } else { 8474 rack_post_recovery(tp, th); 8475 } 8476 } 8477 } 8478 tp->snd_una = th->th_ack; 8479 if (SEQ_GT(tp->snd_una, tp->snd_recover)) 8480 tp->snd_recover = tp->snd_una; 8481 8482 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) { 8483 tp->snd_nxt = tp->snd_una; 8484 } 8485 if (under_pacing && 8486 (rack->use_fixed_rate == 0) && 8487 (rack->in_probe_rtt == 0) && 8488 rack->rc_gp_dyn_mul && 8489 rack->rc_always_pace) { 8490 /* Check if we are dragging bottom */ 8491 rack_check_bottom_drag(tp, rack, so, acked); 8492 } 8493 if (tp->snd_una == tp->snd_max) { 8494 /* Nothing left outstanding */ 8495 rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL); 8496 if (rack->r_ctl.rc_went_idle_time == 0) 8497 rack->r_ctl.rc_went_idle_time = 1; 8498 rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__); 8499 if (sbavail(&tp->t_inpcb->inp_socket->so_snd) == 0) 8500 tp->t_acktime = 0; 8501 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 8502 /* Set need output so persist might get set */ 8503 rack->r_wanted_output = 1; 8504 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 8505 if ((tp->t_state >= TCPS_FIN_WAIT_1) && 8506 (sbavail(&so->so_snd) == 0) && 8507 (tp->t_flags2 & TF2_DROP_AF_DATA)) { 8508 /* 8509 * The socket was gone and the 8510 * peer sent data, time to 8511 * reset him. 8512 */ 8513 *ret_val = 1; 8514 /* tcp_close will kill the inp pre-log the Reset */ 8515 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); 8516 tp = tcp_close(tp); 8517 ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, tlen); 8518 return (1); 8519 8520 } 8521 } 8522 if (ofia) 8523 *ofia = ourfinisacked; 8524 return (0); 8525 } 8526 8527 static void 8528 rack_collapsed_window(struct tcp_rack *rack) 8529 { 8530 /* 8531 * Now we must walk the 8532 * send map and divide the 8533 * ones left stranded. These 8534 * guys can't cause us to abort 8535 * the connection and are really 8536 * "unsent". However if a buggy 8537 * client actually did keep some 8538 * of the data i.e. collapsed the win 8539 * and refused to ack and then opened 8540 * the win and acked that data. We would 8541 * get into an ack war, the simplier 8542 * method then of just pretending we 8543 * did not send those segments something 8544 * won't work. 8545 */ 8546 struct rack_sendmap *rsm, *nrsm, fe, *insret; 8547 tcp_seq max_seq; 8548 8549 max_seq = rack->rc_tp->snd_una + rack->rc_tp->snd_wnd; 8550 memset(&fe, 0, sizeof(fe)); 8551 fe.r_start = max_seq; 8552 /* Find the first seq past or at maxseq */ 8553 rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); 8554 if (rsm == NULL) { 8555 /* Nothing to do strange */ 8556 rack->rc_has_collapsed = 0; 8557 return; 8558 } 8559 /* 8560 * Now do we need to split at 8561 * the collapse point? 8562 */ 8563 if (SEQ_GT(max_seq, rsm->r_start)) { 8564 nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT); 8565 if (nrsm == NULL) { 8566 /* We can't get a rsm, mark all? */ 8567 nrsm = rsm; 8568 goto no_split; 8569 } 8570 /* Clone it */ 8571 rack_clone_rsm(rack, nrsm, rsm, max_seq); 8572 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); 8573 #ifdef INVARIANTS 8574 if (insret != NULL) { 8575 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", 8576 nrsm, insret, rack, rsm); 8577 } 8578 #endif 8579 if (rsm->r_in_tmap) { 8580 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 8581 nrsm->r_in_tmap = 1; 8582 } 8583 /* 8584 * Set in the new RSM as the 8585 * collapsed starting point 8586 */ 8587 rsm = nrsm; 8588 } 8589 no_split: 8590 counter_u64_add(rack_collapsed_win, 1); 8591 RB_FOREACH_FROM(nrsm, rack_rb_tree_head, rsm) { 8592 nrsm->r_flags |= RACK_RWND_COLLAPSED; 8593 rack->rc_has_collapsed = 1; 8594 } 8595 } 8596 8597 static void 8598 rack_un_collapse_window(struct tcp_rack *rack) 8599 { 8600 struct rack_sendmap *rsm; 8601 8602 RB_FOREACH_REVERSE(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) { 8603 if (rsm->r_flags & RACK_RWND_COLLAPSED) 8604 rsm->r_flags &= ~RACK_RWND_COLLAPSED; 8605 else 8606 break; 8607 } 8608 rack->rc_has_collapsed = 0; 8609 } 8610 8611 static void 8612 rack_handle_delayed_ack(struct tcpcb *tp, struct tcp_rack *rack, 8613 int32_t tlen, int32_t tfo_syn) 8614 { 8615 if (DELAY_ACK(tp, tlen) || tfo_syn) { 8616 if (rack->rc_dack_mode && 8617 (tlen > 500) && 8618 (rack->rc_dack_toggle == 1)) { 8619 goto no_delayed_ack; 8620 } 8621 rack_timer_cancel(tp, rack, 8622 rack->r_ctl.rc_rcvtime, __LINE__); 8623 tp->t_flags |= TF_DELACK; 8624 } else { 8625 no_delayed_ack: 8626 rack->r_wanted_output = 1; 8627 tp->t_flags |= TF_ACKNOW; 8628 if (rack->rc_dack_mode) { 8629 if (tp->t_flags & TF_DELACK) 8630 rack->rc_dack_toggle = 1; 8631 else 8632 rack->rc_dack_toggle = 0; 8633 } 8634 } 8635 } 8636 /* 8637 * Return value of 1, the TCB is unlocked and most 8638 * likely gone, return value of 0, the TCP is still 8639 * locked. 8640 */ 8641 static int 8642 rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so, 8643 struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, 8644 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) 8645 { 8646 /* 8647 * Update window information. Don't look at window if no ACK: TAC's 8648 * send garbage on first SYN. 8649 */ 8650 int32_t nsegs; 8651 int32_t tfo_syn; 8652 struct tcp_rack *rack; 8653 8654 rack = (struct tcp_rack *)tp->t_fb_ptr; 8655 INP_WLOCK_ASSERT(tp->t_inpcb); 8656 nsegs = max(1, m->m_pkthdr.lro_nsegs); 8657 if ((thflags & TH_ACK) && 8658 (SEQ_LT(tp->snd_wl1, th->th_seq) || 8659 (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || 8660 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { 8661 /* keep track of pure window updates */ 8662 if (tlen == 0 && 8663 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) 8664 KMOD_TCPSTAT_INC(tcps_rcvwinupd); 8665 tp->snd_wnd = tiwin; 8666 tp->snd_wl1 = th->th_seq; 8667 tp->snd_wl2 = th->th_ack; 8668 if (tp->snd_wnd > tp->max_sndwnd) 8669 tp->max_sndwnd = tp->snd_wnd; 8670 rack->r_wanted_output = 1; 8671 } else if (thflags & TH_ACK) { 8672 if ((tp->snd_wl2 == th->th_ack) && (tiwin < tp->snd_wnd)) { 8673 tp->snd_wnd = tiwin; 8674 tp->snd_wl1 = th->th_seq; 8675 tp->snd_wl2 = th->th_ack; 8676 } 8677 } 8678 if (tp->snd_wnd < ctf_outstanding(tp)) 8679 /* The peer collapsed the window */ 8680 rack_collapsed_window(rack); 8681 else if (rack->rc_has_collapsed) 8682 rack_un_collapse_window(rack); 8683 /* Was persist timer active and now we have window space? */ 8684 if ((rack->rc_in_persist != 0) && 8685 (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2), 8686 rack->r_ctl.rc_pace_min_segs))) { 8687 rack_exit_persist(tp, rack, rack->r_ctl.rc_rcvtime); 8688 tp->snd_nxt = tp->snd_max; 8689 /* Make sure we output to start the timer */ 8690 rack->r_wanted_output = 1; 8691 } 8692 /* Do we enter persists? */ 8693 if ((rack->rc_in_persist == 0) && 8694 (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) && 8695 TCPS_HAVEESTABLISHED(tp->t_state) && 8696 (tp->snd_max == tp->snd_una) && 8697 sbavail(&tp->t_inpcb->inp_socket->so_snd) && 8698 (sbavail(&tp->t_inpcb->inp_socket->so_snd) > tp->snd_wnd)) { 8699 /* 8700 * Here the rwnd is less than 8701 * the pacing size, we are established, 8702 * nothing is outstanding, and there is 8703 * data to send. Enter persists. 8704 */ 8705 tp->snd_nxt = tp->snd_una; 8706 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime); 8707 } 8708 if (tp->t_flags2 & TF2_DROP_AF_DATA) { 8709 m_freem(m); 8710 return (0); 8711 } 8712 /* 8713 * don't process the URG bit, ignore them drag 8714 * along the up. 8715 */ 8716 tp->rcv_up = tp->rcv_nxt; 8717 INP_WLOCK_ASSERT(tp->t_inpcb); 8718 8719 /* 8720 * Process the segment text, merging it into the TCP sequencing 8721 * queue, and arranging for acknowledgment of receipt if necessary. 8722 * This process logically involves adjusting tp->rcv_wnd as data is 8723 * presented to the user (this happens in tcp_usrreq.c, case 8724 * PRU_RCVD). If a FIN has already been received on this connection 8725 * then we just ignore the text. 8726 */ 8727 tfo_syn = ((tp->t_state == TCPS_SYN_RECEIVED) && 8728 IS_FASTOPEN(tp->t_flags)); 8729 if ((tlen || (thflags & TH_FIN) || (tfo_syn && tlen > 0)) && 8730 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 8731 tcp_seq save_start = th->th_seq; 8732 tcp_seq save_rnxt = tp->rcv_nxt; 8733 int save_tlen = tlen; 8734 8735 m_adj(m, drop_hdrlen); /* delayed header drop */ 8736 /* 8737 * Insert segment which includes th into TCP reassembly 8738 * queue with control block tp. Set thflags to whether 8739 * reassembly now includes a segment with FIN. This handles 8740 * the common case inline (segment is the next to be 8741 * received on an established connection, and the queue is 8742 * empty), avoiding linkage into and removal from the queue 8743 * and repetition of various conversions. Set DELACK for 8744 * segments received in order, but ack immediately when 8745 * segments are out of order (so fast retransmit can work). 8746 */ 8747 if (th->th_seq == tp->rcv_nxt && 8748 SEGQ_EMPTY(tp) && 8749 (TCPS_HAVEESTABLISHED(tp->t_state) || 8750 tfo_syn)) { 8751 #ifdef NETFLIX_SB_LIMITS 8752 u_int mcnt, appended; 8753 8754 if (so->so_rcv.sb_shlim) { 8755 mcnt = m_memcnt(m); 8756 appended = 0; 8757 if (counter_fo_get(so->so_rcv.sb_shlim, mcnt, 8758 CFO_NOSLEEP, NULL) == false) { 8759 counter_u64_add(tcp_sb_shlim_fails, 1); 8760 m_freem(m); 8761 return (0); 8762 } 8763 } 8764 #endif 8765 rack_handle_delayed_ack(tp, rack, tlen, tfo_syn); 8766 tp->rcv_nxt += tlen; 8767 if (tlen && 8768 ((tp->t_flags2 & TF2_FBYTES_COMPLETE) == 0) && 8769 (tp->t_fbyte_in == 0)) { 8770 tp->t_fbyte_in = ticks; 8771 if (tp->t_fbyte_in == 0) 8772 tp->t_fbyte_in = 1; 8773 if (tp->t_fbyte_out && tp->t_fbyte_in) 8774 tp->t_flags2 |= TF2_FBYTES_COMPLETE; 8775 } 8776 thflags = th->th_flags & TH_FIN; 8777 KMOD_TCPSTAT_ADD(tcps_rcvpack, nsegs); 8778 KMOD_TCPSTAT_ADD(tcps_rcvbyte, tlen); 8779 SOCKBUF_LOCK(&so->so_rcv); 8780 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 8781 m_freem(m); 8782 } else 8783 #ifdef NETFLIX_SB_LIMITS 8784 appended = 8785 #endif 8786 sbappendstream_locked(&so->so_rcv, m, 0); 8787 /* NB: sorwakeup_locked() does an implicit unlock. */ 8788 sorwakeup_locked(so); 8789 #ifdef NETFLIX_SB_LIMITS 8790 if (so->so_rcv.sb_shlim && appended != mcnt) 8791 counter_fo_release(so->so_rcv.sb_shlim, 8792 mcnt - appended); 8793 #endif 8794 } else { 8795 /* 8796 * XXX: Due to the header drop above "th" is 8797 * theoretically invalid by now. Fortunately 8798 * m_adj() doesn't actually frees any mbufs when 8799 * trimming from the head. 8800 */ 8801 tcp_seq temp = save_start; 8802 thflags = tcp_reass(tp, th, &temp, &tlen, m); 8803 tp->t_flags |= TF_ACKNOW; 8804 } 8805 if ((tp->t_flags & TF_SACK_PERMIT) && (save_tlen > 0)) { 8806 if ((tlen == 0) && (SEQ_LT(save_start, save_rnxt))) { 8807 /* 8808 * DSACK actually handled in the fastpath 8809 * above. 8810 */ 8811 RACK_OPTS_INC(tcp_sack_path_1); 8812 tcp_update_sack_list(tp, save_start, 8813 save_start + save_tlen); 8814 } else if ((tlen > 0) && SEQ_GT(tp->rcv_nxt, save_rnxt)) { 8815 if ((tp->rcv_numsacks >= 1) && 8816 (tp->sackblks[0].end == save_start)) { 8817 /* 8818 * Partial overlap, recorded at todrop 8819 * above. 8820 */ 8821 RACK_OPTS_INC(tcp_sack_path_2a); 8822 tcp_update_sack_list(tp, 8823 tp->sackblks[0].start, 8824 tp->sackblks[0].end); 8825 } else { 8826 RACK_OPTS_INC(tcp_sack_path_2b); 8827 tcp_update_dsack_list(tp, save_start, 8828 save_start + save_tlen); 8829 } 8830 } else if (tlen >= save_tlen) { 8831 /* Update of sackblks. */ 8832 RACK_OPTS_INC(tcp_sack_path_3); 8833 tcp_update_dsack_list(tp, save_start, 8834 save_start + save_tlen); 8835 } else if (tlen > 0) { 8836 RACK_OPTS_INC(tcp_sack_path_4); 8837 tcp_update_dsack_list(tp, save_start, 8838 save_start + tlen); 8839 } 8840 } 8841 } else { 8842 m_freem(m); 8843 thflags &= ~TH_FIN; 8844 } 8845 8846 /* 8847 * If FIN is received ACK the FIN and let the user know that the 8848 * connection is closing. 8849 */ 8850 if (thflags & TH_FIN) { 8851 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 8852 socantrcvmore(so); 8853 /* 8854 * If connection is half-synchronized (ie NEEDSYN 8855 * flag on) then delay ACK, so it may be piggybacked 8856 * when SYN is sent. Otherwise, since we received a 8857 * FIN then no more input can be expected, send ACK 8858 * now. 8859 */ 8860 if (tp->t_flags & TF_NEEDSYN) { 8861 rack_timer_cancel(tp, rack, 8862 rack->r_ctl.rc_rcvtime, __LINE__); 8863 tp->t_flags |= TF_DELACK; 8864 } else { 8865 tp->t_flags |= TF_ACKNOW; 8866 } 8867 tp->rcv_nxt++; 8868 } 8869 switch (tp->t_state) { 8870 8871 /* 8872 * In SYN_RECEIVED and ESTABLISHED STATES enter the 8873 * CLOSE_WAIT state. 8874 */ 8875 case TCPS_SYN_RECEIVED: 8876 tp->t_starttime = ticks; 8877 /* FALLTHROUGH */ 8878 case TCPS_ESTABLISHED: 8879 rack_timer_cancel(tp, rack, 8880 rack->r_ctl.rc_rcvtime, __LINE__); 8881 tcp_state_change(tp, TCPS_CLOSE_WAIT); 8882 break; 8883 8884 /* 8885 * If still in FIN_WAIT_1 STATE FIN has not been 8886 * acked so enter the CLOSING state. 8887 */ 8888 case TCPS_FIN_WAIT_1: 8889 rack_timer_cancel(tp, rack, 8890 rack->r_ctl.rc_rcvtime, __LINE__); 8891 tcp_state_change(tp, TCPS_CLOSING); 8892 break; 8893 8894 /* 8895 * In FIN_WAIT_2 state enter the TIME_WAIT state, 8896 * starting the time-wait timer, turning off the 8897 * other standard timers. 8898 */ 8899 case TCPS_FIN_WAIT_2: 8900 rack_timer_cancel(tp, rack, 8901 rack->r_ctl.rc_rcvtime, __LINE__); 8902 tcp_twstart(tp); 8903 return (1); 8904 } 8905 } 8906 /* 8907 * Return any desired output. 8908 */ 8909 if ((tp->t_flags & TF_ACKNOW) || 8910 (sbavail(&so->so_snd) > (tp->snd_max - tp->snd_una))) { 8911 rack->r_wanted_output = 1; 8912 } 8913 INP_WLOCK_ASSERT(tp->t_inpcb); 8914 return (0); 8915 } 8916 8917 /* 8918 * Here nothing is really faster, its just that we 8919 * have broken out the fast-data path also just like 8920 * the fast-ack. 8921 */ 8922 static int 8923 rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so, 8924 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 8925 uint32_t tiwin, int32_t nxt_pkt, uint8_t iptos) 8926 { 8927 int32_t nsegs; 8928 int32_t newsize = 0; /* automatic sockbuf scaling */ 8929 struct tcp_rack *rack; 8930 #ifdef NETFLIX_SB_LIMITS 8931 u_int mcnt, appended; 8932 #endif 8933 #ifdef TCPDEBUG 8934 /* 8935 * The size of tcp_saveipgen must be the size of the max ip header, 8936 * now IPv6. 8937 */ 8938 u_char tcp_saveipgen[IP6_HDR_LEN]; 8939 struct tcphdr tcp_savetcp; 8940 short ostate = 0; 8941 8942 #endif 8943 /* 8944 * If last ACK falls within this segment's sequence numbers, record 8945 * the timestamp. NOTE that the test is modified according to the 8946 * latest proposal of the tcplw@cray.com list (Braden 1993/04/26). 8947 */ 8948 if (__predict_false(th->th_seq != tp->rcv_nxt)) { 8949 return (0); 8950 } 8951 if (__predict_false(tp->snd_nxt != tp->snd_max)) { 8952 return (0); 8953 } 8954 if (tiwin && tiwin != tp->snd_wnd) { 8955 return (0); 8956 } 8957 if (__predict_false((tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN)))) { 8958 return (0); 8959 } 8960 if (__predict_false((to->to_flags & TOF_TS) && 8961 (TSTMP_LT(to->to_tsval, tp->ts_recent)))) { 8962 return (0); 8963 } 8964 if (__predict_false((th->th_ack != tp->snd_una))) { 8965 return (0); 8966 } 8967 if (__predict_false(tlen > sbspace(&so->so_rcv))) { 8968 return (0); 8969 } 8970 if ((to->to_flags & TOF_TS) != 0 && 8971 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 8972 tp->ts_recent_age = tcp_ts_getticks(); 8973 tp->ts_recent = to->to_tsval; 8974 } 8975 rack = (struct tcp_rack *)tp->t_fb_ptr; 8976 /* 8977 * This is a pure, in-sequence data packet with nothing on the 8978 * reassembly queue and we have enough buffer space to take it. 8979 */ 8980 nsegs = max(1, m->m_pkthdr.lro_nsegs); 8981 8982 #ifdef NETFLIX_SB_LIMITS 8983 if (so->so_rcv.sb_shlim) { 8984 mcnt = m_memcnt(m); 8985 appended = 0; 8986 if (counter_fo_get(so->so_rcv.sb_shlim, mcnt, 8987 CFO_NOSLEEP, NULL) == false) { 8988 counter_u64_add(tcp_sb_shlim_fails, 1); 8989 m_freem(m); 8990 return (1); 8991 } 8992 } 8993 #endif 8994 /* Clean receiver SACK report if present */ 8995 if (tp->rcv_numsacks) 8996 tcp_clean_sackreport(tp); 8997 KMOD_TCPSTAT_INC(tcps_preddat); 8998 tp->rcv_nxt += tlen; 8999 if (tlen && 9000 ((tp->t_flags2 & TF2_FBYTES_COMPLETE) == 0) && 9001 (tp->t_fbyte_in == 0)) { 9002 tp->t_fbyte_in = ticks; 9003 if (tp->t_fbyte_in == 0) 9004 tp->t_fbyte_in = 1; 9005 if (tp->t_fbyte_out && tp->t_fbyte_in) 9006 tp->t_flags2 |= TF2_FBYTES_COMPLETE; 9007 } 9008 /* 9009 * Pull snd_wl1 up to prevent seq wrap relative to th_seq. 9010 */ 9011 tp->snd_wl1 = th->th_seq; 9012 /* 9013 * Pull rcv_up up to prevent seq wrap relative to rcv_nxt. 9014 */ 9015 tp->rcv_up = tp->rcv_nxt; 9016 KMOD_TCPSTAT_ADD(tcps_rcvpack, nsegs); 9017 KMOD_TCPSTAT_ADD(tcps_rcvbyte, tlen); 9018 #ifdef TCPDEBUG 9019 if (so->so_options & SO_DEBUG) 9020 tcp_trace(TA_INPUT, ostate, tp, 9021 (void *)tcp_saveipgen, &tcp_savetcp, 0); 9022 #endif 9023 newsize = tcp_autorcvbuf(m, th, so, tp, tlen); 9024 9025 /* Add data to socket buffer. */ 9026 SOCKBUF_LOCK(&so->so_rcv); 9027 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 9028 m_freem(m); 9029 } else { 9030 /* 9031 * Set new socket buffer size. Give up when limit is 9032 * reached. 9033 */ 9034 if (newsize) 9035 if (!sbreserve_locked(&so->so_rcv, 9036 newsize, so, NULL)) 9037 so->so_rcv.sb_flags &= ~SB_AUTOSIZE; 9038 m_adj(m, drop_hdrlen); /* delayed header drop */ 9039 #ifdef NETFLIX_SB_LIMITS 9040 appended = 9041 #endif 9042 sbappendstream_locked(&so->so_rcv, m, 0); 9043 ctf_calc_rwin(so, tp); 9044 } 9045 /* NB: sorwakeup_locked() does an implicit unlock. */ 9046 sorwakeup_locked(so); 9047 #ifdef NETFLIX_SB_LIMITS 9048 if (so->so_rcv.sb_shlim && mcnt != appended) 9049 counter_fo_release(so->so_rcv.sb_shlim, mcnt - appended); 9050 #endif 9051 rack_handle_delayed_ack(tp, rack, tlen, 0); 9052 if (tp->snd_una == tp->snd_max) 9053 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 9054 return (1); 9055 } 9056 9057 /* 9058 * This subfunction is used to try to highly optimize the 9059 * fast path. We again allow window updates that are 9060 * in sequence to remain in the fast-path. We also add 9061 * in the __predict's to attempt to help the compiler. 9062 * Note that if we return a 0, then we can *not* process 9063 * it and the caller should push the packet into the 9064 * slow-path. 9065 */ 9066 static int 9067 rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so, 9068 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 9069 uint32_t tiwin, int32_t nxt_pkt, uint32_t cts) 9070 { 9071 int32_t acked; 9072 int32_t nsegs; 9073 #ifdef TCPDEBUG 9074 /* 9075 * The size of tcp_saveipgen must be the size of the max ip header, 9076 * now IPv6. 9077 */ 9078 u_char tcp_saveipgen[IP6_HDR_LEN]; 9079 struct tcphdr tcp_savetcp; 9080 short ostate = 0; 9081 #endif 9082 int32_t under_pacing = 0; 9083 struct tcp_rack *rack; 9084 9085 if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) { 9086 /* Old ack, behind (or duplicate to) the last one rcv'd */ 9087 return (0); 9088 } 9089 if (__predict_false(SEQ_GT(th->th_ack, tp->snd_max))) { 9090 /* Above what we have sent? */ 9091 return (0); 9092 } 9093 if (__predict_false(tp->snd_nxt != tp->snd_max)) { 9094 /* We are retransmitting */ 9095 return (0); 9096 } 9097 if (__predict_false(tiwin == 0)) { 9098 /* zero window */ 9099 return (0); 9100 } 9101 if (__predict_false(tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN))) { 9102 /* We need a SYN or a FIN, unlikely.. */ 9103 return (0); 9104 } 9105 if ((to->to_flags & TOF_TS) && __predict_false(TSTMP_LT(to->to_tsval, tp->ts_recent))) { 9106 /* Timestamp is behind .. old ack with seq wrap? */ 9107 return (0); 9108 } 9109 if (__predict_false(IN_RECOVERY(tp->t_flags))) { 9110 /* Still recovering */ 9111 return (0); 9112 } 9113 rack = (struct tcp_rack *)tp->t_fb_ptr; 9114 if (rack->r_ctl.rc_sacked) { 9115 /* We have sack holes on our scoreboard */ 9116 return (0); 9117 } 9118 /* Ok if we reach here, we can process a fast-ack */ 9119 if (rack->rc_gp_filled && 9120 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { 9121 under_pacing = 1; 9122 } 9123 nsegs = max(1, m->m_pkthdr.lro_nsegs); 9124 rack_log_ack(tp, to, th); 9125 /* Did the window get updated? */ 9126 if (tiwin != tp->snd_wnd) { 9127 tp->snd_wnd = tiwin; 9128 tp->snd_wl1 = th->th_seq; 9129 if (tp->snd_wnd > tp->max_sndwnd) 9130 tp->max_sndwnd = tp->snd_wnd; 9131 } 9132 /* Do we exit persists? */ 9133 if ((rack->rc_in_persist != 0) && 9134 (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2), 9135 rack->r_ctl.rc_pace_min_segs))) { 9136 rack_exit_persist(tp, rack, cts); 9137 } 9138 /* Do we enter persists? */ 9139 if ((rack->rc_in_persist == 0) && 9140 (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) && 9141 TCPS_HAVEESTABLISHED(tp->t_state) && 9142 (tp->snd_max == tp->snd_una) && 9143 sbavail(&tp->t_inpcb->inp_socket->so_snd) && 9144 (sbavail(&tp->t_inpcb->inp_socket->so_snd) > tp->snd_wnd)) { 9145 /* 9146 * Here the rwnd is less than 9147 * the pacing size, we are established, 9148 * nothing is outstanding, and there is 9149 * data to send. Enter persists. 9150 */ 9151 tp->snd_nxt = tp->snd_una; 9152 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime); 9153 } 9154 /* 9155 * If last ACK falls within this segment's sequence numbers, record 9156 * the timestamp. NOTE that the test is modified according to the 9157 * latest proposal of the tcplw@cray.com list (Braden 1993/04/26). 9158 */ 9159 if ((to->to_flags & TOF_TS) != 0 && 9160 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 9161 tp->ts_recent_age = tcp_ts_getticks(); 9162 tp->ts_recent = to->to_tsval; 9163 } 9164 /* 9165 * This is a pure ack for outstanding data. 9166 */ 9167 KMOD_TCPSTAT_INC(tcps_predack); 9168 9169 /* 9170 * "bad retransmit" recovery. 9171 */ 9172 if (tp->t_flags & TF_PREVVALID) { 9173 tp->t_flags &= ~TF_PREVVALID; 9174 if (tp->t_rxtshift == 1 && 9175 (int)(ticks - tp->t_badrxtwin) < 0) 9176 rack_cong_signal(tp, th, CC_RTO_ERR); 9177 } 9178 /* 9179 * Recalculate the transmit timer / rtt. 9180 * 9181 * Some boxes send broken timestamp replies during the SYN+ACK 9182 * phase, ignore timestamps of 0 or we could calculate a huge RTT 9183 * and blow up the retransmit timer. 9184 */ 9185 acked = BYTES_THIS_ACK(tp, th); 9186 9187 #ifdef TCP_HHOOK 9188 /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ 9189 hhook_run_tcp_est_in(tp, th, to); 9190 #endif 9191 9192 KMOD_TCPSTAT_ADD(tcps_rcvackpack, nsegs); 9193 KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked); 9194 sbdrop(&so->so_snd, acked); 9195 if (acked) { 9196 /* assure we are not backed off */ 9197 tp->t_rxtshift = 0; 9198 rack->rc_tlp_in_progress = 0; 9199 rack->r_ctl.rc_tlp_cnt_out = 0; 9200 /* 9201 * If it is the RXT timer we want to 9202 * stop it, so we can restart a TLP. 9203 */ 9204 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) 9205 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 9206 #ifdef NETFLIX_HTTP_LOGGING 9207 tcp_http_check_for_comp(rack->rc_tp, th->th_ack); 9208 #endif 9209 } 9210 /* 9211 * Let the congestion control algorithm update congestion control 9212 * related information. This typically means increasing the 9213 * congestion window. 9214 */ 9215 rack_ack_received(tp, rack, th, nsegs, CC_ACK, 0); 9216 9217 tp->snd_una = th->th_ack; 9218 if (tp->snd_wnd < ctf_outstanding(tp)) { 9219 /* The peer collapsed the window */ 9220 rack_collapsed_window(rack); 9221 } else if (rack->rc_has_collapsed) 9222 rack_un_collapse_window(rack); 9223 9224 /* 9225 * Pull snd_wl2 up to prevent seq wrap relative to th_ack. 9226 */ 9227 tp->snd_wl2 = th->th_ack; 9228 tp->t_dupacks = 0; 9229 m_freem(m); 9230 /* ND6_HINT(tp); *//* Some progress has been made. */ 9231 9232 /* 9233 * If all outstanding data are acked, stop retransmit timer, 9234 * otherwise restart timer using current (possibly backed-off) 9235 * value. If process is waiting for space, wakeup/selwakeup/signal. 9236 * If data are ready to send, let tcp_output decide between more 9237 * output or persist. 9238 */ 9239 #ifdef TCPDEBUG 9240 if (so->so_options & SO_DEBUG) 9241 tcp_trace(TA_INPUT, ostate, tp, 9242 (void *)tcp_saveipgen, 9243 &tcp_savetcp, 0); 9244 #endif 9245 if (under_pacing && 9246 (rack->use_fixed_rate == 0) && 9247 (rack->in_probe_rtt == 0) && 9248 rack->rc_gp_dyn_mul && 9249 rack->rc_always_pace) { 9250 /* Check if we are dragging bottom */ 9251 rack_check_bottom_drag(tp, rack, so, acked); 9252 } 9253 if (tp->snd_una == tp->snd_max) { 9254 rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL); 9255 if (rack->r_ctl.rc_went_idle_time == 0) 9256 rack->r_ctl.rc_went_idle_time = 1; 9257 rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__); 9258 if (sbavail(&tp->t_inpcb->inp_socket->so_snd) == 0) 9259 tp->t_acktime = 0; 9260 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 9261 } 9262 /* Wake up the socket if we have room to write more */ 9263 sowwakeup(so); 9264 if (sbavail(&so->so_snd)) { 9265 rack->r_wanted_output = 1; 9266 } 9267 return (1); 9268 } 9269 9270 /* 9271 * Return value of 1, the TCB is unlocked and most 9272 * likely gone, return value of 0, the TCP is still 9273 * locked. 9274 */ 9275 static int 9276 rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so, 9277 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 9278 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 9279 { 9280 int32_t ret_val = 0; 9281 int32_t todrop; 9282 int32_t ourfinisacked = 0; 9283 struct tcp_rack *rack; 9284 9285 ctf_calc_rwin(so, tp); 9286 /* 9287 * If the state is SYN_SENT: if seg contains an ACK, but not for our 9288 * SYN, drop the input. if seg contains a RST, then drop the 9289 * connection. if seg does not contain SYN, then drop it. Otherwise 9290 * this is an acceptable SYN segment initialize tp->rcv_nxt and 9291 * tp->irs if seg contains ack then advance tp->snd_una if seg 9292 * contains an ECE and ECN support is enabled, the stream is ECN 9293 * capable. if SYN has been acked change to ESTABLISHED else 9294 * SYN_RCVD state arrange for segment to be acked (eventually) 9295 * continue processing rest of data/controls. 9296 */ 9297 if ((thflags & TH_ACK) && 9298 (SEQ_LEQ(th->th_ack, tp->iss) || 9299 SEQ_GT(th->th_ack, tp->snd_max))) { 9300 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 9301 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 9302 return (1); 9303 } 9304 if ((thflags & (TH_ACK | TH_RST)) == (TH_ACK | TH_RST)) { 9305 TCP_PROBE5(connect__refused, NULL, tp, 9306 mtod(m, const char *), tp, th); 9307 tp = tcp_drop(tp, ECONNREFUSED); 9308 ctf_do_drop(m, tp); 9309 return (1); 9310 } 9311 if (thflags & TH_RST) { 9312 ctf_do_drop(m, tp); 9313 return (1); 9314 } 9315 if (!(thflags & TH_SYN)) { 9316 ctf_do_drop(m, tp); 9317 return (1); 9318 } 9319 tp->irs = th->th_seq; 9320 tcp_rcvseqinit(tp); 9321 rack = (struct tcp_rack *)tp->t_fb_ptr; 9322 if (thflags & TH_ACK) { 9323 int tfo_partial = 0; 9324 9325 KMOD_TCPSTAT_INC(tcps_connects); 9326 soisconnected(so); 9327 #ifdef MAC 9328 mac_socketpeer_set_from_mbuf(m, so); 9329 #endif 9330 /* Do window scaling on this connection? */ 9331 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 9332 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 9333 tp->rcv_scale = tp->request_r_scale; 9334 } 9335 tp->rcv_adv += min(tp->rcv_wnd, 9336 TCP_MAXWIN << tp->rcv_scale); 9337 /* 9338 * If not all the data that was sent in the TFO SYN 9339 * has been acked, resend the remainder right away. 9340 */ 9341 if (IS_FASTOPEN(tp->t_flags) && 9342 (tp->snd_una != tp->snd_max)) { 9343 tp->snd_nxt = th->th_ack; 9344 tfo_partial = 1; 9345 } 9346 /* 9347 * If there's data, delay ACK; if there's also a FIN ACKNOW 9348 * will be turned on later. 9349 */ 9350 if (DELAY_ACK(tp, tlen) && tlen != 0 && !tfo_partial) { 9351 rack_timer_cancel(tp, rack, 9352 rack->r_ctl.rc_rcvtime, __LINE__); 9353 tp->t_flags |= TF_DELACK; 9354 } else { 9355 rack->r_wanted_output = 1; 9356 tp->t_flags |= TF_ACKNOW; 9357 rack->rc_dack_toggle = 0; 9358 } 9359 if (((thflags & (TH_CWR | TH_ECE)) == TH_ECE) && 9360 (V_tcp_do_ecn == 1)) { 9361 tp->t_flags2 |= TF2_ECN_PERMIT; 9362 KMOD_TCPSTAT_INC(tcps_ecn_shs); 9363 } 9364 if (SEQ_GT(th->th_ack, tp->snd_una)) { 9365 /* 9366 * We advance snd_una for the 9367 * fast open case. If th_ack is 9368 * acknowledging data beyond 9369 * snd_una we can't just call 9370 * ack-processing since the 9371 * data stream in our send-map 9372 * will start at snd_una + 1 (one 9373 * beyond the SYN). If its just 9374 * equal we don't need to do that 9375 * and there is no send_map. 9376 */ 9377 tp->snd_una++; 9378 } 9379 /* 9380 * Received <SYN,ACK> in SYN_SENT[*] state. Transitions: 9381 * SYN_SENT --> ESTABLISHED SYN_SENT* --> FIN_WAIT_1 9382 */ 9383 tp->t_starttime = ticks; 9384 if (tp->t_flags & TF_NEEDFIN) { 9385 tcp_state_change(tp, TCPS_FIN_WAIT_1); 9386 tp->t_flags &= ~TF_NEEDFIN; 9387 thflags &= ~TH_SYN; 9388 } else { 9389 tcp_state_change(tp, TCPS_ESTABLISHED); 9390 TCP_PROBE5(connect__established, NULL, tp, 9391 mtod(m, const char *), tp, th); 9392 rack_cc_conn_init(tp); 9393 } 9394 } else { 9395 /* 9396 * Received initial SYN in SYN-SENT[*] state => simultaneous 9397 * open. If segment contains CC option and there is a 9398 * cached CC, apply TAO test. If it succeeds, connection is * 9399 * half-synchronized. Otherwise, do 3-way handshake: 9400 * SYN-SENT -> SYN-RECEIVED SYN-SENT* -> SYN-RECEIVED* If 9401 * there was no CC option, clear cached CC value. 9402 */ 9403 tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN); 9404 tcp_state_change(tp, TCPS_SYN_RECEIVED); 9405 } 9406 INP_WLOCK_ASSERT(tp->t_inpcb); 9407 /* 9408 * Advance th->th_seq to correspond to first data byte. If data, 9409 * trim to stay within window, dropping FIN if necessary. 9410 */ 9411 th->th_seq++; 9412 if (tlen > tp->rcv_wnd) { 9413 todrop = tlen - tp->rcv_wnd; 9414 m_adj(m, -todrop); 9415 tlen = tp->rcv_wnd; 9416 thflags &= ~TH_FIN; 9417 KMOD_TCPSTAT_INC(tcps_rcvpackafterwin); 9418 KMOD_TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); 9419 } 9420 tp->snd_wl1 = th->th_seq - 1; 9421 tp->rcv_up = th->th_seq; 9422 /* 9423 * Client side of transaction: already sent SYN and data. If the 9424 * remote host used T/TCP to validate the SYN, our data will be 9425 * ACK'd; if so, enter normal data segment processing in the middle 9426 * of step 5, ack processing. Otherwise, goto step 6. 9427 */ 9428 if (thflags & TH_ACK) { 9429 /* For syn-sent we need to possibly update the rtt */ 9430 if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) { 9431 uint32_t t; 9432 9433 t = tcp_ts_getticks() - to->to_tsecr; 9434 if (!tp->t_rttlow || tp->t_rttlow > t) 9435 tp->t_rttlow = t; 9436 tcp_rack_xmit_timer(rack, t + 1, 1, (t * HPTS_USEC_IN_MSEC), 0, NULL, 2); 9437 tcp_rack_xmit_timer_commit(rack, tp); 9438 } 9439 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) 9440 return (ret_val); 9441 /* We may have changed to FIN_WAIT_1 above */ 9442 if (tp->t_state == TCPS_FIN_WAIT_1) { 9443 /* 9444 * In FIN_WAIT_1 STATE in addition to the processing 9445 * for the ESTABLISHED state if our FIN is now 9446 * acknowledged then enter FIN_WAIT_2. 9447 */ 9448 if (ourfinisacked) { 9449 /* 9450 * If we can't receive any more data, then 9451 * closing user can proceed. Starting the 9452 * timer is contrary to the specification, 9453 * but if we don't get a FIN we'll hang 9454 * forever. 9455 * 9456 * XXXjl: we should release the tp also, and 9457 * use a compressed state. 9458 */ 9459 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 9460 soisdisconnected(so); 9461 tcp_timer_activate(tp, TT_2MSL, 9462 (tcp_fast_finwait2_recycle ? 9463 tcp_finwait2_timeout : 9464 TP_MAXIDLE(tp))); 9465 } 9466 tcp_state_change(tp, TCPS_FIN_WAIT_2); 9467 } 9468 } 9469 } 9470 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 9471 tiwin, thflags, nxt_pkt)); 9472 } 9473 9474 /* 9475 * Return value of 1, the TCB is unlocked and most 9476 * likely gone, return value of 0, the TCP is still 9477 * locked. 9478 */ 9479 static int 9480 rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, 9481 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 9482 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 9483 { 9484 struct tcp_rack *rack; 9485 int32_t ret_val = 0; 9486 int32_t ourfinisacked = 0; 9487 9488 ctf_calc_rwin(so, tp); 9489 if ((thflags & TH_ACK) && 9490 (SEQ_LEQ(th->th_ack, tp->snd_una) || 9491 SEQ_GT(th->th_ack, tp->snd_max))) { 9492 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 9493 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 9494 return (1); 9495 } 9496 rack = (struct tcp_rack *)tp->t_fb_ptr; 9497 if (IS_FASTOPEN(tp->t_flags)) { 9498 /* 9499 * When a TFO connection is in SYN_RECEIVED, the 9500 * only valid packets are the initial SYN, a 9501 * retransmit/copy of the initial SYN (possibly with 9502 * a subset of the original data), a valid ACK, a 9503 * FIN, or a RST. 9504 */ 9505 if ((thflags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) { 9506 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 9507 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 9508 return (1); 9509 } else if (thflags & TH_SYN) { 9510 /* non-initial SYN is ignored */ 9511 if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) || 9512 (rack->r_ctl.rc_hpts_flags & PACE_TMR_TLP) || 9513 (rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK)) { 9514 ctf_do_drop(m, NULL); 9515 return (0); 9516 } 9517 } else if (!(thflags & (TH_ACK | TH_FIN | TH_RST))) { 9518 ctf_do_drop(m, NULL); 9519 return (0); 9520 } 9521 } 9522 if ((thflags & TH_RST) || 9523 (tp->t_fin_is_rst && (thflags & TH_FIN))) 9524 return (ctf_process_rst(m, th, so, tp)); 9525 /* 9526 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 9527 * it's less than ts_recent, drop it. 9528 */ 9529 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 9530 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 9531 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 9532 return (ret_val); 9533 } 9534 /* 9535 * In the SYN-RECEIVED state, validate that the packet belongs to 9536 * this connection before trimming the data to fit the receive 9537 * window. Check the sequence number versus IRS since we know the 9538 * sequence numbers haven't wrapped. This is a partial fix for the 9539 * "LAND" DoS attack. 9540 */ 9541 if (SEQ_LT(th->th_seq, tp->irs)) { 9542 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 9543 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 9544 return (1); 9545 } 9546 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 9547 return (ret_val); 9548 } 9549 /* 9550 * If last ACK falls within this segment's sequence numbers, record 9551 * its timestamp. NOTE: 1) That the test incorporates suggestions 9552 * from the latest proposal of the tcplw@cray.com list (Braden 9553 * 1993/04/26). 2) That updating only on newer timestamps interferes 9554 * with our earlier PAWS tests, so this check should be solely 9555 * predicated on the sequence space of this segment. 3) That we 9556 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 9557 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 9558 * SEG.Len, This modified check allows us to overcome RFC1323's 9559 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 9560 * p.869. In such cases, we can still calculate the RTT correctly 9561 * when RCV.NXT == Last.ACK.Sent. 9562 */ 9563 if ((to->to_flags & TOF_TS) != 0 && 9564 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 9565 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 9566 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 9567 tp->ts_recent_age = tcp_ts_getticks(); 9568 tp->ts_recent = to->to_tsval; 9569 } 9570 tp->snd_wnd = tiwin; 9571 /* 9572 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 9573 * is on (half-synchronized state), then queue data for later 9574 * processing; else drop segment and return. 9575 */ 9576 if ((thflags & TH_ACK) == 0) { 9577 if (IS_FASTOPEN(tp->t_flags)) { 9578 rack_cc_conn_init(tp); 9579 } 9580 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 9581 tiwin, thflags, nxt_pkt)); 9582 } 9583 KMOD_TCPSTAT_INC(tcps_connects); 9584 soisconnected(so); 9585 /* Do window scaling? */ 9586 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 9587 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 9588 tp->rcv_scale = tp->request_r_scale; 9589 } 9590 /* 9591 * Make transitions: SYN-RECEIVED -> ESTABLISHED SYN-RECEIVED* -> 9592 * FIN-WAIT-1 9593 */ 9594 tp->t_starttime = ticks; 9595 if (IS_FASTOPEN(tp->t_flags) && tp->t_tfo_pending) { 9596 tcp_fastopen_decrement_counter(tp->t_tfo_pending); 9597 tp->t_tfo_pending = NULL; 9598 } 9599 if (tp->t_flags & TF_NEEDFIN) { 9600 tcp_state_change(tp, TCPS_FIN_WAIT_1); 9601 tp->t_flags &= ~TF_NEEDFIN; 9602 } else { 9603 tcp_state_change(tp, TCPS_ESTABLISHED); 9604 TCP_PROBE5(accept__established, NULL, tp, 9605 mtod(m, const char *), tp, th); 9606 /* 9607 * TFO connections call cc_conn_init() during SYN 9608 * processing. Calling it again here for such connections 9609 * is not harmless as it would undo the snd_cwnd reduction 9610 * that occurs when a TFO SYN|ACK is retransmitted. 9611 */ 9612 if (!IS_FASTOPEN(tp->t_flags)) 9613 rack_cc_conn_init(tp); 9614 } 9615 /* 9616 * Account for the ACK of our SYN prior to 9617 * regular ACK processing below, except for 9618 * simultaneous SYN, which is handled later. 9619 */ 9620 if (SEQ_GT(th->th_ack, tp->snd_una) && !(tp->t_flags & TF_NEEDSYN)) 9621 tp->snd_una++; 9622 /* 9623 * If segment contains data or ACK, will call tcp_reass() later; if 9624 * not, do so now to pass queued data to user. 9625 */ 9626 if (tlen == 0 && (thflags & TH_FIN) == 0) 9627 (void) tcp_reass(tp, (struct tcphdr *)0, NULL, 0, 9628 (struct mbuf *)0); 9629 tp->snd_wl1 = th->th_seq - 1; 9630 /* For syn-recv we need to possibly update the rtt */ 9631 if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) { 9632 uint32_t t; 9633 9634 t = tcp_ts_getticks() - to->to_tsecr; 9635 if (!tp->t_rttlow || tp->t_rttlow > t) 9636 tp->t_rttlow = t; 9637 tcp_rack_xmit_timer(rack, t + 1, 1, (t * HPTS_USEC_IN_MSEC), 0, NULL, 2); 9638 tcp_rack_xmit_timer_commit(rack, tp); 9639 } 9640 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 9641 return (ret_val); 9642 } 9643 if (tp->t_state == TCPS_FIN_WAIT_1) { 9644 /* We could have went to FIN_WAIT_1 (or EST) above */ 9645 /* 9646 * In FIN_WAIT_1 STATE in addition to the processing for the 9647 * ESTABLISHED state if our FIN is now acknowledged then 9648 * enter FIN_WAIT_2. 9649 */ 9650 if (ourfinisacked) { 9651 /* 9652 * If we can't receive any more data, then closing 9653 * user can proceed. Starting the timer is contrary 9654 * to the specification, but if we don't get a FIN 9655 * we'll hang forever. 9656 * 9657 * XXXjl: we should release the tp also, and use a 9658 * compressed state. 9659 */ 9660 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 9661 soisdisconnected(so); 9662 tcp_timer_activate(tp, TT_2MSL, 9663 (tcp_fast_finwait2_recycle ? 9664 tcp_finwait2_timeout : 9665 TP_MAXIDLE(tp))); 9666 } 9667 tcp_state_change(tp, TCPS_FIN_WAIT_2); 9668 } 9669 } 9670 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 9671 tiwin, thflags, nxt_pkt)); 9672 } 9673 9674 /* 9675 * Return value of 1, the TCB is unlocked and most 9676 * likely gone, return value of 0, the TCP is still 9677 * locked. 9678 */ 9679 static int 9680 rack_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so, 9681 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 9682 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 9683 { 9684 int32_t ret_val = 0; 9685 struct tcp_rack *rack; 9686 9687 /* 9688 * Header prediction: check for the two common cases of a 9689 * uni-directional data xfer. If the packet has no control flags, 9690 * is in-sequence, the window didn't change and we're not 9691 * retransmitting, it's a candidate. If the length is zero and the 9692 * ack moved forward, we're the sender side of the xfer. Just free 9693 * the data acked & wake any higher level process that was blocked 9694 * waiting for space. If the length is non-zero and the ack didn't 9695 * move, we're the receiver side. If we're getting packets in-order 9696 * (the reassembly queue is empty), add the data toc The socket 9697 * buffer and note that we need a delayed ack. Make sure that the 9698 * hidden state-flags are also off. Since we check for 9699 * TCPS_ESTABLISHED first, it can only be TH_NEEDSYN. 9700 */ 9701 rack = (struct tcp_rack *)tp->t_fb_ptr; 9702 if (__predict_true(((to->to_flags & TOF_SACK) == 0)) && 9703 __predict_true((thflags & (TH_SYN | TH_FIN | TH_RST | TH_ACK)) == TH_ACK) && 9704 __predict_true(SEGQ_EMPTY(tp)) && 9705 __predict_true(th->th_seq == tp->rcv_nxt)) { 9706 if (tlen == 0) { 9707 if (rack_fastack(m, th, so, tp, to, drop_hdrlen, tlen, 9708 tiwin, nxt_pkt, rack->r_ctl.rc_rcvtime)) { 9709 return (0); 9710 } 9711 } else { 9712 if (rack_do_fastnewdata(m, th, so, tp, to, drop_hdrlen, tlen, 9713 tiwin, nxt_pkt, iptos)) { 9714 return (0); 9715 } 9716 } 9717 } 9718 ctf_calc_rwin(so, tp); 9719 9720 if ((thflags & TH_RST) || 9721 (tp->t_fin_is_rst && (thflags & TH_FIN))) 9722 return (ctf_process_rst(m, th, so, tp)); 9723 9724 /* 9725 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 9726 * synchronized state. 9727 */ 9728 if (thflags & TH_SYN) { 9729 ctf_challenge_ack(m, th, tp, &ret_val); 9730 return (ret_val); 9731 } 9732 /* 9733 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 9734 * it's less than ts_recent, drop it. 9735 */ 9736 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 9737 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 9738 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 9739 return (ret_val); 9740 } 9741 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 9742 return (ret_val); 9743 } 9744 /* 9745 * If last ACK falls within this segment's sequence numbers, record 9746 * its timestamp. NOTE: 1) That the test incorporates suggestions 9747 * from the latest proposal of the tcplw@cray.com list (Braden 9748 * 1993/04/26). 2) That updating only on newer timestamps interferes 9749 * with our earlier PAWS tests, so this check should be solely 9750 * predicated on the sequence space of this segment. 3) That we 9751 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 9752 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 9753 * SEG.Len, This modified check allows us to overcome RFC1323's 9754 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 9755 * p.869. In such cases, we can still calculate the RTT correctly 9756 * when RCV.NXT == Last.ACK.Sent. 9757 */ 9758 if ((to->to_flags & TOF_TS) != 0 && 9759 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 9760 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 9761 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 9762 tp->ts_recent_age = tcp_ts_getticks(); 9763 tp->ts_recent = to->to_tsval; 9764 } 9765 /* 9766 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 9767 * is on (half-synchronized state), then queue data for later 9768 * processing; else drop segment and return. 9769 */ 9770 if ((thflags & TH_ACK) == 0) { 9771 if (tp->t_flags & TF_NEEDSYN) { 9772 9773 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 9774 tiwin, thflags, nxt_pkt)); 9775 9776 } else if (tp->t_flags & TF_ACKNOW) { 9777 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 9778 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output= 1; 9779 return (ret_val); 9780 } else { 9781 ctf_do_drop(m, NULL); 9782 return (0); 9783 } 9784 } 9785 /* 9786 * Ack processing. 9787 */ 9788 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) { 9789 return (ret_val); 9790 } 9791 if (sbavail(&so->so_snd)) { 9792 if (ctf_progress_timeout_check(tp, true)) { 9793 rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__); 9794 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 9795 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 9796 return (1); 9797 } 9798 } 9799 /* State changes only happen in rack_process_data() */ 9800 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 9801 tiwin, thflags, nxt_pkt)); 9802 } 9803 9804 /* 9805 * Return value of 1, the TCB is unlocked and most 9806 * likely gone, return value of 0, the TCP is still 9807 * locked. 9808 */ 9809 static int 9810 rack_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so, 9811 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 9812 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 9813 { 9814 int32_t ret_val = 0; 9815 9816 ctf_calc_rwin(so, tp); 9817 if ((thflags & TH_RST) || 9818 (tp->t_fin_is_rst && (thflags & TH_FIN))) 9819 return (ctf_process_rst(m, th, so, tp)); 9820 /* 9821 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 9822 * synchronized state. 9823 */ 9824 if (thflags & TH_SYN) { 9825 ctf_challenge_ack(m, th, tp, &ret_val); 9826 return (ret_val); 9827 } 9828 /* 9829 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 9830 * it's less than ts_recent, drop it. 9831 */ 9832 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 9833 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 9834 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 9835 return (ret_val); 9836 } 9837 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 9838 return (ret_val); 9839 } 9840 /* 9841 * If last ACK falls within this segment's sequence numbers, record 9842 * its timestamp. NOTE: 1) That the test incorporates suggestions 9843 * from the latest proposal of the tcplw@cray.com list (Braden 9844 * 1993/04/26). 2) That updating only on newer timestamps interferes 9845 * with our earlier PAWS tests, so this check should be solely 9846 * predicated on the sequence space of this segment. 3) That we 9847 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 9848 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 9849 * SEG.Len, This modified check allows us to overcome RFC1323's 9850 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 9851 * p.869. In such cases, we can still calculate the RTT correctly 9852 * when RCV.NXT == Last.ACK.Sent. 9853 */ 9854 if ((to->to_flags & TOF_TS) != 0 && 9855 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 9856 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 9857 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 9858 tp->ts_recent_age = tcp_ts_getticks(); 9859 tp->ts_recent = to->to_tsval; 9860 } 9861 /* 9862 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 9863 * is on (half-synchronized state), then queue data for later 9864 * processing; else drop segment and return. 9865 */ 9866 if ((thflags & TH_ACK) == 0) { 9867 if (tp->t_flags & TF_NEEDSYN) { 9868 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 9869 tiwin, thflags, nxt_pkt)); 9870 9871 } else if (tp->t_flags & TF_ACKNOW) { 9872 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 9873 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; 9874 return (ret_val); 9875 } else { 9876 ctf_do_drop(m, NULL); 9877 return (0); 9878 } 9879 } 9880 /* 9881 * Ack processing. 9882 */ 9883 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) { 9884 return (ret_val); 9885 } 9886 if (sbavail(&so->so_snd)) { 9887 if (ctf_progress_timeout_check(tp, true)) { 9888 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 9889 tp, tick, PROGRESS_DROP, __LINE__); 9890 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 9891 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 9892 return (1); 9893 } 9894 } 9895 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 9896 tiwin, thflags, nxt_pkt)); 9897 } 9898 9899 static int 9900 rack_check_data_after_close(struct mbuf *m, 9901 struct tcpcb *tp, int32_t *tlen, struct tcphdr *th, struct socket *so) 9902 { 9903 struct tcp_rack *rack; 9904 9905 rack = (struct tcp_rack *)tp->t_fb_ptr; 9906 if (rack->rc_allow_data_af_clo == 0) { 9907 close_now: 9908 tcp_log_end_status(tp, TCP_EI_STATUS_DATA_A_CLOSE); 9909 /* tcp_close will kill the inp pre-log the Reset */ 9910 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); 9911 tp = tcp_close(tp); 9912 KMOD_TCPSTAT_INC(tcps_rcvafterclose); 9913 ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, (*tlen)); 9914 return (1); 9915 } 9916 if (sbavail(&so->so_snd) == 0) 9917 goto close_now; 9918 /* Ok we allow data that is ignored and a followup reset */ 9919 tcp_log_end_status(tp, TCP_EI_STATUS_DATA_A_CLOSE); 9920 tp->rcv_nxt = th->th_seq + *tlen; 9921 tp->t_flags2 |= TF2_DROP_AF_DATA; 9922 rack->r_wanted_output = 1; 9923 *tlen = 0; 9924 return (0); 9925 } 9926 9927 /* 9928 * Return value of 1, the TCB is unlocked and most 9929 * likely gone, return value of 0, the TCP is still 9930 * locked. 9931 */ 9932 static int 9933 rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so, 9934 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 9935 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 9936 { 9937 int32_t ret_val = 0; 9938 int32_t ourfinisacked = 0; 9939 9940 ctf_calc_rwin(so, tp); 9941 9942 if ((thflags & TH_RST) || 9943 (tp->t_fin_is_rst && (thflags & TH_FIN))) 9944 return (ctf_process_rst(m, th, so, tp)); 9945 /* 9946 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 9947 * synchronized state. 9948 */ 9949 if (thflags & TH_SYN) { 9950 ctf_challenge_ack(m, th, tp, &ret_val); 9951 return (ret_val); 9952 } 9953 /* 9954 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 9955 * it's less than ts_recent, drop it. 9956 */ 9957 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 9958 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 9959 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 9960 return (ret_val); 9961 } 9962 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 9963 return (ret_val); 9964 } 9965 /* 9966 * If new data are received on a connection after the user processes 9967 * are gone, then RST the other end. 9968 */ 9969 if ((so->so_state & SS_NOFDREF) && tlen) { 9970 if (rack_check_data_after_close(m, tp, &tlen, th, so)) 9971 return (1); 9972 } 9973 /* 9974 * If last ACK falls within this segment's sequence numbers, record 9975 * its timestamp. NOTE: 1) That the test incorporates suggestions 9976 * from the latest proposal of the tcplw@cray.com list (Braden 9977 * 1993/04/26). 2) That updating only on newer timestamps interferes 9978 * with our earlier PAWS tests, so this check should be solely 9979 * predicated on the sequence space of this segment. 3) That we 9980 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 9981 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 9982 * SEG.Len, This modified check allows us to overcome RFC1323's 9983 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 9984 * p.869. In such cases, we can still calculate the RTT correctly 9985 * when RCV.NXT == Last.ACK.Sent. 9986 */ 9987 if ((to->to_flags & TOF_TS) != 0 && 9988 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 9989 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 9990 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 9991 tp->ts_recent_age = tcp_ts_getticks(); 9992 tp->ts_recent = to->to_tsval; 9993 } 9994 /* 9995 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 9996 * is on (half-synchronized state), then queue data for later 9997 * processing; else drop segment and return. 9998 */ 9999 if ((thflags & TH_ACK) == 0) { 10000 if (tp->t_flags & TF_NEEDSYN) { 10001 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 10002 tiwin, thflags, nxt_pkt)); 10003 } else if (tp->t_flags & TF_ACKNOW) { 10004 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 10005 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; 10006 return (ret_val); 10007 } else { 10008 ctf_do_drop(m, NULL); 10009 return (0); 10010 } 10011 } 10012 /* 10013 * Ack processing. 10014 */ 10015 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 10016 return (ret_val); 10017 } 10018 if (ourfinisacked) { 10019 /* 10020 * If we can't receive any more data, then closing user can 10021 * proceed. Starting the timer is contrary to the 10022 * specification, but if we don't get a FIN we'll hang 10023 * forever. 10024 * 10025 * XXXjl: we should release the tp also, and use a 10026 * compressed state. 10027 */ 10028 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 10029 soisdisconnected(so); 10030 tcp_timer_activate(tp, TT_2MSL, 10031 (tcp_fast_finwait2_recycle ? 10032 tcp_finwait2_timeout : 10033 TP_MAXIDLE(tp))); 10034 } 10035 tcp_state_change(tp, TCPS_FIN_WAIT_2); 10036 } 10037 if (sbavail(&so->so_snd)) { 10038 if (ctf_progress_timeout_check(tp, true)) { 10039 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 10040 tp, tick, PROGRESS_DROP, __LINE__); 10041 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 10042 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 10043 return (1); 10044 } 10045 } 10046 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 10047 tiwin, thflags, nxt_pkt)); 10048 } 10049 10050 /* 10051 * Return value of 1, the TCB is unlocked and most 10052 * likely gone, return value of 0, the TCP is still 10053 * locked. 10054 */ 10055 static int 10056 rack_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so, 10057 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 10058 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 10059 { 10060 int32_t ret_val = 0; 10061 int32_t ourfinisacked = 0; 10062 10063 ctf_calc_rwin(so, tp); 10064 10065 if ((thflags & TH_RST) || 10066 (tp->t_fin_is_rst && (thflags & TH_FIN))) 10067 return (ctf_process_rst(m, th, so, tp)); 10068 /* 10069 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 10070 * synchronized state. 10071 */ 10072 if (thflags & TH_SYN) { 10073 ctf_challenge_ack(m, th, tp, &ret_val); 10074 return (ret_val); 10075 } 10076 /* 10077 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 10078 * it's less than ts_recent, drop it. 10079 */ 10080 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 10081 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 10082 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 10083 return (ret_val); 10084 } 10085 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 10086 return (ret_val); 10087 } 10088 /* 10089 * If new data are received on a connection after the user processes 10090 * are gone, then RST the other end. 10091 */ 10092 if ((so->so_state & SS_NOFDREF) && tlen) { 10093 if (rack_check_data_after_close(m, tp, &tlen, th, so)) 10094 return (1); 10095 } 10096 /* 10097 * If last ACK falls within this segment's sequence numbers, record 10098 * its timestamp. NOTE: 1) That the test incorporates suggestions 10099 * from the latest proposal of the tcplw@cray.com list (Braden 10100 * 1993/04/26). 2) That updating only on newer timestamps interferes 10101 * with our earlier PAWS tests, so this check should be solely 10102 * predicated on the sequence space of this segment. 3) That we 10103 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 10104 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 10105 * SEG.Len, This modified check allows us to overcome RFC1323's 10106 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 10107 * p.869. In such cases, we can still calculate the RTT correctly 10108 * when RCV.NXT == Last.ACK.Sent. 10109 */ 10110 if ((to->to_flags & TOF_TS) != 0 && 10111 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 10112 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 10113 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 10114 tp->ts_recent_age = tcp_ts_getticks(); 10115 tp->ts_recent = to->to_tsval; 10116 } 10117 /* 10118 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 10119 * is on (half-synchronized state), then queue data for later 10120 * processing; else drop segment and return. 10121 */ 10122 if ((thflags & TH_ACK) == 0) { 10123 if (tp->t_flags & TF_NEEDSYN) { 10124 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 10125 tiwin, thflags, nxt_pkt)); 10126 } else if (tp->t_flags & TF_ACKNOW) { 10127 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 10128 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output= 1; 10129 return (ret_val); 10130 } else { 10131 ctf_do_drop(m, NULL); 10132 return (0); 10133 } 10134 } 10135 /* 10136 * Ack processing. 10137 */ 10138 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 10139 return (ret_val); 10140 } 10141 if (ourfinisacked) { 10142 tcp_twstart(tp); 10143 m_freem(m); 10144 return (1); 10145 } 10146 if (sbavail(&so->so_snd)) { 10147 if (ctf_progress_timeout_check(tp, true)) { 10148 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 10149 tp, tick, PROGRESS_DROP, __LINE__); 10150 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 10151 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 10152 return (1); 10153 } 10154 } 10155 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 10156 tiwin, thflags, nxt_pkt)); 10157 } 10158 10159 /* 10160 * Return value of 1, the TCB is unlocked and most 10161 * likely gone, return value of 0, the TCP is still 10162 * locked. 10163 */ 10164 static int 10165 rack_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so, 10166 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 10167 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 10168 { 10169 int32_t ret_val = 0; 10170 int32_t ourfinisacked = 0; 10171 10172 ctf_calc_rwin(so, tp); 10173 10174 if ((thflags & TH_RST) || 10175 (tp->t_fin_is_rst && (thflags & TH_FIN))) 10176 return (ctf_process_rst(m, th, so, tp)); 10177 /* 10178 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 10179 * synchronized state. 10180 */ 10181 if (thflags & TH_SYN) { 10182 ctf_challenge_ack(m, th, tp, &ret_val); 10183 return (ret_val); 10184 } 10185 /* 10186 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 10187 * it's less than ts_recent, drop it. 10188 */ 10189 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 10190 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 10191 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 10192 return (ret_val); 10193 } 10194 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 10195 return (ret_val); 10196 } 10197 /* 10198 * If new data are received on a connection after the user processes 10199 * are gone, then RST the other end. 10200 */ 10201 if ((so->so_state & SS_NOFDREF) && tlen) { 10202 if (rack_check_data_after_close(m, tp, &tlen, th, so)) 10203 return (1); 10204 } 10205 /* 10206 * If last ACK falls within this segment's sequence numbers, record 10207 * its timestamp. NOTE: 1) That the test incorporates suggestions 10208 * from the latest proposal of the tcplw@cray.com list (Braden 10209 * 1993/04/26). 2) That updating only on newer timestamps interferes 10210 * with our earlier PAWS tests, so this check should be solely 10211 * predicated on the sequence space of this segment. 3) That we 10212 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 10213 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 10214 * SEG.Len, This modified check allows us to overcome RFC1323's 10215 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 10216 * p.869. In such cases, we can still calculate the RTT correctly 10217 * when RCV.NXT == Last.ACK.Sent. 10218 */ 10219 if ((to->to_flags & TOF_TS) != 0 && 10220 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 10221 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 10222 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 10223 tp->ts_recent_age = tcp_ts_getticks(); 10224 tp->ts_recent = to->to_tsval; 10225 } 10226 /* 10227 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 10228 * is on (half-synchronized state), then queue data for later 10229 * processing; else drop segment and return. 10230 */ 10231 if ((thflags & TH_ACK) == 0) { 10232 if (tp->t_flags & TF_NEEDSYN) { 10233 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 10234 tiwin, thflags, nxt_pkt)); 10235 } else if (tp->t_flags & TF_ACKNOW) { 10236 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 10237 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; 10238 return (ret_val); 10239 } else { 10240 ctf_do_drop(m, NULL); 10241 return (0); 10242 } 10243 } 10244 /* 10245 * case TCPS_LAST_ACK: Ack processing. 10246 */ 10247 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 10248 return (ret_val); 10249 } 10250 if (ourfinisacked) { 10251 tp = tcp_close(tp); 10252 ctf_do_drop(m, tp); 10253 return (1); 10254 } 10255 if (sbavail(&so->so_snd)) { 10256 if (ctf_progress_timeout_check(tp, true)) { 10257 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 10258 tp, tick, PROGRESS_DROP, __LINE__); 10259 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 10260 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 10261 return (1); 10262 } 10263 } 10264 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 10265 tiwin, thflags, nxt_pkt)); 10266 } 10267 10268 10269 /* 10270 * Return value of 1, the TCB is unlocked and most 10271 * likely gone, return value of 0, the TCP is still 10272 * locked. 10273 */ 10274 static int 10275 rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so, 10276 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 10277 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 10278 { 10279 int32_t ret_val = 0; 10280 int32_t ourfinisacked = 0; 10281 10282 ctf_calc_rwin(so, tp); 10283 10284 /* Reset receive buffer auto scaling when not in bulk receive mode. */ 10285 if ((thflags & TH_RST) || 10286 (tp->t_fin_is_rst && (thflags & TH_FIN))) 10287 return (ctf_process_rst(m, th, so, tp)); 10288 /* 10289 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 10290 * synchronized state. 10291 */ 10292 if (thflags & TH_SYN) { 10293 ctf_challenge_ack(m, th, tp, &ret_val); 10294 return (ret_val); 10295 } 10296 /* 10297 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 10298 * it's less than ts_recent, drop it. 10299 */ 10300 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 10301 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 10302 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 10303 return (ret_val); 10304 } 10305 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 10306 return (ret_val); 10307 } 10308 /* 10309 * If new data are received on a connection after the user processes 10310 * are gone, then RST the other end. 10311 */ 10312 if ((so->so_state & SS_NOFDREF) && 10313 tlen) { 10314 if (rack_check_data_after_close(m, tp, &tlen, th, so)) 10315 return (1); 10316 } 10317 /* 10318 * If last ACK falls within this segment's sequence numbers, record 10319 * its timestamp. NOTE: 1) That the test incorporates suggestions 10320 * from the latest proposal of the tcplw@cray.com list (Braden 10321 * 1993/04/26). 2) That updating only on newer timestamps interferes 10322 * with our earlier PAWS tests, so this check should be solely 10323 * predicated on the sequence space of this segment. 3) That we 10324 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 10325 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 10326 * SEG.Len, This modified check allows us to overcome RFC1323's 10327 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 10328 * p.869. In such cases, we can still calculate the RTT correctly 10329 * when RCV.NXT == Last.ACK.Sent. 10330 */ 10331 if ((to->to_flags & TOF_TS) != 0 && 10332 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 10333 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 10334 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 10335 tp->ts_recent_age = tcp_ts_getticks(); 10336 tp->ts_recent = to->to_tsval; 10337 } 10338 /* 10339 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 10340 * is on (half-synchronized state), then queue data for later 10341 * processing; else drop segment and return. 10342 */ 10343 if ((thflags & TH_ACK) == 0) { 10344 if (tp->t_flags & TF_NEEDSYN) { 10345 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 10346 tiwin, thflags, nxt_pkt)); 10347 } else if (tp->t_flags & TF_ACKNOW) { 10348 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 10349 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; 10350 return (ret_val); 10351 } else { 10352 ctf_do_drop(m, NULL); 10353 return (0); 10354 } 10355 } 10356 /* 10357 * Ack processing. 10358 */ 10359 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 10360 return (ret_val); 10361 } 10362 if (sbavail(&so->so_snd)) { 10363 if (ctf_progress_timeout_check(tp, true)) { 10364 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 10365 tp, tick, PROGRESS_DROP, __LINE__); 10366 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 10367 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 10368 return (1); 10369 } 10370 } 10371 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 10372 tiwin, thflags, nxt_pkt)); 10373 } 10374 10375 static void inline 10376 rack_clear_rate_sample(struct tcp_rack *rack) 10377 { 10378 rack->r_ctl.rack_rs.rs_flags = RACK_RTT_EMPTY; 10379 rack->r_ctl.rack_rs.rs_rtt_cnt = 0; 10380 rack->r_ctl.rack_rs.rs_rtt_tot = 0; 10381 } 10382 10383 static void 10384 rack_set_pace_segments(struct tcpcb *tp, struct tcp_rack *rack, uint32_t line) 10385 { 10386 uint64_t bw_est, rate_wanted; 10387 uint32_t tls_seg = 0; 10388 int chged = 0; 10389 uint32_t user_max; 10390 10391 user_max = ctf_fixed_maxseg(tp) * rack->rc_user_set_max_segs; 10392 #ifdef KERN_TLS 10393 if (rack->rc_inp->inp_socket->so_snd.sb_flags & SB_TLS_IFNET) { 10394 tls_seg = ctf_get_opt_tls_size(rack->rc_inp->inp_socket, rack->rc_tp->snd_wnd); 10395 if (tls_seg != rack->r_ctl.rc_pace_min_segs) 10396 chged = 1; 10397 rack->r_ctl.rc_pace_min_segs = tls_seg; 10398 } else 10399 #endif 10400 { 10401 if (ctf_fixed_maxseg(tp) != rack->r_ctl.rc_pace_min_segs) 10402 chged = 1; 10403 rack->r_ctl.rc_pace_min_segs = ctf_fixed_maxseg(tp); 10404 } 10405 if (rack->use_fixed_rate || rack->rc_force_max_seg) { 10406 if (user_max != rack->r_ctl.rc_pace_max_segs) 10407 chged = 1; 10408 } 10409 if (rack->rc_force_max_seg) { 10410 rack->r_ctl.rc_pace_max_segs = user_max; 10411 } else if (rack->use_fixed_rate) { 10412 bw_est = rack_get_bw(rack); 10413 if ((rack->r_ctl.crte == NULL) || 10414 (bw_est != rack->r_ctl.crte->rate)) { 10415 rack->r_ctl.rc_pace_max_segs = user_max; 10416 } else { 10417 /* We are pacing right at the hardware rate */ 10418 uint32_t segsiz; 10419 10420 segsiz = min(ctf_fixed_maxseg(tp), 10421 rack->r_ctl.rc_pace_min_segs); 10422 rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size( 10423 bw_est, segsiz, 0, 10424 rack->r_ctl.crte, NULL); 10425 } 10426 } else if (rack->rc_always_pace) { 10427 if (rack->r_ctl.gp_bw || 10428 #ifdef NETFLIX_PEAKRATE 10429 rack->rc_tp->t_maxpeakrate || 10430 #endif 10431 rack->r_ctl.init_rate) { 10432 /* We have a rate of some sort set */ 10433 uint32_t orig; 10434 10435 bw_est = rack_get_bw(rack); 10436 orig = rack->r_ctl.rc_pace_max_segs; 10437 rate_wanted = rack_get_output_bw(rack, bw_est, NULL); 10438 if (rate_wanted) { 10439 /* We have something */ 10440 rack->r_ctl.rc_pace_max_segs = rack_get_pacing_len(rack, 10441 rate_wanted, 10442 ctf_fixed_maxseg(rack->rc_tp)); 10443 } else 10444 rack->r_ctl.rc_pace_max_segs = rack->r_ctl.rc_pace_min_segs; 10445 if (orig != rack->r_ctl.rc_pace_max_segs) 10446 chged = 1; 10447 } else if ((rack->r_ctl.gp_bw == 0) && 10448 (rack->r_ctl.rc_pace_max_segs == 0)) { 10449 /* 10450 * If we have nothing limit us to bursting 10451 * out IW sized pieces. 10452 */ 10453 chged = 1; 10454 rack->r_ctl.rc_pace_max_segs = rc_init_window(rack); 10455 } 10456 } 10457 if (rack->r_ctl.rc_pace_max_segs > PACE_MAX_IP_BYTES) { 10458 chged = 1; 10459 rack->r_ctl.rc_pace_max_segs = PACE_MAX_IP_BYTES; 10460 } 10461 #ifdef KERN_TLS 10462 uint32_t orig; 10463 10464 if (tls_seg != 0) { 10465 orig = rack->r_ctl.rc_pace_max_segs; 10466 if (rack_hw_tls_max_seg > 1) { 10467 rack->r_ctl.rc_pace_max_segs /= tls_seg; 10468 if (rack_hw_tls_max_seg > rack->r_ctl.rc_pace_max_segs) 10469 rack->r_ctl.rc_pace_max_segs = rack_hw_tls_max_seg; 10470 } else { 10471 rack->r_ctl.rc_pace_max_segs = 1; 10472 } 10473 if (rack->r_ctl.rc_pace_max_segs == 0) 10474 rack->r_ctl.rc_pace_max_segs = 1; 10475 rack->r_ctl.rc_pace_max_segs *= tls_seg; 10476 if (rack->r_ctl.rc_pace_max_segs > PACE_MAX_IP_BYTES) { 10477 /* We can't go over the max bytes (usually 64k) */ 10478 rack->r_ctl.rc_pace_max_segs = ((PACE_MAX_IP_BYTES / tls_seg) * tls_seg); 10479 } 10480 if (orig != rack->r_ctl.rc_pace_max_segs) 10481 chged = 1; 10482 } 10483 #endif 10484 if (chged) 10485 rack_log_type_hrdwtso(tp, rack, tls_seg, rack->rc_inp->inp_socket->so_snd.sb_flags, line, 2); 10486 } 10487 10488 static int 10489 rack_init(struct tcpcb *tp) 10490 { 10491 struct tcp_rack *rack = NULL; 10492 struct rack_sendmap *insret; 10493 uint32_t iwin, snt, us_cts; 10494 10495 tp->t_fb_ptr = uma_zalloc(rack_pcb_zone, M_NOWAIT); 10496 if (tp->t_fb_ptr == NULL) { 10497 /* 10498 * We need to allocate memory but cant. The INP and INP_INFO 10499 * locks and they are recusive (happens during setup. So a 10500 * scheme to drop the locks fails :( 10501 * 10502 */ 10503 return (ENOMEM); 10504 } 10505 memset(tp->t_fb_ptr, 0, sizeof(struct tcp_rack)); 10506 10507 rack = (struct tcp_rack *)tp->t_fb_ptr; 10508 RB_INIT(&rack->r_ctl.rc_mtree); 10509 TAILQ_INIT(&rack->r_ctl.rc_free); 10510 TAILQ_INIT(&rack->r_ctl.rc_tmap); 10511 rack->rc_tp = tp; 10512 if (tp->t_inpcb) { 10513 rack->rc_inp = tp->t_inpcb; 10514 } 10515 /* Probably not needed but lets be sure */ 10516 rack_clear_rate_sample(rack); 10517 rack->r_ctl.rc_reorder_fade = rack_reorder_fade; 10518 rack->rc_allow_data_af_clo = rack_ignore_data_after_close; 10519 rack->r_ctl.rc_tlp_threshold = rack_tlp_thresh; 10520 if (use_rack_rr) 10521 rack->use_rack_rr = 1; 10522 if (V_tcp_delack_enabled) 10523 tp->t_delayed_ack = 1; 10524 else 10525 tp->t_delayed_ack = 0; 10526 if (rack_enable_shared_cwnd) 10527 rack->rack_enable_scwnd = 1; 10528 rack->rc_user_set_max_segs = rack_hptsi_segments; 10529 rack->rc_force_max_seg = 0; 10530 if (rack_use_imac_dack) 10531 rack->rc_dack_mode = 1; 10532 rack->r_ctl.rc_reorder_shift = rack_reorder_thresh; 10533 rack->r_ctl.rc_pkt_delay = rack_pkt_delay; 10534 rack->r_ctl.rc_prop_reduce = rack_use_proportional_reduce; 10535 rack->r_ctl.rc_prop_rate = rack_proportional_rate; 10536 rack->r_ctl.rc_tlp_cwnd_reduce = rack_lower_cwnd_at_tlp; 10537 rack->r_ctl.rc_early_recovery = rack_early_recovery; 10538 rack->r_ctl.rc_lowest_us_rtt = 0xffffffff; 10539 rack->r_ctl.rc_highest_us_rtt = 0; 10540 if (rack_disable_prr) 10541 rack->rack_no_prr = 1; 10542 if (rack_gp_no_rec_chg) 10543 rack->rc_gp_no_rec_chg = 1; 10544 rack->rc_always_pace = rack_pace_every_seg; 10545 if (rack_enable_mqueue_for_nonpaced) 10546 rack->r_mbuf_queue = 1; 10547 else 10548 rack->r_mbuf_queue = 0; 10549 if (rack->r_mbuf_queue || rack->rc_always_pace) 10550 tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ; 10551 else 10552 tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ; 10553 rack_set_pace_segments(tp, rack, __LINE__); 10554 if (rack_limits_scwnd) 10555 rack->r_limit_scw = 1; 10556 else 10557 rack->r_limit_scw = 0; 10558 rack->r_ctl.rc_high_rwnd = tp->snd_wnd; 10559 rack->r_ctl.cwnd_to_use = tp->snd_cwnd; 10560 rack->r_ctl.rc_rate_sample_method = rack_rate_sample_method; 10561 rack->rack_tlp_threshold_use = rack_tlp_threshold_use; 10562 rack->r_ctl.rc_prr_sendalot = rack_send_a_lot_in_prr; 10563 rack->r_ctl.rc_min_to = rack_min_to; 10564 microuptime(&rack->r_ctl.act_rcv_time); 10565 rack->r_ctl.rc_last_time_decay = rack->r_ctl.act_rcv_time; 10566 rack->r_running_late = 0; 10567 rack->r_running_early = 0; 10568 rack->rc_init_win = rack_default_init_window; 10569 rack->r_ctl.rack_per_of_gp_ss = rack_per_of_gp_ss; 10570 if (rack_do_dyn_mul) { 10571 /* When dynamic adjustment is on CA needs to start at 100% */ 10572 rack->rc_gp_dyn_mul = 1; 10573 if (rack_do_dyn_mul >= 100) 10574 rack->r_ctl.rack_per_of_gp_ca = rack_do_dyn_mul; 10575 } else 10576 rack->r_ctl.rack_per_of_gp_ca = rack_per_of_gp_ca; 10577 rack->r_ctl.rack_per_of_gp_rec = rack_per_of_gp_rec; 10578 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt; 10579 rack->r_ctl.rc_tlp_rxt_last_time = tcp_tv_to_mssectick(&rack->r_ctl.act_rcv_time); 10580 setup_time_filter_small(&rack->r_ctl.rc_gp_min_rtt, FILTER_TYPE_MIN, 10581 rack_probertt_filter_life); 10582 us_cts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 10583 rack->r_ctl.rc_lower_rtt_us_cts = us_cts; 10584 rack->r_ctl.rc_time_of_last_probertt = us_cts; 10585 rack->r_ctl.rc_time_probertt_starts = 0; 10586 /* Do we force on detection? */ 10587 #ifdef NETFLIX_EXP_DETECTION 10588 if (tcp_force_detection) 10589 rack->do_detection = 1; 10590 else 10591 #endif 10592 rack->do_detection = 0; 10593 if (rack_non_rxt_use_cr) 10594 rack->rack_rec_nonrxt_use_cr = 1; 10595 if (tp->snd_una != tp->snd_max) { 10596 /* Create a send map for the current outstanding data */ 10597 struct rack_sendmap *rsm; 10598 10599 rsm = rack_alloc(rack); 10600 if (rsm == NULL) { 10601 uma_zfree(rack_pcb_zone, tp->t_fb_ptr); 10602 tp->t_fb_ptr = NULL; 10603 return (ENOMEM); 10604 } 10605 rsm->r_flags = RACK_OVERMAX; 10606 rsm->r_tim_lastsent[0] = rack->r_ctl.rc_tlp_rxt_last_time; 10607 rsm->r_rtr_cnt = 1; 10608 rsm->r_rtr_bytes = 0; 10609 rsm->r_start = tp->snd_una; 10610 rsm->r_end = tp->snd_max; 10611 rsm->usec_orig_send = us_cts; 10612 rsm->r_dupack = 0; 10613 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 10614 #ifdef INVARIANTS 10615 if (insret != NULL) { 10616 panic("Insert in rb tree fails ret:%p rack:%p rsm:%p", 10617 insret, rack, rsm); 10618 } 10619 #endif 10620 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); 10621 rsm->r_in_tmap = 1; 10622 } 10623 /* Cancel the GP measurement in progress */ 10624 tp->t_flags &= ~TF_GPUTINPROG; 10625 if (SEQ_GT(tp->snd_max, tp->iss)) 10626 snt = tp->snd_max - tp->iss; 10627 else 10628 snt = 0; 10629 iwin = rc_init_window(rack); 10630 if (snt < iwin) { 10631 /* We are not past the initial window 10632 * so we need to make sure cwnd is 10633 * correct. 10634 */ 10635 if (tp->snd_cwnd < iwin) 10636 tp->snd_cwnd = iwin; 10637 /* 10638 * If we are within the initial window 10639 * we want ssthresh to be unlimited. Setting 10640 * it to the rwnd (which the default stack does 10641 * and older racks) is not really a good idea 10642 * since we want to be in SS and grow both the 10643 * cwnd and the rwnd (via dynamic rwnd growth). If 10644 * we set it to the rwnd then as the peer grows its 10645 * rwnd we will be stuck in CA and never hit SS. 10646 * 10647 * Its far better to raise it up high (this takes the 10648 * risk that there as been a loss already, probably 10649 * we should have an indicator in all stacks of loss 10650 * but we don't), but considering the normal use this 10651 * is a risk worth taking. The consequences of not 10652 * hitting SS are far worse than going one more time 10653 * into it early on (before we have sent even a IW). 10654 * It is highly unlikely that we will have had a loss 10655 * before getting the IW out. 10656 */ 10657 tp->snd_ssthresh = 0xffffffff; 10658 } 10659 rack_stop_all_timers(tp); 10660 rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), 0, 0, 0); 10661 rack_log_rtt_shrinks(rack, us_cts, 0, 10662 __LINE__, RACK_RTTS_INIT); 10663 return (0); 10664 } 10665 10666 static int 10667 rack_handoff_ok(struct tcpcb *tp) 10668 { 10669 if ((tp->t_state == TCPS_CLOSED) || 10670 (tp->t_state == TCPS_LISTEN)) { 10671 /* Sure no problem though it may not stick */ 10672 return (0); 10673 } 10674 if ((tp->t_state == TCPS_SYN_SENT) || 10675 (tp->t_state == TCPS_SYN_RECEIVED)) { 10676 /* 10677 * We really don't know you have to get to ESTAB or beyond 10678 * to tell. 10679 */ 10680 return (EAGAIN); 10681 } 10682 if ((tp->t_flags & TF_SACK_PERMIT) || rack_sack_not_required){ 10683 return (0); 10684 } 10685 /* 10686 * If we reach here we don't do SACK on this connection so we can 10687 * never do rack. 10688 */ 10689 return (EINVAL); 10690 } 10691 10692 static void 10693 rack_fini(struct tcpcb *tp, int32_t tcb_is_purged) 10694 { 10695 if (tp->t_fb_ptr) { 10696 struct tcp_rack *rack; 10697 struct rack_sendmap *rsm, *nrsm, *rm; 10698 10699 rack = (struct tcp_rack *)tp->t_fb_ptr; 10700 #ifdef NETFLIX_SHARED_CWND 10701 if (rack->r_ctl.rc_scw) { 10702 uint32_t limit; 10703 10704 if (rack->r_limit_scw) 10705 limit = max(1, rack->r_ctl.rc_lowest_us_rtt); 10706 else 10707 limit = 0; 10708 tcp_shared_cwnd_free_full(tp, rack->r_ctl.rc_scw, 10709 rack->r_ctl.rc_scw_index, 10710 limit); 10711 rack->r_ctl.rc_scw = NULL; 10712 } 10713 #endif 10714 /* rack does not use force data but other stacks may clear it */ 10715 tp->t_flags &= ~TF_FORCEDATA; 10716 if (tp->t_inpcb) { 10717 tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ; 10718 tp->t_inpcb->inp_flags2 &= ~INP_MBUF_QUEUE_READY; 10719 tp->t_inpcb->inp_flags2 &= ~INP_DONT_SACK_QUEUE; 10720 } 10721 #ifdef TCP_BLACKBOX 10722 tcp_log_flowend(tp); 10723 #endif 10724 RB_FOREACH_SAFE(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm) { 10725 rm = RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 10726 #ifdef INVARIANTS 10727 if (rm != rsm) { 10728 panic("At fini, rack:%p rsm:%p rm:%p", 10729 rack, rsm, rm); 10730 } 10731 #endif 10732 uma_zfree(rack_zone, rsm); 10733 } 10734 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); 10735 while (rsm) { 10736 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext); 10737 uma_zfree(rack_zone, rsm); 10738 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); 10739 } 10740 rack->rc_free_cnt = 0; 10741 uma_zfree(rack_pcb_zone, tp->t_fb_ptr); 10742 tp->t_fb_ptr = NULL; 10743 } 10744 /* Cancel the GP measurement in progress */ 10745 tp->t_flags &= ~TF_GPUTINPROG; 10746 /* Make sure snd_nxt is correctly set */ 10747 tp->snd_nxt = tp->snd_max; 10748 } 10749 10750 10751 static void 10752 rack_set_state(struct tcpcb *tp, struct tcp_rack *rack) 10753 { 10754 switch (tp->t_state) { 10755 case TCPS_SYN_SENT: 10756 rack->r_state = TCPS_SYN_SENT; 10757 rack->r_substate = rack_do_syn_sent; 10758 break; 10759 case TCPS_SYN_RECEIVED: 10760 rack->r_state = TCPS_SYN_RECEIVED; 10761 rack->r_substate = rack_do_syn_recv; 10762 break; 10763 case TCPS_ESTABLISHED: 10764 rack_set_pace_segments(tp, rack, __LINE__); 10765 rack->r_state = TCPS_ESTABLISHED; 10766 rack->r_substate = rack_do_established; 10767 break; 10768 case TCPS_CLOSE_WAIT: 10769 rack->r_state = TCPS_CLOSE_WAIT; 10770 rack->r_substate = rack_do_close_wait; 10771 break; 10772 case TCPS_FIN_WAIT_1: 10773 rack->r_state = TCPS_FIN_WAIT_1; 10774 rack->r_substate = rack_do_fin_wait_1; 10775 break; 10776 case TCPS_CLOSING: 10777 rack->r_state = TCPS_CLOSING; 10778 rack->r_substate = rack_do_closing; 10779 break; 10780 case TCPS_LAST_ACK: 10781 rack->r_state = TCPS_LAST_ACK; 10782 rack->r_substate = rack_do_lastack; 10783 break; 10784 case TCPS_FIN_WAIT_2: 10785 rack->r_state = TCPS_FIN_WAIT_2; 10786 rack->r_substate = rack_do_fin_wait_2; 10787 break; 10788 case TCPS_LISTEN: 10789 case TCPS_CLOSED: 10790 case TCPS_TIME_WAIT: 10791 default: 10792 break; 10793 }; 10794 } 10795 10796 10797 static void 10798 rack_timer_audit(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb) 10799 { 10800 /* 10801 * We received an ack, and then did not 10802 * call send or were bounced out due to the 10803 * hpts was running. Now a timer is up as well, is 10804 * it the right timer? 10805 */ 10806 struct rack_sendmap *rsm; 10807 int tmr_up; 10808 10809 tmr_up = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK; 10810 if (rack->rc_in_persist && (tmr_up == PACE_TMR_PERSIT)) 10811 return; 10812 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 10813 if (((rsm == NULL) || (tp->t_state < TCPS_ESTABLISHED)) && 10814 (tmr_up == PACE_TMR_RXT)) { 10815 /* Should be an RXT */ 10816 return; 10817 } 10818 if (rsm == NULL) { 10819 /* Nothing outstanding? */ 10820 if (tp->t_flags & TF_DELACK) { 10821 if (tmr_up == PACE_TMR_DELACK) 10822 /* We are supposed to have delayed ack up and we do */ 10823 return; 10824 } else if (sbavail(&tp->t_inpcb->inp_socket->so_snd) && (tmr_up == PACE_TMR_RXT)) { 10825 /* 10826 * if we hit enobufs then we would expect the possiblity 10827 * of nothing outstanding and the RXT up (and the hptsi timer). 10828 */ 10829 return; 10830 } else if (((V_tcp_always_keepalive || 10831 rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) && 10832 (tp->t_state <= TCPS_CLOSING)) && 10833 (tmr_up == PACE_TMR_KEEP) && 10834 (tp->snd_max == tp->snd_una)) { 10835 /* We should have keep alive up and we do */ 10836 return; 10837 } 10838 } 10839 if (SEQ_GT(tp->snd_max, tp->snd_una) && 10840 ((tmr_up == PACE_TMR_TLP) || 10841 (tmr_up == PACE_TMR_RACK) || 10842 (tmr_up == PACE_TMR_RXT))) { 10843 /* 10844 * Either a Rack, TLP or RXT is fine if we 10845 * have outstanding data. 10846 */ 10847 return; 10848 } else if (tmr_up == PACE_TMR_DELACK) { 10849 /* 10850 * If the delayed ack was going to go off 10851 * before the rtx/tlp/rack timer were going to 10852 * expire, then that would be the timer in control. 10853 * Note we don't check the time here trusting the 10854 * code is correct. 10855 */ 10856 return; 10857 } 10858 /* 10859 * Ok the timer originally started is not what we want now. 10860 * We will force the hpts to be stopped if any, and restart 10861 * with the slot set to what was in the saved slot. 10862 */ 10863 if (rack->rc_inp->inp_in_hpts) { 10864 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { 10865 uint32_t us_cts; 10866 10867 us_cts = tcp_get_usecs(NULL); 10868 if (TSTMP_GT(rack->r_ctl.rc_last_output_to, us_cts)) { 10869 rack->r_early = 1; 10870 rack->r_ctl.rc_agg_early += (rack->r_ctl.rc_last_output_to - us_cts); 10871 } 10872 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 10873 } 10874 tcp_hpts_remove(tp->t_inpcb, HPTS_REMOVE_OUTPUT); 10875 } 10876 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 10877 rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), 0, 0, 0); 10878 } 10879 10880 static int 10881 rack_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so, 10882 struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos, 10883 int32_t nxt_pkt, struct timeval *tv) 10884 { 10885 int32_t thflags, retval, did_out = 0; 10886 int32_t way_out = 0; 10887 uint32_t cts; 10888 uint32_t tiwin; 10889 struct timespec ts; 10890 struct tcpopt to; 10891 struct tcp_rack *rack; 10892 struct rack_sendmap *rsm; 10893 int32_t prev_state = 0; 10894 uint32_t us_cts; 10895 /* 10896 * tv passed from common code is from either M_TSTMP_LRO or 10897 * tcp_get_usecs() if no LRO m_pkthdr timestamp is present. The 10898 * rack_pacing stack assumes tv always refers to 'now', so we overwrite 10899 * tv here to guarantee that. 10900 */ 10901 if (m->m_flags & M_TSTMP_LRO) 10902 tcp_get_usecs(tv); 10903 10904 cts = tcp_tv_to_mssectick(tv); 10905 rack = (struct tcp_rack *)tp->t_fb_ptr; 10906 10907 if ((m->m_flags & M_TSTMP) || 10908 (m->m_flags & M_TSTMP_LRO)) { 10909 mbuf_tstmp2timespec(m, &ts); 10910 rack->r_ctl.act_rcv_time.tv_sec = ts.tv_sec; 10911 rack->r_ctl.act_rcv_time.tv_usec = ts.tv_nsec/1000; 10912 } else 10913 rack->r_ctl.act_rcv_time = *tv; 10914 kern_prefetch(rack, &prev_state); 10915 prev_state = 0; 10916 thflags = th->th_flags; 10917 10918 NET_EPOCH_ASSERT(); 10919 INP_WLOCK_ASSERT(tp->t_inpcb); 10920 KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", 10921 __func__)); 10922 KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT", 10923 __func__)); 10924 if (tp->t_logstate != TCP_LOG_STATE_OFF) { 10925 union tcp_log_stackspecific log; 10926 struct timeval ltv; 10927 #ifdef NETFLIX_HTTP_LOGGING 10928 struct http_sendfile_track *http_req; 10929 10930 if (SEQ_GT(th->th_ack, tp->snd_una)) { 10931 http_req = tcp_http_find_req_for_seq(tp, (th->th_ack-1)); 10932 } else { 10933 http_req = tcp_http_find_req_for_seq(tp, th->th_ack); 10934 } 10935 #endif 10936 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 10937 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 10938 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 10939 if (rack->rack_no_prr == 0) 10940 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; 10941 else 10942 log.u_bbr.flex1 = 0; 10943 log.u_bbr.flex2 = rack->r_ctl.rc_num_maps_alloced; 10944 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 10945 log.u_bbr.pkts_out = rack->rc_tp->t_maxseg; 10946 log.u_bbr.flex3 = m->m_flags; 10947 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; 10948 if (m->m_flags & M_TSTMP) { 10949 /* Record the hardware timestamp if present */ 10950 mbuf_tstmp2timespec(m, &ts); 10951 ltv.tv_sec = ts.tv_sec; 10952 ltv.tv_usec = ts.tv_nsec / 1000; 10953 log.u_bbr.lt_epoch = tcp_tv_to_usectick(<v); 10954 } else if (m->m_flags & M_TSTMP_LRO) { 10955 /* Record the LRO the arrival timestamp */ 10956 mbuf_tstmp2timespec(m, &ts); 10957 ltv.tv_sec = ts.tv_sec; 10958 ltv.tv_usec = ts.tv_nsec / 1000; 10959 log.u_bbr.flex5 = tcp_tv_to_usectick(<v); 10960 } 10961 log.u_bbr.timeStamp = tcp_get_usecs(<v); 10962 /* Log the rcv time */ 10963 log.u_bbr.delRate = m->m_pkthdr.rcv_tstmp; 10964 #ifdef NETFLIX_HTTP_LOGGING 10965 log.u_bbr.applimited = tp->t_http_closed; 10966 log.u_bbr.applimited <<= 8; 10967 log.u_bbr.applimited |= tp->t_http_open; 10968 log.u_bbr.applimited <<= 8; 10969 log.u_bbr.applimited |= tp->t_http_req; 10970 if (http_req) { 10971 /* Copy out any client req info */ 10972 /* seconds */ 10973 log.u_bbr.pkt_epoch = (http_req->localtime / HPTS_USEC_IN_SEC); 10974 /* useconds */ 10975 log.u_bbr.delivered = (http_req->localtime % HPTS_USEC_IN_SEC); 10976 log.u_bbr.rttProp = http_req->timestamp; 10977 log.u_bbr.cur_del_rate = http_req->start; 10978 if (http_req->flags & TCP_HTTP_TRACK_FLG_OPEN) { 10979 log.u_bbr.flex8 |= 1; 10980 } else { 10981 log.u_bbr.flex8 |= 2; 10982 log.u_bbr.bw_inuse = http_req->end; 10983 } 10984 log.u_bbr.flex6 = http_req->start_seq; 10985 if (http_req->flags & TCP_HTTP_TRACK_FLG_COMP) { 10986 log.u_bbr.flex8 |= 4; 10987 log.u_bbr.epoch = http_req->end_seq; 10988 } 10989 } 10990 #endif 10991 TCP_LOG_EVENTP(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_IN, 0, 10992 tlen, &log, true, <v); 10993 } 10994 if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) { 10995 way_out = 4; 10996 retval = 0; 10997 goto done_with_input; 10998 } 10999 /* 11000 * If a segment with the ACK-bit set arrives in the SYN-SENT state 11001 * check SEQ.ACK first as described on page 66 of RFC 793, section 3.9. 11002 */ 11003 if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) && 11004 (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { 11005 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 11006 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 11007 return(1); 11008 } 11009 /* 11010 * Segment received on connection. Reset idle time and keep-alive 11011 * timer. XXX: This should be done after segment validation to 11012 * ignore broken/spoofed segs. 11013 */ 11014 if (tp->t_idle_reduce && 11015 (tp->snd_max == tp->snd_una) && 11016 ((ticks - tp->t_rcvtime) >= tp->t_rxtcur)) { 11017 counter_u64_add(rack_input_idle_reduces, 1); 11018 rack_cc_after_idle(rack, tp); 11019 } 11020 tp->t_rcvtime = ticks; 11021 /* 11022 * Unscale the window into a 32-bit value. For the SYN_SENT state 11023 * the scale is zero. 11024 */ 11025 tiwin = th->th_win << tp->snd_scale; 11026 #ifdef STATS 11027 stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_FRWIN, tiwin); 11028 #endif 11029 if (tiwin > rack->r_ctl.rc_high_rwnd) 11030 rack->r_ctl.rc_high_rwnd = tiwin; 11031 /* 11032 * TCP ECN processing. XXXJTL: If we ever use ECN, we need to move 11033 * this to occur after we've validated the segment. 11034 */ 11035 if (tp->t_flags2 & TF2_ECN_PERMIT) { 11036 if (thflags & TH_CWR) { 11037 tp->t_flags2 &= ~TF2_ECN_SND_ECE; 11038 tp->t_flags |= TF_ACKNOW; 11039 } 11040 switch (iptos & IPTOS_ECN_MASK) { 11041 case IPTOS_ECN_CE: 11042 tp->t_flags2 |= TF2_ECN_SND_ECE; 11043 KMOD_TCPSTAT_INC(tcps_ecn_ce); 11044 break; 11045 case IPTOS_ECN_ECT0: 11046 KMOD_TCPSTAT_INC(tcps_ecn_ect0); 11047 break; 11048 case IPTOS_ECN_ECT1: 11049 KMOD_TCPSTAT_INC(tcps_ecn_ect1); 11050 break; 11051 } 11052 11053 /* Process a packet differently from RFC3168. */ 11054 cc_ecnpkt_handler(tp, th, iptos); 11055 11056 /* Congestion experienced. */ 11057 if (thflags & TH_ECE) { 11058 rack_cong_signal(tp, th, CC_ECN); 11059 } 11060 } 11061 /* 11062 * Parse options on any incoming segment. 11063 */ 11064 tcp_dooptions(&to, (u_char *)(th + 1), 11065 (th->th_off << 2) - sizeof(struct tcphdr), 11066 (thflags & TH_SYN) ? TO_SYN : 0); 11067 11068 /* 11069 * If echoed timestamp is later than the current time, fall back to 11070 * non RFC1323 RTT calculation. Normalize timestamp if syncookies 11071 * were used when this connection was established. 11072 */ 11073 if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) { 11074 to.to_tsecr -= tp->ts_offset; 11075 if (TSTMP_GT(to.to_tsecr, cts)) 11076 to.to_tsecr = 0; 11077 } 11078 11079 /* 11080 * If its the first time in we need to take care of options and 11081 * verify we can do SACK for rack! 11082 */ 11083 if (rack->r_state == 0) { 11084 /* Should be init'd by rack_init() */ 11085 KASSERT(rack->rc_inp != NULL, 11086 ("%s: rack->rc_inp unexpectedly NULL", __func__)); 11087 if (rack->rc_inp == NULL) { 11088 rack->rc_inp = tp->t_inpcb; 11089 } 11090 11091 /* 11092 * Process options only when we get SYN/ACK back. The SYN 11093 * case for incoming connections is handled in tcp_syncache. 11094 * According to RFC1323 the window field in a SYN (i.e., a 11095 * <SYN> or <SYN,ACK>) segment itself is never scaled. XXX 11096 * this is traditional behavior, may need to be cleaned up. 11097 */ 11098 if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) { 11099 /* Handle parallel SYN for ECN */ 11100 if (!(thflags & TH_ACK) && 11101 ((thflags & (TH_CWR | TH_ECE)) == (TH_CWR | TH_ECE)) && 11102 ((V_tcp_do_ecn == 1) || (V_tcp_do_ecn == 2))) { 11103 tp->t_flags2 |= TF2_ECN_PERMIT; 11104 tp->t_flags2 |= TF2_ECN_SND_ECE; 11105 TCPSTAT_INC(tcps_ecn_shs); 11106 } 11107 if ((to.to_flags & TOF_SCALE) && 11108 (tp->t_flags & TF_REQ_SCALE)) { 11109 tp->t_flags |= TF_RCVD_SCALE; 11110 tp->snd_scale = to.to_wscale; 11111 } else 11112 tp->t_flags &= ~TF_REQ_SCALE; 11113 /* 11114 * Initial send window. It will be updated with the 11115 * next incoming segment to the scaled value. 11116 */ 11117 tp->snd_wnd = th->th_win; 11118 if ((to.to_flags & TOF_TS) && 11119 (tp->t_flags & TF_REQ_TSTMP)) { 11120 tp->t_flags |= TF_RCVD_TSTMP; 11121 tp->ts_recent = to.to_tsval; 11122 tp->ts_recent_age = cts; 11123 } else 11124 tp->t_flags &= ~TF_REQ_TSTMP; 11125 if (to.to_flags & TOF_MSS) 11126 tcp_mss(tp, to.to_mss); 11127 if ((tp->t_flags & TF_SACK_PERMIT) && 11128 (to.to_flags & TOF_SACKPERM) == 0) 11129 tp->t_flags &= ~TF_SACK_PERMIT; 11130 if (IS_FASTOPEN(tp->t_flags)) { 11131 if (to.to_flags & TOF_FASTOPEN) { 11132 uint16_t mss; 11133 11134 if (to.to_flags & TOF_MSS) 11135 mss = to.to_mss; 11136 else 11137 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) 11138 mss = TCP6_MSS; 11139 else 11140 mss = TCP_MSS; 11141 tcp_fastopen_update_cache(tp, mss, 11142 to.to_tfo_len, to.to_tfo_cookie); 11143 } else 11144 tcp_fastopen_disable_path(tp); 11145 } 11146 } 11147 /* 11148 * At this point we are at the initial call. Here we decide 11149 * if we are doing RACK or not. We do this by seeing if 11150 * TF_SACK_PERMIT is set and the sack-not-required is clear. 11151 * The code now does do dup-ack counting so if you don't 11152 * switch back you won't get rack & TLP, but you will still 11153 * get this stack. 11154 */ 11155 11156 if ((rack_sack_not_required == 0) && 11157 ((tp->t_flags & TF_SACK_PERMIT) == 0)) { 11158 tcp_switch_back_to_default(tp); 11159 (*tp->t_fb->tfb_tcp_do_segment) (m, th, so, tp, drop_hdrlen, 11160 tlen, iptos); 11161 return (1); 11162 } 11163 /* Set the flag */ 11164 rack->r_is_v6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; 11165 tcp_set_hpts(tp->t_inpcb); 11166 sack_filter_clear(&rack->r_ctl.rack_sf, th->th_ack); 11167 } 11168 if (thflags & TH_FIN) 11169 tcp_log_end_status(tp, TCP_EI_STATUS_CLIENT_FIN); 11170 us_cts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 11171 if ((rack->rc_gp_dyn_mul) && 11172 (rack->use_fixed_rate == 0) && 11173 (rack->rc_always_pace)) { 11174 /* Check in on probertt */ 11175 rack_check_probe_rtt(rack, us_cts); 11176 } 11177 if (rack->forced_ack) { 11178 uint32_t us_rtt; 11179 11180 /* 11181 * A persist or keep-alive was forced out, update our 11182 * min rtt time. Note we do not worry about lost 11183 * retransmissions since KEEP-ALIVES and persists 11184 * are usually way long on times of sending (though 11185 * if we were really paranoid or worried we could 11186 * at least use timestamps if available to validate). 11187 */ 11188 rack->forced_ack = 0; 11189 us_rtt = us_cts - rack->r_ctl.forced_ack_ts; 11190 if (us_rtt == 0) 11191 us_rtt = 1; 11192 rack_log_rtt_upd(tp, rack, us_rtt, 0, NULL, 3); 11193 rack_apply_updated_usrtt(rack, us_rtt, us_cts); 11194 } 11195 /* 11196 * This is the one exception case where we set the rack state 11197 * always. All other times (timers etc) we must have a rack-state 11198 * set (so we assure we have done the checks above for SACK). 11199 */ 11200 rack->r_ctl.rc_rcvtime = cts; 11201 if (rack->r_state != tp->t_state) 11202 rack_set_state(tp, rack); 11203 if (SEQ_GT(th->th_ack, tp->snd_una) && 11204 (rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree)) != NULL) 11205 kern_prefetch(rsm, &prev_state); 11206 prev_state = rack->r_state; 11207 rack_clear_rate_sample(rack); 11208 retval = (*rack->r_substate) (m, th, so, 11209 tp, &to, drop_hdrlen, 11210 tlen, tiwin, thflags, nxt_pkt, iptos); 11211 #ifdef INVARIANTS 11212 if ((retval == 0) && 11213 (tp->t_inpcb == NULL)) { 11214 panic("retval:%d tp:%p t_inpcb:NULL state:%d", 11215 retval, tp, prev_state); 11216 } 11217 #endif 11218 if (retval == 0) { 11219 /* 11220 * If retval is 1 the tcb is unlocked and most likely the tp 11221 * is gone. 11222 */ 11223 INP_WLOCK_ASSERT(tp->t_inpcb); 11224 if ((rack->rc_gp_dyn_mul) && 11225 (rack->rc_always_pace) && 11226 (rack->use_fixed_rate == 0) && 11227 rack->in_probe_rtt && 11228 (rack->r_ctl.rc_time_probertt_starts == 0)) { 11229 /* 11230 * If we are going for target, lets recheck before 11231 * we output. 11232 */ 11233 rack_check_probe_rtt(rack, us_cts); 11234 } 11235 if (rack->set_pacing_done_a_iw == 0) { 11236 /* How much has been acked? */ 11237 if ((tp->snd_una - tp->iss) > (ctf_fixed_maxseg(tp) * 10)) { 11238 /* We have enough to set in the pacing segment size */ 11239 rack->set_pacing_done_a_iw = 1; 11240 rack_set_pace_segments(tp, rack, __LINE__); 11241 } 11242 } 11243 tcp_rack_xmit_timer_commit(rack, tp); 11244 if (nxt_pkt == 0) { 11245 if (rack->r_wanted_output != 0) { 11246 do_output_now: 11247 did_out = 1; 11248 (void)tp->t_fb->tfb_tcp_output(tp); 11249 } 11250 rack_start_hpts_timer(rack, tp, cts, 0, 0, 0); 11251 } 11252 if ((nxt_pkt == 0) && 11253 ((rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) == 0) && 11254 (SEQ_GT(tp->snd_max, tp->snd_una) || 11255 (tp->t_flags & TF_DELACK) || 11256 ((V_tcp_always_keepalive || rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) && 11257 (tp->t_state <= TCPS_CLOSING)))) { 11258 /* We could not send (probably in the hpts but stopped the timer earlier)? */ 11259 if ((tp->snd_max == tp->snd_una) && 11260 ((tp->t_flags & TF_DELACK) == 0) && 11261 (rack->rc_inp->inp_in_hpts) && 11262 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { 11263 /* keep alive not needed if we are hptsi output yet */ 11264 ; 11265 } else { 11266 int late = 0; 11267 if (rack->rc_inp->inp_in_hpts) { 11268 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { 11269 us_cts = tcp_get_usecs(NULL); 11270 if (TSTMP_GT(rack->r_ctl.rc_last_output_to, us_cts)) { 11271 rack->r_early = 1; 11272 rack->r_ctl.rc_agg_early += (rack->r_ctl.rc_last_output_to - us_cts); 11273 } else 11274 late = 1; 11275 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 11276 } 11277 tcp_hpts_remove(tp->t_inpcb, HPTS_REMOVE_OUTPUT); 11278 } 11279 if (late && (did_out == 0)) { 11280 /* 11281 * We are late in the sending 11282 * and we did not call the output 11283 * (this probably should not happen). 11284 */ 11285 goto do_output_now; 11286 } 11287 rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), 0, 0, 0); 11288 } 11289 way_out = 1; 11290 } else if (nxt_pkt == 0) { 11291 /* Do we have the correct timer running? */ 11292 rack_timer_audit(tp, rack, &so->so_snd); 11293 way_out = 2; 11294 } 11295 done_with_input: 11296 rack_log_doseg_done(rack, cts, nxt_pkt, did_out, way_out); 11297 if (did_out) 11298 rack->r_wanted_output = 0; 11299 #ifdef INVARIANTS 11300 if (tp->t_inpcb == NULL) { 11301 panic("OP:%d retval:%d tp:%p t_inpcb:NULL state:%d", 11302 did_out, 11303 retval, tp, prev_state); 11304 } 11305 #endif 11306 } 11307 return (retval); 11308 } 11309 11310 void 11311 rack_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, 11312 struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos) 11313 { 11314 struct timeval tv; 11315 11316 /* First lets see if we have old packets */ 11317 if (tp->t_in_pkt) { 11318 if (ctf_do_queued_segments(so, tp, 1)) { 11319 m_freem(m); 11320 return; 11321 } 11322 } 11323 if (m->m_flags & M_TSTMP_LRO) { 11324 tv.tv_sec = m->m_pkthdr.rcv_tstmp /1000000000; 11325 tv.tv_usec = (m->m_pkthdr.rcv_tstmp % 1000000000)/1000; 11326 } else { 11327 /* Should not be should we kassert instead? */ 11328 tcp_get_usecs(&tv); 11329 } 11330 if(rack_do_segment_nounlock(m, th, so, tp, 11331 drop_hdrlen, tlen, iptos, 0, &tv) == 0) 11332 INP_WUNLOCK(tp->t_inpcb); 11333 } 11334 11335 struct rack_sendmap * 11336 tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tsused) 11337 { 11338 struct rack_sendmap *rsm = NULL; 11339 int32_t idx; 11340 uint32_t srtt = 0, thresh = 0, ts_low = 0; 11341 11342 /* Return the next guy to be re-transmitted */ 11343 if (RB_EMPTY(&rack->r_ctl.rc_mtree)) { 11344 return (NULL); 11345 } 11346 if (tp->t_flags & TF_SENTFIN) { 11347 /* retran the end FIN? */ 11348 return (NULL); 11349 } 11350 /* ok lets look at this one */ 11351 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 11352 if (rsm && ((rsm->r_flags & RACK_ACKED) == 0)) { 11353 goto check_it; 11354 } 11355 rsm = rack_find_lowest_rsm(rack); 11356 if (rsm == NULL) { 11357 return (NULL); 11358 } 11359 check_it: 11360 if (rsm->r_flags & RACK_ACKED) { 11361 return (NULL); 11362 } 11363 if (((rsm->r_flags & RACK_SACK_PASSED) == 0) && 11364 (rsm->r_dupack < DUP_ACK_THRESHOLD)) { 11365 /* Its not yet ready */ 11366 return (NULL); 11367 } 11368 srtt = rack_grab_rtt(tp, rack); 11369 idx = rsm->r_rtr_cnt - 1; 11370 ts_low = rsm->r_tim_lastsent[idx]; 11371 thresh = rack_calc_thresh_rack(rack, srtt, tsused); 11372 if ((tsused == ts_low) || 11373 (TSTMP_LT(tsused, ts_low))) { 11374 /* No time since sending */ 11375 return (NULL); 11376 } 11377 if ((tsused - ts_low) < thresh) { 11378 /* It has not been long enough yet */ 11379 return (NULL); 11380 } 11381 if ((rsm->r_dupack >= DUP_ACK_THRESHOLD) || 11382 ((rsm->r_flags & RACK_SACK_PASSED) && 11383 (rack->sack_attack_disable == 0))) { 11384 /* 11385 * We have passed the dup-ack threshold <or> 11386 * a SACK has indicated this is missing. 11387 * Note that if you are a declared attacker 11388 * it is only the dup-ack threshold that 11389 * will cause retransmits. 11390 */ 11391 /* log retransmit reason */ 11392 rack_log_retran_reason(rack, rsm, (tsused - ts_low), thresh, 1); 11393 return (rsm); 11394 } 11395 return (NULL); 11396 } 11397 11398 static void 11399 rack_log_pacing_delay_calc(struct tcp_rack *rack, uint32_t len, uint32_t slot, 11400 uint64_t bw_est, uint64_t bw, uint64_t len_time, int method, 11401 int line, struct rack_sendmap *rsm) 11402 { 11403 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 11404 union tcp_log_stackspecific log; 11405 struct timeval tv; 11406 11407 memset(&log, 0, sizeof(log)); 11408 log.u_bbr.flex1 = slot; 11409 log.u_bbr.flex2 = len; 11410 log.u_bbr.flex3 = rack->r_ctl.rc_pace_min_segs; 11411 log.u_bbr.flex4 = rack->r_ctl.rc_pace_max_segs; 11412 log.u_bbr.flex5 = rack->r_ctl.rack_per_of_gp_ss; 11413 log.u_bbr.flex6 = rack->r_ctl.rack_per_of_gp_ca; 11414 log.u_bbr.use_lt_bw = rack->app_limited_needs_set; 11415 log.u_bbr.use_lt_bw <<= 1; 11416 log.u_bbr.use_lt_bw = rack->rc_gp_filled; 11417 log.u_bbr.use_lt_bw <<= 1; 11418 log.u_bbr.use_lt_bw |= rack->measure_saw_probe_rtt; 11419 log.u_bbr.use_lt_bw <<= 1; 11420 log.u_bbr.use_lt_bw |= rack->in_probe_rtt; 11421 log.u_bbr.pkt_epoch = line; 11422 log.u_bbr.applimited = rack->r_ctl.rack_per_of_gp_rec; 11423 log.u_bbr.bw_inuse = bw_est; 11424 log.u_bbr.delRate = bw; 11425 if (rack->r_ctl.gp_bw == 0) 11426 log.u_bbr.cur_del_rate = 0; 11427 else 11428 log.u_bbr.cur_del_rate = rack_get_bw(rack); 11429 log.u_bbr.rttProp = len_time; 11430 log.u_bbr.pkts_out = rack->r_ctl.rc_rack_min_rtt; 11431 log.u_bbr.lost = rack->r_ctl.rc_probertt_sndmax_atexit; 11432 log.u_bbr.pacing_gain = rack_get_output_gain(rack, rsm); 11433 if (rack->r_ctl.cwnd_to_use < rack->rc_tp->snd_ssthresh) { 11434 /* We are in slow start */ 11435 log.u_bbr.flex7 = 1; 11436 } else { 11437 /* we are on congestion avoidance */ 11438 log.u_bbr.flex7 = 0; 11439 } 11440 log.u_bbr.flex8 = method; 11441 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 11442 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 11443 log.u_bbr.cwnd_gain = rack->rc_gp_saw_rec; 11444 log.u_bbr.cwnd_gain <<= 1; 11445 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ss; 11446 log.u_bbr.cwnd_gain <<= 1; 11447 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ca; 11448 TCP_LOG_EVENTP(rack->rc_tp, NULL, 11449 &rack->rc_inp->inp_socket->so_rcv, 11450 &rack->rc_inp->inp_socket->so_snd, 11451 BBR_LOG_HPTSI_CALC, 0, 11452 0, &log, false, &tv); 11453 } 11454 } 11455 11456 static uint32_t 11457 rack_get_pacing_len(struct tcp_rack *rack, uint64_t bw, uint32_t mss) 11458 { 11459 uint32_t new_tso, user_max; 11460 11461 user_max = rack->rc_user_set_max_segs * mss; 11462 if (rack->rc_force_max_seg) { 11463 return (user_max); 11464 } 11465 if (rack->use_fixed_rate && 11466 ((rack->r_ctl.crte == NULL) || 11467 (bw != rack->r_ctl.crte->rate))) { 11468 /* Use the user mss since we are not exactly matched */ 11469 return (user_max); 11470 } 11471 new_tso = tcp_get_pacing_burst_size(bw, mss, rack_pace_one_seg, rack->r_ctl.crte, NULL); 11472 if (new_tso > user_max) 11473 new_tso = user_max; 11474 return(new_tso); 11475 } 11476 11477 static void 11478 rack_log_hdwr_pacing(struct tcp_rack *rack, const struct ifnet *ifp, 11479 uint64_t rate, uint64_t hw_rate, int line, 11480 int error) 11481 { 11482 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 11483 union tcp_log_stackspecific log; 11484 struct timeval tv; 11485 11486 memset(&log, 0, sizeof(log)); 11487 log.u_bbr.flex1 = ((hw_rate >> 32) & 0x00000000ffffffff); 11488 log.u_bbr.flex2 = (hw_rate & 0x00000000ffffffff); 11489 log.u_bbr.flex3 = (((uint64_t)ifp >> 32) & 0x00000000ffffffff); 11490 log.u_bbr.flex4 = ((uint64_t)ifp & 0x00000000ffffffff); 11491 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 11492 log.u_bbr.bw_inuse = rate; 11493 log.u_bbr.flex5 = line; 11494 log.u_bbr.flex6 = error; 11495 log.u_bbr.applimited = rack->r_ctl.rc_pace_max_segs; 11496 log.u_bbr.flex8 = rack->use_fixed_rate; 11497 log.u_bbr.flex8 <<= 1; 11498 log.u_bbr.flex8 |= rack->rack_hdrw_pacing; 11499 log.u_bbr.pkts_out = rack->rc_tp->t_maxseg; 11500 TCP_LOG_EVENTP(rack->rc_tp, NULL, 11501 &rack->rc_inp->inp_socket->so_rcv, 11502 &rack->rc_inp->inp_socket->so_snd, 11503 BBR_LOG_HDWR_PACE, 0, 11504 0, &log, false, &tv); 11505 } 11506 } 11507 11508 static int32_t 11509 pace_to_fill_cwnd(struct tcp_rack *rack, int32_t slot, uint32_t len, uint32_t segsiz) 11510 { 11511 uint64_t lentim, fill_bw; 11512 11513 /* Lets first see if we are full, if so continue with normal rate */ 11514 if (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) > rack->r_ctl.cwnd_to_use) 11515 return (slot); 11516 if ((ctf_outstanding(rack->rc_tp) + (segsiz-1)) > rack->rc_tp->snd_wnd) 11517 return (slot); 11518 if (rack->r_ctl.rc_last_us_rtt == 0) 11519 return (slot); 11520 if (rack->rc_pace_fill_if_rttin_range && 11521 (rack->r_ctl.rc_last_us_rtt >= 11522 (get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack->rtt_limit_mul))) { 11523 /* The rtt is huge, N * smallest, lets not fill */ 11524 return (slot); 11525 } 11526 /* 11527 * first lets calculate the b/w based on the last us-rtt 11528 * and the sndwnd. 11529 */ 11530 fill_bw = rack->r_ctl.cwnd_to_use; 11531 /* Take the rwnd if its smaller */ 11532 if (fill_bw > rack->rc_tp->snd_wnd) 11533 fill_bw = rack->rc_tp->snd_wnd; 11534 fill_bw *= (uint64_t)HPTS_USEC_IN_SEC; 11535 fill_bw /= (uint64_t)rack->r_ctl.rc_last_us_rtt; 11536 /* We are below the min b/w */ 11537 if (fill_bw < RACK_MIN_BW) 11538 return (slot); 11539 /* 11540 * Ok fill_bw holds our mythical b/w to fill the cwnd 11541 * in a rtt, what does that time wise equate too? 11542 */ 11543 lentim = (uint64_t)(len) * (uint64_t)HPTS_USEC_IN_SEC; 11544 lentim /= fill_bw; 11545 if (lentim < slot) { 11546 rack_log_pacing_delay_calc(rack, len, slot, fill_bw, 11547 0, lentim, 12, __LINE__, NULL); 11548 return ((int32_t)lentim); 11549 } else 11550 return (slot); 11551 } 11552 11553 static int32_t 11554 rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, struct rack_sendmap *rsm, uint32_t segsiz) 11555 { 11556 struct rack_sendmap *lrsm; 11557 int32_t slot = 0; 11558 int err; 11559 11560 if (rack->rc_always_pace == 0) { 11561 /* 11562 * We use the most optimistic possible cwnd/srtt for 11563 * sending calculations. This will make our 11564 * calculation anticipate getting more through 11565 * quicker then possible. But thats ok we don't want 11566 * the peer to have a gap in data sending. 11567 */ 11568 uint32_t srtt, cwnd, tr_perms = 0; 11569 int32_t reduce = 0; 11570 11571 old_method: 11572 /* 11573 * We keep no precise pacing with the old method 11574 * instead we use the pacer to mitigate bursts. 11575 */ 11576 rack->r_ctl.rc_agg_delayed = 0; 11577 rack->r_early = 0; 11578 rack->r_late = 0; 11579 rack->r_ctl.rc_agg_early = 0; 11580 if (rack->r_ctl.rc_rack_min_rtt) 11581 srtt = rack->r_ctl.rc_rack_min_rtt; 11582 else 11583 srtt = TICKS_2_MSEC((tp->t_srtt >> TCP_RTT_SHIFT)); 11584 if (rack->r_ctl.rc_rack_largest_cwnd) 11585 cwnd = rack->r_ctl.rc_rack_largest_cwnd; 11586 else 11587 cwnd = rack->r_ctl.cwnd_to_use; 11588 tr_perms = cwnd / srtt; 11589 if (tr_perms == 0) { 11590 tr_perms = ctf_fixed_maxseg(tp); 11591 } 11592 /* 11593 * Calculate how long this will take to drain, if 11594 * the calculation comes out to zero, thats ok we 11595 * will use send_a_lot to possibly spin around for 11596 * more increasing tot_len_this_send to the point 11597 * that its going to require a pace, or we hit the 11598 * cwnd. Which in that case we are just waiting for 11599 * a ACK. 11600 */ 11601 slot = len / tr_perms; 11602 /* Now do we reduce the time so we don't run dry? */ 11603 if (slot && rack_slot_reduction) { 11604 reduce = (slot / rack_slot_reduction); 11605 if (reduce < slot) { 11606 slot -= reduce; 11607 } else 11608 slot = 0; 11609 } 11610 slot *= HPTS_USEC_IN_MSEC; 11611 if (rsm == NULL) { 11612 /* 11613 * We always consider ourselves app limited with old style 11614 * that are not retransmits. This could be the initial 11615 * measurement, but thats ok its all setup and specially 11616 * handled. If another send leaks out, then that too will 11617 * be mark app-limited. 11618 */ 11619 lrsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 11620 if (lrsm && ((lrsm->r_flags & RACK_APP_LIMITED) == 0)) { 11621 rack->r_ctl.rc_first_appl = lrsm; 11622 lrsm->r_flags |= RACK_APP_LIMITED; 11623 rack->r_ctl.rc_app_limited_cnt++; 11624 } 11625 } 11626 rack_log_pacing_delay_calc(rack, len, slot, tr_perms, reduce, 0, 7, __LINE__, NULL); 11627 } else { 11628 uint64_t bw_est, res, lentim, rate_wanted; 11629 uint32_t orig_val, srtt, segs, oh; 11630 11631 if ((rack->r_rr_config == 1) && rsm) { 11632 return (rack->r_ctl.rc_min_to * HPTS_USEC_IN_MSEC); 11633 } 11634 if (rack->use_fixed_rate) { 11635 rate_wanted = bw_est = rack_get_fixed_pacing_bw(rack); 11636 } else if ((rack->r_ctl.init_rate == 0) && 11637 #ifdef NETFLIX_PEAKRATE 11638 (rack->rc_tp->t_maxpeakrate == 0) && 11639 #endif 11640 (rack->r_ctl.gp_bw == 0)) { 11641 /* no way to yet do an estimate */ 11642 bw_est = rate_wanted = 0; 11643 } else { 11644 bw_est = rack_get_bw(rack); 11645 rate_wanted = rack_get_output_bw(rack, bw_est, rsm); 11646 } 11647 if ((bw_est == 0) || (rate_wanted == 0)) { 11648 /* 11649 * No way yet to make a b/w estimate or 11650 * our raise is set incorrectly. 11651 */ 11652 goto old_method; 11653 } 11654 /* We need to account for all the overheads */ 11655 segs = (len + segsiz - 1) / segsiz; 11656 /* 11657 * We need the diff between 1514 bytes (e-mtu with e-hdr) 11658 * and how much data we put in each packet. Yes this 11659 * means we may be off if we are larger than 1500 bytes 11660 * or smaller. But this just makes us more conservative. 11661 */ 11662 if (ETHERNET_SEGMENT_SIZE > segsiz) 11663 oh = ETHERNET_SEGMENT_SIZE - segsiz; 11664 else 11665 oh = 0; 11666 segs *= oh; 11667 lentim = (uint64_t)(len + segs) * (uint64_t)HPTS_USEC_IN_SEC; 11668 res = lentim / rate_wanted; 11669 slot = (uint32_t)res; 11670 orig_val = rack->r_ctl.rc_pace_max_segs; 11671 rack_set_pace_segments(rack->rc_tp, rack, __LINE__); 11672 #ifdef KERN_TLS 11673 /* For TLS we need to override this, possibly */ 11674 if (rack->rc_inp->inp_socket->so_snd.sb_flags & SB_TLS_IFNET) { 11675 rack_set_pace_segments(rack->rc_tp, rack, __LINE__); 11676 } 11677 #endif 11678 /* Did we change the TSO size, if so log it */ 11679 if (rack->r_ctl.rc_pace_max_segs != orig_val) 11680 rack_log_pacing_delay_calc(rack, len, slot, orig_val, 0, 0, 15, __LINE__, NULL); 11681 if ((rack->rc_pace_to_cwnd) && 11682 (rack->in_probe_rtt == 0) && 11683 (IN_RECOVERY(rack->rc_tp->t_flags) == 0)) { 11684 /* 11685 * We want to pace at our rate *or* faster to 11686 * fill the cwnd to the max if its not full. 11687 */ 11688 slot = pace_to_fill_cwnd(rack, slot, (len+segs), segsiz); 11689 } 11690 if ((rack->rc_inp->inp_route.ro_nh != NULL) && 11691 (rack->rc_inp->inp_route.ro_nh->nh_ifp != NULL)) { 11692 if ((rack->rack_hdw_pace_ena) && 11693 (rack->rack_hdrw_pacing == 0) && 11694 (rack->rack_attempt_hdwr_pace == 0)) { 11695 /* 11696 * Lets attempt to turn on hardware pacing 11697 * if we can. 11698 */ 11699 rack->rack_attempt_hdwr_pace = 1; 11700 rack->r_ctl.crte = tcp_set_pacing_rate(rack->rc_tp, 11701 rack->rc_inp->inp_route.ro_nh->nh_ifp, 11702 rate_wanted, 11703 RS_PACING_GEQ, 11704 &err); 11705 if (rack->r_ctl.crte) { 11706 rack->rack_hdrw_pacing = 1; 11707 rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size(rate_wanted, segsiz, 11708 0, rack->r_ctl.crte, 11709 NULL); 11710 rack_log_hdwr_pacing(rack, rack->rc_inp->inp_route.ro_nh->nh_ifp, 11711 rate_wanted, rack->r_ctl.crte->rate, __LINE__, 11712 err); 11713 } 11714 } else if (rack->rack_hdrw_pacing && 11715 (rack->r_ctl.crte->rate != rate_wanted)) { 11716 /* Do we need to adjust our rate? */ 11717 const struct tcp_hwrate_limit_table *nrte; 11718 11719 nrte = tcp_chg_pacing_rate(rack->r_ctl.crte, 11720 rack->rc_tp, 11721 rack->rc_inp->inp_route.ro_nh->nh_ifp, 11722 rate_wanted, 11723 RS_PACING_GEQ, 11724 &err); 11725 if (nrte == NULL) { 11726 /* Lost the rate */ 11727 rack->rack_hdrw_pacing = 0; 11728 rack_set_pace_segments(rack->rc_tp, rack, __LINE__); 11729 } else if (nrte != rack->r_ctl.crte) { 11730 rack->r_ctl.crte = nrte; 11731 rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size(rate_wanted, 11732 segsiz, 0, 11733 rack->r_ctl.crte, 11734 NULL); 11735 rack_log_hdwr_pacing(rack, rack->rc_inp->inp_route.ro_nh->nh_ifp, 11736 rate_wanted, rack->r_ctl.crte->rate, __LINE__, 11737 err); 11738 } 11739 11740 } 11741 } 11742 if (rack_limit_time_with_srtt && 11743 (rack->use_fixed_rate == 0) && 11744 #ifdef NETFLIX_PEAKRATE 11745 (rack->rc_tp->t_maxpeakrate == 0) && 11746 #endif 11747 (rack->rack_hdrw_pacing == 0)) { 11748 /* 11749 * Sanity check, we do not allow the pacing delay 11750 * to be longer than the SRTT of the path. If it is 11751 * a slow path, then adding a packet should increase 11752 * the RTT and compensate for this i.e. the srtt will 11753 * be greater so the allowed pacing time will be greater. 11754 * 11755 * Note this restriction is not for where a peak rate 11756 * is set, we are doing fixed pacing or hardware pacing. 11757 */ 11758 if (rack->rc_tp->t_srtt) 11759 srtt = (TICKS_2_USEC(rack->rc_tp->t_srtt) >> TCP_RTT_SHIFT); 11760 else 11761 srtt = RACK_INITIAL_RTO * HPTS_USEC_IN_MSEC; /* its in ms convert */ 11762 if (srtt < slot) { 11763 rack_log_pacing_delay_calc(rack, srtt, slot, rate_wanted, bw_est, lentim, 99, __LINE__, NULL); 11764 slot = srtt; 11765 } 11766 } 11767 rack_log_pacing_delay_calc(rack, len, slot, rate_wanted, bw_est, lentim, 2, __LINE__, rsm); 11768 } 11769 if (slot) 11770 counter_u64_add(rack_calc_nonzero, 1); 11771 else 11772 counter_u64_add(rack_calc_zero, 1); 11773 return (slot); 11774 } 11775 11776 static void 11777 rack_start_gp_measurement(struct tcpcb *tp, struct tcp_rack *rack, 11778 tcp_seq startseq, uint32_t sb_offset) 11779 { 11780 struct rack_sendmap *my_rsm = NULL; 11781 struct rack_sendmap fe; 11782 11783 if (tp->t_state < TCPS_ESTABLISHED) { 11784 /* 11785 * We don't start any measurements if we are 11786 * not at least established. 11787 */ 11788 return; 11789 } 11790 tp->t_flags |= TF_GPUTINPROG; 11791 rack->r_ctl.rc_gp_lowrtt = 0xffffffff; 11792 rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd; 11793 tp->gput_seq = startseq; 11794 rack->app_limited_needs_set = 0; 11795 if (rack->in_probe_rtt) 11796 rack->measure_saw_probe_rtt = 1; 11797 else if ((rack->measure_saw_probe_rtt) && 11798 (SEQ_GEQ(tp->gput_seq, rack->r_ctl.rc_probertt_sndmax_atexit))) 11799 rack->measure_saw_probe_rtt = 0; 11800 if (rack->rc_gp_filled) 11801 tp->gput_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 11802 else { 11803 /* Special case initial measurement */ 11804 rack->r_ctl.rc_gp_output_ts = tp->gput_ts = tcp_get_usecs(NULL); 11805 } 11806 /* 11807 * We take a guess out into the future, 11808 * if we have no measurement and no 11809 * initial rate, we measure the first 11810 * initial-windows worth of data to 11811 * speed up getting some GP measurement and 11812 * thus start pacing. 11813 */ 11814 if ((rack->rc_gp_filled == 0) && (rack->r_ctl.init_rate == 0)) { 11815 rack->app_limited_needs_set = 1; 11816 tp->gput_ack = startseq + max(rc_init_window(rack), 11817 (MIN_GP_WIN * ctf_fixed_maxseg(tp))); 11818 rack_log_pacing_delay_calc(rack, 11819 tp->gput_seq, 11820 tp->gput_ack, 11821 0, 11822 tp->gput_ts, 11823 rack->r_ctl.rc_app_limited_cnt, 11824 9, 11825 __LINE__, NULL); 11826 return; 11827 } 11828 if (sb_offset) { 11829 /* 11830 * We are out somewhere in the sb 11831 * can we use the already outstanding data? 11832 */ 11833 11834 if (rack->r_ctl.rc_app_limited_cnt == 0) { 11835 /* 11836 * Yes first one is good and in this case 11837 * the tp->gput_ts is correctly set based on 11838 * the last ack that arrived (no need to 11839 * set things up when an ack comes in). 11840 */ 11841 my_rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 11842 if ((my_rsm == NULL) || 11843 (my_rsm->r_rtr_cnt != 1)) { 11844 /* retransmission? */ 11845 goto use_latest; 11846 } 11847 } else { 11848 if (rack->r_ctl.rc_first_appl == NULL) { 11849 /* 11850 * If rc_first_appl is NULL 11851 * then the cnt should be 0. 11852 * This is probably an error, maybe 11853 * a KASSERT would be approprate. 11854 */ 11855 goto use_latest; 11856 } 11857 /* 11858 * If we have a marker pointer to the last one that is 11859 * app limited we can use that, but we need to set 11860 * things up so that when it gets ack'ed we record 11861 * the ack time (if its not already acked). 11862 */ 11863 rack->app_limited_needs_set = 1; 11864 /* 11865 * We want to get to the rsm that is either 11866 * next with space i.e. over 1 MSS or the one 11867 * after that (after the app-limited). 11868 */ 11869 my_rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, 11870 rack->r_ctl.rc_first_appl); 11871 if (my_rsm) { 11872 if ((my_rsm->r_end - my_rsm->r_start) <= ctf_fixed_maxseg(tp)) 11873 /* Have to use the next one */ 11874 my_rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, 11875 my_rsm); 11876 else { 11877 /* Use after the first MSS of it is acked */ 11878 tp->gput_seq = my_rsm->r_start + ctf_fixed_maxseg(tp); 11879 goto start_set; 11880 } 11881 } 11882 if ((my_rsm == NULL) || 11883 (my_rsm->r_rtr_cnt != 1)) { 11884 /* 11885 * Either its a retransmit or 11886 * the last is the app-limited one. 11887 */ 11888 goto use_latest; 11889 } 11890 } 11891 tp->gput_seq = my_rsm->r_start; 11892 start_set: 11893 if (my_rsm->r_flags & RACK_ACKED) { 11894 /* 11895 * This one has been acked use the arrival ack time 11896 */ 11897 tp->gput_ts = my_rsm->r_ack_arrival; 11898 rack->app_limited_needs_set = 0; 11899 } 11900 rack->r_ctl.rc_gp_output_ts = my_rsm->usec_orig_send; 11901 tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack); 11902 rack_log_pacing_delay_calc(rack, 11903 tp->gput_seq, 11904 tp->gput_ack, 11905 (uint64_t)my_rsm, 11906 tp->gput_ts, 11907 rack->r_ctl.rc_app_limited_cnt, 11908 9, 11909 __LINE__, NULL); 11910 return; 11911 } 11912 11913 use_latest: 11914 /* 11915 * We don't know how long we may have been 11916 * idle or if this is the first-send. Lets 11917 * setup the flag so we will trim off 11918 * the first ack'd data so we get a true 11919 * measurement. 11920 */ 11921 rack->app_limited_needs_set = 1; 11922 tp->gput_ack = startseq + rack_get_measure_window(tp, rack); 11923 /* Find this guy so we can pull the send time */ 11924 fe.r_start = startseq; 11925 my_rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); 11926 if (my_rsm) { 11927 rack->r_ctl.rc_gp_output_ts = my_rsm->usec_orig_send; 11928 if (my_rsm->r_flags & RACK_ACKED) { 11929 /* 11930 * Unlikely since its probably what was 11931 * just transmitted (but I am paranoid). 11932 */ 11933 tp->gput_ts = my_rsm->r_ack_arrival; 11934 rack->app_limited_needs_set = 0; 11935 } 11936 if (SEQ_LT(my_rsm->r_start, tp->gput_seq)) { 11937 /* This also is unlikely */ 11938 tp->gput_seq = my_rsm->r_start; 11939 } 11940 } else { 11941 /* 11942 * TSNH unless we have some send-map limit, 11943 * and even at that it should not be hitting 11944 * that limit (we should have stopped sending). 11945 */ 11946 rack->r_ctl.rc_gp_output_ts = tcp_get_usecs(NULL); 11947 } 11948 rack_log_pacing_delay_calc(rack, 11949 tp->gput_seq, 11950 tp->gput_ack, 11951 (uint64_t)my_rsm, 11952 tp->gput_ts, 11953 rack->r_ctl.rc_app_limited_cnt, 11954 9, __LINE__, NULL); 11955 } 11956 11957 static inline uint32_t 11958 rack_what_can_we_send(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cwnd_to_use, 11959 uint32_t avail, int32_t sb_offset) 11960 { 11961 uint32_t len; 11962 uint32_t sendwin; 11963 11964 if (tp->snd_wnd > cwnd_to_use) 11965 sendwin = cwnd_to_use; 11966 else 11967 sendwin = tp->snd_wnd; 11968 if (ctf_outstanding(tp) >= tp->snd_wnd) { 11969 /* We never want to go over our peers rcv-window */ 11970 len = 0; 11971 } else { 11972 uint32_t flight; 11973 11974 flight = ctf_flight_size(tp, rack->r_ctl.rc_sacked); 11975 if (flight >= sendwin) { 11976 /* 11977 * We have in flight what we are allowed by cwnd (if 11978 * it was rwnd blocking it would have hit above out 11979 * >= tp->snd_wnd). 11980 */ 11981 return (0); 11982 } 11983 len = sendwin - flight; 11984 if ((len + ctf_outstanding(tp)) > tp->snd_wnd) { 11985 /* We would send too much (beyond the rwnd) */ 11986 len = tp->snd_wnd - ctf_outstanding(tp); 11987 } 11988 if ((len + sb_offset) > avail) { 11989 /* 11990 * We don't have that much in the SB, how much is 11991 * there? 11992 */ 11993 len = avail - sb_offset; 11994 } 11995 } 11996 return (len); 11997 } 11998 11999 static int 12000 rack_output(struct tcpcb *tp) 12001 { 12002 struct socket *so; 12003 uint32_t recwin; 12004 uint32_t sb_offset; 12005 int32_t len, flags, error = 0; 12006 struct mbuf *m; 12007 struct mbuf *mb; 12008 uint32_t if_hw_tsomaxsegcount = 0; 12009 uint32_t if_hw_tsomaxsegsize; 12010 int32_t segsiz, minseg; 12011 long tot_len_this_send = 0; 12012 struct ip *ip = NULL; 12013 #ifdef TCPDEBUG 12014 struct ipovly *ipov = NULL; 12015 #endif 12016 struct udphdr *udp = NULL; 12017 struct tcp_rack *rack; 12018 struct tcphdr *th; 12019 uint8_t pass = 0; 12020 uint8_t mark = 0; 12021 uint8_t wanted_cookie = 0; 12022 u_char opt[TCP_MAXOLEN]; 12023 unsigned ipoptlen, optlen, hdrlen, ulen=0; 12024 uint32_t rack_seq; 12025 12026 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 12027 unsigned ipsec_optlen = 0; 12028 12029 #endif 12030 int32_t idle, sendalot; 12031 int32_t sub_from_prr = 0; 12032 volatile int32_t sack_rxmit; 12033 struct rack_sendmap *rsm = NULL; 12034 int32_t tso, mtu; 12035 struct tcpopt to; 12036 int32_t slot = 0; 12037 int32_t sup_rack = 0; 12038 uint32_t cts, us_cts, delayed, early; 12039 uint8_t hpts_calling, new_data_tlp = 0, doing_tlp = 0; 12040 uint32_t cwnd_to_use; 12041 int32_t do_a_prefetch; 12042 int32_t prefetch_rsm = 0; 12043 int force_tso = 0; 12044 int32_t orig_len; 12045 struct timeval tv; 12046 int32_t prefetch_so_done = 0; 12047 struct tcp_log_buffer *lgb = NULL; 12048 struct inpcb *inp; 12049 struct sockbuf *sb; 12050 #ifdef INET6 12051 struct ip6_hdr *ip6 = NULL; 12052 int32_t isipv6; 12053 #endif 12054 uint8_t filled_all = 0; 12055 bool hw_tls = false; 12056 12057 /* setup and take the cache hits here */ 12058 rack = (struct tcp_rack *)tp->t_fb_ptr; 12059 inp = rack->rc_inp; 12060 so = inp->inp_socket; 12061 sb = &so->so_snd; 12062 kern_prefetch(sb, &do_a_prefetch); 12063 do_a_prefetch = 1; 12064 hpts_calling = inp->inp_hpts_calls; 12065 #ifdef KERN_TLS 12066 hw_tls = (so->so_snd.sb_flags & SB_TLS_IFNET) != 0; 12067 #endif 12068 12069 NET_EPOCH_ASSERT(); 12070 INP_WLOCK_ASSERT(inp); 12071 #ifdef TCP_OFFLOAD 12072 if (tp->t_flags & TF_TOE) 12073 return (tcp_offload_output(tp)); 12074 #endif 12075 /* 12076 * For TFO connections in SYN_RECEIVED, only allow the initial 12077 * SYN|ACK and those sent by the retransmit timer. 12078 */ 12079 if (IS_FASTOPEN(tp->t_flags) && 12080 (tp->t_state == TCPS_SYN_RECEIVED) && 12081 SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN|ACK sent */ 12082 (rack->r_ctl.rc_resend == NULL)) /* not a retransmit */ 12083 return (0); 12084 #ifdef INET6 12085 if (rack->r_state) { 12086 /* Use the cache line loaded if possible */ 12087 isipv6 = rack->r_is_v6; 12088 } else { 12089 isipv6 = (inp->inp_vflag & INP_IPV6) != 0; 12090 } 12091 #endif 12092 early = 0; 12093 us_cts = tcp_get_usecs(&tv); 12094 cts = tcp_tv_to_mssectick(&tv); 12095 if (((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) && 12096 inp->inp_in_hpts) { 12097 /* 12098 * We are on the hpts for some timer but not hptsi output. 12099 * Remove from the hpts unconditionally. 12100 */ 12101 rack_timer_cancel(tp, rack, cts, __LINE__); 12102 } 12103 /* Are we pacing and late? */ 12104 if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) && 12105 TSTMP_GEQ(us_cts, rack->r_ctl.rc_last_output_to)) { 12106 /* We are delayed */ 12107 delayed = us_cts - rack->r_ctl.rc_last_output_to; 12108 } else { 12109 delayed = 0; 12110 } 12111 /* Do the timers, which may override the pacer */ 12112 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { 12113 if (rack_process_timers(tp, rack, cts, hpts_calling)) { 12114 counter_u64_add(rack_out_size[TCP_MSS_ACCT_ATIMER], 1); 12115 return (0); 12116 } 12117 } 12118 if ((rack->r_timer_override) || 12119 (delayed) || 12120 (tp->t_state < TCPS_ESTABLISHED)) { 12121 if (tp->t_inpcb->inp_in_hpts) 12122 tcp_hpts_remove(tp->t_inpcb, HPTS_REMOVE_OUTPUT); 12123 } else if (tp->t_inpcb->inp_in_hpts) { 12124 /* 12125 * On the hpts you can't pass even if ACKNOW is on, we will 12126 * when the hpts fires. 12127 */ 12128 counter_u64_add(rack_out_size[TCP_MSS_ACCT_INPACE], 1); 12129 return (0); 12130 } 12131 inp->inp_hpts_calls = 0; 12132 /* Finish out both pacing early and late accounting */ 12133 if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) && 12134 TSTMP_GT(rack->r_ctl.rc_last_output_to, us_cts)) { 12135 early = rack->r_ctl.rc_last_output_to - us_cts; 12136 } else 12137 early = 0; 12138 if (delayed) { 12139 rack->r_ctl.rc_agg_delayed += delayed; 12140 rack->r_late = 1; 12141 } else if (early) { 12142 rack->r_ctl.rc_agg_early += early; 12143 rack->r_early = 1; 12144 } 12145 /* Now that early/late accounting is done turn off the flag */ 12146 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 12147 rack->r_wanted_output = 0; 12148 rack->r_timer_override = 0; 12149 /* 12150 * For TFO connections in SYN_SENT or SYN_RECEIVED, 12151 * only allow the initial SYN or SYN|ACK and those sent 12152 * by the retransmit timer. 12153 */ 12154 if (IS_FASTOPEN(tp->t_flags) && 12155 ((tp->t_state == TCPS_SYN_RECEIVED) || 12156 (tp->t_state == TCPS_SYN_SENT)) && 12157 SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN or SYN|ACK sent */ 12158 (tp->t_rxtshift == 0)) { /* not a retransmit */ 12159 cwnd_to_use = rack->r_ctl.cwnd_to_use = tp->snd_cwnd; 12160 goto just_return_nolock; 12161 } 12162 /* 12163 * Determine length of data that should be transmitted, and flags 12164 * that will be used. If there is some data or critical controls 12165 * (SYN, RST) to send, then transmit; otherwise, investigate 12166 * further. 12167 */ 12168 idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una); 12169 if (tp->t_idle_reduce) { 12170 if (idle && ((ticks - tp->t_rcvtime) >= tp->t_rxtcur)) 12171 rack_cc_after_idle(rack, tp); 12172 } 12173 tp->t_flags &= ~TF_LASTIDLE; 12174 if (idle) { 12175 if (tp->t_flags & TF_MORETOCOME) { 12176 tp->t_flags |= TF_LASTIDLE; 12177 idle = 0; 12178 } 12179 } 12180 if ((tp->snd_una == tp->snd_max) && 12181 rack->r_ctl.rc_went_idle_time && 12182 TSTMP_GT(us_cts, rack->r_ctl.rc_went_idle_time)) { 12183 idle = us_cts - rack->r_ctl.rc_went_idle_time; 12184 if (idle > rack_min_probertt_hold) { 12185 /* Count as a probe rtt */ 12186 if (rack->in_probe_rtt == 0) { 12187 rack->r_ctl.rc_lower_rtt_us_cts = us_cts; 12188 rack->r_ctl.rc_time_probertt_entered = rack->r_ctl.rc_lower_rtt_us_cts; 12189 rack->r_ctl.rc_time_probertt_starts = rack->r_ctl.rc_lower_rtt_us_cts; 12190 rack->r_ctl.rc_time_of_last_probertt = rack->r_ctl.rc_lower_rtt_us_cts; 12191 } else { 12192 rack_exit_probertt(rack, us_cts); 12193 } 12194 } 12195 idle = 0; 12196 } 12197 again: 12198 /* 12199 * If we've recently taken a timeout, snd_max will be greater than 12200 * snd_nxt. There may be SACK information that allows us to avoid 12201 * resending already delivered data. Adjust snd_nxt accordingly. 12202 */ 12203 sendalot = 0; 12204 us_cts = tcp_get_usecs(&tv); 12205 cts = tcp_tv_to_mssectick(&tv); 12206 tso = 0; 12207 mtu = 0; 12208 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 12209 if (so->so_snd.sb_flags & SB_TLS_IFNET) { 12210 minseg = rack->r_ctl.rc_pace_min_segs; 12211 } else { 12212 minseg = segsiz; 12213 } 12214 sb_offset = tp->snd_max - tp->snd_una; 12215 cwnd_to_use = rack->r_ctl.cwnd_to_use = tp->snd_cwnd; 12216 #ifdef NETFLIX_SHARED_CWND 12217 if ((tp->t_flags2 & TF2_TCP_SCWND_ALLOWED) && 12218 rack->rack_enable_scwnd) { 12219 /* We are doing cwnd sharing */ 12220 if (rack->rc_gp_filled && 12221 (rack->rack_attempted_scwnd == 0) && 12222 (rack->r_ctl.rc_scw == NULL) && 12223 tp->t_lib) { 12224 /* The pcbid is in, lets make an attempt */ 12225 counter_u64_add(rack_try_scwnd, 1); 12226 rack->rack_attempted_scwnd = 1; 12227 rack->r_ctl.rc_scw = tcp_shared_cwnd_alloc(tp, 12228 &rack->r_ctl.rc_scw_index, 12229 segsiz); 12230 } 12231 if (rack->r_ctl.rc_scw && 12232 (rack->rack_scwnd_is_idle == 1) && 12233 (rack->rc_in_persist == 0) && 12234 sbavail(sb)) { 12235 /* we are no longer out of data */ 12236 tcp_shared_cwnd_active(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index); 12237 rack->rack_scwnd_is_idle = 0; 12238 } 12239 if (rack->r_ctl.rc_scw) { 12240 /* First lets update and get the cwnd */ 12241 rack->r_ctl.cwnd_to_use = cwnd_to_use = tcp_shared_cwnd_update(rack->r_ctl.rc_scw, 12242 rack->r_ctl.rc_scw_index, 12243 tp->snd_cwnd, tp->snd_wnd, segsiz); 12244 } 12245 } 12246 #endif 12247 flags = tcp_outflags[tp->t_state]; 12248 while (rack->rc_free_cnt < rack_free_cache) { 12249 rsm = rack_alloc(rack); 12250 if (rsm == NULL) { 12251 if (inp->inp_hpts_calls) 12252 /* Retry in a ms */ 12253 slot = (1 * HPTS_USEC_IN_MSEC); 12254 goto just_return_nolock; 12255 } 12256 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_tnext); 12257 rack->rc_free_cnt++; 12258 rsm = NULL; 12259 } 12260 if (inp->inp_hpts_calls) 12261 inp->inp_hpts_calls = 0; 12262 sack_rxmit = 0; 12263 len = 0; 12264 rsm = NULL; 12265 if (flags & TH_RST) { 12266 SOCKBUF_LOCK(sb); 12267 goto send; 12268 } 12269 if (rack->r_ctl.rc_resend) { 12270 /* Retransmit timer */ 12271 rsm = rack->r_ctl.rc_resend; 12272 rack->r_ctl.rc_resend = NULL; 12273 rsm->r_flags &= ~RACK_TLP; 12274 len = rsm->r_end - rsm->r_start; 12275 sack_rxmit = 1; 12276 sendalot = 0; 12277 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start), 12278 ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p", 12279 __func__, __LINE__, 12280 rsm->r_start, tp->snd_una, tp, rack, rsm)); 12281 sb_offset = rsm->r_start - tp->snd_una; 12282 if (len >= segsiz) 12283 len = segsiz; 12284 } else if ((rack->rc_in_persist == 0) && 12285 ((rsm = tcp_rack_output(tp, rack, cts)) != NULL)) { 12286 /* We have a retransmit that takes precedence */ 12287 rsm->r_flags &= ~RACK_TLP; 12288 if ((!IN_RECOVERY(tp->t_flags)) && 12289 ((tp->t_flags & (TF_WASFRECOVERY | TF_WASCRECOVERY)) == 0)) { 12290 /* Enter recovery if not induced by a time-out */ 12291 rack->r_ctl.rc_rsm_start = rsm->r_start; 12292 rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; 12293 rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; 12294 rack_cong_signal(tp, NULL, CC_NDUPACK); 12295 /* 12296 * When we enter recovery we need to assure we send 12297 * one packet. 12298 */ 12299 if (rack->rack_no_prr == 0) { 12300 rack->r_ctl.rc_prr_sndcnt = segsiz; 12301 rack_log_to_prr(rack, 13, 0); 12302 } 12303 } 12304 #ifdef INVARIANTS 12305 if (SEQ_LT(rsm->r_start, tp->snd_una)) { 12306 panic("Huh, tp:%p rack:%p rsm:%p start:%u < snd_una:%u\n", 12307 tp, rack, rsm, rsm->r_start, tp->snd_una); 12308 } 12309 #endif 12310 len = rsm->r_end - rsm->r_start; 12311 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start), 12312 ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p", 12313 __func__, __LINE__, 12314 rsm->r_start, tp->snd_una, tp, rack, rsm)); 12315 sb_offset = rsm->r_start - tp->snd_una; 12316 /* Can we send it within the PRR boundary? */ 12317 if (rack->rack_no_prr == 0) { 12318 if ((rack->use_rack_rr == 0) && (len > rack->r_ctl.rc_prr_sndcnt)) { 12319 /* It does not fit */ 12320 if ((ctf_flight_size(tp, rack->r_ctl.rc_sacked) > len) && 12321 (rack->r_ctl.rc_prr_sndcnt < segsiz)) { 12322 /* 12323 * prr is less than a segment, we 12324 * have more acks due in besides 12325 * what we need to resend. Lets not send 12326 * to avoid sending small pieces of 12327 * what we need to retransmit. 12328 */ 12329 len = 0; 12330 goto just_return_nolock; 12331 } 12332 len = rack->r_ctl.rc_prr_sndcnt; 12333 } 12334 } 12335 sendalot = 0; 12336 if (len >= segsiz) 12337 len = segsiz; 12338 if (len > 0) { 12339 sub_from_prr = 1; 12340 sack_rxmit = 1; 12341 KMOD_TCPSTAT_INC(tcps_sack_rexmits); 12342 KMOD_TCPSTAT_ADD(tcps_sack_rexmit_bytes, 12343 min(len, segsiz)); 12344 counter_u64_add(rack_rtm_prr_retran, 1); 12345 } 12346 } else if (rack->r_ctl.rc_tlpsend) { 12347 /* Tail loss probe */ 12348 long cwin; 12349 long tlen; 12350 12351 doing_tlp = 1; 12352 /* 12353 * Check if we can do a TLP with a RACK'd packet 12354 * this can happen if we are not doing the rack 12355 * cheat and we skipped to a TLP and it 12356 * went off. 12357 */ 12358 rsm = rack->r_ctl.rc_tlpsend; 12359 rsm->r_flags |= RACK_TLP; 12360 rack->r_ctl.rc_tlpsend = NULL; 12361 sack_rxmit = 1; 12362 tlen = rsm->r_end - rsm->r_start; 12363 if (tlen > segsiz) 12364 tlen = segsiz; 12365 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start), 12366 ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p", 12367 __func__, __LINE__, 12368 rsm->r_start, tp->snd_una, tp, rack, rsm)); 12369 sb_offset = rsm->r_start - tp->snd_una; 12370 cwin = min(tp->snd_wnd, tlen); 12371 len = cwin; 12372 } 12373 /* 12374 * Enforce a connection sendmap count limit if set 12375 * as long as we are not retransmiting. 12376 */ 12377 if ((rsm == NULL) && 12378 (rack->do_detection == 0) && 12379 (V_tcp_map_entries_limit > 0) && 12380 (rack->r_ctl.rc_num_maps_alloced >= V_tcp_map_entries_limit)) { 12381 counter_u64_add(rack_to_alloc_limited, 1); 12382 if (!rack->alloc_limit_reported) { 12383 rack->alloc_limit_reported = 1; 12384 counter_u64_add(rack_alloc_limited_conns, 1); 12385 } 12386 goto just_return_nolock; 12387 } 12388 if (rsm && (rsm->r_flags & RACK_HAS_FIN)) { 12389 /* we are retransmitting the fin */ 12390 len--; 12391 if (len) { 12392 /* 12393 * When retransmitting data do *not* include the 12394 * FIN. This could happen from a TLP probe. 12395 */ 12396 flags &= ~TH_FIN; 12397 } 12398 } 12399 #ifdef INVARIANTS 12400 /* For debugging */ 12401 rack->r_ctl.rc_rsm_at_retran = rsm; 12402 #endif 12403 /* 12404 * Get standard flags, and add SYN or FIN if requested by 'hidden' 12405 * state flags. 12406 */ 12407 if (tp->t_flags & TF_NEEDFIN) 12408 flags |= TH_FIN; 12409 if (tp->t_flags & TF_NEEDSYN) 12410 flags |= TH_SYN; 12411 if ((sack_rxmit == 0) && (prefetch_rsm == 0)) { 12412 void *end_rsm; 12413 end_rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext); 12414 if (end_rsm) 12415 kern_prefetch(end_rsm, &prefetch_rsm); 12416 prefetch_rsm = 1; 12417 } 12418 SOCKBUF_LOCK(sb); 12419 /* 12420 * If snd_nxt == snd_max and we have transmitted a FIN, the 12421 * sb_offset will be > 0 even if so_snd.sb_cc is 0, resulting in a 12422 * negative length. This can also occur when TCP opens up its 12423 * congestion window while receiving additional duplicate acks after 12424 * fast-retransmit because TCP will reset snd_nxt to snd_max after 12425 * the fast-retransmit. 12426 * 12427 * In the normal retransmit-FIN-only case, however, snd_nxt will be 12428 * set to snd_una, the sb_offset will be 0, and the length may wind 12429 * up 0. 12430 * 12431 * If sack_rxmit is true we are retransmitting from the scoreboard 12432 * in which case len is already set. 12433 */ 12434 if ((sack_rxmit == 0) && TCPS_HAVEESTABLISHED(tp->t_state)) { 12435 uint32_t avail; 12436 12437 avail = sbavail(sb); 12438 if (SEQ_GT(tp->snd_nxt, tp->snd_una) && avail) 12439 sb_offset = tp->snd_nxt - tp->snd_una; 12440 else 12441 sb_offset = 0; 12442 if ((IN_RECOVERY(tp->t_flags) == 0) || rack->rack_no_prr) { 12443 if (rack->r_ctl.rc_tlp_new_data) { 12444 /* TLP is forcing out new data */ 12445 if (rack->r_ctl.rc_tlp_new_data > (uint32_t) (avail - sb_offset)) { 12446 rack->r_ctl.rc_tlp_new_data = (uint32_t) (avail - sb_offset); 12447 } 12448 if (rack->r_ctl.rc_tlp_new_data > tp->snd_wnd) 12449 len = tp->snd_wnd; 12450 else 12451 len = rack->r_ctl.rc_tlp_new_data; 12452 rack->r_ctl.rc_tlp_new_data = 0; 12453 new_data_tlp = doing_tlp = 1; 12454 } else 12455 len = rack_what_can_we_send(tp, rack, cwnd_to_use, avail, sb_offset); 12456 if (IN_RECOVERY(tp->t_flags) && (len > segsiz)) { 12457 /* 12458 * For prr=off, we need to send only 1 MSS 12459 * at a time. We do this because another sack could 12460 * be arriving that causes us to send retransmits and 12461 * we don't want to be on a long pace due to a larger send 12462 * that keeps us from sending out the retransmit. 12463 */ 12464 len = segsiz; 12465 } 12466 } else { 12467 uint32_t outstanding; 12468 12469 /* 12470 * We are inside of a SACK recovery episode and are 12471 * sending new data, having retransmitted all the 12472 * data possible so far in the scoreboard. 12473 */ 12474 outstanding = tp->snd_max - tp->snd_una; 12475 if ((rack->r_ctl.rc_prr_sndcnt + outstanding) > tp->snd_wnd) { 12476 if (tp->snd_wnd > outstanding) { 12477 len = tp->snd_wnd - outstanding; 12478 /* Check to see if we have the data */ 12479 if ((sb_offset + len) > avail) { 12480 /* It does not all fit */ 12481 if (avail > sb_offset) 12482 len = avail - sb_offset; 12483 else 12484 len = 0; 12485 } 12486 } else 12487 len = 0; 12488 } else if (avail > sb_offset) 12489 len = avail - sb_offset; 12490 else 12491 len = 0; 12492 if (len > 0) { 12493 if (len > rack->r_ctl.rc_prr_sndcnt) 12494 len = rack->r_ctl.rc_prr_sndcnt; 12495 if (len > 0) { 12496 sub_from_prr = 1; 12497 counter_u64_add(rack_rtm_prr_newdata, 1); 12498 } 12499 } 12500 if (len > segsiz) { 12501 /* 12502 * We should never send more than a MSS when 12503 * retransmitting or sending new data in prr 12504 * mode unless the override flag is on. Most 12505 * likely the PRR algorithm is not going to 12506 * let us send a lot as well :-) 12507 */ 12508 if (rack->r_ctl.rc_prr_sendalot == 0) 12509 len = segsiz; 12510 } else if (len < segsiz) { 12511 /* 12512 * Do we send any? The idea here is if the 12513 * send empty's the socket buffer we want to 12514 * do it. However if not then lets just wait 12515 * for our prr_sndcnt to get bigger. 12516 */ 12517 long leftinsb; 12518 12519 leftinsb = sbavail(sb) - sb_offset; 12520 if (leftinsb > len) { 12521 /* This send does not empty the sb */ 12522 len = 0; 12523 } 12524 } 12525 } 12526 } else if (!TCPS_HAVEESTABLISHED(tp->t_state)) { 12527 /* 12528 * If you have not established 12529 * and are not doing FAST OPEN 12530 * no data please. 12531 */ 12532 if ((sack_rxmit == 0) && 12533 (!IS_FASTOPEN(tp->t_flags))){ 12534 len = 0; 12535 sb_offset = 0; 12536 } 12537 } 12538 if (prefetch_so_done == 0) { 12539 kern_prefetch(so, &prefetch_so_done); 12540 prefetch_so_done = 1; 12541 } 12542 /* 12543 * Lop off SYN bit if it has already been sent. However, if this is 12544 * SYN-SENT state and if segment contains data and if we don't know 12545 * that foreign host supports TAO, suppress sending segment. 12546 */ 12547 if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una) && 12548 ((sack_rxmit == 0) && (tp->t_rxtshift == 0))) { 12549 /* 12550 * When sending additional segments following a TFO SYN|ACK, 12551 * do not include the SYN bit. 12552 */ 12553 if (IS_FASTOPEN(tp->t_flags) && 12554 (tp->t_state == TCPS_SYN_RECEIVED)) 12555 flags &= ~TH_SYN; 12556 } 12557 /* 12558 * Be careful not to send data and/or FIN on SYN segments. This 12559 * measure is needed to prevent interoperability problems with not 12560 * fully conformant TCP implementations. 12561 */ 12562 if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) { 12563 len = 0; 12564 flags &= ~TH_FIN; 12565 } 12566 /* 12567 * On TFO sockets, ensure no data is sent in the following cases: 12568 * 12569 * - When retransmitting SYN|ACK on a passively-created socket 12570 * 12571 * - When retransmitting SYN on an actively created socket 12572 * 12573 * - When sending a zero-length cookie (cookie request) on an 12574 * actively created socket 12575 * 12576 * - When the socket is in the CLOSED state (RST is being sent) 12577 */ 12578 if (IS_FASTOPEN(tp->t_flags) && 12579 (((flags & TH_SYN) && (tp->t_rxtshift > 0)) || 12580 ((tp->t_state == TCPS_SYN_SENT) && 12581 (tp->t_tfo_client_cookie_len == 0)) || 12582 (flags & TH_RST))) { 12583 sack_rxmit = 0; 12584 len = 0; 12585 } 12586 /* Without fast-open there should never be data sent on a SYN */ 12587 if ((flags & TH_SYN) && (!IS_FASTOPEN(tp->t_flags))) { 12588 tp->snd_nxt = tp->iss; 12589 len = 0; 12590 } 12591 orig_len = len; 12592 if (len <= 0) { 12593 /* 12594 * If FIN has been sent but not acked, but we haven't been 12595 * called to retransmit, len will be < 0. Otherwise, window 12596 * shrank after we sent into it. If window shrank to 0, 12597 * cancel pending retransmit, pull snd_nxt back to (closed) 12598 * window, and set the persist timer if it isn't already 12599 * going. If the window didn't close completely, just wait 12600 * for an ACK. 12601 * 12602 * We also do a general check here to ensure that we will 12603 * set the persist timer when we have data to send, but a 12604 * 0-byte window. This makes sure the persist timer is set 12605 * even if the packet hits one of the "goto send" lines 12606 * below. 12607 */ 12608 len = 0; 12609 if ((tp->snd_wnd == 0) && 12610 (TCPS_HAVEESTABLISHED(tp->t_state)) && 12611 (tp->snd_una == tp->snd_max) && 12612 (sb_offset < (int)sbavail(sb))) { 12613 tp->snd_nxt = tp->snd_una; 12614 rack_enter_persist(tp, rack, cts); 12615 } 12616 } else if ((rsm == NULL) && 12617 ((doing_tlp == 0) || (new_data_tlp == 1)) && 12618 (len < rack->r_ctl.rc_pace_max_segs)) { 12619 /* 12620 * We are not sending a maximum sized segment for 12621 * some reason. Should we not send anything (think 12622 * sws or persists)? 12623 */ 12624 if ((tp->snd_wnd < min(max(segsiz, (rack->r_ctl.rc_high_rwnd/2)), minseg)) && 12625 (TCPS_HAVEESTABLISHED(tp->t_state)) && 12626 (len < minseg) && 12627 (len < (int)(sbavail(sb) - sb_offset))) { 12628 /* 12629 * Here the rwnd is less than 12630 * the minimum pacing size, this is not a retransmit, 12631 * we are established and 12632 * the send is not the last in the socket buffer 12633 * we send nothing, and we may enter persists 12634 * if nothing is outstanding. 12635 */ 12636 len = 0; 12637 if (tp->snd_max == tp->snd_una) { 12638 /* 12639 * Nothing out we can 12640 * go into persists. 12641 */ 12642 rack_enter_persist(tp, rack, cts); 12643 tp->snd_nxt = tp->snd_una; 12644 } 12645 } else if ((cwnd_to_use >= max(minseg, (segsiz * 4))) && 12646 (ctf_flight_size(tp, rack->r_ctl.rc_sacked) > (2 * segsiz)) && 12647 (len < (int)(sbavail(sb) - sb_offset)) && 12648 (len < minseg)) { 12649 /* 12650 * Here we are not retransmitting, and 12651 * the cwnd is not so small that we could 12652 * not send at least a min size (rxt timer 12653 * not having gone off), We have 2 segments or 12654 * more already in flight, its not the tail end 12655 * of the socket buffer and the cwnd is blocking 12656 * us from sending out a minimum pacing segment size. 12657 * Lets not send anything. 12658 */ 12659 len = 0; 12660 } else if (((tp->snd_wnd - ctf_outstanding(tp)) < 12661 min((rack->r_ctl.rc_high_rwnd/2), minseg)) && 12662 (ctf_flight_size(tp, rack->r_ctl.rc_sacked) > (2 * segsiz)) && 12663 (len < (int)(sbavail(sb) - sb_offset)) && 12664 (TCPS_HAVEESTABLISHED(tp->t_state))) { 12665 /* 12666 * Here we have a send window but we have 12667 * filled it up and we can't send another pacing segment. 12668 * We also have in flight more than 2 segments 12669 * and we are not completing the sb i.e. we allow 12670 * the last bytes of the sb to go out even if 12671 * its not a full pacing segment. 12672 */ 12673 len = 0; 12674 } 12675 } 12676 /* len will be >= 0 after this point. */ 12677 KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); 12678 tcp_sndbuf_autoscale(tp, so, min(tp->snd_wnd, cwnd_to_use)); 12679 /* 12680 * Decide if we can use TCP Segmentation Offloading (if supported by 12681 * hardware). 12682 * 12683 * TSO may only be used if we are in a pure bulk sending state. The 12684 * presence of TCP-MD5, SACK retransmits, SACK advertizements and IP 12685 * options prevent using TSO. With TSO the TCP header is the same 12686 * (except for the sequence number) for all generated packets. This 12687 * makes it impossible to transmit any options which vary per 12688 * generated segment or packet. 12689 * 12690 * IPv4 handling has a clear separation of ip options and ip header 12691 * flags while IPv6 combines both in in6p_outputopts. ip6_optlen() does 12692 * the right thing below to provide length of just ip options and thus 12693 * checking for ipoptlen is enough to decide if ip options are present. 12694 */ 12695 12696 #ifdef INET6 12697 if (isipv6) 12698 ipoptlen = ip6_optlen(tp->t_inpcb); 12699 else 12700 #endif 12701 if (tp->t_inpcb->inp_options) 12702 ipoptlen = tp->t_inpcb->inp_options->m_len - 12703 offsetof(struct ipoption, ipopt_list); 12704 else 12705 ipoptlen = 0; 12706 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 12707 /* 12708 * Pre-calculate here as we save another lookup into the darknesses 12709 * of IPsec that way and can actually decide if TSO is ok. 12710 */ 12711 #ifdef INET6 12712 if (isipv6 && IPSEC_ENABLED(ipv6)) 12713 ipsec_optlen = IPSEC_HDRSIZE(ipv6, tp->t_inpcb); 12714 #ifdef INET 12715 else 12716 #endif 12717 #endif /* INET6 */ 12718 #ifdef INET 12719 if (IPSEC_ENABLED(ipv4)) 12720 ipsec_optlen = IPSEC_HDRSIZE(ipv4, tp->t_inpcb); 12721 #endif /* INET */ 12722 #endif 12723 12724 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 12725 ipoptlen += ipsec_optlen; 12726 #endif 12727 if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > segsiz && 12728 (tp->t_port == 0) && 12729 ((tp->t_flags & TF_SIGNATURE) == 0) && 12730 tp->rcv_numsacks == 0 && sack_rxmit == 0 && 12731 ipoptlen == 0) 12732 tso = 1; 12733 { 12734 uint32_t outstanding; 12735 12736 outstanding = tp->snd_max - tp->snd_una; 12737 if (tp->t_flags & TF_SENTFIN) { 12738 /* 12739 * If we sent a fin, snd_max is 1 higher than 12740 * snd_una 12741 */ 12742 outstanding--; 12743 } 12744 if (sack_rxmit) { 12745 if ((rsm->r_flags & RACK_HAS_FIN) == 0) 12746 flags &= ~TH_FIN; 12747 } else { 12748 if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + 12749 sbused(sb))) 12750 flags &= ~TH_FIN; 12751 } 12752 } 12753 recwin = lmin(lmax(sbspace(&so->so_rcv), 0), 12754 (long)TCP_MAXWIN << tp->rcv_scale); 12755 12756 /* 12757 * Sender silly window avoidance. We transmit under the following 12758 * conditions when len is non-zero: 12759 * 12760 * - We have a full segment (or more with TSO) - This is the last 12761 * buffer in a write()/send() and we are either idle or running 12762 * NODELAY - we've timed out (e.g. persist timer) - we have more 12763 * then 1/2 the maximum send window's worth of data (receiver may be 12764 * limited the window size) - we need to retransmit 12765 */ 12766 if (len) { 12767 if (len >= segsiz) { 12768 goto send; 12769 } 12770 /* 12771 * NOTE! on localhost connections an 'ack' from the remote 12772 * end may occur synchronously with the output and cause us 12773 * to flush a buffer queued with moretocome. XXX 12774 * 12775 */ 12776 if (!(tp->t_flags & TF_MORETOCOME) && /* normal case */ 12777 (idle || (tp->t_flags & TF_NODELAY)) && 12778 ((uint32_t)len + (uint32_t)sb_offset >= sbavail(sb)) && 12779 (tp->t_flags & TF_NOPUSH) == 0) { 12780 pass = 2; 12781 goto send; 12782 } 12783 if ((tp->snd_una == tp->snd_max) && len) { /* Nothing outstanding */ 12784 pass = 22; 12785 goto send; 12786 } 12787 if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) { 12788 pass = 4; 12789 goto send; 12790 } 12791 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { /* retransmit case */ 12792 pass = 5; 12793 goto send; 12794 } 12795 if (sack_rxmit) { 12796 pass = 6; 12797 goto send; 12798 } 12799 if (((tp->snd_wnd - ctf_outstanding(tp)) < segsiz) && 12800 (ctf_outstanding(tp) < (segsiz * 2))) { 12801 /* 12802 * We have less than two MSS outstanding (delayed ack) 12803 * and our rwnd will not let us send a full sized 12804 * MSS. Lets go ahead and let this small segment 12805 * out because we want to try to have at least two 12806 * packets inflight to not be caught by delayed ack. 12807 */ 12808 pass = 12; 12809 goto send; 12810 } 12811 } 12812 /* 12813 * Sending of standalone window updates. 12814 * 12815 * Window updates are important when we close our window due to a 12816 * full socket buffer and are opening it again after the application 12817 * reads data from it. Once the window has opened again and the 12818 * remote end starts to send again the ACK clock takes over and 12819 * provides the most current window information. 12820 * 12821 * We must avoid the silly window syndrome whereas every read from 12822 * the receive buffer, no matter how small, causes a window update 12823 * to be sent. We also should avoid sending a flurry of window 12824 * updates when the socket buffer had queued a lot of data and the 12825 * application is doing small reads. 12826 * 12827 * Prevent a flurry of pointless window updates by only sending an 12828 * update when we can increase the advertized window by more than 12829 * 1/4th of the socket buffer capacity. When the buffer is getting 12830 * full or is very small be more aggressive and send an update 12831 * whenever we can increase by two mss sized segments. In all other 12832 * situations the ACK's to new incoming data will carry further 12833 * window increases. 12834 * 12835 * Don't send an independent window update if a delayed ACK is 12836 * pending (it will get piggy-backed on it) or the remote side 12837 * already has done a half-close and won't send more data. Skip 12838 * this if the connection is in T/TCP half-open state. 12839 */ 12840 if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN) && 12841 !(tp->t_flags & TF_DELACK) && 12842 !TCPS_HAVERCVDFIN(tp->t_state)) { 12843 /* 12844 * "adv" is the amount we could increase the window, taking 12845 * into account that we are limited by TCP_MAXWIN << 12846 * tp->rcv_scale. 12847 */ 12848 int32_t adv; 12849 int oldwin; 12850 12851 adv = recwin; 12852 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) { 12853 oldwin = (tp->rcv_adv - tp->rcv_nxt); 12854 if (adv > oldwin) 12855 adv -= oldwin; 12856 else { 12857 /* We can't increase the window */ 12858 adv = 0; 12859 } 12860 } else 12861 oldwin = 0; 12862 12863 /* 12864 * If the new window size ends up being the same as or less 12865 * than the old size when it is scaled, then don't force 12866 * a window update. 12867 */ 12868 if (oldwin >> tp->rcv_scale >= (adv + oldwin) >> tp->rcv_scale) 12869 goto dontupdate; 12870 12871 if (adv >= (int32_t)(2 * segsiz) && 12872 (adv >= (int32_t)(so->so_rcv.sb_hiwat / 4) || 12873 recwin <= (int32_t)(so->so_rcv.sb_hiwat / 8) || 12874 so->so_rcv.sb_hiwat <= 8 * segsiz)) { 12875 pass = 7; 12876 goto send; 12877 } 12878 if (2 * adv >= (int32_t) so->so_rcv.sb_hiwat) { 12879 pass = 23; 12880 goto send; 12881 } 12882 } 12883 dontupdate: 12884 12885 /* 12886 * Send if we owe the peer an ACK, RST, SYN, or urgent data. ACKNOW 12887 * is also a catch-all for the retransmit timer timeout case. 12888 */ 12889 if (tp->t_flags & TF_ACKNOW) { 12890 pass = 8; 12891 goto send; 12892 } 12893 if (((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0)) { 12894 pass = 9; 12895 goto send; 12896 } 12897 /* 12898 * If our state indicates that FIN should be sent and we have not 12899 * yet done so, then we need to send. 12900 */ 12901 if ((flags & TH_FIN) && 12902 (tp->snd_nxt == tp->snd_una)) { 12903 pass = 11; 12904 goto send; 12905 } 12906 /* 12907 * No reason to send a segment, just return. 12908 */ 12909 just_return: 12910 SOCKBUF_UNLOCK(sb); 12911 just_return_nolock: 12912 { 12913 int app_limited = CTF_JR_SENT_DATA; 12914 12915 if (tot_len_this_send > 0) { 12916 /* Make sure snd_nxt is up to max */ 12917 if (SEQ_GT(tp->snd_max, tp->snd_nxt)) 12918 tp->snd_nxt = tp->snd_max; 12919 slot = rack_get_pacing_delay(rack, tp, tot_len_this_send, NULL, segsiz); 12920 } else { 12921 int end_window = 0; 12922 uint32_t seq = tp->gput_ack; 12923 12924 rsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 12925 if (rsm) { 12926 /* 12927 * Mark the last sent that we just-returned (hinting 12928 * that delayed ack may play a role in any rtt measurement). 12929 */ 12930 rsm->r_just_ret = 1; 12931 } 12932 counter_u64_add(rack_out_size[TCP_MSS_ACCT_JUSTRET], 1); 12933 rack->r_ctl.rc_agg_delayed = 0; 12934 rack->r_early = 0; 12935 rack->r_late = 0; 12936 rack->r_ctl.rc_agg_early = 0; 12937 if ((ctf_outstanding(tp) + 12938 min(max(segsiz, (rack->r_ctl.rc_high_rwnd/2)), 12939 minseg)) >= tp->snd_wnd) { 12940 /* We are limited by the rwnd */ 12941 app_limited = CTF_JR_RWND_LIMITED; 12942 } else if (ctf_outstanding(tp) >= sbavail(sb)) { 12943 /* We are limited by whats available -- app limited */ 12944 app_limited = CTF_JR_APP_LIMITED; 12945 } else if ((idle == 0) && 12946 ((tp->t_flags & TF_NODELAY) == 0) && 12947 ((uint32_t)len + (uint32_t)sb_offset >= sbavail(sb)) && 12948 (len < segsiz)) { 12949 /* 12950 * No delay is not on and the 12951 * user is sending less than 1MSS. This 12952 * brings out SWS avoidance so we 12953 * don't send. Another app-limited case. 12954 */ 12955 app_limited = CTF_JR_APP_LIMITED; 12956 } else if (tp->t_flags & TF_NOPUSH) { 12957 /* 12958 * The user has requested no push of 12959 * the last segment and we are 12960 * at the last segment. Another app 12961 * limited case. 12962 */ 12963 app_limited = CTF_JR_APP_LIMITED; 12964 } else if ((ctf_outstanding(tp) + minseg) > cwnd_to_use) { 12965 /* Its the cwnd */ 12966 app_limited = CTF_JR_CWND_LIMITED; 12967 } else if (rack->rc_in_persist == 1) { 12968 /* We are in persists */ 12969 app_limited = CTF_JR_PERSISTS; 12970 } else if (IN_RECOVERY(tp->t_flags) && 12971 (rack->rack_no_prr == 0) && 12972 (rack->r_ctl.rc_prr_sndcnt < segsiz)) { 12973 app_limited = CTF_JR_PRR; 12974 } else { 12975 /* Now why here are we not sending? */ 12976 #ifdef NOW 12977 #ifdef INVARIANTS 12978 panic("rack:%p hit JR_ASSESSING case cwnd_to_use:%u?", rack, cwnd_to_use); 12979 #endif 12980 #endif 12981 app_limited = CTF_JR_ASSESSING; 12982 } 12983 /* 12984 * App limited in some fashion, for our pacing GP 12985 * measurements we don't want any gap (even cwnd). 12986 * Close down the measurement window. 12987 */ 12988 if (rack_cwnd_block_ends_measure && 12989 ((app_limited == CTF_JR_CWND_LIMITED) || 12990 (app_limited == CTF_JR_PRR))) { 12991 /* 12992 * The reason we are not sending is 12993 * the cwnd (or prr). We have been configured 12994 * to end the measurement window in 12995 * this case. 12996 */ 12997 end_window = 1; 12998 } else if (app_limited == CTF_JR_PERSISTS) { 12999 /* 13000 * We never end the measurement window 13001 * in persists, though in theory we 13002 * should be only entering after everything 13003 * is acknowledged (so we will probably 13004 * never come here). 13005 */ 13006 end_window = 0; 13007 } else if (rack_rwnd_block_ends_measure && 13008 (app_limited == CTF_JR_RWND_LIMITED)) { 13009 /* 13010 * We are rwnd limited and have been 13011 * configured to end the measurement 13012 * window in this case. 13013 */ 13014 end_window = 1; 13015 } else if (app_limited == CTF_JR_APP_LIMITED) { 13016 /* 13017 * A true application limited period, we have 13018 * ran out of data. 13019 */ 13020 end_window = 1; 13021 } else if (app_limited == CTF_JR_ASSESSING) { 13022 /* 13023 * In the assessing case we hit the end of 13024 * the if/else and had no known reason 13025 * This will panic us under invariants.. 13026 * 13027 * If we get this out in logs we need to 13028 * investagate which reason we missed. 13029 */ 13030 end_window = 1; 13031 } 13032 if (end_window) { 13033 uint8_t log = 0; 13034 13035 if ((tp->t_flags & TF_GPUTINPROG) && 13036 SEQ_GT(tp->gput_ack, tp->snd_max)) { 13037 /* Mark the last packet has app limited */ 13038 tp->gput_ack = tp->snd_max; 13039 log = 1; 13040 } 13041 rsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 13042 if (rsm && ((rsm->r_flags & RACK_APP_LIMITED) == 0)) { 13043 if (rack->r_ctl.rc_app_limited_cnt == 0) 13044 rack->r_ctl.rc_end_appl = rack->r_ctl.rc_first_appl = rsm; 13045 else { 13046 /* 13047 * Go out to the end app limited and mark 13048 * this new one as next and move the end_appl up 13049 * to this guy. 13050 */ 13051 if (rack->r_ctl.rc_end_appl) 13052 rack->r_ctl.rc_end_appl->r_nseq_appl = rsm->r_start; 13053 rack->r_ctl.rc_end_appl = rsm; 13054 } 13055 rsm->r_flags |= RACK_APP_LIMITED; 13056 rack->r_ctl.rc_app_limited_cnt++; 13057 } 13058 if (log) 13059 rack_log_pacing_delay_calc(rack, 13060 rack->r_ctl.rc_app_limited_cnt, seq, 13061 tp->gput_ack, 0, 0, 4, __LINE__, NULL); 13062 } 13063 } 13064 if (slot) { 13065 /* set the rack tcb into the slot N */ 13066 counter_u64_add(rack_paced_segments, 1); 13067 } else if (tot_len_this_send) { 13068 counter_u64_add(rack_unpaced_segments, 1); 13069 } 13070 /* Check if we need to go into persists or not */ 13071 if ((rack->rc_in_persist == 0) && 13072 (tp->snd_max == tp->snd_una) && 13073 TCPS_HAVEESTABLISHED(tp->t_state) && 13074 sbavail(sb) && 13075 (sbavail(sb) > tp->snd_wnd) && 13076 (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), minseg))) { 13077 /* Yes lets make sure to move to persist before timer-start */ 13078 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime); 13079 } 13080 rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, sup_rack); 13081 rack_log_type_just_return(rack, cts, tot_len_this_send, slot, hpts_calling, app_limited, cwnd_to_use); 13082 } 13083 #ifdef NETFLIX_SHARED_CWND 13084 if ((sbavail(sb) == 0) && 13085 rack->r_ctl.rc_scw) { 13086 tcp_shared_cwnd_idle(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index); 13087 rack->rack_scwnd_is_idle = 1; 13088 } 13089 #endif 13090 return (0); 13091 13092 send: 13093 if ((flags & TH_FIN) && 13094 sbavail(sb)) { 13095 /* 13096 * We do not transmit a FIN 13097 * with data outstanding. We 13098 * need to make it so all data 13099 * is acked first. 13100 */ 13101 flags &= ~TH_FIN; 13102 } 13103 /* Enforce stack imposed max seg size if we have one */ 13104 if (rack->r_ctl.rc_pace_max_segs && 13105 (len > rack->r_ctl.rc_pace_max_segs)) { 13106 mark = 1; 13107 len = rack->r_ctl.rc_pace_max_segs; 13108 } 13109 SOCKBUF_LOCK_ASSERT(sb); 13110 if (len > 0) { 13111 if (len >= segsiz) 13112 tp->t_flags2 |= TF2_PLPMTU_MAXSEGSNT; 13113 else 13114 tp->t_flags2 &= ~TF2_PLPMTU_MAXSEGSNT; 13115 } 13116 /* 13117 * Before ESTABLISHED, force sending of initial options unless TCP 13118 * set not to do any options. NOTE: we assume that the IP/TCP header 13119 * plus TCP options always fit in a single mbuf, leaving room for a 13120 * maximum link header, i.e. max_linkhdr + sizeof (struct tcpiphdr) 13121 * + optlen <= MCLBYTES 13122 */ 13123 optlen = 0; 13124 #ifdef INET6 13125 if (isipv6) 13126 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 13127 else 13128 #endif 13129 hdrlen = sizeof(struct tcpiphdr); 13130 13131 /* 13132 * Compute options for segment. We only have to care about SYN and 13133 * established connection segments. Options for SYN-ACK segments 13134 * are handled in TCP syncache. 13135 */ 13136 to.to_flags = 0; 13137 if ((tp->t_flags & TF_NOOPT) == 0) { 13138 /* Maximum segment size. */ 13139 if (flags & TH_SYN) { 13140 tp->snd_nxt = tp->iss; 13141 to.to_mss = tcp_mssopt(&inp->inp_inc); 13142 #ifdef NETFLIX_TCPOUDP 13143 if (tp->t_port) 13144 to.to_mss -= V_tcp_udp_tunneling_overhead; 13145 #endif 13146 to.to_flags |= TOF_MSS; 13147 13148 /* 13149 * On SYN or SYN|ACK transmits on TFO connections, 13150 * only include the TFO option if it is not a 13151 * retransmit, as the presence of the TFO option may 13152 * have caused the original SYN or SYN|ACK to have 13153 * been dropped by a middlebox. 13154 */ 13155 if (IS_FASTOPEN(tp->t_flags) && 13156 (tp->t_rxtshift == 0)) { 13157 if (tp->t_state == TCPS_SYN_RECEIVED) { 13158 to.to_tfo_len = TCP_FASTOPEN_COOKIE_LEN; 13159 to.to_tfo_cookie = 13160 (u_int8_t *)&tp->t_tfo_cookie.server; 13161 to.to_flags |= TOF_FASTOPEN; 13162 wanted_cookie = 1; 13163 } else if (tp->t_state == TCPS_SYN_SENT) { 13164 to.to_tfo_len = 13165 tp->t_tfo_client_cookie_len; 13166 to.to_tfo_cookie = 13167 tp->t_tfo_cookie.client; 13168 to.to_flags |= TOF_FASTOPEN; 13169 wanted_cookie = 1; 13170 /* 13171 * If we wind up having more data to 13172 * send with the SYN than can fit in 13173 * one segment, don't send any more 13174 * until the SYN|ACK comes back from 13175 * the other end. 13176 */ 13177 sendalot = 0; 13178 } 13179 } 13180 } 13181 /* Window scaling. */ 13182 if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) { 13183 to.to_wscale = tp->request_r_scale; 13184 to.to_flags |= TOF_SCALE; 13185 } 13186 /* Timestamps. */ 13187 if ((tp->t_flags & TF_RCVD_TSTMP) || 13188 ((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) { 13189 to.to_tsval = cts + tp->ts_offset; 13190 to.to_tsecr = tp->ts_recent; 13191 to.to_flags |= TOF_TS; 13192 } 13193 /* Set receive buffer autosizing timestamp. */ 13194 if (tp->rfbuf_ts == 0 && 13195 (so->so_rcv.sb_flags & SB_AUTOSIZE)) 13196 tp->rfbuf_ts = tcp_ts_getticks(); 13197 /* Selective ACK's. */ 13198 if (flags & TH_SYN) 13199 to.to_flags |= TOF_SACKPERM; 13200 else if (TCPS_HAVEESTABLISHED(tp->t_state) && 13201 tp->rcv_numsacks > 0) { 13202 to.to_flags |= TOF_SACK; 13203 to.to_nsacks = tp->rcv_numsacks; 13204 to.to_sacks = (u_char *)tp->sackblks; 13205 } 13206 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) 13207 /* TCP-MD5 (RFC2385). */ 13208 if (tp->t_flags & TF_SIGNATURE) 13209 to.to_flags |= TOF_SIGNATURE; 13210 #endif /* TCP_SIGNATURE */ 13211 13212 /* Processing the options. */ 13213 hdrlen += optlen = tcp_addoptions(&to, opt); 13214 /* 13215 * If we wanted a TFO option to be added, but it was unable 13216 * to fit, ensure no data is sent. 13217 */ 13218 if (IS_FASTOPEN(tp->t_flags) && wanted_cookie && 13219 !(to.to_flags & TOF_FASTOPEN)) 13220 len = 0; 13221 } 13222 #ifdef NETFLIX_TCPOUDP 13223 if (tp->t_port) { 13224 if (V_tcp_udp_tunneling_port == 0) { 13225 /* The port was removed?? */ 13226 SOCKBUF_UNLOCK(&so->so_snd); 13227 return (EHOSTUNREACH); 13228 } 13229 hdrlen += sizeof(struct udphdr); 13230 } 13231 #endif 13232 #ifdef INET6 13233 if (isipv6) 13234 ipoptlen = ip6_optlen(tp->t_inpcb); 13235 else 13236 #endif 13237 if (tp->t_inpcb->inp_options) 13238 ipoptlen = tp->t_inpcb->inp_options->m_len - 13239 offsetof(struct ipoption, ipopt_list); 13240 else 13241 ipoptlen = 0; 13242 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 13243 ipoptlen += ipsec_optlen; 13244 #endif 13245 13246 #ifdef KERN_TLS 13247 /* force TSO for so TLS offload can get mss */ 13248 if (sb->sb_flags & SB_TLS_IFNET) { 13249 force_tso = 1; 13250 } 13251 #endif 13252 /* 13253 * Adjust data length if insertion of options will bump the packet 13254 * length beyond the t_maxseg length. Clear the FIN bit because we 13255 * cut off the tail of the segment. 13256 */ 13257 if (len + optlen + ipoptlen > tp->t_maxseg) { 13258 if (tso) { 13259 uint32_t if_hw_tsomax; 13260 uint32_t moff; 13261 int32_t max_len; 13262 13263 /* extract TSO information */ 13264 if_hw_tsomax = tp->t_tsomax; 13265 if_hw_tsomaxsegcount = tp->t_tsomaxsegcount; 13266 if_hw_tsomaxsegsize = tp->t_tsomaxsegsize; 13267 KASSERT(ipoptlen == 0, 13268 ("%s: TSO can't do IP options", __func__)); 13269 13270 /* 13271 * Check if we should limit by maximum payload 13272 * length: 13273 */ 13274 if (if_hw_tsomax != 0) { 13275 /* compute maximum TSO length */ 13276 max_len = (if_hw_tsomax - hdrlen - 13277 max_linkhdr); 13278 if (max_len <= 0) { 13279 len = 0; 13280 } else if (len > max_len) { 13281 sendalot = 1; 13282 len = max_len; 13283 mark = 2; 13284 } 13285 } 13286 /* 13287 * Prevent the last segment from being fractional 13288 * unless the send sockbuf can be emptied: 13289 */ 13290 max_len = (tp->t_maxseg - optlen); 13291 if (((sb_offset + len) < sbavail(sb)) && 13292 (hw_tls == 0)) { 13293 moff = len % (u_int)max_len; 13294 if (moff != 0) { 13295 mark = 3; 13296 len -= moff; 13297 } 13298 } 13299 /* 13300 * In case there are too many small fragments don't 13301 * use TSO: 13302 */ 13303 if (len <= segsiz) { 13304 mark = 4; 13305 tso = 0; 13306 } 13307 /* 13308 * Send the FIN in a separate segment after the bulk 13309 * sending is done. We don't trust the TSO 13310 * implementations to clear the FIN flag on all but 13311 * the last segment. 13312 */ 13313 if (tp->t_flags & TF_NEEDFIN) { 13314 sendalot = 4; 13315 } 13316 } else { 13317 mark = 5; 13318 if (optlen + ipoptlen >= tp->t_maxseg) { 13319 /* 13320 * Since we don't have enough space to put 13321 * the IP header chain and the TCP header in 13322 * one packet as required by RFC 7112, don't 13323 * send it. Also ensure that at least one 13324 * byte of the payload can be put into the 13325 * TCP segment. 13326 */ 13327 SOCKBUF_UNLOCK(&so->so_snd); 13328 error = EMSGSIZE; 13329 sack_rxmit = 0; 13330 goto out; 13331 } 13332 len = tp->t_maxseg - optlen - ipoptlen; 13333 sendalot = 5; 13334 } 13335 } else { 13336 tso = 0; 13337 mark = 6; 13338 } 13339 KASSERT(len + hdrlen + ipoptlen <= IP_MAXPACKET, 13340 ("%s: len > IP_MAXPACKET", __func__)); 13341 #ifdef DIAGNOSTIC 13342 #ifdef INET6 13343 if (max_linkhdr + hdrlen > MCLBYTES) 13344 #else 13345 if (max_linkhdr + hdrlen > MHLEN) 13346 #endif 13347 panic("tcphdr too big"); 13348 #endif 13349 13350 /* 13351 * This KASSERT is here to catch edge cases at a well defined place. 13352 * Before, those had triggered (random) panic conditions further 13353 * down. 13354 */ 13355 KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); 13356 if ((len == 0) && 13357 (flags & TH_FIN) && 13358 (sbused(sb))) { 13359 /* 13360 * We have outstanding data, don't send a fin by itself!. 13361 */ 13362 goto just_return; 13363 } 13364 /* 13365 * Grab a header mbuf, attaching a copy of data to be transmitted, 13366 * and initialize the header from the template for sends on this 13367 * connection. 13368 */ 13369 if (len) { 13370 uint32_t max_val; 13371 uint32_t moff; 13372 13373 if (rack->r_ctl.rc_pace_max_segs) 13374 max_val = rack->r_ctl.rc_pace_max_segs; 13375 else if (rack->rc_user_set_max_segs) 13376 max_val = rack->rc_user_set_max_segs * segsiz; 13377 else 13378 max_val = len; 13379 /* 13380 * We allow a limit on sending with hptsi. 13381 */ 13382 if (len > max_val) { 13383 mark = 7; 13384 len = max_val; 13385 } 13386 #ifdef INET6 13387 if (MHLEN < hdrlen + max_linkhdr) 13388 m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); 13389 else 13390 #endif 13391 m = m_gethdr(M_NOWAIT, MT_DATA); 13392 13393 if (m == NULL) { 13394 SOCKBUF_UNLOCK(sb); 13395 error = ENOBUFS; 13396 sack_rxmit = 0; 13397 goto out; 13398 } 13399 m->m_data += max_linkhdr; 13400 m->m_len = hdrlen; 13401 13402 /* 13403 * Start the m_copy functions from the closest mbuf to the 13404 * sb_offset in the socket buffer chain. 13405 */ 13406 mb = sbsndptr_noadv(sb, sb_offset, &moff); 13407 if (len <= MHLEN - hdrlen - max_linkhdr && !hw_tls) { 13408 m_copydata(mb, moff, (int)len, 13409 mtod(m, caddr_t)+hdrlen); 13410 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) 13411 sbsndptr_adv(sb, mb, len); 13412 m->m_len += len; 13413 } else { 13414 struct sockbuf *msb; 13415 13416 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) 13417 msb = NULL; 13418 else 13419 msb = sb; 13420 m->m_next = tcp_m_copym( 13421 mb, moff, &len, 13422 if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb, 13423 ((rsm == NULL) ? hw_tls : 0) 13424 #ifdef NETFLIX_COPY_ARGS 13425 , &filled_all 13426 #endif 13427 ); 13428 if (len <= (tp->t_maxseg - optlen)) { 13429 /* 13430 * Must have ran out of mbufs for the copy 13431 * shorten it to no longer need tso. Lets 13432 * not put on sendalot since we are low on 13433 * mbufs. 13434 */ 13435 tso = 0; 13436 } 13437 if (m->m_next == NULL) { 13438 SOCKBUF_UNLOCK(sb); 13439 (void)m_free(m); 13440 error = ENOBUFS; 13441 sack_rxmit = 0; 13442 goto out; 13443 } 13444 } 13445 if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) { 13446 if (rsm && (rsm->r_flags & RACK_TLP)) { 13447 /* 13448 * TLP should not count in retran count, but 13449 * in its own bin 13450 */ 13451 counter_u64_add(rack_tlp_retran, 1); 13452 counter_u64_add(rack_tlp_retran_bytes, len); 13453 } else { 13454 tp->t_sndrexmitpack++; 13455 KMOD_TCPSTAT_INC(tcps_sndrexmitpack); 13456 KMOD_TCPSTAT_ADD(tcps_sndrexmitbyte, len); 13457 } 13458 #ifdef STATS 13459 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB, 13460 len); 13461 #endif 13462 } else { 13463 KMOD_TCPSTAT_INC(tcps_sndpack); 13464 KMOD_TCPSTAT_ADD(tcps_sndbyte, len); 13465 #ifdef STATS 13466 stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB, 13467 len); 13468 #endif 13469 } 13470 /* 13471 * If we're sending everything we've got, set PUSH. (This 13472 * will keep happy those implementations which only give 13473 * data to the user when a buffer fills or a PUSH comes in.) 13474 */ 13475 if (sb_offset + len == sbused(sb) && 13476 sbused(sb) && 13477 !(flags & TH_SYN)) 13478 flags |= TH_PUSH; 13479 13480 SOCKBUF_UNLOCK(sb); 13481 } else { 13482 SOCKBUF_UNLOCK(sb); 13483 if (tp->t_flags & TF_ACKNOW) 13484 KMOD_TCPSTAT_INC(tcps_sndacks); 13485 else if (flags & (TH_SYN | TH_FIN | TH_RST)) 13486 KMOD_TCPSTAT_INC(tcps_sndctrl); 13487 else 13488 KMOD_TCPSTAT_INC(tcps_sndwinup); 13489 13490 m = m_gethdr(M_NOWAIT, MT_DATA); 13491 if (m == NULL) { 13492 error = ENOBUFS; 13493 sack_rxmit = 0; 13494 goto out; 13495 } 13496 #ifdef INET6 13497 if (isipv6 && (MHLEN < hdrlen + max_linkhdr) && 13498 MHLEN >= hdrlen) { 13499 M_ALIGN(m, hdrlen); 13500 } else 13501 #endif 13502 m->m_data += max_linkhdr; 13503 m->m_len = hdrlen; 13504 } 13505 SOCKBUF_UNLOCK_ASSERT(sb); 13506 m->m_pkthdr.rcvif = (struct ifnet *)0; 13507 #ifdef MAC 13508 mac_inpcb_create_mbuf(inp, m); 13509 #endif 13510 #ifdef INET6 13511 if (isipv6) { 13512 ip6 = mtod(m, struct ip6_hdr *); 13513 #ifdef NETFLIX_TCPOUDP 13514 if (tp->t_port) { 13515 udp = (struct udphdr *)((caddr_t)ip6 + ipoptlen + sizeof(struct ip6_hdr)); 13516 udp->uh_sport = htons(V_tcp_udp_tunneling_port); 13517 udp->uh_dport = tp->t_port; 13518 ulen = hdrlen + len - sizeof(struct ip6_hdr); 13519 udp->uh_ulen = htons(ulen); 13520 th = (struct tcphdr *)(udp + 1); 13521 } else 13522 #endif 13523 th = (struct tcphdr *)(ip6 + 1); 13524 tcpip_fillheaders(inp, 13525 #ifdef NETFLIX_TCPOUDP 13526 tp->t_port, 13527 #endif 13528 ip6, th); 13529 } else 13530 #endif /* INET6 */ 13531 { 13532 ip = mtod(m, struct ip *); 13533 #ifdef TCPDEBUG 13534 ipov = (struct ipovly *)ip; 13535 #endif 13536 #ifdef NETFLIX_TCPOUDP 13537 if (tp->t_port) { 13538 udp = (struct udphdr *)((caddr_t)ip + ipoptlen + sizeof(struct ip)); 13539 udp->uh_sport = htons(V_tcp_udp_tunneling_port); 13540 udp->uh_dport = tp->t_port; 13541 ulen = hdrlen + len - sizeof(struct ip); 13542 udp->uh_ulen = htons(ulen); 13543 th = (struct tcphdr *)(udp + 1); 13544 } else 13545 #endif 13546 th = (struct tcphdr *)(ip + 1); 13547 tcpip_fillheaders(inp, 13548 #ifdef NETFLIX_TCPOUDP 13549 tp->t_port, 13550 #endif 13551 ip, th); 13552 } 13553 /* 13554 * Fill in fields, remembering maximum advertised window for use in 13555 * delaying messages about window sizes. If resending a FIN, be sure 13556 * not to use a new sequence number. 13557 */ 13558 if (flags & TH_FIN && tp->t_flags & TF_SENTFIN && 13559 tp->snd_nxt == tp->snd_max) 13560 tp->snd_nxt--; 13561 /* 13562 * If we are starting a connection, send ECN setup SYN packet. If we 13563 * are on a retransmit, we may resend those bits a number of times 13564 * as per RFC 3168. 13565 */ 13566 if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn == 1) { 13567 if (tp->t_rxtshift >= 1) { 13568 if (tp->t_rxtshift <= V_tcp_ecn_maxretries) 13569 flags |= TH_ECE | TH_CWR; 13570 } else 13571 flags |= TH_ECE | TH_CWR; 13572 } 13573 /* Handle parallel SYN for ECN */ 13574 if ((tp->t_state == TCPS_SYN_RECEIVED) && 13575 (tp->t_flags2 & TF2_ECN_SND_ECE)) { 13576 flags |= TH_ECE; 13577 tp->t_flags2 &= ~TF2_ECN_SND_ECE; 13578 } 13579 if (tp->t_state == TCPS_ESTABLISHED && 13580 (tp->t_flags2 & TF2_ECN_PERMIT)) { 13581 /* 13582 * If the peer has ECN, mark data packets with ECN capable 13583 * transmission (ECT). Ignore pure ack packets, 13584 * retransmissions. 13585 */ 13586 if (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) && 13587 (sack_rxmit == 0)) { 13588 #ifdef INET6 13589 if (isipv6) 13590 ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20); 13591 else 13592 #endif 13593 ip->ip_tos |= IPTOS_ECN_ECT0; 13594 KMOD_TCPSTAT_INC(tcps_ecn_ect0); 13595 /* 13596 * Reply with proper ECN notifications. 13597 * Only set CWR on new data segments. 13598 */ 13599 if (tp->t_flags2 & TF2_ECN_SND_CWR) { 13600 flags |= TH_CWR; 13601 tp->t_flags2 &= ~TF2_ECN_SND_CWR; 13602 } 13603 } 13604 if (tp->t_flags2 & TF2_ECN_SND_ECE) 13605 flags |= TH_ECE; 13606 } 13607 /* 13608 * If we are doing retransmissions, then snd_nxt will not reflect 13609 * the first unsent octet. For ACK only packets, we do not want the 13610 * sequence number of the retransmitted packet, we want the sequence 13611 * number of the next unsent octet. So, if there is no data (and no 13612 * SYN or FIN), use snd_max instead of snd_nxt when filling in 13613 * ti_seq. But if we are in persist state, snd_max might reflect 13614 * one byte beyond the right edge of the window, so use snd_nxt in 13615 * that case, since we know we aren't doing a retransmission. 13616 * (retransmit and persist are mutually exclusive...) 13617 */ 13618 if (sack_rxmit == 0) { 13619 if (len || (flags & (TH_SYN | TH_FIN)) || 13620 rack->rc_in_persist) { 13621 th->th_seq = htonl(tp->snd_nxt); 13622 rack_seq = tp->snd_nxt; 13623 } else if (flags & TH_RST) { 13624 /* 13625 * For a Reset send the last cum ack in sequence 13626 * (this like any other choice may still generate a 13627 * challenge ack, if a ack-update packet is in 13628 * flight). 13629 */ 13630 th->th_seq = htonl(tp->snd_una); 13631 rack_seq = tp->snd_una; 13632 } else { 13633 th->th_seq = htonl(tp->snd_max); 13634 rack_seq = tp->snd_max; 13635 } 13636 } else { 13637 th->th_seq = htonl(rsm->r_start); 13638 rack_seq = rsm->r_start; 13639 } 13640 th->th_ack = htonl(tp->rcv_nxt); 13641 if (optlen) { 13642 bcopy(opt, th + 1, optlen); 13643 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 13644 } 13645 th->th_flags = flags; 13646 /* 13647 * Calculate receive window. Don't shrink window, but avoid silly 13648 * window syndrome. 13649 * If a RST segment is sent, advertise a window of zero. 13650 */ 13651 if (flags & TH_RST) { 13652 recwin = 0; 13653 } else { 13654 if (recwin < (long)(so->so_rcv.sb_hiwat / 4) && 13655 recwin < (long)segsiz) 13656 recwin = 0; 13657 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) && 13658 recwin < (long)(tp->rcv_adv - tp->rcv_nxt)) 13659 recwin = (long)(tp->rcv_adv - tp->rcv_nxt); 13660 } 13661 13662 /* 13663 * According to RFC1323 the window field in a SYN (i.e., a <SYN> or 13664 * <SYN,ACK>) segment itself is never scaled. The <SYN,ACK> case is 13665 * handled in syncache. 13666 */ 13667 if (flags & TH_SYN) 13668 th->th_win = htons((u_short) 13669 (min(sbspace(&so->so_rcv), TCP_MAXWIN))); 13670 else { 13671 /* Avoid shrinking window with window scaling. */ 13672 recwin = roundup2(recwin, 1 << tp->rcv_scale); 13673 th->th_win = htons((u_short)(recwin >> tp->rcv_scale)); 13674 } 13675 /* 13676 * Adjust the RXWIN0SENT flag - indicate that we have advertised a 0 13677 * window. This may cause the remote transmitter to stall. This 13678 * flag tells soreceive() to disable delayed acknowledgements when 13679 * draining the buffer. This can occur if the receiver is 13680 * attempting to read more data than can be buffered prior to 13681 * transmitting on the connection. 13682 */ 13683 if (th->th_win == 0) { 13684 tp->t_sndzerowin++; 13685 tp->t_flags |= TF_RXWIN0SENT; 13686 } else 13687 tp->t_flags &= ~TF_RXWIN0SENT; 13688 tp->snd_up = tp->snd_una; /* drag it along, its deprecated */ 13689 13690 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) 13691 if (to.to_flags & TOF_SIGNATURE) { 13692 /* 13693 * Calculate MD5 signature and put it into the place 13694 * determined before. 13695 * NOTE: since TCP options buffer doesn't point into 13696 * mbuf's data, calculate offset and use it. 13697 */ 13698 if (!TCPMD5_ENABLED() || TCPMD5_OUTPUT(m, th, 13699 (u_char *)(th + 1) + (to.to_signature - opt)) != 0) { 13700 /* 13701 * Do not send segment if the calculation of MD5 13702 * digest has failed. 13703 */ 13704 goto out; 13705 } 13706 } 13707 #endif 13708 13709 /* 13710 * Put TCP length in extended header, and then checksum extended 13711 * header and data. 13712 */ 13713 m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */ 13714 #ifdef INET6 13715 if (isipv6) { 13716 /* 13717 * ip6_plen is not need to be filled now, and will be filled 13718 * in ip6_output. 13719 */ 13720 if (tp->t_port) { 13721 m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; 13722 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 13723 udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0); 13724 th->th_sum = htons(0); 13725 UDPSTAT_INC(udps_opackets); 13726 } else { 13727 m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; 13728 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 13729 th->th_sum = in6_cksum_pseudo(ip6, 13730 sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP, 13731 0); 13732 } 13733 } 13734 #endif 13735 #if defined(INET6) && defined(INET) 13736 else 13737 #endif 13738 #ifdef INET 13739 { 13740 if (tp->t_port) { 13741 m->m_pkthdr.csum_flags = CSUM_UDP; 13742 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 13743 udp->uh_sum = in_pseudo(ip->ip_src.s_addr, 13744 ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP)); 13745 th->th_sum = htons(0); 13746 UDPSTAT_INC(udps_opackets); 13747 } else { 13748 m->m_pkthdr.csum_flags = CSUM_TCP; 13749 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 13750 th->th_sum = in_pseudo(ip->ip_src.s_addr, 13751 ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + 13752 IPPROTO_TCP + len + optlen)); 13753 } 13754 /* IP version must be set here for ipv4/ipv6 checking later */ 13755 KASSERT(ip->ip_v == IPVERSION, 13756 ("%s: IP version incorrect: %d", __func__, ip->ip_v)); 13757 } 13758 #endif 13759 /* 13760 * Enable TSO and specify the size of the segments. The TCP pseudo 13761 * header checksum is always provided. XXX: Fixme: This is currently 13762 * not the case for IPv6. 13763 */ 13764 if (tso || force_tso) { 13765 KASSERT(force_tso || len > tp->t_maxseg - optlen, 13766 ("%s: len <= tso_segsz", __func__)); 13767 m->m_pkthdr.csum_flags |= CSUM_TSO; 13768 m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen; 13769 } 13770 KASSERT(len + hdrlen == m_length(m, NULL), 13771 ("%s: mbuf chain different than expected: %d + %u != %u", 13772 __func__, len, hdrlen, m_length(m, NULL))); 13773 13774 #ifdef TCP_HHOOK 13775 /* Run HHOOK_TCP_ESTABLISHED_OUT helper hooks. */ 13776 hhook_run_tcp_est_out(tp, th, &to, len, tso); 13777 #endif 13778 #ifdef TCPDEBUG 13779 /* 13780 * Trace. 13781 */ 13782 if (so->so_options & SO_DEBUG) { 13783 u_short save = 0; 13784 13785 #ifdef INET6 13786 if (!isipv6) 13787 #endif 13788 { 13789 save = ipov->ih_len; 13790 ipov->ih_len = htons(m->m_pkthdr.len /* - hdrlen + 13791 * (th->th_off << 2) */ ); 13792 } 13793 tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0); 13794 #ifdef INET6 13795 if (!isipv6) 13796 #endif 13797 ipov->ih_len = save; 13798 } 13799 #endif /* TCPDEBUG */ 13800 13801 /* We're getting ready to send; log now. */ 13802 if (tp->t_logstate != TCP_LOG_STATE_OFF) { 13803 union tcp_log_stackspecific log; 13804 struct timeval tv; 13805 13806 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 13807 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 13808 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 13809 if (rack->rack_no_prr) 13810 log.u_bbr.flex1 = 0; 13811 else 13812 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; 13813 log.u_bbr.flex2 = rack->r_ctl.rc_pace_min_segs; 13814 log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs; 13815 log.u_bbr.flex4 = orig_len; 13816 if (filled_all) 13817 log.u_bbr.flex5 = 0x80000000; 13818 else 13819 log.u_bbr.flex5 = 0; 13820 /* Save off the early/late values */ 13821 log.u_bbr.flex6 = rack->r_ctl.rc_agg_early; 13822 log.u_bbr.applimited = rack->r_ctl.rc_agg_delayed; 13823 log.u_bbr.bw_inuse = rack_get_bw(rack); 13824 if (rsm || sack_rxmit) { 13825 if (doing_tlp) 13826 log.u_bbr.flex8 = 2; 13827 else 13828 log.u_bbr.flex8 = 1; 13829 } else { 13830 log.u_bbr.flex8 = 0; 13831 } 13832 log.u_bbr.pacing_gain = rack_get_output_gain(rack, rsm); 13833 log.u_bbr.flex7 = mark; 13834 log.u_bbr.pkts_out = tp->t_maxseg; 13835 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 13836 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 13837 log.u_bbr.lt_epoch = cwnd_to_use; 13838 log.u_bbr.delivered = sendalot; 13839 lgb = tcp_log_event_(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_OUT, ERRNO_UNK, 13840 len, &log, false, NULL, NULL, 0, &tv); 13841 } else 13842 lgb = NULL; 13843 13844 /* 13845 * Fill in IP length and desired time to live and send to IP level. 13846 * There should be a better way to handle ttl and tos; we could keep 13847 * them in the template, but need a way to checksum without them. 13848 */ 13849 /* 13850 * m->m_pkthdr.len should have been set before cksum calcuration, 13851 * because in6_cksum() need it. 13852 */ 13853 #ifdef INET6 13854 if (isipv6) { 13855 /* 13856 * we separately set hoplimit for every segment, since the 13857 * user might want to change the value via setsockopt. Also, 13858 * desired default hop limit might be changed via Neighbor 13859 * Discovery. 13860 */ 13861 ip6->ip6_hlim = in6_selecthlim(inp, NULL); 13862 13863 /* 13864 * Set the packet size here for the benefit of DTrace 13865 * probes. ip6_output() will set it properly; it's supposed 13866 * to include the option header lengths as well. 13867 */ 13868 ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6)); 13869 13870 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) 13871 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 13872 else 13873 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 13874 13875 if (tp->t_state == TCPS_SYN_SENT) 13876 TCP_PROBE5(connect__request, NULL, tp, ip6, tp, th); 13877 13878 TCP_PROBE5(send, NULL, tp, ip6, tp, th); 13879 /* TODO: IPv6 IP6TOS_ECT bit on */ 13880 error = ip6_output(m, inp->in6p_outputopts, 13881 &inp->inp_route6, 13882 ((rsm || sack_rxmit) ? IP_NO_SND_TAG_RL : 0), 13883 NULL, NULL, inp); 13884 13885 if (error == EMSGSIZE && inp->inp_route6.ro_nh != NULL) 13886 mtu = inp->inp_route6.ro_nh->nh_mtu; 13887 } 13888 #endif /* INET6 */ 13889 #if defined(INET) && defined(INET6) 13890 else 13891 #endif 13892 #ifdef INET 13893 { 13894 ip->ip_len = htons(m->m_pkthdr.len); 13895 #ifdef INET6 13896 if (inp->inp_vflag & INP_IPV6PROTO) 13897 ip->ip_ttl = in6_selecthlim(inp, NULL); 13898 #endif /* INET6 */ 13899 /* 13900 * If we do path MTU discovery, then we set DF on every 13901 * packet. This might not be the best thing to do according 13902 * to RFC3390 Section 2. However the tcp hostcache migitates 13903 * the problem so it affects only the first tcp connection 13904 * with a host. 13905 * 13906 * NB: Don't set DF on small MTU/MSS to have a safe 13907 * fallback. 13908 */ 13909 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) { 13910 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 13911 if (tp->t_port == 0 || len < V_tcp_minmss) { 13912 ip->ip_off |= htons(IP_DF); 13913 } 13914 } else { 13915 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 13916 } 13917 13918 if (tp->t_state == TCPS_SYN_SENT) 13919 TCP_PROBE5(connect__request, NULL, tp, ip, tp, th); 13920 13921 TCP_PROBE5(send, NULL, tp, ip, tp, th); 13922 13923 error = ip_output(m, inp->inp_options, &inp->inp_route, 13924 ((rsm || sack_rxmit) ? IP_NO_SND_TAG_RL : 0), 0, 13925 inp); 13926 if (error == EMSGSIZE && inp->inp_route.ro_nh != NULL) 13927 mtu = inp->inp_route.ro_nh->nh_mtu; 13928 } 13929 #endif /* INET */ 13930 13931 out: 13932 if (lgb) { 13933 lgb->tlb_errno = error; 13934 lgb = NULL; 13935 } 13936 /* 13937 * In transmit state, time the transmission and arrange for the 13938 * retransmit. In persist state, just set snd_max. 13939 */ 13940 if (error == 0) { 13941 rack->forced_ack = 0; /* If we send something zap the FA flag */ 13942 if (rsm && (doing_tlp == 0)) { 13943 /* Set we retransmitted */ 13944 rack->rc_gp_saw_rec = 1; 13945 } else { 13946 if (cwnd_to_use > tp->snd_ssthresh) { 13947 /* Set we sent in CA */ 13948 rack->rc_gp_saw_ca = 1; 13949 } else { 13950 /* Set we sent in SS */ 13951 rack->rc_gp_saw_ss = 1; 13952 } 13953 } 13954 if (TCPS_HAVEESTABLISHED(tp->t_state) && 13955 (tp->t_flags & TF_SACK_PERMIT) && 13956 tp->rcv_numsacks > 0) 13957 tcp_clean_dsack_blocks(tp); 13958 tot_len_this_send += len; 13959 if (len == 0) 13960 counter_u64_add(rack_out_size[TCP_MSS_ACCT_SNDACK], 1); 13961 else if (len == 1) { 13962 counter_u64_add(rack_out_size[TCP_MSS_ACCT_PERSIST], 1); 13963 } else if (len > 1) { 13964 int idx; 13965 13966 idx = (len / segsiz) + 3; 13967 if (idx >= TCP_MSS_ACCT_ATIMER) 13968 counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1); 13969 else 13970 counter_u64_add(rack_out_size[idx], 1); 13971 } 13972 if (hw_tls && len > 0) { 13973 if (filled_all) { 13974 counter_u64_add(rack_tls_filled, 1); 13975 rack_log_type_hrdwtso(tp, rack, len, 0, orig_len, 1); 13976 } else { 13977 if (rsm) { 13978 counter_u64_add(rack_tls_rxt, 1); 13979 rack_log_type_hrdwtso(tp, rack, len, 2, orig_len, 1); 13980 } else if (doing_tlp) { 13981 counter_u64_add(rack_tls_tlp, 1); 13982 rack_log_type_hrdwtso(tp, rack, len, 3, orig_len, 1); 13983 } else if ( (ctf_outstanding(tp) + minseg) > sbavail(sb)) { 13984 counter_u64_add(rack_tls_app, 1); 13985 rack_log_type_hrdwtso(tp, rack, len, 4, orig_len, 1); 13986 } else if ((ctf_flight_size(tp, rack->r_ctl.rc_sacked) + minseg) > cwnd_to_use) { 13987 counter_u64_add(rack_tls_cwnd, 1); 13988 rack_log_type_hrdwtso(tp, rack, len, 5, orig_len, 1); 13989 } else if ((ctf_outstanding(tp) + minseg) > tp->snd_wnd) { 13990 counter_u64_add(rack_tls_rwnd, 1); 13991 rack_log_type_hrdwtso(tp, rack, len, 6, orig_len, 1); 13992 } else { 13993 rack_log_type_hrdwtso(tp, rack, len, 7, orig_len, 1); 13994 counter_u64_add(rack_tls_other, 1); 13995 } 13996 } 13997 } 13998 } 13999 if (rack->rack_no_prr == 0) { 14000 if (sub_from_prr && (error == 0)) { 14001 if (rack->r_ctl.rc_prr_sndcnt >= len) 14002 rack->r_ctl.rc_prr_sndcnt -= len; 14003 else 14004 rack->r_ctl.rc_prr_sndcnt = 0; 14005 } 14006 } 14007 sub_from_prr = 0; 14008 rack_log_output(tp, &to, len, rack_seq, (uint8_t) flags, error, cts, 14009 pass, rsm, us_cts); 14010 if ((error == 0) && 14011 (len > 0) && 14012 (tp->snd_una == tp->snd_max)) 14013 rack->r_ctl.rc_tlp_rxt_last_time = cts; 14014 /* Now are we in persists? */ 14015 if (rack->rc_in_persist == 0) { 14016 tcp_seq startseq = tp->snd_nxt; 14017 14018 /* Track our lost count */ 14019 if (rsm && (doing_tlp == 0)) 14020 rack->r_ctl.rc_loss_count += rsm->r_end - rsm->r_start; 14021 /* 14022 * Advance snd_nxt over sequence space of this segment. 14023 */ 14024 if (error) 14025 /* We don't log or do anything with errors */ 14026 goto nomore; 14027 if (doing_tlp == 0) { 14028 if (rsm == NULL) { 14029 /* 14030 * Not a retransmission of some 14031 * sort, new data is going out so 14032 * clear our TLP count and flag. 14033 */ 14034 rack->rc_tlp_in_progress = 0; 14035 rack->r_ctl.rc_tlp_cnt_out = 0; 14036 } 14037 } else { 14038 /* 14039 * We have just sent a TLP, mark that it is true 14040 * and make sure our in progress is set so we 14041 * continue to check the count. 14042 */ 14043 rack->rc_tlp_in_progress = 1; 14044 rack->r_ctl.rc_tlp_cnt_out++; 14045 } 14046 if (flags & (TH_SYN | TH_FIN)) { 14047 if (flags & TH_SYN) 14048 tp->snd_nxt++; 14049 if (flags & TH_FIN) { 14050 tp->snd_nxt++; 14051 tp->t_flags |= TF_SENTFIN; 14052 } 14053 } 14054 /* In the ENOBUFS case we do *not* update snd_max */ 14055 if (sack_rxmit) 14056 goto nomore; 14057 14058 tp->snd_nxt += len; 14059 if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { 14060 if (tp->snd_una == tp->snd_max) { 14061 /* 14062 * Update the time we just added data since 14063 * none was outstanding. 14064 */ 14065 rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__); 14066 tp->t_acktime = ticks; 14067 } 14068 tp->snd_max = tp->snd_nxt; 14069 /* 14070 * Time this transmission if not a retransmission and 14071 * not currently timing anything. 14072 * This is only relevant in case of switching back to 14073 * the base stack. 14074 */ 14075 if (tp->t_rtttime == 0) { 14076 tp->t_rtttime = ticks; 14077 tp->t_rtseq = startseq; 14078 KMOD_TCPSTAT_INC(tcps_segstimed); 14079 } 14080 if (len && 14081 ((tp->t_flags & TF_GPUTINPROG) == 0)) 14082 rack_start_gp_measurement(tp, rack, startseq, sb_offset); 14083 } 14084 } else { 14085 /* 14086 * Persist case, update snd_max but since we are in persist 14087 * mode (no window) we do not update snd_nxt. 14088 */ 14089 int32_t xlen = len; 14090 14091 if (error) 14092 goto nomore; 14093 14094 if (flags & TH_SYN) 14095 ++xlen; 14096 if (flags & TH_FIN) { 14097 ++xlen; 14098 tp->t_flags |= TF_SENTFIN; 14099 } 14100 /* In the ENOBUFS case we do *not* update snd_max */ 14101 if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max)) { 14102 if (tp->snd_una == tp->snd_max) { 14103 /* 14104 * Update the time we just added data since 14105 * none was outstanding. 14106 */ 14107 rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__); 14108 tp->t_acktime = ticks; 14109 } 14110 tp->snd_max = tp->snd_nxt + len; 14111 } 14112 } 14113 nomore: 14114 if (error) { 14115 rack->r_ctl.rc_agg_delayed = 0; 14116 rack->r_early = 0; 14117 rack->r_late = 0; 14118 rack->r_ctl.rc_agg_early = 0; 14119 SOCKBUF_UNLOCK_ASSERT(sb); /* Check gotos. */ 14120 /* 14121 * Failures do not advance the seq counter above. For the 14122 * case of ENOBUFS we will fall out and retry in 1ms with 14123 * the hpts. Everything else will just have to retransmit 14124 * with the timer. 14125 * 14126 * In any case, we do not want to loop around for another 14127 * send without a good reason. 14128 */ 14129 sendalot = 0; 14130 switch (error) { 14131 case EPERM: 14132 tp->t_softerror = error; 14133 return (error); 14134 case ENOBUFS: 14135 if (slot == 0) { 14136 /* 14137 * Pace us right away to retry in a some 14138 * time 14139 */ 14140 slot = ((1 + rack->rc_enobuf) * HPTS_USEC_IN_MSEC); 14141 if (rack->rc_enobuf < 126) 14142 rack->rc_enobuf++; 14143 if (slot > ((rack->rc_rack_rtt / 2) * HPTS_USEC_IN_MSEC)) { 14144 slot = (rack->rc_rack_rtt / 2) * HPTS_USEC_IN_MSEC; 14145 } 14146 if (slot < (10 * HPTS_USEC_IN_MSEC)) 14147 slot = 10 * HPTS_USEC_IN_MSEC; 14148 } 14149 counter_u64_add(rack_saw_enobuf, 1); 14150 error = 0; 14151 goto enobufs; 14152 case EMSGSIZE: 14153 /* 14154 * For some reason the interface we used initially 14155 * to send segments changed to another or lowered 14156 * its MTU. If TSO was active we either got an 14157 * interface without TSO capabilits or TSO was 14158 * turned off. If we obtained mtu from ip_output() 14159 * then update it and try again. 14160 */ 14161 if (tso) 14162 tp->t_flags &= ~TF_TSO; 14163 if (mtu != 0) { 14164 tcp_mss_update(tp, -1, mtu, NULL, NULL); 14165 goto again; 14166 } 14167 slot = 10 * HPTS_USEC_IN_MSEC; 14168 rack_start_hpts_timer(rack, tp, cts, slot, 0, 0); 14169 return (error); 14170 case ENETUNREACH: 14171 counter_u64_add(rack_saw_enetunreach, 1); 14172 case EHOSTDOWN: 14173 case EHOSTUNREACH: 14174 case ENETDOWN: 14175 if (TCPS_HAVERCVDSYN(tp->t_state)) { 14176 tp->t_softerror = error; 14177 } 14178 /* FALLTHROUGH */ 14179 default: 14180 slot = 10 * HPTS_USEC_IN_MSEC; 14181 rack_start_hpts_timer(rack, tp, cts, slot, 0, 0); 14182 return (error); 14183 } 14184 } else { 14185 rack->rc_enobuf = 0; 14186 } 14187 KMOD_TCPSTAT_INC(tcps_sndtotal); 14188 14189 /* 14190 * Data sent (as far as we can tell). If this advertises a larger 14191 * window than any other segment, then remember the size of the 14192 * advertised window. Any pending ACK has now been sent. 14193 */ 14194 if (recwin > 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv)) 14195 tp->rcv_adv = tp->rcv_nxt + recwin; 14196 tp->last_ack_sent = tp->rcv_nxt; 14197 tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); 14198 enobufs: 14199 /* Assure when we leave that snd_nxt will point to top */ 14200 if (SEQ_GT(tp->snd_max, tp->snd_nxt)) 14201 tp->snd_nxt = tp->snd_max; 14202 if (sendalot) { 14203 /* Do we need to turn off sendalot? */ 14204 if (rack->r_ctl.rc_pace_max_segs && 14205 (tot_len_this_send >= rack->r_ctl.rc_pace_max_segs)) { 14206 /* We hit our max. */ 14207 sendalot = 0; 14208 } else if ((rack->rc_user_set_max_segs) && 14209 (tot_len_this_send >= (rack->rc_user_set_max_segs * segsiz))) { 14210 /* We hit the user defined max */ 14211 sendalot = 0; 14212 } 14213 } 14214 if ((error == 0) && (flags & TH_FIN)) 14215 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_FIN); 14216 if (flags & TH_RST) { 14217 /* 14218 * We don't send again after sending a RST. 14219 */ 14220 slot = 0; 14221 sendalot = 0; 14222 if (error == 0) 14223 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); 14224 } else if ((slot == 0) && (sendalot == 0) && tot_len_this_send) { 14225 /* 14226 * Get our pacing rate, if an error 14227 * occured in sending (ENOBUF) we would 14228 * hit the else if with slot preset. Other 14229 * errors return. 14230 */ 14231 slot = rack_get_pacing_delay(rack, tp, tot_len_this_send, rsm, segsiz); 14232 } 14233 if (rsm && 14234 rack->use_rack_rr) { 14235 /* Its a retransmit and we use the rack cheat? */ 14236 if ((slot == 0) || 14237 (rack->rc_always_pace == 0) || 14238 (rack->r_rr_config == 1)) { 14239 /* 14240 * We have no pacing set or we 14241 * are using old-style rack or 14242 * we are overriden to use the old 1ms pacing. 14243 */ 14244 slot = rack->r_ctl.rc_min_to * HPTS_USEC_IN_MSEC; 14245 } 14246 } 14247 if (slot) { 14248 /* set the rack tcb into the slot N */ 14249 counter_u64_add(rack_paced_segments, 1); 14250 } else if (sendalot) { 14251 if (len) 14252 counter_u64_add(rack_unpaced_segments, 1); 14253 sack_rxmit = 0; 14254 goto again; 14255 } else if (len) { 14256 counter_u64_add(rack_unpaced_segments, 1); 14257 } 14258 rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, 0); 14259 return (error); 14260 } 14261 14262 static void 14263 rack_update_seg(struct tcp_rack *rack) 14264 { 14265 uint32_t orig_val; 14266 14267 orig_val = rack->r_ctl.rc_pace_max_segs; 14268 rack_set_pace_segments(rack->rc_tp, rack, __LINE__); 14269 if (orig_val != rack->r_ctl.rc_pace_max_segs) 14270 rack_log_pacing_delay_calc(rack, 0, 0, orig_val, 0, 0, 15, __LINE__, NULL); 14271 } 14272 14273 /* 14274 * rack_ctloutput() must drop the inpcb lock before performing copyin on 14275 * socket option arguments. When it re-acquires the lock after the copy, it 14276 * has to revalidate that the connection is still valid for the socket 14277 * option. 14278 */ 14279 static int 14280 rack_set_sockopt(struct socket *so, struct sockopt *sopt, 14281 struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack) 14282 { 14283 struct epoch_tracker et; 14284 uint64_t val; 14285 int32_t error = 0, optval; 14286 uint16_t ca, ss; 14287 14288 14289 switch (sopt->sopt_name) { 14290 case TCP_RACK_PROP_RATE: /* URL:prop_rate */ 14291 case TCP_RACK_PROP : /* URL:prop */ 14292 case TCP_RACK_TLP_REDUCE: /* URL:tlp_reduce */ 14293 case TCP_RACK_EARLY_RECOV: /* URL:early_recov */ 14294 case TCP_RACK_PACE_REDUCE: /* Not used */ 14295 /* Pacing related ones */ 14296 case TCP_RACK_PACE_ALWAYS: /* URL:pace_always */ 14297 case TCP_BBR_RACK_INIT_RATE: /* URL:irate */ 14298 case TCP_BBR_IWINTSO: /* URL:tso_iwin */ 14299 case TCP_RACK_PACE_MAX_SEG: /* URL:pace_max_seg */ 14300 case TCP_RACK_FORCE_MSEG: /* URL:force_max_seg */ 14301 case TCP_RACK_PACE_RATE_CA: /* URL:pr_ca */ 14302 case TCP_RACK_PACE_RATE_SS: /* URL:pr_ss*/ 14303 case TCP_RACK_PACE_RATE_REC: /* URL:pr_rec */ 14304 case TCP_RACK_GP_INCREASE_CA: /* URL:gp_inc_ca */ 14305 case TCP_RACK_GP_INCREASE_SS: /* URL:gp_inc_ss */ 14306 case TCP_RACK_GP_INCREASE_REC: /* URL:gp_inc_rec */ 14307 case TCP_RACK_RR_CONF: /* URL:rrr_conf */ 14308 case TCP_BBR_HDWR_PACE: /* URL:hdwrpace */ 14309 /* End pacing related */ 14310 case TCP_DELACK: 14311 case TCP_RACK_PRR_SENDALOT: /* URL:prr_sendalot */ 14312 case TCP_RACK_MIN_TO: /* URL:min_to */ 14313 case TCP_RACK_EARLY_SEG: /* URL:early_seg */ 14314 case TCP_RACK_REORD_THRESH: /* URL:reord_thresh */ 14315 case TCP_RACK_REORD_FADE: /* URL:reord_fade */ 14316 case TCP_RACK_TLP_THRESH: /* URL:tlp_thresh */ 14317 case TCP_RACK_PKT_DELAY: /* URL:pkt_delay */ 14318 case TCP_RACK_TLP_USE: /* URL:tlp_use */ 14319 case TCP_RACK_TLP_INC_VAR: /* URL:tlp_inc_var */ 14320 case TCP_RACK_IDLE_REDUCE_HIGH: /* URL:idle_reduce_high */ 14321 case TCP_BBR_RACK_RTT_USE: /* URL:rttuse */ 14322 case TCP_BBR_USE_RACK_RR: /* URL:rackrr */ 14323 case TCP_RACK_DO_DETECTION: /* URL:detect */ 14324 case TCP_NO_PRR: /* URL:noprr */ 14325 case TCP_TIMELY_DYN_ADJ: /* URL:dynamic */ 14326 case TCP_DATA_AFTER_CLOSE: 14327 case TCP_RACK_NONRXT_CFG_RATE: /* URL:nonrxtcr */ 14328 case TCP_SHARED_CWND_ENABLE: /* URL:scwnd */ 14329 case TCP_RACK_MBUF_QUEUE: /* URL:mqueue */ 14330 case TCP_RACK_NO_PUSH_AT_MAX: /* URL:npush */ 14331 case TCP_RACK_PACE_TO_FILL: /* URL:fillcw */ 14332 case TCP_SHARED_CWND_TIME_LIMIT: /* URL:lscwnd */ 14333 case TCP_RACK_PROFILE: /* URL:profile */ 14334 break; 14335 default: 14336 return (tcp_default_ctloutput(so, sopt, inp, tp)); 14337 break; 14338 } 14339 INP_WUNLOCK(inp); 14340 error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); 14341 if (error) 14342 return (error); 14343 INP_WLOCK(inp); 14344 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 14345 INP_WUNLOCK(inp); 14346 return (ECONNRESET); 14347 } 14348 tp = intotcpcb(inp); 14349 rack = (struct tcp_rack *)tp->t_fb_ptr; 14350 switch (sopt->sopt_name) { 14351 case TCP_RACK_PROFILE: 14352 RACK_OPTS_INC(tcp_profile); 14353 if (optval == 1) { 14354 /* pace_always=1 */ 14355 rack->rc_always_pace = 1; 14356 tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ; 14357 /* scwnd=1 */ 14358 rack->rack_enable_scwnd = 1; 14359 /* dynamic=100 */ 14360 rack->rc_gp_dyn_mul = 1; 14361 rack->r_ctl.rack_per_of_gp_ca = 100; 14362 /* rrr_conf=3 */ 14363 rack->r_rr_config = 3; 14364 /* npush=2 */ 14365 rack->r_ctl.rc_no_push_at_mrtt = 2; 14366 /* fillcw=1 */ 14367 rack->rc_pace_to_cwnd = 1; 14368 rack->rc_pace_fill_if_rttin_range = 0; 14369 rack->rtt_limit_mul = 0; 14370 /* noprr=1 */ 14371 rack->rack_no_prr = 1; 14372 /* lscwnd=1 */ 14373 rack->r_limit_scw = 1; 14374 } else if (optval == 2) { 14375 /* pace_always=1 */ 14376 rack->rc_always_pace = 1; 14377 tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ; 14378 /* scwnd=1 */ 14379 rack->rack_enable_scwnd = 1; 14380 /* dynamic=100 */ 14381 rack->rc_gp_dyn_mul = 1; 14382 rack->r_ctl.rack_per_of_gp_ca = 100; 14383 /* rrr_conf=3 */ 14384 rack->r_rr_config = 3; 14385 /* npush=2 */ 14386 rack->r_ctl.rc_no_push_at_mrtt = 2; 14387 /* fillcw=1 */ 14388 rack->rc_pace_to_cwnd = 1; 14389 rack->rc_pace_fill_if_rttin_range = 0; 14390 rack->rtt_limit_mul = 0; 14391 /* noprr=1 */ 14392 rack->rack_no_prr = 1; 14393 /* lscwnd=0 */ 14394 rack->r_limit_scw = 0; 14395 } 14396 break; 14397 case TCP_SHARED_CWND_TIME_LIMIT: 14398 RACK_OPTS_INC(tcp_lscwnd); 14399 if (optval) 14400 rack->r_limit_scw = 1; 14401 else 14402 rack->r_limit_scw = 0; 14403 break; 14404 case TCP_RACK_PACE_TO_FILL: 14405 RACK_OPTS_INC(tcp_fillcw); 14406 if (optval == 0) 14407 rack->rc_pace_to_cwnd = 0; 14408 else 14409 rack->rc_pace_to_cwnd = 1; 14410 if ((optval >= rack_gp_rtt_maxmul) && 14411 rack_gp_rtt_maxmul && 14412 (optval < 0xf)) { 14413 rack->rc_pace_fill_if_rttin_range = 1; 14414 rack->rtt_limit_mul = optval; 14415 } else { 14416 rack->rc_pace_fill_if_rttin_range = 0; 14417 rack->rtt_limit_mul = 0; 14418 } 14419 break; 14420 case TCP_RACK_NO_PUSH_AT_MAX: 14421 RACK_OPTS_INC(tcp_npush); 14422 if (optval == 0) 14423 rack->r_ctl.rc_no_push_at_mrtt = 0; 14424 else if (optval < 0xff) 14425 rack->r_ctl.rc_no_push_at_mrtt = optval; 14426 else 14427 error = EINVAL; 14428 break; 14429 case TCP_SHARED_CWND_ENABLE: 14430 RACK_OPTS_INC(tcp_rack_scwnd); 14431 if (optval == 0) 14432 rack->rack_enable_scwnd = 0; 14433 else 14434 rack->rack_enable_scwnd = 1; 14435 break; 14436 case TCP_RACK_MBUF_QUEUE: 14437 /* Now do we use the LRO mbuf-queue feature */ 14438 RACK_OPTS_INC(tcp_rack_mbufq); 14439 if (optval) 14440 rack->r_mbuf_queue = 1; 14441 else 14442 rack->r_mbuf_queue = 0; 14443 if (rack->r_mbuf_queue || rack->rc_always_pace) 14444 tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ; 14445 else 14446 tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ; 14447 break; 14448 case TCP_RACK_NONRXT_CFG_RATE: 14449 RACK_OPTS_INC(tcp_rack_cfg_rate); 14450 if (optval == 0) 14451 rack->rack_rec_nonrxt_use_cr = 0; 14452 else 14453 rack->rack_rec_nonrxt_use_cr = 1; 14454 break; 14455 case TCP_NO_PRR: 14456 RACK_OPTS_INC(tcp_rack_noprr); 14457 if (optval == 0) 14458 rack->rack_no_prr = 0; 14459 else 14460 rack->rack_no_prr = 1; 14461 break; 14462 case TCP_TIMELY_DYN_ADJ: 14463 RACK_OPTS_INC(tcp_timely_dyn); 14464 if (optval == 0) 14465 rack->rc_gp_dyn_mul = 0; 14466 else { 14467 rack->rc_gp_dyn_mul = 1; 14468 if (optval >= 100) { 14469 /* 14470 * If the user sets something 100 or more 14471 * its the gp_ca value. 14472 */ 14473 rack->r_ctl.rack_per_of_gp_ca = optval; 14474 } 14475 } 14476 break; 14477 case TCP_RACK_DO_DETECTION: 14478 RACK_OPTS_INC(tcp_rack_do_detection); 14479 if (optval == 0) 14480 rack->do_detection = 0; 14481 else 14482 rack->do_detection = 1; 14483 break; 14484 case TCP_RACK_PROP_RATE: 14485 if ((optval <= 0) || (optval >= 100)) { 14486 error = EINVAL; 14487 break; 14488 } 14489 RACK_OPTS_INC(tcp_rack_prop_rate); 14490 rack->r_ctl.rc_prop_rate = optval; 14491 break; 14492 case TCP_RACK_TLP_USE: 14493 if ((optval < TLP_USE_ID) || (optval > TLP_USE_TWO_TWO)) { 14494 error = EINVAL; 14495 break; 14496 } 14497 RACK_OPTS_INC(tcp_tlp_use); 14498 rack->rack_tlp_threshold_use = optval; 14499 break; 14500 case TCP_RACK_PROP: 14501 /* RACK proportional rate reduction (bool) */ 14502 RACK_OPTS_INC(tcp_rack_prop); 14503 rack->r_ctl.rc_prop_reduce = optval; 14504 break; 14505 case TCP_RACK_TLP_REDUCE: 14506 /* RACK TLP cwnd reduction (bool) */ 14507 RACK_OPTS_INC(tcp_rack_tlp_reduce); 14508 rack->r_ctl.rc_tlp_cwnd_reduce = optval; 14509 break; 14510 case TCP_RACK_EARLY_RECOV: 14511 /* Should recovery happen early (bool) */ 14512 RACK_OPTS_INC(tcp_rack_early_recov); 14513 rack->r_ctl.rc_early_recovery = optval; 14514 break; 14515 14516 /* Pacing related ones */ 14517 case TCP_RACK_PACE_ALWAYS: 14518 /* 14519 * zero is old rack method, 1 is new 14520 * method using a pacing rate. 14521 */ 14522 RACK_OPTS_INC(tcp_rack_pace_always); 14523 if (optval > 0) 14524 rack->rc_always_pace = 1; 14525 else 14526 rack->rc_always_pace = 0; 14527 if (rack->r_mbuf_queue || rack->rc_always_pace) 14528 tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ; 14529 else 14530 tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ; 14531 /* A rate may be set irate or other, if so set seg size */ 14532 rack_update_seg(rack); 14533 break; 14534 case TCP_BBR_RACK_INIT_RATE: 14535 RACK_OPTS_INC(tcp_initial_rate); 14536 val = optval; 14537 /* Change from kbits per second to bytes per second */ 14538 val *= 1000; 14539 val /= 8; 14540 rack->r_ctl.init_rate = val; 14541 if (rack->rc_init_win != rack_default_init_window) { 14542 uint32_t win, snt; 14543 14544 /* 14545 * Options don't always get applied 14546 * in the order you think. So in order 14547 * to assure we update a cwnd we need 14548 * to check and see if we are still 14549 * where we should raise the cwnd. 14550 */ 14551 win = rc_init_window(rack); 14552 if (SEQ_GT(tp->snd_max, tp->iss)) 14553 snt = tp->snd_max - tp->iss; 14554 else 14555 snt = 0; 14556 if ((snt < win) && 14557 (tp->snd_cwnd < win)) 14558 tp->snd_cwnd = win; 14559 } 14560 if (rack->rc_always_pace) 14561 rack_update_seg(rack); 14562 break; 14563 case TCP_BBR_IWINTSO: 14564 RACK_OPTS_INC(tcp_initial_win); 14565 if (optval && (optval <= 0xff)) { 14566 uint32_t win, snt; 14567 14568 rack->rc_init_win = optval; 14569 win = rc_init_window(rack); 14570 if (SEQ_GT(tp->snd_max, tp->iss)) 14571 snt = tp->snd_max - tp->iss; 14572 else 14573 snt = 0; 14574 if ((snt < win) && 14575 (tp->t_srtt | 14576 #ifdef NETFLIX_PEAKRATE 14577 tp->t_maxpeakrate | 14578 #endif 14579 rack->r_ctl.init_rate)) { 14580 /* 14581 * We are not past the initial window 14582 * and we have some bases for pacing, 14583 * so we need to possibly adjust up 14584 * the cwnd. Note even if we don't set 14585 * the cwnd, its still ok to raise the rc_init_win 14586 * which can be used coming out of idle when we 14587 * would have a rate. 14588 */ 14589 if (tp->snd_cwnd < win) 14590 tp->snd_cwnd = win; 14591 } 14592 if (rack->rc_always_pace) 14593 rack_update_seg(rack); 14594 } else 14595 error = EINVAL; 14596 break; 14597 case TCP_RACK_FORCE_MSEG: 14598 RACK_OPTS_INC(tcp_rack_force_max_seg); 14599 if (optval) 14600 rack->rc_force_max_seg = 1; 14601 else 14602 rack->rc_force_max_seg = 0; 14603 break; 14604 case TCP_RACK_PACE_MAX_SEG: 14605 /* Max segments size in a pace in bytes */ 14606 RACK_OPTS_INC(tcp_rack_max_seg); 14607 rack->rc_user_set_max_segs = optval; 14608 rack_set_pace_segments(tp, rack, __LINE__); 14609 break; 14610 case TCP_RACK_PACE_RATE_REC: 14611 /* Set the fixed pacing rate in Bytes per second ca */ 14612 RACK_OPTS_INC(tcp_rack_pace_rate_rec); 14613 rack->r_ctl.rc_fixed_pacing_rate_rec = optval; 14614 if (rack->r_ctl.rc_fixed_pacing_rate_ca == 0) 14615 rack->r_ctl.rc_fixed_pacing_rate_ca = optval; 14616 if (rack->r_ctl.rc_fixed_pacing_rate_ss == 0) 14617 rack->r_ctl.rc_fixed_pacing_rate_ss = optval; 14618 rack->use_fixed_rate = 1; 14619 rack_log_pacing_delay_calc(rack, 14620 rack->r_ctl.rc_fixed_pacing_rate_ss, 14621 rack->r_ctl.rc_fixed_pacing_rate_ca, 14622 rack->r_ctl.rc_fixed_pacing_rate_rec, 0, 0, 8, 14623 __LINE__, NULL); 14624 break; 14625 14626 case TCP_RACK_PACE_RATE_SS: 14627 /* Set the fixed pacing rate in Bytes per second ca */ 14628 RACK_OPTS_INC(tcp_rack_pace_rate_ss); 14629 rack->r_ctl.rc_fixed_pacing_rate_ss = optval; 14630 if (rack->r_ctl.rc_fixed_pacing_rate_ca == 0) 14631 rack->r_ctl.rc_fixed_pacing_rate_ca = optval; 14632 if (rack->r_ctl.rc_fixed_pacing_rate_rec == 0) 14633 rack->r_ctl.rc_fixed_pacing_rate_rec = optval; 14634 rack->use_fixed_rate = 1; 14635 rack_log_pacing_delay_calc(rack, 14636 rack->r_ctl.rc_fixed_pacing_rate_ss, 14637 rack->r_ctl.rc_fixed_pacing_rate_ca, 14638 rack->r_ctl.rc_fixed_pacing_rate_rec, 0, 0, 8, 14639 __LINE__, NULL); 14640 break; 14641 14642 case TCP_RACK_PACE_RATE_CA: 14643 /* Set the fixed pacing rate in Bytes per second ca */ 14644 RACK_OPTS_INC(tcp_rack_pace_rate_ca); 14645 rack->r_ctl.rc_fixed_pacing_rate_ca = optval; 14646 if (rack->r_ctl.rc_fixed_pacing_rate_ss == 0) 14647 rack->r_ctl.rc_fixed_pacing_rate_ss = optval; 14648 if (rack->r_ctl.rc_fixed_pacing_rate_rec == 0) 14649 rack->r_ctl.rc_fixed_pacing_rate_rec = optval; 14650 rack->use_fixed_rate = 1; 14651 rack_log_pacing_delay_calc(rack, 14652 rack->r_ctl.rc_fixed_pacing_rate_ss, 14653 rack->r_ctl.rc_fixed_pacing_rate_ca, 14654 rack->r_ctl.rc_fixed_pacing_rate_rec, 0, 0, 8, 14655 __LINE__, NULL); 14656 break; 14657 case TCP_RACK_GP_INCREASE_REC: 14658 RACK_OPTS_INC(tcp_gp_inc_rec); 14659 rack->r_ctl.rack_per_of_gp_rec = optval; 14660 rack_log_pacing_delay_calc(rack, 14661 rack->r_ctl.rack_per_of_gp_ss, 14662 rack->r_ctl.rack_per_of_gp_ca, 14663 rack->r_ctl.rack_per_of_gp_rec, 0, 0, 1, 14664 __LINE__, NULL); 14665 break; 14666 case TCP_RACK_GP_INCREASE_CA: 14667 RACK_OPTS_INC(tcp_gp_inc_ca); 14668 ca = optval; 14669 if (ca < 100) { 14670 /* 14671 * We don't allow any reduction 14672 * over the GP b/w. 14673 */ 14674 error = EINVAL; 14675 break; 14676 } 14677 rack->r_ctl.rack_per_of_gp_ca = ca; 14678 rack_log_pacing_delay_calc(rack, 14679 rack->r_ctl.rack_per_of_gp_ss, 14680 rack->r_ctl.rack_per_of_gp_ca, 14681 rack->r_ctl.rack_per_of_gp_rec, 0, 0, 1, 14682 __LINE__, NULL); 14683 break; 14684 case TCP_RACK_GP_INCREASE_SS: 14685 RACK_OPTS_INC(tcp_gp_inc_ss); 14686 ss = optval; 14687 if (ss < 100) { 14688 /* 14689 * We don't allow any reduction 14690 * over the GP b/w. 14691 */ 14692 error = EINVAL; 14693 break; 14694 } 14695 rack->r_ctl.rack_per_of_gp_ss = ss; 14696 rack_log_pacing_delay_calc(rack, 14697 rack->r_ctl.rack_per_of_gp_ss, 14698 rack->r_ctl.rack_per_of_gp_ca, 14699 rack->r_ctl.rack_per_of_gp_rec, 0, 0, 1, 14700 __LINE__, NULL); 14701 break; 14702 case TCP_RACK_RR_CONF: 14703 RACK_OPTS_INC(tcp_rack_rrr_no_conf_rate); 14704 if (optval && optval <= 3) 14705 rack->r_rr_config = optval; 14706 else 14707 rack->r_rr_config = 0; 14708 break; 14709 case TCP_BBR_HDWR_PACE: 14710 RACK_OPTS_INC(tcp_hdwr_pacing); 14711 if (optval){ 14712 if (rack->rack_hdrw_pacing == 0) { 14713 rack->rack_hdw_pace_ena = 1; 14714 rack->rack_attempt_hdwr_pace = 0; 14715 } else 14716 error = EALREADY; 14717 } else { 14718 rack->rack_hdw_pace_ena = 0; 14719 #ifdef RATELIMIT 14720 if (rack->rack_hdrw_pacing) { 14721 rack->rack_hdrw_pacing = 0; 14722 in_pcbdetach_txrtlmt(rack->rc_inp); 14723 } 14724 #endif 14725 } 14726 break; 14727 /* End Pacing related ones */ 14728 case TCP_RACK_PRR_SENDALOT: 14729 /* Allow PRR to send more than one seg */ 14730 RACK_OPTS_INC(tcp_rack_prr_sendalot); 14731 rack->r_ctl.rc_prr_sendalot = optval; 14732 break; 14733 case TCP_RACK_MIN_TO: 14734 /* Minimum time between rack t-o's in ms */ 14735 RACK_OPTS_INC(tcp_rack_min_to); 14736 rack->r_ctl.rc_min_to = optval; 14737 break; 14738 case TCP_RACK_EARLY_SEG: 14739 /* If early recovery max segments */ 14740 RACK_OPTS_INC(tcp_rack_early_seg); 14741 rack->r_ctl.rc_early_recovery_segs = optval; 14742 break; 14743 case TCP_RACK_REORD_THRESH: 14744 /* RACK reorder threshold (shift amount) */ 14745 RACK_OPTS_INC(tcp_rack_reord_thresh); 14746 if ((optval > 0) && (optval < 31)) 14747 rack->r_ctl.rc_reorder_shift = optval; 14748 else 14749 error = EINVAL; 14750 break; 14751 case TCP_RACK_REORD_FADE: 14752 /* Does reordering fade after ms time */ 14753 RACK_OPTS_INC(tcp_rack_reord_fade); 14754 rack->r_ctl.rc_reorder_fade = optval; 14755 break; 14756 case TCP_RACK_TLP_THRESH: 14757 /* RACK TLP theshold i.e. srtt+(srtt/N) */ 14758 RACK_OPTS_INC(tcp_rack_tlp_thresh); 14759 if (optval) 14760 rack->r_ctl.rc_tlp_threshold = optval; 14761 else 14762 error = EINVAL; 14763 break; 14764 case TCP_BBR_USE_RACK_RR: 14765 RACK_OPTS_INC(tcp_rack_rr); 14766 if (optval) 14767 rack->use_rack_rr = 1; 14768 else 14769 rack->use_rack_rr = 0; 14770 break; 14771 case TCP_RACK_PKT_DELAY: 14772 /* RACK added ms i.e. rack-rtt + reord + N */ 14773 RACK_OPTS_INC(tcp_rack_pkt_delay); 14774 rack->r_ctl.rc_pkt_delay = optval; 14775 break; 14776 case TCP_RACK_TLP_INC_VAR: 14777 /* Does TLP include rtt variance in t-o */ 14778 error = EINVAL; 14779 break; 14780 case TCP_RACK_IDLE_REDUCE_HIGH: 14781 error = EINVAL; 14782 break; 14783 case TCP_DELACK: 14784 if (optval == 0) 14785 tp->t_delayed_ack = 0; 14786 else 14787 tp->t_delayed_ack = 1; 14788 if (tp->t_flags & TF_DELACK) { 14789 tp->t_flags &= ~TF_DELACK; 14790 tp->t_flags |= TF_ACKNOW; 14791 NET_EPOCH_ENTER(et); 14792 rack_output(tp); 14793 NET_EPOCH_EXIT(et); 14794 } 14795 break; 14796 14797 case TCP_BBR_RACK_RTT_USE: 14798 if ((optval != USE_RTT_HIGH) && 14799 (optval != USE_RTT_LOW) && 14800 (optval != USE_RTT_AVG)) 14801 error = EINVAL; 14802 else 14803 rack->r_ctl.rc_rate_sample_method = optval; 14804 break; 14805 case TCP_DATA_AFTER_CLOSE: 14806 if (optval) 14807 rack->rc_allow_data_af_clo = 1; 14808 else 14809 rack->rc_allow_data_af_clo = 0; 14810 break; 14811 case TCP_RACK_PACE_REDUCE: 14812 /* sysctl only now */ 14813 error = EINVAL; 14814 break; 14815 default: 14816 return (tcp_default_ctloutput(so, sopt, inp, tp)); 14817 break; 14818 } 14819 #ifdef NETFLIX_STATS 14820 tcp_log_socket_option(tp, sopt->sopt_name, optval, error); 14821 #endif 14822 INP_WUNLOCK(inp); 14823 return (error); 14824 } 14825 14826 static int 14827 rack_get_sockopt(struct socket *so, struct sockopt *sopt, 14828 struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack) 14829 { 14830 int32_t error, optval; 14831 uint64_t val; 14832 /* 14833 * Because all our options are either boolean or an int, we can just 14834 * pull everything into optval and then unlock and copy. If we ever 14835 * add a option that is not a int, then this will have quite an 14836 * impact to this routine. 14837 */ 14838 error = 0; 14839 switch (sopt->sopt_name) { 14840 case TCP_RACK_PROFILE: 14841 /* You cannot retrieve a profile, its write only */ 14842 error = EINVAL; 14843 break; 14844 case TCP_RACK_PACE_TO_FILL: 14845 optval = rack->rc_pace_to_cwnd; 14846 break; 14847 case TCP_RACK_NO_PUSH_AT_MAX: 14848 optval = rack->r_ctl.rc_no_push_at_mrtt; 14849 break; 14850 case TCP_SHARED_CWND_ENABLE: 14851 optval = rack->rack_enable_scwnd; 14852 break; 14853 case TCP_RACK_NONRXT_CFG_RATE: 14854 optval = rack->rack_rec_nonrxt_use_cr; 14855 break; 14856 case TCP_NO_PRR: 14857 optval = rack->rack_no_prr; 14858 break; 14859 case TCP_RACK_DO_DETECTION: 14860 optval = rack->do_detection; 14861 break; 14862 case TCP_RACK_MBUF_QUEUE: 14863 /* Now do we use the LRO mbuf-queue feature */ 14864 optval = rack->r_mbuf_queue; 14865 break; 14866 case TCP_TIMELY_DYN_ADJ: 14867 optval = rack->rc_gp_dyn_mul; 14868 break; 14869 case TCP_BBR_IWINTSO: 14870 optval = rack->rc_init_win; 14871 break; 14872 case TCP_RACK_PROP_RATE: 14873 optval = rack->r_ctl.rc_prop_rate; 14874 break; 14875 case TCP_RACK_PROP: 14876 /* RACK proportional rate reduction (bool) */ 14877 optval = rack->r_ctl.rc_prop_reduce; 14878 break; 14879 case TCP_RACK_TLP_REDUCE: 14880 /* RACK TLP cwnd reduction (bool) */ 14881 optval = rack->r_ctl.rc_tlp_cwnd_reduce; 14882 break; 14883 case TCP_RACK_EARLY_RECOV: 14884 /* Should recovery happen early (bool) */ 14885 optval = rack->r_ctl.rc_early_recovery; 14886 break; 14887 case TCP_RACK_PACE_REDUCE: 14888 /* RACK Hptsi reduction factor (divisor) */ 14889 error = EINVAL; 14890 break; 14891 case TCP_BBR_RACK_INIT_RATE: 14892 val = rack->r_ctl.init_rate; 14893 /* convert to kbits per sec */ 14894 val *= 8; 14895 val /= 1000; 14896 optval = (uint32_t)val; 14897 break; 14898 case TCP_RACK_FORCE_MSEG: 14899 optval = rack->rc_force_max_seg; 14900 break; 14901 case TCP_RACK_PACE_MAX_SEG: 14902 /* Max segments in a pace */ 14903 optval = rack->rc_user_set_max_segs; 14904 break; 14905 case TCP_RACK_PACE_ALWAYS: 14906 /* Use the always pace method */ 14907 optval = rack->rc_always_pace; 14908 break; 14909 case TCP_RACK_PRR_SENDALOT: 14910 /* Allow PRR to send more than one seg */ 14911 optval = rack->r_ctl.rc_prr_sendalot; 14912 break; 14913 case TCP_RACK_MIN_TO: 14914 /* Minimum time between rack t-o's in ms */ 14915 optval = rack->r_ctl.rc_min_to; 14916 break; 14917 case TCP_RACK_EARLY_SEG: 14918 /* If early recovery max segments */ 14919 optval = rack->r_ctl.rc_early_recovery_segs; 14920 break; 14921 case TCP_RACK_REORD_THRESH: 14922 /* RACK reorder threshold (shift amount) */ 14923 optval = rack->r_ctl.rc_reorder_shift; 14924 break; 14925 case TCP_RACK_REORD_FADE: 14926 /* Does reordering fade after ms time */ 14927 optval = rack->r_ctl.rc_reorder_fade; 14928 break; 14929 case TCP_BBR_USE_RACK_RR: 14930 /* Do we use the rack cheat for rxt */ 14931 optval = rack->use_rack_rr; 14932 break; 14933 case TCP_RACK_RR_CONF: 14934 optval = rack->r_rr_config; 14935 break; 14936 case TCP_BBR_HDWR_PACE: 14937 optval = rack->rack_hdw_pace_ena; 14938 break; 14939 case TCP_RACK_TLP_THRESH: 14940 /* RACK TLP theshold i.e. srtt+(srtt/N) */ 14941 optval = rack->r_ctl.rc_tlp_threshold; 14942 break; 14943 case TCP_RACK_PKT_DELAY: 14944 /* RACK added ms i.e. rack-rtt + reord + N */ 14945 optval = rack->r_ctl.rc_pkt_delay; 14946 break; 14947 case TCP_RACK_TLP_USE: 14948 optval = rack->rack_tlp_threshold_use; 14949 break; 14950 case TCP_RACK_TLP_INC_VAR: 14951 /* Does TLP include rtt variance in t-o */ 14952 error = EINVAL; 14953 break; 14954 case TCP_RACK_IDLE_REDUCE_HIGH: 14955 error = EINVAL; 14956 break; 14957 case TCP_RACK_PACE_RATE_CA: 14958 optval = rack->r_ctl.rc_fixed_pacing_rate_ca; 14959 break; 14960 case TCP_RACK_PACE_RATE_SS: 14961 optval = rack->r_ctl.rc_fixed_pacing_rate_ss; 14962 break; 14963 case TCP_RACK_PACE_RATE_REC: 14964 optval = rack->r_ctl.rc_fixed_pacing_rate_rec; 14965 break; 14966 case TCP_RACK_GP_INCREASE_SS: 14967 optval = rack->r_ctl.rack_per_of_gp_ca; 14968 break; 14969 case TCP_RACK_GP_INCREASE_CA: 14970 optval = rack->r_ctl.rack_per_of_gp_ss; 14971 break; 14972 case TCP_BBR_RACK_RTT_USE: 14973 optval = rack->r_ctl.rc_rate_sample_method; 14974 break; 14975 case TCP_DELACK: 14976 optval = tp->t_delayed_ack; 14977 break; 14978 case TCP_DATA_AFTER_CLOSE: 14979 optval = rack->rc_allow_data_af_clo; 14980 break; 14981 case TCP_SHARED_CWND_TIME_LIMIT: 14982 optval = rack->r_limit_scw; 14983 break; 14984 default: 14985 return (tcp_default_ctloutput(so, sopt, inp, tp)); 14986 break; 14987 } 14988 INP_WUNLOCK(inp); 14989 if (error == 0) { 14990 error = sooptcopyout(sopt, &optval, sizeof optval); 14991 } 14992 return (error); 14993 } 14994 14995 static int 14996 rack_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp) 14997 { 14998 int32_t error = EINVAL; 14999 struct tcp_rack *rack; 15000 15001 rack = (struct tcp_rack *)tp->t_fb_ptr; 15002 if (rack == NULL) { 15003 /* Huh? */ 15004 goto out; 15005 } 15006 if (sopt->sopt_dir == SOPT_SET) { 15007 return (rack_set_sockopt(so, sopt, inp, tp, rack)); 15008 } else if (sopt->sopt_dir == SOPT_GET) { 15009 return (rack_get_sockopt(so, sopt, inp, tp, rack)); 15010 } 15011 out: 15012 INP_WUNLOCK(inp); 15013 return (error); 15014 } 15015 15016 static int 15017 rack_pru_options(struct tcpcb *tp, int flags) 15018 { 15019 if (flags & PRUS_OOB) 15020 return (EOPNOTSUPP); 15021 return (0); 15022 } 15023 15024 static struct tcp_function_block __tcp_rack = { 15025 .tfb_tcp_block_name = __XSTRING(STACKNAME), 15026 .tfb_tcp_output = rack_output, 15027 .tfb_do_queued_segments = ctf_do_queued_segments, 15028 .tfb_do_segment_nounlock = rack_do_segment_nounlock, 15029 .tfb_tcp_do_segment = rack_do_segment, 15030 .tfb_tcp_ctloutput = rack_ctloutput, 15031 .tfb_tcp_fb_init = rack_init, 15032 .tfb_tcp_fb_fini = rack_fini, 15033 .tfb_tcp_timer_stop_all = rack_stopall, 15034 .tfb_tcp_timer_activate = rack_timer_activate, 15035 .tfb_tcp_timer_active = rack_timer_active, 15036 .tfb_tcp_timer_stop = rack_timer_stop, 15037 .tfb_tcp_rexmit_tmr = rack_remxt_tmr, 15038 .tfb_tcp_handoff_ok = rack_handoff_ok, 15039 .tfb_pru_options = rack_pru_options, 15040 }; 15041 15042 static const char *rack_stack_names[] = { 15043 __XSTRING(STACKNAME), 15044 #ifdef STACKALIAS 15045 __XSTRING(STACKALIAS), 15046 #endif 15047 }; 15048 15049 static int 15050 rack_ctor(void *mem, int32_t size, void *arg, int32_t how) 15051 { 15052 memset(mem, 0, size); 15053 return (0); 15054 } 15055 15056 static void 15057 rack_dtor(void *mem, int32_t size, void *arg) 15058 { 15059 15060 } 15061 15062 static bool rack_mod_inited = false; 15063 15064 static int 15065 tcp_addrack(module_t mod, int32_t type, void *data) 15066 { 15067 int32_t err = 0; 15068 int num_stacks; 15069 15070 switch (type) { 15071 case MOD_LOAD: 15072 rack_zone = uma_zcreate(__XSTRING(MODNAME) "_map", 15073 sizeof(struct rack_sendmap), 15074 rack_ctor, rack_dtor, NULL, NULL, UMA_ALIGN_PTR, 0); 15075 15076 rack_pcb_zone = uma_zcreate(__XSTRING(MODNAME) "_pcb", 15077 sizeof(struct tcp_rack), 15078 rack_ctor, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0); 15079 15080 sysctl_ctx_init(&rack_sysctl_ctx); 15081 rack_sysctl_root = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 15082 SYSCTL_STATIC_CHILDREN(_net_inet_tcp), 15083 OID_AUTO, 15084 #ifdef STACKALIAS 15085 __XSTRING(STACKALIAS), 15086 #else 15087 __XSTRING(STACKNAME), 15088 #endif 15089 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 15090 ""); 15091 if (rack_sysctl_root == NULL) { 15092 printf("Failed to add sysctl node\n"); 15093 err = EFAULT; 15094 goto free_uma; 15095 } 15096 rack_init_sysctls(); 15097 num_stacks = nitems(rack_stack_names); 15098 err = register_tcp_functions_as_names(&__tcp_rack, M_WAITOK, 15099 rack_stack_names, &num_stacks); 15100 if (err) { 15101 printf("Failed to register %s stack name for " 15102 "%s module\n", rack_stack_names[num_stacks], 15103 __XSTRING(MODNAME)); 15104 sysctl_ctx_free(&rack_sysctl_ctx); 15105 free_uma: 15106 uma_zdestroy(rack_zone); 15107 uma_zdestroy(rack_pcb_zone); 15108 rack_counter_destroy(); 15109 printf("Failed to register rack module -- err:%d\n", err); 15110 return (err); 15111 } 15112 tcp_lro_reg_mbufq(); 15113 rack_mod_inited = true; 15114 break; 15115 case MOD_QUIESCE: 15116 err = deregister_tcp_functions(&__tcp_rack, true, false); 15117 break; 15118 case MOD_UNLOAD: 15119 err = deregister_tcp_functions(&__tcp_rack, false, true); 15120 if (err == EBUSY) 15121 break; 15122 if (rack_mod_inited) { 15123 uma_zdestroy(rack_zone); 15124 uma_zdestroy(rack_pcb_zone); 15125 sysctl_ctx_free(&rack_sysctl_ctx); 15126 rack_counter_destroy(); 15127 rack_mod_inited = false; 15128 } 15129 tcp_lro_dereg_mbufq(); 15130 err = 0; 15131 break; 15132 default: 15133 return (EOPNOTSUPP); 15134 } 15135 return (err); 15136 } 15137 15138 static moduledata_t tcp_rack = { 15139 .name = __XSTRING(MODNAME), 15140 .evhand = tcp_addrack, 15141 .priv = 0 15142 }; 15143 15144 MODULE_VERSION(MODNAME, 1); 15145 DECLARE_MODULE(MODNAME, tcp_rack, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY); 15146 MODULE_DEPEND(MODNAME, tcphpts, 1, 1, 1); 15147