1 /*- 2 * Copyright (c) 2016-2020 Netflix, Inc. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 16 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 23 * SUCH DAMAGE. 24 * 25 */ 26 27 #include <sys/cdefs.h> 28 __FBSDID("$FreeBSD$"); 29 30 #include "opt_inet.h" 31 #include "opt_inet6.h" 32 #include "opt_ipsec.h" 33 #include "opt_tcpdebug.h" 34 #include "opt_ratelimit.h" 35 #include <sys/param.h> 36 #include <sys/arb.h> 37 #include <sys/module.h> 38 #include <sys/kernel.h> 39 #ifdef TCP_HHOOK 40 #include <sys/hhook.h> 41 #endif 42 #include <sys/lock.h> 43 #include <sys/malloc.h> 44 #include <sys/lock.h> 45 #include <sys/mutex.h> 46 #include <sys/mbuf.h> 47 #include <sys/proc.h> /* for proc0 declaration */ 48 #include <sys/socket.h> 49 #include <sys/socketvar.h> 50 #include <sys/sysctl.h> 51 #include <sys/systm.h> 52 #ifdef STATS 53 #include <sys/qmath.h> 54 #include <sys/tree.h> 55 #include <sys/stats.h> /* Must come after qmath.h and tree.h */ 56 #else 57 #include <sys/tree.h> 58 #endif 59 #include <sys/refcount.h> 60 #include <sys/queue.h> 61 #include <sys/tim_filter.h> 62 #include <sys/smp.h> 63 #include <sys/kthread.h> 64 #include <sys/kern_prefetch.h> 65 #include <sys/protosw.h> 66 67 #include <vm/uma.h> 68 69 #include <net/route.h> 70 #include <net/route/nhop.h> 71 #include <net/vnet.h> 72 73 #define TCPSTATES /* for logging */ 74 75 #include <netinet/in.h> 76 #include <netinet/in_kdtrace.h> 77 #include <netinet/in_pcb.h> 78 #include <netinet/ip.h> 79 #include <netinet/ip_icmp.h> /* required for icmp_var.h */ 80 #include <netinet/icmp_var.h> /* for ICMP_BANDLIM */ 81 #include <netinet/ip_var.h> 82 #include <netinet/ip6.h> 83 #include <netinet6/in6_pcb.h> 84 #include <netinet6/ip6_var.h> 85 #include <netinet/tcp.h> 86 #define TCPOUTFLAGS 87 #include <netinet/tcp_fsm.h> 88 #include <netinet/tcp_log_buf.h> 89 #include <netinet/tcp_seq.h> 90 #include <netinet/tcp_timer.h> 91 #include <netinet/tcp_var.h> 92 #include <netinet/tcp_hpts.h> 93 #include <netinet/tcp_ratelimit.h> 94 #include <netinet/tcpip.h> 95 #include <netinet/cc/cc.h> 96 #include <netinet/tcp_fastopen.h> 97 #include <netinet/tcp_lro.h> 98 #ifdef NETFLIX_SHARED_CWND 99 #include <netinet/tcp_shared_cwnd.h> 100 #endif 101 #ifdef TCPDEBUG 102 #include <netinet/tcp_debug.h> 103 #endif /* TCPDEBUG */ 104 #ifdef TCP_OFFLOAD 105 #include <netinet/tcp_offload.h> 106 #endif 107 #ifdef INET6 108 #include <netinet6/tcp6_var.h> 109 #endif 110 111 #include <netipsec/ipsec_support.h> 112 113 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 114 #include <netipsec/ipsec.h> 115 #include <netipsec/ipsec6.h> 116 #endif /* IPSEC */ 117 118 #include <netinet/udp.h> 119 #include <netinet/udp_var.h> 120 #include <machine/in_cksum.h> 121 122 #ifdef MAC 123 #include <security/mac/mac_framework.h> 124 #endif 125 #include "sack_filter.h" 126 #include "tcp_rack.h" 127 #include "rack_bbr_common.h" 128 129 uma_zone_t rack_zone; 130 uma_zone_t rack_pcb_zone; 131 132 #ifndef TICKS2SBT 133 #define TICKS2SBT(__t) (tick_sbt * ((sbintime_t)(__t))) 134 #endif 135 136 struct sysctl_ctx_list rack_sysctl_ctx; 137 struct sysctl_oid *rack_sysctl_root; 138 139 #define CUM_ACKED 1 140 #define SACKED 2 141 142 /* 143 * The RACK module incorporates a number of 144 * TCP ideas that have been put out into the IETF 145 * over the last few years: 146 * - Matt Mathis's Rate Halving which slowly drops 147 * the congestion window so that the ack clock can 148 * be maintained during a recovery. 149 * - Yuchung Cheng's RACK TCP (for which its named) that 150 * will stop us using the number of dup acks and instead 151 * use time as the gage of when we retransmit. 152 * - Reorder Detection of RFC4737 and the Tail-Loss probe draft 153 * of Dukkipati et.al. 154 * RACK depends on SACK, so if an endpoint arrives that 155 * cannot do SACK the state machine below will shuttle the 156 * connection back to using the "default" TCP stack that is 157 * in FreeBSD. 158 * 159 * To implement RACK the original TCP stack was first decomposed 160 * into a functional state machine with individual states 161 * for each of the possible TCP connection states. The do_segement 162 * functions role in life is to mandate the connection supports SACK 163 * initially and then assure that the RACK state matches the conenction 164 * state before calling the states do_segment function. Each 165 * state is simplified due to the fact that the original do_segment 166 * has been decomposed and we *know* what state we are in (no 167 * switches on the state) and all tests for SACK are gone. This 168 * greatly simplifies what each state does. 169 * 170 * TCP output is also over-written with a new version since it 171 * must maintain the new rack scoreboard. 172 * 173 */ 174 static int32_t rack_tlp_thresh = 1; 175 static int32_t rack_tlp_limit = 2; /* No more than 2 TLPs w-out new data */ 176 static int32_t rack_tlp_use_greater = 1; 177 static int32_t rack_reorder_thresh = 2; 178 static int32_t rack_reorder_fade = 60000; /* 0 - never fade, def 60,000 179 * - 60 seconds */ 180 /* Attack threshold detections */ 181 static uint32_t rack_highest_sack_thresh_seen = 0; 182 static uint32_t rack_highest_move_thresh_seen = 0; 183 184 static int32_t rack_pkt_delay = 1; 185 static int32_t rack_early_recovery = 1; 186 static int32_t rack_send_a_lot_in_prr = 1; 187 static int32_t rack_min_to = 1; /* Number of ms minimum timeout */ 188 static int32_t rack_verbose_logging = 0; 189 static int32_t rack_ignore_data_after_close = 1; 190 static int32_t rack_enable_shared_cwnd = 0; 191 static int32_t rack_limits_scwnd = 1; 192 static int32_t rack_enable_mqueue_for_nonpaced = 0; 193 static int32_t rack_disable_prr = 0; 194 static int32_t use_rack_rr = 1; 195 static int32_t rack_non_rxt_use_cr = 0; /* does a non-rxt in recovery use the configured rate (ss/ca)? */ 196 static int32_t rack_persist_min = 250; /* 250ms */ 197 static int32_t rack_persist_max = 2000; /* 2 Second */ 198 static int32_t rack_sack_not_required = 0; /* set to one to allow non-sack to use rack */ 199 static int32_t rack_default_init_window = 0; /* Use system default */ 200 static int32_t rack_limit_time_with_srtt = 0; 201 static int32_t rack_hw_pace_adjust = 0; 202 /* 203 * Currently regular tcp has a rto_min of 30ms 204 * the backoff goes 12 times so that ends up 205 * being a total of 122.850 seconds before a 206 * connection is killed. 207 */ 208 static uint32_t rack_def_data_window = 20; 209 static uint32_t rack_goal_bdp = 2; 210 static uint32_t rack_min_srtts = 1; 211 static uint32_t rack_min_measure_usec = 0; 212 static int32_t rack_tlp_min = 10; 213 static int32_t rack_rto_min = 30; /* 30ms same as main freebsd */ 214 static int32_t rack_rto_max = 4000; /* 4 seconds */ 215 static const int32_t rack_free_cache = 2; 216 static int32_t rack_hptsi_segments = 40; 217 static int32_t rack_rate_sample_method = USE_RTT_LOW; 218 static int32_t rack_pace_every_seg = 0; 219 static int32_t rack_delayed_ack_time = 200; /* 200ms */ 220 static int32_t rack_slot_reduction = 4; 221 static int32_t rack_wma_divisor = 8; /* For WMA calculation */ 222 static int32_t rack_cwnd_block_ends_measure = 0; 223 static int32_t rack_rwnd_block_ends_measure = 0; 224 225 static int32_t rack_lower_cwnd_at_tlp = 0; 226 static int32_t rack_use_proportional_reduce = 0; 227 static int32_t rack_proportional_rate = 10; 228 static int32_t rack_tlp_max_resend = 2; 229 static int32_t rack_limited_retran = 0; 230 static int32_t rack_always_send_oldest = 0; 231 static int32_t rack_tlp_threshold_use = TLP_USE_TWO_ONE; 232 233 static uint16_t rack_per_of_gp_ss = 250; /* 250 % slow-start */ 234 static uint16_t rack_per_of_gp_ca = 200; /* 200 % congestion-avoidance */ 235 static uint16_t rack_per_of_gp_rec = 200; /* 200 % of bw */ 236 237 /* Probertt */ 238 static uint16_t rack_per_of_gp_probertt = 60; /* 60% of bw */ 239 static uint16_t rack_per_of_gp_lowthresh = 40; /* 40% is bottom */ 240 static uint16_t rack_per_of_gp_probertt_reduce = 10; /* 10% reduction */ 241 static uint16_t rack_atexit_prtt_hbp = 130; /* Clamp to 130% on exit prtt if highly buffered path */ 242 static uint16_t rack_atexit_prtt = 130; /* Clamp to 100% on exit prtt if non highly buffered path */ 243 244 static uint32_t rack_max_drain_wait = 2; /* How man gp srtt's before we give up draining */ 245 static uint32_t rack_must_drain = 1; /* How many GP srtt's we *must* wait */ 246 static uint32_t rack_probertt_use_min_rtt_entry = 1; /* Use the min to calculate the goal else gp_srtt */ 247 static uint32_t rack_probertt_use_min_rtt_exit = 0; 248 static uint32_t rack_probe_rtt_sets_cwnd = 0; 249 static uint32_t rack_probe_rtt_safety_val = 2000000; /* No more than 2 sec in probe-rtt */ 250 static uint32_t rack_time_between_probertt = 9600000; /* 9.6 sec in us */ 251 static uint32_t rack_probertt_gpsrtt_cnt_mul = 0; /* How many srtt periods does probe-rtt last top fraction */ 252 static uint32_t rack_probertt_gpsrtt_cnt_div = 0; /* How many srtt periods does probe-rtt last bottom fraction */ 253 static uint32_t rack_min_probertt_hold = 200000; /* Equal to delayed ack time */ 254 static uint32_t rack_probertt_filter_life = 10000000; 255 static uint32_t rack_probertt_lower_within = 10; 256 static uint32_t rack_min_rtt_movement = 250; /* Must move at least 250 useconds to count as a lowering */ 257 static int32_t rack_pace_one_seg = 0; /* Shall we pace for less than 1.4Meg 1MSS at a time */ 258 static int32_t rack_probertt_clear_is = 1; 259 static int32_t rack_max_drain_hbp = 1; /* Extra drain times gpsrtt for highly buffered paths */ 260 static int32_t rack_hbp_thresh = 3; /* what is the divisor max_rtt/min_rtt to decided a hbp */ 261 262 /* Part of pacing */ 263 static int32_t rack_max_per_above = 30; /* When we go to increment stop if above 100+this% */ 264 265 /* Timely information */ 266 /* Combine these two gives the range of 'no change' to bw */ 267 /* ie the up/down provide the upper and lower bound */ 268 static int32_t rack_gp_per_bw_mul_up = 2; /* 2% */ 269 static int32_t rack_gp_per_bw_mul_down = 4; /* 4% */ 270 static int32_t rack_gp_rtt_maxmul = 3; /* 3 x maxmin */ 271 static int32_t rack_gp_rtt_minmul = 1; /* minrtt + (minrtt/mindiv) is lower rtt */ 272 static int32_t rack_gp_rtt_mindiv = 4; /* minrtt + (minrtt * minmul/mindiv) is lower rtt */ 273 static int32_t rack_gp_decrease_per = 20; /* 20% decrease in multipler */ 274 static int32_t rack_gp_increase_per = 2; /* 2% increase in multipler */ 275 static int32_t rack_per_lower_bound = 50; /* Don't allow to drop below this multiplier */ 276 static int32_t rack_per_upper_bound_ss = 0; /* Don't allow SS to grow above this */ 277 static int32_t rack_per_upper_bound_ca = 0; /* Don't allow CA to grow above this */ 278 static int32_t rack_do_dyn_mul = 0; /* Are the rack gp multipliers dynamic */ 279 static int32_t rack_gp_no_rec_chg = 1; /* Prohibit recovery from reducing it's multiplier */ 280 static int32_t rack_timely_dec_clear = 6; /* Do we clear decrement count at a value (6)? */ 281 static int32_t rack_timely_max_push_rise = 3; /* One round of pushing */ 282 static int32_t rack_timely_max_push_drop = 3; /* Three round of pushing */ 283 static int32_t rack_timely_min_segs = 4; /* 4 segment minimum */ 284 static int32_t rack_use_max_for_nobackoff = 0; 285 static int32_t rack_timely_int_timely_only = 0; /* do interim timely's only use the timely algo (no b/w changes)? */ 286 static int32_t rack_timely_no_stopping = 0; 287 static int32_t rack_down_raise_thresh = 100; 288 static int32_t rack_req_segs = 1; 289 290 /* Weird delayed ack mode */ 291 static int32_t rack_use_imac_dack = 0; 292 /* Rack specific counters */ 293 counter_u64_t rack_badfr; 294 counter_u64_t rack_badfr_bytes; 295 counter_u64_t rack_rtm_prr_retran; 296 counter_u64_t rack_rtm_prr_newdata; 297 counter_u64_t rack_timestamp_mismatch; 298 counter_u64_t rack_reorder_seen; 299 counter_u64_t rack_paced_segments; 300 counter_u64_t rack_unpaced_segments; 301 counter_u64_t rack_calc_zero; 302 counter_u64_t rack_calc_nonzero; 303 counter_u64_t rack_saw_enobuf; 304 counter_u64_t rack_saw_enetunreach; 305 counter_u64_t rack_per_timer_hole; 306 307 /* Tail loss probe counters */ 308 counter_u64_t rack_tlp_tot; 309 counter_u64_t rack_tlp_newdata; 310 counter_u64_t rack_tlp_retran; 311 counter_u64_t rack_tlp_retran_bytes; 312 counter_u64_t rack_tlp_retran_fail; 313 counter_u64_t rack_to_tot; 314 counter_u64_t rack_to_arm_rack; 315 counter_u64_t rack_to_arm_tlp; 316 counter_u64_t rack_to_alloc; 317 counter_u64_t rack_to_alloc_hard; 318 counter_u64_t rack_to_alloc_emerg; 319 counter_u64_t rack_to_alloc_limited; 320 counter_u64_t rack_alloc_limited_conns; 321 counter_u64_t rack_split_limited; 322 323 counter_u64_t rack_sack_proc_all; 324 counter_u64_t rack_sack_proc_short; 325 counter_u64_t rack_sack_proc_restart; 326 counter_u64_t rack_sack_attacks_detected; 327 counter_u64_t rack_sack_attacks_reversed; 328 counter_u64_t rack_sack_used_next_merge; 329 counter_u64_t rack_sack_splits; 330 counter_u64_t rack_sack_used_prev_merge; 331 counter_u64_t rack_sack_skipped_acked; 332 counter_u64_t rack_ack_total; 333 counter_u64_t rack_express_sack; 334 counter_u64_t rack_sack_total; 335 counter_u64_t rack_move_none; 336 counter_u64_t rack_move_some; 337 338 counter_u64_t rack_used_tlpmethod; 339 counter_u64_t rack_used_tlpmethod2; 340 counter_u64_t rack_enter_tlp_calc; 341 counter_u64_t rack_input_idle_reduces; 342 counter_u64_t rack_collapsed_win; 343 counter_u64_t rack_tlp_does_nada; 344 counter_u64_t rack_try_scwnd; 345 346 /* Temp CPU counters */ 347 counter_u64_t rack_find_high; 348 349 counter_u64_t rack_progress_drops; 350 counter_u64_t rack_out_size[TCP_MSS_ACCT_SIZE]; 351 counter_u64_t rack_opts_arry[RACK_OPTS_SIZE]; 352 353 static void 354 rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, int event, int line); 355 356 static int 357 rack_process_ack(struct mbuf *m, struct tcphdr *th, 358 struct socket *so, struct tcpcb *tp, struct tcpopt *to, 359 uint32_t tiwin, int32_t tlen, int32_t * ofia, int32_t thflags, int32_t * ret_val); 360 static int 361 rack_process_data(struct mbuf *m, struct tcphdr *th, 362 struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, 363 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); 364 static void 365 rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, 366 struct tcphdr *th, uint16_t nsegs, uint16_t type, int32_t recovery); 367 static struct rack_sendmap *rack_alloc(struct tcp_rack *rack); 368 static struct rack_sendmap *rack_alloc_limit(struct tcp_rack *rack, 369 uint8_t limit_type); 370 static struct rack_sendmap * 371 rack_check_recovery_mode(struct tcpcb *tp, 372 uint32_t tsused); 373 static void 374 rack_cong_signal(struct tcpcb *tp, struct tcphdr *th, 375 uint32_t type); 376 static void rack_counter_destroy(void); 377 static int 378 rack_ctloutput(struct socket *so, struct sockopt *sopt, 379 struct inpcb *inp, struct tcpcb *tp); 380 static int32_t rack_ctor(void *mem, int32_t size, void *arg, int32_t how); 381 static void 382 rack_set_pace_segments(struct tcpcb *tp, struct tcp_rack *rack, uint32_t line); 383 static void 384 rack_do_segment(struct mbuf *m, struct tcphdr *th, 385 struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, 386 uint8_t iptos); 387 static void rack_dtor(void *mem, int32_t size, void *arg); 388 static void 389 rack_earlier_retran(struct tcpcb *tp, struct rack_sendmap *rsm, 390 uint32_t t, uint32_t cts); 391 static void 392 rack_log_alt_to_to_cancel(struct tcp_rack *rack, 393 uint32_t flex1, uint32_t flex2, 394 uint32_t flex3, uint32_t flex4, 395 uint32_t flex5, uint32_t flex6, 396 uint16_t flex7, uint8_t mod); 397 static void 398 rack_log_pacing_delay_calc(struct tcp_rack *rack, uint32_t len, uint32_t slot, 399 uint64_t bw_est, uint64_t bw, uint64_t len_time, int method, int line, struct rack_sendmap *rsm); 400 static struct rack_sendmap * 401 rack_find_high_nonack(struct tcp_rack *rack, 402 struct rack_sendmap *rsm); 403 static struct rack_sendmap *rack_find_lowest_rsm(struct tcp_rack *rack); 404 static void rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm); 405 static void rack_fini(struct tcpcb *tp, int32_t tcb_is_purged); 406 static int 407 rack_get_sockopt(struct socket *so, struct sockopt *sopt, 408 struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack); 409 static void 410 rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack, 411 tcp_seq th_ack, int line); 412 static uint32_t 413 rack_get_pacing_len(struct tcp_rack *rack, uint64_t bw, uint32_t mss); 414 static int32_t rack_handoff_ok(struct tcpcb *tp); 415 static int32_t rack_init(struct tcpcb *tp); 416 static void rack_init_sysctls(void); 417 static void 418 rack_log_ack(struct tcpcb *tp, struct tcpopt *to, 419 struct tcphdr *th); 420 static void 421 rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len, 422 uint32_t seq_out, uint8_t th_flags, int32_t err, uint32_t ts, 423 uint8_t pass, struct rack_sendmap *hintrsm, uint32_t us_cts); 424 static void 425 rack_log_sack_passed(struct tcpcb *tp, struct tcp_rack *rack, 426 struct rack_sendmap *rsm); 427 static void rack_log_to_event(struct tcp_rack *rack, int32_t to_num, struct rack_sendmap *rsm); 428 static int32_t rack_output(struct tcpcb *tp); 429 430 static uint32_t 431 rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, 432 struct sackblk *sack, struct tcpopt *to, struct rack_sendmap **prsm, 433 uint32_t cts, int *moved_two); 434 static void rack_post_recovery(struct tcpcb *tp, struct tcphdr *th); 435 static void rack_remxt_tmr(struct tcpcb *tp); 436 static int 437 rack_set_sockopt(struct socket *so, struct sockopt *sopt, 438 struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack); 439 static void rack_set_state(struct tcpcb *tp, struct tcp_rack *rack); 440 static int32_t rack_stopall(struct tcpcb *tp); 441 static void 442 rack_timer_activate(struct tcpcb *tp, uint32_t timer_type, 443 uint32_t delta); 444 static int32_t rack_timer_active(struct tcpcb *tp, uint32_t timer_type); 445 static void rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line); 446 static void rack_timer_stop(struct tcpcb *tp, uint32_t timer_type); 447 static uint32_t 448 rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack, 449 struct rack_sendmap *rsm, uint32_t ts, int32_t * lenp); 450 static void 451 rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack, 452 struct rack_sendmap *rsm, uint32_t ts); 453 static int 454 rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack, 455 struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type, tcp_seq th_ack); 456 static int32_t tcp_addrack(module_t mod, int32_t type, void *data); 457 static int 458 rack_do_close_wait(struct mbuf *m, struct tcphdr *th, 459 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 460 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 461 static int 462 rack_do_closing(struct mbuf *m, struct tcphdr *th, 463 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 464 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 465 static int 466 rack_do_established(struct mbuf *m, struct tcphdr *th, 467 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 468 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 469 static int 470 rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, 471 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 472 int32_t tlen, uint32_t tiwin, int32_t nxt_pkt, uint8_t iptos); 473 static int 474 rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, 475 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 476 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 477 static int 478 rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, 479 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 480 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 481 static int 482 rack_do_lastack(struct mbuf *m, struct tcphdr *th, 483 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 484 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 485 static int 486 rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, 487 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 488 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 489 static int 490 rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, 491 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 492 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 493 struct rack_sendmap * 494 tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, 495 uint32_t tsused); 496 static void tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt, 497 uint32_t len, uint32_t us_tim, int confidence, struct rack_sendmap *rsm, uint16_t rtrcnt); 498 static void 499 tcp_rack_partialack(struct tcpcb *tp, struct tcphdr *th); 500 501 int32_t rack_clear_counter=0; 502 503 static int 504 sysctl_rack_clear(SYSCTL_HANDLER_ARGS) 505 { 506 uint32_t stat; 507 int32_t error; 508 509 error = SYSCTL_OUT(req, &rack_clear_counter, sizeof(uint32_t)); 510 if (error || req->newptr == NULL) 511 return error; 512 513 error = SYSCTL_IN(req, &stat, sizeof(uint32_t)); 514 if (error) 515 return (error); 516 if (stat == 1) { 517 #ifdef INVARIANTS 518 printf("Clearing RACK counters\n"); 519 #endif 520 counter_u64_zero(rack_badfr); 521 counter_u64_zero(rack_badfr_bytes); 522 counter_u64_zero(rack_rtm_prr_retran); 523 counter_u64_zero(rack_rtm_prr_newdata); 524 counter_u64_zero(rack_timestamp_mismatch); 525 counter_u64_zero(rack_reorder_seen); 526 counter_u64_zero(rack_tlp_tot); 527 counter_u64_zero(rack_tlp_newdata); 528 counter_u64_zero(rack_tlp_retran); 529 counter_u64_zero(rack_tlp_retran_bytes); 530 counter_u64_zero(rack_tlp_retran_fail); 531 counter_u64_zero(rack_to_tot); 532 counter_u64_zero(rack_to_arm_rack); 533 counter_u64_zero(rack_to_arm_tlp); 534 counter_u64_zero(rack_paced_segments); 535 counter_u64_zero(rack_calc_zero); 536 counter_u64_zero(rack_calc_nonzero); 537 counter_u64_zero(rack_unpaced_segments); 538 counter_u64_zero(rack_saw_enobuf); 539 counter_u64_zero(rack_saw_enetunreach); 540 counter_u64_zero(rack_per_timer_hole); 541 counter_u64_zero(rack_to_alloc_hard); 542 counter_u64_zero(rack_to_alloc_emerg); 543 counter_u64_zero(rack_sack_proc_all); 544 counter_u64_zero(rack_sack_proc_short); 545 counter_u64_zero(rack_sack_proc_restart); 546 counter_u64_zero(rack_to_alloc); 547 counter_u64_zero(rack_to_alloc_limited); 548 counter_u64_zero(rack_alloc_limited_conns); 549 counter_u64_zero(rack_split_limited); 550 counter_u64_zero(rack_find_high); 551 counter_u64_zero(rack_sack_attacks_detected); 552 counter_u64_zero(rack_sack_attacks_reversed); 553 counter_u64_zero(rack_sack_used_next_merge); 554 counter_u64_zero(rack_sack_used_prev_merge); 555 counter_u64_zero(rack_sack_splits); 556 counter_u64_zero(rack_sack_skipped_acked); 557 counter_u64_zero(rack_ack_total); 558 counter_u64_zero(rack_express_sack); 559 counter_u64_zero(rack_sack_total); 560 counter_u64_zero(rack_move_none); 561 counter_u64_zero(rack_move_some); 562 counter_u64_zero(rack_used_tlpmethod); 563 counter_u64_zero(rack_used_tlpmethod2); 564 counter_u64_zero(rack_enter_tlp_calc); 565 counter_u64_zero(rack_progress_drops); 566 counter_u64_zero(rack_tlp_does_nada); 567 counter_u64_zero(rack_try_scwnd); 568 counter_u64_zero(rack_collapsed_win); 569 } 570 rack_clear_counter = 0; 571 return (0); 572 } 573 574 static void 575 rack_init_sysctls(void) 576 { 577 struct sysctl_oid *rack_counters; 578 struct sysctl_oid *rack_attack; 579 struct sysctl_oid *rack_pacing; 580 struct sysctl_oid *rack_timely; 581 struct sysctl_oid *rack_timers; 582 struct sysctl_oid *rack_tlp; 583 struct sysctl_oid *rack_misc; 584 struct sysctl_oid *rack_measure; 585 struct sysctl_oid *rack_probertt; 586 587 rack_attack = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 588 SYSCTL_CHILDREN(rack_sysctl_root), 589 OID_AUTO, 590 "sack_attack", 591 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 592 "Rack Sack Attack Counters and Controls"); 593 rack_counters = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 594 SYSCTL_CHILDREN(rack_sysctl_root), 595 OID_AUTO, 596 "stats", 597 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 598 "Rack Counters"); 599 SYSCTL_ADD_S32(&rack_sysctl_ctx, 600 SYSCTL_CHILDREN(rack_sysctl_root), 601 OID_AUTO, "rate_sample_method", CTLFLAG_RW, 602 &rack_rate_sample_method , USE_RTT_LOW, 603 "What method should we use for rate sampling 0=high, 1=low "); 604 /* Probe rtt related controls */ 605 rack_probertt = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 606 SYSCTL_CHILDREN(rack_sysctl_root), 607 OID_AUTO, 608 "probertt", 609 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 610 "ProbeRTT related Controls"); 611 SYSCTL_ADD_U16(&rack_sysctl_ctx, 612 SYSCTL_CHILDREN(rack_probertt), 613 OID_AUTO, "exit_per_hpb", CTLFLAG_RW, 614 &rack_atexit_prtt_hbp, 130, 615 "What percentage above goodput do we clamp CA/SS to at exit on high-BDP path 110%"); 616 SYSCTL_ADD_U16(&rack_sysctl_ctx, 617 SYSCTL_CHILDREN(rack_probertt), 618 OID_AUTO, "exit_per_nonhpb", CTLFLAG_RW, 619 &rack_atexit_prtt, 130, 620 "What percentage above goodput do we clamp CA/SS to at exit on a non high-BDP path 100%"); 621 SYSCTL_ADD_U16(&rack_sysctl_ctx, 622 SYSCTL_CHILDREN(rack_probertt), 623 OID_AUTO, "gp_per_mul", CTLFLAG_RW, 624 &rack_per_of_gp_probertt, 60, 625 "What percentage of goodput do we pace at in probertt"); 626 SYSCTL_ADD_U16(&rack_sysctl_ctx, 627 SYSCTL_CHILDREN(rack_probertt), 628 OID_AUTO, "gp_per_reduce", CTLFLAG_RW, 629 &rack_per_of_gp_probertt_reduce, 10, 630 "What percentage of goodput do we reduce every gp_srtt"); 631 SYSCTL_ADD_U16(&rack_sysctl_ctx, 632 SYSCTL_CHILDREN(rack_probertt), 633 OID_AUTO, "gp_per_low", CTLFLAG_RW, 634 &rack_per_of_gp_lowthresh, 40, 635 "What percentage of goodput do we allow the multiplier to fall to"); 636 SYSCTL_ADD_U32(&rack_sysctl_ctx, 637 SYSCTL_CHILDREN(rack_probertt), 638 OID_AUTO, "time_between", CTLFLAG_RW, 639 & rack_time_between_probertt, 96000000, 640 "How many useconds between the lowest rtt falling must past before we enter probertt"); 641 SYSCTL_ADD_U32(&rack_sysctl_ctx, 642 SYSCTL_CHILDREN(rack_probertt), 643 OID_AUTO, "safety", CTLFLAG_RW, 644 &rack_probe_rtt_safety_val, 2000000, 645 "If not zero, provides a maximum usecond that you can stay in probertt (2sec = 2000000)"); 646 SYSCTL_ADD_U32(&rack_sysctl_ctx, 647 SYSCTL_CHILDREN(rack_probertt), 648 OID_AUTO, "sets_cwnd", CTLFLAG_RW, 649 &rack_probe_rtt_sets_cwnd, 0, 650 "Do we set the cwnd too (if always_lower is on)"); 651 SYSCTL_ADD_U32(&rack_sysctl_ctx, 652 SYSCTL_CHILDREN(rack_probertt), 653 OID_AUTO, "maxdrainsrtts", CTLFLAG_RW, 654 &rack_max_drain_wait, 2, 655 "Maximum number of gp_srtt's to hold in drain waiting for flight to reach goal"); 656 SYSCTL_ADD_U32(&rack_sysctl_ctx, 657 SYSCTL_CHILDREN(rack_probertt), 658 OID_AUTO, "mustdrainsrtts", CTLFLAG_RW, 659 &rack_must_drain, 1, 660 "We must drain this many gp_srtt's waiting for flight to reach goal"); 661 SYSCTL_ADD_U32(&rack_sysctl_ctx, 662 SYSCTL_CHILDREN(rack_probertt), 663 OID_AUTO, "goal_use_min_entry", CTLFLAG_RW, 664 &rack_probertt_use_min_rtt_entry, 1, 665 "Should we use the min-rtt to calculate the goal rtt (else gp_srtt) at entry"); 666 SYSCTL_ADD_U32(&rack_sysctl_ctx, 667 SYSCTL_CHILDREN(rack_probertt), 668 OID_AUTO, "goal_use_min_exit", CTLFLAG_RW, 669 &rack_probertt_use_min_rtt_exit, 0, 670 "How to set cwnd at exit, 0 - dynamic, 1 - use min-rtt, 2 - use curgprtt, 3 - entry gp-rtt"); 671 SYSCTL_ADD_U32(&rack_sysctl_ctx, 672 SYSCTL_CHILDREN(rack_probertt), 673 OID_AUTO, "length_div", CTLFLAG_RW, 674 &rack_probertt_gpsrtt_cnt_div, 0, 675 "How many recent goodput srtt periods plus hold tim does probertt last (bottom of fraction)"); 676 SYSCTL_ADD_U32(&rack_sysctl_ctx, 677 SYSCTL_CHILDREN(rack_probertt), 678 OID_AUTO, "length_mul", CTLFLAG_RW, 679 &rack_probertt_gpsrtt_cnt_mul, 0, 680 "How many recent goodput srtt periods plus hold tim does probertt last (top of fraction)"); 681 SYSCTL_ADD_U32(&rack_sysctl_ctx, 682 SYSCTL_CHILDREN(rack_probertt), 683 OID_AUTO, "holdtim_at_target", CTLFLAG_RW, 684 &rack_min_probertt_hold, 200000, 685 "What is the minimum time we hold probertt at target"); 686 SYSCTL_ADD_U32(&rack_sysctl_ctx, 687 SYSCTL_CHILDREN(rack_probertt), 688 OID_AUTO, "filter_life", CTLFLAG_RW, 689 &rack_probertt_filter_life, 10000000, 690 "What is the time for the filters life in useconds"); 691 SYSCTL_ADD_U32(&rack_sysctl_ctx, 692 SYSCTL_CHILDREN(rack_probertt), 693 OID_AUTO, "lower_within", CTLFLAG_RW, 694 &rack_probertt_lower_within, 10, 695 "If the rtt goes lower within this percentage of the time, go into probe-rtt"); 696 SYSCTL_ADD_U32(&rack_sysctl_ctx, 697 SYSCTL_CHILDREN(rack_probertt), 698 OID_AUTO, "must_move", CTLFLAG_RW, 699 &rack_min_rtt_movement, 250, 700 "How much is the minimum movement in rtt to count as a drop for probertt purposes"); 701 SYSCTL_ADD_U32(&rack_sysctl_ctx, 702 SYSCTL_CHILDREN(rack_probertt), 703 OID_AUTO, "clear_is_cnts", CTLFLAG_RW, 704 &rack_probertt_clear_is, 1, 705 "Do we clear I/S counts on exiting probe-rtt"); 706 SYSCTL_ADD_S32(&rack_sysctl_ctx, 707 SYSCTL_CHILDREN(rack_probertt), 708 OID_AUTO, "hbp_extra_drain", CTLFLAG_RW, 709 &rack_max_drain_hbp, 1, 710 "How many extra drain gpsrtt's do we get in highly buffered paths"); 711 SYSCTL_ADD_S32(&rack_sysctl_ctx, 712 SYSCTL_CHILDREN(rack_probertt), 713 OID_AUTO, "hbp_threshold", CTLFLAG_RW, 714 &rack_hbp_thresh, 3, 715 "We are highly buffered if min_rtt_seen / max_rtt_seen > this-threshold"); 716 /* Pacing related sysctls */ 717 rack_pacing = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 718 SYSCTL_CHILDREN(rack_sysctl_root), 719 OID_AUTO, 720 "pacing", 721 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 722 "Pacing related Controls"); 723 SYSCTL_ADD_S32(&rack_sysctl_ctx, 724 SYSCTL_CHILDREN(rack_pacing), 725 OID_AUTO, "max_pace_over", CTLFLAG_RW, 726 &rack_max_per_above, 30, 727 "What is the maximum allowable percentage that we can pace above (so 30 = 130% of our goal)"); 728 SYSCTL_ADD_S32(&rack_sysctl_ctx, 729 SYSCTL_CHILDREN(rack_pacing), 730 OID_AUTO, "pace_to_one", CTLFLAG_RW, 731 &rack_pace_one_seg, 0, 732 "Do we allow low b/w pacing of 1MSS instead of two"); 733 SYSCTL_ADD_S32(&rack_sysctl_ctx, 734 SYSCTL_CHILDREN(rack_pacing), 735 OID_AUTO, "limit_wsrtt", CTLFLAG_RW, 736 &rack_limit_time_with_srtt, 0, 737 "Do we limit pacing time based on srtt"); 738 SYSCTL_ADD_S32(&rack_sysctl_ctx, 739 SYSCTL_CHILDREN(rack_pacing), 740 OID_AUTO, "init_win", CTLFLAG_RW, 741 &rack_default_init_window, 0, 742 "Do we have a rack initial window 0 = system default"); 743 SYSCTL_ADD_U32(&rack_sysctl_ctx, 744 SYSCTL_CHILDREN(rack_pacing), 745 OID_AUTO, "hw_pacing_adjust", CTLFLAG_RW, 746 &rack_hw_pace_adjust, 0, 747 "What percentage do we raise the MSS by (11 = 1.1%)"); 748 SYSCTL_ADD_U16(&rack_sysctl_ctx, 749 SYSCTL_CHILDREN(rack_pacing), 750 OID_AUTO, "gp_per_ss", CTLFLAG_RW, 751 &rack_per_of_gp_ss, 250, 752 "If non zero, what percentage of goodput to pace at in slow start"); 753 SYSCTL_ADD_U16(&rack_sysctl_ctx, 754 SYSCTL_CHILDREN(rack_pacing), 755 OID_AUTO, "gp_per_ca", CTLFLAG_RW, 756 &rack_per_of_gp_ca, 150, 757 "If non zero, what percentage of goodput to pace at in congestion avoidance"); 758 SYSCTL_ADD_U16(&rack_sysctl_ctx, 759 SYSCTL_CHILDREN(rack_pacing), 760 OID_AUTO, "gp_per_rec", CTLFLAG_RW, 761 &rack_per_of_gp_rec, 200, 762 "If non zero, what percentage of goodput to pace at in recovery"); 763 SYSCTL_ADD_S32(&rack_sysctl_ctx, 764 SYSCTL_CHILDREN(rack_pacing), 765 OID_AUTO, "pace_max_seg", CTLFLAG_RW, 766 &rack_hptsi_segments, 40, 767 "What size is the max for TSO segments in pacing and burst mitigation"); 768 SYSCTL_ADD_S32(&rack_sysctl_ctx, 769 SYSCTL_CHILDREN(rack_pacing), 770 OID_AUTO, "burst_reduces", CTLFLAG_RW, 771 &rack_slot_reduction, 4, 772 "When doing only burst mitigation what is the reduce divisor"); 773 SYSCTL_ADD_S32(&rack_sysctl_ctx, 774 SYSCTL_CHILDREN(rack_sysctl_root), 775 OID_AUTO, "use_pacing", CTLFLAG_RW, 776 &rack_pace_every_seg, 0, 777 "If set we use pacing, if clear we use only the original burst mitigation"); 778 779 rack_timely = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 780 SYSCTL_CHILDREN(rack_sysctl_root), 781 OID_AUTO, 782 "timely", 783 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 784 "Rack Timely RTT Controls"); 785 /* Timely based GP dynmics */ 786 SYSCTL_ADD_S32(&rack_sysctl_ctx, 787 SYSCTL_CHILDREN(rack_timely), 788 OID_AUTO, "upper", CTLFLAG_RW, 789 &rack_gp_per_bw_mul_up, 2, 790 "Rack timely upper range for equal b/w (in percentage)"); 791 SYSCTL_ADD_S32(&rack_sysctl_ctx, 792 SYSCTL_CHILDREN(rack_timely), 793 OID_AUTO, "lower", CTLFLAG_RW, 794 &rack_gp_per_bw_mul_down, 4, 795 "Rack timely lower range for equal b/w (in percentage)"); 796 SYSCTL_ADD_S32(&rack_sysctl_ctx, 797 SYSCTL_CHILDREN(rack_timely), 798 OID_AUTO, "rtt_max_mul", CTLFLAG_RW, 799 &rack_gp_rtt_maxmul, 3, 800 "Rack timely multipler of lowest rtt for rtt_max"); 801 SYSCTL_ADD_S32(&rack_sysctl_ctx, 802 SYSCTL_CHILDREN(rack_timely), 803 OID_AUTO, "rtt_min_div", CTLFLAG_RW, 804 &rack_gp_rtt_mindiv, 4, 805 "Rack timely divisor used for rtt + (rtt * mul/divisor) for check for lower rtt"); 806 SYSCTL_ADD_S32(&rack_sysctl_ctx, 807 SYSCTL_CHILDREN(rack_timely), 808 OID_AUTO, "rtt_min_mul", CTLFLAG_RW, 809 &rack_gp_rtt_minmul, 1, 810 "Rack timely multiplier used for rtt + (rtt * mul/divisor) for check for lower rtt"); 811 SYSCTL_ADD_S32(&rack_sysctl_ctx, 812 SYSCTL_CHILDREN(rack_timely), 813 OID_AUTO, "decrease", CTLFLAG_RW, 814 &rack_gp_decrease_per, 20, 815 "Rack timely decrease percentage of our GP multiplication factor"); 816 SYSCTL_ADD_S32(&rack_sysctl_ctx, 817 SYSCTL_CHILDREN(rack_timely), 818 OID_AUTO, "increase", CTLFLAG_RW, 819 &rack_gp_increase_per, 2, 820 "Rack timely increase perentage of our GP multiplication factor"); 821 SYSCTL_ADD_S32(&rack_sysctl_ctx, 822 SYSCTL_CHILDREN(rack_timely), 823 OID_AUTO, "lowerbound", CTLFLAG_RW, 824 &rack_per_lower_bound, 50, 825 "Rack timely lowest percentage we allow GP multiplier to fall to"); 826 SYSCTL_ADD_S32(&rack_sysctl_ctx, 827 SYSCTL_CHILDREN(rack_timely), 828 OID_AUTO, "upperboundss", CTLFLAG_RW, 829 &rack_per_upper_bound_ss, 0, 830 "Rack timely higest percentage we allow GP multiplier in SS to raise to (0 is no upperbound)"); 831 SYSCTL_ADD_S32(&rack_sysctl_ctx, 832 SYSCTL_CHILDREN(rack_timely), 833 OID_AUTO, "upperboundca", CTLFLAG_RW, 834 &rack_per_upper_bound_ca, 0, 835 "Rack timely higest percentage we allow GP multiplier to CA raise to (0 is no upperbound)"); 836 SYSCTL_ADD_S32(&rack_sysctl_ctx, 837 SYSCTL_CHILDREN(rack_timely), 838 OID_AUTO, "dynamicgp", CTLFLAG_RW, 839 &rack_do_dyn_mul, 0, 840 "Rack timely do we enable dynmaic timely goodput by default"); 841 SYSCTL_ADD_S32(&rack_sysctl_ctx, 842 SYSCTL_CHILDREN(rack_timely), 843 OID_AUTO, "no_rec_red", CTLFLAG_RW, 844 &rack_gp_no_rec_chg, 1, 845 "Rack timely do we prohibit the recovery multiplier from being lowered"); 846 SYSCTL_ADD_S32(&rack_sysctl_ctx, 847 SYSCTL_CHILDREN(rack_timely), 848 OID_AUTO, "red_clear_cnt", CTLFLAG_RW, 849 &rack_timely_dec_clear, 6, 850 "Rack timely what threshold do we count to before another boost during b/w decent"); 851 SYSCTL_ADD_S32(&rack_sysctl_ctx, 852 SYSCTL_CHILDREN(rack_timely), 853 OID_AUTO, "max_push_rise", CTLFLAG_RW, 854 &rack_timely_max_push_rise, 3, 855 "Rack timely how many times do we push up with b/w increase"); 856 SYSCTL_ADD_S32(&rack_sysctl_ctx, 857 SYSCTL_CHILDREN(rack_timely), 858 OID_AUTO, "max_push_drop", CTLFLAG_RW, 859 &rack_timely_max_push_drop, 3, 860 "Rack timely how many times do we push back on b/w decent"); 861 SYSCTL_ADD_S32(&rack_sysctl_ctx, 862 SYSCTL_CHILDREN(rack_timely), 863 OID_AUTO, "min_segs", CTLFLAG_RW, 864 &rack_timely_min_segs, 4, 865 "Rack timely when setting the cwnd what is the min num segments"); 866 SYSCTL_ADD_S32(&rack_sysctl_ctx, 867 SYSCTL_CHILDREN(rack_timely), 868 OID_AUTO, "noback_max", CTLFLAG_RW, 869 &rack_use_max_for_nobackoff, 0, 870 "Rack timely when deciding if to backoff on a loss, do we use under max rtt else min"); 871 SYSCTL_ADD_S32(&rack_sysctl_ctx, 872 SYSCTL_CHILDREN(rack_timely), 873 OID_AUTO, "interim_timely_only", CTLFLAG_RW, 874 &rack_timely_int_timely_only, 0, 875 "Rack timely when doing interim timely's do we only do timely (no b/w consideration)"); 876 SYSCTL_ADD_S32(&rack_sysctl_ctx, 877 SYSCTL_CHILDREN(rack_timely), 878 OID_AUTO, "nonstop", CTLFLAG_RW, 879 &rack_timely_no_stopping, 0, 880 "Rack timely don't stop increase"); 881 SYSCTL_ADD_S32(&rack_sysctl_ctx, 882 SYSCTL_CHILDREN(rack_timely), 883 OID_AUTO, "dec_raise_thresh", CTLFLAG_RW, 884 &rack_down_raise_thresh, 100, 885 "If the CA or SS is below this threshold raise on the first 3 b/w lowers (0=always)"); 886 SYSCTL_ADD_S32(&rack_sysctl_ctx, 887 SYSCTL_CHILDREN(rack_timely), 888 OID_AUTO, "bottom_drag_segs", CTLFLAG_RW, 889 &rack_req_segs, 1, 890 "Bottom dragging if not these many segments outstanding and room"); 891 892 /* TLP and Rack related parameters */ 893 rack_tlp = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 894 SYSCTL_CHILDREN(rack_sysctl_root), 895 OID_AUTO, 896 "tlp", 897 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 898 "TLP and Rack related Controls"); 899 SYSCTL_ADD_S32(&rack_sysctl_ctx, 900 SYSCTL_CHILDREN(rack_tlp), 901 OID_AUTO, "use_rrr", CTLFLAG_RW, 902 &use_rack_rr, 1, 903 "Do we use Rack Rapid Recovery"); 904 SYSCTL_ADD_S32(&rack_sysctl_ctx, 905 SYSCTL_CHILDREN(rack_tlp), 906 OID_AUTO, "nonrxt_use_cr", CTLFLAG_RW, 907 &rack_non_rxt_use_cr, 0, 908 "Do we use ss/ca rate if in recovery we are transmitting a new data chunk"); 909 SYSCTL_ADD_S32(&rack_sysctl_ctx, 910 SYSCTL_CHILDREN(rack_tlp), 911 OID_AUTO, "tlpmethod", CTLFLAG_RW, 912 &rack_tlp_threshold_use, TLP_USE_TWO_ONE, 913 "What method do we do for TLP time calc 0=no-de-ack-comp, 1=ID, 2=2.1, 3=2.2"); 914 SYSCTL_ADD_S32(&rack_sysctl_ctx, 915 SYSCTL_CHILDREN(rack_tlp), 916 OID_AUTO, "limit", CTLFLAG_RW, 917 &rack_tlp_limit, 2, 918 "How many TLP's can be sent without sending new data"); 919 SYSCTL_ADD_S32(&rack_sysctl_ctx, 920 SYSCTL_CHILDREN(rack_tlp), 921 OID_AUTO, "use_greater", CTLFLAG_RW, 922 &rack_tlp_use_greater, 1, 923 "Should we use the rack_rtt time if its greater than srtt"); 924 SYSCTL_ADD_S32(&rack_sysctl_ctx, 925 SYSCTL_CHILDREN(rack_tlp), 926 OID_AUTO, "tlpminto", CTLFLAG_RW, 927 &rack_tlp_min, 10, 928 "TLP minimum timeout per the specification (10ms)"); 929 SYSCTL_ADD_S32(&rack_sysctl_ctx, 930 SYSCTL_CHILDREN(rack_tlp), 931 OID_AUTO, "send_oldest", CTLFLAG_RW, 932 &rack_always_send_oldest, 0, 933 "Should we always send the oldest TLP and RACK-TLP"); 934 SYSCTL_ADD_S32(&rack_sysctl_ctx, 935 SYSCTL_CHILDREN(rack_tlp), 936 OID_AUTO, "rack_tlimit", CTLFLAG_RW, 937 &rack_limited_retran, 0, 938 "How many times can a rack timeout drive out sends"); 939 SYSCTL_ADD_S32(&rack_sysctl_ctx, 940 SYSCTL_CHILDREN(rack_tlp), 941 OID_AUTO, "tlp_retry", CTLFLAG_RW, 942 &rack_tlp_max_resend, 2, 943 "How many times does TLP retry a single segment or multiple with no ACK"); 944 SYSCTL_ADD_S32(&rack_sysctl_ctx, 945 SYSCTL_CHILDREN(rack_tlp), 946 OID_AUTO, "tlp_cwnd_flag", CTLFLAG_RW, 947 &rack_lower_cwnd_at_tlp, 0, 948 "When a TLP completes a retran should we enter recovery"); 949 SYSCTL_ADD_S32(&rack_sysctl_ctx, 950 SYSCTL_CHILDREN(rack_tlp), 951 OID_AUTO, "reorder_thresh", CTLFLAG_RW, 952 &rack_reorder_thresh, 2, 953 "What factor for rack will be added when seeing reordering (shift right)"); 954 SYSCTL_ADD_S32(&rack_sysctl_ctx, 955 SYSCTL_CHILDREN(rack_tlp), 956 OID_AUTO, "rtt_tlp_thresh", CTLFLAG_RW, 957 &rack_tlp_thresh, 1, 958 "What divisor for TLP rtt/retran will be added (1=rtt, 2=1/2 rtt etc)"); 959 SYSCTL_ADD_S32(&rack_sysctl_ctx, 960 SYSCTL_CHILDREN(rack_tlp), 961 OID_AUTO, "reorder_fade", CTLFLAG_RW, 962 &rack_reorder_fade, 0, 963 "Does reorder detection fade, if so how many ms (0 means never)"); 964 SYSCTL_ADD_S32(&rack_sysctl_ctx, 965 SYSCTL_CHILDREN(rack_tlp), 966 OID_AUTO, "pktdelay", CTLFLAG_RW, 967 &rack_pkt_delay, 1, 968 "Extra RACK time (in ms) besides reordering thresh"); 969 970 /* Timer related controls */ 971 rack_timers = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 972 SYSCTL_CHILDREN(rack_sysctl_root), 973 OID_AUTO, 974 "timers", 975 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 976 "Timer related controls"); 977 SYSCTL_ADD_U32(&rack_sysctl_ctx, 978 SYSCTL_CHILDREN(rack_timers), 979 OID_AUTO, "persmin", CTLFLAG_RW, 980 &rack_persist_min, 250, 981 "What is the minimum time in milliseconds between persists"); 982 SYSCTL_ADD_U32(&rack_sysctl_ctx, 983 SYSCTL_CHILDREN(rack_timers), 984 OID_AUTO, "persmax", CTLFLAG_RW, 985 &rack_persist_max, 2000, 986 "What is the largest delay in milliseconds between persists"); 987 SYSCTL_ADD_S32(&rack_sysctl_ctx, 988 SYSCTL_CHILDREN(rack_timers), 989 OID_AUTO, "delayed_ack", CTLFLAG_RW, 990 &rack_delayed_ack_time, 200, 991 "Delayed ack time (200ms)"); 992 SYSCTL_ADD_S32(&rack_sysctl_ctx, 993 SYSCTL_CHILDREN(rack_timers), 994 OID_AUTO, "minrto", CTLFLAG_RW, 995 &rack_rto_min, 0, 996 "Minimum RTO in ms -- set with caution below 1000 due to TLP"); 997 SYSCTL_ADD_S32(&rack_sysctl_ctx, 998 SYSCTL_CHILDREN(rack_timers), 999 OID_AUTO, "maxrto", CTLFLAG_RW, 1000 &rack_rto_max, 0, 1001 "Maxiumum RTO in ms -- should be at least as large as min_rto"); 1002 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1003 SYSCTL_CHILDREN(rack_timers), 1004 OID_AUTO, "minto", CTLFLAG_RW, 1005 &rack_min_to, 1, 1006 "Minimum rack timeout in milliseconds"); 1007 /* Measure controls */ 1008 rack_measure = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 1009 SYSCTL_CHILDREN(rack_sysctl_root), 1010 OID_AUTO, 1011 "measure", 1012 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1013 "Measure related controls"); 1014 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1015 SYSCTL_CHILDREN(rack_measure), 1016 OID_AUTO, "wma_divisor", CTLFLAG_RW, 1017 &rack_wma_divisor, 8, 1018 "When doing b/w calculation what is the divisor for the WMA"); 1019 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1020 SYSCTL_CHILDREN(rack_measure), 1021 OID_AUTO, "end_cwnd", CTLFLAG_RW, 1022 &rack_cwnd_block_ends_measure, 0, 1023 "Does a cwnd just-return end the measurement window (app limited)"); 1024 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1025 SYSCTL_CHILDREN(rack_measure), 1026 OID_AUTO, "end_rwnd", CTLFLAG_RW, 1027 &rack_rwnd_block_ends_measure, 0, 1028 "Does an rwnd just-return end the measurement window (app limited -- not persists)"); 1029 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1030 SYSCTL_CHILDREN(rack_measure), 1031 OID_AUTO, "min_target", CTLFLAG_RW, 1032 &rack_def_data_window, 20, 1033 "What is the minimum target window (in mss) for a GP measurements"); 1034 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1035 SYSCTL_CHILDREN(rack_measure), 1036 OID_AUTO, "goal_bdp", CTLFLAG_RW, 1037 &rack_goal_bdp, 2, 1038 "What is the goal BDP to measure"); 1039 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1040 SYSCTL_CHILDREN(rack_measure), 1041 OID_AUTO, "min_srtts", CTLFLAG_RW, 1042 &rack_min_srtts, 1, 1043 "What is the goal BDP to measure"); 1044 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1045 SYSCTL_CHILDREN(rack_measure), 1046 OID_AUTO, "min_measure_tim", CTLFLAG_RW, 1047 &rack_min_measure_usec, 0, 1048 "What is the Minimum time time for a measurement if 0, this is off"); 1049 /* Misc rack controls */ 1050 rack_misc = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 1051 SYSCTL_CHILDREN(rack_sysctl_root), 1052 OID_AUTO, 1053 "misc", 1054 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1055 "Misc related controls"); 1056 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1057 SYSCTL_CHILDREN(rack_misc), 1058 OID_AUTO, "shared_cwnd", CTLFLAG_RW, 1059 &rack_enable_shared_cwnd, 0, 1060 "Should RACK try to use the shared cwnd on connections where allowed"); 1061 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1062 SYSCTL_CHILDREN(rack_misc), 1063 OID_AUTO, "limits_on_scwnd", CTLFLAG_RW, 1064 &rack_limits_scwnd, 1, 1065 "Should RACK place low end time limits on the shared cwnd feature"); 1066 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1067 SYSCTL_CHILDREN(rack_misc), 1068 OID_AUTO, "non_paced_lro_queue", CTLFLAG_RW, 1069 &rack_enable_mqueue_for_nonpaced, 0, 1070 "Should RACK use mbuf queuing for non-paced connections"); 1071 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1072 SYSCTL_CHILDREN(rack_misc), 1073 OID_AUTO, "iMac_dack", CTLFLAG_RW, 1074 &rack_use_imac_dack, 0, 1075 "Should RACK try to emulate iMac delayed ack"); 1076 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1077 SYSCTL_CHILDREN(rack_misc), 1078 OID_AUTO, "no_prr", CTLFLAG_RW, 1079 &rack_disable_prr, 0, 1080 "Should RACK not use prr and only pace (must have pacing on)"); 1081 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1082 SYSCTL_CHILDREN(rack_misc), 1083 OID_AUTO, "bb_verbose", CTLFLAG_RW, 1084 &rack_verbose_logging, 0, 1085 "Should RACK black box logging be verbose"); 1086 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1087 SYSCTL_CHILDREN(rack_misc), 1088 OID_AUTO, "data_after_close", CTLFLAG_RW, 1089 &rack_ignore_data_after_close, 1, 1090 "Do we hold off sending a RST until all pending data is ack'd"); 1091 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1092 SYSCTL_CHILDREN(rack_misc), 1093 OID_AUTO, "no_sack_needed", CTLFLAG_RW, 1094 &rack_sack_not_required, 0, 1095 "Do we allow rack to run on connections not supporting SACK"); 1096 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1097 SYSCTL_CHILDREN(rack_misc), 1098 OID_AUTO, "recovery_loss_prop", CTLFLAG_RW, 1099 &rack_use_proportional_reduce, 0, 1100 "Should we proportionaly reduce cwnd based on the number of losses "); 1101 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1102 SYSCTL_CHILDREN(rack_misc), 1103 OID_AUTO, "recovery_prop", CTLFLAG_RW, 1104 &rack_proportional_rate, 10, 1105 "What percent reduction per loss"); 1106 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1107 SYSCTL_CHILDREN(rack_misc), 1108 OID_AUTO, "prr_sendalot", CTLFLAG_RW, 1109 &rack_send_a_lot_in_prr, 1, 1110 "Send a lot in prr"); 1111 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1112 SYSCTL_CHILDREN(rack_misc), 1113 OID_AUTO, "earlyrecovery", CTLFLAG_RW, 1114 &rack_early_recovery, 1, 1115 "Do we do early recovery with rack"); 1116 /* Sack Attacker detection stuff */ 1117 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1118 SYSCTL_CHILDREN(rack_attack), 1119 OID_AUTO, "detect_highsackratio", CTLFLAG_RW, 1120 &rack_highest_sack_thresh_seen, 0, 1121 "Highest sack to ack ratio seen"); 1122 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1123 SYSCTL_CHILDREN(rack_attack), 1124 OID_AUTO, "detect_highmoveratio", CTLFLAG_RW, 1125 &rack_highest_move_thresh_seen, 0, 1126 "Highest move to non-move ratio seen"); 1127 rack_ack_total = counter_u64_alloc(M_WAITOK); 1128 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1129 SYSCTL_CHILDREN(rack_attack), 1130 OID_AUTO, "acktotal", CTLFLAG_RD, 1131 &rack_ack_total, 1132 "Total number of Ack's"); 1133 rack_express_sack = counter_u64_alloc(M_WAITOK); 1134 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1135 SYSCTL_CHILDREN(rack_attack), 1136 OID_AUTO, "exp_sacktotal", CTLFLAG_RD, 1137 &rack_express_sack, 1138 "Total expresss number of Sack's"); 1139 rack_sack_total = counter_u64_alloc(M_WAITOK); 1140 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1141 SYSCTL_CHILDREN(rack_attack), 1142 OID_AUTO, "sacktotal", CTLFLAG_RD, 1143 &rack_sack_total, 1144 "Total number of SACKs"); 1145 rack_move_none = counter_u64_alloc(M_WAITOK); 1146 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1147 SYSCTL_CHILDREN(rack_attack), 1148 OID_AUTO, "move_none", CTLFLAG_RD, 1149 &rack_move_none, 1150 "Total number of SACK index reuse of postions under threshold"); 1151 rack_move_some = counter_u64_alloc(M_WAITOK); 1152 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1153 SYSCTL_CHILDREN(rack_attack), 1154 OID_AUTO, "move_some", CTLFLAG_RD, 1155 &rack_move_some, 1156 "Total number of SACK index reuse of postions over threshold"); 1157 rack_sack_attacks_detected = counter_u64_alloc(M_WAITOK); 1158 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1159 SYSCTL_CHILDREN(rack_attack), 1160 OID_AUTO, "attacks", CTLFLAG_RD, 1161 &rack_sack_attacks_detected, 1162 "Total number of SACK attackers that had sack disabled"); 1163 rack_sack_attacks_reversed = counter_u64_alloc(M_WAITOK); 1164 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1165 SYSCTL_CHILDREN(rack_attack), 1166 OID_AUTO, "reversed", CTLFLAG_RD, 1167 &rack_sack_attacks_reversed, 1168 "Total number of SACK attackers that were later determined false positive"); 1169 rack_sack_used_next_merge = counter_u64_alloc(M_WAITOK); 1170 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1171 SYSCTL_CHILDREN(rack_attack), 1172 OID_AUTO, "nextmerge", CTLFLAG_RD, 1173 &rack_sack_used_next_merge, 1174 "Total number of times we used the next merge"); 1175 rack_sack_used_prev_merge = counter_u64_alloc(M_WAITOK); 1176 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1177 SYSCTL_CHILDREN(rack_attack), 1178 OID_AUTO, "prevmerge", CTLFLAG_RD, 1179 &rack_sack_used_prev_merge, 1180 "Total number of times we used the prev merge"); 1181 /* Counters */ 1182 rack_badfr = counter_u64_alloc(M_WAITOK); 1183 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1184 SYSCTL_CHILDREN(rack_counters), 1185 OID_AUTO, "badfr", CTLFLAG_RD, 1186 &rack_badfr, "Total number of bad FRs"); 1187 rack_badfr_bytes = counter_u64_alloc(M_WAITOK); 1188 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1189 SYSCTL_CHILDREN(rack_counters), 1190 OID_AUTO, "badfr_bytes", CTLFLAG_RD, 1191 &rack_badfr_bytes, "Total number of bad FRs"); 1192 rack_rtm_prr_retran = counter_u64_alloc(M_WAITOK); 1193 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1194 SYSCTL_CHILDREN(rack_counters), 1195 OID_AUTO, "prrsndret", CTLFLAG_RD, 1196 &rack_rtm_prr_retran, 1197 "Total number of prr based retransmits"); 1198 rack_rtm_prr_newdata = counter_u64_alloc(M_WAITOK); 1199 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1200 SYSCTL_CHILDREN(rack_counters), 1201 OID_AUTO, "prrsndnew", CTLFLAG_RD, 1202 &rack_rtm_prr_newdata, 1203 "Total number of prr based new transmits"); 1204 rack_timestamp_mismatch = counter_u64_alloc(M_WAITOK); 1205 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1206 SYSCTL_CHILDREN(rack_counters), 1207 OID_AUTO, "tsnf", CTLFLAG_RD, 1208 &rack_timestamp_mismatch, 1209 "Total number of timestamps that we could not find the reported ts"); 1210 rack_find_high = counter_u64_alloc(M_WAITOK); 1211 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1212 SYSCTL_CHILDREN(rack_counters), 1213 OID_AUTO, "findhigh", CTLFLAG_RD, 1214 &rack_find_high, 1215 "Total number of FIN causing find-high"); 1216 rack_reorder_seen = counter_u64_alloc(M_WAITOK); 1217 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1218 SYSCTL_CHILDREN(rack_counters), 1219 OID_AUTO, "reordering", CTLFLAG_RD, 1220 &rack_reorder_seen, 1221 "Total number of times we added delay due to reordering"); 1222 rack_tlp_tot = counter_u64_alloc(M_WAITOK); 1223 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1224 SYSCTL_CHILDREN(rack_counters), 1225 OID_AUTO, "tlp_to_total", CTLFLAG_RD, 1226 &rack_tlp_tot, 1227 "Total number of tail loss probe expirations"); 1228 rack_tlp_newdata = counter_u64_alloc(M_WAITOK); 1229 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1230 SYSCTL_CHILDREN(rack_counters), 1231 OID_AUTO, "tlp_new", CTLFLAG_RD, 1232 &rack_tlp_newdata, 1233 "Total number of tail loss probe sending new data"); 1234 rack_tlp_retran = counter_u64_alloc(M_WAITOK); 1235 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1236 SYSCTL_CHILDREN(rack_counters), 1237 OID_AUTO, "tlp_retran", CTLFLAG_RD, 1238 &rack_tlp_retran, 1239 "Total number of tail loss probe sending retransmitted data"); 1240 rack_tlp_retran_bytes = counter_u64_alloc(M_WAITOK); 1241 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1242 SYSCTL_CHILDREN(rack_counters), 1243 OID_AUTO, "tlp_retran_bytes", CTLFLAG_RD, 1244 &rack_tlp_retran_bytes, 1245 "Total bytes of tail loss probe sending retransmitted data"); 1246 rack_tlp_retran_fail = counter_u64_alloc(M_WAITOK); 1247 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1248 SYSCTL_CHILDREN(rack_counters), 1249 OID_AUTO, "tlp_retran_fail", CTLFLAG_RD, 1250 &rack_tlp_retran_fail, 1251 "Total number of tail loss probe sending retransmitted data that failed (wait for t3)"); 1252 rack_to_tot = counter_u64_alloc(M_WAITOK); 1253 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1254 SYSCTL_CHILDREN(rack_counters), 1255 OID_AUTO, "rack_to_tot", CTLFLAG_RD, 1256 &rack_to_tot, 1257 "Total number of times the rack to expired"); 1258 rack_to_arm_rack = counter_u64_alloc(M_WAITOK); 1259 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1260 SYSCTL_CHILDREN(rack_counters), 1261 OID_AUTO, "arm_rack", CTLFLAG_RD, 1262 &rack_to_arm_rack, 1263 "Total number of times the rack timer armed"); 1264 rack_to_arm_tlp = counter_u64_alloc(M_WAITOK); 1265 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1266 SYSCTL_CHILDREN(rack_counters), 1267 OID_AUTO, "arm_tlp", CTLFLAG_RD, 1268 &rack_to_arm_tlp, 1269 "Total number of times the tlp timer armed"); 1270 rack_calc_zero = counter_u64_alloc(M_WAITOK); 1271 rack_calc_nonzero = counter_u64_alloc(M_WAITOK); 1272 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1273 SYSCTL_CHILDREN(rack_counters), 1274 OID_AUTO, "calc_zero", CTLFLAG_RD, 1275 &rack_calc_zero, 1276 "Total number of times pacing time worked out to zero"); 1277 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1278 SYSCTL_CHILDREN(rack_counters), 1279 OID_AUTO, "calc_nonzero", CTLFLAG_RD, 1280 &rack_calc_nonzero, 1281 "Total number of times pacing time worked out to non-zero"); 1282 rack_paced_segments = counter_u64_alloc(M_WAITOK); 1283 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1284 SYSCTL_CHILDREN(rack_counters), 1285 OID_AUTO, "paced", CTLFLAG_RD, 1286 &rack_paced_segments, 1287 "Total number of times a segment send caused hptsi"); 1288 rack_unpaced_segments = counter_u64_alloc(M_WAITOK); 1289 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1290 SYSCTL_CHILDREN(rack_counters), 1291 OID_AUTO, "unpaced", CTLFLAG_RD, 1292 &rack_unpaced_segments, 1293 "Total number of times a segment did not cause hptsi"); 1294 rack_saw_enobuf = counter_u64_alloc(M_WAITOK); 1295 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1296 SYSCTL_CHILDREN(rack_counters), 1297 OID_AUTO, "saw_enobufs", CTLFLAG_RD, 1298 &rack_saw_enobuf, 1299 "Total number of times a segment did not cause hptsi"); 1300 rack_saw_enetunreach = counter_u64_alloc(M_WAITOK); 1301 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1302 SYSCTL_CHILDREN(rack_counters), 1303 OID_AUTO, "saw_enetunreach", CTLFLAG_RD, 1304 &rack_saw_enetunreach, 1305 "Total number of times a segment did not cause hptsi"); 1306 rack_to_alloc = counter_u64_alloc(M_WAITOK); 1307 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1308 SYSCTL_CHILDREN(rack_counters), 1309 OID_AUTO, "allocs", CTLFLAG_RD, 1310 &rack_to_alloc, 1311 "Total allocations of tracking structures"); 1312 rack_to_alloc_hard = counter_u64_alloc(M_WAITOK); 1313 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1314 SYSCTL_CHILDREN(rack_counters), 1315 OID_AUTO, "allochard", CTLFLAG_RD, 1316 &rack_to_alloc_hard, 1317 "Total allocations done with sleeping the hard way"); 1318 rack_to_alloc_emerg = counter_u64_alloc(M_WAITOK); 1319 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1320 SYSCTL_CHILDREN(rack_counters), 1321 OID_AUTO, "allocemerg", CTLFLAG_RD, 1322 &rack_to_alloc_emerg, 1323 "Total allocations done from emergency cache"); 1324 rack_to_alloc_limited = counter_u64_alloc(M_WAITOK); 1325 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1326 SYSCTL_CHILDREN(rack_counters), 1327 OID_AUTO, "alloc_limited", CTLFLAG_RD, 1328 &rack_to_alloc_limited, 1329 "Total allocations dropped due to limit"); 1330 rack_alloc_limited_conns = counter_u64_alloc(M_WAITOK); 1331 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1332 SYSCTL_CHILDREN(rack_counters), 1333 OID_AUTO, "alloc_limited_conns", CTLFLAG_RD, 1334 &rack_alloc_limited_conns, 1335 "Connections with allocations dropped due to limit"); 1336 rack_split_limited = counter_u64_alloc(M_WAITOK); 1337 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1338 SYSCTL_CHILDREN(rack_counters), 1339 OID_AUTO, "split_limited", CTLFLAG_RD, 1340 &rack_split_limited, 1341 "Split allocations dropped due to limit"); 1342 rack_sack_proc_all = counter_u64_alloc(M_WAITOK); 1343 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1344 SYSCTL_CHILDREN(rack_counters), 1345 OID_AUTO, "sack_long", CTLFLAG_RD, 1346 &rack_sack_proc_all, 1347 "Total times we had to walk whole list for sack processing"); 1348 rack_sack_proc_restart = counter_u64_alloc(M_WAITOK); 1349 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1350 SYSCTL_CHILDREN(rack_counters), 1351 OID_AUTO, "sack_restart", CTLFLAG_RD, 1352 &rack_sack_proc_restart, 1353 "Total times we had to walk whole list due to a restart"); 1354 rack_sack_proc_short = counter_u64_alloc(M_WAITOK); 1355 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1356 SYSCTL_CHILDREN(rack_counters), 1357 OID_AUTO, "sack_short", CTLFLAG_RD, 1358 &rack_sack_proc_short, 1359 "Total times we took shortcut for sack processing"); 1360 rack_enter_tlp_calc = counter_u64_alloc(M_WAITOK); 1361 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1362 SYSCTL_CHILDREN(rack_counters), 1363 OID_AUTO, "tlp_calc_entered", CTLFLAG_RD, 1364 &rack_enter_tlp_calc, 1365 "Total times we called calc-tlp"); 1366 rack_used_tlpmethod = counter_u64_alloc(M_WAITOK); 1367 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1368 SYSCTL_CHILDREN(rack_counters), 1369 OID_AUTO, "hit_tlp_method", CTLFLAG_RD, 1370 &rack_used_tlpmethod, 1371 "Total number of runt sacks"); 1372 rack_used_tlpmethod2 = counter_u64_alloc(M_WAITOK); 1373 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1374 SYSCTL_CHILDREN(rack_counters), 1375 OID_AUTO, "hit_tlp_method2", CTLFLAG_RD, 1376 &rack_used_tlpmethod2, 1377 "Total number of times we hit TLP method 2"); 1378 rack_sack_skipped_acked = counter_u64_alloc(M_WAITOK); 1379 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1380 SYSCTL_CHILDREN(rack_attack), 1381 OID_AUTO, "skipacked", CTLFLAG_RD, 1382 &rack_sack_skipped_acked, 1383 "Total number of times we skipped previously sacked"); 1384 rack_sack_splits = counter_u64_alloc(M_WAITOK); 1385 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1386 SYSCTL_CHILDREN(rack_attack), 1387 OID_AUTO, "ofsplit", CTLFLAG_RD, 1388 &rack_sack_splits, 1389 "Total number of times we did the old fashion tree split"); 1390 rack_progress_drops = counter_u64_alloc(M_WAITOK); 1391 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1392 SYSCTL_CHILDREN(rack_counters), 1393 OID_AUTO, "prog_drops", CTLFLAG_RD, 1394 &rack_progress_drops, 1395 "Total number of progress drops"); 1396 rack_input_idle_reduces = counter_u64_alloc(M_WAITOK); 1397 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1398 SYSCTL_CHILDREN(rack_counters), 1399 OID_AUTO, "idle_reduce_oninput", CTLFLAG_RD, 1400 &rack_input_idle_reduces, 1401 "Total number of idle reductions on input"); 1402 rack_collapsed_win = counter_u64_alloc(M_WAITOK); 1403 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1404 SYSCTL_CHILDREN(rack_counters), 1405 OID_AUTO, "collapsed_win", CTLFLAG_RD, 1406 &rack_collapsed_win, 1407 "Total number of collapsed windows"); 1408 rack_tlp_does_nada = counter_u64_alloc(M_WAITOK); 1409 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1410 SYSCTL_CHILDREN(rack_counters), 1411 OID_AUTO, "tlp_nada", CTLFLAG_RD, 1412 &rack_tlp_does_nada, 1413 "Total number of nada tlp calls"); 1414 rack_try_scwnd = counter_u64_alloc(M_WAITOK); 1415 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1416 SYSCTL_CHILDREN(rack_counters), 1417 OID_AUTO, "tried_scwnd", CTLFLAG_RD, 1418 &rack_try_scwnd, 1419 "Total number of scwnd attempts"); 1420 1421 rack_per_timer_hole = counter_u64_alloc(M_WAITOK); 1422 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1423 SYSCTL_CHILDREN(rack_counters), 1424 OID_AUTO, "timer_hole", CTLFLAG_RD, 1425 &rack_per_timer_hole, 1426 "Total persists start in timer hole"); 1427 COUNTER_ARRAY_ALLOC(rack_out_size, TCP_MSS_ACCT_SIZE, M_WAITOK); 1428 SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), 1429 OID_AUTO, "outsize", CTLFLAG_RD, 1430 rack_out_size, TCP_MSS_ACCT_SIZE, "MSS send sizes"); 1431 COUNTER_ARRAY_ALLOC(rack_opts_arry, RACK_OPTS_SIZE, M_WAITOK); 1432 SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), 1433 OID_AUTO, "opts", CTLFLAG_RD, 1434 rack_opts_arry, RACK_OPTS_SIZE, "RACK Option Stats"); 1435 SYSCTL_ADD_PROC(&rack_sysctl_ctx, 1436 SYSCTL_CHILDREN(rack_sysctl_root), 1437 OID_AUTO, "clear", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, 1438 &rack_clear_counter, 0, sysctl_rack_clear, "IU", "Clear counters"); 1439 } 1440 1441 static __inline int 1442 rb_map_cmp(struct rack_sendmap *b, struct rack_sendmap *a) 1443 { 1444 if (SEQ_GEQ(b->r_start, a->r_start) && 1445 SEQ_LT(b->r_start, a->r_end)) { 1446 /* 1447 * The entry b is within the 1448 * block a. i.e.: 1449 * a -- |-------------| 1450 * b -- |----| 1451 * <or> 1452 * b -- |------| 1453 * <or> 1454 * b -- |-----------| 1455 */ 1456 return (0); 1457 } else if (SEQ_GEQ(b->r_start, a->r_end)) { 1458 /* 1459 * b falls as either the next 1460 * sequence block after a so a 1461 * is said to be smaller than b. 1462 * i.e: 1463 * a -- |------| 1464 * b -- |--------| 1465 * or 1466 * b -- |-----| 1467 */ 1468 return (1); 1469 } 1470 /* 1471 * Whats left is where a is 1472 * larger than b. i.e: 1473 * a -- |-------| 1474 * b -- |---| 1475 * or even possibly 1476 * b -- |--------------| 1477 */ 1478 return (-1); 1479 } 1480 1481 RB_PROTOTYPE(rack_rb_tree_head, rack_sendmap, r_next, rb_map_cmp); 1482 RB_GENERATE(rack_rb_tree_head, rack_sendmap, r_next, rb_map_cmp); 1483 1484 static uint32_t 1485 rc_init_window(struct tcp_rack *rack) 1486 { 1487 uint32_t win; 1488 1489 if (rack->rc_init_win == 0) { 1490 /* 1491 * Nothing set by the user, use the system stack 1492 * default. 1493 */ 1494 return(tcp_compute_initwnd(tcp_maxseg(rack->rc_tp))); 1495 } 1496 win = ctf_fixed_maxseg(rack->rc_tp) * rack->rc_init_win; 1497 return(win); 1498 } 1499 1500 static uint64_t 1501 rack_get_fixed_pacing_bw(struct tcp_rack *rack) 1502 { 1503 if (IN_RECOVERY(rack->rc_tp->t_flags)) 1504 return (rack->r_ctl.rc_fixed_pacing_rate_rec); 1505 else if (rack->r_ctl.cwnd_to_use < rack->rc_tp->snd_ssthresh) 1506 return (rack->r_ctl.rc_fixed_pacing_rate_ss); 1507 else 1508 return (rack->r_ctl.rc_fixed_pacing_rate_ca); 1509 } 1510 1511 static uint64_t 1512 rack_get_bw(struct tcp_rack *rack) 1513 { 1514 if (rack->use_fixed_rate) { 1515 /* Return the fixed pacing rate */ 1516 return (rack_get_fixed_pacing_bw(rack)); 1517 } 1518 if (rack->r_ctl.gp_bw == 0) { 1519 /* 1520 * We have yet no b/w measurement, 1521 * if we have a user set initial bw 1522 * return it. If we don't have that and 1523 * we have an srtt, use the tcp IW (10) to 1524 * calculate a fictional b/w over the SRTT 1525 * which is more or less a guess. Note 1526 * we don't use our IW from rack on purpose 1527 * so if we have like IW=30, we are not 1528 * calculating a "huge" b/w. 1529 */ 1530 uint64_t bw, srtt; 1531 if (rack->r_ctl.init_rate) 1532 return (rack->r_ctl.init_rate); 1533 1534 /* Has the user set a max peak rate? */ 1535 #ifdef NETFLIX_PEAKRATE 1536 if (rack->rc_tp->t_maxpeakrate) 1537 return (rack->rc_tp->t_maxpeakrate); 1538 #endif 1539 /* Ok lets come up with the IW guess, if we have a srtt */ 1540 if (rack->rc_tp->t_srtt == 0) { 1541 /* 1542 * Go with old pacing method 1543 * i.e. burst mitigation only. 1544 */ 1545 return (0); 1546 } 1547 /* Ok lets get the initial TCP win (not racks) */ 1548 bw = tcp_compute_initwnd(tcp_maxseg(rack->rc_tp)); 1549 srtt = ((uint64_t)TICKS_2_USEC(rack->rc_tp->t_srtt) >> TCP_RTT_SHIFT); 1550 bw *= (uint64_t)USECS_IN_SECOND; 1551 bw /= srtt; 1552 return (bw); 1553 } else { 1554 uint64_t bw; 1555 1556 if(rack->r_ctl.num_avg >= RACK_REQ_AVG) { 1557 /* Averaging is done, we can return the value */ 1558 bw = rack->r_ctl.gp_bw; 1559 } else { 1560 /* Still doing initial average must calculate */ 1561 bw = rack->r_ctl.gp_bw / rack->r_ctl.num_avg; 1562 } 1563 #ifdef NETFLIX_PEAKRATE 1564 if ((rack->rc_tp->t_maxpeakrate) && 1565 (bw > rack->rc_tp->t_maxpeakrate)) { 1566 /* The user has set a peak rate to pace at 1567 * don't allow us to pace faster than that. 1568 */ 1569 return (rack->rc_tp->t_maxpeakrate); 1570 } 1571 #endif 1572 return (bw); 1573 } 1574 } 1575 1576 static uint16_t 1577 rack_get_output_gain(struct tcp_rack *rack, struct rack_sendmap *rsm) 1578 { 1579 if (rack->use_fixed_rate) { 1580 return (100); 1581 } else if (rack->in_probe_rtt && (rsm == NULL)) 1582 return(rack->r_ctl.rack_per_of_gp_probertt); 1583 else if ((IN_RECOVERY(rack->rc_tp->t_flags) && 1584 rack->r_ctl.rack_per_of_gp_rec)) { 1585 if (rsm) { 1586 /* a retransmission always use the recovery rate */ 1587 return(rack->r_ctl.rack_per_of_gp_rec); 1588 } else if (rack->rack_rec_nonrxt_use_cr) { 1589 /* Directed to use the configured rate */ 1590 goto configured_rate; 1591 } else if (rack->rack_no_prr && 1592 (rack->r_ctl.rack_per_of_gp_rec > 100)) { 1593 /* No PRR, lets just use the b/w estimate only */ 1594 return(100); 1595 } else { 1596 /* 1597 * Here we may have a non-retransmit but we 1598 * have no overrides, so just use the recovery 1599 * rate (prr is in effect). 1600 */ 1601 return(rack->r_ctl.rack_per_of_gp_rec); 1602 } 1603 } 1604 configured_rate: 1605 /* For the configured rate we look at our cwnd vs the ssthresh */ 1606 if (rack->r_ctl.cwnd_to_use < rack->rc_tp->snd_ssthresh) 1607 return (rack->r_ctl.rack_per_of_gp_ss); 1608 else 1609 return(rack->r_ctl.rack_per_of_gp_ca); 1610 } 1611 1612 static uint64_t 1613 rack_get_output_bw(struct tcp_rack *rack, uint64_t bw, struct rack_sendmap *rsm) 1614 { 1615 /* 1616 * We allow rack_per_of_gp_xx to dictate our bw rate we want. 1617 */ 1618 uint64_t bw_est; 1619 uint64_t gain; 1620 1621 gain = (uint64_t)rack_get_output_gain(rack, rsm); 1622 bw_est = bw * gain; 1623 bw_est /= (uint64_t)100; 1624 /* Never fall below the minimum (def 64kbps) */ 1625 if (bw_est < RACK_MIN_BW) 1626 bw_est = RACK_MIN_BW; 1627 return (bw_est); 1628 } 1629 1630 static void 1631 rack_log_retran_reason(struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t tsused, uint32_t thresh, int mod) 1632 { 1633 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 1634 union tcp_log_stackspecific log; 1635 struct timeval tv; 1636 1637 if ((mod != 1) && (rack_verbose_logging == 0)) { 1638 /* 1639 * We get 3 values currently for mod 1640 * 1 - We are retransmitting and this tells the reason. 1641 * 2 - We are clearing a dup-ack count. 1642 * 3 - We are incrementing a dup-ack count. 1643 * 1644 * The clear/increment are only logged 1645 * if you have BBverbose on. 1646 */ 1647 return; 1648 } 1649 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 1650 log.u_bbr.flex1 = tsused; 1651 log.u_bbr.flex2 = thresh; 1652 log.u_bbr.flex3 = rsm->r_flags; 1653 log.u_bbr.flex4 = rsm->r_dupack; 1654 log.u_bbr.flex5 = rsm->r_start; 1655 log.u_bbr.flex6 = rsm->r_end; 1656 log.u_bbr.flex8 = mod; 1657 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 1658 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 1659 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 1660 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 1661 TCP_LOG_EVENTP(rack->rc_tp, NULL, 1662 &rack->rc_inp->inp_socket->so_rcv, 1663 &rack->rc_inp->inp_socket->so_snd, 1664 BBR_LOG_SETTINGS_CHG, 0, 1665 0, &log, false, &tv); 1666 } 1667 } 1668 1669 static void 1670 rack_log_to_start(struct tcp_rack *rack, uint32_t cts, uint32_t to, int32_t slot, uint8_t which) 1671 { 1672 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 1673 union tcp_log_stackspecific log; 1674 struct timeval tv; 1675 1676 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 1677 log.u_bbr.flex1 = TICKS_2_MSEC(rack->rc_tp->t_srtt >> TCP_RTT_SHIFT); 1678 log.u_bbr.flex2 = to * 1000; 1679 log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags; 1680 log.u_bbr.flex4 = slot; 1681 log.u_bbr.flex5 = rack->rc_inp->inp_hptsslot; 1682 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur; 1683 log.u_bbr.flex7 = rack->rc_in_persist; 1684 log.u_bbr.flex8 = which; 1685 if (rack->rack_no_prr) 1686 log.u_bbr.pkts_out = 0; 1687 else 1688 log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt; 1689 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 1690 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 1691 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 1692 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 1693 TCP_LOG_EVENTP(rack->rc_tp, NULL, 1694 &rack->rc_inp->inp_socket->so_rcv, 1695 &rack->rc_inp->inp_socket->so_snd, 1696 BBR_LOG_TIMERSTAR, 0, 1697 0, &log, false, &tv); 1698 } 1699 } 1700 1701 static void 1702 rack_log_to_event(struct tcp_rack *rack, int32_t to_num, struct rack_sendmap *rsm) 1703 { 1704 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 1705 union tcp_log_stackspecific log; 1706 struct timeval tv; 1707 1708 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 1709 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 1710 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 1711 log.u_bbr.flex8 = to_num; 1712 log.u_bbr.flex1 = rack->r_ctl.rc_rack_min_rtt; 1713 log.u_bbr.flex2 = rack->rc_rack_rtt; 1714 if (rsm == NULL) 1715 log.u_bbr.flex3 = 0; 1716 else 1717 log.u_bbr.flex3 = rsm->r_end - rsm->r_start; 1718 if (rack->rack_no_prr) 1719 log.u_bbr.flex5 = 0; 1720 else 1721 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; 1722 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 1723 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 1724 TCP_LOG_EVENTP(rack->rc_tp, NULL, 1725 &rack->rc_inp->inp_socket->so_rcv, 1726 &rack->rc_inp->inp_socket->so_snd, 1727 BBR_LOG_RTO, 0, 1728 0, &log, false, &tv); 1729 } 1730 } 1731 1732 static void 1733 rack_log_rtt_upd(struct tcpcb *tp, struct tcp_rack *rack, uint32_t t, uint32_t len, 1734 struct rack_sendmap *rsm, int conf) 1735 { 1736 if (tp->t_logstate != TCP_LOG_STATE_OFF) { 1737 union tcp_log_stackspecific log; 1738 struct timeval tv; 1739 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 1740 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 1741 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 1742 log.u_bbr.flex1 = t; 1743 log.u_bbr.flex2 = len; 1744 log.u_bbr.flex3 = rack->r_ctl.rc_rack_min_rtt * HPTS_USEC_IN_MSEC; 1745 log.u_bbr.flex4 = rack->r_ctl.rack_rs.rs_rtt_lowest * HPTS_USEC_IN_MSEC; 1746 log.u_bbr.flex5 = rack->r_ctl.rack_rs.rs_rtt_highest * HPTS_USEC_IN_MSEC; 1747 log.u_bbr.flex6 = rack->r_ctl.rack_rs.rs_rtt_cnt; 1748 log.u_bbr.flex7 = conf; 1749 log.u_bbr.rttProp = (uint64_t)rack->r_ctl.rack_rs.rs_rtt_tot * (uint64_t)HPTS_USEC_IN_MSEC; 1750 log.u_bbr.flex8 = rack->r_ctl.rc_rate_sample_method; 1751 if (rack->rack_no_prr) 1752 log.u_bbr.pkts_out = 0; 1753 else 1754 log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt; 1755 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 1756 log.u_bbr.delivered = rack->r_ctl.rack_rs.rs_us_rtt; 1757 log.u_bbr.pkts_out = rack->r_ctl.rack_rs.rs_flags; 1758 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 1759 if (rsm) { 1760 log.u_bbr.pkt_epoch = rsm->r_start; 1761 log.u_bbr.lost = rsm->r_end; 1762 log.u_bbr.cwnd_gain = rsm->r_rtr_cnt; 1763 } else { 1764 /* Its a SYN */ 1765 log.u_bbr.pkt_epoch = rack->rc_tp->iss; 1766 log.u_bbr.lost = 0; 1767 log.u_bbr.cwnd_gain = 0; 1768 } 1769 /* Write out general bits of interest rrs here */ 1770 log.u_bbr.use_lt_bw = rack->rc_highly_buffered; 1771 log.u_bbr.use_lt_bw <<= 1; 1772 log.u_bbr.use_lt_bw |= rack->forced_ack; 1773 log.u_bbr.use_lt_bw <<= 1; 1774 log.u_bbr.use_lt_bw |= rack->rc_gp_dyn_mul; 1775 log.u_bbr.use_lt_bw <<= 1; 1776 log.u_bbr.use_lt_bw |= rack->in_probe_rtt; 1777 log.u_bbr.use_lt_bw <<= 1; 1778 log.u_bbr.use_lt_bw |= rack->measure_saw_probe_rtt; 1779 log.u_bbr.use_lt_bw <<= 1; 1780 log.u_bbr.use_lt_bw |= rack->app_limited_needs_set; 1781 log.u_bbr.use_lt_bw <<= 1; 1782 log.u_bbr.use_lt_bw |= rack->rc_gp_filled; 1783 log.u_bbr.use_lt_bw <<= 1; 1784 log.u_bbr.use_lt_bw |= rack->rc_dragged_bottom; 1785 log.u_bbr.applimited = rack->r_ctl.rc_target_probertt_flight; 1786 log.u_bbr.epoch = rack->r_ctl.rc_time_probertt_starts; 1787 log.u_bbr.lt_epoch = rack->r_ctl.rc_time_probertt_entered; 1788 log.u_bbr.cur_del_rate = rack->r_ctl.rc_lower_rtt_us_cts; 1789 log.u_bbr.delRate = rack->r_ctl.rc_gp_srtt; 1790 TCP_LOG_EVENTP(tp, NULL, 1791 &rack->rc_inp->inp_socket->so_rcv, 1792 &rack->rc_inp->inp_socket->so_snd, 1793 BBR_LOG_BBRRTT, 0, 1794 0, &log, false, &tv); 1795 } 1796 } 1797 1798 static void 1799 rack_log_rtt_sample(struct tcp_rack *rack, uint32_t rtt) 1800 { 1801 /* 1802 * Log the rtt sample we are 1803 * applying to the srtt algorithm in 1804 * useconds. 1805 */ 1806 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 1807 union tcp_log_stackspecific log; 1808 struct timeval tv; 1809 1810 /* Convert our ms to a microsecond */ 1811 memset(&log, 0, sizeof(log)); 1812 log.u_bbr.flex1 = rtt * 1000; 1813 log.u_bbr.flex2 = rack->r_ctl.ack_count; 1814 log.u_bbr.flex3 = rack->r_ctl.sack_count; 1815 log.u_bbr.flex4 = rack->r_ctl.sack_noextra_move; 1816 log.u_bbr.flex5 = rack->r_ctl.sack_moved_extra; 1817 log.u_bbr.flex8 = rack->sack_attack_disable; 1818 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 1819 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 1820 TCP_LOG_EVENTP(rack->rc_tp, NULL, 1821 &rack->rc_inp->inp_socket->so_rcv, 1822 &rack->rc_inp->inp_socket->so_snd, 1823 TCP_LOG_RTT, 0, 1824 0, &log, false, &tv); 1825 } 1826 } 1827 1828 static inline void 1829 rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, int event, int line) 1830 { 1831 if (rack_verbose_logging && (tp->t_logstate != TCP_LOG_STATE_OFF)) { 1832 union tcp_log_stackspecific log; 1833 struct timeval tv; 1834 1835 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 1836 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 1837 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 1838 log.u_bbr.flex1 = line; 1839 log.u_bbr.flex2 = tick; 1840 log.u_bbr.flex3 = tp->t_maxunacktime; 1841 log.u_bbr.flex4 = tp->t_acktime; 1842 log.u_bbr.flex8 = event; 1843 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 1844 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 1845 TCP_LOG_EVENTP(tp, NULL, 1846 &rack->rc_inp->inp_socket->so_rcv, 1847 &rack->rc_inp->inp_socket->so_snd, 1848 BBR_LOG_PROGRESS, 0, 1849 0, &log, false, &tv); 1850 } 1851 } 1852 1853 static void 1854 rack_log_type_bbrsnd(struct tcp_rack *rack, uint32_t len, uint32_t slot, uint32_t cts, struct timeval *tv) 1855 { 1856 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 1857 union tcp_log_stackspecific log; 1858 1859 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 1860 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 1861 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 1862 log.u_bbr.flex1 = slot; 1863 if (rack->rack_no_prr) 1864 log.u_bbr.flex2 = 0; 1865 else 1866 log.u_bbr.flex2 = rack->r_ctl.rc_prr_sndcnt; 1867 log.u_bbr.flex7 = (0x0000ffff & rack->r_ctl.rc_hpts_flags); 1868 log.u_bbr.flex8 = rack->rc_in_persist; 1869 log.u_bbr.timeStamp = cts; 1870 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 1871 TCP_LOG_EVENTP(rack->rc_tp, NULL, 1872 &rack->rc_inp->inp_socket->so_rcv, 1873 &rack->rc_inp->inp_socket->so_snd, 1874 BBR_LOG_BBRSND, 0, 1875 0, &log, false, tv); 1876 } 1877 } 1878 1879 static void 1880 rack_log_doseg_done(struct tcp_rack *rack, uint32_t cts, int32_t nxt_pkt, int32_t did_out, int way_out) 1881 { 1882 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 1883 union tcp_log_stackspecific log; 1884 struct timeval tv; 1885 1886 memset(&log, 0, sizeof(log)); 1887 log.u_bbr.flex1 = did_out; 1888 log.u_bbr.flex2 = nxt_pkt; 1889 log.u_bbr.flex3 = way_out; 1890 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; 1891 if (rack->rack_no_prr) 1892 log.u_bbr.flex5 = 0; 1893 else 1894 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; 1895 log.u_bbr.applimited = rack->r_ctl.rc_pace_min_segs; 1896 log.u_bbr.flex7 = rack->r_wanted_output; 1897 log.u_bbr.flex8 = rack->rc_in_persist; 1898 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 1899 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 1900 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 1901 TCP_LOG_EVENTP(rack->rc_tp, NULL, 1902 &rack->rc_inp->inp_socket->so_rcv, 1903 &rack->rc_inp->inp_socket->so_snd, 1904 BBR_LOG_DOSEG_DONE, 0, 1905 0, &log, false, &tv); 1906 } 1907 } 1908 1909 static void 1910 rack_log_type_hrdwtso(struct tcpcb *tp, struct tcp_rack *rack, int len, int mod, int32_t orig_len, int frm) 1911 { 1912 if (tp->t_logstate != TCP_LOG_STATE_OFF) { 1913 union tcp_log_stackspecific log; 1914 struct timeval tv; 1915 uint32_t cts; 1916 1917 memset(&log, 0, sizeof(log)); 1918 cts = tcp_get_usecs(&tv); 1919 log.u_bbr.flex1 = rack->r_ctl.rc_pace_min_segs; 1920 log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs; 1921 log.u_bbr.flex4 = len; 1922 log.u_bbr.flex5 = orig_len; 1923 log.u_bbr.flex6 = rack->r_ctl.rc_sacked; 1924 log.u_bbr.flex7 = mod; 1925 log.u_bbr.flex8 = frm; 1926 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 1927 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 1928 TCP_LOG_EVENTP(tp, NULL, 1929 &tp->t_inpcb->inp_socket->so_rcv, 1930 &tp->t_inpcb->inp_socket->so_snd, 1931 TCP_HDWR_TLS, 0, 1932 0, &log, false, &tv); 1933 } 1934 } 1935 1936 static void 1937 rack_log_type_just_return(struct tcp_rack *rack, uint32_t cts, uint32_t tlen, uint32_t slot, 1938 uint8_t hpts_calling, int reason, uint32_t cwnd_to_use) 1939 { 1940 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 1941 union tcp_log_stackspecific log; 1942 struct timeval tv; 1943 1944 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 1945 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 1946 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 1947 log.u_bbr.flex1 = slot; 1948 log.u_bbr.flex2 = rack->r_ctl.rc_hpts_flags; 1949 log.u_bbr.flex4 = reason; 1950 if (rack->rack_no_prr) 1951 log.u_bbr.flex5 = 0; 1952 else 1953 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; 1954 log.u_bbr.flex7 = hpts_calling; 1955 log.u_bbr.flex8 = rack->rc_in_persist; 1956 log.u_bbr.lt_epoch = cwnd_to_use; 1957 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 1958 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 1959 TCP_LOG_EVENTP(rack->rc_tp, NULL, 1960 &rack->rc_inp->inp_socket->so_rcv, 1961 &rack->rc_inp->inp_socket->so_snd, 1962 BBR_LOG_JUSTRET, 0, 1963 tlen, &log, false, &tv); 1964 } 1965 } 1966 1967 static void 1968 rack_log_to_cancel(struct tcp_rack *rack, int32_t hpts_removed, int line, uint32_t us_cts, 1969 struct timeval *tv, uint32_t flags_on_entry) 1970 { 1971 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 1972 union tcp_log_stackspecific log; 1973 1974 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 1975 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 1976 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 1977 log.u_bbr.flex1 = line; 1978 log.u_bbr.flex2 = rack->r_ctl.rc_last_output_to; 1979 log.u_bbr.flex3 = flags_on_entry; 1980 log.u_bbr.flex4 = us_cts; 1981 if (rack->rack_no_prr) 1982 log.u_bbr.flex5 = 0; 1983 else 1984 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; 1985 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur; 1986 log.u_bbr.flex7 = hpts_removed; 1987 log.u_bbr.flex8 = 1; 1988 log.u_bbr.applimited = rack->r_ctl.rc_hpts_flags; 1989 log.u_bbr.timeStamp = us_cts; 1990 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 1991 TCP_LOG_EVENTP(rack->rc_tp, NULL, 1992 &rack->rc_inp->inp_socket->so_rcv, 1993 &rack->rc_inp->inp_socket->so_snd, 1994 BBR_LOG_TIMERCANC, 0, 1995 0, &log, false, tv); 1996 } 1997 } 1998 1999 static void 2000 rack_log_alt_to_to_cancel(struct tcp_rack *rack, 2001 uint32_t flex1, uint32_t flex2, 2002 uint32_t flex3, uint32_t flex4, 2003 uint32_t flex5, uint32_t flex6, 2004 uint16_t flex7, uint8_t mod) 2005 { 2006 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2007 union tcp_log_stackspecific log; 2008 struct timeval tv; 2009 2010 if (mod == 1) { 2011 /* No you can't use 1, its for the real to cancel */ 2012 return; 2013 } 2014 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2015 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2016 log.u_bbr.flex1 = flex1; 2017 log.u_bbr.flex2 = flex2; 2018 log.u_bbr.flex3 = flex3; 2019 log.u_bbr.flex4 = flex4; 2020 log.u_bbr.flex5 = flex5; 2021 log.u_bbr.flex6 = flex6; 2022 log.u_bbr.flex7 = flex7; 2023 log.u_bbr.flex8 = mod; 2024 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2025 &rack->rc_inp->inp_socket->so_rcv, 2026 &rack->rc_inp->inp_socket->so_snd, 2027 BBR_LOG_TIMERCANC, 0, 2028 0, &log, false, &tv); 2029 } 2030 } 2031 2032 static void 2033 rack_log_to_processing(struct tcp_rack *rack, uint32_t cts, int32_t ret, int32_t timers) 2034 { 2035 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2036 union tcp_log_stackspecific log; 2037 struct timeval tv; 2038 2039 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2040 log.u_bbr.flex1 = timers; 2041 log.u_bbr.flex2 = ret; 2042 log.u_bbr.flex3 = rack->r_ctl.rc_timer_exp; 2043 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; 2044 log.u_bbr.flex5 = cts; 2045 if (rack->rack_no_prr) 2046 log.u_bbr.flex6 = 0; 2047 else 2048 log.u_bbr.flex6 = rack->r_ctl.rc_prr_sndcnt; 2049 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2050 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2051 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2052 &rack->rc_inp->inp_socket->so_rcv, 2053 &rack->rc_inp->inp_socket->so_snd, 2054 BBR_LOG_TO_PROCESS, 0, 2055 0, &log, false, &tv); 2056 } 2057 } 2058 2059 static void 2060 rack_log_to_prr(struct tcp_rack *rack, int frm, int orig_cwnd) 2061 { 2062 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2063 union tcp_log_stackspecific log; 2064 struct timeval tv; 2065 2066 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2067 log.u_bbr.flex1 = rack->r_ctl.rc_prr_out; 2068 log.u_bbr.flex2 = rack->r_ctl.rc_prr_recovery_fs; 2069 if (rack->rack_no_prr) 2070 log.u_bbr.flex3 = 0; 2071 else 2072 log.u_bbr.flex3 = rack->r_ctl.rc_prr_sndcnt; 2073 log.u_bbr.flex4 = rack->r_ctl.rc_prr_delivered; 2074 log.u_bbr.flex5 = rack->r_ctl.rc_sacked; 2075 log.u_bbr.flex6 = rack->r_ctl.rc_holes_rxt; 2076 log.u_bbr.flex8 = frm; 2077 log.u_bbr.pkts_out = orig_cwnd; 2078 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2079 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2080 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2081 &rack->rc_inp->inp_socket->so_rcv, 2082 &rack->rc_inp->inp_socket->so_snd, 2083 BBR_LOG_BBRUPD, 0, 2084 0, &log, false, &tv); 2085 } 2086 } 2087 2088 #ifdef NETFLIX_EXP_DETECTION 2089 static void 2090 rack_log_sad(struct tcp_rack *rack, int event) 2091 { 2092 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2093 union tcp_log_stackspecific log; 2094 struct timeval tv; 2095 2096 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2097 log.u_bbr.flex1 = rack->r_ctl.sack_count; 2098 log.u_bbr.flex2 = rack->r_ctl.ack_count; 2099 log.u_bbr.flex3 = rack->r_ctl.sack_moved_extra; 2100 log.u_bbr.flex4 = rack->r_ctl.sack_noextra_move; 2101 log.u_bbr.flex5 = rack->r_ctl.rc_num_maps_alloced; 2102 log.u_bbr.flex6 = tcp_sack_to_ack_thresh; 2103 log.u_bbr.pkts_out = tcp_sack_to_move_thresh; 2104 log.u_bbr.lt_epoch = (tcp_force_detection << 8); 2105 log.u_bbr.lt_epoch |= rack->do_detection; 2106 log.u_bbr.applimited = tcp_map_minimum; 2107 log.u_bbr.flex7 = rack->sack_attack_disable; 2108 log.u_bbr.flex8 = event; 2109 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2110 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2111 log.u_bbr.delivered = tcp_sad_decay_val; 2112 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2113 &rack->rc_inp->inp_socket->so_rcv, 2114 &rack->rc_inp->inp_socket->so_snd, 2115 TCP_SAD_DETECTION, 0, 2116 0, &log, false, &tv); 2117 } 2118 } 2119 #endif 2120 2121 static void 2122 rack_counter_destroy(void) 2123 { 2124 counter_u64_free(rack_ack_total); 2125 counter_u64_free(rack_express_sack); 2126 counter_u64_free(rack_sack_total); 2127 counter_u64_free(rack_move_none); 2128 counter_u64_free(rack_move_some); 2129 counter_u64_free(rack_sack_attacks_detected); 2130 counter_u64_free(rack_sack_attacks_reversed); 2131 counter_u64_free(rack_sack_used_next_merge); 2132 counter_u64_free(rack_sack_used_prev_merge); 2133 counter_u64_free(rack_badfr); 2134 counter_u64_free(rack_badfr_bytes); 2135 counter_u64_free(rack_rtm_prr_retran); 2136 counter_u64_free(rack_rtm_prr_newdata); 2137 counter_u64_free(rack_timestamp_mismatch); 2138 counter_u64_free(rack_find_high); 2139 counter_u64_free(rack_reorder_seen); 2140 counter_u64_free(rack_tlp_tot); 2141 counter_u64_free(rack_tlp_newdata); 2142 counter_u64_free(rack_tlp_retran); 2143 counter_u64_free(rack_tlp_retran_bytes); 2144 counter_u64_free(rack_tlp_retran_fail); 2145 counter_u64_free(rack_to_tot); 2146 counter_u64_free(rack_to_arm_rack); 2147 counter_u64_free(rack_to_arm_tlp); 2148 counter_u64_free(rack_calc_zero); 2149 counter_u64_free(rack_calc_nonzero); 2150 counter_u64_free(rack_paced_segments); 2151 counter_u64_free(rack_unpaced_segments); 2152 counter_u64_free(rack_saw_enobuf); 2153 counter_u64_free(rack_saw_enetunreach); 2154 counter_u64_free(rack_to_alloc); 2155 counter_u64_free(rack_to_alloc_hard); 2156 counter_u64_free(rack_to_alloc_emerg); 2157 counter_u64_free(rack_to_alloc_limited); 2158 counter_u64_free(rack_alloc_limited_conns); 2159 counter_u64_free(rack_split_limited); 2160 counter_u64_free(rack_sack_proc_all); 2161 counter_u64_free(rack_sack_proc_restart); 2162 counter_u64_free(rack_sack_proc_short); 2163 counter_u64_free(rack_enter_tlp_calc); 2164 counter_u64_free(rack_used_tlpmethod); 2165 counter_u64_free(rack_used_tlpmethod2); 2166 counter_u64_free(rack_sack_skipped_acked); 2167 counter_u64_free(rack_sack_splits); 2168 counter_u64_free(rack_progress_drops); 2169 counter_u64_free(rack_input_idle_reduces); 2170 counter_u64_free(rack_collapsed_win); 2171 counter_u64_free(rack_tlp_does_nada); 2172 counter_u64_free(rack_try_scwnd); 2173 counter_u64_free(rack_per_timer_hole); 2174 COUNTER_ARRAY_FREE(rack_out_size, TCP_MSS_ACCT_SIZE); 2175 COUNTER_ARRAY_FREE(rack_opts_arry, RACK_OPTS_SIZE); 2176 } 2177 2178 static struct rack_sendmap * 2179 rack_alloc(struct tcp_rack *rack) 2180 { 2181 struct rack_sendmap *rsm; 2182 2183 rsm = uma_zalloc(rack_zone, M_NOWAIT); 2184 if (rsm) { 2185 rack->r_ctl.rc_num_maps_alloced++; 2186 counter_u64_add(rack_to_alloc, 1); 2187 return (rsm); 2188 } 2189 if (rack->rc_free_cnt) { 2190 counter_u64_add(rack_to_alloc_emerg, 1); 2191 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); 2192 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext); 2193 rack->rc_free_cnt--; 2194 return (rsm); 2195 } 2196 return (NULL); 2197 } 2198 2199 static struct rack_sendmap * 2200 rack_alloc_full_limit(struct tcp_rack *rack) 2201 { 2202 if ((V_tcp_map_entries_limit > 0) && 2203 (rack->do_detection == 0) && 2204 (rack->r_ctl.rc_num_maps_alloced >= V_tcp_map_entries_limit)) { 2205 counter_u64_add(rack_to_alloc_limited, 1); 2206 if (!rack->alloc_limit_reported) { 2207 rack->alloc_limit_reported = 1; 2208 counter_u64_add(rack_alloc_limited_conns, 1); 2209 } 2210 return (NULL); 2211 } 2212 return (rack_alloc(rack)); 2213 } 2214 2215 /* wrapper to allocate a sendmap entry, subject to a specific limit */ 2216 static struct rack_sendmap * 2217 rack_alloc_limit(struct tcp_rack *rack, uint8_t limit_type) 2218 { 2219 struct rack_sendmap *rsm; 2220 2221 if (limit_type) { 2222 /* currently there is only one limit type */ 2223 if (V_tcp_map_split_limit > 0 && 2224 (rack->do_detection == 0) && 2225 rack->r_ctl.rc_num_split_allocs >= V_tcp_map_split_limit) { 2226 counter_u64_add(rack_split_limited, 1); 2227 if (!rack->alloc_limit_reported) { 2228 rack->alloc_limit_reported = 1; 2229 counter_u64_add(rack_alloc_limited_conns, 1); 2230 } 2231 return (NULL); 2232 } 2233 } 2234 2235 /* allocate and mark in the limit type, if set */ 2236 rsm = rack_alloc(rack); 2237 if (rsm != NULL && limit_type) { 2238 rsm->r_limit_type = limit_type; 2239 rack->r_ctl.rc_num_split_allocs++; 2240 } 2241 return (rsm); 2242 } 2243 2244 static void 2245 rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm) 2246 { 2247 if (rsm->r_flags & RACK_APP_LIMITED) { 2248 if (rack->r_ctl.rc_app_limited_cnt > 0) { 2249 rack->r_ctl.rc_app_limited_cnt--; 2250 } 2251 } 2252 if (rsm->r_limit_type) { 2253 /* currently there is only one limit type */ 2254 rack->r_ctl.rc_num_split_allocs--; 2255 } 2256 if (rsm == rack->r_ctl.rc_first_appl) { 2257 if (rack->r_ctl.rc_app_limited_cnt == 0) 2258 rack->r_ctl.rc_first_appl = NULL; 2259 else { 2260 /* Follow the next one out */ 2261 struct rack_sendmap fe; 2262 2263 fe.r_start = rsm->r_nseq_appl; 2264 rack->r_ctl.rc_first_appl = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); 2265 } 2266 } 2267 if (rsm == rack->r_ctl.rc_resend) 2268 rack->r_ctl.rc_resend = NULL; 2269 if (rsm == rack->r_ctl.rc_rsm_at_retran) 2270 rack->r_ctl.rc_rsm_at_retran = NULL; 2271 if (rsm == rack->r_ctl.rc_end_appl) 2272 rack->r_ctl.rc_end_appl = NULL; 2273 if (rack->r_ctl.rc_tlpsend == rsm) 2274 rack->r_ctl.rc_tlpsend = NULL; 2275 if (rack->r_ctl.rc_sacklast == rsm) 2276 rack->r_ctl.rc_sacklast = NULL; 2277 if (rack->rc_free_cnt < rack_free_cache) { 2278 memset(rsm, 0, sizeof(struct rack_sendmap)); 2279 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_tnext); 2280 rsm->r_limit_type = 0; 2281 rack->rc_free_cnt++; 2282 return; 2283 } 2284 rack->r_ctl.rc_num_maps_alloced--; 2285 uma_zfree(rack_zone, rsm); 2286 } 2287 2288 static uint32_t 2289 rack_get_measure_window(struct tcpcb *tp, struct tcp_rack *rack) 2290 { 2291 uint64_t srtt, bw, len, tim; 2292 uint32_t segsiz, def_len, minl; 2293 2294 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 2295 def_len = rack_def_data_window * segsiz; 2296 if (rack->rc_gp_filled == 0) { 2297 /* 2298 * We have no measurement (IW is in flight?) so 2299 * we can only guess using our data_window sysctl 2300 * value (usually 100MSS). 2301 */ 2302 return (def_len); 2303 } 2304 /* 2305 * Now we have a number of factors to consider. 2306 * 2307 * 1) We have a desired BDP which is usually 2308 * at least 2. 2309 * 2) We have a minimum number of rtt's usually 1 SRTT 2310 * but we allow it too to be more. 2311 * 3) We want to make sure a measurement last N useconds (if 2312 * we have set rack_min_measure_usec. 2313 * 2314 * We handle the first concern here by trying to create a data 2315 * window of max(rack_def_data_window, DesiredBDP). The 2316 * second concern we handle in not letting the measurement 2317 * window end normally until at least the required SRTT's 2318 * have gone by which is done further below in 2319 * rack_enough_for_measurement(). Finally the third concern 2320 * we also handle here by calculating how long that time 2321 * would take at the current BW and then return the 2322 * max of our first calculation and that length. Note 2323 * that if rack_min_measure_usec is 0, we don't deal 2324 * with concern 3. Also for both Concern 1 and 3 an 2325 * application limited period could end the measurement 2326 * earlier. 2327 * 2328 * So lets calculate the BDP with the "known" b/w using 2329 * the SRTT has our rtt and then multiply it by the 2330 * goal. 2331 */ 2332 bw = rack_get_bw(rack); 2333 srtt = ((uint64_t)TICKS_2_USEC(tp->t_srtt) >> TCP_RTT_SHIFT); 2334 len = bw * srtt; 2335 len /= (uint64_t)HPTS_USEC_IN_SEC; 2336 len *= max(1, rack_goal_bdp); 2337 /* Now we need to round up to the nearest MSS */ 2338 len = roundup(len, segsiz); 2339 if (rack_min_measure_usec) { 2340 /* Now calculate our min length for this b/w */ 2341 tim = rack_min_measure_usec; 2342 minl = (tim * bw) / (uint64_t)HPTS_USEC_IN_SEC; 2343 if (minl == 0) 2344 minl = 1; 2345 minl = roundup(minl, segsiz); 2346 if (len < minl) 2347 len = minl; 2348 } 2349 /* 2350 * Now if we have a very small window we want 2351 * to attempt to get the window that is 2352 * as small as possible. This happens on 2353 * low b/w connections and we don't want to 2354 * span huge numbers of rtt's between measurements. 2355 * 2356 * We basically include 2 over our "MIN window" so 2357 * that the measurement can be shortened (possibly) by 2358 * an ack'ed packet. 2359 */ 2360 if (len < def_len) 2361 return (max((uint32_t)len, ((MIN_GP_WIN+2) * segsiz))); 2362 else 2363 return (max((uint32_t)len, def_len)); 2364 2365 } 2366 2367 static int 2368 rack_enough_for_measurement(struct tcpcb *tp, struct tcp_rack *rack, tcp_seq th_ack) 2369 { 2370 uint32_t tim, srtts, segsiz; 2371 2372 /* 2373 * Has enough time passed for the GP measurement to be valid? 2374 */ 2375 if ((tp->snd_max == tp->snd_una) || 2376 (th_ack == tp->snd_max)){ 2377 /* All is acked */ 2378 return (1); 2379 } 2380 if (SEQ_LT(th_ack, tp->gput_seq)) { 2381 /* Not enough bytes yet */ 2382 return (0); 2383 } 2384 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 2385 if (SEQ_LT(th_ack, tp->gput_ack) && 2386 ((th_ack - tp->gput_seq) < max(rc_init_window(rack), (MIN_GP_WIN * segsiz)))) { 2387 /* Not enough bytes yet */ 2388 return (0); 2389 } 2390 if (rack->r_ctl.rc_first_appl && 2391 (rack->r_ctl.rc_first_appl->r_start == th_ack)) { 2392 /* 2393 * We are up to the app limited point 2394 * we have to measure irrespective of the time.. 2395 */ 2396 return (1); 2397 } 2398 /* Now what about time? */ 2399 srtts = (rack->r_ctl.rc_gp_srtt * rack_min_srtts); 2400 tim = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time) - tp->gput_ts; 2401 if (tim >= srtts) { 2402 return (1); 2403 } 2404 /* Nope not even a full SRTT has passed */ 2405 return (0); 2406 } 2407 2408 static void 2409 rack_log_timely(struct tcp_rack *rack, 2410 uint32_t logged, uint64_t cur_bw, uint64_t low_bnd, 2411 uint64_t up_bnd, int line, uint8_t method) 2412 { 2413 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2414 union tcp_log_stackspecific log; 2415 struct timeval tv; 2416 2417 memset(&log, 0, sizeof(log)); 2418 log.u_bbr.flex1 = logged; 2419 log.u_bbr.flex2 = rack->rc_gp_timely_inc_cnt; 2420 log.u_bbr.flex2 <<= 4; 2421 log.u_bbr.flex2 |= rack->rc_gp_timely_dec_cnt; 2422 log.u_bbr.flex2 <<= 4; 2423 log.u_bbr.flex2 |= rack->rc_gp_incr; 2424 log.u_bbr.flex2 <<= 4; 2425 log.u_bbr.flex2 |= rack->rc_gp_bwred; 2426 log.u_bbr.flex3 = rack->rc_gp_incr; 2427 log.u_bbr.flex4 = rack->r_ctl.rack_per_of_gp_ss; 2428 log.u_bbr.flex5 = rack->r_ctl.rack_per_of_gp_ca; 2429 log.u_bbr.flex6 = rack->r_ctl.rack_per_of_gp_rec; 2430 log.u_bbr.flex7 = rack->rc_gp_bwred; 2431 log.u_bbr.flex8 = method; 2432 log.u_bbr.cur_del_rate = cur_bw; 2433 log.u_bbr.delRate = low_bnd; 2434 log.u_bbr.bw_inuse = up_bnd; 2435 log.u_bbr.rttProp = rack_get_bw(rack); 2436 log.u_bbr.pkt_epoch = line; 2437 log.u_bbr.pkts_out = rack->r_ctl.rc_rtt_diff; 2438 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2439 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2440 log.u_bbr.epoch = rack->r_ctl.rc_gp_srtt; 2441 log.u_bbr.lt_epoch = rack->r_ctl.rc_prev_gp_srtt; 2442 log.u_bbr.cwnd_gain = rack->rc_dragged_bottom; 2443 log.u_bbr.cwnd_gain <<= 1; 2444 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_rec; 2445 log.u_bbr.cwnd_gain <<= 1; 2446 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ss; 2447 log.u_bbr.cwnd_gain <<= 1; 2448 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ca; 2449 log.u_bbr.lost = rack->r_ctl.rc_loss_count; 2450 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2451 &rack->rc_inp->inp_socket->so_rcv, 2452 &rack->rc_inp->inp_socket->so_snd, 2453 TCP_TIMELY_WORK, 0, 2454 0, &log, false, &tv); 2455 } 2456 } 2457 2458 static int 2459 rack_bw_can_be_raised(struct tcp_rack *rack, uint64_t cur_bw, uint64_t last_bw_est, uint16_t mult) 2460 { 2461 /* 2462 * Before we increase we need to know if 2463 * the estimate just made was less than 2464 * our pacing goal (i.e. (cur_bw * mult) > last_bw_est) 2465 * 2466 * If we already are pacing at a fast enough 2467 * rate to push us faster there is no sense of 2468 * increasing. 2469 * 2470 * We first caculate our actual pacing rate (ss or ca multipler 2471 * times our cur_bw). 2472 * 2473 * Then we take the last measured rate and multipy by our 2474 * maximum pacing overage to give us a max allowable rate. 2475 * 2476 * If our act_rate is smaller than our max_allowable rate 2477 * then we should increase. Else we should hold steady. 2478 * 2479 */ 2480 uint64_t act_rate, max_allow_rate; 2481 2482 if (rack_timely_no_stopping) 2483 return (1); 2484 2485 if ((cur_bw == 0) || (last_bw_est == 0)) { 2486 /* 2487 * Initial startup case or 2488 * everything is acked case. 2489 */ 2490 rack_log_timely(rack, mult, cur_bw, 0, 0, 2491 __LINE__, 9); 2492 return (1); 2493 } 2494 if (mult <= 100) { 2495 /* 2496 * We can always pace at or slightly above our rate. 2497 */ 2498 rack_log_timely(rack, mult, cur_bw, 0, 0, 2499 __LINE__, 9); 2500 return (1); 2501 } 2502 act_rate = cur_bw * (uint64_t)mult; 2503 act_rate /= 100; 2504 max_allow_rate = last_bw_est * ((uint64_t)rack_max_per_above + (uint64_t)100); 2505 max_allow_rate /= 100; 2506 if (act_rate < max_allow_rate) { 2507 /* 2508 * Here the rate we are actually pacing at 2509 * is smaller than 10% above our last measurement. 2510 * This means we are pacing below what we would 2511 * like to try to achieve (plus some wiggle room). 2512 */ 2513 rack_log_timely(rack, mult, cur_bw, act_rate, max_allow_rate, 2514 __LINE__, 9); 2515 return (1); 2516 } else { 2517 /* 2518 * Here we are already pacing at least rack_max_per_above(10%) 2519 * what we are getting back. This indicates most likely 2520 * that we are being limited (cwnd/rwnd/app) and can't 2521 * get any more b/w. There is no sense of trying to 2522 * raise up the pacing rate its not speeding us up 2523 * and we already are pacing faster than we are getting. 2524 */ 2525 rack_log_timely(rack, mult, cur_bw, act_rate, max_allow_rate, 2526 __LINE__, 8); 2527 return (0); 2528 } 2529 } 2530 2531 static void 2532 rack_validate_multipliers_at_or_above100(struct tcp_rack *rack) 2533 { 2534 /* 2535 * When we drag bottom, we want to assure 2536 * that no multiplier is below 1.0, if so 2537 * we want to restore it to at least that. 2538 */ 2539 if (rack->r_ctl.rack_per_of_gp_rec < 100) { 2540 /* This is unlikely we usually do not touch recovery */ 2541 rack->r_ctl.rack_per_of_gp_rec = 100; 2542 } 2543 if (rack->r_ctl.rack_per_of_gp_ca < 100) { 2544 rack->r_ctl.rack_per_of_gp_ca = 100; 2545 } 2546 if (rack->r_ctl.rack_per_of_gp_ss < 100) { 2547 rack->r_ctl.rack_per_of_gp_ss = 100; 2548 } 2549 } 2550 2551 static void 2552 rack_validate_multipliers_at_or_below_100(struct tcp_rack *rack) 2553 { 2554 if (rack->r_ctl.rack_per_of_gp_ca > 100) { 2555 rack->r_ctl.rack_per_of_gp_ca = 100; 2556 } 2557 if (rack->r_ctl.rack_per_of_gp_ss > 100) { 2558 rack->r_ctl.rack_per_of_gp_ss = 100; 2559 } 2560 } 2561 2562 static void 2563 rack_increase_bw_mul(struct tcp_rack *rack, int timely_says, uint64_t cur_bw, uint64_t last_bw_est, int override) 2564 { 2565 int32_t calc, logged, plus; 2566 2567 logged = 0; 2568 2569 if (override) { 2570 /* 2571 * override is passed when we are 2572 * loosing b/w and making one last 2573 * gasp at trying to not loose out 2574 * to a new-reno flow. 2575 */ 2576 goto extra_boost; 2577 } 2578 /* In classic timely we boost by 5x if we have 5 increases in a row, lets not */ 2579 if (rack->rc_gp_incr && 2580 ((rack->rc_gp_timely_inc_cnt + 1) >= RACK_TIMELY_CNT_BOOST)) { 2581 /* 2582 * Reset and get 5 strokes more before the boost. Note 2583 * that the count is 0 based so we have to add one. 2584 */ 2585 extra_boost: 2586 plus = (uint32_t)rack_gp_increase_per * RACK_TIMELY_CNT_BOOST; 2587 rack->rc_gp_timely_inc_cnt = 0; 2588 } else 2589 plus = (uint32_t)rack_gp_increase_per; 2590 /* Must be at least 1% increase for true timely increases */ 2591 if ((plus < 1) && 2592 ((rack->r_ctl.rc_rtt_diff <= 0) || (timely_says <= 0))) 2593 plus = 1; 2594 if (rack->rc_gp_saw_rec && 2595 (rack->rc_gp_no_rec_chg == 0) && 2596 rack_bw_can_be_raised(rack, cur_bw, last_bw_est, 2597 rack->r_ctl.rack_per_of_gp_rec)) { 2598 /* We have been in recovery ding it too */ 2599 calc = rack->r_ctl.rack_per_of_gp_rec + plus; 2600 if (calc > 0xffff) 2601 calc = 0xffff; 2602 logged |= 1; 2603 rack->r_ctl.rack_per_of_gp_rec = (uint16_t)calc; 2604 if (rack_per_upper_bound_ss && 2605 (rack->rc_dragged_bottom == 0) && 2606 (rack->r_ctl.rack_per_of_gp_rec > rack_per_upper_bound_ss)) 2607 rack->r_ctl.rack_per_of_gp_rec = rack_per_upper_bound_ss; 2608 } 2609 if (rack->rc_gp_saw_ca && 2610 (rack->rc_gp_saw_ss == 0) && 2611 rack_bw_can_be_raised(rack, cur_bw, last_bw_est, 2612 rack->r_ctl.rack_per_of_gp_ca)) { 2613 /* In CA */ 2614 calc = rack->r_ctl.rack_per_of_gp_ca + plus; 2615 if (calc > 0xffff) 2616 calc = 0xffff; 2617 logged |= 2; 2618 rack->r_ctl.rack_per_of_gp_ca = (uint16_t)calc; 2619 if (rack_per_upper_bound_ca && 2620 (rack->rc_dragged_bottom == 0) && 2621 (rack->r_ctl.rack_per_of_gp_ca > rack_per_upper_bound_ca)) 2622 rack->r_ctl.rack_per_of_gp_ca = rack_per_upper_bound_ca; 2623 } 2624 if (rack->rc_gp_saw_ss && 2625 rack_bw_can_be_raised(rack, cur_bw, last_bw_est, 2626 rack->r_ctl.rack_per_of_gp_ss)) { 2627 /* In SS */ 2628 calc = rack->r_ctl.rack_per_of_gp_ss + plus; 2629 if (calc > 0xffff) 2630 calc = 0xffff; 2631 rack->r_ctl.rack_per_of_gp_ss = (uint16_t)calc; 2632 if (rack_per_upper_bound_ss && 2633 (rack->rc_dragged_bottom == 0) && 2634 (rack->r_ctl.rack_per_of_gp_ss > rack_per_upper_bound_ss)) 2635 rack->r_ctl.rack_per_of_gp_ss = rack_per_upper_bound_ss; 2636 logged |= 4; 2637 } 2638 if (logged && 2639 (rack->rc_gp_incr == 0)){ 2640 /* Go into increment mode */ 2641 rack->rc_gp_incr = 1; 2642 rack->rc_gp_timely_inc_cnt = 0; 2643 } 2644 if (rack->rc_gp_incr && 2645 logged && 2646 (rack->rc_gp_timely_inc_cnt < RACK_TIMELY_CNT_BOOST)) { 2647 rack->rc_gp_timely_inc_cnt++; 2648 } 2649 rack_log_timely(rack, logged, plus, 0, 0, 2650 __LINE__, 1); 2651 } 2652 2653 static uint32_t 2654 rack_get_decrease(struct tcp_rack *rack, uint32_t curper, int32_t rtt_diff) 2655 { 2656 /* 2657 * norm_grad = rtt_diff / minrtt; 2658 * new_per = curper * (1 - B * norm_grad) 2659 * 2660 * B = rack_gp_decrease_per (default 10%) 2661 * rtt_dif = input var current rtt-diff 2662 * curper = input var current percentage 2663 * minrtt = from rack filter 2664 * 2665 */ 2666 uint64_t perf; 2667 2668 perf = (((uint64_t)curper * ((uint64_t)1000000 - 2669 ((uint64_t)rack_gp_decrease_per * (uint64_t)10000 * 2670 (((uint64_t)rtt_diff * (uint64_t)1000000)/ 2671 (uint64_t)get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt)))/ 2672 (uint64_t)1000000)) / 2673 (uint64_t)1000000); 2674 if (perf > curper) { 2675 /* TSNH */ 2676 perf = curper - 1; 2677 } 2678 return ((uint32_t)perf); 2679 } 2680 2681 static uint32_t 2682 rack_decrease_highrtt(struct tcp_rack *rack, uint32_t curper, uint32_t rtt) 2683 { 2684 /* 2685 * highrttthresh 2686 * result = curper * (1 - (B * ( 1 - ------ )) 2687 * gp_srtt 2688 * 2689 * B = rack_gp_decrease_per (default 10%) 2690 * highrttthresh = filter_min * rack_gp_rtt_maxmul 2691 */ 2692 uint64_t perf; 2693 uint32_t highrttthresh; 2694 2695 highrttthresh = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_maxmul; 2696 2697 perf = (((uint64_t)curper * ((uint64_t)1000000 - 2698 ((uint64_t)rack_gp_decrease_per * ((uint64_t)1000000 - 2699 ((uint64_t)highrttthresh * (uint64_t)1000000) / 2700 (uint64_t)rtt)) / 100)) /(uint64_t)1000000); 2701 return (perf); 2702 } 2703 2704 static void 2705 rack_decrease_bw_mul(struct tcp_rack *rack, int timely_says, uint32_t rtt, int32_t rtt_diff) 2706 { 2707 uint64_t logvar, logvar2, logvar3; 2708 uint32_t logged, new_per, ss_red, ca_red, rec_red, alt, val; 2709 2710 if (rack->rc_gp_incr) { 2711 /* Turn off increment counting */ 2712 rack->rc_gp_incr = 0; 2713 rack->rc_gp_timely_inc_cnt = 0; 2714 } 2715 ss_red = ca_red = rec_red = 0; 2716 logged = 0; 2717 /* Calculate the reduction value */ 2718 if (rtt_diff < 0) { 2719 rtt_diff *= -1; 2720 } 2721 /* Must be at least 1% reduction */ 2722 if (rack->rc_gp_saw_rec && (rack->rc_gp_no_rec_chg == 0)) { 2723 /* We have been in recovery ding it too */ 2724 if (timely_says == 2) { 2725 new_per = rack_decrease_highrtt(rack, rack->r_ctl.rack_per_of_gp_rec, rtt); 2726 alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff); 2727 if (alt < new_per) 2728 val = alt; 2729 else 2730 val = new_per; 2731 } else 2732 val = new_per = alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff); 2733 if (rack->r_ctl.rack_per_of_gp_rec > val) { 2734 rec_red = (rack->r_ctl.rack_per_of_gp_rec - val); 2735 rack->r_ctl.rack_per_of_gp_rec = (uint16_t)val; 2736 } else { 2737 rack->r_ctl.rack_per_of_gp_rec = rack_per_lower_bound; 2738 rec_red = 0; 2739 } 2740 if (rack_per_lower_bound > rack->r_ctl.rack_per_of_gp_rec) 2741 rack->r_ctl.rack_per_of_gp_rec = rack_per_lower_bound; 2742 logged |= 1; 2743 } 2744 if (rack->rc_gp_saw_ss) { 2745 /* Sent in SS */ 2746 if (timely_says == 2) { 2747 new_per = rack_decrease_highrtt(rack, rack->r_ctl.rack_per_of_gp_ss, rtt); 2748 alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff); 2749 if (alt < new_per) 2750 val = alt; 2751 else 2752 val = new_per; 2753 } else 2754 val = new_per = alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_ss, rtt_diff); 2755 if (rack->r_ctl.rack_per_of_gp_ss > new_per) { 2756 ss_red = rack->r_ctl.rack_per_of_gp_ss - val; 2757 rack->r_ctl.rack_per_of_gp_ss = (uint16_t)val; 2758 } else { 2759 ss_red = new_per; 2760 rack->r_ctl.rack_per_of_gp_ss = rack_per_lower_bound; 2761 logvar = new_per; 2762 logvar <<= 32; 2763 logvar |= alt; 2764 logvar2 = (uint32_t)rtt; 2765 logvar2 <<= 32; 2766 logvar2 |= (uint32_t)rtt_diff; 2767 logvar3 = rack_gp_rtt_maxmul; 2768 logvar3 <<= 32; 2769 logvar3 |= get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); 2770 rack_log_timely(rack, timely_says, 2771 logvar2, logvar3, 2772 logvar, __LINE__, 10); 2773 } 2774 if (rack_per_lower_bound > rack->r_ctl.rack_per_of_gp_ss) 2775 rack->r_ctl.rack_per_of_gp_ss = rack_per_lower_bound; 2776 logged |= 4; 2777 } else if (rack->rc_gp_saw_ca) { 2778 /* Sent in CA */ 2779 if (timely_says == 2) { 2780 new_per = rack_decrease_highrtt(rack, rack->r_ctl.rack_per_of_gp_ca, rtt); 2781 alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff); 2782 if (alt < new_per) 2783 val = alt; 2784 else 2785 val = new_per; 2786 } else 2787 val = new_per = alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_ca, rtt_diff); 2788 if (rack->r_ctl.rack_per_of_gp_ca > val) { 2789 ca_red = rack->r_ctl.rack_per_of_gp_ca - val; 2790 rack->r_ctl.rack_per_of_gp_ca = (uint16_t)val; 2791 } else { 2792 rack->r_ctl.rack_per_of_gp_ca = rack_per_lower_bound; 2793 ca_red = 0; 2794 logvar = new_per; 2795 logvar <<= 32; 2796 logvar |= alt; 2797 logvar2 = (uint32_t)rtt; 2798 logvar2 <<= 32; 2799 logvar2 |= (uint32_t)rtt_diff; 2800 logvar3 = rack_gp_rtt_maxmul; 2801 logvar3 <<= 32; 2802 logvar3 |= get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); 2803 rack_log_timely(rack, timely_says, 2804 logvar2, logvar3, 2805 logvar, __LINE__, 10); 2806 } 2807 if (rack_per_lower_bound > rack->r_ctl.rack_per_of_gp_ca) 2808 rack->r_ctl.rack_per_of_gp_ca = rack_per_lower_bound; 2809 logged |= 2; 2810 } 2811 if (rack->rc_gp_timely_dec_cnt < 0x7) { 2812 rack->rc_gp_timely_dec_cnt++; 2813 if (rack_timely_dec_clear && 2814 (rack->rc_gp_timely_dec_cnt == rack_timely_dec_clear)) 2815 rack->rc_gp_timely_dec_cnt = 0; 2816 } 2817 logvar = ss_red; 2818 logvar <<= 32; 2819 logvar |= ca_red; 2820 rack_log_timely(rack, logged, rec_red, rack_per_lower_bound, logvar, 2821 __LINE__, 2); 2822 } 2823 2824 static void 2825 rack_log_rtt_shrinks(struct tcp_rack *rack, uint32_t us_cts, 2826 uint32_t rtt, uint32_t line, uint8_t reas) 2827 { 2828 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2829 union tcp_log_stackspecific log; 2830 struct timeval tv; 2831 2832 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2833 log.u_bbr.flex1 = line; 2834 log.u_bbr.flex2 = rack->r_ctl.rc_time_probertt_starts; 2835 log.u_bbr.flex3 = rack->r_ctl.rc_lower_rtt_us_cts; 2836 log.u_bbr.flex4 = rack->r_ctl.rack_per_of_gp_ss; 2837 log.u_bbr.flex5 = rtt; 2838 log.u_bbr.flex6 = rack->rc_highly_buffered; 2839 log.u_bbr.flex6 <<= 1; 2840 log.u_bbr.flex6 |= rack->forced_ack; 2841 log.u_bbr.flex6 <<= 1; 2842 log.u_bbr.flex6 |= rack->rc_gp_dyn_mul; 2843 log.u_bbr.flex6 <<= 1; 2844 log.u_bbr.flex6 |= rack->in_probe_rtt; 2845 log.u_bbr.flex6 <<= 1; 2846 log.u_bbr.flex6 |= rack->measure_saw_probe_rtt; 2847 log.u_bbr.flex7 = rack->r_ctl.rack_per_of_gp_probertt; 2848 log.u_bbr.pacing_gain = rack->r_ctl.rack_per_of_gp_ca; 2849 log.u_bbr.cwnd_gain = rack->r_ctl.rack_per_of_gp_rec; 2850 log.u_bbr.flex8 = reas; 2851 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2852 log.u_bbr.delRate = rack_get_bw(rack); 2853 log.u_bbr.cur_del_rate = rack->r_ctl.rc_highest_us_rtt; 2854 log.u_bbr.cur_del_rate <<= 32; 2855 log.u_bbr.cur_del_rate |= rack->r_ctl.rc_lowest_us_rtt; 2856 log.u_bbr.applimited = rack->r_ctl.rc_time_probertt_entered; 2857 log.u_bbr.pkts_out = rack->r_ctl.rc_rtt_diff; 2858 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2859 log.u_bbr.epoch = rack->r_ctl.rc_gp_srtt; 2860 log.u_bbr.lt_epoch = rack->r_ctl.rc_prev_gp_srtt; 2861 log.u_bbr.pkt_epoch = rack->r_ctl.rc_lower_rtt_us_cts; 2862 log.u_bbr.delivered = rack->r_ctl.rc_target_probertt_flight; 2863 log.u_bbr.lost = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); 2864 log.u_bbr.rttProp = us_cts; 2865 log.u_bbr.rttProp <<= 32; 2866 log.u_bbr.rttProp |= rack->r_ctl.rc_entry_gp_rtt; 2867 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2868 &rack->rc_inp->inp_socket->so_rcv, 2869 &rack->rc_inp->inp_socket->so_snd, 2870 BBR_LOG_RTT_SHRINKS, 0, 2871 0, &log, false, &rack->r_ctl.act_rcv_time); 2872 } 2873 } 2874 2875 static void 2876 rack_set_prtt_target(struct tcp_rack *rack, uint32_t segsiz, uint32_t rtt) 2877 { 2878 uint64_t bwdp; 2879 2880 bwdp = rack_get_bw(rack); 2881 bwdp *= (uint64_t)rtt; 2882 bwdp /= (uint64_t)HPTS_USEC_IN_SEC; 2883 rack->r_ctl.rc_target_probertt_flight = roundup((uint32_t)bwdp, segsiz); 2884 if (rack->r_ctl.rc_target_probertt_flight < (segsiz * rack_timely_min_segs)) { 2885 /* 2886 * A window protocol must be able to have 4 packets 2887 * outstanding as the floor in order to function 2888 * (especially considering delayed ack :D). 2889 */ 2890 rack->r_ctl.rc_target_probertt_flight = (segsiz * rack_timely_min_segs); 2891 } 2892 } 2893 2894 static void 2895 rack_enter_probertt(struct tcp_rack *rack, uint32_t us_cts) 2896 { 2897 /** 2898 * ProbeRTT is a bit different in rack_pacing than in 2899 * BBR. It is like BBR in that it uses the lowering of 2900 * the RTT as a signal that we saw something new and 2901 * counts from there for how long between. But it is 2902 * different in that its quite simple. It does not 2903 * play with the cwnd and wait until we get down 2904 * to N segments outstanding and hold that for 2905 * 200ms. Instead it just sets the pacing reduction 2906 * rate to a set percentage (70 by default) and hold 2907 * that for a number of recent GP Srtt's. 2908 */ 2909 uint32_t segsiz; 2910 2911 if (rack->rc_gp_dyn_mul == 0) 2912 return; 2913 2914 if (rack->rc_tp->snd_max == rack->rc_tp->snd_una) { 2915 /* We are idle */ 2916 return; 2917 } 2918 if ((rack->rc_tp->t_flags & TF_GPUTINPROG) && 2919 SEQ_GT(rack->rc_tp->snd_una, rack->rc_tp->gput_seq)) { 2920 /* 2921 * Stop the goodput now, the idea here is 2922 * that future measurements with in_probe_rtt 2923 * won't register if they are not greater so 2924 * we want to get what info (if any) is available 2925 * now. 2926 */ 2927 rack_do_goodput_measurement(rack->rc_tp, rack, 2928 rack->rc_tp->snd_una, __LINE__); 2929 } 2930 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt; 2931 rack->r_ctl.rc_time_probertt_entered = us_cts; 2932 segsiz = min(ctf_fixed_maxseg(rack->rc_tp), 2933 rack->r_ctl.rc_pace_min_segs); 2934 rack->in_probe_rtt = 1; 2935 rack->measure_saw_probe_rtt = 1; 2936 rack->r_ctl.rc_lower_rtt_us_cts = us_cts; 2937 rack->r_ctl.rc_time_probertt_starts = 0; 2938 rack->r_ctl.rc_entry_gp_rtt = rack->r_ctl.rc_gp_srtt; 2939 if (rack_probertt_use_min_rtt_entry) 2940 rack_set_prtt_target(rack, segsiz, get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt)); 2941 else 2942 rack_set_prtt_target(rack, segsiz, rack->r_ctl.rc_gp_srtt); 2943 rack_log_rtt_shrinks(rack, us_cts, get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 2944 __LINE__, RACK_RTTS_ENTERPROBE); 2945 } 2946 2947 static void 2948 rack_exit_probertt(struct tcp_rack *rack, uint32_t us_cts) 2949 { 2950 struct rack_sendmap *rsm; 2951 uint32_t segsiz; 2952 2953 segsiz = min(ctf_fixed_maxseg(rack->rc_tp), 2954 rack->r_ctl.rc_pace_min_segs); 2955 rack->in_probe_rtt = 0; 2956 if ((rack->rc_tp->t_flags & TF_GPUTINPROG) && 2957 SEQ_GT(rack->rc_tp->snd_una, rack->rc_tp->gput_seq)) { 2958 /* 2959 * Stop the goodput now, the idea here is 2960 * that future measurements with in_probe_rtt 2961 * won't register if they are not greater so 2962 * we want to get what info (if any) is available 2963 * now. 2964 */ 2965 rack_do_goodput_measurement(rack->rc_tp, rack, 2966 rack->rc_tp->snd_una, __LINE__); 2967 } else if (rack->rc_tp->t_flags & TF_GPUTINPROG) { 2968 /* 2969 * We don't have enough data to make a measurement. 2970 * So lets just stop and start here after exiting 2971 * probe-rtt. We probably are not interested in 2972 * the results anyway. 2973 */ 2974 rack->rc_tp->t_flags &= ~TF_GPUTINPROG; 2975 } 2976 /* 2977 * Measurements through the current snd_max are going 2978 * to be limited by the slower pacing rate. 2979 * 2980 * We need to mark these as app-limited so we 2981 * don't collapse the b/w. 2982 */ 2983 rsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 2984 if (rsm && ((rsm->r_flags & RACK_APP_LIMITED) == 0)) { 2985 if (rack->r_ctl.rc_app_limited_cnt == 0) 2986 rack->r_ctl.rc_end_appl = rack->r_ctl.rc_first_appl = rsm; 2987 else { 2988 /* 2989 * Go out to the end app limited and mark 2990 * this new one as next and move the end_appl up 2991 * to this guy. 2992 */ 2993 if (rack->r_ctl.rc_end_appl) 2994 rack->r_ctl.rc_end_appl->r_nseq_appl = rsm->r_start; 2995 rack->r_ctl.rc_end_appl = rsm; 2996 } 2997 rsm->r_flags |= RACK_APP_LIMITED; 2998 rack->r_ctl.rc_app_limited_cnt++; 2999 } 3000 /* 3001 * Now, we need to examine our pacing rate multipliers. 3002 * If its under 100%, we need to kick it back up to 3003 * 100%. We also don't let it be over our "max" above 3004 * the actual rate i.e. 100% + rack_clamp_atexit_prtt. 3005 * Note setting clamp_atexit_prtt to 0 has the effect 3006 * of setting CA/SS to 100% always at exit (which is 3007 * the default behavior). 3008 */ 3009 if (rack_probertt_clear_is) { 3010 rack->rc_gp_incr = 0; 3011 rack->rc_gp_bwred = 0; 3012 rack->rc_gp_timely_inc_cnt = 0; 3013 rack->rc_gp_timely_dec_cnt = 0; 3014 } 3015 /* Do we do any clamping at exit? */ 3016 if (rack->rc_highly_buffered && rack_atexit_prtt_hbp) { 3017 rack->r_ctl.rack_per_of_gp_ca = rack_atexit_prtt_hbp; 3018 rack->r_ctl.rack_per_of_gp_ss = rack_atexit_prtt_hbp; 3019 } 3020 if ((rack->rc_highly_buffered == 0) && rack_atexit_prtt) { 3021 rack->r_ctl.rack_per_of_gp_ca = rack_atexit_prtt; 3022 rack->r_ctl.rack_per_of_gp_ss = rack_atexit_prtt; 3023 } 3024 /* 3025 * Lets set rtt_diff to 0, so that we will get a "boost" 3026 * after exiting. 3027 */ 3028 rack->r_ctl.rc_rtt_diff = 0; 3029 3030 /* Clear all flags so we start fresh */ 3031 rack->rc_tp->t_bytes_acked = 0; 3032 rack->rc_tp->ccv->flags &= ~CCF_ABC_SENTAWND; 3033 /* 3034 * If configured to, set the cwnd and ssthresh to 3035 * our targets. 3036 */ 3037 if (rack_probe_rtt_sets_cwnd) { 3038 uint64_t ebdp; 3039 uint32_t setto; 3040 3041 /* Set ssthresh so we get into CA once we hit our target */ 3042 if (rack_probertt_use_min_rtt_exit == 1) { 3043 /* Set to min rtt */ 3044 rack_set_prtt_target(rack, segsiz, 3045 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt)); 3046 } else if (rack_probertt_use_min_rtt_exit == 2) { 3047 /* Set to current gp rtt */ 3048 rack_set_prtt_target(rack, segsiz, 3049 rack->r_ctl.rc_gp_srtt); 3050 } else if (rack_probertt_use_min_rtt_exit == 3) { 3051 /* Set to entry gp rtt */ 3052 rack_set_prtt_target(rack, segsiz, 3053 rack->r_ctl.rc_entry_gp_rtt); 3054 } else { 3055 uint64_t sum; 3056 uint32_t setval; 3057 3058 sum = rack->r_ctl.rc_entry_gp_rtt; 3059 sum *= 10; 3060 sum /= (uint64_t)(max(1, rack->r_ctl.rc_gp_srtt)); 3061 if (sum >= 20) { 3062 /* 3063 * A highly buffered path needs 3064 * cwnd space for timely to work. 3065 * Lets set things up as if 3066 * we are heading back here again. 3067 */ 3068 setval = rack->r_ctl.rc_entry_gp_rtt; 3069 } else if (sum >= 15) { 3070 /* 3071 * Lets take the smaller of the 3072 * two since we are just somewhat 3073 * buffered. 3074 */ 3075 setval = rack->r_ctl.rc_gp_srtt; 3076 if (setval > rack->r_ctl.rc_entry_gp_rtt) 3077 setval = rack->r_ctl.rc_entry_gp_rtt; 3078 } else { 3079 /* 3080 * Here we are not highly buffered 3081 * and should pick the min we can to 3082 * keep from causing loss. 3083 */ 3084 setval = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); 3085 } 3086 rack_set_prtt_target(rack, segsiz, 3087 setval); 3088 } 3089 if (rack_probe_rtt_sets_cwnd > 1) { 3090 /* There is a percentage here to boost */ 3091 ebdp = rack->r_ctl.rc_target_probertt_flight; 3092 ebdp *= rack_probe_rtt_sets_cwnd; 3093 ebdp /= 100; 3094 setto = rack->r_ctl.rc_target_probertt_flight + ebdp; 3095 } else 3096 setto = rack->r_ctl.rc_target_probertt_flight; 3097 rack->rc_tp->snd_cwnd = roundup(setto, segsiz); 3098 if (rack->rc_tp->snd_cwnd < (segsiz * rack_timely_min_segs)) { 3099 /* Enforce a min */ 3100 rack->rc_tp->snd_cwnd = segsiz * rack_timely_min_segs; 3101 } 3102 /* If we set in the cwnd also set the ssthresh point so we are in CA */ 3103 rack->rc_tp->snd_ssthresh = (rack->rc_tp->snd_cwnd - 1); 3104 } 3105 rack_log_rtt_shrinks(rack, us_cts, 3106 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 3107 __LINE__, RACK_RTTS_EXITPROBE); 3108 /* Clear times last so log has all the info */ 3109 rack->r_ctl.rc_probertt_sndmax_atexit = rack->rc_tp->snd_max; 3110 rack->r_ctl.rc_time_probertt_entered = us_cts; 3111 rack->r_ctl.rc_time_probertt_starts = rack->r_ctl.rc_lower_rtt_us_cts = us_cts; 3112 rack->r_ctl.rc_time_of_last_probertt = us_cts; 3113 } 3114 3115 static void 3116 rack_check_probe_rtt(struct tcp_rack *rack, uint32_t us_cts) 3117 { 3118 /* Check in on probe-rtt */ 3119 if (rack->rc_gp_filled == 0) { 3120 /* We do not do p-rtt unless we have gp measurements */ 3121 return; 3122 } 3123 if (rack->in_probe_rtt) { 3124 uint64_t no_overflow; 3125 uint32_t endtime, must_stay; 3126 3127 if (rack->r_ctl.rc_went_idle_time && 3128 ((us_cts - rack->r_ctl.rc_went_idle_time) > rack_min_probertt_hold)) { 3129 /* 3130 * We went idle during prtt, just exit now. 3131 */ 3132 rack_exit_probertt(rack, us_cts); 3133 } else if (rack_probe_rtt_safety_val && 3134 TSTMP_GT(us_cts, rack->r_ctl.rc_time_probertt_entered) && 3135 ((us_cts - rack->r_ctl.rc_time_probertt_entered) > rack_probe_rtt_safety_val)) { 3136 /* 3137 * Probe RTT safety value triggered! 3138 */ 3139 rack_log_rtt_shrinks(rack, us_cts, 3140 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 3141 __LINE__, RACK_RTTS_SAFETY); 3142 rack_exit_probertt(rack, us_cts); 3143 } 3144 /* Calculate the max we will wait */ 3145 endtime = rack->r_ctl.rc_time_probertt_entered + (rack->r_ctl.rc_gp_srtt * rack_max_drain_wait); 3146 if (rack->rc_highly_buffered) 3147 endtime += (rack->r_ctl.rc_gp_srtt * rack_max_drain_hbp); 3148 /* Calculate the min we must wait */ 3149 must_stay = rack->r_ctl.rc_time_probertt_entered + (rack->r_ctl.rc_gp_srtt * rack_must_drain); 3150 if ((ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) > rack->r_ctl.rc_target_probertt_flight) && 3151 TSTMP_LT(us_cts, endtime)) { 3152 uint32_t calc; 3153 /* Do we lower more? */ 3154 no_exit: 3155 if (TSTMP_GT(us_cts, rack->r_ctl.rc_time_probertt_entered)) 3156 calc = us_cts - rack->r_ctl.rc_time_probertt_entered; 3157 else 3158 calc = 0; 3159 calc /= max(rack->r_ctl.rc_gp_srtt, 1); 3160 if (calc) { 3161 /* Maybe */ 3162 calc *= rack_per_of_gp_probertt_reduce; 3163 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt - calc; 3164 /* Limit it too */ 3165 if (rack->r_ctl.rack_per_of_gp_probertt < rack_per_of_gp_lowthresh) 3166 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_lowthresh; 3167 } 3168 /* We must reach target or the time set */ 3169 return; 3170 } 3171 if (rack->r_ctl.rc_time_probertt_starts == 0) { 3172 if ((TSTMP_LT(us_cts, must_stay) && 3173 rack->rc_highly_buffered) || 3174 (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) > 3175 rack->r_ctl.rc_target_probertt_flight)) { 3176 /* We are not past the must_stay time */ 3177 goto no_exit; 3178 } 3179 rack_log_rtt_shrinks(rack, us_cts, 3180 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 3181 __LINE__, RACK_RTTS_REACHTARGET); 3182 rack->r_ctl.rc_time_probertt_starts = us_cts; 3183 if (rack->r_ctl.rc_time_probertt_starts == 0) 3184 rack->r_ctl.rc_time_probertt_starts = 1; 3185 /* Restore back to our rate we want to pace at in prtt */ 3186 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt; 3187 } 3188 /* 3189 * Setup our end time, some number of gp_srtts plus 200ms. 3190 */ 3191 no_overflow = ((uint64_t)rack->r_ctl.rc_gp_srtt * 3192 (uint64_t)rack_probertt_gpsrtt_cnt_mul); 3193 if (rack_probertt_gpsrtt_cnt_div) 3194 endtime = (uint32_t)(no_overflow / (uint64_t)rack_probertt_gpsrtt_cnt_div); 3195 else 3196 endtime = 0; 3197 endtime += rack_min_probertt_hold; 3198 endtime += rack->r_ctl.rc_time_probertt_starts; 3199 if (TSTMP_GEQ(us_cts, endtime)) { 3200 /* yes, exit probertt */ 3201 rack_exit_probertt(rack, us_cts); 3202 } 3203 3204 } else if((us_cts - rack->r_ctl.rc_lower_rtt_us_cts) >= rack_time_between_probertt) { 3205 /* Go into probertt, its been too long since we went lower */ 3206 rack_enter_probertt(rack, us_cts); 3207 } 3208 } 3209 3210 static void 3211 rack_update_multiplier(struct tcp_rack *rack, int32_t timely_says, uint64_t last_bw_est, 3212 uint32_t rtt, int32_t rtt_diff) 3213 { 3214 uint64_t cur_bw, up_bnd, low_bnd, subfr; 3215 uint32_t losses; 3216 3217 if ((rack->rc_gp_dyn_mul == 0) || 3218 (rack->use_fixed_rate) || 3219 (rack->in_probe_rtt) || 3220 (rack->rc_always_pace == 0)) { 3221 /* No dynamic GP multipler in play */ 3222 return; 3223 } 3224 losses = rack->r_ctl.rc_loss_count - rack->r_ctl.rc_loss_at_start; 3225 cur_bw = rack_get_bw(rack); 3226 /* Calculate our up and down range */ 3227 up_bnd = rack->r_ctl.last_gp_comp_bw * (uint64_t)rack_gp_per_bw_mul_up; 3228 up_bnd /= 100; 3229 up_bnd += rack->r_ctl.last_gp_comp_bw; 3230 3231 subfr = (uint64_t)rack->r_ctl.last_gp_comp_bw * (uint64_t)rack_gp_per_bw_mul_down; 3232 subfr /= 100; 3233 low_bnd = rack->r_ctl.last_gp_comp_bw - subfr; 3234 if ((timely_says == 2) && (rack->r_ctl.rc_no_push_at_mrtt)) { 3235 /* 3236 * This is the case where our RTT is above 3237 * the max target and we have been configured 3238 * to just do timely no bonus up stuff in that case. 3239 * 3240 * There are two configurations, set to 1, and we 3241 * just do timely if we are over our max. If its 3242 * set above 1 then we slam the multipliers down 3243 * to 100 and then decrement per timely. 3244 */ 3245 rack_log_timely(rack, timely_says, cur_bw, low_bnd, up_bnd, 3246 __LINE__, 3); 3247 if (rack->r_ctl.rc_no_push_at_mrtt > 1) 3248 rack_validate_multipliers_at_or_below_100(rack); 3249 rack_decrease_bw_mul(rack, timely_says, rtt, rtt_diff); 3250 } else if ((last_bw_est < low_bnd) && !losses) { 3251 /* 3252 * We are decreasing this is a bit complicated this 3253 * means we are loosing ground. This could be 3254 * because another flow entered and we are competing 3255 * for b/w with it. This will push the RTT up which 3256 * makes timely unusable unless we want to get shoved 3257 * into a corner and just be backed off (the age 3258 * old problem with delay based CC). 3259 * 3260 * On the other hand if it was a route change we 3261 * would like to stay somewhat contained and not 3262 * blow out the buffers. 3263 */ 3264 rack_log_timely(rack, timely_says, cur_bw, low_bnd, up_bnd, 3265 __LINE__, 3); 3266 rack->r_ctl.last_gp_comp_bw = cur_bw; 3267 if (rack->rc_gp_bwred == 0) { 3268 /* Go into reduction counting */ 3269 rack->rc_gp_bwred = 1; 3270 rack->rc_gp_timely_dec_cnt = 0; 3271 } 3272 if ((rack->rc_gp_timely_dec_cnt < rack_timely_max_push_drop) || 3273 (timely_says == 0)) { 3274 /* 3275 * Push another time with a faster pacing 3276 * to try to gain back (we include override to 3277 * get a full raise factor). 3278 */ 3279 if ((rack->rc_gp_saw_ca && rack->r_ctl.rack_per_of_gp_ca <= rack_down_raise_thresh) || 3280 (rack->rc_gp_saw_ss && rack->r_ctl.rack_per_of_gp_ss <= rack_down_raise_thresh) || 3281 (timely_says == 0) || 3282 (rack_down_raise_thresh == 0)) { 3283 /* 3284 * Do an override up in b/w if we were 3285 * below the threshold or if the threshold 3286 * is zero we always do the raise. 3287 */ 3288 rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 1); 3289 } else { 3290 /* Log it stays the same */ 3291 rack_log_timely(rack, 0, last_bw_est, low_bnd, 0, 3292 __LINE__, 11); 3293 } 3294 rack->rc_gp_timely_dec_cnt++; 3295 /* We are not incrementing really no-count */ 3296 rack->rc_gp_incr = 0; 3297 rack->rc_gp_timely_inc_cnt = 0; 3298 } else { 3299 /* 3300 * Lets just use the RTT 3301 * information and give up 3302 * pushing. 3303 */ 3304 goto use_timely; 3305 } 3306 } else if ((timely_says != 2) && 3307 !losses && 3308 (last_bw_est > up_bnd)) { 3309 /* 3310 * We are increasing b/w lets keep going, updating 3311 * our b/w and ignoring any timely input, unless 3312 * of course we are at our max raise (if there is one). 3313 */ 3314 3315 rack_log_timely(rack, timely_says, cur_bw, low_bnd, up_bnd, 3316 __LINE__, 3); 3317 rack->r_ctl.last_gp_comp_bw = cur_bw; 3318 if (rack->rc_gp_saw_ss && 3319 rack_per_upper_bound_ss && 3320 (rack->r_ctl.rack_per_of_gp_ss == rack_per_upper_bound_ss)) { 3321 /* 3322 * In cases where we can't go higher 3323 * we should just use timely. 3324 */ 3325 goto use_timely; 3326 } 3327 if (rack->rc_gp_saw_ca && 3328 rack_per_upper_bound_ca && 3329 (rack->r_ctl.rack_per_of_gp_ca == rack_per_upper_bound_ca)) { 3330 /* 3331 * In cases where we can't go higher 3332 * we should just use timely. 3333 */ 3334 goto use_timely; 3335 } 3336 rack->rc_gp_bwred = 0; 3337 rack->rc_gp_timely_dec_cnt = 0; 3338 /* You get a set number of pushes if timely is trying to reduce */ 3339 if ((rack->rc_gp_incr < rack_timely_max_push_rise) || (timely_says == 0)) { 3340 rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 0); 3341 } else { 3342 /* Log it stays the same */ 3343 rack_log_timely(rack, 0, last_bw_est, up_bnd, 0, 3344 __LINE__, 12); 3345 } 3346 return; 3347 } else { 3348 /* 3349 * We are staying between the lower and upper range bounds 3350 * so use timely to decide. 3351 */ 3352 rack_log_timely(rack, timely_says, cur_bw, low_bnd, up_bnd, 3353 __LINE__, 3); 3354 use_timely: 3355 if (timely_says) { 3356 rack->rc_gp_incr = 0; 3357 rack->rc_gp_timely_inc_cnt = 0; 3358 if ((rack->rc_gp_timely_dec_cnt < rack_timely_max_push_drop) && 3359 !losses && 3360 (last_bw_est < low_bnd)) { 3361 /* We are loosing ground */ 3362 rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 0); 3363 rack->rc_gp_timely_dec_cnt++; 3364 /* We are not incrementing really no-count */ 3365 rack->rc_gp_incr = 0; 3366 rack->rc_gp_timely_inc_cnt = 0; 3367 } else 3368 rack_decrease_bw_mul(rack, timely_says, rtt, rtt_diff); 3369 } else { 3370 rack->rc_gp_bwred = 0; 3371 rack->rc_gp_timely_dec_cnt = 0; 3372 rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 0); 3373 } 3374 } 3375 } 3376 3377 static int32_t 3378 rack_make_timely_judgement(struct tcp_rack *rack, uint32_t rtt, int32_t rtt_diff, uint32_t prev_rtt) 3379 { 3380 int32_t timely_says; 3381 uint64_t log_mult, log_rtt_a_diff; 3382 3383 log_rtt_a_diff = rtt; 3384 log_rtt_a_diff <<= 32; 3385 log_rtt_a_diff |= (uint32_t)rtt_diff; 3386 if (rtt >= (get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * 3387 rack_gp_rtt_maxmul)) { 3388 /* Reduce the b/w multipler */ 3389 timely_says = 2; 3390 log_mult = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_maxmul; 3391 log_mult <<= 32; 3392 log_mult |= prev_rtt; 3393 rack_log_timely(rack, timely_says, log_mult, 3394 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 3395 log_rtt_a_diff, __LINE__, 4); 3396 } else if (rtt <= (get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) + 3397 ((get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_minmul) / 3398 max(rack_gp_rtt_mindiv , 1)))) { 3399 /* Increase the b/w multipler */ 3400 log_mult = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) + 3401 ((get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_minmul) / 3402 max(rack_gp_rtt_mindiv , 1)); 3403 log_mult <<= 32; 3404 log_mult |= prev_rtt; 3405 timely_says = 0; 3406 rack_log_timely(rack, timely_says, log_mult , 3407 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 3408 log_rtt_a_diff, __LINE__, 5); 3409 } else { 3410 /* 3411 * Use a gradient to find it the timely gradient 3412 * is: 3413 * grad = rc_rtt_diff / min_rtt; 3414 * 3415 * anything below or equal to 0 will be 3416 * a increase indication. Anything above 3417 * zero is a decrease. Note we take care 3418 * of the actual gradient calculation 3419 * in the reduction (its not needed for 3420 * increase). 3421 */ 3422 log_mult = prev_rtt; 3423 if (rtt_diff <= 0) { 3424 /* 3425 * Rttdiff is less than zero, increase the 3426 * b/w multipler (its 0 or negative) 3427 */ 3428 timely_says = 0; 3429 rack_log_timely(rack, timely_says, log_mult, 3430 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), log_rtt_a_diff, __LINE__, 6); 3431 } else { 3432 /* Reduce the b/w multipler */ 3433 timely_says = 1; 3434 rack_log_timely(rack, timely_says, log_mult, 3435 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), log_rtt_a_diff, __LINE__, 7); 3436 } 3437 } 3438 return (timely_says); 3439 } 3440 3441 static void 3442 rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack, 3443 tcp_seq th_ack, int line) 3444 { 3445 uint64_t tim, bytes_ps, ltim, stim, utim; 3446 uint32_t segsiz, bytes, reqbytes, us_cts; 3447 int32_t gput, new_rtt_diff, timely_says; 3448 3449 us_cts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 3450 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 3451 if (TSTMP_GEQ(us_cts, tp->gput_ts)) 3452 tim = us_cts - tp->gput_ts; 3453 else 3454 tim = 0; 3455 3456 if (TSTMP_GT(rack->r_ctl.rc_gp_cumack_ts, rack->r_ctl.rc_gp_output_ts)) 3457 stim = rack->r_ctl.rc_gp_cumack_ts - rack->r_ctl.rc_gp_output_ts; 3458 else 3459 stim = 0; 3460 /* 3461 * Use the larger of the send time or ack time. This prevents us 3462 * from being influenced by ack artifacts to come up with too 3463 * high of measurement. Note that since we are spanning over many more 3464 * bytes in most of our measurements hopefully that is less likely to 3465 * occur. 3466 */ 3467 if (tim > stim) 3468 utim = max(tim, 1); 3469 else 3470 utim = max(stim, 1); 3471 /* Lets validate utim */ 3472 ltim = max(1, (utim/HPTS_USEC_IN_MSEC)); 3473 gput = (((uint64_t) (th_ack - tp->gput_seq)) << 3) / ltim; 3474 reqbytes = min(rc_init_window(rack), (MIN_GP_WIN * segsiz)); 3475 if ((tim == 0) && (stim == 0)) { 3476 /* 3477 * Invalid measurement time, maybe 3478 * all on one ack/one send? 3479 */ 3480 bytes = 0; 3481 bytes_ps = 0; 3482 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, 3483 0, 0, 0, 10, __LINE__, NULL); 3484 goto skip_measurement; 3485 } 3486 if (rack->r_ctl.rc_gp_lowrtt == 0xffffffff) { 3487 /* We never made a us_rtt measurement? */ 3488 bytes = 0; 3489 bytes_ps = 0; 3490 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, 3491 0, 0, 0, 10, __LINE__, NULL); 3492 goto skip_measurement; 3493 } 3494 /* 3495 * Calculate the maximum possible b/w this connection 3496 * could have. We base our calculation on the lowest 3497 * rtt we have seen during the measurement and the 3498 * largest rwnd the client has given us in that time. This 3499 * forms a BDP that is the maximum that we could ever 3500 * get to the client. Anything larger is not valid. 3501 * 3502 * I originally had code here that rejected measurements 3503 * where the time was less than 1/2 the latest us_rtt. 3504 * But after thinking on that I realized its wrong since 3505 * say you had a 150Mbps or even 1Gbps link, and you 3506 * were a long way away.. example I am in Europe (100ms rtt) 3507 * talking to my 1Gbps link in S.C. Now measuring say 150,000 3508 * bytes my time would be 1.2ms, and yet my rtt would say 3509 * the measurement was invalid the time was < 50ms. The 3510 * same thing is true for 150Mb (8ms of time). 3511 * 3512 * A better way I realized is to look at what the maximum 3513 * the connection could possibly do. This is gated on 3514 * the lowest RTT we have seen and the highest rwnd. 3515 * We should in theory never exceed that, if we are 3516 * then something on the path is storing up packets 3517 * and then feeding them all at once to our endpoint 3518 * messing up our measurement. 3519 */ 3520 rack->r_ctl.last_max_bw = rack->r_ctl.rc_gp_high_rwnd; 3521 rack->r_ctl.last_max_bw *= HPTS_USEC_IN_SEC; 3522 rack->r_ctl.last_max_bw /= rack->r_ctl.rc_gp_lowrtt; 3523 if (SEQ_LT(th_ack, tp->gput_seq)) { 3524 /* No measurement can be made */ 3525 bytes = 0; 3526 bytes_ps = 0; 3527 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, 3528 0, 0, 0, 10, __LINE__, NULL); 3529 goto skip_measurement; 3530 } else 3531 bytes = (th_ack - tp->gput_seq); 3532 bytes_ps = (uint64_t)bytes; 3533 /* 3534 * Don't measure a b/w for pacing unless we have gotten at least 3535 * an initial windows worth of data in this measurement interval. 3536 * 3537 * Small numbers of bytes get badly influenced by delayed ack and 3538 * other artifacts. Note we take the initial window or our 3539 * defined minimum GP (defaulting to 10 which hopefully is the 3540 * IW). 3541 */ 3542 if (rack->rc_gp_filled == 0) { 3543 /* 3544 * The initial estimate is special. We 3545 * have blasted out an IW worth of packets 3546 * without a real valid ack ts results. We 3547 * then setup the app_limited_needs_set flag, 3548 * this should get the first ack in (probably 2 3549 * MSS worth) to be recorded as the timestamp. 3550 * We thus allow a smaller number of bytes i.e. 3551 * IW - 2MSS. 3552 */ 3553 reqbytes -= (2 * segsiz); 3554 /* Also lets fill previous for our first measurement to be neutral */ 3555 rack->r_ctl.rc_prev_gp_srtt = rack->r_ctl.rc_gp_srtt; 3556 } 3557 if ((bytes_ps < reqbytes) || rack->app_limited_needs_set) { 3558 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, 3559 rack->r_ctl.rc_app_limited_cnt, 3560 0, 0, 10, __LINE__, NULL); 3561 goto skip_measurement; 3562 } 3563 /* 3564 * We now need to calculate the Timely like status so 3565 * we can update (possibly) the b/w multipliers. 3566 */ 3567 new_rtt_diff = (int32_t)rack->r_ctl.rc_gp_srtt - (int32_t)rack->r_ctl.rc_prev_gp_srtt; 3568 if (rack->rc_gp_filled == 0) { 3569 /* No previous reading */ 3570 rack->r_ctl.rc_rtt_diff = new_rtt_diff; 3571 } else { 3572 if (rack->measure_saw_probe_rtt == 0) { 3573 /* 3574 * We don't want a probertt to be counted 3575 * since it will be negative incorrectly. We 3576 * expect to be reducing the RTT when we 3577 * pace at a slower rate. 3578 */ 3579 rack->r_ctl.rc_rtt_diff -= (rack->r_ctl.rc_rtt_diff / 8); 3580 rack->r_ctl.rc_rtt_diff += (new_rtt_diff / 8); 3581 } 3582 } 3583 timely_says = rack_make_timely_judgement(rack, 3584 rack->r_ctl.rc_gp_srtt, 3585 rack->r_ctl.rc_rtt_diff, 3586 rack->r_ctl.rc_prev_gp_srtt 3587 ); 3588 bytes_ps *= HPTS_USEC_IN_SEC; 3589 bytes_ps /= utim; 3590 if (bytes_ps > rack->r_ctl.last_max_bw) { 3591 /* 3592 * Something is on path playing 3593 * since this b/w is not possible based 3594 * on our BDP (highest rwnd and lowest rtt 3595 * we saw in the measurement window). 3596 * 3597 * Another option here would be to 3598 * instead skip the measurement. 3599 */ 3600 rack_log_pacing_delay_calc(rack, bytes, reqbytes, 3601 bytes_ps, rack->r_ctl.last_max_bw, 0, 3602 11, __LINE__, NULL); 3603 bytes_ps = rack->r_ctl.last_max_bw; 3604 } 3605 /* We store gp for b/w in bytes per second */ 3606 if (rack->rc_gp_filled == 0) { 3607 /* Initial measurment */ 3608 if (bytes_ps) { 3609 rack->r_ctl.gp_bw = bytes_ps; 3610 rack->rc_gp_filled = 1; 3611 rack->r_ctl.num_avg = 1; 3612 rack_set_pace_segments(rack->rc_tp, rack, __LINE__); 3613 } else { 3614 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, 3615 rack->r_ctl.rc_app_limited_cnt, 3616 0, 0, 10, __LINE__, NULL); 3617 } 3618 if (rack->rc_inp->inp_in_hpts && 3619 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { 3620 /* 3621 * Ok we can't trust the pacer in this case 3622 * where we transition from un-paced to paced. 3623 * Or for that matter when the burst mitigation 3624 * was making a wild guess and got it wrong. 3625 * Stop the pacer and clear up all the aggregate 3626 * delays etc. 3627 */ 3628 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); 3629 rack->r_ctl.rc_hpts_flags = 0; 3630 rack->r_ctl.rc_last_output_to = 0; 3631 } 3632 } else if (rack->r_ctl.num_avg < RACK_REQ_AVG) { 3633 /* Still a small number run an average */ 3634 rack->r_ctl.gp_bw += bytes_ps; 3635 rack->r_ctl.num_avg++; 3636 if (rack->r_ctl.num_avg >= RACK_REQ_AVG) { 3637 /* We have collected enought to move forward */ 3638 rack->r_ctl.gp_bw /= (uint64_t)rack->r_ctl.num_avg; 3639 } 3640 } else { 3641 /* 3642 * We want to take 1/wma of the goodput and add in to 7/8th 3643 * of the old value weighted by the srtt. So if your measurement 3644 * period is say 2 SRTT's long you would get 1/4 as the 3645 * value, if it was like 1/2 SRTT then you would get 1/16th. 3646 * 3647 * But we must be careful not to take too much i.e. if the 3648 * srtt is say 20ms and the measurement is taken over 3649 * 400ms our weight would be 400/20 i.e. 20. On the 3650 * other hand if we get a measurement over 1ms with a 3651 * 10ms rtt we only want to take a much smaller portion. 3652 */ 3653 uint64_t resid_bw, subpart, addpart, srtt; 3654 3655 srtt = ((uint64_t)TICKS_2_USEC(tp->t_srtt) >> TCP_RTT_SHIFT); 3656 if (srtt == 0) { 3657 /* 3658 * Strange why did t_srtt go back to zero? 3659 */ 3660 if (rack->r_ctl.rc_rack_min_rtt) 3661 srtt = (rack->r_ctl.rc_rack_min_rtt * HPTS_USEC_IN_MSEC); 3662 else 3663 srtt = HPTS_USEC_IN_MSEC; 3664 } 3665 /* 3666 * XXXrrs: Note for reviewers, in playing with 3667 * dynamic pacing I discovered this GP calculation 3668 * as done originally leads to some undesired results. 3669 * Basically you can get longer measurements contributing 3670 * too much to the WMA. Thus I changed it if you are doing 3671 * dynamic adjustments to only do the aportioned adjustment 3672 * if we have a very small (time wise) measurement. Longer 3673 * measurements just get there weight (defaulting to 1/8) 3674 * add to the WMA. We may want to think about changing 3675 * this to always do that for both sides i.e. dynamic 3676 * and non-dynamic... but considering lots of folks 3677 * were playing with this I did not want to change the 3678 * calculation per.se. without your thoughts.. Lawerence? 3679 * Peter?? 3680 */ 3681 if (rack->rc_gp_dyn_mul == 0) { 3682 subpart = rack->r_ctl.gp_bw * utim; 3683 subpart /= (srtt * 8); 3684 if (subpart < (rack->r_ctl.gp_bw / 2)) { 3685 /* 3686 * The b/w update takes no more 3687 * away then 1/2 our running total 3688 * so factor it in. 3689 */ 3690 addpart = bytes_ps * utim; 3691 addpart /= (srtt * 8); 3692 } else { 3693 /* 3694 * Don't allow a single measurement 3695 * to account for more than 1/2 of the 3696 * WMA. This could happen on a retransmission 3697 * where utim becomes huge compared to 3698 * srtt (multiple retransmissions when using 3699 * the sending rate which factors in all the 3700 * transmissions from the first one). 3701 */ 3702 subpart = rack->r_ctl.gp_bw / 2; 3703 addpart = bytes_ps / 2; 3704 } 3705 resid_bw = rack->r_ctl.gp_bw - subpart; 3706 rack->r_ctl.gp_bw = resid_bw + addpart; 3707 } else { 3708 if ((utim / srtt) <= 1) { 3709 /* 3710 * The b/w update was over a small period 3711 * of time. The idea here is to prevent a small 3712 * measurement time period from counting 3713 * too much. So we scale it based on the 3714 * time so it attributes less than 1/rack_wma_divisor 3715 * of its measurement. 3716 */ 3717 subpart = rack->r_ctl.gp_bw * utim; 3718 subpart /= (srtt * rack_wma_divisor); 3719 addpart = bytes_ps * utim; 3720 addpart /= (srtt * rack_wma_divisor); 3721 } else { 3722 /* 3723 * The scaled measurement was long 3724 * enough so lets just add in the 3725 * portion of the measurment i.e. 1/rack_wma_divisor 3726 */ 3727 subpart = rack->r_ctl.gp_bw / rack_wma_divisor; 3728 addpart = bytes_ps / rack_wma_divisor; 3729 } 3730 if ((rack->measure_saw_probe_rtt == 0) || 3731 (bytes_ps > rack->r_ctl.gp_bw)) { 3732 /* 3733 * For probe-rtt we only add it in 3734 * if its larger, all others we just 3735 * add in. 3736 */ 3737 resid_bw = rack->r_ctl.gp_bw - subpart; 3738 rack->r_ctl.gp_bw = resid_bw + addpart; 3739 } 3740 } 3741 } 3742 /* We do not update any multipliers if we are in or have seen a probe-rtt */ 3743 if ((rack->measure_saw_probe_rtt == 0) && rack->rc_gp_rtt_set) 3744 rack_update_multiplier(rack, timely_says, bytes_ps, 3745 rack->r_ctl.rc_gp_srtt, 3746 rack->r_ctl.rc_rtt_diff); 3747 rack_log_pacing_delay_calc(rack, bytes, tim, bytes_ps, stim, 3748 rack_get_bw(rack), 3, line, NULL); 3749 /* reset the gp srtt and setup the new prev */ 3750 rack->r_ctl.rc_prev_gp_srtt = rack->r_ctl.rc_gp_srtt; 3751 /* Record the lost count for the next measurement */ 3752 rack->r_ctl.rc_loss_at_start = rack->r_ctl.rc_loss_count; 3753 /* 3754 * We restart our diffs based on the gpsrtt in the 3755 * measurement window. 3756 */ 3757 rack->rc_gp_rtt_set = 0; 3758 rack->rc_gp_saw_rec = 0; 3759 rack->rc_gp_saw_ca = 0; 3760 rack->rc_gp_saw_ss = 0; 3761 rack->rc_dragged_bottom = 0; 3762 skip_measurement: 3763 3764 #ifdef STATS 3765 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_GPUT, 3766 gput); 3767 /* 3768 * XXXLAS: This is a temporary hack, and should be 3769 * chained off VOI_TCP_GPUT when stats(9) grows an 3770 * API to deal with chained VOIs. 3771 */ 3772 if (tp->t_stats_gput_prev > 0) 3773 stats_voi_update_abs_s32(tp->t_stats, 3774 VOI_TCP_GPUT_ND, 3775 ((gput - tp->t_stats_gput_prev) * 100) / 3776 tp->t_stats_gput_prev); 3777 #endif 3778 tp->t_flags &= ~TF_GPUTINPROG; 3779 tp->t_stats_gput_prev = gput; 3780 /* 3781 * Now are we app limited now and there is space from where we 3782 * were to where we want to go? 3783 * 3784 * We don't do the other case i.e. non-applimited here since 3785 * the next send will trigger us picking up the missing data. 3786 */ 3787 if (rack->r_ctl.rc_first_appl && 3788 TCPS_HAVEESTABLISHED(tp->t_state) && 3789 rack->r_ctl.rc_app_limited_cnt && 3790 (SEQ_GT(rack->r_ctl.rc_first_appl->r_start, th_ack)) && 3791 ((rack->r_ctl.rc_first_appl->r_start - th_ack) > 3792 max(rc_init_window(rack), (MIN_GP_WIN * segsiz)))) { 3793 /* 3794 * Yep there is enough outstanding to make a measurement here. 3795 */ 3796 struct rack_sendmap *rsm, fe; 3797 3798 tp->t_flags |= TF_GPUTINPROG; 3799 rack->r_ctl.rc_gp_lowrtt = 0xffffffff; 3800 rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd; 3801 tp->gput_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 3802 rack->app_limited_needs_set = 0; 3803 tp->gput_seq = th_ack; 3804 if (rack->in_probe_rtt) 3805 rack->measure_saw_probe_rtt = 1; 3806 else if ((rack->measure_saw_probe_rtt) && 3807 (SEQ_GEQ(tp->gput_seq, rack->r_ctl.rc_probertt_sndmax_atexit))) 3808 rack->measure_saw_probe_rtt = 0; 3809 if ((rack->r_ctl.rc_first_appl->r_start - th_ack) >= rack_get_measure_window(tp, rack)) { 3810 /* There is a full window to gain info from */ 3811 tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack); 3812 } else { 3813 /* We can only measure up to the applimited point */ 3814 tp->gput_ack = tp->gput_seq + (rack->r_ctl.rc_first_appl->r_start - th_ack); 3815 } 3816 /* 3817 * Now we need to find the timestamp of the send at tp->gput_seq 3818 * for the send based measurement. 3819 */ 3820 fe.r_start = tp->gput_seq; 3821 rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); 3822 if (rsm) { 3823 /* Ok send-based limit is set */ 3824 if (SEQ_LT(rsm->r_start, tp->gput_seq)) { 3825 /* 3826 * Move back to include the earlier part 3827 * so our ack time lines up right (this may 3828 * make an overlapping measurement but thats 3829 * ok). 3830 */ 3831 tp->gput_seq = rsm->r_start; 3832 } 3833 if (rsm->r_flags & RACK_ACKED) 3834 tp->gput_ts = rsm->r_ack_arrival; 3835 else 3836 rack->app_limited_needs_set = 1; 3837 rack->r_ctl.rc_gp_output_ts = rsm->usec_orig_send; 3838 } else { 3839 /* 3840 * If we don't find the rsm due to some 3841 * send-limit set the current time, which 3842 * basically disables the send-limit. 3843 */ 3844 rack->r_ctl.rc_gp_output_ts = tcp_get_usecs(NULL); 3845 } 3846 rack_log_pacing_delay_calc(rack, 3847 tp->gput_seq, 3848 tp->gput_ack, 3849 (uint64_t)rsm, 3850 tp->gput_ts, 3851 rack->r_ctl.rc_app_limited_cnt, 3852 9, 3853 __LINE__, NULL); 3854 } 3855 } 3856 3857 /* 3858 * CC wrapper hook functions 3859 */ 3860 static void 3861 rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, struct tcphdr *th, uint16_t nsegs, 3862 uint16_t type, int32_t recovery) 3863 { 3864 INP_WLOCK_ASSERT(tp->t_inpcb); 3865 tp->ccv->nsegs = nsegs; 3866 tp->ccv->bytes_this_ack = BYTES_THIS_ACK(tp, th); 3867 if ((recovery) && (rack->r_ctl.rc_early_recovery_segs)) { 3868 uint32_t max; 3869 3870 max = rack->r_ctl.rc_early_recovery_segs * ctf_fixed_maxseg(tp); 3871 if (tp->ccv->bytes_this_ack > max) { 3872 tp->ccv->bytes_this_ack = max; 3873 } 3874 } 3875 if (rack->r_ctl.cwnd_to_use <= tp->snd_wnd) 3876 tp->ccv->flags |= CCF_CWND_LIMITED; 3877 else 3878 tp->ccv->flags &= ~CCF_CWND_LIMITED; 3879 #ifdef STATS 3880 stats_voi_update_abs_s32(tp->t_stats, VOI_TCP_CALCFRWINDIFF, 3881 ((int32_t)rack->r_ctl.cwnd_to_use) - tp->snd_wnd); 3882 #endif 3883 if ((tp->t_flags & TF_GPUTINPROG) && 3884 rack_enough_for_measurement(tp, rack, th->th_ack)) { 3885 /* Measure the Goodput */ 3886 rack_do_goodput_measurement(tp, rack, th->th_ack, __LINE__); 3887 #ifdef NETFLIX_PEAKRATE 3888 if ((type == CC_ACK) && 3889 (tp->t_maxpeakrate)) { 3890 /* 3891 * We update t_peakrate_thr. This gives us roughly 3892 * one update per round trip time. Note 3893 * it will only be used if pace_always is off i.e 3894 * we don't do this for paced flows. 3895 */ 3896 tcp_update_peakrate_thr(tp); 3897 } 3898 #endif 3899 } 3900 if (rack->r_ctl.cwnd_to_use > tp->snd_ssthresh) { 3901 tp->t_bytes_acked += tp->ccv->bytes_this_ack; 3902 if (tp->t_bytes_acked >= rack->r_ctl.cwnd_to_use) { 3903 tp->t_bytes_acked -= rack->r_ctl.cwnd_to_use; 3904 tp->ccv->flags |= CCF_ABC_SENTAWND; 3905 } 3906 } else { 3907 tp->ccv->flags &= ~CCF_ABC_SENTAWND; 3908 tp->t_bytes_acked = 0; 3909 } 3910 if (CC_ALGO(tp)->ack_received != NULL) { 3911 /* XXXLAS: Find a way to live without this */ 3912 tp->ccv->curack = th->th_ack; 3913 CC_ALGO(tp)->ack_received(tp->ccv, type); 3914 } 3915 #ifdef STATS 3916 stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_LCWIN, rack->r_ctl.cwnd_to_use); 3917 #endif 3918 if (rack->r_ctl.rc_rack_largest_cwnd < rack->r_ctl.cwnd_to_use) { 3919 rack->r_ctl.rc_rack_largest_cwnd = rack->r_ctl.cwnd_to_use; 3920 } 3921 #ifdef NETFLIX_PEAKRATE 3922 /* we enforce max peak rate if it is set and we are not pacing */ 3923 if ((rack->rc_always_pace == 0) && 3924 tp->t_peakrate_thr && 3925 (tp->snd_cwnd > tp->t_peakrate_thr)) { 3926 tp->snd_cwnd = tp->t_peakrate_thr; 3927 } 3928 #endif 3929 } 3930 3931 static void 3932 tcp_rack_partialack(struct tcpcb *tp, struct tcphdr *th) 3933 { 3934 struct tcp_rack *rack; 3935 3936 rack = (struct tcp_rack *)tp->t_fb_ptr; 3937 INP_WLOCK_ASSERT(tp->t_inpcb); 3938 /* 3939 * If we are doing PRR and have enough 3940 * room to send <or> we are pacing and prr 3941 * is disabled we will want to see if we 3942 * can send data (by setting r_wanted_output to 3943 * true). 3944 */ 3945 if ((rack->r_ctl.rc_prr_sndcnt > 0) || 3946 rack->rack_no_prr) 3947 rack->r_wanted_output = 1; 3948 } 3949 3950 static void 3951 rack_post_recovery(struct tcpcb *tp, struct tcphdr *th) 3952 { 3953 struct tcp_rack *rack; 3954 uint32_t orig_cwnd; 3955 3956 orig_cwnd = tp->snd_cwnd; 3957 INP_WLOCK_ASSERT(tp->t_inpcb); 3958 rack = (struct tcp_rack *)tp->t_fb_ptr; 3959 if (rack->rc_not_backing_off == 0) { 3960 /* only alert CC if we alerted when we entered */ 3961 if (CC_ALGO(tp)->post_recovery != NULL) { 3962 tp->ccv->curack = th->th_ack; 3963 CC_ALGO(tp)->post_recovery(tp->ccv); 3964 } 3965 if (tp->snd_cwnd > tp->snd_ssthresh) { 3966 /* Drop us down to the ssthresh (1/2 cwnd at loss) */ 3967 tp->snd_cwnd = tp->snd_ssthresh; 3968 } 3969 } 3970 if ((rack->rack_no_prr == 0) && 3971 (rack->r_ctl.rc_prr_sndcnt > 0)) { 3972 /* Suck the next prr cnt back into cwnd */ 3973 tp->snd_cwnd += rack->r_ctl.rc_prr_sndcnt; 3974 rack->r_ctl.rc_prr_sndcnt = 0; 3975 rack_log_to_prr(rack, 1, 0); 3976 } 3977 rack_log_to_prr(rack, 14, orig_cwnd); 3978 tp->snd_recover = tp->snd_una; 3979 EXIT_RECOVERY(tp->t_flags); 3980 } 3981 3982 static void 3983 rack_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type) 3984 { 3985 struct tcp_rack *rack; 3986 3987 INP_WLOCK_ASSERT(tp->t_inpcb); 3988 3989 rack = (struct tcp_rack *)tp->t_fb_ptr; 3990 switch (type) { 3991 case CC_NDUPACK: 3992 tp->t_flags &= ~TF_WASFRECOVERY; 3993 tp->t_flags &= ~TF_WASCRECOVERY; 3994 if (!IN_FASTRECOVERY(tp->t_flags)) { 3995 rack->r_ctl.rc_prr_delivered = 0; 3996 rack->r_ctl.rc_prr_out = 0; 3997 if (rack->rack_no_prr == 0) { 3998 rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); 3999 rack_log_to_prr(rack, 2, 0); 4000 } 4001 rack->r_ctl.rc_prr_recovery_fs = tp->snd_max - tp->snd_una; 4002 tp->snd_recover = tp->snd_max; 4003 if (tp->t_flags2 & TF2_ECN_PERMIT) 4004 tp->t_flags2 |= TF2_ECN_SND_CWR; 4005 } 4006 break; 4007 case CC_ECN: 4008 if (!IN_CONGRECOVERY(tp->t_flags) || 4009 /* 4010 * Allow ECN reaction on ACK to CWR, if 4011 * that data segment was also CE marked. 4012 */ 4013 SEQ_GEQ(th->th_ack, tp->snd_recover)) { 4014 EXIT_CONGRECOVERY(tp->t_flags); 4015 KMOD_TCPSTAT_INC(tcps_ecn_rcwnd); 4016 tp->snd_recover = tp->snd_max + 1; 4017 if (tp->t_flags2 & TF2_ECN_PERMIT) 4018 tp->t_flags2 |= TF2_ECN_SND_CWR; 4019 } 4020 break; 4021 case CC_RTO: 4022 tp->t_dupacks = 0; 4023 tp->t_bytes_acked = 0; 4024 EXIT_RECOVERY(tp->t_flags); 4025 tp->snd_ssthresh = max(2, min(tp->snd_wnd, rack->r_ctl.cwnd_to_use) / 2 / 4026 ctf_fixed_maxseg(tp)) * ctf_fixed_maxseg(tp); 4027 tp->snd_cwnd = ctf_fixed_maxseg(tp); 4028 if (tp->t_flags2 & TF2_ECN_PERMIT) 4029 tp->t_flags2 |= TF2_ECN_SND_CWR; 4030 break; 4031 case CC_RTO_ERR: 4032 KMOD_TCPSTAT_INC(tcps_sndrexmitbad); 4033 /* RTO was unnecessary, so reset everything. */ 4034 tp->snd_cwnd = tp->snd_cwnd_prev; 4035 tp->snd_ssthresh = tp->snd_ssthresh_prev; 4036 tp->snd_recover = tp->snd_recover_prev; 4037 if (tp->t_flags & TF_WASFRECOVERY) { 4038 ENTER_FASTRECOVERY(tp->t_flags); 4039 tp->t_flags &= ~TF_WASFRECOVERY; 4040 } 4041 if (tp->t_flags & TF_WASCRECOVERY) { 4042 ENTER_CONGRECOVERY(tp->t_flags); 4043 tp->t_flags &= ~TF_WASCRECOVERY; 4044 } 4045 tp->snd_nxt = tp->snd_max; 4046 tp->t_badrxtwin = 0; 4047 break; 4048 } 4049 /* 4050 * If we are below our max rtt, don't 4051 * signal the CC control to change things. 4052 * instead set it up so that we are in 4053 * recovery but not going to back off. 4054 */ 4055 4056 if (rack->rc_highly_buffered) { 4057 /* 4058 * Do we use the higher rtt for 4059 * our threshold to not backoff (like CDG)? 4060 */ 4061 uint32_t rtt_mul, rtt_div; 4062 4063 if (rack_use_max_for_nobackoff) { 4064 rtt_mul = (rack_gp_rtt_maxmul - 1); 4065 rtt_div = 1; 4066 } else { 4067 rtt_mul = rack_gp_rtt_minmul; 4068 rtt_div = max(rack_gp_rtt_mindiv , 1); 4069 } 4070 if (rack->r_ctl.rc_gp_srtt <= (rack->r_ctl.rc_lowest_us_rtt + 4071 ((rack->r_ctl.rc_lowest_us_rtt * rtt_mul) / 4072 rtt_div))) { 4073 /* below our min threshold */ 4074 rack->rc_not_backing_off = 1; 4075 ENTER_RECOVERY(rack->rc_tp->t_flags); 4076 rack_log_rtt_shrinks(rack, 0, 4077 rtt_mul, 4078 rtt_div, 4079 RACK_RTTS_NOBACKOFF); 4080 return; 4081 } 4082 } 4083 rack->rc_not_backing_off = 0; 4084 if (CC_ALGO(tp)->cong_signal != NULL) { 4085 if (th != NULL) 4086 tp->ccv->curack = th->th_ack; 4087 CC_ALGO(tp)->cong_signal(tp->ccv, type); 4088 } 4089 } 4090 4091 static inline void 4092 rack_cc_after_idle(struct tcp_rack *rack, struct tcpcb *tp) 4093 { 4094 uint32_t i_cwnd; 4095 4096 INP_WLOCK_ASSERT(tp->t_inpcb); 4097 4098 #ifdef NETFLIX_STATS 4099 KMOD_TCPSTAT_INC(tcps_idle_restarts); 4100 if (tp->t_state == TCPS_ESTABLISHED) 4101 KMOD_TCPSTAT_INC(tcps_idle_estrestarts); 4102 #endif 4103 if (CC_ALGO(tp)->after_idle != NULL) 4104 CC_ALGO(tp)->after_idle(tp->ccv); 4105 4106 if (tp->snd_cwnd == 1) 4107 i_cwnd = tp->t_maxseg; /* SYN(-ACK) lost */ 4108 else 4109 i_cwnd = rc_init_window(rack); 4110 4111 /* 4112 * Being idle is no differnt than the initial window. If the cc 4113 * clamps it down below the initial window raise it to the initial 4114 * window. 4115 */ 4116 if (tp->snd_cwnd < i_cwnd) { 4117 tp->snd_cwnd = i_cwnd; 4118 } 4119 } 4120 4121 /* 4122 * Indicate whether this ack should be delayed. We can delay the ack if 4123 * following conditions are met: 4124 * - There is no delayed ack timer in progress. 4125 * - Our last ack wasn't a 0-sized window. We never want to delay 4126 * the ack that opens up a 0-sized window. 4127 * - LRO wasn't used for this segment. We make sure by checking that the 4128 * segment size is not larger than the MSS. 4129 * - Delayed acks are enabled or this is a half-synchronized T/TCP 4130 * connection. 4131 */ 4132 #define DELAY_ACK(tp, tlen) \ 4133 (((tp->t_flags & TF_RXWIN0SENT) == 0) && \ 4134 ((tp->t_flags & TF_DELACK) == 0) && \ 4135 (tlen <= tp->t_maxseg) && \ 4136 (tp->t_delayed_ack || (tp->t_flags & TF_NEEDSYN))) 4137 4138 static struct rack_sendmap * 4139 rack_find_lowest_rsm(struct tcp_rack *rack) 4140 { 4141 struct rack_sendmap *rsm; 4142 4143 /* 4144 * Walk the time-order transmitted list looking for an rsm that is 4145 * not acked. This will be the one that was sent the longest time 4146 * ago that is still outstanding. 4147 */ 4148 TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) { 4149 if (rsm->r_flags & RACK_ACKED) { 4150 continue; 4151 } 4152 goto finish; 4153 } 4154 finish: 4155 return (rsm); 4156 } 4157 4158 static struct rack_sendmap * 4159 rack_find_high_nonack(struct tcp_rack *rack, struct rack_sendmap *rsm) 4160 { 4161 struct rack_sendmap *prsm; 4162 4163 /* 4164 * Walk the sequence order list backward until we hit and arrive at 4165 * the highest seq not acked. In theory when this is called it 4166 * should be the last segment (which it was not). 4167 */ 4168 counter_u64_add(rack_find_high, 1); 4169 prsm = rsm; 4170 RB_FOREACH_REVERSE_FROM(prsm, rack_rb_tree_head, rsm) { 4171 if (prsm->r_flags & (RACK_ACKED | RACK_HAS_FIN)) { 4172 continue; 4173 } 4174 return (prsm); 4175 } 4176 return (NULL); 4177 } 4178 4179 static uint32_t 4180 rack_calc_thresh_rack(struct tcp_rack *rack, uint32_t srtt, uint32_t cts) 4181 { 4182 int32_t lro; 4183 uint32_t thresh; 4184 4185 /* 4186 * lro is the flag we use to determine if we have seen reordering. 4187 * If it gets set we have seen reordering. The reorder logic either 4188 * works in one of two ways: 4189 * 4190 * If reorder-fade is configured, then we track the last time we saw 4191 * re-ordering occur. If we reach the point where enough time as 4192 * passed we no longer consider reordering has occuring. 4193 * 4194 * Or if reorder-face is 0, then once we see reordering we consider 4195 * the connection to alway be subject to reordering and just set lro 4196 * to 1. 4197 * 4198 * In the end if lro is non-zero we add the extra time for 4199 * reordering in. 4200 */ 4201 if (srtt == 0) 4202 srtt = 1; 4203 if (rack->r_ctl.rc_reorder_ts) { 4204 if (rack->r_ctl.rc_reorder_fade) { 4205 if (SEQ_GEQ(cts, rack->r_ctl.rc_reorder_ts)) { 4206 lro = cts - rack->r_ctl.rc_reorder_ts; 4207 if (lro == 0) { 4208 /* 4209 * No time as passed since the last 4210 * reorder, mark it as reordering. 4211 */ 4212 lro = 1; 4213 } 4214 } else { 4215 /* Negative time? */ 4216 lro = 0; 4217 } 4218 if (lro > rack->r_ctl.rc_reorder_fade) { 4219 /* Turn off reordering seen too */ 4220 rack->r_ctl.rc_reorder_ts = 0; 4221 lro = 0; 4222 } 4223 } else { 4224 /* Reodering does not fade */ 4225 lro = 1; 4226 } 4227 } else { 4228 lro = 0; 4229 } 4230 thresh = srtt + rack->r_ctl.rc_pkt_delay; 4231 if (lro) { 4232 /* It must be set, if not you get 1/4 rtt */ 4233 if (rack->r_ctl.rc_reorder_shift) 4234 thresh += (srtt >> rack->r_ctl.rc_reorder_shift); 4235 else 4236 thresh += (srtt >> 2); 4237 } else { 4238 thresh += 1; 4239 } 4240 /* We don't let the rack timeout be above a RTO */ 4241 if (thresh > TICKS_2_MSEC(rack->rc_tp->t_rxtcur)) { 4242 thresh = TICKS_2_MSEC(rack->rc_tp->t_rxtcur); 4243 } 4244 /* And we don't want it above the RTO max either */ 4245 if (thresh > rack_rto_max) { 4246 thresh = rack_rto_max; 4247 } 4248 return (thresh); 4249 } 4250 4251 static uint32_t 4252 rack_calc_thresh_tlp(struct tcpcb *tp, struct tcp_rack *rack, 4253 struct rack_sendmap *rsm, uint32_t srtt) 4254 { 4255 struct rack_sendmap *prsm; 4256 uint32_t thresh, len; 4257 int segsiz; 4258 4259 if (srtt == 0) 4260 srtt = 1; 4261 if (rack->r_ctl.rc_tlp_threshold) 4262 thresh = srtt + (srtt / rack->r_ctl.rc_tlp_threshold); 4263 else 4264 thresh = (srtt * 2); 4265 4266 /* Get the previous sent packet, if any */ 4267 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 4268 counter_u64_add(rack_enter_tlp_calc, 1); 4269 len = rsm->r_end - rsm->r_start; 4270 if (rack->rack_tlp_threshold_use == TLP_USE_ID) { 4271 /* Exactly like the ID */ 4272 if (((tp->snd_max - tp->snd_una) - rack->r_ctl.rc_sacked + rack->r_ctl.rc_holes_rxt) <= segsiz) { 4273 uint32_t alt_thresh; 4274 /* 4275 * Compensate for delayed-ack with the d-ack time. 4276 */ 4277 counter_u64_add(rack_used_tlpmethod, 1); 4278 alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time; 4279 if (alt_thresh > thresh) 4280 thresh = alt_thresh; 4281 } 4282 } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_ONE) { 4283 /* 2.1 behavior */ 4284 prsm = TAILQ_PREV(rsm, rack_head, r_tnext); 4285 if (prsm && (len <= segsiz)) { 4286 /* 4287 * Two packets outstanding, thresh should be (2*srtt) + 4288 * possible inter-packet delay (if any). 4289 */ 4290 uint32_t inter_gap = 0; 4291 int idx, nidx; 4292 4293 counter_u64_add(rack_used_tlpmethod, 1); 4294 idx = rsm->r_rtr_cnt - 1; 4295 nidx = prsm->r_rtr_cnt - 1; 4296 if (TSTMP_GEQ(rsm->r_tim_lastsent[nidx], prsm->r_tim_lastsent[idx])) { 4297 /* Yes it was sent later (or at the same time) */ 4298 inter_gap = rsm->r_tim_lastsent[idx] - prsm->r_tim_lastsent[nidx]; 4299 } 4300 thresh += inter_gap; 4301 } else if (len <= segsiz) { 4302 /* 4303 * Possibly compensate for delayed-ack. 4304 */ 4305 uint32_t alt_thresh; 4306 4307 counter_u64_add(rack_used_tlpmethod2, 1); 4308 alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time; 4309 if (alt_thresh > thresh) 4310 thresh = alt_thresh; 4311 } 4312 } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_TWO) { 4313 /* 2.2 behavior */ 4314 if (len <= segsiz) { 4315 uint32_t alt_thresh; 4316 /* 4317 * Compensate for delayed-ack with the d-ack time. 4318 */ 4319 counter_u64_add(rack_used_tlpmethod, 1); 4320 alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time; 4321 if (alt_thresh > thresh) 4322 thresh = alt_thresh; 4323 } 4324 } 4325 /* Not above an RTO */ 4326 if (thresh > TICKS_2_MSEC(tp->t_rxtcur)) { 4327 thresh = TICKS_2_MSEC(tp->t_rxtcur); 4328 } 4329 /* Not above a RTO max */ 4330 if (thresh > rack_rto_max) { 4331 thresh = rack_rto_max; 4332 } 4333 /* Apply user supplied min TLP */ 4334 if (thresh < rack_tlp_min) { 4335 thresh = rack_tlp_min; 4336 } 4337 return (thresh); 4338 } 4339 4340 static uint32_t 4341 rack_grab_rtt(struct tcpcb *tp, struct tcp_rack *rack) 4342 { 4343 /* 4344 * We want the rack_rtt which is the 4345 * last rtt we measured. However if that 4346 * does not exist we fallback to the srtt (which 4347 * we probably will never do) and then as a last 4348 * resort we use RACK_INITIAL_RTO if no srtt is 4349 * yet set. 4350 */ 4351 if (rack->rc_rack_rtt) 4352 return(rack->rc_rack_rtt); 4353 else if (tp->t_srtt == 0) 4354 return(RACK_INITIAL_RTO); 4355 return (TICKS_2_MSEC(tp->t_srtt >> TCP_RTT_SHIFT)); 4356 } 4357 4358 static struct rack_sendmap * 4359 rack_check_recovery_mode(struct tcpcb *tp, uint32_t tsused) 4360 { 4361 /* 4362 * Check to see that we don't need to fall into recovery. We will 4363 * need to do so if our oldest transmit is past the time we should 4364 * have had an ack. 4365 */ 4366 struct tcp_rack *rack; 4367 struct rack_sendmap *rsm; 4368 int32_t idx; 4369 uint32_t srtt, thresh; 4370 4371 rack = (struct tcp_rack *)tp->t_fb_ptr; 4372 if (RB_EMPTY(&rack->r_ctl.rc_mtree)) { 4373 return (NULL); 4374 } 4375 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 4376 if (rsm == NULL) 4377 return (NULL); 4378 4379 if (rsm->r_flags & RACK_ACKED) { 4380 rsm = rack_find_lowest_rsm(rack); 4381 if (rsm == NULL) 4382 return (NULL); 4383 } 4384 idx = rsm->r_rtr_cnt - 1; 4385 srtt = rack_grab_rtt(tp, rack); 4386 thresh = rack_calc_thresh_rack(rack, srtt, tsused); 4387 if (TSTMP_LT(tsused, rsm->r_tim_lastsent[idx])) { 4388 return (NULL); 4389 } 4390 if ((tsused - rsm->r_tim_lastsent[idx]) < thresh) { 4391 return (NULL); 4392 } 4393 /* Ok if we reach here we are over-due and this guy can be sent */ 4394 if (IN_RECOVERY(tp->t_flags) == 0) { 4395 /* 4396 * For the one that enters us into recovery record undo 4397 * info. 4398 */ 4399 rack->r_ctl.rc_rsm_start = rsm->r_start; 4400 rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; 4401 rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; 4402 } 4403 rack_cong_signal(tp, NULL, CC_NDUPACK); 4404 return (rsm); 4405 } 4406 4407 static uint32_t 4408 rack_get_persists_timer_val(struct tcpcb *tp, struct tcp_rack *rack) 4409 { 4410 int32_t t; 4411 int32_t tt; 4412 uint32_t ret_val; 4413 4414 t = TICKS_2_MSEC((tp->t_srtt >> TCP_RTT_SHIFT) + ((tp->t_rttvar * 4) >> TCP_RTT_SHIFT)); 4415 TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift], 4416 rack_persist_min, rack_persist_max); 4417 if (tp->t_rxtshift < TCP_MAXRXTSHIFT) 4418 tp->t_rxtshift++; 4419 rack->r_ctl.rc_hpts_flags |= PACE_TMR_PERSIT; 4420 ret_val = (uint32_t)tt; 4421 return (ret_val); 4422 } 4423 4424 static uint32_t 4425 rack_timer_start(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int sup_rack) 4426 { 4427 /* 4428 * Start the FR timer, we do this based on getting the first one in 4429 * the rc_tmap. Note that if its NULL we must stop the timer. in all 4430 * events we need to stop the running timer (if its running) before 4431 * starting the new one. 4432 */ 4433 uint32_t thresh, exp, to, srtt, time_since_sent, tstmp_touse; 4434 uint32_t srtt_cur; 4435 int32_t idx; 4436 int32_t is_tlp_timer = 0; 4437 struct rack_sendmap *rsm; 4438 4439 if (rack->t_timers_stopped) { 4440 /* All timers have been stopped none are to run */ 4441 return (0); 4442 } 4443 if (rack->rc_in_persist) { 4444 /* We can't start any timer in persists */ 4445 return (rack_get_persists_timer_val(tp, rack)); 4446 } 4447 rack->rc_on_min_to = 0; 4448 if ((tp->t_state < TCPS_ESTABLISHED) || 4449 ((tp->t_flags & TF_SACK_PERMIT) == 0)) 4450 goto activate_rxt; 4451 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 4452 if ((rsm == NULL) || sup_rack) { 4453 /* Nothing on the send map */ 4454 activate_rxt: 4455 time_since_sent = 0; 4456 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 4457 if (rsm) { 4458 idx = rsm->r_rtr_cnt - 1; 4459 if (TSTMP_GEQ(rsm->r_tim_lastsent[idx], rack->r_ctl.rc_tlp_rxt_last_time)) 4460 tstmp_touse = rsm->r_tim_lastsent[idx]; 4461 else 4462 tstmp_touse = rack->r_ctl.rc_tlp_rxt_last_time; 4463 if (TSTMP_GT(cts, tstmp_touse)) 4464 time_since_sent = cts - tstmp_touse; 4465 } 4466 if (SEQ_LT(tp->snd_una, tp->snd_max) || sbavail(&(tp->t_inpcb->inp_socket->so_snd))) { 4467 rack->r_ctl.rc_hpts_flags |= PACE_TMR_RXT; 4468 to = TICKS_2_MSEC(tp->t_rxtcur); 4469 if (to > time_since_sent) 4470 to -= time_since_sent; 4471 else 4472 to = rack->r_ctl.rc_min_to; 4473 if (to == 0) 4474 to = 1; 4475 return (to); 4476 } 4477 return (0); 4478 } 4479 if (rsm->r_flags & RACK_ACKED) { 4480 rsm = rack_find_lowest_rsm(rack); 4481 if (rsm == NULL) { 4482 /* No lowest? */ 4483 goto activate_rxt; 4484 } 4485 } 4486 if (rack->sack_attack_disable) { 4487 /* 4488 * We don't want to do 4489 * any TLP's if you are an attacker. 4490 * Though if you are doing what 4491 * is expected you may still have 4492 * SACK-PASSED marks. 4493 */ 4494 goto activate_rxt; 4495 } 4496 /* Convert from ms to usecs */ 4497 if ((rsm->r_flags & RACK_SACK_PASSED) || (rsm->r_dupack >= DUP_ACK_THRESHOLD)) { 4498 if ((tp->t_flags & TF_SENTFIN) && 4499 ((tp->snd_max - tp->snd_una) == 1) && 4500 (rsm->r_flags & RACK_HAS_FIN)) { 4501 /* 4502 * We don't start a rack timer if all we have is a 4503 * FIN outstanding. 4504 */ 4505 goto activate_rxt; 4506 } 4507 if ((rack->use_rack_rr == 0) && 4508 (IN_RECOVERY(tp->t_flags)) && 4509 (rack->rack_no_prr == 0) && 4510 (rack->r_ctl.rc_prr_sndcnt < ctf_fixed_maxseg(tp))) { 4511 /* 4512 * We are not cheating, in recovery and 4513 * not enough ack's to yet get our next 4514 * retransmission out. 4515 * 4516 * Note that classified attackers do not 4517 * get to use the rack-cheat. 4518 */ 4519 goto activate_tlp; 4520 } 4521 srtt = rack_grab_rtt(tp, rack); 4522 thresh = rack_calc_thresh_rack(rack, srtt, cts); 4523 idx = rsm->r_rtr_cnt - 1; 4524 exp = rsm->r_tim_lastsent[idx] + thresh; 4525 if (SEQ_GEQ(exp, cts)) { 4526 to = exp - cts; 4527 if (to < rack->r_ctl.rc_min_to) { 4528 to = rack->r_ctl.rc_min_to; 4529 if (rack->r_rr_config == 3) 4530 rack->rc_on_min_to = 1; 4531 } 4532 } else { 4533 to = rack->r_ctl.rc_min_to; 4534 if (rack->r_rr_config == 3) 4535 rack->rc_on_min_to = 1; 4536 } 4537 } else { 4538 /* Ok we need to do a TLP not RACK */ 4539 activate_tlp: 4540 if ((rack->rc_tlp_in_progress != 0) && 4541 (rack->r_ctl.rc_tlp_cnt_out >= rack_tlp_limit)) { 4542 /* 4543 * The previous send was a TLP and we have sent 4544 * N TLP's without sending new data. 4545 */ 4546 goto activate_rxt; 4547 } 4548 rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext); 4549 if (rsm == NULL) { 4550 /* We found no rsm to TLP with. */ 4551 goto activate_rxt; 4552 } 4553 if (rsm->r_flags & RACK_HAS_FIN) { 4554 /* If its a FIN we dont do TLP */ 4555 rsm = NULL; 4556 goto activate_rxt; 4557 } 4558 idx = rsm->r_rtr_cnt - 1; 4559 time_since_sent = 0; 4560 if (TSTMP_GEQ(rsm->r_tim_lastsent[idx], rack->r_ctl.rc_tlp_rxt_last_time)) 4561 tstmp_touse = rsm->r_tim_lastsent[idx]; 4562 else 4563 tstmp_touse = rack->r_ctl.rc_tlp_rxt_last_time; 4564 if (TSTMP_GT(cts, tstmp_touse)) 4565 time_since_sent = cts - tstmp_touse; 4566 is_tlp_timer = 1; 4567 if (tp->t_srtt) { 4568 srtt_cur = (tp->t_srtt >> TCP_RTT_SHIFT); 4569 srtt = TICKS_2_MSEC(srtt_cur); 4570 } else 4571 srtt = RACK_INITIAL_RTO; 4572 /* 4573 * If the SRTT is not keeping up and the 4574 * rack RTT has spiked we want to use 4575 * the last RTT not the smoothed one. 4576 */ 4577 if (rack_tlp_use_greater && (srtt < rack_grab_rtt(tp, rack))) 4578 srtt = rack_grab_rtt(tp, rack); 4579 thresh = rack_calc_thresh_tlp(tp, rack, rsm, srtt); 4580 if (thresh > time_since_sent) 4581 to = thresh - time_since_sent; 4582 else { 4583 to = rack->r_ctl.rc_min_to; 4584 rack_log_alt_to_to_cancel(rack, 4585 thresh, /* flex1 */ 4586 time_since_sent, /* flex2 */ 4587 tstmp_touse, /* flex3 */ 4588 rack->r_ctl.rc_tlp_rxt_last_time, /* flex4 */ 4589 rsm->r_tim_lastsent[idx], 4590 srtt, 4591 idx, 99); 4592 } 4593 if (to > TCPTV_REXMTMAX) { 4594 /* 4595 * If the TLP time works out to larger than the max 4596 * RTO lets not do TLP.. just RTO. 4597 */ 4598 goto activate_rxt; 4599 } 4600 } 4601 if (is_tlp_timer == 0) { 4602 rack->r_ctl.rc_hpts_flags |= PACE_TMR_RACK; 4603 } else { 4604 rack->r_ctl.rc_hpts_flags |= PACE_TMR_TLP; 4605 } 4606 if (to == 0) 4607 to = 1; 4608 return (to); 4609 } 4610 4611 static void 4612 rack_enter_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 4613 { 4614 if (rack->rc_in_persist == 0) { 4615 if (tp->t_flags & TF_GPUTINPROG) { 4616 /* 4617 * Stop the goodput now, the calling of the 4618 * measurement function clears the flag. 4619 */ 4620 rack_do_goodput_measurement(tp, rack, tp->snd_una, __LINE__); 4621 } 4622 #ifdef NETFLIX_SHARED_CWND 4623 if (rack->r_ctl.rc_scw) { 4624 tcp_shared_cwnd_idle(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index); 4625 rack->rack_scwnd_is_idle = 1; 4626 } 4627 #endif 4628 rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL); 4629 if (rack->r_ctl.rc_went_idle_time == 0) 4630 rack->r_ctl.rc_went_idle_time = 1; 4631 rack_timer_cancel(tp, rack, cts, __LINE__); 4632 tp->t_rxtshift = 0; 4633 rack->rc_in_persist = 1; 4634 } 4635 } 4636 4637 static void 4638 rack_exit_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 4639 { 4640 if (rack->rc_inp->inp_in_hpts) { 4641 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); 4642 rack->r_ctl.rc_hpts_flags = 0; 4643 } 4644 #ifdef NETFLIX_SHARED_CWND 4645 if (rack->r_ctl.rc_scw) { 4646 tcp_shared_cwnd_active(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index); 4647 rack->rack_scwnd_is_idle = 0; 4648 } 4649 #endif 4650 if (rack->rc_gp_dyn_mul && 4651 (rack->use_fixed_rate == 0) && 4652 (rack->rc_always_pace)) { 4653 /* 4654 * Do we count this as if a probe-rtt just 4655 * finished? 4656 */ 4657 uint32_t time_idle, idle_min; 4658 4659 time_idle = tcp_get_usecs(NULL) - rack->r_ctl.rc_went_idle_time; 4660 idle_min = rack_min_probertt_hold; 4661 if (rack_probertt_gpsrtt_cnt_div) { 4662 uint64_t extra; 4663 extra = (uint64_t)rack->r_ctl.rc_gp_srtt * 4664 (uint64_t)rack_probertt_gpsrtt_cnt_mul; 4665 extra /= (uint64_t)rack_probertt_gpsrtt_cnt_div; 4666 idle_min += (uint32_t)extra; 4667 } 4668 if (time_idle >= idle_min) { 4669 /* Yes, we count it as a probe-rtt. */ 4670 uint32_t us_cts; 4671 4672 us_cts = tcp_get_usecs(NULL); 4673 if (rack->in_probe_rtt == 0) { 4674 rack->r_ctl.rc_lower_rtt_us_cts = us_cts; 4675 rack->r_ctl.rc_time_probertt_entered = rack->r_ctl.rc_lower_rtt_us_cts; 4676 rack->r_ctl.rc_time_probertt_starts = rack->r_ctl.rc_lower_rtt_us_cts; 4677 rack->r_ctl.rc_time_of_last_probertt = rack->r_ctl.rc_lower_rtt_us_cts; 4678 } else { 4679 rack_exit_probertt(rack, us_cts); 4680 } 4681 } 4682 } 4683 rack->rc_in_persist = 0; 4684 rack->r_ctl.rc_went_idle_time = 0; 4685 tp->t_rxtshift = 0; 4686 rack->r_ctl.rc_agg_delayed = 0; 4687 rack->r_early = 0; 4688 rack->r_late = 0; 4689 rack->r_ctl.rc_agg_early = 0; 4690 } 4691 4692 static void 4693 rack_log_hpts_diag(struct tcp_rack *rack, uint32_t cts, 4694 struct hpts_diag *diag, struct timeval *tv) 4695 { 4696 if (rack_verbose_logging && rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 4697 union tcp_log_stackspecific log; 4698 4699 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 4700 log.u_bbr.flex1 = diag->p_nxt_slot; 4701 log.u_bbr.flex2 = diag->p_cur_slot; 4702 log.u_bbr.flex3 = diag->slot_req; 4703 log.u_bbr.flex4 = diag->inp_hptsslot; 4704 log.u_bbr.flex5 = diag->slot_remaining; 4705 log.u_bbr.flex6 = diag->need_new_to; 4706 log.u_bbr.flex7 = diag->p_hpts_active; 4707 log.u_bbr.flex8 = diag->p_on_min_sleep; 4708 /* Hijack other fields as needed */ 4709 log.u_bbr.epoch = diag->have_slept; 4710 log.u_bbr.lt_epoch = diag->yet_to_sleep; 4711 log.u_bbr.pkts_out = diag->co_ret; 4712 log.u_bbr.applimited = diag->hpts_sleep_time; 4713 log.u_bbr.delivered = diag->p_prev_slot; 4714 log.u_bbr.inflight = diag->p_runningtick; 4715 log.u_bbr.bw_inuse = diag->wheel_tick; 4716 log.u_bbr.rttProp = diag->wheel_cts; 4717 log.u_bbr.timeStamp = cts; 4718 log.u_bbr.delRate = diag->maxticks; 4719 log.u_bbr.cur_del_rate = diag->p_curtick; 4720 log.u_bbr.cur_del_rate <<= 32; 4721 log.u_bbr.cur_del_rate |= diag->p_lasttick; 4722 TCP_LOG_EVENTP(rack->rc_tp, NULL, 4723 &rack->rc_inp->inp_socket->so_rcv, 4724 &rack->rc_inp->inp_socket->so_snd, 4725 BBR_LOG_HPTSDIAG, 0, 4726 0, &log, false, tv); 4727 } 4728 4729 } 4730 4731 static void 4732 rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, 4733 int32_t slot, uint32_t tot_len_this_send, int sup_rack) 4734 { 4735 struct hpts_diag diag; 4736 struct inpcb *inp; 4737 struct timeval tv; 4738 uint32_t delayed_ack = 0; 4739 uint32_t hpts_timeout; 4740 uint8_t stopped; 4741 uint32_t left = 0; 4742 uint32_t us_cts; 4743 4744 inp = tp->t_inpcb; 4745 if ((tp->t_state == TCPS_CLOSED) || 4746 (tp->t_state == TCPS_LISTEN)) { 4747 return; 4748 } 4749 if (inp->inp_in_hpts) { 4750 /* Already on the pacer */ 4751 return; 4752 } 4753 stopped = rack->rc_tmr_stopped; 4754 if (stopped && TSTMP_GT(rack->r_ctl.rc_timer_exp, cts)) { 4755 left = rack->r_ctl.rc_timer_exp - cts; 4756 } 4757 rack->r_ctl.rc_timer_exp = 0; 4758 rack->r_ctl.rc_hpts_flags = 0; 4759 us_cts = tcp_get_usecs(&tv); 4760 /* Now early/late accounting */ 4761 if (rack->r_early) { 4762 /* 4763 * We have a early carry over set, 4764 * we can always add more time so we 4765 * can always make this compensation. 4766 */ 4767 slot += rack->r_ctl.rc_agg_early; 4768 rack->r_early = 0; 4769 rack->r_ctl.rc_agg_early = 0; 4770 } 4771 if (rack->r_late) { 4772 /* 4773 * This is harder, we can 4774 * compensate some but it 4775 * really depends on what 4776 * the current pacing time is. 4777 */ 4778 if (rack->r_ctl.rc_agg_delayed >= slot) { 4779 /* 4780 * We can't compensate for it all. 4781 * And we have to have some time 4782 * on the clock. We always have a min 4783 * 10 slots (10 x 10 i.e. 100 usecs). 4784 */ 4785 if (slot <= HPTS_TICKS_PER_USEC) { 4786 /* We gain delay */ 4787 rack->r_ctl.rc_agg_delayed += (HPTS_TICKS_PER_USEC - slot); 4788 slot = HPTS_TICKS_PER_USEC; 4789 } else { 4790 /* We take off some */ 4791 rack->r_ctl.rc_agg_delayed -= (slot - HPTS_TICKS_PER_USEC); 4792 slot = HPTS_TICKS_PER_USEC; 4793 } 4794 } else { 4795 slot -= rack->r_ctl.rc_agg_delayed; 4796 rack->r_ctl.rc_agg_delayed = 0; 4797 /* Make sure we have 100 useconds at minimum */ 4798 if (slot < HPTS_TICKS_PER_USEC) { 4799 rack->r_ctl.rc_agg_delayed = HPTS_TICKS_PER_USEC - slot; 4800 slot = HPTS_TICKS_PER_USEC; 4801 } 4802 if (rack->r_ctl.rc_agg_delayed == 0) 4803 rack->r_late = 0; 4804 } 4805 } 4806 if (slot) { 4807 /* We are pacing too */ 4808 rack->r_ctl.rc_hpts_flags |= PACE_PKT_OUTPUT; 4809 } 4810 hpts_timeout = rack_timer_start(tp, rack, cts, sup_rack); 4811 #ifdef NETFLIX_EXP_DETECTION 4812 if (rack->sack_attack_disable && 4813 (slot < tcp_sad_pacing_interval)) { 4814 /* 4815 * We have a potential attacker on 4816 * the line. We have possibly some 4817 * (or now) pacing time set. We want to 4818 * slow down the processing of sacks by some 4819 * amount (if it is an attacker). Set the default 4820 * slot for attackers in place (unless the orginal 4821 * interval is longer). Its stored in 4822 * micro-seconds, so lets convert to msecs. 4823 */ 4824 slot = tcp_sad_pacing_interval; 4825 } 4826 #endif 4827 if (tp->t_flags & TF_DELACK) { 4828 delayed_ack = TICKS_2_MSEC(tcp_delacktime); 4829 rack->r_ctl.rc_hpts_flags |= PACE_TMR_DELACK; 4830 } 4831 if (delayed_ack && ((hpts_timeout == 0) || 4832 (delayed_ack < hpts_timeout))) 4833 hpts_timeout = delayed_ack; 4834 else 4835 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK; 4836 /* 4837 * If no timers are going to run and we will fall off the hptsi 4838 * wheel, we resort to a keep-alive timer if its configured. 4839 */ 4840 if ((hpts_timeout == 0) && 4841 (slot == 0)) { 4842 if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && 4843 (tp->t_state <= TCPS_CLOSING)) { 4844 /* 4845 * Ok we have no timer (persists, rack, tlp, rxt or 4846 * del-ack), we don't have segments being paced. So 4847 * all that is left is the keepalive timer. 4848 */ 4849 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 4850 /* Get the established keep-alive time */ 4851 hpts_timeout = TP_KEEPIDLE(tp); 4852 } else { 4853 /* Get the initial setup keep-alive time */ 4854 hpts_timeout = TP_KEEPINIT(tp); 4855 } 4856 rack->r_ctl.rc_hpts_flags |= PACE_TMR_KEEP; 4857 if (rack->in_probe_rtt) { 4858 /* 4859 * We want to instead not wake up a long time from 4860 * now but to wake up about the time we would 4861 * exit probe-rtt and initiate a keep-alive ack. 4862 * This will get us out of probe-rtt and update 4863 * our min-rtt. 4864 */ 4865 hpts_timeout = (rack_min_probertt_hold / HPTS_USEC_IN_MSEC); 4866 } 4867 } 4868 } 4869 if (left && (stopped & (PACE_TMR_KEEP | PACE_TMR_DELACK)) == 4870 (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK)) { 4871 /* 4872 * RACK, TLP, persists and RXT timers all are restartable 4873 * based on actions input .. i.e we received a packet (ack 4874 * or sack) and that changes things (rw, or snd_una etc). 4875 * Thus we can restart them with a new value. For 4876 * keep-alive, delayed_ack we keep track of what was left 4877 * and restart the timer with a smaller value. 4878 */ 4879 if (left < hpts_timeout) 4880 hpts_timeout = left; 4881 } 4882 if (hpts_timeout) { 4883 /* 4884 * Hack alert for now we can't time-out over 2,147,483 4885 * seconds (a bit more than 596 hours), which is probably ok 4886 * :). 4887 */ 4888 if (hpts_timeout > 0x7ffffffe) 4889 hpts_timeout = 0x7ffffffe; 4890 rack->r_ctl.rc_timer_exp = cts + hpts_timeout; 4891 } 4892 if ((rack->rc_gp_filled == 0) && 4893 (hpts_timeout < slot) && 4894 (rack->r_ctl.rc_hpts_flags & (PACE_TMR_TLP|PACE_TMR_RXT))) { 4895 /* 4896 * We have no good estimate yet for the 4897 * old clunky burst mitigation or the 4898 * real pacing. And the tlp or rxt is smaller 4899 * than the pacing calculation. Lets not 4900 * pace that long since we know the calculation 4901 * so far is not accurate. 4902 */ 4903 slot = hpts_timeout; 4904 } 4905 rack->r_ctl.last_pacing_time = slot; 4906 if (slot) { 4907 rack->r_ctl.rc_last_output_to = us_cts + slot; 4908 if (rack->rc_always_pace || rack->r_mbuf_queue) { 4909 if ((rack->rc_gp_filled == 0) || 4910 rack->pacing_longer_than_rtt) { 4911 inp->inp_flags2 &= ~(INP_DONT_SACK_QUEUE|INP_MBUF_QUEUE_READY); 4912 } else { 4913 inp->inp_flags2 |= INP_MBUF_QUEUE_READY; 4914 if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK) && 4915 (rack->r_rr_config != 3)) 4916 inp->inp_flags2 |= INP_DONT_SACK_QUEUE; 4917 else 4918 inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE; 4919 } 4920 } 4921 if ((rack->use_rack_rr) && 4922 (rack->r_rr_config < 2) && 4923 ((hpts_timeout) && ((hpts_timeout * HPTS_USEC_IN_MSEC) < slot))) { 4924 /* 4925 * Arrange for the hpts to kick back in after the 4926 * t-o if the t-o does not cause a send. 4927 */ 4928 (void)tcp_hpts_insert_diag(tp->t_inpcb, HPTS_MS_TO_SLOTS(hpts_timeout), 4929 __LINE__, &diag); 4930 rack_log_hpts_diag(rack, us_cts, &diag, &tv); 4931 rack_log_to_start(rack, cts, hpts_timeout, slot, 0); 4932 } else { 4933 (void)tcp_hpts_insert_diag(tp->t_inpcb, HPTS_USEC_TO_SLOTS(slot), 4934 __LINE__, &diag); 4935 rack_log_hpts_diag(rack, us_cts, &diag, &tv); 4936 rack_log_to_start(rack, cts, hpts_timeout, slot, 1); 4937 } 4938 } else if (hpts_timeout) { 4939 if (rack->rc_always_pace || rack->r_mbuf_queue) { 4940 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK) { 4941 /* For a rack timer, don't wake us */ 4942 inp->inp_flags2 |= INP_MBUF_QUEUE_READY; 4943 if (rack->r_rr_config != 3) 4944 inp->inp_flags2 |= INP_DONT_SACK_QUEUE; 4945 else 4946 inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE; 4947 } else { 4948 /* All other timers wake us up */ 4949 inp->inp_flags2 &= ~INP_MBUF_QUEUE_READY; 4950 inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE; 4951 } 4952 } 4953 (void)tcp_hpts_insert_diag(tp->t_inpcb, HPTS_MS_TO_SLOTS(hpts_timeout), 4954 __LINE__, &diag); 4955 rack_log_hpts_diag(rack, us_cts, &diag, &tv); 4956 rack_log_to_start(rack, cts, hpts_timeout, slot, 0); 4957 } else { 4958 /* No timer starting */ 4959 #ifdef INVARIANTS 4960 if (SEQ_GT(tp->snd_max, tp->snd_una)) { 4961 panic("tp:%p rack:%p tlts:%d cts:%u slot:%u pto:%u -- no timer started?", 4962 tp, rack, tot_len_this_send, cts, slot, hpts_timeout); 4963 } 4964 #endif 4965 } 4966 rack->rc_tmr_stopped = 0; 4967 if (slot) 4968 rack_log_type_bbrsnd(rack, tot_len_this_send, slot, us_cts, &tv); 4969 } 4970 4971 /* 4972 * RACK Timer, here we simply do logging and house keeping. 4973 * the normal rack_output() function will call the 4974 * appropriate thing to check if we need to do a RACK retransmit. 4975 * We return 1, saying don't proceed with rack_output only 4976 * when all timers have been stopped (destroyed PCB?). 4977 */ 4978 static int 4979 rack_timeout_rack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 4980 { 4981 /* 4982 * This timer simply provides an internal trigger to send out data. 4983 * The check_recovery_mode call will see if there are needed 4984 * retransmissions, if so we will enter fast-recovery. The output 4985 * call may or may not do the same thing depending on sysctl 4986 * settings. 4987 */ 4988 struct rack_sendmap *rsm; 4989 int32_t recovery; 4990 4991 if (tp->t_timers->tt_flags & TT_STOPPED) { 4992 return (1); 4993 } 4994 recovery = IN_RECOVERY(tp->t_flags); 4995 counter_u64_add(rack_to_tot, 1); 4996 if (rack->r_state && (rack->r_state != tp->t_state)) 4997 rack_set_state(tp, rack); 4998 rack->rc_on_min_to = 0; 4999 rsm = rack_check_recovery_mode(tp, cts); 5000 rack_log_to_event(rack, RACK_TO_FRM_RACK, rsm); 5001 if (rsm) { 5002 uint32_t rtt; 5003 5004 rack->r_ctl.rc_resend = rsm; 5005 if (rack->use_rack_rr) { 5006 /* 5007 * Don't accumulate extra pacing delay 5008 * we are allowing the rack timer to 5009 * over-ride pacing i.e. rrr takes precedence 5010 * if the pacing interval is longer than the rrr 5011 * time (in other words we get the min pacing 5012 * time versus rrr pacing time). 5013 */ 5014 rack->r_timer_override = 1; 5015 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 5016 } 5017 rtt = rack->rc_rack_rtt; 5018 if (rtt == 0) 5019 rtt = 1; 5020 if (rack->rack_no_prr == 0) { 5021 if ((recovery == 0) && 5022 (rack->r_ctl.rc_prr_sndcnt < ctf_fixed_maxseg(tp))) { 5023 /* 5024 * The rack-timeout that enter's us into recovery 5025 * will force out one MSS and set us up so that we 5026 * can do one more send in 2*rtt (transitioning the 5027 * rack timeout into a rack-tlp). 5028 */ 5029 rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); 5030 rack->r_timer_override = 1; 5031 rack_log_to_prr(rack, 3, 0); 5032 } else if ((rack->r_ctl.rc_prr_sndcnt < (rsm->r_end - rsm->r_start)) && 5033 rack->use_rack_rr) { 5034 /* 5035 * When a rack timer goes, if the rack rr is 5036 * on, arrange it so we can send a full segment 5037 * overriding prr (though we pay a price for this 5038 * for future new sends). 5039 */ 5040 rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); 5041 rack_log_to_prr(rack, 4, 0); 5042 } 5043 } 5044 } 5045 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RACK; 5046 if (rsm == NULL) { 5047 /* restart a timer and return 1 */ 5048 rack_start_hpts_timer(rack, tp, cts, 5049 0, 0, 0); 5050 return (1); 5051 } 5052 return (0); 5053 } 5054 5055 static __inline void 5056 rack_clone_rsm(struct tcp_rack *rack, struct rack_sendmap *nrsm, 5057 struct rack_sendmap *rsm, uint32_t start) 5058 { 5059 int idx; 5060 5061 nrsm->r_start = start; 5062 nrsm->r_end = rsm->r_end; 5063 nrsm->r_rtr_cnt = rsm->r_rtr_cnt; 5064 nrsm->r_flags = rsm->r_flags; 5065 nrsm->r_dupack = rsm->r_dupack; 5066 nrsm->usec_orig_send = rsm->usec_orig_send; 5067 nrsm->r_rtr_bytes = 0; 5068 rsm->r_end = nrsm->r_start; 5069 nrsm->r_just_ret = rsm->r_just_ret; 5070 for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { 5071 nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; 5072 } 5073 } 5074 5075 static struct rack_sendmap * 5076 rack_merge_rsm(struct tcp_rack *rack, 5077 struct rack_sendmap *l_rsm, 5078 struct rack_sendmap *r_rsm) 5079 { 5080 /* 5081 * We are merging two ack'd RSM's, 5082 * the l_rsm is on the left (lower seq 5083 * values) and the r_rsm is on the right 5084 * (higher seq value). The simplest way 5085 * to merge these is to move the right 5086 * one into the left. I don't think there 5087 * is any reason we need to try to find 5088 * the oldest (or last oldest retransmitted). 5089 */ 5090 struct rack_sendmap *rm; 5091 5092 l_rsm->r_end = r_rsm->r_end; 5093 if (l_rsm->r_dupack < r_rsm->r_dupack) 5094 l_rsm->r_dupack = r_rsm->r_dupack; 5095 if (r_rsm->r_rtr_bytes) 5096 l_rsm->r_rtr_bytes += r_rsm->r_rtr_bytes; 5097 if (r_rsm->r_in_tmap) { 5098 /* This really should not happen */ 5099 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, r_rsm, r_tnext); 5100 r_rsm->r_in_tmap = 0; 5101 } 5102 5103 /* Now the flags */ 5104 if (r_rsm->r_flags & RACK_HAS_FIN) 5105 l_rsm->r_flags |= RACK_HAS_FIN; 5106 if (r_rsm->r_flags & RACK_TLP) 5107 l_rsm->r_flags |= RACK_TLP; 5108 if (r_rsm->r_flags & RACK_RWND_COLLAPSED) 5109 l_rsm->r_flags |= RACK_RWND_COLLAPSED; 5110 if ((r_rsm->r_flags & RACK_APP_LIMITED) && 5111 ((l_rsm->r_flags & RACK_APP_LIMITED) == 0)) { 5112 /* 5113 * If both are app-limited then let the 5114 * free lower the count. If right is app 5115 * limited and left is not, transfer. 5116 */ 5117 l_rsm->r_flags |= RACK_APP_LIMITED; 5118 r_rsm->r_flags &= ~RACK_APP_LIMITED; 5119 if (r_rsm == rack->r_ctl.rc_first_appl) 5120 rack->r_ctl.rc_first_appl = l_rsm; 5121 } 5122 rm = RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, r_rsm); 5123 #ifdef INVARIANTS 5124 if (rm != r_rsm) { 5125 panic("removing head in rack:%p rsm:%p rm:%p", 5126 rack, r_rsm, rm); 5127 } 5128 #endif 5129 if ((r_rsm->r_limit_type == 0) && (l_rsm->r_limit_type != 0)) { 5130 /* Transfer the split limit to the map we free */ 5131 r_rsm->r_limit_type = l_rsm->r_limit_type; 5132 l_rsm->r_limit_type = 0; 5133 } 5134 rack_free(rack, r_rsm); 5135 return(l_rsm); 5136 } 5137 5138 /* 5139 * TLP Timer, here we simply setup what segment we want to 5140 * have the TLP expire on, the normal rack_output() will then 5141 * send it out. 5142 * 5143 * We return 1, saying don't proceed with rack_output only 5144 * when all timers have been stopped (destroyed PCB?). 5145 */ 5146 static int 5147 rack_timeout_tlp(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 5148 { 5149 /* 5150 * Tail Loss Probe. 5151 */ 5152 struct rack_sendmap *rsm = NULL; 5153 struct rack_sendmap *insret; 5154 struct socket *so; 5155 uint32_t amm, old_prr_snd = 0; 5156 uint32_t out, avail; 5157 int collapsed_win = 0; 5158 5159 if (tp->t_timers->tt_flags & TT_STOPPED) { 5160 return (1); 5161 } 5162 if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) { 5163 /* Its not time yet */ 5164 return (0); 5165 } 5166 if (ctf_progress_timeout_check(tp, true)) { 5167 rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__); 5168 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 5169 return (1); 5170 } 5171 /* 5172 * A TLP timer has expired. We have been idle for 2 rtts. So we now 5173 * need to figure out how to force a full MSS segment out. 5174 */ 5175 rack_log_to_event(rack, RACK_TO_FRM_TLP, NULL); 5176 counter_u64_add(rack_tlp_tot, 1); 5177 if (rack->r_state && (rack->r_state != tp->t_state)) 5178 rack_set_state(tp, rack); 5179 so = tp->t_inpcb->inp_socket; 5180 avail = sbavail(&so->so_snd); 5181 out = tp->snd_max - tp->snd_una; 5182 if (out > tp->snd_wnd) { 5183 /* special case, we need a retransmission */ 5184 collapsed_win = 1; 5185 goto need_retran; 5186 } 5187 /* 5188 * Check our send oldest always settings, and if 5189 * there is an oldest to send jump to the need_retran. 5190 */ 5191 if (rack_always_send_oldest && (TAILQ_EMPTY(&rack->r_ctl.rc_tmap) == 0)) 5192 goto need_retran; 5193 5194 if (avail > out) { 5195 /* New data is available */ 5196 amm = avail - out; 5197 if (amm > ctf_fixed_maxseg(tp)) { 5198 amm = ctf_fixed_maxseg(tp); 5199 if ((amm + out) > tp->snd_wnd) { 5200 /* We are rwnd limited */ 5201 goto need_retran; 5202 } 5203 } else if (amm < ctf_fixed_maxseg(tp)) { 5204 /* not enough to fill a MTU */ 5205 goto need_retran; 5206 } 5207 if (IN_RECOVERY(tp->t_flags)) { 5208 /* Unlikely */ 5209 if (rack->rack_no_prr == 0) { 5210 old_prr_snd = rack->r_ctl.rc_prr_sndcnt; 5211 if (out + amm <= tp->snd_wnd) { 5212 rack->r_ctl.rc_prr_sndcnt = amm; 5213 rack_log_to_prr(rack, 4, 0); 5214 } 5215 } else 5216 goto need_retran; 5217 } else { 5218 /* Set the send-new override */ 5219 if (out + amm <= tp->snd_wnd) 5220 rack->r_ctl.rc_tlp_new_data = amm; 5221 else 5222 goto need_retran; 5223 } 5224 rack->r_ctl.rc_tlpsend = NULL; 5225 counter_u64_add(rack_tlp_newdata, 1); 5226 goto send; 5227 } 5228 need_retran: 5229 /* 5230 * Ok we need to arrange the last un-acked segment to be re-sent, or 5231 * optionally the first un-acked segment. 5232 */ 5233 if (collapsed_win == 0) { 5234 if (rack_always_send_oldest) 5235 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 5236 else { 5237 rsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 5238 if (rsm && (rsm->r_flags & (RACK_ACKED | RACK_HAS_FIN))) { 5239 rsm = rack_find_high_nonack(rack, rsm); 5240 } 5241 } 5242 if (rsm == NULL) { 5243 counter_u64_add(rack_tlp_does_nada, 1); 5244 #ifdef TCP_BLACKBOX 5245 tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true); 5246 #endif 5247 goto out; 5248 } 5249 } else { 5250 /* 5251 * We must find the last segment 5252 * that was acceptable by the client. 5253 */ 5254 RB_FOREACH_REVERSE(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) { 5255 if ((rsm->r_flags & RACK_RWND_COLLAPSED) == 0) { 5256 /* Found one */ 5257 break; 5258 } 5259 } 5260 if (rsm == NULL) { 5261 /* None? if so send the first */ 5262 rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 5263 if (rsm == NULL) { 5264 counter_u64_add(rack_tlp_does_nada, 1); 5265 #ifdef TCP_BLACKBOX 5266 tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true); 5267 #endif 5268 goto out; 5269 } 5270 } 5271 } 5272 if ((rsm->r_end - rsm->r_start) > ctf_fixed_maxseg(tp)) { 5273 /* 5274 * We need to split this the last segment in two. 5275 */ 5276 struct rack_sendmap *nrsm; 5277 5278 nrsm = rack_alloc_full_limit(rack); 5279 if (nrsm == NULL) { 5280 /* 5281 * No memory to split, we will just exit and punt 5282 * off to the RXT timer. 5283 */ 5284 counter_u64_add(rack_tlp_does_nada, 1); 5285 goto out; 5286 } 5287 rack_clone_rsm(rack, nrsm, rsm, 5288 (rsm->r_end - ctf_fixed_maxseg(tp))); 5289 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); 5290 #ifdef INVARIANTS 5291 if (insret != NULL) { 5292 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", 5293 nrsm, insret, rack, rsm); 5294 } 5295 #endif 5296 if (rsm->r_in_tmap) { 5297 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 5298 nrsm->r_in_tmap = 1; 5299 } 5300 rsm->r_flags &= (~RACK_HAS_FIN); 5301 rsm = nrsm; 5302 } 5303 rack->r_ctl.rc_tlpsend = rsm; 5304 send: 5305 rack->r_timer_override = 1; 5306 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP; 5307 return (0); 5308 out: 5309 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP; 5310 return (0); 5311 } 5312 5313 /* 5314 * Delayed ack Timer, here we simply need to setup the 5315 * ACK_NOW flag and remove the DELACK flag. From there 5316 * the output routine will send the ack out. 5317 * 5318 * We only return 1, saying don't proceed, if all timers 5319 * are stopped (destroyed PCB?). 5320 */ 5321 static int 5322 rack_timeout_delack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 5323 { 5324 if (tp->t_timers->tt_flags & TT_STOPPED) { 5325 return (1); 5326 } 5327 rack_log_to_event(rack, RACK_TO_FRM_DELACK, NULL); 5328 tp->t_flags &= ~TF_DELACK; 5329 tp->t_flags |= TF_ACKNOW; 5330 KMOD_TCPSTAT_INC(tcps_delack); 5331 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK; 5332 return (0); 5333 } 5334 5335 /* 5336 * Persists timer, here we simply send the 5337 * same thing as a keepalive will. 5338 * the one byte send. 5339 * 5340 * We only return 1, saying don't proceed, if all timers 5341 * are stopped (destroyed PCB?). 5342 */ 5343 static int 5344 rack_timeout_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 5345 { 5346 struct tcptemp *t_template; 5347 struct inpcb *inp; 5348 int32_t retval = 1; 5349 5350 inp = tp->t_inpcb; 5351 5352 if (tp->t_timers->tt_flags & TT_STOPPED) { 5353 return (1); 5354 } 5355 if (rack->rc_in_persist == 0) 5356 return (0); 5357 if (ctf_progress_timeout_check(tp, false)) { 5358 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); 5359 rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__); 5360 tcp_set_inp_to_drop(inp, ETIMEDOUT); 5361 return (1); 5362 } 5363 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 5364 /* 5365 * Persistence timer into zero window. Force a byte to be output, if 5366 * possible. 5367 */ 5368 KMOD_TCPSTAT_INC(tcps_persisttimeo); 5369 /* 5370 * Hack: if the peer is dead/unreachable, we do not time out if the 5371 * window is closed. After a full backoff, drop the connection if 5372 * the idle time (no responses to probes) reaches the maximum 5373 * backoff that we would use if retransmitting. 5374 */ 5375 if (tp->t_rxtshift == TCP_MAXRXTSHIFT && 5376 (ticks - tp->t_rcvtime >= tcp_maxpersistidle || 5377 ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) { 5378 KMOD_TCPSTAT_INC(tcps_persistdrop); 5379 retval = 1; 5380 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); 5381 tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT); 5382 goto out; 5383 } 5384 if ((sbavail(&rack->rc_inp->inp_socket->so_snd) == 0) && 5385 tp->snd_una == tp->snd_max) 5386 rack_exit_persist(tp, rack, cts); 5387 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_PERSIT; 5388 /* 5389 * If the user has closed the socket then drop a persisting 5390 * connection after a much reduced timeout. 5391 */ 5392 if (tp->t_state > TCPS_CLOSE_WAIT && 5393 (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) { 5394 retval = 1; 5395 KMOD_TCPSTAT_INC(tcps_persistdrop); 5396 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); 5397 tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT); 5398 goto out; 5399 } 5400 t_template = tcpip_maketemplate(rack->rc_inp); 5401 if (t_template) { 5402 /* only set it if we were answered */ 5403 if (rack->forced_ack == 0) { 5404 rack->forced_ack = 1; 5405 rack->r_ctl.forced_ack_ts = tcp_get_usecs(NULL); 5406 } 5407 tcp_respond(tp, t_template->tt_ipgen, 5408 &t_template->tt_t, (struct mbuf *)NULL, 5409 tp->rcv_nxt, tp->snd_una - 1, 0); 5410 /* This sends an ack */ 5411 if (tp->t_flags & TF_DELACK) 5412 tp->t_flags &= ~TF_DELACK; 5413 free(t_template, M_TEMP); 5414 } 5415 if (tp->t_rxtshift < TCP_MAXRXTSHIFT) 5416 tp->t_rxtshift++; 5417 out: 5418 rack_log_to_event(rack, RACK_TO_FRM_PERSIST, NULL); 5419 rack_start_hpts_timer(rack, tp, cts, 5420 0, 0, 0); 5421 return (retval); 5422 } 5423 5424 /* 5425 * If a keepalive goes off, we had no other timers 5426 * happening. We always return 1 here since this 5427 * routine either drops the connection or sends 5428 * out a segment with respond. 5429 */ 5430 static int 5431 rack_timeout_keepalive(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 5432 { 5433 struct tcptemp *t_template; 5434 struct inpcb *inp; 5435 5436 if (tp->t_timers->tt_flags & TT_STOPPED) { 5437 return (1); 5438 } 5439 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_KEEP; 5440 inp = tp->t_inpcb; 5441 rack_log_to_event(rack, RACK_TO_FRM_KEEP, NULL); 5442 /* 5443 * Keep-alive timer went off; send something or drop connection if 5444 * idle for too long. 5445 */ 5446 KMOD_TCPSTAT_INC(tcps_keeptimeo); 5447 if (tp->t_state < TCPS_ESTABLISHED) 5448 goto dropit; 5449 if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && 5450 tp->t_state <= TCPS_CLOSING) { 5451 if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp)) 5452 goto dropit; 5453 /* 5454 * Send a packet designed to force a response if the peer is 5455 * up and reachable: either an ACK if the connection is 5456 * still alive, or an RST if the peer has closed the 5457 * connection due to timeout or reboot. Using sequence 5458 * number tp->snd_una-1 causes the transmitted zero-length 5459 * segment to lie outside the receive window; by the 5460 * protocol spec, this requires the correspondent TCP to 5461 * respond. 5462 */ 5463 KMOD_TCPSTAT_INC(tcps_keepprobe); 5464 t_template = tcpip_maketemplate(inp); 5465 if (t_template) { 5466 if (rack->forced_ack == 0) { 5467 rack->forced_ack = 1; 5468 rack->r_ctl.forced_ack_ts = tcp_get_usecs(NULL); 5469 } 5470 tcp_respond(tp, t_template->tt_ipgen, 5471 &t_template->tt_t, (struct mbuf *)NULL, 5472 tp->rcv_nxt, tp->snd_una - 1, 0); 5473 free(t_template, M_TEMP); 5474 } 5475 } 5476 rack_start_hpts_timer(rack, tp, cts, 0, 0, 0); 5477 return (1); 5478 dropit: 5479 KMOD_TCPSTAT_INC(tcps_keepdrops); 5480 tcp_log_end_status(tp, TCP_EI_STATUS_KEEP_MAX); 5481 tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT); 5482 return (1); 5483 } 5484 5485 /* 5486 * Retransmit helper function, clear up all the ack 5487 * flags and take care of important book keeping. 5488 */ 5489 static void 5490 rack_remxt_tmr(struct tcpcb *tp) 5491 { 5492 /* 5493 * The retransmit timer went off, all sack'd blocks must be 5494 * un-acked. 5495 */ 5496 struct rack_sendmap *rsm, *trsm = NULL; 5497 struct tcp_rack *rack; 5498 int32_t cnt = 0; 5499 5500 rack = (struct tcp_rack *)tp->t_fb_ptr; 5501 rack_timer_cancel(tp, rack, tcp_ts_getticks(), __LINE__); 5502 rack_log_to_event(rack, RACK_TO_FRM_TMR, NULL); 5503 if (rack->r_state && (rack->r_state != tp->t_state)) 5504 rack_set_state(tp, rack); 5505 /* 5506 * Ideally we would like to be able to 5507 * mark SACK-PASS on anything not acked here. 5508 * However, if we do that we would burst out 5509 * all that data 1ms apart. This would be unwise, 5510 * so for now we will just let the normal rxt timer 5511 * and tlp timer take care of it. 5512 */ 5513 RB_FOREACH(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) { 5514 if (rsm->r_flags & RACK_ACKED) { 5515 cnt++; 5516 rsm->r_dupack = 0; 5517 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 5518 if (rsm->r_in_tmap == 0) { 5519 /* We must re-add it back to the tlist */ 5520 if (trsm == NULL) { 5521 TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext); 5522 } else { 5523 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, trsm, rsm, r_tnext); 5524 } 5525 rsm->r_in_tmap = 1; 5526 } 5527 } 5528 trsm = rsm; 5529 if (rsm->r_flags & RACK_ACKED) 5530 rsm->r_flags |= RACK_WAS_ACKED; 5531 rsm->r_flags &= ~(RACK_ACKED | RACK_SACK_PASSED | RACK_WAS_SACKPASS); 5532 } 5533 /* Clear the count (we just un-acked them) */ 5534 rack->r_ctl.rc_sacked = 0; 5535 rack->r_ctl.rc_agg_delayed = 0; 5536 rack->r_early = 0; 5537 rack->r_ctl.rc_agg_early = 0; 5538 rack->r_late = 0; 5539 /* Clear the tlp rtx mark */ 5540 rack->r_ctl.rc_resend = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 5541 rack->r_ctl.rc_prr_sndcnt = 0; 5542 rack_log_to_prr(rack, 6, 0); 5543 rack->r_timer_override = 1; 5544 } 5545 5546 static void 5547 rack_cc_conn_init(struct tcpcb *tp) 5548 { 5549 struct tcp_rack *rack; 5550 5551 rack = (struct tcp_rack *)tp->t_fb_ptr; 5552 cc_conn_init(tp); 5553 /* 5554 * We want a chance to stay in slowstart as 5555 * we create a connection. TCP spec says that 5556 * initially ssthresh is infinite. For our 5557 * purposes that is the snd_wnd. 5558 */ 5559 if (tp->snd_ssthresh < tp->snd_wnd) { 5560 tp->snd_ssthresh = tp->snd_wnd; 5561 } 5562 /* 5563 * We also want to assure a IW worth of 5564 * data can get inflight. 5565 */ 5566 if (rc_init_window(rack) < tp->snd_cwnd) 5567 tp->snd_cwnd = rc_init_window(rack); 5568 } 5569 5570 /* 5571 * Re-transmit timeout! If we drop the PCB we will return 1, otherwise 5572 * we will setup to retransmit the lowest seq number outstanding. 5573 */ 5574 static int 5575 rack_timeout_rxt(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 5576 { 5577 int32_t rexmt; 5578 struct inpcb *inp; 5579 int32_t retval = 0; 5580 bool isipv6; 5581 5582 inp = tp->t_inpcb; 5583 if (tp->t_timers->tt_flags & TT_STOPPED) { 5584 return (1); 5585 } 5586 if (ctf_progress_timeout_check(tp, false)) { 5587 tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN); 5588 rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__); 5589 tcp_set_inp_to_drop(inp, ETIMEDOUT); 5590 return (1); 5591 } 5592 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RXT; 5593 if (TCPS_HAVEESTABLISHED(tp->t_state) && 5594 (tp->snd_una == tp->snd_max)) { 5595 /* Nothing outstanding .. nothing to do */ 5596 return (0); 5597 } 5598 /* 5599 * Retransmission timer went off. Message has not been acked within 5600 * retransmit interval. Back off to a longer retransmit interval 5601 * and retransmit one segment. 5602 */ 5603 rack_remxt_tmr(tp); 5604 if ((rack->r_ctl.rc_resend == NULL) || 5605 ((rack->r_ctl.rc_resend->r_flags & RACK_RWND_COLLAPSED) == 0)) { 5606 /* 5607 * If the rwnd collapsed on 5608 * the one we are retransmitting 5609 * it does not count against the 5610 * rxt count. 5611 */ 5612 tp->t_rxtshift++; 5613 } 5614 if (tp->t_rxtshift > TCP_MAXRXTSHIFT) { 5615 tp->t_rxtshift = TCP_MAXRXTSHIFT; 5616 KMOD_TCPSTAT_INC(tcps_timeoutdrop); 5617 retval = 1; 5618 tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN); 5619 tcp_set_inp_to_drop(rack->rc_inp, 5620 (tp->t_softerror ? (uint16_t) tp->t_softerror : ETIMEDOUT)); 5621 goto out; 5622 } 5623 if (tp->t_state == TCPS_SYN_SENT) { 5624 /* 5625 * If the SYN was retransmitted, indicate CWND to be limited 5626 * to 1 segment in cc_conn_init(). 5627 */ 5628 tp->snd_cwnd = 1; 5629 } else if (tp->t_rxtshift == 1) { 5630 /* 5631 * first retransmit; record ssthresh and cwnd so they can be 5632 * recovered if this turns out to be a "bad" retransmit. A 5633 * retransmit is considered "bad" if an ACK for this segment 5634 * is received within RTT/2 interval; the assumption here is 5635 * that the ACK was already in flight. See "On Estimating 5636 * End-to-End Network Path Properties" by Allman and Paxson 5637 * for more details. 5638 */ 5639 tp->snd_cwnd_prev = tp->snd_cwnd; 5640 tp->snd_ssthresh_prev = tp->snd_ssthresh; 5641 tp->snd_recover_prev = tp->snd_recover; 5642 if (IN_FASTRECOVERY(tp->t_flags)) 5643 tp->t_flags |= TF_WASFRECOVERY; 5644 else 5645 tp->t_flags &= ~TF_WASFRECOVERY; 5646 if (IN_CONGRECOVERY(tp->t_flags)) 5647 tp->t_flags |= TF_WASCRECOVERY; 5648 else 5649 tp->t_flags &= ~TF_WASCRECOVERY; 5650 tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1)); 5651 tp->t_flags |= TF_PREVVALID; 5652 } else 5653 tp->t_flags &= ~TF_PREVVALID; 5654 KMOD_TCPSTAT_INC(tcps_rexmttimeo); 5655 if ((tp->t_state == TCPS_SYN_SENT) || 5656 (tp->t_state == TCPS_SYN_RECEIVED)) 5657 rexmt = MSEC_2_TICKS(RACK_INITIAL_RTO * tcp_backoff[tp->t_rxtshift]); 5658 else 5659 rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; 5660 TCPT_RANGESET(tp->t_rxtcur, rexmt, 5661 max(MSEC_2_TICKS(rack_rto_min), rexmt), 5662 MSEC_2_TICKS(rack_rto_max)); 5663 /* 5664 * We enter the path for PLMTUD if connection is established or, if 5665 * connection is FIN_WAIT_1 status, reason for the last is that if 5666 * amount of data we send is very small, we could send it in couple 5667 * of packets and process straight to FIN. In that case we won't 5668 * catch ESTABLISHED state. 5669 */ 5670 #ifdef INET6 5671 isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? true : false; 5672 #else 5673 isipv6 = false; 5674 #endif 5675 if (((V_tcp_pmtud_blackhole_detect == 1) || 5676 (V_tcp_pmtud_blackhole_detect == 2 && !isipv6) || 5677 (V_tcp_pmtud_blackhole_detect == 3 && isipv6)) && 5678 ((tp->t_state == TCPS_ESTABLISHED) || 5679 (tp->t_state == TCPS_FIN_WAIT_1))) { 5680 /* 5681 * Idea here is that at each stage of mtu probe (usually, 5682 * 1448 -> 1188 -> 524) should be given 2 chances to recover 5683 * before further clamping down. 'tp->t_rxtshift % 2 == 0' 5684 * should take care of that. 5685 */ 5686 if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) == 5687 (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) && 5688 (tp->t_rxtshift >= 2 && tp->t_rxtshift < 6 && 5689 tp->t_rxtshift % 2 == 0)) { 5690 /* 5691 * Enter Path MTU Black-hole Detection mechanism: - 5692 * Disable Path MTU Discovery (IP "DF" bit). - 5693 * Reduce MTU to lower value than what we negotiated 5694 * with peer. 5695 */ 5696 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) == 0) { 5697 /* Record that we may have found a black hole. */ 5698 tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE; 5699 /* Keep track of previous MSS. */ 5700 tp->t_pmtud_saved_maxseg = tp->t_maxseg; 5701 } 5702 5703 /* 5704 * Reduce the MSS to blackhole value or to the 5705 * default in an attempt to retransmit. 5706 */ 5707 #ifdef INET6 5708 if (isipv6 && 5709 tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) { 5710 /* Use the sysctl tuneable blackhole MSS. */ 5711 tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss; 5712 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated); 5713 } else if (isipv6) { 5714 /* Use the default MSS. */ 5715 tp->t_maxseg = V_tcp_v6mssdflt; 5716 /* 5717 * Disable Path MTU Discovery when we switch 5718 * to minmss. 5719 */ 5720 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 5721 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); 5722 } 5723 #endif 5724 #if defined(INET6) && defined(INET) 5725 else 5726 #endif 5727 #ifdef INET 5728 if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) { 5729 /* Use the sysctl tuneable blackhole MSS. */ 5730 tp->t_maxseg = V_tcp_pmtud_blackhole_mss; 5731 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated); 5732 } else { 5733 /* Use the default MSS. */ 5734 tp->t_maxseg = V_tcp_mssdflt; 5735 /* 5736 * Disable Path MTU Discovery when we switch 5737 * to minmss. 5738 */ 5739 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 5740 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); 5741 } 5742 #endif 5743 } else { 5744 /* 5745 * If further retransmissions are still unsuccessful 5746 * with a lowered MTU, maybe this isn't a blackhole 5747 * and we restore the previous MSS and blackhole 5748 * detection flags. The limit '6' is determined by 5749 * giving each probe stage (1448, 1188, 524) 2 5750 * chances to recover. 5751 */ 5752 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) && 5753 (tp->t_rxtshift >= 6)) { 5754 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 5755 tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE; 5756 tp->t_maxseg = tp->t_pmtud_saved_maxseg; 5757 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_failed); 5758 } 5759 } 5760 } 5761 /* 5762 * If we backed off this far, our srtt estimate is probably bogus. 5763 * Clobber it so we'll take the next rtt measurement as our srtt; 5764 * move the current srtt into rttvar to keep the current retransmit 5765 * times until then. 5766 */ 5767 if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) { 5768 #ifdef INET6 5769 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) 5770 in6_losing(tp->t_inpcb); 5771 else 5772 #endif 5773 in_losing(tp->t_inpcb); 5774 tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT); 5775 tp->t_srtt = 0; 5776 } 5777 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 5778 tp->snd_recover = tp->snd_max; 5779 tp->t_flags |= TF_ACKNOW; 5780 tp->t_rtttime = 0; 5781 rack_cong_signal(tp, NULL, CC_RTO); 5782 out: 5783 return (retval); 5784 } 5785 5786 static int 5787 rack_process_timers(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8_t hpts_calling) 5788 { 5789 int32_t ret = 0; 5790 int32_t timers = (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK); 5791 5792 if (timers == 0) { 5793 return (0); 5794 } 5795 if (tp->t_state == TCPS_LISTEN) { 5796 /* no timers on listen sockets */ 5797 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) 5798 return (0); 5799 return (1); 5800 } 5801 if ((timers & PACE_TMR_RACK) && 5802 rack->rc_on_min_to) { 5803 /* 5804 * For the rack timer when we 5805 * are on a min-timeout (which means rrr_conf = 3) 5806 * we don't want to check the timer. It may 5807 * be going off for a pace and thats ok we 5808 * want to send the retransmit (if its ready). 5809 * 5810 * If its on a normal rack timer (non-min) then 5811 * we will check if its expired. 5812 */ 5813 goto skip_time_check; 5814 } 5815 if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) { 5816 uint32_t left; 5817 5818 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { 5819 ret = -1; 5820 rack_log_to_processing(rack, cts, ret, 0); 5821 return (0); 5822 } 5823 if (hpts_calling == 0) { 5824 /* 5825 * A user send or queued mbuf (sack) has called us? We 5826 * return 0 and let the pacing guards 5827 * deal with it if they should or 5828 * should not cause a send. 5829 */ 5830 ret = -2; 5831 rack_log_to_processing(rack, cts, ret, 0); 5832 return (0); 5833 } 5834 /* 5835 * Ok our timer went off early and we are not paced false 5836 * alarm, go back to sleep. 5837 */ 5838 ret = -3; 5839 left = rack->r_ctl.rc_timer_exp - cts; 5840 tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(left)); 5841 rack_log_to_processing(rack, cts, ret, left); 5842 return (1); 5843 } 5844 skip_time_check: 5845 rack->rc_tmr_stopped = 0; 5846 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_MASK; 5847 if (timers & PACE_TMR_DELACK) { 5848 ret = rack_timeout_delack(tp, rack, cts); 5849 } else if (timers & PACE_TMR_RACK) { 5850 rack->r_ctl.rc_tlp_rxt_last_time = cts; 5851 ret = rack_timeout_rack(tp, rack, cts); 5852 } else if (timers & PACE_TMR_TLP) { 5853 rack->r_ctl.rc_tlp_rxt_last_time = cts; 5854 ret = rack_timeout_tlp(tp, rack, cts); 5855 } else if (timers & PACE_TMR_RXT) { 5856 rack->r_ctl.rc_tlp_rxt_last_time = cts; 5857 ret = rack_timeout_rxt(tp, rack, cts); 5858 } else if (timers & PACE_TMR_PERSIT) { 5859 ret = rack_timeout_persist(tp, rack, cts); 5860 } else if (timers & PACE_TMR_KEEP) { 5861 ret = rack_timeout_keepalive(tp, rack, cts); 5862 } 5863 rack_log_to_processing(rack, cts, ret, timers); 5864 return (ret); 5865 } 5866 5867 static void 5868 rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line) 5869 { 5870 struct timeval tv; 5871 uint32_t us_cts, flags_on_entry; 5872 uint8_t hpts_removed = 0; 5873 5874 flags_on_entry = rack->r_ctl.rc_hpts_flags; 5875 us_cts = tcp_get_usecs(&tv); 5876 if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) && 5877 ((TSTMP_GEQ(us_cts, rack->r_ctl.rc_last_output_to)) || 5878 ((tp->snd_max - tp->snd_una) == 0))) { 5879 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); 5880 hpts_removed = 1; 5881 /* If we were not delayed cancel out the flag. */ 5882 if ((tp->snd_max - tp->snd_una) == 0) 5883 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 5884 rack_log_to_cancel(rack, hpts_removed, line, us_cts, &tv, flags_on_entry); 5885 } 5886 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { 5887 rack->rc_tmr_stopped = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK; 5888 if (rack->rc_inp->inp_in_hpts && 5889 ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0)) { 5890 /* 5891 * Canceling timer's when we have no output being 5892 * paced. We also must remove ourselves from the 5893 * hpts. 5894 */ 5895 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); 5896 hpts_removed = 1; 5897 } 5898 rack->r_ctl.rc_hpts_flags &= ~(PACE_TMR_MASK); 5899 } 5900 if (hpts_removed == 0) 5901 rack_log_to_cancel(rack, hpts_removed, line, us_cts, &tv, flags_on_entry); 5902 } 5903 5904 static void 5905 rack_timer_stop(struct tcpcb *tp, uint32_t timer_type) 5906 { 5907 return; 5908 } 5909 5910 static int 5911 rack_stopall(struct tcpcb *tp) 5912 { 5913 struct tcp_rack *rack; 5914 rack = (struct tcp_rack *)tp->t_fb_ptr; 5915 rack->t_timers_stopped = 1; 5916 return (0); 5917 } 5918 5919 static void 5920 rack_timer_activate(struct tcpcb *tp, uint32_t timer_type, uint32_t delta) 5921 { 5922 return; 5923 } 5924 5925 static int 5926 rack_timer_active(struct tcpcb *tp, uint32_t timer_type) 5927 { 5928 return (0); 5929 } 5930 5931 static void 5932 rack_stop_all_timers(struct tcpcb *tp) 5933 { 5934 struct tcp_rack *rack; 5935 5936 /* 5937 * Assure no timers are running. 5938 */ 5939 if (tcp_timer_active(tp, TT_PERSIST)) { 5940 /* We enter in persists, set the flag appropriately */ 5941 rack = (struct tcp_rack *)tp->t_fb_ptr; 5942 rack->rc_in_persist = 1; 5943 } 5944 tcp_timer_suspend(tp, TT_PERSIST); 5945 tcp_timer_suspend(tp, TT_REXMT); 5946 tcp_timer_suspend(tp, TT_KEEP); 5947 tcp_timer_suspend(tp, TT_DELACK); 5948 } 5949 5950 static void 5951 rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack, 5952 struct rack_sendmap *rsm, uint32_t ts) 5953 { 5954 int32_t idx; 5955 5956 rsm->r_rtr_cnt++; 5957 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 5958 rsm->r_dupack = 0; 5959 if (rsm->r_rtr_cnt > RACK_NUM_OF_RETRANS) { 5960 rsm->r_rtr_cnt = RACK_NUM_OF_RETRANS; 5961 rsm->r_flags |= RACK_OVERMAX; 5962 } 5963 if ((rsm->r_rtr_cnt > 1) && ((rsm->r_flags & RACK_TLP) == 0)) { 5964 rack->r_ctl.rc_holes_rxt += (rsm->r_end - rsm->r_start); 5965 rsm->r_rtr_bytes += (rsm->r_end - rsm->r_start); 5966 } 5967 idx = rsm->r_rtr_cnt - 1; 5968 rsm->r_tim_lastsent[idx] = ts; 5969 if (rsm->r_flags & RACK_ACKED) { 5970 /* Problably MTU discovery messing with us */ 5971 rsm->r_flags &= ~RACK_ACKED; 5972 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); 5973 } 5974 if (rsm->r_in_tmap) { 5975 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 5976 rsm->r_in_tmap = 0; 5977 } 5978 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); 5979 rsm->r_in_tmap = 1; 5980 if (rsm->r_flags & RACK_SACK_PASSED) { 5981 /* We have retransmitted due to the SACK pass */ 5982 rsm->r_flags &= ~RACK_SACK_PASSED; 5983 rsm->r_flags |= RACK_WAS_SACKPASS; 5984 } 5985 } 5986 5987 static uint32_t 5988 rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack, 5989 struct rack_sendmap *rsm, uint32_t ts, int32_t *lenp) 5990 { 5991 /* 5992 * We (re-)transmitted starting at rsm->r_start for some length 5993 * (possibly less than r_end. 5994 */ 5995 struct rack_sendmap *nrsm, *insret; 5996 uint32_t c_end; 5997 int32_t len; 5998 5999 len = *lenp; 6000 c_end = rsm->r_start + len; 6001 if (SEQ_GEQ(c_end, rsm->r_end)) { 6002 /* 6003 * We retransmitted the whole piece or more than the whole 6004 * slopping into the next rsm. 6005 */ 6006 rack_update_rsm(tp, rack, rsm, ts); 6007 if (c_end == rsm->r_end) { 6008 *lenp = 0; 6009 return (0); 6010 } else { 6011 int32_t act_len; 6012 6013 /* Hangs over the end return whats left */ 6014 act_len = rsm->r_end - rsm->r_start; 6015 *lenp = (len - act_len); 6016 return (rsm->r_end); 6017 } 6018 /* We don't get out of this block. */ 6019 } 6020 /* 6021 * Here we retransmitted less than the whole thing which means we 6022 * have to split this into what was transmitted and what was not. 6023 */ 6024 nrsm = rack_alloc_full_limit(rack); 6025 if (nrsm == NULL) { 6026 /* 6027 * We can't get memory, so lets not proceed. 6028 */ 6029 *lenp = 0; 6030 return (0); 6031 } 6032 /* 6033 * So here we are going to take the original rsm and make it what we 6034 * retransmitted. nrsm will be the tail portion we did not 6035 * retransmit. For example say the chunk was 1, 11 (10 bytes). And 6036 * we retransmitted 5 bytes i.e. 1, 5. The original piece shrinks to 6037 * 1, 6 and the new piece will be 6, 11. 6038 */ 6039 rack_clone_rsm(rack, nrsm, rsm, c_end); 6040 nrsm->r_dupack = 0; 6041 rack_log_retran_reason(rack, nrsm, __LINE__, 0, 2); 6042 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); 6043 #ifdef INVARIANTS 6044 if (insret != NULL) { 6045 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", 6046 nrsm, insret, rack, rsm); 6047 } 6048 #endif 6049 if (rsm->r_in_tmap) { 6050 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 6051 nrsm->r_in_tmap = 1; 6052 } 6053 rsm->r_flags &= (~RACK_HAS_FIN); 6054 rack_update_rsm(tp, rack, rsm, ts); 6055 *lenp = 0; 6056 return (0); 6057 } 6058 6059 static void 6060 rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len, 6061 uint32_t seq_out, uint8_t th_flags, int32_t err, uint32_t ts, 6062 uint8_t pass, struct rack_sendmap *hintrsm, uint32_t us_cts) 6063 { 6064 struct tcp_rack *rack; 6065 struct rack_sendmap *rsm, *nrsm, *insret, fe; 6066 register uint32_t snd_max, snd_una; 6067 6068 /* 6069 * Add to the RACK log of packets in flight or retransmitted. If 6070 * there is a TS option we will use the TS echoed, if not we will 6071 * grab a TS. 6072 * 6073 * Retransmissions will increment the count and move the ts to its 6074 * proper place. Note that if options do not include TS's then we 6075 * won't be able to effectively use the ACK for an RTT on a retran. 6076 * 6077 * Notes about r_start and r_end. Lets consider a send starting at 6078 * sequence 1 for 10 bytes. In such an example the r_start would be 6079 * 1 (starting sequence) but the r_end would be r_start+len i.e. 11. 6080 * This means that r_end is actually the first sequence for the next 6081 * slot (11). 6082 * 6083 */ 6084 /* 6085 * If err is set what do we do XXXrrs? should we not add the thing? 6086 * -- i.e. return if err != 0 or should we pretend we sent it? -- 6087 * i.e. proceed with add ** do this for now. 6088 */ 6089 INP_WLOCK_ASSERT(tp->t_inpcb); 6090 if (err) 6091 /* 6092 * We don't log errors -- we could but snd_max does not 6093 * advance in this case either. 6094 */ 6095 return; 6096 6097 if (th_flags & TH_RST) { 6098 /* 6099 * We don't log resets and we return immediately from 6100 * sending 6101 */ 6102 return; 6103 } 6104 rack = (struct tcp_rack *)tp->t_fb_ptr; 6105 snd_una = tp->snd_una; 6106 if (SEQ_LEQ((seq_out + len), snd_una)) { 6107 /* Are sending an old segment to induce an ack (keep-alive)? */ 6108 return; 6109 } 6110 if (SEQ_LT(seq_out, snd_una)) { 6111 /* huh? should we panic? */ 6112 uint32_t end; 6113 6114 end = seq_out + len; 6115 seq_out = snd_una; 6116 if (SEQ_GEQ(end, seq_out)) 6117 len = end - seq_out; 6118 else 6119 len = 0; 6120 } 6121 snd_max = tp->snd_max; 6122 if (th_flags & (TH_SYN | TH_FIN)) { 6123 /* 6124 * The call to rack_log_output is made before bumping 6125 * snd_max. This means we can record one extra byte on a SYN 6126 * or FIN if seq_out is adding more on and a FIN is present 6127 * (and we are not resending). 6128 */ 6129 if ((th_flags & TH_SYN) && (seq_out == tp->iss)) 6130 len++; 6131 if (th_flags & TH_FIN) 6132 len++; 6133 if (SEQ_LT(snd_max, tp->snd_nxt)) { 6134 /* 6135 * The add/update as not been done for the FIN/SYN 6136 * yet. 6137 */ 6138 snd_max = tp->snd_nxt; 6139 } 6140 } 6141 if (len == 0) { 6142 /* We don't log zero window probes */ 6143 return; 6144 } 6145 rack->r_ctl.rc_time_last_sent = ts; 6146 if (IN_RECOVERY(tp->t_flags)) { 6147 rack->r_ctl.rc_prr_out += len; 6148 } 6149 /* First question is it a retransmission or new? */ 6150 if (seq_out == snd_max) { 6151 /* Its new */ 6152 again: 6153 rsm = rack_alloc(rack); 6154 if (rsm == NULL) { 6155 /* 6156 * Hmm out of memory and the tcb got destroyed while 6157 * we tried to wait. 6158 */ 6159 return; 6160 } 6161 if (th_flags & TH_FIN) { 6162 rsm->r_flags = RACK_HAS_FIN; 6163 } else { 6164 rsm->r_flags = 0; 6165 } 6166 rsm->r_tim_lastsent[0] = ts; 6167 rsm->r_rtr_cnt = 1; 6168 rsm->r_rtr_bytes = 0; 6169 rsm->usec_orig_send = us_cts; 6170 if (th_flags & TH_SYN) { 6171 /* The data space is one beyond snd_una */ 6172 rsm->r_flags |= RACK_HAS_SIN; 6173 rsm->r_start = seq_out + 1; 6174 rsm->r_end = rsm->r_start + (len - 1); 6175 } else { 6176 /* Normal case */ 6177 rsm->r_start = seq_out; 6178 rsm->r_end = rsm->r_start + len; 6179 } 6180 rsm->r_dupack = 0; 6181 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 6182 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 6183 #ifdef INVARIANTS 6184 if (insret != NULL) { 6185 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", 6186 nrsm, insret, rack, rsm); 6187 } 6188 #endif 6189 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); 6190 rsm->r_in_tmap = 1; 6191 /* 6192 * Special case detection, is there just a single 6193 * packet outstanding when we are not in recovery? 6194 * 6195 * If this is true mark it so. 6196 */ 6197 if ((IN_RECOVERY(tp->t_flags) == 0) && 6198 (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) == ctf_fixed_maxseg(tp))) { 6199 struct rack_sendmap *prsm; 6200 6201 prsm = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 6202 if (prsm) 6203 prsm->r_one_out_nr = 1; 6204 } 6205 return; 6206 } 6207 /* 6208 * If we reach here its a retransmission and we need to find it. 6209 */ 6210 memset(&fe, 0, sizeof(fe)); 6211 more: 6212 if (hintrsm && (hintrsm->r_start == seq_out)) { 6213 rsm = hintrsm; 6214 hintrsm = NULL; 6215 } else { 6216 /* No hints sorry */ 6217 rsm = NULL; 6218 } 6219 if ((rsm) && (rsm->r_start == seq_out)) { 6220 seq_out = rack_update_entry(tp, rack, rsm, ts, &len); 6221 if (len == 0) { 6222 return; 6223 } else { 6224 goto more; 6225 } 6226 } 6227 /* Ok it was not the last pointer go through it the hard way. */ 6228 refind: 6229 fe.r_start = seq_out; 6230 rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); 6231 if (rsm) { 6232 if (rsm->r_start == seq_out) { 6233 seq_out = rack_update_entry(tp, rack, rsm, ts, &len); 6234 if (len == 0) { 6235 return; 6236 } else { 6237 goto refind; 6238 } 6239 } 6240 if (SEQ_GEQ(seq_out, rsm->r_start) && SEQ_LT(seq_out, rsm->r_end)) { 6241 /* Transmitted within this piece */ 6242 /* 6243 * Ok we must split off the front and then let the 6244 * update do the rest 6245 */ 6246 nrsm = rack_alloc_full_limit(rack); 6247 if (nrsm == NULL) { 6248 rack_update_rsm(tp, rack, rsm, ts); 6249 return; 6250 } 6251 /* 6252 * copy rsm to nrsm and then trim the front of rsm 6253 * to not include this part. 6254 */ 6255 rack_clone_rsm(rack, nrsm, rsm, seq_out); 6256 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); 6257 #ifdef INVARIANTS 6258 if (insret != NULL) { 6259 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", 6260 nrsm, insret, rack, rsm); 6261 } 6262 #endif 6263 if (rsm->r_in_tmap) { 6264 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 6265 nrsm->r_in_tmap = 1; 6266 } 6267 rsm->r_flags &= (~RACK_HAS_FIN); 6268 seq_out = rack_update_entry(tp, rack, nrsm, ts, &len); 6269 if (len == 0) { 6270 return; 6271 } else if (len > 0) 6272 goto refind; 6273 } 6274 } 6275 /* 6276 * Hmm not found in map did they retransmit both old and on into the 6277 * new? 6278 */ 6279 if (seq_out == tp->snd_max) { 6280 goto again; 6281 } else if (SEQ_LT(seq_out, tp->snd_max)) { 6282 #ifdef INVARIANTS 6283 printf("seq_out:%u len:%d snd_una:%u snd_max:%u -- but rsm not found?\n", 6284 seq_out, len, tp->snd_una, tp->snd_max); 6285 printf("Starting Dump of all rack entries\n"); 6286 RB_FOREACH(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) { 6287 printf("rsm:%p start:%u end:%u\n", 6288 rsm, rsm->r_start, rsm->r_end); 6289 } 6290 printf("Dump complete\n"); 6291 panic("seq_out not found rack:%p tp:%p", 6292 rack, tp); 6293 #endif 6294 } else { 6295 #ifdef INVARIANTS 6296 /* 6297 * Hmm beyond sndmax? (only if we are using the new rtt-pack 6298 * flag) 6299 */ 6300 panic("seq_out:%u(%d) is beyond snd_max:%u tp:%p", 6301 seq_out, len, tp->snd_max, tp); 6302 #endif 6303 } 6304 } 6305 6306 /* 6307 * Record one of the RTT updates from an ack into 6308 * our sample structure. 6309 */ 6310 6311 static void 6312 tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt, uint32_t len, uint32_t us_rtt, 6313 int confidence, struct rack_sendmap *rsm, uint16_t rtrcnt) 6314 { 6315 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) || 6316 (rack->r_ctl.rack_rs.rs_rtt_lowest > rtt)) { 6317 rack->r_ctl.rack_rs.rs_rtt_lowest = rtt; 6318 } 6319 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) || 6320 (rack->r_ctl.rack_rs.rs_rtt_highest < rtt)) { 6321 rack->r_ctl.rack_rs.rs_rtt_highest = rtt; 6322 } 6323 if (rack->rc_tp->t_flags & TF_GPUTINPROG) { 6324 if (us_rtt < rack->r_ctl.rc_gp_lowrtt) 6325 rack->r_ctl.rc_gp_lowrtt = us_rtt; 6326 if (rack->rc_tp->snd_wnd > rack->r_ctl.rc_gp_high_rwnd) 6327 rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd; 6328 } 6329 if ((confidence == 1) && 6330 ((rsm == NULL) || 6331 (rsm->r_just_ret) || 6332 (rsm->r_one_out_nr && 6333 len < (ctf_fixed_maxseg(rack->rc_tp) * 2)))) { 6334 /* 6335 * If the rsm had a just return 6336 * hit it then we can't trust the 6337 * rtt measurement for buffer deterimination 6338 * Note that a confidence of 2, indicates 6339 * SACK'd which overrides the r_just_ret or 6340 * the r_one_out_nr. If it was a CUM-ACK and 6341 * we had only two outstanding, but get an 6342 * ack for only 1. Then that also lowers our 6343 * confidence. 6344 */ 6345 confidence = 0; 6346 } 6347 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) || 6348 (rack->r_ctl.rack_rs.rs_us_rtt > us_rtt)) { 6349 if (rack->r_ctl.rack_rs.confidence == 0) { 6350 /* 6351 * We take anything with no current confidence 6352 * saved. 6353 */ 6354 rack->r_ctl.rack_rs.rs_us_rtt = us_rtt; 6355 rack->r_ctl.rack_rs.confidence = confidence; 6356 rack->r_ctl.rack_rs.rs_us_rtrcnt = rtrcnt; 6357 } else if (confidence || rack->r_ctl.rack_rs.confidence) { 6358 /* 6359 * Once we have a confident number, 6360 * we can update it with a smaller 6361 * value since this confident number 6362 * may include the DSACK time until 6363 * the next segment (the second one) arrived. 6364 */ 6365 rack->r_ctl.rack_rs.rs_us_rtt = us_rtt; 6366 rack->r_ctl.rack_rs.confidence = confidence; 6367 rack->r_ctl.rack_rs.rs_us_rtrcnt = rtrcnt; 6368 } 6369 } 6370 rack_log_rtt_upd(rack->rc_tp, rack, us_rtt, len, rsm, confidence); 6371 rack->r_ctl.rack_rs.rs_flags = RACK_RTT_VALID; 6372 rack->r_ctl.rack_rs.rs_rtt_tot += rtt; 6373 rack->r_ctl.rack_rs.rs_rtt_cnt++; 6374 } 6375 6376 /* 6377 * Collect new round-trip time estimate 6378 * and update averages and current timeout. 6379 */ 6380 static void 6381 tcp_rack_xmit_timer_commit(struct tcp_rack *rack, struct tcpcb *tp) 6382 { 6383 int32_t delta; 6384 uint32_t o_srtt, o_var; 6385 int32_t hrtt_up = 0; 6386 int32_t rtt; 6387 6388 if (rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) 6389 /* No valid sample */ 6390 return; 6391 if (rack->r_ctl.rc_rate_sample_method == USE_RTT_LOW) { 6392 /* We are to use the lowest RTT seen in a single ack */ 6393 rtt = rack->r_ctl.rack_rs.rs_rtt_lowest; 6394 } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_HIGH) { 6395 /* We are to use the highest RTT seen in a single ack */ 6396 rtt = rack->r_ctl.rack_rs.rs_rtt_highest; 6397 } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_AVG) { 6398 /* We are to use the average RTT seen in a single ack */ 6399 rtt = (int32_t)(rack->r_ctl.rack_rs.rs_rtt_tot / 6400 (uint64_t)rack->r_ctl.rack_rs.rs_rtt_cnt); 6401 } else { 6402 #ifdef INVARIANTS 6403 panic("Unknown rtt variant %d", rack->r_ctl.rc_rate_sample_method); 6404 #endif 6405 return; 6406 } 6407 if (rtt == 0) 6408 rtt = 1; 6409 if (rack->rc_gp_rtt_set == 0) { 6410 /* 6411 * With no RTT we have to accept 6412 * even one we are not confident of. 6413 */ 6414 rack->r_ctl.rc_gp_srtt = rack->r_ctl.rack_rs.rs_us_rtt; 6415 rack->rc_gp_rtt_set = 1; 6416 } else if (rack->r_ctl.rack_rs.confidence) { 6417 /* update the running gp srtt */ 6418 rack->r_ctl.rc_gp_srtt -= (rack->r_ctl.rc_gp_srtt/8); 6419 rack->r_ctl.rc_gp_srtt += rack->r_ctl.rack_rs.rs_us_rtt / 8; 6420 } 6421 if (rack->r_ctl.rack_rs.confidence) { 6422 /* 6423 * record the low and high for highly buffered path computation, 6424 * we only do this if we are confident (not a retransmission). 6425 */ 6426 if (rack->r_ctl.rc_highest_us_rtt < rack->r_ctl.rack_rs.rs_us_rtt) { 6427 rack->r_ctl.rc_highest_us_rtt = rack->r_ctl.rack_rs.rs_us_rtt; 6428 hrtt_up = 1; 6429 } 6430 if (rack->rc_highly_buffered == 0) { 6431 /* 6432 * Currently once we declare a path has 6433 * highly buffered there is no going 6434 * back, which may be a problem... 6435 */ 6436 if ((rack->r_ctl.rc_highest_us_rtt / rack->r_ctl.rc_lowest_us_rtt) > rack_hbp_thresh) { 6437 rack_log_rtt_shrinks(rack, rack->r_ctl.rack_rs.rs_us_rtt, 6438 rack->r_ctl.rc_highest_us_rtt, 6439 rack->r_ctl.rc_lowest_us_rtt, 6440 RACK_RTTS_SEEHBP); 6441 rack->rc_highly_buffered = 1; 6442 } 6443 } 6444 } 6445 if ((rack->r_ctl.rack_rs.confidence) || 6446 (rack->r_ctl.rack_rs.rs_us_rtrcnt == 1)) { 6447 /* 6448 * If we are highly confident of it <or> it was 6449 * never retransmitted we accept it as the last us_rtt. 6450 */ 6451 rack->r_ctl.rc_last_us_rtt = rack->r_ctl.rack_rs.rs_us_rtt; 6452 /* The lowest rtt can be set if its was not retransmited */ 6453 if (rack->r_ctl.rc_lowest_us_rtt > rack->r_ctl.rack_rs.rs_us_rtt) { 6454 rack->r_ctl.rc_lowest_us_rtt = rack->r_ctl.rack_rs.rs_us_rtt; 6455 if (rack->r_ctl.rc_lowest_us_rtt == 0) 6456 rack->r_ctl.rc_lowest_us_rtt = 1; 6457 } 6458 } 6459 rack_log_rtt_sample(rack, rtt); 6460 o_srtt = tp->t_srtt; 6461 o_var = tp->t_rttvar; 6462 rack = (struct tcp_rack *)tp->t_fb_ptr; 6463 if (tp->t_srtt != 0) { 6464 /* 6465 * srtt is stored as fixed point with 5 bits after the 6466 * binary point (i.e., scaled by 8). The following magic is 6467 * equivalent to the smoothing algorithm in rfc793 with an 6468 * alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed point). 6469 * Adjust rtt to origin 0. 6470 */ 6471 delta = ((rtt - 1) << TCP_DELTA_SHIFT) 6472 - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)); 6473 6474 tp->t_srtt += delta; 6475 if (tp->t_srtt <= 0) 6476 tp->t_srtt = 1; 6477 6478 /* 6479 * We accumulate a smoothed rtt variance (actually, a 6480 * smoothed mean difference), then set the retransmit timer 6481 * to smoothed rtt + 4 times the smoothed variance. rttvar 6482 * is stored as fixed point with 4 bits after the binary 6483 * point (scaled by 16). The following is equivalent to 6484 * rfc793 smoothing with an alpha of .75 (rttvar = 6485 * rttvar*3/4 + |delta| / 4). This replaces rfc793's 6486 * wired-in beta. 6487 */ 6488 if (delta < 0) 6489 delta = -delta; 6490 delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT); 6491 tp->t_rttvar += delta; 6492 if (tp->t_rttvar <= 0) 6493 tp->t_rttvar = 1; 6494 if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar) 6495 tp->t_rttbest = tp->t_srtt + tp->t_rttvar; 6496 } else { 6497 /* 6498 * No rtt measurement yet - use the unsmoothed rtt. Set the 6499 * variance to half the rtt (so our first retransmit happens 6500 * at 3*rtt). 6501 */ 6502 tp->t_srtt = rtt << TCP_RTT_SHIFT; 6503 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1); 6504 tp->t_rttbest = tp->t_srtt + tp->t_rttvar; 6505 } 6506 KMOD_TCPSTAT_INC(tcps_rttupdated); 6507 tp->t_rttupdated++; 6508 #ifdef STATS 6509 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rtt)); 6510 #endif 6511 tp->t_rxtshift = 0; 6512 6513 /* 6514 * the retransmit should happen at rtt + 4 * rttvar. Because of the 6515 * way we do the smoothing, srtt and rttvar will each average +1/2 6516 * tick of bias. When we compute the retransmit timer, we want 1/2 6517 * tick of rounding and 1 extra tick because of +-1/2 tick 6518 * uncertainty in the firing of the timer. The bias will give us 6519 * exactly the 1.5 tick we need. But, because the bias is 6520 * statistical, we have to test that we don't drop below the minimum 6521 * feasible timer (which is 2 ticks). 6522 */ 6523 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), 6524 max(MSEC_2_TICKS(rack_rto_min), rtt + 2), MSEC_2_TICKS(rack_rto_max)); 6525 tp->t_softerror = 0; 6526 } 6527 6528 static void 6529 rack_earlier_retran(struct tcpcb *tp, struct rack_sendmap *rsm, 6530 uint32_t t, uint32_t cts) 6531 { 6532 /* 6533 * For this RSM, we acknowledged the data from a previous 6534 * transmission, not the last one we made. This means we did a false 6535 * retransmit. 6536 */ 6537 struct tcp_rack *rack; 6538 6539 if (rsm->r_flags & RACK_HAS_FIN) { 6540 /* 6541 * The sending of the FIN often is multiple sent when we 6542 * have everything outstanding ack'd. We ignore this case 6543 * since its over now. 6544 */ 6545 return; 6546 } 6547 if (rsm->r_flags & RACK_TLP) { 6548 /* 6549 * We expect TLP's to have this occur. 6550 */ 6551 return; 6552 } 6553 rack = (struct tcp_rack *)tp->t_fb_ptr; 6554 /* should we undo cc changes and exit recovery? */ 6555 if (IN_RECOVERY(tp->t_flags)) { 6556 if (rack->r_ctl.rc_rsm_start == rsm->r_start) { 6557 /* 6558 * Undo what we ratched down and exit recovery if 6559 * possible 6560 */ 6561 EXIT_RECOVERY(tp->t_flags); 6562 tp->snd_recover = tp->snd_una; 6563 if (rack->r_ctl.rc_cwnd_at > tp->snd_cwnd) 6564 tp->snd_cwnd = rack->r_ctl.rc_cwnd_at; 6565 if (rack->r_ctl.rc_ssthresh_at > tp->snd_ssthresh) 6566 tp->snd_ssthresh = rack->r_ctl.rc_ssthresh_at; 6567 } 6568 } 6569 if (rsm->r_flags & RACK_WAS_SACKPASS) { 6570 /* 6571 * We retransmitted based on a sack and the earlier 6572 * retransmission ack'd it - re-ordering is occuring. 6573 */ 6574 counter_u64_add(rack_reorder_seen, 1); 6575 rack->r_ctl.rc_reorder_ts = cts; 6576 } 6577 counter_u64_add(rack_badfr, 1); 6578 counter_u64_add(rack_badfr_bytes, (rsm->r_end - rsm->r_start)); 6579 } 6580 6581 static void 6582 rack_apply_updated_usrtt(struct tcp_rack *rack, uint32_t us_rtt, uint32_t us_cts) 6583 { 6584 /* 6585 * Apply to filter the inbound us-rtt at us_cts. 6586 */ 6587 uint32_t old_rtt; 6588 6589 old_rtt = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); 6590 apply_filter_min_small(&rack->r_ctl.rc_gp_min_rtt, 6591 us_rtt, us_cts); 6592 if (rack->r_ctl.last_pacing_time && 6593 rack->rc_gp_dyn_mul && 6594 (rack->r_ctl.last_pacing_time > us_rtt)) 6595 rack->pacing_longer_than_rtt = 1; 6596 else 6597 rack->pacing_longer_than_rtt = 0; 6598 if (old_rtt > us_rtt) { 6599 /* We just hit a new lower rtt time */ 6600 rack_log_rtt_shrinks(rack, us_cts, old_rtt, 6601 __LINE__, RACK_RTTS_NEWRTT); 6602 /* 6603 * Only count it if its lower than what we saw within our 6604 * calculated range. 6605 */ 6606 if ((old_rtt - us_rtt) > rack_min_rtt_movement) { 6607 if (rack_probertt_lower_within && 6608 rack->rc_gp_dyn_mul && 6609 (rack->use_fixed_rate == 0) && 6610 (rack->rc_always_pace)) { 6611 /* 6612 * We are seeing a new lower rtt very close 6613 * to the time that we would have entered probe-rtt. 6614 * This is probably due to the fact that a peer flow 6615 * has entered probe-rtt. Lets go in now too. 6616 */ 6617 uint32_t val; 6618 6619 val = rack_probertt_lower_within * rack_time_between_probertt; 6620 val /= 100; 6621 if ((rack->in_probe_rtt == 0) && 6622 ((us_cts - rack->r_ctl.rc_lower_rtt_us_cts) >= (rack_time_between_probertt - val))) { 6623 rack_enter_probertt(rack, us_cts); 6624 } 6625 } 6626 rack->r_ctl.rc_lower_rtt_us_cts = us_cts; 6627 } 6628 } 6629 } 6630 6631 static int 6632 rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack, 6633 struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type, tcp_seq th_ack) 6634 { 6635 int32_t i; 6636 uint32_t t, len_acked; 6637 6638 if ((rsm->r_flags & RACK_ACKED) || 6639 (rsm->r_flags & RACK_WAS_ACKED)) 6640 /* Already done */ 6641 return (0); 6642 6643 if (ack_type == CUM_ACKED) { 6644 if (SEQ_GT(th_ack, rsm->r_end)) 6645 len_acked = rsm->r_end - rsm->r_start; 6646 else 6647 len_acked = th_ack - rsm->r_start; 6648 } else 6649 len_acked = rsm->r_end - rsm->r_start; 6650 if (rsm->r_rtr_cnt == 1) { 6651 uint32_t us_rtt; 6652 6653 t = cts - rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; 6654 if ((int)t <= 0) 6655 t = 1; 6656 if (!tp->t_rttlow || tp->t_rttlow > t) 6657 tp->t_rttlow = t; 6658 if (!rack->r_ctl.rc_rack_min_rtt || 6659 SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 6660 rack->r_ctl.rc_rack_min_rtt = t; 6661 if (rack->r_ctl.rc_rack_min_rtt == 0) { 6662 rack->r_ctl.rc_rack_min_rtt = 1; 6663 } 6664 } 6665 us_rtt = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time) - rsm->usec_orig_send; 6666 if (us_rtt == 0) 6667 us_rtt = 1; 6668 rack_apply_updated_usrtt(rack, us_rtt, tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time)); 6669 if (ack_type == SACKED) 6670 tcp_rack_xmit_timer(rack, t + 1, len_acked, us_rtt, 2 , rsm, rsm->r_rtr_cnt); 6671 else { 6672 /* 6673 * For cum-ack we are only confident if what 6674 * is being acked is included in a measurement. 6675 * Otherwise it could be an idle period that 6676 * includes Delayed-ack time. 6677 */ 6678 tcp_rack_xmit_timer(rack, t + 1, len_acked, us_rtt, 6679 (rack->app_limited_needs_set ? 0 : 1), rsm, rsm->r_rtr_cnt); 6680 } 6681 if ((rsm->r_flags & RACK_TLP) && 6682 (!IN_RECOVERY(tp->t_flags))) { 6683 /* Segment was a TLP and our retrans matched */ 6684 if (rack->r_ctl.rc_tlp_cwnd_reduce) { 6685 rack->r_ctl.rc_rsm_start = tp->snd_max; 6686 rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; 6687 rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; 6688 rack_cong_signal(tp, NULL, CC_NDUPACK); 6689 /* 6690 * When we enter recovery we need to assure 6691 * we send one packet. 6692 */ 6693 if (rack->rack_no_prr == 0) { 6694 rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); 6695 rack_log_to_prr(rack, 7, 0); 6696 } 6697 } 6698 } 6699 if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) { 6700 /* New more recent rack_tmit_time */ 6701 rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; 6702 rack->rc_rack_rtt = t; 6703 } 6704 return (1); 6705 } 6706 /* 6707 * We clear the soft/rxtshift since we got an ack. 6708 * There is no assurance we will call the commit() function 6709 * so we need to clear these to avoid incorrect handling. 6710 */ 6711 tp->t_rxtshift = 0; 6712 tp->t_softerror = 0; 6713 if ((to->to_flags & TOF_TS) && 6714 (ack_type == CUM_ACKED) && 6715 (to->to_tsecr) && 6716 ((rsm->r_flags & RACK_OVERMAX) == 0)) { 6717 /* 6718 * Now which timestamp does it match? In this block the ACK 6719 * must be coming from a previous transmission. 6720 */ 6721 for (i = 0; i < rsm->r_rtr_cnt; i++) { 6722 if (rsm->r_tim_lastsent[i] == to->to_tsecr) { 6723 t = cts - rsm->r_tim_lastsent[i]; 6724 if ((int)t <= 0) 6725 t = 1; 6726 if ((i + 1) < rsm->r_rtr_cnt) { 6727 /* Likely */ 6728 rack_earlier_retran(tp, rsm, t, cts); 6729 } 6730 if (!tp->t_rttlow || tp->t_rttlow > t) 6731 tp->t_rttlow = t; 6732 if (!rack->r_ctl.rc_rack_min_rtt || SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 6733 rack->r_ctl.rc_rack_min_rtt = t; 6734 if (rack->r_ctl.rc_rack_min_rtt == 0) { 6735 rack->r_ctl.rc_rack_min_rtt = 1; 6736 } 6737 } 6738 if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, 6739 rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) { 6740 /* New more recent rack_tmit_time */ 6741 rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; 6742 rack->rc_rack_rtt = t; 6743 } 6744 tcp_rack_xmit_timer(rack, t + 1, len_acked, (t * HPTS_USEC_IN_MSEC), 0, rsm, 6745 rsm->r_rtr_cnt); 6746 return (1); 6747 } 6748 } 6749 goto ts_not_found; 6750 } else { 6751 /* 6752 * Ok its a SACK block that we retransmitted. or a windows 6753 * machine without timestamps. We can tell nothing from the 6754 * time-stamp since its not there or the time the peer last 6755 * recieved a segment that moved forward its cum-ack point. 6756 */ 6757 ts_not_found: 6758 i = rsm->r_rtr_cnt - 1; 6759 t = cts - rsm->r_tim_lastsent[i]; 6760 if ((int)t <= 0) 6761 t = 1; 6762 if (rack->r_ctl.rc_rack_min_rtt && SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 6763 /* 6764 * We retransmitted and the ack came back in less 6765 * than the smallest rtt we have observed. We most 6766 * likey did an improper retransmit as outlined in 6767 * 4.2 Step 3 point 2 in the rack-draft. 6768 */ 6769 i = rsm->r_rtr_cnt - 2; 6770 t = cts - rsm->r_tim_lastsent[i]; 6771 rack_earlier_retran(tp, rsm, t, cts); 6772 } else if (rack->r_ctl.rc_rack_min_rtt) { 6773 /* 6774 * We retransmitted it and the retransmit did the 6775 * job. 6776 */ 6777 if (!rack->r_ctl.rc_rack_min_rtt || 6778 SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 6779 rack->r_ctl.rc_rack_min_rtt = t; 6780 if (rack->r_ctl.rc_rack_min_rtt == 0) { 6781 rack->r_ctl.rc_rack_min_rtt = 1; 6782 } 6783 } 6784 if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, rsm->r_tim_lastsent[i])) { 6785 /* New more recent rack_tmit_time */ 6786 rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[i]; 6787 rack->rc_rack_rtt = t; 6788 } 6789 return (1); 6790 } 6791 } 6792 return (0); 6793 } 6794 6795 /* 6796 * Mark the SACK_PASSED flag on all entries prior to rsm send wise. 6797 */ 6798 static void 6799 rack_log_sack_passed(struct tcpcb *tp, 6800 struct tcp_rack *rack, struct rack_sendmap *rsm) 6801 { 6802 struct rack_sendmap *nrsm; 6803 6804 nrsm = rsm; 6805 TAILQ_FOREACH_REVERSE_FROM(nrsm, &rack->r_ctl.rc_tmap, 6806 rack_head, r_tnext) { 6807 if (nrsm == rsm) { 6808 /* Skip orginal segment he is acked */ 6809 continue; 6810 } 6811 if (nrsm->r_flags & RACK_ACKED) { 6812 /* 6813 * Skip ack'd segments, though we 6814 * should not see these, since tmap 6815 * should not have ack'd segments. 6816 */ 6817 continue; 6818 } 6819 if (nrsm->r_flags & RACK_SACK_PASSED) { 6820 /* 6821 * We found one that is already marked 6822 * passed, we have been here before and 6823 * so all others below this are marked. 6824 */ 6825 break; 6826 } 6827 nrsm->r_flags |= RACK_SACK_PASSED; 6828 nrsm->r_flags &= ~RACK_WAS_SACKPASS; 6829 } 6830 } 6831 6832 static void 6833 rack_need_set_test(struct tcpcb *tp, 6834 struct tcp_rack *rack, 6835 struct rack_sendmap *rsm, 6836 tcp_seq th_ack, 6837 int line, 6838 int use_which) 6839 { 6840 6841 if ((tp->t_flags & TF_GPUTINPROG) && 6842 SEQ_GEQ(rsm->r_end, tp->gput_seq)) { 6843 /* 6844 * We were app limited, and this ack 6845 * butts up or goes beyond the point where we want 6846 * to start our next measurement. We need 6847 * to record the new gput_ts as here and 6848 * possibly update the start sequence. 6849 */ 6850 uint32_t seq, ts; 6851 6852 if (rsm->r_rtr_cnt > 1) { 6853 /* 6854 * This is a retransmit, can we 6855 * really make any assessment at this 6856 * point? We are not really sure of 6857 * the timestamp, is it this or the 6858 * previous transmission? 6859 * 6860 * Lets wait for something better that 6861 * is not retransmitted. 6862 */ 6863 return; 6864 } 6865 seq = tp->gput_seq; 6866 ts = tp->gput_ts; 6867 rack->app_limited_needs_set = 0; 6868 tp->gput_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 6869 /* Do we start at a new end? */ 6870 if ((use_which == RACK_USE_BEG) && 6871 SEQ_GEQ(rsm->r_start, tp->gput_seq)) { 6872 /* 6873 * When we get an ACK that just eats 6874 * up some of the rsm, we set RACK_USE_BEG 6875 * since whats at r_start (i.e. th_ack) 6876 * is left unacked and thats where the 6877 * measurement not starts. 6878 */ 6879 tp->gput_seq = rsm->r_start; 6880 rack->r_ctl.rc_gp_output_ts = rsm->usec_orig_send; 6881 } 6882 if ((use_which == RACK_USE_END) && 6883 SEQ_GEQ(rsm->r_end, tp->gput_seq)) { 6884 /* 6885 * We use the end when the cumack 6886 * is moving forward and completely 6887 * deleting the rsm passed so basically 6888 * r_end holds th_ack. 6889 * 6890 * For SACK's we also want to use the end 6891 * since this piece just got sacked and 6892 * we want to target anything after that 6893 * in our measurement. 6894 */ 6895 tp->gput_seq = rsm->r_end; 6896 rack->r_ctl.rc_gp_output_ts = rsm->usec_orig_send; 6897 } 6898 if (use_which == RACK_USE_END_OR_THACK) { 6899 /* 6900 * special case for ack moving forward, 6901 * not a sack, we need to move all the 6902 * way up to where this ack cum-ack moves 6903 * to. 6904 */ 6905 if (SEQ_GT(th_ack, rsm->r_end)) 6906 tp->gput_seq = th_ack; 6907 else 6908 tp->gput_seq = rsm->r_end; 6909 rack->r_ctl.rc_gp_output_ts = rsm->usec_orig_send; 6910 } 6911 if (SEQ_GT(tp->gput_seq, tp->gput_ack)) { 6912 /* 6913 * We moved beyond this guy's range, re-calculate 6914 * the new end point. 6915 */ 6916 if (rack->rc_gp_filled == 0) { 6917 tp->gput_ack = tp->gput_seq + max(rc_init_window(rack), (MIN_GP_WIN * ctf_fixed_maxseg(tp))); 6918 } else { 6919 tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack); 6920 } 6921 } 6922 /* 6923 * We are moving the goal post, we may be able to clear the 6924 * measure_saw_probe_rtt flag. 6925 */ 6926 if ((rack->in_probe_rtt == 0) && 6927 (rack->measure_saw_probe_rtt) && 6928 (SEQ_GEQ(tp->gput_seq, rack->r_ctl.rc_probertt_sndmax_atexit))) 6929 rack->measure_saw_probe_rtt = 0; 6930 rack_log_pacing_delay_calc(rack, ts, tp->gput_ts, 6931 seq, tp->gput_seq, 0, 5, line, NULL); 6932 if (rack->rc_gp_filled && 6933 ((tp->gput_ack - tp->gput_seq) < 6934 max(rc_init_window(rack), (MIN_GP_WIN * 6935 ctf_fixed_maxseg(tp))))) { 6936 /* 6937 * There is no sense of continuing this measurement 6938 * because its too small to gain us anything we 6939 * trust. Skip it and that way we can start a new 6940 * measurement quicker. 6941 */ 6942 rack_log_pacing_delay_calc(rack, tp->gput_ack, tp->gput_seq, 6943 0, 0, 0, 6, __LINE__, NULL); 6944 tp->t_flags &= ~TF_GPUTINPROG; 6945 } 6946 } 6947 } 6948 6949 static uint32_t 6950 rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, struct sackblk *sack, 6951 struct tcpopt *to, struct rack_sendmap **prsm, uint32_t cts, int *moved_two) 6952 { 6953 uint32_t start, end, changed = 0; 6954 struct rack_sendmap stack_map; 6955 struct rack_sendmap *rsm, *nrsm, fe, *insret, *prev, *next; 6956 int32_t used_ref = 1; 6957 int moved = 0; 6958 6959 start = sack->start; 6960 end = sack->end; 6961 rsm = *prsm; 6962 memset(&fe, 0, sizeof(fe)); 6963 do_rest_ofb: 6964 if ((rsm == NULL) || 6965 (SEQ_LT(end, rsm->r_start)) || 6966 (SEQ_GEQ(start, rsm->r_end)) || 6967 (SEQ_LT(start, rsm->r_start))) { 6968 /* 6969 * We are not in the right spot, 6970 * find the correct spot in the tree. 6971 */ 6972 used_ref = 0; 6973 fe.r_start = start; 6974 rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); 6975 moved++; 6976 } 6977 if (rsm == NULL) { 6978 /* TSNH */ 6979 goto out; 6980 } 6981 /* Ok we have an ACK for some piece of this rsm */ 6982 if (rsm->r_start != start) { 6983 if ((rsm->r_flags & RACK_ACKED) == 0) { 6984 /** 6985 * Need to split this in two pieces the before and after, 6986 * the before remains in the map, the after must be 6987 * added. In other words we have: 6988 * rsm |--------------| 6989 * sackblk |-------> 6990 * rsm will become 6991 * rsm |---| 6992 * and nrsm will be the sacked piece 6993 * nrsm |----------| 6994 * 6995 * But before we start down that path lets 6996 * see if the sack spans over on top of 6997 * the next guy and it is already sacked. 6998 */ 6999 next = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7000 if (next && (next->r_flags & RACK_ACKED) && 7001 SEQ_GEQ(end, next->r_start)) { 7002 /** 7003 * So the next one is already acked, and 7004 * we can thus by hookery use our stack_map 7005 * to reflect the piece being sacked and 7006 * then adjust the two tree entries moving 7007 * the start and ends around. So we start like: 7008 * rsm |------------| (not-acked) 7009 * next |-----------| (acked) 7010 * sackblk |--------> 7011 * We want to end like so: 7012 * rsm |------| (not-acked) 7013 * next |-----------------| (acked) 7014 * nrsm |-----| 7015 * Where nrsm is a temporary stack piece we 7016 * use to update all the gizmos. 7017 */ 7018 /* Copy up our fudge block */ 7019 nrsm = &stack_map; 7020 memcpy(nrsm, rsm, sizeof(struct rack_sendmap)); 7021 /* Now adjust our tree blocks */ 7022 rsm->r_end = start; 7023 next->r_start = start; 7024 /* Clear out the dup ack count of the remainder */ 7025 rsm->r_dupack = 0; 7026 rsm->r_just_ret = 0; 7027 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 7028 /* Now lets make sure our fudge block is right */ 7029 nrsm->r_start = start; 7030 /* Now lets update all the stats and such */ 7031 rack_update_rtt(tp, rack, nrsm, to, cts, SACKED, 0); 7032 if (rack->app_limited_needs_set) 7033 rack_need_set_test(tp, rack, nrsm, tp->snd_una, __LINE__, RACK_USE_END); 7034 changed += (nrsm->r_end - nrsm->r_start); 7035 rack->r_ctl.rc_sacked += (nrsm->r_end - nrsm->r_start); 7036 if (nrsm->r_flags & RACK_SACK_PASSED) { 7037 counter_u64_add(rack_reorder_seen, 1); 7038 rack->r_ctl.rc_reorder_ts = cts; 7039 } 7040 /* 7041 * Now we want to go up from rsm (the 7042 * one left un-acked) to the next one 7043 * in the tmap. We do this so when 7044 * we walk backwards we include marking 7045 * sack-passed on rsm (The one passed in 7046 * is skipped since it is generally called 7047 * on something sacked before removing it 7048 * from the tmap). 7049 */ 7050 if (rsm->r_in_tmap) { 7051 nrsm = TAILQ_NEXT(rsm, r_tnext); 7052 /* 7053 * Now that we have the next 7054 * one walk backwards from there. 7055 */ 7056 if (nrsm && nrsm->r_in_tmap) 7057 rack_log_sack_passed(tp, rack, nrsm); 7058 } 7059 /* Now are we done? */ 7060 if (SEQ_LT(end, next->r_end) || 7061 (end == next->r_end)) { 7062 /* Done with block */ 7063 goto out; 7064 } 7065 counter_u64_add(rack_sack_used_next_merge, 1); 7066 /* Postion for the next block */ 7067 start = next->r_end; 7068 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, next); 7069 if (rsm == NULL) 7070 goto out; 7071 } else { 7072 /** 7073 * We can't use any hookery here, so we 7074 * need to split the map. We enter like 7075 * so: 7076 * rsm |--------| 7077 * sackblk |-----> 7078 * We will add the new block nrsm and 7079 * that will be the new portion, and then 7080 * fall through after reseting rsm. So we 7081 * split and look like this: 7082 * rsm |----| 7083 * sackblk |-----> 7084 * nrsm |---| 7085 * We then fall through reseting 7086 * rsm to nrsm, so the next block 7087 * picks it up. 7088 */ 7089 nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT); 7090 if (nrsm == NULL) { 7091 /* 7092 * failed XXXrrs what can we do but loose the sack 7093 * info? 7094 */ 7095 goto out; 7096 } 7097 counter_u64_add(rack_sack_splits, 1); 7098 rack_clone_rsm(rack, nrsm, rsm, start); 7099 rsm->r_just_ret = 0; 7100 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); 7101 #ifdef INVARIANTS 7102 if (insret != NULL) { 7103 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", 7104 nrsm, insret, rack, rsm); 7105 } 7106 #endif 7107 if (rsm->r_in_tmap) { 7108 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 7109 nrsm->r_in_tmap = 1; 7110 } 7111 rsm->r_flags &= (~RACK_HAS_FIN); 7112 /* Position us to point to the new nrsm that starts the sack blk */ 7113 rsm = nrsm; 7114 } 7115 } else { 7116 /* Already sacked this piece */ 7117 counter_u64_add(rack_sack_skipped_acked, 1); 7118 moved++; 7119 if (end == rsm->r_end) { 7120 /* Done with block */ 7121 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7122 goto out; 7123 } else if (SEQ_LT(end, rsm->r_end)) { 7124 /* A partial sack to a already sacked block */ 7125 moved++; 7126 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7127 goto out; 7128 } else { 7129 /* 7130 * The end goes beyond this guy 7131 * repostion the start to the 7132 * next block. 7133 */ 7134 start = rsm->r_end; 7135 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7136 if (rsm == NULL) 7137 goto out; 7138 } 7139 } 7140 } 7141 if (SEQ_GEQ(end, rsm->r_end)) { 7142 /** 7143 * The end of this block is either beyond this guy or right 7144 * at this guy. I.e.: 7145 * rsm --- |-----| 7146 * end |-----| 7147 * <or> 7148 * end |---------| 7149 */ 7150 if ((rsm->r_flags & RACK_ACKED) == 0) { 7151 rack_update_rtt(tp, rack, rsm, to, cts, SACKED, 0); 7152 changed += (rsm->r_end - rsm->r_start); 7153 rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start); 7154 if (rsm->r_in_tmap) /* should be true */ 7155 rack_log_sack_passed(tp, rack, rsm); 7156 /* Is Reordering occuring? */ 7157 if (rsm->r_flags & RACK_SACK_PASSED) { 7158 rsm->r_flags &= ~RACK_SACK_PASSED; 7159 counter_u64_add(rack_reorder_seen, 1); 7160 rack->r_ctl.rc_reorder_ts = cts; 7161 } 7162 if (rack->app_limited_needs_set) 7163 rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_END); 7164 rsm->r_ack_arrival = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 7165 rsm->r_flags |= RACK_ACKED; 7166 rsm->r_flags &= ~RACK_TLP; 7167 if (rsm->r_in_tmap) { 7168 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 7169 rsm->r_in_tmap = 0; 7170 } 7171 } else { 7172 counter_u64_add(rack_sack_skipped_acked, 1); 7173 moved++; 7174 } 7175 if (end == rsm->r_end) { 7176 /* This block only - done, setup for next */ 7177 goto out; 7178 } 7179 /* 7180 * There is more not coverend by this rsm move on 7181 * to the next block in the RB tree. 7182 */ 7183 nrsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7184 start = rsm->r_end; 7185 rsm = nrsm; 7186 if (rsm == NULL) 7187 goto out; 7188 goto do_rest_ofb; 7189 } 7190 /** 7191 * The end of this sack block is smaller than 7192 * our rsm i.e.: 7193 * rsm --- |-----| 7194 * end |--| 7195 */ 7196 if ((rsm->r_flags & RACK_ACKED) == 0) { 7197 prev = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7198 if (prev && (prev->r_flags & RACK_ACKED)) { 7199 /** 7200 * Goal, we want the right remainder of rsm to shrink 7201 * in place and span from (rsm->r_start = end) to rsm->r_end. 7202 * We want to expand prev to go all the way 7203 * to prev->r_end <- end. 7204 * so in the tree we have before: 7205 * prev |--------| (acked) 7206 * rsm |-------| (non-acked) 7207 * sackblk |-| 7208 * We churn it so we end up with 7209 * prev |----------| (acked) 7210 * rsm |-----| (non-acked) 7211 * nrsm |-| (temporary) 7212 */ 7213 nrsm = &stack_map; 7214 memcpy(nrsm, rsm, sizeof(struct rack_sendmap)); 7215 prev->r_end = end; 7216 rsm->r_start = end; 7217 /* Now adjust nrsm (stack copy) to be 7218 * the one that is the small 7219 * piece that was "sacked". 7220 */ 7221 nrsm->r_end = end; 7222 rsm->r_dupack = 0; 7223 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 7224 /* 7225 * Now nrsm is our new little piece 7226 * that is acked (which was merged 7227 * to prev). Update the rtt and changed 7228 * based on that. Also check for reordering. 7229 */ 7230 rack_update_rtt(tp, rack, nrsm, to, cts, SACKED, 0); 7231 if (rack->app_limited_needs_set) 7232 rack_need_set_test(tp, rack, nrsm, tp->snd_una, __LINE__, RACK_USE_END); 7233 changed += (nrsm->r_end - nrsm->r_start); 7234 rack->r_ctl.rc_sacked += (nrsm->r_end - nrsm->r_start); 7235 if (nrsm->r_flags & RACK_SACK_PASSED) { 7236 counter_u64_add(rack_reorder_seen, 1); 7237 rack->r_ctl.rc_reorder_ts = cts; 7238 } 7239 rsm = prev; 7240 counter_u64_add(rack_sack_used_prev_merge, 1); 7241 } else { 7242 /** 7243 * This is the case where our previous 7244 * block is not acked either, so we must 7245 * split the block in two. 7246 */ 7247 nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT); 7248 if (nrsm == NULL) { 7249 /* failed rrs what can we do but loose the sack info? */ 7250 goto out; 7251 } 7252 /** 7253 * In this case nrsm becomes 7254 * nrsm->r_start = end; 7255 * nrsm->r_end = rsm->r_end; 7256 * which is un-acked. 7257 * <and> 7258 * rsm->r_end = nrsm->r_start; 7259 * i.e. the remaining un-acked 7260 * piece is left on the left 7261 * hand side. 7262 * 7263 * So we start like this 7264 * rsm |----------| (not acked) 7265 * sackblk |---| 7266 * build it so we have 7267 * rsm |---| (acked) 7268 * nrsm |------| (not acked) 7269 */ 7270 counter_u64_add(rack_sack_splits, 1); 7271 rack_clone_rsm(rack, nrsm, rsm, end); 7272 rsm->r_flags &= (~RACK_HAS_FIN); 7273 rsm->r_just_ret = 0; 7274 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); 7275 #ifdef INVARIANTS 7276 if (insret != NULL) { 7277 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", 7278 nrsm, insret, rack, rsm); 7279 } 7280 #endif 7281 if (rsm->r_in_tmap) { 7282 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 7283 nrsm->r_in_tmap = 1; 7284 } 7285 nrsm->r_dupack = 0; 7286 rack_log_retran_reason(rack, nrsm, __LINE__, 0, 2); 7287 rack_update_rtt(tp, rack, rsm, to, cts, SACKED, 0); 7288 changed += (rsm->r_end - rsm->r_start); 7289 rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start); 7290 if (rsm->r_in_tmap) /* should be true */ 7291 rack_log_sack_passed(tp, rack, rsm); 7292 /* Is Reordering occuring? */ 7293 if (rsm->r_flags & RACK_SACK_PASSED) { 7294 rsm->r_flags &= ~RACK_SACK_PASSED; 7295 counter_u64_add(rack_reorder_seen, 1); 7296 rack->r_ctl.rc_reorder_ts = cts; 7297 } 7298 if (rack->app_limited_needs_set) 7299 rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_END); 7300 rsm->r_ack_arrival = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 7301 rsm->r_flags |= RACK_ACKED; 7302 rsm->r_flags &= ~RACK_TLP; 7303 if (rsm->r_in_tmap) { 7304 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 7305 rsm->r_in_tmap = 0; 7306 } 7307 } 7308 } else if (start != end){ 7309 /* 7310 * The block was already acked. 7311 */ 7312 counter_u64_add(rack_sack_skipped_acked, 1); 7313 moved++; 7314 } 7315 out: 7316 if (rsm && (rsm->r_flags & RACK_ACKED)) { 7317 /* 7318 * Now can we merge where we worked 7319 * with either the previous or 7320 * next block? 7321 */ 7322 next = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7323 while (next) { 7324 if (next->r_flags & RACK_ACKED) { 7325 /* yep this and next can be merged */ 7326 rsm = rack_merge_rsm(rack, rsm, next); 7327 next = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7328 } else 7329 break; 7330 } 7331 /* Now what about the previous? */ 7332 prev = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7333 while (prev) { 7334 if (prev->r_flags & RACK_ACKED) { 7335 /* yep the previous and this can be merged */ 7336 rsm = rack_merge_rsm(rack, prev, rsm); 7337 prev = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7338 } else 7339 break; 7340 } 7341 } 7342 if (used_ref == 0) { 7343 counter_u64_add(rack_sack_proc_all, 1); 7344 } else { 7345 counter_u64_add(rack_sack_proc_short, 1); 7346 } 7347 /* Save off the next one for quick reference. */ 7348 if (rsm) 7349 nrsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7350 else 7351 nrsm = NULL; 7352 *prsm = rack->r_ctl.rc_sacklast = nrsm; 7353 /* Pass back the moved. */ 7354 *moved_two = moved; 7355 return (changed); 7356 } 7357 7358 static void inline 7359 rack_peer_reneges(struct tcp_rack *rack, struct rack_sendmap *rsm, tcp_seq th_ack) 7360 { 7361 struct rack_sendmap *tmap; 7362 7363 tmap = NULL; 7364 while (rsm && (rsm->r_flags & RACK_ACKED)) { 7365 /* Its no longer sacked, mark it so */ 7366 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); 7367 #ifdef INVARIANTS 7368 if (rsm->r_in_tmap) { 7369 panic("rack:%p rsm:%p flags:0x%x in tmap?", 7370 rack, rsm, rsm->r_flags); 7371 } 7372 #endif 7373 rsm->r_flags &= ~(RACK_ACKED|RACK_SACK_PASSED|RACK_WAS_SACKPASS); 7374 /* Rebuild it into our tmap */ 7375 if (tmap == NULL) { 7376 TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext); 7377 tmap = rsm; 7378 } else { 7379 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, tmap, rsm, r_tnext); 7380 tmap = rsm; 7381 } 7382 tmap->r_in_tmap = 1; 7383 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7384 } 7385 /* 7386 * Now lets possibly clear the sack filter so we start 7387 * recognizing sacks that cover this area. 7388 */ 7389 sack_filter_clear(&rack->r_ctl.rack_sf, th_ack); 7390 7391 } 7392 7393 static void 7394 rack_do_decay(struct tcp_rack *rack) 7395 { 7396 struct timeval res; 7397 7398 #define timersub(tvp, uvp, vvp) \ 7399 do { \ 7400 (vvp)->tv_sec = (tvp)->tv_sec - (uvp)->tv_sec; \ 7401 (vvp)->tv_usec = (tvp)->tv_usec - (uvp)->tv_usec; \ 7402 if ((vvp)->tv_usec < 0) { \ 7403 (vvp)->tv_sec--; \ 7404 (vvp)->tv_usec += 1000000; \ 7405 } \ 7406 } while (0) 7407 7408 timersub(&rack->r_ctl.act_rcv_time, &rack->r_ctl.rc_last_time_decay, &res); 7409 #undef timersub 7410 7411 rack->r_ctl.input_pkt++; 7412 if ((rack->rc_in_persist) || 7413 (res.tv_sec >= 1) || 7414 (rack->rc_tp->snd_max == rack->rc_tp->snd_una)) { 7415 /* 7416 * Check for decay of non-SAD, 7417 * we want all SAD detection metrics to 7418 * decay 1/4 per second (or more) passed. 7419 */ 7420 uint32_t pkt_delta; 7421 7422 pkt_delta = rack->r_ctl.input_pkt - rack->r_ctl.saved_input_pkt; 7423 /* Update our saved tracking values */ 7424 rack->r_ctl.saved_input_pkt = rack->r_ctl.input_pkt; 7425 rack->r_ctl.rc_last_time_decay = rack->r_ctl.act_rcv_time; 7426 /* Now do we escape without decay? */ 7427 #ifdef NETFLIX_EXP_DETECTION 7428 if (rack->rc_in_persist || 7429 (rack->rc_tp->snd_max == rack->rc_tp->snd_una) || 7430 (pkt_delta < tcp_sad_low_pps)){ 7431 /* 7432 * We don't decay idle connections 7433 * or ones that have a low input pps. 7434 */ 7435 return; 7436 } 7437 /* Decay the counters */ 7438 rack->r_ctl.ack_count = ctf_decay_count(rack->r_ctl.ack_count, 7439 tcp_sad_decay_val); 7440 rack->r_ctl.sack_count = ctf_decay_count(rack->r_ctl.sack_count, 7441 tcp_sad_decay_val); 7442 rack->r_ctl.sack_moved_extra = ctf_decay_count(rack->r_ctl.sack_moved_extra, 7443 tcp_sad_decay_val); 7444 rack->r_ctl.sack_noextra_move = ctf_decay_count(rack->r_ctl.sack_noextra_move, 7445 tcp_sad_decay_val); 7446 #endif 7447 } 7448 } 7449 7450 static void 7451 rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th) 7452 { 7453 uint32_t changed, entered_recovery = 0; 7454 struct tcp_rack *rack; 7455 struct rack_sendmap *rsm, *rm; 7456 struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1]; 7457 register uint32_t th_ack; 7458 int32_t i, j, k, num_sack_blks = 0; 7459 uint32_t cts, acked, ack_point, sack_changed = 0; 7460 int loop_start = 0, moved_two = 0; 7461 uint32_t tsused; 7462 7463 INP_WLOCK_ASSERT(tp->t_inpcb); 7464 if (th->th_flags & TH_RST) { 7465 /* We don't log resets */ 7466 return; 7467 } 7468 rack = (struct tcp_rack *)tp->t_fb_ptr; 7469 cts = tcp_ts_getticks(); 7470 rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 7471 changed = 0; 7472 th_ack = th->th_ack; 7473 if (rack->sack_attack_disable == 0) 7474 rack_do_decay(rack); 7475 if (BYTES_THIS_ACK(tp, th) >= ctf_fixed_maxseg(rack->rc_tp)) { 7476 /* 7477 * You only get credit for 7478 * MSS and greater (and you get extra 7479 * credit for larger cum-ack moves). 7480 */ 7481 int ac; 7482 7483 ac = BYTES_THIS_ACK(tp, th) / ctf_fixed_maxseg(rack->rc_tp); 7484 rack->r_ctl.ack_count += ac; 7485 counter_u64_add(rack_ack_total, ac); 7486 } 7487 if (rack->r_ctl.ack_count > 0xfff00000) { 7488 /* 7489 * reduce the number to keep us under 7490 * a uint32_t. 7491 */ 7492 rack->r_ctl.ack_count /= 2; 7493 rack->r_ctl.sack_count /= 2; 7494 } 7495 if (SEQ_GT(th_ack, tp->snd_una)) { 7496 rack_log_progress_event(rack, tp, ticks, PROGRESS_UPDATE, __LINE__); 7497 tp->t_acktime = ticks; 7498 } 7499 if (rsm && SEQ_GT(th_ack, rsm->r_start)) 7500 changed = th_ack - rsm->r_start; 7501 if (changed) { 7502 /* 7503 * The ACK point is advancing to th_ack, we must drop off 7504 * the packets in the rack log and calculate any eligble 7505 * RTT's. 7506 */ 7507 rack->r_wanted_output = 1; 7508 more: 7509 rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 7510 if (rsm == NULL) { 7511 if ((th_ack - 1) == tp->iss) { 7512 /* 7513 * For the SYN incoming case we will not 7514 * have called tcp_output for the sending of 7515 * the SYN, so there will be no map. All 7516 * other cases should probably be a panic. 7517 */ 7518 goto proc_sack; 7519 } 7520 if (tp->t_flags & TF_SENTFIN) { 7521 /* if we send a FIN we will not hav a map */ 7522 goto proc_sack; 7523 } 7524 #ifdef INVARIANTS 7525 panic("No rack map tp:%p for th:%p state:%d rack:%p snd_una:%u snd_max:%u snd_nxt:%u chg:%d\n", 7526 tp, 7527 th, tp->t_state, rack, 7528 tp->snd_una, tp->snd_max, tp->snd_nxt, changed); 7529 #endif 7530 goto proc_sack; 7531 } 7532 if (SEQ_LT(th_ack, rsm->r_start)) { 7533 /* Huh map is missing this */ 7534 #ifdef INVARIANTS 7535 printf("Rack map starts at r_start:%u for th_ack:%u huh? ts:%d rs:%d\n", 7536 rsm->r_start, 7537 th_ack, tp->t_state, rack->r_state); 7538 #endif 7539 goto proc_sack; 7540 } 7541 rack_update_rtt(tp, rack, rsm, to, cts, CUM_ACKED, th_ack); 7542 /* Now do we consume the whole thing? */ 7543 if (SEQ_GEQ(th_ack, rsm->r_end)) { 7544 /* Its all consumed. */ 7545 uint32_t left; 7546 uint8_t newly_acked; 7547 7548 rack->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes; 7549 rsm->r_rtr_bytes = 0; 7550 /* Record the time of highest cumack sent */ 7551 rack->r_ctl.rc_gp_cumack_ts = rsm->usec_orig_send; 7552 rm = RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7553 #ifdef INVARIANTS 7554 if (rm != rsm) { 7555 panic("removing head in rack:%p rsm:%p rm:%p", 7556 rack, rsm, rm); 7557 } 7558 #endif 7559 if (rsm->r_in_tmap) { 7560 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 7561 rsm->r_in_tmap = 0; 7562 } 7563 newly_acked = 1; 7564 if (rsm->r_flags & RACK_ACKED) { 7565 /* 7566 * It was acked on the scoreboard -- remove 7567 * it from total 7568 */ 7569 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); 7570 newly_acked = 0; 7571 } else if (rsm->r_flags & RACK_SACK_PASSED) { 7572 /* 7573 * There are segments ACKED on the 7574 * scoreboard further up. We are seeing 7575 * reordering. 7576 */ 7577 rsm->r_flags &= ~RACK_SACK_PASSED; 7578 counter_u64_add(rack_reorder_seen, 1); 7579 rsm->r_ack_arrival = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 7580 rsm->r_flags |= RACK_ACKED; 7581 rack->r_ctl.rc_reorder_ts = cts; 7582 } 7583 left = th_ack - rsm->r_end; 7584 if (rack->app_limited_needs_set && newly_acked) 7585 rack_need_set_test(tp, rack, rsm, th_ack, __LINE__, RACK_USE_END_OR_THACK); 7586 /* Free back to zone */ 7587 rack_free(rack, rsm); 7588 if (left) { 7589 goto more; 7590 } 7591 goto proc_sack; 7592 } 7593 if (rsm->r_flags & RACK_ACKED) { 7594 /* 7595 * It was acked on the scoreboard -- remove it from 7596 * total for the part being cum-acked. 7597 */ 7598 rack->r_ctl.rc_sacked -= (th_ack - rsm->r_start); 7599 } 7600 /* 7601 * Clear the dup ack count for 7602 * the piece that remains. 7603 */ 7604 rsm->r_dupack = 0; 7605 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 7606 if (rsm->r_rtr_bytes) { 7607 /* 7608 * It was retransmitted adjust the 7609 * sack holes for what was acked. 7610 */ 7611 int ack_am; 7612 7613 ack_am = (th_ack - rsm->r_start); 7614 if (ack_am >= rsm->r_rtr_bytes) { 7615 rack->r_ctl.rc_holes_rxt -= ack_am; 7616 rsm->r_rtr_bytes -= ack_am; 7617 } 7618 } 7619 /* 7620 * Update where the piece starts and record 7621 * the time of send of highest cumack sent. 7622 */ 7623 rack->r_ctl.rc_gp_cumack_ts = rsm->usec_orig_send; 7624 rsm->r_start = th_ack; 7625 if (rack->app_limited_needs_set) 7626 rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_BEG); 7627 } 7628 proc_sack: 7629 /* Check for reneging */ 7630 rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 7631 if (rsm && (rsm->r_flags & RACK_ACKED) && (th_ack == rsm->r_start)) { 7632 /* 7633 * The peer has moved snd_una up to 7634 * the edge of this send, i.e. one 7635 * that it had previously acked. The only 7636 * way that can be true if the peer threw 7637 * away data (space issues) that it had 7638 * previously sacked (else it would have 7639 * given us snd_una up to (rsm->r_end). 7640 * We need to undo the acked markings here. 7641 * 7642 * Note we have to look to make sure th_ack is 7643 * our rsm->r_start in case we get an old ack 7644 * where th_ack is behind snd_una. 7645 */ 7646 rack_peer_reneges(rack, rsm, th->th_ack); 7647 } 7648 if ((to->to_flags & TOF_SACK) == 0) { 7649 /* We are done nothing left */ 7650 goto out; 7651 } 7652 /* Sack block processing */ 7653 if (SEQ_GT(th_ack, tp->snd_una)) 7654 ack_point = th_ack; 7655 else 7656 ack_point = tp->snd_una; 7657 for (i = 0; i < to->to_nsacks; i++) { 7658 bcopy((to->to_sacks + i * TCPOLEN_SACK), 7659 &sack, sizeof(sack)); 7660 sack.start = ntohl(sack.start); 7661 sack.end = ntohl(sack.end); 7662 if (SEQ_GT(sack.end, sack.start) && 7663 SEQ_GT(sack.start, ack_point) && 7664 SEQ_LT(sack.start, tp->snd_max) && 7665 SEQ_GT(sack.end, ack_point) && 7666 SEQ_LEQ(sack.end, tp->snd_max)) { 7667 sack_blocks[num_sack_blks] = sack; 7668 num_sack_blks++; 7669 #ifdef NETFLIX_STATS 7670 } else if (SEQ_LEQ(sack.start, th_ack) && 7671 SEQ_LEQ(sack.end, th_ack)) { 7672 /* 7673 * Its a D-SACK block. 7674 */ 7675 tcp_record_dsack(sack.start, sack.end); 7676 #endif 7677 } 7678 } 7679 /* 7680 * Sort the SACK blocks so we can update the rack scoreboard with 7681 * just one pass. 7682 */ 7683 num_sack_blks = sack_filter_blks(&rack->r_ctl.rack_sf, sack_blocks, 7684 num_sack_blks, th->th_ack); 7685 ctf_log_sack_filter(rack->rc_tp, num_sack_blks, sack_blocks); 7686 if (num_sack_blks == 0) { 7687 /* Nothing to sack (DSACKs?) */ 7688 goto out_with_totals; 7689 } 7690 if (num_sack_blks < 2) { 7691 /* Only one, we don't need to sort */ 7692 goto do_sack_work; 7693 } 7694 /* Sort the sacks */ 7695 for (i = 0; i < num_sack_blks; i++) { 7696 for (j = i + 1; j < num_sack_blks; j++) { 7697 if (SEQ_GT(sack_blocks[i].end, sack_blocks[j].end)) { 7698 sack = sack_blocks[i]; 7699 sack_blocks[i] = sack_blocks[j]; 7700 sack_blocks[j] = sack; 7701 } 7702 } 7703 } 7704 /* 7705 * Now are any of the sack block ends the same (yes some 7706 * implementations send these)? 7707 */ 7708 again: 7709 if (num_sack_blks == 0) 7710 goto out_with_totals; 7711 if (num_sack_blks > 1) { 7712 for (i = 0; i < num_sack_blks; i++) { 7713 for (j = i + 1; j < num_sack_blks; j++) { 7714 if (sack_blocks[i].end == sack_blocks[j].end) { 7715 /* 7716 * Ok these two have the same end we 7717 * want the smallest end and then 7718 * throw away the larger and start 7719 * again. 7720 */ 7721 if (SEQ_LT(sack_blocks[j].start, sack_blocks[i].start)) { 7722 /* 7723 * The second block covers 7724 * more area use that 7725 */ 7726 sack_blocks[i].start = sack_blocks[j].start; 7727 } 7728 /* 7729 * Now collapse out the dup-sack and 7730 * lower the count 7731 */ 7732 for (k = (j + 1); k < num_sack_blks; k++) { 7733 sack_blocks[j].start = sack_blocks[k].start; 7734 sack_blocks[j].end = sack_blocks[k].end; 7735 j++; 7736 } 7737 num_sack_blks--; 7738 goto again; 7739 } 7740 } 7741 } 7742 } 7743 do_sack_work: 7744 /* 7745 * First lets look to see if 7746 * we have retransmitted and 7747 * can use the transmit next? 7748 */ 7749 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 7750 if (rsm && 7751 SEQ_GT(sack_blocks[0].end, rsm->r_start) && 7752 SEQ_LT(sack_blocks[0].start, rsm->r_end)) { 7753 /* 7754 * We probably did the FR and the next 7755 * SACK in continues as we would expect. 7756 */ 7757 acked = rack_proc_sack_blk(tp, rack, &sack_blocks[0], to, &rsm, cts, &moved_two); 7758 if (acked) { 7759 rack->r_wanted_output = 1; 7760 changed += acked; 7761 sack_changed += acked; 7762 } 7763 if (num_sack_blks == 1) { 7764 /* 7765 * This is what we would expect from 7766 * a normal implementation to happen 7767 * after we have retransmitted the FR, 7768 * i.e the sack-filter pushes down 7769 * to 1 block and the next to be retransmitted 7770 * is the sequence in the sack block (has more 7771 * are acked). Count this as ACK'd data to boost 7772 * up the chances of recovering any false positives. 7773 */ 7774 rack->r_ctl.ack_count += (acked / ctf_fixed_maxseg(rack->rc_tp)); 7775 counter_u64_add(rack_ack_total, (acked / ctf_fixed_maxseg(rack->rc_tp))); 7776 counter_u64_add(rack_express_sack, 1); 7777 if (rack->r_ctl.ack_count > 0xfff00000) { 7778 /* 7779 * reduce the number to keep us under 7780 * a uint32_t. 7781 */ 7782 rack->r_ctl.ack_count /= 2; 7783 rack->r_ctl.sack_count /= 2; 7784 } 7785 goto out_with_totals; 7786 } else { 7787 /* 7788 * Start the loop through the 7789 * rest of blocks, past the first block. 7790 */ 7791 moved_two = 0; 7792 loop_start = 1; 7793 } 7794 } 7795 /* Its a sack of some sort */ 7796 rack->r_ctl.sack_count++; 7797 if (rack->r_ctl.sack_count > 0xfff00000) { 7798 /* 7799 * reduce the number to keep us under 7800 * a uint32_t. 7801 */ 7802 rack->r_ctl.ack_count /= 2; 7803 rack->r_ctl.sack_count /= 2; 7804 } 7805 counter_u64_add(rack_sack_total, 1); 7806 if (rack->sack_attack_disable) { 7807 /* An attacker disablement is in place */ 7808 if (num_sack_blks > 1) { 7809 rack->r_ctl.sack_count += (num_sack_blks - 1); 7810 rack->r_ctl.sack_moved_extra++; 7811 counter_u64_add(rack_move_some, 1); 7812 if (rack->r_ctl.sack_moved_extra > 0xfff00000) { 7813 rack->r_ctl.sack_moved_extra /= 2; 7814 rack->r_ctl.sack_noextra_move /= 2; 7815 } 7816 } 7817 goto out; 7818 } 7819 rsm = rack->r_ctl.rc_sacklast; 7820 for (i = loop_start; i < num_sack_blks; i++) { 7821 acked = rack_proc_sack_blk(tp, rack, &sack_blocks[i], to, &rsm, cts, &moved_two); 7822 if (acked) { 7823 rack->r_wanted_output = 1; 7824 changed += acked; 7825 sack_changed += acked; 7826 } 7827 if (moved_two) { 7828 /* 7829 * If we did not get a SACK for at least a MSS and 7830 * had to move at all, or if we moved more than our 7831 * threshold, it counts against the "extra" move. 7832 */ 7833 rack->r_ctl.sack_moved_extra += moved_two; 7834 counter_u64_add(rack_move_some, 1); 7835 } else { 7836 /* 7837 * else we did not have to move 7838 * any more than we would expect. 7839 */ 7840 rack->r_ctl.sack_noextra_move++; 7841 counter_u64_add(rack_move_none, 1); 7842 } 7843 if (moved_two && (acked < ctf_fixed_maxseg(rack->rc_tp))) { 7844 /* 7845 * If the SACK was not a full MSS then 7846 * we add to sack_count the number of 7847 * MSS's (or possibly more than 7848 * a MSS if its a TSO send) we had to skip by. 7849 */ 7850 rack->r_ctl.sack_count += moved_two; 7851 counter_u64_add(rack_sack_total, moved_two); 7852 } 7853 /* 7854 * Now we need to setup for the next 7855 * round. First we make sure we won't 7856 * exceed the size of our uint32_t on 7857 * the various counts, and then clear out 7858 * moved_two. 7859 */ 7860 if ((rack->r_ctl.sack_moved_extra > 0xfff00000) || 7861 (rack->r_ctl.sack_noextra_move > 0xfff00000)) { 7862 rack->r_ctl.sack_moved_extra /= 2; 7863 rack->r_ctl.sack_noextra_move /= 2; 7864 } 7865 if (rack->r_ctl.sack_count > 0xfff00000) { 7866 rack->r_ctl.ack_count /= 2; 7867 rack->r_ctl.sack_count /= 2; 7868 } 7869 moved_two = 0; 7870 } 7871 out_with_totals: 7872 if (num_sack_blks > 1) { 7873 /* 7874 * You get an extra stroke if 7875 * you have more than one sack-blk, this 7876 * could be where we are skipping forward 7877 * and the sack-filter is still working, or 7878 * it could be an attacker constantly 7879 * moving us. 7880 */ 7881 rack->r_ctl.sack_moved_extra++; 7882 counter_u64_add(rack_move_some, 1); 7883 } 7884 out: 7885 #ifdef NETFLIX_EXP_DETECTION 7886 if ((rack->do_detection || tcp_force_detection) && 7887 tcp_sack_to_ack_thresh && 7888 tcp_sack_to_move_thresh && 7889 ((rack->r_ctl.rc_num_maps_alloced > tcp_map_minimum) || rack->sack_attack_disable)) { 7890 /* 7891 * We have thresholds set to find 7892 * possible attackers and disable sack. 7893 * Check them. 7894 */ 7895 uint64_t ackratio, moveratio, movetotal; 7896 7897 /* Log detecting */ 7898 rack_log_sad(rack, 1); 7899 ackratio = (uint64_t)(rack->r_ctl.sack_count); 7900 ackratio *= (uint64_t)(1000); 7901 if (rack->r_ctl.ack_count) 7902 ackratio /= (uint64_t)(rack->r_ctl.ack_count); 7903 else { 7904 /* We really should not hit here */ 7905 ackratio = 1000; 7906 } 7907 if ((rack->sack_attack_disable == 0) && 7908 (ackratio > rack_highest_sack_thresh_seen)) 7909 rack_highest_sack_thresh_seen = (uint32_t)ackratio; 7910 movetotal = rack->r_ctl.sack_moved_extra; 7911 movetotal += rack->r_ctl.sack_noextra_move; 7912 moveratio = rack->r_ctl.sack_moved_extra; 7913 moveratio *= (uint64_t)1000; 7914 if (movetotal) 7915 moveratio /= movetotal; 7916 else { 7917 /* No moves, thats pretty good */ 7918 moveratio = 0; 7919 } 7920 if ((rack->sack_attack_disable == 0) && 7921 (moveratio > rack_highest_move_thresh_seen)) 7922 rack_highest_move_thresh_seen = (uint32_t)moveratio; 7923 if (rack->sack_attack_disable == 0) { 7924 if ((ackratio > tcp_sack_to_ack_thresh) && 7925 (moveratio > tcp_sack_to_move_thresh)) { 7926 /* Disable sack processing */ 7927 rack->sack_attack_disable = 1; 7928 if (rack->r_rep_attack == 0) { 7929 rack->r_rep_attack = 1; 7930 counter_u64_add(rack_sack_attacks_detected, 1); 7931 } 7932 if (tcp_attack_on_turns_on_logging) { 7933 /* 7934 * Turn on logging, used for debugging 7935 * false positives. 7936 */ 7937 rack->rc_tp->t_logstate = tcp_attack_on_turns_on_logging; 7938 } 7939 /* Clamp the cwnd at flight size */ 7940 rack->r_ctl.rc_saved_cwnd = rack->rc_tp->snd_cwnd; 7941 rack->rc_tp->snd_cwnd = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 7942 rack_log_sad(rack, 2); 7943 } 7944 } else { 7945 /* We are sack-disabled check for false positives */ 7946 if ((ackratio <= tcp_restoral_thresh) || 7947 (rack->r_ctl.rc_num_maps_alloced < tcp_map_minimum)) { 7948 rack->sack_attack_disable = 0; 7949 rack_log_sad(rack, 3); 7950 /* Restart counting */ 7951 rack->r_ctl.sack_count = 0; 7952 rack->r_ctl.sack_moved_extra = 0; 7953 rack->r_ctl.sack_noextra_move = 1; 7954 rack->r_ctl.ack_count = max(1, 7955 (BYTES_THIS_ACK(tp, th)/ctf_fixed_maxseg(rack->rc_tp))); 7956 7957 if (rack->r_rep_reverse == 0) { 7958 rack->r_rep_reverse = 1; 7959 counter_u64_add(rack_sack_attacks_reversed, 1); 7960 } 7961 /* Restore the cwnd */ 7962 if (rack->r_ctl.rc_saved_cwnd > rack->rc_tp->snd_cwnd) 7963 rack->rc_tp->snd_cwnd = rack->r_ctl.rc_saved_cwnd; 7964 } 7965 } 7966 } 7967 #endif 7968 if (changed) { 7969 /* Something changed cancel the rack timer */ 7970 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 7971 } 7972 tsused = tcp_ts_getticks(); 7973 rsm = tcp_rack_output(tp, rack, tsused); 7974 if ((!IN_RECOVERY(tp->t_flags)) && 7975 rsm) { 7976 /* Enter recovery */ 7977 rack->r_ctl.rc_rsm_start = rsm->r_start; 7978 rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; 7979 rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; 7980 entered_recovery = 1; 7981 rack_cong_signal(tp, NULL, CC_NDUPACK); 7982 /* 7983 * When we enter recovery we need to assure we send 7984 * one packet. 7985 */ 7986 if (rack->rack_no_prr == 0) { 7987 rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); 7988 rack_log_to_prr(rack, 8, 0); 7989 } 7990 rack->r_timer_override = 1; 7991 rack->r_early = 0; 7992 rack->r_ctl.rc_agg_early = 0; 7993 } else if (IN_RECOVERY(tp->t_flags) && 7994 rsm && 7995 (rack->r_rr_config == 3)) { 7996 /* 7997 * Assure we can output and we get no 7998 * remembered pace time except the retransmit. 7999 */ 8000 rack->r_timer_override = 1; 8001 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 8002 rack->r_ctl.rc_resend = rsm; 8003 } 8004 if (IN_RECOVERY(tp->t_flags) && 8005 (rack->rack_no_prr == 0) && 8006 (entered_recovery == 0)) { 8007 /* Deal with PRR here (in recovery only) */ 8008 uint32_t pipe, snd_una; 8009 8010 rack->r_ctl.rc_prr_delivered += changed; 8011 /* Compute prr_sndcnt */ 8012 if (SEQ_GT(tp->snd_una, th_ack)) { 8013 snd_una = tp->snd_una; 8014 } else { 8015 snd_una = th_ack; 8016 } 8017 pipe = ((tp->snd_max - snd_una) - rack->r_ctl.rc_sacked) + rack->r_ctl.rc_holes_rxt; 8018 if (pipe > tp->snd_ssthresh) { 8019 long sndcnt; 8020 8021 sndcnt = rack->r_ctl.rc_prr_delivered * tp->snd_ssthresh; 8022 if (rack->r_ctl.rc_prr_recovery_fs > 0) 8023 sndcnt /= (long)rack->r_ctl.rc_prr_recovery_fs; 8024 else { 8025 rack->r_ctl.rc_prr_sndcnt = 0; 8026 rack_log_to_prr(rack, 9, 0); 8027 sndcnt = 0; 8028 } 8029 sndcnt++; 8030 if (sndcnt > (long)rack->r_ctl.rc_prr_out) 8031 sndcnt -= rack->r_ctl.rc_prr_out; 8032 else 8033 sndcnt = 0; 8034 rack->r_ctl.rc_prr_sndcnt = sndcnt; 8035 rack_log_to_prr(rack, 10, 0); 8036 } else { 8037 uint32_t limit; 8038 8039 if (rack->r_ctl.rc_prr_delivered > rack->r_ctl.rc_prr_out) 8040 limit = (rack->r_ctl.rc_prr_delivered - rack->r_ctl.rc_prr_out); 8041 else 8042 limit = 0; 8043 if (changed > limit) 8044 limit = changed; 8045 limit += ctf_fixed_maxseg(tp); 8046 if (tp->snd_ssthresh > pipe) { 8047 rack->r_ctl.rc_prr_sndcnt = min((tp->snd_ssthresh - pipe), limit); 8048 rack_log_to_prr(rack, 11, 0); 8049 } else { 8050 rack->r_ctl.rc_prr_sndcnt = min(0, limit); 8051 rack_log_to_prr(rack, 12, 0); 8052 } 8053 } 8054 if ((rsm && (rack->r_ctl.rc_prr_sndcnt >= ctf_fixed_maxseg(tp)) && 8055 ((rack->rc_inp->inp_in_hpts == 0) && 8056 ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0)))) { 8057 /* 8058 * If you are pacing output you don't want 8059 * to override. 8060 */ 8061 rack->r_early = 0; 8062 rack->r_ctl.rc_agg_early = 0; 8063 rack->r_timer_override = 1; 8064 } 8065 } 8066 } 8067 8068 static void 8069 rack_strike_dupack(struct tcp_rack *rack) 8070 { 8071 struct rack_sendmap *rsm; 8072 8073 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 8074 if (rsm && (rsm->r_dupack < 0xff)) { 8075 rsm->r_dupack++; 8076 if (rsm->r_dupack >= DUP_ACK_THRESHOLD) { 8077 rack->r_wanted_output = 1; 8078 rack->r_timer_override = 1; 8079 rack_log_retran_reason(rack, rsm, __LINE__, 1, 3); 8080 } else { 8081 rack_log_retran_reason(rack, rsm, __LINE__, 0, 3); 8082 } 8083 } 8084 } 8085 8086 static void 8087 rack_check_bottom_drag(struct tcpcb *tp, 8088 struct tcp_rack *rack, 8089 struct socket *so, int32_t acked) 8090 { 8091 uint32_t segsiz, minseg; 8092 8093 segsiz = ctf_fixed_maxseg(tp); 8094 minseg = segsiz; 8095 8096 if (tp->snd_max == tp->snd_una) { 8097 /* 8098 * We are doing dynamic pacing and we are way 8099 * under. Basically everything got acked while 8100 * we were still waiting on the pacer to expire. 8101 * 8102 * This means we need to boost the b/w in 8103 * addition to any earlier boosting of 8104 * the multipler. 8105 */ 8106 rack->rc_dragged_bottom = 1; 8107 rack_validate_multipliers_at_or_above100(rack); 8108 /* 8109 * Lets use the segment bytes acked plus 8110 * the lowest RTT seen as the basis to 8111 * form a b/w estimate. This will be off 8112 * due to the fact that the true estimate 8113 * should be around 1/2 the time of the RTT 8114 * but we can settle for that. 8115 */ 8116 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_VALID) && 8117 acked) { 8118 uint64_t bw, calc_bw, rtt; 8119 8120 rtt = rack->r_ctl.rack_rs.rs_us_rtt; 8121 bw = acked; 8122 calc_bw = bw * 1000000; 8123 calc_bw /= rtt; 8124 if (rack->r_ctl.last_max_bw && 8125 (rack->r_ctl.last_max_bw < calc_bw)) { 8126 /* 8127 * If we have a last calculated max bw 8128 * enforce it. 8129 */ 8130 calc_bw = rack->r_ctl.last_max_bw; 8131 } 8132 /* now plop it in */ 8133 if (rack->rc_gp_filled == 0) { 8134 if (calc_bw > ONE_POINT_TWO_MEG) { 8135 /* 8136 * If we have no measurement 8137 * don't let us set in more than 8138 * 1.2Mbps. If we are still too 8139 * low after pacing with this we 8140 * will hopefully have a max b/w 8141 * available to sanity check things. 8142 */ 8143 calc_bw = ONE_POINT_TWO_MEG; 8144 } 8145 rack->r_ctl.rc_rtt_diff = 0; 8146 rack->r_ctl.gp_bw = calc_bw; 8147 rack->rc_gp_filled = 1; 8148 rack->r_ctl.num_avg = RACK_REQ_AVG; 8149 rack_set_pace_segments(rack->rc_tp, rack, __LINE__); 8150 } else if (calc_bw > rack->r_ctl.gp_bw) { 8151 rack->r_ctl.rc_rtt_diff = 0; 8152 rack->r_ctl.num_avg = RACK_REQ_AVG; 8153 rack->r_ctl.gp_bw = calc_bw; 8154 rack_set_pace_segments(rack->rc_tp, rack, __LINE__); 8155 } else 8156 rack_increase_bw_mul(rack, -1, 0, 0, 1); 8157 /* 8158 * For acks over 1mss we do a extra boost to simulate 8159 * where we would get 2 acks (we want 110 for the mul). 8160 */ 8161 if (acked > segsiz) 8162 rack_increase_bw_mul(rack, -1, 0, 0, 1); 8163 } else { 8164 /* 8165 * Huh, this should not be, settle 8166 * for just an old increase. 8167 */ 8168 rack_increase_bw_mul(rack, -1, 0, 0, 1); 8169 } 8170 } else if ((IN_RECOVERY(tp->t_flags) == 0) && 8171 (sbavail(&so->so_snd) > max((segsiz * (4 + rack_req_segs)), 8172 minseg)) && 8173 (rack->r_ctl.cwnd_to_use > max((segsiz * (rack_req_segs + 2)), minseg)) && 8174 (tp->snd_wnd > max((segsiz * (rack_req_segs + 2)), minseg)) && 8175 (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) <= 8176 (segsiz * rack_req_segs))) { 8177 /* 8178 * We are doing dynamic GP pacing and 8179 * we have everything except 1MSS or less 8180 * bytes left out. We are still pacing away. 8181 * And there is data that could be sent, This 8182 * means we are inserting delayed ack time in 8183 * our measurements because we are pacing too slow. 8184 */ 8185 rack_validate_multipliers_at_or_above100(rack); 8186 rack->rc_dragged_bottom = 1; 8187 rack_increase_bw_mul(rack, -1, 0, 0, 1); 8188 } 8189 } 8190 8191 /* 8192 * Return value of 1, we do not need to call rack_process_data(). 8193 * return value of 0, rack_process_data can be called. 8194 * For ret_val if its 0 the TCP is locked, if its non-zero 8195 * its unlocked and probably unsafe to touch the TCB. 8196 */ 8197 static int 8198 rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so, 8199 struct tcpcb *tp, struct tcpopt *to, 8200 uint32_t tiwin, int32_t tlen, 8201 int32_t * ofia, int32_t thflags, int32_t * ret_val) 8202 { 8203 int32_t ourfinisacked = 0; 8204 int32_t nsegs, acked_amount; 8205 int32_t acked; 8206 struct mbuf *mfree; 8207 struct tcp_rack *rack; 8208 int32_t under_pacing = 0; 8209 int32_t recovery = 0; 8210 8211 rack = (struct tcp_rack *)tp->t_fb_ptr; 8212 if (SEQ_GT(th->th_ack, tp->snd_max)) { 8213 ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val); 8214 rack->r_wanted_output = 1; 8215 return (1); 8216 } 8217 if (rack->rc_gp_filled && 8218 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { 8219 under_pacing = 1; 8220 } 8221 if (SEQ_GEQ(th->th_ack, tp->snd_una) || to->to_nsacks) { 8222 if (rack->rc_in_persist) 8223 tp->t_rxtshift = 0; 8224 if ((th->th_ack == tp->snd_una) && (tiwin == tp->snd_wnd)) 8225 rack_strike_dupack(rack); 8226 rack_log_ack(tp, to, th); 8227 } 8228 if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) { 8229 /* 8230 * Old ack, behind (or duplicate to) the last one rcv'd 8231 * Note: Should mark reordering is occuring! We should also 8232 * look for sack blocks arriving e.g. ack 1, 4-4 then ack 1, 8233 * 3-3, 4-4 would be reording. As well as ack 1, 3-3 <no 8234 * retran and> ack 3 8235 */ 8236 return (0); 8237 } 8238 /* 8239 * If we reach this point, ACK is not a duplicate, i.e., it ACKs 8240 * something we sent. 8241 */ 8242 if (tp->t_flags & TF_NEEDSYN) { 8243 /* 8244 * T/TCP: Connection was half-synchronized, and our SYN has 8245 * been ACK'd (so connection is now fully synchronized). Go 8246 * to non-starred state, increment snd_una for ACK of SYN, 8247 * and check if we can do window scaling. 8248 */ 8249 tp->t_flags &= ~TF_NEEDSYN; 8250 tp->snd_una++; 8251 /* Do window scaling? */ 8252 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 8253 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 8254 tp->rcv_scale = tp->request_r_scale; 8255 /* Send window already scaled. */ 8256 } 8257 } 8258 nsegs = max(1, m->m_pkthdr.lro_nsegs); 8259 INP_WLOCK_ASSERT(tp->t_inpcb); 8260 8261 acked = BYTES_THIS_ACK(tp, th); 8262 KMOD_TCPSTAT_ADD(tcps_rcvackpack, nsegs); 8263 KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked); 8264 /* 8265 * If we just performed our first retransmit, and the ACK arrives 8266 * within our recovery window, then it was a mistake to do the 8267 * retransmit in the first place. Recover our original cwnd and 8268 * ssthresh, and proceed to transmit where we left off. 8269 */ 8270 if (tp->t_flags & TF_PREVVALID) { 8271 tp->t_flags &= ~TF_PREVVALID; 8272 if (tp->t_rxtshift == 1 && 8273 (int)(ticks - tp->t_badrxtwin) < 0) 8274 rack_cong_signal(tp, th, CC_RTO_ERR); 8275 } 8276 if (acked) { 8277 /* assure we are not backed off */ 8278 tp->t_rxtshift = 0; 8279 rack->rc_tlp_in_progress = 0; 8280 rack->r_ctl.rc_tlp_cnt_out = 0; 8281 /* 8282 * If it is the RXT timer we want to 8283 * stop it, so we can restart a TLP. 8284 */ 8285 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) 8286 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 8287 #ifdef NETFLIX_HTTP_LOGGING 8288 tcp_http_check_for_comp(rack->rc_tp, th->th_ack); 8289 #endif 8290 } 8291 /* 8292 * If we have a timestamp reply, update smoothed round trip time. If 8293 * no timestamp is present but transmit timer is running and timed 8294 * sequence number was acked, update smoothed round trip time. Since 8295 * we now have an rtt measurement, cancel the timer backoff (cf., 8296 * Phil Karn's retransmit alg.). Recompute the initial retransmit 8297 * timer. 8298 * 8299 * Some boxes send broken timestamp replies during the SYN+ACK 8300 * phase, ignore timestamps of 0 or we could calculate a huge RTT 8301 * and blow up the retransmit timer. 8302 */ 8303 /* 8304 * If all outstanding data is acked, stop retransmit timer and 8305 * remember to restart (more output or persist). If there is more 8306 * data to be acked, restart retransmit timer, using current 8307 * (possibly backed-off) value. 8308 */ 8309 if (acked == 0) { 8310 if (ofia) 8311 *ofia = ourfinisacked; 8312 return (0); 8313 } 8314 if (rack->r_ctl.rc_early_recovery) { 8315 if (IN_RECOVERY(tp->t_flags)) { 8316 if (SEQ_LT(th->th_ack, tp->snd_recover) && 8317 (SEQ_LT(th->th_ack, tp->snd_max))) { 8318 tcp_rack_partialack(tp, th); 8319 } else { 8320 rack_post_recovery(tp, th); 8321 recovery = 1; 8322 } 8323 } 8324 } 8325 /* 8326 * Let the congestion control algorithm update congestion control 8327 * related information. This typically means increasing the 8328 * congestion window. 8329 */ 8330 rack_ack_received(tp, rack, th, nsegs, CC_ACK, recovery); 8331 SOCKBUF_LOCK(&so->so_snd); 8332 acked_amount = min(acked, (int)sbavail(&so->so_snd)); 8333 tp->snd_wnd -= acked_amount; 8334 mfree = sbcut_locked(&so->so_snd, acked_amount); 8335 if ((sbused(&so->so_snd) == 0) && 8336 (acked > acked_amount) && 8337 (tp->t_state >= TCPS_FIN_WAIT_1) && 8338 (tp->t_flags & TF_SENTFIN)) { 8339 /* 8340 * We must be sure our fin 8341 * was sent and acked (we can be 8342 * in FIN_WAIT_1 without having 8343 * sent the fin). 8344 */ 8345 ourfinisacked = 1; 8346 } 8347 /* NB: sowwakeup_locked() does an implicit unlock. */ 8348 sowwakeup_locked(so); 8349 m_freem(mfree); 8350 if (rack->r_ctl.rc_early_recovery == 0) { 8351 if (IN_RECOVERY(tp->t_flags)) { 8352 if (SEQ_LT(th->th_ack, tp->snd_recover) && 8353 (SEQ_LT(th->th_ack, tp->snd_max))) { 8354 tcp_rack_partialack(tp, th); 8355 } else { 8356 rack_post_recovery(tp, th); 8357 } 8358 } 8359 } 8360 tp->snd_una = th->th_ack; 8361 if (SEQ_GT(tp->snd_una, tp->snd_recover)) 8362 tp->snd_recover = tp->snd_una; 8363 8364 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) { 8365 tp->snd_nxt = tp->snd_una; 8366 } 8367 if (under_pacing && 8368 (rack->use_fixed_rate == 0) && 8369 (rack->in_probe_rtt == 0) && 8370 rack->rc_gp_dyn_mul && 8371 rack->rc_always_pace) { 8372 /* Check if we are dragging bottom */ 8373 rack_check_bottom_drag(tp, rack, so, acked); 8374 } 8375 if (tp->snd_una == tp->snd_max) { 8376 /* Nothing left outstanding */ 8377 rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL); 8378 if (rack->r_ctl.rc_went_idle_time == 0) 8379 rack->r_ctl.rc_went_idle_time = 1; 8380 rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__); 8381 if (sbavail(&tp->t_inpcb->inp_socket->so_snd) == 0) 8382 tp->t_acktime = 0; 8383 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 8384 /* Set need output so persist might get set */ 8385 rack->r_wanted_output = 1; 8386 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 8387 if ((tp->t_state >= TCPS_FIN_WAIT_1) && 8388 (sbavail(&so->so_snd) == 0) && 8389 (tp->t_flags2 & TF2_DROP_AF_DATA)) { 8390 /* 8391 * The socket was gone and the 8392 * peer sent data, time to 8393 * reset him. 8394 */ 8395 *ret_val = 1; 8396 /* tcp_close will kill the inp pre-log the Reset */ 8397 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); 8398 tp = tcp_close(tp); 8399 ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, tlen); 8400 return (1); 8401 } 8402 } 8403 if (ofia) 8404 *ofia = ourfinisacked; 8405 return (0); 8406 } 8407 8408 static void 8409 rack_collapsed_window(struct tcp_rack *rack) 8410 { 8411 /* 8412 * Now we must walk the 8413 * send map and divide the 8414 * ones left stranded. These 8415 * guys can't cause us to abort 8416 * the connection and are really 8417 * "unsent". However if a buggy 8418 * client actually did keep some 8419 * of the data i.e. collapsed the win 8420 * and refused to ack and then opened 8421 * the win and acked that data. We would 8422 * get into an ack war, the simplier 8423 * method then of just pretending we 8424 * did not send those segments something 8425 * won't work. 8426 */ 8427 struct rack_sendmap *rsm, *nrsm, fe, *insret; 8428 tcp_seq max_seq; 8429 8430 max_seq = rack->rc_tp->snd_una + rack->rc_tp->snd_wnd; 8431 memset(&fe, 0, sizeof(fe)); 8432 fe.r_start = max_seq; 8433 /* Find the first seq past or at maxseq */ 8434 rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); 8435 if (rsm == NULL) { 8436 /* Nothing to do strange */ 8437 rack->rc_has_collapsed = 0; 8438 return; 8439 } 8440 /* 8441 * Now do we need to split at 8442 * the collapse point? 8443 */ 8444 if (SEQ_GT(max_seq, rsm->r_start)) { 8445 nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT); 8446 if (nrsm == NULL) { 8447 /* We can't get a rsm, mark all? */ 8448 nrsm = rsm; 8449 goto no_split; 8450 } 8451 /* Clone it */ 8452 rack_clone_rsm(rack, nrsm, rsm, max_seq); 8453 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); 8454 #ifdef INVARIANTS 8455 if (insret != NULL) { 8456 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", 8457 nrsm, insret, rack, rsm); 8458 } 8459 #endif 8460 if (rsm->r_in_tmap) { 8461 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 8462 nrsm->r_in_tmap = 1; 8463 } 8464 /* 8465 * Set in the new RSM as the 8466 * collapsed starting point 8467 */ 8468 rsm = nrsm; 8469 } 8470 no_split: 8471 counter_u64_add(rack_collapsed_win, 1); 8472 RB_FOREACH_FROM(nrsm, rack_rb_tree_head, rsm) { 8473 nrsm->r_flags |= RACK_RWND_COLLAPSED; 8474 rack->rc_has_collapsed = 1; 8475 } 8476 } 8477 8478 static void 8479 rack_un_collapse_window(struct tcp_rack *rack) 8480 { 8481 struct rack_sendmap *rsm; 8482 8483 RB_FOREACH_REVERSE(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) { 8484 if (rsm->r_flags & RACK_RWND_COLLAPSED) 8485 rsm->r_flags &= ~RACK_RWND_COLLAPSED; 8486 else 8487 break; 8488 } 8489 rack->rc_has_collapsed = 0; 8490 } 8491 8492 static void 8493 rack_handle_delayed_ack(struct tcpcb *tp, struct tcp_rack *rack, 8494 int32_t tlen, int32_t tfo_syn) 8495 { 8496 if (DELAY_ACK(tp, tlen) || tfo_syn) { 8497 if (rack->rc_dack_mode && 8498 (tlen > 500) && 8499 (rack->rc_dack_toggle == 1)) { 8500 goto no_delayed_ack; 8501 } 8502 rack_timer_cancel(tp, rack, 8503 rack->r_ctl.rc_rcvtime, __LINE__); 8504 tp->t_flags |= TF_DELACK; 8505 } else { 8506 no_delayed_ack: 8507 rack->r_wanted_output = 1; 8508 tp->t_flags |= TF_ACKNOW; 8509 if (rack->rc_dack_mode) { 8510 if (tp->t_flags & TF_DELACK) 8511 rack->rc_dack_toggle = 1; 8512 else 8513 rack->rc_dack_toggle = 0; 8514 } 8515 } 8516 } 8517 /* 8518 * Return value of 1, the TCB is unlocked and most 8519 * likely gone, return value of 0, the TCP is still 8520 * locked. 8521 */ 8522 static int 8523 rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so, 8524 struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, 8525 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) 8526 { 8527 /* 8528 * Update window information. Don't look at window if no ACK: TAC's 8529 * send garbage on first SYN. 8530 */ 8531 int32_t nsegs; 8532 int32_t tfo_syn; 8533 struct tcp_rack *rack; 8534 8535 rack = (struct tcp_rack *)tp->t_fb_ptr; 8536 INP_WLOCK_ASSERT(tp->t_inpcb); 8537 nsegs = max(1, m->m_pkthdr.lro_nsegs); 8538 if ((thflags & TH_ACK) && 8539 (SEQ_LT(tp->snd_wl1, th->th_seq) || 8540 (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || 8541 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { 8542 /* keep track of pure window updates */ 8543 if (tlen == 0 && 8544 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) 8545 KMOD_TCPSTAT_INC(tcps_rcvwinupd); 8546 tp->snd_wnd = tiwin; 8547 tp->snd_wl1 = th->th_seq; 8548 tp->snd_wl2 = th->th_ack; 8549 if (tp->snd_wnd > tp->max_sndwnd) 8550 tp->max_sndwnd = tp->snd_wnd; 8551 rack->r_wanted_output = 1; 8552 } else if (thflags & TH_ACK) { 8553 if ((tp->snd_wl2 == th->th_ack) && (tiwin < tp->snd_wnd)) { 8554 tp->snd_wnd = tiwin; 8555 tp->snd_wl1 = th->th_seq; 8556 tp->snd_wl2 = th->th_ack; 8557 } 8558 } 8559 if (tp->snd_wnd < ctf_outstanding(tp)) 8560 /* The peer collapsed the window */ 8561 rack_collapsed_window(rack); 8562 else if (rack->rc_has_collapsed) 8563 rack_un_collapse_window(rack); 8564 /* Was persist timer active and now we have window space? */ 8565 if ((rack->rc_in_persist != 0) && 8566 (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2), 8567 rack->r_ctl.rc_pace_min_segs))) { 8568 rack_exit_persist(tp, rack, rack->r_ctl.rc_rcvtime); 8569 tp->snd_nxt = tp->snd_max; 8570 /* Make sure we output to start the timer */ 8571 rack->r_wanted_output = 1; 8572 } 8573 /* Do we enter persists? */ 8574 if ((rack->rc_in_persist == 0) && 8575 (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) && 8576 TCPS_HAVEESTABLISHED(tp->t_state) && 8577 (tp->snd_max == tp->snd_una) && 8578 sbavail(&tp->t_inpcb->inp_socket->so_snd) && 8579 (sbavail(&tp->t_inpcb->inp_socket->so_snd) > tp->snd_wnd)) { 8580 /* 8581 * Here the rwnd is less than 8582 * the pacing size, we are established, 8583 * nothing is outstanding, and there is 8584 * data to send. Enter persists. 8585 */ 8586 tp->snd_nxt = tp->snd_una; 8587 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime); 8588 } 8589 if (tp->t_flags2 & TF2_DROP_AF_DATA) { 8590 m_freem(m); 8591 return (0); 8592 } 8593 /* 8594 * don't process the URG bit, ignore them drag 8595 * along the up. 8596 */ 8597 tp->rcv_up = tp->rcv_nxt; 8598 INP_WLOCK_ASSERT(tp->t_inpcb); 8599 8600 /* 8601 * Process the segment text, merging it into the TCP sequencing 8602 * queue, and arranging for acknowledgment of receipt if necessary. 8603 * This process logically involves adjusting tp->rcv_wnd as data is 8604 * presented to the user (this happens in tcp_usrreq.c, case 8605 * PRU_RCVD). If a FIN has already been received on this connection 8606 * then we just ignore the text. 8607 */ 8608 tfo_syn = ((tp->t_state == TCPS_SYN_RECEIVED) && 8609 IS_FASTOPEN(tp->t_flags)); 8610 if ((tlen || (thflags & TH_FIN) || (tfo_syn && tlen > 0)) && 8611 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 8612 tcp_seq save_start = th->th_seq; 8613 tcp_seq save_rnxt = tp->rcv_nxt; 8614 int save_tlen = tlen; 8615 8616 m_adj(m, drop_hdrlen); /* delayed header drop */ 8617 /* 8618 * Insert segment which includes th into TCP reassembly 8619 * queue with control block tp. Set thflags to whether 8620 * reassembly now includes a segment with FIN. This handles 8621 * the common case inline (segment is the next to be 8622 * received on an established connection, and the queue is 8623 * empty), avoiding linkage into and removal from the queue 8624 * and repetition of various conversions. Set DELACK for 8625 * segments received in order, but ack immediately when 8626 * segments are out of order (so fast retransmit can work). 8627 */ 8628 if (th->th_seq == tp->rcv_nxt && 8629 SEGQ_EMPTY(tp) && 8630 (TCPS_HAVEESTABLISHED(tp->t_state) || 8631 tfo_syn)) { 8632 #ifdef NETFLIX_SB_LIMITS 8633 u_int mcnt, appended; 8634 8635 if (so->so_rcv.sb_shlim) { 8636 mcnt = m_memcnt(m); 8637 appended = 0; 8638 if (counter_fo_get(so->so_rcv.sb_shlim, mcnt, 8639 CFO_NOSLEEP, NULL) == false) { 8640 counter_u64_add(tcp_sb_shlim_fails, 1); 8641 m_freem(m); 8642 return (0); 8643 } 8644 } 8645 #endif 8646 rack_handle_delayed_ack(tp, rack, tlen, tfo_syn); 8647 tp->rcv_nxt += tlen; 8648 if (tlen && 8649 ((tp->t_flags2 & TF2_FBYTES_COMPLETE) == 0) && 8650 (tp->t_fbyte_in == 0)) { 8651 tp->t_fbyte_in = ticks; 8652 if (tp->t_fbyte_in == 0) 8653 tp->t_fbyte_in = 1; 8654 if (tp->t_fbyte_out && tp->t_fbyte_in) 8655 tp->t_flags2 |= TF2_FBYTES_COMPLETE; 8656 } 8657 thflags = th->th_flags & TH_FIN; 8658 KMOD_TCPSTAT_ADD(tcps_rcvpack, nsegs); 8659 KMOD_TCPSTAT_ADD(tcps_rcvbyte, tlen); 8660 SOCKBUF_LOCK(&so->so_rcv); 8661 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 8662 m_freem(m); 8663 } else 8664 #ifdef NETFLIX_SB_LIMITS 8665 appended = 8666 #endif 8667 sbappendstream_locked(&so->so_rcv, m, 0); 8668 /* NB: sorwakeup_locked() does an implicit unlock. */ 8669 sorwakeup_locked(so); 8670 #ifdef NETFLIX_SB_LIMITS 8671 if (so->so_rcv.sb_shlim && appended != mcnt) 8672 counter_fo_release(so->so_rcv.sb_shlim, 8673 mcnt - appended); 8674 #endif 8675 } else { 8676 /* 8677 * XXX: Due to the header drop above "th" is 8678 * theoretically invalid by now. Fortunately 8679 * m_adj() doesn't actually frees any mbufs when 8680 * trimming from the head. 8681 */ 8682 tcp_seq temp = save_start; 8683 thflags = tcp_reass(tp, th, &temp, &tlen, m); 8684 tp->t_flags |= TF_ACKNOW; 8685 } 8686 if ((tp->t_flags & TF_SACK_PERMIT) && (save_tlen > 0)) { 8687 if ((tlen == 0) && (SEQ_LT(save_start, save_rnxt))) { 8688 /* 8689 * DSACK actually handled in the fastpath 8690 * above. 8691 */ 8692 RACK_OPTS_INC(tcp_sack_path_1); 8693 tcp_update_sack_list(tp, save_start, 8694 save_start + save_tlen); 8695 } else if ((tlen > 0) && SEQ_GT(tp->rcv_nxt, save_rnxt)) { 8696 if ((tp->rcv_numsacks >= 1) && 8697 (tp->sackblks[0].end == save_start)) { 8698 /* 8699 * Partial overlap, recorded at todrop 8700 * above. 8701 */ 8702 RACK_OPTS_INC(tcp_sack_path_2a); 8703 tcp_update_sack_list(tp, 8704 tp->sackblks[0].start, 8705 tp->sackblks[0].end); 8706 } else { 8707 RACK_OPTS_INC(tcp_sack_path_2b); 8708 tcp_update_dsack_list(tp, save_start, 8709 save_start + save_tlen); 8710 } 8711 } else if (tlen >= save_tlen) { 8712 /* Update of sackblks. */ 8713 RACK_OPTS_INC(tcp_sack_path_3); 8714 tcp_update_dsack_list(tp, save_start, 8715 save_start + save_tlen); 8716 } else if (tlen > 0) { 8717 RACK_OPTS_INC(tcp_sack_path_4); 8718 tcp_update_dsack_list(tp, save_start, 8719 save_start + tlen); 8720 } 8721 } 8722 } else { 8723 m_freem(m); 8724 thflags &= ~TH_FIN; 8725 } 8726 8727 /* 8728 * If FIN is received ACK the FIN and let the user know that the 8729 * connection is closing. 8730 */ 8731 if (thflags & TH_FIN) { 8732 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 8733 socantrcvmore(so); 8734 /* 8735 * If connection is half-synchronized (ie NEEDSYN 8736 * flag on) then delay ACK, so it may be piggybacked 8737 * when SYN is sent. Otherwise, since we received a 8738 * FIN then no more input can be expected, send ACK 8739 * now. 8740 */ 8741 if (tp->t_flags & TF_NEEDSYN) { 8742 rack_timer_cancel(tp, rack, 8743 rack->r_ctl.rc_rcvtime, __LINE__); 8744 tp->t_flags |= TF_DELACK; 8745 } else { 8746 tp->t_flags |= TF_ACKNOW; 8747 } 8748 tp->rcv_nxt++; 8749 } 8750 switch (tp->t_state) { 8751 /* 8752 * In SYN_RECEIVED and ESTABLISHED STATES enter the 8753 * CLOSE_WAIT state. 8754 */ 8755 case TCPS_SYN_RECEIVED: 8756 tp->t_starttime = ticks; 8757 /* FALLTHROUGH */ 8758 case TCPS_ESTABLISHED: 8759 rack_timer_cancel(tp, rack, 8760 rack->r_ctl.rc_rcvtime, __LINE__); 8761 tcp_state_change(tp, TCPS_CLOSE_WAIT); 8762 break; 8763 8764 /* 8765 * If still in FIN_WAIT_1 STATE FIN has not been 8766 * acked so enter the CLOSING state. 8767 */ 8768 case TCPS_FIN_WAIT_1: 8769 rack_timer_cancel(tp, rack, 8770 rack->r_ctl.rc_rcvtime, __LINE__); 8771 tcp_state_change(tp, TCPS_CLOSING); 8772 break; 8773 8774 /* 8775 * In FIN_WAIT_2 state enter the TIME_WAIT state, 8776 * starting the time-wait timer, turning off the 8777 * other standard timers. 8778 */ 8779 case TCPS_FIN_WAIT_2: 8780 rack_timer_cancel(tp, rack, 8781 rack->r_ctl.rc_rcvtime, __LINE__); 8782 tcp_twstart(tp); 8783 return (1); 8784 } 8785 } 8786 /* 8787 * Return any desired output. 8788 */ 8789 if ((tp->t_flags & TF_ACKNOW) || 8790 (sbavail(&so->so_snd) > (tp->snd_max - tp->snd_una))) { 8791 rack->r_wanted_output = 1; 8792 } 8793 INP_WLOCK_ASSERT(tp->t_inpcb); 8794 return (0); 8795 } 8796 8797 /* 8798 * Here nothing is really faster, its just that we 8799 * have broken out the fast-data path also just like 8800 * the fast-ack. 8801 */ 8802 static int 8803 rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so, 8804 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 8805 uint32_t tiwin, int32_t nxt_pkt, uint8_t iptos) 8806 { 8807 int32_t nsegs; 8808 int32_t newsize = 0; /* automatic sockbuf scaling */ 8809 struct tcp_rack *rack; 8810 #ifdef NETFLIX_SB_LIMITS 8811 u_int mcnt, appended; 8812 #endif 8813 #ifdef TCPDEBUG 8814 /* 8815 * The size of tcp_saveipgen must be the size of the max ip header, 8816 * now IPv6. 8817 */ 8818 u_char tcp_saveipgen[IP6_HDR_LEN]; 8819 struct tcphdr tcp_savetcp; 8820 short ostate = 0; 8821 8822 #endif 8823 /* 8824 * If last ACK falls within this segment's sequence numbers, record 8825 * the timestamp. NOTE that the test is modified according to the 8826 * latest proposal of the tcplw@cray.com list (Braden 1993/04/26). 8827 */ 8828 if (__predict_false(th->th_seq != tp->rcv_nxt)) { 8829 return (0); 8830 } 8831 if (__predict_false(tp->snd_nxt != tp->snd_max)) { 8832 return (0); 8833 } 8834 if (tiwin && tiwin != tp->snd_wnd) { 8835 return (0); 8836 } 8837 if (__predict_false((tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN)))) { 8838 return (0); 8839 } 8840 if (__predict_false((to->to_flags & TOF_TS) && 8841 (TSTMP_LT(to->to_tsval, tp->ts_recent)))) { 8842 return (0); 8843 } 8844 if (__predict_false((th->th_ack != tp->snd_una))) { 8845 return (0); 8846 } 8847 if (__predict_false(tlen > sbspace(&so->so_rcv))) { 8848 return (0); 8849 } 8850 if ((to->to_flags & TOF_TS) != 0 && 8851 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 8852 tp->ts_recent_age = tcp_ts_getticks(); 8853 tp->ts_recent = to->to_tsval; 8854 } 8855 rack = (struct tcp_rack *)tp->t_fb_ptr; 8856 /* 8857 * This is a pure, in-sequence data packet with nothing on the 8858 * reassembly queue and we have enough buffer space to take it. 8859 */ 8860 nsegs = max(1, m->m_pkthdr.lro_nsegs); 8861 8862 #ifdef NETFLIX_SB_LIMITS 8863 if (so->so_rcv.sb_shlim) { 8864 mcnt = m_memcnt(m); 8865 appended = 0; 8866 if (counter_fo_get(so->so_rcv.sb_shlim, mcnt, 8867 CFO_NOSLEEP, NULL) == false) { 8868 counter_u64_add(tcp_sb_shlim_fails, 1); 8869 m_freem(m); 8870 return (1); 8871 } 8872 } 8873 #endif 8874 /* Clean receiver SACK report if present */ 8875 if (tp->rcv_numsacks) 8876 tcp_clean_sackreport(tp); 8877 KMOD_TCPSTAT_INC(tcps_preddat); 8878 tp->rcv_nxt += tlen; 8879 if (tlen && 8880 ((tp->t_flags2 & TF2_FBYTES_COMPLETE) == 0) && 8881 (tp->t_fbyte_in == 0)) { 8882 tp->t_fbyte_in = ticks; 8883 if (tp->t_fbyte_in == 0) 8884 tp->t_fbyte_in = 1; 8885 if (tp->t_fbyte_out && tp->t_fbyte_in) 8886 tp->t_flags2 |= TF2_FBYTES_COMPLETE; 8887 } 8888 /* 8889 * Pull snd_wl1 up to prevent seq wrap relative to th_seq. 8890 */ 8891 tp->snd_wl1 = th->th_seq; 8892 /* 8893 * Pull rcv_up up to prevent seq wrap relative to rcv_nxt. 8894 */ 8895 tp->rcv_up = tp->rcv_nxt; 8896 KMOD_TCPSTAT_ADD(tcps_rcvpack, nsegs); 8897 KMOD_TCPSTAT_ADD(tcps_rcvbyte, tlen); 8898 #ifdef TCPDEBUG 8899 if (so->so_options & SO_DEBUG) 8900 tcp_trace(TA_INPUT, ostate, tp, 8901 (void *)tcp_saveipgen, &tcp_savetcp, 0); 8902 #endif 8903 newsize = tcp_autorcvbuf(m, th, so, tp, tlen); 8904 8905 /* Add data to socket buffer. */ 8906 SOCKBUF_LOCK(&so->so_rcv); 8907 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 8908 m_freem(m); 8909 } else { 8910 /* 8911 * Set new socket buffer size. Give up when limit is 8912 * reached. 8913 */ 8914 if (newsize) 8915 if (!sbreserve_locked(&so->so_rcv, 8916 newsize, so, NULL)) 8917 so->so_rcv.sb_flags &= ~SB_AUTOSIZE; 8918 m_adj(m, drop_hdrlen); /* delayed header drop */ 8919 #ifdef NETFLIX_SB_LIMITS 8920 appended = 8921 #endif 8922 sbappendstream_locked(&so->so_rcv, m, 0); 8923 ctf_calc_rwin(so, tp); 8924 } 8925 /* NB: sorwakeup_locked() does an implicit unlock. */ 8926 sorwakeup_locked(so); 8927 #ifdef NETFLIX_SB_LIMITS 8928 if (so->so_rcv.sb_shlim && mcnt != appended) 8929 counter_fo_release(so->so_rcv.sb_shlim, mcnt - appended); 8930 #endif 8931 rack_handle_delayed_ack(tp, rack, tlen, 0); 8932 if (tp->snd_una == tp->snd_max) 8933 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 8934 return (1); 8935 } 8936 8937 /* 8938 * This subfunction is used to try to highly optimize the 8939 * fast path. We again allow window updates that are 8940 * in sequence to remain in the fast-path. We also add 8941 * in the __predict's to attempt to help the compiler. 8942 * Note that if we return a 0, then we can *not* process 8943 * it and the caller should push the packet into the 8944 * slow-path. 8945 */ 8946 static int 8947 rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so, 8948 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 8949 uint32_t tiwin, int32_t nxt_pkt, uint32_t cts) 8950 { 8951 int32_t acked; 8952 int32_t nsegs; 8953 #ifdef TCPDEBUG 8954 /* 8955 * The size of tcp_saveipgen must be the size of the max ip header, 8956 * now IPv6. 8957 */ 8958 u_char tcp_saveipgen[IP6_HDR_LEN]; 8959 struct tcphdr tcp_savetcp; 8960 short ostate = 0; 8961 #endif 8962 int32_t under_pacing = 0; 8963 struct tcp_rack *rack; 8964 8965 if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) { 8966 /* Old ack, behind (or duplicate to) the last one rcv'd */ 8967 return (0); 8968 } 8969 if (__predict_false(SEQ_GT(th->th_ack, tp->snd_max))) { 8970 /* Above what we have sent? */ 8971 return (0); 8972 } 8973 if (__predict_false(tp->snd_nxt != tp->snd_max)) { 8974 /* We are retransmitting */ 8975 return (0); 8976 } 8977 if (__predict_false(tiwin == 0)) { 8978 /* zero window */ 8979 return (0); 8980 } 8981 if (__predict_false(tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN))) { 8982 /* We need a SYN or a FIN, unlikely.. */ 8983 return (0); 8984 } 8985 if ((to->to_flags & TOF_TS) && __predict_false(TSTMP_LT(to->to_tsval, tp->ts_recent))) { 8986 /* Timestamp is behind .. old ack with seq wrap? */ 8987 return (0); 8988 } 8989 if (__predict_false(IN_RECOVERY(tp->t_flags))) { 8990 /* Still recovering */ 8991 return (0); 8992 } 8993 rack = (struct tcp_rack *)tp->t_fb_ptr; 8994 if (rack->r_ctl.rc_sacked) { 8995 /* We have sack holes on our scoreboard */ 8996 return (0); 8997 } 8998 /* Ok if we reach here, we can process a fast-ack */ 8999 if (rack->rc_gp_filled && 9000 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { 9001 under_pacing = 1; 9002 } 9003 nsegs = max(1, m->m_pkthdr.lro_nsegs); 9004 rack_log_ack(tp, to, th); 9005 /* Did the window get updated? */ 9006 if (tiwin != tp->snd_wnd) { 9007 tp->snd_wnd = tiwin; 9008 tp->snd_wl1 = th->th_seq; 9009 if (tp->snd_wnd > tp->max_sndwnd) 9010 tp->max_sndwnd = tp->snd_wnd; 9011 } 9012 /* Do we exit persists? */ 9013 if ((rack->rc_in_persist != 0) && 9014 (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2), 9015 rack->r_ctl.rc_pace_min_segs))) { 9016 rack_exit_persist(tp, rack, cts); 9017 } 9018 /* Do we enter persists? */ 9019 if ((rack->rc_in_persist == 0) && 9020 (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) && 9021 TCPS_HAVEESTABLISHED(tp->t_state) && 9022 (tp->snd_max == tp->snd_una) && 9023 sbavail(&tp->t_inpcb->inp_socket->so_snd) && 9024 (sbavail(&tp->t_inpcb->inp_socket->so_snd) > tp->snd_wnd)) { 9025 /* 9026 * Here the rwnd is less than 9027 * the pacing size, we are established, 9028 * nothing is outstanding, and there is 9029 * data to send. Enter persists. 9030 */ 9031 tp->snd_nxt = tp->snd_una; 9032 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime); 9033 } 9034 /* 9035 * If last ACK falls within this segment's sequence numbers, record 9036 * the timestamp. NOTE that the test is modified according to the 9037 * latest proposal of the tcplw@cray.com list (Braden 1993/04/26). 9038 */ 9039 if ((to->to_flags & TOF_TS) != 0 && 9040 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 9041 tp->ts_recent_age = tcp_ts_getticks(); 9042 tp->ts_recent = to->to_tsval; 9043 } 9044 /* 9045 * This is a pure ack for outstanding data. 9046 */ 9047 KMOD_TCPSTAT_INC(tcps_predack); 9048 9049 /* 9050 * "bad retransmit" recovery. 9051 */ 9052 if (tp->t_flags & TF_PREVVALID) { 9053 tp->t_flags &= ~TF_PREVVALID; 9054 if (tp->t_rxtshift == 1 && 9055 (int)(ticks - tp->t_badrxtwin) < 0) 9056 rack_cong_signal(tp, th, CC_RTO_ERR); 9057 } 9058 /* 9059 * Recalculate the transmit timer / rtt. 9060 * 9061 * Some boxes send broken timestamp replies during the SYN+ACK 9062 * phase, ignore timestamps of 0 or we could calculate a huge RTT 9063 * and blow up the retransmit timer. 9064 */ 9065 acked = BYTES_THIS_ACK(tp, th); 9066 9067 #ifdef TCP_HHOOK 9068 /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ 9069 hhook_run_tcp_est_in(tp, th, to); 9070 #endif 9071 9072 KMOD_TCPSTAT_ADD(tcps_rcvackpack, nsegs); 9073 KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked); 9074 sbdrop(&so->so_snd, acked); 9075 if (acked) { 9076 /* assure we are not backed off */ 9077 tp->t_rxtshift = 0; 9078 rack->rc_tlp_in_progress = 0; 9079 rack->r_ctl.rc_tlp_cnt_out = 0; 9080 /* 9081 * If it is the RXT timer we want to 9082 * stop it, so we can restart a TLP. 9083 */ 9084 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) 9085 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 9086 #ifdef NETFLIX_HTTP_LOGGING 9087 tcp_http_check_for_comp(rack->rc_tp, th->th_ack); 9088 #endif 9089 } 9090 /* 9091 * Let the congestion control algorithm update congestion control 9092 * related information. This typically means increasing the 9093 * congestion window. 9094 */ 9095 rack_ack_received(tp, rack, th, nsegs, CC_ACK, 0); 9096 9097 tp->snd_una = th->th_ack; 9098 if (tp->snd_wnd < ctf_outstanding(tp)) { 9099 /* The peer collapsed the window */ 9100 rack_collapsed_window(rack); 9101 } else if (rack->rc_has_collapsed) 9102 rack_un_collapse_window(rack); 9103 9104 /* 9105 * Pull snd_wl2 up to prevent seq wrap relative to th_ack. 9106 */ 9107 tp->snd_wl2 = th->th_ack; 9108 tp->t_dupacks = 0; 9109 m_freem(m); 9110 /* ND6_HINT(tp); *//* Some progress has been made. */ 9111 9112 /* 9113 * If all outstanding data are acked, stop retransmit timer, 9114 * otherwise restart timer using current (possibly backed-off) 9115 * value. If process is waiting for space, wakeup/selwakeup/signal. 9116 * If data are ready to send, let tcp_output decide between more 9117 * output or persist. 9118 */ 9119 #ifdef TCPDEBUG 9120 if (so->so_options & SO_DEBUG) 9121 tcp_trace(TA_INPUT, ostate, tp, 9122 (void *)tcp_saveipgen, 9123 &tcp_savetcp, 0); 9124 #endif 9125 if (under_pacing && 9126 (rack->use_fixed_rate == 0) && 9127 (rack->in_probe_rtt == 0) && 9128 rack->rc_gp_dyn_mul && 9129 rack->rc_always_pace) { 9130 /* Check if we are dragging bottom */ 9131 rack_check_bottom_drag(tp, rack, so, acked); 9132 } 9133 if (tp->snd_una == tp->snd_max) { 9134 rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL); 9135 if (rack->r_ctl.rc_went_idle_time == 0) 9136 rack->r_ctl.rc_went_idle_time = 1; 9137 rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__); 9138 if (sbavail(&tp->t_inpcb->inp_socket->so_snd) == 0) 9139 tp->t_acktime = 0; 9140 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 9141 } 9142 /* Wake up the socket if we have room to write more */ 9143 sowwakeup(so); 9144 if (sbavail(&so->so_snd)) { 9145 rack->r_wanted_output = 1; 9146 } 9147 return (1); 9148 } 9149 9150 /* 9151 * Return value of 1, the TCB is unlocked and most 9152 * likely gone, return value of 0, the TCP is still 9153 * locked. 9154 */ 9155 static int 9156 rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so, 9157 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 9158 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 9159 { 9160 int32_t ret_val = 0; 9161 int32_t todrop; 9162 int32_t ourfinisacked = 0; 9163 struct tcp_rack *rack; 9164 9165 ctf_calc_rwin(so, tp); 9166 /* 9167 * If the state is SYN_SENT: if seg contains an ACK, but not for our 9168 * SYN, drop the input. if seg contains a RST, then drop the 9169 * connection. if seg does not contain SYN, then drop it. Otherwise 9170 * this is an acceptable SYN segment initialize tp->rcv_nxt and 9171 * tp->irs if seg contains ack then advance tp->snd_una if seg 9172 * contains an ECE and ECN support is enabled, the stream is ECN 9173 * capable. if SYN has been acked change to ESTABLISHED else 9174 * SYN_RCVD state arrange for segment to be acked (eventually) 9175 * continue processing rest of data/controls. 9176 */ 9177 if ((thflags & TH_ACK) && 9178 (SEQ_LEQ(th->th_ack, tp->iss) || 9179 SEQ_GT(th->th_ack, tp->snd_max))) { 9180 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 9181 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 9182 return (1); 9183 } 9184 if ((thflags & (TH_ACK | TH_RST)) == (TH_ACK | TH_RST)) { 9185 TCP_PROBE5(connect__refused, NULL, tp, 9186 mtod(m, const char *), tp, th); 9187 tp = tcp_drop(tp, ECONNREFUSED); 9188 ctf_do_drop(m, tp); 9189 return (1); 9190 } 9191 if (thflags & TH_RST) { 9192 ctf_do_drop(m, tp); 9193 return (1); 9194 } 9195 if (!(thflags & TH_SYN)) { 9196 ctf_do_drop(m, tp); 9197 return (1); 9198 } 9199 tp->irs = th->th_seq; 9200 tcp_rcvseqinit(tp); 9201 rack = (struct tcp_rack *)tp->t_fb_ptr; 9202 if (thflags & TH_ACK) { 9203 int tfo_partial = 0; 9204 9205 KMOD_TCPSTAT_INC(tcps_connects); 9206 soisconnected(so); 9207 #ifdef MAC 9208 mac_socketpeer_set_from_mbuf(m, so); 9209 #endif 9210 /* Do window scaling on this connection? */ 9211 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 9212 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 9213 tp->rcv_scale = tp->request_r_scale; 9214 } 9215 tp->rcv_adv += min(tp->rcv_wnd, 9216 TCP_MAXWIN << tp->rcv_scale); 9217 /* 9218 * If not all the data that was sent in the TFO SYN 9219 * has been acked, resend the remainder right away. 9220 */ 9221 if (IS_FASTOPEN(tp->t_flags) && 9222 (tp->snd_una != tp->snd_max)) { 9223 tp->snd_nxt = th->th_ack; 9224 tfo_partial = 1; 9225 } 9226 /* 9227 * If there's data, delay ACK; if there's also a FIN ACKNOW 9228 * will be turned on later. 9229 */ 9230 if (DELAY_ACK(tp, tlen) && tlen != 0 && !tfo_partial) { 9231 rack_timer_cancel(tp, rack, 9232 rack->r_ctl.rc_rcvtime, __LINE__); 9233 tp->t_flags |= TF_DELACK; 9234 } else { 9235 rack->r_wanted_output = 1; 9236 tp->t_flags |= TF_ACKNOW; 9237 rack->rc_dack_toggle = 0; 9238 } 9239 if (((thflags & (TH_CWR | TH_ECE)) == TH_ECE) && 9240 (V_tcp_do_ecn == 1)) { 9241 tp->t_flags2 |= TF2_ECN_PERMIT; 9242 KMOD_TCPSTAT_INC(tcps_ecn_shs); 9243 } 9244 if (SEQ_GT(th->th_ack, tp->snd_una)) { 9245 /* 9246 * We advance snd_una for the 9247 * fast open case. If th_ack is 9248 * acknowledging data beyond 9249 * snd_una we can't just call 9250 * ack-processing since the 9251 * data stream in our send-map 9252 * will start at snd_una + 1 (one 9253 * beyond the SYN). If its just 9254 * equal we don't need to do that 9255 * and there is no send_map. 9256 */ 9257 tp->snd_una++; 9258 } 9259 /* 9260 * Received <SYN,ACK> in SYN_SENT[*] state. Transitions: 9261 * SYN_SENT --> ESTABLISHED SYN_SENT* --> FIN_WAIT_1 9262 */ 9263 tp->t_starttime = ticks; 9264 if (tp->t_flags & TF_NEEDFIN) { 9265 tcp_state_change(tp, TCPS_FIN_WAIT_1); 9266 tp->t_flags &= ~TF_NEEDFIN; 9267 thflags &= ~TH_SYN; 9268 } else { 9269 tcp_state_change(tp, TCPS_ESTABLISHED); 9270 TCP_PROBE5(connect__established, NULL, tp, 9271 mtod(m, const char *), tp, th); 9272 rack_cc_conn_init(tp); 9273 } 9274 } else { 9275 /* 9276 * Received initial SYN in SYN-SENT[*] state => simultaneous 9277 * open. If segment contains CC option and there is a 9278 * cached CC, apply TAO test. If it succeeds, connection is * 9279 * half-synchronized. Otherwise, do 3-way handshake: 9280 * SYN-SENT -> SYN-RECEIVED SYN-SENT* -> SYN-RECEIVED* If 9281 * there was no CC option, clear cached CC value. 9282 */ 9283 tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN); 9284 tcp_state_change(tp, TCPS_SYN_RECEIVED); 9285 } 9286 INP_WLOCK_ASSERT(tp->t_inpcb); 9287 /* 9288 * Advance th->th_seq to correspond to first data byte. If data, 9289 * trim to stay within window, dropping FIN if necessary. 9290 */ 9291 th->th_seq++; 9292 if (tlen > tp->rcv_wnd) { 9293 todrop = tlen - tp->rcv_wnd; 9294 m_adj(m, -todrop); 9295 tlen = tp->rcv_wnd; 9296 thflags &= ~TH_FIN; 9297 KMOD_TCPSTAT_INC(tcps_rcvpackafterwin); 9298 KMOD_TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); 9299 } 9300 tp->snd_wl1 = th->th_seq - 1; 9301 tp->rcv_up = th->th_seq; 9302 /* 9303 * Client side of transaction: already sent SYN and data. If the 9304 * remote host used T/TCP to validate the SYN, our data will be 9305 * ACK'd; if so, enter normal data segment processing in the middle 9306 * of step 5, ack processing. Otherwise, goto step 6. 9307 */ 9308 if (thflags & TH_ACK) { 9309 /* For syn-sent we need to possibly update the rtt */ 9310 if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) { 9311 uint32_t t; 9312 9313 t = tcp_ts_getticks() - to->to_tsecr; 9314 if (!tp->t_rttlow || tp->t_rttlow > t) 9315 tp->t_rttlow = t; 9316 tcp_rack_xmit_timer(rack, t + 1, 1, (t * HPTS_USEC_IN_MSEC), 0, NULL, 2); 9317 tcp_rack_xmit_timer_commit(rack, tp); 9318 } 9319 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) 9320 return (ret_val); 9321 /* We may have changed to FIN_WAIT_1 above */ 9322 if (tp->t_state == TCPS_FIN_WAIT_1) { 9323 /* 9324 * In FIN_WAIT_1 STATE in addition to the processing 9325 * for the ESTABLISHED state if our FIN is now 9326 * acknowledged then enter FIN_WAIT_2. 9327 */ 9328 if (ourfinisacked) { 9329 /* 9330 * If we can't receive any more data, then 9331 * closing user can proceed. Starting the 9332 * timer is contrary to the specification, 9333 * but if we don't get a FIN we'll hang 9334 * forever. 9335 * 9336 * XXXjl: we should release the tp also, and 9337 * use a compressed state. 9338 */ 9339 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 9340 soisdisconnected(so); 9341 tcp_timer_activate(tp, TT_2MSL, 9342 (tcp_fast_finwait2_recycle ? 9343 tcp_finwait2_timeout : 9344 TP_MAXIDLE(tp))); 9345 } 9346 tcp_state_change(tp, TCPS_FIN_WAIT_2); 9347 } 9348 } 9349 } 9350 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 9351 tiwin, thflags, nxt_pkt)); 9352 } 9353 9354 /* 9355 * Return value of 1, the TCB is unlocked and most 9356 * likely gone, return value of 0, the TCP is still 9357 * locked. 9358 */ 9359 static int 9360 rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, 9361 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 9362 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 9363 { 9364 struct tcp_rack *rack; 9365 int32_t ret_val = 0; 9366 int32_t ourfinisacked = 0; 9367 9368 ctf_calc_rwin(so, tp); 9369 if ((thflags & TH_ACK) && 9370 (SEQ_LEQ(th->th_ack, tp->snd_una) || 9371 SEQ_GT(th->th_ack, tp->snd_max))) { 9372 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 9373 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 9374 return (1); 9375 } 9376 rack = (struct tcp_rack *)tp->t_fb_ptr; 9377 if (IS_FASTOPEN(tp->t_flags)) { 9378 /* 9379 * When a TFO connection is in SYN_RECEIVED, the 9380 * only valid packets are the initial SYN, a 9381 * retransmit/copy of the initial SYN (possibly with 9382 * a subset of the original data), a valid ACK, a 9383 * FIN, or a RST. 9384 */ 9385 if ((thflags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) { 9386 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 9387 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 9388 return (1); 9389 } else if (thflags & TH_SYN) { 9390 /* non-initial SYN is ignored */ 9391 if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) || 9392 (rack->r_ctl.rc_hpts_flags & PACE_TMR_TLP) || 9393 (rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK)) { 9394 ctf_do_drop(m, NULL); 9395 return (0); 9396 } 9397 } else if (!(thflags & (TH_ACK | TH_FIN | TH_RST))) { 9398 ctf_do_drop(m, NULL); 9399 return (0); 9400 } 9401 } 9402 if ((thflags & TH_RST) || 9403 (tp->t_fin_is_rst && (thflags & TH_FIN))) 9404 return (ctf_process_rst(m, th, so, tp)); 9405 /* 9406 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 9407 * it's less than ts_recent, drop it. 9408 */ 9409 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 9410 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 9411 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 9412 return (ret_val); 9413 } 9414 /* 9415 * In the SYN-RECEIVED state, validate that the packet belongs to 9416 * this connection before trimming the data to fit the receive 9417 * window. Check the sequence number versus IRS since we know the 9418 * sequence numbers haven't wrapped. This is a partial fix for the 9419 * "LAND" DoS attack. 9420 */ 9421 if (SEQ_LT(th->th_seq, tp->irs)) { 9422 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 9423 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 9424 return (1); 9425 } 9426 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 9427 return (ret_val); 9428 } 9429 /* 9430 * If last ACK falls within this segment's sequence numbers, record 9431 * its timestamp. NOTE: 1) That the test incorporates suggestions 9432 * from the latest proposal of the tcplw@cray.com list (Braden 9433 * 1993/04/26). 2) That updating only on newer timestamps interferes 9434 * with our earlier PAWS tests, so this check should be solely 9435 * predicated on the sequence space of this segment. 3) That we 9436 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 9437 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 9438 * SEG.Len, This modified check allows us to overcome RFC1323's 9439 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 9440 * p.869. In such cases, we can still calculate the RTT correctly 9441 * when RCV.NXT == Last.ACK.Sent. 9442 */ 9443 if ((to->to_flags & TOF_TS) != 0 && 9444 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 9445 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 9446 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 9447 tp->ts_recent_age = tcp_ts_getticks(); 9448 tp->ts_recent = to->to_tsval; 9449 } 9450 tp->snd_wnd = tiwin; 9451 /* 9452 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 9453 * is on (half-synchronized state), then queue data for later 9454 * processing; else drop segment and return. 9455 */ 9456 if ((thflags & TH_ACK) == 0) { 9457 if (IS_FASTOPEN(tp->t_flags)) { 9458 rack_cc_conn_init(tp); 9459 } 9460 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 9461 tiwin, thflags, nxt_pkt)); 9462 } 9463 KMOD_TCPSTAT_INC(tcps_connects); 9464 soisconnected(so); 9465 /* Do window scaling? */ 9466 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 9467 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 9468 tp->rcv_scale = tp->request_r_scale; 9469 } 9470 /* 9471 * Make transitions: SYN-RECEIVED -> ESTABLISHED SYN-RECEIVED* -> 9472 * FIN-WAIT-1 9473 */ 9474 tp->t_starttime = ticks; 9475 if (IS_FASTOPEN(tp->t_flags) && tp->t_tfo_pending) { 9476 tcp_fastopen_decrement_counter(tp->t_tfo_pending); 9477 tp->t_tfo_pending = NULL; 9478 } 9479 if (tp->t_flags & TF_NEEDFIN) { 9480 tcp_state_change(tp, TCPS_FIN_WAIT_1); 9481 tp->t_flags &= ~TF_NEEDFIN; 9482 } else { 9483 tcp_state_change(tp, TCPS_ESTABLISHED); 9484 TCP_PROBE5(accept__established, NULL, tp, 9485 mtod(m, const char *), tp, th); 9486 /* 9487 * TFO connections call cc_conn_init() during SYN 9488 * processing. Calling it again here for such connections 9489 * is not harmless as it would undo the snd_cwnd reduction 9490 * that occurs when a TFO SYN|ACK is retransmitted. 9491 */ 9492 if (!IS_FASTOPEN(tp->t_flags)) 9493 rack_cc_conn_init(tp); 9494 } 9495 /* 9496 * Account for the ACK of our SYN prior to 9497 * regular ACK processing below, except for 9498 * simultaneous SYN, which is handled later. 9499 */ 9500 if (SEQ_GT(th->th_ack, tp->snd_una) && !(tp->t_flags & TF_NEEDSYN)) 9501 tp->snd_una++; 9502 /* 9503 * If segment contains data or ACK, will call tcp_reass() later; if 9504 * not, do so now to pass queued data to user. 9505 */ 9506 if (tlen == 0 && (thflags & TH_FIN) == 0) 9507 (void) tcp_reass(tp, (struct tcphdr *)0, NULL, 0, 9508 (struct mbuf *)0); 9509 tp->snd_wl1 = th->th_seq - 1; 9510 /* For syn-recv we need to possibly update the rtt */ 9511 if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) { 9512 uint32_t t; 9513 9514 t = tcp_ts_getticks() - to->to_tsecr; 9515 if (!tp->t_rttlow || tp->t_rttlow > t) 9516 tp->t_rttlow = t; 9517 tcp_rack_xmit_timer(rack, t + 1, 1, (t * HPTS_USEC_IN_MSEC), 0, NULL, 2); 9518 tcp_rack_xmit_timer_commit(rack, tp); 9519 } 9520 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 9521 return (ret_val); 9522 } 9523 if (tp->t_state == TCPS_FIN_WAIT_1) { 9524 /* We could have went to FIN_WAIT_1 (or EST) above */ 9525 /* 9526 * In FIN_WAIT_1 STATE in addition to the processing for the 9527 * ESTABLISHED state if our FIN is now acknowledged then 9528 * enter FIN_WAIT_2. 9529 */ 9530 if (ourfinisacked) { 9531 /* 9532 * If we can't receive any more data, then closing 9533 * user can proceed. Starting the timer is contrary 9534 * to the specification, but if we don't get a FIN 9535 * we'll hang forever. 9536 * 9537 * XXXjl: we should release the tp also, and use a 9538 * compressed state. 9539 */ 9540 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 9541 soisdisconnected(so); 9542 tcp_timer_activate(tp, TT_2MSL, 9543 (tcp_fast_finwait2_recycle ? 9544 tcp_finwait2_timeout : 9545 TP_MAXIDLE(tp))); 9546 } 9547 tcp_state_change(tp, TCPS_FIN_WAIT_2); 9548 } 9549 } 9550 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 9551 tiwin, thflags, nxt_pkt)); 9552 } 9553 9554 /* 9555 * Return value of 1, the TCB is unlocked and most 9556 * likely gone, return value of 0, the TCP is still 9557 * locked. 9558 */ 9559 static int 9560 rack_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so, 9561 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 9562 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 9563 { 9564 int32_t ret_val = 0; 9565 struct tcp_rack *rack; 9566 9567 /* 9568 * Header prediction: check for the two common cases of a 9569 * uni-directional data xfer. If the packet has no control flags, 9570 * is in-sequence, the window didn't change and we're not 9571 * retransmitting, it's a candidate. If the length is zero and the 9572 * ack moved forward, we're the sender side of the xfer. Just free 9573 * the data acked & wake any higher level process that was blocked 9574 * waiting for space. If the length is non-zero and the ack didn't 9575 * move, we're the receiver side. If we're getting packets in-order 9576 * (the reassembly queue is empty), add the data toc The socket 9577 * buffer and note that we need a delayed ack. Make sure that the 9578 * hidden state-flags are also off. Since we check for 9579 * TCPS_ESTABLISHED first, it can only be TH_NEEDSYN. 9580 */ 9581 rack = (struct tcp_rack *)tp->t_fb_ptr; 9582 if (__predict_true(((to->to_flags & TOF_SACK) == 0)) && 9583 __predict_true((thflags & (TH_SYN | TH_FIN | TH_RST | TH_ACK)) == TH_ACK) && 9584 __predict_true(SEGQ_EMPTY(tp)) && 9585 __predict_true(th->th_seq == tp->rcv_nxt)) { 9586 if (tlen == 0) { 9587 if (rack_fastack(m, th, so, tp, to, drop_hdrlen, tlen, 9588 tiwin, nxt_pkt, rack->r_ctl.rc_rcvtime)) { 9589 return (0); 9590 } 9591 } else { 9592 if (rack_do_fastnewdata(m, th, so, tp, to, drop_hdrlen, tlen, 9593 tiwin, nxt_pkt, iptos)) { 9594 return (0); 9595 } 9596 } 9597 } 9598 ctf_calc_rwin(so, tp); 9599 9600 if ((thflags & TH_RST) || 9601 (tp->t_fin_is_rst && (thflags & TH_FIN))) 9602 return (ctf_process_rst(m, th, so, tp)); 9603 9604 /* 9605 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 9606 * synchronized state. 9607 */ 9608 if (thflags & TH_SYN) { 9609 ctf_challenge_ack(m, th, tp, &ret_val); 9610 return (ret_val); 9611 } 9612 /* 9613 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 9614 * it's less than ts_recent, drop it. 9615 */ 9616 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 9617 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 9618 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 9619 return (ret_val); 9620 } 9621 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 9622 return (ret_val); 9623 } 9624 /* 9625 * If last ACK falls within this segment's sequence numbers, record 9626 * its timestamp. NOTE: 1) That the test incorporates suggestions 9627 * from the latest proposal of the tcplw@cray.com list (Braden 9628 * 1993/04/26). 2) That updating only on newer timestamps interferes 9629 * with our earlier PAWS tests, so this check should be solely 9630 * predicated on the sequence space of this segment. 3) That we 9631 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 9632 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 9633 * SEG.Len, This modified check allows us to overcome RFC1323's 9634 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 9635 * p.869. In such cases, we can still calculate the RTT correctly 9636 * when RCV.NXT == Last.ACK.Sent. 9637 */ 9638 if ((to->to_flags & TOF_TS) != 0 && 9639 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 9640 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 9641 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 9642 tp->ts_recent_age = tcp_ts_getticks(); 9643 tp->ts_recent = to->to_tsval; 9644 } 9645 /* 9646 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 9647 * is on (half-synchronized state), then queue data for later 9648 * processing; else drop segment and return. 9649 */ 9650 if ((thflags & TH_ACK) == 0) { 9651 if (tp->t_flags & TF_NEEDSYN) { 9652 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 9653 tiwin, thflags, nxt_pkt)); 9654 9655 } else if (tp->t_flags & TF_ACKNOW) { 9656 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 9657 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output= 1; 9658 return (ret_val); 9659 } else { 9660 ctf_do_drop(m, NULL); 9661 return (0); 9662 } 9663 } 9664 /* 9665 * Ack processing. 9666 */ 9667 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) { 9668 return (ret_val); 9669 } 9670 if (sbavail(&so->so_snd)) { 9671 if (ctf_progress_timeout_check(tp, true)) { 9672 rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__); 9673 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 9674 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 9675 return (1); 9676 } 9677 } 9678 /* State changes only happen in rack_process_data() */ 9679 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 9680 tiwin, thflags, nxt_pkt)); 9681 } 9682 9683 /* 9684 * Return value of 1, the TCB is unlocked and most 9685 * likely gone, return value of 0, the TCP is still 9686 * locked. 9687 */ 9688 static int 9689 rack_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so, 9690 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 9691 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 9692 { 9693 int32_t ret_val = 0; 9694 9695 ctf_calc_rwin(so, tp); 9696 if ((thflags & TH_RST) || 9697 (tp->t_fin_is_rst && (thflags & TH_FIN))) 9698 return (ctf_process_rst(m, th, so, tp)); 9699 /* 9700 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 9701 * synchronized state. 9702 */ 9703 if (thflags & TH_SYN) { 9704 ctf_challenge_ack(m, th, tp, &ret_val); 9705 return (ret_val); 9706 } 9707 /* 9708 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 9709 * it's less than ts_recent, drop it. 9710 */ 9711 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 9712 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 9713 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 9714 return (ret_val); 9715 } 9716 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 9717 return (ret_val); 9718 } 9719 /* 9720 * If last ACK falls within this segment's sequence numbers, record 9721 * its timestamp. NOTE: 1) That the test incorporates suggestions 9722 * from the latest proposal of the tcplw@cray.com list (Braden 9723 * 1993/04/26). 2) That updating only on newer timestamps interferes 9724 * with our earlier PAWS tests, so this check should be solely 9725 * predicated on the sequence space of this segment. 3) That we 9726 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 9727 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 9728 * SEG.Len, This modified check allows us to overcome RFC1323's 9729 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 9730 * p.869. In such cases, we can still calculate the RTT correctly 9731 * when RCV.NXT == Last.ACK.Sent. 9732 */ 9733 if ((to->to_flags & TOF_TS) != 0 && 9734 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 9735 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 9736 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 9737 tp->ts_recent_age = tcp_ts_getticks(); 9738 tp->ts_recent = to->to_tsval; 9739 } 9740 /* 9741 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 9742 * is on (half-synchronized state), then queue data for later 9743 * processing; else drop segment and return. 9744 */ 9745 if ((thflags & TH_ACK) == 0) { 9746 if (tp->t_flags & TF_NEEDSYN) { 9747 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 9748 tiwin, thflags, nxt_pkt)); 9749 9750 } else if (tp->t_flags & TF_ACKNOW) { 9751 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 9752 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; 9753 return (ret_val); 9754 } else { 9755 ctf_do_drop(m, NULL); 9756 return (0); 9757 } 9758 } 9759 /* 9760 * Ack processing. 9761 */ 9762 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) { 9763 return (ret_val); 9764 } 9765 if (sbavail(&so->so_snd)) { 9766 if (ctf_progress_timeout_check(tp, true)) { 9767 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 9768 tp, tick, PROGRESS_DROP, __LINE__); 9769 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 9770 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 9771 return (1); 9772 } 9773 } 9774 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 9775 tiwin, thflags, nxt_pkt)); 9776 } 9777 9778 static int 9779 rack_check_data_after_close(struct mbuf *m, 9780 struct tcpcb *tp, int32_t *tlen, struct tcphdr *th, struct socket *so) 9781 { 9782 struct tcp_rack *rack; 9783 9784 rack = (struct tcp_rack *)tp->t_fb_ptr; 9785 if (rack->rc_allow_data_af_clo == 0) { 9786 close_now: 9787 tcp_log_end_status(tp, TCP_EI_STATUS_DATA_A_CLOSE); 9788 /* tcp_close will kill the inp pre-log the Reset */ 9789 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); 9790 tp = tcp_close(tp); 9791 KMOD_TCPSTAT_INC(tcps_rcvafterclose); 9792 ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, (*tlen)); 9793 return (1); 9794 } 9795 if (sbavail(&so->so_snd) == 0) 9796 goto close_now; 9797 /* Ok we allow data that is ignored and a followup reset */ 9798 tcp_log_end_status(tp, TCP_EI_STATUS_DATA_A_CLOSE); 9799 tp->rcv_nxt = th->th_seq + *tlen; 9800 tp->t_flags2 |= TF2_DROP_AF_DATA; 9801 rack->r_wanted_output = 1; 9802 *tlen = 0; 9803 return (0); 9804 } 9805 9806 /* 9807 * Return value of 1, the TCB is unlocked and most 9808 * likely gone, return value of 0, the TCP is still 9809 * locked. 9810 */ 9811 static int 9812 rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so, 9813 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 9814 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 9815 { 9816 int32_t ret_val = 0; 9817 int32_t ourfinisacked = 0; 9818 9819 ctf_calc_rwin(so, tp); 9820 9821 if ((thflags & TH_RST) || 9822 (tp->t_fin_is_rst && (thflags & TH_FIN))) 9823 return (ctf_process_rst(m, th, so, tp)); 9824 /* 9825 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 9826 * synchronized state. 9827 */ 9828 if (thflags & TH_SYN) { 9829 ctf_challenge_ack(m, th, tp, &ret_val); 9830 return (ret_val); 9831 } 9832 /* 9833 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 9834 * it's less than ts_recent, drop it. 9835 */ 9836 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 9837 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 9838 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 9839 return (ret_val); 9840 } 9841 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 9842 return (ret_val); 9843 } 9844 /* 9845 * If new data are received on a connection after the user processes 9846 * are gone, then RST the other end. 9847 */ 9848 if ((so->so_state & SS_NOFDREF) && tlen) { 9849 if (rack_check_data_after_close(m, tp, &tlen, th, so)) 9850 return (1); 9851 } 9852 /* 9853 * If last ACK falls within this segment's sequence numbers, record 9854 * its timestamp. NOTE: 1) That the test incorporates suggestions 9855 * from the latest proposal of the tcplw@cray.com list (Braden 9856 * 1993/04/26). 2) That updating only on newer timestamps interferes 9857 * with our earlier PAWS tests, so this check should be solely 9858 * predicated on the sequence space of this segment. 3) That we 9859 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 9860 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 9861 * SEG.Len, This modified check allows us to overcome RFC1323's 9862 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 9863 * p.869. In such cases, we can still calculate the RTT correctly 9864 * when RCV.NXT == Last.ACK.Sent. 9865 */ 9866 if ((to->to_flags & TOF_TS) != 0 && 9867 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 9868 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 9869 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 9870 tp->ts_recent_age = tcp_ts_getticks(); 9871 tp->ts_recent = to->to_tsval; 9872 } 9873 /* 9874 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 9875 * is on (half-synchronized state), then queue data for later 9876 * processing; else drop segment and return. 9877 */ 9878 if ((thflags & TH_ACK) == 0) { 9879 if (tp->t_flags & TF_NEEDSYN) { 9880 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 9881 tiwin, thflags, nxt_pkt)); 9882 } else if (tp->t_flags & TF_ACKNOW) { 9883 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 9884 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; 9885 return (ret_val); 9886 } else { 9887 ctf_do_drop(m, NULL); 9888 return (0); 9889 } 9890 } 9891 /* 9892 * Ack processing. 9893 */ 9894 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 9895 return (ret_val); 9896 } 9897 if (ourfinisacked) { 9898 /* 9899 * If we can't receive any more data, then closing user can 9900 * proceed. Starting the timer is contrary to the 9901 * specification, but if we don't get a FIN we'll hang 9902 * forever. 9903 * 9904 * XXXjl: we should release the tp also, and use a 9905 * compressed state. 9906 */ 9907 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 9908 soisdisconnected(so); 9909 tcp_timer_activate(tp, TT_2MSL, 9910 (tcp_fast_finwait2_recycle ? 9911 tcp_finwait2_timeout : 9912 TP_MAXIDLE(tp))); 9913 } 9914 tcp_state_change(tp, TCPS_FIN_WAIT_2); 9915 } 9916 if (sbavail(&so->so_snd)) { 9917 if (ctf_progress_timeout_check(tp, true)) { 9918 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 9919 tp, tick, PROGRESS_DROP, __LINE__); 9920 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 9921 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 9922 return (1); 9923 } 9924 } 9925 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 9926 tiwin, thflags, nxt_pkt)); 9927 } 9928 9929 /* 9930 * Return value of 1, the TCB is unlocked and most 9931 * likely gone, return value of 0, the TCP is still 9932 * locked. 9933 */ 9934 static int 9935 rack_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so, 9936 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 9937 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 9938 { 9939 int32_t ret_val = 0; 9940 int32_t ourfinisacked = 0; 9941 9942 ctf_calc_rwin(so, tp); 9943 9944 if ((thflags & TH_RST) || 9945 (tp->t_fin_is_rst && (thflags & TH_FIN))) 9946 return (ctf_process_rst(m, th, so, tp)); 9947 /* 9948 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 9949 * synchronized state. 9950 */ 9951 if (thflags & TH_SYN) { 9952 ctf_challenge_ack(m, th, tp, &ret_val); 9953 return (ret_val); 9954 } 9955 /* 9956 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 9957 * it's less than ts_recent, drop it. 9958 */ 9959 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 9960 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 9961 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 9962 return (ret_val); 9963 } 9964 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 9965 return (ret_val); 9966 } 9967 /* 9968 * If new data are received on a connection after the user processes 9969 * are gone, then RST the other end. 9970 */ 9971 if ((so->so_state & SS_NOFDREF) && tlen) { 9972 if (rack_check_data_after_close(m, tp, &tlen, th, so)) 9973 return (1); 9974 } 9975 /* 9976 * If last ACK falls within this segment's sequence numbers, record 9977 * its timestamp. NOTE: 1) That the test incorporates suggestions 9978 * from the latest proposal of the tcplw@cray.com list (Braden 9979 * 1993/04/26). 2) That updating only on newer timestamps interferes 9980 * with our earlier PAWS tests, so this check should be solely 9981 * predicated on the sequence space of this segment. 3) That we 9982 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 9983 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 9984 * SEG.Len, This modified check allows us to overcome RFC1323's 9985 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 9986 * p.869. In such cases, we can still calculate the RTT correctly 9987 * when RCV.NXT == Last.ACK.Sent. 9988 */ 9989 if ((to->to_flags & TOF_TS) != 0 && 9990 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 9991 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 9992 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 9993 tp->ts_recent_age = tcp_ts_getticks(); 9994 tp->ts_recent = to->to_tsval; 9995 } 9996 /* 9997 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 9998 * is on (half-synchronized state), then queue data for later 9999 * processing; else drop segment and return. 10000 */ 10001 if ((thflags & TH_ACK) == 0) { 10002 if (tp->t_flags & TF_NEEDSYN) { 10003 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 10004 tiwin, thflags, nxt_pkt)); 10005 } else if (tp->t_flags & TF_ACKNOW) { 10006 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 10007 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output= 1; 10008 return (ret_val); 10009 } else { 10010 ctf_do_drop(m, NULL); 10011 return (0); 10012 } 10013 } 10014 /* 10015 * Ack processing. 10016 */ 10017 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 10018 return (ret_val); 10019 } 10020 if (ourfinisacked) { 10021 tcp_twstart(tp); 10022 m_freem(m); 10023 return (1); 10024 } 10025 if (sbavail(&so->so_snd)) { 10026 if (ctf_progress_timeout_check(tp, true)) { 10027 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 10028 tp, tick, PROGRESS_DROP, __LINE__); 10029 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 10030 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 10031 return (1); 10032 } 10033 } 10034 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 10035 tiwin, thflags, nxt_pkt)); 10036 } 10037 10038 /* 10039 * Return value of 1, the TCB is unlocked and most 10040 * likely gone, return value of 0, the TCP is still 10041 * locked. 10042 */ 10043 static int 10044 rack_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so, 10045 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 10046 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 10047 { 10048 int32_t ret_val = 0; 10049 int32_t ourfinisacked = 0; 10050 10051 ctf_calc_rwin(so, tp); 10052 10053 if ((thflags & TH_RST) || 10054 (tp->t_fin_is_rst && (thflags & TH_FIN))) 10055 return (ctf_process_rst(m, th, so, tp)); 10056 /* 10057 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 10058 * synchronized state. 10059 */ 10060 if (thflags & TH_SYN) { 10061 ctf_challenge_ack(m, th, tp, &ret_val); 10062 return (ret_val); 10063 } 10064 /* 10065 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 10066 * it's less than ts_recent, drop it. 10067 */ 10068 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 10069 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 10070 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 10071 return (ret_val); 10072 } 10073 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 10074 return (ret_val); 10075 } 10076 /* 10077 * If new data are received on a connection after the user processes 10078 * are gone, then RST the other end. 10079 */ 10080 if ((so->so_state & SS_NOFDREF) && tlen) { 10081 if (rack_check_data_after_close(m, tp, &tlen, th, so)) 10082 return (1); 10083 } 10084 /* 10085 * If last ACK falls within this segment's sequence numbers, record 10086 * its timestamp. NOTE: 1) That the test incorporates suggestions 10087 * from the latest proposal of the tcplw@cray.com list (Braden 10088 * 1993/04/26). 2) That updating only on newer timestamps interferes 10089 * with our earlier PAWS tests, so this check should be solely 10090 * predicated on the sequence space of this segment. 3) That we 10091 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 10092 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 10093 * SEG.Len, This modified check allows us to overcome RFC1323's 10094 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 10095 * p.869. In such cases, we can still calculate the RTT correctly 10096 * when RCV.NXT == Last.ACK.Sent. 10097 */ 10098 if ((to->to_flags & TOF_TS) != 0 && 10099 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 10100 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 10101 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 10102 tp->ts_recent_age = tcp_ts_getticks(); 10103 tp->ts_recent = to->to_tsval; 10104 } 10105 /* 10106 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 10107 * is on (half-synchronized state), then queue data for later 10108 * processing; else drop segment and return. 10109 */ 10110 if ((thflags & TH_ACK) == 0) { 10111 if (tp->t_flags & TF_NEEDSYN) { 10112 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 10113 tiwin, thflags, nxt_pkt)); 10114 } else if (tp->t_flags & TF_ACKNOW) { 10115 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 10116 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; 10117 return (ret_val); 10118 } else { 10119 ctf_do_drop(m, NULL); 10120 return (0); 10121 } 10122 } 10123 /* 10124 * case TCPS_LAST_ACK: Ack processing. 10125 */ 10126 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 10127 return (ret_val); 10128 } 10129 if (ourfinisacked) { 10130 tp = tcp_close(tp); 10131 ctf_do_drop(m, tp); 10132 return (1); 10133 } 10134 if (sbavail(&so->so_snd)) { 10135 if (ctf_progress_timeout_check(tp, true)) { 10136 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 10137 tp, tick, PROGRESS_DROP, __LINE__); 10138 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 10139 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 10140 return (1); 10141 } 10142 } 10143 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 10144 tiwin, thflags, nxt_pkt)); 10145 } 10146 10147 /* 10148 * Return value of 1, the TCB is unlocked and most 10149 * likely gone, return value of 0, the TCP is still 10150 * locked. 10151 */ 10152 static int 10153 rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so, 10154 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 10155 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 10156 { 10157 int32_t ret_val = 0; 10158 int32_t ourfinisacked = 0; 10159 10160 ctf_calc_rwin(so, tp); 10161 10162 /* Reset receive buffer auto scaling when not in bulk receive mode. */ 10163 if ((thflags & TH_RST) || 10164 (tp->t_fin_is_rst && (thflags & TH_FIN))) 10165 return (ctf_process_rst(m, th, so, tp)); 10166 /* 10167 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 10168 * synchronized state. 10169 */ 10170 if (thflags & TH_SYN) { 10171 ctf_challenge_ack(m, th, tp, &ret_val); 10172 return (ret_val); 10173 } 10174 /* 10175 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 10176 * it's less than ts_recent, drop it. 10177 */ 10178 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 10179 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 10180 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 10181 return (ret_val); 10182 } 10183 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 10184 return (ret_val); 10185 } 10186 /* 10187 * If new data are received on a connection after the user processes 10188 * are gone, then RST the other end. 10189 */ 10190 if ((so->so_state & SS_NOFDREF) && 10191 tlen) { 10192 if (rack_check_data_after_close(m, tp, &tlen, th, so)) 10193 return (1); 10194 } 10195 /* 10196 * If last ACK falls within this segment's sequence numbers, record 10197 * its timestamp. NOTE: 1) That the test incorporates suggestions 10198 * from the latest proposal of the tcplw@cray.com list (Braden 10199 * 1993/04/26). 2) That updating only on newer timestamps interferes 10200 * with our earlier PAWS tests, so this check should be solely 10201 * predicated on the sequence space of this segment. 3) That we 10202 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 10203 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 10204 * SEG.Len, This modified check allows us to overcome RFC1323's 10205 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 10206 * p.869. In such cases, we can still calculate the RTT correctly 10207 * when RCV.NXT == Last.ACK.Sent. 10208 */ 10209 if ((to->to_flags & TOF_TS) != 0 && 10210 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 10211 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 10212 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 10213 tp->ts_recent_age = tcp_ts_getticks(); 10214 tp->ts_recent = to->to_tsval; 10215 } 10216 /* 10217 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 10218 * is on (half-synchronized state), then queue data for later 10219 * processing; else drop segment and return. 10220 */ 10221 if ((thflags & TH_ACK) == 0) { 10222 if (tp->t_flags & TF_NEEDSYN) { 10223 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 10224 tiwin, thflags, nxt_pkt)); 10225 } else if (tp->t_flags & TF_ACKNOW) { 10226 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 10227 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; 10228 return (ret_val); 10229 } else { 10230 ctf_do_drop(m, NULL); 10231 return (0); 10232 } 10233 } 10234 /* 10235 * Ack processing. 10236 */ 10237 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 10238 return (ret_val); 10239 } 10240 if (sbavail(&so->so_snd)) { 10241 if (ctf_progress_timeout_check(tp, true)) { 10242 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 10243 tp, tick, PROGRESS_DROP, __LINE__); 10244 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 10245 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 10246 return (1); 10247 } 10248 } 10249 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 10250 tiwin, thflags, nxt_pkt)); 10251 } 10252 10253 static void inline 10254 rack_clear_rate_sample(struct tcp_rack *rack) 10255 { 10256 rack->r_ctl.rack_rs.rs_flags = RACK_RTT_EMPTY; 10257 rack->r_ctl.rack_rs.rs_rtt_cnt = 0; 10258 rack->r_ctl.rack_rs.rs_rtt_tot = 0; 10259 } 10260 10261 static void 10262 rack_set_pace_segments(struct tcpcb *tp, struct tcp_rack *rack, uint32_t line) 10263 { 10264 uint64_t bw_est, rate_wanted; 10265 int chged = 0; 10266 uint32_t user_max; 10267 10268 user_max = ctf_fixed_maxseg(tp) * rack->rc_user_set_max_segs; 10269 if (ctf_fixed_maxseg(tp) != rack->r_ctl.rc_pace_min_segs) 10270 chged = 1; 10271 rack->r_ctl.rc_pace_min_segs = ctf_fixed_maxseg(tp); 10272 if (rack->use_fixed_rate || rack->rc_force_max_seg) { 10273 if (user_max != rack->r_ctl.rc_pace_max_segs) 10274 chged = 1; 10275 } 10276 if (rack->rc_force_max_seg) { 10277 rack->r_ctl.rc_pace_max_segs = user_max; 10278 } else if (rack->use_fixed_rate) { 10279 bw_est = rack_get_bw(rack); 10280 if ((rack->r_ctl.crte == NULL) || 10281 (bw_est != rack->r_ctl.crte->rate)) { 10282 rack->r_ctl.rc_pace_max_segs = user_max; 10283 } else { 10284 /* We are pacing right at the hardware rate */ 10285 uint32_t segsiz; 10286 10287 segsiz = min(ctf_fixed_maxseg(tp), 10288 rack->r_ctl.rc_pace_min_segs); 10289 rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size( 10290 bw_est, segsiz, 0, 10291 rack->r_ctl.crte, NULL); 10292 } 10293 } else if (rack->rc_always_pace) { 10294 if (rack->r_ctl.gp_bw || 10295 #ifdef NETFLIX_PEAKRATE 10296 rack->rc_tp->t_maxpeakrate || 10297 #endif 10298 rack->r_ctl.init_rate) { 10299 /* We have a rate of some sort set */ 10300 uint32_t orig; 10301 10302 bw_est = rack_get_bw(rack); 10303 orig = rack->r_ctl.rc_pace_max_segs; 10304 rate_wanted = rack_get_output_bw(rack, bw_est, NULL); 10305 if (rate_wanted) { 10306 /* We have something */ 10307 rack->r_ctl.rc_pace_max_segs = rack_get_pacing_len(rack, 10308 rate_wanted, 10309 ctf_fixed_maxseg(rack->rc_tp)); 10310 } else 10311 rack->r_ctl.rc_pace_max_segs = rack->r_ctl.rc_pace_min_segs; 10312 if (orig != rack->r_ctl.rc_pace_max_segs) 10313 chged = 1; 10314 } else if ((rack->r_ctl.gp_bw == 0) && 10315 (rack->r_ctl.rc_pace_max_segs == 0)) { 10316 /* 10317 * If we have nothing limit us to bursting 10318 * out IW sized pieces. 10319 */ 10320 chged = 1; 10321 rack->r_ctl.rc_pace_max_segs = rc_init_window(rack); 10322 } 10323 } 10324 if (rack->r_ctl.rc_pace_max_segs > PACE_MAX_IP_BYTES) { 10325 chged = 1; 10326 rack->r_ctl.rc_pace_max_segs = PACE_MAX_IP_BYTES; 10327 } 10328 if (chged) 10329 rack_log_type_hrdwtso(tp, rack, 0, rack->rc_inp->inp_socket->so_snd.sb_flags, line, 2); 10330 } 10331 10332 static int 10333 rack_init(struct tcpcb *tp) 10334 { 10335 struct tcp_rack *rack = NULL; 10336 struct rack_sendmap *insret; 10337 uint32_t iwin, snt, us_cts; 10338 10339 tp->t_fb_ptr = uma_zalloc(rack_pcb_zone, M_NOWAIT); 10340 if (tp->t_fb_ptr == NULL) { 10341 /* 10342 * We need to allocate memory but cant. The INP and INP_INFO 10343 * locks and they are recusive (happens during setup. So a 10344 * scheme to drop the locks fails :( 10345 * 10346 */ 10347 return (ENOMEM); 10348 } 10349 memset(tp->t_fb_ptr, 0, sizeof(struct tcp_rack)); 10350 10351 rack = (struct tcp_rack *)tp->t_fb_ptr; 10352 RB_INIT(&rack->r_ctl.rc_mtree); 10353 TAILQ_INIT(&rack->r_ctl.rc_free); 10354 TAILQ_INIT(&rack->r_ctl.rc_tmap); 10355 rack->rc_tp = tp; 10356 if (tp->t_inpcb) { 10357 rack->rc_inp = tp->t_inpcb; 10358 } 10359 /* Probably not needed but lets be sure */ 10360 rack_clear_rate_sample(rack); 10361 rack->r_ctl.rc_reorder_fade = rack_reorder_fade; 10362 rack->rc_allow_data_af_clo = rack_ignore_data_after_close; 10363 rack->r_ctl.rc_tlp_threshold = rack_tlp_thresh; 10364 if (use_rack_rr) 10365 rack->use_rack_rr = 1; 10366 if (V_tcp_delack_enabled) 10367 tp->t_delayed_ack = 1; 10368 else 10369 tp->t_delayed_ack = 0; 10370 if (rack_enable_shared_cwnd) 10371 rack->rack_enable_scwnd = 1; 10372 rack->rc_user_set_max_segs = rack_hptsi_segments; 10373 rack->rc_force_max_seg = 0; 10374 if (rack_use_imac_dack) 10375 rack->rc_dack_mode = 1; 10376 rack->r_ctl.rc_reorder_shift = rack_reorder_thresh; 10377 rack->r_ctl.rc_pkt_delay = rack_pkt_delay; 10378 rack->r_ctl.rc_prop_reduce = rack_use_proportional_reduce; 10379 rack->r_ctl.rc_prop_rate = rack_proportional_rate; 10380 rack->r_ctl.rc_tlp_cwnd_reduce = rack_lower_cwnd_at_tlp; 10381 rack->r_ctl.rc_early_recovery = rack_early_recovery; 10382 rack->r_ctl.rc_lowest_us_rtt = 0xffffffff; 10383 rack->r_ctl.rc_highest_us_rtt = 0; 10384 if (rack_disable_prr) 10385 rack->rack_no_prr = 1; 10386 if (rack_gp_no_rec_chg) 10387 rack->rc_gp_no_rec_chg = 1; 10388 rack->rc_always_pace = rack_pace_every_seg; 10389 if (rack_enable_mqueue_for_nonpaced) 10390 rack->r_mbuf_queue = 1; 10391 else 10392 rack->r_mbuf_queue = 0; 10393 if (rack->r_mbuf_queue || rack->rc_always_pace) 10394 tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ; 10395 else 10396 tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ; 10397 rack_set_pace_segments(tp, rack, __LINE__); 10398 if (rack_limits_scwnd) 10399 rack->r_limit_scw = 1; 10400 else 10401 rack->r_limit_scw = 0; 10402 rack->r_ctl.rc_high_rwnd = tp->snd_wnd; 10403 rack->r_ctl.cwnd_to_use = tp->snd_cwnd; 10404 rack->r_ctl.rc_rate_sample_method = rack_rate_sample_method; 10405 rack->rack_tlp_threshold_use = rack_tlp_threshold_use; 10406 rack->r_ctl.rc_prr_sendalot = rack_send_a_lot_in_prr; 10407 rack->r_ctl.rc_min_to = rack_min_to; 10408 microuptime(&rack->r_ctl.act_rcv_time); 10409 rack->r_ctl.rc_last_time_decay = rack->r_ctl.act_rcv_time; 10410 rack->r_running_late = 0; 10411 rack->r_running_early = 0; 10412 rack->rc_init_win = rack_default_init_window; 10413 rack->r_ctl.rack_per_of_gp_ss = rack_per_of_gp_ss; 10414 if (rack_do_dyn_mul) { 10415 /* When dynamic adjustment is on CA needs to start at 100% */ 10416 rack->rc_gp_dyn_mul = 1; 10417 if (rack_do_dyn_mul >= 100) 10418 rack->r_ctl.rack_per_of_gp_ca = rack_do_dyn_mul; 10419 } else 10420 rack->r_ctl.rack_per_of_gp_ca = rack_per_of_gp_ca; 10421 rack->r_ctl.rack_per_of_gp_rec = rack_per_of_gp_rec; 10422 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt; 10423 rack->r_ctl.rc_tlp_rxt_last_time = tcp_tv_to_mssectick(&rack->r_ctl.act_rcv_time); 10424 setup_time_filter_small(&rack->r_ctl.rc_gp_min_rtt, FILTER_TYPE_MIN, 10425 rack_probertt_filter_life); 10426 us_cts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 10427 rack->r_ctl.rc_lower_rtt_us_cts = us_cts; 10428 rack->r_ctl.rc_time_of_last_probertt = us_cts; 10429 rack->r_ctl.rc_time_probertt_starts = 0; 10430 /* Do we force on detection? */ 10431 #ifdef NETFLIX_EXP_DETECTION 10432 if (tcp_force_detection) 10433 rack->do_detection = 1; 10434 else 10435 #endif 10436 rack->do_detection = 0; 10437 if (rack_non_rxt_use_cr) 10438 rack->rack_rec_nonrxt_use_cr = 1; 10439 if (tp->snd_una != tp->snd_max) { 10440 /* Create a send map for the current outstanding data */ 10441 struct rack_sendmap *rsm; 10442 10443 rsm = rack_alloc(rack); 10444 if (rsm == NULL) { 10445 uma_zfree(rack_pcb_zone, tp->t_fb_ptr); 10446 tp->t_fb_ptr = NULL; 10447 return (ENOMEM); 10448 } 10449 rsm->r_flags = RACK_OVERMAX; 10450 rsm->r_tim_lastsent[0] = rack->r_ctl.rc_tlp_rxt_last_time; 10451 rsm->r_rtr_cnt = 1; 10452 rsm->r_rtr_bytes = 0; 10453 rsm->r_start = tp->snd_una; 10454 if (tp->t_flags & TF_SENTFIN) { 10455 rsm->r_end = tp->snd_max - 1; 10456 rsm->r_flags |= RACK_HAS_FIN; 10457 } else { 10458 rsm->r_end = tp->snd_max; 10459 } 10460 rsm->usec_orig_send = us_cts; 10461 rsm->r_dupack = 0; 10462 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 10463 #ifdef INVARIANTS 10464 if (insret != NULL) { 10465 panic("Insert in rb tree fails ret:%p rack:%p rsm:%p", 10466 insret, rack, rsm); 10467 } 10468 #endif 10469 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); 10470 rsm->r_in_tmap = 1; 10471 } 10472 /* Cancel the GP measurement in progress */ 10473 tp->t_flags &= ~TF_GPUTINPROG; 10474 if (SEQ_GT(tp->snd_max, tp->iss)) 10475 snt = tp->snd_max - tp->iss; 10476 else 10477 snt = 0; 10478 iwin = rc_init_window(rack); 10479 if (snt < iwin) { 10480 /* We are not past the initial window 10481 * so we need to make sure cwnd is 10482 * correct. 10483 */ 10484 if (tp->snd_cwnd < iwin) 10485 tp->snd_cwnd = iwin; 10486 /* 10487 * If we are within the initial window 10488 * we want ssthresh to be unlimited. Setting 10489 * it to the rwnd (which the default stack does 10490 * and older racks) is not really a good idea 10491 * since we want to be in SS and grow both the 10492 * cwnd and the rwnd (via dynamic rwnd growth). If 10493 * we set it to the rwnd then as the peer grows its 10494 * rwnd we will be stuck in CA and never hit SS. 10495 * 10496 * Its far better to raise it up high (this takes the 10497 * risk that there as been a loss already, probably 10498 * we should have an indicator in all stacks of loss 10499 * but we don't), but considering the normal use this 10500 * is a risk worth taking. The consequences of not 10501 * hitting SS are far worse than going one more time 10502 * into it early on (before we have sent even a IW). 10503 * It is highly unlikely that we will have had a loss 10504 * before getting the IW out. 10505 */ 10506 tp->snd_ssthresh = 0xffffffff; 10507 } 10508 rack_stop_all_timers(tp); 10509 rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), 0, 0, 0); 10510 rack_log_rtt_shrinks(rack, us_cts, 0, 10511 __LINE__, RACK_RTTS_INIT); 10512 return (0); 10513 } 10514 10515 static int 10516 rack_handoff_ok(struct tcpcb *tp) 10517 { 10518 if ((tp->t_state == TCPS_CLOSED) || 10519 (tp->t_state == TCPS_LISTEN)) { 10520 /* Sure no problem though it may not stick */ 10521 return (0); 10522 } 10523 if ((tp->t_state == TCPS_SYN_SENT) || 10524 (tp->t_state == TCPS_SYN_RECEIVED)) { 10525 /* 10526 * We really don't know if you support sack, 10527 * you have to get to ESTAB or beyond to tell. 10528 */ 10529 return (EAGAIN); 10530 } 10531 if ((tp->t_flags & TF_SENTFIN) && ((tp->snd_max - tp->snd_una) > 1)) { 10532 /* 10533 * Rack will only send a FIN after all data is acknowledged. 10534 * So in this case we have more data outstanding. We can't 10535 * switch stacks until either all data and only the FIN 10536 * is left (in which case rack_init() now knows how 10537 * to deal with that) <or> all is acknowledged and we 10538 * are only left with incoming data, though why you 10539 * would want to switch to rack after all data is acknowledged 10540 * I have no idea (rrs)! 10541 */ 10542 return (EAGAIN); 10543 } 10544 if ((tp->t_flags & TF_SACK_PERMIT) || rack_sack_not_required){ 10545 return (0); 10546 } 10547 /* 10548 * If we reach here we don't do SACK on this connection so we can 10549 * never do rack. 10550 */ 10551 return (EINVAL); 10552 } 10553 10554 static void 10555 rack_fini(struct tcpcb *tp, int32_t tcb_is_purged) 10556 { 10557 if (tp->t_fb_ptr) { 10558 struct tcp_rack *rack; 10559 struct rack_sendmap *rsm, *nrsm, *rm; 10560 10561 rack = (struct tcp_rack *)tp->t_fb_ptr; 10562 #ifdef NETFLIX_SHARED_CWND 10563 if (rack->r_ctl.rc_scw) { 10564 uint32_t limit; 10565 10566 if (rack->r_limit_scw) 10567 limit = max(1, rack->r_ctl.rc_lowest_us_rtt); 10568 else 10569 limit = 0; 10570 tcp_shared_cwnd_free_full(tp, rack->r_ctl.rc_scw, 10571 rack->r_ctl.rc_scw_index, 10572 limit); 10573 rack->r_ctl.rc_scw = NULL; 10574 } 10575 #endif 10576 /* rack does not use force data but other stacks may clear it */ 10577 tp->t_flags &= ~TF_FORCEDATA; 10578 if (tp->t_inpcb) { 10579 tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ; 10580 tp->t_inpcb->inp_flags2 &= ~INP_MBUF_QUEUE_READY; 10581 tp->t_inpcb->inp_flags2 &= ~INP_DONT_SACK_QUEUE; 10582 } 10583 #ifdef TCP_BLACKBOX 10584 tcp_log_flowend(tp); 10585 #endif 10586 RB_FOREACH_SAFE(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm) { 10587 rm = RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 10588 #ifdef INVARIANTS 10589 if (rm != rsm) { 10590 panic("At fini, rack:%p rsm:%p rm:%p", 10591 rack, rsm, rm); 10592 } 10593 #endif 10594 uma_zfree(rack_zone, rsm); 10595 } 10596 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); 10597 while (rsm) { 10598 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext); 10599 uma_zfree(rack_zone, rsm); 10600 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); 10601 } 10602 rack->rc_free_cnt = 0; 10603 uma_zfree(rack_pcb_zone, tp->t_fb_ptr); 10604 tp->t_fb_ptr = NULL; 10605 } 10606 /* Cancel the GP measurement in progress */ 10607 tp->t_flags &= ~TF_GPUTINPROG; 10608 /* Make sure snd_nxt is correctly set */ 10609 tp->snd_nxt = tp->snd_max; 10610 } 10611 10612 static void 10613 rack_set_state(struct tcpcb *tp, struct tcp_rack *rack) 10614 { 10615 switch (tp->t_state) { 10616 case TCPS_SYN_SENT: 10617 rack->r_state = TCPS_SYN_SENT; 10618 rack->r_substate = rack_do_syn_sent; 10619 break; 10620 case TCPS_SYN_RECEIVED: 10621 rack->r_state = TCPS_SYN_RECEIVED; 10622 rack->r_substate = rack_do_syn_recv; 10623 break; 10624 case TCPS_ESTABLISHED: 10625 rack_set_pace_segments(tp, rack, __LINE__); 10626 rack->r_state = TCPS_ESTABLISHED; 10627 rack->r_substate = rack_do_established; 10628 break; 10629 case TCPS_CLOSE_WAIT: 10630 rack->r_state = TCPS_CLOSE_WAIT; 10631 rack->r_substate = rack_do_close_wait; 10632 break; 10633 case TCPS_FIN_WAIT_1: 10634 rack->r_state = TCPS_FIN_WAIT_1; 10635 rack->r_substate = rack_do_fin_wait_1; 10636 break; 10637 case TCPS_CLOSING: 10638 rack->r_state = TCPS_CLOSING; 10639 rack->r_substate = rack_do_closing; 10640 break; 10641 case TCPS_LAST_ACK: 10642 rack->r_state = TCPS_LAST_ACK; 10643 rack->r_substate = rack_do_lastack; 10644 break; 10645 case TCPS_FIN_WAIT_2: 10646 rack->r_state = TCPS_FIN_WAIT_2; 10647 rack->r_substate = rack_do_fin_wait_2; 10648 break; 10649 case TCPS_LISTEN: 10650 case TCPS_CLOSED: 10651 case TCPS_TIME_WAIT: 10652 default: 10653 break; 10654 }; 10655 } 10656 10657 static void 10658 rack_timer_audit(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb) 10659 { 10660 /* 10661 * We received an ack, and then did not 10662 * call send or were bounced out due to the 10663 * hpts was running. Now a timer is up as well, is 10664 * it the right timer? 10665 */ 10666 struct rack_sendmap *rsm; 10667 int tmr_up; 10668 10669 tmr_up = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK; 10670 if (rack->rc_in_persist && (tmr_up == PACE_TMR_PERSIT)) 10671 return; 10672 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 10673 if (((rsm == NULL) || (tp->t_state < TCPS_ESTABLISHED)) && 10674 (tmr_up == PACE_TMR_RXT)) { 10675 /* Should be an RXT */ 10676 return; 10677 } 10678 if (rsm == NULL) { 10679 /* Nothing outstanding? */ 10680 if (tp->t_flags & TF_DELACK) { 10681 if (tmr_up == PACE_TMR_DELACK) 10682 /* We are supposed to have delayed ack up and we do */ 10683 return; 10684 } else if (sbavail(&tp->t_inpcb->inp_socket->so_snd) && (tmr_up == PACE_TMR_RXT)) { 10685 /* 10686 * if we hit enobufs then we would expect the possiblity 10687 * of nothing outstanding and the RXT up (and the hptsi timer). 10688 */ 10689 return; 10690 } else if (((V_tcp_always_keepalive || 10691 rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) && 10692 (tp->t_state <= TCPS_CLOSING)) && 10693 (tmr_up == PACE_TMR_KEEP) && 10694 (tp->snd_max == tp->snd_una)) { 10695 /* We should have keep alive up and we do */ 10696 return; 10697 } 10698 } 10699 if (SEQ_GT(tp->snd_max, tp->snd_una) && 10700 ((tmr_up == PACE_TMR_TLP) || 10701 (tmr_up == PACE_TMR_RACK) || 10702 (tmr_up == PACE_TMR_RXT))) { 10703 /* 10704 * Either a Rack, TLP or RXT is fine if we 10705 * have outstanding data. 10706 */ 10707 return; 10708 } else if (tmr_up == PACE_TMR_DELACK) { 10709 /* 10710 * If the delayed ack was going to go off 10711 * before the rtx/tlp/rack timer were going to 10712 * expire, then that would be the timer in control. 10713 * Note we don't check the time here trusting the 10714 * code is correct. 10715 */ 10716 return; 10717 } 10718 /* 10719 * Ok the timer originally started is not what we want now. 10720 * We will force the hpts to be stopped if any, and restart 10721 * with the slot set to what was in the saved slot. 10722 */ 10723 if (rack->rc_inp->inp_in_hpts) { 10724 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { 10725 uint32_t us_cts; 10726 10727 us_cts = tcp_get_usecs(NULL); 10728 if (TSTMP_GT(rack->r_ctl.rc_last_output_to, us_cts)) { 10729 rack->r_early = 1; 10730 rack->r_ctl.rc_agg_early += (rack->r_ctl.rc_last_output_to - us_cts); 10731 } 10732 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 10733 } 10734 tcp_hpts_remove(tp->t_inpcb, HPTS_REMOVE_OUTPUT); 10735 } 10736 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 10737 rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), 0, 0, 0); 10738 } 10739 10740 static int 10741 rack_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so, 10742 struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos, 10743 int32_t nxt_pkt, struct timeval *tv) 10744 { 10745 int32_t thflags, retval, did_out = 0; 10746 int32_t way_out = 0; 10747 uint32_t cts; 10748 uint32_t tiwin; 10749 struct timespec ts; 10750 struct tcpopt to; 10751 struct tcp_rack *rack; 10752 struct rack_sendmap *rsm; 10753 int32_t prev_state = 0; 10754 uint32_t us_cts; 10755 /* 10756 * tv passed from common code is from either M_TSTMP_LRO or 10757 * tcp_get_usecs() if no LRO m_pkthdr timestamp is present. The 10758 * rack_pacing stack assumes tv always refers to 'now', so we overwrite 10759 * tv here to guarantee that. 10760 */ 10761 if (m->m_flags & M_TSTMP_LRO) 10762 tcp_get_usecs(tv); 10763 10764 cts = tcp_tv_to_mssectick(tv); 10765 rack = (struct tcp_rack *)tp->t_fb_ptr; 10766 10767 if ((m->m_flags & M_TSTMP) || 10768 (m->m_flags & M_TSTMP_LRO)) { 10769 mbuf_tstmp2timespec(m, &ts); 10770 rack->r_ctl.act_rcv_time.tv_sec = ts.tv_sec; 10771 rack->r_ctl.act_rcv_time.tv_usec = ts.tv_nsec/1000; 10772 } else 10773 rack->r_ctl.act_rcv_time = *tv; 10774 kern_prefetch(rack, &prev_state); 10775 prev_state = 0; 10776 thflags = th->th_flags; 10777 10778 NET_EPOCH_ASSERT(); 10779 INP_WLOCK_ASSERT(tp->t_inpcb); 10780 KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", 10781 __func__)); 10782 KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT", 10783 __func__)); 10784 if (tp->t_logstate != TCP_LOG_STATE_OFF) { 10785 union tcp_log_stackspecific log; 10786 struct timeval ltv; 10787 #ifdef NETFLIX_HTTP_LOGGING 10788 struct http_sendfile_track *http_req; 10789 10790 if (SEQ_GT(th->th_ack, tp->snd_una)) { 10791 http_req = tcp_http_find_req_for_seq(tp, (th->th_ack-1)); 10792 } else { 10793 http_req = tcp_http_find_req_for_seq(tp, th->th_ack); 10794 } 10795 #endif 10796 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 10797 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 10798 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 10799 if (rack->rack_no_prr == 0) 10800 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; 10801 else 10802 log.u_bbr.flex1 = 0; 10803 log.u_bbr.flex2 = rack->r_ctl.rc_num_maps_alloced; 10804 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 10805 log.u_bbr.pkts_out = rack->rc_tp->t_maxseg; 10806 log.u_bbr.flex3 = m->m_flags; 10807 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; 10808 if (m->m_flags & M_TSTMP) { 10809 /* Record the hardware timestamp if present */ 10810 mbuf_tstmp2timespec(m, &ts); 10811 ltv.tv_sec = ts.tv_sec; 10812 ltv.tv_usec = ts.tv_nsec / 1000; 10813 log.u_bbr.lt_epoch = tcp_tv_to_usectick(<v); 10814 } else if (m->m_flags & M_TSTMP_LRO) { 10815 /* Record the LRO the arrival timestamp */ 10816 mbuf_tstmp2timespec(m, &ts); 10817 ltv.tv_sec = ts.tv_sec; 10818 ltv.tv_usec = ts.tv_nsec / 1000; 10819 log.u_bbr.flex5 = tcp_tv_to_usectick(<v); 10820 } 10821 log.u_bbr.timeStamp = tcp_get_usecs(<v); 10822 /* Log the rcv time */ 10823 log.u_bbr.delRate = m->m_pkthdr.rcv_tstmp; 10824 #ifdef NETFLIX_HTTP_LOGGING 10825 log.u_bbr.applimited = tp->t_http_closed; 10826 log.u_bbr.applimited <<= 8; 10827 log.u_bbr.applimited |= tp->t_http_open; 10828 log.u_bbr.applimited <<= 8; 10829 log.u_bbr.applimited |= tp->t_http_req; 10830 if (http_req) { 10831 /* Copy out any client req info */ 10832 /* seconds */ 10833 log.u_bbr.pkt_epoch = (http_req->localtime / HPTS_USEC_IN_SEC); 10834 /* useconds */ 10835 log.u_bbr.delivered = (http_req->localtime % HPTS_USEC_IN_SEC); 10836 log.u_bbr.rttProp = http_req->timestamp; 10837 log.u_bbr.cur_del_rate = http_req->start; 10838 if (http_req->flags & TCP_HTTP_TRACK_FLG_OPEN) { 10839 log.u_bbr.flex8 |= 1; 10840 } else { 10841 log.u_bbr.flex8 |= 2; 10842 log.u_bbr.bw_inuse = http_req->end; 10843 } 10844 log.u_bbr.flex6 = http_req->start_seq; 10845 if (http_req->flags & TCP_HTTP_TRACK_FLG_COMP) { 10846 log.u_bbr.flex8 |= 4; 10847 log.u_bbr.epoch = http_req->end_seq; 10848 } 10849 } 10850 #endif 10851 TCP_LOG_EVENTP(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_IN, 0, 10852 tlen, &log, true, <v); 10853 } 10854 if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) { 10855 way_out = 4; 10856 retval = 0; 10857 goto done_with_input; 10858 } 10859 /* 10860 * If a segment with the ACK-bit set arrives in the SYN-SENT state 10861 * check SEQ.ACK first as described on page 66 of RFC 793, section 3.9. 10862 */ 10863 if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) && 10864 (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { 10865 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 10866 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 10867 return(1); 10868 } 10869 /* 10870 * Segment received on connection. Reset idle time and keep-alive 10871 * timer. XXX: This should be done after segment validation to 10872 * ignore broken/spoofed segs. 10873 */ 10874 if (tp->t_idle_reduce && 10875 (tp->snd_max == tp->snd_una) && 10876 ((ticks - tp->t_rcvtime) >= tp->t_rxtcur)) { 10877 counter_u64_add(rack_input_idle_reduces, 1); 10878 rack_cc_after_idle(rack, tp); 10879 } 10880 tp->t_rcvtime = ticks; 10881 /* 10882 * Unscale the window into a 32-bit value. For the SYN_SENT state 10883 * the scale is zero. 10884 */ 10885 tiwin = th->th_win << tp->snd_scale; 10886 #ifdef STATS 10887 stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_FRWIN, tiwin); 10888 #endif 10889 if (tiwin > rack->r_ctl.rc_high_rwnd) 10890 rack->r_ctl.rc_high_rwnd = tiwin; 10891 /* 10892 * TCP ECN processing. XXXJTL: If we ever use ECN, we need to move 10893 * this to occur after we've validated the segment. 10894 */ 10895 if (tp->t_flags2 & TF2_ECN_PERMIT) { 10896 if (thflags & TH_CWR) { 10897 tp->t_flags2 &= ~TF2_ECN_SND_ECE; 10898 tp->t_flags |= TF_ACKNOW; 10899 } 10900 switch (iptos & IPTOS_ECN_MASK) { 10901 case IPTOS_ECN_CE: 10902 tp->t_flags2 |= TF2_ECN_SND_ECE; 10903 KMOD_TCPSTAT_INC(tcps_ecn_ce); 10904 break; 10905 case IPTOS_ECN_ECT0: 10906 KMOD_TCPSTAT_INC(tcps_ecn_ect0); 10907 break; 10908 case IPTOS_ECN_ECT1: 10909 KMOD_TCPSTAT_INC(tcps_ecn_ect1); 10910 break; 10911 } 10912 10913 /* Process a packet differently from RFC3168. */ 10914 cc_ecnpkt_handler(tp, th, iptos); 10915 10916 /* Congestion experienced. */ 10917 if (thflags & TH_ECE) { 10918 rack_cong_signal(tp, th, CC_ECN); 10919 } 10920 } 10921 /* 10922 * Parse options on any incoming segment. 10923 */ 10924 tcp_dooptions(&to, (u_char *)(th + 1), 10925 (th->th_off << 2) - sizeof(struct tcphdr), 10926 (thflags & TH_SYN) ? TO_SYN : 0); 10927 10928 /* 10929 * If echoed timestamp is later than the current time, fall back to 10930 * non RFC1323 RTT calculation. Normalize timestamp if syncookies 10931 * were used when this connection was established. 10932 */ 10933 if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) { 10934 to.to_tsecr -= tp->ts_offset; 10935 if (TSTMP_GT(to.to_tsecr, cts)) 10936 to.to_tsecr = 0; 10937 } 10938 10939 /* 10940 * If its the first time in we need to take care of options and 10941 * verify we can do SACK for rack! 10942 */ 10943 if (rack->r_state == 0) { 10944 /* Should be init'd by rack_init() */ 10945 KASSERT(rack->rc_inp != NULL, 10946 ("%s: rack->rc_inp unexpectedly NULL", __func__)); 10947 if (rack->rc_inp == NULL) { 10948 rack->rc_inp = tp->t_inpcb; 10949 } 10950 10951 /* 10952 * Process options only when we get SYN/ACK back. The SYN 10953 * case for incoming connections is handled in tcp_syncache. 10954 * According to RFC1323 the window field in a SYN (i.e., a 10955 * <SYN> or <SYN,ACK>) segment itself is never scaled. XXX 10956 * this is traditional behavior, may need to be cleaned up. 10957 */ 10958 if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) { 10959 /* Handle parallel SYN for ECN */ 10960 if (!(thflags & TH_ACK) && 10961 ((thflags & (TH_CWR | TH_ECE)) == (TH_CWR | TH_ECE)) && 10962 ((V_tcp_do_ecn == 1) || (V_tcp_do_ecn == 2))) { 10963 tp->t_flags2 |= TF2_ECN_PERMIT; 10964 tp->t_flags2 |= TF2_ECN_SND_ECE; 10965 TCPSTAT_INC(tcps_ecn_shs); 10966 } 10967 if ((to.to_flags & TOF_SCALE) && 10968 (tp->t_flags & TF_REQ_SCALE)) { 10969 tp->t_flags |= TF_RCVD_SCALE; 10970 tp->snd_scale = to.to_wscale; 10971 } else 10972 tp->t_flags &= ~TF_REQ_SCALE; 10973 /* 10974 * Initial send window. It will be updated with the 10975 * next incoming segment to the scaled value. 10976 */ 10977 tp->snd_wnd = th->th_win; 10978 if ((to.to_flags & TOF_TS) && 10979 (tp->t_flags & TF_REQ_TSTMP)) { 10980 tp->t_flags |= TF_RCVD_TSTMP; 10981 tp->ts_recent = to.to_tsval; 10982 tp->ts_recent_age = cts; 10983 } else 10984 tp->t_flags &= ~TF_REQ_TSTMP; 10985 if (to.to_flags & TOF_MSS) 10986 tcp_mss(tp, to.to_mss); 10987 if ((tp->t_flags & TF_SACK_PERMIT) && 10988 (to.to_flags & TOF_SACKPERM) == 0) 10989 tp->t_flags &= ~TF_SACK_PERMIT; 10990 if (IS_FASTOPEN(tp->t_flags)) { 10991 if (to.to_flags & TOF_FASTOPEN) { 10992 uint16_t mss; 10993 10994 if (to.to_flags & TOF_MSS) 10995 mss = to.to_mss; 10996 else 10997 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) 10998 mss = TCP6_MSS; 10999 else 11000 mss = TCP_MSS; 11001 tcp_fastopen_update_cache(tp, mss, 11002 to.to_tfo_len, to.to_tfo_cookie); 11003 } else 11004 tcp_fastopen_disable_path(tp); 11005 } 11006 } 11007 /* 11008 * At this point we are at the initial call. Here we decide 11009 * if we are doing RACK or not. We do this by seeing if 11010 * TF_SACK_PERMIT is set and the sack-not-required is clear. 11011 * The code now does do dup-ack counting so if you don't 11012 * switch back you won't get rack & TLP, but you will still 11013 * get this stack. 11014 */ 11015 11016 if ((rack_sack_not_required == 0) && 11017 ((tp->t_flags & TF_SACK_PERMIT) == 0)) { 11018 tcp_switch_back_to_default(tp); 11019 (*tp->t_fb->tfb_tcp_do_segment) (m, th, so, tp, drop_hdrlen, 11020 tlen, iptos); 11021 return (1); 11022 } 11023 /* Set the flag */ 11024 rack->r_is_v6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; 11025 tcp_set_hpts(tp->t_inpcb); 11026 sack_filter_clear(&rack->r_ctl.rack_sf, th->th_ack); 11027 } 11028 if (thflags & TH_FIN) 11029 tcp_log_end_status(tp, TCP_EI_STATUS_CLIENT_FIN); 11030 us_cts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 11031 if ((rack->rc_gp_dyn_mul) && 11032 (rack->use_fixed_rate == 0) && 11033 (rack->rc_always_pace)) { 11034 /* Check in on probertt */ 11035 rack_check_probe_rtt(rack, us_cts); 11036 } 11037 if (rack->forced_ack) { 11038 uint32_t us_rtt; 11039 11040 /* 11041 * A persist or keep-alive was forced out, update our 11042 * min rtt time. Note we do not worry about lost 11043 * retransmissions since KEEP-ALIVES and persists 11044 * are usually way long on times of sending (though 11045 * if we were really paranoid or worried we could 11046 * at least use timestamps if available to validate). 11047 */ 11048 rack->forced_ack = 0; 11049 us_rtt = us_cts - rack->r_ctl.forced_ack_ts; 11050 if (us_rtt == 0) 11051 us_rtt = 1; 11052 rack_log_rtt_upd(tp, rack, us_rtt, 0, NULL, 3); 11053 rack_apply_updated_usrtt(rack, us_rtt, us_cts); 11054 } 11055 /* 11056 * This is the one exception case where we set the rack state 11057 * always. All other times (timers etc) we must have a rack-state 11058 * set (so we assure we have done the checks above for SACK). 11059 */ 11060 rack->r_ctl.rc_rcvtime = cts; 11061 if (rack->r_state != tp->t_state) 11062 rack_set_state(tp, rack); 11063 if (SEQ_GT(th->th_ack, tp->snd_una) && 11064 (rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree)) != NULL) 11065 kern_prefetch(rsm, &prev_state); 11066 prev_state = rack->r_state; 11067 rack_clear_rate_sample(rack); 11068 retval = (*rack->r_substate) (m, th, so, 11069 tp, &to, drop_hdrlen, 11070 tlen, tiwin, thflags, nxt_pkt, iptos); 11071 #ifdef INVARIANTS 11072 if ((retval == 0) && 11073 (tp->t_inpcb == NULL)) { 11074 panic("retval:%d tp:%p t_inpcb:NULL state:%d", 11075 retval, tp, prev_state); 11076 } 11077 #endif 11078 if (retval == 0) { 11079 /* 11080 * If retval is 1 the tcb is unlocked and most likely the tp 11081 * is gone. 11082 */ 11083 INP_WLOCK_ASSERT(tp->t_inpcb); 11084 if ((rack->rc_gp_dyn_mul) && 11085 (rack->rc_always_pace) && 11086 (rack->use_fixed_rate == 0) && 11087 rack->in_probe_rtt && 11088 (rack->r_ctl.rc_time_probertt_starts == 0)) { 11089 /* 11090 * If we are going for target, lets recheck before 11091 * we output. 11092 */ 11093 rack_check_probe_rtt(rack, us_cts); 11094 } 11095 if (rack->set_pacing_done_a_iw == 0) { 11096 /* How much has been acked? */ 11097 if ((tp->snd_una - tp->iss) > (ctf_fixed_maxseg(tp) * 10)) { 11098 /* We have enough to set in the pacing segment size */ 11099 rack->set_pacing_done_a_iw = 1; 11100 rack_set_pace_segments(tp, rack, __LINE__); 11101 } 11102 } 11103 tcp_rack_xmit_timer_commit(rack, tp); 11104 if (nxt_pkt == 0) { 11105 if (rack->r_wanted_output != 0) { 11106 do_output_now: 11107 did_out = 1; 11108 (void)tp->t_fb->tfb_tcp_output(tp); 11109 } 11110 rack_start_hpts_timer(rack, tp, cts, 0, 0, 0); 11111 } 11112 if ((nxt_pkt == 0) && 11113 ((rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) == 0) && 11114 (SEQ_GT(tp->snd_max, tp->snd_una) || 11115 (tp->t_flags & TF_DELACK) || 11116 ((V_tcp_always_keepalive || rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) && 11117 (tp->t_state <= TCPS_CLOSING)))) { 11118 /* We could not send (probably in the hpts but stopped the timer earlier)? */ 11119 if ((tp->snd_max == tp->snd_una) && 11120 ((tp->t_flags & TF_DELACK) == 0) && 11121 (rack->rc_inp->inp_in_hpts) && 11122 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { 11123 /* keep alive not needed if we are hptsi output yet */ 11124 ; 11125 } else { 11126 int late = 0; 11127 if (rack->rc_inp->inp_in_hpts) { 11128 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { 11129 us_cts = tcp_get_usecs(NULL); 11130 if (TSTMP_GT(rack->r_ctl.rc_last_output_to, us_cts)) { 11131 rack->r_early = 1; 11132 rack->r_ctl.rc_agg_early += (rack->r_ctl.rc_last_output_to - us_cts); 11133 } else 11134 late = 1; 11135 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 11136 } 11137 tcp_hpts_remove(tp->t_inpcb, HPTS_REMOVE_OUTPUT); 11138 } 11139 if (late && (did_out == 0)) { 11140 /* 11141 * We are late in the sending 11142 * and we did not call the output 11143 * (this probably should not happen). 11144 */ 11145 goto do_output_now; 11146 } 11147 rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), 0, 0, 0); 11148 } 11149 way_out = 1; 11150 } else if (nxt_pkt == 0) { 11151 /* Do we have the correct timer running? */ 11152 rack_timer_audit(tp, rack, &so->so_snd); 11153 way_out = 2; 11154 } 11155 done_with_input: 11156 rack_log_doseg_done(rack, cts, nxt_pkt, did_out, way_out); 11157 if (did_out) 11158 rack->r_wanted_output = 0; 11159 #ifdef INVARIANTS 11160 if (tp->t_inpcb == NULL) { 11161 panic("OP:%d retval:%d tp:%p t_inpcb:NULL state:%d", 11162 did_out, 11163 retval, tp, prev_state); 11164 } 11165 #endif 11166 } 11167 return (retval); 11168 } 11169 11170 void 11171 rack_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, 11172 struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos) 11173 { 11174 struct timeval tv; 11175 11176 /* First lets see if we have old packets */ 11177 if (tp->t_in_pkt) { 11178 if (ctf_do_queued_segments(so, tp, 1)) { 11179 m_freem(m); 11180 return; 11181 } 11182 } 11183 if (m->m_flags & M_TSTMP_LRO) { 11184 tv.tv_sec = m->m_pkthdr.rcv_tstmp /1000000000; 11185 tv.tv_usec = (m->m_pkthdr.rcv_tstmp % 1000000000)/1000; 11186 } else { 11187 /* Should not be should we kassert instead? */ 11188 tcp_get_usecs(&tv); 11189 } 11190 if(rack_do_segment_nounlock(m, th, so, tp, 11191 drop_hdrlen, tlen, iptos, 0, &tv) == 0) 11192 INP_WUNLOCK(tp->t_inpcb); 11193 } 11194 11195 struct rack_sendmap * 11196 tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tsused) 11197 { 11198 struct rack_sendmap *rsm = NULL; 11199 int32_t idx; 11200 uint32_t srtt = 0, thresh = 0, ts_low = 0; 11201 11202 /* Return the next guy to be re-transmitted */ 11203 if (RB_EMPTY(&rack->r_ctl.rc_mtree)) { 11204 return (NULL); 11205 } 11206 if (tp->t_flags & TF_SENTFIN) { 11207 /* retran the end FIN? */ 11208 return (NULL); 11209 } 11210 /* ok lets look at this one */ 11211 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 11212 if (rsm && ((rsm->r_flags & RACK_ACKED) == 0)) { 11213 goto check_it; 11214 } 11215 rsm = rack_find_lowest_rsm(rack); 11216 if (rsm == NULL) { 11217 return (NULL); 11218 } 11219 check_it: 11220 if (rsm->r_flags & RACK_ACKED) { 11221 return (NULL); 11222 } 11223 if (((rsm->r_flags & RACK_SACK_PASSED) == 0) && 11224 (rsm->r_dupack < DUP_ACK_THRESHOLD)) { 11225 /* Its not yet ready */ 11226 return (NULL); 11227 } 11228 srtt = rack_grab_rtt(tp, rack); 11229 idx = rsm->r_rtr_cnt - 1; 11230 ts_low = rsm->r_tim_lastsent[idx]; 11231 thresh = rack_calc_thresh_rack(rack, srtt, tsused); 11232 if ((tsused == ts_low) || 11233 (TSTMP_LT(tsused, ts_low))) { 11234 /* No time since sending */ 11235 return (NULL); 11236 } 11237 if ((tsused - ts_low) < thresh) { 11238 /* It has not been long enough yet */ 11239 return (NULL); 11240 } 11241 if ((rsm->r_dupack >= DUP_ACK_THRESHOLD) || 11242 ((rsm->r_flags & RACK_SACK_PASSED) && 11243 (rack->sack_attack_disable == 0))) { 11244 /* 11245 * We have passed the dup-ack threshold <or> 11246 * a SACK has indicated this is missing. 11247 * Note that if you are a declared attacker 11248 * it is only the dup-ack threshold that 11249 * will cause retransmits. 11250 */ 11251 /* log retransmit reason */ 11252 rack_log_retran_reason(rack, rsm, (tsused - ts_low), thresh, 1); 11253 return (rsm); 11254 } 11255 return (NULL); 11256 } 11257 11258 static void 11259 rack_log_pacing_delay_calc(struct tcp_rack *rack, uint32_t len, uint32_t slot, 11260 uint64_t bw_est, uint64_t bw, uint64_t len_time, int method, 11261 int line, struct rack_sendmap *rsm) 11262 { 11263 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 11264 union tcp_log_stackspecific log; 11265 struct timeval tv; 11266 11267 memset(&log, 0, sizeof(log)); 11268 log.u_bbr.flex1 = slot; 11269 log.u_bbr.flex2 = len; 11270 log.u_bbr.flex3 = rack->r_ctl.rc_pace_min_segs; 11271 log.u_bbr.flex4 = rack->r_ctl.rc_pace_max_segs; 11272 log.u_bbr.flex5 = rack->r_ctl.rack_per_of_gp_ss; 11273 log.u_bbr.flex6 = rack->r_ctl.rack_per_of_gp_ca; 11274 log.u_bbr.use_lt_bw = rack->app_limited_needs_set; 11275 log.u_bbr.use_lt_bw <<= 1; 11276 log.u_bbr.use_lt_bw = rack->rc_gp_filled; 11277 log.u_bbr.use_lt_bw <<= 1; 11278 log.u_bbr.use_lt_bw |= rack->measure_saw_probe_rtt; 11279 log.u_bbr.use_lt_bw <<= 1; 11280 log.u_bbr.use_lt_bw |= rack->in_probe_rtt; 11281 log.u_bbr.pkt_epoch = line; 11282 log.u_bbr.applimited = rack->r_ctl.rack_per_of_gp_rec; 11283 log.u_bbr.bw_inuse = bw_est; 11284 log.u_bbr.delRate = bw; 11285 if (rack->r_ctl.gp_bw == 0) 11286 log.u_bbr.cur_del_rate = 0; 11287 else 11288 log.u_bbr.cur_del_rate = rack_get_bw(rack); 11289 log.u_bbr.rttProp = len_time; 11290 log.u_bbr.pkts_out = rack->r_ctl.rc_rack_min_rtt; 11291 log.u_bbr.lost = rack->r_ctl.rc_probertt_sndmax_atexit; 11292 log.u_bbr.pacing_gain = rack_get_output_gain(rack, rsm); 11293 if (rack->r_ctl.cwnd_to_use < rack->rc_tp->snd_ssthresh) { 11294 /* We are in slow start */ 11295 log.u_bbr.flex7 = 1; 11296 } else { 11297 /* we are on congestion avoidance */ 11298 log.u_bbr.flex7 = 0; 11299 } 11300 log.u_bbr.flex8 = method; 11301 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 11302 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 11303 log.u_bbr.cwnd_gain = rack->rc_gp_saw_rec; 11304 log.u_bbr.cwnd_gain <<= 1; 11305 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ss; 11306 log.u_bbr.cwnd_gain <<= 1; 11307 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ca; 11308 TCP_LOG_EVENTP(rack->rc_tp, NULL, 11309 &rack->rc_inp->inp_socket->so_rcv, 11310 &rack->rc_inp->inp_socket->so_snd, 11311 BBR_LOG_HPTSI_CALC, 0, 11312 0, &log, false, &tv); 11313 } 11314 } 11315 11316 static uint32_t 11317 rack_get_pacing_len(struct tcp_rack *rack, uint64_t bw, uint32_t mss) 11318 { 11319 uint32_t new_tso, user_max; 11320 11321 user_max = rack->rc_user_set_max_segs * mss; 11322 if (rack->rc_force_max_seg) { 11323 return (user_max); 11324 } 11325 if (rack->use_fixed_rate && 11326 ((rack->r_ctl.crte == NULL) || 11327 (bw != rack->r_ctl.crte->rate))) { 11328 /* Use the user mss since we are not exactly matched */ 11329 return (user_max); 11330 } 11331 new_tso = tcp_get_pacing_burst_size(bw, mss, rack_pace_one_seg, rack->r_ctl.crte, NULL); 11332 if (new_tso > user_max) 11333 new_tso = user_max; 11334 return(new_tso); 11335 } 11336 11337 static void 11338 rack_log_hdwr_pacing(struct tcp_rack *rack, const struct ifnet *ifp, 11339 uint64_t rate, uint64_t hw_rate, int line, 11340 int error) 11341 { 11342 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 11343 union tcp_log_stackspecific log; 11344 struct timeval tv; 11345 11346 memset(&log, 0, sizeof(log)); 11347 log.u_bbr.flex1 = ((hw_rate >> 32) & 0x00000000ffffffff); 11348 log.u_bbr.flex2 = (hw_rate & 0x00000000ffffffff); 11349 log.u_bbr.flex3 = (((uint64_t)ifp >> 32) & 0x00000000ffffffff); 11350 log.u_bbr.flex4 = ((uint64_t)ifp & 0x00000000ffffffff); 11351 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 11352 log.u_bbr.bw_inuse = rate; 11353 log.u_bbr.flex5 = line; 11354 log.u_bbr.flex6 = error; 11355 log.u_bbr.applimited = rack->r_ctl.rc_pace_max_segs; 11356 log.u_bbr.flex8 = rack->use_fixed_rate; 11357 log.u_bbr.flex8 <<= 1; 11358 log.u_bbr.flex8 |= rack->rack_hdrw_pacing; 11359 log.u_bbr.pkts_out = rack->rc_tp->t_maxseg; 11360 TCP_LOG_EVENTP(rack->rc_tp, NULL, 11361 &rack->rc_inp->inp_socket->so_rcv, 11362 &rack->rc_inp->inp_socket->so_snd, 11363 BBR_LOG_HDWR_PACE, 0, 11364 0, &log, false, &tv); 11365 } 11366 } 11367 11368 static int32_t 11369 pace_to_fill_cwnd(struct tcp_rack *rack, int32_t slot, uint32_t len, uint32_t segsiz) 11370 { 11371 uint64_t lentim, fill_bw; 11372 11373 /* Lets first see if we are full, if so continue with normal rate */ 11374 if (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) > rack->r_ctl.cwnd_to_use) 11375 return (slot); 11376 if ((ctf_outstanding(rack->rc_tp) + (segsiz-1)) > rack->rc_tp->snd_wnd) 11377 return (slot); 11378 if (rack->r_ctl.rc_last_us_rtt == 0) 11379 return (slot); 11380 if (rack->rc_pace_fill_if_rttin_range && 11381 (rack->r_ctl.rc_last_us_rtt >= 11382 (get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack->rtt_limit_mul))) { 11383 /* The rtt is huge, N * smallest, lets not fill */ 11384 return (slot); 11385 } 11386 /* 11387 * first lets calculate the b/w based on the last us-rtt 11388 * and the sndwnd. 11389 */ 11390 fill_bw = rack->r_ctl.cwnd_to_use; 11391 /* Take the rwnd if its smaller */ 11392 if (fill_bw > rack->rc_tp->snd_wnd) 11393 fill_bw = rack->rc_tp->snd_wnd; 11394 fill_bw *= (uint64_t)HPTS_USEC_IN_SEC; 11395 fill_bw /= (uint64_t)rack->r_ctl.rc_last_us_rtt; 11396 /* We are below the min b/w */ 11397 if (fill_bw < RACK_MIN_BW) 11398 return (slot); 11399 /* 11400 * Ok fill_bw holds our mythical b/w to fill the cwnd 11401 * in a rtt, what does that time wise equate too? 11402 */ 11403 lentim = (uint64_t)(len) * (uint64_t)HPTS_USEC_IN_SEC; 11404 lentim /= fill_bw; 11405 if (lentim < slot) { 11406 rack_log_pacing_delay_calc(rack, len, slot, fill_bw, 11407 0, lentim, 12, __LINE__, NULL); 11408 return ((int32_t)lentim); 11409 } else 11410 return (slot); 11411 } 11412 11413 static int32_t 11414 rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, struct rack_sendmap *rsm, uint32_t segsiz) 11415 { 11416 struct rack_sendmap *lrsm; 11417 int32_t slot = 0; 11418 int err; 11419 11420 if (rack->rc_always_pace == 0) { 11421 /* 11422 * We use the most optimistic possible cwnd/srtt for 11423 * sending calculations. This will make our 11424 * calculation anticipate getting more through 11425 * quicker then possible. But thats ok we don't want 11426 * the peer to have a gap in data sending. 11427 */ 11428 uint32_t srtt, cwnd, tr_perms = 0; 11429 int32_t reduce = 0; 11430 11431 old_method: 11432 /* 11433 * We keep no precise pacing with the old method 11434 * instead we use the pacer to mitigate bursts. 11435 */ 11436 rack->r_ctl.rc_agg_delayed = 0; 11437 rack->r_early = 0; 11438 rack->r_late = 0; 11439 rack->r_ctl.rc_agg_early = 0; 11440 if (rack->r_ctl.rc_rack_min_rtt) 11441 srtt = rack->r_ctl.rc_rack_min_rtt; 11442 else 11443 srtt = TICKS_2_MSEC((tp->t_srtt >> TCP_RTT_SHIFT)); 11444 if (rack->r_ctl.rc_rack_largest_cwnd) 11445 cwnd = rack->r_ctl.rc_rack_largest_cwnd; 11446 else 11447 cwnd = rack->r_ctl.cwnd_to_use; 11448 tr_perms = cwnd / srtt; 11449 if (tr_perms == 0) { 11450 tr_perms = ctf_fixed_maxseg(tp); 11451 } 11452 /* 11453 * Calculate how long this will take to drain, if 11454 * the calculation comes out to zero, thats ok we 11455 * will use send_a_lot to possibly spin around for 11456 * more increasing tot_len_this_send to the point 11457 * that its going to require a pace, or we hit the 11458 * cwnd. Which in that case we are just waiting for 11459 * a ACK. 11460 */ 11461 slot = len / tr_perms; 11462 /* Now do we reduce the time so we don't run dry? */ 11463 if (slot && rack_slot_reduction) { 11464 reduce = (slot / rack_slot_reduction); 11465 if (reduce < slot) { 11466 slot -= reduce; 11467 } else 11468 slot = 0; 11469 } 11470 slot *= HPTS_USEC_IN_MSEC; 11471 if (rsm == NULL) { 11472 /* 11473 * We always consider ourselves app limited with old style 11474 * that are not retransmits. This could be the initial 11475 * measurement, but thats ok its all setup and specially 11476 * handled. If another send leaks out, then that too will 11477 * be mark app-limited. 11478 */ 11479 lrsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 11480 if (lrsm && ((lrsm->r_flags & RACK_APP_LIMITED) == 0)) { 11481 rack->r_ctl.rc_first_appl = lrsm; 11482 lrsm->r_flags |= RACK_APP_LIMITED; 11483 rack->r_ctl.rc_app_limited_cnt++; 11484 } 11485 } 11486 rack_log_pacing_delay_calc(rack, len, slot, tr_perms, reduce, 0, 7, __LINE__, NULL); 11487 } else { 11488 uint64_t bw_est, res, lentim, rate_wanted; 11489 uint32_t orig_val, srtt, segs, oh; 11490 11491 if ((rack->r_rr_config == 1) && rsm) { 11492 return (rack->r_ctl.rc_min_to * HPTS_USEC_IN_MSEC); 11493 } 11494 if (rack->use_fixed_rate) { 11495 rate_wanted = bw_est = rack_get_fixed_pacing_bw(rack); 11496 } else if ((rack->r_ctl.init_rate == 0) && 11497 #ifdef NETFLIX_PEAKRATE 11498 (rack->rc_tp->t_maxpeakrate == 0) && 11499 #endif 11500 (rack->r_ctl.gp_bw == 0)) { 11501 /* no way to yet do an estimate */ 11502 bw_est = rate_wanted = 0; 11503 } else { 11504 bw_est = rack_get_bw(rack); 11505 rate_wanted = rack_get_output_bw(rack, bw_est, rsm); 11506 } 11507 if ((bw_est == 0) || (rate_wanted == 0)) { 11508 /* 11509 * No way yet to make a b/w estimate or 11510 * our raise is set incorrectly. 11511 */ 11512 goto old_method; 11513 } 11514 /* We need to account for all the overheads */ 11515 segs = (len + segsiz - 1) / segsiz; 11516 /* 11517 * We need the diff between 1514 bytes (e-mtu with e-hdr) 11518 * and how much data we put in each packet. Yes this 11519 * means we may be off if we are larger than 1500 bytes 11520 * or smaller. But this just makes us more conservative. 11521 */ 11522 if (ETHERNET_SEGMENT_SIZE > segsiz) 11523 oh = ETHERNET_SEGMENT_SIZE - segsiz; 11524 else 11525 oh = 0; 11526 segs *= oh; 11527 lentim = (uint64_t)(len + segs) * (uint64_t)HPTS_USEC_IN_SEC; 11528 res = lentim / rate_wanted; 11529 slot = (uint32_t)res; 11530 orig_val = rack->r_ctl.rc_pace_max_segs; 11531 rack_set_pace_segments(rack->rc_tp, rack, __LINE__); 11532 /* Did we change the TSO size, if so log it */ 11533 if (rack->r_ctl.rc_pace_max_segs != orig_val) 11534 rack_log_pacing_delay_calc(rack, len, slot, orig_val, 0, 0, 15, __LINE__, NULL); 11535 if ((rack->rc_pace_to_cwnd) && 11536 (rack->in_probe_rtt == 0) && 11537 (IN_RECOVERY(rack->rc_tp->t_flags) == 0)) { 11538 /* 11539 * We want to pace at our rate *or* faster to 11540 * fill the cwnd to the max if its not full. 11541 */ 11542 slot = pace_to_fill_cwnd(rack, slot, (len+segs), segsiz); 11543 } 11544 if ((rack->rc_inp->inp_route.ro_nh != NULL) && 11545 (rack->rc_inp->inp_route.ro_nh->nh_ifp != NULL)) { 11546 if ((rack->rack_hdw_pace_ena) && 11547 (rack->rack_hdrw_pacing == 0) && 11548 (rack->rack_attempt_hdwr_pace == 0)) { 11549 /* 11550 * Lets attempt to turn on hardware pacing 11551 * if we can. 11552 */ 11553 rack->rack_attempt_hdwr_pace = 1; 11554 rack->r_ctl.crte = tcp_set_pacing_rate(rack->rc_tp, 11555 rack->rc_inp->inp_route.ro_nh->nh_ifp, 11556 rate_wanted, 11557 RS_PACING_GEQ, 11558 &err); 11559 if (rack->r_ctl.crte) { 11560 rack->rack_hdrw_pacing = 1; 11561 rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size(rate_wanted, segsiz, 11562 0, rack->r_ctl.crte, 11563 NULL); 11564 rack_log_hdwr_pacing(rack, rack->rc_inp->inp_route.ro_nh->nh_ifp, 11565 rate_wanted, rack->r_ctl.crte->rate, __LINE__, 11566 err); 11567 } 11568 } else if (rack->rack_hdrw_pacing && 11569 (rack->r_ctl.crte->rate != rate_wanted)) { 11570 /* Do we need to adjust our rate? */ 11571 const struct tcp_hwrate_limit_table *nrte; 11572 11573 nrte = tcp_chg_pacing_rate(rack->r_ctl.crte, 11574 rack->rc_tp, 11575 rack->rc_inp->inp_route.ro_nh->nh_ifp, 11576 rate_wanted, 11577 RS_PACING_GEQ, 11578 &err); 11579 if (nrte == NULL) { 11580 /* Lost the rate */ 11581 rack->rack_hdrw_pacing = 0; 11582 rack_set_pace_segments(rack->rc_tp, rack, __LINE__); 11583 } else if (nrte != rack->r_ctl.crte) { 11584 rack->r_ctl.crte = nrte; 11585 rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size(rate_wanted, 11586 segsiz, 0, 11587 rack->r_ctl.crte, 11588 NULL); 11589 rack_log_hdwr_pacing(rack, rack->rc_inp->inp_route.ro_nh->nh_ifp, 11590 rate_wanted, rack->r_ctl.crte->rate, __LINE__, 11591 err); 11592 } 11593 } 11594 } 11595 if (rack_limit_time_with_srtt && 11596 (rack->use_fixed_rate == 0) && 11597 #ifdef NETFLIX_PEAKRATE 11598 (rack->rc_tp->t_maxpeakrate == 0) && 11599 #endif 11600 (rack->rack_hdrw_pacing == 0)) { 11601 /* 11602 * Sanity check, we do not allow the pacing delay 11603 * to be longer than the SRTT of the path. If it is 11604 * a slow path, then adding a packet should increase 11605 * the RTT and compensate for this i.e. the srtt will 11606 * be greater so the allowed pacing time will be greater. 11607 * 11608 * Note this restriction is not for where a peak rate 11609 * is set, we are doing fixed pacing or hardware pacing. 11610 */ 11611 if (rack->rc_tp->t_srtt) 11612 srtt = (TICKS_2_USEC(rack->rc_tp->t_srtt) >> TCP_RTT_SHIFT); 11613 else 11614 srtt = RACK_INITIAL_RTO * HPTS_USEC_IN_MSEC; /* its in ms convert */ 11615 if (srtt < slot) { 11616 rack_log_pacing_delay_calc(rack, srtt, slot, rate_wanted, bw_est, lentim, 99, __LINE__, NULL); 11617 slot = srtt; 11618 } 11619 } 11620 rack_log_pacing_delay_calc(rack, len, slot, rate_wanted, bw_est, lentim, 2, __LINE__, rsm); 11621 } 11622 if (slot) 11623 counter_u64_add(rack_calc_nonzero, 1); 11624 else 11625 counter_u64_add(rack_calc_zero, 1); 11626 return (slot); 11627 } 11628 11629 static void 11630 rack_start_gp_measurement(struct tcpcb *tp, struct tcp_rack *rack, 11631 tcp_seq startseq, uint32_t sb_offset) 11632 { 11633 struct rack_sendmap *my_rsm = NULL; 11634 struct rack_sendmap fe; 11635 11636 if (tp->t_state < TCPS_ESTABLISHED) { 11637 /* 11638 * We don't start any measurements if we are 11639 * not at least established. 11640 */ 11641 return; 11642 } 11643 tp->t_flags |= TF_GPUTINPROG; 11644 rack->r_ctl.rc_gp_lowrtt = 0xffffffff; 11645 rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd; 11646 tp->gput_seq = startseq; 11647 rack->app_limited_needs_set = 0; 11648 if (rack->in_probe_rtt) 11649 rack->measure_saw_probe_rtt = 1; 11650 else if ((rack->measure_saw_probe_rtt) && 11651 (SEQ_GEQ(tp->gput_seq, rack->r_ctl.rc_probertt_sndmax_atexit))) 11652 rack->measure_saw_probe_rtt = 0; 11653 if (rack->rc_gp_filled) 11654 tp->gput_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 11655 else { 11656 /* Special case initial measurement */ 11657 rack->r_ctl.rc_gp_output_ts = tp->gput_ts = tcp_get_usecs(NULL); 11658 } 11659 /* 11660 * We take a guess out into the future, 11661 * if we have no measurement and no 11662 * initial rate, we measure the first 11663 * initial-windows worth of data to 11664 * speed up getting some GP measurement and 11665 * thus start pacing. 11666 */ 11667 if ((rack->rc_gp_filled == 0) && (rack->r_ctl.init_rate == 0)) { 11668 rack->app_limited_needs_set = 1; 11669 tp->gput_ack = startseq + max(rc_init_window(rack), 11670 (MIN_GP_WIN * ctf_fixed_maxseg(tp))); 11671 rack_log_pacing_delay_calc(rack, 11672 tp->gput_seq, 11673 tp->gput_ack, 11674 0, 11675 tp->gput_ts, 11676 rack->r_ctl.rc_app_limited_cnt, 11677 9, 11678 __LINE__, NULL); 11679 return; 11680 } 11681 if (sb_offset) { 11682 /* 11683 * We are out somewhere in the sb 11684 * can we use the already outstanding data? 11685 */ 11686 11687 if (rack->r_ctl.rc_app_limited_cnt == 0) { 11688 /* 11689 * Yes first one is good and in this case 11690 * the tp->gput_ts is correctly set based on 11691 * the last ack that arrived (no need to 11692 * set things up when an ack comes in). 11693 */ 11694 my_rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 11695 if ((my_rsm == NULL) || 11696 (my_rsm->r_rtr_cnt != 1)) { 11697 /* retransmission? */ 11698 goto use_latest; 11699 } 11700 } else { 11701 if (rack->r_ctl.rc_first_appl == NULL) { 11702 /* 11703 * If rc_first_appl is NULL 11704 * then the cnt should be 0. 11705 * This is probably an error, maybe 11706 * a KASSERT would be approprate. 11707 */ 11708 goto use_latest; 11709 } 11710 /* 11711 * If we have a marker pointer to the last one that is 11712 * app limited we can use that, but we need to set 11713 * things up so that when it gets ack'ed we record 11714 * the ack time (if its not already acked). 11715 */ 11716 rack->app_limited_needs_set = 1; 11717 /* 11718 * We want to get to the rsm that is either 11719 * next with space i.e. over 1 MSS or the one 11720 * after that (after the app-limited). 11721 */ 11722 my_rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, 11723 rack->r_ctl.rc_first_appl); 11724 if (my_rsm) { 11725 if ((my_rsm->r_end - my_rsm->r_start) <= ctf_fixed_maxseg(tp)) 11726 /* Have to use the next one */ 11727 my_rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, 11728 my_rsm); 11729 else { 11730 /* Use after the first MSS of it is acked */ 11731 tp->gput_seq = my_rsm->r_start + ctf_fixed_maxseg(tp); 11732 goto start_set; 11733 } 11734 } 11735 if ((my_rsm == NULL) || 11736 (my_rsm->r_rtr_cnt != 1)) { 11737 /* 11738 * Either its a retransmit or 11739 * the last is the app-limited one. 11740 */ 11741 goto use_latest; 11742 } 11743 } 11744 tp->gput_seq = my_rsm->r_start; 11745 start_set: 11746 if (my_rsm->r_flags & RACK_ACKED) { 11747 /* 11748 * This one has been acked use the arrival ack time 11749 */ 11750 tp->gput_ts = my_rsm->r_ack_arrival; 11751 rack->app_limited_needs_set = 0; 11752 } 11753 rack->r_ctl.rc_gp_output_ts = my_rsm->usec_orig_send; 11754 tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack); 11755 rack_log_pacing_delay_calc(rack, 11756 tp->gput_seq, 11757 tp->gput_ack, 11758 (uint64_t)my_rsm, 11759 tp->gput_ts, 11760 rack->r_ctl.rc_app_limited_cnt, 11761 9, 11762 __LINE__, NULL); 11763 return; 11764 } 11765 11766 use_latest: 11767 /* 11768 * We don't know how long we may have been 11769 * idle or if this is the first-send. Lets 11770 * setup the flag so we will trim off 11771 * the first ack'd data so we get a true 11772 * measurement. 11773 */ 11774 rack->app_limited_needs_set = 1; 11775 tp->gput_ack = startseq + rack_get_measure_window(tp, rack); 11776 /* Find this guy so we can pull the send time */ 11777 fe.r_start = startseq; 11778 my_rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); 11779 if (my_rsm) { 11780 rack->r_ctl.rc_gp_output_ts = my_rsm->usec_orig_send; 11781 if (my_rsm->r_flags & RACK_ACKED) { 11782 /* 11783 * Unlikely since its probably what was 11784 * just transmitted (but I am paranoid). 11785 */ 11786 tp->gput_ts = my_rsm->r_ack_arrival; 11787 rack->app_limited_needs_set = 0; 11788 } 11789 if (SEQ_LT(my_rsm->r_start, tp->gput_seq)) { 11790 /* This also is unlikely */ 11791 tp->gput_seq = my_rsm->r_start; 11792 } 11793 } else { 11794 /* 11795 * TSNH unless we have some send-map limit, 11796 * and even at that it should not be hitting 11797 * that limit (we should have stopped sending). 11798 */ 11799 rack->r_ctl.rc_gp_output_ts = tcp_get_usecs(NULL); 11800 } 11801 rack_log_pacing_delay_calc(rack, 11802 tp->gput_seq, 11803 tp->gput_ack, 11804 (uint64_t)my_rsm, 11805 tp->gput_ts, 11806 rack->r_ctl.rc_app_limited_cnt, 11807 9, __LINE__, NULL); 11808 } 11809 11810 static inline uint32_t 11811 rack_what_can_we_send(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cwnd_to_use, 11812 uint32_t avail, int32_t sb_offset) 11813 { 11814 uint32_t len; 11815 uint32_t sendwin; 11816 11817 if (tp->snd_wnd > cwnd_to_use) 11818 sendwin = cwnd_to_use; 11819 else 11820 sendwin = tp->snd_wnd; 11821 if (ctf_outstanding(tp) >= tp->snd_wnd) { 11822 /* We never want to go over our peers rcv-window */ 11823 len = 0; 11824 } else { 11825 uint32_t flight; 11826 11827 flight = ctf_flight_size(tp, rack->r_ctl.rc_sacked); 11828 if (flight >= sendwin) { 11829 /* 11830 * We have in flight what we are allowed by cwnd (if 11831 * it was rwnd blocking it would have hit above out 11832 * >= tp->snd_wnd). 11833 */ 11834 return (0); 11835 } 11836 len = sendwin - flight; 11837 if ((len + ctf_outstanding(tp)) > tp->snd_wnd) { 11838 /* We would send too much (beyond the rwnd) */ 11839 len = tp->snd_wnd - ctf_outstanding(tp); 11840 } 11841 if ((len + sb_offset) > avail) { 11842 /* 11843 * We don't have that much in the SB, how much is 11844 * there? 11845 */ 11846 len = avail - sb_offset; 11847 } 11848 } 11849 return (len); 11850 } 11851 11852 static int 11853 rack_output(struct tcpcb *tp) 11854 { 11855 struct socket *so; 11856 uint32_t recwin; 11857 uint32_t sb_offset; 11858 int32_t len, flags, error = 0; 11859 struct mbuf *m; 11860 struct mbuf *mb; 11861 uint32_t if_hw_tsomaxsegcount = 0; 11862 uint32_t if_hw_tsomaxsegsize; 11863 int32_t segsiz, minseg; 11864 long tot_len_this_send = 0; 11865 struct ip *ip = NULL; 11866 #ifdef TCPDEBUG 11867 struct ipovly *ipov = NULL; 11868 #endif 11869 struct udphdr *udp = NULL; 11870 struct tcp_rack *rack; 11871 struct tcphdr *th; 11872 uint8_t pass = 0; 11873 uint8_t mark = 0; 11874 uint8_t wanted_cookie = 0; 11875 u_char opt[TCP_MAXOLEN]; 11876 unsigned ipoptlen, optlen, hdrlen, ulen=0; 11877 uint32_t rack_seq; 11878 11879 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 11880 unsigned ipsec_optlen = 0; 11881 11882 #endif 11883 int32_t idle, sendalot; 11884 int32_t sub_from_prr = 0; 11885 volatile int32_t sack_rxmit; 11886 struct rack_sendmap *rsm = NULL; 11887 int32_t tso, mtu; 11888 struct tcpopt to; 11889 int32_t slot = 0; 11890 int32_t sup_rack = 0; 11891 uint32_t cts, us_cts, delayed, early; 11892 uint8_t hpts_calling, new_data_tlp = 0, doing_tlp = 0; 11893 uint32_t cwnd_to_use; 11894 int32_t do_a_prefetch; 11895 int32_t prefetch_rsm = 0; 11896 int32_t orig_len; 11897 struct timeval tv; 11898 int32_t prefetch_so_done = 0; 11899 struct tcp_log_buffer *lgb = NULL; 11900 struct inpcb *inp; 11901 struct sockbuf *sb; 11902 #ifdef INET6 11903 struct ip6_hdr *ip6 = NULL; 11904 int32_t isipv6; 11905 #endif 11906 uint8_t filled_all = 0; 11907 bool hw_tls = false; 11908 11909 /* setup and take the cache hits here */ 11910 rack = (struct tcp_rack *)tp->t_fb_ptr; 11911 inp = rack->rc_inp; 11912 so = inp->inp_socket; 11913 sb = &so->so_snd; 11914 kern_prefetch(sb, &do_a_prefetch); 11915 do_a_prefetch = 1; 11916 hpts_calling = inp->inp_hpts_calls; 11917 hw_tls = (so->so_snd.sb_flags & SB_TLS_IFNET) != 0; 11918 11919 NET_EPOCH_ASSERT(); 11920 INP_WLOCK_ASSERT(inp); 11921 #ifdef TCP_OFFLOAD 11922 if (tp->t_flags & TF_TOE) 11923 return (tcp_offload_output(tp)); 11924 #endif 11925 /* 11926 * For TFO connections in SYN_RECEIVED, only allow the initial 11927 * SYN|ACK and those sent by the retransmit timer. 11928 */ 11929 if (IS_FASTOPEN(tp->t_flags) && 11930 (tp->t_state == TCPS_SYN_RECEIVED) && 11931 SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN|ACK sent */ 11932 (rack->r_ctl.rc_resend == NULL)) /* not a retransmit */ 11933 return (0); 11934 #ifdef INET6 11935 if (rack->r_state) { 11936 /* Use the cache line loaded if possible */ 11937 isipv6 = rack->r_is_v6; 11938 } else { 11939 isipv6 = (inp->inp_vflag & INP_IPV6) != 0; 11940 } 11941 #endif 11942 early = 0; 11943 us_cts = tcp_get_usecs(&tv); 11944 cts = tcp_tv_to_mssectick(&tv); 11945 if (((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) && 11946 inp->inp_in_hpts) { 11947 /* 11948 * We are on the hpts for some timer but not hptsi output. 11949 * Remove from the hpts unconditionally. 11950 */ 11951 rack_timer_cancel(tp, rack, cts, __LINE__); 11952 } 11953 /* Are we pacing and late? */ 11954 if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) && 11955 TSTMP_GEQ(us_cts, rack->r_ctl.rc_last_output_to)) { 11956 /* We are delayed */ 11957 delayed = us_cts - rack->r_ctl.rc_last_output_to; 11958 } else { 11959 delayed = 0; 11960 } 11961 /* Do the timers, which may override the pacer */ 11962 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { 11963 if (rack_process_timers(tp, rack, cts, hpts_calling)) { 11964 counter_u64_add(rack_out_size[TCP_MSS_ACCT_ATIMER], 1); 11965 return (0); 11966 } 11967 } 11968 if ((rack->r_timer_override) || 11969 (delayed) || 11970 (tp->t_state < TCPS_ESTABLISHED)) { 11971 if (tp->t_inpcb->inp_in_hpts) 11972 tcp_hpts_remove(tp->t_inpcb, HPTS_REMOVE_OUTPUT); 11973 } else if (tp->t_inpcb->inp_in_hpts) { 11974 /* 11975 * On the hpts you can't pass even if ACKNOW is on, we will 11976 * when the hpts fires. 11977 */ 11978 counter_u64_add(rack_out_size[TCP_MSS_ACCT_INPACE], 1); 11979 return (0); 11980 } 11981 inp->inp_hpts_calls = 0; 11982 /* Finish out both pacing early and late accounting */ 11983 if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) && 11984 TSTMP_GT(rack->r_ctl.rc_last_output_to, us_cts)) { 11985 early = rack->r_ctl.rc_last_output_to - us_cts; 11986 } else 11987 early = 0; 11988 if (delayed) { 11989 rack->r_ctl.rc_agg_delayed += delayed; 11990 rack->r_late = 1; 11991 } else if (early) { 11992 rack->r_ctl.rc_agg_early += early; 11993 rack->r_early = 1; 11994 } 11995 /* Now that early/late accounting is done turn off the flag */ 11996 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 11997 rack->r_wanted_output = 0; 11998 rack->r_timer_override = 0; 11999 /* 12000 * For TFO connections in SYN_SENT or SYN_RECEIVED, 12001 * only allow the initial SYN or SYN|ACK and those sent 12002 * by the retransmit timer. 12003 */ 12004 if (IS_FASTOPEN(tp->t_flags) && 12005 ((tp->t_state == TCPS_SYN_RECEIVED) || 12006 (tp->t_state == TCPS_SYN_SENT)) && 12007 SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN or SYN|ACK sent */ 12008 (tp->t_rxtshift == 0)) { /* not a retransmit */ 12009 cwnd_to_use = rack->r_ctl.cwnd_to_use = tp->snd_cwnd; 12010 goto just_return_nolock; 12011 } 12012 /* 12013 * Determine length of data that should be transmitted, and flags 12014 * that will be used. If there is some data or critical controls 12015 * (SYN, RST) to send, then transmit; otherwise, investigate 12016 * further. 12017 */ 12018 idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una); 12019 if (tp->t_idle_reduce) { 12020 if (idle && ((ticks - tp->t_rcvtime) >= tp->t_rxtcur)) 12021 rack_cc_after_idle(rack, tp); 12022 } 12023 tp->t_flags &= ~TF_LASTIDLE; 12024 if (idle) { 12025 if (tp->t_flags & TF_MORETOCOME) { 12026 tp->t_flags |= TF_LASTIDLE; 12027 idle = 0; 12028 } 12029 } 12030 if ((tp->snd_una == tp->snd_max) && 12031 rack->r_ctl.rc_went_idle_time && 12032 TSTMP_GT(us_cts, rack->r_ctl.rc_went_idle_time)) { 12033 idle = us_cts - rack->r_ctl.rc_went_idle_time; 12034 if (idle > rack_min_probertt_hold) { 12035 /* Count as a probe rtt */ 12036 if (rack->in_probe_rtt == 0) { 12037 rack->r_ctl.rc_lower_rtt_us_cts = us_cts; 12038 rack->r_ctl.rc_time_probertt_entered = rack->r_ctl.rc_lower_rtt_us_cts; 12039 rack->r_ctl.rc_time_probertt_starts = rack->r_ctl.rc_lower_rtt_us_cts; 12040 rack->r_ctl.rc_time_of_last_probertt = rack->r_ctl.rc_lower_rtt_us_cts; 12041 } else { 12042 rack_exit_probertt(rack, us_cts); 12043 } 12044 } 12045 idle = 0; 12046 } 12047 again: 12048 /* 12049 * If we've recently taken a timeout, snd_max will be greater than 12050 * snd_nxt. There may be SACK information that allows us to avoid 12051 * resending already delivered data. Adjust snd_nxt accordingly. 12052 */ 12053 sendalot = 0; 12054 us_cts = tcp_get_usecs(&tv); 12055 cts = tcp_tv_to_mssectick(&tv); 12056 tso = 0; 12057 mtu = 0; 12058 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 12059 minseg = segsiz; 12060 sb_offset = tp->snd_max - tp->snd_una; 12061 cwnd_to_use = rack->r_ctl.cwnd_to_use = tp->snd_cwnd; 12062 #ifdef NETFLIX_SHARED_CWND 12063 if ((tp->t_flags2 & TF2_TCP_SCWND_ALLOWED) && 12064 rack->rack_enable_scwnd) { 12065 /* We are doing cwnd sharing */ 12066 if (rack->rc_gp_filled && 12067 (rack->rack_attempted_scwnd == 0) && 12068 (rack->r_ctl.rc_scw == NULL) && 12069 tp->t_lib) { 12070 /* The pcbid is in, lets make an attempt */ 12071 counter_u64_add(rack_try_scwnd, 1); 12072 rack->rack_attempted_scwnd = 1; 12073 rack->r_ctl.rc_scw = tcp_shared_cwnd_alloc(tp, 12074 &rack->r_ctl.rc_scw_index, 12075 segsiz); 12076 } 12077 if (rack->r_ctl.rc_scw && 12078 (rack->rack_scwnd_is_idle == 1) && 12079 (rack->rc_in_persist == 0) && 12080 sbavail(sb)) { 12081 /* we are no longer out of data */ 12082 tcp_shared_cwnd_active(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index); 12083 rack->rack_scwnd_is_idle = 0; 12084 } 12085 if (rack->r_ctl.rc_scw) { 12086 /* First lets update and get the cwnd */ 12087 rack->r_ctl.cwnd_to_use = cwnd_to_use = tcp_shared_cwnd_update(rack->r_ctl.rc_scw, 12088 rack->r_ctl.rc_scw_index, 12089 tp->snd_cwnd, tp->snd_wnd, segsiz); 12090 } 12091 } 12092 #endif 12093 flags = tcp_outflags[tp->t_state]; 12094 while (rack->rc_free_cnt < rack_free_cache) { 12095 rsm = rack_alloc(rack); 12096 if (rsm == NULL) { 12097 if (inp->inp_hpts_calls) 12098 /* Retry in a ms */ 12099 slot = (1 * HPTS_USEC_IN_MSEC); 12100 goto just_return_nolock; 12101 } 12102 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_tnext); 12103 rack->rc_free_cnt++; 12104 rsm = NULL; 12105 } 12106 if (inp->inp_hpts_calls) 12107 inp->inp_hpts_calls = 0; 12108 sack_rxmit = 0; 12109 len = 0; 12110 rsm = NULL; 12111 if (flags & TH_RST) { 12112 SOCKBUF_LOCK(sb); 12113 goto send; 12114 } 12115 if (rack->r_ctl.rc_resend) { 12116 /* Retransmit timer */ 12117 rsm = rack->r_ctl.rc_resend; 12118 rack->r_ctl.rc_resend = NULL; 12119 rsm->r_flags &= ~RACK_TLP; 12120 len = rsm->r_end - rsm->r_start; 12121 sack_rxmit = 1; 12122 sendalot = 0; 12123 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start), 12124 ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p", 12125 __func__, __LINE__, 12126 rsm->r_start, tp->snd_una, tp, rack, rsm)); 12127 sb_offset = rsm->r_start - tp->snd_una; 12128 if (len >= segsiz) 12129 len = segsiz; 12130 } else if ((rack->rc_in_persist == 0) && 12131 ((rsm = tcp_rack_output(tp, rack, cts)) != NULL)) { 12132 /* We have a retransmit that takes precedence */ 12133 rsm->r_flags &= ~RACK_TLP; 12134 if ((!IN_RECOVERY(tp->t_flags)) && 12135 ((tp->t_flags & (TF_WASFRECOVERY | TF_WASCRECOVERY)) == 0)) { 12136 /* Enter recovery if not induced by a time-out */ 12137 rack->r_ctl.rc_rsm_start = rsm->r_start; 12138 rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; 12139 rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; 12140 rack_cong_signal(tp, NULL, CC_NDUPACK); 12141 /* 12142 * When we enter recovery we need to assure we send 12143 * one packet. 12144 */ 12145 if (rack->rack_no_prr == 0) { 12146 rack->r_ctl.rc_prr_sndcnt = segsiz; 12147 rack_log_to_prr(rack, 13, 0); 12148 } 12149 } 12150 #ifdef INVARIANTS 12151 if (SEQ_LT(rsm->r_start, tp->snd_una)) { 12152 panic("Huh, tp:%p rack:%p rsm:%p start:%u < snd_una:%u\n", 12153 tp, rack, rsm, rsm->r_start, tp->snd_una); 12154 } 12155 #endif 12156 len = rsm->r_end - rsm->r_start; 12157 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start), 12158 ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p", 12159 __func__, __LINE__, 12160 rsm->r_start, tp->snd_una, tp, rack, rsm)); 12161 sb_offset = rsm->r_start - tp->snd_una; 12162 /* Can we send it within the PRR boundary? */ 12163 if (rack->rack_no_prr == 0) { 12164 if ((rack->use_rack_rr == 0) && (len > rack->r_ctl.rc_prr_sndcnt)) { 12165 /* It does not fit */ 12166 if ((ctf_flight_size(tp, rack->r_ctl.rc_sacked) > len) && 12167 (rack->r_ctl.rc_prr_sndcnt < segsiz)) { 12168 /* 12169 * prr is less than a segment, we 12170 * have more acks due in besides 12171 * what we need to resend. Lets not send 12172 * to avoid sending small pieces of 12173 * what we need to retransmit. 12174 */ 12175 len = 0; 12176 goto just_return_nolock; 12177 } 12178 len = rack->r_ctl.rc_prr_sndcnt; 12179 } 12180 } 12181 sendalot = 0; 12182 if (len >= segsiz) 12183 len = segsiz; 12184 if (len > 0) { 12185 sub_from_prr = 1; 12186 sack_rxmit = 1; 12187 KMOD_TCPSTAT_INC(tcps_sack_rexmits); 12188 KMOD_TCPSTAT_ADD(tcps_sack_rexmit_bytes, 12189 min(len, segsiz)); 12190 counter_u64_add(rack_rtm_prr_retran, 1); 12191 } 12192 } else if (rack->r_ctl.rc_tlpsend) { 12193 /* Tail loss probe */ 12194 long cwin; 12195 long tlen; 12196 12197 doing_tlp = 1; 12198 /* 12199 * Check if we can do a TLP with a RACK'd packet 12200 * this can happen if we are not doing the rack 12201 * cheat and we skipped to a TLP and it 12202 * went off. 12203 */ 12204 rsm = rack->r_ctl.rc_tlpsend; 12205 rsm->r_flags |= RACK_TLP; 12206 rack->r_ctl.rc_tlpsend = NULL; 12207 sack_rxmit = 1; 12208 tlen = rsm->r_end - rsm->r_start; 12209 if (tlen > segsiz) 12210 tlen = segsiz; 12211 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start), 12212 ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p", 12213 __func__, __LINE__, 12214 rsm->r_start, tp->snd_una, tp, rack, rsm)); 12215 sb_offset = rsm->r_start - tp->snd_una; 12216 cwin = min(tp->snd_wnd, tlen); 12217 len = cwin; 12218 } 12219 /* 12220 * Enforce a connection sendmap count limit if set 12221 * as long as we are not retransmiting. 12222 */ 12223 if ((rsm == NULL) && 12224 (rack->do_detection == 0) && 12225 (V_tcp_map_entries_limit > 0) && 12226 (rack->r_ctl.rc_num_maps_alloced >= V_tcp_map_entries_limit)) { 12227 counter_u64_add(rack_to_alloc_limited, 1); 12228 if (!rack->alloc_limit_reported) { 12229 rack->alloc_limit_reported = 1; 12230 counter_u64_add(rack_alloc_limited_conns, 1); 12231 } 12232 goto just_return_nolock; 12233 } 12234 if (rsm && (rsm->r_flags & RACK_HAS_FIN)) { 12235 /* we are retransmitting the fin */ 12236 len--; 12237 if (len) { 12238 /* 12239 * When retransmitting data do *not* include the 12240 * FIN. This could happen from a TLP probe. 12241 */ 12242 flags &= ~TH_FIN; 12243 } 12244 } 12245 #ifdef INVARIANTS 12246 /* For debugging */ 12247 rack->r_ctl.rc_rsm_at_retran = rsm; 12248 #endif 12249 /* 12250 * Get standard flags, and add SYN or FIN if requested by 'hidden' 12251 * state flags. 12252 */ 12253 if (tp->t_flags & TF_NEEDFIN) 12254 flags |= TH_FIN; 12255 if (tp->t_flags & TF_NEEDSYN) 12256 flags |= TH_SYN; 12257 if ((sack_rxmit == 0) && (prefetch_rsm == 0)) { 12258 void *end_rsm; 12259 end_rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext); 12260 if (end_rsm) 12261 kern_prefetch(end_rsm, &prefetch_rsm); 12262 prefetch_rsm = 1; 12263 } 12264 SOCKBUF_LOCK(sb); 12265 /* 12266 * If snd_nxt == snd_max and we have transmitted a FIN, the 12267 * sb_offset will be > 0 even if so_snd.sb_cc is 0, resulting in a 12268 * negative length. This can also occur when TCP opens up its 12269 * congestion window while receiving additional duplicate acks after 12270 * fast-retransmit because TCP will reset snd_nxt to snd_max after 12271 * the fast-retransmit. 12272 * 12273 * In the normal retransmit-FIN-only case, however, snd_nxt will be 12274 * set to snd_una, the sb_offset will be 0, and the length may wind 12275 * up 0. 12276 * 12277 * If sack_rxmit is true we are retransmitting from the scoreboard 12278 * in which case len is already set. 12279 */ 12280 if ((sack_rxmit == 0) && TCPS_HAVEESTABLISHED(tp->t_state)) { 12281 uint32_t avail; 12282 12283 avail = sbavail(sb); 12284 if (SEQ_GT(tp->snd_nxt, tp->snd_una) && avail) 12285 sb_offset = tp->snd_nxt - tp->snd_una; 12286 else 12287 sb_offset = 0; 12288 if ((IN_RECOVERY(tp->t_flags) == 0) || rack->rack_no_prr) { 12289 if (rack->r_ctl.rc_tlp_new_data) { 12290 /* TLP is forcing out new data */ 12291 if (rack->r_ctl.rc_tlp_new_data > (uint32_t) (avail - sb_offset)) { 12292 rack->r_ctl.rc_tlp_new_data = (uint32_t) (avail - sb_offset); 12293 } 12294 if (rack->r_ctl.rc_tlp_new_data > tp->snd_wnd) 12295 len = tp->snd_wnd; 12296 else 12297 len = rack->r_ctl.rc_tlp_new_data; 12298 rack->r_ctl.rc_tlp_new_data = 0; 12299 new_data_tlp = doing_tlp = 1; 12300 } else 12301 len = rack_what_can_we_send(tp, rack, cwnd_to_use, avail, sb_offset); 12302 if (IN_RECOVERY(tp->t_flags) && (len > segsiz)) { 12303 /* 12304 * For prr=off, we need to send only 1 MSS 12305 * at a time. We do this because another sack could 12306 * be arriving that causes us to send retransmits and 12307 * we don't want to be on a long pace due to a larger send 12308 * that keeps us from sending out the retransmit. 12309 */ 12310 len = segsiz; 12311 } 12312 } else { 12313 uint32_t outstanding; 12314 12315 /* 12316 * We are inside of a SACK recovery episode and are 12317 * sending new data, having retransmitted all the 12318 * data possible so far in the scoreboard. 12319 */ 12320 outstanding = tp->snd_max - tp->snd_una; 12321 if ((rack->r_ctl.rc_prr_sndcnt + outstanding) > tp->snd_wnd) { 12322 if (tp->snd_wnd > outstanding) { 12323 len = tp->snd_wnd - outstanding; 12324 /* Check to see if we have the data */ 12325 if ((sb_offset + len) > avail) { 12326 /* It does not all fit */ 12327 if (avail > sb_offset) 12328 len = avail - sb_offset; 12329 else 12330 len = 0; 12331 } 12332 } else 12333 len = 0; 12334 } else if (avail > sb_offset) 12335 len = avail - sb_offset; 12336 else 12337 len = 0; 12338 if (len > 0) { 12339 if (len > rack->r_ctl.rc_prr_sndcnt) 12340 len = rack->r_ctl.rc_prr_sndcnt; 12341 if (len > 0) { 12342 sub_from_prr = 1; 12343 counter_u64_add(rack_rtm_prr_newdata, 1); 12344 } 12345 } 12346 if (len > segsiz) { 12347 /* 12348 * We should never send more than a MSS when 12349 * retransmitting or sending new data in prr 12350 * mode unless the override flag is on. Most 12351 * likely the PRR algorithm is not going to 12352 * let us send a lot as well :-) 12353 */ 12354 if (rack->r_ctl.rc_prr_sendalot == 0) 12355 len = segsiz; 12356 } else if (len < segsiz) { 12357 /* 12358 * Do we send any? The idea here is if the 12359 * send empty's the socket buffer we want to 12360 * do it. However if not then lets just wait 12361 * for our prr_sndcnt to get bigger. 12362 */ 12363 long leftinsb; 12364 12365 leftinsb = sbavail(sb) - sb_offset; 12366 if (leftinsb > len) { 12367 /* This send does not empty the sb */ 12368 len = 0; 12369 } 12370 } 12371 } 12372 } else if (!TCPS_HAVEESTABLISHED(tp->t_state)) { 12373 /* 12374 * If you have not established 12375 * and are not doing FAST OPEN 12376 * no data please. 12377 */ 12378 if ((sack_rxmit == 0) && 12379 (!IS_FASTOPEN(tp->t_flags))){ 12380 len = 0; 12381 sb_offset = 0; 12382 } 12383 } 12384 if (prefetch_so_done == 0) { 12385 kern_prefetch(so, &prefetch_so_done); 12386 prefetch_so_done = 1; 12387 } 12388 /* 12389 * Lop off SYN bit if it has already been sent. However, if this is 12390 * SYN-SENT state and if segment contains data and if we don't know 12391 * that foreign host supports TAO, suppress sending segment. 12392 */ 12393 if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una) && 12394 ((sack_rxmit == 0) && (tp->t_rxtshift == 0))) { 12395 /* 12396 * When sending additional segments following a TFO SYN|ACK, 12397 * do not include the SYN bit. 12398 */ 12399 if (IS_FASTOPEN(tp->t_flags) && 12400 (tp->t_state == TCPS_SYN_RECEIVED)) 12401 flags &= ~TH_SYN; 12402 } 12403 /* 12404 * Be careful not to send data and/or FIN on SYN segments. This 12405 * measure is needed to prevent interoperability problems with not 12406 * fully conformant TCP implementations. 12407 */ 12408 if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) { 12409 len = 0; 12410 flags &= ~TH_FIN; 12411 } 12412 /* 12413 * On TFO sockets, ensure no data is sent in the following cases: 12414 * 12415 * - When retransmitting SYN|ACK on a passively-created socket 12416 * 12417 * - When retransmitting SYN on an actively created socket 12418 * 12419 * - When sending a zero-length cookie (cookie request) on an 12420 * actively created socket 12421 * 12422 * - When the socket is in the CLOSED state (RST is being sent) 12423 */ 12424 if (IS_FASTOPEN(tp->t_flags) && 12425 (((flags & TH_SYN) && (tp->t_rxtshift > 0)) || 12426 ((tp->t_state == TCPS_SYN_SENT) && 12427 (tp->t_tfo_client_cookie_len == 0)) || 12428 (flags & TH_RST))) { 12429 sack_rxmit = 0; 12430 len = 0; 12431 } 12432 /* Without fast-open there should never be data sent on a SYN */ 12433 if ((flags & TH_SYN) && (!IS_FASTOPEN(tp->t_flags))) { 12434 tp->snd_nxt = tp->iss; 12435 len = 0; 12436 } 12437 orig_len = len; 12438 if (len <= 0) { 12439 /* 12440 * If FIN has been sent but not acked, but we haven't been 12441 * called to retransmit, len will be < 0. Otherwise, window 12442 * shrank after we sent into it. If window shrank to 0, 12443 * cancel pending retransmit, pull snd_nxt back to (closed) 12444 * window, and set the persist timer if it isn't already 12445 * going. If the window didn't close completely, just wait 12446 * for an ACK. 12447 * 12448 * We also do a general check here to ensure that we will 12449 * set the persist timer when we have data to send, but a 12450 * 0-byte window. This makes sure the persist timer is set 12451 * even if the packet hits one of the "goto send" lines 12452 * below. 12453 */ 12454 len = 0; 12455 if ((tp->snd_wnd == 0) && 12456 (TCPS_HAVEESTABLISHED(tp->t_state)) && 12457 (tp->snd_una == tp->snd_max) && 12458 (sb_offset < (int)sbavail(sb))) { 12459 tp->snd_nxt = tp->snd_una; 12460 rack_enter_persist(tp, rack, cts); 12461 } 12462 } else if ((rsm == NULL) && 12463 ((doing_tlp == 0) || (new_data_tlp == 1)) && 12464 (len < rack->r_ctl.rc_pace_max_segs)) { 12465 /* 12466 * We are not sending a maximum sized segment for 12467 * some reason. Should we not send anything (think 12468 * sws or persists)? 12469 */ 12470 if ((tp->snd_wnd < min(max(segsiz, (rack->r_ctl.rc_high_rwnd/2)), minseg)) && 12471 (TCPS_HAVEESTABLISHED(tp->t_state)) && 12472 (len < minseg) && 12473 (len < (int)(sbavail(sb) - sb_offset))) { 12474 /* 12475 * Here the rwnd is less than 12476 * the minimum pacing size, this is not a retransmit, 12477 * we are established and 12478 * the send is not the last in the socket buffer 12479 * we send nothing, and we may enter persists 12480 * if nothing is outstanding. 12481 */ 12482 len = 0; 12483 if (tp->snd_max == tp->snd_una) { 12484 /* 12485 * Nothing out we can 12486 * go into persists. 12487 */ 12488 rack_enter_persist(tp, rack, cts); 12489 tp->snd_nxt = tp->snd_una; 12490 } 12491 } else if ((cwnd_to_use >= max(minseg, (segsiz * 4))) && 12492 (ctf_flight_size(tp, rack->r_ctl.rc_sacked) > (2 * segsiz)) && 12493 (len < (int)(sbavail(sb) - sb_offset)) && 12494 (len < minseg)) { 12495 /* 12496 * Here we are not retransmitting, and 12497 * the cwnd is not so small that we could 12498 * not send at least a min size (rxt timer 12499 * not having gone off), We have 2 segments or 12500 * more already in flight, its not the tail end 12501 * of the socket buffer and the cwnd is blocking 12502 * us from sending out a minimum pacing segment size. 12503 * Lets not send anything. 12504 */ 12505 len = 0; 12506 } else if (((tp->snd_wnd - ctf_outstanding(tp)) < 12507 min((rack->r_ctl.rc_high_rwnd/2), minseg)) && 12508 (ctf_flight_size(tp, rack->r_ctl.rc_sacked) > (2 * segsiz)) && 12509 (len < (int)(sbavail(sb) - sb_offset)) && 12510 (TCPS_HAVEESTABLISHED(tp->t_state))) { 12511 /* 12512 * Here we have a send window but we have 12513 * filled it up and we can't send another pacing segment. 12514 * We also have in flight more than 2 segments 12515 * and we are not completing the sb i.e. we allow 12516 * the last bytes of the sb to go out even if 12517 * its not a full pacing segment. 12518 */ 12519 len = 0; 12520 } 12521 } 12522 /* len will be >= 0 after this point. */ 12523 KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); 12524 tcp_sndbuf_autoscale(tp, so, min(tp->snd_wnd, cwnd_to_use)); 12525 /* 12526 * Decide if we can use TCP Segmentation Offloading (if supported by 12527 * hardware). 12528 * 12529 * TSO may only be used if we are in a pure bulk sending state. The 12530 * presence of TCP-MD5, SACK retransmits, SACK advertizements and IP 12531 * options prevent using TSO. With TSO the TCP header is the same 12532 * (except for the sequence number) for all generated packets. This 12533 * makes it impossible to transmit any options which vary per 12534 * generated segment or packet. 12535 * 12536 * IPv4 handling has a clear separation of ip options and ip header 12537 * flags while IPv6 combines both in in6p_outputopts. ip6_optlen() does 12538 * the right thing below to provide length of just ip options and thus 12539 * checking for ipoptlen is enough to decide if ip options are present. 12540 */ 12541 12542 #ifdef INET6 12543 if (isipv6) 12544 ipoptlen = ip6_optlen(tp->t_inpcb); 12545 else 12546 #endif 12547 if (tp->t_inpcb->inp_options) 12548 ipoptlen = tp->t_inpcb->inp_options->m_len - 12549 offsetof(struct ipoption, ipopt_list); 12550 else 12551 ipoptlen = 0; 12552 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 12553 /* 12554 * Pre-calculate here as we save another lookup into the darknesses 12555 * of IPsec that way and can actually decide if TSO is ok. 12556 */ 12557 #ifdef INET6 12558 if (isipv6 && IPSEC_ENABLED(ipv6)) 12559 ipsec_optlen = IPSEC_HDRSIZE(ipv6, tp->t_inpcb); 12560 #ifdef INET 12561 else 12562 #endif 12563 #endif /* INET6 */ 12564 #ifdef INET 12565 if (IPSEC_ENABLED(ipv4)) 12566 ipsec_optlen = IPSEC_HDRSIZE(ipv4, tp->t_inpcb); 12567 #endif /* INET */ 12568 #endif 12569 12570 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 12571 ipoptlen += ipsec_optlen; 12572 #endif 12573 if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > segsiz && 12574 (tp->t_port == 0) && 12575 ((tp->t_flags & TF_SIGNATURE) == 0) && 12576 tp->rcv_numsacks == 0 && sack_rxmit == 0 && 12577 ipoptlen == 0) 12578 tso = 1; 12579 { 12580 uint32_t outstanding; 12581 12582 outstanding = tp->snd_max - tp->snd_una; 12583 if (tp->t_flags & TF_SENTFIN) { 12584 /* 12585 * If we sent a fin, snd_max is 1 higher than 12586 * snd_una 12587 */ 12588 outstanding--; 12589 } 12590 if (sack_rxmit) { 12591 if ((rsm->r_flags & RACK_HAS_FIN) == 0) 12592 flags &= ~TH_FIN; 12593 } else { 12594 if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + 12595 sbused(sb))) 12596 flags &= ~TH_FIN; 12597 } 12598 } 12599 recwin = lmin(lmax(sbspace(&so->so_rcv), 0), 12600 (long)TCP_MAXWIN << tp->rcv_scale); 12601 12602 /* 12603 * Sender silly window avoidance. We transmit under the following 12604 * conditions when len is non-zero: 12605 * 12606 * - We have a full segment (or more with TSO) - This is the last 12607 * buffer in a write()/send() and we are either idle or running 12608 * NODELAY - we've timed out (e.g. persist timer) - we have more 12609 * then 1/2 the maximum send window's worth of data (receiver may be 12610 * limited the window size) - we need to retransmit 12611 */ 12612 if (len) { 12613 if (len >= segsiz) { 12614 goto send; 12615 } 12616 /* 12617 * NOTE! on localhost connections an 'ack' from the remote 12618 * end may occur synchronously with the output and cause us 12619 * to flush a buffer queued with moretocome. XXX 12620 * 12621 */ 12622 if (!(tp->t_flags & TF_MORETOCOME) && /* normal case */ 12623 (idle || (tp->t_flags & TF_NODELAY)) && 12624 ((uint32_t)len + (uint32_t)sb_offset >= sbavail(sb)) && 12625 (tp->t_flags & TF_NOPUSH) == 0) { 12626 pass = 2; 12627 goto send; 12628 } 12629 if ((tp->snd_una == tp->snd_max) && len) { /* Nothing outstanding */ 12630 pass = 22; 12631 goto send; 12632 } 12633 if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) { 12634 pass = 4; 12635 goto send; 12636 } 12637 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { /* retransmit case */ 12638 pass = 5; 12639 goto send; 12640 } 12641 if (sack_rxmit) { 12642 pass = 6; 12643 goto send; 12644 } 12645 if (((tp->snd_wnd - ctf_outstanding(tp)) < segsiz) && 12646 (ctf_outstanding(tp) < (segsiz * 2))) { 12647 /* 12648 * We have less than two MSS outstanding (delayed ack) 12649 * and our rwnd will not let us send a full sized 12650 * MSS. Lets go ahead and let this small segment 12651 * out because we want to try to have at least two 12652 * packets inflight to not be caught by delayed ack. 12653 */ 12654 pass = 12; 12655 goto send; 12656 } 12657 } 12658 /* 12659 * Sending of standalone window updates. 12660 * 12661 * Window updates are important when we close our window due to a 12662 * full socket buffer and are opening it again after the application 12663 * reads data from it. Once the window has opened again and the 12664 * remote end starts to send again the ACK clock takes over and 12665 * provides the most current window information. 12666 * 12667 * We must avoid the silly window syndrome whereas every read from 12668 * the receive buffer, no matter how small, causes a window update 12669 * to be sent. We also should avoid sending a flurry of window 12670 * updates when the socket buffer had queued a lot of data and the 12671 * application is doing small reads. 12672 * 12673 * Prevent a flurry of pointless window updates by only sending an 12674 * update when we can increase the advertized window by more than 12675 * 1/4th of the socket buffer capacity. When the buffer is getting 12676 * full or is very small be more aggressive and send an update 12677 * whenever we can increase by two mss sized segments. In all other 12678 * situations the ACK's to new incoming data will carry further 12679 * window increases. 12680 * 12681 * Don't send an independent window update if a delayed ACK is 12682 * pending (it will get piggy-backed on it) or the remote side 12683 * already has done a half-close and won't send more data. Skip 12684 * this if the connection is in T/TCP half-open state. 12685 */ 12686 if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN) && 12687 !(tp->t_flags & TF_DELACK) && 12688 !TCPS_HAVERCVDFIN(tp->t_state)) { 12689 /* 12690 * "adv" is the amount we could increase the window, taking 12691 * into account that we are limited by TCP_MAXWIN << 12692 * tp->rcv_scale. 12693 */ 12694 int32_t adv; 12695 int oldwin; 12696 12697 adv = recwin; 12698 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) { 12699 oldwin = (tp->rcv_adv - tp->rcv_nxt); 12700 if (adv > oldwin) 12701 adv -= oldwin; 12702 else { 12703 /* We can't increase the window */ 12704 adv = 0; 12705 } 12706 } else 12707 oldwin = 0; 12708 12709 /* 12710 * If the new window size ends up being the same as or less 12711 * than the old size when it is scaled, then don't force 12712 * a window update. 12713 */ 12714 if (oldwin >> tp->rcv_scale >= (adv + oldwin) >> tp->rcv_scale) 12715 goto dontupdate; 12716 12717 if (adv >= (int32_t)(2 * segsiz) && 12718 (adv >= (int32_t)(so->so_rcv.sb_hiwat / 4) || 12719 recwin <= (int32_t)(so->so_rcv.sb_hiwat / 8) || 12720 so->so_rcv.sb_hiwat <= 8 * segsiz)) { 12721 pass = 7; 12722 goto send; 12723 } 12724 if (2 * adv >= (int32_t) so->so_rcv.sb_hiwat) { 12725 pass = 23; 12726 goto send; 12727 } 12728 } 12729 dontupdate: 12730 12731 /* 12732 * Send if we owe the peer an ACK, RST, SYN, or urgent data. ACKNOW 12733 * is also a catch-all for the retransmit timer timeout case. 12734 */ 12735 if (tp->t_flags & TF_ACKNOW) { 12736 pass = 8; 12737 goto send; 12738 } 12739 if (((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0)) { 12740 pass = 9; 12741 goto send; 12742 } 12743 /* 12744 * If our state indicates that FIN should be sent and we have not 12745 * yet done so, then we need to send. 12746 */ 12747 if ((flags & TH_FIN) && 12748 (tp->snd_nxt == tp->snd_una)) { 12749 pass = 11; 12750 goto send; 12751 } 12752 /* 12753 * No reason to send a segment, just return. 12754 */ 12755 just_return: 12756 SOCKBUF_UNLOCK(sb); 12757 just_return_nolock: 12758 { 12759 int app_limited = CTF_JR_SENT_DATA; 12760 12761 if (tot_len_this_send > 0) { 12762 /* Make sure snd_nxt is up to max */ 12763 if (SEQ_GT(tp->snd_max, tp->snd_nxt)) 12764 tp->snd_nxt = tp->snd_max; 12765 slot = rack_get_pacing_delay(rack, tp, tot_len_this_send, NULL, segsiz); 12766 } else { 12767 int end_window = 0; 12768 uint32_t seq = tp->gput_ack; 12769 12770 rsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 12771 if (rsm) { 12772 /* 12773 * Mark the last sent that we just-returned (hinting 12774 * that delayed ack may play a role in any rtt measurement). 12775 */ 12776 rsm->r_just_ret = 1; 12777 } 12778 counter_u64_add(rack_out_size[TCP_MSS_ACCT_JUSTRET], 1); 12779 rack->r_ctl.rc_agg_delayed = 0; 12780 rack->r_early = 0; 12781 rack->r_late = 0; 12782 rack->r_ctl.rc_agg_early = 0; 12783 if ((ctf_outstanding(tp) + 12784 min(max(segsiz, (rack->r_ctl.rc_high_rwnd/2)), 12785 minseg)) >= tp->snd_wnd) { 12786 /* We are limited by the rwnd */ 12787 app_limited = CTF_JR_RWND_LIMITED; 12788 } else if (ctf_outstanding(tp) >= sbavail(sb)) { 12789 /* We are limited by whats available -- app limited */ 12790 app_limited = CTF_JR_APP_LIMITED; 12791 } else if ((idle == 0) && 12792 ((tp->t_flags & TF_NODELAY) == 0) && 12793 ((uint32_t)len + (uint32_t)sb_offset >= sbavail(sb)) && 12794 (len < segsiz)) { 12795 /* 12796 * No delay is not on and the 12797 * user is sending less than 1MSS. This 12798 * brings out SWS avoidance so we 12799 * don't send. Another app-limited case. 12800 */ 12801 app_limited = CTF_JR_APP_LIMITED; 12802 } else if (tp->t_flags & TF_NOPUSH) { 12803 /* 12804 * The user has requested no push of 12805 * the last segment and we are 12806 * at the last segment. Another app 12807 * limited case. 12808 */ 12809 app_limited = CTF_JR_APP_LIMITED; 12810 } else if ((ctf_outstanding(tp) + minseg) > cwnd_to_use) { 12811 /* Its the cwnd */ 12812 app_limited = CTF_JR_CWND_LIMITED; 12813 } else if (rack->rc_in_persist == 1) { 12814 /* We are in persists */ 12815 app_limited = CTF_JR_PERSISTS; 12816 } else if (IN_RECOVERY(tp->t_flags) && 12817 (rack->rack_no_prr == 0) && 12818 (rack->r_ctl.rc_prr_sndcnt < segsiz)) { 12819 app_limited = CTF_JR_PRR; 12820 } else { 12821 /* Now why here are we not sending? */ 12822 #ifdef NOW 12823 #ifdef INVARIANTS 12824 panic("rack:%p hit JR_ASSESSING case cwnd_to_use:%u?", rack, cwnd_to_use); 12825 #endif 12826 #endif 12827 app_limited = CTF_JR_ASSESSING; 12828 } 12829 /* 12830 * App limited in some fashion, for our pacing GP 12831 * measurements we don't want any gap (even cwnd). 12832 * Close down the measurement window. 12833 */ 12834 if (rack_cwnd_block_ends_measure && 12835 ((app_limited == CTF_JR_CWND_LIMITED) || 12836 (app_limited == CTF_JR_PRR))) { 12837 /* 12838 * The reason we are not sending is 12839 * the cwnd (or prr). We have been configured 12840 * to end the measurement window in 12841 * this case. 12842 */ 12843 end_window = 1; 12844 } else if (app_limited == CTF_JR_PERSISTS) { 12845 /* 12846 * We never end the measurement window 12847 * in persists, though in theory we 12848 * should be only entering after everything 12849 * is acknowledged (so we will probably 12850 * never come here). 12851 */ 12852 end_window = 0; 12853 } else if (rack_rwnd_block_ends_measure && 12854 (app_limited == CTF_JR_RWND_LIMITED)) { 12855 /* 12856 * We are rwnd limited and have been 12857 * configured to end the measurement 12858 * window in this case. 12859 */ 12860 end_window = 1; 12861 } else if (app_limited == CTF_JR_APP_LIMITED) { 12862 /* 12863 * A true application limited period, we have 12864 * ran out of data. 12865 */ 12866 end_window = 1; 12867 } else if (app_limited == CTF_JR_ASSESSING) { 12868 /* 12869 * In the assessing case we hit the end of 12870 * the if/else and had no known reason 12871 * This will panic us under invariants.. 12872 * 12873 * If we get this out in logs we need to 12874 * investagate which reason we missed. 12875 */ 12876 end_window = 1; 12877 } 12878 if (end_window) { 12879 uint8_t log = 0; 12880 12881 if ((tp->t_flags & TF_GPUTINPROG) && 12882 SEQ_GT(tp->gput_ack, tp->snd_max)) { 12883 /* Mark the last packet has app limited */ 12884 tp->gput_ack = tp->snd_max; 12885 log = 1; 12886 } 12887 rsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 12888 if (rsm && ((rsm->r_flags & RACK_APP_LIMITED) == 0)) { 12889 if (rack->r_ctl.rc_app_limited_cnt == 0) 12890 rack->r_ctl.rc_end_appl = rack->r_ctl.rc_first_appl = rsm; 12891 else { 12892 /* 12893 * Go out to the end app limited and mark 12894 * this new one as next and move the end_appl up 12895 * to this guy. 12896 */ 12897 if (rack->r_ctl.rc_end_appl) 12898 rack->r_ctl.rc_end_appl->r_nseq_appl = rsm->r_start; 12899 rack->r_ctl.rc_end_appl = rsm; 12900 } 12901 rsm->r_flags |= RACK_APP_LIMITED; 12902 rack->r_ctl.rc_app_limited_cnt++; 12903 } 12904 if (log) 12905 rack_log_pacing_delay_calc(rack, 12906 rack->r_ctl.rc_app_limited_cnt, seq, 12907 tp->gput_ack, 0, 0, 4, __LINE__, NULL); 12908 } 12909 } 12910 if (slot) { 12911 /* set the rack tcb into the slot N */ 12912 counter_u64_add(rack_paced_segments, 1); 12913 } else if (tot_len_this_send) { 12914 counter_u64_add(rack_unpaced_segments, 1); 12915 } 12916 /* Check if we need to go into persists or not */ 12917 if ((rack->rc_in_persist == 0) && 12918 (tp->snd_max == tp->snd_una) && 12919 TCPS_HAVEESTABLISHED(tp->t_state) && 12920 sbavail(sb) && 12921 (sbavail(sb) > tp->snd_wnd) && 12922 (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), minseg))) { 12923 /* Yes lets make sure to move to persist before timer-start */ 12924 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime); 12925 } 12926 rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, sup_rack); 12927 rack_log_type_just_return(rack, cts, tot_len_this_send, slot, hpts_calling, app_limited, cwnd_to_use); 12928 } 12929 #ifdef NETFLIX_SHARED_CWND 12930 if ((sbavail(sb) == 0) && 12931 rack->r_ctl.rc_scw) { 12932 tcp_shared_cwnd_idle(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index); 12933 rack->rack_scwnd_is_idle = 1; 12934 } 12935 #endif 12936 return (0); 12937 12938 send: 12939 if ((flags & TH_FIN) && 12940 sbavail(sb)) { 12941 /* 12942 * We do not transmit a FIN 12943 * with data outstanding. We 12944 * need to make it so all data 12945 * is acked first. 12946 */ 12947 flags &= ~TH_FIN; 12948 } 12949 /* Enforce stack imposed max seg size if we have one */ 12950 if (rack->r_ctl.rc_pace_max_segs && 12951 (len > rack->r_ctl.rc_pace_max_segs)) { 12952 mark = 1; 12953 len = rack->r_ctl.rc_pace_max_segs; 12954 } 12955 SOCKBUF_LOCK_ASSERT(sb); 12956 if (len > 0) { 12957 if (len >= segsiz) 12958 tp->t_flags2 |= TF2_PLPMTU_MAXSEGSNT; 12959 else 12960 tp->t_flags2 &= ~TF2_PLPMTU_MAXSEGSNT; 12961 } 12962 /* 12963 * Before ESTABLISHED, force sending of initial options unless TCP 12964 * set not to do any options. NOTE: we assume that the IP/TCP header 12965 * plus TCP options always fit in a single mbuf, leaving room for a 12966 * maximum link header, i.e. max_linkhdr + sizeof (struct tcpiphdr) 12967 * + optlen <= MCLBYTES 12968 */ 12969 optlen = 0; 12970 #ifdef INET6 12971 if (isipv6) 12972 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 12973 else 12974 #endif 12975 hdrlen = sizeof(struct tcpiphdr); 12976 12977 /* 12978 * Compute options for segment. We only have to care about SYN and 12979 * established connection segments. Options for SYN-ACK segments 12980 * are handled in TCP syncache. 12981 */ 12982 to.to_flags = 0; 12983 if ((tp->t_flags & TF_NOOPT) == 0) { 12984 /* Maximum segment size. */ 12985 if (flags & TH_SYN) { 12986 tp->snd_nxt = tp->iss; 12987 to.to_mss = tcp_mssopt(&inp->inp_inc); 12988 #ifdef NETFLIX_TCPOUDP 12989 if (tp->t_port) 12990 to.to_mss -= V_tcp_udp_tunneling_overhead; 12991 #endif 12992 to.to_flags |= TOF_MSS; 12993 12994 /* 12995 * On SYN or SYN|ACK transmits on TFO connections, 12996 * only include the TFO option if it is not a 12997 * retransmit, as the presence of the TFO option may 12998 * have caused the original SYN or SYN|ACK to have 12999 * been dropped by a middlebox. 13000 */ 13001 if (IS_FASTOPEN(tp->t_flags) && 13002 (tp->t_rxtshift == 0)) { 13003 if (tp->t_state == TCPS_SYN_RECEIVED) { 13004 to.to_tfo_len = TCP_FASTOPEN_COOKIE_LEN; 13005 to.to_tfo_cookie = 13006 (u_int8_t *)&tp->t_tfo_cookie.server; 13007 to.to_flags |= TOF_FASTOPEN; 13008 wanted_cookie = 1; 13009 } else if (tp->t_state == TCPS_SYN_SENT) { 13010 to.to_tfo_len = 13011 tp->t_tfo_client_cookie_len; 13012 to.to_tfo_cookie = 13013 tp->t_tfo_cookie.client; 13014 to.to_flags |= TOF_FASTOPEN; 13015 wanted_cookie = 1; 13016 /* 13017 * If we wind up having more data to 13018 * send with the SYN than can fit in 13019 * one segment, don't send any more 13020 * until the SYN|ACK comes back from 13021 * the other end. 13022 */ 13023 sendalot = 0; 13024 } 13025 } 13026 } 13027 /* Window scaling. */ 13028 if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) { 13029 to.to_wscale = tp->request_r_scale; 13030 to.to_flags |= TOF_SCALE; 13031 } 13032 /* Timestamps. */ 13033 if ((tp->t_flags & TF_RCVD_TSTMP) || 13034 ((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) { 13035 to.to_tsval = cts + tp->ts_offset; 13036 to.to_tsecr = tp->ts_recent; 13037 to.to_flags |= TOF_TS; 13038 } 13039 /* Set receive buffer autosizing timestamp. */ 13040 if (tp->rfbuf_ts == 0 && 13041 (so->so_rcv.sb_flags & SB_AUTOSIZE)) 13042 tp->rfbuf_ts = tcp_ts_getticks(); 13043 /* Selective ACK's. */ 13044 if (flags & TH_SYN) 13045 to.to_flags |= TOF_SACKPERM; 13046 else if (TCPS_HAVEESTABLISHED(tp->t_state) && 13047 tp->rcv_numsacks > 0) { 13048 to.to_flags |= TOF_SACK; 13049 to.to_nsacks = tp->rcv_numsacks; 13050 to.to_sacks = (u_char *)tp->sackblks; 13051 } 13052 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) 13053 /* TCP-MD5 (RFC2385). */ 13054 if (tp->t_flags & TF_SIGNATURE) 13055 to.to_flags |= TOF_SIGNATURE; 13056 #endif /* TCP_SIGNATURE */ 13057 13058 /* Processing the options. */ 13059 hdrlen += optlen = tcp_addoptions(&to, opt); 13060 /* 13061 * If we wanted a TFO option to be added, but it was unable 13062 * to fit, ensure no data is sent. 13063 */ 13064 if (IS_FASTOPEN(tp->t_flags) && wanted_cookie && 13065 !(to.to_flags & TOF_FASTOPEN)) 13066 len = 0; 13067 } 13068 #ifdef NETFLIX_TCPOUDP 13069 if (tp->t_port) { 13070 if (V_tcp_udp_tunneling_port == 0) { 13071 /* The port was removed?? */ 13072 SOCKBUF_UNLOCK(&so->so_snd); 13073 return (EHOSTUNREACH); 13074 } 13075 hdrlen += sizeof(struct udphdr); 13076 } 13077 #endif 13078 #ifdef INET6 13079 if (isipv6) 13080 ipoptlen = ip6_optlen(tp->t_inpcb); 13081 else 13082 #endif 13083 if (tp->t_inpcb->inp_options) 13084 ipoptlen = tp->t_inpcb->inp_options->m_len - 13085 offsetof(struct ipoption, ipopt_list); 13086 else 13087 ipoptlen = 0; 13088 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 13089 ipoptlen += ipsec_optlen; 13090 #endif 13091 13092 /* 13093 * Adjust data length if insertion of options will bump the packet 13094 * length beyond the t_maxseg length. Clear the FIN bit because we 13095 * cut off the tail of the segment. 13096 */ 13097 if (len + optlen + ipoptlen > tp->t_maxseg) { 13098 if (tso) { 13099 uint32_t if_hw_tsomax; 13100 uint32_t moff; 13101 int32_t max_len; 13102 13103 /* extract TSO information */ 13104 if_hw_tsomax = tp->t_tsomax; 13105 if_hw_tsomaxsegcount = tp->t_tsomaxsegcount; 13106 if_hw_tsomaxsegsize = tp->t_tsomaxsegsize; 13107 KASSERT(ipoptlen == 0, 13108 ("%s: TSO can't do IP options", __func__)); 13109 13110 /* 13111 * Check if we should limit by maximum payload 13112 * length: 13113 */ 13114 if (if_hw_tsomax != 0) { 13115 /* compute maximum TSO length */ 13116 max_len = (if_hw_tsomax - hdrlen - 13117 max_linkhdr); 13118 if (max_len <= 0) { 13119 len = 0; 13120 } else if (len > max_len) { 13121 sendalot = 1; 13122 len = max_len; 13123 mark = 2; 13124 } 13125 } 13126 /* 13127 * Prevent the last segment from being fractional 13128 * unless the send sockbuf can be emptied: 13129 */ 13130 max_len = (tp->t_maxseg - optlen); 13131 if ((sb_offset + len) < sbavail(sb)) { 13132 moff = len % (u_int)max_len; 13133 if (moff != 0) { 13134 mark = 3; 13135 len -= moff; 13136 } 13137 } 13138 /* 13139 * In case there are too many small fragments don't 13140 * use TSO: 13141 */ 13142 if (len <= segsiz) { 13143 mark = 4; 13144 tso = 0; 13145 } 13146 /* 13147 * Send the FIN in a separate segment after the bulk 13148 * sending is done. We don't trust the TSO 13149 * implementations to clear the FIN flag on all but 13150 * the last segment. 13151 */ 13152 if (tp->t_flags & TF_NEEDFIN) { 13153 sendalot = 4; 13154 } 13155 } else { 13156 mark = 5; 13157 if (optlen + ipoptlen >= tp->t_maxseg) { 13158 /* 13159 * Since we don't have enough space to put 13160 * the IP header chain and the TCP header in 13161 * one packet as required by RFC 7112, don't 13162 * send it. Also ensure that at least one 13163 * byte of the payload can be put into the 13164 * TCP segment. 13165 */ 13166 SOCKBUF_UNLOCK(&so->so_snd); 13167 error = EMSGSIZE; 13168 sack_rxmit = 0; 13169 goto out; 13170 } 13171 len = tp->t_maxseg - optlen - ipoptlen; 13172 sendalot = 5; 13173 } 13174 } else { 13175 tso = 0; 13176 mark = 6; 13177 } 13178 KASSERT(len + hdrlen + ipoptlen <= IP_MAXPACKET, 13179 ("%s: len > IP_MAXPACKET", __func__)); 13180 #ifdef DIAGNOSTIC 13181 #ifdef INET6 13182 if (max_linkhdr + hdrlen > MCLBYTES) 13183 #else 13184 if (max_linkhdr + hdrlen > MHLEN) 13185 #endif 13186 panic("tcphdr too big"); 13187 #endif 13188 13189 /* 13190 * This KASSERT is here to catch edge cases at a well defined place. 13191 * Before, those had triggered (random) panic conditions further 13192 * down. 13193 */ 13194 KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); 13195 if ((len == 0) && 13196 (flags & TH_FIN) && 13197 (sbused(sb))) { 13198 /* 13199 * We have outstanding data, don't send a fin by itself!. 13200 */ 13201 goto just_return; 13202 } 13203 /* 13204 * Grab a header mbuf, attaching a copy of data to be transmitted, 13205 * and initialize the header from the template for sends on this 13206 * connection. 13207 */ 13208 if (len) { 13209 uint32_t max_val; 13210 uint32_t moff; 13211 13212 if (rack->r_ctl.rc_pace_max_segs) 13213 max_val = rack->r_ctl.rc_pace_max_segs; 13214 else if (rack->rc_user_set_max_segs) 13215 max_val = rack->rc_user_set_max_segs * segsiz; 13216 else 13217 max_val = len; 13218 /* 13219 * We allow a limit on sending with hptsi. 13220 */ 13221 if (len > max_val) { 13222 mark = 7; 13223 len = max_val; 13224 } 13225 #ifdef INET6 13226 if (MHLEN < hdrlen + max_linkhdr) 13227 m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); 13228 else 13229 #endif 13230 m = m_gethdr(M_NOWAIT, MT_DATA); 13231 13232 if (m == NULL) { 13233 SOCKBUF_UNLOCK(sb); 13234 error = ENOBUFS; 13235 sack_rxmit = 0; 13236 goto out; 13237 } 13238 m->m_data += max_linkhdr; 13239 m->m_len = hdrlen; 13240 13241 /* 13242 * Start the m_copy functions from the closest mbuf to the 13243 * sb_offset in the socket buffer chain. 13244 */ 13245 mb = sbsndptr_noadv(sb, sb_offset, &moff); 13246 if (len <= MHLEN - hdrlen - max_linkhdr && !hw_tls) { 13247 m_copydata(mb, moff, (int)len, 13248 mtod(m, caddr_t)+hdrlen); 13249 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) 13250 sbsndptr_adv(sb, mb, len); 13251 m->m_len += len; 13252 } else { 13253 struct sockbuf *msb; 13254 13255 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) 13256 msb = NULL; 13257 else 13258 msb = sb; 13259 m->m_next = tcp_m_copym( 13260 mb, moff, &len, 13261 if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb, 13262 ((rsm == NULL) ? hw_tls : 0) 13263 #ifdef NETFLIX_COPY_ARGS 13264 , &filled_all 13265 #endif 13266 ); 13267 if (len <= (tp->t_maxseg - optlen)) { 13268 /* 13269 * Must have ran out of mbufs for the copy 13270 * shorten it to no longer need tso. Lets 13271 * not put on sendalot since we are low on 13272 * mbufs. 13273 */ 13274 tso = 0; 13275 } 13276 if (m->m_next == NULL) { 13277 SOCKBUF_UNLOCK(sb); 13278 (void)m_free(m); 13279 error = ENOBUFS; 13280 sack_rxmit = 0; 13281 goto out; 13282 } 13283 } 13284 if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) { 13285 if (rsm && (rsm->r_flags & RACK_TLP)) { 13286 /* 13287 * TLP should not count in retran count, but 13288 * in its own bin 13289 */ 13290 counter_u64_add(rack_tlp_retran, 1); 13291 counter_u64_add(rack_tlp_retran_bytes, len); 13292 } else { 13293 tp->t_sndrexmitpack++; 13294 KMOD_TCPSTAT_INC(tcps_sndrexmitpack); 13295 KMOD_TCPSTAT_ADD(tcps_sndrexmitbyte, len); 13296 } 13297 #ifdef STATS 13298 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB, 13299 len); 13300 #endif 13301 } else { 13302 KMOD_TCPSTAT_INC(tcps_sndpack); 13303 KMOD_TCPSTAT_ADD(tcps_sndbyte, len); 13304 #ifdef STATS 13305 stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB, 13306 len); 13307 #endif 13308 } 13309 /* 13310 * If we're sending everything we've got, set PUSH. (This 13311 * will keep happy those implementations which only give 13312 * data to the user when a buffer fills or a PUSH comes in.) 13313 */ 13314 if (sb_offset + len == sbused(sb) && 13315 sbused(sb) && 13316 !(flags & TH_SYN)) 13317 flags |= TH_PUSH; 13318 13319 SOCKBUF_UNLOCK(sb); 13320 } else { 13321 SOCKBUF_UNLOCK(sb); 13322 if (tp->t_flags & TF_ACKNOW) 13323 KMOD_TCPSTAT_INC(tcps_sndacks); 13324 else if (flags & (TH_SYN | TH_FIN | TH_RST)) 13325 KMOD_TCPSTAT_INC(tcps_sndctrl); 13326 else 13327 KMOD_TCPSTAT_INC(tcps_sndwinup); 13328 13329 m = m_gethdr(M_NOWAIT, MT_DATA); 13330 if (m == NULL) { 13331 error = ENOBUFS; 13332 sack_rxmit = 0; 13333 goto out; 13334 } 13335 #ifdef INET6 13336 if (isipv6 && (MHLEN < hdrlen + max_linkhdr) && 13337 MHLEN >= hdrlen) { 13338 M_ALIGN(m, hdrlen); 13339 } else 13340 #endif 13341 m->m_data += max_linkhdr; 13342 m->m_len = hdrlen; 13343 } 13344 SOCKBUF_UNLOCK_ASSERT(sb); 13345 m->m_pkthdr.rcvif = (struct ifnet *)0; 13346 #ifdef MAC 13347 mac_inpcb_create_mbuf(inp, m); 13348 #endif 13349 #ifdef INET6 13350 if (isipv6) { 13351 ip6 = mtod(m, struct ip6_hdr *); 13352 #ifdef NETFLIX_TCPOUDP 13353 if (tp->t_port) { 13354 udp = (struct udphdr *)((caddr_t)ip6 + ipoptlen + sizeof(struct ip6_hdr)); 13355 udp->uh_sport = htons(V_tcp_udp_tunneling_port); 13356 udp->uh_dport = tp->t_port; 13357 ulen = hdrlen + len - sizeof(struct ip6_hdr); 13358 udp->uh_ulen = htons(ulen); 13359 th = (struct tcphdr *)(udp + 1); 13360 } else 13361 #endif 13362 th = (struct tcphdr *)(ip6 + 1); 13363 tcpip_fillheaders(inp, 13364 #ifdef NETFLIX_TCPOUDP 13365 tp->t_port, 13366 #endif 13367 ip6, th); 13368 } else 13369 #endif /* INET6 */ 13370 { 13371 ip = mtod(m, struct ip *); 13372 #ifdef TCPDEBUG 13373 ipov = (struct ipovly *)ip; 13374 #endif 13375 #ifdef NETFLIX_TCPOUDP 13376 if (tp->t_port) { 13377 udp = (struct udphdr *)((caddr_t)ip + ipoptlen + sizeof(struct ip)); 13378 udp->uh_sport = htons(V_tcp_udp_tunneling_port); 13379 udp->uh_dport = tp->t_port; 13380 ulen = hdrlen + len - sizeof(struct ip); 13381 udp->uh_ulen = htons(ulen); 13382 th = (struct tcphdr *)(udp + 1); 13383 } else 13384 #endif 13385 th = (struct tcphdr *)(ip + 1); 13386 tcpip_fillheaders(inp, 13387 #ifdef NETFLIX_TCPOUDP 13388 tp->t_port, 13389 #endif 13390 ip, th); 13391 } 13392 /* 13393 * Fill in fields, remembering maximum advertised window for use in 13394 * delaying messages about window sizes. If resending a FIN, be sure 13395 * not to use a new sequence number. 13396 */ 13397 if (flags & TH_FIN && tp->t_flags & TF_SENTFIN && 13398 tp->snd_nxt == tp->snd_max) 13399 tp->snd_nxt--; 13400 /* 13401 * If we are starting a connection, send ECN setup SYN packet. If we 13402 * are on a retransmit, we may resend those bits a number of times 13403 * as per RFC 3168. 13404 */ 13405 if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn == 1) { 13406 if (tp->t_rxtshift >= 1) { 13407 if (tp->t_rxtshift <= V_tcp_ecn_maxretries) 13408 flags |= TH_ECE | TH_CWR; 13409 } else 13410 flags |= TH_ECE | TH_CWR; 13411 } 13412 /* Handle parallel SYN for ECN */ 13413 if ((tp->t_state == TCPS_SYN_RECEIVED) && 13414 (tp->t_flags2 & TF2_ECN_SND_ECE)) { 13415 flags |= TH_ECE; 13416 tp->t_flags2 &= ~TF2_ECN_SND_ECE; 13417 } 13418 if (tp->t_state == TCPS_ESTABLISHED && 13419 (tp->t_flags2 & TF2_ECN_PERMIT)) { 13420 /* 13421 * If the peer has ECN, mark data packets with ECN capable 13422 * transmission (ECT). Ignore pure ack packets, 13423 * retransmissions. 13424 */ 13425 if (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) && 13426 (sack_rxmit == 0)) { 13427 #ifdef INET6 13428 if (isipv6) 13429 ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20); 13430 else 13431 #endif 13432 ip->ip_tos |= IPTOS_ECN_ECT0; 13433 KMOD_TCPSTAT_INC(tcps_ecn_ect0); 13434 /* 13435 * Reply with proper ECN notifications. 13436 * Only set CWR on new data segments. 13437 */ 13438 if (tp->t_flags2 & TF2_ECN_SND_CWR) { 13439 flags |= TH_CWR; 13440 tp->t_flags2 &= ~TF2_ECN_SND_CWR; 13441 } 13442 } 13443 if (tp->t_flags2 & TF2_ECN_SND_ECE) 13444 flags |= TH_ECE; 13445 } 13446 /* 13447 * If we are doing retransmissions, then snd_nxt will not reflect 13448 * the first unsent octet. For ACK only packets, we do not want the 13449 * sequence number of the retransmitted packet, we want the sequence 13450 * number of the next unsent octet. So, if there is no data (and no 13451 * SYN or FIN), use snd_max instead of snd_nxt when filling in 13452 * ti_seq. But if we are in persist state, snd_max might reflect 13453 * one byte beyond the right edge of the window, so use snd_nxt in 13454 * that case, since we know we aren't doing a retransmission. 13455 * (retransmit and persist are mutually exclusive...) 13456 */ 13457 if (sack_rxmit == 0) { 13458 if (len || (flags & (TH_SYN | TH_FIN)) || 13459 rack->rc_in_persist) { 13460 th->th_seq = htonl(tp->snd_nxt); 13461 rack_seq = tp->snd_nxt; 13462 } else if (flags & TH_RST) { 13463 /* 13464 * For a Reset send the last cum ack in sequence 13465 * (this like any other choice may still generate a 13466 * challenge ack, if a ack-update packet is in 13467 * flight). 13468 */ 13469 th->th_seq = htonl(tp->snd_una); 13470 rack_seq = tp->snd_una; 13471 } else { 13472 th->th_seq = htonl(tp->snd_max); 13473 rack_seq = tp->snd_max; 13474 } 13475 } else { 13476 th->th_seq = htonl(rsm->r_start); 13477 rack_seq = rsm->r_start; 13478 } 13479 th->th_ack = htonl(tp->rcv_nxt); 13480 if (optlen) { 13481 bcopy(opt, th + 1, optlen); 13482 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 13483 } 13484 th->th_flags = flags; 13485 /* 13486 * Calculate receive window. Don't shrink window, but avoid silly 13487 * window syndrome. 13488 * If a RST segment is sent, advertise a window of zero. 13489 */ 13490 if (flags & TH_RST) { 13491 recwin = 0; 13492 } else { 13493 if (recwin < (long)(so->so_rcv.sb_hiwat / 4) && 13494 recwin < (long)segsiz) 13495 recwin = 0; 13496 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) && 13497 recwin < (long)(tp->rcv_adv - tp->rcv_nxt)) 13498 recwin = (long)(tp->rcv_adv - tp->rcv_nxt); 13499 } 13500 13501 /* 13502 * According to RFC1323 the window field in a SYN (i.e., a <SYN> or 13503 * <SYN,ACK>) segment itself is never scaled. The <SYN,ACK> case is 13504 * handled in syncache. 13505 */ 13506 if (flags & TH_SYN) 13507 th->th_win = htons((u_short) 13508 (min(sbspace(&so->so_rcv), TCP_MAXWIN))); 13509 else { 13510 /* Avoid shrinking window with window scaling. */ 13511 recwin = roundup2(recwin, 1 << tp->rcv_scale); 13512 th->th_win = htons((u_short)(recwin >> tp->rcv_scale)); 13513 } 13514 /* 13515 * Adjust the RXWIN0SENT flag - indicate that we have advertised a 0 13516 * window. This may cause the remote transmitter to stall. This 13517 * flag tells soreceive() to disable delayed acknowledgements when 13518 * draining the buffer. This can occur if the receiver is 13519 * attempting to read more data than can be buffered prior to 13520 * transmitting on the connection. 13521 */ 13522 if (th->th_win == 0) { 13523 tp->t_sndzerowin++; 13524 tp->t_flags |= TF_RXWIN0SENT; 13525 } else 13526 tp->t_flags &= ~TF_RXWIN0SENT; 13527 tp->snd_up = tp->snd_una; /* drag it along, its deprecated */ 13528 13529 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) 13530 if (to.to_flags & TOF_SIGNATURE) { 13531 /* 13532 * Calculate MD5 signature and put it into the place 13533 * determined before. 13534 * NOTE: since TCP options buffer doesn't point into 13535 * mbuf's data, calculate offset and use it. 13536 */ 13537 if (!TCPMD5_ENABLED() || TCPMD5_OUTPUT(m, th, 13538 (u_char *)(th + 1) + (to.to_signature - opt)) != 0) { 13539 /* 13540 * Do not send segment if the calculation of MD5 13541 * digest has failed. 13542 */ 13543 goto out; 13544 } 13545 } 13546 #endif 13547 13548 /* 13549 * Put TCP length in extended header, and then checksum extended 13550 * header and data. 13551 */ 13552 m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */ 13553 #ifdef INET6 13554 if (isipv6) { 13555 /* 13556 * ip6_plen is not need to be filled now, and will be filled 13557 * in ip6_output. 13558 */ 13559 if (tp->t_port) { 13560 m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; 13561 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 13562 udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0); 13563 th->th_sum = htons(0); 13564 UDPSTAT_INC(udps_opackets); 13565 } else { 13566 m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; 13567 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 13568 th->th_sum = in6_cksum_pseudo(ip6, 13569 sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP, 13570 0); 13571 } 13572 } 13573 #endif 13574 #if defined(INET6) && defined(INET) 13575 else 13576 #endif 13577 #ifdef INET 13578 { 13579 if (tp->t_port) { 13580 m->m_pkthdr.csum_flags = CSUM_UDP; 13581 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 13582 udp->uh_sum = in_pseudo(ip->ip_src.s_addr, 13583 ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP)); 13584 th->th_sum = htons(0); 13585 UDPSTAT_INC(udps_opackets); 13586 } else { 13587 m->m_pkthdr.csum_flags = CSUM_TCP; 13588 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 13589 th->th_sum = in_pseudo(ip->ip_src.s_addr, 13590 ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + 13591 IPPROTO_TCP + len + optlen)); 13592 } 13593 /* IP version must be set here for ipv4/ipv6 checking later */ 13594 KASSERT(ip->ip_v == IPVERSION, 13595 ("%s: IP version incorrect: %d", __func__, ip->ip_v)); 13596 } 13597 #endif 13598 /* 13599 * Enable TSO and specify the size of the segments. The TCP pseudo 13600 * header checksum is always provided. XXX: Fixme: This is currently 13601 * not the case for IPv6. 13602 */ 13603 if (tso) { 13604 KASSERT(len > tp->t_maxseg - optlen, 13605 ("%s: len <= tso_segsz", __func__)); 13606 m->m_pkthdr.csum_flags |= CSUM_TSO; 13607 m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen; 13608 } 13609 KASSERT(len + hdrlen == m_length(m, NULL), 13610 ("%s: mbuf chain different than expected: %d + %u != %u", 13611 __func__, len, hdrlen, m_length(m, NULL))); 13612 13613 #ifdef TCP_HHOOK 13614 /* Run HHOOK_TCP_ESTABLISHED_OUT helper hooks. */ 13615 hhook_run_tcp_est_out(tp, th, &to, len, tso); 13616 #endif 13617 #ifdef TCPDEBUG 13618 /* 13619 * Trace. 13620 */ 13621 if (so->so_options & SO_DEBUG) { 13622 u_short save = 0; 13623 13624 #ifdef INET6 13625 if (!isipv6) 13626 #endif 13627 { 13628 save = ipov->ih_len; 13629 ipov->ih_len = htons(m->m_pkthdr.len /* - hdrlen + 13630 * (th->th_off << 2) */ ); 13631 } 13632 tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0); 13633 #ifdef INET6 13634 if (!isipv6) 13635 #endif 13636 ipov->ih_len = save; 13637 } 13638 #endif /* TCPDEBUG */ 13639 13640 /* We're getting ready to send; log now. */ 13641 if (tp->t_logstate != TCP_LOG_STATE_OFF) { 13642 union tcp_log_stackspecific log; 13643 struct timeval tv; 13644 13645 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 13646 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 13647 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 13648 if (rack->rack_no_prr) 13649 log.u_bbr.flex1 = 0; 13650 else 13651 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; 13652 log.u_bbr.flex2 = rack->r_ctl.rc_pace_min_segs; 13653 log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs; 13654 log.u_bbr.flex4 = orig_len; 13655 if (filled_all) 13656 log.u_bbr.flex5 = 0x80000000; 13657 else 13658 log.u_bbr.flex5 = 0; 13659 /* Save off the early/late values */ 13660 log.u_bbr.flex6 = rack->r_ctl.rc_agg_early; 13661 log.u_bbr.applimited = rack->r_ctl.rc_agg_delayed; 13662 log.u_bbr.bw_inuse = rack_get_bw(rack); 13663 if (rsm || sack_rxmit) { 13664 if (doing_tlp) 13665 log.u_bbr.flex8 = 2; 13666 else 13667 log.u_bbr.flex8 = 1; 13668 } else { 13669 log.u_bbr.flex8 = 0; 13670 } 13671 log.u_bbr.pacing_gain = rack_get_output_gain(rack, rsm); 13672 log.u_bbr.flex7 = mark; 13673 log.u_bbr.pkts_out = tp->t_maxseg; 13674 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 13675 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 13676 log.u_bbr.lt_epoch = cwnd_to_use; 13677 log.u_bbr.delivered = sendalot; 13678 lgb = tcp_log_event_(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_OUT, ERRNO_UNK, 13679 len, &log, false, NULL, NULL, 0, &tv); 13680 } else 13681 lgb = NULL; 13682 13683 /* 13684 * Fill in IP length and desired time to live and send to IP level. 13685 * There should be a better way to handle ttl and tos; we could keep 13686 * them in the template, but need a way to checksum without them. 13687 */ 13688 /* 13689 * m->m_pkthdr.len should have been set before cksum calcuration, 13690 * because in6_cksum() need it. 13691 */ 13692 #ifdef INET6 13693 if (isipv6) { 13694 /* 13695 * we separately set hoplimit for every segment, since the 13696 * user might want to change the value via setsockopt. Also, 13697 * desired default hop limit might be changed via Neighbor 13698 * Discovery. 13699 */ 13700 ip6->ip6_hlim = in6_selecthlim(inp, NULL); 13701 13702 /* 13703 * Set the packet size here for the benefit of DTrace 13704 * probes. ip6_output() will set it properly; it's supposed 13705 * to include the option header lengths as well. 13706 */ 13707 ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6)); 13708 13709 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) 13710 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 13711 else 13712 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 13713 13714 if (tp->t_state == TCPS_SYN_SENT) 13715 TCP_PROBE5(connect__request, NULL, tp, ip6, tp, th); 13716 13717 TCP_PROBE5(send, NULL, tp, ip6, tp, th); 13718 /* TODO: IPv6 IP6TOS_ECT bit on */ 13719 error = ip6_output(m, inp->in6p_outputopts, 13720 &inp->inp_route6, 13721 ((rsm || sack_rxmit) ? IP_NO_SND_TAG_RL : 0), 13722 NULL, NULL, inp); 13723 13724 if (error == EMSGSIZE && inp->inp_route6.ro_nh != NULL) 13725 mtu = inp->inp_route6.ro_nh->nh_mtu; 13726 } 13727 #endif /* INET6 */ 13728 #if defined(INET) && defined(INET6) 13729 else 13730 #endif 13731 #ifdef INET 13732 { 13733 ip->ip_len = htons(m->m_pkthdr.len); 13734 #ifdef INET6 13735 if (inp->inp_vflag & INP_IPV6PROTO) 13736 ip->ip_ttl = in6_selecthlim(inp, NULL); 13737 #endif /* INET6 */ 13738 /* 13739 * If we do path MTU discovery, then we set DF on every 13740 * packet. This might not be the best thing to do according 13741 * to RFC3390 Section 2. However the tcp hostcache migitates 13742 * the problem so it affects only the first tcp connection 13743 * with a host. 13744 * 13745 * NB: Don't set DF on small MTU/MSS to have a safe 13746 * fallback. 13747 */ 13748 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) { 13749 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 13750 if (tp->t_port == 0 || len < V_tcp_minmss) { 13751 ip->ip_off |= htons(IP_DF); 13752 } 13753 } else { 13754 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 13755 } 13756 13757 if (tp->t_state == TCPS_SYN_SENT) 13758 TCP_PROBE5(connect__request, NULL, tp, ip, tp, th); 13759 13760 TCP_PROBE5(send, NULL, tp, ip, tp, th); 13761 13762 error = ip_output(m, inp->inp_options, &inp->inp_route, 13763 ((rsm || sack_rxmit) ? IP_NO_SND_TAG_RL : 0), 0, 13764 inp); 13765 if (error == EMSGSIZE && inp->inp_route.ro_nh != NULL) 13766 mtu = inp->inp_route.ro_nh->nh_mtu; 13767 } 13768 #endif /* INET */ 13769 13770 out: 13771 if (lgb) { 13772 lgb->tlb_errno = error; 13773 lgb = NULL; 13774 } 13775 /* 13776 * In transmit state, time the transmission and arrange for the 13777 * retransmit. In persist state, just set snd_max. 13778 */ 13779 if (error == 0) { 13780 rack->forced_ack = 0; /* If we send something zap the FA flag */ 13781 if (rsm && (doing_tlp == 0)) { 13782 /* Set we retransmitted */ 13783 rack->rc_gp_saw_rec = 1; 13784 } else { 13785 if (cwnd_to_use > tp->snd_ssthresh) { 13786 /* Set we sent in CA */ 13787 rack->rc_gp_saw_ca = 1; 13788 } else { 13789 /* Set we sent in SS */ 13790 rack->rc_gp_saw_ss = 1; 13791 } 13792 } 13793 if (TCPS_HAVEESTABLISHED(tp->t_state) && 13794 (tp->t_flags & TF_SACK_PERMIT) && 13795 tp->rcv_numsacks > 0) 13796 tcp_clean_dsack_blocks(tp); 13797 tot_len_this_send += len; 13798 if (len == 0) 13799 counter_u64_add(rack_out_size[TCP_MSS_ACCT_SNDACK], 1); 13800 else if (len == 1) { 13801 counter_u64_add(rack_out_size[TCP_MSS_ACCT_PERSIST], 1); 13802 } else if (len > 1) { 13803 int idx; 13804 13805 idx = (len / segsiz) + 3; 13806 if (idx >= TCP_MSS_ACCT_ATIMER) 13807 counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1); 13808 else 13809 counter_u64_add(rack_out_size[idx], 1); 13810 } 13811 } 13812 if (rack->rack_no_prr == 0) { 13813 if (sub_from_prr && (error == 0)) { 13814 if (rack->r_ctl.rc_prr_sndcnt >= len) 13815 rack->r_ctl.rc_prr_sndcnt -= len; 13816 else 13817 rack->r_ctl.rc_prr_sndcnt = 0; 13818 } 13819 } 13820 sub_from_prr = 0; 13821 rack_log_output(tp, &to, len, rack_seq, (uint8_t) flags, error, cts, 13822 pass, rsm, us_cts); 13823 if ((error == 0) && 13824 (len > 0) && 13825 (tp->snd_una == tp->snd_max)) 13826 rack->r_ctl.rc_tlp_rxt_last_time = cts; 13827 /* Now are we in persists? */ 13828 if (rack->rc_in_persist == 0) { 13829 tcp_seq startseq = tp->snd_nxt; 13830 13831 /* Track our lost count */ 13832 if (rsm && (doing_tlp == 0)) 13833 rack->r_ctl.rc_loss_count += rsm->r_end - rsm->r_start; 13834 /* 13835 * Advance snd_nxt over sequence space of this segment. 13836 */ 13837 if (error) 13838 /* We don't log or do anything with errors */ 13839 goto nomore; 13840 if (doing_tlp == 0) { 13841 if (rsm == NULL) { 13842 /* 13843 * Not a retransmission of some 13844 * sort, new data is going out so 13845 * clear our TLP count and flag. 13846 */ 13847 rack->rc_tlp_in_progress = 0; 13848 rack->r_ctl.rc_tlp_cnt_out = 0; 13849 } 13850 } else { 13851 /* 13852 * We have just sent a TLP, mark that it is true 13853 * and make sure our in progress is set so we 13854 * continue to check the count. 13855 */ 13856 rack->rc_tlp_in_progress = 1; 13857 rack->r_ctl.rc_tlp_cnt_out++; 13858 } 13859 if (flags & (TH_SYN | TH_FIN)) { 13860 if (flags & TH_SYN) 13861 tp->snd_nxt++; 13862 if (flags & TH_FIN) { 13863 tp->snd_nxt++; 13864 tp->t_flags |= TF_SENTFIN; 13865 } 13866 } 13867 /* In the ENOBUFS case we do *not* update snd_max */ 13868 if (sack_rxmit) 13869 goto nomore; 13870 13871 tp->snd_nxt += len; 13872 if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { 13873 if (tp->snd_una == tp->snd_max) { 13874 /* 13875 * Update the time we just added data since 13876 * none was outstanding. 13877 */ 13878 rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__); 13879 tp->t_acktime = ticks; 13880 } 13881 tp->snd_max = tp->snd_nxt; 13882 /* 13883 * Time this transmission if not a retransmission and 13884 * not currently timing anything. 13885 * This is only relevant in case of switching back to 13886 * the base stack. 13887 */ 13888 if (tp->t_rtttime == 0) { 13889 tp->t_rtttime = ticks; 13890 tp->t_rtseq = startseq; 13891 KMOD_TCPSTAT_INC(tcps_segstimed); 13892 } 13893 if (len && 13894 ((tp->t_flags & TF_GPUTINPROG) == 0)) 13895 rack_start_gp_measurement(tp, rack, startseq, sb_offset); 13896 } 13897 } else { 13898 /* 13899 * Persist case, update snd_max but since we are in persist 13900 * mode (no window) we do not update snd_nxt. 13901 */ 13902 int32_t xlen = len; 13903 13904 if (error) 13905 goto nomore; 13906 13907 if (flags & TH_SYN) 13908 ++xlen; 13909 if (flags & TH_FIN) { 13910 ++xlen; 13911 tp->t_flags |= TF_SENTFIN; 13912 } 13913 /* In the ENOBUFS case we do *not* update snd_max */ 13914 if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max)) { 13915 if (tp->snd_una == tp->snd_max) { 13916 /* 13917 * Update the time we just added data since 13918 * none was outstanding. 13919 */ 13920 rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__); 13921 tp->t_acktime = ticks; 13922 } 13923 tp->snd_max = tp->snd_nxt + len; 13924 } 13925 } 13926 nomore: 13927 if (error) { 13928 rack->r_ctl.rc_agg_delayed = 0; 13929 rack->r_early = 0; 13930 rack->r_late = 0; 13931 rack->r_ctl.rc_agg_early = 0; 13932 SOCKBUF_UNLOCK_ASSERT(sb); /* Check gotos. */ 13933 /* 13934 * Failures do not advance the seq counter above. For the 13935 * case of ENOBUFS we will fall out and retry in 1ms with 13936 * the hpts. Everything else will just have to retransmit 13937 * with the timer. 13938 * 13939 * In any case, we do not want to loop around for another 13940 * send without a good reason. 13941 */ 13942 sendalot = 0; 13943 switch (error) { 13944 case EPERM: 13945 tp->t_softerror = error; 13946 return (error); 13947 case ENOBUFS: 13948 if (slot == 0) { 13949 /* 13950 * Pace us right away to retry in a some 13951 * time 13952 */ 13953 slot = ((1 + rack->rc_enobuf) * HPTS_USEC_IN_MSEC); 13954 if (rack->rc_enobuf < 126) 13955 rack->rc_enobuf++; 13956 if (slot > ((rack->rc_rack_rtt / 2) * HPTS_USEC_IN_MSEC)) { 13957 slot = (rack->rc_rack_rtt / 2) * HPTS_USEC_IN_MSEC; 13958 } 13959 if (slot < (10 * HPTS_USEC_IN_MSEC)) 13960 slot = 10 * HPTS_USEC_IN_MSEC; 13961 } 13962 counter_u64_add(rack_saw_enobuf, 1); 13963 error = 0; 13964 goto enobufs; 13965 case EMSGSIZE: 13966 /* 13967 * For some reason the interface we used initially 13968 * to send segments changed to another or lowered 13969 * its MTU. If TSO was active we either got an 13970 * interface without TSO capabilits or TSO was 13971 * turned off. If we obtained mtu from ip_output() 13972 * then update it and try again. 13973 */ 13974 if (tso) 13975 tp->t_flags &= ~TF_TSO; 13976 if (mtu != 0) { 13977 tcp_mss_update(tp, -1, mtu, NULL, NULL); 13978 goto again; 13979 } 13980 slot = 10 * HPTS_USEC_IN_MSEC; 13981 rack_start_hpts_timer(rack, tp, cts, slot, 0, 0); 13982 return (error); 13983 case ENETUNREACH: 13984 counter_u64_add(rack_saw_enetunreach, 1); 13985 case EHOSTDOWN: 13986 case EHOSTUNREACH: 13987 case ENETDOWN: 13988 if (TCPS_HAVERCVDSYN(tp->t_state)) { 13989 tp->t_softerror = error; 13990 } 13991 /* FALLTHROUGH */ 13992 default: 13993 slot = 10 * HPTS_USEC_IN_MSEC; 13994 rack_start_hpts_timer(rack, tp, cts, slot, 0, 0); 13995 return (error); 13996 } 13997 } else { 13998 rack->rc_enobuf = 0; 13999 } 14000 KMOD_TCPSTAT_INC(tcps_sndtotal); 14001 14002 /* 14003 * Data sent (as far as we can tell). If this advertises a larger 14004 * window than any other segment, then remember the size of the 14005 * advertised window. Any pending ACK has now been sent. 14006 */ 14007 if (recwin > 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv)) 14008 tp->rcv_adv = tp->rcv_nxt + recwin; 14009 tp->last_ack_sent = tp->rcv_nxt; 14010 tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); 14011 enobufs: 14012 /* Assure when we leave that snd_nxt will point to top */ 14013 if (SEQ_GT(tp->snd_max, tp->snd_nxt)) 14014 tp->snd_nxt = tp->snd_max; 14015 if (sendalot) { 14016 /* Do we need to turn off sendalot? */ 14017 if (rack->r_ctl.rc_pace_max_segs && 14018 (tot_len_this_send >= rack->r_ctl.rc_pace_max_segs)) { 14019 /* We hit our max. */ 14020 sendalot = 0; 14021 } else if ((rack->rc_user_set_max_segs) && 14022 (tot_len_this_send >= (rack->rc_user_set_max_segs * segsiz))) { 14023 /* We hit the user defined max */ 14024 sendalot = 0; 14025 } 14026 } 14027 if ((error == 0) && (flags & TH_FIN)) 14028 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_FIN); 14029 if (flags & TH_RST) { 14030 /* 14031 * We don't send again after sending a RST. 14032 */ 14033 slot = 0; 14034 sendalot = 0; 14035 if (error == 0) 14036 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); 14037 } else if ((slot == 0) && (sendalot == 0) && tot_len_this_send) { 14038 /* 14039 * Get our pacing rate, if an error 14040 * occured in sending (ENOBUF) we would 14041 * hit the else if with slot preset. Other 14042 * errors return. 14043 */ 14044 slot = rack_get_pacing_delay(rack, tp, tot_len_this_send, rsm, segsiz); 14045 } 14046 if (rsm && 14047 rack->use_rack_rr) { 14048 /* Its a retransmit and we use the rack cheat? */ 14049 if ((slot == 0) || 14050 (rack->rc_always_pace == 0) || 14051 (rack->r_rr_config == 1)) { 14052 /* 14053 * We have no pacing set or we 14054 * are using old-style rack or 14055 * we are overriden to use the old 1ms pacing. 14056 */ 14057 slot = rack->r_ctl.rc_min_to * HPTS_USEC_IN_MSEC; 14058 } 14059 } 14060 if (slot) { 14061 /* set the rack tcb into the slot N */ 14062 counter_u64_add(rack_paced_segments, 1); 14063 } else if (sendalot) { 14064 if (len) 14065 counter_u64_add(rack_unpaced_segments, 1); 14066 sack_rxmit = 0; 14067 goto again; 14068 } else if (len) { 14069 counter_u64_add(rack_unpaced_segments, 1); 14070 } 14071 rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, 0); 14072 return (error); 14073 } 14074 14075 static void 14076 rack_update_seg(struct tcp_rack *rack) 14077 { 14078 uint32_t orig_val; 14079 14080 orig_val = rack->r_ctl.rc_pace_max_segs; 14081 rack_set_pace_segments(rack->rc_tp, rack, __LINE__); 14082 if (orig_val != rack->r_ctl.rc_pace_max_segs) 14083 rack_log_pacing_delay_calc(rack, 0, 0, orig_val, 0, 0, 15, __LINE__, NULL); 14084 } 14085 14086 /* 14087 * rack_ctloutput() must drop the inpcb lock before performing copyin on 14088 * socket option arguments. When it re-acquires the lock after the copy, it 14089 * has to revalidate that the connection is still valid for the socket 14090 * option. 14091 */ 14092 static int 14093 rack_set_sockopt(struct socket *so, struct sockopt *sopt, 14094 struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack) 14095 { 14096 struct epoch_tracker et; 14097 uint64_t val; 14098 int32_t error = 0, optval; 14099 uint16_t ca, ss; 14100 14101 switch (sopt->sopt_name) { 14102 case TCP_RACK_PROP_RATE: /* URL:prop_rate */ 14103 case TCP_RACK_PROP : /* URL:prop */ 14104 case TCP_RACK_TLP_REDUCE: /* URL:tlp_reduce */ 14105 case TCP_RACK_EARLY_RECOV: /* URL:early_recov */ 14106 case TCP_RACK_PACE_REDUCE: /* Not used */ 14107 /* Pacing related ones */ 14108 case TCP_RACK_PACE_ALWAYS: /* URL:pace_always */ 14109 case TCP_BBR_RACK_INIT_RATE: /* URL:irate */ 14110 case TCP_BBR_IWINTSO: /* URL:tso_iwin */ 14111 case TCP_RACK_PACE_MAX_SEG: /* URL:pace_max_seg */ 14112 case TCP_RACK_FORCE_MSEG: /* URL:force_max_seg */ 14113 case TCP_RACK_PACE_RATE_CA: /* URL:pr_ca */ 14114 case TCP_RACK_PACE_RATE_SS: /* URL:pr_ss*/ 14115 case TCP_RACK_PACE_RATE_REC: /* URL:pr_rec */ 14116 case TCP_RACK_GP_INCREASE_CA: /* URL:gp_inc_ca */ 14117 case TCP_RACK_GP_INCREASE_SS: /* URL:gp_inc_ss */ 14118 case TCP_RACK_GP_INCREASE_REC: /* URL:gp_inc_rec */ 14119 case TCP_RACK_RR_CONF: /* URL:rrr_conf */ 14120 case TCP_BBR_HDWR_PACE: /* URL:hdwrpace */ 14121 /* End pacing related */ 14122 case TCP_DELACK: 14123 case TCP_RACK_PRR_SENDALOT: /* URL:prr_sendalot */ 14124 case TCP_RACK_MIN_TO: /* URL:min_to */ 14125 case TCP_RACK_EARLY_SEG: /* URL:early_seg */ 14126 case TCP_RACK_REORD_THRESH: /* URL:reord_thresh */ 14127 case TCP_RACK_REORD_FADE: /* URL:reord_fade */ 14128 case TCP_RACK_TLP_THRESH: /* URL:tlp_thresh */ 14129 case TCP_RACK_PKT_DELAY: /* URL:pkt_delay */ 14130 case TCP_RACK_TLP_USE: /* URL:tlp_use */ 14131 case TCP_RACK_TLP_INC_VAR: /* URL:tlp_inc_var */ 14132 case TCP_RACK_IDLE_REDUCE_HIGH: /* URL:idle_reduce_high */ 14133 case TCP_BBR_RACK_RTT_USE: /* URL:rttuse */ 14134 case TCP_BBR_USE_RACK_RR: /* URL:rackrr */ 14135 case TCP_RACK_DO_DETECTION: /* URL:detect */ 14136 case TCP_NO_PRR: /* URL:noprr */ 14137 case TCP_TIMELY_DYN_ADJ: /* URL:dynamic */ 14138 case TCP_DATA_AFTER_CLOSE: 14139 case TCP_RACK_NONRXT_CFG_RATE: /* URL:nonrxtcr */ 14140 case TCP_SHARED_CWND_ENABLE: /* URL:scwnd */ 14141 case TCP_RACK_MBUF_QUEUE: /* URL:mqueue */ 14142 case TCP_RACK_NO_PUSH_AT_MAX: /* URL:npush */ 14143 case TCP_RACK_PACE_TO_FILL: /* URL:fillcw */ 14144 case TCP_SHARED_CWND_TIME_LIMIT: /* URL:lscwnd */ 14145 case TCP_RACK_PROFILE: /* URL:profile */ 14146 break; 14147 default: 14148 return (tcp_default_ctloutput(so, sopt, inp, tp)); 14149 break; 14150 } 14151 INP_WUNLOCK(inp); 14152 error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); 14153 if (error) 14154 return (error); 14155 INP_WLOCK(inp); 14156 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 14157 INP_WUNLOCK(inp); 14158 return (ECONNRESET); 14159 } 14160 tp = intotcpcb(inp); 14161 rack = (struct tcp_rack *)tp->t_fb_ptr; 14162 switch (sopt->sopt_name) { 14163 case TCP_RACK_PROFILE: 14164 RACK_OPTS_INC(tcp_profile); 14165 if (optval == 1) { 14166 /* pace_always=1 */ 14167 rack->rc_always_pace = 1; 14168 tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ; 14169 /* scwnd=1 */ 14170 rack->rack_enable_scwnd = 1; 14171 /* dynamic=100 */ 14172 rack->rc_gp_dyn_mul = 1; 14173 rack->r_ctl.rack_per_of_gp_ca = 100; 14174 /* rrr_conf=3 */ 14175 rack->r_rr_config = 3; 14176 /* npush=2 */ 14177 rack->r_ctl.rc_no_push_at_mrtt = 2; 14178 /* fillcw=1 */ 14179 rack->rc_pace_to_cwnd = 1; 14180 rack->rc_pace_fill_if_rttin_range = 0; 14181 rack->rtt_limit_mul = 0; 14182 /* noprr=1 */ 14183 rack->rack_no_prr = 1; 14184 /* lscwnd=1 */ 14185 rack->r_limit_scw = 1; 14186 } else if (optval == 2) { 14187 /* pace_always=1 */ 14188 rack->rc_always_pace = 1; 14189 tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ; 14190 /* scwnd=1 */ 14191 rack->rack_enable_scwnd = 1; 14192 /* dynamic=100 */ 14193 rack->rc_gp_dyn_mul = 1; 14194 rack->r_ctl.rack_per_of_gp_ca = 100; 14195 /* rrr_conf=3 */ 14196 rack->r_rr_config = 3; 14197 /* npush=2 */ 14198 rack->r_ctl.rc_no_push_at_mrtt = 2; 14199 /* fillcw=1 */ 14200 rack->rc_pace_to_cwnd = 1; 14201 rack->rc_pace_fill_if_rttin_range = 0; 14202 rack->rtt_limit_mul = 0; 14203 /* noprr=1 */ 14204 rack->rack_no_prr = 1; 14205 /* lscwnd=0 */ 14206 rack->r_limit_scw = 0; 14207 } 14208 break; 14209 case TCP_SHARED_CWND_TIME_LIMIT: 14210 RACK_OPTS_INC(tcp_lscwnd); 14211 if (optval) 14212 rack->r_limit_scw = 1; 14213 else 14214 rack->r_limit_scw = 0; 14215 break; 14216 case TCP_RACK_PACE_TO_FILL: 14217 RACK_OPTS_INC(tcp_fillcw); 14218 if (optval == 0) 14219 rack->rc_pace_to_cwnd = 0; 14220 else 14221 rack->rc_pace_to_cwnd = 1; 14222 if ((optval >= rack_gp_rtt_maxmul) && 14223 rack_gp_rtt_maxmul && 14224 (optval < 0xf)) { 14225 rack->rc_pace_fill_if_rttin_range = 1; 14226 rack->rtt_limit_mul = optval; 14227 } else { 14228 rack->rc_pace_fill_if_rttin_range = 0; 14229 rack->rtt_limit_mul = 0; 14230 } 14231 break; 14232 case TCP_RACK_NO_PUSH_AT_MAX: 14233 RACK_OPTS_INC(tcp_npush); 14234 if (optval == 0) 14235 rack->r_ctl.rc_no_push_at_mrtt = 0; 14236 else if (optval < 0xff) 14237 rack->r_ctl.rc_no_push_at_mrtt = optval; 14238 else 14239 error = EINVAL; 14240 break; 14241 case TCP_SHARED_CWND_ENABLE: 14242 RACK_OPTS_INC(tcp_rack_scwnd); 14243 if (optval == 0) 14244 rack->rack_enable_scwnd = 0; 14245 else 14246 rack->rack_enable_scwnd = 1; 14247 break; 14248 case TCP_RACK_MBUF_QUEUE: 14249 /* Now do we use the LRO mbuf-queue feature */ 14250 RACK_OPTS_INC(tcp_rack_mbufq); 14251 if (optval) 14252 rack->r_mbuf_queue = 1; 14253 else 14254 rack->r_mbuf_queue = 0; 14255 if (rack->r_mbuf_queue || rack->rc_always_pace) 14256 tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ; 14257 else 14258 tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ; 14259 break; 14260 case TCP_RACK_NONRXT_CFG_RATE: 14261 RACK_OPTS_INC(tcp_rack_cfg_rate); 14262 if (optval == 0) 14263 rack->rack_rec_nonrxt_use_cr = 0; 14264 else 14265 rack->rack_rec_nonrxt_use_cr = 1; 14266 break; 14267 case TCP_NO_PRR: 14268 RACK_OPTS_INC(tcp_rack_noprr); 14269 if (optval == 0) 14270 rack->rack_no_prr = 0; 14271 else 14272 rack->rack_no_prr = 1; 14273 break; 14274 case TCP_TIMELY_DYN_ADJ: 14275 RACK_OPTS_INC(tcp_timely_dyn); 14276 if (optval == 0) 14277 rack->rc_gp_dyn_mul = 0; 14278 else { 14279 rack->rc_gp_dyn_mul = 1; 14280 if (optval >= 100) { 14281 /* 14282 * If the user sets something 100 or more 14283 * its the gp_ca value. 14284 */ 14285 rack->r_ctl.rack_per_of_gp_ca = optval; 14286 } 14287 } 14288 break; 14289 case TCP_RACK_DO_DETECTION: 14290 RACK_OPTS_INC(tcp_rack_do_detection); 14291 if (optval == 0) 14292 rack->do_detection = 0; 14293 else 14294 rack->do_detection = 1; 14295 break; 14296 case TCP_RACK_PROP_RATE: 14297 if ((optval <= 0) || (optval >= 100)) { 14298 error = EINVAL; 14299 break; 14300 } 14301 RACK_OPTS_INC(tcp_rack_prop_rate); 14302 rack->r_ctl.rc_prop_rate = optval; 14303 break; 14304 case TCP_RACK_TLP_USE: 14305 if ((optval < TLP_USE_ID) || (optval > TLP_USE_TWO_TWO)) { 14306 error = EINVAL; 14307 break; 14308 } 14309 RACK_OPTS_INC(tcp_tlp_use); 14310 rack->rack_tlp_threshold_use = optval; 14311 break; 14312 case TCP_RACK_PROP: 14313 /* RACK proportional rate reduction (bool) */ 14314 RACK_OPTS_INC(tcp_rack_prop); 14315 rack->r_ctl.rc_prop_reduce = optval; 14316 break; 14317 case TCP_RACK_TLP_REDUCE: 14318 /* RACK TLP cwnd reduction (bool) */ 14319 RACK_OPTS_INC(tcp_rack_tlp_reduce); 14320 rack->r_ctl.rc_tlp_cwnd_reduce = optval; 14321 break; 14322 case TCP_RACK_EARLY_RECOV: 14323 /* Should recovery happen early (bool) */ 14324 RACK_OPTS_INC(tcp_rack_early_recov); 14325 rack->r_ctl.rc_early_recovery = optval; 14326 break; 14327 14328 /* Pacing related ones */ 14329 case TCP_RACK_PACE_ALWAYS: 14330 /* 14331 * zero is old rack method, 1 is new 14332 * method using a pacing rate. 14333 */ 14334 RACK_OPTS_INC(tcp_rack_pace_always); 14335 if (optval > 0) 14336 rack->rc_always_pace = 1; 14337 else 14338 rack->rc_always_pace = 0; 14339 if (rack->r_mbuf_queue || rack->rc_always_pace) 14340 tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ; 14341 else 14342 tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ; 14343 /* A rate may be set irate or other, if so set seg size */ 14344 rack_update_seg(rack); 14345 break; 14346 case TCP_BBR_RACK_INIT_RATE: 14347 RACK_OPTS_INC(tcp_initial_rate); 14348 val = optval; 14349 /* Change from kbits per second to bytes per second */ 14350 val *= 1000; 14351 val /= 8; 14352 rack->r_ctl.init_rate = val; 14353 if (rack->rc_init_win != rack_default_init_window) { 14354 uint32_t win, snt; 14355 14356 /* 14357 * Options don't always get applied 14358 * in the order you think. So in order 14359 * to assure we update a cwnd we need 14360 * to check and see if we are still 14361 * where we should raise the cwnd. 14362 */ 14363 win = rc_init_window(rack); 14364 if (SEQ_GT(tp->snd_max, tp->iss)) 14365 snt = tp->snd_max - tp->iss; 14366 else 14367 snt = 0; 14368 if ((snt < win) && 14369 (tp->snd_cwnd < win)) 14370 tp->snd_cwnd = win; 14371 } 14372 if (rack->rc_always_pace) 14373 rack_update_seg(rack); 14374 break; 14375 case TCP_BBR_IWINTSO: 14376 RACK_OPTS_INC(tcp_initial_win); 14377 if (optval && (optval <= 0xff)) { 14378 uint32_t win, snt; 14379 14380 rack->rc_init_win = optval; 14381 win = rc_init_window(rack); 14382 if (SEQ_GT(tp->snd_max, tp->iss)) 14383 snt = tp->snd_max - tp->iss; 14384 else 14385 snt = 0; 14386 if ((snt < win) && 14387 (tp->t_srtt | 14388 #ifdef NETFLIX_PEAKRATE 14389 tp->t_maxpeakrate | 14390 #endif 14391 rack->r_ctl.init_rate)) { 14392 /* 14393 * We are not past the initial window 14394 * and we have some bases for pacing, 14395 * so we need to possibly adjust up 14396 * the cwnd. Note even if we don't set 14397 * the cwnd, its still ok to raise the rc_init_win 14398 * which can be used coming out of idle when we 14399 * would have a rate. 14400 */ 14401 if (tp->snd_cwnd < win) 14402 tp->snd_cwnd = win; 14403 } 14404 if (rack->rc_always_pace) 14405 rack_update_seg(rack); 14406 } else 14407 error = EINVAL; 14408 break; 14409 case TCP_RACK_FORCE_MSEG: 14410 RACK_OPTS_INC(tcp_rack_force_max_seg); 14411 if (optval) 14412 rack->rc_force_max_seg = 1; 14413 else 14414 rack->rc_force_max_seg = 0; 14415 break; 14416 case TCP_RACK_PACE_MAX_SEG: 14417 /* Max segments size in a pace in bytes */ 14418 RACK_OPTS_INC(tcp_rack_max_seg); 14419 rack->rc_user_set_max_segs = optval; 14420 rack_set_pace_segments(tp, rack, __LINE__); 14421 break; 14422 case TCP_RACK_PACE_RATE_REC: 14423 /* Set the fixed pacing rate in Bytes per second ca */ 14424 RACK_OPTS_INC(tcp_rack_pace_rate_rec); 14425 rack->r_ctl.rc_fixed_pacing_rate_rec = optval; 14426 if (rack->r_ctl.rc_fixed_pacing_rate_ca == 0) 14427 rack->r_ctl.rc_fixed_pacing_rate_ca = optval; 14428 if (rack->r_ctl.rc_fixed_pacing_rate_ss == 0) 14429 rack->r_ctl.rc_fixed_pacing_rate_ss = optval; 14430 rack->use_fixed_rate = 1; 14431 rack_log_pacing_delay_calc(rack, 14432 rack->r_ctl.rc_fixed_pacing_rate_ss, 14433 rack->r_ctl.rc_fixed_pacing_rate_ca, 14434 rack->r_ctl.rc_fixed_pacing_rate_rec, 0, 0, 8, 14435 __LINE__, NULL); 14436 break; 14437 14438 case TCP_RACK_PACE_RATE_SS: 14439 /* Set the fixed pacing rate in Bytes per second ca */ 14440 RACK_OPTS_INC(tcp_rack_pace_rate_ss); 14441 rack->r_ctl.rc_fixed_pacing_rate_ss = optval; 14442 if (rack->r_ctl.rc_fixed_pacing_rate_ca == 0) 14443 rack->r_ctl.rc_fixed_pacing_rate_ca = optval; 14444 if (rack->r_ctl.rc_fixed_pacing_rate_rec == 0) 14445 rack->r_ctl.rc_fixed_pacing_rate_rec = optval; 14446 rack->use_fixed_rate = 1; 14447 rack_log_pacing_delay_calc(rack, 14448 rack->r_ctl.rc_fixed_pacing_rate_ss, 14449 rack->r_ctl.rc_fixed_pacing_rate_ca, 14450 rack->r_ctl.rc_fixed_pacing_rate_rec, 0, 0, 8, 14451 __LINE__, NULL); 14452 break; 14453 14454 case TCP_RACK_PACE_RATE_CA: 14455 /* Set the fixed pacing rate in Bytes per second ca */ 14456 RACK_OPTS_INC(tcp_rack_pace_rate_ca); 14457 rack->r_ctl.rc_fixed_pacing_rate_ca = optval; 14458 if (rack->r_ctl.rc_fixed_pacing_rate_ss == 0) 14459 rack->r_ctl.rc_fixed_pacing_rate_ss = optval; 14460 if (rack->r_ctl.rc_fixed_pacing_rate_rec == 0) 14461 rack->r_ctl.rc_fixed_pacing_rate_rec = optval; 14462 rack->use_fixed_rate = 1; 14463 rack_log_pacing_delay_calc(rack, 14464 rack->r_ctl.rc_fixed_pacing_rate_ss, 14465 rack->r_ctl.rc_fixed_pacing_rate_ca, 14466 rack->r_ctl.rc_fixed_pacing_rate_rec, 0, 0, 8, 14467 __LINE__, NULL); 14468 break; 14469 case TCP_RACK_GP_INCREASE_REC: 14470 RACK_OPTS_INC(tcp_gp_inc_rec); 14471 rack->r_ctl.rack_per_of_gp_rec = optval; 14472 rack_log_pacing_delay_calc(rack, 14473 rack->r_ctl.rack_per_of_gp_ss, 14474 rack->r_ctl.rack_per_of_gp_ca, 14475 rack->r_ctl.rack_per_of_gp_rec, 0, 0, 1, 14476 __LINE__, NULL); 14477 break; 14478 case TCP_RACK_GP_INCREASE_CA: 14479 RACK_OPTS_INC(tcp_gp_inc_ca); 14480 ca = optval; 14481 if (ca < 100) { 14482 /* 14483 * We don't allow any reduction 14484 * over the GP b/w. 14485 */ 14486 error = EINVAL; 14487 break; 14488 } 14489 rack->r_ctl.rack_per_of_gp_ca = ca; 14490 rack_log_pacing_delay_calc(rack, 14491 rack->r_ctl.rack_per_of_gp_ss, 14492 rack->r_ctl.rack_per_of_gp_ca, 14493 rack->r_ctl.rack_per_of_gp_rec, 0, 0, 1, 14494 __LINE__, NULL); 14495 break; 14496 case TCP_RACK_GP_INCREASE_SS: 14497 RACK_OPTS_INC(tcp_gp_inc_ss); 14498 ss = optval; 14499 if (ss < 100) { 14500 /* 14501 * We don't allow any reduction 14502 * over the GP b/w. 14503 */ 14504 error = EINVAL; 14505 break; 14506 } 14507 rack->r_ctl.rack_per_of_gp_ss = ss; 14508 rack_log_pacing_delay_calc(rack, 14509 rack->r_ctl.rack_per_of_gp_ss, 14510 rack->r_ctl.rack_per_of_gp_ca, 14511 rack->r_ctl.rack_per_of_gp_rec, 0, 0, 1, 14512 __LINE__, NULL); 14513 break; 14514 case TCP_RACK_RR_CONF: 14515 RACK_OPTS_INC(tcp_rack_rrr_no_conf_rate); 14516 if (optval && optval <= 3) 14517 rack->r_rr_config = optval; 14518 else 14519 rack->r_rr_config = 0; 14520 break; 14521 case TCP_BBR_HDWR_PACE: 14522 RACK_OPTS_INC(tcp_hdwr_pacing); 14523 if (optval){ 14524 if (rack->rack_hdrw_pacing == 0) { 14525 rack->rack_hdw_pace_ena = 1; 14526 rack->rack_attempt_hdwr_pace = 0; 14527 } else 14528 error = EALREADY; 14529 } else { 14530 rack->rack_hdw_pace_ena = 0; 14531 #ifdef RATELIMIT 14532 if (rack->rack_hdrw_pacing) { 14533 rack->rack_hdrw_pacing = 0; 14534 in_pcbdetach_txrtlmt(rack->rc_inp); 14535 } 14536 #endif 14537 } 14538 break; 14539 /* End Pacing related ones */ 14540 case TCP_RACK_PRR_SENDALOT: 14541 /* Allow PRR to send more than one seg */ 14542 RACK_OPTS_INC(tcp_rack_prr_sendalot); 14543 rack->r_ctl.rc_prr_sendalot = optval; 14544 break; 14545 case TCP_RACK_MIN_TO: 14546 /* Minimum time between rack t-o's in ms */ 14547 RACK_OPTS_INC(tcp_rack_min_to); 14548 rack->r_ctl.rc_min_to = optval; 14549 break; 14550 case TCP_RACK_EARLY_SEG: 14551 /* If early recovery max segments */ 14552 RACK_OPTS_INC(tcp_rack_early_seg); 14553 rack->r_ctl.rc_early_recovery_segs = optval; 14554 break; 14555 case TCP_RACK_REORD_THRESH: 14556 /* RACK reorder threshold (shift amount) */ 14557 RACK_OPTS_INC(tcp_rack_reord_thresh); 14558 if ((optval > 0) && (optval < 31)) 14559 rack->r_ctl.rc_reorder_shift = optval; 14560 else 14561 error = EINVAL; 14562 break; 14563 case TCP_RACK_REORD_FADE: 14564 /* Does reordering fade after ms time */ 14565 RACK_OPTS_INC(tcp_rack_reord_fade); 14566 rack->r_ctl.rc_reorder_fade = optval; 14567 break; 14568 case TCP_RACK_TLP_THRESH: 14569 /* RACK TLP theshold i.e. srtt+(srtt/N) */ 14570 RACK_OPTS_INC(tcp_rack_tlp_thresh); 14571 if (optval) 14572 rack->r_ctl.rc_tlp_threshold = optval; 14573 else 14574 error = EINVAL; 14575 break; 14576 case TCP_BBR_USE_RACK_RR: 14577 RACK_OPTS_INC(tcp_rack_rr); 14578 if (optval) 14579 rack->use_rack_rr = 1; 14580 else 14581 rack->use_rack_rr = 0; 14582 break; 14583 case TCP_RACK_PKT_DELAY: 14584 /* RACK added ms i.e. rack-rtt + reord + N */ 14585 RACK_OPTS_INC(tcp_rack_pkt_delay); 14586 rack->r_ctl.rc_pkt_delay = optval; 14587 break; 14588 case TCP_RACK_TLP_INC_VAR: 14589 /* Does TLP include rtt variance in t-o */ 14590 error = EINVAL; 14591 break; 14592 case TCP_RACK_IDLE_REDUCE_HIGH: 14593 error = EINVAL; 14594 break; 14595 case TCP_DELACK: 14596 if (optval == 0) 14597 tp->t_delayed_ack = 0; 14598 else 14599 tp->t_delayed_ack = 1; 14600 if (tp->t_flags & TF_DELACK) { 14601 tp->t_flags &= ~TF_DELACK; 14602 tp->t_flags |= TF_ACKNOW; 14603 NET_EPOCH_ENTER(et); 14604 rack_output(tp); 14605 NET_EPOCH_EXIT(et); 14606 } 14607 break; 14608 14609 case TCP_BBR_RACK_RTT_USE: 14610 if ((optval != USE_RTT_HIGH) && 14611 (optval != USE_RTT_LOW) && 14612 (optval != USE_RTT_AVG)) 14613 error = EINVAL; 14614 else 14615 rack->r_ctl.rc_rate_sample_method = optval; 14616 break; 14617 case TCP_DATA_AFTER_CLOSE: 14618 if (optval) 14619 rack->rc_allow_data_af_clo = 1; 14620 else 14621 rack->rc_allow_data_af_clo = 0; 14622 break; 14623 case TCP_RACK_PACE_REDUCE: 14624 /* sysctl only now */ 14625 error = EINVAL; 14626 break; 14627 default: 14628 return (tcp_default_ctloutput(so, sopt, inp, tp)); 14629 break; 14630 } 14631 #ifdef NETFLIX_STATS 14632 tcp_log_socket_option(tp, sopt->sopt_name, optval, error); 14633 #endif 14634 INP_WUNLOCK(inp); 14635 return (error); 14636 } 14637 14638 static int 14639 rack_get_sockopt(struct socket *so, struct sockopt *sopt, 14640 struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack) 14641 { 14642 int32_t error, optval; 14643 uint64_t val; 14644 /* 14645 * Because all our options are either boolean or an int, we can just 14646 * pull everything into optval and then unlock and copy. If we ever 14647 * add a option that is not a int, then this will have quite an 14648 * impact to this routine. 14649 */ 14650 error = 0; 14651 switch (sopt->sopt_name) { 14652 case TCP_RACK_PROFILE: 14653 /* You cannot retrieve a profile, its write only */ 14654 error = EINVAL; 14655 break; 14656 case TCP_RACK_PACE_TO_FILL: 14657 optval = rack->rc_pace_to_cwnd; 14658 break; 14659 case TCP_RACK_NO_PUSH_AT_MAX: 14660 optval = rack->r_ctl.rc_no_push_at_mrtt; 14661 break; 14662 case TCP_SHARED_CWND_ENABLE: 14663 optval = rack->rack_enable_scwnd; 14664 break; 14665 case TCP_RACK_NONRXT_CFG_RATE: 14666 optval = rack->rack_rec_nonrxt_use_cr; 14667 break; 14668 case TCP_NO_PRR: 14669 optval = rack->rack_no_prr; 14670 break; 14671 case TCP_RACK_DO_DETECTION: 14672 optval = rack->do_detection; 14673 break; 14674 case TCP_RACK_MBUF_QUEUE: 14675 /* Now do we use the LRO mbuf-queue feature */ 14676 optval = rack->r_mbuf_queue; 14677 break; 14678 case TCP_TIMELY_DYN_ADJ: 14679 optval = rack->rc_gp_dyn_mul; 14680 break; 14681 case TCP_BBR_IWINTSO: 14682 optval = rack->rc_init_win; 14683 break; 14684 case TCP_RACK_PROP_RATE: 14685 optval = rack->r_ctl.rc_prop_rate; 14686 break; 14687 case TCP_RACK_PROP: 14688 /* RACK proportional rate reduction (bool) */ 14689 optval = rack->r_ctl.rc_prop_reduce; 14690 break; 14691 case TCP_RACK_TLP_REDUCE: 14692 /* RACK TLP cwnd reduction (bool) */ 14693 optval = rack->r_ctl.rc_tlp_cwnd_reduce; 14694 break; 14695 case TCP_RACK_EARLY_RECOV: 14696 /* Should recovery happen early (bool) */ 14697 optval = rack->r_ctl.rc_early_recovery; 14698 break; 14699 case TCP_RACK_PACE_REDUCE: 14700 /* RACK Hptsi reduction factor (divisor) */ 14701 error = EINVAL; 14702 break; 14703 case TCP_BBR_RACK_INIT_RATE: 14704 val = rack->r_ctl.init_rate; 14705 /* convert to kbits per sec */ 14706 val *= 8; 14707 val /= 1000; 14708 optval = (uint32_t)val; 14709 break; 14710 case TCP_RACK_FORCE_MSEG: 14711 optval = rack->rc_force_max_seg; 14712 break; 14713 case TCP_RACK_PACE_MAX_SEG: 14714 /* Max segments in a pace */ 14715 optval = rack->rc_user_set_max_segs; 14716 break; 14717 case TCP_RACK_PACE_ALWAYS: 14718 /* Use the always pace method */ 14719 optval = rack->rc_always_pace; 14720 break; 14721 case TCP_RACK_PRR_SENDALOT: 14722 /* Allow PRR to send more than one seg */ 14723 optval = rack->r_ctl.rc_prr_sendalot; 14724 break; 14725 case TCP_RACK_MIN_TO: 14726 /* Minimum time between rack t-o's in ms */ 14727 optval = rack->r_ctl.rc_min_to; 14728 break; 14729 case TCP_RACK_EARLY_SEG: 14730 /* If early recovery max segments */ 14731 optval = rack->r_ctl.rc_early_recovery_segs; 14732 break; 14733 case TCP_RACK_REORD_THRESH: 14734 /* RACK reorder threshold (shift amount) */ 14735 optval = rack->r_ctl.rc_reorder_shift; 14736 break; 14737 case TCP_RACK_REORD_FADE: 14738 /* Does reordering fade after ms time */ 14739 optval = rack->r_ctl.rc_reorder_fade; 14740 break; 14741 case TCP_BBR_USE_RACK_RR: 14742 /* Do we use the rack cheat for rxt */ 14743 optval = rack->use_rack_rr; 14744 break; 14745 case TCP_RACK_RR_CONF: 14746 optval = rack->r_rr_config; 14747 break; 14748 case TCP_BBR_HDWR_PACE: 14749 optval = rack->rack_hdw_pace_ena; 14750 break; 14751 case TCP_RACK_TLP_THRESH: 14752 /* RACK TLP theshold i.e. srtt+(srtt/N) */ 14753 optval = rack->r_ctl.rc_tlp_threshold; 14754 break; 14755 case TCP_RACK_PKT_DELAY: 14756 /* RACK added ms i.e. rack-rtt + reord + N */ 14757 optval = rack->r_ctl.rc_pkt_delay; 14758 break; 14759 case TCP_RACK_TLP_USE: 14760 optval = rack->rack_tlp_threshold_use; 14761 break; 14762 case TCP_RACK_TLP_INC_VAR: 14763 /* Does TLP include rtt variance in t-o */ 14764 error = EINVAL; 14765 break; 14766 case TCP_RACK_IDLE_REDUCE_HIGH: 14767 error = EINVAL; 14768 break; 14769 case TCP_RACK_PACE_RATE_CA: 14770 optval = rack->r_ctl.rc_fixed_pacing_rate_ca; 14771 break; 14772 case TCP_RACK_PACE_RATE_SS: 14773 optval = rack->r_ctl.rc_fixed_pacing_rate_ss; 14774 break; 14775 case TCP_RACK_PACE_RATE_REC: 14776 optval = rack->r_ctl.rc_fixed_pacing_rate_rec; 14777 break; 14778 case TCP_RACK_GP_INCREASE_SS: 14779 optval = rack->r_ctl.rack_per_of_gp_ca; 14780 break; 14781 case TCP_RACK_GP_INCREASE_CA: 14782 optval = rack->r_ctl.rack_per_of_gp_ss; 14783 break; 14784 case TCP_BBR_RACK_RTT_USE: 14785 optval = rack->r_ctl.rc_rate_sample_method; 14786 break; 14787 case TCP_DELACK: 14788 optval = tp->t_delayed_ack; 14789 break; 14790 case TCP_DATA_AFTER_CLOSE: 14791 optval = rack->rc_allow_data_af_clo; 14792 break; 14793 case TCP_SHARED_CWND_TIME_LIMIT: 14794 optval = rack->r_limit_scw; 14795 break; 14796 default: 14797 return (tcp_default_ctloutput(so, sopt, inp, tp)); 14798 break; 14799 } 14800 INP_WUNLOCK(inp); 14801 if (error == 0) { 14802 error = sooptcopyout(sopt, &optval, sizeof optval); 14803 } 14804 return (error); 14805 } 14806 14807 static int 14808 rack_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp) 14809 { 14810 int32_t error = EINVAL; 14811 struct tcp_rack *rack; 14812 14813 rack = (struct tcp_rack *)tp->t_fb_ptr; 14814 if (rack == NULL) { 14815 /* Huh? */ 14816 goto out; 14817 } 14818 if (sopt->sopt_dir == SOPT_SET) { 14819 return (rack_set_sockopt(so, sopt, inp, tp, rack)); 14820 } else if (sopt->sopt_dir == SOPT_GET) { 14821 return (rack_get_sockopt(so, sopt, inp, tp, rack)); 14822 } 14823 out: 14824 INP_WUNLOCK(inp); 14825 return (error); 14826 } 14827 14828 static int 14829 rack_pru_options(struct tcpcb *tp, int flags) 14830 { 14831 if (flags & PRUS_OOB) 14832 return (EOPNOTSUPP); 14833 return (0); 14834 } 14835 14836 static struct tcp_function_block __tcp_rack = { 14837 .tfb_tcp_block_name = __XSTRING(STACKNAME), 14838 .tfb_tcp_output = rack_output, 14839 .tfb_do_queued_segments = ctf_do_queued_segments, 14840 .tfb_do_segment_nounlock = rack_do_segment_nounlock, 14841 .tfb_tcp_do_segment = rack_do_segment, 14842 .tfb_tcp_ctloutput = rack_ctloutput, 14843 .tfb_tcp_fb_init = rack_init, 14844 .tfb_tcp_fb_fini = rack_fini, 14845 .tfb_tcp_timer_stop_all = rack_stopall, 14846 .tfb_tcp_timer_activate = rack_timer_activate, 14847 .tfb_tcp_timer_active = rack_timer_active, 14848 .tfb_tcp_timer_stop = rack_timer_stop, 14849 .tfb_tcp_rexmit_tmr = rack_remxt_tmr, 14850 .tfb_tcp_handoff_ok = rack_handoff_ok, 14851 .tfb_pru_options = rack_pru_options, 14852 }; 14853 14854 static const char *rack_stack_names[] = { 14855 __XSTRING(STACKNAME), 14856 #ifdef STACKALIAS 14857 __XSTRING(STACKALIAS), 14858 #endif 14859 }; 14860 14861 static int 14862 rack_ctor(void *mem, int32_t size, void *arg, int32_t how) 14863 { 14864 memset(mem, 0, size); 14865 return (0); 14866 } 14867 14868 static void 14869 rack_dtor(void *mem, int32_t size, void *arg) 14870 { 14871 14872 } 14873 14874 static bool rack_mod_inited = false; 14875 14876 static int 14877 tcp_addrack(module_t mod, int32_t type, void *data) 14878 { 14879 int32_t err = 0; 14880 int num_stacks; 14881 14882 switch (type) { 14883 case MOD_LOAD: 14884 rack_zone = uma_zcreate(__XSTRING(MODNAME) "_map", 14885 sizeof(struct rack_sendmap), 14886 rack_ctor, rack_dtor, NULL, NULL, UMA_ALIGN_PTR, 0); 14887 14888 rack_pcb_zone = uma_zcreate(__XSTRING(MODNAME) "_pcb", 14889 sizeof(struct tcp_rack), 14890 rack_ctor, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0); 14891 14892 sysctl_ctx_init(&rack_sysctl_ctx); 14893 rack_sysctl_root = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 14894 SYSCTL_STATIC_CHILDREN(_net_inet_tcp), 14895 OID_AUTO, 14896 #ifdef STACKALIAS 14897 __XSTRING(STACKALIAS), 14898 #else 14899 __XSTRING(STACKNAME), 14900 #endif 14901 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 14902 ""); 14903 if (rack_sysctl_root == NULL) { 14904 printf("Failed to add sysctl node\n"); 14905 err = EFAULT; 14906 goto free_uma; 14907 } 14908 rack_init_sysctls(); 14909 num_stacks = nitems(rack_stack_names); 14910 err = register_tcp_functions_as_names(&__tcp_rack, M_WAITOK, 14911 rack_stack_names, &num_stacks); 14912 if (err) { 14913 printf("Failed to register %s stack name for " 14914 "%s module\n", rack_stack_names[num_stacks], 14915 __XSTRING(MODNAME)); 14916 sysctl_ctx_free(&rack_sysctl_ctx); 14917 free_uma: 14918 uma_zdestroy(rack_zone); 14919 uma_zdestroy(rack_pcb_zone); 14920 rack_counter_destroy(); 14921 printf("Failed to register rack module -- err:%d\n", err); 14922 return (err); 14923 } 14924 tcp_lro_reg_mbufq(); 14925 rack_mod_inited = true; 14926 break; 14927 case MOD_QUIESCE: 14928 err = deregister_tcp_functions(&__tcp_rack, true, false); 14929 break; 14930 case MOD_UNLOAD: 14931 err = deregister_tcp_functions(&__tcp_rack, false, true); 14932 if (err == EBUSY) 14933 break; 14934 if (rack_mod_inited) { 14935 uma_zdestroy(rack_zone); 14936 uma_zdestroy(rack_pcb_zone); 14937 sysctl_ctx_free(&rack_sysctl_ctx); 14938 rack_counter_destroy(); 14939 rack_mod_inited = false; 14940 } 14941 tcp_lro_dereg_mbufq(); 14942 err = 0; 14943 break; 14944 default: 14945 return (EOPNOTSUPP); 14946 } 14947 return (err); 14948 } 14949 14950 static moduledata_t tcp_rack = { 14951 .name = __XSTRING(MODNAME), 14952 .evhand = tcp_addrack, 14953 .priv = 0 14954 }; 14955 14956 MODULE_VERSION(MODNAME, 1); 14957 DECLARE_MODULE(MODNAME, tcp_rack, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY); 14958 MODULE_DEPEND(MODNAME, tcphpts, 1, 1, 1); 14959