1 /*- 2 * Copyright (c) 2016-2020 Netflix, Inc. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 16 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 23 * SUCH DAMAGE. 24 * 25 */ 26 27 #include <sys/cdefs.h> 28 __FBSDID("$FreeBSD$"); 29 30 #include "opt_inet.h" 31 #include "opt_inet6.h" 32 #include "opt_ipsec.h" 33 #include "opt_tcpdebug.h" 34 #include "opt_ratelimit.h" 35 #include <sys/param.h> 36 #include <sys/arb.h> 37 #include <sys/module.h> 38 #include <sys/kernel.h> 39 #ifdef TCP_HHOOK 40 #include <sys/hhook.h> 41 #endif 42 #include <sys/lock.h> 43 #include <sys/malloc.h> 44 #include <sys/lock.h> 45 #include <sys/mutex.h> 46 #include <sys/mbuf.h> 47 #include <sys/proc.h> /* for proc0 declaration */ 48 #include <sys/socket.h> 49 #include <sys/socketvar.h> 50 #include <sys/sysctl.h> 51 #include <sys/systm.h> 52 #ifdef STATS 53 #include <sys/qmath.h> 54 #include <sys/tree.h> 55 #include <sys/stats.h> /* Must come after qmath.h and tree.h */ 56 #else 57 #include <sys/tree.h> 58 #endif 59 #include <sys/refcount.h> 60 #include <sys/queue.h> 61 #include <sys/tim_filter.h> 62 #include <sys/smp.h> 63 #include <sys/kthread.h> 64 #include <sys/kern_prefetch.h> 65 #include <sys/protosw.h> 66 67 #include <vm/uma.h> 68 69 #include <net/route.h> 70 #include <net/route/nhop.h> 71 #include <net/vnet.h> 72 73 #define TCPSTATES /* for logging */ 74 75 #include <netinet/in.h> 76 #include <netinet/in_kdtrace.h> 77 #include <netinet/in_pcb.h> 78 #include <netinet/ip.h> 79 #include <netinet/ip_icmp.h> /* required for icmp_var.h */ 80 #include <netinet/icmp_var.h> /* for ICMP_BANDLIM */ 81 #include <netinet/ip_var.h> 82 #include <netinet/ip6.h> 83 #include <netinet6/in6_pcb.h> 84 #include <netinet6/ip6_var.h> 85 #include <netinet/tcp.h> 86 #define TCPOUTFLAGS 87 #include <netinet/tcp_fsm.h> 88 #include <netinet/tcp_log_buf.h> 89 #include <netinet/tcp_seq.h> 90 #include <netinet/tcp_timer.h> 91 #include <netinet/tcp_var.h> 92 #include <netinet/tcp_hpts.h> 93 #include <netinet/tcp_ratelimit.h> 94 #include <netinet/tcpip.h> 95 #include <netinet/cc/cc.h> 96 #include <netinet/tcp_fastopen.h> 97 #include <netinet/tcp_lro.h> 98 #ifdef NETFLIX_SHARED_CWND 99 #include <netinet/tcp_shared_cwnd.h> 100 #endif 101 #ifdef TCPDEBUG 102 #include <netinet/tcp_debug.h> 103 #endif /* TCPDEBUG */ 104 #ifdef TCP_OFFLOAD 105 #include <netinet/tcp_offload.h> 106 #endif 107 #ifdef INET6 108 #include <netinet6/tcp6_var.h> 109 #endif 110 111 #include <netipsec/ipsec_support.h> 112 113 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 114 #include <netipsec/ipsec.h> 115 #include <netipsec/ipsec6.h> 116 #endif /* IPSEC */ 117 118 #include <netinet/udp.h> 119 #include <netinet/udp_var.h> 120 #include <machine/in_cksum.h> 121 122 #ifdef MAC 123 #include <security/mac/mac_framework.h> 124 #endif 125 #include "sack_filter.h" 126 #include "tcp_rack.h" 127 #include "rack_bbr_common.h" 128 129 uma_zone_t rack_zone; 130 uma_zone_t rack_pcb_zone; 131 132 #ifndef TICKS2SBT 133 #define TICKS2SBT(__t) (tick_sbt * ((sbintime_t)(__t))) 134 #endif 135 136 struct sysctl_ctx_list rack_sysctl_ctx; 137 struct sysctl_oid *rack_sysctl_root; 138 139 #define CUM_ACKED 1 140 #define SACKED 2 141 142 /* 143 * The RACK module incorporates a number of 144 * TCP ideas that have been put out into the IETF 145 * over the last few years: 146 * - Matt Mathis's Rate Halving which slowly drops 147 * the congestion window so that the ack clock can 148 * be maintained during a recovery. 149 * - Yuchung Cheng's RACK TCP (for which its named) that 150 * will stop us using the number of dup acks and instead 151 * use time as the gage of when we retransmit. 152 * - Reorder Detection of RFC4737 and the Tail-Loss probe draft 153 * of Dukkipati et.al. 154 * RACK depends on SACK, so if an endpoint arrives that 155 * cannot do SACK the state machine below will shuttle the 156 * connection back to using the "default" TCP stack that is 157 * in FreeBSD. 158 * 159 * To implement RACK the original TCP stack was first decomposed 160 * into a functional state machine with individual states 161 * for each of the possible TCP connection states. The do_segement 162 * functions role in life is to mandate the connection supports SACK 163 * initially and then assure that the RACK state matches the conenction 164 * state before calling the states do_segment function. Each 165 * state is simplified due to the fact that the original do_segment 166 * has been decomposed and we *know* what state we are in (no 167 * switches on the state) and all tests for SACK are gone. This 168 * greatly simplifies what each state does. 169 * 170 * TCP output is also over-written with a new version since it 171 * must maintain the new rack scoreboard. 172 * 173 */ 174 static int32_t rack_tlp_thresh = 1; 175 static int32_t rack_tlp_limit = 2; /* No more than 2 TLPs w-out new data */ 176 static int32_t rack_tlp_use_greater = 1; 177 static int32_t rack_reorder_thresh = 2; 178 static int32_t rack_reorder_fade = 60000; /* 0 - never fade, def 60,000 179 * - 60 seconds */ 180 /* Attack threshold detections */ 181 static uint32_t rack_highest_sack_thresh_seen = 0; 182 static uint32_t rack_highest_move_thresh_seen = 0; 183 184 static int32_t rack_pkt_delay = 1; 185 static int32_t rack_early_recovery = 1; 186 static int32_t rack_send_a_lot_in_prr = 1; 187 static int32_t rack_min_to = 1; /* Number of ms minimum timeout */ 188 static int32_t rack_verbose_logging = 0; 189 static int32_t rack_ignore_data_after_close = 1; 190 static int32_t rack_enable_shared_cwnd = 0; 191 static int32_t rack_limits_scwnd = 1; 192 static int32_t rack_enable_mqueue_for_nonpaced = 0; 193 static int32_t rack_disable_prr = 0; 194 static int32_t use_rack_rr = 1; 195 static int32_t rack_non_rxt_use_cr = 0; /* does a non-rxt in recovery use the configured rate (ss/ca)? */ 196 static int32_t rack_persist_min = 250; /* 250ms */ 197 static int32_t rack_persist_max = 2000; /* 2 Second */ 198 static int32_t rack_sack_not_required = 0; /* set to one to allow non-sack to use rack */ 199 static int32_t rack_default_init_window = 0; /* Use system default */ 200 static int32_t rack_limit_time_with_srtt = 0; 201 static int32_t rack_hw_pace_adjust = 0; 202 /* 203 * Currently regular tcp has a rto_min of 30ms 204 * the backoff goes 12 times so that ends up 205 * being a total of 122.850 seconds before a 206 * connection is killed. 207 */ 208 static uint32_t rack_def_data_window = 20; 209 static uint32_t rack_goal_bdp = 2; 210 static uint32_t rack_min_srtts = 1; 211 static uint32_t rack_min_measure_usec = 0; 212 static int32_t rack_tlp_min = 10; 213 static int32_t rack_rto_min = 30; /* 30ms same as main freebsd */ 214 static int32_t rack_rto_max = 4000; /* 4 seconds */ 215 static const int32_t rack_free_cache = 2; 216 static int32_t rack_hptsi_segments = 40; 217 static int32_t rack_rate_sample_method = USE_RTT_LOW; 218 static int32_t rack_pace_every_seg = 0; 219 static int32_t rack_delayed_ack_time = 200; /* 200ms */ 220 static int32_t rack_slot_reduction = 4; 221 static int32_t rack_wma_divisor = 8; /* For WMA calculation */ 222 static int32_t rack_cwnd_block_ends_measure = 0; 223 static int32_t rack_rwnd_block_ends_measure = 0; 224 225 static int32_t rack_lower_cwnd_at_tlp = 0; 226 static int32_t rack_use_proportional_reduce = 0; 227 static int32_t rack_proportional_rate = 10; 228 static int32_t rack_tlp_max_resend = 2; 229 static int32_t rack_limited_retran = 0; 230 static int32_t rack_always_send_oldest = 0; 231 static int32_t rack_tlp_threshold_use = TLP_USE_TWO_ONE; 232 233 static uint16_t rack_per_of_gp_ss = 250; /* 250 % slow-start */ 234 static uint16_t rack_per_of_gp_ca = 200; /* 200 % congestion-avoidance */ 235 static uint16_t rack_per_of_gp_rec = 200; /* 200 % of bw */ 236 237 /* Probertt */ 238 static uint16_t rack_per_of_gp_probertt = 60; /* 60% of bw */ 239 static uint16_t rack_per_of_gp_lowthresh = 40; /* 40% is bottom */ 240 static uint16_t rack_per_of_gp_probertt_reduce = 10; /* 10% reduction */ 241 static uint16_t rack_atexit_prtt_hbp = 130; /* Clamp to 130% on exit prtt if highly buffered path */ 242 static uint16_t rack_atexit_prtt = 130; /* Clamp to 100% on exit prtt if non highly buffered path */ 243 244 static uint32_t rack_max_drain_wait = 2; /* How man gp srtt's before we give up draining */ 245 static uint32_t rack_must_drain = 1; /* How many GP srtt's we *must* wait */ 246 static uint32_t rack_probertt_use_min_rtt_entry = 1; /* Use the min to calculate the goal else gp_srtt */ 247 static uint32_t rack_probertt_use_min_rtt_exit = 0; 248 static uint32_t rack_probe_rtt_sets_cwnd = 0; 249 static uint32_t rack_probe_rtt_safety_val = 2000000; /* No more than 2 sec in probe-rtt */ 250 static uint32_t rack_time_between_probertt = 9600000; /* 9.6 sec in us */ 251 static uint32_t rack_probertt_gpsrtt_cnt_mul = 0; /* How many srtt periods does probe-rtt last top fraction */ 252 static uint32_t rack_probertt_gpsrtt_cnt_div = 0; /* How many srtt periods does probe-rtt last bottom fraction */ 253 static uint32_t rack_min_probertt_hold = 200000; /* Equal to delayed ack time */ 254 static uint32_t rack_probertt_filter_life = 10000000; 255 static uint32_t rack_probertt_lower_within = 10; 256 static uint32_t rack_min_rtt_movement = 250; /* Must move at least 250 useconds to count as a lowering */ 257 static int32_t rack_pace_one_seg = 0; /* Shall we pace for less than 1.4Meg 1MSS at a time */ 258 static int32_t rack_probertt_clear_is = 1; 259 static int32_t rack_max_drain_hbp = 1; /* Extra drain times gpsrtt for highly buffered paths */ 260 static int32_t rack_hbp_thresh = 3; /* what is the divisor max_rtt/min_rtt to decided a hbp */ 261 262 /* Part of pacing */ 263 static int32_t rack_max_per_above = 30; /* When we go to increment stop if above 100+this% */ 264 265 /* Timely information */ 266 /* Combine these two gives the range of 'no change' to bw */ 267 /* ie the up/down provide the upper and lower bound */ 268 static int32_t rack_gp_per_bw_mul_up = 2; /* 2% */ 269 static int32_t rack_gp_per_bw_mul_down = 4; /* 4% */ 270 static int32_t rack_gp_rtt_maxmul = 3; /* 3 x maxmin */ 271 static int32_t rack_gp_rtt_minmul = 1; /* minrtt + (minrtt/mindiv) is lower rtt */ 272 static int32_t rack_gp_rtt_mindiv = 4; /* minrtt + (minrtt * minmul/mindiv) is lower rtt */ 273 static int32_t rack_gp_decrease_per = 20; /* 20% decrease in multipler */ 274 static int32_t rack_gp_increase_per = 2; /* 2% increase in multipler */ 275 static int32_t rack_per_lower_bound = 50; /* Don't allow to drop below this multiplier */ 276 static int32_t rack_per_upper_bound_ss = 0; /* Don't allow SS to grow above this */ 277 static int32_t rack_per_upper_bound_ca = 0; /* Don't allow CA to grow above this */ 278 static int32_t rack_do_dyn_mul = 0; /* Are the rack gp multipliers dynamic */ 279 static int32_t rack_gp_no_rec_chg = 1; /* Prohibit recovery from reducing it's multiplier */ 280 static int32_t rack_timely_dec_clear = 6; /* Do we clear decrement count at a value (6)? */ 281 static int32_t rack_timely_max_push_rise = 3; /* One round of pushing */ 282 static int32_t rack_timely_max_push_drop = 3; /* Three round of pushing */ 283 static int32_t rack_timely_min_segs = 4; /* 4 segment minimum */ 284 static int32_t rack_use_max_for_nobackoff = 0; 285 static int32_t rack_timely_int_timely_only = 0; /* do interim timely's only use the timely algo (no b/w changes)? */ 286 static int32_t rack_timely_no_stopping = 0; 287 static int32_t rack_down_raise_thresh = 100; 288 static int32_t rack_req_segs = 1; 289 290 /* Weird delayed ack mode */ 291 static int32_t rack_use_imac_dack = 0; 292 /* Rack specific counters */ 293 counter_u64_t rack_badfr; 294 counter_u64_t rack_badfr_bytes; 295 counter_u64_t rack_rtm_prr_retran; 296 counter_u64_t rack_rtm_prr_newdata; 297 counter_u64_t rack_timestamp_mismatch; 298 counter_u64_t rack_reorder_seen; 299 counter_u64_t rack_paced_segments; 300 counter_u64_t rack_unpaced_segments; 301 counter_u64_t rack_calc_zero; 302 counter_u64_t rack_calc_nonzero; 303 counter_u64_t rack_saw_enobuf; 304 counter_u64_t rack_saw_enetunreach; 305 counter_u64_t rack_per_timer_hole; 306 307 /* Tail loss probe counters */ 308 counter_u64_t rack_tlp_tot; 309 counter_u64_t rack_tlp_newdata; 310 counter_u64_t rack_tlp_retran; 311 counter_u64_t rack_tlp_retran_bytes; 312 counter_u64_t rack_tlp_retran_fail; 313 counter_u64_t rack_to_tot; 314 counter_u64_t rack_to_arm_rack; 315 counter_u64_t rack_to_arm_tlp; 316 counter_u64_t rack_to_alloc; 317 counter_u64_t rack_to_alloc_hard; 318 counter_u64_t rack_to_alloc_emerg; 319 counter_u64_t rack_to_alloc_limited; 320 counter_u64_t rack_alloc_limited_conns; 321 counter_u64_t rack_split_limited; 322 323 counter_u64_t rack_sack_proc_all; 324 counter_u64_t rack_sack_proc_short; 325 counter_u64_t rack_sack_proc_restart; 326 counter_u64_t rack_sack_attacks_detected; 327 counter_u64_t rack_sack_attacks_reversed; 328 counter_u64_t rack_sack_used_next_merge; 329 counter_u64_t rack_sack_splits; 330 counter_u64_t rack_sack_used_prev_merge; 331 counter_u64_t rack_sack_skipped_acked; 332 counter_u64_t rack_ack_total; 333 counter_u64_t rack_express_sack; 334 counter_u64_t rack_sack_total; 335 counter_u64_t rack_move_none; 336 counter_u64_t rack_move_some; 337 338 counter_u64_t rack_used_tlpmethod; 339 counter_u64_t rack_used_tlpmethod2; 340 counter_u64_t rack_enter_tlp_calc; 341 counter_u64_t rack_input_idle_reduces; 342 counter_u64_t rack_collapsed_win; 343 counter_u64_t rack_tlp_does_nada; 344 counter_u64_t rack_try_scwnd; 345 346 /* Temp CPU counters */ 347 counter_u64_t rack_find_high; 348 349 counter_u64_t rack_progress_drops; 350 counter_u64_t rack_out_size[TCP_MSS_ACCT_SIZE]; 351 counter_u64_t rack_opts_arry[RACK_OPTS_SIZE]; 352 353 static void 354 rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, int event, int line); 355 356 static int 357 rack_process_ack(struct mbuf *m, struct tcphdr *th, 358 struct socket *so, struct tcpcb *tp, struct tcpopt *to, 359 uint32_t tiwin, int32_t tlen, int32_t * ofia, int32_t thflags, int32_t * ret_val); 360 static int 361 rack_process_data(struct mbuf *m, struct tcphdr *th, 362 struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, 363 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); 364 static void 365 rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, 366 struct tcphdr *th, uint16_t nsegs, uint16_t type, int32_t recovery); 367 static struct rack_sendmap *rack_alloc(struct tcp_rack *rack); 368 static struct rack_sendmap *rack_alloc_limit(struct tcp_rack *rack, 369 uint8_t limit_type); 370 static struct rack_sendmap * 371 rack_check_recovery_mode(struct tcpcb *tp, 372 uint32_t tsused); 373 static void 374 rack_cong_signal(struct tcpcb *tp, struct tcphdr *th, 375 uint32_t type); 376 static void rack_counter_destroy(void); 377 static int 378 rack_ctloutput(struct socket *so, struct sockopt *sopt, 379 struct inpcb *inp, struct tcpcb *tp); 380 static int32_t rack_ctor(void *mem, int32_t size, void *arg, int32_t how); 381 static void 382 rack_set_pace_segments(struct tcpcb *tp, struct tcp_rack *rack, uint32_t line); 383 static void 384 rack_do_segment(struct mbuf *m, struct tcphdr *th, 385 struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, 386 uint8_t iptos); 387 static void rack_dtor(void *mem, int32_t size, void *arg); 388 static void 389 rack_earlier_retran(struct tcpcb *tp, struct rack_sendmap *rsm, 390 uint32_t t, uint32_t cts); 391 static void 392 rack_log_alt_to_to_cancel(struct tcp_rack *rack, 393 uint32_t flex1, uint32_t flex2, 394 uint32_t flex3, uint32_t flex4, 395 uint32_t flex5, uint32_t flex6, 396 uint16_t flex7, uint8_t mod); 397 static void 398 rack_log_pacing_delay_calc(struct tcp_rack *rack, uint32_t len, uint32_t slot, 399 uint64_t bw_est, uint64_t bw, uint64_t len_time, int method, int line, struct rack_sendmap *rsm); 400 static struct rack_sendmap * 401 rack_find_high_nonack(struct tcp_rack *rack, 402 struct rack_sendmap *rsm); 403 static struct rack_sendmap *rack_find_lowest_rsm(struct tcp_rack *rack); 404 static void rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm); 405 static void rack_fini(struct tcpcb *tp, int32_t tcb_is_purged); 406 static int 407 rack_get_sockopt(struct socket *so, struct sockopt *sopt, 408 struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack); 409 static void 410 rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack, 411 tcp_seq th_ack, int line); 412 static uint32_t 413 rack_get_pacing_len(struct tcp_rack *rack, uint64_t bw, uint32_t mss); 414 static int32_t rack_handoff_ok(struct tcpcb *tp); 415 static int32_t rack_init(struct tcpcb *tp); 416 static void rack_init_sysctls(void); 417 static void 418 rack_log_ack(struct tcpcb *tp, struct tcpopt *to, 419 struct tcphdr *th); 420 static void 421 rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len, 422 uint32_t seq_out, uint8_t th_flags, int32_t err, uint32_t ts, 423 uint8_t pass, struct rack_sendmap *hintrsm, uint32_t us_cts); 424 static void 425 rack_log_sack_passed(struct tcpcb *tp, struct tcp_rack *rack, 426 struct rack_sendmap *rsm); 427 static void rack_log_to_event(struct tcp_rack *rack, int32_t to_num, struct rack_sendmap *rsm); 428 static int32_t rack_output(struct tcpcb *tp); 429 430 static uint32_t 431 rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, 432 struct sackblk *sack, struct tcpopt *to, struct rack_sendmap **prsm, 433 uint32_t cts, int *moved_two); 434 static void rack_post_recovery(struct tcpcb *tp, struct tcphdr *th); 435 static void rack_remxt_tmr(struct tcpcb *tp); 436 static int 437 rack_set_sockopt(struct socket *so, struct sockopt *sopt, 438 struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack); 439 static void rack_set_state(struct tcpcb *tp, struct tcp_rack *rack); 440 static int32_t rack_stopall(struct tcpcb *tp); 441 static void 442 rack_timer_activate(struct tcpcb *tp, uint32_t timer_type, 443 uint32_t delta); 444 static int32_t rack_timer_active(struct tcpcb *tp, uint32_t timer_type); 445 static void rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line); 446 static void rack_timer_stop(struct tcpcb *tp, uint32_t timer_type); 447 static uint32_t 448 rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack, 449 struct rack_sendmap *rsm, uint32_t ts, int32_t * lenp); 450 static void 451 rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack, 452 struct rack_sendmap *rsm, uint32_t ts); 453 static int 454 rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack, 455 struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type, tcp_seq th_ack); 456 static int32_t tcp_addrack(module_t mod, int32_t type, void *data); 457 static int 458 rack_do_close_wait(struct mbuf *m, struct tcphdr *th, 459 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 460 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 461 static int 462 rack_do_closing(struct mbuf *m, struct tcphdr *th, 463 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 464 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 465 static int 466 rack_do_established(struct mbuf *m, struct tcphdr *th, 467 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 468 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 469 static int 470 rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, 471 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 472 int32_t tlen, uint32_t tiwin, int32_t nxt_pkt, uint8_t iptos); 473 static int 474 rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, 475 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 476 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 477 static int 478 rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, 479 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 480 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 481 static int 482 rack_do_lastack(struct mbuf *m, struct tcphdr *th, 483 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 484 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 485 static int 486 rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, 487 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 488 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 489 static int 490 rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, 491 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 492 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 493 struct rack_sendmap * 494 tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, 495 uint32_t tsused); 496 static void tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt, 497 uint32_t len, uint32_t us_tim, int confidence, struct rack_sendmap *rsm, uint16_t rtrcnt); 498 static void 499 tcp_rack_partialack(struct tcpcb *tp, struct tcphdr *th); 500 501 int32_t rack_clear_counter=0; 502 503 static int 504 sysctl_rack_clear(SYSCTL_HANDLER_ARGS) 505 { 506 uint32_t stat; 507 int32_t error; 508 509 error = SYSCTL_OUT(req, &rack_clear_counter, sizeof(uint32_t)); 510 if (error || req->newptr == NULL) 511 return error; 512 513 error = SYSCTL_IN(req, &stat, sizeof(uint32_t)); 514 if (error) 515 return (error); 516 if (stat == 1) { 517 #ifdef INVARIANTS 518 printf("Clearing RACK counters\n"); 519 #endif 520 counter_u64_zero(rack_badfr); 521 counter_u64_zero(rack_badfr_bytes); 522 counter_u64_zero(rack_rtm_prr_retran); 523 counter_u64_zero(rack_rtm_prr_newdata); 524 counter_u64_zero(rack_timestamp_mismatch); 525 counter_u64_zero(rack_reorder_seen); 526 counter_u64_zero(rack_tlp_tot); 527 counter_u64_zero(rack_tlp_newdata); 528 counter_u64_zero(rack_tlp_retran); 529 counter_u64_zero(rack_tlp_retran_bytes); 530 counter_u64_zero(rack_tlp_retran_fail); 531 counter_u64_zero(rack_to_tot); 532 counter_u64_zero(rack_to_arm_rack); 533 counter_u64_zero(rack_to_arm_tlp); 534 counter_u64_zero(rack_paced_segments); 535 counter_u64_zero(rack_calc_zero); 536 counter_u64_zero(rack_calc_nonzero); 537 counter_u64_zero(rack_unpaced_segments); 538 counter_u64_zero(rack_saw_enobuf); 539 counter_u64_zero(rack_saw_enetunreach); 540 counter_u64_zero(rack_per_timer_hole); 541 counter_u64_zero(rack_to_alloc_hard); 542 counter_u64_zero(rack_to_alloc_emerg); 543 counter_u64_zero(rack_sack_proc_all); 544 counter_u64_zero(rack_sack_proc_short); 545 counter_u64_zero(rack_sack_proc_restart); 546 counter_u64_zero(rack_to_alloc); 547 counter_u64_zero(rack_to_alloc_limited); 548 counter_u64_zero(rack_alloc_limited_conns); 549 counter_u64_zero(rack_split_limited); 550 counter_u64_zero(rack_find_high); 551 counter_u64_zero(rack_sack_attacks_detected); 552 counter_u64_zero(rack_sack_attacks_reversed); 553 counter_u64_zero(rack_sack_used_next_merge); 554 counter_u64_zero(rack_sack_used_prev_merge); 555 counter_u64_zero(rack_sack_splits); 556 counter_u64_zero(rack_sack_skipped_acked); 557 counter_u64_zero(rack_ack_total); 558 counter_u64_zero(rack_express_sack); 559 counter_u64_zero(rack_sack_total); 560 counter_u64_zero(rack_move_none); 561 counter_u64_zero(rack_move_some); 562 counter_u64_zero(rack_used_tlpmethod); 563 counter_u64_zero(rack_used_tlpmethod2); 564 counter_u64_zero(rack_enter_tlp_calc); 565 counter_u64_zero(rack_progress_drops); 566 counter_u64_zero(rack_tlp_does_nada); 567 counter_u64_zero(rack_try_scwnd); 568 counter_u64_zero(rack_collapsed_win); 569 } 570 rack_clear_counter = 0; 571 return (0); 572 } 573 574 static void 575 rack_init_sysctls(void) 576 { 577 struct sysctl_oid *rack_counters; 578 struct sysctl_oid *rack_attack; 579 struct sysctl_oid *rack_pacing; 580 struct sysctl_oid *rack_timely; 581 struct sysctl_oid *rack_timers; 582 struct sysctl_oid *rack_tlp; 583 struct sysctl_oid *rack_misc; 584 struct sysctl_oid *rack_measure; 585 struct sysctl_oid *rack_probertt; 586 587 rack_attack = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 588 SYSCTL_CHILDREN(rack_sysctl_root), 589 OID_AUTO, 590 "sack_attack", 591 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 592 "Rack Sack Attack Counters and Controls"); 593 rack_counters = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 594 SYSCTL_CHILDREN(rack_sysctl_root), 595 OID_AUTO, 596 "stats", 597 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 598 "Rack Counters"); 599 SYSCTL_ADD_S32(&rack_sysctl_ctx, 600 SYSCTL_CHILDREN(rack_sysctl_root), 601 OID_AUTO, "rate_sample_method", CTLFLAG_RW, 602 &rack_rate_sample_method , USE_RTT_LOW, 603 "What method should we use for rate sampling 0=high, 1=low "); 604 /* Probe rtt related controls */ 605 rack_probertt = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 606 SYSCTL_CHILDREN(rack_sysctl_root), 607 OID_AUTO, 608 "probertt", 609 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 610 "ProbeRTT related Controls"); 611 SYSCTL_ADD_U16(&rack_sysctl_ctx, 612 SYSCTL_CHILDREN(rack_probertt), 613 OID_AUTO, "exit_per_hpb", CTLFLAG_RW, 614 &rack_atexit_prtt_hbp, 130, 615 "What percentage above goodput do we clamp CA/SS to at exit on high-BDP path 110%"); 616 SYSCTL_ADD_U16(&rack_sysctl_ctx, 617 SYSCTL_CHILDREN(rack_probertt), 618 OID_AUTO, "exit_per_nonhpb", CTLFLAG_RW, 619 &rack_atexit_prtt, 130, 620 "What percentage above goodput do we clamp CA/SS to at exit on a non high-BDP path 100%"); 621 SYSCTL_ADD_U16(&rack_sysctl_ctx, 622 SYSCTL_CHILDREN(rack_probertt), 623 OID_AUTO, "gp_per_mul", CTLFLAG_RW, 624 &rack_per_of_gp_probertt, 60, 625 "What percentage of goodput do we pace at in probertt"); 626 SYSCTL_ADD_U16(&rack_sysctl_ctx, 627 SYSCTL_CHILDREN(rack_probertt), 628 OID_AUTO, "gp_per_reduce", CTLFLAG_RW, 629 &rack_per_of_gp_probertt_reduce, 10, 630 "What percentage of goodput do we reduce every gp_srtt"); 631 SYSCTL_ADD_U16(&rack_sysctl_ctx, 632 SYSCTL_CHILDREN(rack_probertt), 633 OID_AUTO, "gp_per_low", CTLFLAG_RW, 634 &rack_per_of_gp_lowthresh, 40, 635 "What percentage of goodput do we allow the multiplier to fall to"); 636 SYSCTL_ADD_U32(&rack_sysctl_ctx, 637 SYSCTL_CHILDREN(rack_probertt), 638 OID_AUTO, "time_between", CTLFLAG_RW, 639 & rack_time_between_probertt, 96000000, 640 "How many useconds between the lowest rtt falling must past before we enter probertt"); 641 SYSCTL_ADD_U32(&rack_sysctl_ctx, 642 SYSCTL_CHILDREN(rack_probertt), 643 OID_AUTO, "safety", CTLFLAG_RW, 644 &rack_probe_rtt_safety_val, 2000000, 645 "If not zero, provides a maximum usecond that you can stay in probertt (2sec = 2000000)"); 646 SYSCTL_ADD_U32(&rack_sysctl_ctx, 647 SYSCTL_CHILDREN(rack_probertt), 648 OID_AUTO, "sets_cwnd", CTLFLAG_RW, 649 &rack_probe_rtt_sets_cwnd, 0, 650 "Do we set the cwnd too (if always_lower is on)"); 651 SYSCTL_ADD_U32(&rack_sysctl_ctx, 652 SYSCTL_CHILDREN(rack_probertt), 653 OID_AUTO, "maxdrainsrtts", CTLFLAG_RW, 654 &rack_max_drain_wait, 2, 655 "Maximum number of gp_srtt's to hold in drain waiting for flight to reach goal"); 656 SYSCTL_ADD_U32(&rack_sysctl_ctx, 657 SYSCTL_CHILDREN(rack_probertt), 658 OID_AUTO, "mustdrainsrtts", CTLFLAG_RW, 659 &rack_must_drain, 1, 660 "We must drain this many gp_srtt's waiting for flight to reach goal"); 661 SYSCTL_ADD_U32(&rack_sysctl_ctx, 662 SYSCTL_CHILDREN(rack_probertt), 663 OID_AUTO, "goal_use_min_entry", CTLFLAG_RW, 664 &rack_probertt_use_min_rtt_entry, 1, 665 "Should we use the min-rtt to calculate the goal rtt (else gp_srtt) at entry"); 666 SYSCTL_ADD_U32(&rack_sysctl_ctx, 667 SYSCTL_CHILDREN(rack_probertt), 668 OID_AUTO, "goal_use_min_exit", CTLFLAG_RW, 669 &rack_probertt_use_min_rtt_exit, 0, 670 "How to set cwnd at exit, 0 - dynamic, 1 - use min-rtt, 2 - use curgprtt, 3 - entry gp-rtt"); 671 SYSCTL_ADD_U32(&rack_sysctl_ctx, 672 SYSCTL_CHILDREN(rack_probertt), 673 OID_AUTO, "length_div", CTLFLAG_RW, 674 &rack_probertt_gpsrtt_cnt_div, 0, 675 "How many recent goodput srtt periods plus hold tim does probertt last (bottom of fraction)"); 676 SYSCTL_ADD_U32(&rack_sysctl_ctx, 677 SYSCTL_CHILDREN(rack_probertt), 678 OID_AUTO, "length_mul", CTLFLAG_RW, 679 &rack_probertt_gpsrtt_cnt_mul, 0, 680 "How many recent goodput srtt periods plus hold tim does probertt last (top of fraction)"); 681 SYSCTL_ADD_U32(&rack_sysctl_ctx, 682 SYSCTL_CHILDREN(rack_probertt), 683 OID_AUTO, "holdtim_at_target", CTLFLAG_RW, 684 &rack_min_probertt_hold, 200000, 685 "What is the minimum time we hold probertt at target"); 686 SYSCTL_ADD_U32(&rack_sysctl_ctx, 687 SYSCTL_CHILDREN(rack_probertt), 688 OID_AUTO, "filter_life", CTLFLAG_RW, 689 &rack_probertt_filter_life, 10000000, 690 "What is the time for the filters life in useconds"); 691 SYSCTL_ADD_U32(&rack_sysctl_ctx, 692 SYSCTL_CHILDREN(rack_probertt), 693 OID_AUTO, "lower_within", CTLFLAG_RW, 694 &rack_probertt_lower_within, 10, 695 "If the rtt goes lower within this percentage of the time, go into probe-rtt"); 696 SYSCTL_ADD_U32(&rack_sysctl_ctx, 697 SYSCTL_CHILDREN(rack_probertt), 698 OID_AUTO, "must_move", CTLFLAG_RW, 699 &rack_min_rtt_movement, 250, 700 "How much is the minimum movement in rtt to count as a drop for probertt purposes"); 701 SYSCTL_ADD_U32(&rack_sysctl_ctx, 702 SYSCTL_CHILDREN(rack_probertt), 703 OID_AUTO, "clear_is_cnts", CTLFLAG_RW, 704 &rack_probertt_clear_is, 1, 705 "Do we clear I/S counts on exiting probe-rtt"); 706 SYSCTL_ADD_S32(&rack_sysctl_ctx, 707 SYSCTL_CHILDREN(rack_probertt), 708 OID_AUTO, "hbp_extra_drain", CTLFLAG_RW, 709 &rack_max_drain_hbp, 1, 710 "How many extra drain gpsrtt's do we get in highly buffered paths"); 711 SYSCTL_ADD_S32(&rack_sysctl_ctx, 712 SYSCTL_CHILDREN(rack_probertt), 713 OID_AUTO, "hbp_threshold", CTLFLAG_RW, 714 &rack_hbp_thresh, 3, 715 "We are highly buffered if min_rtt_seen / max_rtt_seen > this-threshold"); 716 /* Pacing related sysctls */ 717 rack_pacing = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 718 SYSCTL_CHILDREN(rack_sysctl_root), 719 OID_AUTO, 720 "pacing", 721 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 722 "Pacing related Controls"); 723 SYSCTL_ADD_S32(&rack_sysctl_ctx, 724 SYSCTL_CHILDREN(rack_pacing), 725 OID_AUTO, "max_pace_over", CTLFLAG_RW, 726 &rack_max_per_above, 30, 727 "What is the maximum allowable percentage that we can pace above (so 30 = 130% of our goal)"); 728 SYSCTL_ADD_S32(&rack_sysctl_ctx, 729 SYSCTL_CHILDREN(rack_pacing), 730 OID_AUTO, "pace_to_one", CTLFLAG_RW, 731 &rack_pace_one_seg, 0, 732 "Do we allow low b/w pacing of 1MSS instead of two"); 733 SYSCTL_ADD_S32(&rack_sysctl_ctx, 734 SYSCTL_CHILDREN(rack_pacing), 735 OID_AUTO, "limit_wsrtt", CTLFLAG_RW, 736 &rack_limit_time_with_srtt, 0, 737 "Do we limit pacing time based on srtt"); 738 SYSCTL_ADD_S32(&rack_sysctl_ctx, 739 SYSCTL_CHILDREN(rack_pacing), 740 OID_AUTO, "init_win", CTLFLAG_RW, 741 &rack_default_init_window, 0, 742 "Do we have a rack initial window 0 = system default"); 743 SYSCTL_ADD_U32(&rack_sysctl_ctx, 744 SYSCTL_CHILDREN(rack_pacing), 745 OID_AUTO, "hw_pacing_adjust", CTLFLAG_RW, 746 &rack_hw_pace_adjust, 0, 747 "What percentage do we raise the MSS by (11 = 1.1%)"); 748 SYSCTL_ADD_U16(&rack_sysctl_ctx, 749 SYSCTL_CHILDREN(rack_pacing), 750 OID_AUTO, "gp_per_ss", CTLFLAG_RW, 751 &rack_per_of_gp_ss, 250, 752 "If non zero, what percentage of goodput to pace at in slow start"); 753 SYSCTL_ADD_U16(&rack_sysctl_ctx, 754 SYSCTL_CHILDREN(rack_pacing), 755 OID_AUTO, "gp_per_ca", CTLFLAG_RW, 756 &rack_per_of_gp_ca, 150, 757 "If non zero, what percentage of goodput to pace at in congestion avoidance"); 758 SYSCTL_ADD_U16(&rack_sysctl_ctx, 759 SYSCTL_CHILDREN(rack_pacing), 760 OID_AUTO, "gp_per_rec", CTLFLAG_RW, 761 &rack_per_of_gp_rec, 200, 762 "If non zero, what percentage of goodput to pace at in recovery"); 763 SYSCTL_ADD_S32(&rack_sysctl_ctx, 764 SYSCTL_CHILDREN(rack_pacing), 765 OID_AUTO, "pace_max_seg", CTLFLAG_RW, 766 &rack_hptsi_segments, 40, 767 "What size is the max for TSO segments in pacing and burst mitigation"); 768 SYSCTL_ADD_S32(&rack_sysctl_ctx, 769 SYSCTL_CHILDREN(rack_pacing), 770 OID_AUTO, "burst_reduces", CTLFLAG_RW, 771 &rack_slot_reduction, 4, 772 "When doing only burst mitigation what is the reduce divisor"); 773 SYSCTL_ADD_S32(&rack_sysctl_ctx, 774 SYSCTL_CHILDREN(rack_sysctl_root), 775 OID_AUTO, "use_pacing", CTLFLAG_RW, 776 &rack_pace_every_seg, 0, 777 "If set we use pacing, if clear we use only the original burst mitigation"); 778 779 rack_timely = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 780 SYSCTL_CHILDREN(rack_sysctl_root), 781 OID_AUTO, 782 "timely", 783 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 784 "Rack Timely RTT Controls"); 785 /* Timely based GP dynmics */ 786 SYSCTL_ADD_S32(&rack_sysctl_ctx, 787 SYSCTL_CHILDREN(rack_timely), 788 OID_AUTO, "upper", CTLFLAG_RW, 789 &rack_gp_per_bw_mul_up, 2, 790 "Rack timely upper range for equal b/w (in percentage)"); 791 SYSCTL_ADD_S32(&rack_sysctl_ctx, 792 SYSCTL_CHILDREN(rack_timely), 793 OID_AUTO, "lower", CTLFLAG_RW, 794 &rack_gp_per_bw_mul_down, 4, 795 "Rack timely lower range for equal b/w (in percentage)"); 796 SYSCTL_ADD_S32(&rack_sysctl_ctx, 797 SYSCTL_CHILDREN(rack_timely), 798 OID_AUTO, "rtt_max_mul", CTLFLAG_RW, 799 &rack_gp_rtt_maxmul, 3, 800 "Rack timely multipler of lowest rtt for rtt_max"); 801 SYSCTL_ADD_S32(&rack_sysctl_ctx, 802 SYSCTL_CHILDREN(rack_timely), 803 OID_AUTO, "rtt_min_div", CTLFLAG_RW, 804 &rack_gp_rtt_mindiv, 4, 805 "Rack timely divisor used for rtt + (rtt * mul/divisor) for check for lower rtt"); 806 SYSCTL_ADD_S32(&rack_sysctl_ctx, 807 SYSCTL_CHILDREN(rack_timely), 808 OID_AUTO, "rtt_min_mul", CTLFLAG_RW, 809 &rack_gp_rtt_minmul, 1, 810 "Rack timely multiplier used for rtt + (rtt * mul/divisor) for check for lower rtt"); 811 SYSCTL_ADD_S32(&rack_sysctl_ctx, 812 SYSCTL_CHILDREN(rack_timely), 813 OID_AUTO, "decrease", CTLFLAG_RW, 814 &rack_gp_decrease_per, 20, 815 "Rack timely decrease percentage of our GP multiplication factor"); 816 SYSCTL_ADD_S32(&rack_sysctl_ctx, 817 SYSCTL_CHILDREN(rack_timely), 818 OID_AUTO, "increase", CTLFLAG_RW, 819 &rack_gp_increase_per, 2, 820 "Rack timely increase perentage of our GP multiplication factor"); 821 SYSCTL_ADD_S32(&rack_sysctl_ctx, 822 SYSCTL_CHILDREN(rack_timely), 823 OID_AUTO, "lowerbound", CTLFLAG_RW, 824 &rack_per_lower_bound, 50, 825 "Rack timely lowest percentage we allow GP multiplier to fall to"); 826 SYSCTL_ADD_S32(&rack_sysctl_ctx, 827 SYSCTL_CHILDREN(rack_timely), 828 OID_AUTO, "upperboundss", CTLFLAG_RW, 829 &rack_per_upper_bound_ss, 0, 830 "Rack timely higest percentage we allow GP multiplier in SS to raise to (0 is no upperbound)"); 831 SYSCTL_ADD_S32(&rack_sysctl_ctx, 832 SYSCTL_CHILDREN(rack_timely), 833 OID_AUTO, "upperboundca", CTLFLAG_RW, 834 &rack_per_upper_bound_ca, 0, 835 "Rack timely higest percentage we allow GP multiplier to CA raise to (0 is no upperbound)"); 836 SYSCTL_ADD_S32(&rack_sysctl_ctx, 837 SYSCTL_CHILDREN(rack_timely), 838 OID_AUTO, "dynamicgp", CTLFLAG_RW, 839 &rack_do_dyn_mul, 0, 840 "Rack timely do we enable dynmaic timely goodput by default"); 841 SYSCTL_ADD_S32(&rack_sysctl_ctx, 842 SYSCTL_CHILDREN(rack_timely), 843 OID_AUTO, "no_rec_red", CTLFLAG_RW, 844 &rack_gp_no_rec_chg, 1, 845 "Rack timely do we prohibit the recovery multiplier from being lowered"); 846 SYSCTL_ADD_S32(&rack_sysctl_ctx, 847 SYSCTL_CHILDREN(rack_timely), 848 OID_AUTO, "red_clear_cnt", CTLFLAG_RW, 849 &rack_timely_dec_clear, 6, 850 "Rack timely what threshold do we count to before another boost during b/w decent"); 851 SYSCTL_ADD_S32(&rack_sysctl_ctx, 852 SYSCTL_CHILDREN(rack_timely), 853 OID_AUTO, "max_push_rise", CTLFLAG_RW, 854 &rack_timely_max_push_rise, 3, 855 "Rack timely how many times do we push up with b/w increase"); 856 SYSCTL_ADD_S32(&rack_sysctl_ctx, 857 SYSCTL_CHILDREN(rack_timely), 858 OID_AUTO, "max_push_drop", CTLFLAG_RW, 859 &rack_timely_max_push_drop, 3, 860 "Rack timely how many times do we push back on b/w decent"); 861 SYSCTL_ADD_S32(&rack_sysctl_ctx, 862 SYSCTL_CHILDREN(rack_timely), 863 OID_AUTO, "min_segs", CTLFLAG_RW, 864 &rack_timely_min_segs, 4, 865 "Rack timely when setting the cwnd what is the min num segments"); 866 SYSCTL_ADD_S32(&rack_sysctl_ctx, 867 SYSCTL_CHILDREN(rack_timely), 868 OID_AUTO, "noback_max", CTLFLAG_RW, 869 &rack_use_max_for_nobackoff, 0, 870 "Rack timely when deciding if to backoff on a loss, do we use under max rtt else min"); 871 SYSCTL_ADD_S32(&rack_sysctl_ctx, 872 SYSCTL_CHILDREN(rack_timely), 873 OID_AUTO, "interim_timely_only", CTLFLAG_RW, 874 &rack_timely_int_timely_only, 0, 875 "Rack timely when doing interim timely's do we only do timely (no b/w consideration)"); 876 SYSCTL_ADD_S32(&rack_sysctl_ctx, 877 SYSCTL_CHILDREN(rack_timely), 878 OID_AUTO, "nonstop", CTLFLAG_RW, 879 &rack_timely_no_stopping, 0, 880 "Rack timely don't stop increase"); 881 SYSCTL_ADD_S32(&rack_sysctl_ctx, 882 SYSCTL_CHILDREN(rack_timely), 883 OID_AUTO, "dec_raise_thresh", CTLFLAG_RW, 884 &rack_down_raise_thresh, 100, 885 "If the CA or SS is below this threshold raise on the first 3 b/w lowers (0=always)"); 886 SYSCTL_ADD_S32(&rack_sysctl_ctx, 887 SYSCTL_CHILDREN(rack_timely), 888 OID_AUTO, "bottom_drag_segs", CTLFLAG_RW, 889 &rack_req_segs, 1, 890 "Bottom dragging if not these many segments outstanding and room"); 891 892 /* TLP and Rack related parameters */ 893 rack_tlp = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 894 SYSCTL_CHILDREN(rack_sysctl_root), 895 OID_AUTO, 896 "tlp", 897 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 898 "TLP and Rack related Controls"); 899 SYSCTL_ADD_S32(&rack_sysctl_ctx, 900 SYSCTL_CHILDREN(rack_tlp), 901 OID_AUTO, "use_rrr", CTLFLAG_RW, 902 &use_rack_rr, 1, 903 "Do we use Rack Rapid Recovery"); 904 SYSCTL_ADD_S32(&rack_sysctl_ctx, 905 SYSCTL_CHILDREN(rack_tlp), 906 OID_AUTO, "nonrxt_use_cr", CTLFLAG_RW, 907 &rack_non_rxt_use_cr, 0, 908 "Do we use ss/ca rate if in recovery we are transmitting a new data chunk"); 909 SYSCTL_ADD_S32(&rack_sysctl_ctx, 910 SYSCTL_CHILDREN(rack_tlp), 911 OID_AUTO, "tlpmethod", CTLFLAG_RW, 912 &rack_tlp_threshold_use, TLP_USE_TWO_ONE, 913 "What method do we do for TLP time calc 0=no-de-ack-comp, 1=ID, 2=2.1, 3=2.2"); 914 SYSCTL_ADD_S32(&rack_sysctl_ctx, 915 SYSCTL_CHILDREN(rack_tlp), 916 OID_AUTO, "limit", CTLFLAG_RW, 917 &rack_tlp_limit, 2, 918 "How many TLP's can be sent without sending new data"); 919 SYSCTL_ADD_S32(&rack_sysctl_ctx, 920 SYSCTL_CHILDREN(rack_tlp), 921 OID_AUTO, "use_greater", CTLFLAG_RW, 922 &rack_tlp_use_greater, 1, 923 "Should we use the rack_rtt time if its greater than srtt"); 924 SYSCTL_ADD_S32(&rack_sysctl_ctx, 925 SYSCTL_CHILDREN(rack_tlp), 926 OID_AUTO, "tlpminto", CTLFLAG_RW, 927 &rack_tlp_min, 10, 928 "TLP minimum timeout per the specification (10ms)"); 929 SYSCTL_ADD_S32(&rack_sysctl_ctx, 930 SYSCTL_CHILDREN(rack_tlp), 931 OID_AUTO, "send_oldest", CTLFLAG_RW, 932 &rack_always_send_oldest, 0, 933 "Should we always send the oldest TLP and RACK-TLP"); 934 SYSCTL_ADD_S32(&rack_sysctl_ctx, 935 SYSCTL_CHILDREN(rack_tlp), 936 OID_AUTO, "rack_tlimit", CTLFLAG_RW, 937 &rack_limited_retran, 0, 938 "How many times can a rack timeout drive out sends"); 939 SYSCTL_ADD_S32(&rack_sysctl_ctx, 940 SYSCTL_CHILDREN(rack_tlp), 941 OID_AUTO, "tlp_retry", CTLFLAG_RW, 942 &rack_tlp_max_resend, 2, 943 "How many times does TLP retry a single segment or multiple with no ACK"); 944 SYSCTL_ADD_S32(&rack_sysctl_ctx, 945 SYSCTL_CHILDREN(rack_tlp), 946 OID_AUTO, "tlp_cwnd_flag", CTLFLAG_RW, 947 &rack_lower_cwnd_at_tlp, 0, 948 "When a TLP completes a retran should we enter recovery"); 949 SYSCTL_ADD_S32(&rack_sysctl_ctx, 950 SYSCTL_CHILDREN(rack_tlp), 951 OID_AUTO, "reorder_thresh", CTLFLAG_RW, 952 &rack_reorder_thresh, 2, 953 "What factor for rack will be added when seeing reordering (shift right)"); 954 SYSCTL_ADD_S32(&rack_sysctl_ctx, 955 SYSCTL_CHILDREN(rack_tlp), 956 OID_AUTO, "rtt_tlp_thresh", CTLFLAG_RW, 957 &rack_tlp_thresh, 1, 958 "What divisor for TLP rtt/retran will be added (1=rtt, 2=1/2 rtt etc)"); 959 SYSCTL_ADD_S32(&rack_sysctl_ctx, 960 SYSCTL_CHILDREN(rack_tlp), 961 OID_AUTO, "reorder_fade", CTLFLAG_RW, 962 &rack_reorder_fade, 0, 963 "Does reorder detection fade, if so how many ms (0 means never)"); 964 SYSCTL_ADD_S32(&rack_sysctl_ctx, 965 SYSCTL_CHILDREN(rack_tlp), 966 OID_AUTO, "pktdelay", CTLFLAG_RW, 967 &rack_pkt_delay, 1, 968 "Extra RACK time (in ms) besides reordering thresh"); 969 970 /* Timer related controls */ 971 rack_timers = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 972 SYSCTL_CHILDREN(rack_sysctl_root), 973 OID_AUTO, 974 "timers", 975 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 976 "Timer related controls"); 977 SYSCTL_ADD_U32(&rack_sysctl_ctx, 978 SYSCTL_CHILDREN(rack_timers), 979 OID_AUTO, "persmin", CTLFLAG_RW, 980 &rack_persist_min, 250, 981 "What is the minimum time in milliseconds between persists"); 982 SYSCTL_ADD_U32(&rack_sysctl_ctx, 983 SYSCTL_CHILDREN(rack_timers), 984 OID_AUTO, "persmax", CTLFLAG_RW, 985 &rack_persist_max, 2000, 986 "What is the largest delay in milliseconds between persists"); 987 SYSCTL_ADD_S32(&rack_sysctl_ctx, 988 SYSCTL_CHILDREN(rack_timers), 989 OID_AUTO, "delayed_ack", CTLFLAG_RW, 990 &rack_delayed_ack_time, 200, 991 "Delayed ack time (200ms)"); 992 SYSCTL_ADD_S32(&rack_sysctl_ctx, 993 SYSCTL_CHILDREN(rack_timers), 994 OID_AUTO, "minrto", CTLFLAG_RW, 995 &rack_rto_min, 0, 996 "Minimum RTO in ms -- set with caution below 1000 due to TLP"); 997 SYSCTL_ADD_S32(&rack_sysctl_ctx, 998 SYSCTL_CHILDREN(rack_timers), 999 OID_AUTO, "maxrto", CTLFLAG_RW, 1000 &rack_rto_max, 0, 1001 "Maxiumum RTO in ms -- should be at least as large as min_rto"); 1002 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1003 SYSCTL_CHILDREN(rack_timers), 1004 OID_AUTO, "minto", CTLFLAG_RW, 1005 &rack_min_to, 1, 1006 "Minimum rack timeout in milliseconds"); 1007 /* Measure controls */ 1008 rack_measure = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 1009 SYSCTL_CHILDREN(rack_sysctl_root), 1010 OID_AUTO, 1011 "measure", 1012 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1013 "Measure related controls"); 1014 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1015 SYSCTL_CHILDREN(rack_measure), 1016 OID_AUTO, "wma_divisor", CTLFLAG_RW, 1017 &rack_wma_divisor, 8, 1018 "When doing b/w calculation what is the divisor for the WMA"); 1019 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1020 SYSCTL_CHILDREN(rack_measure), 1021 OID_AUTO, "end_cwnd", CTLFLAG_RW, 1022 &rack_cwnd_block_ends_measure, 0, 1023 "Does a cwnd just-return end the measurement window (app limited)"); 1024 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1025 SYSCTL_CHILDREN(rack_measure), 1026 OID_AUTO, "end_rwnd", CTLFLAG_RW, 1027 &rack_rwnd_block_ends_measure, 0, 1028 "Does an rwnd just-return end the measurement window (app limited -- not persists)"); 1029 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1030 SYSCTL_CHILDREN(rack_measure), 1031 OID_AUTO, "min_target", CTLFLAG_RW, 1032 &rack_def_data_window, 20, 1033 "What is the minimum target window (in mss) for a GP measurements"); 1034 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1035 SYSCTL_CHILDREN(rack_measure), 1036 OID_AUTO, "goal_bdp", CTLFLAG_RW, 1037 &rack_goal_bdp, 2, 1038 "What is the goal BDP to measure"); 1039 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1040 SYSCTL_CHILDREN(rack_measure), 1041 OID_AUTO, "min_srtts", CTLFLAG_RW, 1042 &rack_min_srtts, 1, 1043 "What is the goal BDP to measure"); 1044 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1045 SYSCTL_CHILDREN(rack_measure), 1046 OID_AUTO, "min_measure_tim", CTLFLAG_RW, 1047 &rack_min_measure_usec, 0, 1048 "What is the Minimum time time for a measurement if 0, this is off"); 1049 /* Misc rack controls */ 1050 rack_misc = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 1051 SYSCTL_CHILDREN(rack_sysctl_root), 1052 OID_AUTO, 1053 "misc", 1054 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1055 "Misc related controls"); 1056 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1057 SYSCTL_CHILDREN(rack_misc), 1058 OID_AUTO, "shared_cwnd", CTLFLAG_RW, 1059 &rack_enable_shared_cwnd, 0, 1060 "Should RACK try to use the shared cwnd on connections where allowed"); 1061 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1062 SYSCTL_CHILDREN(rack_misc), 1063 OID_AUTO, "limits_on_scwnd", CTLFLAG_RW, 1064 &rack_limits_scwnd, 1, 1065 "Should RACK place low end time limits on the shared cwnd feature"); 1066 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1067 SYSCTL_CHILDREN(rack_misc), 1068 OID_AUTO, "non_paced_lro_queue", CTLFLAG_RW, 1069 &rack_enable_mqueue_for_nonpaced, 0, 1070 "Should RACK use mbuf queuing for non-paced connections"); 1071 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1072 SYSCTL_CHILDREN(rack_misc), 1073 OID_AUTO, "iMac_dack", CTLFLAG_RW, 1074 &rack_use_imac_dack, 0, 1075 "Should RACK try to emulate iMac delayed ack"); 1076 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1077 SYSCTL_CHILDREN(rack_misc), 1078 OID_AUTO, "no_prr", CTLFLAG_RW, 1079 &rack_disable_prr, 0, 1080 "Should RACK not use prr and only pace (must have pacing on)"); 1081 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1082 SYSCTL_CHILDREN(rack_misc), 1083 OID_AUTO, "bb_verbose", CTLFLAG_RW, 1084 &rack_verbose_logging, 0, 1085 "Should RACK black box logging be verbose"); 1086 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1087 SYSCTL_CHILDREN(rack_misc), 1088 OID_AUTO, "data_after_close", CTLFLAG_RW, 1089 &rack_ignore_data_after_close, 1, 1090 "Do we hold off sending a RST until all pending data is ack'd"); 1091 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1092 SYSCTL_CHILDREN(rack_misc), 1093 OID_AUTO, "no_sack_needed", CTLFLAG_RW, 1094 &rack_sack_not_required, 0, 1095 "Do we allow rack to run on connections not supporting SACK"); 1096 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1097 SYSCTL_CHILDREN(rack_misc), 1098 OID_AUTO, "recovery_loss_prop", CTLFLAG_RW, 1099 &rack_use_proportional_reduce, 0, 1100 "Should we proportionaly reduce cwnd based on the number of losses "); 1101 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1102 SYSCTL_CHILDREN(rack_misc), 1103 OID_AUTO, "recovery_prop", CTLFLAG_RW, 1104 &rack_proportional_rate, 10, 1105 "What percent reduction per loss"); 1106 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1107 SYSCTL_CHILDREN(rack_misc), 1108 OID_AUTO, "prr_sendalot", CTLFLAG_RW, 1109 &rack_send_a_lot_in_prr, 1, 1110 "Send a lot in prr"); 1111 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1112 SYSCTL_CHILDREN(rack_misc), 1113 OID_AUTO, "earlyrecovery", CTLFLAG_RW, 1114 &rack_early_recovery, 1, 1115 "Do we do early recovery with rack"); 1116 /* Sack Attacker detection stuff */ 1117 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1118 SYSCTL_CHILDREN(rack_attack), 1119 OID_AUTO, "detect_highsackratio", CTLFLAG_RW, 1120 &rack_highest_sack_thresh_seen, 0, 1121 "Highest sack to ack ratio seen"); 1122 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1123 SYSCTL_CHILDREN(rack_attack), 1124 OID_AUTO, "detect_highmoveratio", CTLFLAG_RW, 1125 &rack_highest_move_thresh_seen, 0, 1126 "Highest move to non-move ratio seen"); 1127 rack_ack_total = counter_u64_alloc(M_WAITOK); 1128 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1129 SYSCTL_CHILDREN(rack_attack), 1130 OID_AUTO, "acktotal", CTLFLAG_RD, 1131 &rack_ack_total, 1132 "Total number of Ack's"); 1133 rack_express_sack = counter_u64_alloc(M_WAITOK); 1134 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1135 SYSCTL_CHILDREN(rack_attack), 1136 OID_AUTO, "exp_sacktotal", CTLFLAG_RD, 1137 &rack_express_sack, 1138 "Total expresss number of Sack's"); 1139 rack_sack_total = counter_u64_alloc(M_WAITOK); 1140 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1141 SYSCTL_CHILDREN(rack_attack), 1142 OID_AUTO, "sacktotal", CTLFLAG_RD, 1143 &rack_sack_total, 1144 "Total number of SACKs"); 1145 rack_move_none = counter_u64_alloc(M_WAITOK); 1146 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1147 SYSCTL_CHILDREN(rack_attack), 1148 OID_AUTO, "move_none", CTLFLAG_RD, 1149 &rack_move_none, 1150 "Total number of SACK index reuse of postions under threshold"); 1151 rack_move_some = counter_u64_alloc(M_WAITOK); 1152 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1153 SYSCTL_CHILDREN(rack_attack), 1154 OID_AUTO, "move_some", CTLFLAG_RD, 1155 &rack_move_some, 1156 "Total number of SACK index reuse of postions over threshold"); 1157 rack_sack_attacks_detected = counter_u64_alloc(M_WAITOK); 1158 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1159 SYSCTL_CHILDREN(rack_attack), 1160 OID_AUTO, "attacks", CTLFLAG_RD, 1161 &rack_sack_attacks_detected, 1162 "Total number of SACK attackers that had sack disabled"); 1163 rack_sack_attacks_reversed = counter_u64_alloc(M_WAITOK); 1164 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1165 SYSCTL_CHILDREN(rack_attack), 1166 OID_AUTO, "reversed", CTLFLAG_RD, 1167 &rack_sack_attacks_reversed, 1168 "Total number of SACK attackers that were later determined false positive"); 1169 rack_sack_used_next_merge = counter_u64_alloc(M_WAITOK); 1170 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1171 SYSCTL_CHILDREN(rack_attack), 1172 OID_AUTO, "nextmerge", CTLFLAG_RD, 1173 &rack_sack_used_next_merge, 1174 "Total number of times we used the next merge"); 1175 rack_sack_used_prev_merge = counter_u64_alloc(M_WAITOK); 1176 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1177 SYSCTL_CHILDREN(rack_attack), 1178 OID_AUTO, "prevmerge", CTLFLAG_RD, 1179 &rack_sack_used_prev_merge, 1180 "Total number of times we used the prev merge"); 1181 /* Counters */ 1182 rack_badfr = counter_u64_alloc(M_WAITOK); 1183 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1184 SYSCTL_CHILDREN(rack_counters), 1185 OID_AUTO, "badfr", CTLFLAG_RD, 1186 &rack_badfr, "Total number of bad FRs"); 1187 rack_badfr_bytes = counter_u64_alloc(M_WAITOK); 1188 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1189 SYSCTL_CHILDREN(rack_counters), 1190 OID_AUTO, "badfr_bytes", CTLFLAG_RD, 1191 &rack_badfr_bytes, "Total number of bad FRs"); 1192 rack_rtm_prr_retran = counter_u64_alloc(M_WAITOK); 1193 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1194 SYSCTL_CHILDREN(rack_counters), 1195 OID_AUTO, "prrsndret", CTLFLAG_RD, 1196 &rack_rtm_prr_retran, 1197 "Total number of prr based retransmits"); 1198 rack_rtm_prr_newdata = counter_u64_alloc(M_WAITOK); 1199 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1200 SYSCTL_CHILDREN(rack_counters), 1201 OID_AUTO, "prrsndnew", CTLFLAG_RD, 1202 &rack_rtm_prr_newdata, 1203 "Total number of prr based new transmits"); 1204 rack_timestamp_mismatch = counter_u64_alloc(M_WAITOK); 1205 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1206 SYSCTL_CHILDREN(rack_counters), 1207 OID_AUTO, "tsnf", CTLFLAG_RD, 1208 &rack_timestamp_mismatch, 1209 "Total number of timestamps that we could not find the reported ts"); 1210 rack_find_high = counter_u64_alloc(M_WAITOK); 1211 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1212 SYSCTL_CHILDREN(rack_counters), 1213 OID_AUTO, "findhigh", CTLFLAG_RD, 1214 &rack_find_high, 1215 "Total number of FIN causing find-high"); 1216 rack_reorder_seen = counter_u64_alloc(M_WAITOK); 1217 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1218 SYSCTL_CHILDREN(rack_counters), 1219 OID_AUTO, "reordering", CTLFLAG_RD, 1220 &rack_reorder_seen, 1221 "Total number of times we added delay due to reordering"); 1222 rack_tlp_tot = counter_u64_alloc(M_WAITOK); 1223 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1224 SYSCTL_CHILDREN(rack_counters), 1225 OID_AUTO, "tlp_to_total", CTLFLAG_RD, 1226 &rack_tlp_tot, 1227 "Total number of tail loss probe expirations"); 1228 rack_tlp_newdata = counter_u64_alloc(M_WAITOK); 1229 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1230 SYSCTL_CHILDREN(rack_counters), 1231 OID_AUTO, "tlp_new", CTLFLAG_RD, 1232 &rack_tlp_newdata, 1233 "Total number of tail loss probe sending new data"); 1234 rack_tlp_retran = counter_u64_alloc(M_WAITOK); 1235 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1236 SYSCTL_CHILDREN(rack_counters), 1237 OID_AUTO, "tlp_retran", CTLFLAG_RD, 1238 &rack_tlp_retran, 1239 "Total number of tail loss probe sending retransmitted data"); 1240 rack_tlp_retran_bytes = counter_u64_alloc(M_WAITOK); 1241 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1242 SYSCTL_CHILDREN(rack_counters), 1243 OID_AUTO, "tlp_retran_bytes", CTLFLAG_RD, 1244 &rack_tlp_retran_bytes, 1245 "Total bytes of tail loss probe sending retransmitted data"); 1246 rack_tlp_retran_fail = counter_u64_alloc(M_WAITOK); 1247 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1248 SYSCTL_CHILDREN(rack_counters), 1249 OID_AUTO, "tlp_retran_fail", CTLFLAG_RD, 1250 &rack_tlp_retran_fail, 1251 "Total number of tail loss probe sending retransmitted data that failed (wait for t3)"); 1252 rack_to_tot = counter_u64_alloc(M_WAITOK); 1253 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1254 SYSCTL_CHILDREN(rack_counters), 1255 OID_AUTO, "rack_to_tot", CTLFLAG_RD, 1256 &rack_to_tot, 1257 "Total number of times the rack to expired"); 1258 rack_to_arm_rack = counter_u64_alloc(M_WAITOK); 1259 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1260 SYSCTL_CHILDREN(rack_counters), 1261 OID_AUTO, "arm_rack", CTLFLAG_RD, 1262 &rack_to_arm_rack, 1263 "Total number of times the rack timer armed"); 1264 rack_to_arm_tlp = counter_u64_alloc(M_WAITOK); 1265 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1266 SYSCTL_CHILDREN(rack_counters), 1267 OID_AUTO, "arm_tlp", CTLFLAG_RD, 1268 &rack_to_arm_tlp, 1269 "Total number of times the tlp timer armed"); 1270 rack_calc_zero = counter_u64_alloc(M_WAITOK); 1271 rack_calc_nonzero = counter_u64_alloc(M_WAITOK); 1272 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1273 SYSCTL_CHILDREN(rack_counters), 1274 OID_AUTO, "calc_zero", CTLFLAG_RD, 1275 &rack_calc_zero, 1276 "Total number of times pacing time worked out to zero"); 1277 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1278 SYSCTL_CHILDREN(rack_counters), 1279 OID_AUTO, "calc_nonzero", CTLFLAG_RD, 1280 &rack_calc_nonzero, 1281 "Total number of times pacing time worked out to non-zero"); 1282 rack_paced_segments = counter_u64_alloc(M_WAITOK); 1283 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1284 SYSCTL_CHILDREN(rack_counters), 1285 OID_AUTO, "paced", CTLFLAG_RD, 1286 &rack_paced_segments, 1287 "Total number of times a segment send caused hptsi"); 1288 rack_unpaced_segments = counter_u64_alloc(M_WAITOK); 1289 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1290 SYSCTL_CHILDREN(rack_counters), 1291 OID_AUTO, "unpaced", CTLFLAG_RD, 1292 &rack_unpaced_segments, 1293 "Total number of times a segment did not cause hptsi"); 1294 rack_saw_enobuf = counter_u64_alloc(M_WAITOK); 1295 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1296 SYSCTL_CHILDREN(rack_counters), 1297 OID_AUTO, "saw_enobufs", CTLFLAG_RD, 1298 &rack_saw_enobuf, 1299 "Total number of times a segment did not cause hptsi"); 1300 rack_saw_enetunreach = counter_u64_alloc(M_WAITOK); 1301 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1302 SYSCTL_CHILDREN(rack_counters), 1303 OID_AUTO, "saw_enetunreach", CTLFLAG_RD, 1304 &rack_saw_enetunreach, 1305 "Total number of times a segment did not cause hptsi"); 1306 rack_to_alloc = counter_u64_alloc(M_WAITOK); 1307 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1308 SYSCTL_CHILDREN(rack_counters), 1309 OID_AUTO, "allocs", CTLFLAG_RD, 1310 &rack_to_alloc, 1311 "Total allocations of tracking structures"); 1312 rack_to_alloc_hard = counter_u64_alloc(M_WAITOK); 1313 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1314 SYSCTL_CHILDREN(rack_counters), 1315 OID_AUTO, "allochard", CTLFLAG_RD, 1316 &rack_to_alloc_hard, 1317 "Total allocations done with sleeping the hard way"); 1318 rack_to_alloc_emerg = counter_u64_alloc(M_WAITOK); 1319 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1320 SYSCTL_CHILDREN(rack_counters), 1321 OID_AUTO, "allocemerg", CTLFLAG_RD, 1322 &rack_to_alloc_emerg, 1323 "Total allocations done from emergency cache"); 1324 rack_to_alloc_limited = counter_u64_alloc(M_WAITOK); 1325 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1326 SYSCTL_CHILDREN(rack_counters), 1327 OID_AUTO, "alloc_limited", CTLFLAG_RD, 1328 &rack_to_alloc_limited, 1329 "Total allocations dropped due to limit"); 1330 rack_alloc_limited_conns = counter_u64_alloc(M_WAITOK); 1331 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1332 SYSCTL_CHILDREN(rack_counters), 1333 OID_AUTO, "alloc_limited_conns", CTLFLAG_RD, 1334 &rack_alloc_limited_conns, 1335 "Connections with allocations dropped due to limit"); 1336 rack_split_limited = counter_u64_alloc(M_WAITOK); 1337 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1338 SYSCTL_CHILDREN(rack_counters), 1339 OID_AUTO, "split_limited", CTLFLAG_RD, 1340 &rack_split_limited, 1341 "Split allocations dropped due to limit"); 1342 rack_sack_proc_all = counter_u64_alloc(M_WAITOK); 1343 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1344 SYSCTL_CHILDREN(rack_counters), 1345 OID_AUTO, "sack_long", CTLFLAG_RD, 1346 &rack_sack_proc_all, 1347 "Total times we had to walk whole list for sack processing"); 1348 rack_sack_proc_restart = counter_u64_alloc(M_WAITOK); 1349 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1350 SYSCTL_CHILDREN(rack_counters), 1351 OID_AUTO, "sack_restart", CTLFLAG_RD, 1352 &rack_sack_proc_restart, 1353 "Total times we had to walk whole list due to a restart"); 1354 rack_sack_proc_short = counter_u64_alloc(M_WAITOK); 1355 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1356 SYSCTL_CHILDREN(rack_counters), 1357 OID_AUTO, "sack_short", CTLFLAG_RD, 1358 &rack_sack_proc_short, 1359 "Total times we took shortcut for sack processing"); 1360 rack_enter_tlp_calc = counter_u64_alloc(M_WAITOK); 1361 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1362 SYSCTL_CHILDREN(rack_counters), 1363 OID_AUTO, "tlp_calc_entered", CTLFLAG_RD, 1364 &rack_enter_tlp_calc, 1365 "Total times we called calc-tlp"); 1366 rack_used_tlpmethod = counter_u64_alloc(M_WAITOK); 1367 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1368 SYSCTL_CHILDREN(rack_counters), 1369 OID_AUTO, "hit_tlp_method", CTLFLAG_RD, 1370 &rack_used_tlpmethod, 1371 "Total number of runt sacks"); 1372 rack_used_tlpmethod2 = counter_u64_alloc(M_WAITOK); 1373 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1374 SYSCTL_CHILDREN(rack_counters), 1375 OID_AUTO, "hit_tlp_method2", CTLFLAG_RD, 1376 &rack_used_tlpmethod2, 1377 "Total number of times we hit TLP method 2"); 1378 rack_sack_skipped_acked = counter_u64_alloc(M_WAITOK); 1379 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1380 SYSCTL_CHILDREN(rack_attack), 1381 OID_AUTO, "skipacked", CTLFLAG_RD, 1382 &rack_sack_skipped_acked, 1383 "Total number of times we skipped previously sacked"); 1384 rack_sack_splits = counter_u64_alloc(M_WAITOK); 1385 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1386 SYSCTL_CHILDREN(rack_attack), 1387 OID_AUTO, "ofsplit", CTLFLAG_RD, 1388 &rack_sack_splits, 1389 "Total number of times we did the old fashion tree split"); 1390 rack_progress_drops = counter_u64_alloc(M_WAITOK); 1391 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1392 SYSCTL_CHILDREN(rack_counters), 1393 OID_AUTO, "prog_drops", CTLFLAG_RD, 1394 &rack_progress_drops, 1395 "Total number of progress drops"); 1396 rack_input_idle_reduces = counter_u64_alloc(M_WAITOK); 1397 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1398 SYSCTL_CHILDREN(rack_counters), 1399 OID_AUTO, "idle_reduce_oninput", CTLFLAG_RD, 1400 &rack_input_idle_reduces, 1401 "Total number of idle reductions on input"); 1402 rack_collapsed_win = counter_u64_alloc(M_WAITOK); 1403 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1404 SYSCTL_CHILDREN(rack_counters), 1405 OID_AUTO, "collapsed_win", CTLFLAG_RD, 1406 &rack_collapsed_win, 1407 "Total number of collapsed windows"); 1408 rack_tlp_does_nada = counter_u64_alloc(M_WAITOK); 1409 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1410 SYSCTL_CHILDREN(rack_counters), 1411 OID_AUTO, "tlp_nada", CTLFLAG_RD, 1412 &rack_tlp_does_nada, 1413 "Total number of nada tlp calls"); 1414 rack_try_scwnd = counter_u64_alloc(M_WAITOK); 1415 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1416 SYSCTL_CHILDREN(rack_counters), 1417 OID_AUTO, "tried_scwnd", CTLFLAG_RD, 1418 &rack_try_scwnd, 1419 "Total number of scwnd attempts"); 1420 1421 rack_per_timer_hole = counter_u64_alloc(M_WAITOK); 1422 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1423 SYSCTL_CHILDREN(rack_counters), 1424 OID_AUTO, "timer_hole", CTLFLAG_RD, 1425 &rack_per_timer_hole, 1426 "Total persists start in timer hole"); 1427 COUNTER_ARRAY_ALLOC(rack_out_size, TCP_MSS_ACCT_SIZE, M_WAITOK); 1428 SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), 1429 OID_AUTO, "outsize", CTLFLAG_RD, 1430 rack_out_size, TCP_MSS_ACCT_SIZE, "MSS send sizes"); 1431 COUNTER_ARRAY_ALLOC(rack_opts_arry, RACK_OPTS_SIZE, M_WAITOK); 1432 SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), 1433 OID_AUTO, "opts", CTLFLAG_RD, 1434 rack_opts_arry, RACK_OPTS_SIZE, "RACK Option Stats"); 1435 SYSCTL_ADD_PROC(&rack_sysctl_ctx, 1436 SYSCTL_CHILDREN(rack_sysctl_root), 1437 OID_AUTO, "clear", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, 1438 &rack_clear_counter, 0, sysctl_rack_clear, "IU", "Clear counters"); 1439 } 1440 1441 static __inline int 1442 rb_map_cmp(struct rack_sendmap *b, struct rack_sendmap *a) 1443 { 1444 if (SEQ_GEQ(b->r_start, a->r_start) && 1445 SEQ_LT(b->r_start, a->r_end)) { 1446 /* 1447 * The entry b is within the 1448 * block a. i.e.: 1449 * a -- |-------------| 1450 * b -- |----| 1451 * <or> 1452 * b -- |------| 1453 * <or> 1454 * b -- |-----------| 1455 */ 1456 return (0); 1457 } else if (SEQ_GEQ(b->r_start, a->r_end)) { 1458 /* 1459 * b falls as either the next 1460 * sequence block after a so a 1461 * is said to be smaller than b. 1462 * i.e: 1463 * a -- |------| 1464 * b -- |--------| 1465 * or 1466 * b -- |-----| 1467 */ 1468 return (1); 1469 } 1470 /* 1471 * Whats left is where a is 1472 * larger than b. i.e: 1473 * a -- |-------| 1474 * b -- |---| 1475 * or even possibly 1476 * b -- |--------------| 1477 */ 1478 return (-1); 1479 } 1480 1481 RB_PROTOTYPE(rack_rb_tree_head, rack_sendmap, r_next, rb_map_cmp); 1482 RB_GENERATE(rack_rb_tree_head, rack_sendmap, r_next, rb_map_cmp); 1483 1484 static uint32_t 1485 rc_init_window(struct tcp_rack *rack) 1486 { 1487 uint32_t win; 1488 1489 if (rack->rc_init_win == 0) { 1490 /* 1491 * Nothing set by the user, use the system stack 1492 * default. 1493 */ 1494 return(tcp_compute_initwnd(tcp_maxseg(rack->rc_tp))); 1495 } 1496 win = ctf_fixed_maxseg(rack->rc_tp) * rack->rc_init_win; 1497 return(win); 1498 } 1499 1500 static uint64_t 1501 rack_get_fixed_pacing_bw(struct tcp_rack *rack) 1502 { 1503 if (IN_RECOVERY(rack->rc_tp->t_flags)) 1504 return (rack->r_ctl.rc_fixed_pacing_rate_rec); 1505 else if (rack->r_ctl.cwnd_to_use < rack->rc_tp->snd_ssthresh) 1506 return (rack->r_ctl.rc_fixed_pacing_rate_ss); 1507 else 1508 return (rack->r_ctl.rc_fixed_pacing_rate_ca); 1509 } 1510 1511 static uint64_t 1512 rack_get_bw(struct tcp_rack *rack) 1513 { 1514 if (rack->use_fixed_rate) { 1515 /* Return the fixed pacing rate */ 1516 return (rack_get_fixed_pacing_bw(rack)); 1517 } 1518 if (rack->r_ctl.gp_bw == 0) { 1519 /* 1520 * We have yet no b/w measurement, 1521 * if we have a user set initial bw 1522 * return it. If we don't have that and 1523 * we have an srtt, use the tcp IW (10) to 1524 * calculate a fictional b/w over the SRTT 1525 * which is more or less a guess. Note 1526 * we don't use our IW from rack on purpose 1527 * so if we have like IW=30, we are not 1528 * calculating a "huge" b/w. 1529 */ 1530 uint64_t bw, srtt; 1531 if (rack->r_ctl.init_rate) 1532 return (rack->r_ctl.init_rate); 1533 1534 /* Has the user set a max peak rate? */ 1535 #ifdef NETFLIX_PEAKRATE 1536 if (rack->rc_tp->t_maxpeakrate) 1537 return (rack->rc_tp->t_maxpeakrate); 1538 #endif 1539 /* Ok lets come up with the IW guess, if we have a srtt */ 1540 if (rack->rc_tp->t_srtt == 0) { 1541 /* 1542 * Go with old pacing method 1543 * i.e. burst mitigation only. 1544 */ 1545 return (0); 1546 } 1547 /* Ok lets get the initial TCP win (not racks) */ 1548 bw = tcp_compute_initwnd(tcp_maxseg(rack->rc_tp)); 1549 srtt = ((uint64_t)TICKS_2_USEC(rack->rc_tp->t_srtt) >> TCP_RTT_SHIFT); 1550 bw *= (uint64_t)USECS_IN_SECOND; 1551 bw /= srtt; 1552 return (bw); 1553 } else { 1554 uint64_t bw; 1555 1556 if(rack->r_ctl.num_avg >= RACK_REQ_AVG) { 1557 /* Averaging is done, we can return the value */ 1558 bw = rack->r_ctl.gp_bw; 1559 } else { 1560 /* Still doing initial average must calculate */ 1561 bw = rack->r_ctl.gp_bw / rack->r_ctl.num_avg; 1562 } 1563 #ifdef NETFLIX_PEAKRATE 1564 if ((rack->rc_tp->t_maxpeakrate) && 1565 (bw > rack->rc_tp->t_maxpeakrate)) { 1566 /* The user has set a peak rate to pace at 1567 * don't allow us to pace faster than that. 1568 */ 1569 return (rack->rc_tp->t_maxpeakrate); 1570 } 1571 #endif 1572 return (bw); 1573 } 1574 } 1575 1576 static uint16_t 1577 rack_get_output_gain(struct tcp_rack *rack, struct rack_sendmap *rsm) 1578 { 1579 if (rack->use_fixed_rate) { 1580 return (100); 1581 } else if (rack->in_probe_rtt && (rsm == NULL)) 1582 return(rack->r_ctl.rack_per_of_gp_probertt); 1583 else if ((IN_RECOVERY(rack->rc_tp->t_flags) && 1584 rack->r_ctl.rack_per_of_gp_rec)) { 1585 if (rsm) { 1586 /* a retransmission always use the recovery rate */ 1587 return(rack->r_ctl.rack_per_of_gp_rec); 1588 } else if (rack->rack_rec_nonrxt_use_cr) { 1589 /* Directed to use the configured rate */ 1590 goto configured_rate; 1591 } else if (rack->rack_no_prr && 1592 (rack->r_ctl.rack_per_of_gp_rec > 100)) { 1593 /* No PRR, lets just use the b/w estimate only */ 1594 return(100); 1595 } else { 1596 /* 1597 * Here we may have a non-retransmit but we 1598 * have no overrides, so just use the recovery 1599 * rate (prr is in effect). 1600 */ 1601 return(rack->r_ctl.rack_per_of_gp_rec); 1602 } 1603 } 1604 configured_rate: 1605 /* For the configured rate we look at our cwnd vs the ssthresh */ 1606 if (rack->r_ctl.cwnd_to_use < rack->rc_tp->snd_ssthresh) 1607 return (rack->r_ctl.rack_per_of_gp_ss); 1608 else 1609 return(rack->r_ctl.rack_per_of_gp_ca); 1610 } 1611 1612 static uint64_t 1613 rack_get_output_bw(struct tcp_rack *rack, uint64_t bw, struct rack_sendmap *rsm) 1614 { 1615 /* 1616 * We allow rack_per_of_gp_xx to dictate our bw rate we want. 1617 */ 1618 uint64_t bw_est; 1619 uint64_t gain; 1620 1621 gain = (uint64_t)rack_get_output_gain(rack, rsm); 1622 bw_est = bw * gain; 1623 bw_est /= (uint64_t)100; 1624 /* Never fall below the minimum (def 64kbps) */ 1625 if (bw_est < RACK_MIN_BW) 1626 bw_est = RACK_MIN_BW; 1627 return (bw_est); 1628 } 1629 1630 static void 1631 rack_log_retran_reason(struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t tsused, uint32_t thresh, int mod) 1632 { 1633 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 1634 union tcp_log_stackspecific log; 1635 struct timeval tv; 1636 1637 if ((mod != 1) && (rack_verbose_logging == 0)) { 1638 /* 1639 * We get 3 values currently for mod 1640 * 1 - We are retransmitting and this tells the reason. 1641 * 2 - We are clearing a dup-ack count. 1642 * 3 - We are incrementing a dup-ack count. 1643 * 1644 * The clear/increment are only logged 1645 * if you have BBverbose on. 1646 */ 1647 return; 1648 } 1649 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 1650 log.u_bbr.flex1 = tsused; 1651 log.u_bbr.flex2 = thresh; 1652 log.u_bbr.flex3 = rsm->r_flags; 1653 log.u_bbr.flex4 = rsm->r_dupack; 1654 log.u_bbr.flex5 = rsm->r_start; 1655 log.u_bbr.flex6 = rsm->r_end; 1656 log.u_bbr.flex8 = mod; 1657 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 1658 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 1659 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 1660 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 1661 TCP_LOG_EVENTP(rack->rc_tp, NULL, 1662 &rack->rc_inp->inp_socket->so_rcv, 1663 &rack->rc_inp->inp_socket->so_snd, 1664 BBR_LOG_SETTINGS_CHG, 0, 1665 0, &log, false, &tv); 1666 } 1667 } 1668 1669 static void 1670 rack_log_to_start(struct tcp_rack *rack, uint32_t cts, uint32_t to, int32_t slot, uint8_t which) 1671 { 1672 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 1673 union tcp_log_stackspecific log; 1674 struct timeval tv; 1675 1676 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 1677 log.u_bbr.flex1 = TICKS_2_MSEC(rack->rc_tp->t_srtt >> TCP_RTT_SHIFT); 1678 log.u_bbr.flex2 = to * 1000; 1679 log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags; 1680 log.u_bbr.flex4 = slot; 1681 log.u_bbr.flex5 = rack->rc_inp->inp_hptsslot; 1682 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur; 1683 log.u_bbr.flex7 = rack->rc_in_persist; 1684 log.u_bbr.flex8 = which; 1685 if (rack->rack_no_prr) 1686 log.u_bbr.pkts_out = 0; 1687 else 1688 log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt; 1689 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 1690 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 1691 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 1692 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 1693 TCP_LOG_EVENTP(rack->rc_tp, NULL, 1694 &rack->rc_inp->inp_socket->so_rcv, 1695 &rack->rc_inp->inp_socket->so_snd, 1696 BBR_LOG_TIMERSTAR, 0, 1697 0, &log, false, &tv); 1698 } 1699 } 1700 1701 static void 1702 rack_log_to_event(struct tcp_rack *rack, int32_t to_num, struct rack_sendmap *rsm) 1703 { 1704 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 1705 union tcp_log_stackspecific log; 1706 struct timeval tv; 1707 1708 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 1709 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 1710 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 1711 log.u_bbr.flex8 = to_num; 1712 log.u_bbr.flex1 = rack->r_ctl.rc_rack_min_rtt; 1713 log.u_bbr.flex2 = rack->rc_rack_rtt; 1714 if (rsm == NULL) 1715 log.u_bbr.flex3 = 0; 1716 else 1717 log.u_bbr.flex3 = rsm->r_end - rsm->r_start; 1718 if (rack->rack_no_prr) 1719 log.u_bbr.flex5 = 0; 1720 else 1721 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; 1722 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 1723 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 1724 TCP_LOG_EVENTP(rack->rc_tp, NULL, 1725 &rack->rc_inp->inp_socket->so_rcv, 1726 &rack->rc_inp->inp_socket->so_snd, 1727 BBR_LOG_RTO, 0, 1728 0, &log, false, &tv); 1729 } 1730 } 1731 1732 static void 1733 rack_log_rtt_upd(struct tcpcb *tp, struct tcp_rack *rack, uint32_t t, uint32_t len, 1734 struct rack_sendmap *rsm, int conf) 1735 { 1736 if (tp->t_logstate != TCP_LOG_STATE_OFF) { 1737 union tcp_log_stackspecific log; 1738 struct timeval tv; 1739 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 1740 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 1741 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 1742 log.u_bbr.flex1 = t; 1743 log.u_bbr.flex2 = len; 1744 log.u_bbr.flex3 = rack->r_ctl.rc_rack_min_rtt * HPTS_USEC_IN_MSEC; 1745 log.u_bbr.flex4 = rack->r_ctl.rack_rs.rs_rtt_lowest * HPTS_USEC_IN_MSEC; 1746 log.u_bbr.flex5 = rack->r_ctl.rack_rs.rs_rtt_highest * HPTS_USEC_IN_MSEC; 1747 log.u_bbr.flex6 = rack->r_ctl.rack_rs.rs_rtt_cnt; 1748 log.u_bbr.flex7 = conf; 1749 log.u_bbr.rttProp = (uint64_t)rack->r_ctl.rack_rs.rs_rtt_tot * (uint64_t)HPTS_USEC_IN_MSEC; 1750 log.u_bbr.flex8 = rack->r_ctl.rc_rate_sample_method; 1751 if (rack->rack_no_prr) 1752 log.u_bbr.pkts_out = 0; 1753 else 1754 log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt; 1755 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 1756 log.u_bbr.delivered = rack->r_ctl.rack_rs.rs_us_rtt; 1757 log.u_bbr.pkts_out = rack->r_ctl.rack_rs.rs_flags; 1758 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 1759 if (rsm) { 1760 log.u_bbr.pkt_epoch = rsm->r_start; 1761 log.u_bbr.lost = rsm->r_end; 1762 log.u_bbr.cwnd_gain = rsm->r_rtr_cnt; 1763 } else { 1764 /* Its a SYN */ 1765 log.u_bbr.pkt_epoch = rack->rc_tp->iss; 1766 log.u_bbr.lost = 0; 1767 log.u_bbr.cwnd_gain = 0; 1768 } 1769 /* Write out general bits of interest rrs here */ 1770 log.u_bbr.use_lt_bw = rack->rc_highly_buffered; 1771 log.u_bbr.use_lt_bw <<= 1; 1772 log.u_bbr.use_lt_bw |= rack->forced_ack; 1773 log.u_bbr.use_lt_bw <<= 1; 1774 log.u_bbr.use_lt_bw |= rack->rc_gp_dyn_mul; 1775 log.u_bbr.use_lt_bw <<= 1; 1776 log.u_bbr.use_lt_bw |= rack->in_probe_rtt; 1777 log.u_bbr.use_lt_bw <<= 1; 1778 log.u_bbr.use_lt_bw |= rack->measure_saw_probe_rtt; 1779 log.u_bbr.use_lt_bw <<= 1; 1780 log.u_bbr.use_lt_bw |= rack->app_limited_needs_set; 1781 log.u_bbr.use_lt_bw <<= 1; 1782 log.u_bbr.use_lt_bw |= rack->rc_gp_filled; 1783 log.u_bbr.use_lt_bw <<= 1; 1784 log.u_bbr.use_lt_bw |= rack->rc_dragged_bottom; 1785 log.u_bbr.applimited = rack->r_ctl.rc_target_probertt_flight; 1786 log.u_bbr.epoch = rack->r_ctl.rc_time_probertt_starts; 1787 log.u_bbr.lt_epoch = rack->r_ctl.rc_time_probertt_entered; 1788 log.u_bbr.cur_del_rate = rack->r_ctl.rc_lower_rtt_us_cts; 1789 log.u_bbr.delRate = rack->r_ctl.rc_gp_srtt; 1790 TCP_LOG_EVENTP(tp, NULL, 1791 &rack->rc_inp->inp_socket->so_rcv, 1792 &rack->rc_inp->inp_socket->so_snd, 1793 BBR_LOG_BBRRTT, 0, 1794 0, &log, false, &tv); 1795 } 1796 } 1797 1798 static void 1799 rack_log_rtt_sample(struct tcp_rack *rack, uint32_t rtt) 1800 { 1801 /* 1802 * Log the rtt sample we are 1803 * applying to the srtt algorithm in 1804 * useconds. 1805 */ 1806 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 1807 union tcp_log_stackspecific log; 1808 struct timeval tv; 1809 1810 /* Convert our ms to a microsecond */ 1811 memset(&log, 0, sizeof(log)); 1812 log.u_bbr.flex1 = rtt * 1000; 1813 log.u_bbr.flex2 = rack->r_ctl.ack_count; 1814 log.u_bbr.flex3 = rack->r_ctl.sack_count; 1815 log.u_bbr.flex4 = rack->r_ctl.sack_noextra_move; 1816 log.u_bbr.flex5 = rack->r_ctl.sack_moved_extra; 1817 log.u_bbr.flex8 = rack->sack_attack_disable; 1818 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 1819 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 1820 TCP_LOG_EVENTP(rack->rc_tp, NULL, 1821 &rack->rc_inp->inp_socket->so_rcv, 1822 &rack->rc_inp->inp_socket->so_snd, 1823 TCP_LOG_RTT, 0, 1824 0, &log, false, &tv); 1825 } 1826 } 1827 1828 static inline void 1829 rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, int event, int line) 1830 { 1831 if (rack_verbose_logging && (tp->t_logstate != TCP_LOG_STATE_OFF)) { 1832 union tcp_log_stackspecific log; 1833 struct timeval tv; 1834 1835 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 1836 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 1837 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 1838 log.u_bbr.flex1 = line; 1839 log.u_bbr.flex2 = tick; 1840 log.u_bbr.flex3 = tp->t_maxunacktime; 1841 log.u_bbr.flex4 = tp->t_acktime; 1842 log.u_bbr.flex8 = event; 1843 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 1844 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 1845 TCP_LOG_EVENTP(tp, NULL, 1846 &rack->rc_inp->inp_socket->so_rcv, 1847 &rack->rc_inp->inp_socket->so_snd, 1848 BBR_LOG_PROGRESS, 0, 1849 0, &log, false, &tv); 1850 } 1851 } 1852 1853 static void 1854 rack_log_type_bbrsnd(struct tcp_rack *rack, uint32_t len, uint32_t slot, uint32_t cts, struct timeval *tv) 1855 { 1856 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 1857 union tcp_log_stackspecific log; 1858 1859 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 1860 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 1861 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 1862 log.u_bbr.flex1 = slot; 1863 if (rack->rack_no_prr) 1864 log.u_bbr.flex2 = 0; 1865 else 1866 log.u_bbr.flex2 = rack->r_ctl.rc_prr_sndcnt; 1867 log.u_bbr.flex7 = (0x0000ffff & rack->r_ctl.rc_hpts_flags); 1868 log.u_bbr.flex8 = rack->rc_in_persist; 1869 log.u_bbr.timeStamp = cts; 1870 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 1871 TCP_LOG_EVENTP(rack->rc_tp, NULL, 1872 &rack->rc_inp->inp_socket->so_rcv, 1873 &rack->rc_inp->inp_socket->so_snd, 1874 BBR_LOG_BBRSND, 0, 1875 0, &log, false, tv); 1876 } 1877 } 1878 1879 static void 1880 rack_log_doseg_done(struct tcp_rack *rack, uint32_t cts, int32_t nxt_pkt, int32_t did_out, int way_out) 1881 { 1882 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 1883 union tcp_log_stackspecific log; 1884 struct timeval tv; 1885 1886 memset(&log, 0, sizeof(log)); 1887 log.u_bbr.flex1 = did_out; 1888 log.u_bbr.flex2 = nxt_pkt; 1889 log.u_bbr.flex3 = way_out; 1890 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; 1891 if (rack->rack_no_prr) 1892 log.u_bbr.flex5 = 0; 1893 else 1894 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; 1895 log.u_bbr.applimited = rack->r_ctl.rc_pace_min_segs; 1896 log.u_bbr.flex7 = rack->r_wanted_output; 1897 log.u_bbr.flex8 = rack->rc_in_persist; 1898 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 1899 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 1900 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 1901 TCP_LOG_EVENTP(rack->rc_tp, NULL, 1902 &rack->rc_inp->inp_socket->so_rcv, 1903 &rack->rc_inp->inp_socket->so_snd, 1904 BBR_LOG_DOSEG_DONE, 0, 1905 0, &log, false, &tv); 1906 } 1907 } 1908 1909 static void 1910 rack_log_type_hrdwtso(struct tcpcb *tp, struct tcp_rack *rack, int len, int mod, int32_t orig_len, int frm) 1911 { 1912 if (tp->t_logstate != TCP_LOG_STATE_OFF) { 1913 union tcp_log_stackspecific log; 1914 struct timeval tv; 1915 uint32_t cts; 1916 1917 memset(&log, 0, sizeof(log)); 1918 cts = tcp_get_usecs(&tv); 1919 log.u_bbr.flex1 = rack->r_ctl.rc_pace_min_segs; 1920 log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs; 1921 log.u_bbr.flex4 = len; 1922 log.u_bbr.flex5 = orig_len; 1923 log.u_bbr.flex6 = rack->r_ctl.rc_sacked; 1924 log.u_bbr.flex7 = mod; 1925 log.u_bbr.flex8 = frm; 1926 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 1927 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 1928 TCP_LOG_EVENTP(tp, NULL, 1929 &tp->t_inpcb->inp_socket->so_rcv, 1930 &tp->t_inpcb->inp_socket->so_snd, 1931 TCP_HDWR_PACE_SIZE, 0, 1932 0, &log, false, &tv); 1933 } 1934 } 1935 1936 static void 1937 rack_log_type_just_return(struct tcp_rack *rack, uint32_t cts, uint32_t tlen, uint32_t slot, 1938 uint8_t hpts_calling, int reason, uint32_t cwnd_to_use) 1939 { 1940 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 1941 union tcp_log_stackspecific log; 1942 struct timeval tv; 1943 1944 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 1945 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 1946 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 1947 log.u_bbr.flex1 = slot; 1948 log.u_bbr.flex2 = rack->r_ctl.rc_hpts_flags; 1949 log.u_bbr.flex4 = reason; 1950 if (rack->rack_no_prr) 1951 log.u_bbr.flex5 = 0; 1952 else 1953 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; 1954 log.u_bbr.flex7 = hpts_calling; 1955 log.u_bbr.flex8 = rack->rc_in_persist; 1956 log.u_bbr.lt_epoch = cwnd_to_use; 1957 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 1958 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 1959 TCP_LOG_EVENTP(rack->rc_tp, NULL, 1960 &rack->rc_inp->inp_socket->so_rcv, 1961 &rack->rc_inp->inp_socket->so_snd, 1962 BBR_LOG_JUSTRET, 0, 1963 tlen, &log, false, &tv); 1964 } 1965 } 1966 1967 static void 1968 rack_log_to_cancel(struct tcp_rack *rack, int32_t hpts_removed, int line, uint32_t us_cts, 1969 struct timeval *tv, uint32_t flags_on_entry) 1970 { 1971 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 1972 union tcp_log_stackspecific log; 1973 1974 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 1975 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 1976 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 1977 log.u_bbr.flex1 = line; 1978 log.u_bbr.flex2 = rack->r_ctl.rc_last_output_to; 1979 log.u_bbr.flex3 = flags_on_entry; 1980 log.u_bbr.flex4 = us_cts; 1981 if (rack->rack_no_prr) 1982 log.u_bbr.flex5 = 0; 1983 else 1984 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; 1985 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur; 1986 log.u_bbr.flex7 = hpts_removed; 1987 log.u_bbr.flex8 = 1; 1988 log.u_bbr.applimited = rack->r_ctl.rc_hpts_flags; 1989 log.u_bbr.timeStamp = us_cts; 1990 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 1991 TCP_LOG_EVENTP(rack->rc_tp, NULL, 1992 &rack->rc_inp->inp_socket->so_rcv, 1993 &rack->rc_inp->inp_socket->so_snd, 1994 BBR_LOG_TIMERCANC, 0, 1995 0, &log, false, tv); 1996 } 1997 } 1998 1999 static void 2000 rack_log_alt_to_to_cancel(struct tcp_rack *rack, 2001 uint32_t flex1, uint32_t flex2, 2002 uint32_t flex3, uint32_t flex4, 2003 uint32_t flex5, uint32_t flex6, 2004 uint16_t flex7, uint8_t mod) 2005 { 2006 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2007 union tcp_log_stackspecific log; 2008 struct timeval tv; 2009 2010 if (mod == 1) { 2011 /* No you can't use 1, its for the real to cancel */ 2012 return; 2013 } 2014 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2015 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2016 log.u_bbr.flex1 = flex1; 2017 log.u_bbr.flex2 = flex2; 2018 log.u_bbr.flex3 = flex3; 2019 log.u_bbr.flex4 = flex4; 2020 log.u_bbr.flex5 = flex5; 2021 log.u_bbr.flex6 = flex6; 2022 log.u_bbr.flex7 = flex7; 2023 log.u_bbr.flex8 = mod; 2024 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2025 &rack->rc_inp->inp_socket->so_rcv, 2026 &rack->rc_inp->inp_socket->so_snd, 2027 BBR_LOG_TIMERCANC, 0, 2028 0, &log, false, &tv); 2029 } 2030 } 2031 2032 static void 2033 rack_log_to_processing(struct tcp_rack *rack, uint32_t cts, int32_t ret, int32_t timers) 2034 { 2035 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2036 union tcp_log_stackspecific log; 2037 struct timeval tv; 2038 2039 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2040 log.u_bbr.flex1 = timers; 2041 log.u_bbr.flex2 = ret; 2042 log.u_bbr.flex3 = rack->r_ctl.rc_timer_exp; 2043 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; 2044 log.u_bbr.flex5 = cts; 2045 if (rack->rack_no_prr) 2046 log.u_bbr.flex6 = 0; 2047 else 2048 log.u_bbr.flex6 = rack->r_ctl.rc_prr_sndcnt; 2049 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2050 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2051 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2052 &rack->rc_inp->inp_socket->so_rcv, 2053 &rack->rc_inp->inp_socket->so_snd, 2054 BBR_LOG_TO_PROCESS, 0, 2055 0, &log, false, &tv); 2056 } 2057 } 2058 2059 static void 2060 rack_log_to_prr(struct tcp_rack *rack, int frm, int orig_cwnd) 2061 { 2062 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2063 union tcp_log_stackspecific log; 2064 struct timeval tv; 2065 2066 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2067 log.u_bbr.flex1 = rack->r_ctl.rc_prr_out; 2068 log.u_bbr.flex2 = rack->r_ctl.rc_prr_recovery_fs; 2069 if (rack->rack_no_prr) 2070 log.u_bbr.flex3 = 0; 2071 else 2072 log.u_bbr.flex3 = rack->r_ctl.rc_prr_sndcnt; 2073 log.u_bbr.flex4 = rack->r_ctl.rc_prr_delivered; 2074 log.u_bbr.flex5 = rack->r_ctl.rc_sacked; 2075 log.u_bbr.flex6 = rack->r_ctl.rc_holes_rxt; 2076 log.u_bbr.flex8 = frm; 2077 log.u_bbr.pkts_out = orig_cwnd; 2078 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2079 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2080 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2081 &rack->rc_inp->inp_socket->so_rcv, 2082 &rack->rc_inp->inp_socket->so_snd, 2083 BBR_LOG_BBRUPD, 0, 2084 0, &log, false, &tv); 2085 } 2086 } 2087 2088 #ifdef NETFLIX_EXP_DETECTION 2089 static void 2090 rack_log_sad(struct tcp_rack *rack, int event) 2091 { 2092 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2093 union tcp_log_stackspecific log; 2094 struct timeval tv; 2095 2096 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2097 log.u_bbr.flex1 = rack->r_ctl.sack_count; 2098 log.u_bbr.flex2 = rack->r_ctl.ack_count; 2099 log.u_bbr.flex3 = rack->r_ctl.sack_moved_extra; 2100 log.u_bbr.flex4 = rack->r_ctl.sack_noextra_move; 2101 log.u_bbr.flex5 = rack->r_ctl.rc_num_maps_alloced; 2102 log.u_bbr.flex6 = tcp_sack_to_ack_thresh; 2103 log.u_bbr.pkts_out = tcp_sack_to_move_thresh; 2104 log.u_bbr.lt_epoch = (tcp_force_detection << 8); 2105 log.u_bbr.lt_epoch |= rack->do_detection; 2106 log.u_bbr.applimited = tcp_map_minimum; 2107 log.u_bbr.flex7 = rack->sack_attack_disable; 2108 log.u_bbr.flex8 = event; 2109 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2110 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2111 log.u_bbr.delivered = tcp_sad_decay_val; 2112 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2113 &rack->rc_inp->inp_socket->so_rcv, 2114 &rack->rc_inp->inp_socket->so_snd, 2115 TCP_SAD_DETECTION, 0, 2116 0, &log, false, &tv); 2117 } 2118 } 2119 #endif 2120 2121 static void 2122 rack_counter_destroy(void) 2123 { 2124 counter_u64_free(rack_ack_total); 2125 counter_u64_free(rack_express_sack); 2126 counter_u64_free(rack_sack_total); 2127 counter_u64_free(rack_move_none); 2128 counter_u64_free(rack_move_some); 2129 counter_u64_free(rack_sack_attacks_detected); 2130 counter_u64_free(rack_sack_attacks_reversed); 2131 counter_u64_free(rack_sack_used_next_merge); 2132 counter_u64_free(rack_sack_used_prev_merge); 2133 counter_u64_free(rack_badfr); 2134 counter_u64_free(rack_badfr_bytes); 2135 counter_u64_free(rack_rtm_prr_retran); 2136 counter_u64_free(rack_rtm_prr_newdata); 2137 counter_u64_free(rack_timestamp_mismatch); 2138 counter_u64_free(rack_find_high); 2139 counter_u64_free(rack_reorder_seen); 2140 counter_u64_free(rack_tlp_tot); 2141 counter_u64_free(rack_tlp_newdata); 2142 counter_u64_free(rack_tlp_retran); 2143 counter_u64_free(rack_tlp_retran_bytes); 2144 counter_u64_free(rack_tlp_retran_fail); 2145 counter_u64_free(rack_to_tot); 2146 counter_u64_free(rack_to_arm_rack); 2147 counter_u64_free(rack_to_arm_tlp); 2148 counter_u64_free(rack_calc_zero); 2149 counter_u64_free(rack_calc_nonzero); 2150 counter_u64_free(rack_paced_segments); 2151 counter_u64_free(rack_unpaced_segments); 2152 counter_u64_free(rack_saw_enobuf); 2153 counter_u64_free(rack_saw_enetunreach); 2154 counter_u64_free(rack_to_alloc); 2155 counter_u64_free(rack_to_alloc_hard); 2156 counter_u64_free(rack_to_alloc_emerg); 2157 counter_u64_free(rack_to_alloc_limited); 2158 counter_u64_free(rack_alloc_limited_conns); 2159 counter_u64_free(rack_split_limited); 2160 counter_u64_free(rack_sack_proc_all); 2161 counter_u64_free(rack_sack_proc_restart); 2162 counter_u64_free(rack_sack_proc_short); 2163 counter_u64_free(rack_enter_tlp_calc); 2164 counter_u64_free(rack_used_tlpmethod); 2165 counter_u64_free(rack_used_tlpmethod2); 2166 counter_u64_free(rack_sack_skipped_acked); 2167 counter_u64_free(rack_sack_splits); 2168 counter_u64_free(rack_progress_drops); 2169 counter_u64_free(rack_input_idle_reduces); 2170 counter_u64_free(rack_collapsed_win); 2171 counter_u64_free(rack_tlp_does_nada); 2172 counter_u64_free(rack_try_scwnd); 2173 counter_u64_free(rack_per_timer_hole); 2174 COUNTER_ARRAY_FREE(rack_out_size, TCP_MSS_ACCT_SIZE); 2175 COUNTER_ARRAY_FREE(rack_opts_arry, RACK_OPTS_SIZE); 2176 } 2177 2178 static struct rack_sendmap * 2179 rack_alloc(struct tcp_rack *rack) 2180 { 2181 struct rack_sendmap *rsm; 2182 2183 rsm = uma_zalloc(rack_zone, M_NOWAIT); 2184 if (rsm) { 2185 rack->r_ctl.rc_num_maps_alloced++; 2186 counter_u64_add(rack_to_alloc, 1); 2187 return (rsm); 2188 } 2189 if (rack->rc_free_cnt) { 2190 counter_u64_add(rack_to_alloc_emerg, 1); 2191 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); 2192 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext); 2193 rack->rc_free_cnt--; 2194 return (rsm); 2195 } 2196 return (NULL); 2197 } 2198 2199 static struct rack_sendmap * 2200 rack_alloc_full_limit(struct tcp_rack *rack) 2201 { 2202 if ((V_tcp_map_entries_limit > 0) && 2203 (rack->do_detection == 0) && 2204 (rack->r_ctl.rc_num_maps_alloced >= V_tcp_map_entries_limit)) { 2205 counter_u64_add(rack_to_alloc_limited, 1); 2206 if (!rack->alloc_limit_reported) { 2207 rack->alloc_limit_reported = 1; 2208 counter_u64_add(rack_alloc_limited_conns, 1); 2209 } 2210 return (NULL); 2211 } 2212 return (rack_alloc(rack)); 2213 } 2214 2215 /* wrapper to allocate a sendmap entry, subject to a specific limit */ 2216 static struct rack_sendmap * 2217 rack_alloc_limit(struct tcp_rack *rack, uint8_t limit_type) 2218 { 2219 struct rack_sendmap *rsm; 2220 2221 if (limit_type) { 2222 /* currently there is only one limit type */ 2223 if (V_tcp_map_split_limit > 0 && 2224 (rack->do_detection == 0) && 2225 rack->r_ctl.rc_num_split_allocs >= V_tcp_map_split_limit) { 2226 counter_u64_add(rack_split_limited, 1); 2227 if (!rack->alloc_limit_reported) { 2228 rack->alloc_limit_reported = 1; 2229 counter_u64_add(rack_alloc_limited_conns, 1); 2230 } 2231 return (NULL); 2232 } 2233 } 2234 2235 /* allocate and mark in the limit type, if set */ 2236 rsm = rack_alloc(rack); 2237 if (rsm != NULL && limit_type) { 2238 rsm->r_limit_type = limit_type; 2239 rack->r_ctl.rc_num_split_allocs++; 2240 } 2241 return (rsm); 2242 } 2243 2244 static void 2245 rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm) 2246 { 2247 if (rsm->r_flags & RACK_APP_LIMITED) { 2248 if (rack->r_ctl.rc_app_limited_cnt > 0) { 2249 rack->r_ctl.rc_app_limited_cnt--; 2250 } 2251 } 2252 if (rsm->r_limit_type) { 2253 /* currently there is only one limit type */ 2254 rack->r_ctl.rc_num_split_allocs--; 2255 } 2256 if (rsm == rack->r_ctl.rc_first_appl) { 2257 if (rack->r_ctl.rc_app_limited_cnt == 0) 2258 rack->r_ctl.rc_first_appl = NULL; 2259 else { 2260 /* Follow the next one out */ 2261 struct rack_sendmap fe; 2262 2263 fe.r_start = rsm->r_nseq_appl; 2264 rack->r_ctl.rc_first_appl = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); 2265 } 2266 } 2267 if (rsm == rack->r_ctl.rc_resend) 2268 rack->r_ctl.rc_resend = NULL; 2269 if (rsm == rack->r_ctl.rc_rsm_at_retran) 2270 rack->r_ctl.rc_rsm_at_retran = NULL; 2271 if (rsm == rack->r_ctl.rc_end_appl) 2272 rack->r_ctl.rc_end_appl = NULL; 2273 if (rack->r_ctl.rc_tlpsend == rsm) 2274 rack->r_ctl.rc_tlpsend = NULL; 2275 if (rack->r_ctl.rc_sacklast == rsm) 2276 rack->r_ctl.rc_sacklast = NULL; 2277 if (rack->rc_free_cnt < rack_free_cache) { 2278 memset(rsm, 0, sizeof(struct rack_sendmap)); 2279 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_tnext); 2280 rsm->r_limit_type = 0; 2281 rack->rc_free_cnt++; 2282 return; 2283 } 2284 rack->r_ctl.rc_num_maps_alloced--; 2285 uma_zfree(rack_zone, rsm); 2286 } 2287 2288 static uint32_t 2289 rack_get_measure_window(struct tcpcb *tp, struct tcp_rack *rack) 2290 { 2291 uint64_t srtt, bw, len, tim; 2292 uint32_t segsiz, def_len, minl; 2293 2294 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 2295 def_len = rack_def_data_window * segsiz; 2296 if (rack->rc_gp_filled == 0) { 2297 /* 2298 * We have no measurement (IW is in flight?) so 2299 * we can only guess using our data_window sysctl 2300 * value (usually 100MSS). 2301 */ 2302 return (def_len); 2303 } 2304 /* 2305 * Now we have a number of factors to consider. 2306 * 2307 * 1) We have a desired BDP which is usually 2308 * at least 2. 2309 * 2) We have a minimum number of rtt's usually 1 SRTT 2310 * but we allow it too to be more. 2311 * 3) We want to make sure a measurement last N useconds (if 2312 * we have set rack_min_measure_usec. 2313 * 2314 * We handle the first concern here by trying to create a data 2315 * window of max(rack_def_data_window, DesiredBDP). The 2316 * second concern we handle in not letting the measurement 2317 * window end normally until at least the required SRTT's 2318 * have gone by which is done further below in 2319 * rack_enough_for_measurement(). Finally the third concern 2320 * we also handle here by calculating how long that time 2321 * would take at the current BW and then return the 2322 * max of our first calculation and that length. Note 2323 * that if rack_min_measure_usec is 0, we don't deal 2324 * with concern 3. Also for both Concern 1 and 3 an 2325 * application limited period could end the measurement 2326 * earlier. 2327 * 2328 * So lets calculate the BDP with the "known" b/w using 2329 * the SRTT has our rtt and then multiply it by the 2330 * goal. 2331 */ 2332 bw = rack_get_bw(rack); 2333 srtt = ((uint64_t)TICKS_2_USEC(tp->t_srtt) >> TCP_RTT_SHIFT); 2334 len = bw * srtt; 2335 len /= (uint64_t)HPTS_USEC_IN_SEC; 2336 len *= max(1, rack_goal_bdp); 2337 /* Now we need to round up to the nearest MSS */ 2338 len = roundup(len, segsiz); 2339 if (rack_min_measure_usec) { 2340 /* Now calculate our min length for this b/w */ 2341 tim = rack_min_measure_usec; 2342 minl = (tim * bw) / (uint64_t)HPTS_USEC_IN_SEC; 2343 if (minl == 0) 2344 minl = 1; 2345 minl = roundup(minl, segsiz); 2346 if (len < minl) 2347 len = minl; 2348 } 2349 /* 2350 * Now if we have a very small window we want 2351 * to attempt to get the window that is 2352 * as small as possible. This happens on 2353 * low b/w connections and we don't want to 2354 * span huge numbers of rtt's between measurements. 2355 * 2356 * We basically include 2 over our "MIN window" so 2357 * that the measurement can be shortened (possibly) by 2358 * an ack'ed packet. 2359 */ 2360 if (len < def_len) 2361 return (max((uint32_t)len, ((MIN_GP_WIN+2) * segsiz))); 2362 else 2363 return (max((uint32_t)len, def_len)); 2364 2365 } 2366 2367 static int 2368 rack_enough_for_measurement(struct tcpcb *tp, struct tcp_rack *rack, tcp_seq th_ack) 2369 { 2370 uint32_t tim, srtts, segsiz; 2371 2372 /* 2373 * Has enough time passed for the GP measurement to be valid? 2374 */ 2375 if ((tp->snd_max == tp->snd_una) || 2376 (th_ack == tp->snd_max)){ 2377 /* All is acked */ 2378 return (1); 2379 } 2380 if (SEQ_LT(th_ack, tp->gput_seq)) { 2381 /* Not enough bytes yet */ 2382 return (0); 2383 } 2384 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 2385 if (SEQ_LT(th_ack, tp->gput_ack) && 2386 ((th_ack - tp->gput_seq) < max(rc_init_window(rack), (MIN_GP_WIN * segsiz)))) { 2387 /* Not enough bytes yet */ 2388 return (0); 2389 } 2390 if (rack->r_ctl.rc_first_appl && 2391 (rack->r_ctl.rc_first_appl->r_start == th_ack)) { 2392 /* 2393 * We are up to the app limited point 2394 * we have to measure irrespective of the time.. 2395 */ 2396 return (1); 2397 } 2398 /* Now what about time? */ 2399 srtts = (rack->r_ctl.rc_gp_srtt * rack_min_srtts); 2400 tim = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time) - tp->gput_ts; 2401 if (tim >= srtts) { 2402 return (1); 2403 } 2404 /* Nope not even a full SRTT has passed */ 2405 return (0); 2406 } 2407 2408 static void 2409 rack_log_timely(struct tcp_rack *rack, 2410 uint32_t logged, uint64_t cur_bw, uint64_t low_bnd, 2411 uint64_t up_bnd, int line, uint8_t method) 2412 { 2413 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2414 union tcp_log_stackspecific log; 2415 struct timeval tv; 2416 2417 memset(&log, 0, sizeof(log)); 2418 log.u_bbr.flex1 = logged; 2419 log.u_bbr.flex2 = rack->rc_gp_timely_inc_cnt; 2420 log.u_bbr.flex2 <<= 4; 2421 log.u_bbr.flex2 |= rack->rc_gp_timely_dec_cnt; 2422 log.u_bbr.flex2 <<= 4; 2423 log.u_bbr.flex2 |= rack->rc_gp_incr; 2424 log.u_bbr.flex2 <<= 4; 2425 log.u_bbr.flex2 |= rack->rc_gp_bwred; 2426 log.u_bbr.flex3 = rack->rc_gp_incr; 2427 log.u_bbr.flex4 = rack->r_ctl.rack_per_of_gp_ss; 2428 log.u_bbr.flex5 = rack->r_ctl.rack_per_of_gp_ca; 2429 log.u_bbr.flex6 = rack->r_ctl.rack_per_of_gp_rec; 2430 log.u_bbr.flex7 = rack->rc_gp_bwred; 2431 log.u_bbr.flex8 = method; 2432 log.u_bbr.cur_del_rate = cur_bw; 2433 log.u_bbr.delRate = low_bnd; 2434 log.u_bbr.bw_inuse = up_bnd; 2435 log.u_bbr.rttProp = rack_get_bw(rack); 2436 log.u_bbr.pkt_epoch = line; 2437 log.u_bbr.pkts_out = rack->r_ctl.rc_rtt_diff; 2438 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2439 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2440 log.u_bbr.epoch = rack->r_ctl.rc_gp_srtt; 2441 log.u_bbr.lt_epoch = rack->r_ctl.rc_prev_gp_srtt; 2442 log.u_bbr.cwnd_gain = rack->rc_dragged_bottom; 2443 log.u_bbr.cwnd_gain <<= 1; 2444 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_rec; 2445 log.u_bbr.cwnd_gain <<= 1; 2446 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ss; 2447 log.u_bbr.cwnd_gain <<= 1; 2448 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ca; 2449 log.u_bbr.lost = rack->r_ctl.rc_loss_count; 2450 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2451 &rack->rc_inp->inp_socket->so_rcv, 2452 &rack->rc_inp->inp_socket->so_snd, 2453 TCP_TIMELY_WORK, 0, 2454 0, &log, false, &tv); 2455 } 2456 } 2457 2458 static int 2459 rack_bw_can_be_raised(struct tcp_rack *rack, uint64_t cur_bw, uint64_t last_bw_est, uint16_t mult) 2460 { 2461 /* 2462 * Before we increase we need to know if 2463 * the estimate just made was less than 2464 * our pacing goal (i.e. (cur_bw * mult) > last_bw_est) 2465 * 2466 * If we already are pacing at a fast enough 2467 * rate to push us faster there is no sense of 2468 * increasing. 2469 * 2470 * We first caculate our actual pacing rate (ss or ca multipler 2471 * times our cur_bw). 2472 * 2473 * Then we take the last measured rate and multipy by our 2474 * maximum pacing overage to give us a max allowable rate. 2475 * 2476 * If our act_rate is smaller than our max_allowable rate 2477 * then we should increase. Else we should hold steady. 2478 * 2479 */ 2480 uint64_t act_rate, max_allow_rate; 2481 2482 if (rack_timely_no_stopping) 2483 return (1); 2484 2485 if ((cur_bw == 0) || (last_bw_est == 0)) { 2486 /* 2487 * Initial startup case or 2488 * everything is acked case. 2489 */ 2490 rack_log_timely(rack, mult, cur_bw, 0, 0, 2491 __LINE__, 9); 2492 return (1); 2493 } 2494 if (mult <= 100) { 2495 /* 2496 * We can always pace at or slightly above our rate. 2497 */ 2498 rack_log_timely(rack, mult, cur_bw, 0, 0, 2499 __LINE__, 9); 2500 return (1); 2501 } 2502 act_rate = cur_bw * (uint64_t)mult; 2503 act_rate /= 100; 2504 max_allow_rate = last_bw_est * ((uint64_t)rack_max_per_above + (uint64_t)100); 2505 max_allow_rate /= 100; 2506 if (act_rate < max_allow_rate) { 2507 /* 2508 * Here the rate we are actually pacing at 2509 * is smaller than 10% above our last measurement. 2510 * This means we are pacing below what we would 2511 * like to try to achieve (plus some wiggle room). 2512 */ 2513 rack_log_timely(rack, mult, cur_bw, act_rate, max_allow_rate, 2514 __LINE__, 9); 2515 return (1); 2516 } else { 2517 /* 2518 * Here we are already pacing at least rack_max_per_above(10%) 2519 * what we are getting back. This indicates most likely 2520 * that we are being limited (cwnd/rwnd/app) and can't 2521 * get any more b/w. There is no sense of trying to 2522 * raise up the pacing rate its not speeding us up 2523 * and we already are pacing faster than we are getting. 2524 */ 2525 rack_log_timely(rack, mult, cur_bw, act_rate, max_allow_rate, 2526 __LINE__, 8); 2527 return (0); 2528 } 2529 } 2530 2531 static void 2532 rack_validate_multipliers_at_or_above100(struct tcp_rack *rack) 2533 { 2534 /* 2535 * When we drag bottom, we want to assure 2536 * that no multiplier is below 1.0, if so 2537 * we want to restore it to at least that. 2538 */ 2539 if (rack->r_ctl.rack_per_of_gp_rec < 100) { 2540 /* This is unlikely we usually do not touch recovery */ 2541 rack->r_ctl.rack_per_of_gp_rec = 100; 2542 } 2543 if (rack->r_ctl.rack_per_of_gp_ca < 100) { 2544 rack->r_ctl.rack_per_of_gp_ca = 100; 2545 } 2546 if (rack->r_ctl.rack_per_of_gp_ss < 100) { 2547 rack->r_ctl.rack_per_of_gp_ss = 100; 2548 } 2549 } 2550 2551 static void 2552 rack_validate_multipliers_at_or_below_100(struct tcp_rack *rack) 2553 { 2554 if (rack->r_ctl.rack_per_of_gp_ca > 100) { 2555 rack->r_ctl.rack_per_of_gp_ca = 100; 2556 } 2557 if (rack->r_ctl.rack_per_of_gp_ss > 100) { 2558 rack->r_ctl.rack_per_of_gp_ss = 100; 2559 } 2560 } 2561 2562 static void 2563 rack_increase_bw_mul(struct tcp_rack *rack, int timely_says, uint64_t cur_bw, uint64_t last_bw_est, int override) 2564 { 2565 int32_t calc, logged, plus; 2566 2567 logged = 0; 2568 2569 if (override) { 2570 /* 2571 * override is passed when we are 2572 * loosing b/w and making one last 2573 * gasp at trying to not loose out 2574 * to a new-reno flow. 2575 */ 2576 goto extra_boost; 2577 } 2578 /* In classic timely we boost by 5x if we have 5 increases in a row, lets not */ 2579 if (rack->rc_gp_incr && 2580 ((rack->rc_gp_timely_inc_cnt + 1) >= RACK_TIMELY_CNT_BOOST)) { 2581 /* 2582 * Reset and get 5 strokes more before the boost. Note 2583 * that the count is 0 based so we have to add one. 2584 */ 2585 extra_boost: 2586 plus = (uint32_t)rack_gp_increase_per * RACK_TIMELY_CNT_BOOST; 2587 rack->rc_gp_timely_inc_cnt = 0; 2588 } else 2589 plus = (uint32_t)rack_gp_increase_per; 2590 /* Must be at least 1% increase for true timely increases */ 2591 if ((plus < 1) && 2592 ((rack->r_ctl.rc_rtt_diff <= 0) || (timely_says <= 0))) 2593 plus = 1; 2594 if (rack->rc_gp_saw_rec && 2595 (rack->rc_gp_no_rec_chg == 0) && 2596 rack_bw_can_be_raised(rack, cur_bw, last_bw_est, 2597 rack->r_ctl.rack_per_of_gp_rec)) { 2598 /* We have been in recovery ding it too */ 2599 calc = rack->r_ctl.rack_per_of_gp_rec + plus; 2600 if (calc > 0xffff) 2601 calc = 0xffff; 2602 logged |= 1; 2603 rack->r_ctl.rack_per_of_gp_rec = (uint16_t)calc; 2604 if (rack_per_upper_bound_ss && 2605 (rack->rc_dragged_bottom == 0) && 2606 (rack->r_ctl.rack_per_of_gp_rec > rack_per_upper_bound_ss)) 2607 rack->r_ctl.rack_per_of_gp_rec = rack_per_upper_bound_ss; 2608 } 2609 if (rack->rc_gp_saw_ca && 2610 (rack->rc_gp_saw_ss == 0) && 2611 rack_bw_can_be_raised(rack, cur_bw, last_bw_est, 2612 rack->r_ctl.rack_per_of_gp_ca)) { 2613 /* In CA */ 2614 calc = rack->r_ctl.rack_per_of_gp_ca + plus; 2615 if (calc > 0xffff) 2616 calc = 0xffff; 2617 logged |= 2; 2618 rack->r_ctl.rack_per_of_gp_ca = (uint16_t)calc; 2619 if (rack_per_upper_bound_ca && 2620 (rack->rc_dragged_bottom == 0) && 2621 (rack->r_ctl.rack_per_of_gp_ca > rack_per_upper_bound_ca)) 2622 rack->r_ctl.rack_per_of_gp_ca = rack_per_upper_bound_ca; 2623 } 2624 if (rack->rc_gp_saw_ss && 2625 rack_bw_can_be_raised(rack, cur_bw, last_bw_est, 2626 rack->r_ctl.rack_per_of_gp_ss)) { 2627 /* In SS */ 2628 calc = rack->r_ctl.rack_per_of_gp_ss + plus; 2629 if (calc > 0xffff) 2630 calc = 0xffff; 2631 rack->r_ctl.rack_per_of_gp_ss = (uint16_t)calc; 2632 if (rack_per_upper_bound_ss && 2633 (rack->rc_dragged_bottom == 0) && 2634 (rack->r_ctl.rack_per_of_gp_ss > rack_per_upper_bound_ss)) 2635 rack->r_ctl.rack_per_of_gp_ss = rack_per_upper_bound_ss; 2636 logged |= 4; 2637 } 2638 if (logged && 2639 (rack->rc_gp_incr == 0)){ 2640 /* Go into increment mode */ 2641 rack->rc_gp_incr = 1; 2642 rack->rc_gp_timely_inc_cnt = 0; 2643 } 2644 if (rack->rc_gp_incr && 2645 logged && 2646 (rack->rc_gp_timely_inc_cnt < RACK_TIMELY_CNT_BOOST)) { 2647 rack->rc_gp_timely_inc_cnt++; 2648 } 2649 rack_log_timely(rack, logged, plus, 0, 0, 2650 __LINE__, 1); 2651 } 2652 2653 static uint32_t 2654 rack_get_decrease(struct tcp_rack *rack, uint32_t curper, int32_t rtt_diff) 2655 { 2656 /* 2657 * norm_grad = rtt_diff / minrtt; 2658 * new_per = curper * (1 - B * norm_grad) 2659 * 2660 * B = rack_gp_decrease_per (default 10%) 2661 * rtt_dif = input var current rtt-diff 2662 * curper = input var current percentage 2663 * minrtt = from rack filter 2664 * 2665 */ 2666 uint64_t perf; 2667 2668 perf = (((uint64_t)curper * ((uint64_t)1000000 - 2669 ((uint64_t)rack_gp_decrease_per * (uint64_t)10000 * 2670 (((uint64_t)rtt_diff * (uint64_t)1000000)/ 2671 (uint64_t)get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt)))/ 2672 (uint64_t)1000000)) / 2673 (uint64_t)1000000); 2674 if (perf > curper) { 2675 /* TSNH */ 2676 perf = curper - 1; 2677 } 2678 return ((uint32_t)perf); 2679 } 2680 2681 static uint32_t 2682 rack_decrease_highrtt(struct tcp_rack *rack, uint32_t curper, uint32_t rtt) 2683 { 2684 /* 2685 * highrttthresh 2686 * result = curper * (1 - (B * ( 1 - ------ )) 2687 * gp_srtt 2688 * 2689 * B = rack_gp_decrease_per (default 10%) 2690 * highrttthresh = filter_min * rack_gp_rtt_maxmul 2691 */ 2692 uint64_t perf; 2693 uint32_t highrttthresh; 2694 2695 highrttthresh = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_maxmul; 2696 2697 perf = (((uint64_t)curper * ((uint64_t)1000000 - 2698 ((uint64_t)rack_gp_decrease_per * ((uint64_t)1000000 - 2699 ((uint64_t)highrttthresh * (uint64_t)1000000) / 2700 (uint64_t)rtt)) / 100)) /(uint64_t)1000000); 2701 return (perf); 2702 } 2703 2704 static void 2705 rack_decrease_bw_mul(struct tcp_rack *rack, int timely_says, uint32_t rtt, int32_t rtt_diff) 2706 { 2707 uint64_t logvar, logvar2, logvar3; 2708 uint32_t logged, new_per, ss_red, ca_red, rec_red, alt, val; 2709 2710 if (rack->rc_gp_incr) { 2711 /* Turn off increment counting */ 2712 rack->rc_gp_incr = 0; 2713 rack->rc_gp_timely_inc_cnt = 0; 2714 } 2715 ss_red = ca_red = rec_red = 0; 2716 logged = 0; 2717 /* Calculate the reduction value */ 2718 if (rtt_diff < 0) { 2719 rtt_diff *= -1; 2720 } 2721 /* Must be at least 1% reduction */ 2722 if (rack->rc_gp_saw_rec && (rack->rc_gp_no_rec_chg == 0)) { 2723 /* We have been in recovery ding it too */ 2724 if (timely_says == 2) { 2725 new_per = rack_decrease_highrtt(rack, rack->r_ctl.rack_per_of_gp_rec, rtt); 2726 alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff); 2727 if (alt < new_per) 2728 val = alt; 2729 else 2730 val = new_per; 2731 } else 2732 val = new_per = alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff); 2733 if (rack->r_ctl.rack_per_of_gp_rec > val) { 2734 rec_red = (rack->r_ctl.rack_per_of_gp_rec - val); 2735 rack->r_ctl.rack_per_of_gp_rec = (uint16_t)val; 2736 } else { 2737 rack->r_ctl.rack_per_of_gp_rec = rack_per_lower_bound; 2738 rec_red = 0; 2739 } 2740 if (rack_per_lower_bound > rack->r_ctl.rack_per_of_gp_rec) 2741 rack->r_ctl.rack_per_of_gp_rec = rack_per_lower_bound; 2742 logged |= 1; 2743 } 2744 if (rack->rc_gp_saw_ss) { 2745 /* Sent in SS */ 2746 if (timely_says == 2) { 2747 new_per = rack_decrease_highrtt(rack, rack->r_ctl.rack_per_of_gp_ss, rtt); 2748 alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff); 2749 if (alt < new_per) 2750 val = alt; 2751 else 2752 val = new_per; 2753 } else 2754 val = new_per = alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_ss, rtt_diff); 2755 if (rack->r_ctl.rack_per_of_gp_ss > new_per) { 2756 ss_red = rack->r_ctl.rack_per_of_gp_ss - val; 2757 rack->r_ctl.rack_per_of_gp_ss = (uint16_t)val; 2758 } else { 2759 ss_red = new_per; 2760 rack->r_ctl.rack_per_of_gp_ss = rack_per_lower_bound; 2761 logvar = new_per; 2762 logvar <<= 32; 2763 logvar |= alt; 2764 logvar2 = (uint32_t)rtt; 2765 logvar2 <<= 32; 2766 logvar2 |= (uint32_t)rtt_diff; 2767 logvar3 = rack_gp_rtt_maxmul; 2768 logvar3 <<= 32; 2769 logvar3 |= get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); 2770 rack_log_timely(rack, timely_says, 2771 logvar2, logvar3, 2772 logvar, __LINE__, 10); 2773 } 2774 if (rack_per_lower_bound > rack->r_ctl.rack_per_of_gp_ss) 2775 rack->r_ctl.rack_per_of_gp_ss = rack_per_lower_bound; 2776 logged |= 4; 2777 } else if (rack->rc_gp_saw_ca) { 2778 /* Sent in CA */ 2779 if (timely_says == 2) { 2780 new_per = rack_decrease_highrtt(rack, rack->r_ctl.rack_per_of_gp_ca, rtt); 2781 alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff); 2782 if (alt < new_per) 2783 val = alt; 2784 else 2785 val = new_per; 2786 } else 2787 val = new_per = alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_ca, rtt_diff); 2788 if (rack->r_ctl.rack_per_of_gp_ca > val) { 2789 ca_red = rack->r_ctl.rack_per_of_gp_ca - val; 2790 rack->r_ctl.rack_per_of_gp_ca = (uint16_t)val; 2791 } else { 2792 rack->r_ctl.rack_per_of_gp_ca = rack_per_lower_bound; 2793 ca_red = 0; 2794 logvar = new_per; 2795 logvar <<= 32; 2796 logvar |= alt; 2797 logvar2 = (uint32_t)rtt; 2798 logvar2 <<= 32; 2799 logvar2 |= (uint32_t)rtt_diff; 2800 logvar3 = rack_gp_rtt_maxmul; 2801 logvar3 <<= 32; 2802 logvar3 |= get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); 2803 rack_log_timely(rack, timely_says, 2804 logvar2, logvar3, 2805 logvar, __LINE__, 10); 2806 } 2807 if (rack_per_lower_bound > rack->r_ctl.rack_per_of_gp_ca) 2808 rack->r_ctl.rack_per_of_gp_ca = rack_per_lower_bound; 2809 logged |= 2; 2810 } 2811 if (rack->rc_gp_timely_dec_cnt < 0x7) { 2812 rack->rc_gp_timely_dec_cnt++; 2813 if (rack_timely_dec_clear && 2814 (rack->rc_gp_timely_dec_cnt == rack_timely_dec_clear)) 2815 rack->rc_gp_timely_dec_cnt = 0; 2816 } 2817 logvar = ss_red; 2818 logvar <<= 32; 2819 logvar |= ca_red; 2820 rack_log_timely(rack, logged, rec_red, rack_per_lower_bound, logvar, 2821 __LINE__, 2); 2822 } 2823 2824 static void 2825 rack_log_rtt_shrinks(struct tcp_rack *rack, uint32_t us_cts, 2826 uint32_t rtt, uint32_t line, uint8_t reas) 2827 { 2828 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2829 union tcp_log_stackspecific log; 2830 struct timeval tv; 2831 2832 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2833 log.u_bbr.flex1 = line; 2834 log.u_bbr.flex2 = rack->r_ctl.rc_time_probertt_starts; 2835 log.u_bbr.flex3 = rack->r_ctl.rc_lower_rtt_us_cts; 2836 log.u_bbr.flex4 = rack->r_ctl.rack_per_of_gp_ss; 2837 log.u_bbr.flex5 = rtt; 2838 log.u_bbr.flex6 = rack->rc_highly_buffered; 2839 log.u_bbr.flex6 <<= 1; 2840 log.u_bbr.flex6 |= rack->forced_ack; 2841 log.u_bbr.flex6 <<= 1; 2842 log.u_bbr.flex6 |= rack->rc_gp_dyn_mul; 2843 log.u_bbr.flex6 <<= 1; 2844 log.u_bbr.flex6 |= rack->in_probe_rtt; 2845 log.u_bbr.flex6 <<= 1; 2846 log.u_bbr.flex6 |= rack->measure_saw_probe_rtt; 2847 log.u_bbr.flex7 = rack->r_ctl.rack_per_of_gp_probertt; 2848 log.u_bbr.pacing_gain = rack->r_ctl.rack_per_of_gp_ca; 2849 log.u_bbr.cwnd_gain = rack->r_ctl.rack_per_of_gp_rec; 2850 log.u_bbr.flex8 = reas; 2851 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2852 log.u_bbr.delRate = rack_get_bw(rack); 2853 log.u_bbr.cur_del_rate = rack->r_ctl.rc_highest_us_rtt; 2854 log.u_bbr.cur_del_rate <<= 32; 2855 log.u_bbr.cur_del_rate |= rack->r_ctl.rc_lowest_us_rtt; 2856 log.u_bbr.applimited = rack->r_ctl.rc_time_probertt_entered; 2857 log.u_bbr.pkts_out = rack->r_ctl.rc_rtt_diff; 2858 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2859 log.u_bbr.epoch = rack->r_ctl.rc_gp_srtt; 2860 log.u_bbr.lt_epoch = rack->r_ctl.rc_prev_gp_srtt; 2861 log.u_bbr.pkt_epoch = rack->r_ctl.rc_lower_rtt_us_cts; 2862 log.u_bbr.delivered = rack->r_ctl.rc_target_probertt_flight; 2863 log.u_bbr.lost = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); 2864 log.u_bbr.rttProp = us_cts; 2865 log.u_bbr.rttProp <<= 32; 2866 log.u_bbr.rttProp |= rack->r_ctl.rc_entry_gp_rtt; 2867 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2868 &rack->rc_inp->inp_socket->so_rcv, 2869 &rack->rc_inp->inp_socket->so_snd, 2870 BBR_LOG_RTT_SHRINKS, 0, 2871 0, &log, false, &rack->r_ctl.act_rcv_time); 2872 } 2873 } 2874 2875 static void 2876 rack_set_prtt_target(struct tcp_rack *rack, uint32_t segsiz, uint32_t rtt) 2877 { 2878 uint64_t bwdp; 2879 2880 bwdp = rack_get_bw(rack); 2881 bwdp *= (uint64_t)rtt; 2882 bwdp /= (uint64_t)HPTS_USEC_IN_SEC; 2883 rack->r_ctl.rc_target_probertt_flight = roundup((uint32_t)bwdp, segsiz); 2884 if (rack->r_ctl.rc_target_probertt_flight < (segsiz * rack_timely_min_segs)) { 2885 /* 2886 * A window protocol must be able to have 4 packets 2887 * outstanding as the floor in order to function 2888 * (especially considering delayed ack :D). 2889 */ 2890 rack->r_ctl.rc_target_probertt_flight = (segsiz * rack_timely_min_segs); 2891 } 2892 } 2893 2894 static void 2895 rack_enter_probertt(struct tcp_rack *rack, uint32_t us_cts) 2896 { 2897 /** 2898 * ProbeRTT is a bit different in rack_pacing than in 2899 * BBR. It is like BBR in that it uses the lowering of 2900 * the RTT as a signal that we saw something new and 2901 * counts from there for how long between. But it is 2902 * different in that its quite simple. It does not 2903 * play with the cwnd and wait until we get down 2904 * to N segments outstanding and hold that for 2905 * 200ms. Instead it just sets the pacing reduction 2906 * rate to a set percentage (70 by default) and hold 2907 * that for a number of recent GP Srtt's. 2908 */ 2909 uint32_t segsiz; 2910 2911 if (rack->rc_gp_dyn_mul == 0) 2912 return; 2913 2914 if (rack->rc_tp->snd_max == rack->rc_tp->snd_una) { 2915 /* We are idle */ 2916 return; 2917 } 2918 if ((rack->rc_tp->t_flags & TF_GPUTINPROG) && 2919 SEQ_GT(rack->rc_tp->snd_una, rack->rc_tp->gput_seq)) { 2920 /* 2921 * Stop the goodput now, the idea here is 2922 * that future measurements with in_probe_rtt 2923 * won't register if they are not greater so 2924 * we want to get what info (if any) is available 2925 * now. 2926 */ 2927 rack_do_goodput_measurement(rack->rc_tp, rack, 2928 rack->rc_tp->snd_una, __LINE__); 2929 } 2930 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt; 2931 rack->r_ctl.rc_time_probertt_entered = us_cts; 2932 segsiz = min(ctf_fixed_maxseg(rack->rc_tp), 2933 rack->r_ctl.rc_pace_min_segs); 2934 rack->in_probe_rtt = 1; 2935 rack->measure_saw_probe_rtt = 1; 2936 rack->r_ctl.rc_lower_rtt_us_cts = us_cts; 2937 rack->r_ctl.rc_time_probertt_starts = 0; 2938 rack->r_ctl.rc_entry_gp_rtt = rack->r_ctl.rc_gp_srtt; 2939 if (rack_probertt_use_min_rtt_entry) 2940 rack_set_prtt_target(rack, segsiz, get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt)); 2941 else 2942 rack_set_prtt_target(rack, segsiz, rack->r_ctl.rc_gp_srtt); 2943 rack_log_rtt_shrinks(rack, us_cts, get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 2944 __LINE__, RACK_RTTS_ENTERPROBE); 2945 } 2946 2947 static void 2948 rack_exit_probertt(struct tcp_rack *rack, uint32_t us_cts) 2949 { 2950 struct rack_sendmap *rsm; 2951 uint32_t segsiz; 2952 2953 segsiz = min(ctf_fixed_maxseg(rack->rc_tp), 2954 rack->r_ctl.rc_pace_min_segs); 2955 rack->in_probe_rtt = 0; 2956 if ((rack->rc_tp->t_flags & TF_GPUTINPROG) && 2957 SEQ_GT(rack->rc_tp->snd_una, rack->rc_tp->gput_seq)) { 2958 /* 2959 * Stop the goodput now, the idea here is 2960 * that future measurements with in_probe_rtt 2961 * won't register if they are not greater so 2962 * we want to get what info (if any) is available 2963 * now. 2964 */ 2965 rack_do_goodput_measurement(rack->rc_tp, rack, 2966 rack->rc_tp->snd_una, __LINE__); 2967 } else if (rack->rc_tp->t_flags & TF_GPUTINPROG) { 2968 /* 2969 * We don't have enough data to make a measurement. 2970 * So lets just stop and start here after exiting 2971 * probe-rtt. We probably are not interested in 2972 * the results anyway. 2973 */ 2974 rack->rc_tp->t_flags &= ~TF_GPUTINPROG; 2975 } 2976 /* 2977 * Measurements through the current snd_max are going 2978 * to be limited by the slower pacing rate. 2979 * 2980 * We need to mark these as app-limited so we 2981 * don't collapse the b/w. 2982 */ 2983 rsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 2984 if (rsm && ((rsm->r_flags & RACK_APP_LIMITED) == 0)) { 2985 if (rack->r_ctl.rc_app_limited_cnt == 0) 2986 rack->r_ctl.rc_end_appl = rack->r_ctl.rc_first_appl = rsm; 2987 else { 2988 /* 2989 * Go out to the end app limited and mark 2990 * this new one as next and move the end_appl up 2991 * to this guy. 2992 */ 2993 if (rack->r_ctl.rc_end_appl) 2994 rack->r_ctl.rc_end_appl->r_nseq_appl = rsm->r_start; 2995 rack->r_ctl.rc_end_appl = rsm; 2996 } 2997 rsm->r_flags |= RACK_APP_LIMITED; 2998 rack->r_ctl.rc_app_limited_cnt++; 2999 } 3000 /* 3001 * Now, we need to examine our pacing rate multipliers. 3002 * If its under 100%, we need to kick it back up to 3003 * 100%. We also don't let it be over our "max" above 3004 * the actual rate i.e. 100% + rack_clamp_atexit_prtt. 3005 * Note setting clamp_atexit_prtt to 0 has the effect 3006 * of setting CA/SS to 100% always at exit (which is 3007 * the default behavior). 3008 */ 3009 if (rack_probertt_clear_is) { 3010 rack->rc_gp_incr = 0; 3011 rack->rc_gp_bwred = 0; 3012 rack->rc_gp_timely_inc_cnt = 0; 3013 rack->rc_gp_timely_dec_cnt = 0; 3014 } 3015 /* Do we do any clamping at exit? */ 3016 if (rack->rc_highly_buffered && rack_atexit_prtt_hbp) { 3017 rack->r_ctl.rack_per_of_gp_ca = rack_atexit_prtt_hbp; 3018 rack->r_ctl.rack_per_of_gp_ss = rack_atexit_prtt_hbp; 3019 } 3020 if ((rack->rc_highly_buffered == 0) && rack_atexit_prtt) { 3021 rack->r_ctl.rack_per_of_gp_ca = rack_atexit_prtt; 3022 rack->r_ctl.rack_per_of_gp_ss = rack_atexit_prtt; 3023 } 3024 /* 3025 * Lets set rtt_diff to 0, so that we will get a "boost" 3026 * after exiting. 3027 */ 3028 rack->r_ctl.rc_rtt_diff = 0; 3029 3030 /* Clear all flags so we start fresh */ 3031 rack->rc_tp->t_bytes_acked = 0; 3032 rack->rc_tp->ccv->flags &= ~CCF_ABC_SENTAWND; 3033 /* 3034 * If configured to, set the cwnd and ssthresh to 3035 * our targets. 3036 */ 3037 if (rack_probe_rtt_sets_cwnd) { 3038 uint64_t ebdp; 3039 uint32_t setto; 3040 3041 /* Set ssthresh so we get into CA once we hit our target */ 3042 if (rack_probertt_use_min_rtt_exit == 1) { 3043 /* Set to min rtt */ 3044 rack_set_prtt_target(rack, segsiz, 3045 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt)); 3046 } else if (rack_probertt_use_min_rtt_exit == 2) { 3047 /* Set to current gp rtt */ 3048 rack_set_prtt_target(rack, segsiz, 3049 rack->r_ctl.rc_gp_srtt); 3050 } else if (rack_probertt_use_min_rtt_exit == 3) { 3051 /* Set to entry gp rtt */ 3052 rack_set_prtt_target(rack, segsiz, 3053 rack->r_ctl.rc_entry_gp_rtt); 3054 } else { 3055 uint64_t sum; 3056 uint32_t setval; 3057 3058 sum = rack->r_ctl.rc_entry_gp_rtt; 3059 sum *= 10; 3060 sum /= (uint64_t)(max(1, rack->r_ctl.rc_gp_srtt)); 3061 if (sum >= 20) { 3062 /* 3063 * A highly buffered path needs 3064 * cwnd space for timely to work. 3065 * Lets set things up as if 3066 * we are heading back here again. 3067 */ 3068 setval = rack->r_ctl.rc_entry_gp_rtt; 3069 } else if (sum >= 15) { 3070 /* 3071 * Lets take the smaller of the 3072 * two since we are just somewhat 3073 * buffered. 3074 */ 3075 setval = rack->r_ctl.rc_gp_srtt; 3076 if (setval > rack->r_ctl.rc_entry_gp_rtt) 3077 setval = rack->r_ctl.rc_entry_gp_rtt; 3078 } else { 3079 /* 3080 * Here we are not highly buffered 3081 * and should pick the min we can to 3082 * keep from causing loss. 3083 */ 3084 setval = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); 3085 } 3086 rack_set_prtt_target(rack, segsiz, 3087 setval); 3088 } 3089 if (rack_probe_rtt_sets_cwnd > 1) { 3090 /* There is a percentage here to boost */ 3091 ebdp = rack->r_ctl.rc_target_probertt_flight; 3092 ebdp *= rack_probe_rtt_sets_cwnd; 3093 ebdp /= 100; 3094 setto = rack->r_ctl.rc_target_probertt_flight + ebdp; 3095 } else 3096 setto = rack->r_ctl.rc_target_probertt_flight; 3097 rack->rc_tp->snd_cwnd = roundup(setto, segsiz); 3098 if (rack->rc_tp->snd_cwnd < (segsiz * rack_timely_min_segs)) { 3099 /* Enforce a min */ 3100 rack->rc_tp->snd_cwnd = segsiz * rack_timely_min_segs; 3101 } 3102 /* If we set in the cwnd also set the ssthresh point so we are in CA */ 3103 rack->rc_tp->snd_ssthresh = (rack->rc_tp->snd_cwnd - 1); 3104 } 3105 rack_log_rtt_shrinks(rack, us_cts, 3106 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 3107 __LINE__, RACK_RTTS_EXITPROBE); 3108 /* Clear times last so log has all the info */ 3109 rack->r_ctl.rc_probertt_sndmax_atexit = rack->rc_tp->snd_max; 3110 rack->r_ctl.rc_time_probertt_entered = us_cts; 3111 rack->r_ctl.rc_time_probertt_starts = rack->r_ctl.rc_lower_rtt_us_cts = us_cts; 3112 rack->r_ctl.rc_time_of_last_probertt = us_cts; 3113 } 3114 3115 static void 3116 rack_check_probe_rtt(struct tcp_rack *rack, uint32_t us_cts) 3117 { 3118 /* Check in on probe-rtt */ 3119 if (rack->rc_gp_filled == 0) { 3120 /* We do not do p-rtt unless we have gp measurements */ 3121 return; 3122 } 3123 if (rack->in_probe_rtt) { 3124 uint64_t no_overflow; 3125 uint32_t endtime, must_stay; 3126 3127 if (rack->r_ctl.rc_went_idle_time && 3128 ((us_cts - rack->r_ctl.rc_went_idle_time) > rack_min_probertt_hold)) { 3129 /* 3130 * We went idle during prtt, just exit now. 3131 */ 3132 rack_exit_probertt(rack, us_cts); 3133 } else if (rack_probe_rtt_safety_val && 3134 TSTMP_GT(us_cts, rack->r_ctl.rc_time_probertt_entered) && 3135 ((us_cts - rack->r_ctl.rc_time_probertt_entered) > rack_probe_rtt_safety_val)) { 3136 /* 3137 * Probe RTT safety value triggered! 3138 */ 3139 rack_log_rtt_shrinks(rack, us_cts, 3140 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 3141 __LINE__, RACK_RTTS_SAFETY); 3142 rack_exit_probertt(rack, us_cts); 3143 } 3144 /* Calculate the max we will wait */ 3145 endtime = rack->r_ctl.rc_time_probertt_entered + (rack->r_ctl.rc_gp_srtt * rack_max_drain_wait); 3146 if (rack->rc_highly_buffered) 3147 endtime += (rack->r_ctl.rc_gp_srtt * rack_max_drain_hbp); 3148 /* Calculate the min we must wait */ 3149 must_stay = rack->r_ctl.rc_time_probertt_entered + (rack->r_ctl.rc_gp_srtt * rack_must_drain); 3150 if ((ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) > rack->r_ctl.rc_target_probertt_flight) && 3151 TSTMP_LT(us_cts, endtime)) { 3152 uint32_t calc; 3153 /* Do we lower more? */ 3154 no_exit: 3155 if (TSTMP_GT(us_cts, rack->r_ctl.rc_time_probertt_entered)) 3156 calc = us_cts - rack->r_ctl.rc_time_probertt_entered; 3157 else 3158 calc = 0; 3159 calc /= max(rack->r_ctl.rc_gp_srtt, 1); 3160 if (calc) { 3161 /* Maybe */ 3162 calc *= rack_per_of_gp_probertt_reduce; 3163 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt - calc; 3164 /* Limit it too */ 3165 if (rack->r_ctl.rack_per_of_gp_probertt < rack_per_of_gp_lowthresh) 3166 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_lowthresh; 3167 } 3168 /* We must reach target or the time set */ 3169 return; 3170 } 3171 if (rack->r_ctl.rc_time_probertt_starts == 0) { 3172 if ((TSTMP_LT(us_cts, must_stay) && 3173 rack->rc_highly_buffered) || 3174 (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) > 3175 rack->r_ctl.rc_target_probertt_flight)) { 3176 /* We are not past the must_stay time */ 3177 goto no_exit; 3178 } 3179 rack_log_rtt_shrinks(rack, us_cts, 3180 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 3181 __LINE__, RACK_RTTS_REACHTARGET); 3182 rack->r_ctl.rc_time_probertt_starts = us_cts; 3183 if (rack->r_ctl.rc_time_probertt_starts == 0) 3184 rack->r_ctl.rc_time_probertt_starts = 1; 3185 /* Restore back to our rate we want to pace at in prtt */ 3186 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt; 3187 } 3188 /* 3189 * Setup our end time, some number of gp_srtts plus 200ms. 3190 */ 3191 no_overflow = ((uint64_t)rack->r_ctl.rc_gp_srtt * 3192 (uint64_t)rack_probertt_gpsrtt_cnt_mul); 3193 if (rack_probertt_gpsrtt_cnt_div) 3194 endtime = (uint32_t)(no_overflow / (uint64_t)rack_probertt_gpsrtt_cnt_div); 3195 else 3196 endtime = 0; 3197 endtime += rack_min_probertt_hold; 3198 endtime += rack->r_ctl.rc_time_probertt_starts; 3199 if (TSTMP_GEQ(us_cts, endtime)) { 3200 /* yes, exit probertt */ 3201 rack_exit_probertt(rack, us_cts); 3202 } 3203 3204 } else if((us_cts - rack->r_ctl.rc_lower_rtt_us_cts) >= rack_time_between_probertt) { 3205 /* Go into probertt, its been too long since we went lower */ 3206 rack_enter_probertt(rack, us_cts); 3207 } 3208 } 3209 3210 static void 3211 rack_update_multiplier(struct tcp_rack *rack, int32_t timely_says, uint64_t last_bw_est, 3212 uint32_t rtt, int32_t rtt_diff) 3213 { 3214 uint64_t cur_bw, up_bnd, low_bnd, subfr; 3215 uint32_t losses; 3216 3217 if ((rack->rc_gp_dyn_mul == 0) || 3218 (rack->use_fixed_rate) || 3219 (rack->in_probe_rtt) || 3220 (rack->rc_always_pace == 0)) { 3221 /* No dynamic GP multipler in play */ 3222 return; 3223 } 3224 losses = rack->r_ctl.rc_loss_count - rack->r_ctl.rc_loss_at_start; 3225 cur_bw = rack_get_bw(rack); 3226 /* Calculate our up and down range */ 3227 up_bnd = rack->r_ctl.last_gp_comp_bw * (uint64_t)rack_gp_per_bw_mul_up; 3228 up_bnd /= 100; 3229 up_bnd += rack->r_ctl.last_gp_comp_bw; 3230 3231 subfr = (uint64_t)rack->r_ctl.last_gp_comp_bw * (uint64_t)rack_gp_per_bw_mul_down; 3232 subfr /= 100; 3233 low_bnd = rack->r_ctl.last_gp_comp_bw - subfr; 3234 if ((timely_says == 2) && (rack->r_ctl.rc_no_push_at_mrtt)) { 3235 /* 3236 * This is the case where our RTT is above 3237 * the max target and we have been configured 3238 * to just do timely no bonus up stuff in that case. 3239 * 3240 * There are two configurations, set to 1, and we 3241 * just do timely if we are over our max. If its 3242 * set above 1 then we slam the multipliers down 3243 * to 100 and then decrement per timely. 3244 */ 3245 rack_log_timely(rack, timely_says, cur_bw, low_bnd, up_bnd, 3246 __LINE__, 3); 3247 if (rack->r_ctl.rc_no_push_at_mrtt > 1) 3248 rack_validate_multipliers_at_or_below_100(rack); 3249 rack_decrease_bw_mul(rack, timely_says, rtt, rtt_diff); 3250 } else if ((last_bw_est < low_bnd) && !losses) { 3251 /* 3252 * We are decreasing this is a bit complicated this 3253 * means we are loosing ground. This could be 3254 * because another flow entered and we are competing 3255 * for b/w with it. This will push the RTT up which 3256 * makes timely unusable unless we want to get shoved 3257 * into a corner and just be backed off (the age 3258 * old problem with delay based CC). 3259 * 3260 * On the other hand if it was a route change we 3261 * would like to stay somewhat contained and not 3262 * blow out the buffers. 3263 */ 3264 rack_log_timely(rack, timely_says, cur_bw, low_bnd, up_bnd, 3265 __LINE__, 3); 3266 rack->r_ctl.last_gp_comp_bw = cur_bw; 3267 if (rack->rc_gp_bwred == 0) { 3268 /* Go into reduction counting */ 3269 rack->rc_gp_bwred = 1; 3270 rack->rc_gp_timely_dec_cnt = 0; 3271 } 3272 if ((rack->rc_gp_timely_dec_cnt < rack_timely_max_push_drop) || 3273 (timely_says == 0)) { 3274 /* 3275 * Push another time with a faster pacing 3276 * to try to gain back (we include override to 3277 * get a full raise factor). 3278 */ 3279 if ((rack->rc_gp_saw_ca && rack->r_ctl.rack_per_of_gp_ca <= rack_down_raise_thresh) || 3280 (rack->rc_gp_saw_ss && rack->r_ctl.rack_per_of_gp_ss <= rack_down_raise_thresh) || 3281 (timely_says == 0) || 3282 (rack_down_raise_thresh == 0)) { 3283 /* 3284 * Do an override up in b/w if we were 3285 * below the threshold or if the threshold 3286 * is zero we always do the raise. 3287 */ 3288 rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 1); 3289 } else { 3290 /* Log it stays the same */ 3291 rack_log_timely(rack, 0, last_bw_est, low_bnd, 0, 3292 __LINE__, 11); 3293 } 3294 rack->rc_gp_timely_dec_cnt++; 3295 /* We are not incrementing really no-count */ 3296 rack->rc_gp_incr = 0; 3297 rack->rc_gp_timely_inc_cnt = 0; 3298 } else { 3299 /* 3300 * Lets just use the RTT 3301 * information and give up 3302 * pushing. 3303 */ 3304 goto use_timely; 3305 } 3306 } else if ((timely_says != 2) && 3307 !losses && 3308 (last_bw_est > up_bnd)) { 3309 /* 3310 * We are increasing b/w lets keep going, updating 3311 * our b/w and ignoring any timely input, unless 3312 * of course we are at our max raise (if there is one). 3313 */ 3314 3315 rack_log_timely(rack, timely_says, cur_bw, low_bnd, up_bnd, 3316 __LINE__, 3); 3317 rack->r_ctl.last_gp_comp_bw = cur_bw; 3318 if (rack->rc_gp_saw_ss && 3319 rack_per_upper_bound_ss && 3320 (rack->r_ctl.rack_per_of_gp_ss == rack_per_upper_bound_ss)) { 3321 /* 3322 * In cases where we can't go higher 3323 * we should just use timely. 3324 */ 3325 goto use_timely; 3326 } 3327 if (rack->rc_gp_saw_ca && 3328 rack_per_upper_bound_ca && 3329 (rack->r_ctl.rack_per_of_gp_ca == rack_per_upper_bound_ca)) { 3330 /* 3331 * In cases where we can't go higher 3332 * we should just use timely. 3333 */ 3334 goto use_timely; 3335 } 3336 rack->rc_gp_bwred = 0; 3337 rack->rc_gp_timely_dec_cnt = 0; 3338 /* You get a set number of pushes if timely is trying to reduce */ 3339 if ((rack->rc_gp_incr < rack_timely_max_push_rise) || (timely_says == 0)) { 3340 rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 0); 3341 } else { 3342 /* Log it stays the same */ 3343 rack_log_timely(rack, 0, last_bw_est, up_bnd, 0, 3344 __LINE__, 12); 3345 } 3346 return; 3347 } else { 3348 /* 3349 * We are staying between the lower and upper range bounds 3350 * so use timely to decide. 3351 */ 3352 rack_log_timely(rack, timely_says, cur_bw, low_bnd, up_bnd, 3353 __LINE__, 3); 3354 use_timely: 3355 if (timely_says) { 3356 rack->rc_gp_incr = 0; 3357 rack->rc_gp_timely_inc_cnt = 0; 3358 if ((rack->rc_gp_timely_dec_cnt < rack_timely_max_push_drop) && 3359 !losses && 3360 (last_bw_est < low_bnd)) { 3361 /* We are loosing ground */ 3362 rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 0); 3363 rack->rc_gp_timely_dec_cnt++; 3364 /* We are not incrementing really no-count */ 3365 rack->rc_gp_incr = 0; 3366 rack->rc_gp_timely_inc_cnt = 0; 3367 } else 3368 rack_decrease_bw_mul(rack, timely_says, rtt, rtt_diff); 3369 } else { 3370 rack->rc_gp_bwred = 0; 3371 rack->rc_gp_timely_dec_cnt = 0; 3372 rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 0); 3373 } 3374 } 3375 } 3376 3377 static int32_t 3378 rack_make_timely_judgement(struct tcp_rack *rack, uint32_t rtt, int32_t rtt_diff, uint32_t prev_rtt) 3379 { 3380 int32_t timely_says; 3381 uint64_t log_mult, log_rtt_a_diff; 3382 3383 log_rtt_a_diff = rtt; 3384 log_rtt_a_diff <<= 32; 3385 log_rtt_a_diff |= (uint32_t)rtt_diff; 3386 if (rtt >= (get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * 3387 rack_gp_rtt_maxmul)) { 3388 /* Reduce the b/w multipler */ 3389 timely_says = 2; 3390 log_mult = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_maxmul; 3391 log_mult <<= 32; 3392 log_mult |= prev_rtt; 3393 rack_log_timely(rack, timely_says, log_mult, 3394 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 3395 log_rtt_a_diff, __LINE__, 4); 3396 } else if (rtt <= (get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) + 3397 ((get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_minmul) / 3398 max(rack_gp_rtt_mindiv , 1)))) { 3399 /* Increase the b/w multipler */ 3400 log_mult = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) + 3401 ((get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_minmul) / 3402 max(rack_gp_rtt_mindiv , 1)); 3403 log_mult <<= 32; 3404 log_mult |= prev_rtt; 3405 timely_says = 0; 3406 rack_log_timely(rack, timely_says, log_mult , 3407 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 3408 log_rtt_a_diff, __LINE__, 5); 3409 } else { 3410 /* 3411 * Use a gradient to find it the timely gradient 3412 * is: 3413 * grad = rc_rtt_diff / min_rtt; 3414 * 3415 * anything below or equal to 0 will be 3416 * a increase indication. Anything above 3417 * zero is a decrease. Note we take care 3418 * of the actual gradient calculation 3419 * in the reduction (its not needed for 3420 * increase). 3421 */ 3422 log_mult = prev_rtt; 3423 if (rtt_diff <= 0) { 3424 /* 3425 * Rttdiff is less than zero, increase the 3426 * b/w multipler (its 0 or negative) 3427 */ 3428 timely_says = 0; 3429 rack_log_timely(rack, timely_says, log_mult, 3430 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), log_rtt_a_diff, __LINE__, 6); 3431 } else { 3432 /* Reduce the b/w multipler */ 3433 timely_says = 1; 3434 rack_log_timely(rack, timely_says, log_mult, 3435 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), log_rtt_a_diff, __LINE__, 7); 3436 } 3437 } 3438 return (timely_says); 3439 } 3440 3441 static void 3442 rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack, 3443 tcp_seq th_ack, int line) 3444 { 3445 uint64_t tim, bytes_ps, ltim, stim, utim; 3446 uint32_t segsiz, bytes, reqbytes, us_cts; 3447 int32_t gput, new_rtt_diff, timely_says; 3448 3449 us_cts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 3450 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 3451 if (TSTMP_GEQ(us_cts, tp->gput_ts)) 3452 tim = us_cts - tp->gput_ts; 3453 else 3454 tim = 0; 3455 3456 if (TSTMP_GT(rack->r_ctl.rc_gp_cumack_ts, rack->r_ctl.rc_gp_output_ts)) 3457 stim = rack->r_ctl.rc_gp_cumack_ts - rack->r_ctl.rc_gp_output_ts; 3458 else 3459 stim = 0; 3460 /* 3461 * Use the larger of the send time or ack time. This prevents us 3462 * from being influenced by ack artifacts to come up with too 3463 * high of measurement. Note that since we are spanning over many more 3464 * bytes in most of our measurements hopefully that is less likely to 3465 * occur. 3466 */ 3467 if (tim > stim) 3468 utim = max(tim, 1); 3469 else 3470 utim = max(stim, 1); 3471 /* Lets validate utim */ 3472 ltim = max(1, (utim/HPTS_USEC_IN_MSEC)); 3473 gput = (((uint64_t) (th_ack - tp->gput_seq)) << 3) / ltim; 3474 reqbytes = min(rc_init_window(rack), (MIN_GP_WIN * segsiz)); 3475 if ((tim == 0) && (stim == 0)) { 3476 /* 3477 * Invalid measurement time, maybe 3478 * all on one ack/one send? 3479 */ 3480 bytes = 0; 3481 bytes_ps = 0; 3482 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, 3483 0, 0, 0, 10, __LINE__, NULL); 3484 goto skip_measurement; 3485 } 3486 if (rack->r_ctl.rc_gp_lowrtt == 0xffffffff) { 3487 /* We never made a us_rtt measurement? */ 3488 bytes = 0; 3489 bytes_ps = 0; 3490 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, 3491 0, 0, 0, 10, __LINE__, NULL); 3492 goto skip_measurement; 3493 } 3494 /* 3495 * Calculate the maximum possible b/w this connection 3496 * could have. We base our calculation on the lowest 3497 * rtt we have seen during the measurement and the 3498 * largest rwnd the client has given us in that time. This 3499 * forms a BDP that is the maximum that we could ever 3500 * get to the client. Anything larger is not valid. 3501 * 3502 * I originally had code here that rejected measurements 3503 * where the time was less than 1/2 the latest us_rtt. 3504 * But after thinking on that I realized its wrong since 3505 * say you had a 150Mbps or even 1Gbps link, and you 3506 * were a long way away.. example I am in Europe (100ms rtt) 3507 * talking to my 1Gbps link in S.C. Now measuring say 150,000 3508 * bytes my time would be 1.2ms, and yet my rtt would say 3509 * the measurement was invalid the time was < 50ms. The 3510 * same thing is true for 150Mb (8ms of time). 3511 * 3512 * A better way I realized is to look at what the maximum 3513 * the connection could possibly do. This is gated on 3514 * the lowest RTT we have seen and the highest rwnd. 3515 * We should in theory never exceed that, if we are 3516 * then something on the path is storing up packets 3517 * and then feeding them all at once to our endpoint 3518 * messing up our measurement. 3519 */ 3520 rack->r_ctl.last_max_bw = rack->r_ctl.rc_gp_high_rwnd; 3521 rack->r_ctl.last_max_bw *= HPTS_USEC_IN_SEC; 3522 rack->r_ctl.last_max_bw /= rack->r_ctl.rc_gp_lowrtt; 3523 if (SEQ_LT(th_ack, tp->gput_seq)) { 3524 /* No measurement can be made */ 3525 bytes = 0; 3526 bytes_ps = 0; 3527 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, 3528 0, 0, 0, 10, __LINE__, NULL); 3529 goto skip_measurement; 3530 } else 3531 bytes = (th_ack - tp->gput_seq); 3532 bytes_ps = (uint64_t)bytes; 3533 /* 3534 * Don't measure a b/w for pacing unless we have gotten at least 3535 * an initial windows worth of data in this measurement interval. 3536 * 3537 * Small numbers of bytes get badly influenced by delayed ack and 3538 * other artifacts. Note we take the initial window or our 3539 * defined minimum GP (defaulting to 10 which hopefully is the 3540 * IW). 3541 */ 3542 if (rack->rc_gp_filled == 0) { 3543 /* 3544 * The initial estimate is special. We 3545 * have blasted out an IW worth of packets 3546 * without a real valid ack ts results. We 3547 * then setup the app_limited_needs_set flag, 3548 * this should get the first ack in (probably 2 3549 * MSS worth) to be recorded as the timestamp. 3550 * We thus allow a smaller number of bytes i.e. 3551 * IW - 2MSS. 3552 */ 3553 reqbytes -= (2 * segsiz); 3554 /* Also lets fill previous for our first measurement to be neutral */ 3555 rack->r_ctl.rc_prev_gp_srtt = rack->r_ctl.rc_gp_srtt; 3556 } 3557 if ((bytes_ps < reqbytes) || rack->app_limited_needs_set) { 3558 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, 3559 rack->r_ctl.rc_app_limited_cnt, 3560 0, 0, 10, __LINE__, NULL); 3561 goto skip_measurement; 3562 } 3563 /* 3564 * We now need to calculate the Timely like status so 3565 * we can update (possibly) the b/w multipliers. 3566 */ 3567 new_rtt_diff = (int32_t)rack->r_ctl.rc_gp_srtt - (int32_t)rack->r_ctl.rc_prev_gp_srtt; 3568 if (rack->rc_gp_filled == 0) { 3569 /* No previous reading */ 3570 rack->r_ctl.rc_rtt_diff = new_rtt_diff; 3571 } else { 3572 if (rack->measure_saw_probe_rtt == 0) { 3573 /* 3574 * We don't want a probertt to be counted 3575 * since it will be negative incorrectly. We 3576 * expect to be reducing the RTT when we 3577 * pace at a slower rate. 3578 */ 3579 rack->r_ctl.rc_rtt_diff -= (rack->r_ctl.rc_rtt_diff / 8); 3580 rack->r_ctl.rc_rtt_diff += (new_rtt_diff / 8); 3581 } 3582 } 3583 timely_says = rack_make_timely_judgement(rack, 3584 rack->r_ctl.rc_gp_srtt, 3585 rack->r_ctl.rc_rtt_diff, 3586 rack->r_ctl.rc_prev_gp_srtt 3587 ); 3588 bytes_ps *= HPTS_USEC_IN_SEC; 3589 bytes_ps /= utim; 3590 if (bytes_ps > rack->r_ctl.last_max_bw) { 3591 /* 3592 * Something is on path playing 3593 * since this b/w is not possible based 3594 * on our BDP (highest rwnd and lowest rtt 3595 * we saw in the measurement window). 3596 * 3597 * Another option here would be to 3598 * instead skip the measurement. 3599 */ 3600 rack_log_pacing_delay_calc(rack, bytes, reqbytes, 3601 bytes_ps, rack->r_ctl.last_max_bw, 0, 3602 11, __LINE__, NULL); 3603 bytes_ps = rack->r_ctl.last_max_bw; 3604 } 3605 /* We store gp for b/w in bytes per second */ 3606 if (rack->rc_gp_filled == 0) { 3607 /* Initial measurment */ 3608 if (bytes_ps) { 3609 rack->r_ctl.gp_bw = bytes_ps; 3610 rack->rc_gp_filled = 1; 3611 rack->r_ctl.num_avg = 1; 3612 rack_set_pace_segments(rack->rc_tp, rack, __LINE__); 3613 } else { 3614 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, 3615 rack->r_ctl.rc_app_limited_cnt, 3616 0, 0, 10, __LINE__, NULL); 3617 } 3618 if (rack->rc_inp->inp_in_hpts && 3619 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { 3620 /* 3621 * Ok we can't trust the pacer in this case 3622 * where we transition from un-paced to paced. 3623 * Or for that matter when the burst mitigation 3624 * was making a wild guess and got it wrong. 3625 * Stop the pacer and clear up all the aggregate 3626 * delays etc. 3627 */ 3628 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); 3629 rack->r_ctl.rc_hpts_flags = 0; 3630 rack->r_ctl.rc_last_output_to = 0; 3631 } 3632 } else if (rack->r_ctl.num_avg < RACK_REQ_AVG) { 3633 /* Still a small number run an average */ 3634 rack->r_ctl.gp_bw += bytes_ps; 3635 rack->r_ctl.num_avg++; 3636 if (rack->r_ctl.num_avg >= RACK_REQ_AVG) { 3637 /* We have collected enought to move forward */ 3638 rack->r_ctl.gp_bw /= (uint64_t)rack->r_ctl.num_avg; 3639 } 3640 } else { 3641 /* 3642 * We want to take 1/wma of the goodput and add in to 7/8th 3643 * of the old value weighted by the srtt. So if your measurement 3644 * period is say 2 SRTT's long you would get 1/4 as the 3645 * value, if it was like 1/2 SRTT then you would get 1/16th. 3646 * 3647 * But we must be careful not to take too much i.e. if the 3648 * srtt is say 20ms and the measurement is taken over 3649 * 400ms our weight would be 400/20 i.e. 20. On the 3650 * other hand if we get a measurement over 1ms with a 3651 * 10ms rtt we only want to take a much smaller portion. 3652 */ 3653 uint64_t resid_bw, subpart, addpart, srtt; 3654 3655 srtt = ((uint64_t)TICKS_2_USEC(tp->t_srtt) >> TCP_RTT_SHIFT); 3656 if (srtt == 0) { 3657 /* 3658 * Strange why did t_srtt go back to zero? 3659 */ 3660 if (rack->r_ctl.rc_rack_min_rtt) 3661 srtt = (rack->r_ctl.rc_rack_min_rtt * HPTS_USEC_IN_MSEC); 3662 else 3663 srtt = HPTS_USEC_IN_MSEC; 3664 } 3665 /* 3666 * XXXrrs: Note for reviewers, in playing with 3667 * dynamic pacing I discovered this GP calculation 3668 * as done originally leads to some undesired results. 3669 * Basically you can get longer measurements contributing 3670 * too much to the WMA. Thus I changed it if you are doing 3671 * dynamic adjustments to only do the aportioned adjustment 3672 * if we have a very small (time wise) measurement. Longer 3673 * measurements just get there weight (defaulting to 1/8) 3674 * add to the WMA. We may want to think about changing 3675 * this to always do that for both sides i.e. dynamic 3676 * and non-dynamic... but considering lots of folks 3677 * were playing with this I did not want to change the 3678 * calculation per.se. without your thoughts.. Lawerence? 3679 * Peter?? 3680 */ 3681 if (rack->rc_gp_dyn_mul == 0) { 3682 subpart = rack->r_ctl.gp_bw * utim; 3683 subpart /= (srtt * 8); 3684 if (subpart < (rack->r_ctl.gp_bw / 2)) { 3685 /* 3686 * The b/w update takes no more 3687 * away then 1/2 our running total 3688 * so factor it in. 3689 */ 3690 addpart = bytes_ps * utim; 3691 addpart /= (srtt * 8); 3692 } else { 3693 /* 3694 * Don't allow a single measurement 3695 * to account for more than 1/2 of the 3696 * WMA. This could happen on a retransmission 3697 * where utim becomes huge compared to 3698 * srtt (multiple retransmissions when using 3699 * the sending rate which factors in all the 3700 * transmissions from the first one). 3701 */ 3702 subpart = rack->r_ctl.gp_bw / 2; 3703 addpart = bytes_ps / 2; 3704 } 3705 resid_bw = rack->r_ctl.gp_bw - subpart; 3706 rack->r_ctl.gp_bw = resid_bw + addpart; 3707 } else { 3708 if ((utim / srtt) <= 1) { 3709 /* 3710 * The b/w update was over a small period 3711 * of time. The idea here is to prevent a small 3712 * measurement time period from counting 3713 * too much. So we scale it based on the 3714 * time so it attributes less than 1/rack_wma_divisor 3715 * of its measurement. 3716 */ 3717 subpart = rack->r_ctl.gp_bw * utim; 3718 subpart /= (srtt * rack_wma_divisor); 3719 addpart = bytes_ps * utim; 3720 addpart /= (srtt * rack_wma_divisor); 3721 } else { 3722 /* 3723 * The scaled measurement was long 3724 * enough so lets just add in the 3725 * portion of the measurment i.e. 1/rack_wma_divisor 3726 */ 3727 subpart = rack->r_ctl.gp_bw / rack_wma_divisor; 3728 addpart = bytes_ps / rack_wma_divisor; 3729 } 3730 if ((rack->measure_saw_probe_rtt == 0) || 3731 (bytes_ps > rack->r_ctl.gp_bw)) { 3732 /* 3733 * For probe-rtt we only add it in 3734 * if its larger, all others we just 3735 * add in. 3736 */ 3737 resid_bw = rack->r_ctl.gp_bw - subpart; 3738 rack->r_ctl.gp_bw = resid_bw + addpart; 3739 } 3740 } 3741 } 3742 /* We do not update any multipliers if we are in or have seen a probe-rtt */ 3743 if ((rack->measure_saw_probe_rtt == 0) && rack->rc_gp_rtt_set) 3744 rack_update_multiplier(rack, timely_says, bytes_ps, 3745 rack->r_ctl.rc_gp_srtt, 3746 rack->r_ctl.rc_rtt_diff); 3747 rack_log_pacing_delay_calc(rack, bytes, tim, bytes_ps, stim, 3748 rack_get_bw(rack), 3, line, NULL); 3749 /* reset the gp srtt and setup the new prev */ 3750 rack->r_ctl.rc_prev_gp_srtt = rack->r_ctl.rc_gp_srtt; 3751 /* Record the lost count for the next measurement */ 3752 rack->r_ctl.rc_loss_at_start = rack->r_ctl.rc_loss_count; 3753 /* 3754 * We restart our diffs based on the gpsrtt in the 3755 * measurement window. 3756 */ 3757 rack->rc_gp_rtt_set = 0; 3758 rack->rc_gp_saw_rec = 0; 3759 rack->rc_gp_saw_ca = 0; 3760 rack->rc_gp_saw_ss = 0; 3761 rack->rc_dragged_bottom = 0; 3762 skip_measurement: 3763 3764 #ifdef STATS 3765 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_GPUT, 3766 gput); 3767 /* 3768 * XXXLAS: This is a temporary hack, and should be 3769 * chained off VOI_TCP_GPUT when stats(9) grows an 3770 * API to deal with chained VOIs. 3771 */ 3772 if (tp->t_stats_gput_prev > 0) 3773 stats_voi_update_abs_s32(tp->t_stats, 3774 VOI_TCP_GPUT_ND, 3775 ((gput - tp->t_stats_gput_prev) * 100) / 3776 tp->t_stats_gput_prev); 3777 #endif 3778 tp->t_flags &= ~TF_GPUTINPROG; 3779 tp->t_stats_gput_prev = gput; 3780 /* 3781 * Now are we app limited now and there is space from where we 3782 * were to where we want to go? 3783 * 3784 * We don't do the other case i.e. non-applimited here since 3785 * the next send will trigger us picking up the missing data. 3786 */ 3787 if (rack->r_ctl.rc_first_appl && 3788 TCPS_HAVEESTABLISHED(tp->t_state) && 3789 rack->r_ctl.rc_app_limited_cnt && 3790 (SEQ_GT(rack->r_ctl.rc_first_appl->r_start, th_ack)) && 3791 ((rack->r_ctl.rc_first_appl->r_start - th_ack) > 3792 max(rc_init_window(rack), (MIN_GP_WIN * segsiz)))) { 3793 /* 3794 * Yep there is enough outstanding to make a measurement here. 3795 */ 3796 struct rack_sendmap *rsm, fe; 3797 3798 tp->t_flags |= TF_GPUTINPROG; 3799 rack->r_ctl.rc_gp_lowrtt = 0xffffffff; 3800 rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd; 3801 tp->gput_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 3802 rack->app_limited_needs_set = 0; 3803 tp->gput_seq = th_ack; 3804 if (rack->in_probe_rtt) 3805 rack->measure_saw_probe_rtt = 1; 3806 else if ((rack->measure_saw_probe_rtt) && 3807 (SEQ_GEQ(tp->gput_seq, rack->r_ctl.rc_probertt_sndmax_atexit))) 3808 rack->measure_saw_probe_rtt = 0; 3809 if ((rack->r_ctl.rc_first_appl->r_start - th_ack) >= rack_get_measure_window(tp, rack)) { 3810 /* There is a full window to gain info from */ 3811 tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack); 3812 } else { 3813 /* We can only measure up to the applimited point */ 3814 tp->gput_ack = tp->gput_seq + (rack->r_ctl.rc_first_appl->r_start - th_ack); 3815 } 3816 /* 3817 * Now we need to find the timestamp of the send at tp->gput_seq 3818 * for the send based measurement. 3819 */ 3820 fe.r_start = tp->gput_seq; 3821 rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); 3822 if (rsm) { 3823 /* Ok send-based limit is set */ 3824 if (SEQ_LT(rsm->r_start, tp->gput_seq)) { 3825 /* 3826 * Move back to include the earlier part 3827 * so our ack time lines up right (this may 3828 * make an overlapping measurement but thats 3829 * ok). 3830 */ 3831 tp->gput_seq = rsm->r_start; 3832 } 3833 if (rsm->r_flags & RACK_ACKED) 3834 tp->gput_ts = rsm->r_ack_arrival; 3835 else 3836 rack->app_limited_needs_set = 1; 3837 rack->r_ctl.rc_gp_output_ts = rsm->usec_orig_send; 3838 } else { 3839 /* 3840 * If we don't find the rsm due to some 3841 * send-limit set the current time, which 3842 * basically disables the send-limit. 3843 */ 3844 rack->r_ctl.rc_gp_output_ts = tcp_get_usecs(NULL); 3845 } 3846 rack_log_pacing_delay_calc(rack, 3847 tp->gput_seq, 3848 tp->gput_ack, 3849 (uint64_t)rsm, 3850 tp->gput_ts, 3851 rack->r_ctl.rc_app_limited_cnt, 3852 9, 3853 __LINE__, NULL); 3854 } 3855 } 3856 3857 /* 3858 * CC wrapper hook functions 3859 */ 3860 static void 3861 rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, struct tcphdr *th, uint16_t nsegs, 3862 uint16_t type, int32_t recovery) 3863 { 3864 INP_WLOCK_ASSERT(tp->t_inpcb); 3865 tp->ccv->nsegs = nsegs; 3866 tp->ccv->bytes_this_ack = BYTES_THIS_ACK(tp, th); 3867 if ((recovery) && (rack->r_ctl.rc_early_recovery_segs)) { 3868 uint32_t max; 3869 3870 max = rack->r_ctl.rc_early_recovery_segs * ctf_fixed_maxseg(tp); 3871 if (tp->ccv->bytes_this_ack > max) { 3872 tp->ccv->bytes_this_ack = max; 3873 } 3874 } 3875 if (rack->r_ctl.cwnd_to_use <= tp->snd_wnd) 3876 tp->ccv->flags |= CCF_CWND_LIMITED; 3877 else 3878 tp->ccv->flags &= ~CCF_CWND_LIMITED; 3879 #ifdef STATS 3880 stats_voi_update_abs_s32(tp->t_stats, VOI_TCP_CALCFRWINDIFF, 3881 ((int32_t)rack->r_ctl.cwnd_to_use) - tp->snd_wnd); 3882 #endif 3883 if ((tp->t_flags & TF_GPUTINPROG) && 3884 rack_enough_for_measurement(tp, rack, th->th_ack)) { 3885 /* Measure the Goodput */ 3886 rack_do_goodput_measurement(tp, rack, th->th_ack, __LINE__); 3887 #ifdef NETFLIX_PEAKRATE 3888 if ((type == CC_ACK) && 3889 (tp->t_maxpeakrate)) { 3890 /* 3891 * We update t_peakrate_thr. This gives us roughly 3892 * one update per round trip time. Note 3893 * it will only be used if pace_always is off i.e 3894 * we don't do this for paced flows. 3895 */ 3896 tcp_update_peakrate_thr(tp); 3897 } 3898 #endif 3899 } 3900 if (rack->r_ctl.cwnd_to_use > tp->snd_ssthresh) { 3901 tp->t_bytes_acked += tp->ccv->bytes_this_ack; 3902 if (tp->t_bytes_acked >= rack->r_ctl.cwnd_to_use) { 3903 tp->t_bytes_acked -= rack->r_ctl.cwnd_to_use; 3904 tp->ccv->flags |= CCF_ABC_SENTAWND; 3905 } 3906 } else { 3907 tp->ccv->flags &= ~CCF_ABC_SENTAWND; 3908 tp->t_bytes_acked = 0; 3909 } 3910 if (CC_ALGO(tp)->ack_received != NULL) { 3911 /* XXXLAS: Find a way to live without this */ 3912 tp->ccv->curack = th->th_ack; 3913 CC_ALGO(tp)->ack_received(tp->ccv, type); 3914 } 3915 #ifdef STATS 3916 stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_LCWIN, rack->r_ctl.cwnd_to_use); 3917 #endif 3918 if (rack->r_ctl.rc_rack_largest_cwnd < rack->r_ctl.cwnd_to_use) { 3919 rack->r_ctl.rc_rack_largest_cwnd = rack->r_ctl.cwnd_to_use; 3920 } 3921 #ifdef NETFLIX_PEAKRATE 3922 /* we enforce max peak rate if it is set and we are not pacing */ 3923 if ((rack->rc_always_pace == 0) && 3924 tp->t_peakrate_thr && 3925 (tp->snd_cwnd > tp->t_peakrate_thr)) { 3926 tp->snd_cwnd = tp->t_peakrate_thr; 3927 } 3928 #endif 3929 } 3930 3931 static void 3932 tcp_rack_partialack(struct tcpcb *tp, struct tcphdr *th) 3933 { 3934 struct tcp_rack *rack; 3935 3936 rack = (struct tcp_rack *)tp->t_fb_ptr; 3937 INP_WLOCK_ASSERT(tp->t_inpcb); 3938 /* 3939 * If we are doing PRR and have enough 3940 * room to send <or> we are pacing and prr 3941 * is disabled we will want to see if we 3942 * can send data (by setting r_wanted_output to 3943 * true). 3944 */ 3945 if ((rack->r_ctl.rc_prr_sndcnt > 0) || 3946 rack->rack_no_prr) 3947 rack->r_wanted_output = 1; 3948 } 3949 3950 static void 3951 rack_post_recovery(struct tcpcb *tp, struct tcphdr *th) 3952 { 3953 struct tcp_rack *rack; 3954 uint32_t orig_cwnd; 3955 3956 orig_cwnd = tp->snd_cwnd; 3957 INP_WLOCK_ASSERT(tp->t_inpcb); 3958 rack = (struct tcp_rack *)tp->t_fb_ptr; 3959 if (rack->rc_not_backing_off == 0) { 3960 /* only alert CC if we alerted when we entered */ 3961 if (CC_ALGO(tp)->post_recovery != NULL) { 3962 tp->ccv->curack = th->th_ack; 3963 CC_ALGO(tp)->post_recovery(tp->ccv); 3964 } 3965 if (tp->snd_cwnd > tp->snd_ssthresh) { 3966 /* Drop us down to the ssthresh (1/2 cwnd at loss) */ 3967 tp->snd_cwnd = tp->snd_ssthresh; 3968 } 3969 } 3970 if ((rack->rack_no_prr == 0) && 3971 (rack->r_ctl.rc_prr_sndcnt > 0)) { 3972 /* Suck the next prr cnt back into cwnd */ 3973 tp->snd_cwnd += rack->r_ctl.rc_prr_sndcnt; 3974 rack->r_ctl.rc_prr_sndcnt = 0; 3975 rack_log_to_prr(rack, 1, 0); 3976 } 3977 rack_log_to_prr(rack, 14, orig_cwnd); 3978 tp->snd_recover = tp->snd_una; 3979 EXIT_RECOVERY(tp->t_flags); 3980 } 3981 3982 static void 3983 rack_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type) 3984 { 3985 struct tcp_rack *rack; 3986 3987 INP_WLOCK_ASSERT(tp->t_inpcb); 3988 3989 rack = (struct tcp_rack *)tp->t_fb_ptr; 3990 switch (type) { 3991 case CC_NDUPACK: 3992 tp->t_flags &= ~TF_WASFRECOVERY; 3993 tp->t_flags &= ~TF_WASCRECOVERY; 3994 if (!IN_FASTRECOVERY(tp->t_flags)) { 3995 rack->r_ctl.rc_prr_delivered = 0; 3996 rack->r_ctl.rc_prr_out = 0; 3997 if (rack->rack_no_prr == 0) { 3998 rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); 3999 rack_log_to_prr(rack, 2, 0); 4000 } 4001 rack->r_ctl.rc_prr_recovery_fs = tp->snd_max - tp->snd_una; 4002 tp->snd_recover = tp->snd_max; 4003 if (tp->t_flags2 & TF2_ECN_PERMIT) 4004 tp->t_flags2 |= TF2_ECN_SND_CWR; 4005 } 4006 break; 4007 case CC_ECN: 4008 if (!IN_CONGRECOVERY(tp->t_flags) || 4009 /* 4010 * Allow ECN reaction on ACK to CWR, if 4011 * that data segment was also CE marked. 4012 */ 4013 SEQ_GEQ(th->th_ack, tp->snd_recover)) { 4014 EXIT_CONGRECOVERY(tp->t_flags); 4015 KMOD_TCPSTAT_INC(tcps_ecn_rcwnd); 4016 tp->snd_recover = tp->snd_max + 1; 4017 if (tp->t_flags2 & TF2_ECN_PERMIT) 4018 tp->t_flags2 |= TF2_ECN_SND_CWR; 4019 } 4020 break; 4021 case CC_RTO: 4022 tp->t_dupacks = 0; 4023 tp->t_bytes_acked = 0; 4024 EXIT_RECOVERY(tp->t_flags); 4025 tp->snd_ssthresh = max(2, min(tp->snd_wnd, rack->r_ctl.cwnd_to_use) / 2 / 4026 ctf_fixed_maxseg(tp)) * ctf_fixed_maxseg(tp); 4027 tp->snd_cwnd = ctf_fixed_maxseg(tp); 4028 if (tp->t_flags2 & TF2_ECN_PERMIT) 4029 tp->t_flags2 |= TF2_ECN_SND_CWR; 4030 break; 4031 case CC_RTO_ERR: 4032 KMOD_TCPSTAT_INC(tcps_sndrexmitbad); 4033 /* RTO was unnecessary, so reset everything. */ 4034 tp->snd_cwnd = tp->snd_cwnd_prev; 4035 tp->snd_ssthresh = tp->snd_ssthresh_prev; 4036 tp->snd_recover = tp->snd_recover_prev; 4037 if (tp->t_flags & TF_WASFRECOVERY) { 4038 ENTER_FASTRECOVERY(tp->t_flags); 4039 tp->t_flags &= ~TF_WASFRECOVERY; 4040 } 4041 if (tp->t_flags & TF_WASCRECOVERY) { 4042 ENTER_CONGRECOVERY(tp->t_flags); 4043 tp->t_flags &= ~TF_WASCRECOVERY; 4044 } 4045 tp->snd_nxt = tp->snd_max; 4046 tp->t_badrxtwin = 0; 4047 break; 4048 } 4049 /* 4050 * If we are below our max rtt, don't 4051 * signal the CC control to change things. 4052 * instead set it up so that we are in 4053 * recovery but not going to back off. 4054 */ 4055 4056 if (rack->rc_highly_buffered) { 4057 /* 4058 * Do we use the higher rtt for 4059 * our threshold to not backoff (like CDG)? 4060 */ 4061 uint32_t rtt_mul, rtt_div; 4062 4063 if (rack_use_max_for_nobackoff) { 4064 rtt_mul = (rack_gp_rtt_maxmul - 1); 4065 rtt_div = 1; 4066 } else { 4067 rtt_mul = rack_gp_rtt_minmul; 4068 rtt_div = max(rack_gp_rtt_mindiv , 1); 4069 } 4070 if (rack->r_ctl.rc_gp_srtt <= (rack->r_ctl.rc_lowest_us_rtt + 4071 ((rack->r_ctl.rc_lowest_us_rtt * rtt_mul) / 4072 rtt_div))) { 4073 /* below our min threshold */ 4074 rack->rc_not_backing_off = 1; 4075 ENTER_RECOVERY(rack->rc_tp->t_flags); 4076 rack_log_rtt_shrinks(rack, 0, 4077 rtt_mul, 4078 rtt_div, 4079 RACK_RTTS_NOBACKOFF); 4080 return; 4081 } 4082 } 4083 rack->rc_not_backing_off = 0; 4084 if (CC_ALGO(tp)->cong_signal != NULL) { 4085 if (th != NULL) 4086 tp->ccv->curack = th->th_ack; 4087 CC_ALGO(tp)->cong_signal(tp->ccv, type); 4088 } 4089 } 4090 4091 static inline void 4092 rack_cc_after_idle(struct tcp_rack *rack, struct tcpcb *tp) 4093 { 4094 uint32_t i_cwnd; 4095 4096 INP_WLOCK_ASSERT(tp->t_inpcb); 4097 4098 #ifdef NETFLIX_STATS 4099 KMOD_TCPSTAT_INC(tcps_idle_restarts); 4100 if (tp->t_state == TCPS_ESTABLISHED) 4101 KMOD_TCPSTAT_INC(tcps_idle_estrestarts); 4102 #endif 4103 if (CC_ALGO(tp)->after_idle != NULL) 4104 CC_ALGO(tp)->after_idle(tp->ccv); 4105 4106 if (tp->snd_cwnd == 1) 4107 i_cwnd = tp->t_maxseg; /* SYN(-ACK) lost */ 4108 else 4109 i_cwnd = rc_init_window(rack); 4110 4111 /* 4112 * Being idle is no differnt than the initial window. If the cc 4113 * clamps it down below the initial window raise it to the initial 4114 * window. 4115 */ 4116 if (tp->snd_cwnd < i_cwnd) { 4117 tp->snd_cwnd = i_cwnd; 4118 } 4119 } 4120 4121 /* 4122 * Indicate whether this ack should be delayed. We can delay the ack if 4123 * following conditions are met: 4124 * - There is no delayed ack timer in progress. 4125 * - Our last ack wasn't a 0-sized window. We never want to delay 4126 * the ack that opens up a 0-sized window. 4127 * - LRO wasn't used for this segment. We make sure by checking that the 4128 * segment size is not larger than the MSS. 4129 * - Delayed acks are enabled or this is a half-synchronized T/TCP 4130 * connection. 4131 */ 4132 #define DELAY_ACK(tp, tlen) \ 4133 (((tp->t_flags & TF_RXWIN0SENT) == 0) && \ 4134 ((tp->t_flags & TF_DELACK) == 0) && \ 4135 (tlen <= tp->t_maxseg) && \ 4136 (tp->t_delayed_ack || (tp->t_flags & TF_NEEDSYN))) 4137 4138 static struct rack_sendmap * 4139 rack_find_lowest_rsm(struct tcp_rack *rack) 4140 { 4141 struct rack_sendmap *rsm; 4142 4143 /* 4144 * Walk the time-order transmitted list looking for an rsm that is 4145 * not acked. This will be the one that was sent the longest time 4146 * ago that is still outstanding. 4147 */ 4148 TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) { 4149 if (rsm->r_flags & RACK_ACKED) { 4150 continue; 4151 } 4152 goto finish; 4153 } 4154 finish: 4155 return (rsm); 4156 } 4157 4158 static struct rack_sendmap * 4159 rack_find_high_nonack(struct tcp_rack *rack, struct rack_sendmap *rsm) 4160 { 4161 struct rack_sendmap *prsm; 4162 4163 /* 4164 * Walk the sequence order list backward until we hit and arrive at 4165 * the highest seq not acked. In theory when this is called it 4166 * should be the last segment (which it was not). 4167 */ 4168 counter_u64_add(rack_find_high, 1); 4169 prsm = rsm; 4170 RB_FOREACH_REVERSE_FROM(prsm, rack_rb_tree_head, rsm) { 4171 if (prsm->r_flags & (RACK_ACKED | RACK_HAS_FIN)) { 4172 continue; 4173 } 4174 return (prsm); 4175 } 4176 return (NULL); 4177 } 4178 4179 static uint32_t 4180 rack_calc_thresh_rack(struct tcp_rack *rack, uint32_t srtt, uint32_t cts) 4181 { 4182 int32_t lro; 4183 uint32_t thresh; 4184 4185 /* 4186 * lro is the flag we use to determine if we have seen reordering. 4187 * If it gets set we have seen reordering. The reorder logic either 4188 * works in one of two ways: 4189 * 4190 * If reorder-fade is configured, then we track the last time we saw 4191 * re-ordering occur. If we reach the point where enough time as 4192 * passed we no longer consider reordering has occuring. 4193 * 4194 * Or if reorder-face is 0, then once we see reordering we consider 4195 * the connection to alway be subject to reordering and just set lro 4196 * to 1. 4197 * 4198 * In the end if lro is non-zero we add the extra time for 4199 * reordering in. 4200 */ 4201 if (srtt == 0) 4202 srtt = 1; 4203 if (rack->r_ctl.rc_reorder_ts) { 4204 if (rack->r_ctl.rc_reorder_fade) { 4205 if (SEQ_GEQ(cts, rack->r_ctl.rc_reorder_ts)) { 4206 lro = cts - rack->r_ctl.rc_reorder_ts; 4207 if (lro == 0) { 4208 /* 4209 * No time as passed since the last 4210 * reorder, mark it as reordering. 4211 */ 4212 lro = 1; 4213 } 4214 } else { 4215 /* Negative time? */ 4216 lro = 0; 4217 } 4218 if (lro > rack->r_ctl.rc_reorder_fade) { 4219 /* Turn off reordering seen too */ 4220 rack->r_ctl.rc_reorder_ts = 0; 4221 lro = 0; 4222 } 4223 } else { 4224 /* Reodering does not fade */ 4225 lro = 1; 4226 } 4227 } else { 4228 lro = 0; 4229 } 4230 thresh = srtt + rack->r_ctl.rc_pkt_delay; 4231 if (lro) { 4232 /* It must be set, if not you get 1/4 rtt */ 4233 if (rack->r_ctl.rc_reorder_shift) 4234 thresh += (srtt >> rack->r_ctl.rc_reorder_shift); 4235 else 4236 thresh += (srtt >> 2); 4237 } else { 4238 thresh += 1; 4239 } 4240 /* We don't let the rack timeout be above a RTO */ 4241 if (thresh > TICKS_2_MSEC(rack->rc_tp->t_rxtcur)) { 4242 thresh = TICKS_2_MSEC(rack->rc_tp->t_rxtcur); 4243 } 4244 /* And we don't want it above the RTO max either */ 4245 if (thresh > rack_rto_max) { 4246 thresh = rack_rto_max; 4247 } 4248 return (thresh); 4249 } 4250 4251 static uint32_t 4252 rack_calc_thresh_tlp(struct tcpcb *tp, struct tcp_rack *rack, 4253 struct rack_sendmap *rsm, uint32_t srtt) 4254 { 4255 struct rack_sendmap *prsm; 4256 uint32_t thresh, len; 4257 int segsiz; 4258 4259 if (srtt == 0) 4260 srtt = 1; 4261 if (rack->r_ctl.rc_tlp_threshold) 4262 thresh = srtt + (srtt / rack->r_ctl.rc_tlp_threshold); 4263 else 4264 thresh = (srtt * 2); 4265 4266 /* Get the previous sent packet, if any */ 4267 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 4268 counter_u64_add(rack_enter_tlp_calc, 1); 4269 len = rsm->r_end - rsm->r_start; 4270 if (rack->rack_tlp_threshold_use == TLP_USE_ID) { 4271 /* Exactly like the ID */ 4272 if (((tp->snd_max - tp->snd_una) - rack->r_ctl.rc_sacked + rack->r_ctl.rc_holes_rxt) <= segsiz) { 4273 uint32_t alt_thresh; 4274 /* 4275 * Compensate for delayed-ack with the d-ack time. 4276 */ 4277 counter_u64_add(rack_used_tlpmethod, 1); 4278 alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time; 4279 if (alt_thresh > thresh) 4280 thresh = alt_thresh; 4281 } 4282 } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_ONE) { 4283 /* 2.1 behavior */ 4284 prsm = TAILQ_PREV(rsm, rack_head, r_tnext); 4285 if (prsm && (len <= segsiz)) { 4286 /* 4287 * Two packets outstanding, thresh should be (2*srtt) + 4288 * possible inter-packet delay (if any). 4289 */ 4290 uint32_t inter_gap = 0; 4291 int idx, nidx; 4292 4293 counter_u64_add(rack_used_tlpmethod, 1); 4294 idx = rsm->r_rtr_cnt - 1; 4295 nidx = prsm->r_rtr_cnt - 1; 4296 if (TSTMP_GEQ(rsm->r_tim_lastsent[nidx], prsm->r_tim_lastsent[idx])) { 4297 /* Yes it was sent later (or at the same time) */ 4298 inter_gap = rsm->r_tim_lastsent[idx] - prsm->r_tim_lastsent[nidx]; 4299 } 4300 thresh += inter_gap; 4301 } else if (len <= segsiz) { 4302 /* 4303 * Possibly compensate for delayed-ack. 4304 */ 4305 uint32_t alt_thresh; 4306 4307 counter_u64_add(rack_used_tlpmethod2, 1); 4308 alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time; 4309 if (alt_thresh > thresh) 4310 thresh = alt_thresh; 4311 } 4312 } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_TWO) { 4313 /* 2.2 behavior */ 4314 if (len <= segsiz) { 4315 uint32_t alt_thresh; 4316 /* 4317 * Compensate for delayed-ack with the d-ack time. 4318 */ 4319 counter_u64_add(rack_used_tlpmethod, 1); 4320 alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time; 4321 if (alt_thresh > thresh) 4322 thresh = alt_thresh; 4323 } 4324 } 4325 /* Not above an RTO */ 4326 if (thresh > TICKS_2_MSEC(tp->t_rxtcur)) { 4327 thresh = TICKS_2_MSEC(tp->t_rxtcur); 4328 } 4329 /* Not above a RTO max */ 4330 if (thresh > rack_rto_max) { 4331 thresh = rack_rto_max; 4332 } 4333 /* Apply user supplied min TLP */ 4334 if (thresh < rack_tlp_min) { 4335 thresh = rack_tlp_min; 4336 } 4337 return (thresh); 4338 } 4339 4340 static uint32_t 4341 rack_grab_rtt(struct tcpcb *tp, struct tcp_rack *rack) 4342 { 4343 /* 4344 * We want the rack_rtt which is the 4345 * last rtt we measured. However if that 4346 * does not exist we fallback to the srtt (which 4347 * we probably will never do) and then as a last 4348 * resort we use RACK_INITIAL_RTO if no srtt is 4349 * yet set. 4350 */ 4351 if (rack->rc_rack_rtt) 4352 return(rack->rc_rack_rtt); 4353 else if (tp->t_srtt == 0) 4354 return(RACK_INITIAL_RTO); 4355 return (TICKS_2_MSEC(tp->t_srtt >> TCP_RTT_SHIFT)); 4356 } 4357 4358 static struct rack_sendmap * 4359 rack_check_recovery_mode(struct tcpcb *tp, uint32_t tsused) 4360 { 4361 /* 4362 * Check to see that we don't need to fall into recovery. We will 4363 * need to do so if our oldest transmit is past the time we should 4364 * have had an ack. 4365 */ 4366 struct tcp_rack *rack; 4367 struct rack_sendmap *rsm; 4368 int32_t idx; 4369 uint32_t srtt, thresh; 4370 4371 rack = (struct tcp_rack *)tp->t_fb_ptr; 4372 if (RB_EMPTY(&rack->r_ctl.rc_mtree)) { 4373 return (NULL); 4374 } 4375 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 4376 if (rsm == NULL) 4377 return (NULL); 4378 4379 if (rsm->r_flags & RACK_ACKED) { 4380 rsm = rack_find_lowest_rsm(rack); 4381 if (rsm == NULL) 4382 return (NULL); 4383 } 4384 idx = rsm->r_rtr_cnt - 1; 4385 srtt = rack_grab_rtt(tp, rack); 4386 thresh = rack_calc_thresh_rack(rack, srtt, tsused); 4387 if (TSTMP_LT(tsused, rsm->r_tim_lastsent[idx])) { 4388 return (NULL); 4389 } 4390 if ((tsused - rsm->r_tim_lastsent[idx]) < thresh) { 4391 return (NULL); 4392 } 4393 /* Ok if we reach here we are over-due and this guy can be sent */ 4394 if (IN_RECOVERY(tp->t_flags) == 0) { 4395 /* 4396 * For the one that enters us into recovery record undo 4397 * info. 4398 */ 4399 rack->r_ctl.rc_rsm_start = rsm->r_start; 4400 rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; 4401 rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; 4402 } 4403 rack_cong_signal(tp, NULL, CC_NDUPACK); 4404 return (rsm); 4405 } 4406 4407 static uint32_t 4408 rack_get_persists_timer_val(struct tcpcb *tp, struct tcp_rack *rack) 4409 { 4410 int32_t t; 4411 int32_t tt; 4412 uint32_t ret_val; 4413 4414 t = TICKS_2_MSEC((tp->t_srtt >> TCP_RTT_SHIFT) + ((tp->t_rttvar * 4) >> TCP_RTT_SHIFT)); 4415 TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift], 4416 rack_persist_min, rack_persist_max); 4417 if (tp->t_rxtshift < TCP_MAXRXTSHIFT) 4418 tp->t_rxtshift++; 4419 rack->r_ctl.rc_hpts_flags |= PACE_TMR_PERSIT; 4420 ret_val = (uint32_t)tt; 4421 return (ret_val); 4422 } 4423 4424 static uint32_t 4425 rack_timer_start(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int sup_rack) 4426 { 4427 /* 4428 * Start the FR timer, we do this based on getting the first one in 4429 * the rc_tmap. Note that if its NULL we must stop the timer. in all 4430 * events we need to stop the running timer (if its running) before 4431 * starting the new one. 4432 */ 4433 uint32_t thresh, exp, to, srtt, time_since_sent, tstmp_touse; 4434 uint32_t srtt_cur; 4435 int32_t idx; 4436 int32_t is_tlp_timer = 0; 4437 struct rack_sendmap *rsm; 4438 4439 if (rack->t_timers_stopped) { 4440 /* All timers have been stopped none are to run */ 4441 return (0); 4442 } 4443 if (rack->rc_in_persist) { 4444 /* We can't start any timer in persists */ 4445 return (rack_get_persists_timer_val(tp, rack)); 4446 } 4447 rack->rc_on_min_to = 0; 4448 if ((tp->t_state < TCPS_ESTABLISHED) || 4449 ((tp->t_flags & TF_SACK_PERMIT) == 0)) 4450 goto activate_rxt; 4451 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 4452 if ((rsm == NULL) || sup_rack) { 4453 /* Nothing on the send map */ 4454 activate_rxt: 4455 time_since_sent = 0; 4456 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 4457 if (rsm) { 4458 idx = rsm->r_rtr_cnt - 1; 4459 if (TSTMP_GEQ(rsm->r_tim_lastsent[idx], rack->r_ctl.rc_tlp_rxt_last_time)) 4460 tstmp_touse = rsm->r_tim_lastsent[idx]; 4461 else 4462 tstmp_touse = rack->r_ctl.rc_tlp_rxt_last_time; 4463 if (TSTMP_GT(cts, tstmp_touse)) 4464 time_since_sent = cts - tstmp_touse; 4465 } 4466 if (SEQ_LT(tp->snd_una, tp->snd_max) || sbavail(&(tp->t_inpcb->inp_socket->so_snd))) { 4467 rack->r_ctl.rc_hpts_flags |= PACE_TMR_RXT; 4468 to = TICKS_2_MSEC(tp->t_rxtcur); 4469 if (to > time_since_sent) 4470 to -= time_since_sent; 4471 else 4472 to = rack->r_ctl.rc_min_to; 4473 if (to == 0) 4474 to = 1; 4475 return (to); 4476 } 4477 return (0); 4478 } 4479 if (rsm->r_flags & RACK_ACKED) { 4480 rsm = rack_find_lowest_rsm(rack); 4481 if (rsm == NULL) { 4482 /* No lowest? */ 4483 goto activate_rxt; 4484 } 4485 } 4486 if (rack->sack_attack_disable) { 4487 /* 4488 * We don't want to do 4489 * any TLP's if you are an attacker. 4490 * Though if you are doing what 4491 * is expected you may still have 4492 * SACK-PASSED marks. 4493 */ 4494 goto activate_rxt; 4495 } 4496 /* Convert from ms to usecs */ 4497 if ((rsm->r_flags & RACK_SACK_PASSED) || (rsm->r_dupack >= DUP_ACK_THRESHOLD)) { 4498 if ((tp->t_flags & TF_SENTFIN) && 4499 ((tp->snd_max - tp->snd_una) == 1) && 4500 (rsm->r_flags & RACK_HAS_FIN)) { 4501 /* 4502 * We don't start a rack timer if all we have is a 4503 * FIN outstanding. 4504 */ 4505 goto activate_rxt; 4506 } 4507 if ((rack->use_rack_rr == 0) && 4508 (IN_RECOVERY(tp->t_flags)) && 4509 (rack->rack_no_prr == 0) && 4510 (rack->r_ctl.rc_prr_sndcnt < ctf_fixed_maxseg(tp))) { 4511 /* 4512 * We are not cheating, in recovery and 4513 * not enough ack's to yet get our next 4514 * retransmission out. 4515 * 4516 * Note that classified attackers do not 4517 * get to use the rack-cheat. 4518 */ 4519 goto activate_tlp; 4520 } 4521 srtt = rack_grab_rtt(tp, rack); 4522 thresh = rack_calc_thresh_rack(rack, srtt, cts); 4523 idx = rsm->r_rtr_cnt - 1; 4524 exp = rsm->r_tim_lastsent[idx] + thresh; 4525 if (SEQ_GEQ(exp, cts)) { 4526 to = exp - cts; 4527 if (to < rack->r_ctl.rc_min_to) { 4528 to = rack->r_ctl.rc_min_to; 4529 if (rack->r_rr_config == 3) 4530 rack->rc_on_min_to = 1; 4531 } 4532 } else { 4533 to = rack->r_ctl.rc_min_to; 4534 if (rack->r_rr_config == 3) 4535 rack->rc_on_min_to = 1; 4536 } 4537 } else { 4538 /* Ok we need to do a TLP not RACK */ 4539 activate_tlp: 4540 if ((rack->rc_tlp_in_progress != 0) && 4541 (rack->r_ctl.rc_tlp_cnt_out >= rack_tlp_limit)) { 4542 /* 4543 * The previous send was a TLP and we have sent 4544 * N TLP's without sending new data. 4545 */ 4546 goto activate_rxt; 4547 } 4548 rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext); 4549 if (rsm == NULL) { 4550 /* We found no rsm to TLP with. */ 4551 goto activate_rxt; 4552 } 4553 if (rsm->r_flags & RACK_HAS_FIN) { 4554 /* If its a FIN we dont do TLP */ 4555 rsm = NULL; 4556 goto activate_rxt; 4557 } 4558 idx = rsm->r_rtr_cnt - 1; 4559 time_since_sent = 0; 4560 if (TSTMP_GEQ(rsm->r_tim_lastsent[idx], rack->r_ctl.rc_tlp_rxt_last_time)) 4561 tstmp_touse = rsm->r_tim_lastsent[idx]; 4562 else 4563 tstmp_touse = rack->r_ctl.rc_tlp_rxt_last_time; 4564 if (TSTMP_GT(cts, tstmp_touse)) 4565 time_since_sent = cts - tstmp_touse; 4566 is_tlp_timer = 1; 4567 if (tp->t_srtt) { 4568 srtt_cur = (tp->t_srtt >> TCP_RTT_SHIFT); 4569 srtt = TICKS_2_MSEC(srtt_cur); 4570 } else 4571 srtt = RACK_INITIAL_RTO; 4572 /* 4573 * If the SRTT is not keeping up and the 4574 * rack RTT has spiked we want to use 4575 * the last RTT not the smoothed one. 4576 */ 4577 if (rack_tlp_use_greater && (srtt < rack_grab_rtt(tp, rack))) 4578 srtt = rack_grab_rtt(tp, rack); 4579 thresh = rack_calc_thresh_tlp(tp, rack, rsm, srtt); 4580 if (thresh > time_since_sent) 4581 to = thresh - time_since_sent; 4582 else { 4583 to = rack->r_ctl.rc_min_to; 4584 rack_log_alt_to_to_cancel(rack, 4585 thresh, /* flex1 */ 4586 time_since_sent, /* flex2 */ 4587 tstmp_touse, /* flex3 */ 4588 rack->r_ctl.rc_tlp_rxt_last_time, /* flex4 */ 4589 rsm->r_tim_lastsent[idx], 4590 srtt, 4591 idx, 99); 4592 } 4593 if (to > TCPTV_REXMTMAX) { 4594 /* 4595 * If the TLP time works out to larger than the max 4596 * RTO lets not do TLP.. just RTO. 4597 */ 4598 goto activate_rxt; 4599 } 4600 } 4601 if (is_tlp_timer == 0) { 4602 rack->r_ctl.rc_hpts_flags |= PACE_TMR_RACK; 4603 } else { 4604 rack->r_ctl.rc_hpts_flags |= PACE_TMR_TLP; 4605 } 4606 if (to == 0) 4607 to = 1; 4608 return (to); 4609 } 4610 4611 static void 4612 rack_enter_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 4613 { 4614 if (rack->rc_in_persist == 0) { 4615 if (tp->t_flags & TF_GPUTINPROG) { 4616 /* 4617 * Stop the goodput now, the calling of the 4618 * measurement function clears the flag. 4619 */ 4620 rack_do_goodput_measurement(tp, rack, tp->snd_una, __LINE__); 4621 } 4622 #ifdef NETFLIX_SHARED_CWND 4623 if (rack->r_ctl.rc_scw) { 4624 tcp_shared_cwnd_idle(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index); 4625 rack->rack_scwnd_is_idle = 1; 4626 } 4627 #endif 4628 rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL); 4629 if (rack->r_ctl.rc_went_idle_time == 0) 4630 rack->r_ctl.rc_went_idle_time = 1; 4631 rack_timer_cancel(tp, rack, cts, __LINE__); 4632 tp->t_rxtshift = 0; 4633 rack->rc_in_persist = 1; 4634 } 4635 } 4636 4637 static void 4638 rack_exit_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 4639 { 4640 if (rack->rc_inp->inp_in_hpts) { 4641 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); 4642 rack->r_ctl.rc_hpts_flags = 0; 4643 } 4644 #ifdef NETFLIX_SHARED_CWND 4645 if (rack->r_ctl.rc_scw) { 4646 tcp_shared_cwnd_active(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index); 4647 rack->rack_scwnd_is_idle = 0; 4648 } 4649 #endif 4650 if (rack->rc_gp_dyn_mul && 4651 (rack->use_fixed_rate == 0) && 4652 (rack->rc_always_pace)) { 4653 /* 4654 * Do we count this as if a probe-rtt just 4655 * finished? 4656 */ 4657 uint32_t time_idle, idle_min; 4658 4659 time_idle = tcp_get_usecs(NULL) - rack->r_ctl.rc_went_idle_time; 4660 idle_min = rack_min_probertt_hold; 4661 if (rack_probertt_gpsrtt_cnt_div) { 4662 uint64_t extra; 4663 extra = (uint64_t)rack->r_ctl.rc_gp_srtt * 4664 (uint64_t)rack_probertt_gpsrtt_cnt_mul; 4665 extra /= (uint64_t)rack_probertt_gpsrtt_cnt_div; 4666 idle_min += (uint32_t)extra; 4667 } 4668 if (time_idle >= idle_min) { 4669 /* Yes, we count it as a probe-rtt. */ 4670 uint32_t us_cts; 4671 4672 us_cts = tcp_get_usecs(NULL); 4673 if (rack->in_probe_rtt == 0) { 4674 rack->r_ctl.rc_lower_rtt_us_cts = us_cts; 4675 rack->r_ctl.rc_time_probertt_entered = rack->r_ctl.rc_lower_rtt_us_cts; 4676 rack->r_ctl.rc_time_probertt_starts = rack->r_ctl.rc_lower_rtt_us_cts; 4677 rack->r_ctl.rc_time_of_last_probertt = rack->r_ctl.rc_lower_rtt_us_cts; 4678 } else { 4679 rack_exit_probertt(rack, us_cts); 4680 } 4681 } 4682 } 4683 rack->rc_in_persist = 0; 4684 rack->r_ctl.rc_went_idle_time = 0; 4685 tp->t_rxtshift = 0; 4686 rack->r_ctl.rc_agg_delayed = 0; 4687 rack->r_early = 0; 4688 rack->r_late = 0; 4689 rack->r_ctl.rc_agg_early = 0; 4690 } 4691 4692 static void 4693 rack_log_hpts_diag(struct tcp_rack *rack, uint32_t cts, 4694 struct hpts_diag *diag, struct timeval *tv) 4695 { 4696 if (rack_verbose_logging && rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 4697 union tcp_log_stackspecific log; 4698 4699 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 4700 log.u_bbr.flex1 = diag->p_nxt_slot; 4701 log.u_bbr.flex2 = diag->p_cur_slot; 4702 log.u_bbr.flex3 = diag->slot_req; 4703 log.u_bbr.flex4 = diag->inp_hptsslot; 4704 log.u_bbr.flex5 = diag->slot_remaining; 4705 log.u_bbr.flex6 = diag->need_new_to; 4706 log.u_bbr.flex7 = diag->p_hpts_active; 4707 log.u_bbr.flex8 = diag->p_on_min_sleep; 4708 /* Hijack other fields as needed */ 4709 log.u_bbr.epoch = diag->have_slept; 4710 log.u_bbr.lt_epoch = diag->yet_to_sleep; 4711 log.u_bbr.pkts_out = diag->co_ret; 4712 log.u_bbr.applimited = diag->hpts_sleep_time; 4713 log.u_bbr.delivered = diag->p_prev_slot; 4714 log.u_bbr.inflight = diag->p_runningtick; 4715 log.u_bbr.bw_inuse = diag->wheel_tick; 4716 log.u_bbr.rttProp = diag->wheel_cts; 4717 log.u_bbr.timeStamp = cts; 4718 log.u_bbr.delRate = diag->maxticks; 4719 log.u_bbr.cur_del_rate = diag->p_curtick; 4720 log.u_bbr.cur_del_rate <<= 32; 4721 log.u_bbr.cur_del_rate |= diag->p_lasttick; 4722 TCP_LOG_EVENTP(rack->rc_tp, NULL, 4723 &rack->rc_inp->inp_socket->so_rcv, 4724 &rack->rc_inp->inp_socket->so_snd, 4725 BBR_LOG_HPTSDIAG, 0, 4726 0, &log, false, tv); 4727 } 4728 4729 } 4730 4731 static void 4732 rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, 4733 int32_t slot, uint32_t tot_len_this_send, int sup_rack) 4734 { 4735 struct hpts_diag diag; 4736 struct inpcb *inp; 4737 struct timeval tv; 4738 uint32_t delayed_ack = 0; 4739 uint32_t hpts_timeout; 4740 uint8_t stopped; 4741 uint32_t left = 0; 4742 uint32_t us_cts; 4743 4744 inp = tp->t_inpcb; 4745 if ((tp->t_state == TCPS_CLOSED) || 4746 (tp->t_state == TCPS_LISTEN)) { 4747 return; 4748 } 4749 if (inp->inp_in_hpts) { 4750 /* Already on the pacer */ 4751 return; 4752 } 4753 stopped = rack->rc_tmr_stopped; 4754 if (stopped && TSTMP_GT(rack->r_ctl.rc_timer_exp, cts)) { 4755 left = rack->r_ctl.rc_timer_exp - cts; 4756 } 4757 rack->r_ctl.rc_timer_exp = 0; 4758 rack->r_ctl.rc_hpts_flags = 0; 4759 us_cts = tcp_get_usecs(&tv); 4760 /* Now early/late accounting */ 4761 if (rack->r_early) { 4762 /* 4763 * We have a early carry over set, 4764 * we can always add more time so we 4765 * can always make this compensation. 4766 */ 4767 slot += rack->r_ctl.rc_agg_early; 4768 rack->r_early = 0; 4769 rack->r_ctl.rc_agg_early = 0; 4770 } 4771 if (rack->r_late) { 4772 /* 4773 * This is harder, we can 4774 * compensate some but it 4775 * really depends on what 4776 * the current pacing time is. 4777 */ 4778 if (rack->r_ctl.rc_agg_delayed >= slot) { 4779 /* 4780 * We can't compensate for it all. 4781 * And we have to have some time 4782 * on the clock. We always have a min 4783 * 10 slots (10 x 10 i.e. 100 usecs). 4784 */ 4785 if (slot <= HPTS_TICKS_PER_USEC) { 4786 /* We gain delay */ 4787 rack->r_ctl.rc_agg_delayed += (HPTS_TICKS_PER_USEC - slot); 4788 slot = HPTS_TICKS_PER_USEC; 4789 } else { 4790 /* We take off some */ 4791 rack->r_ctl.rc_agg_delayed -= (slot - HPTS_TICKS_PER_USEC); 4792 slot = HPTS_TICKS_PER_USEC; 4793 } 4794 } else { 4795 slot -= rack->r_ctl.rc_agg_delayed; 4796 rack->r_ctl.rc_agg_delayed = 0; 4797 /* Make sure we have 100 useconds at minimum */ 4798 if (slot < HPTS_TICKS_PER_USEC) { 4799 rack->r_ctl.rc_agg_delayed = HPTS_TICKS_PER_USEC - slot; 4800 slot = HPTS_TICKS_PER_USEC; 4801 } 4802 if (rack->r_ctl.rc_agg_delayed == 0) 4803 rack->r_late = 0; 4804 } 4805 } 4806 if (slot) { 4807 /* We are pacing too */ 4808 rack->r_ctl.rc_hpts_flags |= PACE_PKT_OUTPUT; 4809 } 4810 hpts_timeout = rack_timer_start(tp, rack, cts, sup_rack); 4811 #ifdef NETFLIX_EXP_DETECTION 4812 if (rack->sack_attack_disable && 4813 (slot < tcp_sad_pacing_interval)) { 4814 /* 4815 * We have a potential attacker on 4816 * the line. We have possibly some 4817 * (or now) pacing time set. We want to 4818 * slow down the processing of sacks by some 4819 * amount (if it is an attacker). Set the default 4820 * slot for attackers in place (unless the orginal 4821 * interval is longer). Its stored in 4822 * micro-seconds, so lets convert to msecs. 4823 */ 4824 slot = tcp_sad_pacing_interval; 4825 } 4826 #endif 4827 if (tp->t_flags & TF_DELACK) { 4828 delayed_ack = TICKS_2_MSEC(tcp_delacktime); 4829 rack->r_ctl.rc_hpts_flags |= PACE_TMR_DELACK; 4830 } 4831 if (delayed_ack && ((hpts_timeout == 0) || 4832 (delayed_ack < hpts_timeout))) 4833 hpts_timeout = delayed_ack; 4834 else 4835 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK; 4836 /* 4837 * If no timers are going to run and we will fall off the hptsi 4838 * wheel, we resort to a keep-alive timer if its configured. 4839 */ 4840 if ((hpts_timeout == 0) && 4841 (slot == 0)) { 4842 if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && 4843 (tp->t_state <= TCPS_CLOSING)) { 4844 /* 4845 * Ok we have no timer (persists, rack, tlp, rxt or 4846 * del-ack), we don't have segments being paced. So 4847 * all that is left is the keepalive timer. 4848 */ 4849 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 4850 /* Get the established keep-alive time */ 4851 hpts_timeout = TP_KEEPIDLE(tp); 4852 } else { 4853 /* Get the initial setup keep-alive time */ 4854 hpts_timeout = TP_KEEPINIT(tp); 4855 } 4856 rack->r_ctl.rc_hpts_flags |= PACE_TMR_KEEP; 4857 if (rack->in_probe_rtt) { 4858 /* 4859 * We want to instead not wake up a long time from 4860 * now but to wake up about the time we would 4861 * exit probe-rtt and initiate a keep-alive ack. 4862 * This will get us out of probe-rtt and update 4863 * our min-rtt. 4864 */ 4865 hpts_timeout = (rack_min_probertt_hold / HPTS_USEC_IN_MSEC); 4866 } 4867 } 4868 } 4869 if (left && (stopped & (PACE_TMR_KEEP | PACE_TMR_DELACK)) == 4870 (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK)) { 4871 /* 4872 * RACK, TLP, persists and RXT timers all are restartable 4873 * based on actions input .. i.e we received a packet (ack 4874 * or sack) and that changes things (rw, or snd_una etc). 4875 * Thus we can restart them with a new value. For 4876 * keep-alive, delayed_ack we keep track of what was left 4877 * and restart the timer with a smaller value. 4878 */ 4879 if (left < hpts_timeout) 4880 hpts_timeout = left; 4881 } 4882 if (hpts_timeout) { 4883 /* 4884 * Hack alert for now we can't time-out over 2,147,483 4885 * seconds (a bit more than 596 hours), which is probably ok 4886 * :). 4887 */ 4888 if (hpts_timeout > 0x7ffffffe) 4889 hpts_timeout = 0x7ffffffe; 4890 rack->r_ctl.rc_timer_exp = cts + hpts_timeout; 4891 } 4892 if ((rack->rc_gp_filled == 0) && 4893 (hpts_timeout < slot) && 4894 (rack->r_ctl.rc_hpts_flags & (PACE_TMR_TLP|PACE_TMR_RXT))) { 4895 /* 4896 * We have no good estimate yet for the 4897 * old clunky burst mitigation or the 4898 * real pacing. And the tlp or rxt is smaller 4899 * than the pacing calculation. Lets not 4900 * pace that long since we know the calculation 4901 * so far is not accurate. 4902 */ 4903 slot = hpts_timeout; 4904 } 4905 rack->r_ctl.last_pacing_time = slot; 4906 if (slot) { 4907 rack->r_ctl.rc_last_output_to = us_cts + slot; 4908 if (rack->rc_always_pace || rack->r_mbuf_queue) { 4909 if ((rack->rc_gp_filled == 0) || 4910 rack->pacing_longer_than_rtt) { 4911 inp->inp_flags2 &= ~(INP_DONT_SACK_QUEUE|INP_MBUF_QUEUE_READY); 4912 } else { 4913 inp->inp_flags2 |= INP_MBUF_QUEUE_READY; 4914 if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK) && 4915 (rack->r_rr_config != 3)) 4916 inp->inp_flags2 |= INP_DONT_SACK_QUEUE; 4917 else 4918 inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE; 4919 } 4920 } 4921 if ((rack->use_rack_rr) && 4922 (rack->r_rr_config < 2) && 4923 ((hpts_timeout) && ((hpts_timeout * HPTS_USEC_IN_MSEC) < slot))) { 4924 /* 4925 * Arrange for the hpts to kick back in after the 4926 * t-o if the t-o does not cause a send. 4927 */ 4928 (void)tcp_hpts_insert_diag(tp->t_inpcb, HPTS_MS_TO_SLOTS(hpts_timeout), 4929 __LINE__, &diag); 4930 rack_log_hpts_diag(rack, us_cts, &diag, &tv); 4931 rack_log_to_start(rack, cts, hpts_timeout, slot, 0); 4932 } else { 4933 (void)tcp_hpts_insert_diag(tp->t_inpcb, HPTS_USEC_TO_SLOTS(slot), 4934 __LINE__, &diag); 4935 rack_log_hpts_diag(rack, us_cts, &diag, &tv); 4936 rack_log_to_start(rack, cts, hpts_timeout, slot, 1); 4937 } 4938 } else if (hpts_timeout) { 4939 if (rack->rc_always_pace || rack->r_mbuf_queue) { 4940 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK) { 4941 /* For a rack timer, don't wake us */ 4942 inp->inp_flags2 |= INP_MBUF_QUEUE_READY; 4943 if (rack->r_rr_config != 3) 4944 inp->inp_flags2 |= INP_DONT_SACK_QUEUE; 4945 else 4946 inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE; 4947 } else { 4948 /* All other timers wake us up */ 4949 inp->inp_flags2 &= ~INP_MBUF_QUEUE_READY; 4950 inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE; 4951 } 4952 } 4953 (void)tcp_hpts_insert_diag(tp->t_inpcb, HPTS_MS_TO_SLOTS(hpts_timeout), 4954 __LINE__, &diag); 4955 rack_log_hpts_diag(rack, us_cts, &diag, &tv); 4956 rack_log_to_start(rack, cts, hpts_timeout, slot, 0); 4957 } else { 4958 /* No timer starting */ 4959 #ifdef INVARIANTS 4960 if (SEQ_GT(tp->snd_max, tp->snd_una)) { 4961 panic("tp:%p rack:%p tlts:%d cts:%u slot:%u pto:%u -- no timer started?", 4962 tp, rack, tot_len_this_send, cts, slot, hpts_timeout); 4963 } 4964 #endif 4965 } 4966 rack->rc_tmr_stopped = 0; 4967 if (slot) 4968 rack_log_type_bbrsnd(rack, tot_len_this_send, slot, us_cts, &tv); 4969 } 4970 4971 /* 4972 * RACK Timer, here we simply do logging and house keeping. 4973 * the normal rack_output() function will call the 4974 * appropriate thing to check if we need to do a RACK retransmit. 4975 * We return 1, saying don't proceed with rack_output only 4976 * when all timers have been stopped (destroyed PCB?). 4977 */ 4978 static int 4979 rack_timeout_rack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 4980 { 4981 /* 4982 * This timer simply provides an internal trigger to send out data. 4983 * The check_recovery_mode call will see if there are needed 4984 * retransmissions, if so we will enter fast-recovery. The output 4985 * call may or may not do the same thing depending on sysctl 4986 * settings. 4987 */ 4988 struct rack_sendmap *rsm; 4989 int32_t recovery; 4990 4991 if (tp->t_timers->tt_flags & TT_STOPPED) { 4992 return (1); 4993 } 4994 recovery = IN_RECOVERY(tp->t_flags); 4995 counter_u64_add(rack_to_tot, 1); 4996 if (rack->r_state && (rack->r_state != tp->t_state)) 4997 rack_set_state(tp, rack); 4998 rack->rc_on_min_to = 0; 4999 rsm = rack_check_recovery_mode(tp, cts); 5000 rack_log_to_event(rack, RACK_TO_FRM_RACK, rsm); 5001 if (rsm) { 5002 uint32_t rtt; 5003 5004 rack->r_ctl.rc_resend = rsm; 5005 if (rack->use_rack_rr) { 5006 /* 5007 * Don't accumulate extra pacing delay 5008 * we are allowing the rack timer to 5009 * over-ride pacing i.e. rrr takes precedence 5010 * if the pacing interval is longer than the rrr 5011 * time (in other words we get the min pacing 5012 * time versus rrr pacing time). 5013 */ 5014 rack->r_timer_override = 1; 5015 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 5016 } 5017 rtt = rack->rc_rack_rtt; 5018 if (rtt == 0) 5019 rtt = 1; 5020 if (rack->rack_no_prr == 0) { 5021 if ((recovery == 0) && 5022 (rack->r_ctl.rc_prr_sndcnt < ctf_fixed_maxseg(tp))) { 5023 /* 5024 * The rack-timeout that enter's us into recovery 5025 * will force out one MSS and set us up so that we 5026 * can do one more send in 2*rtt (transitioning the 5027 * rack timeout into a rack-tlp). 5028 */ 5029 rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); 5030 rack->r_timer_override = 1; 5031 rack_log_to_prr(rack, 3, 0); 5032 } else if ((rack->r_ctl.rc_prr_sndcnt < (rsm->r_end - rsm->r_start)) && 5033 rack->use_rack_rr) { 5034 /* 5035 * When a rack timer goes, if the rack rr is 5036 * on, arrange it so we can send a full segment 5037 * overriding prr (though we pay a price for this 5038 * for future new sends). 5039 */ 5040 rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); 5041 rack_log_to_prr(rack, 4, 0); 5042 } 5043 } 5044 } 5045 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RACK; 5046 if (rsm == NULL) { 5047 /* restart a timer and return 1 */ 5048 rack_start_hpts_timer(rack, tp, cts, 5049 0, 0, 0); 5050 return (1); 5051 } 5052 return (0); 5053 } 5054 5055 static __inline void 5056 rack_clone_rsm(struct tcp_rack *rack, struct rack_sendmap *nrsm, 5057 struct rack_sendmap *rsm, uint32_t start) 5058 { 5059 int idx; 5060 5061 nrsm->r_start = start; 5062 nrsm->r_end = rsm->r_end; 5063 nrsm->r_rtr_cnt = rsm->r_rtr_cnt; 5064 nrsm->r_flags = rsm->r_flags; 5065 nrsm->r_dupack = rsm->r_dupack; 5066 nrsm->usec_orig_send = rsm->usec_orig_send; 5067 nrsm->r_rtr_bytes = 0; 5068 rsm->r_end = nrsm->r_start; 5069 nrsm->r_just_ret = rsm->r_just_ret; 5070 for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { 5071 nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; 5072 } 5073 } 5074 5075 static struct rack_sendmap * 5076 rack_merge_rsm(struct tcp_rack *rack, 5077 struct rack_sendmap *l_rsm, 5078 struct rack_sendmap *r_rsm) 5079 { 5080 /* 5081 * We are merging two ack'd RSM's, 5082 * the l_rsm is on the left (lower seq 5083 * values) and the r_rsm is on the right 5084 * (higher seq value). The simplest way 5085 * to merge these is to move the right 5086 * one into the left. I don't think there 5087 * is any reason we need to try to find 5088 * the oldest (or last oldest retransmitted). 5089 */ 5090 struct rack_sendmap *rm; 5091 5092 l_rsm->r_end = r_rsm->r_end; 5093 if (l_rsm->r_dupack < r_rsm->r_dupack) 5094 l_rsm->r_dupack = r_rsm->r_dupack; 5095 if (r_rsm->r_rtr_bytes) 5096 l_rsm->r_rtr_bytes += r_rsm->r_rtr_bytes; 5097 if (r_rsm->r_in_tmap) { 5098 /* This really should not happen */ 5099 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, r_rsm, r_tnext); 5100 r_rsm->r_in_tmap = 0; 5101 } 5102 5103 /* Now the flags */ 5104 if (r_rsm->r_flags & RACK_HAS_FIN) 5105 l_rsm->r_flags |= RACK_HAS_FIN; 5106 if (r_rsm->r_flags & RACK_TLP) 5107 l_rsm->r_flags |= RACK_TLP; 5108 if (r_rsm->r_flags & RACK_RWND_COLLAPSED) 5109 l_rsm->r_flags |= RACK_RWND_COLLAPSED; 5110 if ((r_rsm->r_flags & RACK_APP_LIMITED) && 5111 ((l_rsm->r_flags & RACK_APP_LIMITED) == 0)) { 5112 /* 5113 * If both are app-limited then let the 5114 * free lower the count. If right is app 5115 * limited and left is not, transfer. 5116 */ 5117 l_rsm->r_flags |= RACK_APP_LIMITED; 5118 r_rsm->r_flags &= ~RACK_APP_LIMITED; 5119 if (r_rsm == rack->r_ctl.rc_first_appl) 5120 rack->r_ctl.rc_first_appl = l_rsm; 5121 } 5122 rm = RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, r_rsm); 5123 #ifdef INVARIANTS 5124 if (rm != r_rsm) { 5125 panic("removing head in rack:%p rsm:%p rm:%p", 5126 rack, r_rsm, rm); 5127 } 5128 #endif 5129 if ((r_rsm->r_limit_type == 0) && (l_rsm->r_limit_type != 0)) { 5130 /* Transfer the split limit to the map we free */ 5131 r_rsm->r_limit_type = l_rsm->r_limit_type; 5132 l_rsm->r_limit_type = 0; 5133 } 5134 rack_free(rack, r_rsm); 5135 return(l_rsm); 5136 } 5137 5138 /* 5139 * TLP Timer, here we simply setup what segment we want to 5140 * have the TLP expire on, the normal rack_output() will then 5141 * send it out. 5142 * 5143 * We return 1, saying don't proceed with rack_output only 5144 * when all timers have been stopped (destroyed PCB?). 5145 */ 5146 static int 5147 rack_timeout_tlp(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 5148 { 5149 /* 5150 * Tail Loss Probe. 5151 */ 5152 struct rack_sendmap *rsm = NULL; 5153 struct rack_sendmap *insret; 5154 struct socket *so; 5155 uint32_t amm, old_prr_snd = 0; 5156 uint32_t out, avail; 5157 int collapsed_win = 0; 5158 5159 if (tp->t_timers->tt_flags & TT_STOPPED) { 5160 return (1); 5161 } 5162 if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) { 5163 /* Its not time yet */ 5164 return (0); 5165 } 5166 if (ctf_progress_timeout_check(tp, true)) { 5167 rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__); 5168 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 5169 return (1); 5170 } 5171 /* 5172 * A TLP timer has expired. We have been idle for 2 rtts. So we now 5173 * need to figure out how to force a full MSS segment out. 5174 */ 5175 rack_log_to_event(rack, RACK_TO_FRM_TLP, NULL); 5176 counter_u64_add(rack_tlp_tot, 1); 5177 if (rack->r_state && (rack->r_state != tp->t_state)) 5178 rack_set_state(tp, rack); 5179 so = tp->t_inpcb->inp_socket; 5180 avail = sbavail(&so->so_snd); 5181 out = tp->snd_max - tp->snd_una; 5182 if (out > tp->snd_wnd) { 5183 /* special case, we need a retransmission */ 5184 collapsed_win = 1; 5185 goto need_retran; 5186 } 5187 /* 5188 * Check our send oldest always settings, and if 5189 * there is an oldest to send jump to the need_retran. 5190 */ 5191 if (rack_always_send_oldest && (TAILQ_EMPTY(&rack->r_ctl.rc_tmap) == 0)) 5192 goto need_retran; 5193 5194 if (avail > out) { 5195 /* New data is available */ 5196 amm = avail - out; 5197 if (amm > ctf_fixed_maxseg(tp)) { 5198 amm = ctf_fixed_maxseg(tp); 5199 if ((amm + out) > tp->snd_wnd) { 5200 /* We are rwnd limited */ 5201 goto need_retran; 5202 } 5203 } else if (amm < ctf_fixed_maxseg(tp)) { 5204 /* not enough to fill a MTU */ 5205 goto need_retran; 5206 } 5207 if (IN_RECOVERY(tp->t_flags)) { 5208 /* Unlikely */ 5209 if (rack->rack_no_prr == 0) { 5210 old_prr_snd = rack->r_ctl.rc_prr_sndcnt; 5211 if (out + amm <= tp->snd_wnd) { 5212 rack->r_ctl.rc_prr_sndcnt = amm; 5213 rack_log_to_prr(rack, 4, 0); 5214 } 5215 } else 5216 goto need_retran; 5217 } else { 5218 /* Set the send-new override */ 5219 if (out + amm <= tp->snd_wnd) 5220 rack->r_ctl.rc_tlp_new_data = amm; 5221 else 5222 goto need_retran; 5223 } 5224 rack->r_ctl.rc_tlpsend = NULL; 5225 counter_u64_add(rack_tlp_newdata, 1); 5226 goto send; 5227 } 5228 need_retran: 5229 /* 5230 * Ok we need to arrange the last un-acked segment to be re-sent, or 5231 * optionally the first un-acked segment. 5232 */ 5233 if (collapsed_win == 0) { 5234 if (rack_always_send_oldest) 5235 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 5236 else { 5237 rsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 5238 if (rsm && (rsm->r_flags & (RACK_ACKED | RACK_HAS_FIN))) { 5239 rsm = rack_find_high_nonack(rack, rsm); 5240 } 5241 } 5242 if (rsm == NULL) { 5243 counter_u64_add(rack_tlp_does_nada, 1); 5244 #ifdef TCP_BLACKBOX 5245 tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true); 5246 #endif 5247 goto out; 5248 } 5249 } else { 5250 /* 5251 * We must find the last segment 5252 * that was acceptable by the client. 5253 */ 5254 RB_FOREACH_REVERSE(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) { 5255 if ((rsm->r_flags & RACK_RWND_COLLAPSED) == 0) { 5256 /* Found one */ 5257 break; 5258 } 5259 } 5260 if (rsm == NULL) { 5261 /* None? if so send the first */ 5262 rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 5263 if (rsm == NULL) { 5264 counter_u64_add(rack_tlp_does_nada, 1); 5265 #ifdef TCP_BLACKBOX 5266 tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true); 5267 #endif 5268 goto out; 5269 } 5270 } 5271 } 5272 if ((rsm->r_end - rsm->r_start) > ctf_fixed_maxseg(tp)) { 5273 /* 5274 * We need to split this the last segment in two. 5275 */ 5276 struct rack_sendmap *nrsm; 5277 5278 nrsm = rack_alloc_full_limit(rack); 5279 if (nrsm == NULL) { 5280 /* 5281 * No memory to split, we will just exit and punt 5282 * off to the RXT timer. 5283 */ 5284 counter_u64_add(rack_tlp_does_nada, 1); 5285 goto out; 5286 } 5287 rack_clone_rsm(rack, nrsm, rsm, 5288 (rsm->r_end - ctf_fixed_maxseg(tp))); 5289 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); 5290 #ifdef INVARIANTS 5291 if (insret != NULL) { 5292 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", 5293 nrsm, insret, rack, rsm); 5294 } 5295 #endif 5296 if (rsm->r_in_tmap) { 5297 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 5298 nrsm->r_in_tmap = 1; 5299 } 5300 rsm->r_flags &= (~RACK_HAS_FIN); 5301 rsm = nrsm; 5302 } 5303 rack->r_ctl.rc_tlpsend = rsm; 5304 send: 5305 rack->r_timer_override = 1; 5306 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP; 5307 return (0); 5308 out: 5309 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP; 5310 return (0); 5311 } 5312 5313 /* 5314 * Delayed ack Timer, here we simply need to setup the 5315 * ACK_NOW flag and remove the DELACK flag. From there 5316 * the output routine will send the ack out. 5317 * 5318 * We only return 1, saying don't proceed, if all timers 5319 * are stopped (destroyed PCB?). 5320 */ 5321 static int 5322 rack_timeout_delack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 5323 { 5324 if (tp->t_timers->tt_flags & TT_STOPPED) { 5325 return (1); 5326 } 5327 rack_log_to_event(rack, RACK_TO_FRM_DELACK, NULL); 5328 tp->t_flags &= ~TF_DELACK; 5329 tp->t_flags |= TF_ACKNOW; 5330 KMOD_TCPSTAT_INC(tcps_delack); 5331 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK; 5332 return (0); 5333 } 5334 5335 /* 5336 * Persists timer, here we simply send the 5337 * same thing as a keepalive will. 5338 * the one byte send. 5339 * 5340 * We only return 1, saying don't proceed, if all timers 5341 * are stopped (destroyed PCB?). 5342 */ 5343 static int 5344 rack_timeout_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 5345 { 5346 struct tcptemp *t_template; 5347 struct inpcb *inp; 5348 int32_t retval = 1; 5349 5350 inp = tp->t_inpcb; 5351 5352 if (tp->t_timers->tt_flags & TT_STOPPED) { 5353 return (1); 5354 } 5355 if (rack->rc_in_persist == 0) 5356 return (0); 5357 if (ctf_progress_timeout_check(tp, false)) { 5358 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); 5359 rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__); 5360 tcp_set_inp_to_drop(inp, ETIMEDOUT); 5361 return (1); 5362 } 5363 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 5364 /* 5365 * Persistence timer into zero window. Force a byte to be output, if 5366 * possible. 5367 */ 5368 KMOD_TCPSTAT_INC(tcps_persisttimeo); 5369 /* 5370 * Hack: if the peer is dead/unreachable, we do not time out if the 5371 * window is closed. After a full backoff, drop the connection if 5372 * the idle time (no responses to probes) reaches the maximum 5373 * backoff that we would use if retransmitting. 5374 */ 5375 if (tp->t_rxtshift == TCP_MAXRXTSHIFT && 5376 (ticks - tp->t_rcvtime >= tcp_maxpersistidle || 5377 ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) { 5378 KMOD_TCPSTAT_INC(tcps_persistdrop); 5379 retval = 1; 5380 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); 5381 tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT); 5382 goto out; 5383 } 5384 if ((sbavail(&rack->rc_inp->inp_socket->so_snd) == 0) && 5385 tp->snd_una == tp->snd_max) 5386 rack_exit_persist(tp, rack, cts); 5387 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_PERSIT; 5388 /* 5389 * If the user has closed the socket then drop a persisting 5390 * connection after a much reduced timeout. 5391 */ 5392 if (tp->t_state > TCPS_CLOSE_WAIT && 5393 (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) { 5394 retval = 1; 5395 KMOD_TCPSTAT_INC(tcps_persistdrop); 5396 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); 5397 tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT); 5398 goto out; 5399 } 5400 t_template = tcpip_maketemplate(rack->rc_inp); 5401 if (t_template) { 5402 /* only set it if we were answered */ 5403 if (rack->forced_ack == 0) { 5404 rack->forced_ack = 1; 5405 rack->r_ctl.forced_ack_ts = tcp_get_usecs(NULL); 5406 } 5407 tcp_respond(tp, t_template->tt_ipgen, 5408 &t_template->tt_t, (struct mbuf *)NULL, 5409 tp->rcv_nxt, tp->snd_una - 1, 0); 5410 /* This sends an ack */ 5411 if (tp->t_flags & TF_DELACK) 5412 tp->t_flags &= ~TF_DELACK; 5413 free(t_template, M_TEMP); 5414 } 5415 if (tp->t_rxtshift < TCP_MAXRXTSHIFT) 5416 tp->t_rxtshift++; 5417 out: 5418 rack_log_to_event(rack, RACK_TO_FRM_PERSIST, NULL); 5419 rack_start_hpts_timer(rack, tp, cts, 5420 0, 0, 0); 5421 return (retval); 5422 } 5423 5424 /* 5425 * If a keepalive goes off, we had no other timers 5426 * happening. We always return 1 here since this 5427 * routine either drops the connection or sends 5428 * out a segment with respond. 5429 */ 5430 static int 5431 rack_timeout_keepalive(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 5432 { 5433 struct tcptemp *t_template; 5434 struct inpcb *inp; 5435 5436 if (tp->t_timers->tt_flags & TT_STOPPED) { 5437 return (1); 5438 } 5439 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_KEEP; 5440 inp = tp->t_inpcb; 5441 rack_log_to_event(rack, RACK_TO_FRM_KEEP, NULL); 5442 /* 5443 * Keep-alive timer went off; send something or drop connection if 5444 * idle for too long. 5445 */ 5446 KMOD_TCPSTAT_INC(tcps_keeptimeo); 5447 if (tp->t_state < TCPS_ESTABLISHED) 5448 goto dropit; 5449 if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && 5450 tp->t_state <= TCPS_CLOSING) { 5451 if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp)) 5452 goto dropit; 5453 /* 5454 * Send a packet designed to force a response if the peer is 5455 * up and reachable: either an ACK if the connection is 5456 * still alive, or an RST if the peer has closed the 5457 * connection due to timeout or reboot. Using sequence 5458 * number tp->snd_una-1 causes the transmitted zero-length 5459 * segment to lie outside the receive window; by the 5460 * protocol spec, this requires the correspondent TCP to 5461 * respond. 5462 */ 5463 KMOD_TCPSTAT_INC(tcps_keepprobe); 5464 t_template = tcpip_maketemplate(inp); 5465 if (t_template) { 5466 if (rack->forced_ack == 0) { 5467 rack->forced_ack = 1; 5468 rack->r_ctl.forced_ack_ts = tcp_get_usecs(NULL); 5469 } 5470 tcp_respond(tp, t_template->tt_ipgen, 5471 &t_template->tt_t, (struct mbuf *)NULL, 5472 tp->rcv_nxt, tp->snd_una - 1, 0); 5473 free(t_template, M_TEMP); 5474 } 5475 } 5476 rack_start_hpts_timer(rack, tp, cts, 0, 0, 0); 5477 return (1); 5478 dropit: 5479 KMOD_TCPSTAT_INC(tcps_keepdrops); 5480 tcp_log_end_status(tp, TCP_EI_STATUS_KEEP_MAX); 5481 tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT); 5482 return (1); 5483 } 5484 5485 /* 5486 * Retransmit helper function, clear up all the ack 5487 * flags and take care of important book keeping. 5488 */ 5489 static void 5490 rack_remxt_tmr(struct tcpcb *tp) 5491 { 5492 /* 5493 * The retransmit timer went off, all sack'd blocks must be 5494 * un-acked. 5495 */ 5496 struct rack_sendmap *rsm, *trsm = NULL; 5497 struct tcp_rack *rack; 5498 int32_t cnt = 0; 5499 5500 rack = (struct tcp_rack *)tp->t_fb_ptr; 5501 rack_timer_cancel(tp, rack, tcp_ts_getticks(), __LINE__); 5502 rack_log_to_event(rack, RACK_TO_FRM_TMR, NULL); 5503 if (rack->r_state && (rack->r_state != tp->t_state)) 5504 rack_set_state(tp, rack); 5505 /* 5506 * Ideally we would like to be able to 5507 * mark SACK-PASS on anything not acked here. 5508 * However, if we do that we would burst out 5509 * all that data 1ms apart. This would be unwise, 5510 * so for now we will just let the normal rxt timer 5511 * and tlp timer take care of it. 5512 */ 5513 RB_FOREACH(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) { 5514 if (rsm->r_flags & RACK_ACKED) { 5515 cnt++; 5516 rsm->r_dupack = 0; 5517 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 5518 if (rsm->r_in_tmap == 0) { 5519 /* We must re-add it back to the tlist */ 5520 if (trsm == NULL) { 5521 TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext); 5522 } else { 5523 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, trsm, rsm, r_tnext); 5524 } 5525 rsm->r_in_tmap = 1; 5526 } 5527 } 5528 trsm = rsm; 5529 if (rsm->r_flags & RACK_ACKED) 5530 rsm->r_flags |= RACK_WAS_ACKED; 5531 rsm->r_flags &= ~(RACK_ACKED | RACK_SACK_PASSED | RACK_WAS_SACKPASS); 5532 } 5533 /* Clear the count (we just un-acked them) */ 5534 rack->r_ctl.rc_sacked = 0; 5535 rack->r_ctl.rc_agg_delayed = 0; 5536 rack->r_early = 0; 5537 rack->r_ctl.rc_agg_early = 0; 5538 rack->r_late = 0; 5539 /* Clear the tlp rtx mark */ 5540 rack->r_ctl.rc_resend = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 5541 rack->r_ctl.rc_prr_sndcnt = 0; 5542 rack_log_to_prr(rack, 6, 0); 5543 rack->r_timer_override = 1; 5544 } 5545 5546 static void 5547 rack_cc_conn_init(struct tcpcb *tp) 5548 { 5549 struct tcp_rack *rack; 5550 5551 rack = (struct tcp_rack *)tp->t_fb_ptr; 5552 cc_conn_init(tp); 5553 /* 5554 * We want a chance to stay in slowstart as 5555 * we create a connection. TCP spec says that 5556 * initially ssthresh is infinite. For our 5557 * purposes that is the snd_wnd. 5558 */ 5559 if (tp->snd_ssthresh < tp->snd_wnd) { 5560 tp->snd_ssthresh = tp->snd_wnd; 5561 } 5562 /* 5563 * We also want to assure a IW worth of 5564 * data can get inflight. 5565 */ 5566 if (rc_init_window(rack) < tp->snd_cwnd) 5567 tp->snd_cwnd = rc_init_window(rack); 5568 } 5569 5570 /* 5571 * Re-transmit timeout! If we drop the PCB we will return 1, otherwise 5572 * we will setup to retransmit the lowest seq number outstanding. 5573 */ 5574 static int 5575 rack_timeout_rxt(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 5576 { 5577 int32_t rexmt; 5578 struct inpcb *inp; 5579 int32_t retval = 0; 5580 bool isipv6; 5581 5582 inp = tp->t_inpcb; 5583 if (tp->t_timers->tt_flags & TT_STOPPED) { 5584 return (1); 5585 } 5586 if (ctf_progress_timeout_check(tp, false)) { 5587 tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN); 5588 rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__); 5589 tcp_set_inp_to_drop(inp, ETIMEDOUT); 5590 return (1); 5591 } 5592 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RXT; 5593 if (TCPS_HAVEESTABLISHED(tp->t_state) && 5594 (tp->snd_una == tp->snd_max)) { 5595 /* Nothing outstanding .. nothing to do */ 5596 return (0); 5597 } 5598 /* 5599 * Retransmission timer went off. Message has not been acked within 5600 * retransmit interval. Back off to a longer retransmit interval 5601 * and retransmit one segment. 5602 */ 5603 rack_remxt_tmr(tp); 5604 if ((rack->r_ctl.rc_resend == NULL) || 5605 ((rack->r_ctl.rc_resend->r_flags & RACK_RWND_COLLAPSED) == 0)) { 5606 /* 5607 * If the rwnd collapsed on 5608 * the one we are retransmitting 5609 * it does not count against the 5610 * rxt count. 5611 */ 5612 tp->t_rxtshift++; 5613 } 5614 if (tp->t_rxtshift > TCP_MAXRXTSHIFT) { 5615 tp->t_rxtshift = TCP_MAXRXTSHIFT; 5616 KMOD_TCPSTAT_INC(tcps_timeoutdrop); 5617 retval = 1; 5618 tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN); 5619 tcp_set_inp_to_drop(rack->rc_inp, 5620 (tp->t_softerror ? (uint16_t) tp->t_softerror : ETIMEDOUT)); 5621 goto out; 5622 } 5623 if (tp->t_state == TCPS_SYN_SENT) { 5624 /* 5625 * If the SYN was retransmitted, indicate CWND to be limited 5626 * to 1 segment in cc_conn_init(). 5627 */ 5628 tp->snd_cwnd = 1; 5629 } else if (tp->t_rxtshift == 1) { 5630 /* 5631 * first retransmit; record ssthresh and cwnd so they can be 5632 * recovered if this turns out to be a "bad" retransmit. A 5633 * retransmit is considered "bad" if an ACK for this segment 5634 * is received within RTT/2 interval; the assumption here is 5635 * that the ACK was already in flight. See "On Estimating 5636 * End-to-End Network Path Properties" by Allman and Paxson 5637 * for more details. 5638 */ 5639 tp->snd_cwnd_prev = tp->snd_cwnd; 5640 tp->snd_ssthresh_prev = tp->snd_ssthresh; 5641 tp->snd_recover_prev = tp->snd_recover; 5642 if (IN_FASTRECOVERY(tp->t_flags)) 5643 tp->t_flags |= TF_WASFRECOVERY; 5644 else 5645 tp->t_flags &= ~TF_WASFRECOVERY; 5646 if (IN_CONGRECOVERY(tp->t_flags)) 5647 tp->t_flags |= TF_WASCRECOVERY; 5648 else 5649 tp->t_flags &= ~TF_WASCRECOVERY; 5650 tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1)); 5651 tp->t_flags |= TF_PREVVALID; 5652 } else 5653 tp->t_flags &= ~TF_PREVVALID; 5654 KMOD_TCPSTAT_INC(tcps_rexmttimeo); 5655 if ((tp->t_state == TCPS_SYN_SENT) || 5656 (tp->t_state == TCPS_SYN_RECEIVED)) 5657 rexmt = MSEC_2_TICKS(RACK_INITIAL_RTO * tcp_backoff[tp->t_rxtshift]); 5658 else 5659 rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; 5660 TCPT_RANGESET(tp->t_rxtcur, rexmt, 5661 max(MSEC_2_TICKS(rack_rto_min), rexmt), 5662 MSEC_2_TICKS(rack_rto_max)); 5663 /* 5664 * We enter the path for PLMTUD if connection is established or, if 5665 * connection is FIN_WAIT_1 status, reason for the last is that if 5666 * amount of data we send is very small, we could send it in couple 5667 * of packets and process straight to FIN. In that case we won't 5668 * catch ESTABLISHED state. 5669 */ 5670 #ifdef INET6 5671 isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? true : false; 5672 #else 5673 isipv6 = false; 5674 #endif 5675 if (((V_tcp_pmtud_blackhole_detect == 1) || 5676 (V_tcp_pmtud_blackhole_detect == 2 && !isipv6) || 5677 (V_tcp_pmtud_blackhole_detect == 3 && isipv6)) && 5678 ((tp->t_state == TCPS_ESTABLISHED) || 5679 (tp->t_state == TCPS_FIN_WAIT_1))) { 5680 /* 5681 * Idea here is that at each stage of mtu probe (usually, 5682 * 1448 -> 1188 -> 524) should be given 2 chances to recover 5683 * before further clamping down. 'tp->t_rxtshift % 2 == 0' 5684 * should take care of that. 5685 */ 5686 if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) == 5687 (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) && 5688 (tp->t_rxtshift >= 2 && tp->t_rxtshift < 6 && 5689 tp->t_rxtshift % 2 == 0)) { 5690 /* 5691 * Enter Path MTU Black-hole Detection mechanism: - 5692 * Disable Path MTU Discovery (IP "DF" bit). - 5693 * Reduce MTU to lower value than what we negotiated 5694 * with peer. 5695 */ 5696 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) == 0) { 5697 /* Record that we may have found a black hole. */ 5698 tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE; 5699 /* Keep track of previous MSS. */ 5700 tp->t_pmtud_saved_maxseg = tp->t_maxseg; 5701 } 5702 5703 /* 5704 * Reduce the MSS to blackhole value or to the 5705 * default in an attempt to retransmit. 5706 */ 5707 #ifdef INET6 5708 if (isipv6 && 5709 tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) { 5710 /* Use the sysctl tuneable blackhole MSS. */ 5711 tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss; 5712 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated); 5713 } else if (isipv6) { 5714 /* Use the default MSS. */ 5715 tp->t_maxseg = V_tcp_v6mssdflt; 5716 /* 5717 * Disable Path MTU Discovery when we switch 5718 * to minmss. 5719 */ 5720 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 5721 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); 5722 } 5723 #endif 5724 #if defined(INET6) && defined(INET) 5725 else 5726 #endif 5727 #ifdef INET 5728 if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) { 5729 /* Use the sysctl tuneable blackhole MSS. */ 5730 tp->t_maxseg = V_tcp_pmtud_blackhole_mss; 5731 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated); 5732 } else { 5733 /* Use the default MSS. */ 5734 tp->t_maxseg = V_tcp_mssdflt; 5735 /* 5736 * Disable Path MTU Discovery when we switch 5737 * to minmss. 5738 */ 5739 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 5740 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); 5741 } 5742 #endif 5743 } else { 5744 /* 5745 * If further retransmissions are still unsuccessful 5746 * with a lowered MTU, maybe this isn't a blackhole 5747 * and we restore the previous MSS and blackhole 5748 * detection flags. The limit '6' is determined by 5749 * giving each probe stage (1448, 1188, 524) 2 5750 * chances to recover. 5751 */ 5752 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) && 5753 (tp->t_rxtshift >= 6)) { 5754 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 5755 tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE; 5756 tp->t_maxseg = tp->t_pmtud_saved_maxseg; 5757 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_failed); 5758 } 5759 } 5760 } 5761 /* 5762 * If we backed off this far, our srtt estimate is probably bogus. 5763 * Clobber it so we'll take the next rtt measurement as our srtt; 5764 * move the current srtt into rttvar to keep the current retransmit 5765 * times until then. 5766 */ 5767 if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) { 5768 #ifdef INET6 5769 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) 5770 in6_losing(tp->t_inpcb); 5771 else 5772 #endif 5773 in_losing(tp->t_inpcb); 5774 tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT); 5775 tp->t_srtt = 0; 5776 } 5777 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 5778 tp->snd_recover = tp->snd_max; 5779 tp->t_flags |= TF_ACKNOW; 5780 tp->t_rtttime = 0; 5781 rack_cong_signal(tp, NULL, CC_RTO); 5782 out: 5783 return (retval); 5784 } 5785 5786 static int 5787 rack_process_timers(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8_t hpts_calling) 5788 { 5789 int32_t ret = 0; 5790 int32_t timers = (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK); 5791 5792 if (timers == 0) { 5793 return (0); 5794 } 5795 if (tp->t_state == TCPS_LISTEN) { 5796 /* no timers on listen sockets */ 5797 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) 5798 return (0); 5799 return (1); 5800 } 5801 if ((timers & PACE_TMR_RACK) && 5802 rack->rc_on_min_to) { 5803 /* 5804 * For the rack timer when we 5805 * are on a min-timeout (which means rrr_conf = 3) 5806 * we don't want to check the timer. It may 5807 * be going off for a pace and thats ok we 5808 * want to send the retransmit (if its ready). 5809 * 5810 * If its on a normal rack timer (non-min) then 5811 * we will check if its expired. 5812 */ 5813 goto skip_time_check; 5814 } 5815 if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) { 5816 uint32_t left; 5817 5818 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { 5819 ret = -1; 5820 rack_log_to_processing(rack, cts, ret, 0); 5821 return (0); 5822 } 5823 if (hpts_calling == 0) { 5824 /* 5825 * A user send or queued mbuf (sack) has called us? We 5826 * return 0 and let the pacing guards 5827 * deal with it if they should or 5828 * should not cause a send. 5829 */ 5830 ret = -2; 5831 rack_log_to_processing(rack, cts, ret, 0); 5832 return (0); 5833 } 5834 /* 5835 * Ok our timer went off early and we are not paced false 5836 * alarm, go back to sleep. 5837 */ 5838 ret = -3; 5839 left = rack->r_ctl.rc_timer_exp - cts; 5840 tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(left)); 5841 rack_log_to_processing(rack, cts, ret, left); 5842 return (1); 5843 } 5844 skip_time_check: 5845 rack->rc_tmr_stopped = 0; 5846 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_MASK; 5847 if (timers & PACE_TMR_DELACK) { 5848 ret = rack_timeout_delack(tp, rack, cts); 5849 } else if (timers & PACE_TMR_RACK) { 5850 rack->r_ctl.rc_tlp_rxt_last_time = cts; 5851 ret = rack_timeout_rack(tp, rack, cts); 5852 } else if (timers & PACE_TMR_TLP) { 5853 rack->r_ctl.rc_tlp_rxt_last_time = cts; 5854 ret = rack_timeout_tlp(tp, rack, cts); 5855 } else if (timers & PACE_TMR_RXT) { 5856 rack->r_ctl.rc_tlp_rxt_last_time = cts; 5857 ret = rack_timeout_rxt(tp, rack, cts); 5858 } else if (timers & PACE_TMR_PERSIT) { 5859 ret = rack_timeout_persist(tp, rack, cts); 5860 } else if (timers & PACE_TMR_KEEP) { 5861 ret = rack_timeout_keepalive(tp, rack, cts); 5862 } 5863 rack_log_to_processing(rack, cts, ret, timers); 5864 return (ret); 5865 } 5866 5867 static void 5868 rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line) 5869 { 5870 struct timeval tv; 5871 uint32_t us_cts, flags_on_entry; 5872 uint8_t hpts_removed = 0; 5873 5874 flags_on_entry = rack->r_ctl.rc_hpts_flags; 5875 us_cts = tcp_get_usecs(&tv); 5876 if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) && 5877 ((TSTMP_GEQ(us_cts, rack->r_ctl.rc_last_output_to)) || 5878 ((tp->snd_max - tp->snd_una) == 0))) { 5879 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); 5880 hpts_removed = 1; 5881 /* If we were not delayed cancel out the flag. */ 5882 if ((tp->snd_max - tp->snd_una) == 0) 5883 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 5884 rack_log_to_cancel(rack, hpts_removed, line, us_cts, &tv, flags_on_entry); 5885 } 5886 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { 5887 rack->rc_tmr_stopped = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK; 5888 if (rack->rc_inp->inp_in_hpts && 5889 ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0)) { 5890 /* 5891 * Canceling timer's when we have no output being 5892 * paced. We also must remove ourselves from the 5893 * hpts. 5894 */ 5895 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); 5896 hpts_removed = 1; 5897 } 5898 rack->r_ctl.rc_hpts_flags &= ~(PACE_TMR_MASK); 5899 } 5900 if (hpts_removed == 0) 5901 rack_log_to_cancel(rack, hpts_removed, line, us_cts, &tv, flags_on_entry); 5902 } 5903 5904 static void 5905 rack_timer_stop(struct tcpcb *tp, uint32_t timer_type) 5906 { 5907 return; 5908 } 5909 5910 static int 5911 rack_stopall(struct tcpcb *tp) 5912 { 5913 struct tcp_rack *rack; 5914 rack = (struct tcp_rack *)tp->t_fb_ptr; 5915 rack->t_timers_stopped = 1; 5916 return (0); 5917 } 5918 5919 static void 5920 rack_timer_activate(struct tcpcb *tp, uint32_t timer_type, uint32_t delta) 5921 { 5922 return; 5923 } 5924 5925 static int 5926 rack_timer_active(struct tcpcb *tp, uint32_t timer_type) 5927 { 5928 return (0); 5929 } 5930 5931 static void 5932 rack_stop_all_timers(struct tcpcb *tp) 5933 { 5934 struct tcp_rack *rack; 5935 5936 /* 5937 * Assure no timers are running. 5938 */ 5939 if (tcp_timer_active(tp, TT_PERSIST)) { 5940 /* We enter in persists, set the flag appropriately */ 5941 rack = (struct tcp_rack *)tp->t_fb_ptr; 5942 rack->rc_in_persist = 1; 5943 } 5944 tcp_timer_suspend(tp, TT_PERSIST); 5945 tcp_timer_suspend(tp, TT_REXMT); 5946 tcp_timer_suspend(tp, TT_KEEP); 5947 tcp_timer_suspend(tp, TT_DELACK); 5948 } 5949 5950 static void 5951 rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack, 5952 struct rack_sendmap *rsm, uint32_t ts) 5953 { 5954 int32_t idx; 5955 5956 rsm->r_rtr_cnt++; 5957 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 5958 rsm->r_dupack = 0; 5959 if (rsm->r_rtr_cnt > RACK_NUM_OF_RETRANS) { 5960 rsm->r_rtr_cnt = RACK_NUM_OF_RETRANS; 5961 rsm->r_flags |= RACK_OVERMAX; 5962 } 5963 if ((rsm->r_rtr_cnt > 1) && ((rsm->r_flags & RACK_TLP) == 0)) { 5964 rack->r_ctl.rc_holes_rxt += (rsm->r_end - rsm->r_start); 5965 rsm->r_rtr_bytes += (rsm->r_end - rsm->r_start); 5966 } 5967 idx = rsm->r_rtr_cnt - 1; 5968 rsm->r_tim_lastsent[idx] = ts; 5969 if (rsm->r_flags & RACK_ACKED) { 5970 /* Problably MTU discovery messing with us */ 5971 rsm->r_flags &= ~RACK_ACKED; 5972 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); 5973 } 5974 if (rsm->r_in_tmap) { 5975 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 5976 rsm->r_in_tmap = 0; 5977 } 5978 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); 5979 rsm->r_in_tmap = 1; 5980 if (rsm->r_flags & RACK_SACK_PASSED) { 5981 /* We have retransmitted due to the SACK pass */ 5982 rsm->r_flags &= ~RACK_SACK_PASSED; 5983 rsm->r_flags |= RACK_WAS_SACKPASS; 5984 } 5985 } 5986 5987 static uint32_t 5988 rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack, 5989 struct rack_sendmap *rsm, uint32_t ts, int32_t *lenp) 5990 { 5991 /* 5992 * We (re-)transmitted starting at rsm->r_start for some length 5993 * (possibly less than r_end. 5994 */ 5995 struct rack_sendmap *nrsm, *insret; 5996 uint32_t c_end; 5997 int32_t len; 5998 5999 len = *lenp; 6000 c_end = rsm->r_start + len; 6001 if (SEQ_GEQ(c_end, rsm->r_end)) { 6002 /* 6003 * We retransmitted the whole piece or more than the whole 6004 * slopping into the next rsm. 6005 */ 6006 rack_update_rsm(tp, rack, rsm, ts); 6007 if (c_end == rsm->r_end) { 6008 *lenp = 0; 6009 return (0); 6010 } else { 6011 int32_t act_len; 6012 6013 /* Hangs over the end return whats left */ 6014 act_len = rsm->r_end - rsm->r_start; 6015 *lenp = (len - act_len); 6016 return (rsm->r_end); 6017 } 6018 /* We don't get out of this block. */ 6019 } 6020 /* 6021 * Here we retransmitted less than the whole thing which means we 6022 * have to split this into what was transmitted and what was not. 6023 */ 6024 nrsm = rack_alloc_full_limit(rack); 6025 if (nrsm == NULL) { 6026 /* 6027 * We can't get memory, so lets not proceed. 6028 */ 6029 *lenp = 0; 6030 return (0); 6031 } 6032 /* 6033 * So here we are going to take the original rsm and make it what we 6034 * retransmitted. nrsm will be the tail portion we did not 6035 * retransmit. For example say the chunk was 1, 11 (10 bytes). And 6036 * we retransmitted 5 bytes i.e. 1, 5. The original piece shrinks to 6037 * 1, 6 and the new piece will be 6, 11. 6038 */ 6039 rack_clone_rsm(rack, nrsm, rsm, c_end); 6040 nrsm->r_dupack = 0; 6041 rack_log_retran_reason(rack, nrsm, __LINE__, 0, 2); 6042 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); 6043 #ifdef INVARIANTS 6044 if (insret != NULL) { 6045 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", 6046 nrsm, insret, rack, rsm); 6047 } 6048 #endif 6049 if (rsm->r_in_tmap) { 6050 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 6051 nrsm->r_in_tmap = 1; 6052 } 6053 rsm->r_flags &= (~RACK_HAS_FIN); 6054 rack_update_rsm(tp, rack, rsm, ts); 6055 *lenp = 0; 6056 return (0); 6057 } 6058 6059 static void 6060 rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len, 6061 uint32_t seq_out, uint8_t th_flags, int32_t err, uint32_t ts, 6062 uint8_t pass, struct rack_sendmap *hintrsm, uint32_t us_cts) 6063 { 6064 struct tcp_rack *rack; 6065 struct rack_sendmap *rsm, *nrsm, *insret, fe; 6066 register uint32_t snd_max, snd_una; 6067 6068 /* 6069 * Add to the RACK log of packets in flight or retransmitted. If 6070 * there is a TS option we will use the TS echoed, if not we will 6071 * grab a TS. 6072 * 6073 * Retransmissions will increment the count and move the ts to its 6074 * proper place. Note that if options do not include TS's then we 6075 * won't be able to effectively use the ACK for an RTT on a retran. 6076 * 6077 * Notes about r_start and r_end. Lets consider a send starting at 6078 * sequence 1 for 10 bytes. In such an example the r_start would be 6079 * 1 (starting sequence) but the r_end would be r_start+len i.e. 11. 6080 * This means that r_end is actually the first sequence for the next 6081 * slot (11). 6082 * 6083 */ 6084 /* 6085 * If err is set what do we do XXXrrs? should we not add the thing? 6086 * -- i.e. return if err != 0 or should we pretend we sent it? -- 6087 * i.e. proceed with add ** do this for now. 6088 */ 6089 INP_WLOCK_ASSERT(tp->t_inpcb); 6090 if (err) 6091 /* 6092 * We don't log errors -- we could but snd_max does not 6093 * advance in this case either. 6094 */ 6095 return; 6096 6097 if (th_flags & TH_RST) { 6098 /* 6099 * We don't log resets and we return immediately from 6100 * sending 6101 */ 6102 return; 6103 } 6104 rack = (struct tcp_rack *)tp->t_fb_ptr; 6105 snd_una = tp->snd_una; 6106 if (SEQ_LEQ((seq_out + len), snd_una)) { 6107 /* Are sending an old segment to induce an ack (keep-alive)? */ 6108 return; 6109 } 6110 if (SEQ_LT(seq_out, snd_una)) { 6111 /* huh? should we panic? */ 6112 uint32_t end; 6113 6114 end = seq_out + len; 6115 seq_out = snd_una; 6116 if (SEQ_GEQ(end, seq_out)) 6117 len = end - seq_out; 6118 else 6119 len = 0; 6120 } 6121 snd_max = tp->snd_max; 6122 if (th_flags & (TH_SYN | TH_FIN)) { 6123 /* 6124 * The call to rack_log_output is made before bumping 6125 * snd_max. This means we can record one extra byte on a SYN 6126 * or FIN if seq_out is adding more on and a FIN is present 6127 * (and we are not resending). 6128 */ 6129 if ((th_flags & TH_SYN) && (seq_out == tp->iss)) 6130 len++; 6131 if (th_flags & TH_FIN) 6132 len++; 6133 if (SEQ_LT(snd_max, tp->snd_nxt)) { 6134 /* 6135 * The add/update as not been done for the FIN/SYN 6136 * yet. 6137 */ 6138 snd_max = tp->snd_nxt; 6139 } 6140 } 6141 if (len == 0) { 6142 /* We don't log zero window probes */ 6143 return; 6144 } 6145 rack->r_ctl.rc_time_last_sent = ts; 6146 if (IN_RECOVERY(tp->t_flags)) { 6147 rack->r_ctl.rc_prr_out += len; 6148 } 6149 /* First question is it a retransmission or new? */ 6150 if (seq_out == snd_max) { 6151 /* Its new */ 6152 again: 6153 rsm = rack_alloc(rack); 6154 if (rsm == NULL) { 6155 /* 6156 * Hmm out of memory and the tcb got destroyed while 6157 * we tried to wait. 6158 */ 6159 return; 6160 } 6161 if (th_flags & TH_FIN) { 6162 rsm->r_flags = RACK_HAS_FIN; 6163 } else { 6164 rsm->r_flags = 0; 6165 } 6166 rsm->r_tim_lastsent[0] = ts; 6167 rsm->r_rtr_cnt = 1; 6168 rsm->r_rtr_bytes = 0; 6169 rsm->usec_orig_send = us_cts; 6170 if (th_flags & TH_SYN) { 6171 /* The data space is one beyond snd_una */ 6172 rsm->r_flags |= RACK_HAS_SIN; 6173 rsm->r_start = seq_out + 1; 6174 rsm->r_end = rsm->r_start + (len - 1); 6175 } else { 6176 /* Normal case */ 6177 rsm->r_start = seq_out; 6178 rsm->r_end = rsm->r_start + len; 6179 } 6180 rsm->r_dupack = 0; 6181 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 6182 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 6183 #ifdef INVARIANTS 6184 if (insret != NULL) { 6185 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", 6186 nrsm, insret, rack, rsm); 6187 } 6188 #endif 6189 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); 6190 rsm->r_in_tmap = 1; 6191 /* 6192 * Special case detection, is there just a single 6193 * packet outstanding when we are not in recovery? 6194 * 6195 * If this is true mark it so. 6196 */ 6197 if ((IN_RECOVERY(tp->t_flags) == 0) && 6198 (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) == ctf_fixed_maxseg(tp))) { 6199 struct rack_sendmap *prsm; 6200 6201 prsm = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 6202 if (prsm) 6203 prsm->r_one_out_nr = 1; 6204 } 6205 return; 6206 } 6207 /* 6208 * If we reach here its a retransmission and we need to find it. 6209 */ 6210 memset(&fe, 0, sizeof(fe)); 6211 more: 6212 if (hintrsm && (hintrsm->r_start == seq_out)) { 6213 rsm = hintrsm; 6214 hintrsm = NULL; 6215 } else { 6216 /* No hints sorry */ 6217 rsm = NULL; 6218 } 6219 if ((rsm) && (rsm->r_start == seq_out)) { 6220 seq_out = rack_update_entry(tp, rack, rsm, ts, &len); 6221 if (len == 0) { 6222 return; 6223 } else { 6224 goto more; 6225 } 6226 } 6227 /* Ok it was not the last pointer go through it the hard way. */ 6228 refind: 6229 fe.r_start = seq_out; 6230 rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); 6231 if (rsm) { 6232 if (rsm->r_start == seq_out) { 6233 seq_out = rack_update_entry(tp, rack, rsm, ts, &len); 6234 if (len == 0) { 6235 return; 6236 } else { 6237 goto refind; 6238 } 6239 } 6240 if (SEQ_GEQ(seq_out, rsm->r_start) && SEQ_LT(seq_out, rsm->r_end)) { 6241 /* Transmitted within this piece */ 6242 /* 6243 * Ok we must split off the front and then let the 6244 * update do the rest 6245 */ 6246 nrsm = rack_alloc_full_limit(rack); 6247 if (nrsm == NULL) { 6248 rack_update_rsm(tp, rack, rsm, ts); 6249 return; 6250 } 6251 /* 6252 * copy rsm to nrsm and then trim the front of rsm 6253 * to not include this part. 6254 */ 6255 rack_clone_rsm(rack, nrsm, rsm, seq_out); 6256 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); 6257 #ifdef INVARIANTS 6258 if (insret != NULL) { 6259 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", 6260 nrsm, insret, rack, rsm); 6261 } 6262 #endif 6263 if (rsm->r_in_tmap) { 6264 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 6265 nrsm->r_in_tmap = 1; 6266 } 6267 rsm->r_flags &= (~RACK_HAS_FIN); 6268 seq_out = rack_update_entry(tp, rack, nrsm, ts, &len); 6269 if (len == 0) { 6270 return; 6271 } else if (len > 0) 6272 goto refind; 6273 } 6274 } 6275 /* 6276 * Hmm not found in map did they retransmit both old and on into the 6277 * new? 6278 */ 6279 if (seq_out == tp->snd_max) { 6280 goto again; 6281 } else if (SEQ_LT(seq_out, tp->snd_max)) { 6282 #ifdef INVARIANTS 6283 printf("seq_out:%u len:%d snd_una:%u snd_max:%u -- but rsm not found?\n", 6284 seq_out, len, tp->snd_una, tp->snd_max); 6285 printf("Starting Dump of all rack entries\n"); 6286 RB_FOREACH(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) { 6287 printf("rsm:%p start:%u end:%u\n", 6288 rsm, rsm->r_start, rsm->r_end); 6289 } 6290 printf("Dump complete\n"); 6291 panic("seq_out not found rack:%p tp:%p", 6292 rack, tp); 6293 #endif 6294 } else { 6295 #ifdef INVARIANTS 6296 /* 6297 * Hmm beyond sndmax? (only if we are using the new rtt-pack 6298 * flag) 6299 */ 6300 panic("seq_out:%u(%d) is beyond snd_max:%u tp:%p", 6301 seq_out, len, tp->snd_max, tp); 6302 #endif 6303 } 6304 } 6305 6306 /* 6307 * Record one of the RTT updates from an ack into 6308 * our sample structure. 6309 */ 6310 6311 static void 6312 tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt, uint32_t len, uint32_t us_rtt, 6313 int confidence, struct rack_sendmap *rsm, uint16_t rtrcnt) 6314 { 6315 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) || 6316 (rack->r_ctl.rack_rs.rs_rtt_lowest > rtt)) { 6317 rack->r_ctl.rack_rs.rs_rtt_lowest = rtt; 6318 } 6319 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) || 6320 (rack->r_ctl.rack_rs.rs_rtt_highest < rtt)) { 6321 rack->r_ctl.rack_rs.rs_rtt_highest = rtt; 6322 } 6323 if (rack->rc_tp->t_flags & TF_GPUTINPROG) { 6324 if (us_rtt < rack->r_ctl.rc_gp_lowrtt) 6325 rack->r_ctl.rc_gp_lowrtt = us_rtt; 6326 if (rack->rc_tp->snd_wnd > rack->r_ctl.rc_gp_high_rwnd) 6327 rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd; 6328 } 6329 if ((confidence == 1) && 6330 ((rsm == NULL) || 6331 (rsm->r_just_ret) || 6332 (rsm->r_one_out_nr && 6333 len < (ctf_fixed_maxseg(rack->rc_tp) * 2)))) { 6334 /* 6335 * If the rsm had a just return 6336 * hit it then we can't trust the 6337 * rtt measurement for buffer deterimination 6338 * Note that a confidence of 2, indicates 6339 * SACK'd which overrides the r_just_ret or 6340 * the r_one_out_nr. If it was a CUM-ACK and 6341 * we had only two outstanding, but get an 6342 * ack for only 1. Then that also lowers our 6343 * confidence. 6344 */ 6345 confidence = 0; 6346 } 6347 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) || 6348 (rack->r_ctl.rack_rs.rs_us_rtt > us_rtt)) { 6349 if (rack->r_ctl.rack_rs.confidence == 0) { 6350 /* 6351 * We take anything with no current confidence 6352 * saved. 6353 */ 6354 rack->r_ctl.rack_rs.rs_us_rtt = us_rtt; 6355 rack->r_ctl.rack_rs.confidence = confidence; 6356 rack->r_ctl.rack_rs.rs_us_rtrcnt = rtrcnt; 6357 } else if (confidence || rack->r_ctl.rack_rs.confidence) { 6358 /* 6359 * Once we have a confident number, 6360 * we can update it with a smaller 6361 * value since this confident number 6362 * may include the DSACK time until 6363 * the next segment (the second one) arrived. 6364 */ 6365 rack->r_ctl.rack_rs.rs_us_rtt = us_rtt; 6366 rack->r_ctl.rack_rs.confidence = confidence; 6367 rack->r_ctl.rack_rs.rs_us_rtrcnt = rtrcnt; 6368 } 6369 } 6370 rack_log_rtt_upd(rack->rc_tp, rack, us_rtt, len, rsm, confidence); 6371 rack->r_ctl.rack_rs.rs_flags = RACK_RTT_VALID; 6372 rack->r_ctl.rack_rs.rs_rtt_tot += rtt; 6373 rack->r_ctl.rack_rs.rs_rtt_cnt++; 6374 } 6375 6376 /* 6377 * Collect new round-trip time estimate 6378 * and update averages and current timeout. 6379 */ 6380 static void 6381 tcp_rack_xmit_timer_commit(struct tcp_rack *rack, struct tcpcb *tp) 6382 { 6383 int32_t delta; 6384 uint32_t o_srtt, o_var; 6385 int32_t hrtt_up = 0; 6386 int32_t rtt; 6387 6388 if (rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) 6389 /* No valid sample */ 6390 return; 6391 if (rack->r_ctl.rc_rate_sample_method == USE_RTT_LOW) { 6392 /* We are to use the lowest RTT seen in a single ack */ 6393 rtt = rack->r_ctl.rack_rs.rs_rtt_lowest; 6394 } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_HIGH) { 6395 /* We are to use the highest RTT seen in a single ack */ 6396 rtt = rack->r_ctl.rack_rs.rs_rtt_highest; 6397 } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_AVG) { 6398 /* We are to use the average RTT seen in a single ack */ 6399 rtt = (int32_t)(rack->r_ctl.rack_rs.rs_rtt_tot / 6400 (uint64_t)rack->r_ctl.rack_rs.rs_rtt_cnt); 6401 } else { 6402 #ifdef INVARIANTS 6403 panic("Unknown rtt variant %d", rack->r_ctl.rc_rate_sample_method); 6404 #endif 6405 return; 6406 } 6407 if (rtt == 0) 6408 rtt = 1; 6409 if (rack->rc_gp_rtt_set == 0) { 6410 /* 6411 * With no RTT we have to accept 6412 * even one we are not confident of. 6413 */ 6414 rack->r_ctl.rc_gp_srtt = rack->r_ctl.rack_rs.rs_us_rtt; 6415 rack->rc_gp_rtt_set = 1; 6416 } else if (rack->r_ctl.rack_rs.confidence) { 6417 /* update the running gp srtt */ 6418 rack->r_ctl.rc_gp_srtt -= (rack->r_ctl.rc_gp_srtt/8); 6419 rack->r_ctl.rc_gp_srtt += rack->r_ctl.rack_rs.rs_us_rtt / 8; 6420 } 6421 if (rack->r_ctl.rack_rs.confidence) { 6422 /* 6423 * record the low and high for highly buffered path computation, 6424 * we only do this if we are confident (not a retransmission). 6425 */ 6426 if (rack->r_ctl.rc_highest_us_rtt < rack->r_ctl.rack_rs.rs_us_rtt) { 6427 rack->r_ctl.rc_highest_us_rtt = rack->r_ctl.rack_rs.rs_us_rtt; 6428 hrtt_up = 1; 6429 } 6430 if (rack->rc_highly_buffered == 0) { 6431 /* 6432 * Currently once we declare a path has 6433 * highly buffered there is no going 6434 * back, which may be a problem... 6435 */ 6436 if ((rack->r_ctl.rc_highest_us_rtt / rack->r_ctl.rc_lowest_us_rtt) > rack_hbp_thresh) { 6437 rack_log_rtt_shrinks(rack, rack->r_ctl.rack_rs.rs_us_rtt, 6438 rack->r_ctl.rc_highest_us_rtt, 6439 rack->r_ctl.rc_lowest_us_rtt, 6440 RACK_RTTS_SEEHBP); 6441 rack->rc_highly_buffered = 1; 6442 } 6443 } 6444 } 6445 if ((rack->r_ctl.rack_rs.confidence) || 6446 (rack->r_ctl.rack_rs.rs_us_rtrcnt == 1)) { 6447 /* 6448 * If we are highly confident of it <or> it was 6449 * never retransmitted we accept it as the last us_rtt. 6450 */ 6451 rack->r_ctl.rc_last_us_rtt = rack->r_ctl.rack_rs.rs_us_rtt; 6452 /* The lowest rtt can be set if its was not retransmited */ 6453 if (rack->r_ctl.rc_lowest_us_rtt > rack->r_ctl.rack_rs.rs_us_rtt) { 6454 rack->r_ctl.rc_lowest_us_rtt = rack->r_ctl.rack_rs.rs_us_rtt; 6455 if (rack->r_ctl.rc_lowest_us_rtt == 0) 6456 rack->r_ctl.rc_lowest_us_rtt = 1; 6457 } 6458 } 6459 rack_log_rtt_sample(rack, rtt); 6460 o_srtt = tp->t_srtt; 6461 o_var = tp->t_rttvar; 6462 rack = (struct tcp_rack *)tp->t_fb_ptr; 6463 if (tp->t_srtt != 0) { 6464 /* 6465 * srtt is stored as fixed point with 5 bits after the 6466 * binary point (i.e., scaled by 8). The following magic is 6467 * equivalent to the smoothing algorithm in rfc793 with an 6468 * alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed point). 6469 * Adjust rtt to origin 0. 6470 */ 6471 delta = ((rtt - 1) << TCP_DELTA_SHIFT) 6472 - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)); 6473 6474 tp->t_srtt += delta; 6475 if (tp->t_srtt <= 0) 6476 tp->t_srtt = 1; 6477 6478 /* 6479 * We accumulate a smoothed rtt variance (actually, a 6480 * smoothed mean difference), then set the retransmit timer 6481 * to smoothed rtt + 4 times the smoothed variance. rttvar 6482 * is stored as fixed point with 4 bits after the binary 6483 * point (scaled by 16). The following is equivalent to 6484 * rfc793 smoothing with an alpha of .75 (rttvar = 6485 * rttvar*3/4 + |delta| / 4). This replaces rfc793's 6486 * wired-in beta. 6487 */ 6488 if (delta < 0) 6489 delta = -delta; 6490 delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT); 6491 tp->t_rttvar += delta; 6492 if (tp->t_rttvar <= 0) 6493 tp->t_rttvar = 1; 6494 if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar) 6495 tp->t_rttbest = tp->t_srtt + tp->t_rttvar; 6496 } else { 6497 /* 6498 * No rtt measurement yet - use the unsmoothed rtt. Set the 6499 * variance to half the rtt (so our first retransmit happens 6500 * at 3*rtt). 6501 */ 6502 tp->t_srtt = rtt << TCP_RTT_SHIFT; 6503 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1); 6504 tp->t_rttbest = tp->t_srtt + tp->t_rttvar; 6505 } 6506 KMOD_TCPSTAT_INC(tcps_rttupdated); 6507 tp->t_rttupdated++; 6508 #ifdef STATS 6509 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rtt)); 6510 #endif 6511 tp->t_rxtshift = 0; 6512 6513 /* 6514 * the retransmit should happen at rtt + 4 * rttvar. Because of the 6515 * way we do the smoothing, srtt and rttvar will each average +1/2 6516 * tick of bias. When we compute the retransmit timer, we want 1/2 6517 * tick of rounding and 1 extra tick because of +-1/2 tick 6518 * uncertainty in the firing of the timer. The bias will give us 6519 * exactly the 1.5 tick we need. But, because the bias is 6520 * statistical, we have to test that we don't drop below the minimum 6521 * feasible timer (which is 2 ticks). 6522 */ 6523 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), 6524 max(MSEC_2_TICKS(rack_rto_min), rtt + 2), MSEC_2_TICKS(rack_rto_max)); 6525 tp->t_softerror = 0; 6526 } 6527 6528 static void 6529 rack_earlier_retran(struct tcpcb *tp, struct rack_sendmap *rsm, 6530 uint32_t t, uint32_t cts) 6531 { 6532 /* 6533 * For this RSM, we acknowledged the data from a previous 6534 * transmission, not the last one we made. This means we did a false 6535 * retransmit. 6536 */ 6537 struct tcp_rack *rack; 6538 6539 if (rsm->r_flags & RACK_HAS_FIN) { 6540 /* 6541 * The sending of the FIN often is multiple sent when we 6542 * have everything outstanding ack'd. We ignore this case 6543 * since its over now. 6544 */ 6545 return; 6546 } 6547 if (rsm->r_flags & RACK_TLP) { 6548 /* 6549 * We expect TLP's to have this occur. 6550 */ 6551 return; 6552 } 6553 rack = (struct tcp_rack *)tp->t_fb_ptr; 6554 /* should we undo cc changes and exit recovery? */ 6555 if (IN_RECOVERY(tp->t_flags)) { 6556 if (rack->r_ctl.rc_rsm_start == rsm->r_start) { 6557 /* 6558 * Undo what we ratched down and exit recovery if 6559 * possible 6560 */ 6561 EXIT_RECOVERY(tp->t_flags); 6562 tp->snd_recover = tp->snd_una; 6563 if (rack->r_ctl.rc_cwnd_at > tp->snd_cwnd) 6564 tp->snd_cwnd = rack->r_ctl.rc_cwnd_at; 6565 if (rack->r_ctl.rc_ssthresh_at > tp->snd_ssthresh) 6566 tp->snd_ssthresh = rack->r_ctl.rc_ssthresh_at; 6567 } 6568 } 6569 if (rsm->r_flags & RACK_WAS_SACKPASS) { 6570 /* 6571 * We retransmitted based on a sack and the earlier 6572 * retransmission ack'd it - re-ordering is occuring. 6573 */ 6574 counter_u64_add(rack_reorder_seen, 1); 6575 rack->r_ctl.rc_reorder_ts = cts; 6576 } 6577 counter_u64_add(rack_badfr, 1); 6578 counter_u64_add(rack_badfr_bytes, (rsm->r_end - rsm->r_start)); 6579 } 6580 6581 static void 6582 rack_apply_updated_usrtt(struct tcp_rack *rack, uint32_t us_rtt, uint32_t us_cts) 6583 { 6584 /* 6585 * Apply to filter the inbound us-rtt at us_cts. 6586 */ 6587 uint32_t old_rtt; 6588 6589 old_rtt = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); 6590 apply_filter_min_small(&rack->r_ctl.rc_gp_min_rtt, 6591 us_rtt, us_cts); 6592 if (rack->r_ctl.last_pacing_time && 6593 rack->rc_gp_dyn_mul && 6594 (rack->r_ctl.last_pacing_time > us_rtt)) 6595 rack->pacing_longer_than_rtt = 1; 6596 else 6597 rack->pacing_longer_than_rtt = 0; 6598 if (old_rtt > us_rtt) { 6599 /* We just hit a new lower rtt time */ 6600 rack_log_rtt_shrinks(rack, us_cts, old_rtt, 6601 __LINE__, RACK_RTTS_NEWRTT); 6602 /* 6603 * Only count it if its lower than what we saw within our 6604 * calculated range. 6605 */ 6606 if ((old_rtt - us_rtt) > rack_min_rtt_movement) { 6607 if (rack_probertt_lower_within && 6608 rack->rc_gp_dyn_mul && 6609 (rack->use_fixed_rate == 0) && 6610 (rack->rc_always_pace)) { 6611 /* 6612 * We are seeing a new lower rtt very close 6613 * to the time that we would have entered probe-rtt. 6614 * This is probably due to the fact that a peer flow 6615 * has entered probe-rtt. Lets go in now too. 6616 */ 6617 uint32_t val; 6618 6619 val = rack_probertt_lower_within * rack_time_between_probertt; 6620 val /= 100; 6621 if ((rack->in_probe_rtt == 0) && 6622 ((us_cts - rack->r_ctl.rc_lower_rtt_us_cts) >= (rack_time_between_probertt - val))) { 6623 rack_enter_probertt(rack, us_cts); 6624 } 6625 } 6626 rack->r_ctl.rc_lower_rtt_us_cts = us_cts; 6627 } 6628 } 6629 } 6630 6631 static int 6632 rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack, 6633 struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type, tcp_seq th_ack) 6634 { 6635 int32_t i; 6636 uint32_t t, len_acked; 6637 6638 if ((rsm->r_flags & RACK_ACKED) || 6639 (rsm->r_flags & RACK_WAS_ACKED)) 6640 /* Already done */ 6641 return (0); 6642 6643 if (ack_type == CUM_ACKED) { 6644 if (SEQ_GT(th_ack, rsm->r_end)) 6645 len_acked = rsm->r_end - rsm->r_start; 6646 else 6647 len_acked = th_ack - rsm->r_start; 6648 } else 6649 len_acked = rsm->r_end - rsm->r_start; 6650 if (rsm->r_rtr_cnt == 1) { 6651 uint32_t us_rtt; 6652 6653 t = cts - rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; 6654 if ((int)t <= 0) 6655 t = 1; 6656 if (!tp->t_rttlow || tp->t_rttlow > t) 6657 tp->t_rttlow = t; 6658 if (!rack->r_ctl.rc_rack_min_rtt || 6659 SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 6660 rack->r_ctl.rc_rack_min_rtt = t; 6661 if (rack->r_ctl.rc_rack_min_rtt == 0) { 6662 rack->r_ctl.rc_rack_min_rtt = 1; 6663 } 6664 } 6665 us_rtt = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time) - rsm->usec_orig_send; 6666 if (us_rtt == 0) 6667 us_rtt = 1; 6668 rack_apply_updated_usrtt(rack, us_rtt, tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time)); 6669 if (ack_type == SACKED) 6670 tcp_rack_xmit_timer(rack, t + 1, len_acked, us_rtt, 2 , rsm, rsm->r_rtr_cnt); 6671 else { 6672 /* 6673 * For cum-ack we are only confident if what 6674 * is being acked is included in a measurement. 6675 * Otherwise it could be an idle period that 6676 * includes Delayed-ack time. 6677 */ 6678 tcp_rack_xmit_timer(rack, t + 1, len_acked, us_rtt, 6679 (rack->app_limited_needs_set ? 0 : 1), rsm, rsm->r_rtr_cnt); 6680 } 6681 if ((rsm->r_flags & RACK_TLP) && 6682 (!IN_RECOVERY(tp->t_flags))) { 6683 /* Segment was a TLP and our retrans matched */ 6684 if (rack->r_ctl.rc_tlp_cwnd_reduce) { 6685 rack->r_ctl.rc_rsm_start = tp->snd_max; 6686 rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; 6687 rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; 6688 rack_cong_signal(tp, NULL, CC_NDUPACK); 6689 /* 6690 * When we enter recovery we need to assure 6691 * we send one packet. 6692 */ 6693 if (rack->rack_no_prr == 0) { 6694 rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); 6695 rack_log_to_prr(rack, 7, 0); 6696 } 6697 } 6698 } 6699 if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) { 6700 /* New more recent rack_tmit_time */ 6701 rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; 6702 rack->rc_rack_rtt = t; 6703 } 6704 return (1); 6705 } 6706 /* 6707 * We clear the soft/rxtshift since we got an ack. 6708 * There is no assurance we will call the commit() function 6709 * so we need to clear these to avoid incorrect handling. 6710 */ 6711 tp->t_rxtshift = 0; 6712 tp->t_softerror = 0; 6713 if ((to->to_flags & TOF_TS) && 6714 (ack_type == CUM_ACKED) && 6715 (to->to_tsecr) && 6716 ((rsm->r_flags & RACK_OVERMAX) == 0)) { 6717 /* 6718 * Now which timestamp does it match? In this block the ACK 6719 * must be coming from a previous transmission. 6720 */ 6721 for (i = 0; i < rsm->r_rtr_cnt; i++) { 6722 if (rsm->r_tim_lastsent[i] == to->to_tsecr) { 6723 t = cts - rsm->r_tim_lastsent[i]; 6724 if ((int)t <= 0) 6725 t = 1; 6726 if ((i + 1) < rsm->r_rtr_cnt) { 6727 /* Likely */ 6728 rack_earlier_retran(tp, rsm, t, cts); 6729 } 6730 if (!tp->t_rttlow || tp->t_rttlow > t) 6731 tp->t_rttlow = t; 6732 if (!rack->r_ctl.rc_rack_min_rtt || SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 6733 rack->r_ctl.rc_rack_min_rtt = t; 6734 if (rack->r_ctl.rc_rack_min_rtt == 0) { 6735 rack->r_ctl.rc_rack_min_rtt = 1; 6736 } 6737 } 6738 if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, 6739 rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) { 6740 /* New more recent rack_tmit_time */ 6741 rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; 6742 rack->rc_rack_rtt = t; 6743 } 6744 tcp_rack_xmit_timer(rack, t + 1, len_acked, (t * HPTS_USEC_IN_MSEC), 0, rsm, 6745 rsm->r_rtr_cnt); 6746 return (1); 6747 } 6748 } 6749 goto ts_not_found; 6750 } else { 6751 /* 6752 * Ok its a SACK block that we retransmitted. or a windows 6753 * machine without timestamps. We can tell nothing from the 6754 * time-stamp since its not there or the time the peer last 6755 * recieved a segment that moved forward its cum-ack point. 6756 */ 6757 ts_not_found: 6758 i = rsm->r_rtr_cnt - 1; 6759 t = cts - rsm->r_tim_lastsent[i]; 6760 if ((int)t <= 0) 6761 t = 1; 6762 if (rack->r_ctl.rc_rack_min_rtt && SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 6763 /* 6764 * We retransmitted and the ack came back in less 6765 * than the smallest rtt we have observed. We most 6766 * likey did an improper retransmit as outlined in 6767 * 4.2 Step 3 point 2 in the rack-draft. 6768 */ 6769 i = rsm->r_rtr_cnt - 2; 6770 t = cts - rsm->r_tim_lastsent[i]; 6771 rack_earlier_retran(tp, rsm, t, cts); 6772 } else if (rack->r_ctl.rc_rack_min_rtt) { 6773 /* 6774 * We retransmitted it and the retransmit did the 6775 * job. 6776 */ 6777 if (!rack->r_ctl.rc_rack_min_rtt || 6778 SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 6779 rack->r_ctl.rc_rack_min_rtt = t; 6780 if (rack->r_ctl.rc_rack_min_rtt == 0) { 6781 rack->r_ctl.rc_rack_min_rtt = 1; 6782 } 6783 } 6784 if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, rsm->r_tim_lastsent[i])) { 6785 /* New more recent rack_tmit_time */ 6786 rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[i]; 6787 rack->rc_rack_rtt = t; 6788 } 6789 return (1); 6790 } 6791 } 6792 return (0); 6793 } 6794 6795 /* 6796 * Mark the SACK_PASSED flag on all entries prior to rsm send wise. 6797 */ 6798 static void 6799 rack_log_sack_passed(struct tcpcb *tp, 6800 struct tcp_rack *rack, struct rack_sendmap *rsm) 6801 { 6802 struct rack_sendmap *nrsm; 6803 6804 nrsm = rsm; 6805 TAILQ_FOREACH_REVERSE_FROM(nrsm, &rack->r_ctl.rc_tmap, 6806 rack_head, r_tnext) { 6807 if (nrsm == rsm) { 6808 /* Skip orginal segment he is acked */ 6809 continue; 6810 } 6811 if (nrsm->r_flags & RACK_ACKED) { 6812 /* 6813 * Skip ack'd segments, though we 6814 * should not see these, since tmap 6815 * should not have ack'd segments. 6816 */ 6817 continue; 6818 } 6819 if (nrsm->r_flags & RACK_SACK_PASSED) { 6820 /* 6821 * We found one that is already marked 6822 * passed, we have been here before and 6823 * so all others below this are marked. 6824 */ 6825 break; 6826 } 6827 nrsm->r_flags |= RACK_SACK_PASSED; 6828 nrsm->r_flags &= ~RACK_WAS_SACKPASS; 6829 } 6830 } 6831 6832 static void 6833 rack_need_set_test(struct tcpcb *tp, 6834 struct tcp_rack *rack, 6835 struct rack_sendmap *rsm, 6836 tcp_seq th_ack, 6837 int line, 6838 int use_which) 6839 { 6840 6841 if ((tp->t_flags & TF_GPUTINPROG) && 6842 SEQ_GEQ(rsm->r_end, tp->gput_seq)) { 6843 /* 6844 * We were app limited, and this ack 6845 * butts up or goes beyond the point where we want 6846 * to start our next measurement. We need 6847 * to record the new gput_ts as here and 6848 * possibly update the start sequence. 6849 */ 6850 uint32_t seq, ts; 6851 6852 if (rsm->r_rtr_cnt > 1) { 6853 /* 6854 * This is a retransmit, can we 6855 * really make any assessment at this 6856 * point? We are not really sure of 6857 * the timestamp, is it this or the 6858 * previous transmission? 6859 * 6860 * Lets wait for something better that 6861 * is not retransmitted. 6862 */ 6863 return; 6864 } 6865 seq = tp->gput_seq; 6866 ts = tp->gput_ts; 6867 rack->app_limited_needs_set = 0; 6868 tp->gput_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 6869 /* Do we start at a new end? */ 6870 if ((use_which == RACK_USE_BEG) && 6871 SEQ_GEQ(rsm->r_start, tp->gput_seq)) { 6872 /* 6873 * When we get an ACK that just eats 6874 * up some of the rsm, we set RACK_USE_BEG 6875 * since whats at r_start (i.e. th_ack) 6876 * is left unacked and thats where the 6877 * measurement not starts. 6878 */ 6879 tp->gput_seq = rsm->r_start; 6880 rack->r_ctl.rc_gp_output_ts = rsm->usec_orig_send; 6881 } 6882 if ((use_which == RACK_USE_END) && 6883 SEQ_GEQ(rsm->r_end, tp->gput_seq)) { 6884 /* 6885 * We use the end when the cumack 6886 * is moving forward and completely 6887 * deleting the rsm passed so basically 6888 * r_end holds th_ack. 6889 * 6890 * For SACK's we also want to use the end 6891 * since this piece just got sacked and 6892 * we want to target anything after that 6893 * in our measurement. 6894 */ 6895 tp->gput_seq = rsm->r_end; 6896 rack->r_ctl.rc_gp_output_ts = rsm->usec_orig_send; 6897 } 6898 if (use_which == RACK_USE_END_OR_THACK) { 6899 /* 6900 * special case for ack moving forward, 6901 * not a sack, we need to move all the 6902 * way up to where this ack cum-ack moves 6903 * to. 6904 */ 6905 if (SEQ_GT(th_ack, rsm->r_end)) 6906 tp->gput_seq = th_ack; 6907 else 6908 tp->gput_seq = rsm->r_end; 6909 rack->r_ctl.rc_gp_output_ts = rsm->usec_orig_send; 6910 } 6911 if (SEQ_GT(tp->gput_seq, tp->gput_ack)) { 6912 /* 6913 * We moved beyond this guy's range, re-calculate 6914 * the new end point. 6915 */ 6916 if (rack->rc_gp_filled == 0) { 6917 tp->gput_ack = tp->gput_seq + max(rc_init_window(rack), (MIN_GP_WIN * ctf_fixed_maxseg(tp))); 6918 } else { 6919 tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack); 6920 } 6921 } 6922 /* 6923 * We are moving the goal post, we may be able to clear the 6924 * measure_saw_probe_rtt flag. 6925 */ 6926 if ((rack->in_probe_rtt == 0) && 6927 (rack->measure_saw_probe_rtt) && 6928 (SEQ_GEQ(tp->gput_seq, rack->r_ctl.rc_probertt_sndmax_atexit))) 6929 rack->measure_saw_probe_rtt = 0; 6930 rack_log_pacing_delay_calc(rack, ts, tp->gput_ts, 6931 seq, tp->gput_seq, 0, 5, line, NULL); 6932 if (rack->rc_gp_filled && 6933 ((tp->gput_ack - tp->gput_seq) < 6934 max(rc_init_window(rack), (MIN_GP_WIN * 6935 ctf_fixed_maxseg(tp))))) { 6936 /* 6937 * There is no sense of continuing this measurement 6938 * because its too small to gain us anything we 6939 * trust. Skip it and that way we can start a new 6940 * measurement quicker. 6941 */ 6942 rack_log_pacing_delay_calc(rack, tp->gput_ack, tp->gput_seq, 6943 0, 0, 0, 6, __LINE__, NULL); 6944 tp->t_flags &= ~TF_GPUTINPROG; 6945 } 6946 } 6947 } 6948 6949 static uint32_t 6950 rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, struct sackblk *sack, 6951 struct tcpopt *to, struct rack_sendmap **prsm, uint32_t cts, int *moved_two) 6952 { 6953 uint32_t start, end, changed = 0; 6954 struct rack_sendmap stack_map; 6955 struct rack_sendmap *rsm, *nrsm, fe, *insret, *prev, *next; 6956 int32_t used_ref = 1; 6957 int moved = 0; 6958 6959 start = sack->start; 6960 end = sack->end; 6961 rsm = *prsm; 6962 memset(&fe, 0, sizeof(fe)); 6963 do_rest_ofb: 6964 if ((rsm == NULL) || 6965 (SEQ_LT(end, rsm->r_start)) || 6966 (SEQ_GEQ(start, rsm->r_end)) || 6967 (SEQ_LT(start, rsm->r_start))) { 6968 /* 6969 * We are not in the right spot, 6970 * find the correct spot in the tree. 6971 */ 6972 used_ref = 0; 6973 fe.r_start = start; 6974 rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); 6975 moved++; 6976 } 6977 if (rsm == NULL) { 6978 /* TSNH */ 6979 goto out; 6980 } 6981 /* Ok we have an ACK for some piece of this rsm */ 6982 if (rsm->r_start != start) { 6983 if ((rsm->r_flags & RACK_ACKED) == 0) { 6984 /** 6985 * Need to split this in two pieces the before and after, 6986 * the before remains in the map, the after must be 6987 * added. In other words we have: 6988 * rsm |--------------| 6989 * sackblk |-------> 6990 * rsm will become 6991 * rsm |---| 6992 * and nrsm will be the sacked piece 6993 * nrsm |----------| 6994 * 6995 * But before we start down that path lets 6996 * see if the sack spans over on top of 6997 * the next guy and it is already sacked. 6998 */ 6999 next = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7000 if (next && (next->r_flags & RACK_ACKED) && 7001 SEQ_GEQ(end, next->r_start)) { 7002 /** 7003 * So the next one is already acked, and 7004 * we can thus by hookery use our stack_map 7005 * to reflect the piece being sacked and 7006 * then adjust the two tree entries moving 7007 * the start and ends around. So we start like: 7008 * rsm |------------| (not-acked) 7009 * next |-----------| (acked) 7010 * sackblk |--------> 7011 * We want to end like so: 7012 * rsm |------| (not-acked) 7013 * next |-----------------| (acked) 7014 * nrsm |-----| 7015 * Where nrsm is a temporary stack piece we 7016 * use to update all the gizmos. 7017 */ 7018 /* Copy up our fudge block */ 7019 nrsm = &stack_map; 7020 memcpy(nrsm, rsm, sizeof(struct rack_sendmap)); 7021 /* Now adjust our tree blocks */ 7022 rsm->r_end = start; 7023 next->r_start = start; 7024 /* Clear out the dup ack count of the remainder */ 7025 rsm->r_dupack = 0; 7026 rsm->r_just_ret = 0; 7027 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 7028 /* Now lets make sure our fudge block is right */ 7029 nrsm->r_start = start; 7030 /* Now lets update all the stats and such */ 7031 rack_update_rtt(tp, rack, nrsm, to, cts, SACKED, 0); 7032 if (rack->app_limited_needs_set) 7033 rack_need_set_test(tp, rack, nrsm, tp->snd_una, __LINE__, RACK_USE_END); 7034 changed += (nrsm->r_end - nrsm->r_start); 7035 rack->r_ctl.rc_sacked += (nrsm->r_end - nrsm->r_start); 7036 if (nrsm->r_flags & RACK_SACK_PASSED) { 7037 counter_u64_add(rack_reorder_seen, 1); 7038 rack->r_ctl.rc_reorder_ts = cts; 7039 } 7040 /* 7041 * Now we want to go up from rsm (the 7042 * one left un-acked) to the next one 7043 * in the tmap. We do this so when 7044 * we walk backwards we include marking 7045 * sack-passed on rsm (The one passed in 7046 * is skipped since it is generally called 7047 * on something sacked before removing it 7048 * from the tmap). 7049 */ 7050 if (rsm->r_in_tmap) { 7051 nrsm = TAILQ_NEXT(rsm, r_tnext); 7052 /* 7053 * Now that we have the next 7054 * one walk backwards from there. 7055 */ 7056 if (nrsm && nrsm->r_in_tmap) 7057 rack_log_sack_passed(tp, rack, nrsm); 7058 } 7059 /* Now are we done? */ 7060 if (SEQ_LT(end, next->r_end) || 7061 (end == next->r_end)) { 7062 /* Done with block */ 7063 goto out; 7064 } 7065 counter_u64_add(rack_sack_used_next_merge, 1); 7066 /* Postion for the next block */ 7067 start = next->r_end; 7068 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, next); 7069 if (rsm == NULL) 7070 goto out; 7071 } else { 7072 /** 7073 * We can't use any hookery here, so we 7074 * need to split the map. We enter like 7075 * so: 7076 * rsm |--------| 7077 * sackblk |-----> 7078 * We will add the new block nrsm and 7079 * that will be the new portion, and then 7080 * fall through after reseting rsm. So we 7081 * split and look like this: 7082 * rsm |----| 7083 * sackblk |-----> 7084 * nrsm |---| 7085 * We then fall through reseting 7086 * rsm to nrsm, so the next block 7087 * picks it up. 7088 */ 7089 nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT); 7090 if (nrsm == NULL) { 7091 /* 7092 * failed XXXrrs what can we do but loose the sack 7093 * info? 7094 */ 7095 goto out; 7096 } 7097 counter_u64_add(rack_sack_splits, 1); 7098 rack_clone_rsm(rack, nrsm, rsm, start); 7099 rsm->r_just_ret = 0; 7100 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); 7101 #ifdef INVARIANTS 7102 if (insret != NULL) { 7103 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", 7104 nrsm, insret, rack, rsm); 7105 } 7106 #endif 7107 if (rsm->r_in_tmap) { 7108 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 7109 nrsm->r_in_tmap = 1; 7110 } 7111 rsm->r_flags &= (~RACK_HAS_FIN); 7112 /* Position us to point to the new nrsm that starts the sack blk */ 7113 rsm = nrsm; 7114 } 7115 } else { 7116 /* Already sacked this piece */ 7117 counter_u64_add(rack_sack_skipped_acked, 1); 7118 moved++; 7119 if (end == rsm->r_end) { 7120 /* Done with block */ 7121 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7122 goto out; 7123 } else if (SEQ_LT(end, rsm->r_end)) { 7124 /* A partial sack to a already sacked block */ 7125 moved++; 7126 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7127 goto out; 7128 } else { 7129 /* 7130 * The end goes beyond this guy 7131 * repostion the start to the 7132 * next block. 7133 */ 7134 start = rsm->r_end; 7135 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7136 if (rsm == NULL) 7137 goto out; 7138 } 7139 } 7140 } 7141 if (SEQ_GEQ(end, rsm->r_end)) { 7142 /** 7143 * The end of this block is either beyond this guy or right 7144 * at this guy. I.e.: 7145 * rsm --- |-----| 7146 * end |-----| 7147 * <or> 7148 * end |---------| 7149 */ 7150 if ((rsm->r_flags & RACK_ACKED) == 0) { 7151 rack_update_rtt(tp, rack, rsm, to, cts, SACKED, 0); 7152 changed += (rsm->r_end - rsm->r_start); 7153 rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start); 7154 if (rsm->r_in_tmap) /* should be true */ 7155 rack_log_sack_passed(tp, rack, rsm); 7156 /* Is Reordering occuring? */ 7157 if (rsm->r_flags & RACK_SACK_PASSED) { 7158 rsm->r_flags &= ~RACK_SACK_PASSED; 7159 counter_u64_add(rack_reorder_seen, 1); 7160 rack->r_ctl.rc_reorder_ts = cts; 7161 } 7162 if (rack->app_limited_needs_set) 7163 rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_END); 7164 rsm->r_ack_arrival = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 7165 rsm->r_flags |= RACK_ACKED; 7166 rsm->r_flags &= ~RACK_TLP; 7167 if (rsm->r_in_tmap) { 7168 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 7169 rsm->r_in_tmap = 0; 7170 } 7171 } else { 7172 counter_u64_add(rack_sack_skipped_acked, 1); 7173 moved++; 7174 } 7175 if (end == rsm->r_end) { 7176 /* This block only - done, setup for next */ 7177 goto out; 7178 } 7179 /* 7180 * There is more not coverend by this rsm move on 7181 * to the next block in the RB tree. 7182 */ 7183 nrsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7184 start = rsm->r_end; 7185 rsm = nrsm; 7186 if (rsm == NULL) 7187 goto out; 7188 goto do_rest_ofb; 7189 } 7190 /** 7191 * The end of this sack block is smaller than 7192 * our rsm i.e.: 7193 * rsm --- |-----| 7194 * end |--| 7195 */ 7196 if ((rsm->r_flags & RACK_ACKED) == 0) { 7197 prev = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7198 if (prev && (prev->r_flags & RACK_ACKED)) { 7199 /** 7200 * Goal, we want the right remainder of rsm to shrink 7201 * in place and span from (rsm->r_start = end) to rsm->r_end. 7202 * We want to expand prev to go all the way 7203 * to prev->r_end <- end. 7204 * so in the tree we have before: 7205 * prev |--------| (acked) 7206 * rsm |-------| (non-acked) 7207 * sackblk |-| 7208 * We churn it so we end up with 7209 * prev |----------| (acked) 7210 * rsm |-----| (non-acked) 7211 * nrsm |-| (temporary) 7212 */ 7213 nrsm = &stack_map; 7214 memcpy(nrsm, rsm, sizeof(struct rack_sendmap)); 7215 prev->r_end = end; 7216 rsm->r_start = end; 7217 /* Now adjust nrsm (stack copy) to be 7218 * the one that is the small 7219 * piece that was "sacked". 7220 */ 7221 nrsm->r_end = end; 7222 rsm->r_dupack = 0; 7223 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 7224 /* 7225 * Now nrsm is our new little piece 7226 * that is acked (which was merged 7227 * to prev). Update the rtt and changed 7228 * based on that. Also check for reordering. 7229 */ 7230 rack_update_rtt(tp, rack, nrsm, to, cts, SACKED, 0); 7231 if (rack->app_limited_needs_set) 7232 rack_need_set_test(tp, rack, nrsm, tp->snd_una, __LINE__, RACK_USE_END); 7233 changed += (nrsm->r_end - nrsm->r_start); 7234 rack->r_ctl.rc_sacked += (nrsm->r_end - nrsm->r_start); 7235 if (nrsm->r_flags & RACK_SACK_PASSED) { 7236 counter_u64_add(rack_reorder_seen, 1); 7237 rack->r_ctl.rc_reorder_ts = cts; 7238 } 7239 rsm = prev; 7240 counter_u64_add(rack_sack_used_prev_merge, 1); 7241 } else { 7242 /** 7243 * This is the case where our previous 7244 * block is not acked either, so we must 7245 * split the block in two. 7246 */ 7247 nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT); 7248 if (nrsm == NULL) { 7249 /* failed rrs what can we do but loose the sack info? */ 7250 goto out; 7251 } 7252 /** 7253 * In this case nrsm becomes 7254 * nrsm->r_start = end; 7255 * nrsm->r_end = rsm->r_end; 7256 * which is un-acked. 7257 * <and> 7258 * rsm->r_end = nrsm->r_start; 7259 * i.e. the remaining un-acked 7260 * piece is left on the left 7261 * hand side. 7262 * 7263 * So we start like this 7264 * rsm |----------| (not acked) 7265 * sackblk |---| 7266 * build it so we have 7267 * rsm |---| (acked) 7268 * nrsm |------| (not acked) 7269 */ 7270 counter_u64_add(rack_sack_splits, 1); 7271 rack_clone_rsm(rack, nrsm, rsm, end); 7272 rsm->r_flags &= (~RACK_HAS_FIN); 7273 rsm->r_just_ret = 0; 7274 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); 7275 #ifdef INVARIANTS 7276 if (insret != NULL) { 7277 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", 7278 nrsm, insret, rack, rsm); 7279 } 7280 #endif 7281 if (rsm->r_in_tmap) { 7282 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 7283 nrsm->r_in_tmap = 1; 7284 } 7285 nrsm->r_dupack = 0; 7286 rack_log_retran_reason(rack, nrsm, __LINE__, 0, 2); 7287 rack_update_rtt(tp, rack, rsm, to, cts, SACKED, 0); 7288 changed += (rsm->r_end - rsm->r_start); 7289 rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start); 7290 if (rsm->r_in_tmap) /* should be true */ 7291 rack_log_sack_passed(tp, rack, rsm); 7292 /* Is Reordering occuring? */ 7293 if (rsm->r_flags & RACK_SACK_PASSED) { 7294 rsm->r_flags &= ~RACK_SACK_PASSED; 7295 counter_u64_add(rack_reorder_seen, 1); 7296 rack->r_ctl.rc_reorder_ts = cts; 7297 } 7298 if (rack->app_limited_needs_set) 7299 rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_END); 7300 rsm->r_ack_arrival = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 7301 rsm->r_flags |= RACK_ACKED; 7302 rsm->r_flags &= ~RACK_TLP; 7303 if (rsm->r_in_tmap) { 7304 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 7305 rsm->r_in_tmap = 0; 7306 } 7307 } 7308 } else if (start != end){ 7309 /* 7310 * The block was already acked. 7311 */ 7312 counter_u64_add(rack_sack_skipped_acked, 1); 7313 moved++; 7314 } 7315 out: 7316 if (rsm && (rsm->r_flags & RACK_ACKED)) { 7317 /* 7318 * Now can we merge where we worked 7319 * with either the previous or 7320 * next block? 7321 */ 7322 next = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7323 while (next) { 7324 if (next->r_flags & RACK_ACKED) { 7325 /* yep this and next can be merged */ 7326 rsm = rack_merge_rsm(rack, rsm, next); 7327 next = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7328 } else 7329 break; 7330 } 7331 /* Now what about the previous? */ 7332 prev = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7333 while (prev) { 7334 if (prev->r_flags & RACK_ACKED) { 7335 /* yep the previous and this can be merged */ 7336 rsm = rack_merge_rsm(rack, prev, rsm); 7337 prev = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7338 } else 7339 break; 7340 } 7341 } 7342 if (used_ref == 0) { 7343 counter_u64_add(rack_sack_proc_all, 1); 7344 } else { 7345 counter_u64_add(rack_sack_proc_short, 1); 7346 } 7347 /* Save off the next one for quick reference. */ 7348 if (rsm) 7349 nrsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7350 else 7351 nrsm = NULL; 7352 *prsm = rack->r_ctl.rc_sacklast = nrsm; 7353 /* Pass back the moved. */ 7354 *moved_two = moved; 7355 return (changed); 7356 } 7357 7358 static void inline 7359 rack_peer_reneges(struct tcp_rack *rack, struct rack_sendmap *rsm, tcp_seq th_ack) 7360 { 7361 struct rack_sendmap *tmap; 7362 7363 tmap = NULL; 7364 while (rsm && (rsm->r_flags & RACK_ACKED)) { 7365 /* Its no longer sacked, mark it so */ 7366 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); 7367 #ifdef INVARIANTS 7368 if (rsm->r_in_tmap) { 7369 panic("rack:%p rsm:%p flags:0x%x in tmap?", 7370 rack, rsm, rsm->r_flags); 7371 } 7372 #endif 7373 rsm->r_flags &= ~(RACK_ACKED|RACK_SACK_PASSED|RACK_WAS_SACKPASS); 7374 /* Rebuild it into our tmap */ 7375 if (tmap == NULL) { 7376 TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext); 7377 tmap = rsm; 7378 } else { 7379 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, tmap, rsm, r_tnext); 7380 tmap = rsm; 7381 } 7382 tmap->r_in_tmap = 1; 7383 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7384 } 7385 /* 7386 * Now lets possibly clear the sack filter so we start 7387 * recognizing sacks that cover this area. 7388 */ 7389 sack_filter_clear(&rack->r_ctl.rack_sf, th_ack); 7390 7391 } 7392 7393 static void 7394 rack_do_decay(struct tcp_rack *rack) 7395 { 7396 struct timeval res; 7397 7398 #define timersub(tvp, uvp, vvp) \ 7399 do { \ 7400 (vvp)->tv_sec = (tvp)->tv_sec - (uvp)->tv_sec; \ 7401 (vvp)->tv_usec = (tvp)->tv_usec - (uvp)->tv_usec; \ 7402 if ((vvp)->tv_usec < 0) { \ 7403 (vvp)->tv_sec--; \ 7404 (vvp)->tv_usec += 1000000; \ 7405 } \ 7406 } while (0) 7407 7408 timersub(&rack->r_ctl.act_rcv_time, &rack->r_ctl.rc_last_time_decay, &res); 7409 #undef timersub 7410 7411 rack->r_ctl.input_pkt++; 7412 if ((rack->rc_in_persist) || 7413 (res.tv_sec >= 1) || 7414 (rack->rc_tp->snd_max == rack->rc_tp->snd_una)) { 7415 /* 7416 * Check for decay of non-SAD, 7417 * we want all SAD detection metrics to 7418 * decay 1/4 per second (or more) passed. 7419 */ 7420 uint32_t pkt_delta; 7421 7422 pkt_delta = rack->r_ctl.input_pkt - rack->r_ctl.saved_input_pkt; 7423 /* Update our saved tracking values */ 7424 rack->r_ctl.saved_input_pkt = rack->r_ctl.input_pkt; 7425 rack->r_ctl.rc_last_time_decay = rack->r_ctl.act_rcv_time; 7426 /* Now do we escape without decay? */ 7427 #ifdef NETFLIX_EXP_DETECTION 7428 if (rack->rc_in_persist || 7429 (rack->rc_tp->snd_max == rack->rc_tp->snd_una) || 7430 (pkt_delta < tcp_sad_low_pps)){ 7431 /* 7432 * We don't decay idle connections 7433 * or ones that have a low input pps. 7434 */ 7435 return; 7436 } 7437 /* Decay the counters */ 7438 rack->r_ctl.ack_count = ctf_decay_count(rack->r_ctl.ack_count, 7439 tcp_sad_decay_val); 7440 rack->r_ctl.sack_count = ctf_decay_count(rack->r_ctl.sack_count, 7441 tcp_sad_decay_val); 7442 rack->r_ctl.sack_moved_extra = ctf_decay_count(rack->r_ctl.sack_moved_extra, 7443 tcp_sad_decay_val); 7444 rack->r_ctl.sack_noextra_move = ctf_decay_count(rack->r_ctl.sack_noextra_move, 7445 tcp_sad_decay_val); 7446 #endif 7447 } 7448 } 7449 7450 static void 7451 rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th) 7452 { 7453 uint32_t changed, entered_recovery = 0; 7454 struct tcp_rack *rack; 7455 struct rack_sendmap *rsm, *rm; 7456 struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1]; 7457 register uint32_t th_ack; 7458 int32_t i, j, k, num_sack_blks = 0; 7459 uint32_t cts, acked, ack_point, sack_changed = 0; 7460 int loop_start = 0, moved_two = 0; 7461 uint32_t tsused; 7462 7463 INP_WLOCK_ASSERT(tp->t_inpcb); 7464 if (th->th_flags & TH_RST) { 7465 /* We don't log resets */ 7466 return; 7467 } 7468 rack = (struct tcp_rack *)tp->t_fb_ptr; 7469 cts = tcp_ts_getticks(); 7470 rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 7471 changed = 0; 7472 th_ack = th->th_ack; 7473 if (rack->sack_attack_disable == 0) 7474 rack_do_decay(rack); 7475 if (BYTES_THIS_ACK(tp, th) >= ctf_fixed_maxseg(rack->rc_tp)) { 7476 /* 7477 * You only get credit for 7478 * MSS and greater (and you get extra 7479 * credit for larger cum-ack moves). 7480 */ 7481 int ac; 7482 7483 ac = BYTES_THIS_ACK(tp, th) / ctf_fixed_maxseg(rack->rc_tp); 7484 rack->r_ctl.ack_count += ac; 7485 counter_u64_add(rack_ack_total, ac); 7486 } 7487 if (rack->r_ctl.ack_count > 0xfff00000) { 7488 /* 7489 * reduce the number to keep us under 7490 * a uint32_t. 7491 */ 7492 rack->r_ctl.ack_count /= 2; 7493 rack->r_ctl.sack_count /= 2; 7494 } 7495 if (SEQ_GT(th_ack, tp->snd_una)) { 7496 rack_log_progress_event(rack, tp, ticks, PROGRESS_UPDATE, __LINE__); 7497 tp->t_acktime = ticks; 7498 } 7499 if (rsm && SEQ_GT(th_ack, rsm->r_start)) 7500 changed = th_ack - rsm->r_start; 7501 if (changed) { 7502 /* 7503 * The ACK point is advancing to th_ack, we must drop off 7504 * the packets in the rack log and calculate any eligble 7505 * RTT's. 7506 */ 7507 rack->r_wanted_output = 1; 7508 more: 7509 rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 7510 if (rsm == NULL) { 7511 if ((th_ack - 1) == tp->iss) { 7512 /* 7513 * For the SYN incoming case we will not 7514 * have called tcp_output for the sending of 7515 * the SYN, so there will be no map. All 7516 * other cases should probably be a panic. 7517 */ 7518 goto proc_sack; 7519 } 7520 if (tp->t_flags & TF_SENTFIN) { 7521 /* if we send a FIN we will not hav a map */ 7522 goto proc_sack; 7523 } 7524 #ifdef INVARIANTS 7525 panic("No rack map tp:%p for th:%p state:%d rack:%p snd_una:%u snd_max:%u snd_nxt:%u chg:%d\n", 7526 tp, 7527 th, tp->t_state, rack, 7528 tp->snd_una, tp->snd_max, tp->snd_nxt, changed); 7529 #endif 7530 goto proc_sack; 7531 } 7532 if (SEQ_LT(th_ack, rsm->r_start)) { 7533 /* Huh map is missing this */ 7534 #ifdef INVARIANTS 7535 printf("Rack map starts at r_start:%u for th_ack:%u huh? ts:%d rs:%d\n", 7536 rsm->r_start, 7537 th_ack, tp->t_state, rack->r_state); 7538 #endif 7539 goto proc_sack; 7540 } 7541 rack_update_rtt(tp, rack, rsm, to, cts, CUM_ACKED, th_ack); 7542 /* Now do we consume the whole thing? */ 7543 if (SEQ_GEQ(th_ack, rsm->r_end)) { 7544 /* Its all consumed. */ 7545 uint32_t left; 7546 uint8_t newly_acked; 7547 7548 rack->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes; 7549 rsm->r_rtr_bytes = 0; 7550 /* Record the time of highest cumack sent */ 7551 rack->r_ctl.rc_gp_cumack_ts = rsm->usec_orig_send; 7552 rm = RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7553 #ifdef INVARIANTS 7554 if (rm != rsm) { 7555 panic("removing head in rack:%p rsm:%p rm:%p", 7556 rack, rsm, rm); 7557 } 7558 #endif 7559 if (rsm->r_in_tmap) { 7560 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 7561 rsm->r_in_tmap = 0; 7562 } 7563 newly_acked = 1; 7564 if (rsm->r_flags & RACK_ACKED) { 7565 /* 7566 * It was acked on the scoreboard -- remove 7567 * it from total 7568 */ 7569 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); 7570 newly_acked = 0; 7571 } else if (rsm->r_flags & RACK_SACK_PASSED) { 7572 /* 7573 * There are segments ACKED on the 7574 * scoreboard further up. We are seeing 7575 * reordering. 7576 */ 7577 rsm->r_flags &= ~RACK_SACK_PASSED; 7578 counter_u64_add(rack_reorder_seen, 1); 7579 rsm->r_ack_arrival = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 7580 rsm->r_flags |= RACK_ACKED; 7581 rack->r_ctl.rc_reorder_ts = cts; 7582 } 7583 left = th_ack - rsm->r_end; 7584 if (rack->app_limited_needs_set && newly_acked) 7585 rack_need_set_test(tp, rack, rsm, th_ack, __LINE__, RACK_USE_END_OR_THACK); 7586 /* Free back to zone */ 7587 rack_free(rack, rsm); 7588 if (left) { 7589 goto more; 7590 } 7591 goto proc_sack; 7592 } 7593 if (rsm->r_flags & RACK_ACKED) { 7594 /* 7595 * It was acked on the scoreboard -- remove it from 7596 * total for the part being cum-acked. 7597 */ 7598 rack->r_ctl.rc_sacked -= (th_ack - rsm->r_start); 7599 } 7600 /* 7601 * Clear the dup ack count for 7602 * the piece that remains. 7603 */ 7604 rsm->r_dupack = 0; 7605 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 7606 if (rsm->r_rtr_bytes) { 7607 /* 7608 * It was retransmitted adjust the 7609 * sack holes for what was acked. 7610 */ 7611 int ack_am; 7612 7613 ack_am = (th_ack - rsm->r_start); 7614 if (ack_am >= rsm->r_rtr_bytes) { 7615 rack->r_ctl.rc_holes_rxt -= ack_am; 7616 rsm->r_rtr_bytes -= ack_am; 7617 } 7618 } 7619 /* 7620 * Update where the piece starts and record 7621 * the time of send of highest cumack sent. 7622 */ 7623 rack->r_ctl.rc_gp_cumack_ts = rsm->usec_orig_send; 7624 rsm->r_start = th_ack; 7625 if (rack->app_limited_needs_set) 7626 rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_BEG); 7627 } 7628 proc_sack: 7629 /* Check for reneging */ 7630 rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 7631 if (rsm && (rsm->r_flags & RACK_ACKED) && (th_ack == rsm->r_start)) { 7632 /* 7633 * The peer has moved snd_una up to 7634 * the edge of this send, i.e. one 7635 * that it had previously acked. The only 7636 * way that can be true if the peer threw 7637 * away data (space issues) that it had 7638 * previously sacked (else it would have 7639 * given us snd_una up to (rsm->r_end). 7640 * We need to undo the acked markings here. 7641 * 7642 * Note we have to look to make sure th_ack is 7643 * our rsm->r_start in case we get an old ack 7644 * where th_ack is behind snd_una. 7645 */ 7646 rack_peer_reneges(rack, rsm, th->th_ack); 7647 } 7648 if ((to->to_flags & TOF_SACK) == 0) { 7649 /* We are done nothing left */ 7650 goto out; 7651 } 7652 /* Sack block processing */ 7653 if (SEQ_GT(th_ack, tp->snd_una)) 7654 ack_point = th_ack; 7655 else 7656 ack_point = tp->snd_una; 7657 for (i = 0; i < to->to_nsacks; i++) { 7658 bcopy((to->to_sacks + i * TCPOLEN_SACK), 7659 &sack, sizeof(sack)); 7660 sack.start = ntohl(sack.start); 7661 sack.end = ntohl(sack.end); 7662 if (SEQ_GT(sack.end, sack.start) && 7663 SEQ_GT(sack.start, ack_point) && 7664 SEQ_LT(sack.start, tp->snd_max) && 7665 SEQ_GT(sack.end, ack_point) && 7666 SEQ_LEQ(sack.end, tp->snd_max)) { 7667 sack_blocks[num_sack_blks] = sack; 7668 num_sack_blks++; 7669 #ifdef NETFLIX_STATS 7670 } else if (SEQ_LEQ(sack.start, th_ack) && 7671 SEQ_LEQ(sack.end, th_ack)) { 7672 /* 7673 * Its a D-SACK block. 7674 */ 7675 tcp_record_dsack(sack.start, sack.end); 7676 #endif 7677 } 7678 } 7679 /* 7680 * Sort the SACK blocks so we can update the rack scoreboard with 7681 * just one pass. 7682 */ 7683 num_sack_blks = sack_filter_blks(&rack->r_ctl.rack_sf, sack_blocks, 7684 num_sack_blks, th->th_ack); 7685 ctf_log_sack_filter(rack->rc_tp, num_sack_blks, sack_blocks); 7686 if (num_sack_blks == 0) { 7687 /* Nothing to sack (DSACKs?) */ 7688 goto out_with_totals; 7689 } 7690 if (num_sack_blks < 2) { 7691 /* Only one, we don't need to sort */ 7692 goto do_sack_work; 7693 } 7694 /* Sort the sacks */ 7695 for (i = 0; i < num_sack_blks; i++) { 7696 for (j = i + 1; j < num_sack_blks; j++) { 7697 if (SEQ_GT(sack_blocks[i].end, sack_blocks[j].end)) { 7698 sack = sack_blocks[i]; 7699 sack_blocks[i] = sack_blocks[j]; 7700 sack_blocks[j] = sack; 7701 } 7702 } 7703 } 7704 /* 7705 * Now are any of the sack block ends the same (yes some 7706 * implementations send these)? 7707 */ 7708 again: 7709 if (num_sack_blks == 0) 7710 goto out_with_totals; 7711 if (num_sack_blks > 1) { 7712 for (i = 0; i < num_sack_blks; i++) { 7713 for (j = i + 1; j < num_sack_blks; j++) { 7714 if (sack_blocks[i].end == sack_blocks[j].end) { 7715 /* 7716 * Ok these two have the same end we 7717 * want the smallest end and then 7718 * throw away the larger and start 7719 * again. 7720 */ 7721 if (SEQ_LT(sack_blocks[j].start, sack_blocks[i].start)) { 7722 /* 7723 * The second block covers 7724 * more area use that 7725 */ 7726 sack_blocks[i].start = sack_blocks[j].start; 7727 } 7728 /* 7729 * Now collapse out the dup-sack and 7730 * lower the count 7731 */ 7732 for (k = (j + 1); k < num_sack_blks; k++) { 7733 sack_blocks[j].start = sack_blocks[k].start; 7734 sack_blocks[j].end = sack_blocks[k].end; 7735 j++; 7736 } 7737 num_sack_blks--; 7738 goto again; 7739 } 7740 } 7741 } 7742 } 7743 do_sack_work: 7744 /* 7745 * First lets look to see if 7746 * we have retransmitted and 7747 * can use the transmit next? 7748 */ 7749 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 7750 if (rsm && 7751 SEQ_GT(sack_blocks[0].end, rsm->r_start) && 7752 SEQ_LT(sack_blocks[0].start, rsm->r_end)) { 7753 /* 7754 * We probably did the FR and the next 7755 * SACK in continues as we would expect. 7756 */ 7757 acked = rack_proc_sack_blk(tp, rack, &sack_blocks[0], to, &rsm, cts, &moved_two); 7758 if (acked) { 7759 rack->r_wanted_output = 1; 7760 changed += acked; 7761 sack_changed += acked; 7762 } 7763 if (num_sack_blks == 1) { 7764 /* 7765 * This is what we would expect from 7766 * a normal implementation to happen 7767 * after we have retransmitted the FR, 7768 * i.e the sack-filter pushes down 7769 * to 1 block and the next to be retransmitted 7770 * is the sequence in the sack block (has more 7771 * are acked). Count this as ACK'd data to boost 7772 * up the chances of recovering any false positives. 7773 */ 7774 rack->r_ctl.ack_count += (acked / ctf_fixed_maxseg(rack->rc_tp)); 7775 counter_u64_add(rack_ack_total, (acked / ctf_fixed_maxseg(rack->rc_tp))); 7776 counter_u64_add(rack_express_sack, 1); 7777 if (rack->r_ctl.ack_count > 0xfff00000) { 7778 /* 7779 * reduce the number to keep us under 7780 * a uint32_t. 7781 */ 7782 rack->r_ctl.ack_count /= 2; 7783 rack->r_ctl.sack_count /= 2; 7784 } 7785 goto out_with_totals; 7786 } else { 7787 /* 7788 * Start the loop through the 7789 * rest of blocks, past the first block. 7790 */ 7791 moved_two = 0; 7792 loop_start = 1; 7793 } 7794 } 7795 /* Its a sack of some sort */ 7796 rack->r_ctl.sack_count++; 7797 if (rack->r_ctl.sack_count > 0xfff00000) { 7798 /* 7799 * reduce the number to keep us under 7800 * a uint32_t. 7801 */ 7802 rack->r_ctl.ack_count /= 2; 7803 rack->r_ctl.sack_count /= 2; 7804 } 7805 counter_u64_add(rack_sack_total, 1); 7806 if (rack->sack_attack_disable) { 7807 /* An attacker disablement is in place */ 7808 if (num_sack_blks > 1) { 7809 rack->r_ctl.sack_count += (num_sack_blks - 1); 7810 rack->r_ctl.sack_moved_extra++; 7811 counter_u64_add(rack_move_some, 1); 7812 if (rack->r_ctl.sack_moved_extra > 0xfff00000) { 7813 rack->r_ctl.sack_moved_extra /= 2; 7814 rack->r_ctl.sack_noextra_move /= 2; 7815 } 7816 } 7817 goto out; 7818 } 7819 rsm = rack->r_ctl.rc_sacklast; 7820 for (i = loop_start; i < num_sack_blks; i++) { 7821 acked = rack_proc_sack_blk(tp, rack, &sack_blocks[i], to, &rsm, cts, &moved_two); 7822 if (acked) { 7823 rack->r_wanted_output = 1; 7824 changed += acked; 7825 sack_changed += acked; 7826 } 7827 if (moved_two) { 7828 /* 7829 * If we did not get a SACK for at least a MSS and 7830 * had to move at all, or if we moved more than our 7831 * threshold, it counts against the "extra" move. 7832 */ 7833 rack->r_ctl.sack_moved_extra += moved_two; 7834 counter_u64_add(rack_move_some, 1); 7835 } else { 7836 /* 7837 * else we did not have to move 7838 * any more than we would expect. 7839 */ 7840 rack->r_ctl.sack_noextra_move++; 7841 counter_u64_add(rack_move_none, 1); 7842 } 7843 if (moved_two && (acked < ctf_fixed_maxseg(rack->rc_tp))) { 7844 /* 7845 * If the SACK was not a full MSS then 7846 * we add to sack_count the number of 7847 * MSS's (or possibly more than 7848 * a MSS if its a TSO send) we had to skip by. 7849 */ 7850 rack->r_ctl.sack_count += moved_two; 7851 counter_u64_add(rack_sack_total, moved_two); 7852 } 7853 /* 7854 * Now we need to setup for the next 7855 * round. First we make sure we won't 7856 * exceed the size of our uint32_t on 7857 * the various counts, and then clear out 7858 * moved_two. 7859 */ 7860 if ((rack->r_ctl.sack_moved_extra > 0xfff00000) || 7861 (rack->r_ctl.sack_noextra_move > 0xfff00000)) { 7862 rack->r_ctl.sack_moved_extra /= 2; 7863 rack->r_ctl.sack_noextra_move /= 2; 7864 } 7865 if (rack->r_ctl.sack_count > 0xfff00000) { 7866 rack->r_ctl.ack_count /= 2; 7867 rack->r_ctl.sack_count /= 2; 7868 } 7869 moved_two = 0; 7870 } 7871 out_with_totals: 7872 if (num_sack_blks > 1) { 7873 /* 7874 * You get an extra stroke if 7875 * you have more than one sack-blk, this 7876 * could be where we are skipping forward 7877 * and the sack-filter is still working, or 7878 * it could be an attacker constantly 7879 * moving us. 7880 */ 7881 rack->r_ctl.sack_moved_extra++; 7882 counter_u64_add(rack_move_some, 1); 7883 } 7884 out: 7885 #ifdef NETFLIX_EXP_DETECTION 7886 if ((rack->do_detection || tcp_force_detection) && 7887 tcp_sack_to_ack_thresh && 7888 tcp_sack_to_move_thresh && 7889 ((rack->r_ctl.rc_num_maps_alloced > tcp_map_minimum) || rack->sack_attack_disable)) { 7890 /* 7891 * We have thresholds set to find 7892 * possible attackers and disable sack. 7893 * Check them. 7894 */ 7895 uint64_t ackratio, moveratio, movetotal; 7896 7897 /* Log detecting */ 7898 rack_log_sad(rack, 1); 7899 ackratio = (uint64_t)(rack->r_ctl.sack_count); 7900 ackratio *= (uint64_t)(1000); 7901 if (rack->r_ctl.ack_count) 7902 ackratio /= (uint64_t)(rack->r_ctl.ack_count); 7903 else { 7904 /* We really should not hit here */ 7905 ackratio = 1000; 7906 } 7907 if ((rack->sack_attack_disable == 0) && 7908 (ackratio > rack_highest_sack_thresh_seen)) 7909 rack_highest_sack_thresh_seen = (uint32_t)ackratio; 7910 movetotal = rack->r_ctl.sack_moved_extra; 7911 movetotal += rack->r_ctl.sack_noextra_move; 7912 moveratio = rack->r_ctl.sack_moved_extra; 7913 moveratio *= (uint64_t)1000; 7914 if (movetotal) 7915 moveratio /= movetotal; 7916 else { 7917 /* No moves, thats pretty good */ 7918 moveratio = 0; 7919 } 7920 if ((rack->sack_attack_disable == 0) && 7921 (moveratio > rack_highest_move_thresh_seen)) 7922 rack_highest_move_thresh_seen = (uint32_t)moveratio; 7923 if (rack->sack_attack_disable == 0) { 7924 if ((ackratio > tcp_sack_to_ack_thresh) && 7925 (moveratio > tcp_sack_to_move_thresh)) { 7926 /* Disable sack processing */ 7927 rack->sack_attack_disable = 1; 7928 if (rack->r_rep_attack == 0) { 7929 rack->r_rep_attack = 1; 7930 counter_u64_add(rack_sack_attacks_detected, 1); 7931 } 7932 if (tcp_attack_on_turns_on_logging) { 7933 /* 7934 * Turn on logging, used for debugging 7935 * false positives. 7936 */ 7937 rack->rc_tp->t_logstate = tcp_attack_on_turns_on_logging; 7938 } 7939 /* Clamp the cwnd at flight size */ 7940 rack->r_ctl.rc_saved_cwnd = rack->rc_tp->snd_cwnd; 7941 rack->rc_tp->snd_cwnd = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 7942 rack_log_sad(rack, 2); 7943 } 7944 } else { 7945 /* We are sack-disabled check for false positives */ 7946 if ((ackratio <= tcp_restoral_thresh) || 7947 (rack->r_ctl.rc_num_maps_alloced < tcp_map_minimum)) { 7948 rack->sack_attack_disable = 0; 7949 rack_log_sad(rack, 3); 7950 /* Restart counting */ 7951 rack->r_ctl.sack_count = 0; 7952 rack->r_ctl.sack_moved_extra = 0; 7953 rack->r_ctl.sack_noextra_move = 1; 7954 rack->r_ctl.ack_count = max(1, 7955 (BYTES_THIS_ACK(tp, th)/ctf_fixed_maxseg(rack->rc_tp))); 7956 7957 if (rack->r_rep_reverse == 0) { 7958 rack->r_rep_reverse = 1; 7959 counter_u64_add(rack_sack_attacks_reversed, 1); 7960 } 7961 /* Restore the cwnd */ 7962 if (rack->r_ctl.rc_saved_cwnd > rack->rc_tp->snd_cwnd) 7963 rack->rc_tp->snd_cwnd = rack->r_ctl.rc_saved_cwnd; 7964 } 7965 } 7966 } 7967 #endif 7968 if (changed) { 7969 /* Something changed cancel the rack timer */ 7970 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 7971 } 7972 tsused = tcp_ts_getticks(); 7973 rsm = tcp_rack_output(tp, rack, tsused); 7974 if ((!IN_RECOVERY(tp->t_flags)) && 7975 rsm) { 7976 /* Enter recovery */ 7977 rack->r_ctl.rc_rsm_start = rsm->r_start; 7978 rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; 7979 rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; 7980 entered_recovery = 1; 7981 rack_cong_signal(tp, NULL, CC_NDUPACK); 7982 /* 7983 * When we enter recovery we need to assure we send 7984 * one packet. 7985 */ 7986 if (rack->rack_no_prr == 0) { 7987 rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); 7988 rack_log_to_prr(rack, 8, 0); 7989 } 7990 rack->r_timer_override = 1; 7991 rack->r_early = 0; 7992 rack->r_ctl.rc_agg_early = 0; 7993 } else if (IN_RECOVERY(tp->t_flags) && 7994 rsm && 7995 (rack->r_rr_config == 3)) { 7996 /* 7997 * Assure we can output and we get no 7998 * remembered pace time except the retransmit. 7999 */ 8000 rack->r_timer_override = 1; 8001 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 8002 rack->r_ctl.rc_resend = rsm; 8003 } 8004 if (IN_RECOVERY(tp->t_flags) && 8005 (rack->rack_no_prr == 0) && 8006 (entered_recovery == 0)) { 8007 /* Deal with PRR here (in recovery only) */ 8008 uint32_t pipe, snd_una; 8009 8010 rack->r_ctl.rc_prr_delivered += changed; 8011 /* Compute prr_sndcnt */ 8012 if (SEQ_GT(tp->snd_una, th_ack)) { 8013 snd_una = tp->snd_una; 8014 } else { 8015 snd_una = th_ack; 8016 } 8017 pipe = ((tp->snd_max - snd_una) - rack->r_ctl.rc_sacked) + rack->r_ctl.rc_holes_rxt; 8018 if (pipe > tp->snd_ssthresh) { 8019 long sndcnt; 8020 8021 sndcnt = rack->r_ctl.rc_prr_delivered * tp->snd_ssthresh; 8022 if (rack->r_ctl.rc_prr_recovery_fs > 0) 8023 sndcnt /= (long)rack->r_ctl.rc_prr_recovery_fs; 8024 else { 8025 rack->r_ctl.rc_prr_sndcnt = 0; 8026 rack_log_to_prr(rack, 9, 0); 8027 sndcnt = 0; 8028 } 8029 sndcnt++; 8030 if (sndcnt > (long)rack->r_ctl.rc_prr_out) 8031 sndcnt -= rack->r_ctl.rc_prr_out; 8032 else 8033 sndcnt = 0; 8034 rack->r_ctl.rc_prr_sndcnt = sndcnt; 8035 rack_log_to_prr(rack, 10, 0); 8036 } else { 8037 uint32_t limit; 8038 8039 if (rack->r_ctl.rc_prr_delivered > rack->r_ctl.rc_prr_out) 8040 limit = (rack->r_ctl.rc_prr_delivered - rack->r_ctl.rc_prr_out); 8041 else 8042 limit = 0; 8043 if (changed > limit) 8044 limit = changed; 8045 limit += ctf_fixed_maxseg(tp); 8046 if (tp->snd_ssthresh > pipe) { 8047 rack->r_ctl.rc_prr_sndcnt = min((tp->snd_ssthresh - pipe), limit); 8048 rack_log_to_prr(rack, 11, 0); 8049 } else { 8050 rack->r_ctl.rc_prr_sndcnt = min(0, limit); 8051 rack_log_to_prr(rack, 12, 0); 8052 } 8053 } 8054 if ((rsm && (rack->r_ctl.rc_prr_sndcnt >= ctf_fixed_maxseg(tp)) && 8055 ((rack->rc_inp->inp_in_hpts == 0) && 8056 ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0)))) { 8057 /* 8058 * If you are pacing output you don't want 8059 * to override. 8060 */ 8061 rack->r_early = 0; 8062 rack->r_ctl.rc_agg_early = 0; 8063 rack->r_timer_override = 1; 8064 } 8065 } 8066 } 8067 8068 static void 8069 rack_strike_dupack(struct tcp_rack *rack) 8070 { 8071 struct rack_sendmap *rsm; 8072 8073 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 8074 if (rsm && (rsm->r_dupack < 0xff)) { 8075 rsm->r_dupack++; 8076 if (rsm->r_dupack >= DUP_ACK_THRESHOLD) { 8077 rack->r_wanted_output = 1; 8078 rack->r_timer_override = 1; 8079 rack_log_retran_reason(rack, rsm, __LINE__, 1, 3); 8080 } else { 8081 rack_log_retran_reason(rack, rsm, __LINE__, 0, 3); 8082 } 8083 } 8084 } 8085 8086 static void 8087 rack_check_bottom_drag(struct tcpcb *tp, 8088 struct tcp_rack *rack, 8089 struct socket *so, int32_t acked) 8090 { 8091 uint32_t segsiz, minseg; 8092 8093 segsiz = ctf_fixed_maxseg(tp); 8094 minseg = segsiz; 8095 8096 if (tp->snd_max == tp->snd_una) { 8097 /* 8098 * We are doing dynamic pacing and we are way 8099 * under. Basically everything got acked while 8100 * we were still waiting on the pacer to expire. 8101 * 8102 * This means we need to boost the b/w in 8103 * addition to any earlier boosting of 8104 * the multipler. 8105 */ 8106 rack->rc_dragged_bottom = 1; 8107 rack_validate_multipliers_at_or_above100(rack); 8108 /* 8109 * Lets use the segment bytes acked plus 8110 * the lowest RTT seen as the basis to 8111 * form a b/w estimate. This will be off 8112 * due to the fact that the true estimate 8113 * should be around 1/2 the time of the RTT 8114 * but we can settle for that. 8115 */ 8116 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_VALID) && 8117 acked) { 8118 uint64_t bw, calc_bw, rtt; 8119 8120 rtt = rack->r_ctl.rack_rs.rs_us_rtt; 8121 bw = acked; 8122 calc_bw = bw * 1000000; 8123 calc_bw /= rtt; 8124 if (rack->r_ctl.last_max_bw && 8125 (rack->r_ctl.last_max_bw < calc_bw)) { 8126 /* 8127 * If we have a last calculated max bw 8128 * enforce it. 8129 */ 8130 calc_bw = rack->r_ctl.last_max_bw; 8131 } 8132 /* now plop it in */ 8133 if (rack->rc_gp_filled == 0) { 8134 if (calc_bw > ONE_POINT_TWO_MEG) { 8135 /* 8136 * If we have no measurement 8137 * don't let us set in more than 8138 * 1.2Mbps. If we are still too 8139 * low after pacing with this we 8140 * will hopefully have a max b/w 8141 * available to sanity check things. 8142 */ 8143 calc_bw = ONE_POINT_TWO_MEG; 8144 } 8145 rack->r_ctl.rc_rtt_diff = 0; 8146 rack->r_ctl.gp_bw = calc_bw; 8147 rack->rc_gp_filled = 1; 8148 rack->r_ctl.num_avg = RACK_REQ_AVG; 8149 rack_set_pace_segments(rack->rc_tp, rack, __LINE__); 8150 } else if (calc_bw > rack->r_ctl.gp_bw) { 8151 rack->r_ctl.rc_rtt_diff = 0; 8152 rack->r_ctl.num_avg = RACK_REQ_AVG; 8153 rack->r_ctl.gp_bw = calc_bw; 8154 rack_set_pace_segments(rack->rc_tp, rack, __LINE__); 8155 } else 8156 rack_increase_bw_mul(rack, -1, 0, 0, 1); 8157 /* 8158 * For acks over 1mss we do a extra boost to simulate 8159 * where we would get 2 acks (we want 110 for the mul). 8160 */ 8161 if (acked > segsiz) 8162 rack_increase_bw_mul(rack, -1, 0, 0, 1); 8163 } else { 8164 /* 8165 * Huh, this should not be, settle 8166 * for just an old increase. 8167 */ 8168 rack_increase_bw_mul(rack, -1, 0, 0, 1); 8169 } 8170 } else if ((IN_RECOVERY(tp->t_flags) == 0) && 8171 (sbavail(&so->so_snd) > max((segsiz * (4 + rack_req_segs)), 8172 minseg)) && 8173 (rack->r_ctl.cwnd_to_use > max((segsiz * (rack_req_segs + 2)), minseg)) && 8174 (tp->snd_wnd > max((segsiz * (rack_req_segs + 2)), minseg)) && 8175 (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) <= 8176 (segsiz * rack_req_segs))) { 8177 /* 8178 * We are doing dynamic GP pacing and 8179 * we have everything except 1MSS or less 8180 * bytes left out. We are still pacing away. 8181 * And there is data that could be sent, This 8182 * means we are inserting delayed ack time in 8183 * our measurements because we are pacing too slow. 8184 */ 8185 rack_validate_multipliers_at_or_above100(rack); 8186 rack->rc_dragged_bottom = 1; 8187 rack_increase_bw_mul(rack, -1, 0, 0, 1); 8188 } 8189 } 8190 8191 /* 8192 * Return value of 1, we do not need to call rack_process_data(). 8193 * return value of 0, rack_process_data can be called. 8194 * For ret_val if its 0 the TCP is locked, if its non-zero 8195 * its unlocked and probably unsafe to touch the TCB. 8196 */ 8197 static int 8198 rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so, 8199 struct tcpcb *tp, struct tcpopt *to, 8200 uint32_t tiwin, int32_t tlen, 8201 int32_t * ofia, int32_t thflags, int32_t * ret_val) 8202 { 8203 int32_t ourfinisacked = 0; 8204 int32_t nsegs, acked_amount; 8205 int32_t acked; 8206 struct mbuf *mfree; 8207 struct tcp_rack *rack; 8208 int32_t under_pacing = 0; 8209 int32_t recovery = 0; 8210 8211 rack = (struct tcp_rack *)tp->t_fb_ptr; 8212 if (SEQ_GT(th->th_ack, tp->snd_max)) { 8213 ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val); 8214 rack->r_wanted_output = 1; 8215 return (1); 8216 } 8217 if (rack->rc_gp_filled && 8218 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { 8219 under_pacing = 1; 8220 } 8221 if (SEQ_GEQ(th->th_ack, tp->snd_una) || to->to_nsacks) { 8222 if (rack->rc_in_persist) 8223 tp->t_rxtshift = 0; 8224 if ((th->th_ack == tp->snd_una) && (tiwin == tp->snd_wnd)) 8225 rack_strike_dupack(rack); 8226 rack_log_ack(tp, to, th); 8227 } 8228 if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) { 8229 /* 8230 * Old ack, behind (or duplicate to) the last one rcv'd 8231 * Note: Should mark reordering is occuring! We should also 8232 * look for sack blocks arriving e.g. ack 1, 4-4 then ack 1, 8233 * 3-3, 4-4 would be reording. As well as ack 1, 3-3 <no 8234 * retran and> ack 3 8235 */ 8236 return (0); 8237 } 8238 /* 8239 * If we reach this point, ACK is not a duplicate, i.e., it ACKs 8240 * something we sent. 8241 */ 8242 if (tp->t_flags & TF_NEEDSYN) { 8243 /* 8244 * T/TCP: Connection was half-synchronized, and our SYN has 8245 * been ACK'd (so connection is now fully synchronized). Go 8246 * to non-starred state, increment snd_una for ACK of SYN, 8247 * and check if we can do window scaling. 8248 */ 8249 tp->t_flags &= ~TF_NEEDSYN; 8250 tp->snd_una++; 8251 /* Do window scaling? */ 8252 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 8253 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 8254 tp->rcv_scale = tp->request_r_scale; 8255 /* Send window already scaled. */ 8256 } 8257 } 8258 nsegs = max(1, m->m_pkthdr.lro_nsegs); 8259 INP_WLOCK_ASSERT(tp->t_inpcb); 8260 8261 acked = BYTES_THIS_ACK(tp, th); 8262 KMOD_TCPSTAT_ADD(tcps_rcvackpack, nsegs); 8263 KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked); 8264 /* 8265 * If we just performed our first retransmit, and the ACK arrives 8266 * within our recovery window, then it was a mistake to do the 8267 * retransmit in the first place. Recover our original cwnd and 8268 * ssthresh, and proceed to transmit where we left off. 8269 */ 8270 if (tp->t_flags & TF_PREVVALID) { 8271 tp->t_flags &= ~TF_PREVVALID; 8272 if (tp->t_rxtshift == 1 && 8273 (int)(ticks - tp->t_badrxtwin) < 0) 8274 rack_cong_signal(tp, th, CC_RTO_ERR); 8275 } 8276 if (acked) { 8277 /* assure we are not backed off */ 8278 tp->t_rxtshift = 0; 8279 rack->rc_tlp_in_progress = 0; 8280 rack->r_ctl.rc_tlp_cnt_out = 0; 8281 /* 8282 * If it is the RXT timer we want to 8283 * stop it, so we can restart a TLP. 8284 */ 8285 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) 8286 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 8287 #ifdef NETFLIX_HTTP_LOGGING 8288 tcp_http_check_for_comp(rack->rc_tp, th->th_ack); 8289 #endif 8290 } 8291 /* 8292 * If we have a timestamp reply, update smoothed round trip time. If 8293 * no timestamp is present but transmit timer is running and timed 8294 * sequence number was acked, update smoothed round trip time. Since 8295 * we now have an rtt measurement, cancel the timer backoff (cf., 8296 * Phil Karn's retransmit alg.). Recompute the initial retransmit 8297 * timer. 8298 * 8299 * Some boxes send broken timestamp replies during the SYN+ACK 8300 * phase, ignore timestamps of 0 or we could calculate a huge RTT 8301 * and blow up the retransmit timer. 8302 */ 8303 /* 8304 * If all outstanding data is acked, stop retransmit timer and 8305 * remember to restart (more output or persist). If there is more 8306 * data to be acked, restart retransmit timer, using current 8307 * (possibly backed-off) value. 8308 */ 8309 if (acked == 0) { 8310 if (ofia) 8311 *ofia = ourfinisacked; 8312 return (0); 8313 } 8314 if (rack->r_ctl.rc_early_recovery) { 8315 if (IN_RECOVERY(tp->t_flags)) { 8316 if (SEQ_LT(th->th_ack, tp->snd_recover) && 8317 (SEQ_LT(th->th_ack, tp->snd_max))) { 8318 tcp_rack_partialack(tp, th); 8319 } else { 8320 rack_post_recovery(tp, th); 8321 recovery = 1; 8322 } 8323 } 8324 } 8325 /* 8326 * Let the congestion control algorithm update congestion control 8327 * related information. This typically means increasing the 8328 * congestion window. 8329 */ 8330 rack_ack_received(tp, rack, th, nsegs, CC_ACK, recovery); 8331 SOCKBUF_LOCK(&so->so_snd); 8332 acked_amount = min(acked, (int)sbavail(&so->so_snd)); 8333 tp->snd_wnd -= acked_amount; 8334 mfree = sbcut_locked(&so->so_snd, acked_amount); 8335 if ((sbused(&so->so_snd) == 0) && 8336 (acked > acked_amount) && 8337 (tp->t_state >= TCPS_FIN_WAIT_1) && 8338 (tp->t_flags & TF_SENTFIN)) { 8339 /* 8340 * We must be sure our fin 8341 * was sent and acked (we can be 8342 * in FIN_WAIT_1 without having 8343 * sent the fin). 8344 */ 8345 ourfinisacked = 1; 8346 } 8347 SOCKBUF_UNLOCK(&so->so_snd); 8348 tp->t_flags |= TF_WAKESOW; 8349 m_freem(mfree); 8350 if (rack->r_ctl.rc_early_recovery == 0) { 8351 if (IN_RECOVERY(tp->t_flags)) { 8352 if (SEQ_LT(th->th_ack, tp->snd_recover) && 8353 (SEQ_LT(th->th_ack, tp->snd_max))) { 8354 tcp_rack_partialack(tp, th); 8355 } else { 8356 rack_post_recovery(tp, th); 8357 } 8358 } 8359 } 8360 tp->snd_una = th->th_ack; 8361 if (SEQ_GT(tp->snd_una, tp->snd_recover)) 8362 tp->snd_recover = tp->snd_una; 8363 8364 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) { 8365 tp->snd_nxt = tp->snd_una; 8366 } 8367 if (under_pacing && 8368 (rack->use_fixed_rate == 0) && 8369 (rack->in_probe_rtt == 0) && 8370 rack->rc_gp_dyn_mul && 8371 rack->rc_always_pace) { 8372 /* Check if we are dragging bottom */ 8373 rack_check_bottom_drag(tp, rack, so, acked); 8374 } 8375 if (tp->snd_una == tp->snd_max) { 8376 /* Nothing left outstanding */ 8377 rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL); 8378 if (rack->r_ctl.rc_went_idle_time == 0) 8379 rack->r_ctl.rc_went_idle_time = 1; 8380 rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__); 8381 if (sbavail(&tp->t_inpcb->inp_socket->so_snd) == 0) 8382 tp->t_acktime = 0; 8383 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 8384 /* Set need output so persist might get set */ 8385 rack->r_wanted_output = 1; 8386 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 8387 if ((tp->t_state >= TCPS_FIN_WAIT_1) && 8388 (sbavail(&so->so_snd) == 0) && 8389 (tp->t_flags2 & TF2_DROP_AF_DATA)) { 8390 /* 8391 * The socket was gone and the 8392 * peer sent data, time to 8393 * reset him. 8394 */ 8395 *ret_val = 1; 8396 /* tcp_close will kill the inp pre-log the Reset */ 8397 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); 8398 tp = tcp_close(tp); 8399 ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, tlen); 8400 return (1); 8401 } 8402 } 8403 if (ofia) 8404 *ofia = ourfinisacked; 8405 return (0); 8406 } 8407 8408 static void 8409 rack_collapsed_window(struct tcp_rack *rack) 8410 { 8411 /* 8412 * Now we must walk the 8413 * send map and divide the 8414 * ones left stranded. These 8415 * guys can't cause us to abort 8416 * the connection and are really 8417 * "unsent". However if a buggy 8418 * client actually did keep some 8419 * of the data i.e. collapsed the win 8420 * and refused to ack and then opened 8421 * the win and acked that data. We would 8422 * get into an ack war, the simplier 8423 * method then of just pretending we 8424 * did not send those segments something 8425 * won't work. 8426 */ 8427 struct rack_sendmap *rsm, *nrsm, fe, *insret; 8428 tcp_seq max_seq; 8429 8430 max_seq = rack->rc_tp->snd_una + rack->rc_tp->snd_wnd; 8431 memset(&fe, 0, sizeof(fe)); 8432 fe.r_start = max_seq; 8433 /* Find the first seq past or at maxseq */ 8434 rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); 8435 if (rsm == NULL) { 8436 /* Nothing to do strange */ 8437 rack->rc_has_collapsed = 0; 8438 return; 8439 } 8440 /* 8441 * Now do we need to split at 8442 * the collapse point? 8443 */ 8444 if (SEQ_GT(max_seq, rsm->r_start)) { 8445 nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT); 8446 if (nrsm == NULL) { 8447 /* We can't get a rsm, mark all? */ 8448 nrsm = rsm; 8449 goto no_split; 8450 } 8451 /* Clone it */ 8452 rack_clone_rsm(rack, nrsm, rsm, max_seq); 8453 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); 8454 #ifdef INVARIANTS 8455 if (insret != NULL) { 8456 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", 8457 nrsm, insret, rack, rsm); 8458 } 8459 #endif 8460 if (rsm->r_in_tmap) { 8461 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 8462 nrsm->r_in_tmap = 1; 8463 } 8464 /* 8465 * Set in the new RSM as the 8466 * collapsed starting point 8467 */ 8468 rsm = nrsm; 8469 } 8470 no_split: 8471 counter_u64_add(rack_collapsed_win, 1); 8472 RB_FOREACH_FROM(nrsm, rack_rb_tree_head, rsm) { 8473 nrsm->r_flags |= RACK_RWND_COLLAPSED; 8474 rack->rc_has_collapsed = 1; 8475 } 8476 } 8477 8478 static void 8479 rack_un_collapse_window(struct tcp_rack *rack) 8480 { 8481 struct rack_sendmap *rsm; 8482 8483 RB_FOREACH_REVERSE(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) { 8484 if (rsm->r_flags & RACK_RWND_COLLAPSED) 8485 rsm->r_flags &= ~RACK_RWND_COLLAPSED; 8486 else 8487 break; 8488 } 8489 rack->rc_has_collapsed = 0; 8490 } 8491 8492 static void 8493 rack_handle_delayed_ack(struct tcpcb *tp, struct tcp_rack *rack, 8494 int32_t tlen, int32_t tfo_syn) 8495 { 8496 if (DELAY_ACK(tp, tlen) || tfo_syn) { 8497 if (rack->rc_dack_mode && 8498 (tlen > 500) && 8499 (rack->rc_dack_toggle == 1)) { 8500 goto no_delayed_ack; 8501 } 8502 rack_timer_cancel(tp, rack, 8503 rack->r_ctl.rc_rcvtime, __LINE__); 8504 tp->t_flags |= TF_DELACK; 8505 } else { 8506 no_delayed_ack: 8507 rack->r_wanted_output = 1; 8508 tp->t_flags |= TF_ACKNOW; 8509 if (rack->rc_dack_mode) { 8510 if (tp->t_flags & TF_DELACK) 8511 rack->rc_dack_toggle = 1; 8512 else 8513 rack->rc_dack_toggle = 0; 8514 } 8515 } 8516 } 8517 /* 8518 * Return value of 1, the TCB is unlocked and most 8519 * likely gone, return value of 0, the TCP is still 8520 * locked. 8521 */ 8522 static int 8523 rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so, 8524 struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, 8525 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) 8526 { 8527 /* 8528 * Update window information. Don't look at window if no ACK: TAC's 8529 * send garbage on first SYN. 8530 */ 8531 int32_t nsegs; 8532 int32_t tfo_syn; 8533 struct tcp_rack *rack; 8534 8535 rack = (struct tcp_rack *)tp->t_fb_ptr; 8536 INP_WLOCK_ASSERT(tp->t_inpcb); 8537 nsegs = max(1, m->m_pkthdr.lro_nsegs); 8538 if ((thflags & TH_ACK) && 8539 (SEQ_LT(tp->snd_wl1, th->th_seq) || 8540 (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || 8541 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { 8542 /* keep track of pure window updates */ 8543 if (tlen == 0 && 8544 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) 8545 KMOD_TCPSTAT_INC(tcps_rcvwinupd); 8546 tp->snd_wnd = tiwin; 8547 tp->snd_wl1 = th->th_seq; 8548 tp->snd_wl2 = th->th_ack; 8549 if (tp->snd_wnd > tp->max_sndwnd) 8550 tp->max_sndwnd = tp->snd_wnd; 8551 rack->r_wanted_output = 1; 8552 } else if (thflags & TH_ACK) { 8553 if ((tp->snd_wl2 == th->th_ack) && (tiwin < tp->snd_wnd)) { 8554 tp->snd_wnd = tiwin; 8555 tp->snd_wl1 = th->th_seq; 8556 tp->snd_wl2 = th->th_ack; 8557 } 8558 } 8559 if (tp->snd_wnd < ctf_outstanding(tp)) 8560 /* The peer collapsed the window */ 8561 rack_collapsed_window(rack); 8562 else if (rack->rc_has_collapsed) 8563 rack_un_collapse_window(rack); 8564 /* Was persist timer active and now we have window space? */ 8565 if ((rack->rc_in_persist != 0) && 8566 (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2), 8567 rack->r_ctl.rc_pace_min_segs))) { 8568 rack_exit_persist(tp, rack, rack->r_ctl.rc_rcvtime); 8569 tp->snd_nxt = tp->snd_max; 8570 /* Make sure we output to start the timer */ 8571 rack->r_wanted_output = 1; 8572 } 8573 /* Do we enter persists? */ 8574 if ((rack->rc_in_persist == 0) && 8575 (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) && 8576 TCPS_HAVEESTABLISHED(tp->t_state) && 8577 (tp->snd_max == tp->snd_una) && 8578 sbavail(&tp->t_inpcb->inp_socket->so_snd) && 8579 (sbavail(&tp->t_inpcb->inp_socket->so_snd) > tp->snd_wnd)) { 8580 /* 8581 * Here the rwnd is less than 8582 * the pacing size, we are established, 8583 * nothing is outstanding, and there is 8584 * data to send. Enter persists. 8585 */ 8586 tp->snd_nxt = tp->snd_una; 8587 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime); 8588 } 8589 if (tp->t_flags2 & TF2_DROP_AF_DATA) { 8590 m_freem(m); 8591 return (0); 8592 } 8593 /* 8594 * don't process the URG bit, ignore them drag 8595 * along the up. 8596 */ 8597 tp->rcv_up = tp->rcv_nxt; 8598 INP_WLOCK_ASSERT(tp->t_inpcb); 8599 8600 /* 8601 * Process the segment text, merging it into the TCP sequencing 8602 * queue, and arranging for acknowledgment of receipt if necessary. 8603 * This process logically involves adjusting tp->rcv_wnd as data is 8604 * presented to the user (this happens in tcp_usrreq.c, case 8605 * PRU_RCVD). If a FIN has already been received on this connection 8606 * then we just ignore the text. 8607 */ 8608 tfo_syn = ((tp->t_state == TCPS_SYN_RECEIVED) && 8609 IS_FASTOPEN(tp->t_flags)); 8610 if ((tlen || (thflags & TH_FIN) || (tfo_syn && tlen > 0)) && 8611 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 8612 tcp_seq save_start = th->th_seq; 8613 tcp_seq save_rnxt = tp->rcv_nxt; 8614 int save_tlen = tlen; 8615 8616 m_adj(m, drop_hdrlen); /* delayed header drop */ 8617 /* 8618 * Insert segment which includes th into TCP reassembly 8619 * queue with control block tp. Set thflags to whether 8620 * reassembly now includes a segment with FIN. This handles 8621 * the common case inline (segment is the next to be 8622 * received on an established connection, and the queue is 8623 * empty), avoiding linkage into and removal from the queue 8624 * and repetition of various conversions. Set DELACK for 8625 * segments received in order, but ack immediately when 8626 * segments are out of order (so fast retransmit can work). 8627 */ 8628 if (th->th_seq == tp->rcv_nxt && 8629 SEGQ_EMPTY(tp) && 8630 (TCPS_HAVEESTABLISHED(tp->t_state) || 8631 tfo_syn)) { 8632 #ifdef NETFLIX_SB_LIMITS 8633 u_int mcnt, appended; 8634 8635 if (so->so_rcv.sb_shlim) { 8636 mcnt = m_memcnt(m); 8637 appended = 0; 8638 if (counter_fo_get(so->so_rcv.sb_shlim, mcnt, 8639 CFO_NOSLEEP, NULL) == false) { 8640 counter_u64_add(tcp_sb_shlim_fails, 1); 8641 m_freem(m); 8642 return (0); 8643 } 8644 } 8645 #endif 8646 rack_handle_delayed_ack(tp, rack, tlen, tfo_syn); 8647 tp->rcv_nxt += tlen; 8648 if (tlen && 8649 ((tp->t_flags2 & TF2_FBYTES_COMPLETE) == 0) && 8650 (tp->t_fbyte_in == 0)) { 8651 tp->t_fbyte_in = ticks; 8652 if (tp->t_fbyte_in == 0) 8653 tp->t_fbyte_in = 1; 8654 if (tp->t_fbyte_out && tp->t_fbyte_in) 8655 tp->t_flags2 |= TF2_FBYTES_COMPLETE; 8656 } 8657 thflags = th->th_flags & TH_FIN; 8658 KMOD_TCPSTAT_ADD(tcps_rcvpack, nsegs); 8659 KMOD_TCPSTAT_ADD(tcps_rcvbyte, tlen); 8660 SOCKBUF_LOCK(&so->so_rcv); 8661 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 8662 m_freem(m); 8663 } else 8664 #ifdef NETFLIX_SB_LIMITS 8665 appended = 8666 #endif 8667 sbappendstream_locked(&so->so_rcv, m, 0); 8668 SOCKBUF_UNLOCK(&so->so_rcv); 8669 tp->t_flags |= TF_WAKESOR; 8670 #ifdef NETFLIX_SB_LIMITS 8671 if (so->so_rcv.sb_shlim && appended != mcnt) 8672 counter_fo_release(so->so_rcv.sb_shlim, 8673 mcnt - appended); 8674 #endif 8675 } else { 8676 /* 8677 * XXX: Due to the header drop above "th" is 8678 * theoretically invalid by now. Fortunately 8679 * m_adj() doesn't actually frees any mbufs when 8680 * trimming from the head. 8681 */ 8682 tcp_seq temp = save_start; 8683 thflags = tcp_reass(tp, th, &temp, &tlen, m); 8684 tp->t_flags |= TF_ACKNOW; 8685 } 8686 if ((tp->t_flags & TF_SACK_PERMIT) && (save_tlen > 0)) { 8687 if ((tlen == 0) && (SEQ_LT(save_start, save_rnxt))) { 8688 /* 8689 * DSACK actually handled in the fastpath 8690 * above. 8691 */ 8692 RACK_OPTS_INC(tcp_sack_path_1); 8693 tcp_update_sack_list(tp, save_start, 8694 save_start + save_tlen); 8695 } else if ((tlen > 0) && SEQ_GT(tp->rcv_nxt, save_rnxt)) { 8696 if ((tp->rcv_numsacks >= 1) && 8697 (tp->sackblks[0].end == save_start)) { 8698 /* 8699 * Partial overlap, recorded at todrop 8700 * above. 8701 */ 8702 RACK_OPTS_INC(tcp_sack_path_2a); 8703 tcp_update_sack_list(tp, 8704 tp->sackblks[0].start, 8705 tp->sackblks[0].end); 8706 } else { 8707 RACK_OPTS_INC(tcp_sack_path_2b); 8708 tcp_update_dsack_list(tp, save_start, 8709 save_start + save_tlen); 8710 } 8711 } else if (tlen >= save_tlen) { 8712 /* Update of sackblks. */ 8713 RACK_OPTS_INC(tcp_sack_path_3); 8714 tcp_update_dsack_list(tp, save_start, 8715 save_start + save_tlen); 8716 } else if (tlen > 0) { 8717 RACK_OPTS_INC(tcp_sack_path_4); 8718 tcp_update_dsack_list(tp, save_start, 8719 save_start + tlen); 8720 } 8721 } 8722 } else { 8723 m_freem(m); 8724 thflags &= ~TH_FIN; 8725 } 8726 8727 /* 8728 * If FIN is received ACK the FIN and let the user know that the 8729 * connection is closing. 8730 */ 8731 if (thflags & TH_FIN) { 8732 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 8733 socantrcvmore(so); 8734 /* The socket upcall is handled by socantrcvmore. */ 8735 tp->t_flags &= ~TF_WAKESOR; 8736 /* 8737 * If connection is half-synchronized (ie NEEDSYN 8738 * flag on) then delay ACK, so it may be piggybacked 8739 * when SYN is sent. Otherwise, since we received a 8740 * FIN then no more input can be expected, send ACK 8741 * now. 8742 */ 8743 if (tp->t_flags & TF_NEEDSYN) { 8744 rack_timer_cancel(tp, rack, 8745 rack->r_ctl.rc_rcvtime, __LINE__); 8746 tp->t_flags |= TF_DELACK; 8747 } else { 8748 tp->t_flags |= TF_ACKNOW; 8749 } 8750 tp->rcv_nxt++; 8751 } 8752 switch (tp->t_state) { 8753 /* 8754 * In SYN_RECEIVED and ESTABLISHED STATES enter the 8755 * CLOSE_WAIT state. 8756 */ 8757 case TCPS_SYN_RECEIVED: 8758 tp->t_starttime = ticks; 8759 /* FALLTHROUGH */ 8760 case TCPS_ESTABLISHED: 8761 rack_timer_cancel(tp, rack, 8762 rack->r_ctl.rc_rcvtime, __LINE__); 8763 tcp_state_change(tp, TCPS_CLOSE_WAIT); 8764 break; 8765 8766 /* 8767 * If still in FIN_WAIT_1 STATE FIN has not been 8768 * acked so enter the CLOSING state. 8769 */ 8770 case TCPS_FIN_WAIT_1: 8771 rack_timer_cancel(tp, rack, 8772 rack->r_ctl.rc_rcvtime, __LINE__); 8773 tcp_state_change(tp, TCPS_CLOSING); 8774 break; 8775 8776 /* 8777 * In FIN_WAIT_2 state enter the TIME_WAIT state, 8778 * starting the time-wait timer, turning off the 8779 * other standard timers. 8780 */ 8781 case TCPS_FIN_WAIT_2: 8782 rack_timer_cancel(tp, rack, 8783 rack->r_ctl.rc_rcvtime, __LINE__); 8784 tcp_twstart(tp); 8785 return (1); 8786 } 8787 } 8788 /* 8789 * Return any desired output. 8790 */ 8791 if ((tp->t_flags & TF_ACKNOW) || 8792 (sbavail(&so->so_snd) > (tp->snd_max - tp->snd_una))) { 8793 rack->r_wanted_output = 1; 8794 } 8795 INP_WLOCK_ASSERT(tp->t_inpcb); 8796 return (0); 8797 } 8798 8799 /* 8800 * Here nothing is really faster, its just that we 8801 * have broken out the fast-data path also just like 8802 * the fast-ack. 8803 */ 8804 static int 8805 rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so, 8806 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 8807 uint32_t tiwin, int32_t nxt_pkt, uint8_t iptos) 8808 { 8809 int32_t nsegs; 8810 int32_t newsize = 0; /* automatic sockbuf scaling */ 8811 struct tcp_rack *rack; 8812 #ifdef NETFLIX_SB_LIMITS 8813 u_int mcnt, appended; 8814 #endif 8815 #ifdef TCPDEBUG 8816 /* 8817 * The size of tcp_saveipgen must be the size of the max ip header, 8818 * now IPv6. 8819 */ 8820 u_char tcp_saveipgen[IP6_HDR_LEN]; 8821 struct tcphdr tcp_savetcp; 8822 short ostate = 0; 8823 8824 #endif 8825 /* 8826 * If last ACK falls within this segment's sequence numbers, record 8827 * the timestamp. NOTE that the test is modified according to the 8828 * latest proposal of the tcplw@cray.com list (Braden 1993/04/26). 8829 */ 8830 if (__predict_false(th->th_seq != tp->rcv_nxt)) { 8831 return (0); 8832 } 8833 if (__predict_false(tp->snd_nxt != tp->snd_max)) { 8834 return (0); 8835 } 8836 if (tiwin && tiwin != tp->snd_wnd) { 8837 return (0); 8838 } 8839 if (__predict_false((tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN)))) { 8840 return (0); 8841 } 8842 if (__predict_false((to->to_flags & TOF_TS) && 8843 (TSTMP_LT(to->to_tsval, tp->ts_recent)))) { 8844 return (0); 8845 } 8846 if (__predict_false((th->th_ack != tp->snd_una))) { 8847 return (0); 8848 } 8849 if (__predict_false(tlen > sbspace(&so->so_rcv))) { 8850 return (0); 8851 } 8852 if ((to->to_flags & TOF_TS) != 0 && 8853 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 8854 tp->ts_recent_age = tcp_ts_getticks(); 8855 tp->ts_recent = to->to_tsval; 8856 } 8857 rack = (struct tcp_rack *)tp->t_fb_ptr; 8858 /* 8859 * This is a pure, in-sequence data packet with nothing on the 8860 * reassembly queue and we have enough buffer space to take it. 8861 */ 8862 nsegs = max(1, m->m_pkthdr.lro_nsegs); 8863 8864 #ifdef NETFLIX_SB_LIMITS 8865 if (so->so_rcv.sb_shlim) { 8866 mcnt = m_memcnt(m); 8867 appended = 0; 8868 if (counter_fo_get(so->so_rcv.sb_shlim, mcnt, 8869 CFO_NOSLEEP, NULL) == false) { 8870 counter_u64_add(tcp_sb_shlim_fails, 1); 8871 m_freem(m); 8872 return (1); 8873 } 8874 } 8875 #endif 8876 /* Clean receiver SACK report if present */ 8877 if (tp->rcv_numsacks) 8878 tcp_clean_sackreport(tp); 8879 KMOD_TCPSTAT_INC(tcps_preddat); 8880 tp->rcv_nxt += tlen; 8881 if (tlen && 8882 ((tp->t_flags2 & TF2_FBYTES_COMPLETE) == 0) && 8883 (tp->t_fbyte_in == 0)) { 8884 tp->t_fbyte_in = ticks; 8885 if (tp->t_fbyte_in == 0) 8886 tp->t_fbyte_in = 1; 8887 if (tp->t_fbyte_out && tp->t_fbyte_in) 8888 tp->t_flags2 |= TF2_FBYTES_COMPLETE; 8889 } 8890 /* 8891 * Pull snd_wl1 up to prevent seq wrap relative to th_seq. 8892 */ 8893 tp->snd_wl1 = th->th_seq; 8894 /* 8895 * Pull rcv_up up to prevent seq wrap relative to rcv_nxt. 8896 */ 8897 tp->rcv_up = tp->rcv_nxt; 8898 KMOD_TCPSTAT_ADD(tcps_rcvpack, nsegs); 8899 KMOD_TCPSTAT_ADD(tcps_rcvbyte, tlen); 8900 #ifdef TCPDEBUG 8901 if (so->so_options & SO_DEBUG) 8902 tcp_trace(TA_INPUT, ostate, tp, 8903 (void *)tcp_saveipgen, &tcp_savetcp, 0); 8904 #endif 8905 newsize = tcp_autorcvbuf(m, th, so, tp, tlen); 8906 8907 /* Add data to socket buffer. */ 8908 SOCKBUF_LOCK(&so->so_rcv); 8909 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 8910 m_freem(m); 8911 } else { 8912 /* 8913 * Set new socket buffer size. Give up when limit is 8914 * reached. 8915 */ 8916 if (newsize) 8917 if (!sbreserve_locked(&so->so_rcv, 8918 newsize, so, NULL)) 8919 so->so_rcv.sb_flags &= ~SB_AUTOSIZE; 8920 m_adj(m, drop_hdrlen); /* delayed header drop */ 8921 #ifdef NETFLIX_SB_LIMITS 8922 appended = 8923 #endif 8924 sbappendstream_locked(&so->so_rcv, m, 0); 8925 ctf_calc_rwin(so, tp); 8926 } 8927 SOCKBUF_UNLOCK(&so->so_rcv); 8928 tp->t_flags |= TF_WAKESOR; 8929 #ifdef NETFLIX_SB_LIMITS 8930 if (so->so_rcv.sb_shlim && mcnt != appended) 8931 counter_fo_release(so->so_rcv.sb_shlim, mcnt - appended); 8932 #endif 8933 rack_handle_delayed_ack(tp, rack, tlen, 0); 8934 if (tp->snd_una == tp->snd_max) 8935 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 8936 return (1); 8937 } 8938 8939 /* 8940 * This subfunction is used to try to highly optimize the 8941 * fast path. We again allow window updates that are 8942 * in sequence to remain in the fast-path. We also add 8943 * in the __predict's to attempt to help the compiler. 8944 * Note that if we return a 0, then we can *not* process 8945 * it and the caller should push the packet into the 8946 * slow-path. 8947 */ 8948 static int 8949 rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so, 8950 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 8951 uint32_t tiwin, int32_t nxt_pkt, uint32_t cts) 8952 { 8953 int32_t acked; 8954 int32_t nsegs; 8955 #ifdef TCPDEBUG 8956 /* 8957 * The size of tcp_saveipgen must be the size of the max ip header, 8958 * now IPv6. 8959 */ 8960 u_char tcp_saveipgen[IP6_HDR_LEN]; 8961 struct tcphdr tcp_savetcp; 8962 short ostate = 0; 8963 #endif 8964 int32_t under_pacing = 0; 8965 struct tcp_rack *rack; 8966 8967 if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) { 8968 /* Old ack, behind (or duplicate to) the last one rcv'd */ 8969 return (0); 8970 } 8971 if (__predict_false(SEQ_GT(th->th_ack, tp->snd_max))) { 8972 /* Above what we have sent? */ 8973 return (0); 8974 } 8975 if (__predict_false(tp->snd_nxt != tp->snd_max)) { 8976 /* We are retransmitting */ 8977 return (0); 8978 } 8979 if (__predict_false(tiwin == 0)) { 8980 /* zero window */ 8981 return (0); 8982 } 8983 if (__predict_false(tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN))) { 8984 /* We need a SYN or a FIN, unlikely.. */ 8985 return (0); 8986 } 8987 if ((to->to_flags & TOF_TS) && __predict_false(TSTMP_LT(to->to_tsval, tp->ts_recent))) { 8988 /* Timestamp is behind .. old ack with seq wrap? */ 8989 return (0); 8990 } 8991 if (__predict_false(IN_RECOVERY(tp->t_flags))) { 8992 /* Still recovering */ 8993 return (0); 8994 } 8995 rack = (struct tcp_rack *)tp->t_fb_ptr; 8996 if (rack->r_ctl.rc_sacked) { 8997 /* We have sack holes on our scoreboard */ 8998 return (0); 8999 } 9000 /* Ok if we reach here, we can process a fast-ack */ 9001 if (rack->rc_gp_filled && 9002 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { 9003 under_pacing = 1; 9004 } 9005 nsegs = max(1, m->m_pkthdr.lro_nsegs); 9006 rack_log_ack(tp, to, th); 9007 /* Did the window get updated? */ 9008 if (tiwin != tp->snd_wnd) { 9009 tp->snd_wnd = tiwin; 9010 tp->snd_wl1 = th->th_seq; 9011 if (tp->snd_wnd > tp->max_sndwnd) 9012 tp->max_sndwnd = tp->snd_wnd; 9013 } 9014 /* Do we exit persists? */ 9015 if ((rack->rc_in_persist != 0) && 9016 (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2), 9017 rack->r_ctl.rc_pace_min_segs))) { 9018 rack_exit_persist(tp, rack, cts); 9019 } 9020 /* Do we enter persists? */ 9021 if ((rack->rc_in_persist == 0) && 9022 (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) && 9023 TCPS_HAVEESTABLISHED(tp->t_state) && 9024 (tp->snd_max == tp->snd_una) && 9025 sbavail(&tp->t_inpcb->inp_socket->so_snd) && 9026 (sbavail(&tp->t_inpcb->inp_socket->so_snd) > tp->snd_wnd)) { 9027 /* 9028 * Here the rwnd is less than 9029 * the pacing size, we are established, 9030 * nothing is outstanding, and there is 9031 * data to send. Enter persists. 9032 */ 9033 tp->snd_nxt = tp->snd_una; 9034 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime); 9035 } 9036 /* 9037 * If last ACK falls within this segment's sequence numbers, record 9038 * the timestamp. NOTE that the test is modified according to the 9039 * latest proposal of the tcplw@cray.com list (Braden 1993/04/26). 9040 */ 9041 if ((to->to_flags & TOF_TS) != 0 && 9042 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 9043 tp->ts_recent_age = tcp_ts_getticks(); 9044 tp->ts_recent = to->to_tsval; 9045 } 9046 /* 9047 * This is a pure ack for outstanding data. 9048 */ 9049 KMOD_TCPSTAT_INC(tcps_predack); 9050 9051 /* 9052 * "bad retransmit" recovery. 9053 */ 9054 if (tp->t_flags & TF_PREVVALID) { 9055 tp->t_flags &= ~TF_PREVVALID; 9056 if (tp->t_rxtshift == 1 && 9057 (int)(ticks - tp->t_badrxtwin) < 0) 9058 rack_cong_signal(tp, th, CC_RTO_ERR); 9059 } 9060 /* 9061 * Recalculate the transmit timer / rtt. 9062 * 9063 * Some boxes send broken timestamp replies during the SYN+ACK 9064 * phase, ignore timestamps of 0 or we could calculate a huge RTT 9065 * and blow up the retransmit timer. 9066 */ 9067 acked = BYTES_THIS_ACK(tp, th); 9068 9069 #ifdef TCP_HHOOK 9070 /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ 9071 hhook_run_tcp_est_in(tp, th, to); 9072 #endif 9073 9074 KMOD_TCPSTAT_ADD(tcps_rcvackpack, nsegs); 9075 KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked); 9076 sbdrop(&so->so_snd, acked); 9077 if (acked) { 9078 /* assure we are not backed off */ 9079 tp->t_rxtshift = 0; 9080 rack->rc_tlp_in_progress = 0; 9081 rack->r_ctl.rc_tlp_cnt_out = 0; 9082 /* 9083 * If it is the RXT timer we want to 9084 * stop it, so we can restart a TLP. 9085 */ 9086 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) 9087 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 9088 #ifdef NETFLIX_HTTP_LOGGING 9089 tcp_http_check_for_comp(rack->rc_tp, th->th_ack); 9090 #endif 9091 } 9092 /* 9093 * Let the congestion control algorithm update congestion control 9094 * related information. This typically means increasing the 9095 * congestion window. 9096 */ 9097 rack_ack_received(tp, rack, th, nsegs, CC_ACK, 0); 9098 9099 tp->snd_una = th->th_ack; 9100 if (tp->snd_wnd < ctf_outstanding(tp)) { 9101 /* The peer collapsed the window */ 9102 rack_collapsed_window(rack); 9103 } else if (rack->rc_has_collapsed) 9104 rack_un_collapse_window(rack); 9105 9106 /* 9107 * Pull snd_wl2 up to prevent seq wrap relative to th_ack. 9108 */ 9109 tp->snd_wl2 = th->th_ack; 9110 tp->t_dupacks = 0; 9111 m_freem(m); 9112 /* ND6_HINT(tp); *//* Some progress has been made. */ 9113 9114 /* 9115 * If all outstanding data are acked, stop retransmit timer, 9116 * otherwise restart timer using current (possibly backed-off) 9117 * value. If process is waiting for space, wakeup/selwakeup/signal. 9118 * If data are ready to send, let tcp_output decide between more 9119 * output or persist. 9120 */ 9121 #ifdef TCPDEBUG 9122 if (so->so_options & SO_DEBUG) 9123 tcp_trace(TA_INPUT, ostate, tp, 9124 (void *)tcp_saveipgen, 9125 &tcp_savetcp, 0); 9126 #endif 9127 if (under_pacing && 9128 (rack->use_fixed_rate == 0) && 9129 (rack->in_probe_rtt == 0) && 9130 rack->rc_gp_dyn_mul && 9131 rack->rc_always_pace) { 9132 /* Check if we are dragging bottom */ 9133 rack_check_bottom_drag(tp, rack, so, acked); 9134 } 9135 if (tp->snd_una == tp->snd_max) { 9136 rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL); 9137 if (rack->r_ctl.rc_went_idle_time == 0) 9138 rack->r_ctl.rc_went_idle_time = 1; 9139 rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__); 9140 if (sbavail(&tp->t_inpcb->inp_socket->so_snd) == 0) 9141 tp->t_acktime = 0; 9142 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 9143 } 9144 /* Wake up the socket if we have room to write more */ 9145 tp->t_flags |= TF_WAKESOW; 9146 if (sbavail(&so->so_snd)) { 9147 rack->r_wanted_output = 1; 9148 } 9149 return (1); 9150 } 9151 9152 /* 9153 * Return value of 1, the TCB is unlocked and most 9154 * likely gone, return value of 0, the TCP is still 9155 * locked. 9156 */ 9157 static int 9158 rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so, 9159 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 9160 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 9161 { 9162 int32_t ret_val = 0; 9163 int32_t todrop; 9164 int32_t ourfinisacked = 0; 9165 struct tcp_rack *rack; 9166 9167 ctf_calc_rwin(so, tp); 9168 /* 9169 * If the state is SYN_SENT: if seg contains an ACK, but not for our 9170 * SYN, drop the input. if seg contains a RST, then drop the 9171 * connection. if seg does not contain SYN, then drop it. Otherwise 9172 * this is an acceptable SYN segment initialize tp->rcv_nxt and 9173 * tp->irs if seg contains ack then advance tp->snd_una if seg 9174 * contains an ECE and ECN support is enabled, the stream is ECN 9175 * capable. if SYN has been acked change to ESTABLISHED else 9176 * SYN_RCVD state arrange for segment to be acked (eventually) 9177 * continue processing rest of data/controls. 9178 */ 9179 if ((thflags & TH_ACK) && 9180 (SEQ_LEQ(th->th_ack, tp->iss) || 9181 SEQ_GT(th->th_ack, tp->snd_max))) { 9182 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 9183 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 9184 return (1); 9185 } 9186 if ((thflags & (TH_ACK | TH_RST)) == (TH_ACK | TH_RST)) { 9187 TCP_PROBE5(connect__refused, NULL, tp, 9188 mtod(m, const char *), tp, th); 9189 tp = tcp_drop(tp, ECONNREFUSED); 9190 ctf_do_drop(m, tp); 9191 return (1); 9192 } 9193 if (thflags & TH_RST) { 9194 ctf_do_drop(m, tp); 9195 return (1); 9196 } 9197 if (!(thflags & TH_SYN)) { 9198 ctf_do_drop(m, tp); 9199 return (1); 9200 } 9201 tp->irs = th->th_seq; 9202 tcp_rcvseqinit(tp); 9203 rack = (struct tcp_rack *)tp->t_fb_ptr; 9204 if (thflags & TH_ACK) { 9205 int tfo_partial = 0; 9206 9207 KMOD_TCPSTAT_INC(tcps_connects); 9208 soisconnected(so); 9209 #ifdef MAC 9210 mac_socketpeer_set_from_mbuf(m, so); 9211 #endif 9212 /* Do window scaling on this connection? */ 9213 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 9214 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 9215 tp->rcv_scale = tp->request_r_scale; 9216 } 9217 tp->rcv_adv += min(tp->rcv_wnd, 9218 TCP_MAXWIN << tp->rcv_scale); 9219 /* 9220 * If not all the data that was sent in the TFO SYN 9221 * has been acked, resend the remainder right away. 9222 */ 9223 if (IS_FASTOPEN(tp->t_flags) && 9224 (tp->snd_una != tp->snd_max)) { 9225 tp->snd_nxt = th->th_ack; 9226 tfo_partial = 1; 9227 } 9228 /* 9229 * If there's data, delay ACK; if there's also a FIN ACKNOW 9230 * will be turned on later. 9231 */ 9232 if (DELAY_ACK(tp, tlen) && tlen != 0 && !tfo_partial) { 9233 rack_timer_cancel(tp, rack, 9234 rack->r_ctl.rc_rcvtime, __LINE__); 9235 tp->t_flags |= TF_DELACK; 9236 } else { 9237 rack->r_wanted_output = 1; 9238 tp->t_flags |= TF_ACKNOW; 9239 rack->rc_dack_toggle = 0; 9240 } 9241 if (((thflags & (TH_CWR | TH_ECE)) == TH_ECE) && 9242 (V_tcp_do_ecn == 1)) { 9243 tp->t_flags2 |= TF2_ECN_PERMIT; 9244 KMOD_TCPSTAT_INC(tcps_ecn_shs); 9245 } 9246 if (SEQ_GT(th->th_ack, tp->snd_una)) { 9247 /* 9248 * We advance snd_una for the 9249 * fast open case. If th_ack is 9250 * acknowledging data beyond 9251 * snd_una we can't just call 9252 * ack-processing since the 9253 * data stream in our send-map 9254 * will start at snd_una + 1 (one 9255 * beyond the SYN). If its just 9256 * equal we don't need to do that 9257 * and there is no send_map. 9258 */ 9259 tp->snd_una++; 9260 } 9261 /* 9262 * Received <SYN,ACK> in SYN_SENT[*] state. Transitions: 9263 * SYN_SENT --> ESTABLISHED SYN_SENT* --> FIN_WAIT_1 9264 */ 9265 tp->t_starttime = ticks; 9266 if (tp->t_flags & TF_NEEDFIN) { 9267 tcp_state_change(tp, TCPS_FIN_WAIT_1); 9268 tp->t_flags &= ~TF_NEEDFIN; 9269 thflags &= ~TH_SYN; 9270 } else { 9271 tcp_state_change(tp, TCPS_ESTABLISHED); 9272 TCP_PROBE5(connect__established, NULL, tp, 9273 mtod(m, const char *), tp, th); 9274 rack_cc_conn_init(tp); 9275 } 9276 } else { 9277 /* 9278 * Received initial SYN in SYN-SENT[*] state => simultaneous 9279 * open. If segment contains CC option and there is a 9280 * cached CC, apply TAO test. If it succeeds, connection is * 9281 * half-synchronized. Otherwise, do 3-way handshake: 9282 * SYN-SENT -> SYN-RECEIVED SYN-SENT* -> SYN-RECEIVED* If 9283 * there was no CC option, clear cached CC value. 9284 */ 9285 tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN); 9286 tcp_state_change(tp, TCPS_SYN_RECEIVED); 9287 } 9288 INP_WLOCK_ASSERT(tp->t_inpcb); 9289 /* 9290 * Advance th->th_seq to correspond to first data byte. If data, 9291 * trim to stay within window, dropping FIN if necessary. 9292 */ 9293 th->th_seq++; 9294 if (tlen > tp->rcv_wnd) { 9295 todrop = tlen - tp->rcv_wnd; 9296 m_adj(m, -todrop); 9297 tlen = tp->rcv_wnd; 9298 thflags &= ~TH_FIN; 9299 KMOD_TCPSTAT_INC(tcps_rcvpackafterwin); 9300 KMOD_TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); 9301 } 9302 tp->snd_wl1 = th->th_seq - 1; 9303 tp->rcv_up = th->th_seq; 9304 /* 9305 * Client side of transaction: already sent SYN and data. If the 9306 * remote host used T/TCP to validate the SYN, our data will be 9307 * ACK'd; if so, enter normal data segment processing in the middle 9308 * of step 5, ack processing. Otherwise, goto step 6. 9309 */ 9310 if (thflags & TH_ACK) { 9311 /* For syn-sent we need to possibly update the rtt */ 9312 if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) { 9313 uint32_t t; 9314 9315 t = tcp_ts_getticks() - to->to_tsecr; 9316 if (!tp->t_rttlow || tp->t_rttlow > t) 9317 tp->t_rttlow = t; 9318 tcp_rack_xmit_timer(rack, t + 1, 1, (t * HPTS_USEC_IN_MSEC), 0, NULL, 2); 9319 tcp_rack_xmit_timer_commit(rack, tp); 9320 } 9321 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) 9322 return (ret_val); 9323 /* We may have changed to FIN_WAIT_1 above */ 9324 if (tp->t_state == TCPS_FIN_WAIT_1) { 9325 /* 9326 * In FIN_WAIT_1 STATE in addition to the processing 9327 * for the ESTABLISHED state if our FIN is now 9328 * acknowledged then enter FIN_WAIT_2. 9329 */ 9330 if (ourfinisacked) { 9331 /* 9332 * If we can't receive any more data, then 9333 * closing user can proceed. Starting the 9334 * timer is contrary to the specification, 9335 * but if we don't get a FIN we'll hang 9336 * forever. 9337 * 9338 * XXXjl: we should release the tp also, and 9339 * use a compressed state. 9340 */ 9341 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 9342 soisdisconnected(so); 9343 tcp_timer_activate(tp, TT_2MSL, 9344 (tcp_fast_finwait2_recycle ? 9345 tcp_finwait2_timeout : 9346 TP_MAXIDLE(tp))); 9347 } 9348 tcp_state_change(tp, TCPS_FIN_WAIT_2); 9349 } 9350 } 9351 } 9352 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 9353 tiwin, thflags, nxt_pkt)); 9354 } 9355 9356 /* 9357 * Return value of 1, the TCB is unlocked and most 9358 * likely gone, return value of 0, the TCP is still 9359 * locked. 9360 */ 9361 static int 9362 rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, 9363 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 9364 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 9365 { 9366 struct tcp_rack *rack; 9367 int32_t ret_val = 0; 9368 int32_t ourfinisacked = 0; 9369 9370 ctf_calc_rwin(so, tp); 9371 if ((thflags & TH_ACK) && 9372 (SEQ_LEQ(th->th_ack, tp->snd_una) || 9373 SEQ_GT(th->th_ack, tp->snd_max))) { 9374 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 9375 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 9376 return (1); 9377 } 9378 rack = (struct tcp_rack *)tp->t_fb_ptr; 9379 if (IS_FASTOPEN(tp->t_flags)) { 9380 /* 9381 * When a TFO connection is in SYN_RECEIVED, the 9382 * only valid packets are the initial SYN, a 9383 * retransmit/copy of the initial SYN (possibly with 9384 * a subset of the original data), a valid ACK, a 9385 * FIN, or a RST. 9386 */ 9387 if ((thflags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) { 9388 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 9389 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 9390 return (1); 9391 } else if (thflags & TH_SYN) { 9392 /* non-initial SYN is ignored */ 9393 if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) || 9394 (rack->r_ctl.rc_hpts_flags & PACE_TMR_TLP) || 9395 (rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK)) { 9396 ctf_do_drop(m, NULL); 9397 return (0); 9398 } 9399 } else if (!(thflags & (TH_ACK | TH_FIN | TH_RST))) { 9400 ctf_do_drop(m, NULL); 9401 return (0); 9402 } 9403 } 9404 if ((thflags & TH_RST) || 9405 (tp->t_fin_is_rst && (thflags & TH_FIN))) 9406 return (ctf_process_rst(m, th, so, tp)); 9407 /* 9408 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 9409 * it's less than ts_recent, drop it. 9410 */ 9411 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 9412 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 9413 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 9414 return (ret_val); 9415 } 9416 /* 9417 * In the SYN-RECEIVED state, validate that the packet belongs to 9418 * this connection before trimming the data to fit the receive 9419 * window. Check the sequence number versus IRS since we know the 9420 * sequence numbers haven't wrapped. This is a partial fix for the 9421 * "LAND" DoS attack. 9422 */ 9423 if (SEQ_LT(th->th_seq, tp->irs)) { 9424 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 9425 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 9426 return (1); 9427 } 9428 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 9429 return (ret_val); 9430 } 9431 /* 9432 * If last ACK falls within this segment's sequence numbers, record 9433 * its timestamp. NOTE: 1) That the test incorporates suggestions 9434 * from the latest proposal of the tcplw@cray.com list (Braden 9435 * 1993/04/26). 2) That updating only on newer timestamps interferes 9436 * with our earlier PAWS tests, so this check should be solely 9437 * predicated on the sequence space of this segment. 3) That we 9438 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 9439 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 9440 * SEG.Len, This modified check allows us to overcome RFC1323's 9441 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 9442 * p.869. In such cases, we can still calculate the RTT correctly 9443 * when RCV.NXT == Last.ACK.Sent. 9444 */ 9445 if ((to->to_flags & TOF_TS) != 0 && 9446 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 9447 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 9448 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 9449 tp->ts_recent_age = tcp_ts_getticks(); 9450 tp->ts_recent = to->to_tsval; 9451 } 9452 tp->snd_wnd = tiwin; 9453 /* 9454 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 9455 * is on (half-synchronized state), then queue data for later 9456 * processing; else drop segment and return. 9457 */ 9458 if ((thflags & TH_ACK) == 0) { 9459 if (IS_FASTOPEN(tp->t_flags)) { 9460 rack_cc_conn_init(tp); 9461 } 9462 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 9463 tiwin, thflags, nxt_pkt)); 9464 } 9465 KMOD_TCPSTAT_INC(tcps_connects); 9466 soisconnected(so); 9467 /* Do window scaling? */ 9468 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 9469 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 9470 tp->rcv_scale = tp->request_r_scale; 9471 } 9472 /* 9473 * Make transitions: SYN-RECEIVED -> ESTABLISHED SYN-RECEIVED* -> 9474 * FIN-WAIT-1 9475 */ 9476 tp->t_starttime = ticks; 9477 if (IS_FASTOPEN(tp->t_flags) && tp->t_tfo_pending) { 9478 tcp_fastopen_decrement_counter(tp->t_tfo_pending); 9479 tp->t_tfo_pending = NULL; 9480 } 9481 if (tp->t_flags & TF_NEEDFIN) { 9482 tcp_state_change(tp, TCPS_FIN_WAIT_1); 9483 tp->t_flags &= ~TF_NEEDFIN; 9484 } else { 9485 tcp_state_change(tp, TCPS_ESTABLISHED); 9486 TCP_PROBE5(accept__established, NULL, tp, 9487 mtod(m, const char *), tp, th); 9488 /* 9489 * TFO connections call cc_conn_init() during SYN 9490 * processing. Calling it again here for such connections 9491 * is not harmless as it would undo the snd_cwnd reduction 9492 * that occurs when a TFO SYN|ACK is retransmitted. 9493 */ 9494 if (!IS_FASTOPEN(tp->t_flags)) 9495 rack_cc_conn_init(tp); 9496 } 9497 /* 9498 * Account for the ACK of our SYN prior to 9499 * regular ACK processing below, except for 9500 * simultaneous SYN, which is handled later. 9501 */ 9502 if (SEQ_GT(th->th_ack, tp->snd_una) && !(tp->t_flags & TF_NEEDSYN)) 9503 tp->snd_una++; 9504 /* 9505 * If segment contains data or ACK, will call tcp_reass() later; if 9506 * not, do so now to pass queued data to user. 9507 */ 9508 if (tlen == 0 && (thflags & TH_FIN) == 0) 9509 (void) tcp_reass(tp, (struct tcphdr *)0, NULL, 0, 9510 (struct mbuf *)0); 9511 tp->snd_wl1 = th->th_seq - 1; 9512 /* For syn-recv we need to possibly update the rtt */ 9513 if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) { 9514 uint32_t t; 9515 9516 t = tcp_ts_getticks() - to->to_tsecr; 9517 if (!tp->t_rttlow || tp->t_rttlow > t) 9518 tp->t_rttlow = t; 9519 tcp_rack_xmit_timer(rack, t + 1, 1, (t * HPTS_USEC_IN_MSEC), 0, NULL, 2); 9520 tcp_rack_xmit_timer_commit(rack, tp); 9521 } 9522 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 9523 return (ret_val); 9524 } 9525 if (tp->t_state == TCPS_FIN_WAIT_1) { 9526 /* We could have went to FIN_WAIT_1 (or EST) above */ 9527 /* 9528 * In FIN_WAIT_1 STATE in addition to the processing for the 9529 * ESTABLISHED state if our FIN is now acknowledged then 9530 * enter FIN_WAIT_2. 9531 */ 9532 if (ourfinisacked) { 9533 /* 9534 * If we can't receive any more data, then closing 9535 * user can proceed. Starting the timer is contrary 9536 * to the specification, but if we don't get a FIN 9537 * we'll hang forever. 9538 * 9539 * XXXjl: we should release the tp also, and use a 9540 * compressed state. 9541 */ 9542 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 9543 soisdisconnected(so); 9544 tcp_timer_activate(tp, TT_2MSL, 9545 (tcp_fast_finwait2_recycle ? 9546 tcp_finwait2_timeout : 9547 TP_MAXIDLE(tp))); 9548 } 9549 tcp_state_change(tp, TCPS_FIN_WAIT_2); 9550 } 9551 } 9552 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 9553 tiwin, thflags, nxt_pkt)); 9554 } 9555 9556 /* 9557 * Return value of 1, the TCB is unlocked and most 9558 * likely gone, return value of 0, the TCP is still 9559 * locked. 9560 */ 9561 static int 9562 rack_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so, 9563 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 9564 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 9565 { 9566 int32_t ret_val = 0; 9567 struct tcp_rack *rack; 9568 9569 /* 9570 * Header prediction: check for the two common cases of a 9571 * uni-directional data xfer. If the packet has no control flags, 9572 * is in-sequence, the window didn't change and we're not 9573 * retransmitting, it's a candidate. If the length is zero and the 9574 * ack moved forward, we're the sender side of the xfer. Just free 9575 * the data acked & wake any higher level process that was blocked 9576 * waiting for space. If the length is non-zero and the ack didn't 9577 * move, we're the receiver side. If we're getting packets in-order 9578 * (the reassembly queue is empty), add the data toc The socket 9579 * buffer and note that we need a delayed ack. Make sure that the 9580 * hidden state-flags are also off. Since we check for 9581 * TCPS_ESTABLISHED first, it can only be TH_NEEDSYN. 9582 */ 9583 rack = (struct tcp_rack *)tp->t_fb_ptr; 9584 if (__predict_true(((to->to_flags & TOF_SACK) == 0)) && 9585 __predict_true((thflags & (TH_SYN | TH_FIN | TH_RST | TH_ACK)) == TH_ACK) && 9586 __predict_true(SEGQ_EMPTY(tp)) && 9587 __predict_true(th->th_seq == tp->rcv_nxt)) { 9588 if (tlen == 0) { 9589 if (rack_fastack(m, th, so, tp, to, drop_hdrlen, tlen, 9590 tiwin, nxt_pkt, rack->r_ctl.rc_rcvtime)) { 9591 return (0); 9592 } 9593 } else { 9594 if (rack_do_fastnewdata(m, th, so, tp, to, drop_hdrlen, tlen, 9595 tiwin, nxt_pkt, iptos)) { 9596 return (0); 9597 } 9598 } 9599 } 9600 ctf_calc_rwin(so, tp); 9601 9602 if ((thflags & TH_RST) || 9603 (tp->t_fin_is_rst && (thflags & TH_FIN))) 9604 return (ctf_process_rst(m, th, so, tp)); 9605 9606 /* 9607 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 9608 * synchronized state. 9609 */ 9610 if (thflags & TH_SYN) { 9611 ctf_challenge_ack(m, th, tp, &ret_val); 9612 return (ret_val); 9613 } 9614 /* 9615 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 9616 * it's less than ts_recent, drop it. 9617 */ 9618 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 9619 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 9620 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 9621 return (ret_val); 9622 } 9623 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 9624 return (ret_val); 9625 } 9626 /* 9627 * If last ACK falls within this segment's sequence numbers, record 9628 * its timestamp. NOTE: 1) That the test incorporates suggestions 9629 * from the latest proposal of the tcplw@cray.com list (Braden 9630 * 1993/04/26). 2) That updating only on newer timestamps interferes 9631 * with our earlier PAWS tests, so this check should be solely 9632 * predicated on the sequence space of this segment. 3) That we 9633 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 9634 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 9635 * SEG.Len, This modified check allows us to overcome RFC1323's 9636 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 9637 * p.869. In such cases, we can still calculate the RTT correctly 9638 * when RCV.NXT == Last.ACK.Sent. 9639 */ 9640 if ((to->to_flags & TOF_TS) != 0 && 9641 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 9642 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 9643 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 9644 tp->ts_recent_age = tcp_ts_getticks(); 9645 tp->ts_recent = to->to_tsval; 9646 } 9647 /* 9648 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 9649 * is on (half-synchronized state), then queue data for later 9650 * processing; else drop segment and return. 9651 */ 9652 if ((thflags & TH_ACK) == 0) { 9653 if (tp->t_flags & TF_NEEDSYN) { 9654 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 9655 tiwin, thflags, nxt_pkt)); 9656 9657 } else if (tp->t_flags & TF_ACKNOW) { 9658 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 9659 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output= 1; 9660 return (ret_val); 9661 } else { 9662 ctf_do_drop(m, NULL); 9663 return (0); 9664 } 9665 } 9666 /* 9667 * Ack processing. 9668 */ 9669 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) { 9670 return (ret_val); 9671 } 9672 if (sbavail(&so->so_snd)) { 9673 if (ctf_progress_timeout_check(tp, true)) { 9674 rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__); 9675 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 9676 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 9677 return (1); 9678 } 9679 } 9680 /* State changes only happen in rack_process_data() */ 9681 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 9682 tiwin, thflags, nxt_pkt)); 9683 } 9684 9685 /* 9686 * Return value of 1, the TCB is unlocked and most 9687 * likely gone, return value of 0, the TCP is still 9688 * locked. 9689 */ 9690 static int 9691 rack_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so, 9692 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 9693 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 9694 { 9695 int32_t ret_val = 0; 9696 9697 ctf_calc_rwin(so, tp); 9698 if ((thflags & TH_RST) || 9699 (tp->t_fin_is_rst && (thflags & TH_FIN))) 9700 return (ctf_process_rst(m, th, so, tp)); 9701 /* 9702 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 9703 * synchronized state. 9704 */ 9705 if (thflags & TH_SYN) { 9706 ctf_challenge_ack(m, th, tp, &ret_val); 9707 return (ret_val); 9708 } 9709 /* 9710 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 9711 * it's less than ts_recent, drop it. 9712 */ 9713 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 9714 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 9715 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 9716 return (ret_val); 9717 } 9718 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 9719 return (ret_val); 9720 } 9721 /* 9722 * If last ACK falls within this segment's sequence numbers, record 9723 * its timestamp. NOTE: 1) That the test incorporates suggestions 9724 * from the latest proposal of the tcplw@cray.com list (Braden 9725 * 1993/04/26). 2) That updating only on newer timestamps interferes 9726 * with our earlier PAWS tests, so this check should be solely 9727 * predicated on the sequence space of this segment. 3) That we 9728 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 9729 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 9730 * SEG.Len, This modified check allows us to overcome RFC1323's 9731 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 9732 * p.869. In such cases, we can still calculate the RTT correctly 9733 * when RCV.NXT == Last.ACK.Sent. 9734 */ 9735 if ((to->to_flags & TOF_TS) != 0 && 9736 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 9737 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 9738 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 9739 tp->ts_recent_age = tcp_ts_getticks(); 9740 tp->ts_recent = to->to_tsval; 9741 } 9742 /* 9743 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 9744 * is on (half-synchronized state), then queue data for later 9745 * processing; else drop segment and return. 9746 */ 9747 if ((thflags & TH_ACK) == 0) { 9748 if (tp->t_flags & TF_NEEDSYN) { 9749 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 9750 tiwin, thflags, nxt_pkt)); 9751 9752 } else if (tp->t_flags & TF_ACKNOW) { 9753 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 9754 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; 9755 return (ret_val); 9756 } else { 9757 ctf_do_drop(m, NULL); 9758 return (0); 9759 } 9760 } 9761 /* 9762 * Ack processing. 9763 */ 9764 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) { 9765 return (ret_val); 9766 } 9767 if (sbavail(&so->so_snd)) { 9768 if (ctf_progress_timeout_check(tp, true)) { 9769 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 9770 tp, tick, PROGRESS_DROP, __LINE__); 9771 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 9772 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 9773 return (1); 9774 } 9775 } 9776 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 9777 tiwin, thflags, nxt_pkt)); 9778 } 9779 9780 static int 9781 rack_check_data_after_close(struct mbuf *m, 9782 struct tcpcb *tp, int32_t *tlen, struct tcphdr *th, struct socket *so) 9783 { 9784 struct tcp_rack *rack; 9785 9786 rack = (struct tcp_rack *)tp->t_fb_ptr; 9787 if (rack->rc_allow_data_af_clo == 0) { 9788 close_now: 9789 tcp_log_end_status(tp, TCP_EI_STATUS_DATA_A_CLOSE); 9790 /* tcp_close will kill the inp pre-log the Reset */ 9791 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); 9792 tp = tcp_close(tp); 9793 KMOD_TCPSTAT_INC(tcps_rcvafterclose); 9794 ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, (*tlen)); 9795 return (1); 9796 } 9797 if (sbavail(&so->so_snd) == 0) 9798 goto close_now; 9799 /* Ok we allow data that is ignored and a followup reset */ 9800 tcp_log_end_status(tp, TCP_EI_STATUS_DATA_A_CLOSE); 9801 tp->rcv_nxt = th->th_seq + *tlen; 9802 tp->t_flags2 |= TF2_DROP_AF_DATA; 9803 rack->r_wanted_output = 1; 9804 *tlen = 0; 9805 return (0); 9806 } 9807 9808 /* 9809 * Return value of 1, the TCB is unlocked and most 9810 * likely gone, return value of 0, the TCP is still 9811 * locked. 9812 */ 9813 static int 9814 rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so, 9815 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 9816 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 9817 { 9818 int32_t ret_val = 0; 9819 int32_t ourfinisacked = 0; 9820 9821 ctf_calc_rwin(so, tp); 9822 9823 if ((thflags & TH_RST) || 9824 (tp->t_fin_is_rst && (thflags & TH_FIN))) 9825 return (ctf_process_rst(m, th, so, tp)); 9826 /* 9827 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 9828 * synchronized state. 9829 */ 9830 if (thflags & TH_SYN) { 9831 ctf_challenge_ack(m, th, tp, &ret_val); 9832 return (ret_val); 9833 } 9834 /* 9835 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 9836 * it's less than ts_recent, drop it. 9837 */ 9838 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 9839 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 9840 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 9841 return (ret_val); 9842 } 9843 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 9844 return (ret_val); 9845 } 9846 /* 9847 * If new data are received on a connection after the user processes 9848 * are gone, then RST the other end. 9849 */ 9850 if ((so->so_state & SS_NOFDREF) && tlen) { 9851 if (rack_check_data_after_close(m, tp, &tlen, th, so)) 9852 return (1); 9853 } 9854 /* 9855 * If last ACK falls within this segment's sequence numbers, record 9856 * its timestamp. NOTE: 1) That the test incorporates suggestions 9857 * from the latest proposal of the tcplw@cray.com list (Braden 9858 * 1993/04/26). 2) That updating only on newer timestamps interferes 9859 * with our earlier PAWS tests, so this check should be solely 9860 * predicated on the sequence space of this segment. 3) That we 9861 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 9862 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 9863 * SEG.Len, This modified check allows us to overcome RFC1323's 9864 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 9865 * p.869. In such cases, we can still calculate the RTT correctly 9866 * when RCV.NXT == Last.ACK.Sent. 9867 */ 9868 if ((to->to_flags & TOF_TS) != 0 && 9869 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 9870 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 9871 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 9872 tp->ts_recent_age = tcp_ts_getticks(); 9873 tp->ts_recent = to->to_tsval; 9874 } 9875 /* 9876 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 9877 * is on (half-synchronized state), then queue data for later 9878 * processing; else drop segment and return. 9879 */ 9880 if ((thflags & TH_ACK) == 0) { 9881 if (tp->t_flags & TF_NEEDSYN) { 9882 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 9883 tiwin, thflags, nxt_pkt)); 9884 } else if (tp->t_flags & TF_ACKNOW) { 9885 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 9886 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; 9887 return (ret_val); 9888 } else { 9889 ctf_do_drop(m, NULL); 9890 return (0); 9891 } 9892 } 9893 /* 9894 * Ack processing. 9895 */ 9896 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 9897 return (ret_val); 9898 } 9899 if (ourfinisacked) { 9900 /* 9901 * If we can't receive any more data, then closing user can 9902 * proceed. Starting the timer is contrary to the 9903 * specification, but if we don't get a FIN we'll hang 9904 * forever. 9905 * 9906 * XXXjl: we should release the tp also, and use a 9907 * compressed state. 9908 */ 9909 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 9910 soisdisconnected(so); 9911 tcp_timer_activate(tp, TT_2MSL, 9912 (tcp_fast_finwait2_recycle ? 9913 tcp_finwait2_timeout : 9914 TP_MAXIDLE(tp))); 9915 } 9916 tcp_state_change(tp, TCPS_FIN_WAIT_2); 9917 } 9918 if (sbavail(&so->so_snd)) { 9919 if (ctf_progress_timeout_check(tp, true)) { 9920 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 9921 tp, tick, PROGRESS_DROP, __LINE__); 9922 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 9923 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 9924 return (1); 9925 } 9926 } 9927 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 9928 tiwin, thflags, nxt_pkt)); 9929 } 9930 9931 /* 9932 * Return value of 1, the TCB is unlocked and most 9933 * likely gone, return value of 0, the TCP is still 9934 * locked. 9935 */ 9936 static int 9937 rack_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so, 9938 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 9939 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 9940 { 9941 int32_t ret_val = 0; 9942 int32_t ourfinisacked = 0; 9943 9944 ctf_calc_rwin(so, tp); 9945 9946 if ((thflags & TH_RST) || 9947 (tp->t_fin_is_rst && (thflags & TH_FIN))) 9948 return (ctf_process_rst(m, th, so, tp)); 9949 /* 9950 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 9951 * synchronized state. 9952 */ 9953 if (thflags & TH_SYN) { 9954 ctf_challenge_ack(m, th, tp, &ret_val); 9955 return (ret_val); 9956 } 9957 /* 9958 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 9959 * it's less than ts_recent, drop it. 9960 */ 9961 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 9962 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 9963 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 9964 return (ret_val); 9965 } 9966 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 9967 return (ret_val); 9968 } 9969 /* 9970 * If new data are received on a connection after the user processes 9971 * are gone, then RST the other end. 9972 */ 9973 if ((so->so_state & SS_NOFDREF) && tlen) { 9974 if (rack_check_data_after_close(m, tp, &tlen, th, so)) 9975 return (1); 9976 } 9977 /* 9978 * If last ACK falls within this segment's sequence numbers, record 9979 * its timestamp. NOTE: 1) That the test incorporates suggestions 9980 * from the latest proposal of the tcplw@cray.com list (Braden 9981 * 1993/04/26). 2) That updating only on newer timestamps interferes 9982 * with our earlier PAWS tests, so this check should be solely 9983 * predicated on the sequence space of this segment. 3) That we 9984 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 9985 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 9986 * SEG.Len, This modified check allows us to overcome RFC1323's 9987 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 9988 * p.869. In such cases, we can still calculate the RTT correctly 9989 * when RCV.NXT == Last.ACK.Sent. 9990 */ 9991 if ((to->to_flags & TOF_TS) != 0 && 9992 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 9993 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 9994 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 9995 tp->ts_recent_age = tcp_ts_getticks(); 9996 tp->ts_recent = to->to_tsval; 9997 } 9998 /* 9999 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 10000 * is on (half-synchronized state), then queue data for later 10001 * processing; else drop segment and return. 10002 */ 10003 if ((thflags & TH_ACK) == 0) { 10004 if (tp->t_flags & TF_NEEDSYN) { 10005 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 10006 tiwin, thflags, nxt_pkt)); 10007 } else if (tp->t_flags & TF_ACKNOW) { 10008 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 10009 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output= 1; 10010 return (ret_val); 10011 } else { 10012 ctf_do_drop(m, NULL); 10013 return (0); 10014 } 10015 } 10016 /* 10017 * Ack processing. 10018 */ 10019 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 10020 return (ret_val); 10021 } 10022 if (ourfinisacked) { 10023 tcp_twstart(tp); 10024 m_freem(m); 10025 return (1); 10026 } 10027 if (sbavail(&so->so_snd)) { 10028 if (ctf_progress_timeout_check(tp, true)) { 10029 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 10030 tp, tick, PROGRESS_DROP, __LINE__); 10031 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 10032 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 10033 return (1); 10034 } 10035 } 10036 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 10037 tiwin, thflags, nxt_pkt)); 10038 } 10039 10040 /* 10041 * Return value of 1, the TCB is unlocked and most 10042 * likely gone, return value of 0, the TCP is still 10043 * locked. 10044 */ 10045 static int 10046 rack_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so, 10047 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 10048 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 10049 { 10050 int32_t ret_val = 0; 10051 int32_t ourfinisacked = 0; 10052 10053 ctf_calc_rwin(so, tp); 10054 10055 if ((thflags & TH_RST) || 10056 (tp->t_fin_is_rst && (thflags & TH_FIN))) 10057 return (ctf_process_rst(m, th, so, tp)); 10058 /* 10059 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 10060 * synchronized state. 10061 */ 10062 if (thflags & TH_SYN) { 10063 ctf_challenge_ack(m, th, tp, &ret_val); 10064 return (ret_val); 10065 } 10066 /* 10067 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 10068 * it's less than ts_recent, drop it. 10069 */ 10070 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 10071 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 10072 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 10073 return (ret_val); 10074 } 10075 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 10076 return (ret_val); 10077 } 10078 /* 10079 * If new data are received on a connection after the user processes 10080 * are gone, then RST the other end. 10081 */ 10082 if ((so->so_state & SS_NOFDREF) && tlen) { 10083 if (rack_check_data_after_close(m, tp, &tlen, th, so)) 10084 return (1); 10085 } 10086 /* 10087 * If last ACK falls within this segment's sequence numbers, record 10088 * its timestamp. NOTE: 1) That the test incorporates suggestions 10089 * from the latest proposal of the tcplw@cray.com list (Braden 10090 * 1993/04/26). 2) That updating only on newer timestamps interferes 10091 * with our earlier PAWS tests, so this check should be solely 10092 * predicated on the sequence space of this segment. 3) That we 10093 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 10094 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 10095 * SEG.Len, This modified check allows us to overcome RFC1323's 10096 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 10097 * p.869. In such cases, we can still calculate the RTT correctly 10098 * when RCV.NXT == Last.ACK.Sent. 10099 */ 10100 if ((to->to_flags & TOF_TS) != 0 && 10101 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 10102 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 10103 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 10104 tp->ts_recent_age = tcp_ts_getticks(); 10105 tp->ts_recent = to->to_tsval; 10106 } 10107 /* 10108 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 10109 * is on (half-synchronized state), then queue data for later 10110 * processing; else drop segment and return. 10111 */ 10112 if ((thflags & TH_ACK) == 0) { 10113 if (tp->t_flags & TF_NEEDSYN) { 10114 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 10115 tiwin, thflags, nxt_pkt)); 10116 } else if (tp->t_flags & TF_ACKNOW) { 10117 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 10118 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; 10119 return (ret_val); 10120 } else { 10121 ctf_do_drop(m, NULL); 10122 return (0); 10123 } 10124 } 10125 /* 10126 * case TCPS_LAST_ACK: Ack processing. 10127 */ 10128 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 10129 return (ret_val); 10130 } 10131 if (ourfinisacked) { 10132 tp = tcp_close(tp); 10133 ctf_do_drop(m, tp); 10134 return (1); 10135 } 10136 if (sbavail(&so->so_snd)) { 10137 if (ctf_progress_timeout_check(tp, true)) { 10138 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 10139 tp, tick, PROGRESS_DROP, __LINE__); 10140 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 10141 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 10142 return (1); 10143 } 10144 } 10145 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 10146 tiwin, thflags, nxt_pkt)); 10147 } 10148 10149 /* 10150 * Return value of 1, the TCB is unlocked and most 10151 * likely gone, return value of 0, the TCP is still 10152 * locked. 10153 */ 10154 static int 10155 rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so, 10156 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 10157 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 10158 { 10159 int32_t ret_val = 0; 10160 int32_t ourfinisacked = 0; 10161 10162 ctf_calc_rwin(so, tp); 10163 10164 /* Reset receive buffer auto scaling when not in bulk receive mode. */ 10165 if ((thflags & TH_RST) || 10166 (tp->t_fin_is_rst && (thflags & TH_FIN))) 10167 return (ctf_process_rst(m, th, so, tp)); 10168 /* 10169 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 10170 * synchronized state. 10171 */ 10172 if (thflags & TH_SYN) { 10173 ctf_challenge_ack(m, th, tp, &ret_val); 10174 return (ret_val); 10175 } 10176 /* 10177 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 10178 * it's less than ts_recent, drop it. 10179 */ 10180 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 10181 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 10182 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 10183 return (ret_val); 10184 } 10185 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 10186 return (ret_val); 10187 } 10188 /* 10189 * If new data are received on a connection after the user processes 10190 * are gone, then RST the other end. 10191 */ 10192 if ((so->so_state & SS_NOFDREF) && 10193 tlen) { 10194 if (rack_check_data_after_close(m, tp, &tlen, th, so)) 10195 return (1); 10196 } 10197 /* 10198 * If last ACK falls within this segment's sequence numbers, record 10199 * its timestamp. NOTE: 1) That the test incorporates suggestions 10200 * from the latest proposal of the tcplw@cray.com list (Braden 10201 * 1993/04/26). 2) That updating only on newer timestamps interferes 10202 * with our earlier PAWS tests, so this check should be solely 10203 * predicated on the sequence space of this segment. 3) That we 10204 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 10205 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 10206 * SEG.Len, This modified check allows us to overcome RFC1323's 10207 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 10208 * p.869. In such cases, we can still calculate the RTT correctly 10209 * when RCV.NXT == Last.ACK.Sent. 10210 */ 10211 if ((to->to_flags & TOF_TS) != 0 && 10212 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 10213 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 10214 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 10215 tp->ts_recent_age = tcp_ts_getticks(); 10216 tp->ts_recent = to->to_tsval; 10217 } 10218 /* 10219 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 10220 * is on (half-synchronized state), then queue data for later 10221 * processing; else drop segment and return. 10222 */ 10223 if ((thflags & TH_ACK) == 0) { 10224 if (tp->t_flags & TF_NEEDSYN) { 10225 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 10226 tiwin, thflags, nxt_pkt)); 10227 } else if (tp->t_flags & TF_ACKNOW) { 10228 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 10229 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; 10230 return (ret_val); 10231 } else { 10232 ctf_do_drop(m, NULL); 10233 return (0); 10234 } 10235 } 10236 /* 10237 * Ack processing. 10238 */ 10239 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 10240 return (ret_val); 10241 } 10242 if (sbavail(&so->so_snd)) { 10243 if (ctf_progress_timeout_check(tp, true)) { 10244 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 10245 tp, tick, PROGRESS_DROP, __LINE__); 10246 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 10247 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 10248 return (1); 10249 } 10250 } 10251 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 10252 tiwin, thflags, nxt_pkt)); 10253 } 10254 10255 static void inline 10256 rack_clear_rate_sample(struct tcp_rack *rack) 10257 { 10258 rack->r_ctl.rack_rs.rs_flags = RACK_RTT_EMPTY; 10259 rack->r_ctl.rack_rs.rs_rtt_cnt = 0; 10260 rack->r_ctl.rack_rs.rs_rtt_tot = 0; 10261 } 10262 10263 static void 10264 rack_set_pace_segments(struct tcpcb *tp, struct tcp_rack *rack, uint32_t line) 10265 { 10266 uint64_t bw_est, rate_wanted; 10267 int chged = 0; 10268 uint32_t user_max; 10269 10270 user_max = ctf_fixed_maxseg(tp) * rack->rc_user_set_max_segs; 10271 if (ctf_fixed_maxseg(tp) != rack->r_ctl.rc_pace_min_segs) 10272 chged = 1; 10273 rack->r_ctl.rc_pace_min_segs = ctf_fixed_maxseg(tp); 10274 if (rack->use_fixed_rate || rack->rc_force_max_seg) { 10275 if (user_max != rack->r_ctl.rc_pace_max_segs) 10276 chged = 1; 10277 } 10278 if (rack->rc_force_max_seg) { 10279 rack->r_ctl.rc_pace_max_segs = user_max; 10280 } else if (rack->use_fixed_rate) { 10281 bw_est = rack_get_bw(rack); 10282 if ((rack->r_ctl.crte == NULL) || 10283 (bw_est != rack->r_ctl.crte->rate)) { 10284 rack->r_ctl.rc_pace_max_segs = user_max; 10285 } else { 10286 /* We are pacing right at the hardware rate */ 10287 uint32_t segsiz; 10288 10289 segsiz = min(ctf_fixed_maxseg(tp), 10290 rack->r_ctl.rc_pace_min_segs); 10291 rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size( 10292 tp, bw_est, segsiz, 0, 10293 rack->r_ctl.crte, NULL); 10294 } 10295 } else if (rack->rc_always_pace) { 10296 if (rack->r_ctl.gp_bw || 10297 #ifdef NETFLIX_PEAKRATE 10298 rack->rc_tp->t_maxpeakrate || 10299 #endif 10300 rack->r_ctl.init_rate) { 10301 /* We have a rate of some sort set */ 10302 uint32_t orig; 10303 10304 bw_est = rack_get_bw(rack); 10305 orig = rack->r_ctl.rc_pace_max_segs; 10306 rate_wanted = rack_get_output_bw(rack, bw_est, NULL); 10307 if (rate_wanted) { 10308 /* We have something */ 10309 rack->r_ctl.rc_pace_max_segs = rack_get_pacing_len(rack, 10310 rate_wanted, 10311 ctf_fixed_maxseg(rack->rc_tp)); 10312 } else 10313 rack->r_ctl.rc_pace_max_segs = rack->r_ctl.rc_pace_min_segs; 10314 if (orig != rack->r_ctl.rc_pace_max_segs) 10315 chged = 1; 10316 } else if ((rack->r_ctl.gp_bw == 0) && 10317 (rack->r_ctl.rc_pace_max_segs == 0)) { 10318 /* 10319 * If we have nothing limit us to bursting 10320 * out IW sized pieces. 10321 */ 10322 chged = 1; 10323 rack->r_ctl.rc_pace_max_segs = rc_init_window(rack); 10324 } 10325 } 10326 if (rack->r_ctl.rc_pace_max_segs > PACE_MAX_IP_BYTES) { 10327 chged = 1; 10328 rack->r_ctl.rc_pace_max_segs = PACE_MAX_IP_BYTES; 10329 } 10330 if (chged) 10331 rack_log_type_hrdwtso(tp, rack, 0, rack->rc_inp->inp_socket->so_snd.sb_flags, line, 2); 10332 } 10333 10334 static int 10335 rack_init(struct tcpcb *tp) 10336 { 10337 struct tcp_rack *rack = NULL; 10338 struct rack_sendmap *insret; 10339 uint32_t iwin, snt, us_cts; 10340 10341 tp->t_fb_ptr = uma_zalloc(rack_pcb_zone, M_NOWAIT); 10342 if (tp->t_fb_ptr == NULL) { 10343 /* 10344 * We need to allocate memory but cant. The INP and INP_INFO 10345 * locks and they are recusive (happens during setup. So a 10346 * scheme to drop the locks fails :( 10347 * 10348 */ 10349 return (ENOMEM); 10350 } 10351 memset(tp->t_fb_ptr, 0, sizeof(struct tcp_rack)); 10352 10353 rack = (struct tcp_rack *)tp->t_fb_ptr; 10354 RB_INIT(&rack->r_ctl.rc_mtree); 10355 TAILQ_INIT(&rack->r_ctl.rc_free); 10356 TAILQ_INIT(&rack->r_ctl.rc_tmap); 10357 rack->rc_tp = tp; 10358 if (tp->t_inpcb) { 10359 rack->rc_inp = tp->t_inpcb; 10360 } 10361 /* Probably not needed but lets be sure */ 10362 rack_clear_rate_sample(rack); 10363 rack->r_ctl.rc_reorder_fade = rack_reorder_fade; 10364 rack->rc_allow_data_af_clo = rack_ignore_data_after_close; 10365 rack->r_ctl.rc_tlp_threshold = rack_tlp_thresh; 10366 if (use_rack_rr) 10367 rack->use_rack_rr = 1; 10368 if (V_tcp_delack_enabled) 10369 tp->t_delayed_ack = 1; 10370 else 10371 tp->t_delayed_ack = 0; 10372 if (rack_enable_shared_cwnd) 10373 rack->rack_enable_scwnd = 1; 10374 rack->rc_user_set_max_segs = rack_hptsi_segments; 10375 rack->rc_force_max_seg = 0; 10376 if (rack_use_imac_dack) 10377 rack->rc_dack_mode = 1; 10378 rack->r_ctl.rc_reorder_shift = rack_reorder_thresh; 10379 rack->r_ctl.rc_pkt_delay = rack_pkt_delay; 10380 rack->r_ctl.rc_prop_reduce = rack_use_proportional_reduce; 10381 rack->r_ctl.rc_prop_rate = rack_proportional_rate; 10382 rack->r_ctl.rc_tlp_cwnd_reduce = rack_lower_cwnd_at_tlp; 10383 rack->r_ctl.rc_early_recovery = rack_early_recovery; 10384 rack->r_ctl.rc_lowest_us_rtt = 0xffffffff; 10385 rack->r_ctl.rc_highest_us_rtt = 0; 10386 if (rack_disable_prr) 10387 rack->rack_no_prr = 1; 10388 if (rack_gp_no_rec_chg) 10389 rack->rc_gp_no_rec_chg = 1; 10390 rack->rc_always_pace = rack_pace_every_seg; 10391 if (rack_enable_mqueue_for_nonpaced) 10392 rack->r_mbuf_queue = 1; 10393 else 10394 rack->r_mbuf_queue = 0; 10395 if (rack->r_mbuf_queue || rack->rc_always_pace) 10396 tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ; 10397 else 10398 tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ; 10399 rack_set_pace_segments(tp, rack, __LINE__); 10400 if (rack_limits_scwnd) 10401 rack->r_limit_scw = 1; 10402 else 10403 rack->r_limit_scw = 0; 10404 rack->r_ctl.rc_high_rwnd = tp->snd_wnd; 10405 rack->r_ctl.cwnd_to_use = tp->snd_cwnd; 10406 rack->r_ctl.rc_rate_sample_method = rack_rate_sample_method; 10407 rack->rack_tlp_threshold_use = rack_tlp_threshold_use; 10408 rack->r_ctl.rc_prr_sendalot = rack_send_a_lot_in_prr; 10409 rack->r_ctl.rc_min_to = rack_min_to; 10410 microuptime(&rack->r_ctl.act_rcv_time); 10411 rack->r_ctl.rc_last_time_decay = rack->r_ctl.act_rcv_time; 10412 rack->r_running_late = 0; 10413 rack->r_running_early = 0; 10414 rack->rc_init_win = rack_default_init_window; 10415 rack->r_ctl.rack_per_of_gp_ss = rack_per_of_gp_ss; 10416 if (rack_do_dyn_mul) { 10417 /* When dynamic adjustment is on CA needs to start at 100% */ 10418 rack->rc_gp_dyn_mul = 1; 10419 if (rack_do_dyn_mul >= 100) 10420 rack->r_ctl.rack_per_of_gp_ca = rack_do_dyn_mul; 10421 } else 10422 rack->r_ctl.rack_per_of_gp_ca = rack_per_of_gp_ca; 10423 rack->r_ctl.rack_per_of_gp_rec = rack_per_of_gp_rec; 10424 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt; 10425 rack->r_ctl.rc_tlp_rxt_last_time = tcp_tv_to_mssectick(&rack->r_ctl.act_rcv_time); 10426 setup_time_filter_small(&rack->r_ctl.rc_gp_min_rtt, FILTER_TYPE_MIN, 10427 rack_probertt_filter_life); 10428 us_cts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 10429 rack->r_ctl.rc_lower_rtt_us_cts = us_cts; 10430 rack->r_ctl.rc_time_of_last_probertt = us_cts; 10431 rack->r_ctl.rc_time_probertt_starts = 0; 10432 /* Do we force on detection? */ 10433 #ifdef NETFLIX_EXP_DETECTION 10434 if (tcp_force_detection) 10435 rack->do_detection = 1; 10436 else 10437 #endif 10438 rack->do_detection = 0; 10439 if (rack_non_rxt_use_cr) 10440 rack->rack_rec_nonrxt_use_cr = 1; 10441 if (tp->snd_una != tp->snd_max) { 10442 /* Create a send map for the current outstanding data */ 10443 struct rack_sendmap *rsm; 10444 10445 rsm = rack_alloc(rack); 10446 if (rsm == NULL) { 10447 uma_zfree(rack_pcb_zone, tp->t_fb_ptr); 10448 tp->t_fb_ptr = NULL; 10449 return (ENOMEM); 10450 } 10451 rsm->r_flags = RACK_OVERMAX; 10452 rsm->r_tim_lastsent[0] = rack->r_ctl.rc_tlp_rxt_last_time; 10453 rsm->r_rtr_cnt = 1; 10454 rsm->r_rtr_bytes = 0; 10455 rsm->r_start = tp->snd_una; 10456 if (tp->t_flags & TF_SENTFIN) { 10457 rsm->r_end = tp->snd_max - 1; 10458 rsm->r_flags |= RACK_HAS_FIN; 10459 } else { 10460 rsm->r_end = tp->snd_max; 10461 } 10462 rsm->usec_orig_send = us_cts; 10463 rsm->r_dupack = 0; 10464 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 10465 #ifdef INVARIANTS 10466 if (insret != NULL) { 10467 panic("Insert in rb tree fails ret:%p rack:%p rsm:%p", 10468 insret, rack, rsm); 10469 } 10470 #endif 10471 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); 10472 rsm->r_in_tmap = 1; 10473 } 10474 /* Cancel the GP measurement in progress */ 10475 tp->t_flags &= ~TF_GPUTINPROG; 10476 if (SEQ_GT(tp->snd_max, tp->iss)) 10477 snt = tp->snd_max - tp->iss; 10478 else 10479 snt = 0; 10480 iwin = rc_init_window(rack); 10481 if (snt < iwin) { 10482 /* We are not past the initial window 10483 * so we need to make sure cwnd is 10484 * correct. 10485 */ 10486 if (tp->snd_cwnd < iwin) 10487 tp->snd_cwnd = iwin; 10488 /* 10489 * If we are within the initial window 10490 * we want ssthresh to be unlimited. Setting 10491 * it to the rwnd (which the default stack does 10492 * and older racks) is not really a good idea 10493 * since we want to be in SS and grow both the 10494 * cwnd and the rwnd (via dynamic rwnd growth). If 10495 * we set it to the rwnd then as the peer grows its 10496 * rwnd we will be stuck in CA and never hit SS. 10497 * 10498 * Its far better to raise it up high (this takes the 10499 * risk that there as been a loss already, probably 10500 * we should have an indicator in all stacks of loss 10501 * but we don't), but considering the normal use this 10502 * is a risk worth taking. The consequences of not 10503 * hitting SS are far worse than going one more time 10504 * into it early on (before we have sent even a IW). 10505 * It is highly unlikely that we will have had a loss 10506 * before getting the IW out. 10507 */ 10508 tp->snd_ssthresh = 0xffffffff; 10509 } 10510 rack_stop_all_timers(tp); 10511 rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), 0, 0, 0); 10512 rack_log_rtt_shrinks(rack, us_cts, 0, 10513 __LINE__, RACK_RTTS_INIT); 10514 return (0); 10515 } 10516 10517 static int 10518 rack_handoff_ok(struct tcpcb *tp) 10519 { 10520 if ((tp->t_state == TCPS_CLOSED) || 10521 (tp->t_state == TCPS_LISTEN)) { 10522 /* Sure no problem though it may not stick */ 10523 return (0); 10524 } 10525 if ((tp->t_state == TCPS_SYN_SENT) || 10526 (tp->t_state == TCPS_SYN_RECEIVED)) { 10527 /* 10528 * We really don't know if you support sack, 10529 * you have to get to ESTAB or beyond to tell. 10530 */ 10531 return (EAGAIN); 10532 } 10533 if ((tp->t_flags & TF_SENTFIN) && ((tp->snd_max - tp->snd_una) > 1)) { 10534 /* 10535 * Rack will only send a FIN after all data is acknowledged. 10536 * So in this case we have more data outstanding. We can't 10537 * switch stacks until either all data and only the FIN 10538 * is left (in which case rack_init() now knows how 10539 * to deal with that) <or> all is acknowledged and we 10540 * are only left with incoming data, though why you 10541 * would want to switch to rack after all data is acknowledged 10542 * I have no idea (rrs)! 10543 */ 10544 return (EAGAIN); 10545 } 10546 if ((tp->t_flags & TF_SACK_PERMIT) || rack_sack_not_required){ 10547 return (0); 10548 } 10549 /* 10550 * If we reach here we don't do SACK on this connection so we can 10551 * never do rack. 10552 */ 10553 return (EINVAL); 10554 } 10555 10556 static void 10557 rack_fini(struct tcpcb *tp, int32_t tcb_is_purged) 10558 { 10559 if (tp->t_fb_ptr) { 10560 struct tcp_rack *rack; 10561 struct rack_sendmap *rsm, *nrsm, *rm; 10562 10563 rack = (struct tcp_rack *)tp->t_fb_ptr; 10564 #ifdef NETFLIX_SHARED_CWND 10565 if (rack->r_ctl.rc_scw) { 10566 uint32_t limit; 10567 10568 if (rack->r_limit_scw) 10569 limit = max(1, rack->r_ctl.rc_lowest_us_rtt); 10570 else 10571 limit = 0; 10572 tcp_shared_cwnd_free_full(tp, rack->r_ctl.rc_scw, 10573 rack->r_ctl.rc_scw_index, 10574 limit); 10575 rack->r_ctl.rc_scw = NULL; 10576 } 10577 #endif 10578 /* rack does not use force data but other stacks may clear it */ 10579 tp->t_flags &= ~TF_FORCEDATA; 10580 if (tp->t_inpcb) { 10581 tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ; 10582 tp->t_inpcb->inp_flags2 &= ~INP_MBUF_QUEUE_READY; 10583 tp->t_inpcb->inp_flags2 &= ~INP_DONT_SACK_QUEUE; 10584 } 10585 #ifdef TCP_BLACKBOX 10586 tcp_log_flowend(tp); 10587 #endif 10588 RB_FOREACH_SAFE(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm) { 10589 rm = RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 10590 #ifdef INVARIANTS 10591 if (rm != rsm) { 10592 panic("At fini, rack:%p rsm:%p rm:%p", 10593 rack, rsm, rm); 10594 } 10595 #endif 10596 uma_zfree(rack_zone, rsm); 10597 } 10598 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); 10599 while (rsm) { 10600 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext); 10601 uma_zfree(rack_zone, rsm); 10602 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); 10603 } 10604 rack->rc_free_cnt = 0; 10605 uma_zfree(rack_pcb_zone, tp->t_fb_ptr); 10606 tp->t_fb_ptr = NULL; 10607 } 10608 /* Cancel the GP measurement in progress */ 10609 tp->t_flags &= ~TF_GPUTINPROG; 10610 /* Make sure snd_nxt is correctly set */ 10611 tp->snd_nxt = tp->snd_max; 10612 } 10613 10614 static void 10615 rack_set_state(struct tcpcb *tp, struct tcp_rack *rack) 10616 { 10617 switch (tp->t_state) { 10618 case TCPS_SYN_SENT: 10619 rack->r_state = TCPS_SYN_SENT; 10620 rack->r_substate = rack_do_syn_sent; 10621 break; 10622 case TCPS_SYN_RECEIVED: 10623 rack->r_state = TCPS_SYN_RECEIVED; 10624 rack->r_substate = rack_do_syn_recv; 10625 break; 10626 case TCPS_ESTABLISHED: 10627 rack_set_pace_segments(tp, rack, __LINE__); 10628 rack->r_state = TCPS_ESTABLISHED; 10629 rack->r_substate = rack_do_established; 10630 break; 10631 case TCPS_CLOSE_WAIT: 10632 rack->r_state = TCPS_CLOSE_WAIT; 10633 rack->r_substate = rack_do_close_wait; 10634 break; 10635 case TCPS_FIN_WAIT_1: 10636 rack->r_state = TCPS_FIN_WAIT_1; 10637 rack->r_substate = rack_do_fin_wait_1; 10638 break; 10639 case TCPS_CLOSING: 10640 rack->r_state = TCPS_CLOSING; 10641 rack->r_substate = rack_do_closing; 10642 break; 10643 case TCPS_LAST_ACK: 10644 rack->r_state = TCPS_LAST_ACK; 10645 rack->r_substate = rack_do_lastack; 10646 break; 10647 case TCPS_FIN_WAIT_2: 10648 rack->r_state = TCPS_FIN_WAIT_2; 10649 rack->r_substate = rack_do_fin_wait_2; 10650 break; 10651 case TCPS_LISTEN: 10652 case TCPS_CLOSED: 10653 case TCPS_TIME_WAIT: 10654 default: 10655 break; 10656 }; 10657 } 10658 10659 static void 10660 rack_timer_audit(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb) 10661 { 10662 /* 10663 * We received an ack, and then did not 10664 * call send or were bounced out due to the 10665 * hpts was running. Now a timer is up as well, is 10666 * it the right timer? 10667 */ 10668 struct rack_sendmap *rsm; 10669 int tmr_up; 10670 10671 tmr_up = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK; 10672 if (rack->rc_in_persist && (tmr_up == PACE_TMR_PERSIT)) 10673 return; 10674 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 10675 if (((rsm == NULL) || (tp->t_state < TCPS_ESTABLISHED)) && 10676 (tmr_up == PACE_TMR_RXT)) { 10677 /* Should be an RXT */ 10678 return; 10679 } 10680 if (rsm == NULL) { 10681 /* Nothing outstanding? */ 10682 if (tp->t_flags & TF_DELACK) { 10683 if (tmr_up == PACE_TMR_DELACK) 10684 /* We are supposed to have delayed ack up and we do */ 10685 return; 10686 } else if (sbavail(&tp->t_inpcb->inp_socket->so_snd) && (tmr_up == PACE_TMR_RXT)) { 10687 /* 10688 * if we hit enobufs then we would expect the possiblity 10689 * of nothing outstanding and the RXT up (and the hptsi timer). 10690 */ 10691 return; 10692 } else if (((V_tcp_always_keepalive || 10693 rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) && 10694 (tp->t_state <= TCPS_CLOSING)) && 10695 (tmr_up == PACE_TMR_KEEP) && 10696 (tp->snd_max == tp->snd_una)) { 10697 /* We should have keep alive up and we do */ 10698 return; 10699 } 10700 } 10701 if (SEQ_GT(tp->snd_max, tp->snd_una) && 10702 ((tmr_up == PACE_TMR_TLP) || 10703 (tmr_up == PACE_TMR_RACK) || 10704 (tmr_up == PACE_TMR_RXT))) { 10705 /* 10706 * Either a Rack, TLP or RXT is fine if we 10707 * have outstanding data. 10708 */ 10709 return; 10710 } else if (tmr_up == PACE_TMR_DELACK) { 10711 /* 10712 * If the delayed ack was going to go off 10713 * before the rtx/tlp/rack timer were going to 10714 * expire, then that would be the timer in control. 10715 * Note we don't check the time here trusting the 10716 * code is correct. 10717 */ 10718 return; 10719 } 10720 /* 10721 * Ok the timer originally started is not what we want now. 10722 * We will force the hpts to be stopped if any, and restart 10723 * with the slot set to what was in the saved slot. 10724 */ 10725 if (rack->rc_inp->inp_in_hpts) { 10726 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { 10727 uint32_t us_cts; 10728 10729 us_cts = tcp_get_usecs(NULL); 10730 if (TSTMP_GT(rack->r_ctl.rc_last_output_to, us_cts)) { 10731 rack->r_early = 1; 10732 rack->r_ctl.rc_agg_early += (rack->r_ctl.rc_last_output_to - us_cts); 10733 } 10734 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 10735 } 10736 tcp_hpts_remove(tp->t_inpcb, HPTS_REMOVE_OUTPUT); 10737 } 10738 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 10739 rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), 0, 0, 0); 10740 } 10741 10742 static int 10743 rack_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so, 10744 struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos, 10745 int32_t nxt_pkt, struct timeval *tv) 10746 { 10747 int32_t thflags, retval, did_out = 0; 10748 int32_t way_out = 0; 10749 uint32_t cts; 10750 uint32_t tiwin; 10751 struct timespec ts; 10752 struct tcpopt to; 10753 struct tcp_rack *rack; 10754 struct rack_sendmap *rsm; 10755 int32_t prev_state = 0; 10756 uint32_t us_cts; 10757 /* 10758 * tv passed from common code is from either M_TSTMP_LRO or 10759 * tcp_get_usecs() if no LRO m_pkthdr timestamp is present. The 10760 * rack_pacing stack assumes tv always refers to 'now', so we overwrite 10761 * tv here to guarantee that. 10762 */ 10763 if (m->m_flags & M_TSTMP_LRO) 10764 tcp_get_usecs(tv); 10765 10766 cts = tcp_tv_to_mssectick(tv); 10767 rack = (struct tcp_rack *)tp->t_fb_ptr; 10768 10769 if ((m->m_flags & M_TSTMP) || 10770 (m->m_flags & M_TSTMP_LRO)) { 10771 mbuf_tstmp2timespec(m, &ts); 10772 rack->r_ctl.act_rcv_time.tv_sec = ts.tv_sec; 10773 rack->r_ctl.act_rcv_time.tv_usec = ts.tv_nsec/1000; 10774 } else 10775 rack->r_ctl.act_rcv_time = *tv; 10776 kern_prefetch(rack, &prev_state); 10777 prev_state = 0; 10778 thflags = th->th_flags; 10779 10780 NET_EPOCH_ASSERT(); 10781 INP_WLOCK_ASSERT(tp->t_inpcb); 10782 KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", 10783 __func__)); 10784 KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT", 10785 __func__)); 10786 if (tp->t_logstate != TCP_LOG_STATE_OFF) { 10787 union tcp_log_stackspecific log; 10788 struct timeval ltv; 10789 #ifdef NETFLIX_HTTP_LOGGING 10790 struct http_sendfile_track *http_req; 10791 10792 if (SEQ_GT(th->th_ack, tp->snd_una)) { 10793 http_req = tcp_http_find_req_for_seq(tp, (th->th_ack-1)); 10794 } else { 10795 http_req = tcp_http_find_req_for_seq(tp, th->th_ack); 10796 } 10797 #endif 10798 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 10799 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 10800 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 10801 if (rack->rack_no_prr == 0) 10802 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; 10803 else 10804 log.u_bbr.flex1 = 0; 10805 log.u_bbr.flex2 = rack->r_ctl.rc_num_maps_alloced; 10806 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 10807 log.u_bbr.pkts_out = rack->rc_tp->t_maxseg; 10808 log.u_bbr.flex3 = m->m_flags; 10809 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; 10810 if (m->m_flags & M_TSTMP) { 10811 /* Record the hardware timestamp if present */ 10812 mbuf_tstmp2timespec(m, &ts); 10813 ltv.tv_sec = ts.tv_sec; 10814 ltv.tv_usec = ts.tv_nsec / 1000; 10815 log.u_bbr.lt_epoch = tcp_tv_to_usectick(<v); 10816 } else if (m->m_flags & M_TSTMP_LRO) { 10817 /* Record the LRO the arrival timestamp */ 10818 mbuf_tstmp2timespec(m, &ts); 10819 ltv.tv_sec = ts.tv_sec; 10820 ltv.tv_usec = ts.tv_nsec / 1000; 10821 log.u_bbr.flex5 = tcp_tv_to_usectick(<v); 10822 } 10823 log.u_bbr.timeStamp = tcp_get_usecs(<v); 10824 /* Log the rcv time */ 10825 log.u_bbr.delRate = m->m_pkthdr.rcv_tstmp; 10826 #ifdef NETFLIX_HTTP_LOGGING 10827 log.u_bbr.applimited = tp->t_http_closed; 10828 log.u_bbr.applimited <<= 8; 10829 log.u_bbr.applimited |= tp->t_http_open; 10830 log.u_bbr.applimited <<= 8; 10831 log.u_bbr.applimited |= tp->t_http_req; 10832 if (http_req) { 10833 /* Copy out any client req info */ 10834 /* seconds */ 10835 log.u_bbr.pkt_epoch = (http_req->localtime / HPTS_USEC_IN_SEC); 10836 /* useconds */ 10837 log.u_bbr.delivered = (http_req->localtime % HPTS_USEC_IN_SEC); 10838 log.u_bbr.rttProp = http_req->timestamp; 10839 log.u_bbr.cur_del_rate = http_req->start; 10840 if (http_req->flags & TCP_HTTP_TRACK_FLG_OPEN) { 10841 log.u_bbr.flex8 |= 1; 10842 } else { 10843 log.u_bbr.flex8 |= 2; 10844 log.u_bbr.bw_inuse = http_req->end; 10845 } 10846 log.u_bbr.flex6 = http_req->start_seq; 10847 if (http_req->flags & TCP_HTTP_TRACK_FLG_COMP) { 10848 log.u_bbr.flex8 |= 4; 10849 log.u_bbr.epoch = http_req->end_seq; 10850 } 10851 } 10852 #endif 10853 TCP_LOG_EVENTP(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_IN, 0, 10854 tlen, &log, true, <v); 10855 } 10856 if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) { 10857 way_out = 4; 10858 retval = 0; 10859 goto done_with_input; 10860 } 10861 /* 10862 * If a segment with the ACK-bit set arrives in the SYN-SENT state 10863 * check SEQ.ACK first as described on page 66 of RFC 793, section 3.9. 10864 */ 10865 if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) && 10866 (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { 10867 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 10868 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 10869 return(1); 10870 } 10871 10872 /* 10873 * Parse options on any incoming segment. 10874 */ 10875 tcp_dooptions(&to, (u_char *)(th + 1), 10876 (th->th_off << 2) - sizeof(struct tcphdr), 10877 (thflags & TH_SYN) ? TO_SYN : 0); 10878 10879 /* 10880 * If timestamps were negotiated during SYN/ACK and a 10881 * segment without a timestamp is received, silently drop 10882 * the segment, unless it is a RST segment or missing timestamps are 10883 * tolerated. 10884 * See section 3.2 of RFC 7323. 10885 */ 10886 if ((tp->t_flags & TF_RCVD_TSTMP) && !(to.to_flags & TOF_TS) && 10887 ((thflags & TH_RST) == 0) && (V_tcp_tolerate_missing_ts == 0)) { 10888 way_out = 5; 10889 retval = 0; 10890 goto done_with_input; 10891 } 10892 10893 /* 10894 * Segment received on connection. Reset idle time and keep-alive 10895 * timer. XXX: This should be done after segment validation to 10896 * ignore broken/spoofed segs. 10897 */ 10898 if (tp->t_idle_reduce && 10899 (tp->snd_max == tp->snd_una) && 10900 ((ticks - tp->t_rcvtime) >= tp->t_rxtcur)) { 10901 counter_u64_add(rack_input_idle_reduces, 1); 10902 rack_cc_after_idle(rack, tp); 10903 } 10904 tp->t_rcvtime = ticks; 10905 /* 10906 * Unscale the window into a 32-bit value. For the SYN_SENT state 10907 * the scale is zero. 10908 */ 10909 tiwin = th->th_win << tp->snd_scale; 10910 #ifdef STATS 10911 stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_FRWIN, tiwin); 10912 #endif 10913 if (tiwin > rack->r_ctl.rc_high_rwnd) 10914 rack->r_ctl.rc_high_rwnd = tiwin; 10915 /* 10916 * TCP ECN processing. XXXJTL: If we ever use ECN, we need to move 10917 * this to occur after we've validated the segment. 10918 */ 10919 if (tp->t_flags2 & TF2_ECN_PERMIT) { 10920 if (thflags & TH_CWR) { 10921 tp->t_flags2 &= ~TF2_ECN_SND_ECE; 10922 tp->t_flags |= TF_ACKNOW; 10923 } 10924 switch (iptos & IPTOS_ECN_MASK) { 10925 case IPTOS_ECN_CE: 10926 tp->t_flags2 |= TF2_ECN_SND_ECE; 10927 KMOD_TCPSTAT_INC(tcps_ecn_ce); 10928 break; 10929 case IPTOS_ECN_ECT0: 10930 KMOD_TCPSTAT_INC(tcps_ecn_ect0); 10931 break; 10932 case IPTOS_ECN_ECT1: 10933 KMOD_TCPSTAT_INC(tcps_ecn_ect1); 10934 break; 10935 } 10936 10937 /* Process a packet differently from RFC3168. */ 10938 cc_ecnpkt_handler(tp, th, iptos); 10939 10940 /* Congestion experienced. */ 10941 if (thflags & TH_ECE) { 10942 rack_cong_signal(tp, th, CC_ECN); 10943 } 10944 } 10945 10946 /* 10947 * If echoed timestamp is later than the current time, fall back to 10948 * non RFC1323 RTT calculation. Normalize timestamp if syncookies 10949 * were used when this connection was established. 10950 */ 10951 if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) { 10952 to.to_tsecr -= tp->ts_offset; 10953 if (TSTMP_GT(to.to_tsecr, cts)) 10954 to.to_tsecr = 0; 10955 } 10956 10957 /* 10958 * If its the first time in we need to take care of options and 10959 * verify we can do SACK for rack! 10960 */ 10961 if (rack->r_state == 0) { 10962 /* Should be init'd by rack_init() */ 10963 KASSERT(rack->rc_inp != NULL, 10964 ("%s: rack->rc_inp unexpectedly NULL", __func__)); 10965 if (rack->rc_inp == NULL) { 10966 rack->rc_inp = tp->t_inpcb; 10967 } 10968 10969 /* 10970 * Process options only when we get SYN/ACK back. The SYN 10971 * case for incoming connections is handled in tcp_syncache. 10972 * According to RFC1323 the window field in a SYN (i.e., a 10973 * <SYN> or <SYN,ACK>) segment itself is never scaled. XXX 10974 * this is traditional behavior, may need to be cleaned up. 10975 */ 10976 if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) { 10977 /* Handle parallel SYN for ECN */ 10978 if (!(thflags & TH_ACK) && 10979 ((thflags & (TH_CWR | TH_ECE)) == (TH_CWR | TH_ECE)) && 10980 ((V_tcp_do_ecn == 1) || (V_tcp_do_ecn == 2))) { 10981 tp->t_flags2 |= TF2_ECN_PERMIT; 10982 tp->t_flags2 |= TF2_ECN_SND_ECE; 10983 TCPSTAT_INC(tcps_ecn_shs); 10984 } 10985 if ((to.to_flags & TOF_SCALE) && 10986 (tp->t_flags & TF_REQ_SCALE)) { 10987 tp->t_flags |= TF_RCVD_SCALE; 10988 tp->snd_scale = to.to_wscale; 10989 } else 10990 tp->t_flags &= ~TF_REQ_SCALE; 10991 /* 10992 * Initial send window. It will be updated with the 10993 * next incoming segment to the scaled value. 10994 */ 10995 tp->snd_wnd = th->th_win; 10996 if ((to.to_flags & TOF_TS) && 10997 (tp->t_flags & TF_REQ_TSTMP)) { 10998 tp->t_flags |= TF_RCVD_TSTMP; 10999 tp->ts_recent = to.to_tsval; 11000 tp->ts_recent_age = cts; 11001 } else 11002 tp->t_flags &= ~TF_REQ_TSTMP; 11003 if (to.to_flags & TOF_MSS) 11004 tcp_mss(tp, to.to_mss); 11005 if ((tp->t_flags & TF_SACK_PERMIT) && 11006 (to.to_flags & TOF_SACKPERM) == 0) 11007 tp->t_flags &= ~TF_SACK_PERMIT; 11008 if (IS_FASTOPEN(tp->t_flags)) { 11009 if (to.to_flags & TOF_FASTOPEN) { 11010 uint16_t mss; 11011 11012 if (to.to_flags & TOF_MSS) 11013 mss = to.to_mss; 11014 else 11015 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) 11016 mss = TCP6_MSS; 11017 else 11018 mss = TCP_MSS; 11019 tcp_fastopen_update_cache(tp, mss, 11020 to.to_tfo_len, to.to_tfo_cookie); 11021 } else 11022 tcp_fastopen_disable_path(tp); 11023 } 11024 } 11025 /* 11026 * At this point we are at the initial call. Here we decide 11027 * if we are doing RACK or not. We do this by seeing if 11028 * TF_SACK_PERMIT is set and the sack-not-required is clear. 11029 * The code now does do dup-ack counting so if you don't 11030 * switch back you won't get rack & TLP, but you will still 11031 * get this stack. 11032 */ 11033 11034 if ((rack_sack_not_required == 0) && 11035 ((tp->t_flags & TF_SACK_PERMIT) == 0)) { 11036 tcp_switch_back_to_default(tp); 11037 (*tp->t_fb->tfb_tcp_do_segment) (m, th, so, tp, drop_hdrlen, 11038 tlen, iptos); 11039 return (1); 11040 } 11041 /* Set the flag */ 11042 rack->r_is_v6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; 11043 tcp_set_hpts(tp->t_inpcb); 11044 sack_filter_clear(&rack->r_ctl.rack_sf, th->th_ack); 11045 } 11046 if (thflags & TH_FIN) 11047 tcp_log_end_status(tp, TCP_EI_STATUS_CLIENT_FIN); 11048 us_cts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 11049 if ((rack->rc_gp_dyn_mul) && 11050 (rack->use_fixed_rate == 0) && 11051 (rack->rc_always_pace)) { 11052 /* Check in on probertt */ 11053 rack_check_probe_rtt(rack, us_cts); 11054 } 11055 if (rack->forced_ack) { 11056 uint32_t us_rtt; 11057 11058 /* 11059 * A persist or keep-alive was forced out, update our 11060 * min rtt time. Note we do not worry about lost 11061 * retransmissions since KEEP-ALIVES and persists 11062 * are usually way long on times of sending (though 11063 * if we were really paranoid or worried we could 11064 * at least use timestamps if available to validate). 11065 */ 11066 rack->forced_ack = 0; 11067 us_rtt = us_cts - rack->r_ctl.forced_ack_ts; 11068 if (us_rtt == 0) 11069 us_rtt = 1; 11070 rack_log_rtt_upd(tp, rack, us_rtt, 0, NULL, 3); 11071 rack_apply_updated_usrtt(rack, us_rtt, us_cts); 11072 } 11073 /* 11074 * This is the one exception case where we set the rack state 11075 * always. All other times (timers etc) we must have a rack-state 11076 * set (so we assure we have done the checks above for SACK). 11077 */ 11078 rack->r_ctl.rc_rcvtime = cts; 11079 if (rack->r_state != tp->t_state) 11080 rack_set_state(tp, rack); 11081 if (SEQ_GT(th->th_ack, tp->snd_una) && 11082 (rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree)) != NULL) 11083 kern_prefetch(rsm, &prev_state); 11084 prev_state = rack->r_state; 11085 rack_clear_rate_sample(rack); 11086 retval = (*rack->r_substate) (m, th, so, 11087 tp, &to, drop_hdrlen, 11088 tlen, tiwin, thflags, nxt_pkt, iptos); 11089 #ifdef INVARIANTS 11090 if ((retval == 0) && 11091 (tp->t_inpcb == NULL)) { 11092 panic("retval:%d tp:%p t_inpcb:NULL state:%d", 11093 retval, tp, prev_state); 11094 } 11095 #endif 11096 if (retval == 0) { 11097 /* 11098 * If retval is 1 the tcb is unlocked and most likely the tp 11099 * is gone. 11100 */ 11101 INP_WLOCK_ASSERT(tp->t_inpcb); 11102 if ((rack->rc_gp_dyn_mul) && 11103 (rack->rc_always_pace) && 11104 (rack->use_fixed_rate == 0) && 11105 rack->in_probe_rtt && 11106 (rack->r_ctl.rc_time_probertt_starts == 0)) { 11107 /* 11108 * If we are going for target, lets recheck before 11109 * we output. 11110 */ 11111 rack_check_probe_rtt(rack, us_cts); 11112 } 11113 if (rack->set_pacing_done_a_iw == 0) { 11114 /* How much has been acked? */ 11115 if ((tp->snd_una - tp->iss) > (ctf_fixed_maxseg(tp) * 10)) { 11116 /* We have enough to set in the pacing segment size */ 11117 rack->set_pacing_done_a_iw = 1; 11118 rack_set_pace_segments(tp, rack, __LINE__); 11119 } 11120 } 11121 tcp_rack_xmit_timer_commit(rack, tp); 11122 if (nxt_pkt == 0) { 11123 if (rack->r_wanted_output != 0) { 11124 do_output_now: 11125 did_out = 1; 11126 (void)tp->t_fb->tfb_tcp_output(tp); 11127 } 11128 rack_start_hpts_timer(rack, tp, cts, 0, 0, 0); 11129 } 11130 if ((nxt_pkt == 0) && 11131 ((rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) == 0) && 11132 (SEQ_GT(tp->snd_max, tp->snd_una) || 11133 (tp->t_flags & TF_DELACK) || 11134 ((V_tcp_always_keepalive || rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) && 11135 (tp->t_state <= TCPS_CLOSING)))) { 11136 /* We could not send (probably in the hpts but stopped the timer earlier)? */ 11137 if ((tp->snd_max == tp->snd_una) && 11138 ((tp->t_flags & TF_DELACK) == 0) && 11139 (rack->rc_inp->inp_in_hpts) && 11140 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { 11141 /* keep alive not needed if we are hptsi output yet */ 11142 ; 11143 } else { 11144 int late = 0; 11145 if (rack->rc_inp->inp_in_hpts) { 11146 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { 11147 us_cts = tcp_get_usecs(NULL); 11148 if (TSTMP_GT(rack->r_ctl.rc_last_output_to, us_cts)) { 11149 rack->r_early = 1; 11150 rack->r_ctl.rc_agg_early += (rack->r_ctl.rc_last_output_to - us_cts); 11151 } else 11152 late = 1; 11153 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 11154 } 11155 tcp_hpts_remove(tp->t_inpcb, HPTS_REMOVE_OUTPUT); 11156 } 11157 if (late && (did_out == 0)) { 11158 /* 11159 * We are late in the sending 11160 * and we did not call the output 11161 * (this probably should not happen). 11162 */ 11163 goto do_output_now; 11164 } 11165 rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), 0, 0, 0); 11166 } 11167 way_out = 1; 11168 } else if (nxt_pkt == 0) { 11169 /* Do we have the correct timer running? */ 11170 rack_timer_audit(tp, rack, &so->so_snd); 11171 way_out = 2; 11172 } 11173 done_with_input: 11174 rack_log_doseg_done(rack, cts, nxt_pkt, did_out, way_out); 11175 if (did_out) 11176 rack->r_wanted_output = 0; 11177 #ifdef INVARIANTS 11178 if (tp->t_inpcb == NULL) { 11179 panic("OP:%d retval:%d tp:%p t_inpcb:NULL state:%d", 11180 did_out, 11181 retval, tp, prev_state); 11182 } 11183 #endif 11184 } 11185 return (retval); 11186 } 11187 11188 void 11189 rack_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, 11190 struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos) 11191 { 11192 struct timeval tv; 11193 11194 /* First lets see if we have old packets */ 11195 if (tp->t_in_pkt) { 11196 if (ctf_do_queued_segments(so, tp, 1)) { 11197 m_freem(m); 11198 return; 11199 } 11200 } 11201 if (m->m_flags & M_TSTMP_LRO) { 11202 tv.tv_sec = m->m_pkthdr.rcv_tstmp /1000000000; 11203 tv.tv_usec = (m->m_pkthdr.rcv_tstmp % 1000000000)/1000; 11204 } else { 11205 /* Should not be should we kassert instead? */ 11206 tcp_get_usecs(&tv); 11207 } 11208 if(rack_do_segment_nounlock(m, th, so, tp, 11209 drop_hdrlen, tlen, iptos, 0, &tv) == 0) { 11210 tcp_handle_wakeup(tp, so); 11211 INP_WUNLOCK(tp->t_inpcb); 11212 } 11213 } 11214 11215 struct rack_sendmap * 11216 tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tsused) 11217 { 11218 struct rack_sendmap *rsm = NULL; 11219 int32_t idx; 11220 uint32_t srtt = 0, thresh = 0, ts_low = 0; 11221 11222 /* Return the next guy to be re-transmitted */ 11223 if (RB_EMPTY(&rack->r_ctl.rc_mtree)) { 11224 return (NULL); 11225 } 11226 if (tp->t_flags & TF_SENTFIN) { 11227 /* retran the end FIN? */ 11228 return (NULL); 11229 } 11230 /* ok lets look at this one */ 11231 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 11232 if (rsm && ((rsm->r_flags & RACK_ACKED) == 0)) { 11233 goto check_it; 11234 } 11235 rsm = rack_find_lowest_rsm(rack); 11236 if (rsm == NULL) { 11237 return (NULL); 11238 } 11239 check_it: 11240 if (rsm->r_flags & RACK_ACKED) { 11241 return (NULL); 11242 } 11243 if (((rsm->r_flags & RACK_SACK_PASSED) == 0) && 11244 (rsm->r_dupack < DUP_ACK_THRESHOLD)) { 11245 /* Its not yet ready */ 11246 return (NULL); 11247 } 11248 srtt = rack_grab_rtt(tp, rack); 11249 idx = rsm->r_rtr_cnt - 1; 11250 ts_low = rsm->r_tim_lastsent[idx]; 11251 thresh = rack_calc_thresh_rack(rack, srtt, tsused); 11252 if ((tsused == ts_low) || 11253 (TSTMP_LT(tsused, ts_low))) { 11254 /* No time since sending */ 11255 return (NULL); 11256 } 11257 if ((tsused - ts_low) < thresh) { 11258 /* It has not been long enough yet */ 11259 return (NULL); 11260 } 11261 if ((rsm->r_dupack >= DUP_ACK_THRESHOLD) || 11262 ((rsm->r_flags & RACK_SACK_PASSED) && 11263 (rack->sack_attack_disable == 0))) { 11264 /* 11265 * We have passed the dup-ack threshold <or> 11266 * a SACK has indicated this is missing. 11267 * Note that if you are a declared attacker 11268 * it is only the dup-ack threshold that 11269 * will cause retransmits. 11270 */ 11271 /* log retransmit reason */ 11272 rack_log_retran_reason(rack, rsm, (tsused - ts_low), thresh, 1); 11273 return (rsm); 11274 } 11275 return (NULL); 11276 } 11277 11278 static void 11279 rack_log_pacing_delay_calc(struct tcp_rack *rack, uint32_t len, uint32_t slot, 11280 uint64_t bw_est, uint64_t bw, uint64_t len_time, int method, 11281 int line, struct rack_sendmap *rsm) 11282 { 11283 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 11284 union tcp_log_stackspecific log; 11285 struct timeval tv; 11286 11287 memset(&log, 0, sizeof(log)); 11288 log.u_bbr.flex1 = slot; 11289 log.u_bbr.flex2 = len; 11290 log.u_bbr.flex3 = rack->r_ctl.rc_pace_min_segs; 11291 log.u_bbr.flex4 = rack->r_ctl.rc_pace_max_segs; 11292 log.u_bbr.flex5 = rack->r_ctl.rack_per_of_gp_ss; 11293 log.u_bbr.flex6 = rack->r_ctl.rack_per_of_gp_ca; 11294 log.u_bbr.use_lt_bw = rack->app_limited_needs_set; 11295 log.u_bbr.use_lt_bw <<= 1; 11296 log.u_bbr.use_lt_bw = rack->rc_gp_filled; 11297 log.u_bbr.use_lt_bw <<= 1; 11298 log.u_bbr.use_lt_bw |= rack->measure_saw_probe_rtt; 11299 log.u_bbr.use_lt_bw <<= 1; 11300 log.u_bbr.use_lt_bw |= rack->in_probe_rtt; 11301 log.u_bbr.pkt_epoch = line; 11302 log.u_bbr.applimited = rack->r_ctl.rack_per_of_gp_rec; 11303 log.u_bbr.bw_inuse = bw_est; 11304 log.u_bbr.delRate = bw; 11305 if (rack->r_ctl.gp_bw == 0) 11306 log.u_bbr.cur_del_rate = 0; 11307 else 11308 log.u_bbr.cur_del_rate = rack_get_bw(rack); 11309 log.u_bbr.rttProp = len_time; 11310 log.u_bbr.pkts_out = rack->r_ctl.rc_rack_min_rtt; 11311 log.u_bbr.lost = rack->r_ctl.rc_probertt_sndmax_atexit; 11312 log.u_bbr.pacing_gain = rack_get_output_gain(rack, rsm); 11313 if (rack->r_ctl.cwnd_to_use < rack->rc_tp->snd_ssthresh) { 11314 /* We are in slow start */ 11315 log.u_bbr.flex7 = 1; 11316 } else { 11317 /* we are on congestion avoidance */ 11318 log.u_bbr.flex7 = 0; 11319 } 11320 log.u_bbr.flex8 = method; 11321 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 11322 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 11323 log.u_bbr.cwnd_gain = rack->rc_gp_saw_rec; 11324 log.u_bbr.cwnd_gain <<= 1; 11325 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ss; 11326 log.u_bbr.cwnd_gain <<= 1; 11327 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ca; 11328 TCP_LOG_EVENTP(rack->rc_tp, NULL, 11329 &rack->rc_inp->inp_socket->so_rcv, 11330 &rack->rc_inp->inp_socket->so_snd, 11331 BBR_LOG_HPTSI_CALC, 0, 11332 0, &log, false, &tv); 11333 } 11334 } 11335 11336 static uint32_t 11337 rack_get_pacing_len(struct tcp_rack *rack, uint64_t bw, uint32_t mss) 11338 { 11339 uint32_t new_tso, user_max; 11340 11341 user_max = rack->rc_user_set_max_segs * mss; 11342 if (rack->rc_force_max_seg) { 11343 return (user_max); 11344 } 11345 if (rack->use_fixed_rate && 11346 ((rack->r_ctl.crte == NULL) || 11347 (bw != rack->r_ctl.crte->rate))) { 11348 /* Use the user mss since we are not exactly matched */ 11349 return (user_max); 11350 } 11351 new_tso = tcp_get_pacing_burst_size(rack->rc_tp, bw, mss, rack_pace_one_seg, rack->r_ctl.crte, NULL); 11352 if (new_tso > user_max) 11353 new_tso = user_max; 11354 return(new_tso); 11355 } 11356 11357 static void 11358 rack_log_hdwr_pacing(struct tcp_rack *rack, const struct ifnet *ifp, 11359 uint64_t rate, uint64_t hw_rate, int line, 11360 int error) 11361 { 11362 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 11363 union tcp_log_stackspecific log; 11364 struct timeval tv; 11365 11366 memset(&log, 0, sizeof(log)); 11367 log.u_bbr.flex1 = ((hw_rate >> 32) & 0x00000000ffffffff); 11368 log.u_bbr.flex2 = (hw_rate & 0x00000000ffffffff); 11369 log.u_bbr.flex3 = (((uint64_t)ifp >> 32) & 0x00000000ffffffff); 11370 log.u_bbr.flex4 = ((uint64_t)ifp & 0x00000000ffffffff); 11371 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 11372 log.u_bbr.bw_inuse = rate; 11373 log.u_bbr.flex5 = line; 11374 log.u_bbr.flex6 = error; 11375 log.u_bbr.applimited = rack->r_ctl.rc_pace_max_segs; 11376 log.u_bbr.flex8 = rack->use_fixed_rate; 11377 log.u_bbr.flex8 <<= 1; 11378 log.u_bbr.flex8 |= rack->rack_hdrw_pacing; 11379 log.u_bbr.pkts_out = rack->rc_tp->t_maxseg; 11380 TCP_LOG_EVENTP(rack->rc_tp, NULL, 11381 &rack->rc_inp->inp_socket->so_rcv, 11382 &rack->rc_inp->inp_socket->so_snd, 11383 BBR_LOG_HDWR_PACE, 0, 11384 0, &log, false, &tv); 11385 } 11386 } 11387 11388 static int32_t 11389 pace_to_fill_cwnd(struct tcp_rack *rack, int32_t slot, uint32_t len, uint32_t segsiz) 11390 { 11391 uint64_t lentim, fill_bw; 11392 11393 /* Lets first see if we are full, if so continue with normal rate */ 11394 if (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) > rack->r_ctl.cwnd_to_use) 11395 return (slot); 11396 if ((ctf_outstanding(rack->rc_tp) + (segsiz-1)) > rack->rc_tp->snd_wnd) 11397 return (slot); 11398 if (rack->r_ctl.rc_last_us_rtt == 0) 11399 return (slot); 11400 if (rack->rc_pace_fill_if_rttin_range && 11401 (rack->r_ctl.rc_last_us_rtt >= 11402 (get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack->rtt_limit_mul))) { 11403 /* The rtt is huge, N * smallest, lets not fill */ 11404 return (slot); 11405 } 11406 /* 11407 * first lets calculate the b/w based on the last us-rtt 11408 * and the sndwnd. 11409 */ 11410 fill_bw = rack->r_ctl.cwnd_to_use; 11411 /* Take the rwnd if its smaller */ 11412 if (fill_bw > rack->rc_tp->snd_wnd) 11413 fill_bw = rack->rc_tp->snd_wnd; 11414 fill_bw *= (uint64_t)HPTS_USEC_IN_SEC; 11415 fill_bw /= (uint64_t)rack->r_ctl.rc_last_us_rtt; 11416 /* We are below the min b/w */ 11417 if (fill_bw < RACK_MIN_BW) 11418 return (slot); 11419 /* 11420 * Ok fill_bw holds our mythical b/w to fill the cwnd 11421 * in a rtt, what does that time wise equate too? 11422 */ 11423 lentim = (uint64_t)(len) * (uint64_t)HPTS_USEC_IN_SEC; 11424 lentim /= fill_bw; 11425 if (lentim < slot) { 11426 rack_log_pacing_delay_calc(rack, len, slot, fill_bw, 11427 0, lentim, 12, __LINE__, NULL); 11428 return ((int32_t)lentim); 11429 } else 11430 return (slot); 11431 } 11432 11433 static int32_t 11434 rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, struct rack_sendmap *rsm, uint32_t segsiz) 11435 { 11436 struct rack_sendmap *lrsm; 11437 int32_t slot = 0; 11438 int err; 11439 11440 if (rack->rc_always_pace == 0) { 11441 /* 11442 * We use the most optimistic possible cwnd/srtt for 11443 * sending calculations. This will make our 11444 * calculation anticipate getting more through 11445 * quicker then possible. But thats ok we don't want 11446 * the peer to have a gap in data sending. 11447 */ 11448 uint32_t srtt, cwnd, tr_perms = 0; 11449 int32_t reduce = 0; 11450 11451 old_method: 11452 /* 11453 * We keep no precise pacing with the old method 11454 * instead we use the pacer to mitigate bursts. 11455 */ 11456 rack->r_ctl.rc_agg_delayed = 0; 11457 rack->r_early = 0; 11458 rack->r_late = 0; 11459 rack->r_ctl.rc_agg_early = 0; 11460 if (rack->r_ctl.rc_rack_min_rtt) 11461 srtt = rack->r_ctl.rc_rack_min_rtt; 11462 else 11463 srtt = TICKS_2_MSEC((tp->t_srtt >> TCP_RTT_SHIFT)); 11464 if (rack->r_ctl.rc_rack_largest_cwnd) 11465 cwnd = rack->r_ctl.rc_rack_largest_cwnd; 11466 else 11467 cwnd = rack->r_ctl.cwnd_to_use; 11468 tr_perms = cwnd / srtt; 11469 if (tr_perms == 0) { 11470 tr_perms = ctf_fixed_maxseg(tp); 11471 } 11472 /* 11473 * Calculate how long this will take to drain, if 11474 * the calculation comes out to zero, thats ok we 11475 * will use send_a_lot to possibly spin around for 11476 * more increasing tot_len_this_send to the point 11477 * that its going to require a pace, or we hit the 11478 * cwnd. Which in that case we are just waiting for 11479 * a ACK. 11480 */ 11481 slot = len / tr_perms; 11482 /* Now do we reduce the time so we don't run dry? */ 11483 if (slot && rack_slot_reduction) { 11484 reduce = (slot / rack_slot_reduction); 11485 if (reduce < slot) { 11486 slot -= reduce; 11487 } else 11488 slot = 0; 11489 } 11490 slot *= HPTS_USEC_IN_MSEC; 11491 if (rsm == NULL) { 11492 /* 11493 * We always consider ourselves app limited with old style 11494 * that are not retransmits. This could be the initial 11495 * measurement, but thats ok its all setup and specially 11496 * handled. If another send leaks out, then that too will 11497 * be mark app-limited. 11498 */ 11499 lrsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 11500 if (lrsm && ((lrsm->r_flags & RACK_APP_LIMITED) == 0)) { 11501 rack->r_ctl.rc_first_appl = lrsm; 11502 lrsm->r_flags |= RACK_APP_LIMITED; 11503 rack->r_ctl.rc_app_limited_cnt++; 11504 } 11505 } 11506 rack_log_pacing_delay_calc(rack, len, slot, tr_perms, reduce, 0, 7, __LINE__, NULL); 11507 } else { 11508 uint64_t bw_est, res, lentim, rate_wanted; 11509 uint32_t orig_val, srtt, segs, oh; 11510 11511 if ((rack->r_rr_config == 1) && rsm) { 11512 return (rack->r_ctl.rc_min_to * HPTS_USEC_IN_MSEC); 11513 } 11514 if (rack->use_fixed_rate) { 11515 rate_wanted = bw_est = rack_get_fixed_pacing_bw(rack); 11516 } else if ((rack->r_ctl.init_rate == 0) && 11517 #ifdef NETFLIX_PEAKRATE 11518 (rack->rc_tp->t_maxpeakrate == 0) && 11519 #endif 11520 (rack->r_ctl.gp_bw == 0)) { 11521 /* no way to yet do an estimate */ 11522 bw_est = rate_wanted = 0; 11523 } else { 11524 bw_est = rack_get_bw(rack); 11525 rate_wanted = rack_get_output_bw(rack, bw_est, rsm); 11526 } 11527 if ((bw_est == 0) || (rate_wanted == 0)) { 11528 /* 11529 * No way yet to make a b/w estimate or 11530 * our raise is set incorrectly. 11531 */ 11532 goto old_method; 11533 } 11534 /* We need to account for all the overheads */ 11535 segs = (len + segsiz - 1) / segsiz; 11536 /* 11537 * We need the diff between 1514 bytes (e-mtu with e-hdr) 11538 * and how much data we put in each packet. Yes this 11539 * means we may be off if we are larger than 1500 bytes 11540 * or smaller. But this just makes us more conservative. 11541 */ 11542 if (ETHERNET_SEGMENT_SIZE > segsiz) 11543 oh = ETHERNET_SEGMENT_SIZE - segsiz; 11544 else 11545 oh = 0; 11546 segs *= oh; 11547 lentim = (uint64_t)(len + segs) * (uint64_t)HPTS_USEC_IN_SEC; 11548 res = lentim / rate_wanted; 11549 slot = (uint32_t)res; 11550 orig_val = rack->r_ctl.rc_pace_max_segs; 11551 rack_set_pace_segments(rack->rc_tp, rack, __LINE__); 11552 /* Did we change the TSO size, if so log it */ 11553 if (rack->r_ctl.rc_pace_max_segs != orig_val) 11554 rack_log_pacing_delay_calc(rack, len, slot, orig_val, 0, 0, 15, __LINE__, NULL); 11555 if ((rack->rc_pace_to_cwnd) && 11556 (rack->in_probe_rtt == 0) && 11557 (IN_RECOVERY(rack->rc_tp->t_flags) == 0)) { 11558 /* 11559 * We want to pace at our rate *or* faster to 11560 * fill the cwnd to the max if its not full. 11561 */ 11562 slot = pace_to_fill_cwnd(rack, slot, (len+segs), segsiz); 11563 } 11564 if ((rack->rc_inp->inp_route.ro_nh != NULL) && 11565 (rack->rc_inp->inp_route.ro_nh->nh_ifp != NULL)) { 11566 if ((rack->rack_hdw_pace_ena) && 11567 (rack->rack_hdrw_pacing == 0) && 11568 (rack->rack_attempt_hdwr_pace == 0)) { 11569 /* 11570 * Lets attempt to turn on hardware pacing 11571 * if we can. 11572 */ 11573 rack->rack_attempt_hdwr_pace = 1; 11574 rack->r_ctl.crte = tcp_set_pacing_rate(rack->rc_tp, 11575 rack->rc_inp->inp_route.ro_nh->nh_ifp, 11576 rate_wanted, 11577 RS_PACING_GEQ, 11578 &err, NULL); 11579 if (rack->r_ctl.crte) { 11580 rack->rack_hdrw_pacing = 1; 11581 rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size(rack->rc_tp, rate_wanted, segsiz, 11582 0, rack->r_ctl.crte, 11583 NULL); 11584 rack_log_hdwr_pacing(rack, rack->rc_inp->inp_route.ro_nh->nh_ifp, 11585 rate_wanted, rack->r_ctl.crte->rate, __LINE__, 11586 err); 11587 } 11588 } else if (rack->rack_hdrw_pacing && 11589 (rack->r_ctl.crte->rate != rate_wanted)) { 11590 /* Do we need to adjust our rate? */ 11591 const struct tcp_hwrate_limit_table *nrte; 11592 11593 nrte = tcp_chg_pacing_rate(rack->r_ctl.crte, 11594 rack->rc_tp, 11595 rack->rc_inp->inp_route.ro_nh->nh_ifp, 11596 rate_wanted, 11597 RS_PACING_GEQ, 11598 &err, NULL); 11599 if (nrte == NULL) { 11600 /* Lost the rate */ 11601 rack->rack_hdrw_pacing = 0; 11602 rack_set_pace_segments(rack->rc_tp, rack, __LINE__); 11603 } else if (nrte != rack->r_ctl.crte) { 11604 rack->r_ctl.crte = nrte; 11605 rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size(rack->rc_tp, rate_wanted, 11606 segsiz, 0, 11607 rack->r_ctl.crte, 11608 NULL); 11609 rack_log_hdwr_pacing(rack, rack->rc_inp->inp_route.ro_nh->nh_ifp, 11610 rate_wanted, rack->r_ctl.crte->rate, __LINE__, 11611 err); 11612 } 11613 } 11614 } 11615 if (rack_limit_time_with_srtt && 11616 (rack->use_fixed_rate == 0) && 11617 #ifdef NETFLIX_PEAKRATE 11618 (rack->rc_tp->t_maxpeakrate == 0) && 11619 #endif 11620 (rack->rack_hdrw_pacing == 0)) { 11621 /* 11622 * Sanity check, we do not allow the pacing delay 11623 * to be longer than the SRTT of the path. If it is 11624 * a slow path, then adding a packet should increase 11625 * the RTT and compensate for this i.e. the srtt will 11626 * be greater so the allowed pacing time will be greater. 11627 * 11628 * Note this restriction is not for where a peak rate 11629 * is set, we are doing fixed pacing or hardware pacing. 11630 */ 11631 if (rack->rc_tp->t_srtt) 11632 srtt = (TICKS_2_USEC(rack->rc_tp->t_srtt) >> TCP_RTT_SHIFT); 11633 else 11634 srtt = RACK_INITIAL_RTO * HPTS_USEC_IN_MSEC; /* its in ms convert */ 11635 if (srtt < slot) { 11636 rack_log_pacing_delay_calc(rack, srtt, slot, rate_wanted, bw_est, lentim, 99, __LINE__, NULL); 11637 slot = srtt; 11638 } 11639 } 11640 rack_log_pacing_delay_calc(rack, len, slot, rate_wanted, bw_est, lentim, 2, __LINE__, rsm); 11641 } 11642 if (slot) 11643 counter_u64_add(rack_calc_nonzero, 1); 11644 else 11645 counter_u64_add(rack_calc_zero, 1); 11646 return (slot); 11647 } 11648 11649 static void 11650 rack_start_gp_measurement(struct tcpcb *tp, struct tcp_rack *rack, 11651 tcp_seq startseq, uint32_t sb_offset) 11652 { 11653 struct rack_sendmap *my_rsm = NULL; 11654 struct rack_sendmap fe; 11655 11656 if (tp->t_state < TCPS_ESTABLISHED) { 11657 /* 11658 * We don't start any measurements if we are 11659 * not at least established. 11660 */ 11661 return; 11662 } 11663 tp->t_flags |= TF_GPUTINPROG; 11664 rack->r_ctl.rc_gp_lowrtt = 0xffffffff; 11665 rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd; 11666 tp->gput_seq = startseq; 11667 rack->app_limited_needs_set = 0; 11668 if (rack->in_probe_rtt) 11669 rack->measure_saw_probe_rtt = 1; 11670 else if ((rack->measure_saw_probe_rtt) && 11671 (SEQ_GEQ(tp->gput_seq, rack->r_ctl.rc_probertt_sndmax_atexit))) 11672 rack->measure_saw_probe_rtt = 0; 11673 if (rack->rc_gp_filled) 11674 tp->gput_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 11675 else { 11676 /* Special case initial measurement */ 11677 rack->r_ctl.rc_gp_output_ts = tp->gput_ts = tcp_get_usecs(NULL); 11678 } 11679 /* 11680 * We take a guess out into the future, 11681 * if we have no measurement and no 11682 * initial rate, we measure the first 11683 * initial-windows worth of data to 11684 * speed up getting some GP measurement and 11685 * thus start pacing. 11686 */ 11687 if ((rack->rc_gp_filled == 0) && (rack->r_ctl.init_rate == 0)) { 11688 rack->app_limited_needs_set = 1; 11689 tp->gput_ack = startseq + max(rc_init_window(rack), 11690 (MIN_GP_WIN * ctf_fixed_maxseg(tp))); 11691 rack_log_pacing_delay_calc(rack, 11692 tp->gput_seq, 11693 tp->gput_ack, 11694 0, 11695 tp->gput_ts, 11696 rack->r_ctl.rc_app_limited_cnt, 11697 9, 11698 __LINE__, NULL); 11699 return; 11700 } 11701 if (sb_offset) { 11702 /* 11703 * We are out somewhere in the sb 11704 * can we use the already outstanding data? 11705 */ 11706 11707 if (rack->r_ctl.rc_app_limited_cnt == 0) { 11708 /* 11709 * Yes first one is good and in this case 11710 * the tp->gput_ts is correctly set based on 11711 * the last ack that arrived (no need to 11712 * set things up when an ack comes in). 11713 */ 11714 my_rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 11715 if ((my_rsm == NULL) || 11716 (my_rsm->r_rtr_cnt != 1)) { 11717 /* retransmission? */ 11718 goto use_latest; 11719 } 11720 } else { 11721 if (rack->r_ctl.rc_first_appl == NULL) { 11722 /* 11723 * If rc_first_appl is NULL 11724 * then the cnt should be 0. 11725 * This is probably an error, maybe 11726 * a KASSERT would be approprate. 11727 */ 11728 goto use_latest; 11729 } 11730 /* 11731 * If we have a marker pointer to the last one that is 11732 * app limited we can use that, but we need to set 11733 * things up so that when it gets ack'ed we record 11734 * the ack time (if its not already acked). 11735 */ 11736 rack->app_limited_needs_set = 1; 11737 /* 11738 * We want to get to the rsm that is either 11739 * next with space i.e. over 1 MSS or the one 11740 * after that (after the app-limited). 11741 */ 11742 my_rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, 11743 rack->r_ctl.rc_first_appl); 11744 if (my_rsm) { 11745 if ((my_rsm->r_end - my_rsm->r_start) <= ctf_fixed_maxseg(tp)) 11746 /* Have to use the next one */ 11747 my_rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, 11748 my_rsm); 11749 else { 11750 /* Use after the first MSS of it is acked */ 11751 tp->gput_seq = my_rsm->r_start + ctf_fixed_maxseg(tp); 11752 goto start_set; 11753 } 11754 } 11755 if ((my_rsm == NULL) || 11756 (my_rsm->r_rtr_cnt != 1)) { 11757 /* 11758 * Either its a retransmit or 11759 * the last is the app-limited one. 11760 */ 11761 goto use_latest; 11762 } 11763 } 11764 tp->gput_seq = my_rsm->r_start; 11765 start_set: 11766 if (my_rsm->r_flags & RACK_ACKED) { 11767 /* 11768 * This one has been acked use the arrival ack time 11769 */ 11770 tp->gput_ts = my_rsm->r_ack_arrival; 11771 rack->app_limited_needs_set = 0; 11772 } 11773 rack->r_ctl.rc_gp_output_ts = my_rsm->usec_orig_send; 11774 tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack); 11775 rack_log_pacing_delay_calc(rack, 11776 tp->gput_seq, 11777 tp->gput_ack, 11778 (uint64_t)my_rsm, 11779 tp->gput_ts, 11780 rack->r_ctl.rc_app_limited_cnt, 11781 9, 11782 __LINE__, NULL); 11783 return; 11784 } 11785 11786 use_latest: 11787 /* 11788 * We don't know how long we may have been 11789 * idle or if this is the first-send. Lets 11790 * setup the flag so we will trim off 11791 * the first ack'd data so we get a true 11792 * measurement. 11793 */ 11794 rack->app_limited_needs_set = 1; 11795 tp->gput_ack = startseq + rack_get_measure_window(tp, rack); 11796 /* Find this guy so we can pull the send time */ 11797 fe.r_start = startseq; 11798 my_rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); 11799 if (my_rsm) { 11800 rack->r_ctl.rc_gp_output_ts = my_rsm->usec_orig_send; 11801 if (my_rsm->r_flags & RACK_ACKED) { 11802 /* 11803 * Unlikely since its probably what was 11804 * just transmitted (but I am paranoid). 11805 */ 11806 tp->gput_ts = my_rsm->r_ack_arrival; 11807 rack->app_limited_needs_set = 0; 11808 } 11809 if (SEQ_LT(my_rsm->r_start, tp->gput_seq)) { 11810 /* This also is unlikely */ 11811 tp->gput_seq = my_rsm->r_start; 11812 } 11813 } else { 11814 /* 11815 * TSNH unless we have some send-map limit, 11816 * and even at that it should not be hitting 11817 * that limit (we should have stopped sending). 11818 */ 11819 rack->r_ctl.rc_gp_output_ts = tcp_get_usecs(NULL); 11820 } 11821 rack_log_pacing_delay_calc(rack, 11822 tp->gput_seq, 11823 tp->gput_ack, 11824 (uint64_t)my_rsm, 11825 tp->gput_ts, 11826 rack->r_ctl.rc_app_limited_cnt, 11827 9, __LINE__, NULL); 11828 } 11829 11830 static inline uint32_t 11831 rack_what_can_we_send(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cwnd_to_use, 11832 uint32_t avail, int32_t sb_offset) 11833 { 11834 uint32_t len; 11835 uint32_t sendwin; 11836 11837 if (tp->snd_wnd > cwnd_to_use) 11838 sendwin = cwnd_to_use; 11839 else 11840 sendwin = tp->snd_wnd; 11841 if (ctf_outstanding(tp) >= tp->snd_wnd) { 11842 /* We never want to go over our peers rcv-window */ 11843 len = 0; 11844 } else { 11845 uint32_t flight; 11846 11847 flight = ctf_flight_size(tp, rack->r_ctl.rc_sacked); 11848 if (flight >= sendwin) { 11849 /* 11850 * We have in flight what we are allowed by cwnd (if 11851 * it was rwnd blocking it would have hit above out 11852 * >= tp->snd_wnd). 11853 */ 11854 return (0); 11855 } 11856 len = sendwin - flight; 11857 if ((len + ctf_outstanding(tp)) > tp->snd_wnd) { 11858 /* We would send too much (beyond the rwnd) */ 11859 len = tp->snd_wnd - ctf_outstanding(tp); 11860 } 11861 if ((len + sb_offset) > avail) { 11862 /* 11863 * We don't have that much in the SB, how much is 11864 * there? 11865 */ 11866 len = avail - sb_offset; 11867 } 11868 } 11869 return (len); 11870 } 11871 11872 static int 11873 rack_output(struct tcpcb *tp) 11874 { 11875 struct socket *so; 11876 uint32_t recwin; 11877 uint32_t sb_offset; 11878 int32_t len, flags, error = 0; 11879 struct mbuf *m; 11880 struct mbuf *mb; 11881 uint32_t if_hw_tsomaxsegcount = 0; 11882 uint32_t if_hw_tsomaxsegsize; 11883 int32_t segsiz, minseg; 11884 long tot_len_this_send = 0; 11885 struct ip *ip = NULL; 11886 #ifdef TCPDEBUG 11887 struct ipovly *ipov = NULL; 11888 #endif 11889 struct udphdr *udp = NULL; 11890 struct tcp_rack *rack; 11891 struct tcphdr *th; 11892 uint8_t pass = 0; 11893 uint8_t mark = 0; 11894 uint8_t wanted_cookie = 0; 11895 u_char opt[TCP_MAXOLEN]; 11896 unsigned ipoptlen, optlen, hdrlen, ulen=0; 11897 uint32_t rack_seq; 11898 11899 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 11900 unsigned ipsec_optlen = 0; 11901 11902 #endif 11903 int32_t idle, sendalot; 11904 int32_t sub_from_prr = 0; 11905 volatile int32_t sack_rxmit; 11906 struct rack_sendmap *rsm = NULL; 11907 int32_t tso, mtu; 11908 struct tcpopt to; 11909 int32_t slot = 0; 11910 int32_t sup_rack = 0; 11911 uint32_t cts, us_cts, delayed, early; 11912 uint8_t hpts_calling, new_data_tlp = 0, doing_tlp = 0; 11913 uint32_t cwnd_to_use; 11914 int32_t do_a_prefetch; 11915 int32_t prefetch_rsm = 0; 11916 int32_t orig_len; 11917 struct timeval tv; 11918 int32_t prefetch_so_done = 0; 11919 struct tcp_log_buffer *lgb = NULL; 11920 struct inpcb *inp; 11921 struct sockbuf *sb; 11922 #ifdef INET6 11923 struct ip6_hdr *ip6 = NULL; 11924 int32_t isipv6; 11925 #endif 11926 uint8_t filled_all = 0; 11927 bool hw_tls = false; 11928 11929 /* setup and take the cache hits here */ 11930 rack = (struct tcp_rack *)tp->t_fb_ptr; 11931 inp = rack->rc_inp; 11932 so = inp->inp_socket; 11933 sb = &so->so_snd; 11934 kern_prefetch(sb, &do_a_prefetch); 11935 do_a_prefetch = 1; 11936 hpts_calling = inp->inp_hpts_calls; 11937 hw_tls = (so->so_snd.sb_flags & SB_TLS_IFNET) != 0; 11938 11939 NET_EPOCH_ASSERT(); 11940 INP_WLOCK_ASSERT(inp); 11941 #ifdef TCP_OFFLOAD 11942 if (tp->t_flags & TF_TOE) 11943 return (tcp_offload_output(tp)); 11944 #endif 11945 /* 11946 * For TFO connections in SYN_RECEIVED, only allow the initial 11947 * SYN|ACK and those sent by the retransmit timer. 11948 */ 11949 if (IS_FASTOPEN(tp->t_flags) && 11950 (tp->t_state == TCPS_SYN_RECEIVED) && 11951 SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN|ACK sent */ 11952 (rack->r_ctl.rc_resend == NULL)) /* not a retransmit */ 11953 return (0); 11954 #ifdef INET6 11955 if (rack->r_state) { 11956 /* Use the cache line loaded if possible */ 11957 isipv6 = rack->r_is_v6; 11958 } else { 11959 isipv6 = (inp->inp_vflag & INP_IPV6) != 0; 11960 } 11961 #endif 11962 early = 0; 11963 us_cts = tcp_get_usecs(&tv); 11964 cts = tcp_tv_to_mssectick(&tv); 11965 if (((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) && 11966 inp->inp_in_hpts) { 11967 /* 11968 * We are on the hpts for some timer but not hptsi output. 11969 * Remove from the hpts unconditionally. 11970 */ 11971 rack_timer_cancel(tp, rack, cts, __LINE__); 11972 } 11973 /* Are we pacing and late? */ 11974 if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) && 11975 TSTMP_GEQ(us_cts, rack->r_ctl.rc_last_output_to)) { 11976 /* We are delayed */ 11977 delayed = us_cts - rack->r_ctl.rc_last_output_to; 11978 } else { 11979 delayed = 0; 11980 } 11981 /* Do the timers, which may override the pacer */ 11982 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { 11983 if (rack_process_timers(tp, rack, cts, hpts_calling)) { 11984 counter_u64_add(rack_out_size[TCP_MSS_ACCT_ATIMER], 1); 11985 return (0); 11986 } 11987 } 11988 if ((rack->r_timer_override) || 11989 (delayed) || 11990 (tp->t_state < TCPS_ESTABLISHED)) { 11991 if (tp->t_inpcb->inp_in_hpts) 11992 tcp_hpts_remove(tp->t_inpcb, HPTS_REMOVE_OUTPUT); 11993 } else if (tp->t_inpcb->inp_in_hpts) { 11994 /* 11995 * On the hpts you can't pass even if ACKNOW is on, we will 11996 * when the hpts fires. 11997 */ 11998 counter_u64_add(rack_out_size[TCP_MSS_ACCT_INPACE], 1); 11999 return (0); 12000 } 12001 inp->inp_hpts_calls = 0; 12002 /* Finish out both pacing early and late accounting */ 12003 if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) && 12004 TSTMP_GT(rack->r_ctl.rc_last_output_to, us_cts)) { 12005 early = rack->r_ctl.rc_last_output_to - us_cts; 12006 } else 12007 early = 0; 12008 if (delayed) { 12009 rack->r_ctl.rc_agg_delayed += delayed; 12010 rack->r_late = 1; 12011 } else if (early) { 12012 rack->r_ctl.rc_agg_early += early; 12013 rack->r_early = 1; 12014 } 12015 /* Now that early/late accounting is done turn off the flag */ 12016 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 12017 rack->r_wanted_output = 0; 12018 rack->r_timer_override = 0; 12019 /* 12020 * For TFO connections in SYN_SENT or SYN_RECEIVED, 12021 * only allow the initial SYN or SYN|ACK and those sent 12022 * by the retransmit timer. 12023 */ 12024 if (IS_FASTOPEN(tp->t_flags) && 12025 ((tp->t_state == TCPS_SYN_RECEIVED) || 12026 (tp->t_state == TCPS_SYN_SENT)) && 12027 SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN or SYN|ACK sent */ 12028 (tp->t_rxtshift == 0)) { /* not a retransmit */ 12029 cwnd_to_use = rack->r_ctl.cwnd_to_use = tp->snd_cwnd; 12030 goto just_return_nolock; 12031 } 12032 /* 12033 * Determine length of data that should be transmitted, and flags 12034 * that will be used. If there is some data or critical controls 12035 * (SYN, RST) to send, then transmit; otherwise, investigate 12036 * further. 12037 */ 12038 idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una); 12039 if (tp->t_idle_reduce) { 12040 if (idle && ((ticks - tp->t_rcvtime) >= tp->t_rxtcur)) 12041 rack_cc_after_idle(rack, tp); 12042 } 12043 tp->t_flags &= ~TF_LASTIDLE; 12044 if (idle) { 12045 if (tp->t_flags & TF_MORETOCOME) { 12046 tp->t_flags |= TF_LASTIDLE; 12047 idle = 0; 12048 } 12049 } 12050 if ((tp->snd_una == tp->snd_max) && 12051 rack->r_ctl.rc_went_idle_time && 12052 TSTMP_GT(us_cts, rack->r_ctl.rc_went_idle_time)) { 12053 idle = us_cts - rack->r_ctl.rc_went_idle_time; 12054 if (idle > rack_min_probertt_hold) { 12055 /* Count as a probe rtt */ 12056 if (rack->in_probe_rtt == 0) { 12057 rack->r_ctl.rc_lower_rtt_us_cts = us_cts; 12058 rack->r_ctl.rc_time_probertt_entered = rack->r_ctl.rc_lower_rtt_us_cts; 12059 rack->r_ctl.rc_time_probertt_starts = rack->r_ctl.rc_lower_rtt_us_cts; 12060 rack->r_ctl.rc_time_of_last_probertt = rack->r_ctl.rc_lower_rtt_us_cts; 12061 } else { 12062 rack_exit_probertt(rack, us_cts); 12063 } 12064 } 12065 idle = 0; 12066 } 12067 again: 12068 /* 12069 * If we've recently taken a timeout, snd_max will be greater than 12070 * snd_nxt. There may be SACK information that allows us to avoid 12071 * resending already delivered data. Adjust snd_nxt accordingly. 12072 */ 12073 sendalot = 0; 12074 us_cts = tcp_get_usecs(&tv); 12075 cts = tcp_tv_to_mssectick(&tv); 12076 tso = 0; 12077 mtu = 0; 12078 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 12079 minseg = segsiz; 12080 sb_offset = tp->snd_max - tp->snd_una; 12081 cwnd_to_use = rack->r_ctl.cwnd_to_use = tp->snd_cwnd; 12082 #ifdef NETFLIX_SHARED_CWND 12083 if ((tp->t_flags2 & TF2_TCP_SCWND_ALLOWED) && 12084 rack->rack_enable_scwnd) { 12085 /* We are doing cwnd sharing */ 12086 if (rack->rc_gp_filled && 12087 (rack->rack_attempted_scwnd == 0) && 12088 (rack->r_ctl.rc_scw == NULL) && 12089 tp->t_lib) { 12090 /* The pcbid is in, lets make an attempt */ 12091 counter_u64_add(rack_try_scwnd, 1); 12092 rack->rack_attempted_scwnd = 1; 12093 rack->r_ctl.rc_scw = tcp_shared_cwnd_alloc(tp, 12094 &rack->r_ctl.rc_scw_index, 12095 segsiz); 12096 } 12097 if (rack->r_ctl.rc_scw && 12098 (rack->rack_scwnd_is_idle == 1) && 12099 (rack->rc_in_persist == 0) && 12100 sbavail(sb)) { 12101 /* we are no longer out of data */ 12102 tcp_shared_cwnd_active(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index); 12103 rack->rack_scwnd_is_idle = 0; 12104 } 12105 if (rack->r_ctl.rc_scw) { 12106 /* First lets update and get the cwnd */ 12107 rack->r_ctl.cwnd_to_use = cwnd_to_use = tcp_shared_cwnd_update(rack->r_ctl.rc_scw, 12108 rack->r_ctl.rc_scw_index, 12109 tp->snd_cwnd, tp->snd_wnd, segsiz); 12110 } 12111 } 12112 #endif 12113 flags = tcp_outflags[tp->t_state]; 12114 while (rack->rc_free_cnt < rack_free_cache) { 12115 rsm = rack_alloc(rack); 12116 if (rsm == NULL) { 12117 if (inp->inp_hpts_calls) 12118 /* Retry in a ms */ 12119 slot = (1 * HPTS_USEC_IN_MSEC); 12120 goto just_return_nolock; 12121 } 12122 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_tnext); 12123 rack->rc_free_cnt++; 12124 rsm = NULL; 12125 } 12126 if (inp->inp_hpts_calls) 12127 inp->inp_hpts_calls = 0; 12128 sack_rxmit = 0; 12129 len = 0; 12130 rsm = NULL; 12131 if (flags & TH_RST) { 12132 SOCKBUF_LOCK(sb); 12133 goto send; 12134 } 12135 if (rack->r_ctl.rc_resend) { 12136 /* Retransmit timer */ 12137 rsm = rack->r_ctl.rc_resend; 12138 rack->r_ctl.rc_resend = NULL; 12139 rsm->r_flags &= ~RACK_TLP; 12140 len = rsm->r_end - rsm->r_start; 12141 sack_rxmit = 1; 12142 sendalot = 0; 12143 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start), 12144 ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p", 12145 __func__, __LINE__, 12146 rsm->r_start, tp->snd_una, tp, rack, rsm)); 12147 sb_offset = rsm->r_start - tp->snd_una; 12148 if (len >= segsiz) 12149 len = segsiz; 12150 } else if ((rack->rc_in_persist == 0) && 12151 ((rsm = tcp_rack_output(tp, rack, cts)) != NULL)) { 12152 /* We have a retransmit that takes precedence */ 12153 rsm->r_flags &= ~RACK_TLP; 12154 if ((!IN_RECOVERY(tp->t_flags)) && 12155 ((tp->t_flags & (TF_WASFRECOVERY | TF_WASCRECOVERY)) == 0)) { 12156 /* Enter recovery if not induced by a time-out */ 12157 rack->r_ctl.rc_rsm_start = rsm->r_start; 12158 rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; 12159 rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; 12160 rack_cong_signal(tp, NULL, CC_NDUPACK); 12161 /* 12162 * When we enter recovery we need to assure we send 12163 * one packet. 12164 */ 12165 if (rack->rack_no_prr == 0) { 12166 rack->r_ctl.rc_prr_sndcnt = segsiz; 12167 rack_log_to_prr(rack, 13, 0); 12168 } 12169 } 12170 #ifdef INVARIANTS 12171 if (SEQ_LT(rsm->r_start, tp->snd_una)) { 12172 panic("Huh, tp:%p rack:%p rsm:%p start:%u < snd_una:%u\n", 12173 tp, rack, rsm, rsm->r_start, tp->snd_una); 12174 } 12175 #endif 12176 len = rsm->r_end - rsm->r_start; 12177 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start), 12178 ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p", 12179 __func__, __LINE__, 12180 rsm->r_start, tp->snd_una, tp, rack, rsm)); 12181 sb_offset = rsm->r_start - tp->snd_una; 12182 /* Can we send it within the PRR boundary? */ 12183 if (rack->rack_no_prr == 0) { 12184 if ((rack->use_rack_rr == 0) && (len > rack->r_ctl.rc_prr_sndcnt)) { 12185 /* It does not fit */ 12186 if ((ctf_flight_size(tp, rack->r_ctl.rc_sacked) > len) && 12187 (rack->r_ctl.rc_prr_sndcnt < segsiz)) { 12188 /* 12189 * prr is less than a segment, we 12190 * have more acks due in besides 12191 * what we need to resend. Lets not send 12192 * to avoid sending small pieces of 12193 * what we need to retransmit. 12194 */ 12195 len = 0; 12196 goto just_return_nolock; 12197 } 12198 len = rack->r_ctl.rc_prr_sndcnt; 12199 } 12200 } 12201 sendalot = 0; 12202 if (len >= segsiz) 12203 len = segsiz; 12204 if (len > 0) { 12205 sub_from_prr = 1; 12206 sack_rxmit = 1; 12207 KMOD_TCPSTAT_INC(tcps_sack_rexmits); 12208 KMOD_TCPSTAT_ADD(tcps_sack_rexmit_bytes, 12209 min(len, segsiz)); 12210 counter_u64_add(rack_rtm_prr_retran, 1); 12211 } 12212 } else if (rack->r_ctl.rc_tlpsend) { 12213 /* Tail loss probe */ 12214 long cwin; 12215 long tlen; 12216 12217 doing_tlp = 1; 12218 /* 12219 * Check if we can do a TLP with a RACK'd packet 12220 * this can happen if we are not doing the rack 12221 * cheat and we skipped to a TLP and it 12222 * went off. 12223 */ 12224 rsm = rack->r_ctl.rc_tlpsend; 12225 rsm->r_flags |= RACK_TLP; 12226 rack->r_ctl.rc_tlpsend = NULL; 12227 sack_rxmit = 1; 12228 tlen = rsm->r_end - rsm->r_start; 12229 if (tlen > segsiz) 12230 tlen = segsiz; 12231 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start), 12232 ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p", 12233 __func__, __LINE__, 12234 rsm->r_start, tp->snd_una, tp, rack, rsm)); 12235 sb_offset = rsm->r_start - tp->snd_una; 12236 cwin = min(tp->snd_wnd, tlen); 12237 len = cwin; 12238 } 12239 /* 12240 * Enforce a connection sendmap count limit if set 12241 * as long as we are not retransmiting. 12242 */ 12243 if ((rsm == NULL) && 12244 (rack->do_detection == 0) && 12245 (V_tcp_map_entries_limit > 0) && 12246 (rack->r_ctl.rc_num_maps_alloced >= V_tcp_map_entries_limit)) { 12247 counter_u64_add(rack_to_alloc_limited, 1); 12248 if (!rack->alloc_limit_reported) { 12249 rack->alloc_limit_reported = 1; 12250 counter_u64_add(rack_alloc_limited_conns, 1); 12251 } 12252 goto just_return_nolock; 12253 } 12254 if (rsm && (rsm->r_flags & RACK_HAS_FIN)) { 12255 /* we are retransmitting the fin */ 12256 len--; 12257 if (len) { 12258 /* 12259 * When retransmitting data do *not* include the 12260 * FIN. This could happen from a TLP probe. 12261 */ 12262 flags &= ~TH_FIN; 12263 } 12264 } 12265 #ifdef INVARIANTS 12266 /* For debugging */ 12267 rack->r_ctl.rc_rsm_at_retran = rsm; 12268 #endif 12269 /* 12270 * Get standard flags, and add SYN or FIN if requested by 'hidden' 12271 * state flags. 12272 */ 12273 if (tp->t_flags & TF_NEEDFIN) 12274 flags |= TH_FIN; 12275 if (tp->t_flags & TF_NEEDSYN) 12276 flags |= TH_SYN; 12277 if ((sack_rxmit == 0) && (prefetch_rsm == 0)) { 12278 void *end_rsm; 12279 end_rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext); 12280 if (end_rsm) 12281 kern_prefetch(end_rsm, &prefetch_rsm); 12282 prefetch_rsm = 1; 12283 } 12284 SOCKBUF_LOCK(sb); 12285 /* 12286 * If snd_nxt == snd_max and we have transmitted a FIN, the 12287 * sb_offset will be > 0 even if so_snd.sb_cc is 0, resulting in a 12288 * negative length. This can also occur when TCP opens up its 12289 * congestion window while receiving additional duplicate acks after 12290 * fast-retransmit because TCP will reset snd_nxt to snd_max after 12291 * the fast-retransmit. 12292 * 12293 * In the normal retransmit-FIN-only case, however, snd_nxt will be 12294 * set to snd_una, the sb_offset will be 0, and the length may wind 12295 * up 0. 12296 * 12297 * If sack_rxmit is true we are retransmitting from the scoreboard 12298 * in which case len is already set. 12299 */ 12300 if ((sack_rxmit == 0) && TCPS_HAVEESTABLISHED(tp->t_state)) { 12301 uint32_t avail; 12302 12303 avail = sbavail(sb); 12304 if (SEQ_GT(tp->snd_nxt, tp->snd_una) && avail) 12305 sb_offset = tp->snd_nxt - tp->snd_una; 12306 else 12307 sb_offset = 0; 12308 if ((IN_RECOVERY(tp->t_flags) == 0) || rack->rack_no_prr) { 12309 if (rack->r_ctl.rc_tlp_new_data) { 12310 /* TLP is forcing out new data */ 12311 if (rack->r_ctl.rc_tlp_new_data > (uint32_t) (avail - sb_offset)) { 12312 rack->r_ctl.rc_tlp_new_data = (uint32_t) (avail - sb_offset); 12313 } 12314 if (rack->r_ctl.rc_tlp_new_data > tp->snd_wnd) 12315 len = tp->snd_wnd; 12316 else 12317 len = rack->r_ctl.rc_tlp_new_data; 12318 rack->r_ctl.rc_tlp_new_data = 0; 12319 new_data_tlp = doing_tlp = 1; 12320 } else 12321 len = rack_what_can_we_send(tp, rack, cwnd_to_use, avail, sb_offset); 12322 if (IN_RECOVERY(tp->t_flags) && (len > segsiz)) { 12323 /* 12324 * For prr=off, we need to send only 1 MSS 12325 * at a time. We do this because another sack could 12326 * be arriving that causes us to send retransmits and 12327 * we don't want to be on a long pace due to a larger send 12328 * that keeps us from sending out the retransmit. 12329 */ 12330 len = segsiz; 12331 } 12332 } else { 12333 uint32_t outstanding; 12334 12335 /* 12336 * We are inside of a SACK recovery episode and are 12337 * sending new data, having retransmitted all the 12338 * data possible so far in the scoreboard. 12339 */ 12340 outstanding = tp->snd_max - tp->snd_una; 12341 if ((rack->r_ctl.rc_prr_sndcnt + outstanding) > tp->snd_wnd) { 12342 if (tp->snd_wnd > outstanding) { 12343 len = tp->snd_wnd - outstanding; 12344 /* Check to see if we have the data */ 12345 if ((sb_offset + len) > avail) { 12346 /* It does not all fit */ 12347 if (avail > sb_offset) 12348 len = avail - sb_offset; 12349 else 12350 len = 0; 12351 } 12352 } else 12353 len = 0; 12354 } else if (avail > sb_offset) 12355 len = avail - sb_offset; 12356 else 12357 len = 0; 12358 if (len > 0) { 12359 if (len > rack->r_ctl.rc_prr_sndcnt) 12360 len = rack->r_ctl.rc_prr_sndcnt; 12361 if (len > 0) { 12362 sub_from_prr = 1; 12363 counter_u64_add(rack_rtm_prr_newdata, 1); 12364 } 12365 } 12366 if (len > segsiz) { 12367 /* 12368 * We should never send more than a MSS when 12369 * retransmitting or sending new data in prr 12370 * mode unless the override flag is on. Most 12371 * likely the PRR algorithm is not going to 12372 * let us send a lot as well :-) 12373 */ 12374 if (rack->r_ctl.rc_prr_sendalot == 0) 12375 len = segsiz; 12376 } else if (len < segsiz) { 12377 /* 12378 * Do we send any? The idea here is if the 12379 * send empty's the socket buffer we want to 12380 * do it. However if not then lets just wait 12381 * for our prr_sndcnt to get bigger. 12382 */ 12383 long leftinsb; 12384 12385 leftinsb = sbavail(sb) - sb_offset; 12386 if (leftinsb > len) { 12387 /* This send does not empty the sb */ 12388 len = 0; 12389 } 12390 } 12391 } 12392 } else if (!TCPS_HAVEESTABLISHED(tp->t_state)) { 12393 /* 12394 * If you have not established 12395 * and are not doing FAST OPEN 12396 * no data please. 12397 */ 12398 if ((sack_rxmit == 0) && 12399 (!IS_FASTOPEN(tp->t_flags))){ 12400 len = 0; 12401 sb_offset = 0; 12402 } 12403 } 12404 if (prefetch_so_done == 0) { 12405 kern_prefetch(so, &prefetch_so_done); 12406 prefetch_so_done = 1; 12407 } 12408 /* 12409 * Lop off SYN bit if it has already been sent. However, if this is 12410 * SYN-SENT state and if segment contains data and if we don't know 12411 * that foreign host supports TAO, suppress sending segment. 12412 */ 12413 if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una) && 12414 ((sack_rxmit == 0) && (tp->t_rxtshift == 0))) { 12415 /* 12416 * When sending additional segments following a TFO SYN|ACK, 12417 * do not include the SYN bit. 12418 */ 12419 if (IS_FASTOPEN(tp->t_flags) && 12420 (tp->t_state == TCPS_SYN_RECEIVED)) 12421 flags &= ~TH_SYN; 12422 } 12423 /* 12424 * Be careful not to send data and/or FIN on SYN segments. This 12425 * measure is needed to prevent interoperability problems with not 12426 * fully conformant TCP implementations. 12427 */ 12428 if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) { 12429 len = 0; 12430 flags &= ~TH_FIN; 12431 } 12432 /* 12433 * On TFO sockets, ensure no data is sent in the following cases: 12434 * 12435 * - When retransmitting SYN|ACK on a passively-created socket 12436 * 12437 * - When retransmitting SYN on an actively created socket 12438 * 12439 * - When sending a zero-length cookie (cookie request) on an 12440 * actively created socket 12441 * 12442 * - When the socket is in the CLOSED state (RST is being sent) 12443 */ 12444 if (IS_FASTOPEN(tp->t_flags) && 12445 (((flags & TH_SYN) && (tp->t_rxtshift > 0)) || 12446 ((tp->t_state == TCPS_SYN_SENT) && 12447 (tp->t_tfo_client_cookie_len == 0)) || 12448 (flags & TH_RST))) { 12449 sack_rxmit = 0; 12450 len = 0; 12451 } 12452 /* Without fast-open there should never be data sent on a SYN */ 12453 if ((flags & TH_SYN) && (!IS_FASTOPEN(tp->t_flags))) { 12454 tp->snd_nxt = tp->iss; 12455 len = 0; 12456 } 12457 orig_len = len; 12458 if (len <= 0) { 12459 /* 12460 * If FIN has been sent but not acked, but we haven't been 12461 * called to retransmit, len will be < 0. Otherwise, window 12462 * shrank after we sent into it. If window shrank to 0, 12463 * cancel pending retransmit, pull snd_nxt back to (closed) 12464 * window, and set the persist timer if it isn't already 12465 * going. If the window didn't close completely, just wait 12466 * for an ACK. 12467 * 12468 * We also do a general check here to ensure that we will 12469 * set the persist timer when we have data to send, but a 12470 * 0-byte window. This makes sure the persist timer is set 12471 * even if the packet hits one of the "goto send" lines 12472 * below. 12473 */ 12474 len = 0; 12475 if ((tp->snd_wnd == 0) && 12476 (TCPS_HAVEESTABLISHED(tp->t_state)) && 12477 (tp->snd_una == tp->snd_max) && 12478 (sb_offset < (int)sbavail(sb))) { 12479 tp->snd_nxt = tp->snd_una; 12480 rack_enter_persist(tp, rack, cts); 12481 } 12482 } else if ((rsm == NULL) && 12483 ((doing_tlp == 0) || (new_data_tlp == 1)) && 12484 (len < rack->r_ctl.rc_pace_max_segs)) { 12485 /* 12486 * We are not sending a maximum sized segment for 12487 * some reason. Should we not send anything (think 12488 * sws or persists)? 12489 */ 12490 if ((tp->snd_wnd < min(max(segsiz, (rack->r_ctl.rc_high_rwnd/2)), minseg)) && 12491 (TCPS_HAVEESTABLISHED(tp->t_state)) && 12492 (len < minseg) && 12493 (len < (int)(sbavail(sb) - sb_offset))) { 12494 /* 12495 * Here the rwnd is less than 12496 * the minimum pacing size, this is not a retransmit, 12497 * we are established and 12498 * the send is not the last in the socket buffer 12499 * we send nothing, and we may enter persists 12500 * if nothing is outstanding. 12501 */ 12502 len = 0; 12503 if (tp->snd_max == tp->snd_una) { 12504 /* 12505 * Nothing out we can 12506 * go into persists. 12507 */ 12508 rack_enter_persist(tp, rack, cts); 12509 tp->snd_nxt = tp->snd_una; 12510 } 12511 } else if ((cwnd_to_use >= max(minseg, (segsiz * 4))) && 12512 (ctf_flight_size(tp, rack->r_ctl.rc_sacked) > (2 * segsiz)) && 12513 (len < (int)(sbavail(sb) - sb_offset)) && 12514 (len < minseg)) { 12515 /* 12516 * Here we are not retransmitting, and 12517 * the cwnd is not so small that we could 12518 * not send at least a min size (rxt timer 12519 * not having gone off), We have 2 segments or 12520 * more already in flight, its not the tail end 12521 * of the socket buffer and the cwnd is blocking 12522 * us from sending out a minimum pacing segment size. 12523 * Lets not send anything. 12524 */ 12525 len = 0; 12526 } else if (((tp->snd_wnd - ctf_outstanding(tp)) < 12527 min((rack->r_ctl.rc_high_rwnd/2), minseg)) && 12528 (ctf_flight_size(tp, rack->r_ctl.rc_sacked) > (2 * segsiz)) && 12529 (len < (int)(sbavail(sb) - sb_offset)) && 12530 (TCPS_HAVEESTABLISHED(tp->t_state))) { 12531 /* 12532 * Here we have a send window but we have 12533 * filled it up and we can't send another pacing segment. 12534 * We also have in flight more than 2 segments 12535 * and we are not completing the sb i.e. we allow 12536 * the last bytes of the sb to go out even if 12537 * its not a full pacing segment. 12538 */ 12539 len = 0; 12540 } 12541 } 12542 /* len will be >= 0 after this point. */ 12543 KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); 12544 tcp_sndbuf_autoscale(tp, so, min(tp->snd_wnd, cwnd_to_use)); 12545 /* 12546 * Decide if we can use TCP Segmentation Offloading (if supported by 12547 * hardware). 12548 * 12549 * TSO may only be used if we are in a pure bulk sending state. The 12550 * presence of TCP-MD5, SACK retransmits, SACK advertizements and IP 12551 * options prevent using TSO. With TSO the TCP header is the same 12552 * (except for the sequence number) for all generated packets. This 12553 * makes it impossible to transmit any options which vary per 12554 * generated segment or packet. 12555 * 12556 * IPv4 handling has a clear separation of ip options and ip header 12557 * flags while IPv6 combines both in in6p_outputopts. ip6_optlen() does 12558 * the right thing below to provide length of just ip options and thus 12559 * checking for ipoptlen is enough to decide if ip options are present. 12560 */ 12561 12562 #ifdef INET6 12563 if (isipv6) 12564 ipoptlen = ip6_optlen(tp->t_inpcb); 12565 else 12566 #endif 12567 if (tp->t_inpcb->inp_options) 12568 ipoptlen = tp->t_inpcb->inp_options->m_len - 12569 offsetof(struct ipoption, ipopt_list); 12570 else 12571 ipoptlen = 0; 12572 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 12573 /* 12574 * Pre-calculate here as we save another lookup into the darknesses 12575 * of IPsec that way and can actually decide if TSO is ok. 12576 */ 12577 #ifdef INET6 12578 if (isipv6 && IPSEC_ENABLED(ipv6)) 12579 ipsec_optlen = IPSEC_HDRSIZE(ipv6, tp->t_inpcb); 12580 #ifdef INET 12581 else 12582 #endif 12583 #endif /* INET6 */ 12584 #ifdef INET 12585 if (IPSEC_ENABLED(ipv4)) 12586 ipsec_optlen = IPSEC_HDRSIZE(ipv4, tp->t_inpcb); 12587 #endif /* INET */ 12588 #endif 12589 12590 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 12591 ipoptlen += ipsec_optlen; 12592 #endif 12593 if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > segsiz && 12594 (tp->t_port == 0) && 12595 ((tp->t_flags & TF_SIGNATURE) == 0) && 12596 tp->rcv_numsacks == 0 && sack_rxmit == 0 && 12597 ipoptlen == 0) 12598 tso = 1; 12599 { 12600 uint32_t outstanding; 12601 12602 outstanding = tp->snd_max - tp->snd_una; 12603 if (tp->t_flags & TF_SENTFIN) { 12604 /* 12605 * If we sent a fin, snd_max is 1 higher than 12606 * snd_una 12607 */ 12608 outstanding--; 12609 } 12610 if (sack_rxmit) { 12611 if ((rsm->r_flags & RACK_HAS_FIN) == 0) 12612 flags &= ~TH_FIN; 12613 } else { 12614 if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + 12615 sbused(sb))) 12616 flags &= ~TH_FIN; 12617 } 12618 } 12619 recwin = lmin(lmax(sbspace(&so->so_rcv), 0), 12620 (long)TCP_MAXWIN << tp->rcv_scale); 12621 12622 /* 12623 * Sender silly window avoidance. We transmit under the following 12624 * conditions when len is non-zero: 12625 * 12626 * - We have a full segment (or more with TSO) - This is the last 12627 * buffer in a write()/send() and we are either idle or running 12628 * NODELAY - we've timed out (e.g. persist timer) - we have more 12629 * then 1/2 the maximum send window's worth of data (receiver may be 12630 * limited the window size) - we need to retransmit 12631 */ 12632 if (len) { 12633 if (len >= segsiz) { 12634 goto send; 12635 } 12636 /* 12637 * NOTE! on localhost connections an 'ack' from the remote 12638 * end may occur synchronously with the output and cause us 12639 * to flush a buffer queued with moretocome. XXX 12640 * 12641 */ 12642 if (!(tp->t_flags & TF_MORETOCOME) && /* normal case */ 12643 (idle || (tp->t_flags & TF_NODELAY)) && 12644 ((uint32_t)len + (uint32_t)sb_offset >= sbavail(sb)) && 12645 (tp->t_flags & TF_NOPUSH) == 0) { 12646 pass = 2; 12647 goto send; 12648 } 12649 if ((tp->snd_una == tp->snd_max) && len) { /* Nothing outstanding */ 12650 pass = 22; 12651 goto send; 12652 } 12653 if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) { 12654 pass = 4; 12655 goto send; 12656 } 12657 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { /* retransmit case */ 12658 pass = 5; 12659 goto send; 12660 } 12661 if (sack_rxmit) { 12662 pass = 6; 12663 goto send; 12664 } 12665 if (((tp->snd_wnd - ctf_outstanding(tp)) < segsiz) && 12666 (ctf_outstanding(tp) < (segsiz * 2))) { 12667 /* 12668 * We have less than two MSS outstanding (delayed ack) 12669 * and our rwnd will not let us send a full sized 12670 * MSS. Lets go ahead and let this small segment 12671 * out because we want to try to have at least two 12672 * packets inflight to not be caught by delayed ack. 12673 */ 12674 pass = 12; 12675 goto send; 12676 } 12677 } 12678 /* 12679 * Sending of standalone window updates. 12680 * 12681 * Window updates are important when we close our window due to a 12682 * full socket buffer and are opening it again after the application 12683 * reads data from it. Once the window has opened again and the 12684 * remote end starts to send again the ACK clock takes over and 12685 * provides the most current window information. 12686 * 12687 * We must avoid the silly window syndrome whereas every read from 12688 * the receive buffer, no matter how small, causes a window update 12689 * to be sent. We also should avoid sending a flurry of window 12690 * updates when the socket buffer had queued a lot of data and the 12691 * application is doing small reads. 12692 * 12693 * Prevent a flurry of pointless window updates by only sending an 12694 * update when we can increase the advertized window by more than 12695 * 1/4th of the socket buffer capacity. When the buffer is getting 12696 * full or is very small be more aggressive and send an update 12697 * whenever we can increase by two mss sized segments. In all other 12698 * situations the ACK's to new incoming data will carry further 12699 * window increases. 12700 * 12701 * Don't send an independent window update if a delayed ACK is 12702 * pending (it will get piggy-backed on it) or the remote side 12703 * already has done a half-close and won't send more data. Skip 12704 * this if the connection is in T/TCP half-open state. 12705 */ 12706 if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN) && 12707 !(tp->t_flags & TF_DELACK) && 12708 !TCPS_HAVERCVDFIN(tp->t_state)) { 12709 /* 12710 * "adv" is the amount we could increase the window, taking 12711 * into account that we are limited by TCP_MAXWIN << 12712 * tp->rcv_scale. 12713 */ 12714 int32_t adv; 12715 int oldwin; 12716 12717 adv = recwin; 12718 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) { 12719 oldwin = (tp->rcv_adv - tp->rcv_nxt); 12720 if (adv > oldwin) 12721 adv -= oldwin; 12722 else { 12723 /* We can't increase the window */ 12724 adv = 0; 12725 } 12726 } else 12727 oldwin = 0; 12728 12729 /* 12730 * If the new window size ends up being the same as or less 12731 * than the old size when it is scaled, then don't force 12732 * a window update. 12733 */ 12734 if (oldwin >> tp->rcv_scale >= (adv + oldwin) >> tp->rcv_scale) 12735 goto dontupdate; 12736 12737 if (adv >= (int32_t)(2 * segsiz) && 12738 (adv >= (int32_t)(so->so_rcv.sb_hiwat / 4) || 12739 recwin <= (int32_t)(so->so_rcv.sb_hiwat / 8) || 12740 so->so_rcv.sb_hiwat <= 8 * segsiz)) { 12741 pass = 7; 12742 goto send; 12743 } 12744 if (2 * adv >= (int32_t) so->so_rcv.sb_hiwat) { 12745 pass = 23; 12746 goto send; 12747 } 12748 } 12749 dontupdate: 12750 12751 /* 12752 * Send if we owe the peer an ACK, RST, SYN, or urgent data. ACKNOW 12753 * is also a catch-all for the retransmit timer timeout case. 12754 */ 12755 if (tp->t_flags & TF_ACKNOW) { 12756 pass = 8; 12757 goto send; 12758 } 12759 if (((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0)) { 12760 pass = 9; 12761 goto send; 12762 } 12763 /* 12764 * If our state indicates that FIN should be sent and we have not 12765 * yet done so, then we need to send. 12766 */ 12767 if ((flags & TH_FIN) && 12768 (tp->snd_nxt == tp->snd_una)) { 12769 pass = 11; 12770 goto send; 12771 } 12772 /* 12773 * No reason to send a segment, just return. 12774 */ 12775 just_return: 12776 SOCKBUF_UNLOCK(sb); 12777 just_return_nolock: 12778 { 12779 int app_limited = CTF_JR_SENT_DATA; 12780 12781 if (tot_len_this_send > 0) { 12782 /* Make sure snd_nxt is up to max */ 12783 if (SEQ_GT(tp->snd_max, tp->snd_nxt)) 12784 tp->snd_nxt = tp->snd_max; 12785 slot = rack_get_pacing_delay(rack, tp, tot_len_this_send, NULL, segsiz); 12786 } else { 12787 int end_window = 0; 12788 uint32_t seq = tp->gput_ack; 12789 12790 rsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 12791 if (rsm) { 12792 /* 12793 * Mark the last sent that we just-returned (hinting 12794 * that delayed ack may play a role in any rtt measurement). 12795 */ 12796 rsm->r_just_ret = 1; 12797 } 12798 counter_u64_add(rack_out_size[TCP_MSS_ACCT_JUSTRET], 1); 12799 rack->r_ctl.rc_agg_delayed = 0; 12800 rack->r_early = 0; 12801 rack->r_late = 0; 12802 rack->r_ctl.rc_agg_early = 0; 12803 if ((ctf_outstanding(tp) + 12804 min(max(segsiz, (rack->r_ctl.rc_high_rwnd/2)), 12805 minseg)) >= tp->snd_wnd) { 12806 /* We are limited by the rwnd */ 12807 app_limited = CTF_JR_RWND_LIMITED; 12808 } else if (ctf_outstanding(tp) >= sbavail(sb)) { 12809 /* We are limited by whats available -- app limited */ 12810 app_limited = CTF_JR_APP_LIMITED; 12811 } else if ((idle == 0) && 12812 ((tp->t_flags & TF_NODELAY) == 0) && 12813 ((uint32_t)len + (uint32_t)sb_offset >= sbavail(sb)) && 12814 (len < segsiz)) { 12815 /* 12816 * No delay is not on and the 12817 * user is sending less than 1MSS. This 12818 * brings out SWS avoidance so we 12819 * don't send. Another app-limited case. 12820 */ 12821 app_limited = CTF_JR_APP_LIMITED; 12822 } else if (tp->t_flags & TF_NOPUSH) { 12823 /* 12824 * The user has requested no push of 12825 * the last segment and we are 12826 * at the last segment. Another app 12827 * limited case. 12828 */ 12829 app_limited = CTF_JR_APP_LIMITED; 12830 } else if ((ctf_outstanding(tp) + minseg) > cwnd_to_use) { 12831 /* Its the cwnd */ 12832 app_limited = CTF_JR_CWND_LIMITED; 12833 } else if (rack->rc_in_persist == 1) { 12834 /* We are in persists */ 12835 app_limited = CTF_JR_PERSISTS; 12836 } else if (IN_RECOVERY(tp->t_flags) && 12837 (rack->rack_no_prr == 0) && 12838 (rack->r_ctl.rc_prr_sndcnt < segsiz)) { 12839 app_limited = CTF_JR_PRR; 12840 } else { 12841 /* Now why here are we not sending? */ 12842 #ifdef NOW 12843 #ifdef INVARIANTS 12844 panic("rack:%p hit JR_ASSESSING case cwnd_to_use:%u?", rack, cwnd_to_use); 12845 #endif 12846 #endif 12847 app_limited = CTF_JR_ASSESSING; 12848 } 12849 /* 12850 * App limited in some fashion, for our pacing GP 12851 * measurements we don't want any gap (even cwnd). 12852 * Close down the measurement window. 12853 */ 12854 if (rack_cwnd_block_ends_measure && 12855 ((app_limited == CTF_JR_CWND_LIMITED) || 12856 (app_limited == CTF_JR_PRR))) { 12857 /* 12858 * The reason we are not sending is 12859 * the cwnd (or prr). We have been configured 12860 * to end the measurement window in 12861 * this case. 12862 */ 12863 end_window = 1; 12864 } else if (app_limited == CTF_JR_PERSISTS) { 12865 /* 12866 * We never end the measurement window 12867 * in persists, though in theory we 12868 * should be only entering after everything 12869 * is acknowledged (so we will probably 12870 * never come here). 12871 */ 12872 end_window = 0; 12873 } else if (rack_rwnd_block_ends_measure && 12874 (app_limited == CTF_JR_RWND_LIMITED)) { 12875 /* 12876 * We are rwnd limited and have been 12877 * configured to end the measurement 12878 * window in this case. 12879 */ 12880 end_window = 1; 12881 } else if (app_limited == CTF_JR_APP_LIMITED) { 12882 /* 12883 * A true application limited period, we have 12884 * ran out of data. 12885 */ 12886 end_window = 1; 12887 } else if (app_limited == CTF_JR_ASSESSING) { 12888 /* 12889 * In the assessing case we hit the end of 12890 * the if/else and had no known reason 12891 * This will panic us under invariants.. 12892 * 12893 * If we get this out in logs we need to 12894 * investagate which reason we missed. 12895 */ 12896 end_window = 1; 12897 } 12898 if (end_window) { 12899 uint8_t log = 0; 12900 12901 if ((tp->t_flags & TF_GPUTINPROG) && 12902 SEQ_GT(tp->gput_ack, tp->snd_max)) { 12903 /* Mark the last packet has app limited */ 12904 tp->gput_ack = tp->snd_max; 12905 log = 1; 12906 } 12907 rsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 12908 if (rsm && ((rsm->r_flags & RACK_APP_LIMITED) == 0)) { 12909 if (rack->r_ctl.rc_app_limited_cnt == 0) 12910 rack->r_ctl.rc_end_appl = rack->r_ctl.rc_first_appl = rsm; 12911 else { 12912 /* 12913 * Go out to the end app limited and mark 12914 * this new one as next and move the end_appl up 12915 * to this guy. 12916 */ 12917 if (rack->r_ctl.rc_end_appl) 12918 rack->r_ctl.rc_end_appl->r_nseq_appl = rsm->r_start; 12919 rack->r_ctl.rc_end_appl = rsm; 12920 } 12921 rsm->r_flags |= RACK_APP_LIMITED; 12922 rack->r_ctl.rc_app_limited_cnt++; 12923 } 12924 if (log) 12925 rack_log_pacing_delay_calc(rack, 12926 rack->r_ctl.rc_app_limited_cnt, seq, 12927 tp->gput_ack, 0, 0, 4, __LINE__, NULL); 12928 } 12929 } 12930 if (slot) { 12931 /* set the rack tcb into the slot N */ 12932 counter_u64_add(rack_paced_segments, 1); 12933 } else if (tot_len_this_send) { 12934 counter_u64_add(rack_unpaced_segments, 1); 12935 } 12936 /* Check if we need to go into persists or not */ 12937 if ((rack->rc_in_persist == 0) && 12938 (tp->snd_max == tp->snd_una) && 12939 TCPS_HAVEESTABLISHED(tp->t_state) && 12940 sbavail(sb) && 12941 (sbavail(sb) > tp->snd_wnd) && 12942 (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), minseg))) { 12943 /* Yes lets make sure to move to persist before timer-start */ 12944 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime); 12945 } 12946 rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, sup_rack); 12947 rack_log_type_just_return(rack, cts, tot_len_this_send, slot, hpts_calling, app_limited, cwnd_to_use); 12948 } 12949 #ifdef NETFLIX_SHARED_CWND 12950 if ((sbavail(sb) == 0) && 12951 rack->r_ctl.rc_scw) { 12952 tcp_shared_cwnd_idle(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index); 12953 rack->rack_scwnd_is_idle = 1; 12954 } 12955 #endif 12956 return (0); 12957 12958 send: 12959 if ((flags & TH_FIN) && 12960 sbavail(sb)) { 12961 /* 12962 * We do not transmit a FIN 12963 * with data outstanding. We 12964 * need to make it so all data 12965 * is acked first. 12966 */ 12967 flags &= ~TH_FIN; 12968 } 12969 /* Enforce stack imposed max seg size if we have one */ 12970 if (rack->r_ctl.rc_pace_max_segs && 12971 (len > rack->r_ctl.rc_pace_max_segs)) { 12972 mark = 1; 12973 len = rack->r_ctl.rc_pace_max_segs; 12974 } 12975 SOCKBUF_LOCK_ASSERT(sb); 12976 if (len > 0) { 12977 if (len >= segsiz) 12978 tp->t_flags2 |= TF2_PLPMTU_MAXSEGSNT; 12979 else 12980 tp->t_flags2 &= ~TF2_PLPMTU_MAXSEGSNT; 12981 } 12982 /* 12983 * Before ESTABLISHED, force sending of initial options unless TCP 12984 * set not to do any options. NOTE: we assume that the IP/TCP header 12985 * plus TCP options always fit in a single mbuf, leaving room for a 12986 * maximum link header, i.e. max_linkhdr + sizeof (struct tcpiphdr) 12987 * + optlen <= MCLBYTES 12988 */ 12989 optlen = 0; 12990 #ifdef INET6 12991 if (isipv6) 12992 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 12993 else 12994 #endif 12995 hdrlen = sizeof(struct tcpiphdr); 12996 12997 /* 12998 * Compute options for segment. We only have to care about SYN and 12999 * established connection segments. Options for SYN-ACK segments 13000 * are handled in TCP syncache. 13001 */ 13002 to.to_flags = 0; 13003 if ((tp->t_flags & TF_NOOPT) == 0) { 13004 /* Maximum segment size. */ 13005 if (flags & TH_SYN) { 13006 tp->snd_nxt = tp->iss; 13007 to.to_mss = tcp_mssopt(&inp->inp_inc); 13008 #ifdef NETFLIX_TCPOUDP 13009 if (tp->t_port) 13010 to.to_mss -= V_tcp_udp_tunneling_overhead; 13011 #endif 13012 to.to_flags |= TOF_MSS; 13013 13014 /* 13015 * On SYN or SYN|ACK transmits on TFO connections, 13016 * only include the TFO option if it is not a 13017 * retransmit, as the presence of the TFO option may 13018 * have caused the original SYN or SYN|ACK to have 13019 * been dropped by a middlebox. 13020 */ 13021 if (IS_FASTOPEN(tp->t_flags) && 13022 (tp->t_rxtshift == 0)) { 13023 if (tp->t_state == TCPS_SYN_RECEIVED) { 13024 to.to_tfo_len = TCP_FASTOPEN_COOKIE_LEN; 13025 to.to_tfo_cookie = 13026 (u_int8_t *)&tp->t_tfo_cookie.server; 13027 to.to_flags |= TOF_FASTOPEN; 13028 wanted_cookie = 1; 13029 } else if (tp->t_state == TCPS_SYN_SENT) { 13030 to.to_tfo_len = 13031 tp->t_tfo_client_cookie_len; 13032 to.to_tfo_cookie = 13033 tp->t_tfo_cookie.client; 13034 to.to_flags |= TOF_FASTOPEN; 13035 wanted_cookie = 1; 13036 /* 13037 * If we wind up having more data to 13038 * send with the SYN than can fit in 13039 * one segment, don't send any more 13040 * until the SYN|ACK comes back from 13041 * the other end. 13042 */ 13043 sendalot = 0; 13044 } 13045 } 13046 } 13047 /* Window scaling. */ 13048 if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) { 13049 to.to_wscale = tp->request_r_scale; 13050 to.to_flags |= TOF_SCALE; 13051 } 13052 /* Timestamps. */ 13053 if ((tp->t_flags & TF_RCVD_TSTMP) || 13054 ((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) { 13055 to.to_tsval = cts + tp->ts_offset; 13056 to.to_tsecr = tp->ts_recent; 13057 to.to_flags |= TOF_TS; 13058 } 13059 /* Set receive buffer autosizing timestamp. */ 13060 if (tp->rfbuf_ts == 0 && 13061 (so->so_rcv.sb_flags & SB_AUTOSIZE)) 13062 tp->rfbuf_ts = tcp_ts_getticks(); 13063 /* Selective ACK's. */ 13064 if (flags & TH_SYN) 13065 to.to_flags |= TOF_SACKPERM; 13066 else if (TCPS_HAVEESTABLISHED(tp->t_state) && 13067 tp->rcv_numsacks > 0) { 13068 to.to_flags |= TOF_SACK; 13069 to.to_nsacks = tp->rcv_numsacks; 13070 to.to_sacks = (u_char *)tp->sackblks; 13071 } 13072 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) 13073 /* TCP-MD5 (RFC2385). */ 13074 if (tp->t_flags & TF_SIGNATURE) 13075 to.to_flags |= TOF_SIGNATURE; 13076 #endif /* TCP_SIGNATURE */ 13077 13078 /* Processing the options. */ 13079 hdrlen += optlen = tcp_addoptions(&to, opt); 13080 /* 13081 * If we wanted a TFO option to be added, but it was unable 13082 * to fit, ensure no data is sent. 13083 */ 13084 if (IS_FASTOPEN(tp->t_flags) && wanted_cookie && 13085 !(to.to_flags & TOF_FASTOPEN)) 13086 len = 0; 13087 } 13088 #ifdef NETFLIX_TCPOUDP 13089 if (tp->t_port) { 13090 if (V_tcp_udp_tunneling_port == 0) { 13091 /* The port was removed?? */ 13092 SOCKBUF_UNLOCK(&so->so_snd); 13093 return (EHOSTUNREACH); 13094 } 13095 hdrlen += sizeof(struct udphdr); 13096 } 13097 #endif 13098 #ifdef INET6 13099 if (isipv6) 13100 ipoptlen = ip6_optlen(tp->t_inpcb); 13101 else 13102 #endif 13103 if (tp->t_inpcb->inp_options) 13104 ipoptlen = tp->t_inpcb->inp_options->m_len - 13105 offsetof(struct ipoption, ipopt_list); 13106 else 13107 ipoptlen = 0; 13108 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 13109 ipoptlen += ipsec_optlen; 13110 #endif 13111 13112 /* 13113 * Adjust data length if insertion of options will bump the packet 13114 * length beyond the t_maxseg length. Clear the FIN bit because we 13115 * cut off the tail of the segment. 13116 */ 13117 if (len + optlen + ipoptlen > tp->t_maxseg) { 13118 if (tso) { 13119 uint32_t if_hw_tsomax; 13120 uint32_t moff; 13121 int32_t max_len; 13122 13123 /* extract TSO information */ 13124 if_hw_tsomax = tp->t_tsomax; 13125 if_hw_tsomaxsegcount = tp->t_tsomaxsegcount; 13126 if_hw_tsomaxsegsize = tp->t_tsomaxsegsize; 13127 KASSERT(ipoptlen == 0, 13128 ("%s: TSO can't do IP options", __func__)); 13129 13130 /* 13131 * Check if we should limit by maximum payload 13132 * length: 13133 */ 13134 if (if_hw_tsomax != 0) { 13135 /* compute maximum TSO length */ 13136 max_len = (if_hw_tsomax - hdrlen - 13137 max_linkhdr); 13138 if (max_len <= 0) { 13139 len = 0; 13140 } else if (len > max_len) { 13141 sendalot = 1; 13142 len = max_len; 13143 mark = 2; 13144 } 13145 } 13146 /* 13147 * Prevent the last segment from being fractional 13148 * unless the send sockbuf can be emptied: 13149 */ 13150 max_len = (tp->t_maxseg - optlen); 13151 if ((sb_offset + len) < sbavail(sb)) { 13152 moff = len % (u_int)max_len; 13153 if (moff != 0) { 13154 mark = 3; 13155 len -= moff; 13156 } 13157 } 13158 /* 13159 * In case there are too many small fragments don't 13160 * use TSO: 13161 */ 13162 if (len <= segsiz) { 13163 mark = 4; 13164 tso = 0; 13165 } 13166 /* 13167 * Send the FIN in a separate segment after the bulk 13168 * sending is done. We don't trust the TSO 13169 * implementations to clear the FIN flag on all but 13170 * the last segment. 13171 */ 13172 if (tp->t_flags & TF_NEEDFIN) { 13173 sendalot = 4; 13174 } 13175 } else { 13176 mark = 5; 13177 if (optlen + ipoptlen >= tp->t_maxseg) { 13178 /* 13179 * Since we don't have enough space to put 13180 * the IP header chain and the TCP header in 13181 * one packet as required by RFC 7112, don't 13182 * send it. Also ensure that at least one 13183 * byte of the payload can be put into the 13184 * TCP segment. 13185 */ 13186 SOCKBUF_UNLOCK(&so->so_snd); 13187 error = EMSGSIZE; 13188 sack_rxmit = 0; 13189 goto out; 13190 } 13191 len = tp->t_maxseg - optlen - ipoptlen; 13192 sendalot = 5; 13193 } 13194 } else { 13195 tso = 0; 13196 mark = 6; 13197 } 13198 KASSERT(len + hdrlen + ipoptlen <= IP_MAXPACKET, 13199 ("%s: len > IP_MAXPACKET", __func__)); 13200 #ifdef DIAGNOSTIC 13201 #ifdef INET6 13202 if (max_linkhdr + hdrlen > MCLBYTES) 13203 #else 13204 if (max_linkhdr + hdrlen > MHLEN) 13205 #endif 13206 panic("tcphdr too big"); 13207 #endif 13208 13209 /* 13210 * This KASSERT is here to catch edge cases at a well defined place. 13211 * Before, those had triggered (random) panic conditions further 13212 * down. 13213 */ 13214 KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); 13215 if ((len == 0) && 13216 (flags & TH_FIN) && 13217 (sbused(sb))) { 13218 /* 13219 * We have outstanding data, don't send a fin by itself!. 13220 */ 13221 goto just_return; 13222 } 13223 /* 13224 * Grab a header mbuf, attaching a copy of data to be transmitted, 13225 * and initialize the header from the template for sends on this 13226 * connection. 13227 */ 13228 if (len) { 13229 uint32_t max_val; 13230 uint32_t moff; 13231 13232 if (rack->r_ctl.rc_pace_max_segs) 13233 max_val = rack->r_ctl.rc_pace_max_segs; 13234 else if (rack->rc_user_set_max_segs) 13235 max_val = rack->rc_user_set_max_segs * segsiz; 13236 else 13237 max_val = len; 13238 /* 13239 * We allow a limit on sending with hptsi. 13240 */ 13241 if (len > max_val) { 13242 mark = 7; 13243 len = max_val; 13244 } 13245 #ifdef INET6 13246 if (MHLEN < hdrlen + max_linkhdr) 13247 m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); 13248 else 13249 #endif 13250 m = m_gethdr(M_NOWAIT, MT_DATA); 13251 13252 if (m == NULL) { 13253 SOCKBUF_UNLOCK(sb); 13254 error = ENOBUFS; 13255 sack_rxmit = 0; 13256 goto out; 13257 } 13258 m->m_data += max_linkhdr; 13259 m->m_len = hdrlen; 13260 13261 /* 13262 * Start the m_copy functions from the closest mbuf to the 13263 * sb_offset in the socket buffer chain. 13264 */ 13265 mb = sbsndptr_noadv(sb, sb_offset, &moff); 13266 if (len <= MHLEN - hdrlen - max_linkhdr && !hw_tls) { 13267 m_copydata(mb, moff, (int)len, 13268 mtod(m, caddr_t)+hdrlen); 13269 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) 13270 sbsndptr_adv(sb, mb, len); 13271 m->m_len += len; 13272 } else { 13273 struct sockbuf *msb; 13274 13275 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) 13276 msb = NULL; 13277 else 13278 msb = sb; 13279 m->m_next = tcp_m_copym( 13280 mb, moff, &len, 13281 if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb, 13282 ((rsm == NULL) ? hw_tls : 0) 13283 #ifdef NETFLIX_COPY_ARGS 13284 , &filled_all 13285 #endif 13286 ); 13287 if (len <= (tp->t_maxseg - optlen)) { 13288 /* 13289 * Must have ran out of mbufs for the copy 13290 * shorten it to no longer need tso. Lets 13291 * not put on sendalot since we are low on 13292 * mbufs. 13293 */ 13294 tso = 0; 13295 } 13296 if (m->m_next == NULL) { 13297 SOCKBUF_UNLOCK(sb); 13298 (void)m_free(m); 13299 error = ENOBUFS; 13300 sack_rxmit = 0; 13301 goto out; 13302 } 13303 } 13304 if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) { 13305 if (rsm && (rsm->r_flags & RACK_TLP)) { 13306 /* 13307 * TLP should not count in retran count, but 13308 * in its own bin 13309 */ 13310 counter_u64_add(rack_tlp_retran, 1); 13311 counter_u64_add(rack_tlp_retran_bytes, len); 13312 } else { 13313 tp->t_sndrexmitpack++; 13314 KMOD_TCPSTAT_INC(tcps_sndrexmitpack); 13315 KMOD_TCPSTAT_ADD(tcps_sndrexmitbyte, len); 13316 } 13317 #ifdef STATS 13318 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB, 13319 len); 13320 #endif 13321 } else { 13322 KMOD_TCPSTAT_INC(tcps_sndpack); 13323 KMOD_TCPSTAT_ADD(tcps_sndbyte, len); 13324 #ifdef STATS 13325 stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB, 13326 len); 13327 #endif 13328 } 13329 /* 13330 * If we're sending everything we've got, set PUSH. (This 13331 * will keep happy those implementations which only give 13332 * data to the user when a buffer fills or a PUSH comes in.) 13333 */ 13334 if (sb_offset + len == sbused(sb) && 13335 sbused(sb) && 13336 !(flags & TH_SYN)) 13337 flags |= TH_PUSH; 13338 13339 SOCKBUF_UNLOCK(sb); 13340 } else { 13341 SOCKBUF_UNLOCK(sb); 13342 if (tp->t_flags & TF_ACKNOW) 13343 KMOD_TCPSTAT_INC(tcps_sndacks); 13344 else if (flags & (TH_SYN | TH_FIN | TH_RST)) 13345 KMOD_TCPSTAT_INC(tcps_sndctrl); 13346 else 13347 KMOD_TCPSTAT_INC(tcps_sndwinup); 13348 13349 m = m_gethdr(M_NOWAIT, MT_DATA); 13350 if (m == NULL) { 13351 error = ENOBUFS; 13352 sack_rxmit = 0; 13353 goto out; 13354 } 13355 #ifdef INET6 13356 if (isipv6 && (MHLEN < hdrlen + max_linkhdr) && 13357 MHLEN >= hdrlen) { 13358 M_ALIGN(m, hdrlen); 13359 } else 13360 #endif 13361 m->m_data += max_linkhdr; 13362 m->m_len = hdrlen; 13363 } 13364 SOCKBUF_UNLOCK_ASSERT(sb); 13365 m->m_pkthdr.rcvif = (struct ifnet *)0; 13366 #ifdef MAC 13367 mac_inpcb_create_mbuf(inp, m); 13368 #endif 13369 #ifdef INET6 13370 if (isipv6) { 13371 ip6 = mtod(m, struct ip6_hdr *); 13372 #ifdef NETFLIX_TCPOUDP 13373 if (tp->t_port) { 13374 udp = (struct udphdr *)((caddr_t)ip6 + ipoptlen + sizeof(struct ip6_hdr)); 13375 udp->uh_sport = htons(V_tcp_udp_tunneling_port); 13376 udp->uh_dport = tp->t_port; 13377 ulen = hdrlen + len - sizeof(struct ip6_hdr); 13378 udp->uh_ulen = htons(ulen); 13379 th = (struct tcphdr *)(udp + 1); 13380 } else 13381 #endif 13382 th = (struct tcphdr *)(ip6 + 1); 13383 tcpip_fillheaders(inp, 13384 #ifdef NETFLIX_TCPOUDP 13385 tp->t_port, 13386 #endif 13387 ip6, th); 13388 } else 13389 #endif /* INET6 */ 13390 { 13391 ip = mtod(m, struct ip *); 13392 #ifdef TCPDEBUG 13393 ipov = (struct ipovly *)ip; 13394 #endif 13395 #ifdef NETFLIX_TCPOUDP 13396 if (tp->t_port) { 13397 udp = (struct udphdr *)((caddr_t)ip + ipoptlen + sizeof(struct ip)); 13398 udp->uh_sport = htons(V_tcp_udp_tunneling_port); 13399 udp->uh_dport = tp->t_port; 13400 ulen = hdrlen + len - sizeof(struct ip); 13401 udp->uh_ulen = htons(ulen); 13402 th = (struct tcphdr *)(udp + 1); 13403 } else 13404 #endif 13405 th = (struct tcphdr *)(ip + 1); 13406 tcpip_fillheaders(inp, 13407 #ifdef NETFLIX_TCPOUDP 13408 tp->t_port, 13409 #endif 13410 ip, th); 13411 } 13412 /* 13413 * Fill in fields, remembering maximum advertised window for use in 13414 * delaying messages about window sizes. If resending a FIN, be sure 13415 * not to use a new sequence number. 13416 */ 13417 if (flags & TH_FIN && tp->t_flags & TF_SENTFIN && 13418 tp->snd_nxt == tp->snd_max) 13419 tp->snd_nxt--; 13420 /* 13421 * If we are starting a connection, send ECN setup SYN packet. If we 13422 * are on a retransmit, we may resend those bits a number of times 13423 * as per RFC 3168. 13424 */ 13425 if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn == 1) { 13426 if (tp->t_rxtshift >= 1) { 13427 if (tp->t_rxtshift <= V_tcp_ecn_maxretries) 13428 flags |= TH_ECE | TH_CWR; 13429 } else 13430 flags |= TH_ECE | TH_CWR; 13431 } 13432 /* Handle parallel SYN for ECN */ 13433 if ((tp->t_state == TCPS_SYN_RECEIVED) && 13434 (tp->t_flags2 & TF2_ECN_SND_ECE)) { 13435 flags |= TH_ECE; 13436 tp->t_flags2 &= ~TF2_ECN_SND_ECE; 13437 } 13438 if (tp->t_state == TCPS_ESTABLISHED && 13439 (tp->t_flags2 & TF2_ECN_PERMIT)) { 13440 /* 13441 * If the peer has ECN, mark data packets with ECN capable 13442 * transmission (ECT). Ignore pure ack packets, 13443 * retransmissions. 13444 */ 13445 if (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) && 13446 (sack_rxmit == 0)) { 13447 #ifdef INET6 13448 if (isipv6) 13449 ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20); 13450 else 13451 #endif 13452 ip->ip_tos |= IPTOS_ECN_ECT0; 13453 KMOD_TCPSTAT_INC(tcps_ecn_ect0); 13454 /* 13455 * Reply with proper ECN notifications. 13456 * Only set CWR on new data segments. 13457 */ 13458 if (tp->t_flags2 & TF2_ECN_SND_CWR) { 13459 flags |= TH_CWR; 13460 tp->t_flags2 &= ~TF2_ECN_SND_CWR; 13461 } 13462 } 13463 if (tp->t_flags2 & TF2_ECN_SND_ECE) 13464 flags |= TH_ECE; 13465 } 13466 /* 13467 * If we are doing retransmissions, then snd_nxt will not reflect 13468 * the first unsent octet. For ACK only packets, we do not want the 13469 * sequence number of the retransmitted packet, we want the sequence 13470 * number of the next unsent octet. So, if there is no data (and no 13471 * SYN or FIN), use snd_max instead of snd_nxt when filling in 13472 * ti_seq. But if we are in persist state, snd_max might reflect 13473 * one byte beyond the right edge of the window, so use snd_nxt in 13474 * that case, since we know we aren't doing a retransmission. 13475 * (retransmit and persist are mutually exclusive...) 13476 */ 13477 if (sack_rxmit == 0) { 13478 if (len || (flags & (TH_SYN | TH_FIN)) || 13479 rack->rc_in_persist) { 13480 th->th_seq = htonl(tp->snd_nxt); 13481 rack_seq = tp->snd_nxt; 13482 } else if (flags & TH_RST) { 13483 /* 13484 * For a Reset send the last cum ack in sequence 13485 * (this like any other choice may still generate a 13486 * challenge ack, if a ack-update packet is in 13487 * flight). 13488 */ 13489 th->th_seq = htonl(tp->snd_una); 13490 rack_seq = tp->snd_una; 13491 } else { 13492 th->th_seq = htonl(tp->snd_max); 13493 rack_seq = tp->snd_max; 13494 } 13495 } else { 13496 th->th_seq = htonl(rsm->r_start); 13497 rack_seq = rsm->r_start; 13498 } 13499 th->th_ack = htonl(tp->rcv_nxt); 13500 if (optlen) { 13501 bcopy(opt, th + 1, optlen); 13502 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 13503 } 13504 th->th_flags = flags; 13505 /* 13506 * Calculate receive window. Don't shrink window, but avoid silly 13507 * window syndrome. 13508 * If a RST segment is sent, advertise a window of zero. 13509 */ 13510 if (flags & TH_RST) { 13511 recwin = 0; 13512 } else { 13513 if (recwin < (long)(so->so_rcv.sb_hiwat / 4) && 13514 recwin < (long)segsiz) 13515 recwin = 0; 13516 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) && 13517 recwin < (long)(tp->rcv_adv - tp->rcv_nxt)) 13518 recwin = (long)(tp->rcv_adv - tp->rcv_nxt); 13519 } 13520 13521 /* 13522 * According to RFC1323 the window field in a SYN (i.e., a <SYN> or 13523 * <SYN,ACK>) segment itself is never scaled. The <SYN,ACK> case is 13524 * handled in syncache. 13525 */ 13526 if (flags & TH_SYN) 13527 th->th_win = htons((u_short) 13528 (min(sbspace(&so->so_rcv), TCP_MAXWIN))); 13529 else { 13530 /* Avoid shrinking window with window scaling. */ 13531 recwin = roundup2(recwin, 1 << tp->rcv_scale); 13532 th->th_win = htons((u_short)(recwin >> tp->rcv_scale)); 13533 } 13534 /* 13535 * Adjust the RXWIN0SENT flag - indicate that we have advertised a 0 13536 * window. This may cause the remote transmitter to stall. This 13537 * flag tells soreceive() to disable delayed acknowledgements when 13538 * draining the buffer. This can occur if the receiver is 13539 * attempting to read more data than can be buffered prior to 13540 * transmitting on the connection. 13541 */ 13542 if (th->th_win == 0) { 13543 tp->t_sndzerowin++; 13544 tp->t_flags |= TF_RXWIN0SENT; 13545 } else 13546 tp->t_flags &= ~TF_RXWIN0SENT; 13547 tp->snd_up = tp->snd_una; /* drag it along, its deprecated */ 13548 13549 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) 13550 if (to.to_flags & TOF_SIGNATURE) { 13551 /* 13552 * Calculate MD5 signature and put it into the place 13553 * determined before. 13554 * NOTE: since TCP options buffer doesn't point into 13555 * mbuf's data, calculate offset and use it. 13556 */ 13557 if (!TCPMD5_ENABLED() || TCPMD5_OUTPUT(m, th, 13558 (u_char *)(th + 1) + (to.to_signature - opt)) != 0) { 13559 /* 13560 * Do not send segment if the calculation of MD5 13561 * digest has failed. 13562 */ 13563 goto out; 13564 } 13565 } 13566 #endif 13567 13568 /* 13569 * Put TCP length in extended header, and then checksum extended 13570 * header and data. 13571 */ 13572 m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */ 13573 #ifdef INET6 13574 if (isipv6) { 13575 /* 13576 * ip6_plen is not need to be filled now, and will be filled 13577 * in ip6_output. 13578 */ 13579 if (tp->t_port) { 13580 m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; 13581 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 13582 udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0); 13583 th->th_sum = htons(0); 13584 UDPSTAT_INC(udps_opackets); 13585 } else { 13586 m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; 13587 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 13588 th->th_sum = in6_cksum_pseudo(ip6, 13589 sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP, 13590 0); 13591 } 13592 } 13593 #endif 13594 #if defined(INET6) && defined(INET) 13595 else 13596 #endif 13597 #ifdef INET 13598 { 13599 if (tp->t_port) { 13600 m->m_pkthdr.csum_flags = CSUM_UDP; 13601 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 13602 udp->uh_sum = in_pseudo(ip->ip_src.s_addr, 13603 ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP)); 13604 th->th_sum = htons(0); 13605 UDPSTAT_INC(udps_opackets); 13606 } else { 13607 m->m_pkthdr.csum_flags = CSUM_TCP; 13608 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 13609 th->th_sum = in_pseudo(ip->ip_src.s_addr, 13610 ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + 13611 IPPROTO_TCP + len + optlen)); 13612 } 13613 /* IP version must be set here for ipv4/ipv6 checking later */ 13614 KASSERT(ip->ip_v == IPVERSION, 13615 ("%s: IP version incorrect: %d", __func__, ip->ip_v)); 13616 } 13617 #endif 13618 /* 13619 * Enable TSO and specify the size of the segments. The TCP pseudo 13620 * header checksum is always provided. XXX: Fixme: This is currently 13621 * not the case for IPv6. 13622 */ 13623 if (tso) { 13624 KASSERT(len > tp->t_maxseg - optlen, 13625 ("%s: len <= tso_segsz", __func__)); 13626 m->m_pkthdr.csum_flags |= CSUM_TSO; 13627 m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen; 13628 } 13629 KASSERT(len + hdrlen == m_length(m, NULL), 13630 ("%s: mbuf chain different than expected: %d + %u != %u", 13631 __func__, len, hdrlen, m_length(m, NULL))); 13632 13633 #ifdef TCP_HHOOK 13634 /* Run HHOOK_TCP_ESTABLISHED_OUT helper hooks. */ 13635 hhook_run_tcp_est_out(tp, th, &to, len, tso); 13636 #endif 13637 #ifdef TCPDEBUG 13638 /* 13639 * Trace. 13640 */ 13641 if (so->so_options & SO_DEBUG) { 13642 u_short save = 0; 13643 13644 #ifdef INET6 13645 if (!isipv6) 13646 #endif 13647 { 13648 save = ipov->ih_len; 13649 ipov->ih_len = htons(m->m_pkthdr.len /* - hdrlen + 13650 * (th->th_off << 2) */ ); 13651 } 13652 tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0); 13653 #ifdef INET6 13654 if (!isipv6) 13655 #endif 13656 ipov->ih_len = save; 13657 } 13658 #endif /* TCPDEBUG */ 13659 13660 /* We're getting ready to send; log now. */ 13661 if (tp->t_logstate != TCP_LOG_STATE_OFF) { 13662 union tcp_log_stackspecific log; 13663 struct timeval tv; 13664 13665 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 13666 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 13667 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 13668 if (rack->rack_no_prr) 13669 log.u_bbr.flex1 = 0; 13670 else 13671 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; 13672 log.u_bbr.flex2 = rack->r_ctl.rc_pace_min_segs; 13673 log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs; 13674 log.u_bbr.flex4 = orig_len; 13675 if (filled_all) 13676 log.u_bbr.flex5 = 0x80000000; 13677 else 13678 log.u_bbr.flex5 = 0; 13679 /* Save off the early/late values */ 13680 log.u_bbr.flex6 = rack->r_ctl.rc_agg_early; 13681 log.u_bbr.applimited = rack->r_ctl.rc_agg_delayed; 13682 log.u_bbr.bw_inuse = rack_get_bw(rack); 13683 if (rsm || sack_rxmit) { 13684 if (doing_tlp) 13685 log.u_bbr.flex8 = 2; 13686 else 13687 log.u_bbr.flex8 = 1; 13688 } else { 13689 log.u_bbr.flex8 = 0; 13690 } 13691 log.u_bbr.pacing_gain = rack_get_output_gain(rack, rsm); 13692 log.u_bbr.flex7 = mark; 13693 log.u_bbr.pkts_out = tp->t_maxseg; 13694 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 13695 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 13696 log.u_bbr.lt_epoch = cwnd_to_use; 13697 log.u_bbr.delivered = sendalot; 13698 lgb = tcp_log_event_(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_OUT, ERRNO_UNK, 13699 len, &log, false, NULL, NULL, 0, &tv); 13700 } else 13701 lgb = NULL; 13702 13703 /* 13704 * Fill in IP length and desired time to live and send to IP level. 13705 * There should be a better way to handle ttl and tos; we could keep 13706 * them in the template, but need a way to checksum without them. 13707 */ 13708 /* 13709 * m->m_pkthdr.len should have been set before cksum calcuration, 13710 * because in6_cksum() need it. 13711 */ 13712 #ifdef INET6 13713 if (isipv6) { 13714 /* 13715 * we separately set hoplimit for every segment, since the 13716 * user might want to change the value via setsockopt. Also, 13717 * desired default hop limit might be changed via Neighbor 13718 * Discovery. 13719 */ 13720 ip6->ip6_hlim = in6_selecthlim(inp, NULL); 13721 13722 /* 13723 * Set the packet size here for the benefit of DTrace 13724 * probes. ip6_output() will set it properly; it's supposed 13725 * to include the option header lengths as well. 13726 */ 13727 ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6)); 13728 13729 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) 13730 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 13731 else 13732 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 13733 13734 if (tp->t_state == TCPS_SYN_SENT) 13735 TCP_PROBE5(connect__request, NULL, tp, ip6, tp, th); 13736 13737 TCP_PROBE5(send, NULL, tp, ip6, tp, th); 13738 /* TODO: IPv6 IP6TOS_ECT bit on */ 13739 error = ip6_output(m, inp->in6p_outputopts, 13740 &inp->inp_route6, 13741 ((rsm || sack_rxmit) ? IP_NO_SND_TAG_RL : 0), 13742 NULL, NULL, inp); 13743 13744 if (error == EMSGSIZE && inp->inp_route6.ro_nh != NULL) 13745 mtu = inp->inp_route6.ro_nh->nh_mtu; 13746 } 13747 #endif /* INET6 */ 13748 #if defined(INET) && defined(INET6) 13749 else 13750 #endif 13751 #ifdef INET 13752 { 13753 ip->ip_len = htons(m->m_pkthdr.len); 13754 #ifdef INET6 13755 if (inp->inp_vflag & INP_IPV6PROTO) 13756 ip->ip_ttl = in6_selecthlim(inp, NULL); 13757 #endif /* INET6 */ 13758 /* 13759 * If we do path MTU discovery, then we set DF on every 13760 * packet. This might not be the best thing to do according 13761 * to RFC3390 Section 2. However the tcp hostcache migitates 13762 * the problem so it affects only the first tcp connection 13763 * with a host. 13764 * 13765 * NB: Don't set DF on small MTU/MSS to have a safe 13766 * fallback. 13767 */ 13768 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) { 13769 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 13770 if (tp->t_port == 0 || len < V_tcp_minmss) { 13771 ip->ip_off |= htons(IP_DF); 13772 } 13773 } else { 13774 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 13775 } 13776 13777 if (tp->t_state == TCPS_SYN_SENT) 13778 TCP_PROBE5(connect__request, NULL, tp, ip, tp, th); 13779 13780 TCP_PROBE5(send, NULL, tp, ip, tp, th); 13781 13782 error = ip_output(m, inp->inp_options, &inp->inp_route, 13783 ((rsm || sack_rxmit) ? IP_NO_SND_TAG_RL : 0), 0, 13784 inp); 13785 if (error == EMSGSIZE && inp->inp_route.ro_nh != NULL) 13786 mtu = inp->inp_route.ro_nh->nh_mtu; 13787 } 13788 #endif /* INET */ 13789 13790 out: 13791 if (lgb) { 13792 lgb->tlb_errno = error; 13793 lgb = NULL; 13794 } 13795 /* 13796 * In transmit state, time the transmission and arrange for the 13797 * retransmit. In persist state, just set snd_max. 13798 */ 13799 if (error == 0) { 13800 rack->forced_ack = 0; /* If we send something zap the FA flag */ 13801 if (rsm && (doing_tlp == 0)) { 13802 /* Set we retransmitted */ 13803 rack->rc_gp_saw_rec = 1; 13804 } else { 13805 if (cwnd_to_use > tp->snd_ssthresh) { 13806 /* Set we sent in CA */ 13807 rack->rc_gp_saw_ca = 1; 13808 } else { 13809 /* Set we sent in SS */ 13810 rack->rc_gp_saw_ss = 1; 13811 } 13812 } 13813 if (TCPS_HAVEESTABLISHED(tp->t_state) && 13814 (tp->t_flags & TF_SACK_PERMIT) && 13815 tp->rcv_numsacks > 0) 13816 tcp_clean_dsack_blocks(tp); 13817 tot_len_this_send += len; 13818 if (len == 0) 13819 counter_u64_add(rack_out_size[TCP_MSS_ACCT_SNDACK], 1); 13820 else if (len == 1) { 13821 counter_u64_add(rack_out_size[TCP_MSS_ACCT_PERSIST], 1); 13822 } else if (len > 1) { 13823 int idx; 13824 13825 idx = (len / segsiz) + 3; 13826 if (idx >= TCP_MSS_ACCT_ATIMER) 13827 counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1); 13828 else 13829 counter_u64_add(rack_out_size[idx], 1); 13830 } 13831 } 13832 if (rack->rack_no_prr == 0) { 13833 if (sub_from_prr && (error == 0)) { 13834 if (rack->r_ctl.rc_prr_sndcnt >= len) 13835 rack->r_ctl.rc_prr_sndcnt -= len; 13836 else 13837 rack->r_ctl.rc_prr_sndcnt = 0; 13838 } 13839 } 13840 sub_from_prr = 0; 13841 rack_log_output(tp, &to, len, rack_seq, (uint8_t) flags, error, cts, 13842 pass, rsm, us_cts); 13843 if ((error == 0) && 13844 (len > 0) && 13845 (tp->snd_una == tp->snd_max)) 13846 rack->r_ctl.rc_tlp_rxt_last_time = cts; 13847 /* Now are we in persists? */ 13848 if (rack->rc_in_persist == 0) { 13849 tcp_seq startseq = tp->snd_nxt; 13850 13851 /* Track our lost count */ 13852 if (rsm && (doing_tlp == 0)) 13853 rack->r_ctl.rc_loss_count += rsm->r_end - rsm->r_start; 13854 /* 13855 * Advance snd_nxt over sequence space of this segment. 13856 */ 13857 if (error) 13858 /* We don't log or do anything with errors */ 13859 goto nomore; 13860 if (doing_tlp == 0) { 13861 if (rsm == NULL) { 13862 /* 13863 * Not a retransmission of some 13864 * sort, new data is going out so 13865 * clear our TLP count and flag. 13866 */ 13867 rack->rc_tlp_in_progress = 0; 13868 rack->r_ctl.rc_tlp_cnt_out = 0; 13869 } 13870 } else { 13871 /* 13872 * We have just sent a TLP, mark that it is true 13873 * and make sure our in progress is set so we 13874 * continue to check the count. 13875 */ 13876 rack->rc_tlp_in_progress = 1; 13877 rack->r_ctl.rc_tlp_cnt_out++; 13878 } 13879 if (flags & (TH_SYN | TH_FIN)) { 13880 if (flags & TH_SYN) 13881 tp->snd_nxt++; 13882 if (flags & TH_FIN) { 13883 tp->snd_nxt++; 13884 tp->t_flags |= TF_SENTFIN; 13885 } 13886 } 13887 /* In the ENOBUFS case we do *not* update snd_max */ 13888 if (sack_rxmit) 13889 goto nomore; 13890 13891 tp->snd_nxt += len; 13892 if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { 13893 if (tp->snd_una == tp->snd_max) { 13894 /* 13895 * Update the time we just added data since 13896 * none was outstanding. 13897 */ 13898 rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__); 13899 tp->t_acktime = ticks; 13900 } 13901 tp->snd_max = tp->snd_nxt; 13902 /* 13903 * Time this transmission if not a retransmission and 13904 * not currently timing anything. 13905 * This is only relevant in case of switching back to 13906 * the base stack. 13907 */ 13908 if (tp->t_rtttime == 0) { 13909 tp->t_rtttime = ticks; 13910 tp->t_rtseq = startseq; 13911 KMOD_TCPSTAT_INC(tcps_segstimed); 13912 } 13913 if (len && 13914 ((tp->t_flags & TF_GPUTINPROG) == 0)) 13915 rack_start_gp_measurement(tp, rack, startseq, sb_offset); 13916 } 13917 } else { 13918 /* 13919 * Persist case, update snd_max but since we are in persist 13920 * mode (no window) we do not update snd_nxt. 13921 */ 13922 int32_t xlen = len; 13923 13924 if (error) 13925 goto nomore; 13926 13927 if (flags & TH_SYN) 13928 ++xlen; 13929 if (flags & TH_FIN) { 13930 ++xlen; 13931 tp->t_flags |= TF_SENTFIN; 13932 } 13933 /* In the ENOBUFS case we do *not* update snd_max */ 13934 if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max)) { 13935 if (tp->snd_una == tp->snd_max) { 13936 /* 13937 * Update the time we just added data since 13938 * none was outstanding. 13939 */ 13940 rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__); 13941 tp->t_acktime = ticks; 13942 } 13943 tp->snd_max = tp->snd_nxt + len; 13944 } 13945 } 13946 nomore: 13947 if (error) { 13948 rack->r_ctl.rc_agg_delayed = 0; 13949 rack->r_early = 0; 13950 rack->r_late = 0; 13951 rack->r_ctl.rc_agg_early = 0; 13952 SOCKBUF_UNLOCK_ASSERT(sb); /* Check gotos. */ 13953 /* 13954 * Failures do not advance the seq counter above. For the 13955 * case of ENOBUFS we will fall out and retry in 1ms with 13956 * the hpts. Everything else will just have to retransmit 13957 * with the timer. 13958 * 13959 * In any case, we do not want to loop around for another 13960 * send without a good reason. 13961 */ 13962 sendalot = 0; 13963 switch (error) { 13964 case EPERM: 13965 tp->t_softerror = error; 13966 return (error); 13967 case ENOBUFS: 13968 if (slot == 0) { 13969 /* 13970 * Pace us right away to retry in a some 13971 * time 13972 */ 13973 slot = ((1 + rack->rc_enobuf) * HPTS_USEC_IN_MSEC); 13974 if (rack->rc_enobuf < 126) 13975 rack->rc_enobuf++; 13976 if (slot > ((rack->rc_rack_rtt / 2) * HPTS_USEC_IN_MSEC)) { 13977 slot = (rack->rc_rack_rtt / 2) * HPTS_USEC_IN_MSEC; 13978 } 13979 if (slot < (10 * HPTS_USEC_IN_MSEC)) 13980 slot = 10 * HPTS_USEC_IN_MSEC; 13981 } 13982 counter_u64_add(rack_saw_enobuf, 1); 13983 error = 0; 13984 goto enobufs; 13985 case EMSGSIZE: 13986 /* 13987 * For some reason the interface we used initially 13988 * to send segments changed to another or lowered 13989 * its MTU. If TSO was active we either got an 13990 * interface without TSO capabilits or TSO was 13991 * turned off. If we obtained mtu from ip_output() 13992 * then update it and try again. 13993 */ 13994 if (tso) 13995 tp->t_flags &= ~TF_TSO; 13996 if (mtu != 0) { 13997 tcp_mss_update(tp, -1, mtu, NULL, NULL); 13998 goto again; 13999 } 14000 slot = 10 * HPTS_USEC_IN_MSEC; 14001 rack_start_hpts_timer(rack, tp, cts, slot, 0, 0); 14002 return (error); 14003 case ENETUNREACH: 14004 counter_u64_add(rack_saw_enetunreach, 1); 14005 case EHOSTDOWN: 14006 case EHOSTUNREACH: 14007 case ENETDOWN: 14008 if (TCPS_HAVERCVDSYN(tp->t_state)) { 14009 tp->t_softerror = error; 14010 } 14011 /* FALLTHROUGH */ 14012 default: 14013 slot = 10 * HPTS_USEC_IN_MSEC; 14014 rack_start_hpts_timer(rack, tp, cts, slot, 0, 0); 14015 return (error); 14016 } 14017 } else { 14018 rack->rc_enobuf = 0; 14019 } 14020 KMOD_TCPSTAT_INC(tcps_sndtotal); 14021 14022 /* 14023 * Data sent (as far as we can tell). If this advertises a larger 14024 * window than any other segment, then remember the size of the 14025 * advertised window. Any pending ACK has now been sent. 14026 */ 14027 if (recwin > 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv)) 14028 tp->rcv_adv = tp->rcv_nxt + recwin; 14029 tp->last_ack_sent = tp->rcv_nxt; 14030 tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); 14031 enobufs: 14032 /* Assure when we leave that snd_nxt will point to top */ 14033 if (SEQ_GT(tp->snd_max, tp->snd_nxt)) 14034 tp->snd_nxt = tp->snd_max; 14035 if (sendalot) { 14036 /* Do we need to turn off sendalot? */ 14037 if (rack->r_ctl.rc_pace_max_segs && 14038 (tot_len_this_send >= rack->r_ctl.rc_pace_max_segs)) { 14039 /* We hit our max. */ 14040 sendalot = 0; 14041 } else if ((rack->rc_user_set_max_segs) && 14042 (tot_len_this_send >= (rack->rc_user_set_max_segs * segsiz))) { 14043 /* We hit the user defined max */ 14044 sendalot = 0; 14045 } 14046 } 14047 if ((error == 0) && (flags & TH_FIN)) 14048 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_FIN); 14049 if (flags & TH_RST) { 14050 /* 14051 * We don't send again after sending a RST. 14052 */ 14053 slot = 0; 14054 sendalot = 0; 14055 if (error == 0) 14056 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); 14057 } else if ((slot == 0) && (sendalot == 0) && tot_len_this_send) { 14058 /* 14059 * Get our pacing rate, if an error 14060 * occured in sending (ENOBUF) we would 14061 * hit the else if with slot preset. Other 14062 * errors return. 14063 */ 14064 slot = rack_get_pacing_delay(rack, tp, tot_len_this_send, rsm, segsiz); 14065 } 14066 if (rsm && 14067 rack->use_rack_rr) { 14068 /* Its a retransmit and we use the rack cheat? */ 14069 if ((slot == 0) || 14070 (rack->rc_always_pace == 0) || 14071 (rack->r_rr_config == 1)) { 14072 /* 14073 * We have no pacing set or we 14074 * are using old-style rack or 14075 * we are overriden to use the old 1ms pacing. 14076 */ 14077 slot = rack->r_ctl.rc_min_to * HPTS_USEC_IN_MSEC; 14078 } 14079 } 14080 if (slot) { 14081 /* set the rack tcb into the slot N */ 14082 counter_u64_add(rack_paced_segments, 1); 14083 } else if (sendalot) { 14084 if (len) 14085 counter_u64_add(rack_unpaced_segments, 1); 14086 sack_rxmit = 0; 14087 goto again; 14088 } else if (len) { 14089 counter_u64_add(rack_unpaced_segments, 1); 14090 } 14091 rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, 0); 14092 return (error); 14093 } 14094 14095 static void 14096 rack_update_seg(struct tcp_rack *rack) 14097 { 14098 uint32_t orig_val; 14099 14100 orig_val = rack->r_ctl.rc_pace_max_segs; 14101 rack_set_pace_segments(rack->rc_tp, rack, __LINE__); 14102 if (orig_val != rack->r_ctl.rc_pace_max_segs) 14103 rack_log_pacing_delay_calc(rack, 0, 0, orig_val, 0, 0, 15, __LINE__, NULL); 14104 } 14105 14106 /* 14107 * rack_ctloutput() must drop the inpcb lock before performing copyin on 14108 * socket option arguments. When it re-acquires the lock after the copy, it 14109 * has to revalidate that the connection is still valid for the socket 14110 * option. 14111 */ 14112 static int 14113 rack_set_sockopt(struct socket *so, struct sockopt *sopt, 14114 struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack) 14115 { 14116 struct epoch_tracker et; 14117 uint64_t val; 14118 int32_t error = 0, optval; 14119 uint16_t ca, ss; 14120 14121 switch (sopt->sopt_name) { 14122 case TCP_RACK_PROP_RATE: /* URL:prop_rate */ 14123 case TCP_RACK_PROP : /* URL:prop */ 14124 case TCP_RACK_TLP_REDUCE: /* URL:tlp_reduce */ 14125 case TCP_RACK_EARLY_RECOV: /* URL:early_recov */ 14126 case TCP_RACK_PACE_REDUCE: /* Not used */ 14127 /* Pacing related ones */ 14128 case TCP_RACK_PACE_ALWAYS: /* URL:pace_always */ 14129 case TCP_BBR_RACK_INIT_RATE: /* URL:irate */ 14130 case TCP_BBR_IWINTSO: /* URL:tso_iwin */ 14131 case TCP_RACK_PACE_MAX_SEG: /* URL:pace_max_seg */ 14132 case TCP_RACK_FORCE_MSEG: /* URL:force_max_seg */ 14133 case TCP_RACK_PACE_RATE_CA: /* URL:pr_ca */ 14134 case TCP_RACK_PACE_RATE_SS: /* URL:pr_ss*/ 14135 case TCP_RACK_PACE_RATE_REC: /* URL:pr_rec */ 14136 case TCP_RACK_GP_INCREASE_CA: /* URL:gp_inc_ca */ 14137 case TCP_RACK_GP_INCREASE_SS: /* URL:gp_inc_ss */ 14138 case TCP_RACK_GP_INCREASE_REC: /* URL:gp_inc_rec */ 14139 case TCP_RACK_RR_CONF: /* URL:rrr_conf */ 14140 case TCP_BBR_HDWR_PACE: /* URL:hdwrpace */ 14141 /* End pacing related */ 14142 case TCP_DELACK: 14143 case TCP_RACK_PRR_SENDALOT: /* URL:prr_sendalot */ 14144 case TCP_RACK_MIN_TO: /* URL:min_to */ 14145 case TCP_RACK_EARLY_SEG: /* URL:early_seg */ 14146 case TCP_RACK_REORD_THRESH: /* URL:reord_thresh */ 14147 case TCP_RACK_REORD_FADE: /* URL:reord_fade */ 14148 case TCP_RACK_TLP_THRESH: /* URL:tlp_thresh */ 14149 case TCP_RACK_PKT_DELAY: /* URL:pkt_delay */ 14150 case TCP_RACK_TLP_USE: /* URL:tlp_use */ 14151 case TCP_RACK_TLP_INC_VAR: /* URL:tlp_inc_var */ 14152 case TCP_RACK_IDLE_REDUCE_HIGH: /* URL:idle_reduce_high */ 14153 case TCP_BBR_RACK_RTT_USE: /* URL:rttuse */ 14154 case TCP_BBR_USE_RACK_RR: /* URL:rackrr */ 14155 case TCP_RACK_DO_DETECTION: /* URL:detect */ 14156 case TCP_NO_PRR: /* URL:noprr */ 14157 case TCP_TIMELY_DYN_ADJ: /* URL:dynamic */ 14158 case TCP_DATA_AFTER_CLOSE: 14159 case TCP_RACK_NONRXT_CFG_RATE: /* URL:nonrxtcr */ 14160 case TCP_SHARED_CWND_ENABLE: /* URL:scwnd */ 14161 case TCP_RACK_MBUF_QUEUE: /* URL:mqueue */ 14162 case TCP_RACK_NO_PUSH_AT_MAX: /* URL:npush */ 14163 case TCP_RACK_PACE_TO_FILL: /* URL:fillcw */ 14164 case TCP_SHARED_CWND_TIME_LIMIT: /* URL:lscwnd */ 14165 case TCP_RACK_PROFILE: /* URL:profile */ 14166 break; 14167 default: 14168 return (tcp_default_ctloutput(so, sopt, inp, tp)); 14169 break; 14170 } 14171 INP_WUNLOCK(inp); 14172 error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); 14173 if (error) 14174 return (error); 14175 INP_WLOCK(inp); 14176 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 14177 INP_WUNLOCK(inp); 14178 return (ECONNRESET); 14179 } 14180 tp = intotcpcb(inp); 14181 rack = (struct tcp_rack *)tp->t_fb_ptr; 14182 switch (sopt->sopt_name) { 14183 case TCP_RACK_PROFILE: 14184 RACK_OPTS_INC(tcp_profile); 14185 if (optval == 1) { 14186 /* pace_always=1 */ 14187 rack->rc_always_pace = 1; 14188 tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ; 14189 /* scwnd=1 */ 14190 rack->rack_enable_scwnd = 1; 14191 /* dynamic=100 */ 14192 rack->rc_gp_dyn_mul = 1; 14193 rack->r_ctl.rack_per_of_gp_ca = 100; 14194 /* rrr_conf=3 */ 14195 rack->r_rr_config = 3; 14196 /* npush=2 */ 14197 rack->r_ctl.rc_no_push_at_mrtt = 2; 14198 /* fillcw=1 */ 14199 rack->rc_pace_to_cwnd = 1; 14200 rack->rc_pace_fill_if_rttin_range = 0; 14201 rack->rtt_limit_mul = 0; 14202 /* noprr=1 */ 14203 rack->rack_no_prr = 1; 14204 /* lscwnd=1 */ 14205 rack->r_limit_scw = 1; 14206 } else if (optval == 2) { 14207 /* pace_always=1 */ 14208 rack->rc_always_pace = 1; 14209 tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ; 14210 /* scwnd=1 */ 14211 rack->rack_enable_scwnd = 1; 14212 /* dynamic=100 */ 14213 rack->rc_gp_dyn_mul = 1; 14214 rack->r_ctl.rack_per_of_gp_ca = 100; 14215 /* rrr_conf=3 */ 14216 rack->r_rr_config = 3; 14217 /* npush=2 */ 14218 rack->r_ctl.rc_no_push_at_mrtt = 2; 14219 /* fillcw=1 */ 14220 rack->rc_pace_to_cwnd = 1; 14221 rack->rc_pace_fill_if_rttin_range = 0; 14222 rack->rtt_limit_mul = 0; 14223 /* noprr=1 */ 14224 rack->rack_no_prr = 1; 14225 /* lscwnd=0 */ 14226 rack->r_limit_scw = 0; 14227 } 14228 break; 14229 case TCP_SHARED_CWND_TIME_LIMIT: 14230 RACK_OPTS_INC(tcp_lscwnd); 14231 if (optval) 14232 rack->r_limit_scw = 1; 14233 else 14234 rack->r_limit_scw = 0; 14235 break; 14236 case TCP_RACK_PACE_TO_FILL: 14237 RACK_OPTS_INC(tcp_fillcw); 14238 if (optval == 0) 14239 rack->rc_pace_to_cwnd = 0; 14240 else 14241 rack->rc_pace_to_cwnd = 1; 14242 if ((optval >= rack_gp_rtt_maxmul) && 14243 rack_gp_rtt_maxmul && 14244 (optval < 0xf)) { 14245 rack->rc_pace_fill_if_rttin_range = 1; 14246 rack->rtt_limit_mul = optval; 14247 } else { 14248 rack->rc_pace_fill_if_rttin_range = 0; 14249 rack->rtt_limit_mul = 0; 14250 } 14251 break; 14252 case TCP_RACK_NO_PUSH_AT_MAX: 14253 RACK_OPTS_INC(tcp_npush); 14254 if (optval == 0) 14255 rack->r_ctl.rc_no_push_at_mrtt = 0; 14256 else if (optval < 0xff) 14257 rack->r_ctl.rc_no_push_at_mrtt = optval; 14258 else 14259 error = EINVAL; 14260 break; 14261 case TCP_SHARED_CWND_ENABLE: 14262 RACK_OPTS_INC(tcp_rack_scwnd); 14263 if (optval == 0) 14264 rack->rack_enable_scwnd = 0; 14265 else 14266 rack->rack_enable_scwnd = 1; 14267 break; 14268 case TCP_RACK_MBUF_QUEUE: 14269 /* Now do we use the LRO mbuf-queue feature */ 14270 RACK_OPTS_INC(tcp_rack_mbufq); 14271 if (optval) 14272 rack->r_mbuf_queue = 1; 14273 else 14274 rack->r_mbuf_queue = 0; 14275 if (rack->r_mbuf_queue || rack->rc_always_pace) 14276 tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ; 14277 else 14278 tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ; 14279 break; 14280 case TCP_RACK_NONRXT_CFG_RATE: 14281 RACK_OPTS_INC(tcp_rack_cfg_rate); 14282 if (optval == 0) 14283 rack->rack_rec_nonrxt_use_cr = 0; 14284 else 14285 rack->rack_rec_nonrxt_use_cr = 1; 14286 break; 14287 case TCP_NO_PRR: 14288 RACK_OPTS_INC(tcp_rack_noprr); 14289 if (optval == 0) 14290 rack->rack_no_prr = 0; 14291 else 14292 rack->rack_no_prr = 1; 14293 break; 14294 case TCP_TIMELY_DYN_ADJ: 14295 RACK_OPTS_INC(tcp_timely_dyn); 14296 if (optval == 0) 14297 rack->rc_gp_dyn_mul = 0; 14298 else { 14299 rack->rc_gp_dyn_mul = 1; 14300 if (optval >= 100) { 14301 /* 14302 * If the user sets something 100 or more 14303 * its the gp_ca value. 14304 */ 14305 rack->r_ctl.rack_per_of_gp_ca = optval; 14306 } 14307 } 14308 break; 14309 case TCP_RACK_DO_DETECTION: 14310 RACK_OPTS_INC(tcp_rack_do_detection); 14311 if (optval == 0) 14312 rack->do_detection = 0; 14313 else 14314 rack->do_detection = 1; 14315 break; 14316 case TCP_RACK_PROP_RATE: 14317 if ((optval <= 0) || (optval >= 100)) { 14318 error = EINVAL; 14319 break; 14320 } 14321 RACK_OPTS_INC(tcp_rack_prop_rate); 14322 rack->r_ctl.rc_prop_rate = optval; 14323 break; 14324 case TCP_RACK_TLP_USE: 14325 if ((optval < TLP_USE_ID) || (optval > TLP_USE_TWO_TWO)) { 14326 error = EINVAL; 14327 break; 14328 } 14329 RACK_OPTS_INC(tcp_tlp_use); 14330 rack->rack_tlp_threshold_use = optval; 14331 break; 14332 case TCP_RACK_PROP: 14333 /* RACK proportional rate reduction (bool) */ 14334 RACK_OPTS_INC(tcp_rack_prop); 14335 rack->r_ctl.rc_prop_reduce = optval; 14336 break; 14337 case TCP_RACK_TLP_REDUCE: 14338 /* RACK TLP cwnd reduction (bool) */ 14339 RACK_OPTS_INC(tcp_rack_tlp_reduce); 14340 rack->r_ctl.rc_tlp_cwnd_reduce = optval; 14341 break; 14342 case TCP_RACK_EARLY_RECOV: 14343 /* Should recovery happen early (bool) */ 14344 RACK_OPTS_INC(tcp_rack_early_recov); 14345 rack->r_ctl.rc_early_recovery = optval; 14346 break; 14347 14348 /* Pacing related ones */ 14349 case TCP_RACK_PACE_ALWAYS: 14350 /* 14351 * zero is old rack method, 1 is new 14352 * method using a pacing rate. 14353 */ 14354 RACK_OPTS_INC(tcp_rack_pace_always); 14355 if (optval > 0) 14356 rack->rc_always_pace = 1; 14357 else 14358 rack->rc_always_pace = 0; 14359 if (rack->r_mbuf_queue || rack->rc_always_pace) 14360 tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ; 14361 else 14362 tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ; 14363 /* A rate may be set irate or other, if so set seg size */ 14364 rack_update_seg(rack); 14365 break; 14366 case TCP_BBR_RACK_INIT_RATE: 14367 RACK_OPTS_INC(tcp_initial_rate); 14368 val = optval; 14369 /* Change from kbits per second to bytes per second */ 14370 val *= 1000; 14371 val /= 8; 14372 rack->r_ctl.init_rate = val; 14373 if (rack->rc_init_win != rack_default_init_window) { 14374 uint32_t win, snt; 14375 14376 /* 14377 * Options don't always get applied 14378 * in the order you think. So in order 14379 * to assure we update a cwnd we need 14380 * to check and see if we are still 14381 * where we should raise the cwnd. 14382 */ 14383 win = rc_init_window(rack); 14384 if (SEQ_GT(tp->snd_max, tp->iss)) 14385 snt = tp->snd_max - tp->iss; 14386 else 14387 snt = 0; 14388 if ((snt < win) && 14389 (tp->snd_cwnd < win)) 14390 tp->snd_cwnd = win; 14391 } 14392 if (rack->rc_always_pace) 14393 rack_update_seg(rack); 14394 break; 14395 case TCP_BBR_IWINTSO: 14396 RACK_OPTS_INC(tcp_initial_win); 14397 if (optval && (optval <= 0xff)) { 14398 uint32_t win, snt; 14399 14400 rack->rc_init_win = optval; 14401 win = rc_init_window(rack); 14402 if (SEQ_GT(tp->snd_max, tp->iss)) 14403 snt = tp->snd_max - tp->iss; 14404 else 14405 snt = 0; 14406 if ((snt < win) && 14407 (tp->t_srtt | 14408 #ifdef NETFLIX_PEAKRATE 14409 tp->t_maxpeakrate | 14410 #endif 14411 rack->r_ctl.init_rate)) { 14412 /* 14413 * We are not past the initial window 14414 * and we have some bases for pacing, 14415 * so we need to possibly adjust up 14416 * the cwnd. Note even if we don't set 14417 * the cwnd, its still ok to raise the rc_init_win 14418 * which can be used coming out of idle when we 14419 * would have a rate. 14420 */ 14421 if (tp->snd_cwnd < win) 14422 tp->snd_cwnd = win; 14423 } 14424 if (rack->rc_always_pace) 14425 rack_update_seg(rack); 14426 } else 14427 error = EINVAL; 14428 break; 14429 case TCP_RACK_FORCE_MSEG: 14430 RACK_OPTS_INC(tcp_rack_force_max_seg); 14431 if (optval) 14432 rack->rc_force_max_seg = 1; 14433 else 14434 rack->rc_force_max_seg = 0; 14435 break; 14436 case TCP_RACK_PACE_MAX_SEG: 14437 /* Max segments size in a pace in bytes */ 14438 RACK_OPTS_INC(tcp_rack_max_seg); 14439 rack->rc_user_set_max_segs = optval; 14440 rack_set_pace_segments(tp, rack, __LINE__); 14441 break; 14442 case TCP_RACK_PACE_RATE_REC: 14443 /* Set the fixed pacing rate in Bytes per second ca */ 14444 RACK_OPTS_INC(tcp_rack_pace_rate_rec); 14445 rack->r_ctl.rc_fixed_pacing_rate_rec = optval; 14446 if (rack->r_ctl.rc_fixed_pacing_rate_ca == 0) 14447 rack->r_ctl.rc_fixed_pacing_rate_ca = optval; 14448 if (rack->r_ctl.rc_fixed_pacing_rate_ss == 0) 14449 rack->r_ctl.rc_fixed_pacing_rate_ss = optval; 14450 rack->use_fixed_rate = 1; 14451 rack_log_pacing_delay_calc(rack, 14452 rack->r_ctl.rc_fixed_pacing_rate_ss, 14453 rack->r_ctl.rc_fixed_pacing_rate_ca, 14454 rack->r_ctl.rc_fixed_pacing_rate_rec, 0, 0, 8, 14455 __LINE__, NULL); 14456 break; 14457 14458 case TCP_RACK_PACE_RATE_SS: 14459 /* Set the fixed pacing rate in Bytes per second ca */ 14460 RACK_OPTS_INC(tcp_rack_pace_rate_ss); 14461 rack->r_ctl.rc_fixed_pacing_rate_ss = optval; 14462 if (rack->r_ctl.rc_fixed_pacing_rate_ca == 0) 14463 rack->r_ctl.rc_fixed_pacing_rate_ca = optval; 14464 if (rack->r_ctl.rc_fixed_pacing_rate_rec == 0) 14465 rack->r_ctl.rc_fixed_pacing_rate_rec = optval; 14466 rack->use_fixed_rate = 1; 14467 rack_log_pacing_delay_calc(rack, 14468 rack->r_ctl.rc_fixed_pacing_rate_ss, 14469 rack->r_ctl.rc_fixed_pacing_rate_ca, 14470 rack->r_ctl.rc_fixed_pacing_rate_rec, 0, 0, 8, 14471 __LINE__, NULL); 14472 break; 14473 14474 case TCP_RACK_PACE_RATE_CA: 14475 /* Set the fixed pacing rate in Bytes per second ca */ 14476 RACK_OPTS_INC(tcp_rack_pace_rate_ca); 14477 rack->r_ctl.rc_fixed_pacing_rate_ca = optval; 14478 if (rack->r_ctl.rc_fixed_pacing_rate_ss == 0) 14479 rack->r_ctl.rc_fixed_pacing_rate_ss = optval; 14480 if (rack->r_ctl.rc_fixed_pacing_rate_rec == 0) 14481 rack->r_ctl.rc_fixed_pacing_rate_rec = optval; 14482 rack->use_fixed_rate = 1; 14483 rack_log_pacing_delay_calc(rack, 14484 rack->r_ctl.rc_fixed_pacing_rate_ss, 14485 rack->r_ctl.rc_fixed_pacing_rate_ca, 14486 rack->r_ctl.rc_fixed_pacing_rate_rec, 0, 0, 8, 14487 __LINE__, NULL); 14488 break; 14489 case TCP_RACK_GP_INCREASE_REC: 14490 RACK_OPTS_INC(tcp_gp_inc_rec); 14491 rack->r_ctl.rack_per_of_gp_rec = optval; 14492 rack_log_pacing_delay_calc(rack, 14493 rack->r_ctl.rack_per_of_gp_ss, 14494 rack->r_ctl.rack_per_of_gp_ca, 14495 rack->r_ctl.rack_per_of_gp_rec, 0, 0, 1, 14496 __LINE__, NULL); 14497 break; 14498 case TCP_RACK_GP_INCREASE_CA: 14499 RACK_OPTS_INC(tcp_gp_inc_ca); 14500 ca = optval; 14501 if (ca < 100) { 14502 /* 14503 * We don't allow any reduction 14504 * over the GP b/w. 14505 */ 14506 error = EINVAL; 14507 break; 14508 } 14509 rack->r_ctl.rack_per_of_gp_ca = ca; 14510 rack_log_pacing_delay_calc(rack, 14511 rack->r_ctl.rack_per_of_gp_ss, 14512 rack->r_ctl.rack_per_of_gp_ca, 14513 rack->r_ctl.rack_per_of_gp_rec, 0, 0, 1, 14514 __LINE__, NULL); 14515 break; 14516 case TCP_RACK_GP_INCREASE_SS: 14517 RACK_OPTS_INC(tcp_gp_inc_ss); 14518 ss = optval; 14519 if (ss < 100) { 14520 /* 14521 * We don't allow any reduction 14522 * over the GP b/w. 14523 */ 14524 error = EINVAL; 14525 break; 14526 } 14527 rack->r_ctl.rack_per_of_gp_ss = ss; 14528 rack_log_pacing_delay_calc(rack, 14529 rack->r_ctl.rack_per_of_gp_ss, 14530 rack->r_ctl.rack_per_of_gp_ca, 14531 rack->r_ctl.rack_per_of_gp_rec, 0, 0, 1, 14532 __LINE__, NULL); 14533 break; 14534 case TCP_RACK_RR_CONF: 14535 RACK_OPTS_INC(tcp_rack_rrr_no_conf_rate); 14536 if (optval && optval <= 3) 14537 rack->r_rr_config = optval; 14538 else 14539 rack->r_rr_config = 0; 14540 break; 14541 case TCP_BBR_HDWR_PACE: 14542 RACK_OPTS_INC(tcp_hdwr_pacing); 14543 if (optval){ 14544 if (rack->rack_hdrw_pacing == 0) { 14545 rack->rack_hdw_pace_ena = 1; 14546 rack->rack_attempt_hdwr_pace = 0; 14547 } else 14548 error = EALREADY; 14549 } else { 14550 rack->rack_hdw_pace_ena = 0; 14551 #ifdef RATELIMIT 14552 if (rack->rack_hdrw_pacing) { 14553 rack->rack_hdrw_pacing = 0; 14554 in_pcbdetach_txrtlmt(rack->rc_inp); 14555 } 14556 #endif 14557 } 14558 break; 14559 /* End Pacing related ones */ 14560 case TCP_RACK_PRR_SENDALOT: 14561 /* Allow PRR to send more than one seg */ 14562 RACK_OPTS_INC(tcp_rack_prr_sendalot); 14563 rack->r_ctl.rc_prr_sendalot = optval; 14564 break; 14565 case TCP_RACK_MIN_TO: 14566 /* Minimum time between rack t-o's in ms */ 14567 RACK_OPTS_INC(tcp_rack_min_to); 14568 rack->r_ctl.rc_min_to = optval; 14569 break; 14570 case TCP_RACK_EARLY_SEG: 14571 /* If early recovery max segments */ 14572 RACK_OPTS_INC(tcp_rack_early_seg); 14573 rack->r_ctl.rc_early_recovery_segs = optval; 14574 break; 14575 case TCP_RACK_REORD_THRESH: 14576 /* RACK reorder threshold (shift amount) */ 14577 RACK_OPTS_INC(tcp_rack_reord_thresh); 14578 if ((optval > 0) && (optval < 31)) 14579 rack->r_ctl.rc_reorder_shift = optval; 14580 else 14581 error = EINVAL; 14582 break; 14583 case TCP_RACK_REORD_FADE: 14584 /* Does reordering fade after ms time */ 14585 RACK_OPTS_INC(tcp_rack_reord_fade); 14586 rack->r_ctl.rc_reorder_fade = optval; 14587 break; 14588 case TCP_RACK_TLP_THRESH: 14589 /* RACK TLP theshold i.e. srtt+(srtt/N) */ 14590 RACK_OPTS_INC(tcp_rack_tlp_thresh); 14591 if (optval) 14592 rack->r_ctl.rc_tlp_threshold = optval; 14593 else 14594 error = EINVAL; 14595 break; 14596 case TCP_BBR_USE_RACK_RR: 14597 RACK_OPTS_INC(tcp_rack_rr); 14598 if (optval) 14599 rack->use_rack_rr = 1; 14600 else 14601 rack->use_rack_rr = 0; 14602 break; 14603 case TCP_RACK_PKT_DELAY: 14604 /* RACK added ms i.e. rack-rtt + reord + N */ 14605 RACK_OPTS_INC(tcp_rack_pkt_delay); 14606 rack->r_ctl.rc_pkt_delay = optval; 14607 break; 14608 case TCP_RACK_TLP_INC_VAR: 14609 /* Does TLP include rtt variance in t-o */ 14610 error = EINVAL; 14611 break; 14612 case TCP_RACK_IDLE_REDUCE_HIGH: 14613 error = EINVAL; 14614 break; 14615 case TCP_DELACK: 14616 if (optval == 0) 14617 tp->t_delayed_ack = 0; 14618 else 14619 tp->t_delayed_ack = 1; 14620 if (tp->t_flags & TF_DELACK) { 14621 tp->t_flags &= ~TF_DELACK; 14622 tp->t_flags |= TF_ACKNOW; 14623 NET_EPOCH_ENTER(et); 14624 rack_output(tp); 14625 NET_EPOCH_EXIT(et); 14626 } 14627 break; 14628 14629 case TCP_BBR_RACK_RTT_USE: 14630 if ((optval != USE_RTT_HIGH) && 14631 (optval != USE_RTT_LOW) && 14632 (optval != USE_RTT_AVG)) 14633 error = EINVAL; 14634 else 14635 rack->r_ctl.rc_rate_sample_method = optval; 14636 break; 14637 case TCP_DATA_AFTER_CLOSE: 14638 if (optval) 14639 rack->rc_allow_data_af_clo = 1; 14640 else 14641 rack->rc_allow_data_af_clo = 0; 14642 break; 14643 case TCP_RACK_PACE_REDUCE: 14644 /* sysctl only now */ 14645 error = EINVAL; 14646 break; 14647 default: 14648 return (tcp_default_ctloutput(so, sopt, inp, tp)); 14649 break; 14650 } 14651 #ifdef NETFLIX_STATS 14652 tcp_log_socket_option(tp, sopt->sopt_name, optval, error); 14653 #endif 14654 INP_WUNLOCK(inp); 14655 return (error); 14656 } 14657 14658 static int 14659 rack_get_sockopt(struct socket *so, struct sockopt *sopt, 14660 struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack) 14661 { 14662 int32_t error, optval; 14663 uint64_t val; 14664 /* 14665 * Because all our options are either boolean or an int, we can just 14666 * pull everything into optval and then unlock and copy. If we ever 14667 * add a option that is not a int, then this will have quite an 14668 * impact to this routine. 14669 */ 14670 error = 0; 14671 switch (sopt->sopt_name) { 14672 case TCP_RACK_PROFILE: 14673 /* You cannot retrieve a profile, its write only */ 14674 error = EINVAL; 14675 break; 14676 case TCP_RACK_PACE_TO_FILL: 14677 optval = rack->rc_pace_to_cwnd; 14678 break; 14679 case TCP_RACK_NO_PUSH_AT_MAX: 14680 optval = rack->r_ctl.rc_no_push_at_mrtt; 14681 break; 14682 case TCP_SHARED_CWND_ENABLE: 14683 optval = rack->rack_enable_scwnd; 14684 break; 14685 case TCP_RACK_NONRXT_CFG_RATE: 14686 optval = rack->rack_rec_nonrxt_use_cr; 14687 break; 14688 case TCP_NO_PRR: 14689 optval = rack->rack_no_prr; 14690 break; 14691 case TCP_RACK_DO_DETECTION: 14692 optval = rack->do_detection; 14693 break; 14694 case TCP_RACK_MBUF_QUEUE: 14695 /* Now do we use the LRO mbuf-queue feature */ 14696 optval = rack->r_mbuf_queue; 14697 break; 14698 case TCP_TIMELY_DYN_ADJ: 14699 optval = rack->rc_gp_dyn_mul; 14700 break; 14701 case TCP_BBR_IWINTSO: 14702 optval = rack->rc_init_win; 14703 break; 14704 case TCP_RACK_PROP_RATE: 14705 optval = rack->r_ctl.rc_prop_rate; 14706 break; 14707 case TCP_RACK_PROP: 14708 /* RACK proportional rate reduction (bool) */ 14709 optval = rack->r_ctl.rc_prop_reduce; 14710 break; 14711 case TCP_RACK_TLP_REDUCE: 14712 /* RACK TLP cwnd reduction (bool) */ 14713 optval = rack->r_ctl.rc_tlp_cwnd_reduce; 14714 break; 14715 case TCP_RACK_EARLY_RECOV: 14716 /* Should recovery happen early (bool) */ 14717 optval = rack->r_ctl.rc_early_recovery; 14718 break; 14719 case TCP_RACK_PACE_REDUCE: 14720 /* RACK Hptsi reduction factor (divisor) */ 14721 error = EINVAL; 14722 break; 14723 case TCP_BBR_RACK_INIT_RATE: 14724 val = rack->r_ctl.init_rate; 14725 /* convert to kbits per sec */ 14726 val *= 8; 14727 val /= 1000; 14728 optval = (uint32_t)val; 14729 break; 14730 case TCP_RACK_FORCE_MSEG: 14731 optval = rack->rc_force_max_seg; 14732 break; 14733 case TCP_RACK_PACE_MAX_SEG: 14734 /* Max segments in a pace */ 14735 optval = rack->rc_user_set_max_segs; 14736 break; 14737 case TCP_RACK_PACE_ALWAYS: 14738 /* Use the always pace method */ 14739 optval = rack->rc_always_pace; 14740 break; 14741 case TCP_RACK_PRR_SENDALOT: 14742 /* Allow PRR to send more than one seg */ 14743 optval = rack->r_ctl.rc_prr_sendalot; 14744 break; 14745 case TCP_RACK_MIN_TO: 14746 /* Minimum time between rack t-o's in ms */ 14747 optval = rack->r_ctl.rc_min_to; 14748 break; 14749 case TCP_RACK_EARLY_SEG: 14750 /* If early recovery max segments */ 14751 optval = rack->r_ctl.rc_early_recovery_segs; 14752 break; 14753 case TCP_RACK_REORD_THRESH: 14754 /* RACK reorder threshold (shift amount) */ 14755 optval = rack->r_ctl.rc_reorder_shift; 14756 break; 14757 case TCP_RACK_REORD_FADE: 14758 /* Does reordering fade after ms time */ 14759 optval = rack->r_ctl.rc_reorder_fade; 14760 break; 14761 case TCP_BBR_USE_RACK_RR: 14762 /* Do we use the rack cheat for rxt */ 14763 optval = rack->use_rack_rr; 14764 break; 14765 case TCP_RACK_RR_CONF: 14766 optval = rack->r_rr_config; 14767 break; 14768 case TCP_BBR_HDWR_PACE: 14769 optval = rack->rack_hdw_pace_ena; 14770 break; 14771 case TCP_RACK_TLP_THRESH: 14772 /* RACK TLP theshold i.e. srtt+(srtt/N) */ 14773 optval = rack->r_ctl.rc_tlp_threshold; 14774 break; 14775 case TCP_RACK_PKT_DELAY: 14776 /* RACK added ms i.e. rack-rtt + reord + N */ 14777 optval = rack->r_ctl.rc_pkt_delay; 14778 break; 14779 case TCP_RACK_TLP_USE: 14780 optval = rack->rack_tlp_threshold_use; 14781 break; 14782 case TCP_RACK_TLP_INC_VAR: 14783 /* Does TLP include rtt variance in t-o */ 14784 error = EINVAL; 14785 break; 14786 case TCP_RACK_IDLE_REDUCE_HIGH: 14787 error = EINVAL; 14788 break; 14789 case TCP_RACK_PACE_RATE_CA: 14790 optval = rack->r_ctl.rc_fixed_pacing_rate_ca; 14791 break; 14792 case TCP_RACK_PACE_RATE_SS: 14793 optval = rack->r_ctl.rc_fixed_pacing_rate_ss; 14794 break; 14795 case TCP_RACK_PACE_RATE_REC: 14796 optval = rack->r_ctl.rc_fixed_pacing_rate_rec; 14797 break; 14798 case TCP_RACK_GP_INCREASE_SS: 14799 optval = rack->r_ctl.rack_per_of_gp_ca; 14800 break; 14801 case TCP_RACK_GP_INCREASE_CA: 14802 optval = rack->r_ctl.rack_per_of_gp_ss; 14803 break; 14804 case TCP_BBR_RACK_RTT_USE: 14805 optval = rack->r_ctl.rc_rate_sample_method; 14806 break; 14807 case TCP_DELACK: 14808 optval = tp->t_delayed_ack; 14809 break; 14810 case TCP_DATA_AFTER_CLOSE: 14811 optval = rack->rc_allow_data_af_clo; 14812 break; 14813 case TCP_SHARED_CWND_TIME_LIMIT: 14814 optval = rack->r_limit_scw; 14815 break; 14816 default: 14817 return (tcp_default_ctloutput(so, sopt, inp, tp)); 14818 break; 14819 } 14820 INP_WUNLOCK(inp); 14821 if (error == 0) { 14822 error = sooptcopyout(sopt, &optval, sizeof optval); 14823 } 14824 return (error); 14825 } 14826 14827 static int 14828 rack_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp) 14829 { 14830 int32_t error = EINVAL; 14831 struct tcp_rack *rack; 14832 14833 rack = (struct tcp_rack *)tp->t_fb_ptr; 14834 if (rack == NULL) { 14835 /* Huh? */ 14836 goto out; 14837 } 14838 if (sopt->sopt_dir == SOPT_SET) { 14839 return (rack_set_sockopt(so, sopt, inp, tp, rack)); 14840 } else if (sopt->sopt_dir == SOPT_GET) { 14841 return (rack_get_sockopt(so, sopt, inp, tp, rack)); 14842 } 14843 out: 14844 INP_WUNLOCK(inp); 14845 return (error); 14846 } 14847 14848 static int 14849 rack_pru_options(struct tcpcb *tp, int flags) 14850 { 14851 if (flags & PRUS_OOB) 14852 return (EOPNOTSUPP); 14853 return (0); 14854 } 14855 14856 static struct tcp_function_block __tcp_rack = { 14857 .tfb_tcp_block_name = __XSTRING(STACKNAME), 14858 .tfb_tcp_output = rack_output, 14859 .tfb_do_queued_segments = ctf_do_queued_segments, 14860 .tfb_do_segment_nounlock = rack_do_segment_nounlock, 14861 .tfb_tcp_do_segment = rack_do_segment, 14862 .tfb_tcp_ctloutput = rack_ctloutput, 14863 .tfb_tcp_fb_init = rack_init, 14864 .tfb_tcp_fb_fini = rack_fini, 14865 .tfb_tcp_timer_stop_all = rack_stopall, 14866 .tfb_tcp_timer_activate = rack_timer_activate, 14867 .tfb_tcp_timer_active = rack_timer_active, 14868 .tfb_tcp_timer_stop = rack_timer_stop, 14869 .tfb_tcp_rexmit_tmr = rack_remxt_tmr, 14870 .tfb_tcp_handoff_ok = rack_handoff_ok, 14871 .tfb_pru_options = rack_pru_options, 14872 }; 14873 14874 static const char *rack_stack_names[] = { 14875 __XSTRING(STACKNAME), 14876 #ifdef STACKALIAS 14877 __XSTRING(STACKALIAS), 14878 #endif 14879 }; 14880 14881 static int 14882 rack_ctor(void *mem, int32_t size, void *arg, int32_t how) 14883 { 14884 memset(mem, 0, size); 14885 return (0); 14886 } 14887 14888 static void 14889 rack_dtor(void *mem, int32_t size, void *arg) 14890 { 14891 14892 } 14893 14894 static bool rack_mod_inited = false; 14895 14896 static int 14897 tcp_addrack(module_t mod, int32_t type, void *data) 14898 { 14899 int32_t err = 0; 14900 int num_stacks; 14901 14902 switch (type) { 14903 case MOD_LOAD: 14904 rack_zone = uma_zcreate(__XSTRING(MODNAME) "_map", 14905 sizeof(struct rack_sendmap), 14906 rack_ctor, rack_dtor, NULL, NULL, UMA_ALIGN_PTR, 0); 14907 14908 rack_pcb_zone = uma_zcreate(__XSTRING(MODNAME) "_pcb", 14909 sizeof(struct tcp_rack), 14910 rack_ctor, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0); 14911 14912 sysctl_ctx_init(&rack_sysctl_ctx); 14913 rack_sysctl_root = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 14914 SYSCTL_STATIC_CHILDREN(_net_inet_tcp), 14915 OID_AUTO, 14916 #ifdef STACKALIAS 14917 __XSTRING(STACKALIAS), 14918 #else 14919 __XSTRING(STACKNAME), 14920 #endif 14921 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 14922 ""); 14923 if (rack_sysctl_root == NULL) { 14924 printf("Failed to add sysctl node\n"); 14925 err = EFAULT; 14926 goto free_uma; 14927 } 14928 rack_init_sysctls(); 14929 num_stacks = nitems(rack_stack_names); 14930 err = register_tcp_functions_as_names(&__tcp_rack, M_WAITOK, 14931 rack_stack_names, &num_stacks); 14932 if (err) { 14933 printf("Failed to register %s stack name for " 14934 "%s module\n", rack_stack_names[num_stacks], 14935 __XSTRING(MODNAME)); 14936 sysctl_ctx_free(&rack_sysctl_ctx); 14937 free_uma: 14938 uma_zdestroy(rack_zone); 14939 uma_zdestroy(rack_pcb_zone); 14940 rack_counter_destroy(); 14941 printf("Failed to register rack module -- err:%d\n", err); 14942 return (err); 14943 } 14944 tcp_lro_reg_mbufq(); 14945 rack_mod_inited = true; 14946 break; 14947 case MOD_QUIESCE: 14948 err = deregister_tcp_functions(&__tcp_rack, true, false); 14949 break; 14950 case MOD_UNLOAD: 14951 err = deregister_tcp_functions(&__tcp_rack, false, true); 14952 if (err == EBUSY) 14953 break; 14954 if (rack_mod_inited) { 14955 uma_zdestroy(rack_zone); 14956 uma_zdestroy(rack_pcb_zone); 14957 sysctl_ctx_free(&rack_sysctl_ctx); 14958 rack_counter_destroy(); 14959 rack_mod_inited = false; 14960 } 14961 tcp_lro_dereg_mbufq(); 14962 err = 0; 14963 break; 14964 default: 14965 return (EOPNOTSUPP); 14966 } 14967 return (err); 14968 } 14969 14970 static moduledata_t tcp_rack = { 14971 .name = __XSTRING(MODNAME), 14972 .evhand = tcp_addrack, 14973 .priv = 0 14974 }; 14975 14976 MODULE_VERSION(MODNAME, 1); 14977 DECLARE_MODULE(MODNAME, tcp_rack, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY); 14978 MODULE_DEPEND(MODNAME, tcphpts, 1, 1, 1); 14979