1 /*- 2 * Copyright (c) 2016-2020 Netflix, Inc. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 16 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 23 * SUCH DAMAGE. 24 * 25 */ 26 27 #include <sys/cdefs.h> 28 __FBSDID("$FreeBSD$"); 29 30 #include "opt_inet.h" 31 #include "opt_inet6.h" 32 #include "opt_ipsec.h" 33 #include "opt_tcpdebug.h" 34 #include "opt_ratelimit.h" 35 #include <sys/param.h> 36 #include <sys/arb.h> 37 #include <sys/module.h> 38 #include <sys/kernel.h> 39 #ifdef TCP_HHOOK 40 #include <sys/hhook.h> 41 #endif 42 #include <sys/lock.h> 43 #include <sys/malloc.h> 44 #include <sys/lock.h> 45 #include <sys/mutex.h> 46 #include <sys/mbuf.h> 47 #include <sys/proc.h> /* for proc0 declaration */ 48 #include <sys/socket.h> 49 #include <sys/socketvar.h> 50 #include <sys/sysctl.h> 51 #include <sys/systm.h> 52 #ifdef STATS 53 #include <sys/qmath.h> 54 #include <sys/tree.h> 55 #include <sys/stats.h> /* Must come after qmath.h and tree.h */ 56 #else 57 #include <sys/tree.h> 58 #endif 59 #include <sys/refcount.h> 60 #include <sys/queue.h> 61 #include <sys/tim_filter.h> 62 #include <sys/smp.h> 63 #include <sys/kthread.h> 64 #include <sys/kern_prefetch.h> 65 #include <sys/protosw.h> 66 67 #include <vm/uma.h> 68 69 #include <net/route.h> 70 #include <net/route/nhop.h> 71 #include <net/vnet.h> 72 73 #define TCPSTATES /* for logging */ 74 75 #include <netinet/in.h> 76 #include <netinet/in_kdtrace.h> 77 #include <netinet/in_pcb.h> 78 #include <netinet/ip.h> 79 #include <netinet/ip_icmp.h> /* required for icmp_var.h */ 80 #include <netinet/icmp_var.h> /* for ICMP_BANDLIM */ 81 #include <netinet/ip_var.h> 82 #include <netinet/ip6.h> 83 #include <netinet6/in6_pcb.h> 84 #include <netinet6/ip6_var.h> 85 #include <netinet/tcp.h> 86 #define TCPOUTFLAGS 87 #include <netinet/tcp_fsm.h> 88 #include <netinet/tcp_log_buf.h> 89 #include <netinet/tcp_seq.h> 90 #include <netinet/tcp_timer.h> 91 #include <netinet/tcp_var.h> 92 #include <netinet/tcp_hpts.h> 93 #include <netinet/tcp_ratelimit.h> 94 #include <netinet/tcpip.h> 95 #include <netinet/cc/cc.h> 96 #include <netinet/tcp_fastopen.h> 97 #include <netinet/tcp_lro.h> 98 #ifdef NETFLIX_SHARED_CWND 99 #include <netinet/tcp_shared_cwnd.h> 100 #endif 101 #ifdef TCPDEBUG 102 #include <netinet/tcp_debug.h> 103 #endif /* TCPDEBUG */ 104 #ifdef TCP_OFFLOAD 105 #include <netinet/tcp_offload.h> 106 #endif 107 #ifdef INET6 108 #include <netinet6/tcp6_var.h> 109 #endif 110 111 #include <netipsec/ipsec_support.h> 112 113 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 114 #include <netipsec/ipsec.h> 115 #include <netipsec/ipsec6.h> 116 #endif /* IPSEC */ 117 118 #include <netinet/udp.h> 119 #include <netinet/udp_var.h> 120 #include <machine/in_cksum.h> 121 122 #ifdef MAC 123 #include <security/mac/mac_framework.h> 124 #endif 125 #include "sack_filter.h" 126 #include "tcp_rack.h" 127 #include "rack_bbr_common.h" 128 129 uma_zone_t rack_zone; 130 uma_zone_t rack_pcb_zone; 131 132 #ifndef TICKS2SBT 133 #define TICKS2SBT(__t) (tick_sbt * ((sbintime_t)(__t))) 134 #endif 135 136 struct sysctl_ctx_list rack_sysctl_ctx; 137 struct sysctl_oid *rack_sysctl_root; 138 139 #define CUM_ACKED 1 140 #define SACKED 2 141 142 /* 143 * The RACK module incorporates a number of 144 * TCP ideas that have been put out into the IETF 145 * over the last few years: 146 * - Matt Mathis's Rate Halving which slowly drops 147 * the congestion window so that the ack clock can 148 * be maintained during a recovery. 149 * - Yuchung Cheng's RACK TCP (for which its named) that 150 * will stop us using the number of dup acks and instead 151 * use time as the gage of when we retransmit. 152 * - Reorder Detection of RFC4737 and the Tail-Loss probe draft 153 * of Dukkipati et.al. 154 * RACK depends on SACK, so if an endpoint arrives that 155 * cannot do SACK the state machine below will shuttle the 156 * connection back to using the "default" TCP stack that is 157 * in FreeBSD. 158 * 159 * To implement RACK the original TCP stack was first decomposed 160 * into a functional state machine with individual states 161 * for each of the possible TCP connection states. The do_segement 162 * functions role in life is to mandate the connection supports SACK 163 * initially and then assure that the RACK state matches the conenction 164 * state before calling the states do_segment function. Each 165 * state is simplified due to the fact that the original do_segment 166 * has been decomposed and we *know* what state we are in (no 167 * switches on the state) and all tests for SACK are gone. This 168 * greatly simplifies what each state does. 169 * 170 * TCP output is also over-written with a new version since it 171 * must maintain the new rack scoreboard. 172 * 173 */ 174 static int32_t rack_tlp_thresh = 1; 175 static int32_t rack_tlp_limit = 2; /* No more than 2 TLPs w-out new data */ 176 static int32_t rack_tlp_use_greater = 1; 177 static int32_t rack_reorder_thresh = 2; 178 static int32_t rack_reorder_fade = 60000; /* 0 - never fade, def 60,000 179 * - 60 seconds */ 180 /* Attack threshold detections */ 181 static uint32_t rack_highest_sack_thresh_seen = 0; 182 static uint32_t rack_highest_move_thresh_seen = 0; 183 184 static int32_t rack_pkt_delay = 1; 185 static int32_t rack_early_recovery = 1; 186 static int32_t rack_send_a_lot_in_prr = 1; 187 static int32_t rack_min_to = 1; /* Number of ms minimum timeout */ 188 static int32_t rack_verbose_logging = 0; 189 static int32_t rack_ignore_data_after_close = 1; 190 static int32_t rack_enable_shared_cwnd = 0; 191 static int32_t rack_limits_scwnd = 1; 192 static int32_t rack_enable_mqueue_for_nonpaced = 0; 193 static int32_t rack_disable_prr = 0; 194 static int32_t use_rack_rr = 1; 195 static int32_t rack_non_rxt_use_cr = 0; /* does a non-rxt in recovery use the configured rate (ss/ca)? */ 196 static int32_t rack_persist_min = 250; /* 250ms */ 197 static int32_t rack_persist_max = 2000; /* 2 Second */ 198 static int32_t rack_sack_not_required = 0; /* set to one to allow non-sack to use rack */ 199 static int32_t rack_default_init_window = 0; /* Use system default */ 200 static int32_t rack_limit_time_with_srtt = 0; 201 static int32_t rack_hw_pace_adjust = 0; 202 /* 203 * Currently regular tcp has a rto_min of 30ms 204 * the backoff goes 12 times so that ends up 205 * being a total of 122.850 seconds before a 206 * connection is killed. 207 */ 208 static uint32_t rack_def_data_window = 20; 209 static uint32_t rack_goal_bdp = 2; 210 static uint32_t rack_min_srtts = 1; 211 static uint32_t rack_min_measure_usec = 0; 212 static int32_t rack_tlp_min = 10; 213 static int32_t rack_rto_min = 30; /* 30ms same as main freebsd */ 214 static int32_t rack_rto_max = 4000; /* 4 seconds */ 215 static const int32_t rack_free_cache = 2; 216 static int32_t rack_hptsi_segments = 40; 217 static int32_t rack_rate_sample_method = USE_RTT_LOW; 218 static int32_t rack_pace_every_seg = 0; 219 static int32_t rack_delayed_ack_time = 200; /* 200ms */ 220 static int32_t rack_slot_reduction = 4; 221 static int32_t rack_wma_divisor = 8; /* For WMA calculation */ 222 static int32_t rack_cwnd_block_ends_measure = 0; 223 static int32_t rack_rwnd_block_ends_measure = 0; 224 225 static int32_t rack_lower_cwnd_at_tlp = 0; 226 static int32_t rack_use_proportional_reduce = 0; 227 static int32_t rack_proportional_rate = 10; 228 static int32_t rack_tlp_max_resend = 2; 229 static int32_t rack_limited_retran = 0; 230 static int32_t rack_always_send_oldest = 0; 231 static int32_t rack_tlp_threshold_use = TLP_USE_TWO_ONE; 232 233 static uint16_t rack_per_of_gp_ss = 250; /* 250 % slow-start */ 234 static uint16_t rack_per_of_gp_ca = 200; /* 200 % congestion-avoidance */ 235 static uint16_t rack_per_of_gp_rec = 200; /* 200 % of bw */ 236 237 /* Probertt */ 238 static uint16_t rack_per_of_gp_probertt = 60; /* 60% of bw */ 239 static uint16_t rack_per_of_gp_lowthresh = 40; /* 40% is bottom */ 240 static uint16_t rack_per_of_gp_probertt_reduce = 10; /* 10% reduction */ 241 static uint16_t rack_atexit_prtt_hbp = 130; /* Clamp to 130% on exit prtt if highly buffered path */ 242 static uint16_t rack_atexit_prtt = 130; /* Clamp to 100% on exit prtt if non highly buffered path */ 243 244 static uint32_t rack_max_drain_wait = 2; /* How man gp srtt's before we give up draining */ 245 static uint32_t rack_must_drain = 1; /* How many GP srtt's we *must* wait */ 246 static uint32_t rack_probertt_use_min_rtt_entry = 1; /* Use the min to calculate the goal else gp_srtt */ 247 static uint32_t rack_probertt_use_min_rtt_exit = 0; 248 static uint32_t rack_probe_rtt_sets_cwnd = 0; 249 static uint32_t rack_probe_rtt_safety_val = 2000000; /* No more than 2 sec in probe-rtt */ 250 static uint32_t rack_time_between_probertt = 9600000; /* 9.6 sec in us */ 251 static uint32_t rack_probertt_gpsrtt_cnt_mul = 0; /* How many srtt periods does probe-rtt last top fraction */ 252 static uint32_t rack_probertt_gpsrtt_cnt_div = 0; /* How many srtt periods does probe-rtt last bottom fraction */ 253 static uint32_t rack_min_probertt_hold = 200000; /* Equal to delayed ack time */ 254 static uint32_t rack_probertt_filter_life = 10000000; 255 static uint32_t rack_probertt_lower_within = 10; 256 static uint32_t rack_min_rtt_movement = 250; /* Must move at least 250 useconds to count as a lowering */ 257 static int32_t rack_pace_one_seg = 0; /* Shall we pace for less than 1.4Meg 1MSS at a time */ 258 static int32_t rack_probertt_clear_is = 1; 259 static int32_t rack_max_drain_hbp = 1; /* Extra drain times gpsrtt for highly buffered paths */ 260 static int32_t rack_hbp_thresh = 3; /* what is the divisor max_rtt/min_rtt to decided a hbp */ 261 262 263 /* Part of pacing */ 264 static int32_t rack_max_per_above = 30; /* When we go to increment stop if above 100+this% */ 265 266 /* Timely information */ 267 /* Combine these two gives the range of 'no change' to bw */ 268 /* ie the up/down provide the upper and lower bound */ 269 static int32_t rack_gp_per_bw_mul_up = 2; /* 2% */ 270 static int32_t rack_gp_per_bw_mul_down = 4; /* 4% */ 271 static int32_t rack_gp_rtt_maxmul = 3; /* 3 x maxmin */ 272 static int32_t rack_gp_rtt_minmul = 1; /* minrtt + (minrtt/mindiv) is lower rtt */ 273 static int32_t rack_gp_rtt_mindiv = 4; /* minrtt + (minrtt * minmul/mindiv) is lower rtt */ 274 static int32_t rack_gp_decrease_per = 20; /* 20% decrease in multipler */ 275 static int32_t rack_gp_increase_per = 2; /* 2% increase in multipler */ 276 static int32_t rack_per_lower_bound = 50; /* Don't allow to drop below this multiplier */ 277 static int32_t rack_per_upper_bound_ss = 0; /* Don't allow SS to grow above this */ 278 static int32_t rack_per_upper_bound_ca = 0; /* Don't allow CA to grow above this */ 279 static int32_t rack_do_dyn_mul = 0; /* Are the rack gp multipliers dynamic */ 280 static int32_t rack_gp_no_rec_chg = 1; /* Prohibit recovery from reducing it's multiplier */ 281 static int32_t rack_timely_dec_clear = 6; /* Do we clear decrement count at a value (6)? */ 282 static int32_t rack_timely_max_push_rise = 3; /* One round of pushing */ 283 static int32_t rack_timely_max_push_drop = 3; /* Three round of pushing */ 284 static int32_t rack_timely_min_segs = 4; /* 4 segment minimum */ 285 static int32_t rack_use_max_for_nobackoff = 0; 286 static int32_t rack_timely_int_timely_only = 0; /* do interim timely's only use the timely algo (no b/w changes)? */ 287 static int32_t rack_timely_no_stopping = 0; 288 static int32_t rack_down_raise_thresh = 100; 289 static int32_t rack_req_segs = 1; 290 291 /* Weird delayed ack mode */ 292 static int32_t rack_use_imac_dack = 0; 293 /* Rack specific counters */ 294 counter_u64_t rack_badfr; 295 counter_u64_t rack_badfr_bytes; 296 counter_u64_t rack_rtm_prr_retran; 297 counter_u64_t rack_rtm_prr_newdata; 298 counter_u64_t rack_timestamp_mismatch; 299 counter_u64_t rack_reorder_seen; 300 counter_u64_t rack_paced_segments; 301 counter_u64_t rack_unpaced_segments; 302 counter_u64_t rack_calc_zero; 303 counter_u64_t rack_calc_nonzero; 304 counter_u64_t rack_saw_enobuf; 305 counter_u64_t rack_saw_enetunreach; 306 counter_u64_t rack_per_timer_hole; 307 308 /* Tail loss probe counters */ 309 counter_u64_t rack_tlp_tot; 310 counter_u64_t rack_tlp_newdata; 311 counter_u64_t rack_tlp_retran; 312 counter_u64_t rack_tlp_retran_bytes; 313 counter_u64_t rack_tlp_retran_fail; 314 counter_u64_t rack_to_tot; 315 counter_u64_t rack_to_arm_rack; 316 counter_u64_t rack_to_arm_tlp; 317 counter_u64_t rack_to_alloc; 318 counter_u64_t rack_to_alloc_hard; 319 counter_u64_t rack_to_alloc_emerg; 320 counter_u64_t rack_to_alloc_limited; 321 counter_u64_t rack_alloc_limited_conns; 322 counter_u64_t rack_split_limited; 323 324 counter_u64_t rack_sack_proc_all; 325 counter_u64_t rack_sack_proc_short; 326 counter_u64_t rack_sack_proc_restart; 327 counter_u64_t rack_sack_attacks_detected; 328 counter_u64_t rack_sack_attacks_reversed; 329 counter_u64_t rack_sack_used_next_merge; 330 counter_u64_t rack_sack_splits; 331 counter_u64_t rack_sack_used_prev_merge; 332 counter_u64_t rack_sack_skipped_acked; 333 counter_u64_t rack_ack_total; 334 counter_u64_t rack_express_sack; 335 counter_u64_t rack_sack_total; 336 counter_u64_t rack_move_none; 337 counter_u64_t rack_move_some; 338 339 counter_u64_t rack_used_tlpmethod; 340 counter_u64_t rack_used_tlpmethod2; 341 counter_u64_t rack_enter_tlp_calc; 342 counter_u64_t rack_input_idle_reduces; 343 counter_u64_t rack_collapsed_win; 344 counter_u64_t rack_tlp_does_nada; 345 counter_u64_t rack_try_scwnd; 346 347 /* Temp CPU counters */ 348 counter_u64_t rack_find_high; 349 350 counter_u64_t rack_progress_drops; 351 counter_u64_t rack_out_size[TCP_MSS_ACCT_SIZE]; 352 counter_u64_t rack_opts_arry[RACK_OPTS_SIZE]; 353 354 static void 355 rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, int event, int line); 356 357 static int 358 rack_process_ack(struct mbuf *m, struct tcphdr *th, 359 struct socket *so, struct tcpcb *tp, struct tcpopt *to, 360 uint32_t tiwin, int32_t tlen, int32_t * ofia, int32_t thflags, int32_t * ret_val); 361 static int 362 rack_process_data(struct mbuf *m, struct tcphdr *th, 363 struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, 364 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); 365 static void 366 rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, 367 struct tcphdr *th, uint16_t nsegs, uint16_t type, int32_t recovery); 368 static struct rack_sendmap *rack_alloc(struct tcp_rack *rack); 369 static struct rack_sendmap *rack_alloc_limit(struct tcp_rack *rack, 370 uint8_t limit_type); 371 static struct rack_sendmap * 372 rack_check_recovery_mode(struct tcpcb *tp, 373 uint32_t tsused); 374 static void 375 rack_cong_signal(struct tcpcb *tp, struct tcphdr *th, 376 uint32_t type); 377 static void rack_counter_destroy(void); 378 static int 379 rack_ctloutput(struct socket *so, struct sockopt *sopt, 380 struct inpcb *inp, struct tcpcb *tp); 381 static int32_t rack_ctor(void *mem, int32_t size, void *arg, int32_t how); 382 static void 383 rack_set_pace_segments(struct tcpcb *tp, struct tcp_rack *rack, uint32_t line); 384 static void 385 rack_do_segment(struct mbuf *m, struct tcphdr *th, 386 struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, 387 uint8_t iptos); 388 static void rack_dtor(void *mem, int32_t size, void *arg); 389 static void 390 rack_earlier_retran(struct tcpcb *tp, struct rack_sendmap *rsm, 391 uint32_t t, uint32_t cts); 392 static void 393 rack_log_alt_to_to_cancel(struct tcp_rack *rack, 394 uint32_t flex1, uint32_t flex2, 395 uint32_t flex3, uint32_t flex4, 396 uint32_t flex5, uint32_t flex6, 397 uint16_t flex7, uint8_t mod); 398 static void 399 rack_log_pacing_delay_calc(struct tcp_rack *rack, uint32_t len, uint32_t slot, 400 uint64_t bw_est, uint64_t bw, uint64_t len_time, int method, int line, struct rack_sendmap *rsm); 401 static struct rack_sendmap * 402 rack_find_high_nonack(struct tcp_rack *rack, 403 struct rack_sendmap *rsm); 404 static struct rack_sendmap *rack_find_lowest_rsm(struct tcp_rack *rack); 405 static void rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm); 406 static void rack_fini(struct tcpcb *tp, int32_t tcb_is_purged); 407 static int 408 rack_get_sockopt(struct socket *so, struct sockopt *sopt, 409 struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack); 410 static void 411 rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack, 412 tcp_seq th_ack, int line); 413 static uint32_t 414 rack_get_pacing_len(struct tcp_rack *rack, uint64_t bw, uint32_t mss); 415 static int32_t rack_handoff_ok(struct tcpcb *tp); 416 static int32_t rack_init(struct tcpcb *tp); 417 static void rack_init_sysctls(void); 418 static void 419 rack_log_ack(struct tcpcb *tp, struct tcpopt *to, 420 struct tcphdr *th); 421 static void 422 rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len, 423 uint32_t seq_out, uint8_t th_flags, int32_t err, uint32_t ts, 424 uint8_t pass, struct rack_sendmap *hintrsm, uint32_t us_cts); 425 static void 426 rack_log_sack_passed(struct tcpcb *tp, struct tcp_rack *rack, 427 struct rack_sendmap *rsm); 428 static void rack_log_to_event(struct tcp_rack *rack, int32_t to_num, struct rack_sendmap *rsm); 429 static int32_t rack_output(struct tcpcb *tp); 430 431 static uint32_t 432 rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, 433 struct sackblk *sack, struct tcpopt *to, struct rack_sendmap **prsm, 434 uint32_t cts, int *moved_two); 435 static void rack_post_recovery(struct tcpcb *tp, struct tcphdr *th); 436 static void rack_remxt_tmr(struct tcpcb *tp); 437 static int 438 rack_set_sockopt(struct socket *so, struct sockopt *sopt, 439 struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack); 440 static void rack_set_state(struct tcpcb *tp, struct tcp_rack *rack); 441 static int32_t rack_stopall(struct tcpcb *tp); 442 static void 443 rack_timer_activate(struct tcpcb *tp, uint32_t timer_type, 444 uint32_t delta); 445 static int32_t rack_timer_active(struct tcpcb *tp, uint32_t timer_type); 446 static void rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line); 447 static void rack_timer_stop(struct tcpcb *tp, uint32_t timer_type); 448 static uint32_t 449 rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack, 450 struct rack_sendmap *rsm, uint32_t ts, int32_t * lenp); 451 static void 452 rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack, 453 struct rack_sendmap *rsm, uint32_t ts); 454 static int 455 rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack, 456 struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type, tcp_seq th_ack); 457 static int32_t tcp_addrack(module_t mod, int32_t type, void *data); 458 static int 459 rack_do_close_wait(struct mbuf *m, struct tcphdr *th, 460 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 461 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 462 static int 463 rack_do_closing(struct mbuf *m, struct tcphdr *th, 464 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 465 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 466 static int 467 rack_do_established(struct mbuf *m, struct tcphdr *th, 468 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 469 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 470 static int 471 rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, 472 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 473 int32_t tlen, uint32_t tiwin, int32_t nxt_pkt, uint8_t iptos); 474 static int 475 rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, 476 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 477 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 478 static int 479 rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, 480 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 481 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 482 static int 483 rack_do_lastack(struct mbuf *m, struct tcphdr *th, 484 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 485 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 486 static int 487 rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, 488 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 489 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 490 static int 491 rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, 492 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 493 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 494 struct rack_sendmap * 495 tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, 496 uint32_t tsused); 497 static void tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt, 498 uint32_t len, uint32_t us_tim, int confidence, struct rack_sendmap *rsm, uint16_t rtrcnt); 499 static void 500 tcp_rack_partialack(struct tcpcb *tp, struct tcphdr *th); 501 502 int32_t rack_clear_counter=0; 503 504 505 static int 506 sysctl_rack_clear(SYSCTL_HANDLER_ARGS) 507 { 508 uint32_t stat; 509 int32_t error; 510 511 error = SYSCTL_OUT(req, &rack_clear_counter, sizeof(uint32_t)); 512 if (error || req->newptr == NULL) 513 return error; 514 515 error = SYSCTL_IN(req, &stat, sizeof(uint32_t)); 516 if (error) 517 return (error); 518 if (stat == 1) { 519 #ifdef INVARIANTS 520 printf("Clearing RACK counters\n"); 521 #endif 522 counter_u64_zero(rack_badfr); 523 counter_u64_zero(rack_badfr_bytes); 524 counter_u64_zero(rack_rtm_prr_retran); 525 counter_u64_zero(rack_rtm_prr_newdata); 526 counter_u64_zero(rack_timestamp_mismatch); 527 counter_u64_zero(rack_reorder_seen); 528 counter_u64_zero(rack_tlp_tot); 529 counter_u64_zero(rack_tlp_newdata); 530 counter_u64_zero(rack_tlp_retran); 531 counter_u64_zero(rack_tlp_retran_bytes); 532 counter_u64_zero(rack_tlp_retran_fail); 533 counter_u64_zero(rack_to_tot); 534 counter_u64_zero(rack_to_arm_rack); 535 counter_u64_zero(rack_to_arm_tlp); 536 counter_u64_zero(rack_paced_segments); 537 counter_u64_zero(rack_calc_zero); 538 counter_u64_zero(rack_calc_nonzero); 539 counter_u64_zero(rack_unpaced_segments); 540 counter_u64_zero(rack_saw_enobuf); 541 counter_u64_zero(rack_saw_enetunreach); 542 counter_u64_zero(rack_per_timer_hole); 543 counter_u64_zero(rack_to_alloc_hard); 544 counter_u64_zero(rack_to_alloc_emerg); 545 counter_u64_zero(rack_sack_proc_all); 546 counter_u64_zero(rack_sack_proc_short); 547 counter_u64_zero(rack_sack_proc_restart); 548 counter_u64_zero(rack_to_alloc); 549 counter_u64_zero(rack_to_alloc_limited); 550 counter_u64_zero(rack_alloc_limited_conns); 551 counter_u64_zero(rack_split_limited); 552 counter_u64_zero(rack_find_high); 553 counter_u64_zero(rack_sack_attacks_detected); 554 counter_u64_zero(rack_sack_attacks_reversed); 555 counter_u64_zero(rack_sack_used_next_merge); 556 counter_u64_zero(rack_sack_used_prev_merge); 557 counter_u64_zero(rack_sack_splits); 558 counter_u64_zero(rack_sack_skipped_acked); 559 counter_u64_zero(rack_ack_total); 560 counter_u64_zero(rack_express_sack); 561 counter_u64_zero(rack_sack_total); 562 counter_u64_zero(rack_move_none); 563 counter_u64_zero(rack_move_some); 564 counter_u64_zero(rack_used_tlpmethod); 565 counter_u64_zero(rack_used_tlpmethod2); 566 counter_u64_zero(rack_enter_tlp_calc); 567 counter_u64_zero(rack_progress_drops); 568 counter_u64_zero(rack_tlp_does_nada); 569 counter_u64_zero(rack_try_scwnd); 570 counter_u64_zero(rack_collapsed_win); 571 572 } 573 rack_clear_counter = 0; 574 return (0); 575 } 576 577 578 579 static void 580 rack_init_sysctls(void) 581 { 582 struct sysctl_oid *rack_counters; 583 struct sysctl_oid *rack_attack; 584 struct sysctl_oid *rack_pacing; 585 struct sysctl_oid *rack_timely; 586 struct sysctl_oid *rack_timers; 587 struct sysctl_oid *rack_tlp; 588 struct sysctl_oid *rack_misc; 589 struct sysctl_oid *rack_measure; 590 struct sysctl_oid *rack_probertt; 591 592 rack_attack = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 593 SYSCTL_CHILDREN(rack_sysctl_root), 594 OID_AUTO, 595 "sack_attack", 596 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 597 "Rack Sack Attack Counters and Controls"); 598 rack_counters = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 599 SYSCTL_CHILDREN(rack_sysctl_root), 600 OID_AUTO, 601 "stats", 602 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 603 "Rack Counters"); 604 SYSCTL_ADD_S32(&rack_sysctl_ctx, 605 SYSCTL_CHILDREN(rack_sysctl_root), 606 OID_AUTO, "rate_sample_method", CTLFLAG_RW, 607 &rack_rate_sample_method , USE_RTT_LOW, 608 "What method should we use for rate sampling 0=high, 1=low "); 609 /* Probe rtt related controls */ 610 rack_probertt = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 611 SYSCTL_CHILDREN(rack_sysctl_root), 612 OID_AUTO, 613 "probertt", 614 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 615 "ProbeRTT related Controls"); 616 SYSCTL_ADD_U16(&rack_sysctl_ctx, 617 SYSCTL_CHILDREN(rack_probertt), 618 OID_AUTO, "exit_per_hpb", CTLFLAG_RW, 619 &rack_atexit_prtt_hbp, 130, 620 "What percentage above goodput do we clamp CA/SS to at exit on high-BDP path 110%"); 621 SYSCTL_ADD_U16(&rack_sysctl_ctx, 622 SYSCTL_CHILDREN(rack_probertt), 623 OID_AUTO, "exit_per_nonhpb", CTLFLAG_RW, 624 &rack_atexit_prtt, 130, 625 "What percentage above goodput do we clamp CA/SS to at exit on a non high-BDP path 100%"); 626 SYSCTL_ADD_U16(&rack_sysctl_ctx, 627 SYSCTL_CHILDREN(rack_probertt), 628 OID_AUTO, "gp_per_mul", CTLFLAG_RW, 629 &rack_per_of_gp_probertt, 60, 630 "What percentage of goodput do we pace at in probertt"); 631 SYSCTL_ADD_U16(&rack_sysctl_ctx, 632 SYSCTL_CHILDREN(rack_probertt), 633 OID_AUTO, "gp_per_reduce", CTLFLAG_RW, 634 &rack_per_of_gp_probertt_reduce, 10, 635 "What percentage of goodput do we reduce every gp_srtt"); 636 SYSCTL_ADD_U16(&rack_sysctl_ctx, 637 SYSCTL_CHILDREN(rack_probertt), 638 OID_AUTO, "gp_per_low", CTLFLAG_RW, 639 &rack_per_of_gp_lowthresh, 40, 640 "What percentage of goodput do we allow the multiplier to fall to"); 641 SYSCTL_ADD_U32(&rack_sysctl_ctx, 642 SYSCTL_CHILDREN(rack_probertt), 643 OID_AUTO, "time_between", CTLFLAG_RW, 644 & rack_time_between_probertt, 96000000, 645 "How many useconds between the lowest rtt falling must past before we enter probertt"); 646 SYSCTL_ADD_U32(&rack_sysctl_ctx, 647 SYSCTL_CHILDREN(rack_probertt), 648 OID_AUTO, "safety", CTLFLAG_RW, 649 &rack_probe_rtt_safety_val, 2000000, 650 "If not zero, provides a maximum usecond that you can stay in probertt (2sec = 2000000)"); 651 SYSCTL_ADD_U32(&rack_sysctl_ctx, 652 SYSCTL_CHILDREN(rack_probertt), 653 OID_AUTO, "sets_cwnd", CTLFLAG_RW, 654 &rack_probe_rtt_sets_cwnd, 0, 655 "Do we set the cwnd too (if always_lower is on)"); 656 SYSCTL_ADD_U32(&rack_sysctl_ctx, 657 SYSCTL_CHILDREN(rack_probertt), 658 OID_AUTO, "maxdrainsrtts", CTLFLAG_RW, 659 &rack_max_drain_wait, 2, 660 "Maximum number of gp_srtt's to hold in drain waiting for flight to reach goal"); 661 SYSCTL_ADD_U32(&rack_sysctl_ctx, 662 SYSCTL_CHILDREN(rack_probertt), 663 OID_AUTO, "mustdrainsrtts", CTLFLAG_RW, 664 &rack_must_drain, 1, 665 "We must drain this many gp_srtt's waiting for flight to reach goal"); 666 SYSCTL_ADD_U32(&rack_sysctl_ctx, 667 SYSCTL_CHILDREN(rack_probertt), 668 OID_AUTO, "goal_use_min_entry", CTLFLAG_RW, 669 &rack_probertt_use_min_rtt_entry, 1, 670 "Should we use the min-rtt to calculate the goal rtt (else gp_srtt) at entry"); 671 SYSCTL_ADD_U32(&rack_sysctl_ctx, 672 SYSCTL_CHILDREN(rack_probertt), 673 OID_AUTO, "goal_use_min_exit", CTLFLAG_RW, 674 &rack_probertt_use_min_rtt_exit, 0, 675 "How to set cwnd at exit, 0 - dynamic, 1 - use min-rtt, 2 - use curgprtt, 3 - entry gp-rtt"); 676 SYSCTL_ADD_U32(&rack_sysctl_ctx, 677 SYSCTL_CHILDREN(rack_probertt), 678 OID_AUTO, "length_div", CTLFLAG_RW, 679 &rack_probertt_gpsrtt_cnt_div, 0, 680 "How many recent goodput srtt periods plus hold tim does probertt last (bottom of fraction)"); 681 SYSCTL_ADD_U32(&rack_sysctl_ctx, 682 SYSCTL_CHILDREN(rack_probertt), 683 OID_AUTO, "length_mul", CTLFLAG_RW, 684 &rack_probertt_gpsrtt_cnt_mul, 0, 685 "How many recent goodput srtt periods plus hold tim does probertt last (top of fraction)"); 686 SYSCTL_ADD_U32(&rack_sysctl_ctx, 687 SYSCTL_CHILDREN(rack_probertt), 688 OID_AUTO, "holdtim_at_target", CTLFLAG_RW, 689 &rack_min_probertt_hold, 200000, 690 "What is the minimum time we hold probertt at target"); 691 SYSCTL_ADD_U32(&rack_sysctl_ctx, 692 SYSCTL_CHILDREN(rack_probertt), 693 OID_AUTO, "filter_life", CTLFLAG_RW, 694 &rack_probertt_filter_life, 10000000, 695 "What is the time for the filters life in useconds"); 696 SYSCTL_ADD_U32(&rack_sysctl_ctx, 697 SYSCTL_CHILDREN(rack_probertt), 698 OID_AUTO, "lower_within", CTLFLAG_RW, 699 &rack_probertt_lower_within, 10, 700 "If the rtt goes lower within this percentage of the time, go into probe-rtt"); 701 SYSCTL_ADD_U32(&rack_sysctl_ctx, 702 SYSCTL_CHILDREN(rack_probertt), 703 OID_AUTO, "must_move", CTLFLAG_RW, 704 &rack_min_rtt_movement, 250, 705 "How much is the minimum movement in rtt to count as a drop for probertt purposes"); 706 SYSCTL_ADD_U32(&rack_sysctl_ctx, 707 SYSCTL_CHILDREN(rack_probertt), 708 OID_AUTO, "clear_is_cnts", CTLFLAG_RW, 709 &rack_probertt_clear_is, 1, 710 "Do we clear I/S counts on exiting probe-rtt"); 711 SYSCTL_ADD_S32(&rack_sysctl_ctx, 712 SYSCTL_CHILDREN(rack_probertt), 713 OID_AUTO, "hbp_extra_drain", CTLFLAG_RW, 714 &rack_max_drain_hbp, 1, 715 "How many extra drain gpsrtt's do we get in highly buffered paths"); 716 SYSCTL_ADD_S32(&rack_sysctl_ctx, 717 SYSCTL_CHILDREN(rack_probertt), 718 OID_AUTO, "hbp_threshold", CTLFLAG_RW, 719 &rack_hbp_thresh, 3, 720 "We are highly buffered if min_rtt_seen / max_rtt_seen > this-threshold"); 721 /* Pacing related sysctls */ 722 rack_pacing = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 723 SYSCTL_CHILDREN(rack_sysctl_root), 724 OID_AUTO, 725 "pacing", 726 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 727 "Pacing related Controls"); 728 SYSCTL_ADD_S32(&rack_sysctl_ctx, 729 SYSCTL_CHILDREN(rack_pacing), 730 OID_AUTO, "max_pace_over", CTLFLAG_RW, 731 &rack_max_per_above, 30, 732 "What is the maximum allowable percentage that we can pace above (so 30 = 130% of our goal)"); 733 SYSCTL_ADD_S32(&rack_sysctl_ctx, 734 SYSCTL_CHILDREN(rack_pacing), 735 OID_AUTO, "pace_to_one", CTLFLAG_RW, 736 &rack_pace_one_seg, 0, 737 "Do we allow low b/w pacing of 1MSS instead of two"); 738 SYSCTL_ADD_S32(&rack_sysctl_ctx, 739 SYSCTL_CHILDREN(rack_pacing), 740 OID_AUTO, "limit_wsrtt", CTLFLAG_RW, 741 &rack_limit_time_with_srtt, 0, 742 "Do we limit pacing time based on srtt"); 743 SYSCTL_ADD_S32(&rack_sysctl_ctx, 744 SYSCTL_CHILDREN(rack_pacing), 745 OID_AUTO, "init_win", CTLFLAG_RW, 746 &rack_default_init_window, 0, 747 "Do we have a rack initial window 0 = system default"); 748 SYSCTL_ADD_U32(&rack_sysctl_ctx, 749 SYSCTL_CHILDREN(rack_pacing), 750 OID_AUTO, "hw_pacing_adjust", CTLFLAG_RW, 751 &rack_hw_pace_adjust, 0, 752 "What percentage do we raise the MSS by (11 = 1.1%)"); 753 SYSCTL_ADD_U16(&rack_sysctl_ctx, 754 SYSCTL_CHILDREN(rack_pacing), 755 OID_AUTO, "gp_per_ss", CTLFLAG_RW, 756 &rack_per_of_gp_ss, 250, 757 "If non zero, what percentage of goodput to pace at in slow start"); 758 SYSCTL_ADD_U16(&rack_sysctl_ctx, 759 SYSCTL_CHILDREN(rack_pacing), 760 OID_AUTO, "gp_per_ca", CTLFLAG_RW, 761 &rack_per_of_gp_ca, 150, 762 "If non zero, what percentage of goodput to pace at in congestion avoidance"); 763 SYSCTL_ADD_U16(&rack_sysctl_ctx, 764 SYSCTL_CHILDREN(rack_pacing), 765 OID_AUTO, "gp_per_rec", CTLFLAG_RW, 766 &rack_per_of_gp_rec, 200, 767 "If non zero, what percentage of goodput to pace at in recovery"); 768 SYSCTL_ADD_S32(&rack_sysctl_ctx, 769 SYSCTL_CHILDREN(rack_pacing), 770 OID_AUTO, "pace_max_seg", CTLFLAG_RW, 771 &rack_hptsi_segments, 40, 772 "What size is the max for TSO segments in pacing and burst mitigation"); 773 SYSCTL_ADD_S32(&rack_sysctl_ctx, 774 SYSCTL_CHILDREN(rack_pacing), 775 OID_AUTO, "burst_reduces", CTLFLAG_RW, 776 &rack_slot_reduction, 4, 777 "When doing only burst mitigation what is the reduce divisor"); 778 SYSCTL_ADD_S32(&rack_sysctl_ctx, 779 SYSCTL_CHILDREN(rack_sysctl_root), 780 OID_AUTO, "use_pacing", CTLFLAG_RW, 781 &rack_pace_every_seg, 0, 782 "If set we use pacing, if clear we use only the original burst mitigation"); 783 784 rack_timely = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 785 SYSCTL_CHILDREN(rack_sysctl_root), 786 OID_AUTO, 787 "timely", 788 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 789 "Rack Timely RTT Controls"); 790 /* Timely based GP dynmics */ 791 SYSCTL_ADD_S32(&rack_sysctl_ctx, 792 SYSCTL_CHILDREN(rack_timely), 793 OID_AUTO, "upper", CTLFLAG_RW, 794 &rack_gp_per_bw_mul_up, 2, 795 "Rack timely upper range for equal b/w (in percentage)"); 796 SYSCTL_ADD_S32(&rack_sysctl_ctx, 797 SYSCTL_CHILDREN(rack_timely), 798 OID_AUTO, "lower", CTLFLAG_RW, 799 &rack_gp_per_bw_mul_down, 4, 800 "Rack timely lower range for equal b/w (in percentage)"); 801 SYSCTL_ADD_S32(&rack_sysctl_ctx, 802 SYSCTL_CHILDREN(rack_timely), 803 OID_AUTO, "rtt_max_mul", CTLFLAG_RW, 804 &rack_gp_rtt_maxmul, 3, 805 "Rack timely multipler of lowest rtt for rtt_max"); 806 SYSCTL_ADD_S32(&rack_sysctl_ctx, 807 SYSCTL_CHILDREN(rack_timely), 808 OID_AUTO, "rtt_min_div", CTLFLAG_RW, 809 &rack_gp_rtt_mindiv, 4, 810 "Rack timely divisor used for rtt + (rtt * mul/divisor) for check for lower rtt"); 811 SYSCTL_ADD_S32(&rack_sysctl_ctx, 812 SYSCTL_CHILDREN(rack_timely), 813 OID_AUTO, "rtt_min_mul", CTLFLAG_RW, 814 &rack_gp_rtt_minmul, 1, 815 "Rack timely multiplier used for rtt + (rtt * mul/divisor) for check for lower rtt"); 816 SYSCTL_ADD_S32(&rack_sysctl_ctx, 817 SYSCTL_CHILDREN(rack_timely), 818 OID_AUTO, "decrease", CTLFLAG_RW, 819 &rack_gp_decrease_per, 20, 820 "Rack timely decrease percentage of our GP multiplication factor"); 821 SYSCTL_ADD_S32(&rack_sysctl_ctx, 822 SYSCTL_CHILDREN(rack_timely), 823 OID_AUTO, "increase", CTLFLAG_RW, 824 &rack_gp_increase_per, 2, 825 "Rack timely increase perentage of our GP multiplication factor"); 826 SYSCTL_ADD_S32(&rack_sysctl_ctx, 827 SYSCTL_CHILDREN(rack_timely), 828 OID_AUTO, "lowerbound", CTLFLAG_RW, 829 &rack_per_lower_bound, 50, 830 "Rack timely lowest percentage we allow GP multiplier to fall to"); 831 SYSCTL_ADD_S32(&rack_sysctl_ctx, 832 SYSCTL_CHILDREN(rack_timely), 833 OID_AUTO, "upperboundss", CTLFLAG_RW, 834 &rack_per_upper_bound_ss, 0, 835 "Rack timely higest percentage we allow GP multiplier in SS to raise to (0 is no upperbound)"); 836 SYSCTL_ADD_S32(&rack_sysctl_ctx, 837 SYSCTL_CHILDREN(rack_timely), 838 OID_AUTO, "upperboundca", CTLFLAG_RW, 839 &rack_per_upper_bound_ca, 0, 840 "Rack timely higest percentage we allow GP multiplier to CA raise to (0 is no upperbound)"); 841 SYSCTL_ADD_S32(&rack_sysctl_ctx, 842 SYSCTL_CHILDREN(rack_timely), 843 OID_AUTO, "dynamicgp", CTLFLAG_RW, 844 &rack_do_dyn_mul, 0, 845 "Rack timely do we enable dynmaic timely goodput by default"); 846 SYSCTL_ADD_S32(&rack_sysctl_ctx, 847 SYSCTL_CHILDREN(rack_timely), 848 OID_AUTO, "no_rec_red", CTLFLAG_RW, 849 &rack_gp_no_rec_chg, 1, 850 "Rack timely do we prohibit the recovery multiplier from being lowered"); 851 SYSCTL_ADD_S32(&rack_sysctl_ctx, 852 SYSCTL_CHILDREN(rack_timely), 853 OID_AUTO, "red_clear_cnt", CTLFLAG_RW, 854 &rack_timely_dec_clear, 6, 855 "Rack timely what threshold do we count to before another boost during b/w decent"); 856 SYSCTL_ADD_S32(&rack_sysctl_ctx, 857 SYSCTL_CHILDREN(rack_timely), 858 OID_AUTO, "max_push_rise", CTLFLAG_RW, 859 &rack_timely_max_push_rise, 3, 860 "Rack timely how many times do we push up with b/w increase"); 861 SYSCTL_ADD_S32(&rack_sysctl_ctx, 862 SYSCTL_CHILDREN(rack_timely), 863 OID_AUTO, "max_push_drop", CTLFLAG_RW, 864 &rack_timely_max_push_drop, 3, 865 "Rack timely how many times do we push back on b/w decent"); 866 SYSCTL_ADD_S32(&rack_sysctl_ctx, 867 SYSCTL_CHILDREN(rack_timely), 868 OID_AUTO, "min_segs", CTLFLAG_RW, 869 &rack_timely_min_segs, 4, 870 "Rack timely when setting the cwnd what is the min num segments"); 871 SYSCTL_ADD_S32(&rack_sysctl_ctx, 872 SYSCTL_CHILDREN(rack_timely), 873 OID_AUTO, "noback_max", CTLFLAG_RW, 874 &rack_use_max_for_nobackoff, 0, 875 "Rack timely when deciding if to backoff on a loss, do we use under max rtt else min"); 876 SYSCTL_ADD_S32(&rack_sysctl_ctx, 877 SYSCTL_CHILDREN(rack_timely), 878 OID_AUTO, "interim_timely_only", CTLFLAG_RW, 879 &rack_timely_int_timely_only, 0, 880 "Rack timely when doing interim timely's do we only do timely (no b/w consideration)"); 881 SYSCTL_ADD_S32(&rack_sysctl_ctx, 882 SYSCTL_CHILDREN(rack_timely), 883 OID_AUTO, "nonstop", CTLFLAG_RW, 884 &rack_timely_no_stopping, 0, 885 "Rack timely don't stop increase"); 886 SYSCTL_ADD_S32(&rack_sysctl_ctx, 887 SYSCTL_CHILDREN(rack_timely), 888 OID_AUTO, "dec_raise_thresh", CTLFLAG_RW, 889 &rack_down_raise_thresh, 100, 890 "If the CA or SS is below this threshold raise on the first 3 b/w lowers (0=always)"); 891 SYSCTL_ADD_S32(&rack_sysctl_ctx, 892 SYSCTL_CHILDREN(rack_timely), 893 OID_AUTO, "bottom_drag_segs", CTLFLAG_RW, 894 &rack_req_segs, 1, 895 "Bottom dragging if not these many segments outstanding and room"); 896 897 /* TLP and Rack related parameters */ 898 rack_tlp = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 899 SYSCTL_CHILDREN(rack_sysctl_root), 900 OID_AUTO, 901 "tlp", 902 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 903 "TLP and Rack related Controls"); 904 SYSCTL_ADD_S32(&rack_sysctl_ctx, 905 SYSCTL_CHILDREN(rack_tlp), 906 OID_AUTO, "use_rrr", CTLFLAG_RW, 907 &use_rack_rr, 1, 908 "Do we use Rack Rapid Recovery"); 909 SYSCTL_ADD_S32(&rack_sysctl_ctx, 910 SYSCTL_CHILDREN(rack_tlp), 911 OID_AUTO, "nonrxt_use_cr", CTLFLAG_RW, 912 &rack_non_rxt_use_cr, 0, 913 "Do we use ss/ca rate if in recovery we are transmitting a new data chunk"); 914 SYSCTL_ADD_S32(&rack_sysctl_ctx, 915 SYSCTL_CHILDREN(rack_tlp), 916 OID_AUTO, "tlpmethod", CTLFLAG_RW, 917 &rack_tlp_threshold_use, TLP_USE_TWO_ONE, 918 "What method do we do for TLP time calc 0=no-de-ack-comp, 1=ID, 2=2.1, 3=2.2"); 919 SYSCTL_ADD_S32(&rack_sysctl_ctx, 920 SYSCTL_CHILDREN(rack_tlp), 921 OID_AUTO, "limit", CTLFLAG_RW, 922 &rack_tlp_limit, 2, 923 "How many TLP's can be sent without sending new data"); 924 SYSCTL_ADD_S32(&rack_sysctl_ctx, 925 SYSCTL_CHILDREN(rack_tlp), 926 OID_AUTO, "use_greater", CTLFLAG_RW, 927 &rack_tlp_use_greater, 1, 928 "Should we use the rack_rtt time if its greater than srtt"); 929 SYSCTL_ADD_S32(&rack_sysctl_ctx, 930 SYSCTL_CHILDREN(rack_tlp), 931 OID_AUTO, "tlpminto", CTLFLAG_RW, 932 &rack_tlp_min, 10, 933 "TLP minimum timeout per the specification (10ms)"); 934 SYSCTL_ADD_S32(&rack_sysctl_ctx, 935 SYSCTL_CHILDREN(rack_tlp), 936 OID_AUTO, "send_oldest", CTLFLAG_RW, 937 &rack_always_send_oldest, 0, 938 "Should we always send the oldest TLP and RACK-TLP"); 939 SYSCTL_ADD_S32(&rack_sysctl_ctx, 940 SYSCTL_CHILDREN(rack_tlp), 941 OID_AUTO, "rack_tlimit", CTLFLAG_RW, 942 &rack_limited_retran, 0, 943 "How many times can a rack timeout drive out sends"); 944 SYSCTL_ADD_S32(&rack_sysctl_ctx, 945 SYSCTL_CHILDREN(rack_tlp), 946 OID_AUTO, "tlp_retry", CTLFLAG_RW, 947 &rack_tlp_max_resend, 2, 948 "How many times does TLP retry a single segment or multiple with no ACK"); 949 SYSCTL_ADD_S32(&rack_sysctl_ctx, 950 SYSCTL_CHILDREN(rack_tlp), 951 OID_AUTO, "tlp_cwnd_flag", CTLFLAG_RW, 952 &rack_lower_cwnd_at_tlp, 0, 953 "When a TLP completes a retran should we enter recovery"); 954 SYSCTL_ADD_S32(&rack_sysctl_ctx, 955 SYSCTL_CHILDREN(rack_tlp), 956 OID_AUTO, "reorder_thresh", CTLFLAG_RW, 957 &rack_reorder_thresh, 2, 958 "What factor for rack will be added when seeing reordering (shift right)"); 959 SYSCTL_ADD_S32(&rack_sysctl_ctx, 960 SYSCTL_CHILDREN(rack_tlp), 961 OID_AUTO, "rtt_tlp_thresh", CTLFLAG_RW, 962 &rack_tlp_thresh, 1, 963 "What divisor for TLP rtt/retran will be added (1=rtt, 2=1/2 rtt etc)"); 964 SYSCTL_ADD_S32(&rack_sysctl_ctx, 965 SYSCTL_CHILDREN(rack_tlp), 966 OID_AUTO, "reorder_fade", CTLFLAG_RW, 967 &rack_reorder_fade, 0, 968 "Does reorder detection fade, if so how many ms (0 means never)"); 969 SYSCTL_ADD_S32(&rack_sysctl_ctx, 970 SYSCTL_CHILDREN(rack_tlp), 971 OID_AUTO, "pktdelay", CTLFLAG_RW, 972 &rack_pkt_delay, 1, 973 "Extra RACK time (in ms) besides reordering thresh"); 974 975 /* Timer related controls */ 976 rack_timers = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 977 SYSCTL_CHILDREN(rack_sysctl_root), 978 OID_AUTO, 979 "timers", 980 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 981 "Timer related controls"); 982 SYSCTL_ADD_U32(&rack_sysctl_ctx, 983 SYSCTL_CHILDREN(rack_timers), 984 OID_AUTO, "persmin", CTLFLAG_RW, 985 &rack_persist_min, 250, 986 "What is the minimum time in milliseconds between persists"); 987 SYSCTL_ADD_U32(&rack_sysctl_ctx, 988 SYSCTL_CHILDREN(rack_timers), 989 OID_AUTO, "persmax", CTLFLAG_RW, 990 &rack_persist_max, 2000, 991 "What is the largest delay in milliseconds between persists"); 992 SYSCTL_ADD_S32(&rack_sysctl_ctx, 993 SYSCTL_CHILDREN(rack_timers), 994 OID_AUTO, "delayed_ack", CTLFLAG_RW, 995 &rack_delayed_ack_time, 200, 996 "Delayed ack time (200ms)"); 997 SYSCTL_ADD_S32(&rack_sysctl_ctx, 998 SYSCTL_CHILDREN(rack_timers), 999 OID_AUTO, "minrto", CTLFLAG_RW, 1000 &rack_rto_min, 0, 1001 "Minimum RTO in ms -- set with caution below 1000 due to TLP"); 1002 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1003 SYSCTL_CHILDREN(rack_timers), 1004 OID_AUTO, "maxrto", CTLFLAG_RW, 1005 &rack_rto_max, 0, 1006 "Maxiumum RTO in ms -- should be at least as large as min_rto"); 1007 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1008 SYSCTL_CHILDREN(rack_timers), 1009 OID_AUTO, "minto", CTLFLAG_RW, 1010 &rack_min_to, 1, 1011 "Minimum rack timeout in milliseconds"); 1012 /* Measure controls */ 1013 rack_measure = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 1014 SYSCTL_CHILDREN(rack_sysctl_root), 1015 OID_AUTO, 1016 "measure", 1017 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1018 "Measure related controls"); 1019 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1020 SYSCTL_CHILDREN(rack_measure), 1021 OID_AUTO, "wma_divisor", CTLFLAG_RW, 1022 &rack_wma_divisor, 8, 1023 "When doing b/w calculation what is the divisor for the WMA"); 1024 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1025 SYSCTL_CHILDREN(rack_measure), 1026 OID_AUTO, "end_cwnd", CTLFLAG_RW, 1027 &rack_cwnd_block_ends_measure, 0, 1028 "Does a cwnd just-return end the measurement window (app limited)"); 1029 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1030 SYSCTL_CHILDREN(rack_measure), 1031 OID_AUTO, "end_rwnd", CTLFLAG_RW, 1032 &rack_rwnd_block_ends_measure, 0, 1033 "Does an rwnd just-return end the measurement window (app limited -- not persists)"); 1034 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1035 SYSCTL_CHILDREN(rack_measure), 1036 OID_AUTO, "min_target", CTLFLAG_RW, 1037 &rack_def_data_window, 20, 1038 "What is the minimum target window (in mss) for a GP measurements"); 1039 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1040 SYSCTL_CHILDREN(rack_measure), 1041 OID_AUTO, "goal_bdp", CTLFLAG_RW, 1042 &rack_goal_bdp, 2, 1043 "What is the goal BDP to measure"); 1044 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1045 SYSCTL_CHILDREN(rack_measure), 1046 OID_AUTO, "min_srtts", CTLFLAG_RW, 1047 &rack_min_srtts, 1, 1048 "What is the goal BDP to measure"); 1049 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1050 SYSCTL_CHILDREN(rack_measure), 1051 OID_AUTO, "min_measure_tim", CTLFLAG_RW, 1052 &rack_min_measure_usec, 0, 1053 "What is the Minimum time time for a measurement if 0, this is off"); 1054 /* Misc rack controls */ 1055 rack_misc = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 1056 SYSCTL_CHILDREN(rack_sysctl_root), 1057 OID_AUTO, 1058 "misc", 1059 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1060 "Misc related controls"); 1061 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1062 SYSCTL_CHILDREN(rack_misc), 1063 OID_AUTO, "shared_cwnd", CTLFLAG_RW, 1064 &rack_enable_shared_cwnd, 0, 1065 "Should RACK try to use the shared cwnd on connections where allowed"); 1066 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1067 SYSCTL_CHILDREN(rack_misc), 1068 OID_AUTO, "limits_on_scwnd", CTLFLAG_RW, 1069 &rack_limits_scwnd, 1, 1070 "Should RACK place low end time limits on the shared cwnd feature"); 1071 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1072 SYSCTL_CHILDREN(rack_misc), 1073 OID_AUTO, "non_paced_lro_queue", CTLFLAG_RW, 1074 &rack_enable_mqueue_for_nonpaced, 0, 1075 "Should RACK use mbuf queuing for non-paced connections"); 1076 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1077 SYSCTL_CHILDREN(rack_misc), 1078 OID_AUTO, "iMac_dack", CTLFLAG_RW, 1079 &rack_use_imac_dack, 0, 1080 "Should RACK try to emulate iMac delayed ack"); 1081 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1082 SYSCTL_CHILDREN(rack_misc), 1083 OID_AUTO, "no_prr", CTLFLAG_RW, 1084 &rack_disable_prr, 0, 1085 "Should RACK not use prr and only pace (must have pacing on)"); 1086 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1087 SYSCTL_CHILDREN(rack_misc), 1088 OID_AUTO, "bb_verbose", CTLFLAG_RW, 1089 &rack_verbose_logging, 0, 1090 "Should RACK black box logging be verbose"); 1091 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1092 SYSCTL_CHILDREN(rack_misc), 1093 OID_AUTO, "data_after_close", CTLFLAG_RW, 1094 &rack_ignore_data_after_close, 1, 1095 "Do we hold off sending a RST until all pending data is ack'd"); 1096 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1097 SYSCTL_CHILDREN(rack_misc), 1098 OID_AUTO, "no_sack_needed", CTLFLAG_RW, 1099 &rack_sack_not_required, 0, 1100 "Do we allow rack to run on connections not supporting SACK"); 1101 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1102 SYSCTL_CHILDREN(rack_misc), 1103 OID_AUTO, "recovery_loss_prop", CTLFLAG_RW, 1104 &rack_use_proportional_reduce, 0, 1105 "Should we proportionaly reduce cwnd based on the number of losses "); 1106 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1107 SYSCTL_CHILDREN(rack_misc), 1108 OID_AUTO, "recovery_prop", CTLFLAG_RW, 1109 &rack_proportional_rate, 10, 1110 "What percent reduction per loss"); 1111 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1112 SYSCTL_CHILDREN(rack_misc), 1113 OID_AUTO, "prr_sendalot", CTLFLAG_RW, 1114 &rack_send_a_lot_in_prr, 1, 1115 "Send a lot in prr"); 1116 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1117 SYSCTL_CHILDREN(rack_misc), 1118 OID_AUTO, "earlyrecovery", CTLFLAG_RW, 1119 &rack_early_recovery, 1, 1120 "Do we do early recovery with rack"); 1121 /* Sack Attacker detection stuff */ 1122 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1123 SYSCTL_CHILDREN(rack_attack), 1124 OID_AUTO, "detect_highsackratio", CTLFLAG_RW, 1125 &rack_highest_sack_thresh_seen, 0, 1126 "Highest sack to ack ratio seen"); 1127 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1128 SYSCTL_CHILDREN(rack_attack), 1129 OID_AUTO, "detect_highmoveratio", CTLFLAG_RW, 1130 &rack_highest_move_thresh_seen, 0, 1131 "Highest move to non-move ratio seen"); 1132 rack_ack_total = counter_u64_alloc(M_WAITOK); 1133 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1134 SYSCTL_CHILDREN(rack_attack), 1135 OID_AUTO, "acktotal", CTLFLAG_RD, 1136 &rack_ack_total, 1137 "Total number of Ack's"); 1138 rack_express_sack = counter_u64_alloc(M_WAITOK); 1139 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1140 SYSCTL_CHILDREN(rack_attack), 1141 OID_AUTO, "exp_sacktotal", CTLFLAG_RD, 1142 &rack_express_sack, 1143 "Total expresss number of Sack's"); 1144 rack_sack_total = counter_u64_alloc(M_WAITOK); 1145 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1146 SYSCTL_CHILDREN(rack_attack), 1147 OID_AUTO, "sacktotal", CTLFLAG_RD, 1148 &rack_sack_total, 1149 "Total number of SACKs"); 1150 rack_move_none = counter_u64_alloc(M_WAITOK); 1151 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1152 SYSCTL_CHILDREN(rack_attack), 1153 OID_AUTO, "move_none", CTLFLAG_RD, 1154 &rack_move_none, 1155 "Total number of SACK index reuse of postions under threshold"); 1156 rack_move_some = counter_u64_alloc(M_WAITOK); 1157 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1158 SYSCTL_CHILDREN(rack_attack), 1159 OID_AUTO, "move_some", CTLFLAG_RD, 1160 &rack_move_some, 1161 "Total number of SACK index reuse of postions over threshold"); 1162 rack_sack_attacks_detected = counter_u64_alloc(M_WAITOK); 1163 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1164 SYSCTL_CHILDREN(rack_attack), 1165 OID_AUTO, "attacks", CTLFLAG_RD, 1166 &rack_sack_attacks_detected, 1167 "Total number of SACK attackers that had sack disabled"); 1168 rack_sack_attacks_reversed = counter_u64_alloc(M_WAITOK); 1169 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1170 SYSCTL_CHILDREN(rack_attack), 1171 OID_AUTO, "reversed", CTLFLAG_RD, 1172 &rack_sack_attacks_reversed, 1173 "Total number of SACK attackers that were later determined false positive"); 1174 rack_sack_used_next_merge = counter_u64_alloc(M_WAITOK); 1175 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1176 SYSCTL_CHILDREN(rack_attack), 1177 OID_AUTO, "nextmerge", CTLFLAG_RD, 1178 &rack_sack_used_next_merge, 1179 "Total number of times we used the next merge"); 1180 rack_sack_used_prev_merge = counter_u64_alloc(M_WAITOK); 1181 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1182 SYSCTL_CHILDREN(rack_attack), 1183 OID_AUTO, "prevmerge", CTLFLAG_RD, 1184 &rack_sack_used_prev_merge, 1185 "Total number of times we used the prev merge"); 1186 /* Counters */ 1187 rack_badfr = counter_u64_alloc(M_WAITOK); 1188 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1189 SYSCTL_CHILDREN(rack_counters), 1190 OID_AUTO, "badfr", CTLFLAG_RD, 1191 &rack_badfr, "Total number of bad FRs"); 1192 rack_badfr_bytes = counter_u64_alloc(M_WAITOK); 1193 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1194 SYSCTL_CHILDREN(rack_counters), 1195 OID_AUTO, "badfr_bytes", CTLFLAG_RD, 1196 &rack_badfr_bytes, "Total number of bad FRs"); 1197 rack_rtm_prr_retran = counter_u64_alloc(M_WAITOK); 1198 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1199 SYSCTL_CHILDREN(rack_counters), 1200 OID_AUTO, "prrsndret", CTLFLAG_RD, 1201 &rack_rtm_prr_retran, 1202 "Total number of prr based retransmits"); 1203 rack_rtm_prr_newdata = counter_u64_alloc(M_WAITOK); 1204 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1205 SYSCTL_CHILDREN(rack_counters), 1206 OID_AUTO, "prrsndnew", CTLFLAG_RD, 1207 &rack_rtm_prr_newdata, 1208 "Total number of prr based new transmits"); 1209 rack_timestamp_mismatch = counter_u64_alloc(M_WAITOK); 1210 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1211 SYSCTL_CHILDREN(rack_counters), 1212 OID_AUTO, "tsnf", CTLFLAG_RD, 1213 &rack_timestamp_mismatch, 1214 "Total number of timestamps that we could not find the reported ts"); 1215 rack_find_high = counter_u64_alloc(M_WAITOK); 1216 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1217 SYSCTL_CHILDREN(rack_counters), 1218 OID_AUTO, "findhigh", CTLFLAG_RD, 1219 &rack_find_high, 1220 "Total number of FIN causing find-high"); 1221 rack_reorder_seen = counter_u64_alloc(M_WAITOK); 1222 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1223 SYSCTL_CHILDREN(rack_counters), 1224 OID_AUTO, "reordering", CTLFLAG_RD, 1225 &rack_reorder_seen, 1226 "Total number of times we added delay due to reordering"); 1227 rack_tlp_tot = counter_u64_alloc(M_WAITOK); 1228 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1229 SYSCTL_CHILDREN(rack_counters), 1230 OID_AUTO, "tlp_to_total", CTLFLAG_RD, 1231 &rack_tlp_tot, 1232 "Total number of tail loss probe expirations"); 1233 rack_tlp_newdata = counter_u64_alloc(M_WAITOK); 1234 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1235 SYSCTL_CHILDREN(rack_counters), 1236 OID_AUTO, "tlp_new", CTLFLAG_RD, 1237 &rack_tlp_newdata, 1238 "Total number of tail loss probe sending new data"); 1239 rack_tlp_retran = counter_u64_alloc(M_WAITOK); 1240 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1241 SYSCTL_CHILDREN(rack_counters), 1242 OID_AUTO, "tlp_retran", CTLFLAG_RD, 1243 &rack_tlp_retran, 1244 "Total number of tail loss probe sending retransmitted data"); 1245 rack_tlp_retran_bytes = counter_u64_alloc(M_WAITOK); 1246 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1247 SYSCTL_CHILDREN(rack_counters), 1248 OID_AUTO, "tlp_retran_bytes", CTLFLAG_RD, 1249 &rack_tlp_retran_bytes, 1250 "Total bytes of tail loss probe sending retransmitted data"); 1251 rack_tlp_retran_fail = counter_u64_alloc(M_WAITOK); 1252 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1253 SYSCTL_CHILDREN(rack_counters), 1254 OID_AUTO, "tlp_retran_fail", CTLFLAG_RD, 1255 &rack_tlp_retran_fail, 1256 "Total number of tail loss probe sending retransmitted data that failed (wait for t3)"); 1257 rack_to_tot = counter_u64_alloc(M_WAITOK); 1258 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1259 SYSCTL_CHILDREN(rack_counters), 1260 OID_AUTO, "rack_to_tot", CTLFLAG_RD, 1261 &rack_to_tot, 1262 "Total number of times the rack to expired"); 1263 rack_to_arm_rack = counter_u64_alloc(M_WAITOK); 1264 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1265 SYSCTL_CHILDREN(rack_counters), 1266 OID_AUTO, "arm_rack", CTLFLAG_RD, 1267 &rack_to_arm_rack, 1268 "Total number of times the rack timer armed"); 1269 rack_to_arm_tlp = counter_u64_alloc(M_WAITOK); 1270 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1271 SYSCTL_CHILDREN(rack_counters), 1272 OID_AUTO, "arm_tlp", CTLFLAG_RD, 1273 &rack_to_arm_tlp, 1274 "Total number of times the tlp timer armed"); 1275 rack_calc_zero = counter_u64_alloc(M_WAITOK); 1276 rack_calc_nonzero = counter_u64_alloc(M_WAITOK); 1277 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1278 SYSCTL_CHILDREN(rack_counters), 1279 OID_AUTO, "calc_zero", CTLFLAG_RD, 1280 &rack_calc_zero, 1281 "Total number of times pacing time worked out to zero"); 1282 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1283 SYSCTL_CHILDREN(rack_counters), 1284 OID_AUTO, "calc_nonzero", CTLFLAG_RD, 1285 &rack_calc_nonzero, 1286 "Total number of times pacing time worked out to non-zero"); 1287 rack_paced_segments = counter_u64_alloc(M_WAITOK); 1288 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1289 SYSCTL_CHILDREN(rack_counters), 1290 OID_AUTO, "paced", CTLFLAG_RD, 1291 &rack_paced_segments, 1292 "Total number of times a segment send caused hptsi"); 1293 rack_unpaced_segments = counter_u64_alloc(M_WAITOK); 1294 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1295 SYSCTL_CHILDREN(rack_counters), 1296 OID_AUTO, "unpaced", CTLFLAG_RD, 1297 &rack_unpaced_segments, 1298 "Total number of times a segment did not cause hptsi"); 1299 rack_saw_enobuf = counter_u64_alloc(M_WAITOK); 1300 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1301 SYSCTL_CHILDREN(rack_counters), 1302 OID_AUTO, "saw_enobufs", CTLFLAG_RD, 1303 &rack_saw_enobuf, 1304 "Total number of times a segment did not cause hptsi"); 1305 rack_saw_enetunreach = counter_u64_alloc(M_WAITOK); 1306 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1307 SYSCTL_CHILDREN(rack_counters), 1308 OID_AUTO, "saw_enetunreach", CTLFLAG_RD, 1309 &rack_saw_enetunreach, 1310 "Total number of times a segment did not cause hptsi"); 1311 rack_to_alloc = counter_u64_alloc(M_WAITOK); 1312 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1313 SYSCTL_CHILDREN(rack_counters), 1314 OID_AUTO, "allocs", CTLFLAG_RD, 1315 &rack_to_alloc, 1316 "Total allocations of tracking structures"); 1317 rack_to_alloc_hard = counter_u64_alloc(M_WAITOK); 1318 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1319 SYSCTL_CHILDREN(rack_counters), 1320 OID_AUTO, "allochard", CTLFLAG_RD, 1321 &rack_to_alloc_hard, 1322 "Total allocations done with sleeping the hard way"); 1323 rack_to_alloc_emerg = counter_u64_alloc(M_WAITOK); 1324 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1325 SYSCTL_CHILDREN(rack_counters), 1326 OID_AUTO, "allocemerg", CTLFLAG_RD, 1327 &rack_to_alloc_emerg, 1328 "Total allocations done from emergency cache"); 1329 rack_to_alloc_limited = counter_u64_alloc(M_WAITOK); 1330 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1331 SYSCTL_CHILDREN(rack_counters), 1332 OID_AUTO, "alloc_limited", CTLFLAG_RD, 1333 &rack_to_alloc_limited, 1334 "Total allocations dropped due to limit"); 1335 rack_alloc_limited_conns = counter_u64_alloc(M_WAITOK); 1336 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1337 SYSCTL_CHILDREN(rack_counters), 1338 OID_AUTO, "alloc_limited_conns", CTLFLAG_RD, 1339 &rack_alloc_limited_conns, 1340 "Connections with allocations dropped due to limit"); 1341 rack_split_limited = counter_u64_alloc(M_WAITOK); 1342 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1343 SYSCTL_CHILDREN(rack_counters), 1344 OID_AUTO, "split_limited", CTLFLAG_RD, 1345 &rack_split_limited, 1346 "Split allocations dropped due to limit"); 1347 rack_sack_proc_all = counter_u64_alloc(M_WAITOK); 1348 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1349 SYSCTL_CHILDREN(rack_counters), 1350 OID_AUTO, "sack_long", CTLFLAG_RD, 1351 &rack_sack_proc_all, 1352 "Total times we had to walk whole list for sack processing"); 1353 rack_sack_proc_restart = counter_u64_alloc(M_WAITOK); 1354 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1355 SYSCTL_CHILDREN(rack_counters), 1356 OID_AUTO, "sack_restart", CTLFLAG_RD, 1357 &rack_sack_proc_restart, 1358 "Total times we had to walk whole list due to a restart"); 1359 rack_sack_proc_short = counter_u64_alloc(M_WAITOK); 1360 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1361 SYSCTL_CHILDREN(rack_counters), 1362 OID_AUTO, "sack_short", CTLFLAG_RD, 1363 &rack_sack_proc_short, 1364 "Total times we took shortcut for sack processing"); 1365 rack_enter_tlp_calc = counter_u64_alloc(M_WAITOK); 1366 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1367 SYSCTL_CHILDREN(rack_counters), 1368 OID_AUTO, "tlp_calc_entered", CTLFLAG_RD, 1369 &rack_enter_tlp_calc, 1370 "Total times we called calc-tlp"); 1371 rack_used_tlpmethod = counter_u64_alloc(M_WAITOK); 1372 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1373 SYSCTL_CHILDREN(rack_counters), 1374 OID_AUTO, "hit_tlp_method", CTLFLAG_RD, 1375 &rack_used_tlpmethod, 1376 "Total number of runt sacks"); 1377 rack_used_tlpmethod2 = counter_u64_alloc(M_WAITOK); 1378 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1379 SYSCTL_CHILDREN(rack_counters), 1380 OID_AUTO, "hit_tlp_method2", CTLFLAG_RD, 1381 &rack_used_tlpmethod2, 1382 "Total number of times we hit TLP method 2"); 1383 rack_sack_skipped_acked = counter_u64_alloc(M_WAITOK); 1384 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1385 SYSCTL_CHILDREN(rack_attack), 1386 OID_AUTO, "skipacked", CTLFLAG_RD, 1387 &rack_sack_skipped_acked, 1388 "Total number of times we skipped previously sacked"); 1389 rack_sack_splits = counter_u64_alloc(M_WAITOK); 1390 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1391 SYSCTL_CHILDREN(rack_attack), 1392 OID_AUTO, "ofsplit", CTLFLAG_RD, 1393 &rack_sack_splits, 1394 "Total number of times we did the old fashion tree split"); 1395 rack_progress_drops = counter_u64_alloc(M_WAITOK); 1396 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1397 SYSCTL_CHILDREN(rack_counters), 1398 OID_AUTO, "prog_drops", CTLFLAG_RD, 1399 &rack_progress_drops, 1400 "Total number of progress drops"); 1401 rack_input_idle_reduces = counter_u64_alloc(M_WAITOK); 1402 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1403 SYSCTL_CHILDREN(rack_counters), 1404 OID_AUTO, "idle_reduce_oninput", CTLFLAG_RD, 1405 &rack_input_idle_reduces, 1406 "Total number of idle reductions on input"); 1407 rack_collapsed_win = counter_u64_alloc(M_WAITOK); 1408 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1409 SYSCTL_CHILDREN(rack_counters), 1410 OID_AUTO, "collapsed_win", CTLFLAG_RD, 1411 &rack_collapsed_win, 1412 "Total number of collapsed windows"); 1413 rack_tlp_does_nada = counter_u64_alloc(M_WAITOK); 1414 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1415 SYSCTL_CHILDREN(rack_counters), 1416 OID_AUTO, "tlp_nada", CTLFLAG_RD, 1417 &rack_tlp_does_nada, 1418 "Total number of nada tlp calls"); 1419 rack_try_scwnd = counter_u64_alloc(M_WAITOK); 1420 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1421 SYSCTL_CHILDREN(rack_counters), 1422 OID_AUTO, "tried_scwnd", CTLFLAG_RD, 1423 &rack_try_scwnd, 1424 "Total number of scwnd attempts"); 1425 1426 rack_per_timer_hole = counter_u64_alloc(M_WAITOK); 1427 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1428 SYSCTL_CHILDREN(rack_counters), 1429 OID_AUTO, "timer_hole", CTLFLAG_RD, 1430 &rack_per_timer_hole, 1431 "Total persists start in timer hole"); 1432 COUNTER_ARRAY_ALLOC(rack_out_size, TCP_MSS_ACCT_SIZE, M_WAITOK); 1433 SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), 1434 OID_AUTO, "outsize", CTLFLAG_RD, 1435 rack_out_size, TCP_MSS_ACCT_SIZE, "MSS send sizes"); 1436 COUNTER_ARRAY_ALLOC(rack_opts_arry, RACK_OPTS_SIZE, M_WAITOK); 1437 SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), 1438 OID_AUTO, "opts", CTLFLAG_RD, 1439 rack_opts_arry, RACK_OPTS_SIZE, "RACK Option Stats"); 1440 SYSCTL_ADD_PROC(&rack_sysctl_ctx, 1441 SYSCTL_CHILDREN(rack_sysctl_root), 1442 OID_AUTO, "clear", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, 1443 &rack_clear_counter, 0, sysctl_rack_clear, "IU", "Clear counters"); 1444 } 1445 1446 static __inline int 1447 rb_map_cmp(struct rack_sendmap *b, struct rack_sendmap *a) 1448 { 1449 if (SEQ_GEQ(b->r_start, a->r_start) && 1450 SEQ_LT(b->r_start, a->r_end)) { 1451 /* 1452 * The entry b is within the 1453 * block a. i.e.: 1454 * a -- |-------------| 1455 * b -- |----| 1456 * <or> 1457 * b -- |------| 1458 * <or> 1459 * b -- |-----------| 1460 */ 1461 return (0); 1462 } else if (SEQ_GEQ(b->r_start, a->r_end)) { 1463 /* 1464 * b falls as either the next 1465 * sequence block after a so a 1466 * is said to be smaller than b. 1467 * i.e: 1468 * a -- |------| 1469 * b -- |--------| 1470 * or 1471 * b -- |-----| 1472 */ 1473 return (1); 1474 } 1475 /* 1476 * Whats left is where a is 1477 * larger than b. i.e: 1478 * a -- |-------| 1479 * b -- |---| 1480 * or even possibly 1481 * b -- |--------------| 1482 */ 1483 return (-1); 1484 } 1485 1486 RB_PROTOTYPE(rack_rb_tree_head, rack_sendmap, r_next, rb_map_cmp); 1487 RB_GENERATE(rack_rb_tree_head, rack_sendmap, r_next, rb_map_cmp); 1488 1489 static uint32_t 1490 rc_init_window(struct tcp_rack *rack) 1491 { 1492 uint32_t win; 1493 1494 if (rack->rc_init_win == 0) { 1495 /* 1496 * Nothing set by the user, use the system stack 1497 * default. 1498 */ 1499 return(tcp_compute_initwnd(tcp_maxseg(rack->rc_tp))); 1500 } 1501 win = ctf_fixed_maxseg(rack->rc_tp) * rack->rc_init_win; 1502 return(win); 1503 } 1504 1505 static uint64_t 1506 rack_get_fixed_pacing_bw(struct tcp_rack *rack) 1507 { 1508 if (IN_RECOVERY(rack->rc_tp->t_flags)) 1509 return (rack->r_ctl.rc_fixed_pacing_rate_rec); 1510 else if (rack->r_ctl.cwnd_to_use < rack->rc_tp->snd_ssthresh) 1511 return (rack->r_ctl.rc_fixed_pacing_rate_ss); 1512 else 1513 return (rack->r_ctl.rc_fixed_pacing_rate_ca); 1514 } 1515 1516 static uint64_t 1517 rack_get_bw(struct tcp_rack *rack) 1518 { 1519 if (rack->use_fixed_rate) { 1520 /* Return the fixed pacing rate */ 1521 return (rack_get_fixed_pacing_bw(rack)); 1522 } 1523 if (rack->r_ctl.gp_bw == 0) { 1524 /* 1525 * We have yet no b/w measurement, 1526 * if we have a user set initial bw 1527 * return it. If we don't have that and 1528 * we have an srtt, use the tcp IW (10) to 1529 * calculate a fictional b/w over the SRTT 1530 * which is more or less a guess. Note 1531 * we don't use our IW from rack on purpose 1532 * so if we have like IW=30, we are not 1533 * calculating a "huge" b/w. 1534 */ 1535 uint64_t bw, srtt; 1536 if (rack->r_ctl.init_rate) 1537 return (rack->r_ctl.init_rate); 1538 1539 /* Has the user set a max peak rate? */ 1540 #ifdef NETFLIX_PEAKRATE 1541 if (rack->rc_tp->t_maxpeakrate) 1542 return (rack->rc_tp->t_maxpeakrate); 1543 #endif 1544 /* Ok lets come up with the IW guess, if we have a srtt */ 1545 if (rack->rc_tp->t_srtt == 0) { 1546 /* 1547 * Go with old pacing method 1548 * i.e. burst mitigation only. 1549 */ 1550 return (0); 1551 } 1552 /* Ok lets get the initial TCP win (not racks) */ 1553 bw = tcp_compute_initwnd(tcp_maxseg(rack->rc_tp)); 1554 srtt = ((uint64_t)TICKS_2_USEC(rack->rc_tp->t_srtt) >> TCP_RTT_SHIFT); 1555 bw *= (uint64_t)USECS_IN_SECOND; 1556 bw /= srtt; 1557 return (bw); 1558 } else { 1559 uint64_t bw; 1560 1561 if(rack->r_ctl.num_avg >= RACK_REQ_AVG) { 1562 /* Averaging is done, we can return the value */ 1563 bw = rack->r_ctl.gp_bw; 1564 } else { 1565 /* Still doing initial average must calculate */ 1566 bw = rack->r_ctl.gp_bw / rack->r_ctl.num_avg; 1567 } 1568 #ifdef NETFLIX_PEAKRATE 1569 if ((rack->rc_tp->t_maxpeakrate) && 1570 (bw > rack->rc_tp->t_maxpeakrate)) { 1571 /* The user has set a peak rate to pace at 1572 * don't allow us to pace faster than that. 1573 */ 1574 return (rack->rc_tp->t_maxpeakrate); 1575 } 1576 #endif 1577 return (bw); 1578 } 1579 } 1580 1581 static uint16_t 1582 rack_get_output_gain(struct tcp_rack *rack, struct rack_sendmap *rsm) 1583 { 1584 if (rack->use_fixed_rate) { 1585 return (100); 1586 } else if (rack->in_probe_rtt && (rsm == NULL)) 1587 return(rack->r_ctl.rack_per_of_gp_probertt); 1588 else if ((IN_RECOVERY(rack->rc_tp->t_flags) && 1589 rack->r_ctl.rack_per_of_gp_rec)) { 1590 if (rsm) { 1591 /* a retransmission always use the recovery rate */ 1592 return(rack->r_ctl.rack_per_of_gp_rec); 1593 } else if (rack->rack_rec_nonrxt_use_cr) { 1594 /* Directed to use the configured rate */ 1595 goto configured_rate; 1596 } else if (rack->rack_no_prr && 1597 (rack->r_ctl.rack_per_of_gp_rec > 100)) { 1598 /* No PRR, lets just use the b/w estimate only */ 1599 return(100); 1600 } else { 1601 /* 1602 * Here we may have a non-retransmit but we 1603 * have no overrides, so just use the recovery 1604 * rate (prr is in effect). 1605 */ 1606 return(rack->r_ctl.rack_per_of_gp_rec); 1607 } 1608 } 1609 configured_rate: 1610 /* For the configured rate we look at our cwnd vs the ssthresh */ 1611 if (rack->r_ctl.cwnd_to_use < rack->rc_tp->snd_ssthresh) 1612 return (rack->r_ctl.rack_per_of_gp_ss); 1613 else 1614 return(rack->r_ctl.rack_per_of_gp_ca); 1615 } 1616 1617 static uint64_t 1618 rack_get_output_bw(struct tcp_rack *rack, uint64_t bw, struct rack_sendmap *rsm) 1619 { 1620 /* 1621 * We allow rack_per_of_gp_xx to dictate our bw rate we want. 1622 */ 1623 uint64_t bw_est; 1624 uint64_t gain; 1625 1626 gain = (uint64_t)rack_get_output_gain(rack, rsm); 1627 bw_est = bw * gain; 1628 bw_est /= (uint64_t)100; 1629 /* Never fall below the minimum (def 64kbps) */ 1630 if (bw_est < RACK_MIN_BW) 1631 bw_est = RACK_MIN_BW; 1632 return (bw_est); 1633 } 1634 1635 static void 1636 rack_log_retran_reason(struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t tsused, uint32_t thresh, int mod) 1637 { 1638 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 1639 union tcp_log_stackspecific log; 1640 struct timeval tv; 1641 1642 if ((mod != 1) && (rack_verbose_logging == 0)) { 1643 /* 1644 * We get 3 values currently for mod 1645 * 1 - We are retransmitting and this tells the reason. 1646 * 2 - We are clearing a dup-ack count. 1647 * 3 - We are incrementing a dup-ack count. 1648 * 1649 * The clear/increment are only logged 1650 * if you have BBverbose on. 1651 */ 1652 return; 1653 } 1654 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 1655 log.u_bbr.flex1 = tsused; 1656 log.u_bbr.flex2 = thresh; 1657 log.u_bbr.flex3 = rsm->r_flags; 1658 log.u_bbr.flex4 = rsm->r_dupack; 1659 log.u_bbr.flex5 = rsm->r_start; 1660 log.u_bbr.flex6 = rsm->r_end; 1661 log.u_bbr.flex8 = mod; 1662 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 1663 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 1664 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 1665 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 1666 TCP_LOG_EVENTP(rack->rc_tp, NULL, 1667 &rack->rc_inp->inp_socket->so_rcv, 1668 &rack->rc_inp->inp_socket->so_snd, 1669 BBR_LOG_SETTINGS_CHG, 0, 1670 0, &log, false, &tv); 1671 } 1672 } 1673 1674 1675 1676 static void 1677 rack_log_to_start(struct tcp_rack *rack, uint32_t cts, uint32_t to, int32_t slot, uint8_t which) 1678 { 1679 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 1680 union tcp_log_stackspecific log; 1681 struct timeval tv; 1682 1683 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 1684 log.u_bbr.flex1 = TICKS_2_MSEC(rack->rc_tp->t_srtt >> TCP_RTT_SHIFT); 1685 log.u_bbr.flex2 = to * 1000; 1686 log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags; 1687 log.u_bbr.flex4 = slot; 1688 log.u_bbr.flex5 = rack->rc_inp->inp_hptsslot; 1689 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur; 1690 log.u_bbr.flex7 = rack->rc_in_persist; 1691 log.u_bbr.flex8 = which; 1692 if (rack->rack_no_prr) 1693 log.u_bbr.pkts_out = 0; 1694 else 1695 log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt; 1696 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 1697 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 1698 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 1699 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 1700 TCP_LOG_EVENTP(rack->rc_tp, NULL, 1701 &rack->rc_inp->inp_socket->so_rcv, 1702 &rack->rc_inp->inp_socket->so_snd, 1703 BBR_LOG_TIMERSTAR, 0, 1704 0, &log, false, &tv); 1705 } 1706 } 1707 1708 static void 1709 rack_log_to_event(struct tcp_rack *rack, int32_t to_num, struct rack_sendmap *rsm) 1710 { 1711 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 1712 union tcp_log_stackspecific log; 1713 struct timeval tv; 1714 1715 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 1716 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 1717 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 1718 log.u_bbr.flex8 = to_num; 1719 log.u_bbr.flex1 = rack->r_ctl.rc_rack_min_rtt; 1720 log.u_bbr.flex2 = rack->rc_rack_rtt; 1721 if (rsm == NULL) 1722 log.u_bbr.flex3 = 0; 1723 else 1724 log.u_bbr.flex3 = rsm->r_end - rsm->r_start; 1725 if (rack->rack_no_prr) 1726 log.u_bbr.flex5 = 0; 1727 else 1728 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; 1729 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 1730 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 1731 TCP_LOG_EVENTP(rack->rc_tp, NULL, 1732 &rack->rc_inp->inp_socket->so_rcv, 1733 &rack->rc_inp->inp_socket->so_snd, 1734 BBR_LOG_RTO, 0, 1735 0, &log, false, &tv); 1736 } 1737 } 1738 1739 static void 1740 rack_log_rtt_upd(struct tcpcb *tp, struct tcp_rack *rack, uint32_t t, uint32_t len, 1741 struct rack_sendmap *rsm, int conf) 1742 { 1743 if (tp->t_logstate != TCP_LOG_STATE_OFF) { 1744 union tcp_log_stackspecific log; 1745 struct timeval tv; 1746 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 1747 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 1748 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 1749 log.u_bbr.flex1 = t; 1750 log.u_bbr.flex2 = len; 1751 log.u_bbr.flex3 = rack->r_ctl.rc_rack_min_rtt * HPTS_USEC_IN_MSEC; 1752 log.u_bbr.flex4 = rack->r_ctl.rack_rs.rs_rtt_lowest * HPTS_USEC_IN_MSEC; 1753 log.u_bbr.flex5 = rack->r_ctl.rack_rs.rs_rtt_highest * HPTS_USEC_IN_MSEC; 1754 log.u_bbr.flex6 = rack->r_ctl.rack_rs.rs_rtt_cnt; 1755 log.u_bbr.flex7 = conf; 1756 log.u_bbr.rttProp = (uint64_t)rack->r_ctl.rack_rs.rs_rtt_tot * (uint64_t)HPTS_USEC_IN_MSEC; 1757 log.u_bbr.flex8 = rack->r_ctl.rc_rate_sample_method; 1758 if (rack->rack_no_prr) 1759 log.u_bbr.pkts_out = 0; 1760 else 1761 log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt; 1762 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 1763 log.u_bbr.delivered = rack->r_ctl.rack_rs.rs_us_rtt; 1764 log.u_bbr.pkts_out = rack->r_ctl.rack_rs.rs_flags; 1765 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 1766 if (rsm) { 1767 log.u_bbr.pkt_epoch = rsm->r_start; 1768 log.u_bbr.lost = rsm->r_end; 1769 log.u_bbr.cwnd_gain = rsm->r_rtr_cnt; 1770 } else { 1771 1772 /* Its a SYN */ 1773 log.u_bbr.pkt_epoch = rack->rc_tp->iss; 1774 log.u_bbr.lost = 0; 1775 log.u_bbr.cwnd_gain = 0; 1776 } 1777 /* Write out general bits of interest rrs here */ 1778 log.u_bbr.use_lt_bw = rack->rc_highly_buffered; 1779 log.u_bbr.use_lt_bw <<= 1; 1780 log.u_bbr.use_lt_bw |= rack->forced_ack; 1781 log.u_bbr.use_lt_bw <<= 1; 1782 log.u_bbr.use_lt_bw |= rack->rc_gp_dyn_mul; 1783 log.u_bbr.use_lt_bw <<= 1; 1784 log.u_bbr.use_lt_bw |= rack->in_probe_rtt; 1785 log.u_bbr.use_lt_bw <<= 1; 1786 log.u_bbr.use_lt_bw |= rack->measure_saw_probe_rtt; 1787 log.u_bbr.use_lt_bw <<= 1; 1788 log.u_bbr.use_lt_bw |= rack->app_limited_needs_set; 1789 log.u_bbr.use_lt_bw <<= 1; 1790 log.u_bbr.use_lt_bw |= rack->rc_gp_filled; 1791 log.u_bbr.use_lt_bw <<= 1; 1792 log.u_bbr.use_lt_bw |= rack->rc_dragged_bottom; 1793 log.u_bbr.applimited = rack->r_ctl.rc_target_probertt_flight; 1794 log.u_bbr.epoch = rack->r_ctl.rc_time_probertt_starts; 1795 log.u_bbr.lt_epoch = rack->r_ctl.rc_time_probertt_entered; 1796 log.u_bbr.cur_del_rate = rack->r_ctl.rc_lower_rtt_us_cts; 1797 log.u_bbr.delRate = rack->r_ctl.rc_gp_srtt; 1798 TCP_LOG_EVENTP(tp, NULL, 1799 &rack->rc_inp->inp_socket->so_rcv, 1800 &rack->rc_inp->inp_socket->so_snd, 1801 BBR_LOG_BBRRTT, 0, 1802 0, &log, false, &tv); 1803 } 1804 } 1805 1806 static void 1807 rack_log_rtt_sample(struct tcp_rack *rack, uint32_t rtt) 1808 { 1809 /* 1810 * Log the rtt sample we are 1811 * applying to the srtt algorithm in 1812 * useconds. 1813 */ 1814 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 1815 union tcp_log_stackspecific log; 1816 struct timeval tv; 1817 1818 /* Convert our ms to a microsecond */ 1819 memset(&log, 0, sizeof(log)); 1820 log.u_bbr.flex1 = rtt * 1000; 1821 log.u_bbr.flex2 = rack->r_ctl.ack_count; 1822 log.u_bbr.flex3 = rack->r_ctl.sack_count; 1823 log.u_bbr.flex4 = rack->r_ctl.sack_noextra_move; 1824 log.u_bbr.flex5 = rack->r_ctl.sack_moved_extra; 1825 log.u_bbr.flex8 = rack->sack_attack_disable; 1826 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 1827 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 1828 TCP_LOG_EVENTP(rack->rc_tp, NULL, 1829 &rack->rc_inp->inp_socket->so_rcv, 1830 &rack->rc_inp->inp_socket->so_snd, 1831 TCP_LOG_RTT, 0, 1832 0, &log, false, &tv); 1833 } 1834 } 1835 1836 1837 static inline void 1838 rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, int event, int line) 1839 { 1840 if (rack_verbose_logging && (tp->t_logstate != TCP_LOG_STATE_OFF)) { 1841 union tcp_log_stackspecific log; 1842 struct timeval tv; 1843 1844 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 1845 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 1846 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 1847 log.u_bbr.flex1 = line; 1848 log.u_bbr.flex2 = tick; 1849 log.u_bbr.flex3 = tp->t_maxunacktime; 1850 log.u_bbr.flex4 = tp->t_acktime; 1851 log.u_bbr.flex8 = event; 1852 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 1853 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 1854 TCP_LOG_EVENTP(tp, NULL, 1855 &rack->rc_inp->inp_socket->so_rcv, 1856 &rack->rc_inp->inp_socket->so_snd, 1857 BBR_LOG_PROGRESS, 0, 1858 0, &log, false, &tv); 1859 } 1860 } 1861 1862 static void 1863 rack_log_type_bbrsnd(struct tcp_rack *rack, uint32_t len, uint32_t slot, uint32_t cts, struct timeval *tv) 1864 { 1865 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 1866 union tcp_log_stackspecific log; 1867 1868 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 1869 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 1870 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 1871 log.u_bbr.flex1 = slot; 1872 if (rack->rack_no_prr) 1873 log.u_bbr.flex2 = 0; 1874 else 1875 log.u_bbr.flex2 = rack->r_ctl.rc_prr_sndcnt; 1876 log.u_bbr.flex7 = (0x0000ffff & rack->r_ctl.rc_hpts_flags); 1877 log.u_bbr.flex8 = rack->rc_in_persist; 1878 log.u_bbr.timeStamp = cts; 1879 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 1880 TCP_LOG_EVENTP(rack->rc_tp, NULL, 1881 &rack->rc_inp->inp_socket->so_rcv, 1882 &rack->rc_inp->inp_socket->so_snd, 1883 BBR_LOG_BBRSND, 0, 1884 0, &log, false, tv); 1885 } 1886 } 1887 1888 static void 1889 rack_log_doseg_done(struct tcp_rack *rack, uint32_t cts, int32_t nxt_pkt, int32_t did_out, int way_out) 1890 { 1891 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 1892 union tcp_log_stackspecific log; 1893 struct timeval tv; 1894 1895 memset(&log, 0, sizeof(log)); 1896 log.u_bbr.flex1 = did_out; 1897 log.u_bbr.flex2 = nxt_pkt; 1898 log.u_bbr.flex3 = way_out; 1899 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; 1900 if (rack->rack_no_prr) 1901 log.u_bbr.flex5 = 0; 1902 else 1903 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; 1904 log.u_bbr.applimited = rack->r_ctl.rc_pace_min_segs; 1905 log.u_bbr.flex7 = rack->r_wanted_output; 1906 log.u_bbr.flex8 = rack->rc_in_persist; 1907 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 1908 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 1909 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 1910 TCP_LOG_EVENTP(rack->rc_tp, NULL, 1911 &rack->rc_inp->inp_socket->so_rcv, 1912 &rack->rc_inp->inp_socket->so_snd, 1913 BBR_LOG_DOSEG_DONE, 0, 1914 0, &log, false, &tv); 1915 } 1916 } 1917 1918 static void 1919 rack_log_type_hrdwtso(struct tcpcb *tp, struct tcp_rack *rack, int len, int mod, int32_t orig_len, int frm) 1920 { 1921 if (tp->t_logstate != TCP_LOG_STATE_OFF) { 1922 union tcp_log_stackspecific log; 1923 struct timeval tv; 1924 uint32_t cts; 1925 1926 memset(&log, 0, sizeof(log)); 1927 cts = tcp_get_usecs(&tv); 1928 log.u_bbr.flex1 = rack->r_ctl.rc_pace_min_segs; 1929 log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs; 1930 log.u_bbr.flex4 = len; 1931 log.u_bbr.flex5 = orig_len; 1932 log.u_bbr.flex6 = rack->r_ctl.rc_sacked; 1933 log.u_bbr.flex7 = mod; 1934 log.u_bbr.flex8 = frm; 1935 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 1936 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 1937 TCP_LOG_EVENTP(tp, NULL, 1938 &tp->t_inpcb->inp_socket->so_rcv, 1939 &tp->t_inpcb->inp_socket->so_snd, 1940 TCP_HDWR_TLS, 0, 1941 0, &log, false, &tv); 1942 } 1943 } 1944 1945 static void 1946 rack_log_type_just_return(struct tcp_rack *rack, uint32_t cts, uint32_t tlen, uint32_t slot, 1947 uint8_t hpts_calling, int reason, uint32_t cwnd_to_use) 1948 { 1949 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 1950 union tcp_log_stackspecific log; 1951 struct timeval tv; 1952 1953 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 1954 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 1955 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 1956 log.u_bbr.flex1 = slot; 1957 log.u_bbr.flex2 = rack->r_ctl.rc_hpts_flags; 1958 log.u_bbr.flex4 = reason; 1959 if (rack->rack_no_prr) 1960 log.u_bbr.flex5 = 0; 1961 else 1962 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; 1963 log.u_bbr.flex7 = hpts_calling; 1964 log.u_bbr.flex8 = rack->rc_in_persist; 1965 log.u_bbr.lt_epoch = cwnd_to_use; 1966 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 1967 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 1968 TCP_LOG_EVENTP(rack->rc_tp, NULL, 1969 &rack->rc_inp->inp_socket->so_rcv, 1970 &rack->rc_inp->inp_socket->so_snd, 1971 BBR_LOG_JUSTRET, 0, 1972 tlen, &log, false, &tv); 1973 } 1974 } 1975 1976 static void 1977 rack_log_to_cancel(struct tcp_rack *rack, int32_t hpts_removed, int line, uint32_t us_cts, 1978 struct timeval *tv, uint32_t flags_on_entry) 1979 { 1980 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 1981 union tcp_log_stackspecific log; 1982 1983 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 1984 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 1985 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 1986 log.u_bbr.flex1 = line; 1987 log.u_bbr.flex2 = rack->r_ctl.rc_last_output_to; 1988 log.u_bbr.flex3 = flags_on_entry; 1989 log.u_bbr.flex4 = us_cts; 1990 if (rack->rack_no_prr) 1991 log.u_bbr.flex5 = 0; 1992 else 1993 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; 1994 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur; 1995 log.u_bbr.flex7 = hpts_removed; 1996 log.u_bbr.flex8 = 1; 1997 log.u_bbr.applimited = rack->r_ctl.rc_hpts_flags; 1998 log.u_bbr.timeStamp = us_cts; 1999 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2000 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2001 &rack->rc_inp->inp_socket->so_rcv, 2002 &rack->rc_inp->inp_socket->so_snd, 2003 BBR_LOG_TIMERCANC, 0, 2004 0, &log, false, tv); 2005 } 2006 } 2007 2008 static void 2009 rack_log_alt_to_to_cancel(struct tcp_rack *rack, 2010 uint32_t flex1, uint32_t flex2, 2011 uint32_t flex3, uint32_t flex4, 2012 uint32_t flex5, uint32_t flex6, 2013 uint16_t flex7, uint8_t mod) 2014 { 2015 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2016 union tcp_log_stackspecific log; 2017 struct timeval tv; 2018 2019 if (mod == 1) { 2020 /* No you can't use 1, its for the real to cancel */ 2021 return; 2022 } 2023 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2024 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2025 log.u_bbr.flex1 = flex1; 2026 log.u_bbr.flex2 = flex2; 2027 log.u_bbr.flex3 = flex3; 2028 log.u_bbr.flex4 = flex4; 2029 log.u_bbr.flex5 = flex5; 2030 log.u_bbr.flex6 = flex6; 2031 log.u_bbr.flex7 = flex7; 2032 log.u_bbr.flex8 = mod; 2033 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2034 &rack->rc_inp->inp_socket->so_rcv, 2035 &rack->rc_inp->inp_socket->so_snd, 2036 BBR_LOG_TIMERCANC, 0, 2037 0, &log, false, &tv); 2038 } 2039 } 2040 2041 static void 2042 rack_log_to_processing(struct tcp_rack *rack, uint32_t cts, int32_t ret, int32_t timers) 2043 { 2044 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2045 union tcp_log_stackspecific log; 2046 struct timeval tv; 2047 2048 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2049 log.u_bbr.flex1 = timers; 2050 log.u_bbr.flex2 = ret; 2051 log.u_bbr.flex3 = rack->r_ctl.rc_timer_exp; 2052 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; 2053 log.u_bbr.flex5 = cts; 2054 if (rack->rack_no_prr) 2055 log.u_bbr.flex6 = 0; 2056 else 2057 log.u_bbr.flex6 = rack->r_ctl.rc_prr_sndcnt; 2058 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2059 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2060 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2061 &rack->rc_inp->inp_socket->so_rcv, 2062 &rack->rc_inp->inp_socket->so_snd, 2063 BBR_LOG_TO_PROCESS, 0, 2064 0, &log, false, &tv); 2065 } 2066 } 2067 2068 static void 2069 rack_log_to_prr(struct tcp_rack *rack, int frm, int orig_cwnd) 2070 { 2071 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2072 union tcp_log_stackspecific log; 2073 struct timeval tv; 2074 2075 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2076 log.u_bbr.flex1 = rack->r_ctl.rc_prr_out; 2077 log.u_bbr.flex2 = rack->r_ctl.rc_prr_recovery_fs; 2078 if (rack->rack_no_prr) 2079 log.u_bbr.flex3 = 0; 2080 else 2081 log.u_bbr.flex3 = rack->r_ctl.rc_prr_sndcnt; 2082 log.u_bbr.flex4 = rack->r_ctl.rc_prr_delivered; 2083 log.u_bbr.flex5 = rack->r_ctl.rc_sacked; 2084 log.u_bbr.flex6 = rack->r_ctl.rc_holes_rxt; 2085 log.u_bbr.flex8 = frm; 2086 log.u_bbr.pkts_out = orig_cwnd; 2087 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2088 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2089 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2090 &rack->rc_inp->inp_socket->so_rcv, 2091 &rack->rc_inp->inp_socket->so_snd, 2092 BBR_LOG_BBRUPD, 0, 2093 0, &log, false, &tv); 2094 } 2095 } 2096 2097 #ifdef NETFLIX_EXP_DETECTION 2098 static void 2099 rack_log_sad(struct tcp_rack *rack, int event) 2100 { 2101 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2102 union tcp_log_stackspecific log; 2103 struct timeval tv; 2104 2105 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2106 log.u_bbr.flex1 = rack->r_ctl.sack_count; 2107 log.u_bbr.flex2 = rack->r_ctl.ack_count; 2108 log.u_bbr.flex3 = rack->r_ctl.sack_moved_extra; 2109 log.u_bbr.flex4 = rack->r_ctl.sack_noextra_move; 2110 log.u_bbr.flex5 = rack->r_ctl.rc_num_maps_alloced; 2111 log.u_bbr.flex6 = tcp_sack_to_ack_thresh; 2112 log.u_bbr.pkts_out = tcp_sack_to_move_thresh; 2113 log.u_bbr.lt_epoch = (tcp_force_detection << 8); 2114 log.u_bbr.lt_epoch |= rack->do_detection; 2115 log.u_bbr.applimited = tcp_map_minimum; 2116 log.u_bbr.flex7 = rack->sack_attack_disable; 2117 log.u_bbr.flex8 = event; 2118 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2119 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2120 log.u_bbr.delivered = tcp_sad_decay_val; 2121 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2122 &rack->rc_inp->inp_socket->so_rcv, 2123 &rack->rc_inp->inp_socket->so_snd, 2124 TCP_SAD_DETECTION, 0, 2125 0, &log, false, &tv); 2126 } 2127 } 2128 #endif 2129 2130 static void 2131 rack_counter_destroy(void) 2132 { 2133 counter_u64_free(rack_ack_total); 2134 counter_u64_free(rack_express_sack); 2135 counter_u64_free(rack_sack_total); 2136 counter_u64_free(rack_move_none); 2137 counter_u64_free(rack_move_some); 2138 counter_u64_free(rack_sack_attacks_detected); 2139 counter_u64_free(rack_sack_attacks_reversed); 2140 counter_u64_free(rack_sack_used_next_merge); 2141 counter_u64_free(rack_sack_used_prev_merge); 2142 counter_u64_free(rack_badfr); 2143 counter_u64_free(rack_badfr_bytes); 2144 counter_u64_free(rack_rtm_prr_retran); 2145 counter_u64_free(rack_rtm_prr_newdata); 2146 counter_u64_free(rack_timestamp_mismatch); 2147 counter_u64_free(rack_find_high); 2148 counter_u64_free(rack_reorder_seen); 2149 counter_u64_free(rack_tlp_tot); 2150 counter_u64_free(rack_tlp_newdata); 2151 counter_u64_free(rack_tlp_retran); 2152 counter_u64_free(rack_tlp_retran_bytes); 2153 counter_u64_free(rack_tlp_retran_fail); 2154 counter_u64_free(rack_to_tot); 2155 counter_u64_free(rack_to_arm_rack); 2156 counter_u64_free(rack_to_arm_tlp); 2157 counter_u64_free(rack_calc_zero); 2158 counter_u64_free(rack_calc_nonzero); 2159 counter_u64_free(rack_paced_segments); 2160 counter_u64_free(rack_unpaced_segments); 2161 counter_u64_free(rack_saw_enobuf); 2162 counter_u64_free(rack_saw_enetunreach); 2163 counter_u64_free(rack_to_alloc); 2164 counter_u64_free(rack_to_alloc_hard); 2165 counter_u64_free(rack_to_alloc_emerg); 2166 counter_u64_free(rack_to_alloc_limited); 2167 counter_u64_free(rack_alloc_limited_conns); 2168 counter_u64_free(rack_split_limited); 2169 counter_u64_free(rack_sack_proc_all); 2170 counter_u64_free(rack_sack_proc_restart); 2171 counter_u64_free(rack_sack_proc_short); 2172 counter_u64_free(rack_enter_tlp_calc); 2173 counter_u64_free(rack_used_tlpmethod); 2174 counter_u64_free(rack_used_tlpmethod2); 2175 counter_u64_free(rack_sack_skipped_acked); 2176 counter_u64_free(rack_sack_splits); 2177 counter_u64_free(rack_progress_drops); 2178 counter_u64_free(rack_input_idle_reduces); 2179 counter_u64_free(rack_collapsed_win); 2180 counter_u64_free(rack_tlp_does_nada); 2181 counter_u64_free(rack_try_scwnd); 2182 counter_u64_free(rack_per_timer_hole); 2183 COUNTER_ARRAY_FREE(rack_out_size, TCP_MSS_ACCT_SIZE); 2184 COUNTER_ARRAY_FREE(rack_opts_arry, RACK_OPTS_SIZE); 2185 } 2186 2187 static struct rack_sendmap * 2188 rack_alloc(struct tcp_rack *rack) 2189 { 2190 struct rack_sendmap *rsm; 2191 2192 rsm = uma_zalloc(rack_zone, M_NOWAIT); 2193 if (rsm) { 2194 rack->r_ctl.rc_num_maps_alloced++; 2195 counter_u64_add(rack_to_alloc, 1); 2196 return (rsm); 2197 } 2198 if (rack->rc_free_cnt) { 2199 counter_u64_add(rack_to_alloc_emerg, 1); 2200 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); 2201 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext); 2202 rack->rc_free_cnt--; 2203 return (rsm); 2204 } 2205 return (NULL); 2206 } 2207 2208 static struct rack_sendmap * 2209 rack_alloc_full_limit(struct tcp_rack *rack) 2210 { 2211 if ((V_tcp_map_entries_limit > 0) && 2212 (rack->do_detection == 0) && 2213 (rack->r_ctl.rc_num_maps_alloced >= V_tcp_map_entries_limit)) { 2214 counter_u64_add(rack_to_alloc_limited, 1); 2215 if (!rack->alloc_limit_reported) { 2216 rack->alloc_limit_reported = 1; 2217 counter_u64_add(rack_alloc_limited_conns, 1); 2218 } 2219 return (NULL); 2220 } 2221 return (rack_alloc(rack)); 2222 } 2223 2224 /* wrapper to allocate a sendmap entry, subject to a specific limit */ 2225 static struct rack_sendmap * 2226 rack_alloc_limit(struct tcp_rack *rack, uint8_t limit_type) 2227 { 2228 struct rack_sendmap *rsm; 2229 2230 if (limit_type) { 2231 /* currently there is only one limit type */ 2232 if (V_tcp_map_split_limit > 0 && 2233 (rack->do_detection == 0) && 2234 rack->r_ctl.rc_num_split_allocs >= V_tcp_map_split_limit) { 2235 counter_u64_add(rack_split_limited, 1); 2236 if (!rack->alloc_limit_reported) { 2237 rack->alloc_limit_reported = 1; 2238 counter_u64_add(rack_alloc_limited_conns, 1); 2239 } 2240 return (NULL); 2241 } 2242 } 2243 2244 /* allocate and mark in the limit type, if set */ 2245 rsm = rack_alloc(rack); 2246 if (rsm != NULL && limit_type) { 2247 rsm->r_limit_type = limit_type; 2248 rack->r_ctl.rc_num_split_allocs++; 2249 } 2250 return (rsm); 2251 } 2252 2253 static void 2254 rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm) 2255 { 2256 if (rsm->r_flags & RACK_APP_LIMITED) { 2257 if (rack->r_ctl.rc_app_limited_cnt > 0) { 2258 rack->r_ctl.rc_app_limited_cnt--; 2259 } 2260 } 2261 if (rsm->r_limit_type) { 2262 /* currently there is only one limit type */ 2263 rack->r_ctl.rc_num_split_allocs--; 2264 } 2265 if (rsm == rack->r_ctl.rc_first_appl) { 2266 if (rack->r_ctl.rc_app_limited_cnt == 0) 2267 rack->r_ctl.rc_first_appl = NULL; 2268 else { 2269 /* Follow the next one out */ 2270 struct rack_sendmap fe; 2271 2272 fe.r_start = rsm->r_nseq_appl; 2273 rack->r_ctl.rc_first_appl = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); 2274 } 2275 } 2276 if (rsm == rack->r_ctl.rc_resend) 2277 rack->r_ctl.rc_resend = NULL; 2278 if (rsm == rack->r_ctl.rc_rsm_at_retran) 2279 rack->r_ctl.rc_rsm_at_retran = NULL; 2280 if (rsm == rack->r_ctl.rc_end_appl) 2281 rack->r_ctl.rc_end_appl = NULL; 2282 if (rack->r_ctl.rc_tlpsend == rsm) 2283 rack->r_ctl.rc_tlpsend = NULL; 2284 if (rack->r_ctl.rc_sacklast == rsm) 2285 rack->r_ctl.rc_sacklast = NULL; 2286 if (rack->rc_free_cnt < rack_free_cache) { 2287 memset(rsm, 0, sizeof(struct rack_sendmap)); 2288 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_tnext); 2289 rsm->r_limit_type = 0; 2290 rack->rc_free_cnt++; 2291 return; 2292 } 2293 rack->r_ctl.rc_num_maps_alloced--; 2294 uma_zfree(rack_zone, rsm); 2295 } 2296 2297 static uint32_t 2298 rack_get_measure_window(struct tcpcb *tp, struct tcp_rack *rack) 2299 { 2300 uint64_t srtt, bw, len, tim; 2301 uint32_t segsiz, def_len, minl; 2302 2303 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 2304 def_len = rack_def_data_window * segsiz; 2305 if (rack->rc_gp_filled == 0) { 2306 /* 2307 * We have no measurement (IW is in flight?) so 2308 * we can only guess using our data_window sysctl 2309 * value (usually 100MSS). 2310 */ 2311 return (def_len); 2312 } 2313 /* 2314 * Now we have a number of factors to consider. 2315 * 2316 * 1) We have a desired BDP which is usually 2317 * at least 2. 2318 * 2) We have a minimum number of rtt's usually 1 SRTT 2319 * but we allow it too to be more. 2320 * 3) We want to make sure a measurement last N useconds (if 2321 * we have set rack_min_measure_usec. 2322 * 2323 * We handle the first concern here by trying to create a data 2324 * window of max(rack_def_data_window, DesiredBDP). The 2325 * second concern we handle in not letting the measurement 2326 * window end normally until at least the required SRTT's 2327 * have gone by which is done further below in 2328 * rack_enough_for_measurement(). Finally the third concern 2329 * we also handle here by calculating how long that time 2330 * would take at the current BW and then return the 2331 * max of our first calculation and that length. Note 2332 * that if rack_min_measure_usec is 0, we don't deal 2333 * with concern 3. Also for both Concern 1 and 3 an 2334 * application limited period could end the measurement 2335 * earlier. 2336 * 2337 * So lets calculate the BDP with the "known" b/w using 2338 * the SRTT has our rtt and then multiply it by the 2339 * goal. 2340 */ 2341 bw = rack_get_bw(rack); 2342 srtt = ((uint64_t)TICKS_2_USEC(tp->t_srtt) >> TCP_RTT_SHIFT); 2343 len = bw * srtt; 2344 len /= (uint64_t)HPTS_USEC_IN_SEC; 2345 len *= max(1, rack_goal_bdp); 2346 /* Now we need to round up to the nearest MSS */ 2347 len = roundup(len, segsiz); 2348 if (rack_min_measure_usec) { 2349 /* Now calculate our min length for this b/w */ 2350 tim = rack_min_measure_usec; 2351 minl = (tim * bw) / (uint64_t)HPTS_USEC_IN_SEC; 2352 if (minl == 0) 2353 minl = 1; 2354 minl = roundup(minl, segsiz); 2355 if (len < minl) 2356 len = minl; 2357 } 2358 /* 2359 * Now if we have a very small window we want 2360 * to attempt to get the window that is 2361 * as small as possible. This happens on 2362 * low b/w connections and we don't want to 2363 * span huge numbers of rtt's between measurements. 2364 * 2365 * We basically include 2 over our "MIN window" so 2366 * that the measurement can be shortened (possibly) by 2367 * an ack'ed packet. 2368 */ 2369 if (len < def_len) 2370 return (max((uint32_t)len, ((MIN_GP_WIN+2) * segsiz))); 2371 else 2372 return (max((uint32_t)len, def_len)); 2373 2374 } 2375 2376 static int 2377 rack_enough_for_measurement(struct tcpcb *tp, struct tcp_rack *rack, tcp_seq th_ack) 2378 { 2379 uint32_t tim, srtts, segsiz; 2380 2381 /* 2382 * Has enough time passed for the GP measurement to be valid? 2383 */ 2384 if ((tp->snd_max == tp->snd_una) || 2385 (th_ack == tp->snd_max)){ 2386 /* All is acked */ 2387 return (1); 2388 } 2389 if (SEQ_LT(th_ack, tp->gput_seq)) { 2390 /* Not enough bytes yet */ 2391 return (0); 2392 } 2393 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 2394 if (SEQ_LT(th_ack, tp->gput_ack) && 2395 ((th_ack - tp->gput_seq) < max(rc_init_window(rack), (MIN_GP_WIN * segsiz)))) { 2396 /* Not enough bytes yet */ 2397 return (0); 2398 } 2399 if (rack->r_ctl.rc_first_appl && 2400 (rack->r_ctl.rc_first_appl->r_start == th_ack)) { 2401 /* 2402 * We are up to the app limited point 2403 * we have to measure irrespective of the time.. 2404 */ 2405 return (1); 2406 } 2407 /* Now what about time? */ 2408 srtts = (rack->r_ctl.rc_gp_srtt * rack_min_srtts); 2409 tim = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time) - tp->gput_ts; 2410 if (tim >= srtts) { 2411 return (1); 2412 } 2413 /* Nope not even a full SRTT has passed */ 2414 return (0); 2415 } 2416 2417 2418 static void 2419 rack_log_timely(struct tcp_rack *rack, 2420 uint32_t logged, uint64_t cur_bw, uint64_t low_bnd, 2421 uint64_t up_bnd, int line, uint8_t method) 2422 { 2423 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2424 union tcp_log_stackspecific log; 2425 struct timeval tv; 2426 2427 memset(&log, 0, sizeof(log)); 2428 log.u_bbr.flex1 = logged; 2429 log.u_bbr.flex2 = rack->rc_gp_timely_inc_cnt; 2430 log.u_bbr.flex2 <<= 4; 2431 log.u_bbr.flex2 |= rack->rc_gp_timely_dec_cnt; 2432 log.u_bbr.flex2 <<= 4; 2433 log.u_bbr.flex2 |= rack->rc_gp_incr; 2434 log.u_bbr.flex2 <<= 4; 2435 log.u_bbr.flex2 |= rack->rc_gp_bwred; 2436 log.u_bbr.flex3 = rack->rc_gp_incr; 2437 log.u_bbr.flex4 = rack->r_ctl.rack_per_of_gp_ss; 2438 log.u_bbr.flex5 = rack->r_ctl.rack_per_of_gp_ca; 2439 log.u_bbr.flex6 = rack->r_ctl.rack_per_of_gp_rec; 2440 log.u_bbr.flex7 = rack->rc_gp_bwred; 2441 log.u_bbr.flex8 = method; 2442 log.u_bbr.cur_del_rate = cur_bw; 2443 log.u_bbr.delRate = low_bnd; 2444 log.u_bbr.bw_inuse = up_bnd; 2445 log.u_bbr.rttProp = rack_get_bw(rack); 2446 log.u_bbr.pkt_epoch = line; 2447 log.u_bbr.pkts_out = rack->r_ctl.rc_rtt_diff; 2448 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2449 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2450 log.u_bbr.epoch = rack->r_ctl.rc_gp_srtt; 2451 log.u_bbr.lt_epoch = rack->r_ctl.rc_prev_gp_srtt; 2452 log.u_bbr.cwnd_gain = rack->rc_dragged_bottom; 2453 log.u_bbr.cwnd_gain <<= 1; 2454 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_rec; 2455 log.u_bbr.cwnd_gain <<= 1; 2456 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ss; 2457 log.u_bbr.cwnd_gain <<= 1; 2458 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ca; 2459 log.u_bbr.lost = rack->r_ctl.rc_loss_count; 2460 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2461 &rack->rc_inp->inp_socket->so_rcv, 2462 &rack->rc_inp->inp_socket->so_snd, 2463 TCP_TIMELY_WORK, 0, 2464 0, &log, false, &tv); 2465 } 2466 } 2467 2468 static int 2469 rack_bw_can_be_raised(struct tcp_rack *rack, uint64_t cur_bw, uint64_t last_bw_est, uint16_t mult) 2470 { 2471 /* 2472 * Before we increase we need to know if 2473 * the estimate just made was less than 2474 * our pacing goal (i.e. (cur_bw * mult) > last_bw_est) 2475 * 2476 * If we already are pacing at a fast enough 2477 * rate to push us faster there is no sense of 2478 * increasing. 2479 * 2480 * We first caculate our actual pacing rate (ss or ca multipler 2481 * times our cur_bw). 2482 * 2483 * Then we take the last measured rate and multipy by our 2484 * maximum pacing overage to give us a max allowable rate. 2485 * 2486 * If our act_rate is smaller than our max_allowable rate 2487 * then we should increase. Else we should hold steady. 2488 * 2489 */ 2490 uint64_t act_rate, max_allow_rate; 2491 2492 if (rack_timely_no_stopping) 2493 return (1); 2494 2495 if ((cur_bw == 0) || (last_bw_est == 0)) { 2496 /* 2497 * Initial startup case or 2498 * everything is acked case. 2499 */ 2500 rack_log_timely(rack, mult, cur_bw, 0, 0, 2501 __LINE__, 9); 2502 return (1); 2503 } 2504 if (mult <= 100) { 2505 /* 2506 * We can always pace at or slightly above our rate. 2507 */ 2508 rack_log_timely(rack, mult, cur_bw, 0, 0, 2509 __LINE__, 9); 2510 return (1); 2511 } 2512 act_rate = cur_bw * (uint64_t)mult; 2513 act_rate /= 100; 2514 max_allow_rate = last_bw_est * ((uint64_t)rack_max_per_above + (uint64_t)100); 2515 max_allow_rate /= 100; 2516 if (act_rate < max_allow_rate) { 2517 /* 2518 * Here the rate we are actually pacing at 2519 * is smaller than 10% above our last measurement. 2520 * This means we are pacing below what we would 2521 * like to try to achieve (plus some wiggle room). 2522 */ 2523 rack_log_timely(rack, mult, cur_bw, act_rate, max_allow_rate, 2524 __LINE__, 9); 2525 return (1); 2526 } else { 2527 /* 2528 * Here we are already pacing at least rack_max_per_above(10%) 2529 * what we are getting back. This indicates most likely 2530 * that we are being limited (cwnd/rwnd/app) and can't 2531 * get any more b/w. There is no sense of trying to 2532 * raise up the pacing rate its not speeding us up 2533 * and we already are pacing faster than we are getting. 2534 */ 2535 rack_log_timely(rack, mult, cur_bw, act_rate, max_allow_rate, 2536 __LINE__, 8); 2537 return (0); 2538 } 2539 } 2540 2541 static void 2542 rack_validate_multipliers_at_or_above100(struct tcp_rack *rack) 2543 { 2544 /* 2545 * When we drag bottom, we want to assure 2546 * that no multiplier is below 1.0, if so 2547 * we want to restore it to at least that. 2548 */ 2549 if (rack->r_ctl.rack_per_of_gp_rec < 100) { 2550 /* This is unlikely we usually do not touch recovery */ 2551 rack->r_ctl.rack_per_of_gp_rec = 100; 2552 } 2553 if (rack->r_ctl.rack_per_of_gp_ca < 100) { 2554 rack->r_ctl.rack_per_of_gp_ca = 100; 2555 } 2556 if (rack->r_ctl.rack_per_of_gp_ss < 100) { 2557 rack->r_ctl.rack_per_of_gp_ss = 100; 2558 } 2559 } 2560 2561 static void 2562 rack_validate_multipliers_at_or_below_100(struct tcp_rack *rack) 2563 { 2564 if (rack->r_ctl.rack_per_of_gp_ca > 100) { 2565 rack->r_ctl.rack_per_of_gp_ca = 100; 2566 } 2567 if (rack->r_ctl.rack_per_of_gp_ss > 100) { 2568 rack->r_ctl.rack_per_of_gp_ss = 100; 2569 } 2570 } 2571 2572 static void 2573 rack_increase_bw_mul(struct tcp_rack *rack, int timely_says, uint64_t cur_bw, uint64_t last_bw_est, int override) 2574 { 2575 int32_t calc, logged, plus; 2576 2577 logged = 0; 2578 2579 if (override) { 2580 /* 2581 * override is passed when we are 2582 * loosing b/w and making one last 2583 * gasp at trying to not loose out 2584 * to a new-reno flow. 2585 */ 2586 goto extra_boost; 2587 } 2588 /* In classic timely we boost by 5x if we have 5 increases in a row, lets not */ 2589 if (rack->rc_gp_incr && 2590 ((rack->rc_gp_timely_inc_cnt + 1) >= RACK_TIMELY_CNT_BOOST)) { 2591 /* 2592 * Reset and get 5 strokes more before the boost. Note 2593 * that the count is 0 based so we have to add one. 2594 */ 2595 extra_boost: 2596 plus = (uint32_t)rack_gp_increase_per * RACK_TIMELY_CNT_BOOST; 2597 rack->rc_gp_timely_inc_cnt = 0; 2598 } else 2599 plus = (uint32_t)rack_gp_increase_per; 2600 /* Must be at least 1% increase for true timely increases */ 2601 if ((plus < 1) && 2602 ((rack->r_ctl.rc_rtt_diff <= 0) || (timely_says <= 0))) 2603 plus = 1; 2604 if (rack->rc_gp_saw_rec && 2605 (rack->rc_gp_no_rec_chg == 0) && 2606 rack_bw_can_be_raised(rack, cur_bw, last_bw_est, 2607 rack->r_ctl.rack_per_of_gp_rec)) { 2608 /* We have been in recovery ding it too */ 2609 calc = rack->r_ctl.rack_per_of_gp_rec + plus; 2610 if (calc > 0xffff) 2611 calc = 0xffff; 2612 logged |= 1; 2613 rack->r_ctl.rack_per_of_gp_rec = (uint16_t)calc; 2614 if (rack_per_upper_bound_ss && 2615 (rack->rc_dragged_bottom == 0) && 2616 (rack->r_ctl.rack_per_of_gp_rec > rack_per_upper_bound_ss)) 2617 rack->r_ctl.rack_per_of_gp_rec = rack_per_upper_bound_ss; 2618 } 2619 if (rack->rc_gp_saw_ca && 2620 (rack->rc_gp_saw_ss == 0) && 2621 rack_bw_can_be_raised(rack, cur_bw, last_bw_est, 2622 rack->r_ctl.rack_per_of_gp_ca)) { 2623 /* In CA */ 2624 calc = rack->r_ctl.rack_per_of_gp_ca + plus; 2625 if (calc > 0xffff) 2626 calc = 0xffff; 2627 logged |= 2; 2628 rack->r_ctl.rack_per_of_gp_ca = (uint16_t)calc; 2629 if (rack_per_upper_bound_ca && 2630 (rack->rc_dragged_bottom == 0) && 2631 (rack->r_ctl.rack_per_of_gp_ca > rack_per_upper_bound_ca)) 2632 rack->r_ctl.rack_per_of_gp_ca = rack_per_upper_bound_ca; 2633 } 2634 if (rack->rc_gp_saw_ss && 2635 rack_bw_can_be_raised(rack, cur_bw, last_bw_est, 2636 rack->r_ctl.rack_per_of_gp_ss)) { 2637 /* In SS */ 2638 calc = rack->r_ctl.rack_per_of_gp_ss + plus; 2639 if (calc > 0xffff) 2640 calc = 0xffff; 2641 rack->r_ctl.rack_per_of_gp_ss = (uint16_t)calc; 2642 if (rack_per_upper_bound_ss && 2643 (rack->rc_dragged_bottom == 0) && 2644 (rack->r_ctl.rack_per_of_gp_ss > rack_per_upper_bound_ss)) 2645 rack->r_ctl.rack_per_of_gp_ss = rack_per_upper_bound_ss; 2646 logged |= 4; 2647 } 2648 if (logged && 2649 (rack->rc_gp_incr == 0)){ 2650 /* Go into increment mode */ 2651 rack->rc_gp_incr = 1; 2652 rack->rc_gp_timely_inc_cnt = 0; 2653 } 2654 if (rack->rc_gp_incr && 2655 logged && 2656 (rack->rc_gp_timely_inc_cnt < RACK_TIMELY_CNT_BOOST)) { 2657 rack->rc_gp_timely_inc_cnt++; 2658 } 2659 rack_log_timely(rack, logged, plus, 0, 0, 2660 __LINE__, 1); 2661 } 2662 2663 static uint32_t 2664 rack_get_decrease(struct tcp_rack *rack, uint32_t curper, int32_t rtt_diff) 2665 { 2666 /* 2667 * norm_grad = rtt_diff / minrtt; 2668 * new_per = curper * (1 - B * norm_grad) 2669 * 2670 * B = rack_gp_decrease_per (default 10%) 2671 * rtt_dif = input var current rtt-diff 2672 * curper = input var current percentage 2673 * minrtt = from rack filter 2674 * 2675 */ 2676 uint64_t perf; 2677 2678 perf = (((uint64_t)curper * ((uint64_t)1000000 - 2679 ((uint64_t)rack_gp_decrease_per * (uint64_t)10000 * 2680 (((uint64_t)rtt_diff * (uint64_t)1000000)/ 2681 (uint64_t)get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt)))/ 2682 (uint64_t)1000000)) / 2683 (uint64_t)1000000); 2684 if (perf > curper) { 2685 /* TSNH */ 2686 perf = curper - 1; 2687 } 2688 return ((uint32_t)perf); 2689 } 2690 2691 static uint32_t 2692 rack_decrease_highrtt(struct tcp_rack *rack, uint32_t curper, uint32_t rtt) 2693 { 2694 /* 2695 * highrttthresh 2696 * result = curper * (1 - (B * ( 1 - ------ )) 2697 * gp_srtt 2698 * 2699 * B = rack_gp_decrease_per (default 10%) 2700 * highrttthresh = filter_min * rack_gp_rtt_maxmul 2701 */ 2702 uint64_t perf; 2703 uint32_t highrttthresh; 2704 2705 highrttthresh = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_maxmul; 2706 2707 perf = (((uint64_t)curper * ((uint64_t)1000000 - 2708 ((uint64_t)rack_gp_decrease_per * ((uint64_t)1000000 - 2709 ((uint64_t)highrttthresh * (uint64_t)1000000) / 2710 (uint64_t)rtt)) / 100)) /(uint64_t)1000000); 2711 return (perf); 2712 } 2713 2714 2715 static void 2716 rack_decrease_bw_mul(struct tcp_rack *rack, int timely_says, uint32_t rtt, int32_t rtt_diff) 2717 { 2718 uint64_t logvar, logvar2, logvar3; 2719 uint32_t logged, new_per, ss_red, ca_red, rec_red, alt, val; 2720 2721 if (rack->rc_gp_incr) { 2722 /* Turn off increment counting */ 2723 rack->rc_gp_incr = 0; 2724 rack->rc_gp_timely_inc_cnt = 0; 2725 } 2726 ss_red = ca_red = rec_red = 0; 2727 logged = 0; 2728 /* Calculate the reduction value */ 2729 if (rtt_diff < 0) { 2730 rtt_diff *= -1; 2731 } 2732 /* Must be at least 1% reduction */ 2733 if (rack->rc_gp_saw_rec && (rack->rc_gp_no_rec_chg == 0)) { 2734 /* We have been in recovery ding it too */ 2735 if (timely_says == 2) { 2736 new_per = rack_decrease_highrtt(rack, rack->r_ctl.rack_per_of_gp_rec, rtt); 2737 alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff); 2738 if (alt < new_per) 2739 val = alt; 2740 else 2741 val = new_per; 2742 } else 2743 val = new_per = alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff); 2744 if (rack->r_ctl.rack_per_of_gp_rec > val) { 2745 rec_red = (rack->r_ctl.rack_per_of_gp_rec - val); 2746 rack->r_ctl.rack_per_of_gp_rec = (uint16_t)val; 2747 } else { 2748 rack->r_ctl.rack_per_of_gp_rec = rack_per_lower_bound; 2749 rec_red = 0; 2750 } 2751 if (rack_per_lower_bound > rack->r_ctl.rack_per_of_gp_rec) 2752 rack->r_ctl.rack_per_of_gp_rec = rack_per_lower_bound; 2753 logged |= 1; 2754 } 2755 if (rack->rc_gp_saw_ss) { 2756 /* Sent in SS */ 2757 if (timely_says == 2) { 2758 new_per = rack_decrease_highrtt(rack, rack->r_ctl.rack_per_of_gp_ss, rtt); 2759 alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff); 2760 if (alt < new_per) 2761 val = alt; 2762 else 2763 val = new_per; 2764 } else 2765 val = new_per = alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_ss, rtt_diff); 2766 if (rack->r_ctl.rack_per_of_gp_ss > new_per) { 2767 ss_red = rack->r_ctl.rack_per_of_gp_ss - val; 2768 rack->r_ctl.rack_per_of_gp_ss = (uint16_t)val; 2769 } else { 2770 ss_red = new_per; 2771 rack->r_ctl.rack_per_of_gp_ss = rack_per_lower_bound; 2772 logvar = new_per; 2773 logvar <<= 32; 2774 logvar |= alt; 2775 logvar2 = (uint32_t)rtt; 2776 logvar2 <<= 32; 2777 logvar2 |= (uint32_t)rtt_diff; 2778 logvar3 = rack_gp_rtt_maxmul; 2779 logvar3 <<= 32; 2780 logvar3 |= get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); 2781 rack_log_timely(rack, timely_says, 2782 logvar2, logvar3, 2783 logvar, __LINE__, 10); 2784 } 2785 if (rack_per_lower_bound > rack->r_ctl.rack_per_of_gp_ss) 2786 rack->r_ctl.rack_per_of_gp_ss = rack_per_lower_bound; 2787 logged |= 4; 2788 } else if (rack->rc_gp_saw_ca) { 2789 /* Sent in CA */ 2790 if (timely_says == 2) { 2791 new_per = rack_decrease_highrtt(rack, rack->r_ctl.rack_per_of_gp_ca, rtt); 2792 alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff); 2793 if (alt < new_per) 2794 val = alt; 2795 else 2796 val = new_per; 2797 } else 2798 val = new_per = alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_ca, rtt_diff); 2799 if (rack->r_ctl.rack_per_of_gp_ca > val) { 2800 ca_red = rack->r_ctl.rack_per_of_gp_ca - val; 2801 rack->r_ctl.rack_per_of_gp_ca = (uint16_t)val; 2802 } else { 2803 rack->r_ctl.rack_per_of_gp_ca = rack_per_lower_bound; 2804 ca_red = 0; 2805 logvar = new_per; 2806 logvar <<= 32; 2807 logvar |= alt; 2808 logvar2 = (uint32_t)rtt; 2809 logvar2 <<= 32; 2810 logvar2 |= (uint32_t)rtt_diff; 2811 logvar3 = rack_gp_rtt_maxmul; 2812 logvar3 <<= 32; 2813 logvar3 |= get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); 2814 rack_log_timely(rack, timely_says, 2815 logvar2, logvar3, 2816 logvar, __LINE__, 10); 2817 } 2818 if (rack_per_lower_bound > rack->r_ctl.rack_per_of_gp_ca) 2819 rack->r_ctl.rack_per_of_gp_ca = rack_per_lower_bound; 2820 logged |= 2; 2821 } 2822 if (rack->rc_gp_timely_dec_cnt < 0x7) { 2823 rack->rc_gp_timely_dec_cnt++; 2824 if (rack_timely_dec_clear && 2825 (rack->rc_gp_timely_dec_cnt == rack_timely_dec_clear)) 2826 rack->rc_gp_timely_dec_cnt = 0; 2827 } 2828 logvar = ss_red; 2829 logvar <<= 32; 2830 logvar |= ca_red; 2831 rack_log_timely(rack, logged, rec_red, rack_per_lower_bound, logvar, 2832 __LINE__, 2); 2833 } 2834 2835 static void 2836 rack_log_rtt_shrinks(struct tcp_rack *rack, uint32_t us_cts, 2837 uint32_t rtt, uint32_t line, uint8_t reas) 2838 { 2839 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2840 union tcp_log_stackspecific log; 2841 struct timeval tv; 2842 2843 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2844 log.u_bbr.flex1 = line; 2845 log.u_bbr.flex2 = rack->r_ctl.rc_time_probertt_starts; 2846 log.u_bbr.flex3 = rack->r_ctl.rc_lower_rtt_us_cts; 2847 log.u_bbr.flex4 = rack->r_ctl.rack_per_of_gp_ss; 2848 log.u_bbr.flex5 = rtt; 2849 log.u_bbr.flex6 = rack->rc_highly_buffered; 2850 log.u_bbr.flex6 <<= 1; 2851 log.u_bbr.flex6 |= rack->forced_ack; 2852 log.u_bbr.flex6 <<= 1; 2853 log.u_bbr.flex6 |= rack->rc_gp_dyn_mul; 2854 log.u_bbr.flex6 <<= 1; 2855 log.u_bbr.flex6 |= rack->in_probe_rtt; 2856 log.u_bbr.flex6 <<= 1; 2857 log.u_bbr.flex6 |= rack->measure_saw_probe_rtt; 2858 log.u_bbr.flex7 = rack->r_ctl.rack_per_of_gp_probertt; 2859 log.u_bbr.pacing_gain = rack->r_ctl.rack_per_of_gp_ca; 2860 log.u_bbr.cwnd_gain = rack->r_ctl.rack_per_of_gp_rec; 2861 log.u_bbr.flex8 = reas; 2862 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2863 log.u_bbr.delRate = rack_get_bw(rack); 2864 log.u_bbr.cur_del_rate = rack->r_ctl.rc_highest_us_rtt; 2865 log.u_bbr.cur_del_rate <<= 32; 2866 log.u_bbr.cur_del_rate |= rack->r_ctl.rc_lowest_us_rtt; 2867 log.u_bbr.applimited = rack->r_ctl.rc_time_probertt_entered; 2868 log.u_bbr.pkts_out = rack->r_ctl.rc_rtt_diff; 2869 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2870 log.u_bbr.epoch = rack->r_ctl.rc_gp_srtt; 2871 log.u_bbr.lt_epoch = rack->r_ctl.rc_prev_gp_srtt; 2872 log.u_bbr.pkt_epoch = rack->r_ctl.rc_lower_rtt_us_cts; 2873 log.u_bbr.delivered = rack->r_ctl.rc_target_probertt_flight; 2874 log.u_bbr.lost = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); 2875 log.u_bbr.rttProp = us_cts; 2876 log.u_bbr.rttProp <<= 32; 2877 log.u_bbr.rttProp |= rack->r_ctl.rc_entry_gp_rtt; 2878 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2879 &rack->rc_inp->inp_socket->so_rcv, 2880 &rack->rc_inp->inp_socket->so_snd, 2881 BBR_LOG_RTT_SHRINKS, 0, 2882 0, &log, false, &rack->r_ctl.act_rcv_time); 2883 } 2884 } 2885 2886 static void 2887 rack_set_prtt_target(struct tcp_rack *rack, uint32_t segsiz, uint32_t rtt) 2888 { 2889 uint64_t bwdp; 2890 2891 bwdp = rack_get_bw(rack); 2892 bwdp *= (uint64_t)rtt; 2893 bwdp /= (uint64_t)HPTS_USEC_IN_SEC; 2894 rack->r_ctl.rc_target_probertt_flight = roundup((uint32_t)bwdp, segsiz); 2895 if (rack->r_ctl.rc_target_probertt_flight < (segsiz * rack_timely_min_segs)) { 2896 /* 2897 * A window protocol must be able to have 4 packets 2898 * outstanding as the floor in order to function 2899 * (especially considering delayed ack :D). 2900 */ 2901 rack->r_ctl.rc_target_probertt_flight = (segsiz * rack_timely_min_segs); 2902 } 2903 } 2904 2905 static void 2906 rack_enter_probertt(struct tcp_rack *rack, uint32_t us_cts) 2907 { 2908 /** 2909 * ProbeRTT is a bit different in rack_pacing than in 2910 * BBR. It is like BBR in that it uses the lowering of 2911 * the RTT as a signal that we saw something new and 2912 * counts from there for how long between. But it is 2913 * different in that its quite simple. It does not 2914 * play with the cwnd and wait until we get down 2915 * to N segments outstanding and hold that for 2916 * 200ms. Instead it just sets the pacing reduction 2917 * rate to a set percentage (70 by default) and hold 2918 * that for a number of recent GP Srtt's. 2919 */ 2920 uint32_t segsiz; 2921 2922 if (rack->rc_gp_dyn_mul == 0) 2923 return; 2924 2925 if (rack->rc_tp->snd_max == rack->rc_tp->snd_una) { 2926 /* We are idle */ 2927 return; 2928 } 2929 if ((rack->rc_tp->t_flags & TF_GPUTINPROG) && 2930 SEQ_GT(rack->rc_tp->snd_una, rack->rc_tp->gput_seq)) { 2931 /* 2932 * Stop the goodput now, the idea here is 2933 * that future measurements with in_probe_rtt 2934 * won't register if they are not greater so 2935 * we want to get what info (if any) is available 2936 * now. 2937 */ 2938 rack_do_goodput_measurement(rack->rc_tp, rack, 2939 rack->rc_tp->snd_una, __LINE__); 2940 } 2941 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt; 2942 rack->r_ctl.rc_time_probertt_entered = us_cts; 2943 segsiz = min(ctf_fixed_maxseg(rack->rc_tp), 2944 rack->r_ctl.rc_pace_min_segs); 2945 rack->in_probe_rtt = 1; 2946 rack->measure_saw_probe_rtt = 1; 2947 rack->r_ctl.rc_lower_rtt_us_cts = us_cts; 2948 rack->r_ctl.rc_time_probertt_starts = 0; 2949 rack->r_ctl.rc_entry_gp_rtt = rack->r_ctl.rc_gp_srtt; 2950 if (rack_probertt_use_min_rtt_entry) 2951 rack_set_prtt_target(rack, segsiz, get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt)); 2952 else 2953 rack_set_prtt_target(rack, segsiz, rack->r_ctl.rc_gp_srtt); 2954 rack_log_rtt_shrinks(rack, us_cts, get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 2955 __LINE__, RACK_RTTS_ENTERPROBE); 2956 } 2957 2958 static void 2959 rack_exit_probertt(struct tcp_rack *rack, uint32_t us_cts) 2960 { 2961 struct rack_sendmap *rsm; 2962 uint32_t segsiz; 2963 2964 segsiz = min(ctf_fixed_maxseg(rack->rc_tp), 2965 rack->r_ctl.rc_pace_min_segs); 2966 rack->in_probe_rtt = 0; 2967 if ((rack->rc_tp->t_flags & TF_GPUTINPROG) && 2968 SEQ_GT(rack->rc_tp->snd_una, rack->rc_tp->gput_seq)) { 2969 /* 2970 * Stop the goodput now, the idea here is 2971 * that future measurements with in_probe_rtt 2972 * won't register if they are not greater so 2973 * we want to get what info (if any) is available 2974 * now. 2975 */ 2976 rack_do_goodput_measurement(rack->rc_tp, rack, 2977 rack->rc_tp->snd_una, __LINE__); 2978 } else if (rack->rc_tp->t_flags & TF_GPUTINPROG) { 2979 /* 2980 * We don't have enough data to make a measurement. 2981 * So lets just stop and start here after exiting 2982 * probe-rtt. We probably are not interested in 2983 * the results anyway. 2984 */ 2985 rack->rc_tp->t_flags &= ~TF_GPUTINPROG; 2986 } 2987 /* 2988 * Measurements through the current snd_max are going 2989 * to be limited by the slower pacing rate. 2990 * 2991 * We need to mark these as app-limited so we 2992 * don't collapse the b/w. 2993 */ 2994 rsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 2995 if (rsm && ((rsm->r_flags & RACK_APP_LIMITED) == 0)) { 2996 if (rack->r_ctl.rc_app_limited_cnt == 0) 2997 rack->r_ctl.rc_end_appl = rack->r_ctl.rc_first_appl = rsm; 2998 else { 2999 /* 3000 * Go out to the end app limited and mark 3001 * this new one as next and move the end_appl up 3002 * to this guy. 3003 */ 3004 if (rack->r_ctl.rc_end_appl) 3005 rack->r_ctl.rc_end_appl->r_nseq_appl = rsm->r_start; 3006 rack->r_ctl.rc_end_appl = rsm; 3007 } 3008 rsm->r_flags |= RACK_APP_LIMITED; 3009 rack->r_ctl.rc_app_limited_cnt++; 3010 } 3011 /* 3012 * Now, we need to examine our pacing rate multipliers. 3013 * If its under 100%, we need to kick it back up to 3014 * 100%. We also don't let it be over our "max" above 3015 * the actual rate i.e. 100% + rack_clamp_atexit_prtt. 3016 * Note setting clamp_atexit_prtt to 0 has the effect 3017 * of setting CA/SS to 100% always at exit (which is 3018 * the default behavior). 3019 */ 3020 if (rack_probertt_clear_is) { 3021 rack->rc_gp_incr = 0; 3022 rack->rc_gp_bwred = 0; 3023 rack->rc_gp_timely_inc_cnt = 0; 3024 rack->rc_gp_timely_dec_cnt = 0; 3025 } 3026 /* Do we do any clamping at exit? */ 3027 if (rack->rc_highly_buffered && rack_atexit_prtt_hbp) { 3028 rack->r_ctl.rack_per_of_gp_ca = rack_atexit_prtt_hbp; 3029 rack->r_ctl.rack_per_of_gp_ss = rack_atexit_prtt_hbp; 3030 } 3031 if ((rack->rc_highly_buffered == 0) && rack_atexit_prtt) { 3032 rack->r_ctl.rack_per_of_gp_ca = rack_atexit_prtt; 3033 rack->r_ctl.rack_per_of_gp_ss = rack_atexit_prtt; 3034 } 3035 /* 3036 * Lets set rtt_diff to 0, so that we will get a "boost" 3037 * after exiting. 3038 */ 3039 rack->r_ctl.rc_rtt_diff = 0; 3040 3041 /* Clear all flags so we start fresh */ 3042 rack->rc_tp->t_bytes_acked = 0; 3043 rack->rc_tp->ccv->flags &= ~CCF_ABC_SENTAWND; 3044 /* 3045 * If configured to, set the cwnd and ssthresh to 3046 * our targets. 3047 */ 3048 if (rack_probe_rtt_sets_cwnd) { 3049 uint64_t ebdp; 3050 uint32_t setto; 3051 3052 /* Set ssthresh so we get into CA once we hit our target */ 3053 if (rack_probertt_use_min_rtt_exit == 1) { 3054 /* Set to min rtt */ 3055 rack_set_prtt_target(rack, segsiz, 3056 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt)); 3057 } else if (rack_probertt_use_min_rtt_exit == 2) { 3058 /* Set to current gp rtt */ 3059 rack_set_prtt_target(rack, segsiz, 3060 rack->r_ctl.rc_gp_srtt); 3061 } else if (rack_probertt_use_min_rtt_exit == 3) { 3062 /* Set to entry gp rtt */ 3063 rack_set_prtt_target(rack, segsiz, 3064 rack->r_ctl.rc_entry_gp_rtt); 3065 } else { 3066 uint64_t sum; 3067 uint32_t setval; 3068 3069 sum = rack->r_ctl.rc_entry_gp_rtt; 3070 sum *= 10; 3071 sum /= (uint64_t)(max(1, rack->r_ctl.rc_gp_srtt)); 3072 if (sum >= 20) { 3073 /* 3074 * A highly buffered path needs 3075 * cwnd space for timely to work. 3076 * Lets set things up as if 3077 * we are heading back here again. 3078 */ 3079 setval = rack->r_ctl.rc_entry_gp_rtt; 3080 } else if (sum >= 15) { 3081 /* 3082 * Lets take the smaller of the 3083 * two since we are just somewhat 3084 * buffered. 3085 */ 3086 setval = rack->r_ctl.rc_gp_srtt; 3087 if (setval > rack->r_ctl.rc_entry_gp_rtt) 3088 setval = rack->r_ctl.rc_entry_gp_rtt; 3089 } else { 3090 /* 3091 * Here we are not highly buffered 3092 * and should pick the min we can to 3093 * keep from causing loss. 3094 */ 3095 setval = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); 3096 } 3097 rack_set_prtt_target(rack, segsiz, 3098 setval); 3099 } 3100 if (rack_probe_rtt_sets_cwnd > 1) { 3101 /* There is a percentage here to boost */ 3102 ebdp = rack->r_ctl.rc_target_probertt_flight; 3103 ebdp *= rack_probe_rtt_sets_cwnd; 3104 ebdp /= 100; 3105 setto = rack->r_ctl.rc_target_probertt_flight + ebdp; 3106 } else 3107 setto = rack->r_ctl.rc_target_probertt_flight; 3108 rack->rc_tp->snd_cwnd = roundup(setto, segsiz); 3109 if (rack->rc_tp->snd_cwnd < (segsiz * rack_timely_min_segs)) { 3110 /* Enforce a min */ 3111 rack->rc_tp->snd_cwnd = segsiz * rack_timely_min_segs; 3112 } 3113 /* If we set in the cwnd also set the ssthresh point so we are in CA */ 3114 rack->rc_tp->snd_ssthresh = (rack->rc_tp->snd_cwnd - 1); 3115 } 3116 rack_log_rtt_shrinks(rack, us_cts, 3117 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 3118 __LINE__, RACK_RTTS_EXITPROBE); 3119 /* Clear times last so log has all the info */ 3120 rack->r_ctl.rc_probertt_sndmax_atexit = rack->rc_tp->snd_max; 3121 rack->r_ctl.rc_time_probertt_entered = us_cts; 3122 rack->r_ctl.rc_time_probertt_starts = rack->r_ctl.rc_lower_rtt_us_cts = us_cts; 3123 rack->r_ctl.rc_time_of_last_probertt = us_cts; 3124 } 3125 3126 static void 3127 rack_check_probe_rtt(struct tcp_rack *rack, uint32_t us_cts) 3128 { 3129 /* Check in on probe-rtt */ 3130 if (rack->rc_gp_filled == 0) { 3131 /* We do not do p-rtt unless we have gp measurements */ 3132 return; 3133 } 3134 if (rack->in_probe_rtt) { 3135 uint64_t no_overflow; 3136 uint32_t endtime, must_stay; 3137 3138 if (rack->r_ctl.rc_went_idle_time && 3139 ((us_cts - rack->r_ctl.rc_went_idle_time) > rack_min_probertt_hold)) { 3140 /* 3141 * We went idle during prtt, just exit now. 3142 */ 3143 rack_exit_probertt(rack, us_cts); 3144 } else if (rack_probe_rtt_safety_val && 3145 TSTMP_GT(us_cts, rack->r_ctl.rc_time_probertt_entered) && 3146 ((us_cts - rack->r_ctl.rc_time_probertt_entered) > rack_probe_rtt_safety_val)) { 3147 /* 3148 * Probe RTT safety value triggered! 3149 */ 3150 rack_log_rtt_shrinks(rack, us_cts, 3151 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 3152 __LINE__, RACK_RTTS_SAFETY); 3153 rack_exit_probertt(rack, us_cts); 3154 } 3155 /* Calculate the max we will wait */ 3156 endtime = rack->r_ctl.rc_time_probertt_entered + (rack->r_ctl.rc_gp_srtt * rack_max_drain_wait); 3157 if (rack->rc_highly_buffered) 3158 endtime += (rack->r_ctl.rc_gp_srtt * rack_max_drain_hbp); 3159 /* Calculate the min we must wait */ 3160 must_stay = rack->r_ctl.rc_time_probertt_entered + (rack->r_ctl.rc_gp_srtt * rack_must_drain); 3161 if ((ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) > rack->r_ctl.rc_target_probertt_flight) && 3162 TSTMP_LT(us_cts, endtime)) { 3163 uint32_t calc; 3164 /* Do we lower more? */ 3165 no_exit: 3166 if (TSTMP_GT(us_cts, rack->r_ctl.rc_time_probertt_entered)) 3167 calc = us_cts - rack->r_ctl.rc_time_probertt_entered; 3168 else 3169 calc = 0; 3170 calc /= max(rack->r_ctl.rc_gp_srtt, 1); 3171 if (calc) { 3172 /* Maybe */ 3173 calc *= rack_per_of_gp_probertt_reduce; 3174 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt - calc; 3175 /* Limit it too */ 3176 if (rack->r_ctl.rack_per_of_gp_probertt < rack_per_of_gp_lowthresh) 3177 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_lowthresh; 3178 } 3179 /* We must reach target or the time set */ 3180 return; 3181 } 3182 if (rack->r_ctl.rc_time_probertt_starts == 0) { 3183 if ((TSTMP_LT(us_cts, must_stay) && 3184 rack->rc_highly_buffered) || 3185 (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) > 3186 rack->r_ctl.rc_target_probertt_flight)) { 3187 /* We are not past the must_stay time */ 3188 goto no_exit; 3189 } 3190 rack_log_rtt_shrinks(rack, us_cts, 3191 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 3192 __LINE__, RACK_RTTS_REACHTARGET); 3193 rack->r_ctl.rc_time_probertt_starts = us_cts; 3194 if (rack->r_ctl.rc_time_probertt_starts == 0) 3195 rack->r_ctl.rc_time_probertt_starts = 1; 3196 /* Restore back to our rate we want to pace at in prtt */ 3197 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt; 3198 } 3199 /* 3200 * Setup our end time, some number of gp_srtts plus 200ms. 3201 */ 3202 no_overflow = ((uint64_t)rack->r_ctl.rc_gp_srtt * 3203 (uint64_t)rack_probertt_gpsrtt_cnt_mul); 3204 if (rack_probertt_gpsrtt_cnt_div) 3205 endtime = (uint32_t)(no_overflow / (uint64_t)rack_probertt_gpsrtt_cnt_div); 3206 else 3207 endtime = 0; 3208 endtime += rack_min_probertt_hold; 3209 endtime += rack->r_ctl.rc_time_probertt_starts; 3210 if (TSTMP_GEQ(us_cts, endtime)) { 3211 /* yes, exit probertt */ 3212 rack_exit_probertt(rack, us_cts); 3213 } 3214 3215 } else if((us_cts - rack->r_ctl.rc_lower_rtt_us_cts) >= rack_time_between_probertt) { 3216 /* Go into probertt, its been too long since we went lower */ 3217 rack_enter_probertt(rack, us_cts); 3218 } 3219 } 3220 3221 static void 3222 rack_update_multiplier(struct tcp_rack *rack, int32_t timely_says, uint64_t last_bw_est, 3223 uint32_t rtt, int32_t rtt_diff) 3224 { 3225 uint64_t cur_bw, up_bnd, low_bnd, subfr; 3226 uint32_t losses; 3227 3228 if ((rack->rc_gp_dyn_mul == 0) || 3229 (rack->use_fixed_rate) || 3230 (rack->in_probe_rtt) || 3231 (rack->rc_always_pace == 0)) { 3232 /* No dynamic GP multipler in play */ 3233 return; 3234 } 3235 losses = rack->r_ctl.rc_loss_count - rack->r_ctl.rc_loss_at_start; 3236 cur_bw = rack_get_bw(rack); 3237 /* Calculate our up and down range */ 3238 up_bnd = rack->r_ctl.last_gp_comp_bw * (uint64_t)rack_gp_per_bw_mul_up; 3239 up_bnd /= 100; 3240 up_bnd += rack->r_ctl.last_gp_comp_bw; 3241 3242 subfr = (uint64_t)rack->r_ctl.last_gp_comp_bw * (uint64_t)rack_gp_per_bw_mul_down; 3243 subfr /= 100; 3244 low_bnd = rack->r_ctl.last_gp_comp_bw - subfr; 3245 if ((timely_says == 2) && (rack->r_ctl.rc_no_push_at_mrtt)) { 3246 /* 3247 * This is the case where our RTT is above 3248 * the max target and we have been configured 3249 * to just do timely no bonus up stuff in that case. 3250 * 3251 * There are two configurations, set to 1, and we 3252 * just do timely if we are over our max. If its 3253 * set above 1 then we slam the multipliers down 3254 * to 100 and then decrement per timely. 3255 */ 3256 rack_log_timely(rack, timely_says, cur_bw, low_bnd, up_bnd, 3257 __LINE__, 3); 3258 if (rack->r_ctl.rc_no_push_at_mrtt > 1) 3259 rack_validate_multipliers_at_or_below_100(rack); 3260 rack_decrease_bw_mul(rack, timely_says, rtt, rtt_diff); 3261 } else if ((last_bw_est < low_bnd) && !losses) { 3262 /* 3263 * We are decreasing this is a bit complicated this 3264 * means we are loosing ground. This could be 3265 * because another flow entered and we are competing 3266 * for b/w with it. This will push the RTT up which 3267 * makes timely unusable unless we want to get shoved 3268 * into a corner and just be backed off (the age 3269 * old problem with delay based CC). 3270 * 3271 * On the other hand if it was a route change we 3272 * would like to stay somewhat contained and not 3273 * blow out the buffers. 3274 */ 3275 rack_log_timely(rack, timely_says, cur_bw, low_bnd, up_bnd, 3276 __LINE__, 3); 3277 rack->r_ctl.last_gp_comp_bw = cur_bw; 3278 if (rack->rc_gp_bwred == 0) { 3279 /* Go into reduction counting */ 3280 rack->rc_gp_bwred = 1; 3281 rack->rc_gp_timely_dec_cnt = 0; 3282 } 3283 if ((rack->rc_gp_timely_dec_cnt < rack_timely_max_push_drop) || 3284 (timely_says == 0)) { 3285 /* 3286 * Push another time with a faster pacing 3287 * to try to gain back (we include override to 3288 * get a full raise factor). 3289 */ 3290 if ((rack->rc_gp_saw_ca && rack->r_ctl.rack_per_of_gp_ca <= rack_down_raise_thresh) || 3291 (rack->rc_gp_saw_ss && rack->r_ctl.rack_per_of_gp_ss <= rack_down_raise_thresh) || 3292 (timely_says == 0) || 3293 (rack_down_raise_thresh == 0)) { 3294 /* 3295 * Do an override up in b/w if we were 3296 * below the threshold or if the threshold 3297 * is zero we always do the raise. 3298 */ 3299 rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 1); 3300 } else { 3301 /* Log it stays the same */ 3302 rack_log_timely(rack, 0, last_bw_est, low_bnd, 0, 3303 __LINE__, 11); 3304 3305 } 3306 rack->rc_gp_timely_dec_cnt++; 3307 /* We are not incrementing really no-count */ 3308 rack->rc_gp_incr = 0; 3309 rack->rc_gp_timely_inc_cnt = 0; 3310 } else { 3311 /* 3312 * Lets just use the RTT 3313 * information and give up 3314 * pushing. 3315 */ 3316 goto use_timely; 3317 } 3318 } else if ((timely_says != 2) && 3319 !losses && 3320 (last_bw_est > up_bnd)) { 3321 /* 3322 * We are increasing b/w lets keep going, updating 3323 * our b/w and ignoring any timely input, unless 3324 * of course we are at our max raise (if there is one). 3325 */ 3326 3327 rack_log_timely(rack, timely_says, cur_bw, low_bnd, up_bnd, 3328 __LINE__, 3); 3329 rack->r_ctl.last_gp_comp_bw = cur_bw; 3330 if (rack->rc_gp_saw_ss && 3331 rack_per_upper_bound_ss && 3332 (rack->r_ctl.rack_per_of_gp_ss == rack_per_upper_bound_ss)) { 3333 /* 3334 * In cases where we can't go higher 3335 * we should just use timely. 3336 */ 3337 goto use_timely; 3338 } 3339 if (rack->rc_gp_saw_ca && 3340 rack_per_upper_bound_ca && 3341 (rack->r_ctl.rack_per_of_gp_ca == rack_per_upper_bound_ca)) { 3342 /* 3343 * In cases where we can't go higher 3344 * we should just use timely. 3345 */ 3346 goto use_timely; 3347 } 3348 rack->rc_gp_bwred = 0; 3349 rack->rc_gp_timely_dec_cnt = 0; 3350 /* You get a set number of pushes if timely is trying to reduce */ 3351 if ((rack->rc_gp_incr < rack_timely_max_push_rise) || (timely_says == 0)) { 3352 rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 0); 3353 } else { 3354 /* Log it stays the same */ 3355 rack_log_timely(rack, 0, last_bw_est, up_bnd, 0, 3356 __LINE__, 12); 3357 3358 } 3359 return; 3360 } else { 3361 /* 3362 * We are staying between the lower and upper range bounds 3363 * so use timely to decide. 3364 */ 3365 rack_log_timely(rack, timely_says, cur_bw, low_bnd, up_bnd, 3366 __LINE__, 3); 3367 use_timely: 3368 if (timely_says) { 3369 rack->rc_gp_incr = 0; 3370 rack->rc_gp_timely_inc_cnt = 0; 3371 if ((rack->rc_gp_timely_dec_cnt < rack_timely_max_push_drop) && 3372 !losses && 3373 (last_bw_est < low_bnd)) { 3374 /* We are loosing ground */ 3375 rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 0); 3376 rack->rc_gp_timely_dec_cnt++; 3377 /* We are not incrementing really no-count */ 3378 rack->rc_gp_incr = 0; 3379 rack->rc_gp_timely_inc_cnt = 0; 3380 } else 3381 rack_decrease_bw_mul(rack, timely_says, rtt, rtt_diff); 3382 } else { 3383 rack->rc_gp_bwred = 0; 3384 rack->rc_gp_timely_dec_cnt = 0; 3385 rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 0); 3386 } 3387 } 3388 } 3389 3390 static int32_t 3391 rack_make_timely_judgement(struct tcp_rack *rack, uint32_t rtt, int32_t rtt_diff, uint32_t prev_rtt) 3392 { 3393 int32_t timely_says; 3394 uint64_t log_mult, log_rtt_a_diff; 3395 3396 log_rtt_a_diff = rtt; 3397 log_rtt_a_diff <<= 32; 3398 log_rtt_a_diff |= (uint32_t)rtt_diff; 3399 if (rtt >= (get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * 3400 rack_gp_rtt_maxmul)) { 3401 /* Reduce the b/w multipler */ 3402 timely_says = 2; 3403 log_mult = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_maxmul; 3404 log_mult <<= 32; 3405 log_mult |= prev_rtt; 3406 rack_log_timely(rack, timely_says, log_mult, 3407 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 3408 log_rtt_a_diff, __LINE__, 4); 3409 } else if (rtt <= (get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) + 3410 ((get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_minmul) / 3411 max(rack_gp_rtt_mindiv , 1)))) { 3412 /* Increase the b/w multipler */ 3413 log_mult = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) + 3414 ((get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_minmul) / 3415 max(rack_gp_rtt_mindiv , 1)); 3416 log_mult <<= 32; 3417 log_mult |= prev_rtt; 3418 timely_says = 0; 3419 rack_log_timely(rack, timely_says, log_mult , 3420 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 3421 log_rtt_a_diff, __LINE__, 5); 3422 } else { 3423 /* 3424 * Use a gradient to find it the timely gradient 3425 * is: 3426 * grad = rc_rtt_diff / min_rtt; 3427 * 3428 * anything below or equal to 0 will be 3429 * a increase indication. Anything above 3430 * zero is a decrease. Note we take care 3431 * of the actual gradient calculation 3432 * in the reduction (its not needed for 3433 * increase). 3434 */ 3435 log_mult = prev_rtt; 3436 if (rtt_diff <= 0) { 3437 /* 3438 * Rttdiff is less than zero, increase the 3439 * b/w multipler (its 0 or negative) 3440 */ 3441 timely_says = 0; 3442 rack_log_timely(rack, timely_says, log_mult, 3443 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), log_rtt_a_diff, __LINE__, 6); 3444 } else { 3445 /* Reduce the b/w multipler */ 3446 timely_says = 1; 3447 rack_log_timely(rack, timely_says, log_mult, 3448 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), log_rtt_a_diff, __LINE__, 7); 3449 } 3450 } 3451 return (timely_says); 3452 } 3453 3454 static void 3455 rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack, 3456 tcp_seq th_ack, int line) 3457 { 3458 uint64_t tim, bytes_ps, ltim, stim, utim; 3459 uint32_t segsiz, bytes, reqbytes, us_cts; 3460 int32_t gput, new_rtt_diff, timely_says; 3461 3462 us_cts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 3463 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 3464 if (TSTMP_GEQ(us_cts, tp->gput_ts)) 3465 tim = us_cts - tp->gput_ts; 3466 else 3467 tim = 0; 3468 3469 if (TSTMP_GT(rack->r_ctl.rc_gp_cumack_ts, rack->r_ctl.rc_gp_output_ts)) 3470 stim = rack->r_ctl.rc_gp_cumack_ts - rack->r_ctl.rc_gp_output_ts; 3471 else 3472 stim = 0; 3473 /* 3474 * Use the larger of the send time or ack time. This prevents us 3475 * from being influenced by ack artifacts to come up with too 3476 * high of measurement. Note that since we are spanning over many more 3477 * bytes in most of our measurements hopefully that is less likely to 3478 * occur. 3479 */ 3480 if (tim > stim) 3481 utim = max(tim, 1); 3482 else 3483 utim = max(stim, 1); 3484 /* Lets validate utim */ 3485 ltim = max(1, (utim/HPTS_USEC_IN_MSEC)); 3486 gput = (((uint64_t) (th_ack - tp->gput_seq)) << 3) / ltim; 3487 reqbytes = min(rc_init_window(rack), (MIN_GP_WIN * segsiz)); 3488 if ((tim == 0) && (stim == 0)) { 3489 /* 3490 * Invalid measurement time, maybe 3491 * all on one ack/one send? 3492 */ 3493 bytes = 0; 3494 bytes_ps = 0; 3495 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, 3496 0, 0, 0, 10, __LINE__, NULL); 3497 goto skip_measurement; 3498 } 3499 if (rack->r_ctl.rc_gp_lowrtt == 0xffffffff) { 3500 /* We never made a us_rtt measurement? */ 3501 bytes = 0; 3502 bytes_ps = 0; 3503 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, 3504 0, 0, 0, 10, __LINE__, NULL); 3505 goto skip_measurement; 3506 } 3507 /* 3508 * Calculate the maximum possible b/w this connection 3509 * could have. We base our calculation on the lowest 3510 * rtt we have seen during the measurement and the 3511 * largest rwnd the client has given us in that time. This 3512 * forms a BDP that is the maximum that we could ever 3513 * get to the client. Anything larger is not valid. 3514 * 3515 * I originally had code here that rejected measurements 3516 * where the time was less than 1/2 the latest us_rtt. 3517 * But after thinking on that I realized its wrong since 3518 * say you had a 150Mbps or even 1Gbps link, and you 3519 * were a long way away.. example I am in Europe (100ms rtt) 3520 * talking to my 1Gbps link in S.C. Now measuring say 150,000 3521 * bytes my time would be 1.2ms, and yet my rtt would say 3522 * the measurement was invalid the time was < 50ms. The 3523 * same thing is true for 150Mb (8ms of time). 3524 * 3525 * A better way I realized is to look at what the maximum 3526 * the connection could possibly do. This is gated on 3527 * the lowest RTT we have seen and the highest rwnd. 3528 * We should in theory never exceed that, if we are 3529 * then something on the path is storing up packets 3530 * and then feeding them all at once to our endpoint 3531 * messing up our measurement. 3532 */ 3533 rack->r_ctl.last_max_bw = rack->r_ctl.rc_gp_high_rwnd; 3534 rack->r_ctl.last_max_bw *= HPTS_USEC_IN_SEC; 3535 rack->r_ctl.last_max_bw /= rack->r_ctl.rc_gp_lowrtt; 3536 if (SEQ_LT(th_ack, tp->gput_seq)) { 3537 /* No measurement can be made */ 3538 bytes = 0; 3539 bytes_ps = 0; 3540 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, 3541 0, 0, 0, 10, __LINE__, NULL); 3542 goto skip_measurement; 3543 } else 3544 bytes = (th_ack - tp->gput_seq); 3545 bytes_ps = (uint64_t)bytes; 3546 /* 3547 * Don't measure a b/w for pacing unless we have gotten at least 3548 * an initial windows worth of data in this measurement interval. 3549 * 3550 * Small numbers of bytes get badly influenced by delayed ack and 3551 * other artifacts. Note we take the initial window or our 3552 * defined minimum GP (defaulting to 10 which hopefully is the 3553 * IW). 3554 */ 3555 if (rack->rc_gp_filled == 0) { 3556 /* 3557 * The initial estimate is special. We 3558 * have blasted out an IW worth of packets 3559 * without a real valid ack ts results. We 3560 * then setup the app_limited_needs_set flag, 3561 * this should get the first ack in (probably 2 3562 * MSS worth) to be recorded as the timestamp. 3563 * We thus allow a smaller number of bytes i.e. 3564 * IW - 2MSS. 3565 */ 3566 reqbytes -= (2 * segsiz); 3567 /* Also lets fill previous for our first measurement to be neutral */ 3568 rack->r_ctl.rc_prev_gp_srtt = rack->r_ctl.rc_gp_srtt; 3569 } 3570 if ((bytes_ps < reqbytes) || rack->app_limited_needs_set) { 3571 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, 3572 rack->r_ctl.rc_app_limited_cnt, 3573 0, 0, 10, __LINE__, NULL); 3574 goto skip_measurement; 3575 } 3576 /* 3577 * We now need to calculate the Timely like status so 3578 * we can update (possibly) the b/w multipliers. 3579 */ 3580 new_rtt_diff = (int32_t)rack->r_ctl.rc_gp_srtt - (int32_t)rack->r_ctl.rc_prev_gp_srtt; 3581 if (rack->rc_gp_filled == 0) { 3582 /* No previous reading */ 3583 rack->r_ctl.rc_rtt_diff = new_rtt_diff; 3584 } else { 3585 if (rack->measure_saw_probe_rtt == 0) { 3586 /* 3587 * We don't want a probertt to be counted 3588 * since it will be negative incorrectly. We 3589 * expect to be reducing the RTT when we 3590 * pace at a slower rate. 3591 */ 3592 rack->r_ctl.rc_rtt_diff -= (rack->r_ctl.rc_rtt_diff / 8); 3593 rack->r_ctl.rc_rtt_diff += (new_rtt_diff / 8); 3594 } 3595 } 3596 timely_says = rack_make_timely_judgement(rack, 3597 rack->r_ctl.rc_gp_srtt, 3598 rack->r_ctl.rc_rtt_diff, 3599 rack->r_ctl.rc_prev_gp_srtt 3600 ); 3601 bytes_ps *= HPTS_USEC_IN_SEC; 3602 bytes_ps /= utim; 3603 if (bytes_ps > rack->r_ctl.last_max_bw) { 3604 /* 3605 * Something is on path playing 3606 * since this b/w is not possible based 3607 * on our BDP (highest rwnd and lowest rtt 3608 * we saw in the measurement window). 3609 * 3610 * Another option here would be to 3611 * instead skip the measurement. 3612 */ 3613 rack_log_pacing_delay_calc(rack, bytes, reqbytes, 3614 bytes_ps, rack->r_ctl.last_max_bw, 0, 3615 11, __LINE__, NULL); 3616 bytes_ps = rack->r_ctl.last_max_bw; 3617 } 3618 /* We store gp for b/w in bytes per second */ 3619 if (rack->rc_gp_filled == 0) { 3620 /* Initial measurment */ 3621 if (bytes_ps) { 3622 rack->r_ctl.gp_bw = bytes_ps; 3623 rack->rc_gp_filled = 1; 3624 rack->r_ctl.num_avg = 1; 3625 rack_set_pace_segments(rack->rc_tp, rack, __LINE__); 3626 } else { 3627 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, 3628 rack->r_ctl.rc_app_limited_cnt, 3629 0, 0, 10, __LINE__, NULL); 3630 } 3631 if (rack->rc_inp->inp_in_hpts && 3632 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { 3633 /* 3634 * Ok we can't trust the pacer in this case 3635 * where we transition from un-paced to paced. 3636 * Or for that matter when the burst mitigation 3637 * was making a wild guess and got it wrong. 3638 * Stop the pacer and clear up all the aggregate 3639 * delays etc. 3640 */ 3641 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); 3642 rack->r_ctl.rc_hpts_flags = 0; 3643 rack->r_ctl.rc_last_output_to = 0; 3644 } 3645 } else if (rack->r_ctl.num_avg < RACK_REQ_AVG) { 3646 /* Still a small number run an average */ 3647 rack->r_ctl.gp_bw += bytes_ps; 3648 rack->r_ctl.num_avg++; 3649 if (rack->r_ctl.num_avg >= RACK_REQ_AVG) { 3650 /* We have collected enought to move forward */ 3651 rack->r_ctl.gp_bw /= (uint64_t)rack->r_ctl.num_avg; 3652 } 3653 } else { 3654 /* 3655 * We want to take 1/wma of the goodput and add in to 7/8th 3656 * of the old value weighted by the srtt. So if your measurement 3657 * period is say 2 SRTT's long you would get 1/4 as the 3658 * value, if it was like 1/2 SRTT then you would get 1/16th. 3659 * 3660 * But we must be careful not to take too much i.e. if the 3661 * srtt is say 20ms and the measurement is taken over 3662 * 400ms our weight would be 400/20 i.e. 20. On the 3663 * other hand if we get a measurement over 1ms with a 3664 * 10ms rtt we only want to take a much smaller portion. 3665 */ 3666 uint64_t resid_bw, subpart, addpart, srtt; 3667 3668 srtt = ((uint64_t)TICKS_2_USEC(tp->t_srtt) >> TCP_RTT_SHIFT); 3669 if (srtt == 0) { 3670 /* 3671 * Strange why did t_srtt go back to zero? 3672 */ 3673 if (rack->r_ctl.rc_rack_min_rtt) 3674 srtt = (rack->r_ctl.rc_rack_min_rtt * HPTS_USEC_IN_MSEC); 3675 else 3676 srtt = HPTS_USEC_IN_MSEC; 3677 } 3678 /* 3679 * XXXrrs: Note for reviewers, in playing with 3680 * dynamic pacing I discovered this GP calculation 3681 * as done originally leads to some undesired results. 3682 * Basically you can get longer measurements contributing 3683 * too much to the WMA. Thus I changed it if you are doing 3684 * dynamic adjustments to only do the aportioned adjustment 3685 * if we have a very small (time wise) measurement. Longer 3686 * measurements just get there weight (defaulting to 1/8) 3687 * add to the WMA. We may want to think about changing 3688 * this to always do that for both sides i.e. dynamic 3689 * and non-dynamic... but considering lots of folks 3690 * were playing with this I did not want to change the 3691 * calculation per.se. without your thoughts.. Lawerence? 3692 * Peter?? 3693 */ 3694 if (rack->rc_gp_dyn_mul == 0) { 3695 subpart = rack->r_ctl.gp_bw * utim; 3696 subpart /= (srtt * 8); 3697 if (subpart < (rack->r_ctl.gp_bw / 2)) { 3698 /* 3699 * The b/w update takes no more 3700 * away then 1/2 our running total 3701 * so factor it in. 3702 */ 3703 addpart = bytes_ps * utim; 3704 addpart /= (srtt * 8); 3705 } else { 3706 /* 3707 * Don't allow a single measurement 3708 * to account for more than 1/2 of the 3709 * WMA. This could happen on a retransmission 3710 * where utim becomes huge compared to 3711 * srtt (multiple retransmissions when using 3712 * the sending rate which factors in all the 3713 * transmissions from the first one). 3714 */ 3715 subpart = rack->r_ctl.gp_bw / 2; 3716 addpart = bytes_ps / 2; 3717 } 3718 resid_bw = rack->r_ctl.gp_bw - subpart; 3719 rack->r_ctl.gp_bw = resid_bw + addpart; 3720 } else { 3721 if ((utim / srtt) <= 1) { 3722 /* 3723 * The b/w update was over a small period 3724 * of time. The idea here is to prevent a small 3725 * measurement time period from counting 3726 * too much. So we scale it based on the 3727 * time so it attributes less than 1/rack_wma_divisor 3728 * of its measurement. 3729 */ 3730 subpart = rack->r_ctl.gp_bw * utim; 3731 subpart /= (srtt * rack_wma_divisor); 3732 addpart = bytes_ps * utim; 3733 addpart /= (srtt * rack_wma_divisor); 3734 } else { 3735 /* 3736 * The scaled measurement was long 3737 * enough so lets just add in the 3738 * portion of the measurment i.e. 1/rack_wma_divisor 3739 */ 3740 subpart = rack->r_ctl.gp_bw / rack_wma_divisor; 3741 addpart = bytes_ps / rack_wma_divisor; 3742 } 3743 if ((rack->measure_saw_probe_rtt == 0) || 3744 (bytes_ps > rack->r_ctl.gp_bw)) { 3745 /* 3746 * For probe-rtt we only add it in 3747 * if its larger, all others we just 3748 * add in. 3749 */ 3750 resid_bw = rack->r_ctl.gp_bw - subpart; 3751 rack->r_ctl.gp_bw = resid_bw + addpart; 3752 } 3753 } 3754 } 3755 /* We do not update any multipliers if we are in or have seen a probe-rtt */ 3756 if ((rack->measure_saw_probe_rtt == 0) && rack->rc_gp_rtt_set) 3757 rack_update_multiplier(rack, timely_says, bytes_ps, 3758 rack->r_ctl.rc_gp_srtt, 3759 rack->r_ctl.rc_rtt_diff); 3760 rack_log_pacing_delay_calc(rack, bytes, tim, bytes_ps, stim, 3761 rack_get_bw(rack), 3, line, NULL); 3762 /* reset the gp srtt and setup the new prev */ 3763 rack->r_ctl.rc_prev_gp_srtt = rack->r_ctl.rc_gp_srtt; 3764 /* Record the lost count for the next measurement */ 3765 rack->r_ctl.rc_loss_at_start = rack->r_ctl.rc_loss_count; 3766 /* 3767 * We restart our diffs based on the gpsrtt in the 3768 * measurement window. 3769 */ 3770 rack->rc_gp_rtt_set = 0; 3771 rack->rc_gp_saw_rec = 0; 3772 rack->rc_gp_saw_ca = 0; 3773 rack->rc_gp_saw_ss = 0; 3774 rack->rc_dragged_bottom = 0; 3775 skip_measurement: 3776 3777 #ifdef STATS 3778 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_GPUT, 3779 gput); 3780 /* 3781 * XXXLAS: This is a temporary hack, and should be 3782 * chained off VOI_TCP_GPUT when stats(9) grows an 3783 * API to deal with chained VOIs. 3784 */ 3785 if (tp->t_stats_gput_prev > 0) 3786 stats_voi_update_abs_s32(tp->t_stats, 3787 VOI_TCP_GPUT_ND, 3788 ((gput - tp->t_stats_gput_prev) * 100) / 3789 tp->t_stats_gput_prev); 3790 #endif 3791 tp->t_flags &= ~TF_GPUTINPROG; 3792 tp->t_stats_gput_prev = gput; 3793 /* 3794 * Now are we app limited now and there is space from where we 3795 * were to where we want to go? 3796 * 3797 * We don't do the other case i.e. non-applimited here since 3798 * the next send will trigger us picking up the missing data. 3799 */ 3800 if (rack->r_ctl.rc_first_appl && 3801 TCPS_HAVEESTABLISHED(tp->t_state) && 3802 rack->r_ctl.rc_app_limited_cnt && 3803 (SEQ_GT(rack->r_ctl.rc_first_appl->r_start, th_ack)) && 3804 ((rack->r_ctl.rc_first_appl->r_start - th_ack) > 3805 max(rc_init_window(rack), (MIN_GP_WIN * segsiz)))) { 3806 /* 3807 * Yep there is enough outstanding to make a measurement here. 3808 */ 3809 struct rack_sendmap *rsm, fe; 3810 3811 tp->t_flags |= TF_GPUTINPROG; 3812 rack->r_ctl.rc_gp_lowrtt = 0xffffffff; 3813 rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd; 3814 tp->gput_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 3815 rack->app_limited_needs_set = 0; 3816 tp->gput_seq = th_ack; 3817 if (rack->in_probe_rtt) 3818 rack->measure_saw_probe_rtt = 1; 3819 else if ((rack->measure_saw_probe_rtt) && 3820 (SEQ_GEQ(tp->gput_seq, rack->r_ctl.rc_probertt_sndmax_atexit))) 3821 rack->measure_saw_probe_rtt = 0; 3822 if ((rack->r_ctl.rc_first_appl->r_start - th_ack) >= rack_get_measure_window(tp, rack)) { 3823 /* There is a full window to gain info from */ 3824 tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack); 3825 } else { 3826 /* We can only measure up to the applimited point */ 3827 tp->gput_ack = tp->gput_seq + (rack->r_ctl.rc_first_appl->r_start - th_ack); 3828 } 3829 /* 3830 * Now we need to find the timestamp of the send at tp->gput_seq 3831 * for the send based measurement. 3832 */ 3833 fe.r_start = tp->gput_seq; 3834 rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); 3835 if (rsm) { 3836 /* Ok send-based limit is set */ 3837 if (SEQ_LT(rsm->r_start, tp->gput_seq)) { 3838 /* 3839 * Move back to include the earlier part 3840 * so our ack time lines up right (this may 3841 * make an overlapping measurement but thats 3842 * ok). 3843 */ 3844 tp->gput_seq = rsm->r_start; 3845 } 3846 if (rsm->r_flags & RACK_ACKED) 3847 tp->gput_ts = rsm->r_ack_arrival; 3848 else 3849 rack->app_limited_needs_set = 1; 3850 rack->r_ctl.rc_gp_output_ts = rsm->usec_orig_send; 3851 } else { 3852 /* 3853 * If we don't find the rsm due to some 3854 * send-limit set the current time, which 3855 * basically disables the send-limit. 3856 */ 3857 rack->r_ctl.rc_gp_output_ts = tcp_get_usecs(NULL); 3858 } 3859 rack_log_pacing_delay_calc(rack, 3860 tp->gput_seq, 3861 tp->gput_ack, 3862 (uint64_t)rsm, 3863 tp->gput_ts, 3864 rack->r_ctl.rc_app_limited_cnt, 3865 9, 3866 __LINE__, NULL); 3867 } 3868 } 3869 3870 /* 3871 * CC wrapper hook functions 3872 */ 3873 static void 3874 rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, struct tcphdr *th, uint16_t nsegs, 3875 uint16_t type, int32_t recovery) 3876 { 3877 INP_WLOCK_ASSERT(tp->t_inpcb); 3878 tp->ccv->nsegs = nsegs; 3879 tp->ccv->bytes_this_ack = BYTES_THIS_ACK(tp, th); 3880 if ((recovery) && (rack->r_ctl.rc_early_recovery_segs)) { 3881 uint32_t max; 3882 3883 max = rack->r_ctl.rc_early_recovery_segs * ctf_fixed_maxseg(tp); 3884 if (tp->ccv->bytes_this_ack > max) { 3885 tp->ccv->bytes_this_ack = max; 3886 } 3887 } 3888 if (rack->r_ctl.cwnd_to_use <= tp->snd_wnd) 3889 tp->ccv->flags |= CCF_CWND_LIMITED; 3890 else 3891 tp->ccv->flags &= ~CCF_CWND_LIMITED; 3892 #ifdef STATS 3893 stats_voi_update_abs_s32(tp->t_stats, VOI_TCP_CALCFRWINDIFF, 3894 ((int32_t)rack->r_ctl.cwnd_to_use) - tp->snd_wnd); 3895 #endif 3896 if ((tp->t_flags & TF_GPUTINPROG) && 3897 rack_enough_for_measurement(tp, rack, th->th_ack)) { 3898 /* Measure the Goodput */ 3899 rack_do_goodput_measurement(tp, rack, th->th_ack, __LINE__); 3900 #ifdef NETFLIX_PEAKRATE 3901 if ((type == CC_ACK) && 3902 (tp->t_maxpeakrate)) { 3903 /* 3904 * We update t_peakrate_thr. This gives us roughly 3905 * one update per round trip time. Note 3906 * it will only be used if pace_always is off i.e 3907 * we don't do this for paced flows. 3908 */ 3909 tcp_update_peakrate_thr(tp); 3910 } 3911 #endif 3912 } 3913 if (rack->r_ctl.cwnd_to_use > tp->snd_ssthresh) { 3914 tp->t_bytes_acked += min(tp->ccv->bytes_this_ack, 3915 nsegs * V_tcp_abc_l_var * ctf_fixed_maxseg(tp)); 3916 if (tp->t_bytes_acked >= rack->r_ctl.cwnd_to_use) { 3917 tp->t_bytes_acked -= rack->r_ctl.cwnd_to_use; 3918 tp->ccv->flags |= CCF_ABC_SENTAWND; 3919 } 3920 } else { 3921 tp->ccv->flags &= ~CCF_ABC_SENTAWND; 3922 tp->t_bytes_acked = 0; 3923 } 3924 if (CC_ALGO(tp)->ack_received != NULL) { 3925 /* XXXLAS: Find a way to live without this */ 3926 tp->ccv->curack = th->th_ack; 3927 CC_ALGO(tp)->ack_received(tp->ccv, type); 3928 } 3929 #ifdef STATS 3930 stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_LCWIN, rack->r_ctl.cwnd_to_use); 3931 #endif 3932 if (rack->r_ctl.rc_rack_largest_cwnd < rack->r_ctl.cwnd_to_use) { 3933 rack->r_ctl.rc_rack_largest_cwnd = rack->r_ctl.cwnd_to_use; 3934 } 3935 #ifdef NETFLIX_PEAKRATE 3936 /* we enforce max peak rate if it is set and we are not pacing */ 3937 if ((rack->rc_always_pace == 0) && 3938 tp->t_peakrate_thr && 3939 (tp->snd_cwnd > tp->t_peakrate_thr)) { 3940 tp->snd_cwnd = tp->t_peakrate_thr; 3941 } 3942 #endif 3943 } 3944 3945 static void 3946 tcp_rack_partialack(struct tcpcb *tp, struct tcphdr *th) 3947 { 3948 struct tcp_rack *rack; 3949 3950 rack = (struct tcp_rack *)tp->t_fb_ptr; 3951 INP_WLOCK_ASSERT(tp->t_inpcb); 3952 /* 3953 * If we are doing PRR and have enough 3954 * room to send <or> we are pacing and prr 3955 * is disabled we will want to see if we 3956 * can send data (by setting r_wanted_output to 3957 * true). 3958 */ 3959 if ((rack->r_ctl.rc_prr_sndcnt > 0) || 3960 rack->rack_no_prr) 3961 rack->r_wanted_output = 1; 3962 } 3963 3964 static void 3965 rack_post_recovery(struct tcpcb *tp, struct tcphdr *th) 3966 { 3967 struct tcp_rack *rack; 3968 uint32_t orig_cwnd; 3969 3970 3971 orig_cwnd = tp->snd_cwnd; 3972 INP_WLOCK_ASSERT(tp->t_inpcb); 3973 rack = (struct tcp_rack *)tp->t_fb_ptr; 3974 if (rack->rc_not_backing_off == 0) { 3975 /* only alert CC if we alerted when we entered */ 3976 if (CC_ALGO(tp)->post_recovery != NULL) { 3977 tp->ccv->curack = th->th_ack; 3978 CC_ALGO(tp)->post_recovery(tp->ccv); 3979 } 3980 if (tp->snd_cwnd > tp->snd_ssthresh) { 3981 /* Drop us down to the ssthresh (1/2 cwnd at loss) */ 3982 tp->snd_cwnd = tp->snd_ssthresh; 3983 } 3984 } 3985 if ((rack->rack_no_prr == 0) && 3986 (rack->r_ctl.rc_prr_sndcnt > 0)) { 3987 /* Suck the next prr cnt back into cwnd */ 3988 tp->snd_cwnd += rack->r_ctl.rc_prr_sndcnt; 3989 rack->r_ctl.rc_prr_sndcnt = 0; 3990 rack_log_to_prr(rack, 1, 0); 3991 } 3992 rack_log_to_prr(rack, 14, orig_cwnd); 3993 tp->snd_recover = tp->snd_una; 3994 EXIT_RECOVERY(tp->t_flags); 3995 } 3996 3997 static void 3998 rack_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type) 3999 { 4000 struct tcp_rack *rack; 4001 4002 INP_WLOCK_ASSERT(tp->t_inpcb); 4003 4004 rack = (struct tcp_rack *)tp->t_fb_ptr; 4005 switch (type) { 4006 case CC_NDUPACK: 4007 tp->t_flags &= ~TF_WASFRECOVERY; 4008 tp->t_flags &= ~TF_WASCRECOVERY; 4009 if (!IN_FASTRECOVERY(tp->t_flags)) { 4010 rack->r_ctl.rc_prr_delivered = 0; 4011 rack->r_ctl.rc_prr_out = 0; 4012 if (rack->rack_no_prr == 0) { 4013 rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); 4014 rack_log_to_prr(rack, 2, 0); 4015 } 4016 rack->r_ctl.rc_prr_recovery_fs = tp->snd_max - tp->snd_una; 4017 tp->snd_recover = tp->snd_max; 4018 if (tp->t_flags2 & TF2_ECN_PERMIT) 4019 tp->t_flags2 |= TF2_ECN_SND_CWR; 4020 } 4021 break; 4022 case CC_ECN: 4023 if (!IN_CONGRECOVERY(tp->t_flags) || 4024 /* 4025 * Allow ECN reaction on ACK to CWR, if 4026 * that data segment was also CE marked. 4027 */ 4028 SEQ_GEQ(th->th_ack, tp->snd_recover)) { 4029 EXIT_CONGRECOVERY(tp->t_flags); 4030 KMOD_TCPSTAT_INC(tcps_ecn_rcwnd); 4031 tp->snd_recover = tp->snd_max + 1; 4032 if (tp->t_flags2 & TF2_ECN_PERMIT) 4033 tp->t_flags2 |= TF2_ECN_SND_CWR; 4034 } 4035 break; 4036 case CC_RTO: 4037 tp->t_dupacks = 0; 4038 tp->t_bytes_acked = 0; 4039 EXIT_RECOVERY(tp->t_flags); 4040 tp->snd_ssthresh = max(2, min(tp->snd_wnd, rack->r_ctl.cwnd_to_use) / 2 / 4041 ctf_fixed_maxseg(tp)) * ctf_fixed_maxseg(tp); 4042 tp->snd_cwnd = ctf_fixed_maxseg(tp); 4043 if (tp->t_flags2 & TF2_ECN_PERMIT) 4044 tp->t_flags2 |= TF2_ECN_SND_CWR; 4045 break; 4046 case CC_RTO_ERR: 4047 KMOD_TCPSTAT_INC(tcps_sndrexmitbad); 4048 /* RTO was unnecessary, so reset everything. */ 4049 tp->snd_cwnd = tp->snd_cwnd_prev; 4050 tp->snd_ssthresh = tp->snd_ssthresh_prev; 4051 tp->snd_recover = tp->snd_recover_prev; 4052 if (tp->t_flags & TF_WASFRECOVERY) { 4053 ENTER_FASTRECOVERY(tp->t_flags); 4054 tp->t_flags &= ~TF_WASFRECOVERY; 4055 } 4056 if (tp->t_flags & TF_WASCRECOVERY) { 4057 ENTER_CONGRECOVERY(tp->t_flags); 4058 tp->t_flags &= ~TF_WASCRECOVERY; 4059 } 4060 tp->snd_nxt = tp->snd_max; 4061 tp->t_badrxtwin = 0; 4062 break; 4063 } 4064 /* 4065 * If we are below our max rtt, don't 4066 * signal the CC control to change things. 4067 * instead set it up so that we are in 4068 * recovery but not going to back off. 4069 */ 4070 4071 if (rack->rc_highly_buffered) { 4072 /* 4073 * Do we use the higher rtt for 4074 * our threshold to not backoff (like CDG)? 4075 */ 4076 uint32_t rtt_mul, rtt_div; 4077 4078 if (rack_use_max_for_nobackoff) { 4079 rtt_mul = (rack_gp_rtt_maxmul - 1); 4080 rtt_div = 1; 4081 } else { 4082 rtt_mul = rack_gp_rtt_minmul; 4083 rtt_div = max(rack_gp_rtt_mindiv , 1); 4084 } 4085 if (rack->r_ctl.rc_gp_srtt <= (rack->r_ctl.rc_lowest_us_rtt + 4086 ((rack->r_ctl.rc_lowest_us_rtt * rtt_mul) / 4087 rtt_div))) { 4088 /* below our min threshold */ 4089 rack->rc_not_backing_off = 1; 4090 ENTER_RECOVERY(rack->rc_tp->t_flags); 4091 rack_log_rtt_shrinks(rack, 0, 4092 rtt_mul, 4093 rtt_div, 4094 RACK_RTTS_NOBACKOFF); 4095 return; 4096 } 4097 } 4098 rack->rc_not_backing_off = 0; 4099 if (CC_ALGO(tp)->cong_signal != NULL) { 4100 if (th != NULL) 4101 tp->ccv->curack = th->th_ack; 4102 CC_ALGO(tp)->cong_signal(tp->ccv, type); 4103 } 4104 } 4105 4106 4107 4108 static inline void 4109 rack_cc_after_idle(struct tcp_rack *rack, struct tcpcb *tp) 4110 { 4111 uint32_t i_cwnd; 4112 4113 INP_WLOCK_ASSERT(tp->t_inpcb); 4114 4115 #ifdef NETFLIX_STATS 4116 KMOD_TCPSTAT_INC(tcps_idle_restarts); 4117 if (tp->t_state == TCPS_ESTABLISHED) 4118 KMOD_TCPSTAT_INC(tcps_idle_estrestarts); 4119 #endif 4120 if (CC_ALGO(tp)->after_idle != NULL) 4121 CC_ALGO(tp)->after_idle(tp->ccv); 4122 4123 if (tp->snd_cwnd == 1) 4124 i_cwnd = tp->t_maxseg; /* SYN(-ACK) lost */ 4125 else 4126 i_cwnd = rc_init_window(rack); 4127 4128 /* 4129 * Being idle is no differnt than the initial window. If the cc 4130 * clamps it down below the initial window raise it to the initial 4131 * window. 4132 */ 4133 if (tp->snd_cwnd < i_cwnd) { 4134 tp->snd_cwnd = i_cwnd; 4135 } 4136 } 4137 4138 4139 /* 4140 * Indicate whether this ack should be delayed. We can delay the ack if 4141 * following conditions are met: 4142 * - There is no delayed ack timer in progress. 4143 * - Our last ack wasn't a 0-sized window. We never want to delay 4144 * the ack that opens up a 0-sized window. 4145 * - LRO wasn't used for this segment. We make sure by checking that the 4146 * segment size is not larger than the MSS. 4147 * - Delayed acks are enabled or this is a half-synchronized T/TCP 4148 * connection. 4149 */ 4150 #define DELAY_ACK(tp, tlen) \ 4151 (((tp->t_flags & TF_RXWIN0SENT) == 0) && \ 4152 ((tp->t_flags & TF_DELACK) == 0) && \ 4153 (tlen <= tp->t_maxseg) && \ 4154 (tp->t_delayed_ack || (tp->t_flags & TF_NEEDSYN))) 4155 4156 static struct rack_sendmap * 4157 rack_find_lowest_rsm(struct tcp_rack *rack) 4158 { 4159 struct rack_sendmap *rsm; 4160 4161 /* 4162 * Walk the time-order transmitted list looking for an rsm that is 4163 * not acked. This will be the one that was sent the longest time 4164 * ago that is still outstanding. 4165 */ 4166 TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) { 4167 if (rsm->r_flags & RACK_ACKED) { 4168 continue; 4169 } 4170 goto finish; 4171 } 4172 finish: 4173 return (rsm); 4174 } 4175 4176 static struct rack_sendmap * 4177 rack_find_high_nonack(struct tcp_rack *rack, struct rack_sendmap *rsm) 4178 { 4179 struct rack_sendmap *prsm; 4180 4181 /* 4182 * Walk the sequence order list backward until we hit and arrive at 4183 * the highest seq not acked. In theory when this is called it 4184 * should be the last segment (which it was not). 4185 */ 4186 counter_u64_add(rack_find_high, 1); 4187 prsm = rsm; 4188 RB_FOREACH_REVERSE_FROM(prsm, rack_rb_tree_head, rsm) { 4189 if (prsm->r_flags & (RACK_ACKED | RACK_HAS_FIN)) { 4190 continue; 4191 } 4192 return (prsm); 4193 } 4194 return (NULL); 4195 } 4196 4197 4198 static uint32_t 4199 rack_calc_thresh_rack(struct tcp_rack *rack, uint32_t srtt, uint32_t cts) 4200 { 4201 int32_t lro; 4202 uint32_t thresh; 4203 4204 /* 4205 * lro is the flag we use to determine if we have seen reordering. 4206 * If it gets set we have seen reordering. The reorder logic either 4207 * works in one of two ways: 4208 * 4209 * If reorder-fade is configured, then we track the last time we saw 4210 * re-ordering occur. If we reach the point where enough time as 4211 * passed we no longer consider reordering has occuring. 4212 * 4213 * Or if reorder-face is 0, then once we see reordering we consider 4214 * the connection to alway be subject to reordering and just set lro 4215 * to 1. 4216 * 4217 * In the end if lro is non-zero we add the extra time for 4218 * reordering in. 4219 */ 4220 if (srtt == 0) 4221 srtt = 1; 4222 if (rack->r_ctl.rc_reorder_ts) { 4223 if (rack->r_ctl.rc_reorder_fade) { 4224 if (SEQ_GEQ(cts, rack->r_ctl.rc_reorder_ts)) { 4225 lro = cts - rack->r_ctl.rc_reorder_ts; 4226 if (lro == 0) { 4227 /* 4228 * No time as passed since the last 4229 * reorder, mark it as reordering. 4230 */ 4231 lro = 1; 4232 } 4233 } else { 4234 /* Negative time? */ 4235 lro = 0; 4236 } 4237 if (lro > rack->r_ctl.rc_reorder_fade) { 4238 /* Turn off reordering seen too */ 4239 rack->r_ctl.rc_reorder_ts = 0; 4240 lro = 0; 4241 } 4242 } else { 4243 /* Reodering does not fade */ 4244 lro = 1; 4245 } 4246 } else { 4247 lro = 0; 4248 } 4249 thresh = srtt + rack->r_ctl.rc_pkt_delay; 4250 if (lro) { 4251 /* It must be set, if not you get 1/4 rtt */ 4252 if (rack->r_ctl.rc_reorder_shift) 4253 thresh += (srtt >> rack->r_ctl.rc_reorder_shift); 4254 else 4255 thresh += (srtt >> 2); 4256 } else { 4257 thresh += 1; 4258 } 4259 /* We don't let the rack timeout be above a RTO */ 4260 if (thresh > TICKS_2_MSEC(rack->rc_tp->t_rxtcur)) { 4261 thresh = TICKS_2_MSEC(rack->rc_tp->t_rxtcur); 4262 } 4263 /* And we don't want it above the RTO max either */ 4264 if (thresh > rack_rto_max) { 4265 thresh = rack_rto_max; 4266 } 4267 return (thresh); 4268 } 4269 4270 static uint32_t 4271 rack_calc_thresh_tlp(struct tcpcb *tp, struct tcp_rack *rack, 4272 struct rack_sendmap *rsm, uint32_t srtt) 4273 { 4274 struct rack_sendmap *prsm; 4275 uint32_t thresh, len; 4276 int segsiz; 4277 4278 if (srtt == 0) 4279 srtt = 1; 4280 if (rack->r_ctl.rc_tlp_threshold) 4281 thresh = srtt + (srtt / rack->r_ctl.rc_tlp_threshold); 4282 else 4283 thresh = (srtt * 2); 4284 4285 /* Get the previous sent packet, if any */ 4286 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 4287 counter_u64_add(rack_enter_tlp_calc, 1); 4288 len = rsm->r_end - rsm->r_start; 4289 if (rack->rack_tlp_threshold_use == TLP_USE_ID) { 4290 /* Exactly like the ID */ 4291 if (((tp->snd_max - tp->snd_una) - rack->r_ctl.rc_sacked + rack->r_ctl.rc_holes_rxt) <= segsiz) { 4292 uint32_t alt_thresh; 4293 /* 4294 * Compensate for delayed-ack with the d-ack time. 4295 */ 4296 counter_u64_add(rack_used_tlpmethod, 1); 4297 alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time; 4298 if (alt_thresh > thresh) 4299 thresh = alt_thresh; 4300 } 4301 } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_ONE) { 4302 /* 2.1 behavior */ 4303 prsm = TAILQ_PREV(rsm, rack_head, r_tnext); 4304 if (prsm && (len <= segsiz)) { 4305 /* 4306 * Two packets outstanding, thresh should be (2*srtt) + 4307 * possible inter-packet delay (if any). 4308 */ 4309 uint32_t inter_gap = 0; 4310 int idx, nidx; 4311 4312 counter_u64_add(rack_used_tlpmethod, 1); 4313 idx = rsm->r_rtr_cnt - 1; 4314 nidx = prsm->r_rtr_cnt - 1; 4315 if (TSTMP_GEQ(rsm->r_tim_lastsent[nidx], prsm->r_tim_lastsent[idx])) { 4316 /* Yes it was sent later (or at the same time) */ 4317 inter_gap = rsm->r_tim_lastsent[idx] - prsm->r_tim_lastsent[nidx]; 4318 } 4319 thresh += inter_gap; 4320 } else if (len <= segsiz) { 4321 /* 4322 * Possibly compensate for delayed-ack. 4323 */ 4324 uint32_t alt_thresh; 4325 4326 counter_u64_add(rack_used_tlpmethod2, 1); 4327 alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time; 4328 if (alt_thresh > thresh) 4329 thresh = alt_thresh; 4330 } 4331 } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_TWO) { 4332 /* 2.2 behavior */ 4333 if (len <= segsiz) { 4334 uint32_t alt_thresh; 4335 /* 4336 * Compensate for delayed-ack with the d-ack time. 4337 */ 4338 counter_u64_add(rack_used_tlpmethod, 1); 4339 alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time; 4340 if (alt_thresh > thresh) 4341 thresh = alt_thresh; 4342 } 4343 } 4344 /* Not above an RTO */ 4345 if (thresh > TICKS_2_MSEC(tp->t_rxtcur)) { 4346 thresh = TICKS_2_MSEC(tp->t_rxtcur); 4347 } 4348 /* Not above a RTO max */ 4349 if (thresh > rack_rto_max) { 4350 thresh = rack_rto_max; 4351 } 4352 /* Apply user supplied min TLP */ 4353 if (thresh < rack_tlp_min) { 4354 thresh = rack_tlp_min; 4355 } 4356 return (thresh); 4357 } 4358 4359 static uint32_t 4360 rack_grab_rtt(struct tcpcb *tp, struct tcp_rack *rack) 4361 { 4362 /* 4363 * We want the rack_rtt which is the 4364 * last rtt we measured. However if that 4365 * does not exist we fallback to the srtt (which 4366 * we probably will never do) and then as a last 4367 * resort we use RACK_INITIAL_RTO if no srtt is 4368 * yet set. 4369 */ 4370 if (rack->rc_rack_rtt) 4371 return(rack->rc_rack_rtt); 4372 else if (tp->t_srtt == 0) 4373 return(RACK_INITIAL_RTO); 4374 return (TICKS_2_MSEC(tp->t_srtt >> TCP_RTT_SHIFT)); 4375 } 4376 4377 static struct rack_sendmap * 4378 rack_check_recovery_mode(struct tcpcb *tp, uint32_t tsused) 4379 { 4380 /* 4381 * Check to see that we don't need to fall into recovery. We will 4382 * need to do so if our oldest transmit is past the time we should 4383 * have had an ack. 4384 */ 4385 struct tcp_rack *rack; 4386 struct rack_sendmap *rsm; 4387 int32_t idx; 4388 uint32_t srtt, thresh; 4389 4390 rack = (struct tcp_rack *)tp->t_fb_ptr; 4391 if (RB_EMPTY(&rack->r_ctl.rc_mtree)) { 4392 return (NULL); 4393 } 4394 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 4395 if (rsm == NULL) 4396 return (NULL); 4397 4398 if (rsm->r_flags & RACK_ACKED) { 4399 rsm = rack_find_lowest_rsm(rack); 4400 if (rsm == NULL) 4401 return (NULL); 4402 } 4403 idx = rsm->r_rtr_cnt - 1; 4404 srtt = rack_grab_rtt(tp, rack); 4405 thresh = rack_calc_thresh_rack(rack, srtt, tsused); 4406 if (TSTMP_LT(tsused, rsm->r_tim_lastsent[idx])) { 4407 return (NULL); 4408 } 4409 if ((tsused - rsm->r_tim_lastsent[idx]) < thresh) { 4410 return (NULL); 4411 } 4412 /* Ok if we reach here we are over-due and this guy can be sent */ 4413 if (IN_RECOVERY(tp->t_flags) == 0) { 4414 /* 4415 * For the one that enters us into recovery record undo 4416 * info. 4417 */ 4418 rack->r_ctl.rc_rsm_start = rsm->r_start; 4419 rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; 4420 rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; 4421 } 4422 rack_cong_signal(tp, NULL, CC_NDUPACK); 4423 return (rsm); 4424 } 4425 4426 static uint32_t 4427 rack_get_persists_timer_val(struct tcpcb *tp, struct tcp_rack *rack) 4428 { 4429 int32_t t; 4430 int32_t tt; 4431 uint32_t ret_val; 4432 4433 t = TICKS_2_MSEC((tp->t_srtt >> TCP_RTT_SHIFT) + ((tp->t_rttvar * 4) >> TCP_RTT_SHIFT)); 4434 TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift], 4435 rack_persist_min, rack_persist_max); 4436 if (tp->t_rxtshift < TCP_MAXRXTSHIFT) 4437 tp->t_rxtshift++; 4438 rack->r_ctl.rc_hpts_flags |= PACE_TMR_PERSIT; 4439 ret_val = (uint32_t)tt; 4440 return (ret_val); 4441 } 4442 4443 static uint32_t 4444 rack_timer_start(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int sup_rack) 4445 { 4446 /* 4447 * Start the FR timer, we do this based on getting the first one in 4448 * the rc_tmap. Note that if its NULL we must stop the timer. in all 4449 * events we need to stop the running timer (if its running) before 4450 * starting the new one. 4451 */ 4452 uint32_t thresh, exp, to, srtt, time_since_sent, tstmp_touse; 4453 uint32_t srtt_cur; 4454 int32_t idx; 4455 int32_t is_tlp_timer = 0; 4456 struct rack_sendmap *rsm; 4457 4458 if (rack->t_timers_stopped) { 4459 /* All timers have been stopped none are to run */ 4460 return (0); 4461 } 4462 if (rack->rc_in_persist) { 4463 /* We can't start any timer in persists */ 4464 return (rack_get_persists_timer_val(tp, rack)); 4465 } 4466 rack->rc_on_min_to = 0; 4467 if ((tp->t_state < TCPS_ESTABLISHED) || 4468 ((tp->t_flags & TF_SACK_PERMIT) == 0)) 4469 goto activate_rxt; 4470 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 4471 if ((rsm == NULL) || sup_rack) { 4472 /* Nothing on the send map */ 4473 activate_rxt: 4474 time_since_sent = 0; 4475 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 4476 if (rsm) { 4477 idx = rsm->r_rtr_cnt - 1; 4478 if (TSTMP_GEQ(rsm->r_tim_lastsent[idx], rack->r_ctl.rc_tlp_rxt_last_time)) 4479 tstmp_touse = rsm->r_tim_lastsent[idx]; 4480 else 4481 tstmp_touse = rack->r_ctl.rc_tlp_rxt_last_time; 4482 if (TSTMP_GT(cts, tstmp_touse)) 4483 time_since_sent = cts - tstmp_touse; 4484 } 4485 if (SEQ_LT(tp->snd_una, tp->snd_max) || sbavail(&(tp->t_inpcb->inp_socket->so_snd))) { 4486 rack->r_ctl.rc_hpts_flags |= PACE_TMR_RXT; 4487 to = TICKS_2_MSEC(tp->t_rxtcur); 4488 if (to > time_since_sent) 4489 to -= time_since_sent; 4490 else 4491 to = rack->r_ctl.rc_min_to; 4492 if (to == 0) 4493 to = 1; 4494 return (to); 4495 } 4496 return (0); 4497 } 4498 if (rsm->r_flags & RACK_ACKED) { 4499 rsm = rack_find_lowest_rsm(rack); 4500 if (rsm == NULL) { 4501 /* No lowest? */ 4502 goto activate_rxt; 4503 } 4504 } 4505 if (rack->sack_attack_disable) { 4506 /* 4507 * We don't want to do 4508 * any TLP's if you are an attacker. 4509 * Though if you are doing what 4510 * is expected you may still have 4511 * SACK-PASSED marks. 4512 */ 4513 goto activate_rxt; 4514 } 4515 /* Convert from ms to usecs */ 4516 if ((rsm->r_flags & RACK_SACK_PASSED) || (rsm->r_dupack >= DUP_ACK_THRESHOLD)) { 4517 if ((tp->t_flags & TF_SENTFIN) && 4518 ((tp->snd_max - tp->snd_una) == 1) && 4519 (rsm->r_flags & RACK_HAS_FIN)) { 4520 /* 4521 * We don't start a rack timer if all we have is a 4522 * FIN outstanding. 4523 */ 4524 goto activate_rxt; 4525 } 4526 if ((rack->use_rack_rr == 0) && 4527 (IN_RECOVERY(tp->t_flags)) && 4528 (rack->rack_no_prr == 0) && 4529 (rack->r_ctl.rc_prr_sndcnt < ctf_fixed_maxseg(tp))) { 4530 /* 4531 * We are not cheating, in recovery and 4532 * not enough ack's to yet get our next 4533 * retransmission out. 4534 * 4535 * Note that classified attackers do not 4536 * get to use the rack-cheat. 4537 */ 4538 goto activate_tlp; 4539 } 4540 srtt = rack_grab_rtt(tp, rack); 4541 thresh = rack_calc_thresh_rack(rack, srtt, cts); 4542 idx = rsm->r_rtr_cnt - 1; 4543 exp = rsm->r_tim_lastsent[idx] + thresh; 4544 if (SEQ_GEQ(exp, cts)) { 4545 to = exp - cts; 4546 if (to < rack->r_ctl.rc_min_to) { 4547 to = rack->r_ctl.rc_min_to; 4548 if (rack->r_rr_config == 3) 4549 rack->rc_on_min_to = 1; 4550 } 4551 } else { 4552 to = rack->r_ctl.rc_min_to; 4553 if (rack->r_rr_config == 3) 4554 rack->rc_on_min_to = 1; 4555 } 4556 } else { 4557 /* Ok we need to do a TLP not RACK */ 4558 activate_tlp: 4559 if ((rack->rc_tlp_in_progress != 0) && 4560 (rack->r_ctl.rc_tlp_cnt_out >= rack_tlp_limit)) { 4561 /* 4562 * The previous send was a TLP and we have sent 4563 * N TLP's without sending new data. 4564 */ 4565 goto activate_rxt; 4566 } 4567 rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext); 4568 if (rsm == NULL) { 4569 /* We found no rsm to TLP with. */ 4570 goto activate_rxt; 4571 } 4572 if (rsm->r_flags & RACK_HAS_FIN) { 4573 /* If its a FIN we dont do TLP */ 4574 rsm = NULL; 4575 goto activate_rxt; 4576 } 4577 idx = rsm->r_rtr_cnt - 1; 4578 time_since_sent = 0; 4579 if (TSTMP_GEQ(rsm->r_tim_lastsent[idx], rack->r_ctl.rc_tlp_rxt_last_time)) 4580 tstmp_touse = rsm->r_tim_lastsent[idx]; 4581 else 4582 tstmp_touse = rack->r_ctl.rc_tlp_rxt_last_time; 4583 if (TSTMP_GT(cts, tstmp_touse)) 4584 time_since_sent = cts - tstmp_touse; 4585 is_tlp_timer = 1; 4586 if (tp->t_srtt) { 4587 srtt_cur = (tp->t_srtt >> TCP_RTT_SHIFT); 4588 srtt = TICKS_2_MSEC(srtt_cur); 4589 } else 4590 srtt = RACK_INITIAL_RTO; 4591 /* 4592 * If the SRTT is not keeping up and the 4593 * rack RTT has spiked we want to use 4594 * the last RTT not the smoothed one. 4595 */ 4596 if (rack_tlp_use_greater && (srtt < rack_grab_rtt(tp, rack))) 4597 srtt = rack_grab_rtt(tp, rack); 4598 thresh = rack_calc_thresh_tlp(tp, rack, rsm, srtt); 4599 if (thresh > time_since_sent) 4600 to = thresh - time_since_sent; 4601 else { 4602 to = rack->r_ctl.rc_min_to; 4603 rack_log_alt_to_to_cancel(rack, 4604 thresh, /* flex1 */ 4605 time_since_sent, /* flex2 */ 4606 tstmp_touse, /* flex3 */ 4607 rack->r_ctl.rc_tlp_rxt_last_time, /* flex4 */ 4608 rsm->r_tim_lastsent[idx], 4609 srtt, 4610 idx, 99); 4611 } 4612 if (to > TCPTV_REXMTMAX) { 4613 /* 4614 * If the TLP time works out to larger than the max 4615 * RTO lets not do TLP.. just RTO. 4616 */ 4617 goto activate_rxt; 4618 } 4619 } 4620 if (is_tlp_timer == 0) { 4621 rack->r_ctl.rc_hpts_flags |= PACE_TMR_RACK; 4622 } else { 4623 rack->r_ctl.rc_hpts_flags |= PACE_TMR_TLP; 4624 } 4625 if (to == 0) 4626 to = 1; 4627 return (to); 4628 } 4629 4630 static void 4631 rack_enter_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 4632 { 4633 if (rack->rc_in_persist == 0) { 4634 if (tp->t_flags & TF_GPUTINPROG) { 4635 /* 4636 * Stop the goodput now, the calling of the 4637 * measurement function clears the flag. 4638 */ 4639 rack_do_goodput_measurement(tp, rack, tp->snd_una, __LINE__); 4640 } 4641 #ifdef NETFLIX_SHARED_CWND 4642 if (rack->r_ctl.rc_scw) { 4643 tcp_shared_cwnd_idle(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index); 4644 rack->rack_scwnd_is_idle = 1; 4645 } 4646 #endif 4647 rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL); 4648 if (rack->r_ctl.rc_went_idle_time == 0) 4649 rack->r_ctl.rc_went_idle_time = 1; 4650 rack_timer_cancel(tp, rack, cts, __LINE__); 4651 tp->t_rxtshift = 0; 4652 rack->rc_in_persist = 1; 4653 } 4654 } 4655 4656 static void 4657 rack_exit_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 4658 { 4659 if (rack->rc_inp->inp_in_hpts) { 4660 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); 4661 rack->r_ctl.rc_hpts_flags = 0; 4662 } 4663 #ifdef NETFLIX_SHARED_CWND 4664 if (rack->r_ctl.rc_scw) { 4665 tcp_shared_cwnd_active(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index); 4666 rack->rack_scwnd_is_idle = 0; 4667 } 4668 #endif 4669 if (rack->rc_gp_dyn_mul && 4670 (rack->use_fixed_rate == 0) && 4671 (rack->rc_always_pace)) { 4672 /* 4673 * Do we count this as if a probe-rtt just 4674 * finished? 4675 */ 4676 uint32_t time_idle, idle_min; 4677 4678 time_idle = tcp_get_usecs(NULL) - rack->r_ctl.rc_went_idle_time; 4679 idle_min = rack_min_probertt_hold; 4680 if (rack_probertt_gpsrtt_cnt_div) { 4681 uint64_t extra; 4682 extra = (uint64_t)rack->r_ctl.rc_gp_srtt * 4683 (uint64_t)rack_probertt_gpsrtt_cnt_mul; 4684 extra /= (uint64_t)rack_probertt_gpsrtt_cnt_div; 4685 idle_min += (uint32_t)extra; 4686 } 4687 if (time_idle >= idle_min) { 4688 /* Yes, we count it as a probe-rtt. */ 4689 uint32_t us_cts; 4690 4691 us_cts = tcp_get_usecs(NULL); 4692 if (rack->in_probe_rtt == 0) { 4693 rack->r_ctl.rc_lower_rtt_us_cts = us_cts; 4694 rack->r_ctl.rc_time_probertt_entered = rack->r_ctl.rc_lower_rtt_us_cts; 4695 rack->r_ctl.rc_time_probertt_starts = rack->r_ctl.rc_lower_rtt_us_cts; 4696 rack->r_ctl.rc_time_of_last_probertt = rack->r_ctl.rc_lower_rtt_us_cts; 4697 } else { 4698 rack_exit_probertt(rack, us_cts); 4699 } 4700 } 4701 4702 } 4703 rack->rc_in_persist = 0; 4704 rack->r_ctl.rc_went_idle_time = 0; 4705 tp->t_rxtshift = 0; 4706 rack->r_ctl.rc_agg_delayed = 0; 4707 rack->r_early = 0; 4708 rack->r_late = 0; 4709 rack->r_ctl.rc_agg_early = 0; 4710 } 4711 4712 static void 4713 rack_log_hpts_diag(struct tcp_rack *rack, uint32_t cts, 4714 struct hpts_diag *diag, struct timeval *tv) 4715 { 4716 if (rack_verbose_logging && rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 4717 union tcp_log_stackspecific log; 4718 4719 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 4720 log.u_bbr.flex1 = diag->p_nxt_slot; 4721 log.u_bbr.flex2 = diag->p_cur_slot; 4722 log.u_bbr.flex3 = diag->slot_req; 4723 log.u_bbr.flex4 = diag->inp_hptsslot; 4724 log.u_bbr.flex5 = diag->slot_remaining; 4725 log.u_bbr.flex6 = diag->need_new_to; 4726 log.u_bbr.flex7 = diag->p_hpts_active; 4727 log.u_bbr.flex8 = diag->p_on_min_sleep; 4728 /* Hijack other fields as needed */ 4729 log.u_bbr.epoch = diag->have_slept; 4730 log.u_bbr.lt_epoch = diag->yet_to_sleep; 4731 log.u_bbr.pkts_out = diag->co_ret; 4732 log.u_bbr.applimited = diag->hpts_sleep_time; 4733 log.u_bbr.delivered = diag->p_prev_slot; 4734 log.u_bbr.inflight = diag->p_runningtick; 4735 log.u_bbr.bw_inuse = diag->wheel_tick; 4736 log.u_bbr.rttProp = diag->wheel_cts; 4737 log.u_bbr.timeStamp = cts; 4738 log.u_bbr.delRate = diag->maxticks; 4739 log.u_bbr.cur_del_rate = diag->p_curtick; 4740 log.u_bbr.cur_del_rate <<= 32; 4741 log.u_bbr.cur_del_rate |= diag->p_lasttick; 4742 TCP_LOG_EVENTP(rack->rc_tp, NULL, 4743 &rack->rc_inp->inp_socket->so_rcv, 4744 &rack->rc_inp->inp_socket->so_snd, 4745 BBR_LOG_HPTSDIAG, 0, 4746 0, &log, false, tv); 4747 } 4748 4749 } 4750 4751 static void 4752 rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, 4753 int32_t slot, uint32_t tot_len_this_send, int sup_rack) 4754 { 4755 struct hpts_diag diag; 4756 struct inpcb *inp; 4757 struct timeval tv; 4758 uint32_t delayed_ack = 0; 4759 uint32_t hpts_timeout; 4760 uint8_t stopped; 4761 uint32_t left = 0; 4762 uint32_t us_cts; 4763 4764 inp = tp->t_inpcb; 4765 if ((tp->t_state == TCPS_CLOSED) || 4766 (tp->t_state == TCPS_LISTEN)) { 4767 return; 4768 } 4769 if (inp->inp_in_hpts) { 4770 /* Already on the pacer */ 4771 return; 4772 } 4773 stopped = rack->rc_tmr_stopped; 4774 if (stopped && TSTMP_GT(rack->r_ctl.rc_timer_exp, cts)) { 4775 left = rack->r_ctl.rc_timer_exp - cts; 4776 } 4777 rack->r_ctl.rc_timer_exp = 0; 4778 rack->r_ctl.rc_hpts_flags = 0; 4779 us_cts = tcp_get_usecs(&tv); 4780 /* Now early/late accounting */ 4781 if (rack->r_early) { 4782 /* 4783 * We have a early carry over set, 4784 * we can always add more time so we 4785 * can always make this compensation. 4786 */ 4787 slot += rack->r_ctl.rc_agg_early; 4788 rack->r_early = 0; 4789 rack->r_ctl.rc_agg_early = 0; 4790 } 4791 if (rack->r_late) { 4792 /* 4793 * This is harder, we can 4794 * compensate some but it 4795 * really depends on what 4796 * the current pacing time is. 4797 */ 4798 if (rack->r_ctl.rc_agg_delayed >= slot) { 4799 /* 4800 * We can't compensate for it all. 4801 * And we have to have some time 4802 * on the clock. We always have a min 4803 * 10 slots (10 x 10 i.e. 100 usecs). 4804 */ 4805 if (slot <= HPTS_TICKS_PER_USEC) { 4806 /* We gain delay */ 4807 rack->r_ctl.rc_agg_delayed += (HPTS_TICKS_PER_USEC - slot); 4808 slot = HPTS_TICKS_PER_USEC; 4809 } else { 4810 /* We take off some */ 4811 rack->r_ctl.rc_agg_delayed -= (slot - HPTS_TICKS_PER_USEC); 4812 slot = HPTS_TICKS_PER_USEC; 4813 } 4814 } else { 4815 4816 slot -= rack->r_ctl.rc_agg_delayed; 4817 rack->r_ctl.rc_agg_delayed = 0; 4818 /* Make sure we have 100 useconds at minimum */ 4819 if (slot < HPTS_TICKS_PER_USEC) { 4820 rack->r_ctl.rc_agg_delayed = HPTS_TICKS_PER_USEC - slot; 4821 slot = HPTS_TICKS_PER_USEC; 4822 } 4823 if (rack->r_ctl.rc_agg_delayed == 0) 4824 rack->r_late = 0; 4825 } 4826 } 4827 if (slot) { 4828 /* We are pacing too */ 4829 rack->r_ctl.rc_hpts_flags |= PACE_PKT_OUTPUT; 4830 } 4831 hpts_timeout = rack_timer_start(tp, rack, cts, sup_rack); 4832 #ifdef NETFLIX_EXP_DETECTION 4833 if (rack->sack_attack_disable && 4834 (slot < tcp_sad_pacing_interval)) { 4835 /* 4836 * We have a potential attacker on 4837 * the line. We have possibly some 4838 * (or now) pacing time set. We want to 4839 * slow down the processing of sacks by some 4840 * amount (if it is an attacker). Set the default 4841 * slot for attackers in place (unless the orginal 4842 * interval is longer). Its stored in 4843 * micro-seconds, so lets convert to msecs. 4844 */ 4845 slot = tcp_sad_pacing_interval; 4846 } 4847 #endif 4848 if (tp->t_flags & TF_DELACK) { 4849 delayed_ack = TICKS_2_MSEC(tcp_delacktime); 4850 rack->r_ctl.rc_hpts_flags |= PACE_TMR_DELACK; 4851 } 4852 if (delayed_ack && ((hpts_timeout == 0) || 4853 (delayed_ack < hpts_timeout))) 4854 hpts_timeout = delayed_ack; 4855 else 4856 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK; 4857 /* 4858 * If no timers are going to run and we will fall off the hptsi 4859 * wheel, we resort to a keep-alive timer if its configured. 4860 */ 4861 if ((hpts_timeout == 0) && 4862 (slot == 0)) { 4863 if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && 4864 (tp->t_state <= TCPS_CLOSING)) { 4865 /* 4866 * Ok we have no timer (persists, rack, tlp, rxt or 4867 * del-ack), we don't have segments being paced. So 4868 * all that is left is the keepalive timer. 4869 */ 4870 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 4871 /* Get the established keep-alive time */ 4872 hpts_timeout = TP_KEEPIDLE(tp); 4873 } else { 4874 /* Get the initial setup keep-alive time */ 4875 hpts_timeout = TP_KEEPINIT(tp); 4876 } 4877 rack->r_ctl.rc_hpts_flags |= PACE_TMR_KEEP; 4878 if (rack->in_probe_rtt) { 4879 /* 4880 * We want to instead not wake up a long time from 4881 * now but to wake up about the time we would 4882 * exit probe-rtt and initiate a keep-alive ack. 4883 * This will get us out of probe-rtt and update 4884 * our min-rtt. 4885 */ 4886 hpts_timeout = (rack_min_probertt_hold / HPTS_USEC_IN_MSEC); 4887 } 4888 } 4889 } 4890 if (left && (stopped & (PACE_TMR_KEEP | PACE_TMR_DELACK)) == 4891 (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK)) { 4892 /* 4893 * RACK, TLP, persists and RXT timers all are restartable 4894 * based on actions input .. i.e we received a packet (ack 4895 * or sack) and that changes things (rw, or snd_una etc). 4896 * Thus we can restart them with a new value. For 4897 * keep-alive, delayed_ack we keep track of what was left 4898 * and restart the timer with a smaller value. 4899 */ 4900 if (left < hpts_timeout) 4901 hpts_timeout = left; 4902 } 4903 if (hpts_timeout) { 4904 /* 4905 * Hack alert for now we can't time-out over 2,147,483 4906 * seconds (a bit more than 596 hours), which is probably ok 4907 * :). 4908 */ 4909 if (hpts_timeout > 0x7ffffffe) 4910 hpts_timeout = 0x7ffffffe; 4911 rack->r_ctl.rc_timer_exp = cts + hpts_timeout; 4912 } 4913 if ((rack->rc_gp_filled == 0) && 4914 (hpts_timeout < slot) && 4915 (rack->r_ctl.rc_hpts_flags & (PACE_TMR_TLP|PACE_TMR_RXT))) { 4916 /* 4917 * We have no good estimate yet for the 4918 * old clunky burst mitigation or the 4919 * real pacing. And the tlp or rxt is smaller 4920 * than the pacing calculation. Lets not 4921 * pace that long since we know the calculation 4922 * so far is not accurate. 4923 */ 4924 slot = hpts_timeout; 4925 } 4926 rack->r_ctl.last_pacing_time = slot; 4927 if (slot) { 4928 rack->r_ctl.rc_last_output_to = us_cts + slot; 4929 if (rack->rc_always_pace || rack->r_mbuf_queue) { 4930 if ((rack->rc_gp_filled == 0) || 4931 rack->pacing_longer_than_rtt) { 4932 inp->inp_flags2 &= ~(INP_DONT_SACK_QUEUE|INP_MBUF_QUEUE_READY); 4933 } else { 4934 inp->inp_flags2 |= INP_MBUF_QUEUE_READY; 4935 if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK) && 4936 (rack->r_rr_config != 3)) 4937 inp->inp_flags2 |= INP_DONT_SACK_QUEUE; 4938 else 4939 inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE; 4940 } 4941 } 4942 if ((rack->use_rack_rr) && 4943 (rack->r_rr_config < 2) && 4944 ((hpts_timeout) && ((hpts_timeout * HPTS_USEC_IN_MSEC) < slot))) { 4945 /* 4946 * Arrange for the hpts to kick back in after the 4947 * t-o if the t-o does not cause a send. 4948 */ 4949 (void)tcp_hpts_insert_diag(tp->t_inpcb, HPTS_MS_TO_SLOTS(hpts_timeout), 4950 __LINE__, &diag); 4951 rack_log_hpts_diag(rack, us_cts, &diag, &tv); 4952 rack_log_to_start(rack, cts, hpts_timeout, slot, 0); 4953 } else { 4954 (void)tcp_hpts_insert_diag(tp->t_inpcb, HPTS_USEC_TO_SLOTS(slot), 4955 __LINE__, &diag); 4956 rack_log_hpts_diag(rack, us_cts, &diag, &tv); 4957 rack_log_to_start(rack, cts, hpts_timeout, slot, 1); 4958 } 4959 } else if (hpts_timeout) { 4960 if (rack->rc_always_pace || rack->r_mbuf_queue) { 4961 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK) { 4962 /* For a rack timer, don't wake us */ 4963 inp->inp_flags2 |= INP_MBUF_QUEUE_READY; 4964 if (rack->r_rr_config != 3) 4965 inp->inp_flags2 |= INP_DONT_SACK_QUEUE; 4966 else 4967 inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE; 4968 } else { 4969 /* All other timers wake us up */ 4970 inp->inp_flags2 &= ~INP_MBUF_QUEUE_READY; 4971 inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE; 4972 } 4973 } 4974 (void)tcp_hpts_insert_diag(tp->t_inpcb, HPTS_MS_TO_SLOTS(hpts_timeout), 4975 __LINE__, &diag); 4976 rack_log_hpts_diag(rack, us_cts, &diag, &tv); 4977 rack_log_to_start(rack, cts, hpts_timeout, slot, 0); 4978 } else { 4979 /* No timer starting */ 4980 #ifdef INVARIANTS 4981 if (SEQ_GT(tp->snd_max, tp->snd_una)) { 4982 panic("tp:%p rack:%p tlts:%d cts:%u slot:%u pto:%u -- no timer started?", 4983 tp, rack, tot_len_this_send, cts, slot, hpts_timeout); 4984 } 4985 #endif 4986 } 4987 rack->rc_tmr_stopped = 0; 4988 if (slot) 4989 rack_log_type_bbrsnd(rack, tot_len_this_send, slot, us_cts, &tv); 4990 } 4991 4992 /* 4993 * RACK Timer, here we simply do logging and house keeping. 4994 * the normal rack_output() function will call the 4995 * appropriate thing to check if we need to do a RACK retransmit. 4996 * We return 1, saying don't proceed with rack_output only 4997 * when all timers have been stopped (destroyed PCB?). 4998 */ 4999 static int 5000 rack_timeout_rack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 5001 { 5002 /* 5003 * This timer simply provides an internal trigger to send out data. 5004 * The check_recovery_mode call will see if there are needed 5005 * retransmissions, if so we will enter fast-recovery. The output 5006 * call may or may not do the same thing depending on sysctl 5007 * settings. 5008 */ 5009 struct rack_sendmap *rsm; 5010 int32_t recovery; 5011 5012 if (tp->t_timers->tt_flags & TT_STOPPED) { 5013 return (1); 5014 } 5015 recovery = IN_RECOVERY(tp->t_flags); 5016 counter_u64_add(rack_to_tot, 1); 5017 if (rack->r_state && (rack->r_state != tp->t_state)) 5018 rack_set_state(tp, rack); 5019 rack->rc_on_min_to = 0; 5020 rsm = rack_check_recovery_mode(tp, cts); 5021 rack_log_to_event(rack, RACK_TO_FRM_RACK, rsm); 5022 if (rsm) { 5023 uint32_t rtt; 5024 5025 rack->r_ctl.rc_resend = rsm; 5026 if (rack->use_rack_rr) { 5027 /* 5028 * Don't accumulate extra pacing delay 5029 * we are allowing the rack timer to 5030 * over-ride pacing i.e. rrr takes precedence 5031 * if the pacing interval is longer than the rrr 5032 * time (in other words we get the min pacing 5033 * time versus rrr pacing time). 5034 */ 5035 rack->r_timer_override = 1; 5036 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 5037 } 5038 rtt = rack->rc_rack_rtt; 5039 if (rtt == 0) 5040 rtt = 1; 5041 if (rack->rack_no_prr == 0) { 5042 if ((recovery == 0) && 5043 (rack->r_ctl.rc_prr_sndcnt < ctf_fixed_maxseg(tp))) { 5044 /* 5045 * The rack-timeout that enter's us into recovery 5046 * will force out one MSS and set us up so that we 5047 * can do one more send in 2*rtt (transitioning the 5048 * rack timeout into a rack-tlp). 5049 */ 5050 rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); 5051 rack->r_timer_override = 1; 5052 rack_log_to_prr(rack, 3, 0); 5053 } else if ((rack->r_ctl.rc_prr_sndcnt < (rsm->r_end - rsm->r_start)) && 5054 rack->use_rack_rr) { 5055 /* 5056 * When a rack timer goes, if the rack rr is 5057 * on, arrange it so we can send a full segment 5058 * overriding prr (though we pay a price for this 5059 * for future new sends). 5060 */ 5061 rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); 5062 rack_log_to_prr(rack, 4, 0); 5063 } 5064 } 5065 } 5066 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RACK; 5067 if (rsm == NULL) { 5068 /* restart a timer and return 1 */ 5069 rack_start_hpts_timer(rack, tp, cts, 5070 0, 0, 0); 5071 return (1); 5072 } 5073 return (0); 5074 } 5075 5076 static __inline void 5077 rack_clone_rsm(struct tcp_rack *rack, struct rack_sendmap *nrsm, 5078 struct rack_sendmap *rsm, uint32_t start) 5079 { 5080 int idx; 5081 5082 nrsm->r_start = start; 5083 nrsm->r_end = rsm->r_end; 5084 nrsm->r_rtr_cnt = rsm->r_rtr_cnt; 5085 nrsm->r_flags = rsm->r_flags; 5086 nrsm->r_dupack = rsm->r_dupack; 5087 nrsm->usec_orig_send = rsm->usec_orig_send; 5088 nrsm->r_rtr_bytes = 0; 5089 rsm->r_end = nrsm->r_start; 5090 nrsm->r_just_ret = rsm->r_just_ret; 5091 for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { 5092 nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; 5093 } 5094 } 5095 5096 static struct rack_sendmap * 5097 rack_merge_rsm(struct tcp_rack *rack, 5098 struct rack_sendmap *l_rsm, 5099 struct rack_sendmap *r_rsm) 5100 { 5101 /* 5102 * We are merging two ack'd RSM's, 5103 * the l_rsm is on the left (lower seq 5104 * values) and the r_rsm is on the right 5105 * (higher seq value). The simplest way 5106 * to merge these is to move the right 5107 * one into the left. I don't think there 5108 * is any reason we need to try to find 5109 * the oldest (or last oldest retransmitted). 5110 */ 5111 struct rack_sendmap *rm; 5112 5113 l_rsm->r_end = r_rsm->r_end; 5114 if (l_rsm->r_dupack < r_rsm->r_dupack) 5115 l_rsm->r_dupack = r_rsm->r_dupack; 5116 if (r_rsm->r_rtr_bytes) 5117 l_rsm->r_rtr_bytes += r_rsm->r_rtr_bytes; 5118 if (r_rsm->r_in_tmap) { 5119 /* This really should not happen */ 5120 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, r_rsm, r_tnext); 5121 r_rsm->r_in_tmap = 0; 5122 } 5123 5124 /* Now the flags */ 5125 if (r_rsm->r_flags & RACK_HAS_FIN) 5126 l_rsm->r_flags |= RACK_HAS_FIN; 5127 if (r_rsm->r_flags & RACK_TLP) 5128 l_rsm->r_flags |= RACK_TLP; 5129 if (r_rsm->r_flags & RACK_RWND_COLLAPSED) 5130 l_rsm->r_flags |= RACK_RWND_COLLAPSED; 5131 if ((r_rsm->r_flags & RACK_APP_LIMITED) && 5132 ((l_rsm->r_flags & RACK_APP_LIMITED) == 0)) { 5133 /* 5134 * If both are app-limited then let the 5135 * free lower the count. If right is app 5136 * limited and left is not, transfer. 5137 */ 5138 l_rsm->r_flags |= RACK_APP_LIMITED; 5139 r_rsm->r_flags &= ~RACK_APP_LIMITED; 5140 if (r_rsm == rack->r_ctl.rc_first_appl) 5141 rack->r_ctl.rc_first_appl = l_rsm; 5142 } 5143 rm = RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, r_rsm); 5144 #ifdef INVARIANTS 5145 if (rm != r_rsm) { 5146 panic("removing head in rack:%p rsm:%p rm:%p", 5147 rack, r_rsm, rm); 5148 } 5149 #endif 5150 if ((r_rsm->r_limit_type == 0) && (l_rsm->r_limit_type != 0)) { 5151 /* Transfer the split limit to the map we free */ 5152 r_rsm->r_limit_type = l_rsm->r_limit_type; 5153 l_rsm->r_limit_type = 0; 5154 } 5155 rack_free(rack, r_rsm); 5156 return(l_rsm); 5157 } 5158 5159 /* 5160 * TLP Timer, here we simply setup what segment we want to 5161 * have the TLP expire on, the normal rack_output() will then 5162 * send it out. 5163 * 5164 * We return 1, saying don't proceed with rack_output only 5165 * when all timers have been stopped (destroyed PCB?). 5166 */ 5167 static int 5168 rack_timeout_tlp(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 5169 { 5170 /* 5171 * Tail Loss Probe. 5172 */ 5173 struct rack_sendmap *rsm = NULL; 5174 struct rack_sendmap *insret; 5175 struct socket *so; 5176 uint32_t amm, old_prr_snd = 0; 5177 uint32_t out, avail; 5178 int collapsed_win = 0; 5179 5180 if (tp->t_timers->tt_flags & TT_STOPPED) { 5181 return (1); 5182 } 5183 if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) { 5184 /* Its not time yet */ 5185 return (0); 5186 } 5187 if (ctf_progress_timeout_check(tp, true)) { 5188 rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__); 5189 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 5190 return (1); 5191 } 5192 /* 5193 * A TLP timer has expired. We have been idle for 2 rtts. So we now 5194 * need to figure out how to force a full MSS segment out. 5195 */ 5196 rack_log_to_event(rack, RACK_TO_FRM_TLP, NULL); 5197 counter_u64_add(rack_tlp_tot, 1); 5198 if (rack->r_state && (rack->r_state != tp->t_state)) 5199 rack_set_state(tp, rack); 5200 so = tp->t_inpcb->inp_socket; 5201 avail = sbavail(&so->so_snd); 5202 out = tp->snd_max - tp->snd_una; 5203 if (out > tp->snd_wnd) { 5204 /* special case, we need a retransmission */ 5205 collapsed_win = 1; 5206 goto need_retran; 5207 } 5208 /* 5209 * Check our send oldest always settings, and if 5210 * there is an oldest to send jump to the need_retran. 5211 */ 5212 if (rack_always_send_oldest && (TAILQ_EMPTY(&rack->r_ctl.rc_tmap) == 0)) 5213 goto need_retran; 5214 5215 if (avail > out) { 5216 /* New data is available */ 5217 amm = avail - out; 5218 if (amm > ctf_fixed_maxseg(tp)) { 5219 amm = ctf_fixed_maxseg(tp); 5220 if ((amm + out) > tp->snd_wnd) { 5221 /* We are rwnd limited */ 5222 goto need_retran; 5223 } 5224 } else if (amm < ctf_fixed_maxseg(tp)) { 5225 /* not enough to fill a MTU */ 5226 goto need_retran; 5227 } 5228 if (IN_RECOVERY(tp->t_flags)) { 5229 /* Unlikely */ 5230 if (rack->rack_no_prr == 0) { 5231 old_prr_snd = rack->r_ctl.rc_prr_sndcnt; 5232 if (out + amm <= tp->snd_wnd) { 5233 rack->r_ctl.rc_prr_sndcnt = amm; 5234 rack_log_to_prr(rack, 4, 0); 5235 } 5236 } else 5237 goto need_retran; 5238 } else { 5239 /* Set the send-new override */ 5240 if (out + amm <= tp->snd_wnd) 5241 rack->r_ctl.rc_tlp_new_data = amm; 5242 else 5243 goto need_retran; 5244 } 5245 rack->r_ctl.rc_tlpsend = NULL; 5246 counter_u64_add(rack_tlp_newdata, 1); 5247 goto send; 5248 } 5249 need_retran: 5250 /* 5251 * Ok we need to arrange the last un-acked segment to be re-sent, or 5252 * optionally the first un-acked segment. 5253 */ 5254 if (collapsed_win == 0) { 5255 if (rack_always_send_oldest) 5256 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 5257 else { 5258 rsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 5259 if (rsm && (rsm->r_flags & (RACK_ACKED | RACK_HAS_FIN))) { 5260 rsm = rack_find_high_nonack(rack, rsm); 5261 } 5262 } 5263 if (rsm == NULL) { 5264 counter_u64_add(rack_tlp_does_nada, 1); 5265 #ifdef TCP_BLACKBOX 5266 tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true); 5267 #endif 5268 goto out; 5269 } 5270 } else { 5271 /* 5272 * We must find the last segment 5273 * that was acceptable by the client. 5274 */ 5275 RB_FOREACH_REVERSE(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) { 5276 if ((rsm->r_flags & RACK_RWND_COLLAPSED) == 0) { 5277 /* Found one */ 5278 break; 5279 } 5280 } 5281 if (rsm == NULL) { 5282 /* None? if so send the first */ 5283 rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 5284 if (rsm == NULL) { 5285 counter_u64_add(rack_tlp_does_nada, 1); 5286 #ifdef TCP_BLACKBOX 5287 tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true); 5288 #endif 5289 goto out; 5290 } 5291 } 5292 } 5293 if ((rsm->r_end - rsm->r_start) > ctf_fixed_maxseg(tp)) { 5294 /* 5295 * We need to split this the last segment in two. 5296 */ 5297 struct rack_sendmap *nrsm; 5298 5299 5300 nrsm = rack_alloc_full_limit(rack); 5301 if (nrsm == NULL) { 5302 /* 5303 * No memory to split, we will just exit and punt 5304 * off to the RXT timer. 5305 */ 5306 counter_u64_add(rack_tlp_does_nada, 1); 5307 goto out; 5308 } 5309 rack_clone_rsm(rack, nrsm, rsm, 5310 (rsm->r_end - ctf_fixed_maxseg(tp))); 5311 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); 5312 #ifdef INVARIANTS 5313 if (insret != NULL) { 5314 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", 5315 nrsm, insret, rack, rsm); 5316 } 5317 #endif 5318 if (rsm->r_in_tmap) { 5319 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 5320 nrsm->r_in_tmap = 1; 5321 } 5322 rsm->r_flags &= (~RACK_HAS_FIN); 5323 rsm = nrsm; 5324 } 5325 rack->r_ctl.rc_tlpsend = rsm; 5326 send: 5327 rack->r_timer_override = 1; 5328 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP; 5329 return (0); 5330 out: 5331 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP; 5332 return (0); 5333 } 5334 5335 /* 5336 * Delayed ack Timer, here we simply need to setup the 5337 * ACK_NOW flag and remove the DELACK flag. From there 5338 * the output routine will send the ack out. 5339 * 5340 * We only return 1, saying don't proceed, if all timers 5341 * are stopped (destroyed PCB?). 5342 */ 5343 static int 5344 rack_timeout_delack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 5345 { 5346 if (tp->t_timers->tt_flags & TT_STOPPED) { 5347 return (1); 5348 } 5349 rack_log_to_event(rack, RACK_TO_FRM_DELACK, NULL); 5350 tp->t_flags &= ~TF_DELACK; 5351 tp->t_flags |= TF_ACKNOW; 5352 KMOD_TCPSTAT_INC(tcps_delack); 5353 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK; 5354 return (0); 5355 } 5356 5357 /* 5358 * Persists timer, here we simply send the 5359 * same thing as a keepalive will. 5360 * the one byte send. 5361 * 5362 * We only return 1, saying don't proceed, if all timers 5363 * are stopped (destroyed PCB?). 5364 */ 5365 static int 5366 rack_timeout_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 5367 { 5368 struct tcptemp *t_template; 5369 struct inpcb *inp; 5370 int32_t retval = 1; 5371 5372 inp = tp->t_inpcb; 5373 5374 if (tp->t_timers->tt_flags & TT_STOPPED) { 5375 return (1); 5376 } 5377 if (rack->rc_in_persist == 0) 5378 return (0); 5379 if (ctf_progress_timeout_check(tp, false)) { 5380 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); 5381 rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__); 5382 tcp_set_inp_to_drop(inp, ETIMEDOUT); 5383 return (1); 5384 } 5385 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 5386 /* 5387 * Persistence timer into zero window. Force a byte to be output, if 5388 * possible. 5389 */ 5390 KMOD_TCPSTAT_INC(tcps_persisttimeo); 5391 /* 5392 * Hack: if the peer is dead/unreachable, we do not time out if the 5393 * window is closed. After a full backoff, drop the connection if 5394 * the idle time (no responses to probes) reaches the maximum 5395 * backoff that we would use if retransmitting. 5396 */ 5397 if (tp->t_rxtshift == TCP_MAXRXTSHIFT && 5398 (ticks - tp->t_rcvtime >= tcp_maxpersistidle || 5399 ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) { 5400 KMOD_TCPSTAT_INC(tcps_persistdrop); 5401 retval = 1; 5402 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); 5403 tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT); 5404 goto out; 5405 } 5406 if ((sbavail(&rack->rc_inp->inp_socket->so_snd) == 0) && 5407 tp->snd_una == tp->snd_max) 5408 rack_exit_persist(tp, rack, cts); 5409 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_PERSIT; 5410 /* 5411 * If the user has closed the socket then drop a persisting 5412 * connection after a much reduced timeout. 5413 */ 5414 if (tp->t_state > TCPS_CLOSE_WAIT && 5415 (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) { 5416 retval = 1; 5417 KMOD_TCPSTAT_INC(tcps_persistdrop); 5418 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); 5419 tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT); 5420 goto out; 5421 } 5422 t_template = tcpip_maketemplate(rack->rc_inp); 5423 if (t_template) { 5424 /* only set it if we were answered */ 5425 if (rack->forced_ack == 0) { 5426 rack->forced_ack = 1; 5427 rack->r_ctl.forced_ack_ts = tcp_get_usecs(NULL); 5428 } 5429 tcp_respond(tp, t_template->tt_ipgen, 5430 &t_template->tt_t, (struct mbuf *)NULL, 5431 tp->rcv_nxt, tp->snd_una - 1, 0); 5432 /* This sends an ack */ 5433 if (tp->t_flags & TF_DELACK) 5434 tp->t_flags &= ~TF_DELACK; 5435 free(t_template, M_TEMP); 5436 } 5437 if (tp->t_rxtshift < TCP_MAXRXTSHIFT) 5438 tp->t_rxtshift++; 5439 out: 5440 rack_log_to_event(rack, RACK_TO_FRM_PERSIST, NULL); 5441 rack_start_hpts_timer(rack, tp, cts, 5442 0, 0, 0); 5443 return (retval); 5444 } 5445 5446 /* 5447 * If a keepalive goes off, we had no other timers 5448 * happening. We always return 1 here since this 5449 * routine either drops the connection or sends 5450 * out a segment with respond. 5451 */ 5452 static int 5453 rack_timeout_keepalive(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 5454 { 5455 struct tcptemp *t_template; 5456 struct inpcb *inp; 5457 5458 if (tp->t_timers->tt_flags & TT_STOPPED) { 5459 return (1); 5460 } 5461 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_KEEP; 5462 inp = tp->t_inpcb; 5463 rack_log_to_event(rack, RACK_TO_FRM_KEEP, NULL); 5464 /* 5465 * Keep-alive timer went off; send something or drop connection if 5466 * idle for too long. 5467 */ 5468 KMOD_TCPSTAT_INC(tcps_keeptimeo); 5469 if (tp->t_state < TCPS_ESTABLISHED) 5470 goto dropit; 5471 if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && 5472 tp->t_state <= TCPS_CLOSING) { 5473 if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp)) 5474 goto dropit; 5475 /* 5476 * Send a packet designed to force a response if the peer is 5477 * up and reachable: either an ACK if the connection is 5478 * still alive, or an RST if the peer has closed the 5479 * connection due to timeout or reboot. Using sequence 5480 * number tp->snd_una-1 causes the transmitted zero-length 5481 * segment to lie outside the receive window; by the 5482 * protocol spec, this requires the correspondent TCP to 5483 * respond. 5484 */ 5485 KMOD_TCPSTAT_INC(tcps_keepprobe); 5486 t_template = tcpip_maketemplate(inp); 5487 if (t_template) { 5488 if (rack->forced_ack == 0) { 5489 rack->forced_ack = 1; 5490 rack->r_ctl.forced_ack_ts = tcp_get_usecs(NULL); 5491 } 5492 tcp_respond(tp, t_template->tt_ipgen, 5493 &t_template->tt_t, (struct mbuf *)NULL, 5494 tp->rcv_nxt, tp->snd_una - 1, 0); 5495 free(t_template, M_TEMP); 5496 } 5497 } 5498 rack_start_hpts_timer(rack, tp, cts, 0, 0, 0); 5499 return (1); 5500 dropit: 5501 KMOD_TCPSTAT_INC(tcps_keepdrops); 5502 tcp_log_end_status(tp, TCP_EI_STATUS_KEEP_MAX); 5503 tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT); 5504 return (1); 5505 } 5506 5507 /* 5508 * Retransmit helper function, clear up all the ack 5509 * flags and take care of important book keeping. 5510 */ 5511 static void 5512 rack_remxt_tmr(struct tcpcb *tp) 5513 { 5514 /* 5515 * The retransmit timer went off, all sack'd blocks must be 5516 * un-acked. 5517 */ 5518 struct rack_sendmap *rsm, *trsm = NULL; 5519 struct tcp_rack *rack; 5520 int32_t cnt = 0; 5521 5522 rack = (struct tcp_rack *)tp->t_fb_ptr; 5523 rack_timer_cancel(tp, rack, tcp_ts_getticks(), __LINE__); 5524 rack_log_to_event(rack, RACK_TO_FRM_TMR, NULL); 5525 if (rack->r_state && (rack->r_state != tp->t_state)) 5526 rack_set_state(tp, rack); 5527 /* 5528 * Ideally we would like to be able to 5529 * mark SACK-PASS on anything not acked here. 5530 * However, if we do that we would burst out 5531 * all that data 1ms apart. This would be unwise, 5532 * so for now we will just let the normal rxt timer 5533 * and tlp timer take care of it. 5534 */ 5535 RB_FOREACH(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) { 5536 if (rsm->r_flags & RACK_ACKED) { 5537 cnt++; 5538 rsm->r_dupack = 0; 5539 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 5540 if (rsm->r_in_tmap == 0) { 5541 /* We must re-add it back to the tlist */ 5542 if (trsm == NULL) { 5543 TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext); 5544 } else { 5545 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, trsm, rsm, r_tnext); 5546 } 5547 rsm->r_in_tmap = 1; 5548 } 5549 } 5550 trsm = rsm; 5551 if (rsm->r_flags & RACK_ACKED) 5552 rsm->r_flags |= RACK_WAS_ACKED; 5553 rsm->r_flags &= ~(RACK_ACKED | RACK_SACK_PASSED | RACK_WAS_SACKPASS); 5554 } 5555 /* Clear the count (we just un-acked them) */ 5556 rack->r_ctl.rc_sacked = 0; 5557 rack->r_ctl.rc_agg_delayed = 0; 5558 rack->r_early = 0; 5559 rack->r_ctl.rc_agg_early = 0; 5560 rack->r_late = 0; 5561 /* Clear the tlp rtx mark */ 5562 rack->r_ctl.rc_resend = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 5563 rack->r_ctl.rc_prr_sndcnt = 0; 5564 rack_log_to_prr(rack, 6, 0); 5565 rack->r_timer_override = 1; 5566 } 5567 5568 static void 5569 rack_cc_conn_init(struct tcpcb *tp) 5570 { 5571 struct tcp_rack *rack; 5572 5573 5574 rack = (struct tcp_rack *)tp->t_fb_ptr; 5575 cc_conn_init(tp); 5576 /* 5577 * We want a chance to stay in slowstart as 5578 * we create a connection. TCP spec says that 5579 * initially ssthresh is infinite. For our 5580 * purposes that is the snd_wnd. 5581 */ 5582 if (tp->snd_ssthresh < tp->snd_wnd) { 5583 tp->snd_ssthresh = tp->snd_wnd; 5584 } 5585 /* 5586 * We also want to assure a IW worth of 5587 * data can get inflight. 5588 */ 5589 if (rc_init_window(rack) < tp->snd_cwnd) 5590 tp->snd_cwnd = rc_init_window(rack); 5591 } 5592 5593 /* 5594 * Re-transmit timeout! If we drop the PCB we will return 1, otherwise 5595 * we will setup to retransmit the lowest seq number outstanding. 5596 */ 5597 static int 5598 rack_timeout_rxt(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 5599 { 5600 int32_t rexmt; 5601 struct inpcb *inp; 5602 int32_t retval = 0; 5603 bool isipv6; 5604 5605 inp = tp->t_inpcb; 5606 if (tp->t_timers->tt_flags & TT_STOPPED) { 5607 return (1); 5608 } 5609 if (ctf_progress_timeout_check(tp, false)) { 5610 tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN); 5611 rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__); 5612 tcp_set_inp_to_drop(inp, ETIMEDOUT); 5613 return (1); 5614 } 5615 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RXT; 5616 if (TCPS_HAVEESTABLISHED(tp->t_state) && 5617 (tp->snd_una == tp->snd_max)) { 5618 /* Nothing outstanding .. nothing to do */ 5619 return (0); 5620 } 5621 /* 5622 * Retransmission timer went off. Message has not been acked within 5623 * retransmit interval. Back off to a longer retransmit interval 5624 * and retransmit one segment. 5625 */ 5626 rack_remxt_tmr(tp); 5627 if ((rack->r_ctl.rc_resend == NULL) || 5628 ((rack->r_ctl.rc_resend->r_flags & RACK_RWND_COLLAPSED) == 0)) { 5629 /* 5630 * If the rwnd collapsed on 5631 * the one we are retransmitting 5632 * it does not count against the 5633 * rxt count. 5634 */ 5635 tp->t_rxtshift++; 5636 } 5637 if (tp->t_rxtshift > TCP_MAXRXTSHIFT) { 5638 tp->t_rxtshift = TCP_MAXRXTSHIFT; 5639 KMOD_TCPSTAT_INC(tcps_timeoutdrop); 5640 retval = 1; 5641 tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN); 5642 tcp_set_inp_to_drop(rack->rc_inp, 5643 (tp->t_softerror ? (uint16_t) tp->t_softerror : ETIMEDOUT)); 5644 goto out; 5645 } 5646 if (tp->t_state == TCPS_SYN_SENT) { 5647 /* 5648 * If the SYN was retransmitted, indicate CWND to be limited 5649 * to 1 segment in cc_conn_init(). 5650 */ 5651 tp->snd_cwnd = 1; 5652 } else if (tp->t_rxtshift == 1) { 5653 /* 5654 * first retransmit; record ssthresh and cwnd so they can be 5655 * recovered if this turns out to be a "bad" retransmit. A 5656 * retransmit is considered "bad" if an ACK for this segment 5657 * is received within RTT/2 interval; the assumption here is 5658 * that the ACK was already in flight. See "On Estimating 5659 * End-to-End Network Path Properties" by Allman and Paxson 5660 * for more details. 5661 */ 5662 tp->snd_cwnd_prev = tp->snd_cwnd; 5663 tp->snd_ssthresh_prev = tp->snd_ssthresh; 5664 tp->snd_recover_prev = tp->snd_recover; 5665 if (IN_FASTRECOVERY(tp->t_flags)) 5666 tp->t_flags |= TF_WASFRECOVERY; 5667 else 5668 tp->t_flags &= ~TF_WASFRECOVERY; 5669 if (IN_CONGRECOVERY(tp->t_flags)) 5670 tp->t_flags |= TF_WASCRECOVERY; 5671 else 5672 tp->t_flags &= ~TF_WASCRECOVERY; 5673 tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1)); 5674 tp->t_flags |= TF_PREVVALID; 5675 } else 5676 tp->t_flags &= ~TF_PREVVALID; 5677 KMOD_TCPSTAT_INC(tcps_rexmttimeo); 5678 if ((tp->t_state == TCPS_SYN_SENT) || 5679 (tp->t_state == TCPS_SYN_RECEIVED)) 5680 rexmt = MSEC_2_TICKS(RACK_INITIAL_RTO * tcp_backoff[tp->t_rxtshift]); 5681 else 5682 rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; 5683 TCPT_RANGESET(tp->t_rxtcur, rexmt, 5684 max(MSEC_2_TICKS(rack_rto_min), rexmt), 5685 MSEC_2_TICKS(rack_rto_max)); 5686 /* 5687 * We enter the path for PLMTUD if connection is established or, if 5688 * connection is FIN_WAIT_1 status, reason for the last is that if 5689 * amount of data we send is very small, we could send it in couple 5690 * of packets and process straight to FIN. In that case we won't 5691 * catch ESTABLISHED state. 5692 */ 5693 #ifdef INET6 5694 isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? true : false; 5695 #else 5696 isipv6 = false; 5697 #endif 5698 if (((V_tcp_pmtud_blackhole_detect == 1) || 5699 (V_tcp_pmtud_blackhole_detect == 2 && !isipv6) || 5700 (V_tcp_pmtud_blackhole_detect == 3 && isipv6)) && 5701 ((tp->t_state == TCPS_ESTABLISHED) || 5702 (tp->t_state == TCPS_FIN_WAIT_1))) { 5703 5704 /* 5705 * Idea here is that at each stage of mtu probe (usually, 5706 * 1448 -> 1188 -> 524) should be given 2 chances to recover 5707 * before further clamping down. 'tp->t_rxtshift % 2 == 0' 5708 * should take care of that. 5709 */ 5710 if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) == 5711 (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) && 5712 (tp->t_rxtshift >= 2 && tp->t_rxtshift < 6 && 5713 tp->t_rxtshift % 2 == 0)) { 5714 /* 5715 * Enter Path MTU Black-hole Detection mechanism: - 5716 * Disable Path MTU Discovery (IP "DF" bit). - 5717 * Reduce MTU to lower value than what we negotiated 5718 * with peer. 5719 */ 5720 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) == 0) { 5721 /* Record that we may have found a black hole. */ 5722 tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE; 5723 /* Keep track of previous MSS. */ 5724 tp->t_pmtud_saved_maxseg = tp->t_maxseg; 5725 } 5726 5727 /* 5728 * Reduce the MSS to blackhole value or to the 5729 * default in an attempt to retransmit. 5730 */ 5731 #ifdef INET6 5732 if (isipv6 && 5733 tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) { 5734 /* Use the sysctl tuneable blackhole MSS. */ 5735 tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss; 5736 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated); 5737 } else if (isipv6) { 5738 /* Use the default MSS. */ 5739 tp->t_maxseg = V_tcp_v6mssdflt; 5740 /* 5741 * Disable Path MTU Discovery when we switch 5742 * to minmss. 5743 */ 5744 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 5745 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); 5746 } 5747 #endif 5748 #if defined(INET6) && defined(INET) 5749 else 5750 #endif 5751 #ifdef INET 5752 if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) { 5753 /* Use the sysctl tuneable blackhole MSS. */ 5754 tp->t_maxseg = V_tcp_pmtud_blackhole_mss; 5755 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated); 5756 } else { 5757 /* Use the default MSS. */ 5758 tp->t_maxseg = V_tcp_mssdflt; 5759 /* 5760 * Disable Path MTU Discovery when we switch 5761 * to minmss. 5762 */ 5763 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 5764 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); 5765 } 5766 #endif 5767 } else { 5768 /* 5769 * If further retransmissions are still unsuccessful 5770 * with a lowered MTU, maybe this isn't a blackhole 5771 * and we restore the previous MSS and blackhole 5772 * detection flags. The limit '6' is determined by 5773 * giving each probe stage (1448, 1188, 524) 2 5774 * chances to recover. 5775 */ 5776 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) && 5777 (tp->t_rxtshift >= 6)) { 5778 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 5779 tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE; 5780 tp->t_maxseg = tp->t_pmtud_saved_maxseg; 5781 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_failed); 5782 } 5783 } 5784 } 5785 /* 5786 * If we backed off this far, our srtt estimate is probably bogus. 5787 * Clobber it so we'll take the next rtt measurement as our srtt; 5788 * move the current srtt into rttvar to keep the current retransmit 5789 * times until then. 5790 */ 5791 if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) { 5792 #ifdef INET6 5793 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) 5794 in6_losing(tp->t_inpcb); 5795 else 5796 #endif 5797 in_losing(tp->t_inpcb); 5798 tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT); 5799 tp->t_srtt = 0; 5800 } 5801 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 5802 tp->snd_recover = tp->snd_max; 5803 tp->t_flags |= TF_ACKNOW; 5804 tp->t_rtttime = 0; 5805 rack_cong_signal(tp, NULL, CC_RTO); 5806 out: 5807 return (retval); 5808 } 5809 5810 static int 5811 rack_process_timers(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8_t hpts_calling) 5812 { 5813 int32_t ret = 0; 5814 int32_t timers = (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK); 5815 5816 if (timers == 0) { 5817 return (0); 5818 } 5819 if (tp->t_state == TCPS_LISTEN) { 5820 /* no timers on listen sockets */ 5821 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) 5822 return (0); 5823 return (1); 5824 } 5825 if ((timers & PACE_TMR_RACK) && 5826 rack->rc_on_min_to) { 5827 /* 5828 * For the rack timer when we 5829 * are on a min-timeout (which means rrr_conf = 3) 5830 * we don't want to check the timer. It may 5831 * be going off for a pace and thats ok we 5832 * want to send the retransmit (if its ready). 5833 * 5834 * If its on a normal rack timer (non-min) then 5835 * we will check if its expired. 5836 */ 5837 goto skip_time_check; 5838 } 5839 if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) { 5840 uint32_t left; 5841 5842 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { 5843 ret = -1; 5844 rack_log_to_processing(rack, cts, ret, 0); 5845 return (0); 5846 } 5847 if (hpts_calling == 0) { 5848 /* 5849 * A user send or queued mbuf (sack) has called us? We 5850 * return 0 and let the pacing guards 5851 * deal with it if they should or 5852 * should not cause a send. 5853 */ 5854 ret = -2; 5855 rack_log_to_processing(rack, cts, ret, 0); 5856 return (0); 5857 } 5858 /* 5859 * Ok our timer went off early and we are not paced false 5860 * alarm, go back to sleep. 5861 */ 5862 ret = -3; 5863 left = rack->r_ctl.rc_timer_exp - cts; 5864 tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(left)); 5865 rack_log_to_processing(rack, cts, ret, left); 5866 return (1); 5867 } 5868 skip_time_check: 5869 rack->rc_tmr_stopped = 0; 5870 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_MASK; 5871 if (timers & PACE_TMR_DELACK) { 5872 ret = rack_timeout_delack(tp, rack, cts); 5873 } else if (timers & PACE_TMR_RACK) { 5874 rack->r_ctl.rc_tlp_rxt_last_time = cts; 5875 ret = rack_timeout_rack(tp, rack, cts); 5876 } else if (timers & PACE_TMR_TLP) { 5877 rack->r_ctl.rc_tlp_rxt_last_time = cts; 5878 ret = rack_timeout_tlp(tp, rack, cts); 5879 } else if (timers & PACE_TMR_RXT) { 5880 rack->r_ctl.rc_tlp_rxt_last_time = cts; 5881 ret = rack_timeout_rxt(tp, rack, cts); 5882 } else if (timers & PACE_TMR_PERSIT) { 5883 ret = rack_timeout_persist(tp, rack, cts); 5884 } else if (timers & PACE_TMR_KEEP) { 5885 ret = rack_timeout_keepalive(tp, rack, cts); 5886 } 5887 rack_log_to_processing(rack, cts, ret, timers); 5888 return (ret); 5889 } 5890 5891 static void 5892 rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line) 5893 { 5894 struct timeval tv; 5895 uint32_t us_cts, flags_on_entry; 5896 uint8_t hpts_removed = 0; 5897 5898 5899 flags_on_entry = rack->r_ctl.rc_hpts_flags; 5900 us_cts = tcp_get_usecs(&tv); 5901 if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) && 5902 ((TSTMP_GEQ(us_cts, rack->r_ctl.rc_last_output_to)) || 5903 ((tp->snd_max - tp->snd_una) == 0))) { 5904 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); 5905 hpts_removed = 1; 5906 /* If we were not delayed cancel out the flag. */ 5907 if ((tp->snd_max - tp->snd_una) == 0) 5908 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 5909 rack_log_to_cancel(rack, hpts_removed, line, us_cts, &tv, flags_on_entry); 5910 } 5911 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { 5912 rack->rc_tmr_stopped = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK; 5913 if (rack->rc_inp->inp_in_hpts && 5914 ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0)) { 5915 /* 5916 * Canceling timer's when we have no output being 5917 * paced. We also must remove ourselves from the 5918 * hpts. 5919 */ 5920 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); 5921 hpts_removed = 1; 5922 } 5923 rack->r_ctl.rc_hpts_flags &= ~(PACE_TMR_MASK); 5924 } 5925 if (hpts_removed == 0) 5926 rack_log_to_cancel(rack, hpts_removed, line, us_cts, &tv, flags_on_entry); 5927 } 5928 5929 static void 5930 rack_timer_stop(struct tcpcb *tp, uint32_t timer_type) 5931 { 5932 return; 5933 } 5934 5935 static int 5936 rack_stopall(struct tcpcb *tp) 5937 { 5938 struct tcp_rack *rack; 5939 rack = (struct tcp_rack *)tp->t_fb_ptr; 5940 rack->t_timers_stopped = 1; 5941 return (0); 5942 } 5943 5944 static void 5945 rack_timer_activate(struct tcpcb *tp, uint32_t timer_type, uint32_t delta) 5946 { 5947 return; 5948 } 5949 5950 static int 5951 rack_timer_active(struct tcpcb *tp, uint32_t timer_type) 5952 { 5953 return (0); 5954 } 5955 5956 static void 5957 rack_stop_all_timers(struct tcpcb *tp) 5958 { 5959 struct tcp_rack *rack; 5960 5961 /* 5962 * Assure no timers are running. 5963 */ 5964 if (tcp_timer_active(tp, TT_PERSIST)) { 5965 /* We enter in persists, set the flag appropriately */ 5966 rack = (struct tcp_rack *)tp->t_fb_ptr; 5967 rack->rc_in_persist = 1; 5968 } 5969 tcp_timer_suspend(tp, TT_PERSIST); 5970 tcp_timer_suspend(tp, TT_REXMT); 5971 tcp_timer_suspend(tp, TT_KEEP); 5972 tcp_timer_suspend(tp, TT_DELACK); 5973 } 5974 5975 static void 5976 rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack, 5977 struct rack_sendmap *rsm, uint32_t ts) 5978 { 5979 int32_t idx; 5980 5981 rsm->r_rtr_cnt++; 5982 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 5983 rsm->r_dupack = 0; 5984 if (rsm->r_rtr_cnt > RACK_NUM_OF_RETRANS) { 5985 rsm->r_rtr_cnt = RACK_NUM_OF_RETRANS; 5986 rsm->r_flags |= RACK_OVERMAX; 5987 } 5988 if ((rsm->r_rtr_cnt > 1) && ((rsm->r_flags & RACK_TLP) == 0)) { 5989 rack->r_ctl.rc_holes_rxt += (rsm->r_end - rsm->r_start); 5990 rsm->r_rtr_bytes += (rsm->r_end - rsm->r_start); 5991 } 5992 idx = rsm->r_rtr_cnt - 1; 5993 rsm->r_tim_lastsent[idx] = ts; 5994 if (rsm->r_flags & RACK_ACKED) { 5995 /* Problably MTU discovery messing with us */ 5996 rsm->r_flags &= ~RACK_ACKED; 5997 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); 5998 } 5999 if (rsm->r_in_tmap) { 6000 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 6001 rsm->r_in_tmap = 0; 6002 } 6003 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); 6004 rsm->r_in_tmap = 1; 6005 if (rsm->r_flags & RACK_SACK_PASSED) { 6006 /* We have retransmitted due to the SACK pass */ 6007 rsm->r_flags &= ~RACK_SACK_PASSED; 6008 rsm->r_flags |= RACK_WAS_SACKPASS; 6009 } 6010 } 6011 6012 6013 static uint32_t 6014 rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack, 6015 struct rack_sendmap *rsm, uint32_t ts, int32_t *lenp) 6016 { 6017 /* 6018 * We (re-)transmitted starting at rsm->r_start for some length 6019 * (possibly less than r_end. 6020 */ 6021 struct rack_sendmap *nrsm, *insret; 6022 uint32_t c_end; 6023 int32_t len; 6024 6025 len = *lenp; 6026 c_end = rsm->r_start + len; 6027 if (SEQ_GEQ(c_end, rsm->r_end)) { 6028 /* 6029 * We retransmitted the whole piece or more than the whole 6030 * slopping into the next rsm. 6031 */ 6032 rack_update_rsm(tp, rack, rsm, ts); 6033 if (c_end == rsm->r_end) { 6034 *lenp = 0; 6035 return (0); 6036 } else { 6037 int32_t act_len; 6038 6039 /* Hangs over the end return whats left */ 6040 act_len = rsm->r_end - rsm->r_start; 6041 *lenp = (len - act_len); 6042 return (rsm->r_end); 6043 } 6044 /* We don't get out of this block. */ 6045 } 6046 /* 6047 * Here we retransmitted less than the whole thing which means we 6048 * have to split this into what was transmitted and what was not. 6049 */ 6050 nrsm = rack_alloc_full_limit(rack); 6051 if (nrsm == NULL) { 6052 /* 6053 * We can't get memory, so lets not proceed. 6054 */ 6055 *lenp = 0; 6056 return (0); 6057 } 6058 /* 6059 * So here we are going to take the original rsm and make it what we 6060 * retransmitted. nrsm will be the tail portion we did not 6061 * retransmit. For example say the chunk was 1, 11 (10 bytes). And 6062 * we retransmitted 5 bytes i.e. 1, 5. The original piece shrinks to 6063 * 1, 6 and the new piece will be 6, 11. 6064 */ 6065 rack_clone_rsm(rack, nrsm, rsm, c_end); 6066 nrsm->r_dupack = 0; 6067 rack_log_retran_reason(rack, nrsm, __LINE__, 0, 2); 6068 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); 6069 #ifdef INVARIANTS 6070 if (insret != NULL) { 6071 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", 6072 nrsm, insret, rack, rsm); 6073 } 6074 #endif 6075 if (rsm->r_in_tmap) { 6076 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 6077 nrsm->r_in_tmap = 1; 6078 } 6079 rsm->r_flags &= (~RACK_HAS_FIN); 6080 rack_update_rsm(tp, rack, rsm, ts); 6081 *lenp = 0; 6082 return (0); 6083 } 6084 6085 6086 static void 6087 rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len, 6088 uint32_t seq_out, uint8_t th_flags, int32_t err, uint32_t ts, 6089 uint8_t pass, struct rack_sendmap *hintrsm, uint32_t us_cts) 6090 { 6091 struct tcp_rack *rack; 6092 struct rack_sendmap *rsm, *nrsm, *insret, fe; 6093 register uint32_t snd_max, snd_una; 6094 6095 /* 6096 * Add to the RACK log of packets in flight or retransmitted. If 6097 * there is a TS option we will use the TS echoed, if not we will 6098 * grab a TS. 6099 * 6100 * Retransmissions will increment the count and move the ts to its 6101 * proper place. Note that if options do not include TS's then we 6102 * won't be able to effectively use the ACK for an RTT on a retran. 6103 * 6104 * Notes about r_start and r_end. Lets consider a send starting at 6105 * sequence 1 for 10 bytes. In such an example the r_start would be 6106 * 1 (starting sequence) but the r_end would be r_start+len i.e. 11. 6107 * This means that r_end is actually the first sequence for the next 6108 * slot (11). 6109 * 6110 */ 6111 /* 6112 * If err is set what do we do XXXrrs? should we not add the thing? 6113 * -- i.e. return if err != 0 or should we pretend we sent it? -- 6114 * i.e. proceed with add ** do this for now. 6115 */ 6116 INP_WLOCK_ASSERT(tp->t_inpcb); 6117 if (err) 6118 /* 6119 * We don't log errors -- we could but snd_max does not 6120 * advance in this case either. 6121 */ 6122 return; 6123 6124 if (th_flags & TH_RST) { 6125 /* 6126 * We don't log resets and we return immediately from 6127 * sending 6128 */ 6129 return; 6130 } 6131 rack = (struct tcp_rack *)tp->t_fb_ptr; 6132 snd_una = tp->snd_una; 6133 if (SEQ_LEQ((seq_out + len), snd_una)) { 6134 /* Are sending an old segment to induce an ack (keep-alive)? */ 6135 return; 6136 } 6137 if (SEQ_LT(seq_out, snd_una)) { 6138 /* huh? should we panic? */ 6139 uint32_t end; 6140 6141 end = seq_out + len; 6142 seq_out = snd_una; 6143 if (SEQ_GEQ(end, seq_out)) 6144 len = end - seq_out; 6145 else 6146 len = 0; 6147 } 6148 snd_max = tp->snd_max; 6149 if (th_flags & (TH_SYN | TH_FIN)) { 6150 /* 6151 * The call to rack_log_output is made before bumping 6152 * snd_max. This means we can record one extra byte on a SYN 6153 * or FIN if seq_out is adding more on and a FIN is present 6154 * (and we are not resending). 6155 */ 6156 if ((th_flags & TH_SYN) && (seq_out == tp->iss)) 6157 len++; 6158 if (th_flags & TH_FIN) 6159 len++; 6160 if (SEQ_LT(snd_max, tp->snd_nxt)) { 6161 /* 6162 * The add/update as not been done for the FIN/SYN 6163 * yet. 6164 */ 6165 snd_max = tp->snd_nxt; 6166 } 6167 } 6168 if (len == 0) { 6169 /* We don't log zero window probes */ 6170 return; 6171 } 6172 rack->r_ctl.rc_time_last_sent = ts; 6173 if (IN_RECOVERY(tp->t_flags)) { 6174 rack->r_ctl.rc_prr_out += len; 6175 } 6176 /* First question is it a retransmission or new? */ 6177 if (seq_out == snd_max) { 6178 /* Its new */ 6179 again: 6180 rsm = rack_alloc(rack); 6181 if (rsm == NULL) { 6182 /* 6183 * Hmm out of memory and the tcb got destroyed while 6184 * we tried to wait. 6185 */ 6186 return; 6187 } 6188 if (th_flags & TH_FIN) { 6189 rsm->r_flags = RACK_HAS_FIN; 6190 } else { 6191 rsm->r_flags = 0; 6192 } 6193 rsm->r_tim_lastsent[0] = ts; 6194 rsm->r_rtr_cnt = 1; 6195 rsm->r_rtr_bytes = 0; 6196 rsm->usec_orig_send = us_cts; 6197 if (th_flags & TH_SYN) { 6198 /* The data space is one beyond snd_una */ 6199 rsm->r_flags |= RACK_HAS_SIN; 6200 rsm->r_start = seq_out + 1; 6201 rsm->r_end = rsm->r_start + (len - 1); 6202 } else { 6203 /* Normal case */ 6204 rsm->r_start = seq_out; 6205 rsm->r_end = rsm->r_start + len; 6206 } 6207 rsm->r_dupack = 0; 6208 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 6209 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 6210 #ifdef INVARIANTS 6211 if (insret != NULL) { 6212 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", 6213 nrsm, insret, rack, rsm); 6214 } 6215 #endif 6216 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); 6217 rsm->r_in_tmap = 1; 6218 /* 6219 * Special case detection, is there just a single 6220 * packet outstanding when we are not in recovery? 6221 * 6222 * If this is true mark it so. 6223 */ 6224 if ((IN_RECOVERY(tp->t_flags) == 0) && 6225 (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) == ctf_fixed_maxseg(tp))) { 6226 struct rack_sendmap *prsm; 6227 6228 prsm = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 6229 if (prsm) 6230 prsm->r_one_out_nr = 1; 6231 } 6232 return; 6233 } 6234 /* 6235 * If we reach here its a retransmission and we need to find it. 6236 */ 6237 memset(&fe, 0, sizeof(fe)); 6238 more: 6239 if (hintrsm && (hintrsm->r_start == seq_out)) { 6240 rsm = hintrsm; 6241 hintrsm = NULL; 6242 } else { 6243 /* No hints sorry */ 6244 rsm = NULL; 6245 } 6246 if ((rsm) && (rsm->r_start == seq_out)) { 6247 seq_out = rack_update_entry(tp, rack, rsm, ts, &len); 6248 if (len == 0) { 6249 return; 6250 } else { 6251 goto more; 6252 } 6253 } 6254 /* Ok it was not the last pointer go through it the hard way. */ 6255 refind: 6256 fe.r_start = seq_out; 6257 rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); 6258 if (rsm) { 6259 if (rsm->r_start == seq_out) { 6260 seq_out = rack_update_entry(tp, rack, rsm, ts, &len); 6261 if (len == 0) { 6262 return; 6263 } else { 6264 goto refind; 6265 } 6266 } 6267 if (SEQ_GEQ(seq_out, rsm->r_start) && SEQ_LT(seq_out, rsm->r_end)) { 6268 /* Transmitted within this piece */ 6269 /* 6270 * Ok we must split off the front and then let the 6271 * update do the rest 6272 */ 6273 nrsm = rack_alloc_full_limit(rack); 6274 if (nrsm == NULL) { 6275 rack_update_rsm(tp, rack, rsm, ts); 6276 return; 6277 } 6278 /* 6279 * copy rsm to nrsm and then trim the front of rsm 6280 * to not include this part. 6281 */ 6282 rack_clone_rsm(rack, nrsm, rsm, seq_out); 6283 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); 6284 #ifdef INVARIANTS 6285 if (insret != NULL) { 6286 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", 6287 nrsm, insret, rack, rsm); 6288 } 6289 #endif 6290 if (rsm->r_in_tmap) { 6291 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 6292 nrsm->r_in_tmap = 1; 6293 } 6294 rsm->r_flags &= (~RACK_HAS_FIN); 6295 seq_out = rack_update_entry(tp, rack, nrsm, ts, &len); 6296 if (len == 0) { 6297 return; 6298 } else if (len > 0) 6299 goto refind; 6300 } 6301 } 6302 /* 6303 * Hmm not found in map did they retransmit both old and on into the 6304 * new? 6305 */ 6306 if (seq_out == tp->snd_max) { 6307 goto again; 6308 } else if (SEQ_LT(seq_out, tp->snd_max)) { 6309 #ifdef INVARIANTS 6310 printf("seq_out:%u len:%d snd_una:%u snd_max:%u -- but rsm not found?\n", 6311 seq_out, len, tp->snd_una, tp->snd_max); 6312 printf("Starting Dump of all rack entries\n"); 6313 RB_FOREACH(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) { 6314 printf("rsm:%p start:%u end:%u\n", 6315 rsm, rsm->r_start, rsm->r_end); 6316 } 6317 printf("Dump complete\n"); 6318 panic("seq_out not found rack:%p tp:%p", 6319 rack, tp); 6320 #endif 6321 } else { 6322 #ifdef INVARIANTS 6323 /* 6324 * Hmm beyond sndmax? (only if we are using the new rtt-pack 6325 * flag) 6326 */ 6327 panic("seq_out:%u(%d) is beyond snd_max:%u tp:%p", 6328 seq_out, len, tp->snd_max, tp); 6329 #endif 6330 } 6331 } 6332 6333 /* 6334 * Record one of the RTT updates from an ack into 6335 * our sample structure. 6336 */ 6337 6338 static void 6339 tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt, uint32_t len, uint32_t us_rtt, 6340 int confidence, struct rack_sendmap *rsm, uint16_t rtrcnt) 6341 { 6342 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) || 6343 (rack->r_ctl.rack_rs.rs_rtt_lowest > rtt)) { 6344 rack->r_ctl.rack_rs.rs_rtt_lowest = rtt; 6345 } 6346 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) || 6347 (rack->r_ctl.rack_rs.rs_rtt_highest < rtt)) { 6348 rack->r_ctl.rack_rs.rs_rtt_highest = rtt; 6349 } 6350 if (rack->rc_tp->t_flags & TF_GPUTINPROG) { 6351 if (us_rtt < rack->r_ctl.rc_gp_lowrtt) 6352 rack->r_ctl.rc_gp_lowrtt = us_rtt; 6353 if (rack->rc_tp->snd_wnd > rack->r_ctl.rc_gp_high_rwnd) 6354 rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd; 6355 } 6356 if ((confidence == 1) && 6357 ((rsm == NULL) || 6358 (rsm->r_just_ret) || 6359 (rsm->r_one_out_nr && 6360 len < (ctf_fixed_maxseg(rack->rc_tp) * 2)))) { 6361 /* 6362 * If the rsm had a just return 6363 * hit it then we can't trust the 6364 * rtt measurement for buffer deterimination 6365 * Note that a confidence of 2, indicates 6366 * SACK'd which overrides the r_just_ret or 6367 * the r_one_out_nr. If it was a CUM-ACK and 6368 * we had only two outstanding, but get an 6369 * ack for only 1. Then that also lowers our 6370 * confidence. 6371 */ 6372 confidence = 0; 6373 } 6374 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) || 6375 (rack->r_ctl.rack_rs.rs_us_rtt > us_rtt)) { 6376 if (rack->r_ctl.rack_rs.confidence == 0) { 6377 /* 6378 * We take anything with no current confidence 6379 * saved. 6380 */ 6381 rack->r_ctl.rack_rs.rs_us_rtt = us_rtt; 6382 rack->r_ctl.rack_rs.confidence = confidence; 6383 rack->r_ctl.rack_rs.rs_us_rtrcnt = rtrcnt; 6384 } else if (confidence || rack->r_ctl.rack_rs.confidence) { 6385 /* 6386 * Once we have a confident number, 6387 * we can update it with a smaller 6388 * value since this confident number 6389 * may include the DSACK time until 6390 * the next segment (the second one) arrived. 6391 */ 6392 rack->r_ctl.rack_rs.rs_us_rtt = us_rtt; 6393 rack->r_ctl.rack_rs.confidence = confidence; 6394 rack->r_ctl.rack_rs.rs_us_rtrcnt = rtrcnt; 6395 } 6396 6397 } 6398 rack_log_rtt_upd(rack->rc_tp, rack, us_rtt, len, rsm, confidence); 6399 rack->r_ctl.rack_rs.rs_flags = RACK_RTT_VALID; 6400 rack->r_ctl.rack_rs.rs_rtt_tot += rtt; 6401 rack->r_ctl.rack_rs.rs_rtt_cnt++; 6402 } 6403 6404 /* 6405 * Collect new round-trip time estimate 6406 * and update averages and current timeout. 6407 */ 6408 static void 6409 tcp_rack_xmit_timer_commit(struct tcp_rack *rack, struct tcpcb *tp) 6410 { 6411 int32_t delta; 6412 uint32_t o_srtt, o_var; 6413 int32_t hrtt_up = 0; 6414 int32_t rtt; 6415 6416 if (rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) 6417 /* No valid sample */ 6418 return; 6419 if (rack->r_ctl.rc_rate_sample_method == USE_RTT_LOW) { 6420 /* We are to use the lowest RTT seen in a single ack */ 6421 rtt = rack->r_ctl.rack_rs.rs_rtt_lowest; 6422 } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_HIGH) { 6423 /* We are to use the highest RTT seen in a single ack */ 6424 rtt = rack->r_ctl.rack_rs.rs_rtt_highest; 6425 } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_AVG) { 6426 /* We are to use the average RTT seen in a single ack */ 6427 rtt = (int32_t)(rack->r_ctl.rack_rs.rs_rtt_tot / 6428 (uint64_t)rack->r_ctl.rack_rs.rs_rtt_cnt); 6429 } else { 6430 #ifdef INVARIANTS 6431 panic("Unknown rtt variant %d", rack->r_ctl.rc_rate_sample_method); 6432 #endif 6433 return; 6434 } 6435 if (rtt == 0) 6436 rtt = 1; 6437 if (rack->rc_gp_rtt_set == 0) { 6438 /* 6439 * With no RTT we have to accept 6440 * even one we are not confident of. 6441 */ 6442 rack->r_ctl.rc_gp_srtt = rack->r_ctl.rack_rs.rs_us_rtt; 6443 rack->rc_gp_rtt_set = 1; 6444 } else if (rack->r_ctl.rack_rs.confidence) { 6445 /* update the running gp srtt */ 6446 rack->r_ctl.rc_gp_srtt -= (rack->r_ctl.rc_gp_srtt/8); 6447 rack->r_ctl.rc_gp_srtt += rack->r_ctl.rack_rs.rs_us_rtt / 8; 6448 } 6449 if (rack->r_ctl.rack_rs.confidence) { 6450 /* 6451 * record the low and high for highly buffered path computation, 6452 * we only do this if we are confident (not a retransmission). 6453 */ 6454 if (rack->r_ctl.rc_highest_us_rtt < rack->r_ctl.rack_rs.rs_us_rtt) { 6455 rack->r_ctl.rc_highest_us_rtt = rack->r_ctl.rack_rs.rs_us_rtt; 6456 hrtt_up = 1; 6457 } 6458 if (rack->rc_highly_buffered == 0) { 6459 /* 6460 * Currently once we declare a path has 6461 * highly buffered there is no going 6462 * back, which may be a problem... 6463 */ 6464 if ((rack->r_ctl.rc_highest_us_rtt / rack->r_ctl.rc_lowest_us_rtt) > rack_hbp_thresh) { 6465 rack_log_rtt_shrinks(rack, rack->r_ctl.rack_rs.rs_us_rtt, 6466 rack->r_ctl.rc_highest_us_rtt, 6467 rack->r_ctl.rc_lowest_us_rtt, 6468 RACK_RTTS_SEEHBP); 6469 rack->rc_highly_buffered = 1; 6470 } 6471 } 6472 } 6473 if ((rack->r_ctl.rack_rs.confidence) || 6474 (rack->r_ctl.rack_rs.rs_us_rtrcnt == 1)) { 6475 /* 6476 * If we are highly confident of it <or> it was 6477 * never retransmitted we accept it as the last us_rtt. 6478 */ 6479 rack->r_ctl.rc_last_us_rtt = rack->r_ctl.rack_rs.rs_us_rtt; 6480 /* The lowest rtt can be set if its was not retransmited */ 6481 if (rack->r_ctl.rc_lowest_us_rtt > rack->r_ctl.rack_rs.rs_us_rtt) { 6482 rack->r_ctl.rc_lowest_us_rtt = rack->r_ctl.rack_rs.rs_us_rtt; 6483 if (rack->r_ctl.rc_lowest_us_rtt == 0) 6484 rack->r_ctl.rc_lowest_us_rtt = 1; 6485 } 6486 } 6487 rack_log_rtt_sample(rack, rtt); 6488 o_srtt = tp->t_srtt; 6489 o_var = tp->t_rttvar; 6490 rack = (struct tcp_rack *)tp->t_fb_ptr; 6491 if (tp->t_srtt != 0) { 6492 /* 6493 * srtt is stored as fixed point with 5 bits after the 6494 * binary point (i.e., scaled by 8). The following magic is 6495 * equivalent to the smoothing algorithm in rfc793 with an 6496 * alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed point). 6497 * Adjust rtt to origin 0. 6498 */ 6499 delta = ((rtt - 1) << TCP_DELTA_SHIFT) 6500 - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)); 6501 6502 tp->t_srtt += delta; 6503 if (tp->t_srtt <= 0) 6504 tp->t_srtt = 1; 6505 6506 /* 6507 * We accumulate a smoothed rtt variance (actually, a 6508 * smoothed mean difference), then set the retransmit timer 6509 * to smoothed rtt + 4 times the smoothed variance. rttvar 6510 * is stored as fixed point with 4 bits after the binary 6511 * point (scaled by 16). The following is equivalent to 6512 * rfc793 smoothing with an alpha of .75 (rttvar = 6513 * rttvar*3/4 + |delta| / 4). This replaces rfc793's 6514 * wired-in beta. 6515 */ 6516 if (delta < 0) 6517 delta = -delta; 6518 delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT); 6519 tp->t_rttvar += delta; 6520 if (tp->t_rttvar <= 0) 6521 tp->t_rttvar = 1; 6522 if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar) 6523 tp->t_rttbest = tp->t_srtt + tp->t_rttvar; 6524 } else { 6525 /* 6526 * No rtt measurement yet - use the unsmoothed rtt. Set the 6527 * variance to half the rtt (so our first retransmit happens 6528 * at 3*rtt). 6529 */ 6530 tp->t_srtt = rtt << TCP_RTT_SHIFT; 6531 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1); 6532 tp->t_rttbest = tp->t_srtt + tp->t_rttvar; 6533 } 6534 KMOD_TCPSTAT_INC(tcps_rttupdated); 6535 tp->t_rttupdated++; 6536 #ifdef STATS 6537 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rtt)); 6538 #endif 6539 tp->t_rxtshift = 0; 6540 6541 /* 6542 * the retransmit should happen at rtt + 4 * rttvar. Because of the 6543 * way we do the smoothing, srtt and rttvar will each average +1/2 6544 * tick of bias. When we compute the retransmit timer, we want 1/2 6545 * tick of rounding and 1 extra tick because of +-1/2 tick 6546 * uncertainty in the firing of the timer. The bias will give us 6547 * exactly the 1.5 tick we need. But, because the bias is 6548 * statistical, we have to test that we don't drop below the minimum 6549 * feasible timer (which is 2 ticks). 6550 */ 6551 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), 6552 max(MSEC_2_TICKS(rack_rto_min), rtt + 2), MSEC_2_TICKS(rack_rto_max)); 6553 tp->t_softerror = 0; 6554 } 6555 6556 static void 6557 rack_earlier_retran(struct tcpcb *tp, struct rack_sendmap *rsm, 6558 uint32_t t, uint32_t cts) 6559 { 6560 /* 6561 * For this RSM, we acknowledged the data from a previous 6562 * transmission, not the last one we made. This means we did a false 6563 * retransmit. 6564 */ 6565 struct tcp_rack *rack; 6566 6567 if (rsm->r_flags & RACK_HAS_FIN) { 6568 /* 6569 * The sending of the FIN often is multiple sent when we 6570 * have everything outstanding ack'd. We ignore this case 6571 * since its over now. 6572 */ 6573 return; 6574 } 6575 if (rsm->r_flags & RACK_TLP) { 6576 /* 6577 * We expect TLP's to have this occur. 6578 */ 6579 return; 6580 } 6581 rack = (struct tcp_rack *)tp->t_fb_ptr; 6582 /* should we undo cc changes and exit recovery? */ 6583 if (IN_RECOVERY(tp->t_flags)) { 6584 if (rack->r_ctl.rc_rsm_start == rsm->r_start) { 6585 /* 6586 * Undo what we ratched down and exit recovery if 6587 * possible 6588 */ 6589 EXIT_RECOVERY(tp->t_flags); 6590 tp->snd_recover = tp->snd_una; 6591 if (rack->r_ctl.rc_cwnd_at > tp->snd_cwnd) 6592 tp->snd_cwnd = rack->r_ctl.rc_cwnd_at; 6593 if (rack->r_ctl.rc_ssthresh_at > tp->snd_ssthresh) 6594 tp->snd_ssthresh = rack->r_ctl.rc_ssthresh_at; 6595 } 6596 } 6597 if (rsm->r_flags & RACK_WAS_SACKPASS) { 6598 /* 6599 * We retransmitted based on a sack and the earlier 6600 * retransmission ack'd it - re-ordering is occuring. 6601 */ 6602 counter_u64_add(rack_reorder_seen, 1); 6603 rack->r_ctl.rc_reorder_ts = cts; 6604 } 6605 counter_u64_add(rack_badfr, 1); 6606 counter_u64_add(rack_badfr_bytes, (rsm->r_end - rsm->r_start)); 6607 } 6608 6609 static void 6610 rack_apply_updated_usrtt(struct tcp_rack *rack, uint32_t us_rtt, uint32_t us_cts) 6611 { 6612 /* 6613 * Apply to filter the inbound us-rtt at us_cts. 6614 */ 6615 uint32_t old_rtt; 6616 6617 old_rtt = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); 6618 apply_filter_min_small(&rack->r_ctl.rc_gp_min_rtt, 6619 us_rtt, us_cts); 6620 if (rack->r_ctl.last_pacing_time && 6621 rack->rc_gp_dyn_mul && 6622 (rack->r_ctl.last_pacing_time > us_rtt)) 6623 rack->pacing_longer_than_rtt = 1; 6624 else 6625 rack->pacing_longer_than_rtt = 0; 6626 if (old_rtt > us_rtt) { 6627 /* We just hit a new lower rtt time */ 6628 rack_log_rtt_shrinks(rack, us_cts, old_rtt, 6629 __LINE__, RACK_RTTS_NEWRTT); 6630 /* 6631 * Only count it if its lower than what we saw within our 6632 * calculated range. 6633 */ 6634 if ((old_rtt - us_rtt) > rack_min_rtt_movement) { 6635 if (rack_probertt_lower_within && 6636 rack->rc_gp_dyn_mul && 6637 (rack->use_fixed_rate == 0) && 6638 (rack->rc_always_pace)) { 6639 /* 6640 * We are seeing a new lower rtt very close 6641 * to the time that we would have entered probe-rtt. 6642 * This is probably due to the fact that a peer flow 6643 * has entered probe-rtt. Lets go in now too. 6644 */ 6645 uint32_t val; 6646 6647 val = rack_probertt_lower_within * rack_time_between_probertt; 6648 val /= 100; 6649 if ((rack->in_probe_rtt == 0) && 6650 ((us_cts - rack->r_ctl.rc_lower_rtt_us_cts) >= (rack_time_between_probertt - val))) { 6651 rack_enter_probertt(rack, us_cts); 6652 } 6653 } 6654 rack->r_ctl.rc_lower_rtt_us_cts = us_cts; 6655 } 6656 } 6657 } 6658 6659 static int 6660 rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack, 6661 struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type, tcp_seq th_ack) 6662 { 6663 int32_t i; 6664 uint32_t t, len_acked; 6665 6666 if ((rsm->r_flags & RACK_ACKED) || 6667 (rsm->r_flags & RACK_WAS_ACKED)) 6668 /* Already done */ 6669 return (0); 6670 6671 if (ack_type == CUM_ACKED) { 6672 if (SEQ_GT(th_ack, rsm->r_end)) 6673 len_acked = rsm->r_end - rsm->r_start; 6674 else 6675 len_acked = th_ack - rsm->r_start; 6676 } else 6677 len_acked = rsm->r_end - rsm->r_start; 6678 if (rsm->r_rtr_cnt == 1) { 6679 uint32_t us_rtt; 6680 6681 t = cts - rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; 6682 if ((int)t <= 0) 6683 t = 1; 6684 if (!tp->t_rttlow || tp->t_rttlow > t) 6685 tp->t_rttlow = t; 6686 if (!rack->r_ctl.rc_rack_min_rtt || 6687 SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 6688 rack->r_ctl.rc_rack_min_rtt = t; 6689 if (rack->r_ctl.rc_rack_min_rtt == 0) { 6690 rack->r_ctl.rc_rack_min_rtt = 1; 6691 } 6692 } 6693 us_rtt = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time) - rsm->usec_orig_send; 6694 if (us_rtt == 0) 6695 us_rtt = 1; 6696 rack_apply_updated_usrtt(rack, us_rtt, tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time)); 6697 if (ack_type == SACKED) 6698 tcp_rack_xmit_timer(rack, t + 1, len_acked, us_rtt, 2 , rsm, rsm->r_rtr_cnt); 6699 else { 6700 /* 6701 * For cum-ack we are only confident if what 6702 * is being acked is included in a measurement. 6703 * Otherwise it could be an idle period that 6704 * includes Delayed-ack time. 6705 */ 6706 tcp_rack_xmit_timer(rack, t + 1, len_acked, us_rtt, 6707 (rack->app_limited_needs_set ? 0 : 1), rsm, rsm->r_rtr_cnt); 6708 } 6709 if ((rsm->r_flags & RACK_TLP) && 6710 (!IN_RECOVERY(tp->t_flags))) { 6711 /* Segment was a TLP and our retrans matched */ 6712 if (rack->r_ctl.rc_tlp_cwnd_reduce) { 6713 rack->r_ctl.rc_rsm_start = tp->snd_max; 6714 rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; 6715 rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; 6716 rack_cong_signal(tp, NULL, CC_NDUPACK); 6717 /* 6718 * When we enter recovery we need to assure 6719 * we send one packet. 6720 */ 6721 if (rack->rack_no_prr == 0) { 6722 rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); 6723 rack_log_to_prr(rack, 7, 0); 6724 } 6725 } 6726 } 6727 if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) { 6728 /* New more recent rack_tmit_time */ 6729 rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; 6730 rack->rc_rack_rtt = t; 6731 } 6732 return (1); 6733 } 6734 /* 6735 * We clear the soft/rxtshift since we got an ack. 6736 * There is no assurance we will call the commit() function 6737 * so we need to clear these to avoid incorrect handling. 6738 */ 6739 tp->t_rxtshift = 0; 6740 tp->t_softerror = 0; 6741 if ((to->to_flags & TOF_TS) && 6742 (ack_type == CUM_ACKED) && 6743 (to->to_tsecr) && 6744 ((rsm->r_flags & RACK_OVERMAX) == 0)) { 6745 /* 6746 * Now which timestamp does it match? In this block the ACK 6747 * must be coming from a previous transmission. 6748 */ 6749 for (i = 0; i < rsm->r_rtr_cnt; i++) { 6750 if (rsm->r_tim_lastsent[i] == to->to_tsecr) { 6751 t = cts - rsm->r_tim_lastsent[i]; 6752 if ((int)t <= 0) 6753 t = 1; 6754 if ((i + 1) < rsm->r_rtr_cnt) { 6755 /* Likely */ 6756 rack_earlier_retran(tp, rsm, t, cts); 6757 } 6758 if (!tp->t_rttlow || tp->t_rttlow > t) 6759 tp->t_rttlow = t; 6760 if (!rack->r_ctl.rc_rack_min_rtt || SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 6761 rack->r_ctl.rc_rack_min_rtt = t; 6762 if (rack->r_ctl.rc_rack_min_rtt == 0) { 6763 rack->r_ctl.rc_rack_min_rtt = 1; 6764 } 6765 } 6766 if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, 6767 rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) { 6768 /* New more recent rack_tmit_time */ 6769 rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; 6770 rack->rc_rack_rtt = t; 6771 } 6772 tcp_rack_xmit_timer(rack, t + 1, len_acked, (t * HPTS_USEC_IN_MSEC), 0, rsm, 6773 rsm->r_rtr_cnt); 6774 return (1); 6775 } 6776 } 6777 goto ts_not_found; 6778 } else { 6779 /* 6780 * Ok its a SACK block that we retransmitted. or a windows 6781 * machine without timestamps. We can tell nothing from the 6782 * time-stamp since its not there or the time the peer last 6783 * recieved a segment that moved forward its cum-ack point. 6784 */ 6785 ts_not_found: 6786 i = rsm->r_rtr_cnt - 1; 6787 t = cts - rsm->r_tim_lastsent[i]; 6788 if ((int)t <= 0) 6789 t = 1; 6790 if (rack->r_ctl.rc_rack_min_rtt && SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 6791 /* 6792 * We retransmitted and the ack came back in less 6793 * than the smallest rtt we have observed. We most 6794 * likey did an improper retransmit as outlined in 6795 * 4.2 Step 3 point 2 in the rack-draft. 6796 */ 6797 i = rsm->r_rtr_cnt - 2; 6798 t = cts - rsm->r_tim_lastsent[i]; 6799 rack_earlier_retran(tp, rsm, t, cts); 6800 } else if (rack->r_ctl.rc_rack_min_rtt) { 6801 /* 6802 * We retransmitted it and the retransmit did the 6803 * job. 6804 */ 6805 if (!rack->r_ctl.rc_rack_min_rtt || 6806 SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 6807 rack->r_ctl.rc_rack_min_rtt = t; 6808 if (rack->r_ctl.rc_rack_min_rtt == 0) { 6809 rack->r_ctl.rc_rack_min_rtt = 1; 6810 } 6811 } 6812 if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, rsm->r_tim_lastsent[i])) { 6813 /* New more recent rack_tmit_time */ 6814 rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[i]; 6815 rack->rc_rack_rtt = t; 6816 } 6817 return (1); 6818 } 6819 } 6820 return (0); 6821 } 6822 6823 /* 6824 * Mark the SACK_PASSED flag on all entries prior to rsm send wise. 6825 */ 6826 static void 6827 rack_log_sack_passed(struct tcpcb *tp, 6828 struct tcp_rack *rack, struct rack_sendmap *rsm) 6829 { 6830 struct rack_sendmap *nrsm; 6831 6832 nrsm = rsm; 6833 TAILQ_FOREACH_REVERSE_FROM(nrsm, &rack->r_ctl.rc_tmap, 6834 rack_head, r_tnext) { 6835 if (nrsm == rsm) { 6836 /* Skip orginal segment he is acked */ 6837 continue; 6838 } 6839 if (nrsm->r_flags & RACK_ACKED) { 6840 /* 6841 * Skip ack'd segments, though we 6842 * should not see these, since tmap 6843 * should not have ack'd segments. 6844 */ 6845 continue; 6846 } 6847 if (nrsm->r_flags & RACK_SACK_PASSED) { 6848 /* 6849 * We found one that is already marked 6850 * passed, we have been here before and 6851 * so all others below this are marked. 6852 */ 6853 break; 6854 } 6855 nrsm->r_flags |= RACK_SACK_PASSED; 6856 nrsm->r_flags &= ~RACK_WAS_SACKPASS; 6857 } 6858 } 6859 6860 static void 6861 rack_need_set_test(struct tcpcb *tp, 6862 struct tcp_rack *rack, 6863 struct rack_sendmap *rsm, 6864 tcp_seq th_ack, 6865 int line, 6866 int use_which) 6867 { 6868 6869 if ((tp->t_flags & TF_GPUTINPROG) && 6870 SEQ_GEQ(rsm->r_end, tp->gput_seq)) { 6871 /* 6872 * We were app limited, and this ack 6873 * butts up or goes beyond the point where we want 6874 * to start our next measurement. We need 6875 * to record the new gput_ts as here and 6876 * possibly update the start sequence. 6877 */ 6878 uint32_t seq, ts; 6879 6880 if (rsm->r_rtr_cnt > 1) { 6881 /* 6882 * This is a retransmit, can we 6883 * really make any assessment at this 6884 * point? We are not really sure of 6885 * the timestamp, is it this or the 6886 * previous transmission? 6887 * 6888 * Lets wait for something better that 6889 * is not retransmitted. 6890 */ 6891 return; 6892 } 6893 seq = tp->gput_seq; 6894 ts = tp->gput_ts; 6895 rack->app_limited_needs_set = 0; 6896 tp->gput_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 6897 /* Do we start at a new end? */ 6898 if ((use_which == RACK_USE_BEG) && 6899 SEQ_GEQ(rsm->r_start, tp->gput_seq)) { 6900 /* 6901 * When we get an ACK that just eats 6902 * up some of the rsm, we set RACK_USE_BEG 6903 * since whats at r_start (i.e. th_ack) 6904 * is left unacked and thats where the 6905 * measurement not starts. 6906 */ 6907 tp->gput_seq = rsm->r_start; 6908 rack->r_ctl.rc_gp_output_ts = rsm->usec_orig_send; 6909 } 6910 if ((use_which == RACK_USE_END) && 6911 SEQ_GEQ(rsm->r_end, tp->gput_seq)) { 6912 /* 6913 * We use the end when the cumack 6914 * is moving forward and completely 6915 * deleting the rsm passed so basically 6916 * r_end holds th_ack. 6917 * 6918 * For SACK's we also want to use the end 6919 * since this piece just got sacked and 6920 * we want to target anything after that 6921 * in our measurement. 6922 */ 6923 tp->gput_seq = rsm->r_end; 6924 rack->r_ctl.rc_gp_output_ts = rsm->usec_orig_send; 6925 } 6926 if (use_which == RACK_USE_END_OR_THACK) { 6927 /* 6928 * special case for ack moving forward, 6929 * not a sack, we need to move all the 6930 * way up to where this ack cum-ack moves 6931 * to. 6932 */ 6933 if (SEQ_GT(th_ack, rsm->r_end)) 6934 tp->gput_seq = th_ack; 6935 else 6936 tp->gput_seq = rsm->r_end; 6937 rack->r_ctl.rc_gp_output_ts = rsm->usec_orig_send; 6938 } 6939 if (SEQ_GT(tp->gput_seq, tp->gput_ack)) { 6940 /* 6941 * We moved beyond this guy's range, re-calculate 6942 * the new end point. 6943 */ 6944 if (rack->rc_gp_filled == 0) { 6945 tp->gput_ack = tp->gput_seq + max(rc_init_window(rack), (MIN_GP_WIN * ctf_fixed_maxseg(tp))); 6946 } else { 6947 tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack); 6948 } 6949 } 6950 /* 6951 * We are moving the goal post, we may be able to clear the 6952 * measure_saw_probe_rtt flag. 6953 */ 6954 if ((rack->in_probe_rtt == 0) && 6955 (rack->measure_saw_probe_rtt) && 6956 (SEQ_GEQ(tp->gput_seq, rack->r_ctl.rc_probertt_sndmax_atexit))) 6957 rack->measure_saw_probe_rtt = 0; 6958 rack_log_pacing_delay_calc(rack, ts, tp->gput_ts, 6959 seq, tp->gput_seq, 0, 5, line, NULL); 6960 if (rack->rc_gp_filled && 6961 ((tp->gput_ack - tp->gput_seq) < 6962 max(rc_init_window(rack), (MIN_GP_WIN * 6963 ctf_fixed_maxseg(tp))))) { 6964 /* 6965 * There is no sense of continuing this measurement 6966 * because its too small to gain us anything we 6967 * trust. Skip it and that way we can start a new 6968 * measurement quicker. 6969 */ 6970 rack_log_pacing_delay_calc(rack, tp->gput_ack, tp->gput_seq, 6971 0, 0, 0, 6, __LINE__, NULL); 6972 tp->t_flags &= ~TF_GPUTINPROG; 6973 } 6974 } 6975 } 6976 6977 static uint32_t 6978 rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, struct sackblk *sack, 6979 struct tcpopt *to, struct rack_sendmap **prsm, uint32_t cts, int *moved_two) 6980 { 6981 uint32_t start, end, changed = 0; 6982 struct rack_sendmap stack_map; 6983 struct rack_sendmap *rsm, *nrsm, fe, *insret, *prev, *next; 6984 int32_t used_ref = 1; 6985 int moved = 0; 6986 6987 start = sack->start; 6988 end = sack->end; 6989 rsm = *prsm; 6990 memset(&fe, 0, sizeof(fe)); 6991 do_rest_ofb: 6992 if ((rsm == NULL) || 6993 (SEQ_LT(end, rsm->r_start)) || 6994 (SEQ_GEQ(start, rsm->r_end)) || 6995 (SEQ_LT(start, rsm->r_start))) { 6996 /* 6997 * We are not in the right spot, 6998 * find the correct spot in the tree. 6999 */ 7000 used_ref = 0; 7001 fe.r_start = start; 7002 rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); 7003 moved++; 7004 } 7005 if (rsm == NULL) { 7006 /* TSNH */ 7007 goto out; 7008 } 7009 /* Ok we have an ACK for some piece of this rsm */ 7010 if (rsm->r_start != start) { 7011 if ((rsm->r_flags & RACK_ACKED) == 0) { 7012 /** 7013 * Need to split this in two pieces the before and after, 7014 * the before remains in the map, the after must be 7015 * added. In other words we have: 7016 * rsm |--------------| 7017 * sackblk |-------> 7018 * rsm will become 7019 * rsm |---| 7020 * and nrsm will be the sacked piece 7021 * nrsm |----------| 7022 * 7023 * But before we start down that path lets 7024 * see if the sack spans over on top of 7025 * the next guy and it is already sacked. 7026 */ 7027 next = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7028 if (next && (next->r_flags & RACK_ACKED) && 7029 SEQ_GEQ(end, next->r_start)) { 7030 /** 7031 * So the next one is already acked, and 7032 * we can thus by hookery use our stack_map 7033 * to reflect the piece being sacked and 7034 * then adjust the two tree entries moving 7035 * the start and ends around. So we start like: 7036 * rsm |------------| (not-acked) 7037 * next |-----------| (acked) 7038 * sackblk |--------> 7039 * We want to end like so: 7040 * rsm |------| (not-acked) 7041 * next |-----------------| (acked) 7042 * nrsm |-----| 7043 * Where nrsm is a temporary stack piece we 7044 * use to update all the gizmos. 7045 */ 7046 /* Copy up our fudge block */ 7047 nrsm = &stack_map; 7048 memcpy(nrsm, rsm, sizeof(struct rack_sendmap)); 7049 /* Now adjust our tree blocks */ 7050 rsm->r_end = start; 7051 next->r_start = start; 7052 /* Clear out the dup ack count of the remainder */ 7053 rsm->r_dupack = 0; 7054 rsm->r_just_ret = 0; 7055 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 7056 /* Now lets make sure our fudge block is right */ 7057 nrsm->r_start = start; 7058 /* Now lets update all the stats and such */ 7059 rack_update_rtt(tp, rack, nrsm, to, cts, SACKED, 0); 7060 if (rack->app_limited_needs_set) 7061 rack_need_set_test(tp, rack, nrsm, tp->snd_una, __LINE__, RACK_USE_END); 7062 changed += (nrsm->r_end - nrsm->r_start); 7063 rack->r_ctl.rc_sacked += (nrsm->r_end - nrsm->r_start); 7064 if (nrsm->r_flags & RACK_SACK_PASSED) { 7065 counter_u64_add(rack_reorder_seen, 1); 7066 rack->r_ctl.rc_reorder_ts = cts; 7067 } 7068 /* 7069 * Now we want to go up from rsm (the 7070 * one left un-acked) to the next one 7071 * in the tmap. We do this so when 7072 * we walk backwards we include marking 7073 * sack-passed on rsm (The one passed in 7074 * is skipped since it is generally called 7075 * on something sacked before removing it 7076 * from the tmap). 7077 */ 7078 if (rsm->r_in_tmap) { 7079 nrsm = TAILQ_NEXT(rsm, r_tnext); 7080 /* 7081 * Now that we have the next 7082 * one walk backwards from there. 7083 */ 7084 if (nrsm && nrsm->r_in_tmap) 7085 rack_log_sack_passed(tp, rack, nrsm); 7086 } 7087 /* Now are we done? */ 7088 if (SEQ_LT(end, next->r_end) || 7089 (end == next->r_end)) { 7090 /* Done with block */ 7091 goto out; 7092 } 7093 counter_u64_add(rack_sack_used_next_merge, 1); 7094 /* Postion for the next block */ 7095 start = next->r_end; 7096 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, next); 7097 if (rsm == NULL) 7098 goto out; 7099 } else { 7100 /** 7101 * We can't use any hookery here, so we 7102 * need to split the map. We enter like 7103 * so: 7104 * rsm |--------| 7105 * sackblk |-----> 7106 * We will add the new block nrsm and 7107 * that will be the new portion, and then 7108 * fall through after reseting rsm. So we 7109 * split and look like this: 7110 * rsm |----| 7111 * sackblk |-----> 7112 * nrsm |---| 7113 * We then fall through reseting 7114 * rsm to nrsm, so the next block 7115 * picks it up. 7116 */ 7117 nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT); 7118 if (nrsm == NULL) { 7119 /* 7120 * failed XXXrrs what can we do but loose the sack 7121 * info? 7122 */ 7123 goto out; 7124 } 7125 counter_u64_add(rack_sack_splits, 1); 7126 rack_clone_rsm(rack, nrsm, rsm, start); 7127 rsm->r_just_ret = 0; 7128 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); 7129 #ifdef INVARIANTS 7130 if (insret != NULL) { 7131 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", 7132 nrsm, insret, rack, rsm); 7133 } 7134 #endif 7135 if (rsm->r_in_tmap) { 7136 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 7137 nrsm->r_in_tmap = 1; 7138 } 7139 rsm->r_flags &= (~RACK_HAS_FIN); 7140 /* Position us to point to the new nrsm that starts the sack blk */ 7141 rsm = nrsm; 7142 } 7143 } else { 7144 /* Already sacked this piece */ 7145 counter_u64_add(rack_sack_skipped_acked, 1); 7146 moved++; 7147 if (end == rsm->r_end) { 7148 /* Done with block */ 7149 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7150 goto out; 7151 } else if (SEQ_LT(end, rsm->r_end)) { 7152 /* A partial sack to a already sacked block */ 7153 moved++; 7154 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7155 goto out; 7156 } else { 7157 /* 7158 * The end goes beyond this guy 7159 * repostion the start to the 7160 * next block. 7161 */ 7162 start = rsm->r_end; 7163 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7164 if (rsm == NULL) 7165 goto out; 7166 } 7167 } 7168 } 7169 if (SEQ_GEQ(end, rsm->r_end)) { 7170 /** 7171 * The end of this block is either beyond this guy or right 7172 * at this guy. I.e.: 7173 * rsm --- |-----| 7174 * end |-----| 7175 * <or> 7176 * end |---------| 7177 */ 7178 if ((rsm->r_flags & RACK_ACKED) == 0) { 7179 rack_update_rtt(tp, rack, rsm, to, cts, SACKED, 0); 7180 changed += (rsm->r_end - rsm->r_start); 7181 rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start); 7182 if (rsm->r_in_tmap) /* should be true */ 7183 rack_log_sack_passed(tp, rack, rsm); 7184 /* Is Reordering occuring? */ 7185 if (rsm->r_flags & RACK_SACK_PASSED) { 7186 rsm->r_flags &= ~RACK_SACK_PASSED; 7187 counter_u64_add(rack_reorder_seen, 1); 7188 rack->r_ctl.rc_reorder_ts = cts; 7189 } 7190 if (rack->app_limited_needs_set) 7191 rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_END); 7192 rsm->r_ack_arrival = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 7193 rsm->r_flags |= RACK_ACKED; 7194 rsm->r_flags &= ~RACK_TLP; 7195 if (rsm->r_in_tmap) { 7196 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 7197 rsm->r_in_tmap = 0; 7198 } 7199 } else { 7200 counter_u64_add(rack_sack_skipped_acked, 1); 7201 moved++; 7202 } 7203 if (end == rsm->r_end) { 7204 /* This block only - done, setup for next */ 7205 goto out; 7206 } 7207 /* 7208 * There is more not coverend by this rsm move on 7209 * to the next block in the RB tree. 7210 */ 7211 nrsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7212 start = rsm->r_end; 7213 rsm = nrsm; 7214 if (rsm == NULL) 7215 goto out; 7216 goto do_rest_ofb; 7217 } 7218 /** 7219 * The end of this sack block is smaller than 7220 * our rsm i.e.: 7221 * rsm --- |-----| 7222 * end |--| 7223 */ 7224 if ((rsm->r_flags & RACK_ACKED) == 0) { 7225 prev = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7226 if (prev && (prev->r_flags & RACK_ACKED)) { 7227 /** 7228 * Goal, we want the right remainder of rsm to shrink 7229 * in place and span from (rsm->r_start = end) to rsm->r_end. 7230 * We want to expand prev to go all the way 7231 * to prev->r_end <- end. 7232 * so in the tree we have before: 7233 * prev |--------| (acked) 7234 * rsm |-------| (non-acked) 7235 * sackblk |-| 7236 * We churn it so we end up with 7237 * prev |----------| (acked) 7238 * rsm |-----| (non-acked) 7239 * nrsm |-| (temporary) 7240 */ 7241 nrsm = &stack_map; 7242 memcpy(nrsm, rsm, sizeof(struct rack_sendmap)); 7243 prev->r_end = end; 7244 rsm->r_start = end; 7245 /* Now adjust nrsm (stack copy) to be 7246 * the one that is the small 7247 * piece that was "sacked". 7248 */ 7249 nrsm->r_end = end; 7250 rsm->r_dupack = 0; 7251 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 7252 /* 7253 * Now nrsm is our new little piece 7254 * that is acked (which was merged 7255 * to prev). Update the rtt and changed 7256 * based on that. Also check for reordering. 7257 */ 7258 rack_update_rtt(tp, rack, nrsm, to, cts, SACKED, 0); 7259 if (rack->app_limited_needs_set) 7260 rack_need_set_test(tp, rack, nrsm, tp->snd_una, __LINE__, RACK_USE_END); 7261 changed += (nrsm->r_end - nrsm->r_start); 7262 rack->r_ctl.rc_sacked += (nrsm->r_end - nrsm->r_start); 7263 if (nrsm->r_flags & RACK_SACK_PASSED) { 7264 counter_u64_add(rack_reorder_seen, 1); 7265 rack->r_ctl.rc_reorder_ts = cts; 7266 } 7267 rsm = prev; 7268 counter_u64_add(rack_sack_used_prev_merge, 1); 7269 } else { 7270 /** 7271 * This is the case where our previous 7272 * block is not acked either, so we must 7273 * split the block in two. 7274 */ 7275 nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT); 7276 if (nrsm == NULL) { 7277 /* failed rrs what can we do but loose the sack info? */ 7278 goto out; 7279 } 7280 /** 7281 * In this case nrsm becomes 7282 * nrsm->r_start = end; 7283 * nrsm->r_end = rsm->r_end; 7284 * which is un-acked. 7285 * <and> 7286 * rsm->r_end = nrsm->r_start; 7287 * i.e. the remaining un-acked 7288 * piece is left on the left 7289 * hand side. 7290 * 7291 * So we start like this 7292 * rsm |----------| (not acked) 7293 * sackblk |---| 7294 * build it so we have 7295 * rsm |---| (acked) 7296 * nrsm |------| (not acked) 7297 */ 7298 counter_u64_add(rack_sack_splits, 1); 7299 rack_clone_rsm(rack, nrsm, rsm, end); 7300 rsm->r_flags &= (~RACK_HAS_FIN); 7301 rsm->r_just_ret = 0; 7302 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); 7303 #ifdef INVARIANTS 7304 if (insret != NULL) { 7305 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", 7306 nrsm, insret, rack, rsm); 7307 } 7308 #endif 7309 if (rsm->r_in_tmap) { 7310 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 7311 nrsm->r_in_tmap = 1; 7312 } 7313 nrsm->r_dupack = 0; 7314 rack_log_retran_reason(rack, nrsm, __LINE__, 0, 2); 7315 rack_update_rtt(tp, rack, rsm, to, cts, SACKED, 0); 7316 changed += (rsm->r_end - rsm->r_start); 7317 rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start); 7318 if (rsm->r_in_tmap) /* should be true */ 7319 rack_log_sack_passed(tp, rack, rsm); 7320 /* Is Reordering occuring? */ 7321 if (rsm->r_flags & RACK_SACK_PASSED) { 7322 rsm->r_flags &= ~RACK_SACK_PASSED; 7323 counter_u64_add(rack_reorder_seen, 1); 7324 rack->r_ctl.rc_reorder_ts = cts; 7325 } 7326 if (rack->app_limited_needs_set) 7327 rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_END); 7328 rsm->r_ack_arrival = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 7329 rsm->r_flags |= RACK_ACKED; 7330 rsm->r_flags &= ~RACK_TLP; 7331 if (rsm->r_in_tmap) { 7332 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 7333 rsm->r_in_tmap = 0; 7334 } 7335 } 7336 } else if (start != end){ 7337 /* 7338 * The block was already acked. 7339 */ 7340 counter_u64_add(rack_sack_skipped_acked, 1); 7341 moved++; 7342 } 7343 out: 7344 if (rsm && (rsm->r_flags & RACK_ACKED)) { 7345 /* 7346 * Now can we merge where we worked 7347 * with either the previous or 7348 * next block? 7349 */ 7350 next = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7351 while (next) { 7352 if (next->r_flags & RACK_ACKED) { 7353 /* yep this and next can be merged */ 7354 rsm = rack_merge_rsm(rack, rsm, next); 7355 next = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7356 } else 7357 break; 7358 } 7359 /* Now what about the previous? */ 7360 prev = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7361 while (prev) { 7362 if (prev->r_flags & RACK_ACKED) { 7363 /* yep the previous and this can be merged */ 7364 rsm = rack_merge_rsm(rack, prev, rsm); 7365 prev = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7366 } else 7367 break; 7368 } 7369 } 7370 if (used_ref == 0) { 7371 counter_u64_add(rack_sack_proc_all, 1); 7372 } else { 7373 counter_u64_add(rack_sack_proc_short, 1); 7374 } 7375 /* Save off the next one for quick reference. */ 7376 if (rsm) 7377 nrsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7378 else 7379 nrsm = NULL; 7380 *prsm = rack->r_ctl.rc_sacklast = nrsm; 7381 /* Pass back the moved. */ 7382 *moved_two = moved; 7383 return (changed); 7384 } 7385 7386 static void inline 7387 rack_peer_reneges(struct tcp_rack *rack, struct rack_sendmap *rsm, tcp_seq th_ack) 7388 { 7389 struct rack_sendmap *tmap; 7390 7391 tmap = NULL; 7392 while (rsm && (rsm->r_flags & RACK_ACKED)) { 7393 /* Its no longer sacked, mark it so */ 7394 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); 7395 #ifdef INVARIANTS 7396 if (rsm->r_in_tmap) { 7397 panic("rack:%p rsm:%p flags:0x%x in tmap?", 7398 rack, rsm, rsm->r_flags); 7399 } 7400 #endif 7401 rsm->r_flags &= ~(RACK_ACKED|RACK_SACK_PASSED|RACK_WAS_SACKPASS); 7402 /* Rebuild it into our tmap */ 7403 if (tmap == NULL) { 7404 TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext); 7405 tmap = rsm; 7406 } else { 7407 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, tmap, rsm, r_tnext); 7408 tmap = rsm; 7409 } 7410 tmap->r_in_tmap = 1; 7411 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7412 } 7413 /* 7414 * Now lets possibly clear the sack filter so we start 7415 * recognizing sacks that cover this area. 7416 */ 7417 sack_filter_clear(&rack->r_ctl.rack_sf, th_ack); 7418 7419 } 7420 7421 static void 7422 rack_do_decay(struct tcp_rack *rack) 7423 { 7424 struct timeval res; 7425 7426 #define timersub(tvp, uvp, vvp) \ 7427 do { \ 7428 (vvp)->tv_sec = (tvp)->tv_sec - (uvp)->tv_sec; \ 7429 (vvp)->tv_usec = (tvp)->tv_usec - (uvp)->tv_usec; \ 7430 if ((vvp)->tv_usec < 0) { \ 7431 (vvp)->tv_sec--; \ 7432 (vvp)->tv_usec += 1000000; \ 7433 } \ 7434 } while (0) 7435 7436 timersub(&rack->r_ctl.act_rcv_time, &rack->r_ctl.rc_last_time_decay, &res); 7437 #undef timersub 7438 7439 rack->r_ctl.input_pkt++; 7440 if ((rack->rc_in_persist) || 7441 (res.tv_sec >= 1) || 7442 (rack->rc_tp->snd_max == rack->rc_tp->snd_una)) { 7443 /* 7444 * Check for decay of non-SAD, 7445 * we want all SAD detection metrics to 7446 * decay 1/4 per second (or more) passed. 7447 */ 7448 uint32_t pkt_delta; 7449 7450 pkt_delta = rack->r_ctl.input_pkt - rack->r_ctl.saved_input_pkt; 7451 /* Update our saved tracking values */ 7452 rack->r_ctl.saved_input_pkt = rack->r_ctl.input_pkt; 7453 rack->r_ctl.rc_last_time_decay = rack->r_ctl.act_rcv_time; 7454 /* Now do we escape without decay? */ 7455 #ifdef NETFLIX_EXP_DETECTION 7456 if (rack->rc_in_persist || 7457 (rack->rc_tp->snd_max == rack->rc_tp->snd_una) || 7458 (pkt_delta < tcp_sad_low_pps)){ 7459 /* 7460 * We don't decay idle connections 7461 * or ones that have a low input pps. 7462 */ 7463 return; 7464 } 7465 /* Decay the counters */ 7466 rack->r_ctl.ack_count = ctf_decay_count(rack->r_ctl.ack_count, 7467 tcp_sad_decay_val); 7468 rack->r_ctl.sack_count = ctf_decay_count(rack->r_ctl.sack_count, 7469 tcp_sad_decay_val); 7470 rack->r_ctl.sack_moved_extra = ctf_decay_count(rack->r_ctl.sack_moved_extra, 7471 tcp_sad_decay_val); 7472 rack->r_ctl.sack_noextra_move = ctf_decay_count(rack->r_ctl.sack_noextra_move, 7473 tcp_sad_decay_val); 7474 #endif 7475 } 7476 } 7477 7478 static void 7479 rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th) 7480 { 7481 uint32_t changed, entered_recovery = 0; 7482 struct tcp_rack *rack; 7483 struct rack_sendmap *rsm, *rm; 7484 struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1]; 7485 register uint32_t th_ack; 7486 int32_t i, j, k, num_sack_blks = 0; 7487 uint32_t cts, acked, ack_point, sack_changed = 0; 7488 int loop_start = 0, moved_two = 0; 7489 uint32_t tsused; 7490 7491 7492 INP_WLOCK_ASSERT(tp->t_inpcb); 7493 if (th->th_flags & TH_RST) { 7494 /* We don't log resets */ 7495 return; 7496 } 7497 rack = (struct tcp_rack *)tp->t_fb_ptr; 7498 cts = tcp_ts_getticks(); 7499 rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 7500 changed = 0; 7501 th_ack = th->th_ack; 7502 if (rack->sack_attack_disable == 0) 7503 rack_do_decay(rack); 7504 if (BYTES_THIS_ACK(tp, th) >= ctf_fixed_maxseg(rack->rc_tp)) { 7505 /* 7506 * You only get credit for 7507 * MSS and greater (and you get extra 7508 * credit for larger cum-ack moves). 7509 */ 7510 int ac; 7511 7512 ac = BYTES_THIS_ACK(tp, th) / ctf_fixed_maxseg(rack->rc_tp); 7513 rack->r_ctl.ack_count += ac; 7514 counter_u64_add(rack_ack_total, ac); 7515 } 7516 if (rack->r_ctl.ack_count > 0xfff00000) { 7517 /* 7518 * reduce the number to keep us under 7519 * a uint32_t. 7520 */ 7521 rack->r_ctl.ack_count /= 2; 7522 rack->r_ctl.sack_count /= 2; 7523 } 7524 if (SEQ_GT(th_ack, tp->snd_una)) { 7525 rack_log_progress_event(rack, tp, ticks, PROGRESS_UPDATE, __LINE__); 7526 tp->t_acktime = ticks; 7527 } 7528 if (rsm && SEQ_GT(th_ack, rsm->r_start)) 7529 changed = th_ack - rsm->r_start; 7530 if (changed) { 7531 /* 7532 * The ACK point is advancing to th_ack, we must drop off 7533 * the packets in the rack log and calculate any eligble 7534 * RTT's. 7535 */ 7536 rack->r_wanted_output = 1; 7537 more: 7538 rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 7539 if (rsm == NULL) { 7540 if ((th_ack - 1) == tp->iss) { 7541 /* 7542 * For the SYN incoming case we will not 7543 * have called tcp_output for the sending of 7544 * the SYN, so there will be no map. All 7545 * other cases should probably be a panic. 7546 */ 7547 goto proc_sack; 7548 } 7549 if (tp->t_flags & TF_SENTFIN) { 7550 /* if we send a FIN we will not hav a map */ 7551 goto proc_sack; 7552 } 7553 #ifdef INVARIANTS 7554 panic("No rack map tp:%p for th:%p state:%d rack:%p snd_una:%u snd_max:%u snd_nxt:%u chg:%d\n", 7555 tp, 7556 th, tp->t_state, rack, 7557 tp->snd_una, tp->snd_max, tp->snd_nxt, changed); 7558 #endif 7559 goto proc_sack; 7560 } 7561 if (SEQ_LT(th_ack, rsm->r_start)) { 7562 /* Huh map is missing this */ 7563 #ifdef INVARIANTS 7564 printf("Rack map starts at r_start:%u for th_ack:%u huh? ts:%d rs:%d\n", 7565 rsm->r_start, 7566 th_ack, tp->t_state, rack->r_state); 7567 #endif 7568 goto proc_sack; 7569 } 7570 rack_update_rtt(tp, rack, rsm, to, cts, CUM_ACKED, th_ack); 7571 /* Now do we consume the whole thing? */ 7572 if (SEQ_GEQ(th_ack, rsm->r_end)) { 7573 /* Its all consumed. */ 7574 uint32_t left; 7575 uint8_t newly_acked; 7576 7577 rack->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes; 7578 rsm->r_rtr_bytes = 0; 7579 /* Record the time of highest cumack sent */ 7580 rack->r_ctl.rc_gp_cumack_ts = rsm->usec_orig_send; 7581 rm = RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7582 #ifdef INVARIANTS 7583 if (rm != rsm) { 7584 panic("removing head in rack:%p rsm:%p rm:%p", 7585 rack, rsm, rm); 7586 } 7587 #endif 7588 if (rsm->r_in_tmap) { 7589 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 7590 rsm->r_in_tmap = 0; 7591 } 7592 newly_acked = 1; 7593 if (rsm->r_flags & RACK_ACKED) { 7594 /* 7595 * It was acked on the scoreboard -- remove 7596 * it from total 7597 */ 7598 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); 7599 newly_acked = 0; 7600 } else if (rsm->r_flags & RACK_SACK_PASSED) { 7601 /* 7602 * There are segments ACKED on the 7603 * scoreboard further up. We are seeing 7604 * reordering. 7605 */ 7606 rsm->r_flags &= ~RACK_SACK_PASSED; 7607 counter_u64_add(rack_reorder_seen, 1); 7608 rsm->r_ack_arrival = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 7609 rsm->r_flags |= RACK_ACKED; 7610 rack->r_ctl.rc_reorder_ts = cts; 7611 } 7612 left = th_ack - rsm->r_end; 7613 if (rack->app_limited_needs_set && newly_acked) 7614 rack_need_set_test(tp, rack, rsm, th_ack, __LINE__, RACK_USE_END_OR_THACK); 7615 /* Free back to zone */ 7616 rack_free(rack, rsm); 7617 if (left) { 7618 goto more; 7619 } 7620 goto proc_sack; 7621 } 7622 if (rsm->r_flags & RACK_ACKED) { 7623 /* 7624 * It was acked on the scoreboard -- remove it from 7625 * total for the part being cum-acked. 7626 */ 7627 rack->r_ctl.rc_sacked -= (th_ack - rsm->r_start); 7628 } 7629 /* 7630 * Clear the dup ack count for 7631 * the piece that remains. 7632 */ 7633 rsm->r_dupack = 0; 7634 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 7635 if (rsm->r_rtr_bytes) { 7636 /* 7637 * It was retransmitted adjust the 7638 * sack holes for what was acked. 7639 */ 7640 int ack_am; 7641 7642 ack_am = (th_ack - rsm->r_start); 7643 if (ack_am >= rsm->r_rtr_bytes) { 7644 rack->r_ctl.rc_holes_rxt -= ack_am; 7645 rsm->r_rtr_bytes -= ack_am; 7646 } 7647 } 7648 /* 7649 * Update where the piece starts and record 7650 * the time of send of highest cumack sent. 7651 */ 7652 rack->r_ctl.rc_gp_cumack_ts = rsm->usec_orig_send; 7653 rsm->r_start = th_ack; 7654 if (rack->app_limited_needs_set) 7655 rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_BEG); 7656 7657 } 7658 proc_sack: 7659 /* Check for reneging */ 7660 rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 7661 if (rsm && (rsm->r_flags & RACK_ACKED) && (th_ack == rsm->r_start)) { 7662 /* 7663 * The peer has moved snd_una up to 7664 * the edge of this send, i.e. one 7665 * that it had previously acked. The only 7666 * way that can be true if the peer threw 7667 * away data (space issues) that it had 7668 * previously sacked (else it would have 7669 * given us snd_una up to (rsm->r_end). 7670 * We need to undo the acked markings here. 7671 * 7672 * Note we have to look to make sure th_ack is 7673 * our rsm->r_start in case we get an old ack 7674 * where th_ack is behind snd_una. 7675 */ 7676 rack_peer_reneges(rack, rsm, th->th_ack); 7677 } 7678 if ((to->to_flags & TOF_SACK) == 0) { 7679 /* We are done nothing left */ 7680 goto out; 7681 } 7682 /* Sack block processing */ 7683 if (SEQ_GT(th_ack, tp->snd_una)) 7684 ack_point = th_ack; 7685 else 7686 ack_point = tp->snd_una; 7687 for (i = 0; i < to->to_nsacks; i++) { 7688 bcopy((to->to_sacks + i * TCPOLEN_SACK), 7689 &sack, sizeof(sack)); 7690 sack.start = ntohl(sack.start); 7691 sack.end = ntohl(sack.end); 7692 if (SEQ_GT(sack.end, sack.start) && 7693 SEQ_GT(sack.start, ack_point) && 7694 SEQ_LT(sack.start, tp->snd_max) && 7695 SEQ_GT(sack.end, ack_point) && 7696 SEQ_LEQ(sack.end, tp->snd_max)) { 7697 sack_blocks[num_sack_blks] = sack; 7698 num_sack_blks++; 7699 #ifdef NETFLIX_STATS 7700 } else if (SEQ_LEQ(sack.start, th_ack) && 7701 SEQ_LEQ(sack.end, th_ack)) { 7702 /* 7703 * Its a D-SACK block. 7704 */ 7705 tcp_record_dsack(sack.start, sack.end); 7706 #endif 7707 } 7708 7709 } 7710 /* 7711 * Sort the SACK blocks so we can update the rack scoreboard with 7712 * just one pass. 7713 */ 7714 num_sack_blks = sack_filter_blks(&rack->r_ctl.rack_sf, sack_blocks, 7715 num_sack_blks, th->th_ack); 7716 ctf_log_sack_filter(rack->rc_tp, num_sack_blks, sack_blocks); 7717 if (num_sack_blks == 0) { 7718 /* Nothing to sack (DSACKs?) */ 7719 goto out_with_totals; 7720 } 7721 if (num_sack_blks < 2) { 7722 /* Only one, we don't need to sort */ 7723 goto do_sack_work; 7724 } 7725 /* Sort the sacks */ 7726 for (i = 0; i < num_sack_blks; i++) { 7727 for (j = i + 1; j < num_sack_blks; j++) { 7728 if (SEQ_GT(sack_blocks[i].end, sack_blocks[j].end)) { 7729 sack = sack_blocks[i]; 7730 sack_blocks[i] = sack_blocks[j]; 7731 sack_blocks[j] = sack; 7732 } 7733 } 7734 } 7735 /* 7736 * Now are any of the sack block ends the same (yes some 7737 * implementations send these)? 7738 */ 7739 again: 7740 if (num_sack_blks == 0) 7741 goto out_with_totals; 7742 if (num_sack_blks > 1) { 7743 for (i = 0; i < num_sack_blks; i++) { 7744 for (j = i + 1; j < num_sack_blks; j++) { 7745 if (sack_blocks[i].end == sack_blocks[j].end) { 7746 /* 7747 * Ok these two have the same end we 7748 * want the smallest end and then 7749 * throw away the larger and start 7750 * again. 7751 */ 7752 if (SEQ_LT(sack_blocks[j].start, sack_blocks[i].start)) { 7753 /* 7754 * The second block covers 7755 * more area use that 7756 */ 7757 sack_blocks[i].start = sack_blocks[j].start; 7758 } 7759 /* 7760 * Now collapse out the dup-sack and 7761 * lower the count 7762 */ 7763 for (k = (j + 1); k < num_sack_blks; k++) { 7764 sack_blocks[j].start = sack_blocks[k].start; 7765 sack_blocks[j].end = sack_blocks[k].end; 7766 j++; 7767 } 7768 num_sack_blks--; 7769 goto again; 7770 } 7771 } 7772 } 7773 } 7774 do_sack_work: 7775 /* 7776 * First lets look to see if 7777 * we have retransmitted and 7778 * can use the transmit next? 7779 */ 7780 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 7781 if (rsm && 7782 SEQ_GT(sack_blocks[0].end, rsm->r_start) && 7783 SEQ_LT(sack_blocks[0].start, rsm->r_end)) { 7784 /* 7785 * We probably did the FR and the next 7786 * SACK in continues as we would expect. 7787 */ 7788 acked = rack_proc_sack_blk(tp, rack, &sack_blocks[0], to, &rsm, cts, &moved_two); 7789 if (acked) { 7790 rack->r_wanted_output = 1; 7791 changed += acked; 7792 sack_changed += acked; 7793 } 7794 if (num_sack_blks == 1) { 7795 /* 7796 * This is what we would expect from 7797 * a normal implementation to happen 7798 * after we have retransmitted the FR, 7799 * i.e the sack-filter pushes down 7800 * to 1 block and the next to be retransmitted 7801 * is the sequence in the sack block (has more 7802 * are acked). Count this as ACK'd data to boost 7803 * up the chances of recovering any false positives. 7804 */ 7805 rack->r_ctl.ack_count += (acked / ctf_fixed_maxseg(rack->rc_tp)); 7806 counter_u64_add(rack_ack_total, (acked / ctf_fixed_maxseg(rack->rc_tp))); 7807 counter_u64_add(rack_express_sack, 1); 7808 if (rack->r_ctl.ack_count > 0xfff00000) { 7809 /* 7810 * reduce the number to keep us under 7811 * a uint32_t. 7812 */ 7813 rack->r_ctl.ack_count /= 2; 7814 rack->r_ctl.sack_count /= 2; 7815 } 7816 goto out_with_totals; 7817 } else { 7818 /* 7819 * Start the loop through the 7820 * rest of blocks, past the first block. 7821 */ 7822 moved_two = 0; 7823 loop_start = 1; 7824 } 7825 } 7826 /* Its a sack of some sort */ 7827 rack->r_ctl.sack_count++; 7828 if (rack->r_ctl.sack_count > 0xfff00000) { 7829 /* 7830 * reduce the number to keep us under 7831 * a uint32_t. 7832 */ 7833 rack->r_ctl.ack_count /= 2; 7834 rack->r_ctl.sack_count /= 2; 7835 } 7836 counter_u64_add(rack_sack_total, 1); 7837 if (rack->sack_attack_disable) { 7838 /* An attacker disablement is in place */ 7839 if (num_sack_blks > 1) { 7840 rack->r_ctl.sack_count += (num_sack_blks - 1); 7841 rack->r_ctl.sack_moved_extra++; 7842 counter_u64_add(rack_move_some, 1); 7843 if (rack->r_ctl.sack_moved_extra > 0xfff00000) { 7844 rack->r_ctl.sack_moved_extra /= 2; 7845 rack->r_ctl.sack_noextra_move /= 2; 7846 } 7847 } 7848 goto out; 7849 } 7850 rsm = rack->r_ctl.rc_sacklast; 7851 for (i = loop_start; i < num_sack_blks; i++) { 7852 acked = rack_proc_sack_blk(tp, rack, &sack_blocks[i], to, &rsm, cts, &moved_two); 7853 if (acked) { 7854 rack->r_wanted_output = 1; 7855 changed += acked; 7856 sack_changed += acked; 7857 } 7858 if (moved_two) { 7859 /* 7860 * If we did not get a SACK for at least a MSS and 7861 * had to move at all, or if we moved more than our 7862 * threshold, it counts against the "extra" move. 7863 */ 7864 rack->r_ctl.sack_moved_extra += moved_two; 7865 counter_u64_add(rack_move_some, 1); 7866 } else { 7867 /* 7868 * else we did not have to move 7869 * any more than we would expect. 7870 */ 7871 rack->r_ctl.sack_noextra_move++; 7872 counter_u64_add(rack_move_none, 1); 7873 } 7874 if (moved_two && (acked < ctf_fixed_maxseg(rack->rc_tp))) { 7875 /* 7876 * If the SACK was not a full MSS then 7877 * we add to sack_count the number of 7878 * MSS's (or possibly more than 7879 * a MSS if its a TSO send) we had to skip by. 7880 */ 7881 rack->r_ctl.sack_count += moved_two; 7882 counter_u64_add(rack_sack_total, moved_two); 7883 } 7884 /* 7885 * Now we need to setup for the next 7886 * round. First we make sure we won't 7887 * exceed the size of our uint32_t on 7888 * the various counts, and then clear out 7889 * moved_two. 7890 */ 7891 if ((rack->r_ctl.sack_moved_extra > 0xfff00000) || 7892 (rack->r_ctl.sack_noextra_move > 0xfff00000)) { 7893 rack->r_ctl.sack_moved_extra /= 2; 7894 rack->r_ctl.sack_noextra_move /= 2; 7895 } 7896 if (rack->r_ctl.sack_count > 0xfff00000) { 7897 rack->r_ctl.ack_count /= 2; 7898 rack->r_ctl.sack_count /= 2; 7899 } 7900 moved_two = 0; 7901 } 7902 out_with_totals: 7903 if (num_sack_blks > 1) { 7904 /* 7905 * You get an extra stroke if 7906 * you have more than one sack-blk, this 7907 * could be where we are skipping forward 7908 * and the sack-filter is still working, or 7909 * it could be an attacker constantly 7910 * moving us. 7911 */ 7912 rack->r_ctl.sack_moved_extra++; 7913 counter_u64_add(rack_move_some, 1); 7914 } 7915 out: 7916 #ifdef NETFLIX_EXP_DETECTION 7917 if ((rack->do_detection || tcp_force_detection) && 7918 tcp_sack_to_ack_thresh && 7919 tcp_sack_to_move_thresh && 7920 ((rack->r_ctl.rc_num_maps_alloced > tcp_map_minimum) || rack->sack_attack_disable)) { 7921 /* 7922 * We have thresholds set to find 7923 * possible attackers and disable sack. 7924 * Check them. 7925 */ 7926 uint64_t ackratio, moveratio, movetotal; 7927 7928 /* Log detecting */ 7929 rack_log_sad(rack, 1); 7930 ackratio = (uint64_t)(rack->r_ctl.sack_count); 7931 ackratio *= (uint64_t)(1000); 7932 if (rack->r_ctl.ack_count) 7933 ackratio /= (uint64_t)(rack->r_ctl.ack_count); 7934 else { 7935 /* We really should not hit here */ 7936 ackratio = 1000; 7937 } 7938 if ((rack->sack_attack_disable == 0) && 7939 (ackratio > rack_highest_sack_thresh_seen)) 7940 rack_highest_sack_thresh_seen = (uint32_t)ackratio; 7941 movetotal = rack->r_ctl.sack_moved_extra; 7942 movetotal += rack->r_ctl.sack_noextra_move; 7943 moveratio = rack->r_ctl.sack_moved_extra; 7944 moveratio *= (uint64_t)1000; 7945 if (movetotal) 7946 moveratio /= movetotal; 7947 else { 7948 /* No moves, thats pretty good */ 7949 moveratio = 0; 7950 } 7951 if ((rack->sack_attack_disable == 0) && 7952 (moveratio > rack_highest_move_thresh_seen)) 7953 rack_highest_move_thresh_seen = (uint32_t)moveratio; 7954 if (rack->sack_attack_disable == 0) { 7955 if ((ackratio > tcp_sack_to_ack_thresh) && 7956 (moveratio > tcp_sack_to_move_thresh)) { 7957 /* Disable sack processing */ 7958 rack->sack_attack_disable = 1; 7959 if (rack->r_rep_attack == 0) { 7960 rack->r_rep_attack = 1; 7961 counter_u64_add(rack_sack_attacks_detected, 1); 7962 } 7963 if (tcp_attack_on_turns_on_logging) { 7964 /* 7965 * Turn on logging, used for debugging 7966 * false positives. 7967 */ 7968 rack->rc_tp->t_logstate = tcp_attack_on_turns_on_logging; 7969 } 7970 /* Clamp the cwnd at flight size */ 7971 rack->r_ctl.rc_saved_cwnd = rack->rc_tp->snd_cwnd; 7972 rack->rc_tp->snd_cwnd = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 7973 rack_log_sad(rack, 2); 7974 } 7975 } else { 7976 /* We are sack-disabled check for false positives */ 7977 if ((ackratio <= tcp_restoral_thresh) || 7978 (rack->r_ctl.rc_num_maps_alloced < tcp_map_minimum)) { 7979 rack->sack_attack_disable = 0; 7980 rack_log_sad(rack, 3); 7981 /* Restart counting */ 7982 rack->r_ctl.sack_count = 0; 7983 rack->r_ctl.sack_moved_extra = 0; 7984 rack->r_ctl.sack_noextra_move = 1; 7985 rack->r_ctl.ack_count = max(1, 7986 (BYTES_THIS_ACK(tp, th)/ctf_fixed_maxseg(rack->rc_tp))); 7987 7988 if (rack->r_rep_reverse == 0) { 7989 rack->r_rep_reverse = 1; 7990 counter_u64_add(rack_sack_attacks_reversed, 1); 7991 } 7992 /* Restore the cwnd */ 7993 if (rack->r_ctl.rc_saved_cwnd > rack->rc_tp->snd_cwnd) 7994 rack->rc_tp->snd_cwnd = rack->r_ctl.rc_saved_cwnd; 7995 } 7996 } 7997 } 7998 #endif 7999 if (changed) { 8000 /* Something changed cancel the rack timer */ 8001 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 8002 } 8003 tsused = tcp_ts_getticks(); 8004 rsm = tcp_rack_output(tp, rack, tsused); 8005 if ((!IN_RECOVERY(tp->t_flags)) && 8006 rsm) { 8007 /* Enter recovery */ 8008 rack->r_ctl.rc_rsm_start = rsm->r_start; 8009 rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; 8010 rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; 8011 entered_recovery = 1; 8012 rack_cong_signal(tp, NULL, CC_NDUPACK); 8013 /* 8014 * When we enter recovery we need to assure we send 8015 * one packet. 8016 */ 8017 if (rack->rack_no_prr == 0) { 8018 rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); 8019 rack_log_to_prr(rack, 8, 0); 8020 } 8021 rack->r_timer_override = 1; 8022 rack->r_early = 0; 8023 rack->r_ctl.rc_agg_early = 0; 8024 } else if (IN_RECOVERY(tp->t_flags) && 8025 rsm && 8026 (rack->r_rr_config == 3)) { 8027 /* 8028 * Assure we can output and we get no 8029 * remembered pace time except the retransmit. 8030 */ 8031 rack->r_timer_override = 1; 8032 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 8033 rack->r_ctl.rc_resend = rsm; 8034 } 8035 if (IN_RECOVERY(tp->t_flags) && 8036 (rack->rack_no_prr == 0) && 8037 (entered_recovery == 0)) { 8038 /* Deal with PRR here (in recovery only) */ 8039 uint32_t pipe, snd_una; 8040 8041 rack->r_ctl.rc_prr_delivered += changed; 8042 /* Compute prr_sndcnt */ 8043 if (SEQ_GT(tp->snd_una, th_ack)) { 8044 snd_una = tp->snd_una; 8045 } else { 8046 snd_una = th_ack; 8047 } 8048 pipe = ((tp->snd_max - snd_una) - rack->r_ctl.rc_sacked) + rack->r_ctl.rc_holes_rxt; 8049 if (pipe > tp->snd_ssthresh) { 8050 long sndcnt; 8051 8052 sndcnt = rack->r_ctl.rc_prr_delivered * tp->snd_ssthresh; 8053 if (rack->r_ctl.rc_prr_recovery_fs > 0) 8054 sndcnt /= (long)rack->r_ctl.rc_prr_recovery_fs; 8055 else { 8056 rack->r_ctl.rc_prr_sndcnt = 0; 8057 rack_log_to_prr(rack, 9, 0); 8058 sndcnt = 0; 8059 } 8060 sndcnt++; 8061 if (sndcnt > (long)rack->r_ctl.rc_prr_out) 8062 sndcnt -= rack->r_ctl.rc_prr_out; 8063 else 8064 sndcnt = 0; 8065 rack->r_ctl.rc_prr_sndcnt = sndcnt; 8066 rack_log_to_prr(rack, 10, 0); 8067 } else { 8068 uint32_t limit; 8069 8070 if (rack->r_ctl.rc_prr_delivered > rack->r_ctl.rc_prr_out) 8071 limit = (rack->r_ctl.rc_prr_delivered - rack->r_ctl.rc_prr_out); 8072 else 8073 limit = 0; 8074 if (changed > limit) 8075 limit = changed; 8076 limit += ctf_fixed_maxseg(tp); 8077 if (tp->snd_ssthresh > pipe) { 8078 rack->r_ctl.rc_prr_sndcnt = min((tp->snd_ssthresh - pipe), limit); 8079 rack_log_to_prr(rack, 11, 0); 8080 } else { 8081 rack->r_ctl.rc_prr_sndcnt = min(0, limit); 8082 rack_log_to_prr(rack, 12, 0); 8083 } 8084 } 8085 if ((rsm && (rack->r_ctl.rc_prr_sndcnt >= ctf_fixed_maxseg(tp)) && 8086 ((rack->rc_inp->inp_in_hpts == 0) && 8087 ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0)))) { 8088 /* 8089 * If you are pacing output you don't want 8090 * to override. 8091 */ 8092 rack->r_early = 0; 8093 rack->r_ctl.rc_agg_early = 0; 8094 rack->r_timer_override = 1; 8095 } 8096 } 8097 } 8098 8099 static void 8100 rack_strike_dupack(struct tcp_rack *rack) 8101 { 8102 struct rack_sendmap *rsm; 8103 8104 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 8105 if (rsm && (rsm->r_dupack < 0xff)) { 8106 rsm->r_dupack++; 8107 if (rsm->r_dupack >= DUP_ACK_THRESHOLD) { 8108 rack->r_wanted_output = 1; 8109 rack->r_timer_override = 1; 8110 rack_log_retran_reason(rack, rsm, __LINE__, 1, 3); 8111 } else { 8112 rack_log_retran_reason(rack, rsm, __LINE__, 0, 3); 8113 } 8114 } 8115 } 8116 8117 static void 8118 rack_check_bottom_drag(struct tcpcb *tp, 8119 struct tcp_rack *rack, 8120 struct socket *so, int32_t acked) 8121 { 8122 uint32_t segsiz, minseg; 8123 8124 segsiz = ctf_fixed_maxseg(tp); 8125 minseg = segsiz; 8126 8127 if (tp->snd_max == tp->snd_una) { 8128 /* 8129 * We are doing dynamic pacing and we are way 8130 * under. Basically everything got acked while 8131 * we were still waiting on the pacer to expire. 8132 * 8133 * This means we need to boost the b/w in 8134 * addition to any earlier boosting of 8135 * the multipler. 8136 */ 8137 rack->rc_dragged_bottom = 1; 8138 rack_validate_multipliers_at_or_above100(rack); 8139 /* 8140 * Lets use the segment bytes acked plus 8141 * the lowest RTT seen as the basis to 8142 * form a b/w estimate. This will be off 8143 * due to the fact that the true estimate 8144 * should be around 1/2 the time of the RTT 8145 * but we can settle for that. 8146 */ 8147 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_VALID) && 8148 acked) { 8149 uint64_t bw, calc_bw, rtt; 8150 8151 rtt = rack->r_ctl.rack_rs.rs_us_rtt; 8152 bw = acked; 8153 calc_bw = bw * 1000000; 8154 calc_bw /= rtt; 8155 if (rack->r_ctl.last_max_bw && 8156 (rack->r_ctl.last_max_bw < calc_bw)) { 8157 /* 8158 * If we have a last calculated max bw 8159 * enforce it. 8160 */ 8161 calc_bw = rack->r_ctl.last_max_bw; 8162 } 8163 /* now plop it in */ 8164 if (rack->rc_gp_filled == 0) { 8165 if (calc_bw > ONE_POINT_TWO_MEG) { 8166 /* 8167 * If we have no measurement 8168 * don't let us set in more than 8169 * 1.2Mbps. If we are still too 8170 * low after pacing with this we 8171 * will hopefully have a max b/w 8172 * available to sanity check things. 8173 */ 8174 calc_bw = ONE_POINT_TWO_MEG; 8175 } 8176 rack->r_ctl.rc_rtt_diff = 0; 8177 rack->r_ctl.gp_bw = calc_bw; 8178 rack->rc_gp_filled = 1; 8179 rack->r_ctl.num_avg = RACK_REQ_AVG; 8180 rack_set_pace_segments(rack->rc_tp, rack, __LINE__); 8181 } else if (calc_bw > rack->r_ctl.gp_bw) { 8182 rack->r_ctl.rc_rtt_diff = 0; 8183 rack->r_ctl.num_avg = RACK_REQ_AVG; 8184 rack->r_ctl.gp_bw = calc_bw; 8185 rack_set_pace_segments(rack->rc_tp, rack, __LINE__); 8186 } else 8187 rack_increase_bw_mul(rack, -1, 0, 0, 1); 8188 /* 8189 * For acks over 1mss we do a extra boost to simulate 8190 * where we would get 2 acks (we want 110 for the mul). 8191 */ 8192 if (acked > segsiz) 8193 rack_increase_bw_mul(rack, -1, 0, 0, 1); 8194 } else { 8195 /* 8196 * Huh, this should not be, settle 8197 * for just an old increase. 8198 */ 8199 rack_increase_bw_mul(rack, -1, 0, 0, 1); 8200 } 8201 } else if ((IN_RECOVERY(tp->t_flags) == 0) && 8202 (sbavail(&so->so_snd) > max((segsiz * (4 + rack_req_segs)), 8203 minseg)) && 8204 (rack->r_ctl.cwnd_to_use > max((segsiz * (rack_req_segs + 2)), minseg)) && 8205 (tp->snd_wnd > max((segsiz * (rack_req_segs + 2)), minseg)) && 8206 (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) <= 8207 (segsiz * rack_req_segs))) { 8208 /* 8209 * We are doing dynamic GP pacing and 8210 * we have everything except 1MSS or less 8211 * bytes left out. We are still pacing away. 8212 * And there is data that could be sent, This 8213 * means we are inserting delayed ack time in 8214 * our measurements because we are pacing too slow. 8215 */ 8216 rack_validate_multipliers_at_or_above100(rack); 8217 rack->rc_dragged_bottom = 1; 8218 rack_increase_bw_mul(rack, -1, 0, 0, 1); 8219 } 8220 } 8221 8222 /* 8223 * Return value of 1, we do not need to call rack_process_data(). 8224 * return value of 0, rack_process_data can be called. 8225 * For ret_val if its 0 the TCP is locked, if its non-zero 8226 * its unlocked and probably unsafe to touch the TCB. 8227 */ 8228 static int 8229 rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so, 8230 struct tcpcb *tp, struct tcpopt *to, 8231 uint32_t tiwin, int32_t tlen, 8232 int32_t * ofia, int32_t thflags, int32_t * ret_val) 8233 { 8234 int32_t ourfinisacked = 0; 8235 int32_t nsegs, acked_amount; 8236 int32_t acked; 8237 struct mbuf *mfree; 8238 struct tcp_rack *rack; 8239 int32_t under_pacing = 0; 8240 int32_t recovery = 0; 8241 8242 rack = (struct tcp_rack *)tp->t_fb_ptr; 8243 if (SEQ_GT(th->th_ack, tp->snd_max)) { 8244 ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val); 8245 rack->r_wanted_output = 1; 8246 return (1); 8247 } 8248 if (rack->rc_gp_filled && 8249 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { 8250 under_pacing = 1; 8251 } 8252 if (SEQ_GEQ(th->th_ack, tp->snd_una) || to->to_nsacks) { 8253 if (rack->rc_in_persist) 8254 tp->t_rxtshift = 0; 8255 if ((th->th_ack == tp->snd_una) && (tiwin == tp->snd_wnd)) 8256 rack_strike_dupack(rack); 8257 rack_log_ack(tp, to, th); 8258 } 8259 if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) { 8260 /* 8261 * Old ack, behind (or duplicate to) the last one rcv'd 8262 * Note: Should mark reordering is occuring! We should also 8263 * look for sack blocks arriving e.g. ack 1, 4-4 then ack 1, 8264 * 3-3, 4-4 would be reording. As well as ack 1, 3-3 <no 8265 * retran and> ack 3 8266 */ 8267 return (0); 8268 } 8269 /* 8270 * If we reach this point, ACK is not a duplicate, i.e., it ACKs 8271 * something we sent. 8272 */ 8273 if (tp->t_flags & TF_NEEDSYN) { 8274 /* 8275 * T/TCP: Connection was half-synchronized, and our SYN has 8276 * been ACK'd (so connection is now fully synchronized). Go 8277 * to non-starred state, increment snd_una for ACK of SYN, 8278 * and check if we can do window scaling. 8279 */ 8280 tp->t_flags &= ~TF_NEEDSYN; 8281 tp->snd_una++; 8282 /* Do window scaling? */ 8283 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 8284 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 8285 tp->rcv_scale = tp->request_r_scale; 8286 /* Send window already scaled. */ 8287 } 8288 } 8289 nsegs = max(1, m->m_pkthdr.lro_nsegs); 8290 INP_WLOCK_ASSERT(tp->t_inpcb); 8291 8292 acked = BYTES_THIS_ACK(tp, th); 8293 KMOD_TCPSTAT_ADD(tcps_rcvackpack, nsegs); 8294 KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked); 8295 /* 8296 * If we just performed our first retransmit, and the ACK arrives 8297 * within our recovery window, then it was a mistake to do the 8298 * retransmit in the first place. Recover our original cwnd and 8299 * ssthresh, and proceed to transmit where we left off. 8300 */ 8301 if (tp->t_flags & TF_PREVVALID) { 8302 tp->t_flags &= ~TF_PREVVALID; 8303 if (tp->t_rxtshift == 1 && 8304 (int)(ticks - tp->t_badrxtwin) < 0) 8305 rack_cong_signal(tp, th, CC_RTO_ERR); 8306 } 8307 if (acked) { 8308 /* assure we are not backed off */ 8309 tp->t_rxtshift = 0; 8310 rack->rc_tlp_in_progress = 0; 8311 rack->r_ctl.rc_tlp_cnt_out = 0; 8312 /* 8313 * If it is the RXT timer we want to 8314 * stop it, so we can restart a TLP. 8315 */ 8316 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) 8317 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 8318 #ifdef NETFLIX_HTTP_LOGGING 8319 tcp_http_check_for_comp(rack->rc_tp, th->th_ack); 8320 #endif 8321 } 8322 /* 8323 * If we have a timestamp reply, update smoothed round trip time. If 8324 * no timestamp is present but transmit timer is running and timed 8325 * sequence number was acked, update smoothed round trip time. Since 8326 * we now have an rtt measurement, cancel the timer backoff (cf., 8327 * Phil Karn's retransmit alg.). Recompute the initial retransmit 8328 * timer. 8329 * 8330 * Some boxes send broken timestamp replies during the SYN+ACK 8331 * phase, ignore timestamps of 0 or we could calculate a huge RTT 8332 * and blow up the retransmit timer. 8333 */ 8334 /* 8335 * If all outstanding data is acked, stop retransmit timer and 8336 * remember to restart (more output or persist). If there is more 8337 * data to be acked, restart retransmit timer, using current 8338 * (possibly backed-off) value. 8339 */ 8340 if (acked == 0) { 8341 if (ofia) 8342 *ofia = ourfinisacked; 8343 return (0); 8344 } 8345 if (rack->r_ctl.rc_early_recovery) { 8346 if (IN_RECOVERY(tp->t_flags)) { 8347 if (SEQ_LT(th->th_ack, tp->snd_recover) && 8348 (SEQ_LT(th->th_ack, tp->snd_max))) { 8349 tcp_rack_partialack(tp, th); 8350 } else { 8351 rack_post_recovery(tp, th); 8352 recovery = 1; 8353 } 8354 } 8355 } 8356 /* 8357 * Let the congestion control algorithm update congestion control 8358 * related information. This typically means increasing the 8359 * congestion window. 8360 */ 8361 rack_ack_received(tp, rack, th, nsegs, CC_ACK, recovery); 8362 SOCKBUF_LOCK(&so->so_snd); 8363 acked_amount = min(acked, (int)sbavail(&so->so_snd)); 8364 tp->snd_wnd -= acked_amount; 8365 mfree = sbcut_locked(&so->so_snd, acked_amount); 8366 if ((sbused(&so->so_snd) == 0) && 8367 (acked > acked_amount) && 8368 (tp->t_state >= TCPS_FIN_WAIT_1) && 8369 (tp->t_flags & TF_SENTFIN)) { 8370 /* 8371 * We must be sure our fin 8372 * was sent and acked (we can be 8373 * in FIN_WAIT_1 without having 8374 * sent the fin). 8375 */ 8376 ourfinisacked = 1; 8377 } 8378 /* NB: sowwakeup_locked() does an implicit unlock. */ 8379 sowwakeup_locked(so); 8380 m_freem(mfree); 8381 if (rack->r_ctl.rc_early_recovery == 0) { 8382 if (IN_RECOVERY(tp->t_flags)) { 8383 if (SEQ_LT(th->th_ack, tp->snd_recover) && 8384 (SEQ_LT(th->th_ack, tp->snd_max))) { 8385 tcp_rack_partialack(tp, th); 8386 } else { 8387 rack_post_recovery(tp, th); 8388 } 8389 } 8390 } 8391 tp->snd_una = th->th_ack; 8392 if (SEQ_GT(tp->snd_una, tp->snd_recover)) 8393 tp->snd_recover = tp->snd_una; 8394 8395 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) { 8396 tp->snd_nxt = tp->snd_una; 8397 } 8398 if (under_pacing && 8399 (rack->use_fixed_rate == 0) && 8400 (rack->in_probe_rtt == 0) && 8401 rack->rc_gp_dyn_mul && 8402 rack->rc_always_pace) { 8403 /* Check if we are dragging bottom */ 8404 rack_check_bottom_drag(tp, rack, so, acked); 8405 } 8406 if (tp->snd_una == tp->snd_max) { 8407 /* Nothing left outstanding */ 8408 rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL); 8409 if (rack->r_ctl.rc_went_idle_time == 0) 8410 rack->r_ctl.rc_went_idle_time = 1; 8411 rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__); 8412 if (sbavail(&tp->t_inpcb->inp_socket->so_snd) == 0) 8413 tp->t_acktime = 0; 8414 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 8415 /* Set need output so persist might get set */ 8416 rack->r_wanted_output = 1; 8417 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 8418 if ((tp->t_state >= TCPS_FIN_WAIT_1) && 8419 (sbavail(&so->so_snd) == 0) && 8420 (tp->t_flags2 & TF2_DROP_AF_DATA)) { 8421 /* 8422 * The socket was gone and the 8423 * peer sent data, time to 8424 * reset him. 8425 */ 8426 *ret_val = 1; 8427 /* tcp_close will kill the inp pre-log the Reset */ 8428 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); 8429 tp = tcp_close(tp); 8430 ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, tlen); 8431 return (1); 8432 8433 } 8434 } 8435 if (ofia) 8436 *ofia = ourfinisacked; 8437 return (0); 8438 } 8439 8440 static void 8441 rack_collapsed_window(struct tcp_rack *rack) 8442 { 8443 /* 8444 * Now we must walk the 8445 * send map and divide the 8446 * ones left stranded. These 8447 * guys can't cause us to abort 8448 * the connection and are really 8449 * "unsent". However if a buggy 8450 * client actually did keep some 8451 * of the data i.e. collapsed the win 8452 * and refused to ack and then opened 8453 * the win and acked that data. We would 8454 * get into an ack war, the simplier 8455 * method then of just pretending we 8456 * did not send those segments something 8457 * won't work. 8458 */ 8459 struct rack_sendmap *rsm, *nrsm, fe, *insret; 8460 tcp_seq max_seq; 8461 8462 max_seq = rack->rc_tp->snd_una + rack->rc_tp->snd_wnd; 8463 memset(&fe, 0, sizeof(fe)); 8464 fe.r_start = max_seq; 8465 /* Find the first seq past or at maxseq */ 8466 rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); 8467 if (rsm == NULL) { 8468 /* Nothing to do strange */ 8469 rack->rc_has_collapsed = 0; 8470 return; 8471 } 8472 /* 8473 * Now do we need to split at 8474 * the collapse point? 8475 */ 8476 if (SEQ_GT(max_seq, rsm->r_start)) { 8477 nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT); 8478 if (nrsm == NULL) { 8479 /* We can't get a rsm, mark all? */ 8480 nrsm = rsm; 8481 goto no_split; 8482 } 8483 /* Clone it */ 8484 rack_clone_rsm(rack, nrsm, rsm, max_seq); 8485 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); 8486 #ifdef INVARIANTS 8487 if (insret != NULL) { 8488 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", 8489 nrsm, insret, rack, rsm); 8490 } 8491 #endif 8492 if (rsm->r_in_tmap) { 8493 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 8494 nrsm->r_in_tmap = 1; 8495 } 8496 /* 8497 * Set in the new RSM as the 8498 * collapsed starting point 8499 */ 8500 rsm = nrsm; 8501 } 8502 no_split: 8503 counter_u64_add(rack_collapsed_win, 1); 8504 RB_FOREACH_FROM(nrsm, rack_rb_tree_head, rsm) { 8505 nrsm->r_flags |= RACK_RWND_COLLAPSED; 8506 rack->rc_has_collapsed = 1; 8507 } 8508 } 8509 8510 static void 8511 rack_un_collapse_window(struct tcp_rack *rack) 8512 { 8513 struct rack_sendmap *rsm; 8514 8515 RB_FOREACH_REVERSE(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) { 8516 if (rsm->r_flags & RACK_RWND_COLLAPSED) 8517 rsm->r_flags &= ~RACK_RWND_COLLAPSED; 8518 else 8519 break; 8520 } 8521 rack->rc_has_collapsed = 0; 8522 } 8523 8524 static void 8525 rack_handle_delayed_ack(struct tcpcb *tp, struct tcp_rack *rack, 8526 int32_t tlen, int32_t tfo_syn) 8527 { 8528 if (DELAY_ACK(tp, tlen) || tfo_syn) { 8529 if (rack->rc_dack_mode && 8530 (tlen > 500) && 8531 (rack->rc_dack_toggle == 1)) { 8532 goto no_delayed_ack; 8533 } 8534 rack_timer_cancel(tp, rack, 8535 rack->r_ctl.rc_rcvtime, __LINE__); 8536 tp->t_flags |= TF_DELACK; 8537 } else { 8538 no_delayed_ack: 8539 rack->r_wanted_output = 1; 8540 tp->t_flags |= TF_ACKNOW; 8541 if (rack->rc_dack_mode) { 8542 if (tp->t_flags & TF_DELACK) 8543 rack->rc_dack_toggle = 1; 8544 else 8545 rack->rc_dack_toggle = 0; 8546 } 8547 } 8548 } 8549 /* 8550 * Return value of 1, the TCB is unlocked and most 8551 * likely gone, return value of 0, the TCP is still 8552 * locked. 8553 */ 8554 static int 8555 rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so, 8556 struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, 8557 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) 8558 { 8559 /* 8560 * Update window information. Don't look at window if no ACK: TAC's 8561 * send garbage on first SYN. 8562 */ 8563 int32_t nsegs; 8564 int32_t tfo_syn; 8565 struct tcp_rack *rack; 8566 8567 rack = (struct tcp_rack *)tp->t_fb_ptr; 8568 INP_WLOCK_ASSERT(tp->t_inpcb); 8569 nsegs = max(1, m->m_pkthdr.lro_nsegs); 8570 if ((thflags & TH_ACK) && 8571 (SEQ_LT(tp->snd_wl1, th->th_seq) || 8572 (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || 8573 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { 8574 /* keep track of pure window updates */ 8575 if (tlen == 0 && 8576 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) 8577 KMOD_TCPSTAT_INC(tcps_rcvwinupd); 8578 tp->snd_wnd = tiwin; 8579 tp->snd_wl1 = th->th_seq; 8580 tp->snd_wl2 = th->th_ack; 8581 if (tp->snd_wnd > tp->max_sndwnd) 8582 tp->max_sndwnd = tp->snd_wnd; 8583 rack->r_wanted_output = 1; 8584 } else if (thflags & TH_ACK) { 8585 if ((tp->snd_wl2 == th->th_ack) && (tiwin < tp->snd_wnd)) { 8586 tp->snd_wnd = tiwin; 8587 tp->snd_wl1 = th->th_seq; 8588 tp->snd_wl2 = th->th_ack; 8589 } 8590 } 8591 if (tp->snd_wnd < ctf_outstanding(tp)) 8592 /* The peer collapsed the window */ 8593 rack_collapsed_window(rack); 8594 else if (rack->rc_has_collapsed) 8595 rack_un_collapse_window(rack); 8596 /* Was persist timer active and now we have window space? */ 8597 if ((rack->rc_in_persist != 0) && 8598 (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2), 8599 rack->r_ctl.rc_pace_min_segs))) { 8600 rack_exit_persist(tp, rack, rack->r_ctl.rc_rcvtime); 8601 tp->snd_nxt = tp->snd_max; 8602 /* Make sure we output to start the timer */ 8603 rack->r_wanted_output = 1; 8604 } 8605 /* Do we enter persists? */ 8606 if ((rack->rc_in_persist == 0) && 8607 (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) && 8608 TCPS_HAVEESTABLISHED(tp->t_state) && 8609 (tp->snd_max == tp->snd_una) && 8610 sbavail(&tp->t_inpcb->inp_socket->so_snd) && 8611 (sbavail(&tp->t_inpcb->inp_socket->so_snd) > tp->snd_wnd)) { 8612 /* 8613 * Here the rwnd is less than 8614 * the pacing size, we are established, 8615 * nothing is outstanding, and there is 8616 * data to send. Enter persists. 8617 */ 8618 tp->snd_nxt = tp->snd_una; 8619 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime); 8620 } 8621 if (tp->t_flags2 & TF2_DROP_AF_DATA) { 8622 m_freem(m); 8623 return (0); 8624 } 8625 /* 8626 * don't process the URG bit, ignore them drag 8627 * along the up. 8628 */ 8629 tp->rcv_up = tp->rcv_nxt; 8630 INP_WLOCK_ASSERT(tp->t_inpcb); 8631 8632 /* 8633 * Process the segment text, merging it into the TCP sequencing 8634 * queue, and arranging for acknowledgment of receipt if necessary. 8635 * This process logically involves adjusting tp->rcv_wnd as data is 8636 * presented to the user (this happens in tcp_usrreq.c, case 8637 * PRU_RCVD). If a FIN has already been received on this connection 8638 * then we just ignore the text. 8639 */ 8640 tfo_syn = ((tp->t_state == TCPS_SYN_RECEIVED) && 8641 IS_FASTOPEN(tp->t_flags)); 8642 if ((tlen || (thflags & TH_FIN) || (tfo_syn && tlen > 0)) && 8643 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 8644 tcp_seq save_start = th->th_seq; 8645 tcp_seq save_rnxt = tp->rcv_nxt; 8646 int save_tlen = tlen; 8647 8648 m_adj(m, drop_hdrlen); /* delayed header drop */ 8649 /* 8650 * Insert segment which includes th into TCP reassembly 8651 * queue with control block tp. Set thflags to whether 8652 * reassembly now includes a segment with FIN. This handles 8653 * the common case inline (segment is the next to be 8654 * received on an established connection, and the queue is 8655 * empty), avoiding linkage into and removal from the queue 8656 * and repetition of various conversions. Set DELACK for 8657 * segments received in order, but ack immediately when 8658 * segments are out of order (so fast retransmit can work). 8659 */ 8660 if (th->th_seq == tp->rcv_nxt && 8661 SEGQ_EMPTY(tp) && 8662 (TCPS_HAVEESTABLISHED(tp->t_state) || 8663 tfo_syn)) { 8664 #ifdef NETFLIX_SB_LIMITS 8665 u_int mcnt, appended; 8666 8667 if (so->so_rcv.sb_shlim) { 8668 mcnt = m_memcnt(m); 8669 appended = 0; 8670 if (counter_fo_get(so->so_rcv.sb_shlim, mcnt, 8671 CFO_NOSLEEP, NULL) == false) { 8672 counter_u64_add(tcp_sb_shlim_fails, 1); 8673 m_freem(m); 8674 return (0); 8675 } 8676 } 8677 #endif 8678 rack_handle_delayed_ack(tp, rack, tlen, tfo_syn); 8679 tp->rcv_nxt += tlen; 8680 if (tlen && 8681 ((tp->t_flags2 & TF2_FBYTES_COMPLETE) == 0) && 8682 (tp->t_fbyte_in == 0)) { 8683 tp->t_fbyte_in = ticks; 8684 if (tp->t_fbyte_in == 0) 8685 tp->t_fbyte_in = 1; 8686 if (tp->t_fbyte_out && tp->t_fbyte_in) 8687 tp->t_flags2 |= TF2_FBYTES_COMPLETE; 8688 } 8689 thflags = th->th_flags & TH_FIN; 8690 KMOD_TCPSTAT_ADD(tcps_rcvpack, nsegs); 8691 KMOD_TCPSTAT_ADD(tcps_rcvbyte, tlen); 8692 SOCKBUF_LOCK(&so->so_rcv); 8693 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 8694 m_freem(m); 8695 } else 8696 #ifdef NETFLIX_SB_LIMITS 8697 appended = 8698 #endif 8699 sbappendstream_locked(&so->so_rcv, m, 0); 8700 /* NB: sorwakeup_locked() does an implicit unlock. */ 8701 sorwakeup_locked(so); 8702 #ifdef NETFLIX_SB_LIMITS 8703 if (so->so_rcv.sb_shlim && appended != mcnt) 8704 counter_fo_release(so->so_rcv.sb_shlim, 8705 mcnt - appended); 8706 #endif 8707 } else { 8708 /* 8709 * XXX: Due to the header drop above "th" is 8710 * theoretically invalid by now. Fortunately 8711 * m_adj() doesn't actually frees any mbufs when 8712 * trimming from the head. 8713 */ 8714 tcp_seq temp = save_start; 8715 thflags = tcp_reass(tp, th, &temp, &tlen, m); 8716 tp->t_flags |= TF_ACKNOW; 8717 } 8718 if ((tp->t_flags & TF_SACK_PERMIT) && (save_tlen > 0)) { 8719 if ((tlen == 0) && (SEQ_LT(save_start, save_rnxt))) { 8720 /* 8721 * DSACK actually handled in the fastpath 8722 * above. 8723 */ 8724 RACK_OPTS_INC(tcp_sack_path_1); 8725 tcp_update_sack_list(tp, save_start, 8726 save_start + save_tlen); 8727 } else if ((tlen > 0) && SEQ_GT(tp->rcv_nxt, save_rnxt)) { 8728 if ((tp->rcv_numsacks >= 1) && 8729 (tp->sackblks[0].end == save_start)) { 8730 /* 8731 * Partial overlap, recorded at todrop 8732 * above. 8733 */ 8734 RACK_OPTS_INC(tcp_sack_path_2a); 8735 tcp_update_sack_list(tp, 8736 tp->sackblks[0].start, 8737 tp->sackblks[0].end); 8738 } else { 8739 RACK_OPTS_INC(tcp_sack_path_2b); 8740 tcp_update_dsack_list(tp, save_start, 8741 save_start + save_tlen); 8742 } 8743 } else if (tlen >= save_tlen) { 8744 /* Update of sackblks. */ 8745 RACK_OPTS_INC(tcp_sack_path_3); 8746 tcp_update_dsack_list(tp, save_start, 8747 save_start + save_tlen); 8748 } else if (tlen > 0) { 8749 RACK_OPTS_INC(tcp_sack_path_4); 8750 tcp_update_dsack_list(tp, save_start, 8751 save_start + tlen); 8752 } 8753 } 8754 } else { 8755 m_freem(m); 8756 thflags &= ~TH_FIN; 8757 } 8758 8759 /* 8760 * If FIN is received ACK the FIN and let the user know that the 8761 * connection is closing. 8762 */ 8763 if (thflags & TH_FIN) { 8764 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 8765 socantrcvmore(so); 8766 /* 8767 * If connection is half-synchronized (ie NEEDSYN 8768 * flag on) then delay ACK, so it may be piggybacked 8769 * when SYN is sent. Otherwise, since we received a 8770 * FIN then no more input can be expected, send ACK 8771 * now. 8772 */ 8773 if (tp->t_flags & TF_NEEDSYN) { 8774 rack_timer_cancel(tp, rack, 8775 rack->r_ctl.rc_rcvtime, __LINE__); 8776 tp->t_flags |= TF_DELACK; 8777 } else { 8778 tp->t_flags |= TF_ACKNOW; 8779 } 8780 tp->rcv_nxt++; 8781 } 8782 switch (tp->t_state) { 8783 8784 /* 8785 * In SYN_RECEIVED and ESTABLISHED STATES enter the 8786 * CLOSE_WAIT state. 8787 */ 8788 case TCPS_SYN_RECEIVED: 8789 tp->t_starttime = ticks; 8790 /* FALLTHROUGH */ 8791 case TCPS_ESTABLISHED: 8792 rack_timer_cancel(tp, rack, 8793 rack->r_ctl.rc_rcvtime, __LINE__); 8794 tcp_state_change(tp, TCPS_CLOSE_WAIT); 8795 break; 8796 8797 /* 8798 * If still in FIN_WAIT_1 STATE FIN has not been 8799 * acked so enter the CLOSING state. 8800 */ 8801 case TCPS_FIN_WAIT_1: 8802 rack_timer_cancel(tp, rack, 8803 rack->r_ctl.rc_rcvtime, __LINE__); 8804 tcp_state_change(tp, TCPS_CLOSING); 8805 break; 8806 8807 /* 8808 * In FIN_WAIT_2 state enter the TIME_WAIT state, 8809 * starting the time-wait timer, turning off the 8810 * other standard timers. 8811 */ 8812 case TCPS_FIN_WAIT_2: 8813 rack_timer_cancel(tp, rack, 8814 rack->r_ctl.rc_rcvtime, __LINE__); 8815 tcp_twstart(tp); 8816 return (1); 8817 } 8818 } 8819 /* 8820 * Return any desired output. 8821 */ 8822 if ((tp->t_flags & TF_ACKNOW) || 8823 (sbavail(&so->so_snd) > (tp->snd_max - tp->snd_una))) { 8824 rack->r_wanted_output = 1; 8825 } 8826 INP_WLOCK_ASSERT(tp->t_inpcb); 8827 return (0); 8828 } 8829 8830 /* 8831 * Here nothing is really faster, its just that we 8832 * have broken out the fast-data path also just like 8833 * the fast-ack. 8834 */ 8835 static int 8836 rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so, 8837 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 8838 uint32_t tiwin, int32_t nxt_pkt, uint8_t iptos) 8839 { 8840 int32_t nsegs; 8841 int32_t newsize = 0; /* automatic sockbuf scaling */ 8842 struct tcp_rack *rack; 8843 #ifdef NETFLIX_SB_LIMITS 8844 u_int mcnt, appended; 8845 #endif 8846 #ifdef TCPDEBUG 8847 /* 8848 * The size of tcp_saveipgen must be the size of the max ip header, 8849 * now IPv6. 8850 */ 8851 u_char tcp_saveipgen[IP6_HDR_LEN]; 8852 struct tcphdr tcp_savetcp; 8853 short ostate = 0; 8854 8855 #endif 8856 /* 8857 * If last ACK falls within this segment's sequence numbers, record 8858 * the timestamp. NOTE that the test is modified according to the 8859 * latest proposal of the tcplw@cray.com list (Braden 1993/04/26). 8860 */ 8861 if (__predict_false(th->th_seq != tp->rcv_nxt)) { 8862 return (0); 8863 } 8864 if (__predict_false(tp->snd_nxt != tp->snd_max)) { 8865 return (0); 8866 } 8867 if (tiwin && tiwin != tp->snd_wnd) { 8868 return (0); 8869 } 8870 if (__predict_false((tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN)))) { 8871 return (0); 8872 } 8873 if (__predict_false((to->to_flags & TOF_TS) && 8874 (TSTMP_LT(to->to_tsval, tp->ts_recent)))) { 8875 return (0); 8876 } 8877 if (__predict_false((th->th_ack != tp->snd_una))) { 8878 return (0); 8879 } 8880 if (__predict_false(tlen > sbspace(&so->so_rcv))) { 8881 return (0); 8882 } 8883 if ((to->to_flags & TOF_TS) != 0 && 8884 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 8885 tp->ts_recent_age = tcp_ts_getticks(); 8886 tp->ts_recent = to->to_tsval; 8887 } 8888 rack = (struct tcp_rack *)tp->t_fb_ptr; 8889 /* 8890 * This is a pure, in-sequence data packet with nothing on the 8891 * reassembly queue and we have enough buffer space to take it. 8892 */ 8893 nsegs = max(1, m->m_pkthdr.lro_nsegs); 8894 8895 #ifdef NETFLIX_SB_LIMITS 8896 if (so->so_rcv.sb_shlim) { 8897 mcnt = m_memcnt(m); 8898 appended = 0; 8899 if (counter_fo_get(so->so_rcv.sb_shlim, mcnt, 8900 CFO_NOSLEEP, NULL) == false) { 8901 counter_u64_add(tcp_sb_shlim_fails, 1); 8902 m_freem(m); 8903 return (1); 8904 } 8905 } 8906 #endif 8907 /* Clean receiver SACK report if present */ 8908 if (tp->rcv_numsacks) 8909 tcp_clean_sackreport(tp); 8910 KMOD_TCPSTAT_INC(tcps_preddat); 8911 tp->rcv_nxt += tlen; 8912 if (tlen && 8913 ((tp->t_flags2 & TF2_FBYTES_COMPLETE) == 0) && 8914 (tp->t_fbyte_in == 0)) { 8915 tp->t_fbyte_in = ticks; 8916 if (tp->t_fbyte_in == 0) 8917 tp->t_fbyte_in = 1; 8918 if (tp->t_fbyte_out && tp->t_fbyte_in) 8919 tp->t_flags2 |= TF2_FBYTES_COMPLETE; 8920 } 8921 /* 8922 * Pull snd_wl1 up to prevent seq wrap relative to th_seq. 8923 */ 8924 tp->snd_wl1 = th->th_seq; 8925 /* 8926 * Pull rcv_up up to prevent seq wrap relative to rcv_nxt. 8927 */ 8928 tp->rcv_up = tp->rcv_nxt; 8929 KMOD_TCPSTAT_ADD(tcps_rcvpack, nsegs); 8930 KMOD_TCPSTAT_ADD(tcps_rcvbyte, tlen); 8931 #ifdef TCPDEBUG 8932 if (so->so_options & SO_DEBUG) 8933 tcp_trace(TA_INPUT, ostate, tp, 8934 (void *)tcp_saveipgen, &tcp_savetcp, 0); 8935 #endif 8936 newsize = tcp_autorcvbuf(m, th, so, tp, tlen); 8937 8938 /* Add data to socket buffer. */ 8939 SOCKBUF_LOCK(&so->so_rcv); 8940 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 8941 m_freem(m); 8942 } else { 8943 /* 8944 * Set new socket buffer size. Give up when limit is 8945 * reached. 8946 */ 8947 if (newsize) 8948 if (!sbreserve_locked(&so->so_rcv, 8949 newsize, so, NULL)) 8950 so->so_rcv.sb_flags &= ~SB_AUTOSIZE; 8951 m_adj(m, drop_hdrlen); /* delayed header drop */ 8952 #ifdef NETFLIX_SB_LIMITS 8953 appended = 8954 #endif 8955 sbappendstream_locked(&so->so_rcv, m, 0); 8956 ctf_calc_rwin(so, tp); 8957 } 8958 /* NB: sorwakeup_locked() does an implicit unlock. */ 8959 sorwakeup_locked(so); 8960 #ifdef NETFLIX_SB_LIMITS 8961 if (so->so_rcv.sb_shlim && mcnt != appended) 8962 counter_fo_release(so->so_rcv.sb_shlim, mcnt - appended); 8963 #endif 8964 rack_handle_delayed_ack(tp, rack, tlen, 0); 8965 if (tp->snd_una == tp->snd_max) 8966 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 8967 return (1); 8968 } 8969 8970 /* 8971 * This subfunction is used to try to highly optimize the 8972 * fast path. We again allow window updates that are 8973 * in sequence to remain in the fast-path. We also add 8974 * in the __predict's to attempt to help the compiler. 8975 * Note that if we return a 0, then we can *not* process 8976 * it and the caller should push the packet into the 8977 * slow-path. 8978 */ 8979 static int 8980 rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so, 8981 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 8982 uint32_t tiwin, int32_t nxt_pkt, uint32_t cts) 8983 { 8984 int32_t acked; 8985 int32_t nsegs; 8986 #ifdef TCPDEBUG 8987 /* 8988 * The size of tcp_saveipgen must be the size of the max ip header, 8989 * now IPv6. 8990 */ 8991 u_char tcp_saveipgen[IP6_HDR_LEN]; 8992 struct tcphdr tcp_savetcp; 8993 short ostate = 0; 8994 #endif 8995 int32_t under_pacing = 0; 8996 struct tcp_rack *rack; 8997 8998 if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) { 8999 /* Old ack, behind (or duplicate to) the last one rcv'd */ 9000 return (0); 9001 } 9002 if (__predict_false(SEQ_GT(th->th_ack, tp->snd_max))) { 9003 /* Above what we have sent? */ 9004 return (0); 9005 } 9006 if (__predict_false(tp->snd_nxt != tp->snd_max)) { 9007 /* We are retransmitting */ 9008 return (0); 9009 } 9010 if (__predict_false(tiwin == 0)) { 9011 /* zero window */ 9012 return (0); 9013 } 9014 if (__predict_false(tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN))) { 9015 /* We need a SYN or a FIN, unlikely.. */ 9016 return (0); 9017 } 9018 if ((to->to_flags & TOF_TS) && __predict_false(TSTMP_LT(to->to_tsval, tp->ts_recent))) { 9019 /* Timestamp is behind .. old ack with seq wrap? */ 9020 return (0); 9021 } 9022 if (__predict_false(IN_RECOVERY(tp->t_flags))) { 9023 /* Still recovering */ 9024 return (0); 9025 } 9026 rack = (struct tcp_rack *)tp->t_fb_ptr; 9027 if (rack->r_ctl.rc_sacked) { 9028 /* We have sack holes on our scoreboard */ 9029 return (0); 9030 } 9031 /* Ok if we reach here, we can process a fast-ack */ 9032 if (rack->rc_gp_filled && 9033 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { 9034 under_pacing = 1; 9035 } 9036 nsegs = max(1, m->m_pkthdr.lro_nsegs); 9037 rack_log_ack(tp, to, th); 9038 /* Did the window get updated? */ 9039 if (tiwin != tp->snd_wnd) { 9040 tp->snd_wnd = tiwin; 9041 tp->snd_wl1 = th->th_seq; 9042 if (tp->snd_wnd > tp->max_sndwnd) 9043 tp->max_sndwnd = tp->snd_wnd; 9044 } 9045 /* Do we exit persists? */ 9046 if ((rack->rc_in_persist != 0) && 9047 (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2), 9048 rack->r_ctl.rc_pace_min_segs))) { 9049 rack_exit_persist(tp, rack, cts); 9050 } 9051 /* Do we enter persists? */ 9052 if ((rack->rc_in_persist == 0) && 9053 (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) && 9054 TCPS_HAVEESTABLISHED(tp->t_state) && 9055 (tp->snd_max == tp->snd_una) && 9056 sbavail(&tp->t_inpcb->inp_socket->so_snd) && 9057 (sbavail(&tp->t_inpcb->inp_socket->so_snd) > tp->snd_wnd)) { 9058 /* 9059 * Here the rwnd is less than 9060 * the pacing size, we are established, 9061 * nothing is outstanding, and there is 9062 * data to send. Enter persists. 9063 */ 9064 tp->snd_nxt = tp->snd_una; 9065 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime); 9066 } 9067 /* 9068 * If last ACK falls within this segment's sequence numbers, record 9069 * the timestamp. NOTE that the test is modified according to the 9070 * latest proposal of the tcplw@cray.com list (Braden 1993/04/26). 9071 */ 9072 if ((to->to_flags & TOF_TS) != 0 && 9073 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 9074 tp->ts_recent_age = tcp_ts_getticks(); 9075 tp->ts_recent = to->to_tsval; 9076 } 9077 /* 9078 * This is a pure ack for outstanding data. 9079 */ 9080 KMOD_TCPSTAT_INC(tcps_predack); 9081 9082 /* 9083 * "bad retransmit" recovery. 9084 */ 9085 if (tp->t_flags & TF_PREVVALID) { 9086 tp->t_flags &= ~TF_PREVVALID; 9087 if (tp->t_rxtshift == 1 && 9088 (int)(ticks - tp->t_badrxtwin) < 0) 9089 rack_cong_signal(tp, th, CC_RTO_ERR); 9090 } 9091 /* 9092 * Recalculate the transmit timer / rtt. 9093 * 9094 * Some boxes send broken timestamp replies during the SYN+ACK 9095 * phase, ignore timestamps of 0 or we could calculate a huge RTT 9096 * and blow up the retransmit timer. 9097 */ 9098 acked = BYTES_THIS_ACK(tp, th); 9099 9100 #ifdef TCP_HHOOK 9101 /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ 9102 hhook_run_tcp_est_in(tp, th, to); 9103 #endif 9104 9105 KMOD_TCPSTAT_ADD(tcps_rcvackpack, nsegs); 9106 KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked); 9107 sbdrop(&so->so_snd, acked); 9108 if (acked) { 9109 /* assure we are not backed off */ 9110 tp->t_rxtshift = 0; 9111 rack->rc_tlp_in_progress = 0; 9112 rack->r_ctl.rc_tlp_cnt_out = 0; 9113 /* 9114 * If it is the RXT timer we want to 9115 * stop it, so we can restart a TLP. 9116 */ 9117 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) 9118 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 9119 #ifdef NETFLIX_HTTP_LOGGING 9120 tcp_http_check_for_comp(rack->rc_tp, th->th_ack); 9121 #endif 9122 } 9123 /* 9124 * Let the congestion control algorithm update congestion control 9125 * related information. This typically means increasing the 9126 * congestion window. 9127 */ 9128 rack_ack_received(tp, rack, th, nsegs, CC_ACK, 0); 9129 9130 tp->snd_una = th->th_ack; 9131 if (tp->snd_wnd < ctf_outstanding(tp)) { 9132 /* The peer collapsed the window */ 9133 rack_collapsed_window(rack); 9134 } else if (rack->rc_has_collapsed) 9135 rack_un_collapse_window(rack); 9136 9137 /* 9138 * Pull snd_wl2 up to prevent seq wrap relative to th_ack. 9139 */ 9140 tp->snd_wl2 = th->th_ack; 9141 tp->t_dupacks = 0; 9142 m_freem(m); 9143 /* ND6_HINT(tp); *//* Some progress has been made. */ 9144 9145 /* 9146 * If all outstanding data are acked, stop retransmit timer, 9147 * otherwise restart timer using current (possibly backed-off) 9148 * value. If process is waiting for space, wakeup/selwakeup/signal. 9149 * If data are ready to send, let tcp_output decide between more 9150 * output or persist. 9151 */ 9152 #ifdef TCPDEBUG 9153 if (so->so_options & SO_DEBUG) 9154 tcp_trace(TA_INPUT, ostate, tp, 9155 (void *)tcp_saveipgen, 9156 &tcp_savetcp, 0); 9157 #endif 9158 if (under_pacing && 9159 (rack->use_fixed_rate == 0) && 9160 (rack->in_probe_rtt == 0) && 9161 rack->rc_gp_dyn_mul && 9162 rack->rc_always_pace) { 9163 /* Check if we are dragging bottom */ 9164 rack_check_bottom_drag(tp, rack, so, acked); 9165 } 9166 if (tp->snd_una == tp->snd_max) { 9167 rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL); 9168 if (rack->r_ctl.rc_went_idle_time == 0) 9169 rack->r_ctl.rc_went_idle_time = 1; 9170 rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__); 9171 if (sbavail(&tp->t_inpcb->inp_socket->so_snd) == 0) 9172 tp->t_acktime = 0; 9173 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 9174 } 9175 /* Wake up the socket if we have room to write more */ 9176 sowwakeup(so); 9177 if (sbavail(&so->so_snd)) { 9178 rack->r_wanted_output = 1; 9179 } 9180 return (1); 9181 } 9182 9183 /* 9184 * Return value of 1, the TCB is unlocked and most 9185 * likely gone, return value of 0, the TCP is still 9186 * locked. 9187 */ 9188 static int 9189 rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so, 9190 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 9191 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 9192 { 9193 int32_t ret_val = 0; 9194 int32_t todrop; 9195 int32_t ourfinisacked = 0; 9196 struct tcp_rack *rack; 9197 9198 ctf_calc_rwin(so, tp); 9199 /* 9200 * If the state is SYN_SENT: if seg contains an ACK, but not for our 9201 * SYN, drop the input. if seg contains a RST, then drop the 9202 * connection. if seg does not contain SYN, then drop it. Otherwise 9203 * this is an acceptable SYN segment initialize tp->rcv_nxt and 9204 * tp->irs if seg contains ack then advance tp->snd_una if seg 9205 * contains an ECE and ECN support is enabled, the stream is ECN 9206 * capable. if SYN has been acked change to ESTABLISHED else 9207 * SYN_RCVD state arrange for segment to be acked (eventually) 9208 * continue processing rest of data/controls. 9209 */ 9210 if ((thflags & TH_ACK) && 9211 (SEQ_LEQ(th->th_ack, tp->iss) || 9212 SEQ_GT(th->th_ack, tp->snd_max))) { 9213 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 9214 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 9215 return (1); 9216 } 9217 if ((thflags & (TH_ACK | TH_RST)) == (TH_ACK | TH_RST)) { 9218 TCP_PROBE5(connect__refused, NULL, tp, 9219 mtod(m, const char *), tp, th); 9220 tp = tcp_drop(tp, ECONNREFUSED); 9221 ctf_do_drop(m, tp); 9222 return (1); 9223 } 9224 if (thflags & TH_RST) { 9225 ctf_do_drop(m, tp); 9226 return (1); 9227 } 9228 if (!(thflags & TH_SYN)) { 9229 ctf_do_drop(m, tp); 9230 return (1); 9231 } 9232 tp->irs = th->th_seq; 9233 tcp_rcvseqinit(tp); 9234 rack = (struct tcp_rack *)tp->t_fb_ptr; 9235 if (thflags & TH_ACK) { 9236 int tfo_partial = 0; 9237 9238 KMOD_TCPSTAT_INC(tcps_connects); 9239 soisconnected(so); 9240 #ifdef MAC 9241 mac_socketpeer_set_from_mbuf(m, so); 9242 #endif 9243 /* Do window scaling on this connection? */ 9244 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 9245 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 9246 tp->rcv_scale = tp->request_r_scale; 9247 } 9248 tp->rcv_adv += min(tp->rcv_wnd, 9249 TCP_MAXWIN << tp->rcv_scale); 9250 /* 9251 * If not all the data that was sent in the TFO SYN 9252 * has been acked, resend the remainder right away. 9253 */ 9254 if (IS_FASTOPEN(tp->t_flags) && 9255 (tp->snd_una != tp->snd_max)) { 9256 tp->snd_nxt = th->th_ack; 9257 tfo_partial = 1; 9258 } 9259 /* 9260 * If there's data, delay ACK; if there's also a FIN ACKNOW 9261 * will be turned on later. 9262 */ 9263 if (DELAY_ACK(tp, tlen) && tlen != 0 && !tfo_partial) { 9264 rack_timer_cancel(tp, rack, 9265 rack->r_ctl.rc_rcvtime, __LINE__); 9266 tp->t_flags |= TF_DELACK; 9267 } else { 9268 rack->r_wanted_output = 1; 9269 tp->t_flags |= TF_ACKNOW; 9270 rack->rc_dack_toggle = 0; 9271 } 9272 if (((thflags & (TH_CWR | TH_ECE)) == TH_ECE) && 9273 (V_tcp_do_ecn == 1)) { 9274 tp->t_flags2 |= TF2_ECN_PERMIT; 9275 KMOD_TCPSTAT_INC(tcps_ecn_shs); 9276 } 9277 if (SEQ_GT(th->th_ack, tp->snd_una)) { 9278 /* 9279 * We advance snd_una for the 9280 * fast open case. If th_ack is 9281 * acknowledging data beyond 9282 * snd_una we can't just call 9283 * ack-processing since the 9284 * data stream in our send-map 9285 * will start at snd_una + 1 (one 9286 * beyond the SYN). If its just 9287 * equal we don't need to do that 9288 * and there is no send_map. 9289 */ 9290 tp->snd_una++; 9291 } 9292 /* 9293 * Received <SYN,ACK> in SYN_SENT[*] state. Transitions: 9294 * SYN_SENT --> ESTABLISHED SYN_SENT* --> FIN_WAIT_1 9295 */ 9296 tp->t_starttime = ticks; 9297 if (tp->t_flags & TF_NEEDFIN) { 9298 tcp_state_change(tp, TCPS_FIN_WAIT_1); 9299 tp->t_flags &= ~TF_NEEDFIN; 9300 thflags &= ~TH_SYN; 9301 } else { 9302 tcp_state_change(tp, TCPS_ESTABLISHED); 9303 TCP_PROBE5(connect__established, NULL, tp, 9304 mtod(m, const char *), tp, th); 9305 rack_cc_conn_init(tp); 9306 } 9307 } else { 9308 /* 9309 * Received initial SYN in SYN-SENT[*] state => simultaneous 9310 * open. If segment contains CC option and there is a 9311 * cached CC, apply TAO test. If it succeeds, connection is * 9312 * half-synchronized. Otherwise, do 3-way handshake: 9313 * SYN-SENT -> SYN-RECEIVED SYN-SENT* -> SYN-RECEIVED* If 9314 * there was no CC option, clear cached CC value. 9315 */ 9316 tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN); 9317 tcp_state_change(tp, TCPS_SYN_RECEIVED); 9318 } 9319 INP_WLOCK_ASSERT(tp->t_inpcb); 9320 /* 9321 * Advance th->th_seq to correspond to first data byte. If data, 9322 * trim to stay within window, dropping FIN if necessary. 9323 */ 9324 th->th_seq++; 9325 if (tlen > tp->rcv_wnd) { 9326 todrop = tlen - tp->rcv_wnd; 9327 m_adj(m, -todrop); 9328 tlen = tp->rcv_wnd; 9329 thflags &= ~TH_FIN; 9330 KMOD_TCPSTAT_INC(tcps_rcvpackafterwin); 9331 KMOD_TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); 9332 } 9333 tp->snd_wl1 = th->th_seq - 1; 9334 tp->rcv_up = th->th_seq; 9335 /* 9336 * Client side of transaction: already sent SYN and data. If the 9337 * remote host used T/TCP to validate the SYN, our data will be 9338 * ACK'd; if so, enter normal data segment processing in the middle 9339 * of step 5, ack processing. Otherwise, goto step 6. 9340 */ 9341 if (thflags & TH_ACK) { 9342 /* For syn-sent we need to possibly update the rtt */ 9343 if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) { 9344 uint32_t t; 9345 9346 t = tcp_ts_getticks() - to->to_tsecr; 9347 if (!tp->t_rttlow || tp->t_rttlow > t) 9348 tp->t_rttlow = t; 9349 tcp_rack_xmit_timer(rack, t + 1, 1, (t * HPTS_USEC_IN_MSEC), 0, NULL, 2); 9350 tcp_rack_xmit_timer_commit(rack, tp); 9351 } 9352 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) 9353 return (ret_val); 9354 /* We may have changed to FIN_WAIT_1 above */ 9355 if (tp->t_state == TCPS_FIN_WAIT_1) { 9356 /* 9357 * In FIN_WAIT_1 STATE in addition to the processing 9358 * for the ESTABLISHED state if our FIN is now 9359 * acknowledged then enter FIN_WAIT_2. 9360 */ 9361 if (ourfinisacked) { 9362 /* 9363 * If we can't receive any more data, then 9364 * closing user can proceed. Starting the 9365 * timer is contrary to the specification, 9366 * but if we don't get a FIN we'll hang 9367 * forever. 9368 * 9369 * XXXjl: we should release the tp also, and 9370 * use a compressed state. 9371 */ 9372 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 9373 soisdisconnected(so); 9374 tcp_timer_activate(tp, TT_2MSL, 9375 (tcp_fast_finwait2_recycle ? 9376 tcp_finwait2_timeout : 9377 TP_MAXIDLE(tp))); 9378 } 9379 tcp_state_change(tp, TCPS_FIN_WAIT_2); 9380 } 9381 } 9382 } 9383 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 9384 tiwin, thflags, nxt_pkt)); 9385 } 9386 9387 /* 9388 * Return value of 1, the TCB is unlocked and most 9389 * likely gone, return value of 0, the TCP is still 9390 * locked. 9391 */ 9392 static int 9393 rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, 9394 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 9395 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 9396 { 9397 struct tcp_rack *rack; 9398 int32_t ret_val = 0; 9399 int32_t ourfinisacked = 0; 9400 9401 ctf_calc_rwin(so, tp); 9402 if ((thflags & TH_ACK) && 9403 (SEQ_LEQ(th->th_ack, tp->snd_una) || 9404 SEQ_GT(th->th_ack, tp->snd_max))) { 9405 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 9406 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 9407 return (1); 9408 } 9409 rack = (struct tcp_rack *)tp->t_fb_ptr; 9410 if (IS_FASTOPEN(tp->t_flags)) { 9411 /* 9412 * When a TFO connection is in SYN_RECEIVED, the 9413 * only valid packets are the initial SYN, a 9414 * retransmit/copy of the initial SYN (possibly with 9415 * a subset of the original data), a valid ACK, a 9416 * FIN, or a RST. 9417 */ 9418 if ((thflags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) { 9419 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 9420 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 9421 return (1); 9422 } else if (thflags & TH_SYN) { 9423 /* non-initial SYN is ignored */ 9424 if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) || 9425 (rack->r_ctl.rc_hpts_flags & PACE_TMR_TLP) || 9426 (rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK)) { 9427 ctf_do_drop(m, NULL); 9428 return (0); 9429 } 9430 } else if (!(thflags & (TH_ACK | TH_FIN | TH_RST))) { 9431 ctf_do_drop(m, NULL); 9432 return (0); 9433 } 9434 } 9435 if ((thflags & TH_RST) || 9436 (tp->t_fin_is_rst && (thflags & TH_FIN))) 9437 return (ctf_process_rst(m, th, so, tp)); 9438 /* 9439 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 9440 * it's less than ts_recent, drop it. 9441 */ 9442 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 9443 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 9444 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 9445 return (ret_val); 9446 } 9447 /* 9448 * In the SYN-RECEIVED state, validate that the packet belongs to 9449 * this connection before trimming the data to fit the receive 9450 * window. Check the sequence number versus IRS since we know the 9451 * sequence numbers haven't wrapped. This is a partial fix for the 9452 * "LAND" DoS attack. 9453 */ 9454 if (SEQ_LT(th->th_seq, tp->irs)) { 9455 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 9456 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 9457 return (1); 9458 } 9459 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 9460 return (ret_val); 9461 } 9462 /* 9463 * If last ACK falls within this segment's sequence numbers, record 9464 * its timestamp. NOTE: 1) That the test incorporates suggestions 9465 * from the latest proposal of the tcplw@cray.com list (Braden 9466 * 1993/04/26). 2) That updating only on newer timestamps interferes 9467 * with our earlier PAWS tests, so this check should be solely 9468 * predicated on the sequence space of this segment. 3) That we 9469 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 9470 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 9471 * SEG.Len, This modified check allows us to overcome RFC1323's 9472 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 9473 * p.869. In such cases, we can still calculate the RTT correctly 9474 * when RCV.NXT == Last.ACK.Sent. 9475 */ 9476 if ((to->to_flags & TOF_TS) != 0 && 9477 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 9478 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 9479 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 9480 tp->ts_recent_age = tcp_ts_getticks(); 9481 tp->ts_recent = to->to_tsval; 9482 } 9483 tp->snd_wnd = tiwin; 9484 /* 9485 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 9486 * is on (half-synchronized state), then queue data for later 9487 * processing; else drop segment and return. 9488 */ 9489 if ((thflags & TH_ACK) == 0) { 9490 if (IS_FASTOPEN(tp->t_flags)) { 9491 rack_cc_conn_init(tp); 9492 } 9493 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 9494 tiwin, thflags, nxt_pkt)); 9495 } 9496 KMOD_TCPSTAT_INC(tcps_connects); 9497 soisconnected(so); 9498 /* Do window scaling? */ 9499 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 9500 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 9501 tp->rcv_scale = tp->request_r_scale; 9502 } 9503 /* 9504 * Make transitions: SYN-RECEIVED -> ESTABLISHED SYN-RECEIVED* -> 9505 * FIN-WAIT-1 9506 */ 9507 tp->t_starttime = ticks; 9508 if (IS_FASTOPEN(tp->t_flags) && tp->t_tfo_pending) { 9509 tcp_fastopen_decrement_counter(tp->t_tfo_pending); 9510 tp->t_tfo_pending = NULL; 9511 } 9512 if (tp->t_flags & TF_NEEDFIN) { 9513 tcp_state_change(tp, TCPS_FIN_WAIT_1); 9514 tp->t_flags &= ~TF_NEEDFIN; 9515 } else { 9516 tcp_state_change(tp, TCPS_ESTABLISHED); 9517 TCP_PROBE5(accept__established, NULL, tp, 9518 mtod(m, const char *), tp, th); 9519 /* 9520 * TFO connections call cc_conn_init() during SYN 9521 * processing. Calling it again here for such connections 9522 * is not harmless as it would undo the snd_cwnd reduction 9523 * that occurs when a TFO SYN|ACK is retransmitted. 9524 */ 9525 if (!IS_FASTOPEN(tp->t_flags)) 9526 rack_cc_conn_init(tp); 9527 } 9528 /* 9529 * Account for the ACK of our SYN prior to 9530 * regular ACK processing below, except for 9531 * simultaneous SYN, which is handled later. 9532 */ 9533 if (SEQ_GT(th->th_ack, tp->snd_una) && !(tp->t_flags & TF_NEEDSYN)) 9534 tp->snd_una++; 9535 /* 9536 * If segment contains data or ACK, will call tcp_reass() later; if 9537 * not, do so now to pass queued data to user. 9538 */ 9539 if (tlen == 0 && (thflags & TH_FIN) == 0) 9540 (void) tcp_reass(tp, (struct tcphdr *)0, NULL, 0, 9541 (struct mbuf *)0); 9542 tp->snd_wl1 = th->th_seq - 1; 9543 /* For syn-recv we need to possibly update the rtt */ 9544 if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) { 9545 uint32_t t; 9546 9547 t = tcp_ts_getticks() - to->to_tsecr; 9548 if (!tp->t_rttlow || tp->t_rttlow > t) 9549 tp->t_rttlow = t; 9550 tcp_rack_xmit_timer(rack, t + 1, 1, (t * HPTS_USEC_IN_MSEC), 0, NULL, 2); 9551 tcp_rack_xmit_timer_commit(rack, tp); 9552 } 9553 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 9554 return (ret_val); 9555 } 9556 if (tp->t_state == TCPS_FIN_WAIT_1) { 9557 /* We could have went to FIN_WAIT_1 (or EST) above */ 9558 /* 9559 * In FIN_WAIT_1 STATE in addition to the processing for the 9560 * ESTABLISHED state if our FIN is now acknowledged then 9561 * enter FIN_WAIT_2. 9562 */ 9563 if (ourfinisacked) { 9564 /* 9565 * If we can't receive any more data, then closing 9566 * user can proceed. Starting the timer is contrary 9567 * to the specification, but if we don't get a FIN 9568 * we'll hang forever. 9569 * 9570 * XXXjl: we should release the tp also, and use a 9571 * compressed state. 9572 */ 9573 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 9574 soisdisconnected(so); 9575 tcp_timer_activate(tp, TT_2MSL, 9576 (tcp_fast_finwait2_recycle ? 9577 tcp_finwait2_timeout : 9578 TP_MAXIDLE(tp))); 9579 } 9580 tcp_state_change(tp, TCPS_FIN_WAIT_2); 9581 } 9582 } 9583 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 9584 tiwin, thflags, nxt_pkt)); 9585 } 9586 9587 /* 9588 * Return value of 1, the TCB is unlocked and most 9589 * likely gone, return value of 0, the TCP is still 9590 * locked. 9591 */ 9592 static int 9593 rack_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so, 9594 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 9595 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 9596 { 9597 int32_t ret_val = 0; 9598 struct tcp_rack *rack; 9599 9600 /* 9601 * Header prediction: check for the two common cases of a 9602 * uni-directional data xfer. If the packet has no control flags, 9603 * is in-sequence, the window didn't change and we're not 9604 * retransmitting, it's a candidate. If the length is zero and the 9605 * ack moved forward, we're the sender side of the xfer. Just free 9606 * the data acked & wake any higher level process that was blocked 9607 * waiting for space. If the length is non-zero and the ack didn't 9608 * move, we're the receiver side. If we're getting packets in-order 9609 * (the reassembly queue is empty), add the data toc The socket 9610 * buffer and note that we need a delayed ack. Make sure that the 9611 * hidden state-flags are also off. Since we check for 9612 * TCPS_ESTABLISHED first, it can only be TH_NEEDSYN. 9613 */ 9614 rack = (struct tcp_rack *)tp->t_fb_ptr; 9615 if (__predict_true(((to->to_flags & TOF_SACK) == 0)) && 9616 __predict_true((thflags & (TH_SYN | TH_FIN | TH_RST | TH_ACK)) == TH_ACK) && 9617 __predict_true(SEGQ_EMPTY(tp)) && 9618 __predict_true(th->th_seq == tp->rcv_nxt)) { 9619 if (tlen == 0) { 9620 if (rack_fastack(m, th, so, tp, to, drop_hdrlen, tlen, 9621 tiwin, nxt_pkt, rack->r_ctl.rc_rcvtime)) { 9622 return (0); 9623 } 9624 } else { 9625 if (rack_do_fastnewdata(m, th, so, tp, to, drop_hdrlen, tlen, 9626 tiwin, nxt_pkt, iptos)) { 9627 return (0); 9628 } 9629 } 9630 } 9631 ctf_calc_rwin(so, tp); 9632 9633 if ((thflags & TH_RST) || 9634 (tp->t_fin_is_rst && (thflags & TH_FIN))) 9635 return (ctf_process_rst(m, th, so, tp)); 9636 9637 /* 9638 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 9639 * synchronized state. 9640 */ 9641 if (thflags & TH_SYN) { 9642 ctf_challenge_ack(m, th, tp, &ret_val); 9643 return (ret_val); 9644 } 9645 /* 9646 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 9647 * it's less than ts_recent, drop it. 9648 */ 9649 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 9650 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 9651 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 9652 return (ret_val); 9653 } 9654 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 9655 return (ret_val); 9656 } 9657 /* 9658 * If last ACK falls within this segment's sequence numbers, record 9659 * its timestamp. NOTE: 1) That the test incorporates suggestions 9660 * from the latest proposal of the tcplw@cray.com list (Braden 9661 * 1993/04/26). 2) That updating only on newer timestamps interferes 9662 * with our earlier PAWS tests, so this check should be solely 9663 * predicated on the sequence space of this segment. 3) That we 9664 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 9665 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 9666 * SEG.Len, This modified check allows us to overcome RFC1323's 9667 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 9668 * p.869. In such cases, we can still calculate the RTT correctly 9669 * when RCV.NXT == Last.ACK.Sent. 9670 */ 9671 if ((to->to_flags & TOF_TS) != 0 && 9672 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 9673 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 9674 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 9675 tp->ts_recent_age = tcp_ts_getticks(); 9676 tp->ts_recent = to->to_tsval; 9677 } 9678 /* 9679 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 9680 * is on (half-synchronized state), then queue data for later 9681 * processing; else drop segment and return. 9682 */ 9683 if ((thflags & TH_ACK) == 0) { 9684 if (tp->t_flags & TF_NEEDSYN) { 9685 9686 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 9687 tiwin, thflags, nxt_pkt)); 9688 9689 } else if (tp->t_flags & TF_ACKNOW) { 9690 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 9691 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output= 1; 9692 return (ret_val); 9693 } else { 9694 ctf_do_drop(m, NULL); 9695 return (0); 9696 } 9697 } 9698 /* 9699 * Ack processing. 9700 */ 9701 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) { 9702 return (ret_val); 9703 } 9704 if (sbavail(&so->so_snd)) { 9705 if (ctf_progress_timeout_check(tp, true)) { 9706 rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__); 9707 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 9708 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 9709 return (1); 9710 } 9711 } 9712 /* State changes only happen in rack_process_data() */ 9713 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 9714 tiwin, thflags, nxt_pkt)); 9715 } 9716 9717 /* 9718 * Return value of 1, the TCB is unlocked and most 9719 * likely gone, return value of 0, the TCP is still 9720 * locked. 9721 */ 9722 static int 9723 rack_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so, 9724 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 9725 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 9726 { 9727 int32_t ret_val = 0; 9728 9729 ctf_calc_rwin(so, tp); 9730 if ((thflags & TH_RST) || 9731 (tp->t_fin_is_rst && (thflags & TH_FIN))) 9732 return (ctf_process_rst(m, th, so, tp)); 9733 /* 9734 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 9735 * synchronized state. 9736 */ 9737 if (thflags & TH_SYN) { 9738 ctf_challenge_ack(m, th, tp, &ret_val); 9739 return (ret_val); 9740 } 9741 /* 9742 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 9743 * it's less than ts_recent, drop it. 9744 */ 9745 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 9746 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 9747 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 9748 return (ret_val); 9749 } 9750 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 9751 return (ret_val); 9752 } 9753 /* 9754 * If last ACK falls within this segment's sequence numbers, record 9755 * its timestamp. NOTE: 1) That the test incorporates suggestions 9756 * from the latest proposal of the tcplw@cray.com list (Braden 9757 * 1993/04/26). 2) That updating only on newer timestamps interferes 9758 * with our earlier PAWS tests, so this check should be solely 9759 * predicated on the sequence space of this segment. 3) That we 9760 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 9761 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 9762 * SEG.Len, This modified check allows us to overcome RFC1323's 9763 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 9764 * p.869. In such cases, we can still calculate the RTT correctly 9765 * when RCV.NXT == Last.ACK.Sent. 9766 */ 9767 if ((to->to_flags & TOF_TS) != 0 && 9768 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 9769 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 9770 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 9771 tp->ts_recent_age = tcp_ts_getticks(); 9772 tp->ts_recent = to->to_tsval; 9773 } 9774 /* 9775 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 9776 * is on (half-synchronized state), then queue data for later 9777 * processing; else drop segment and return. 9778 */ 9779 if ((thflags & TH_ACK) == 0) { 9780 if (tp->t_flags & TF_NEEDSYN) { 9781 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 9782 tiwin, thflags, nxt_pkt)); 9783 9784 } else if (tp->t_flags & TF_ACKNOW) { 9785 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 9786 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; 9787 return (ret_val); 9788 } else { 9789 ctf_do_drop(m, NULL); 9790 return (0); 9791 } 9792 } 9793 /* 9794 * Ack processing. 9795 */ 9796 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) { 9797 return (ret_val); 9798 } 9799 if (sbavail(&so->so_snd)) { 9800 if (ctf_progress_timeout_check(tp, true)) { 9801 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 9802 tp, tick, PROGRESS_DROP, __LINE__); 9803 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 9804 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 9805 return (1); 9806 } 9807 } 9808 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 9809 tiwin, thflags, nxt_pkt)); 9810 } 9811 9812 static int 9813 rack_check_data_after_close(struct mbuf *m, 9814 struct tcpcb *tp, int32_t *tlen, struct tcphdr *th, struct socket *so) 9815 { 9816 struct tcp_rack *rack; 9817 9818 rack = (struct tcp_rack *)tp->t_fb_ptr; 9819 if (rack->rc_allow_data_af_clo == 0) { 9820 close_now: 9821 tcp_log_end_status(tp, TCP_EI_STATUS_DATA_A_CLOSE); 9822 /* tcp_close will kill the inp pre-log the Reset */ 9823 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); 9824 tp = tcp_close(tp); 9825 KMOD_TCPSTAT_INC(tcps_rcvafterclose); 9826 ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, (*tlen)); 9827 return (1); 9828 } 9829 if (sbavail(&so->so_snd) == 0) 9830 goto close_now; 9831 /* Ok we allow data that is ignored and a followup reset */ 9832 tcp_log_end_status(tp, TCP_EI_STATUS_DATA_A_CLOSE); 9833 tp->rcv_nxt = th->th_seq + *tlen; 9834 tp->t_flags2 |= TF2_DROP_AF_DATA; 9835 rack->r_wanted_output = 1; 9836 *tlen = 0; 9837 return (0); 9838 } 9839 9840 /* 9841 * Return value of 1, the TCB is unlocked and most 9842 * likely gone, return value of 0, the TCP is still 9843 * locked. 9844 */ 9845 static int 9846 rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so, 9847 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 9848 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 9849 { 9850 int32_t ret_val = 0; 9851 int32_t ourfinisacked = 0; 9852 9853 ctf_calc_rwin(so, tp); 9854 9855 if ((thflags & TH_RST) || 9856 (tp->t_fin_is_rst && (thflags & TH_FIN))) 9857 return (ctf_process_rst(m, th, so, tp)); 9858 /* 9859 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 9860 * synchronized state. 9861 */ 9862 if (thflags & TH_SYN) { 9863 ctf_challenge_ack(m, th, tp, &ret_val); 9864 return (ret_val); 9865 } 9866 /* 9867 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 9868 * it's less than ts_recent, drop it. 9869 */ 9870 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 9871 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 9872 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 9873 return (ret_val); 9874 } 9875 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 9876 return (ret_val); 9877 } 9878 /* 9879 * If new data are received on a connection after the user processes 9880 * are gone, then RST the other end. 9881 */ 9882 if ((so->so_state & SS_NOFDREF) && tlen) { 9883 if (rack_check_data_after_close(m, tp, &tlen, th, so)) 9884 return (1); 9885 } 9886 /* 9887 * If last ACK falls within this segment's sequence numbers, record 9888 * its timestamp. NOTE: 1) That the test incorporates suggestions 9889 * from the latest proposal of the tcplw@cray.com list (Braden 9890 * 1993/04/26). 2) That updating only on newer timestamps interferes 9891 * with our earlier PAWS tests, so this check should be solely 9892 * predicated on the sequence space of this segment. 3) That we 9893 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 9894 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 9895 * SEG.Len, This modified check allows us to overcome RFC1323's 9896 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 9897 * p.869. In such cases, we can still calculate the RTT correctly 9898 * when RCV.NXT == Last.ACK.Sent. 9899 */ 9900 if ((to->to_flags & TOF_TS) != 0 && 9901 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 9902 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 9903 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 9904 tp->ts_recent_age = tcp_ts_getticks(); 9905 tp->ts_recent = to->to_tsval; 9906 } 9907 /* 9908 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 9909 * is on (half-synchronized state), then queue data for later 9910 * processing; else drop segment and return. 9911 */ 9912 if ((thflags & TH_ACK) == 0) { 9913 if (tp->t_flags & TF_NEEDSYN) { 9914 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 9915 tiwin, thflags, nxt_pkt)); 9916 } else if (tp->t_flags & TF_ACKNOW) { 9917 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 9918 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; 9919 return (ret_val); 9920 } else { 9921 ctf_do_drop(m, NULL); 9922 return (0); 9923 } 9924 } 9925 /* 9926 * Ack processing. 9927 */ 9928 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 9929 return (ret_val); 9930 } 9931 if (ourfinisacked) { 9932 /* 9933 * If we can't receive any more data, then closing user can 9934 * proceed. Starting the timer is contrary to the 9935 * specification, but if we don't get a FIN we'll hang 9936 * forever. 9937 * 9938 * XXXjl: we should release the tp also, and use a 9939 * compressed state. 9940 */ 9941 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 9942 soisdisconnected(so); 9943 tcp_timer_activate(tp, TT_2MSL, 9944 (tcp_fast_finwait2_recycle ? 9945 tcp_finwait2_timeout : 9946 TP_MAXIDLE(tp))); 9947 } 9948 tcp_state_change(tp, TCPS_FIN_WAIT_2); 9949 } 9950 if (sbavail(&so->so_snd)) { 9951 if (ctf_progress_timeout_check(tp, true)) { 9952 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 9953 tp, tick, PROGRESS_DROP, __LINE__); 9954 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 9955 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 9956 return (1); 9957 } 9958 } 9959 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 9960 tiwin, thflags, nxt_pkt)); 9961 } 9962 9963 /* 9964 * Return value of 1, the TCB is unlocked and most 9965 * likely gone, return value of 0, the TCP is still 9966 * locked. 9967 */ 9968 static int 9969 rack_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so, 9970 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 9971 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 9972 { 9973 int32_t ret_val = 0; 9974 int32_t ourfinisacked = 0; 9975 9976 ctf_calc_rwin(so, tp); 9977 9978 if ((thflags & TH_RST) || 9979 (tp->t_fin_is_rst && (thflags & TH_FIN))) 9980 return (ctf_process_rst(m, th, so, tp)); 9981 /* 9982 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 9983 * synchronized state. 9984 */ 9985 if (thflags & TH_SYN) { 9986 ctf_challenge_ack(m, th, tp, &ret_val); 9987 return (ret_val); 9988 } 9989 /* 9990 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 9991 * it's less than ts_recent, drop it. 9992 */ 9993 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 9994 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 9995 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 9996 return (ret_val); 9997 } 9998 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 9999 return (ret_val); 10000 } 10001 /* 10002 * If new data are received on a connection after the user processes 10003 * are gone, then RST the other end. 10004 */ 10005 if ((so->so_state & SS_NOFDREF) && tlen) { 10006 if (rack_check_data_after_close(m, tp, &tlen, th, so)) 10007 return (1); 10008 } 10009 /* 10010 * If last ACK falls within this segment's sequence numbers, record 10011 * its timestamp. NOTE: 1) That the test incorporates suggestions 10012 * from the latest proposal of the tcplw@cray.com list (Braden 10013 * 1993/04/26). 2) That updating only on newer timestamps interferes 10014 * with our earlier PAWS tests, so this check should be solely 10015 * predicated on the sequence space of this segment. 3) That we 10016 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 10017 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 10018 * SEG.Len, This modified check allows us to overcome RFC1323's 10019 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 10020 * p.869. In such cases, we can still calculate the RTT correctly 10021 * when RCV.NXT == Last.ACK.Sent. 10022 */ 10023 if ((to->to_flags & TOF_TS) != 0 && 10024 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 10025 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 10026 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 10027 tp->ts_recent_age = tcp_ts_getticks(); 10028 tp->ts_recent = to->to_tsval; 10029 } 10030 /* 10031 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 10032 * is on (half-synchronized state), then queue data for later 10033 * processing; else drop segment and return. 10034 */ 10035 if ((thflags & TH_ACK) == 0) { 10036 if (tp->t_flags & TF_NEEDSYN) { 10037 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 10038 tiwin, thflags, nxt_pkt)); 10039 } else if (tp->t_flags & TF_ACKNOW) { 10040 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 10041 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output= 1; 10042 return (ret_val); 10043 } else { 10044 ctf_do_drop(m, NULL); 10045 return (0); 10046 } 10047 } 10048 /* 10049 * Ack processing. 10050 */ 10051 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 10052 return (ret_val); 10053 } 10054 if (ourfinisacked) { 10055 tcp_twstart(tp); 10056 m_freem(m); 10057 return (1); 10058 } 10059 if (sbavail(&so->so_snd)) { 10060 if (ctf_progress_timeout_check(tp, true)) { 10061 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 10062 tp, tick, PROGRESS_DROP, __LINE__); 10063 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 10064 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 10065 return (1); 10066 } 10067 } 10068 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 10069 tiwin, thflags, nxt_pkt)); 10070 } 10071 10072 /* 10073 * Return value of 1, the TCB is unlocked and most 10074 * likely gone, return value of 0, the TCP is still 10075 * locked. 10076 */ 10077 static int 10078 rack_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so, 10079 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 10080 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 10081 { 10082 int32_t ret_val = 0; 10083 int32_t ourfinisacked = 0; 10084 10085 ctf_calc_rwin(so, tp); 10086 10087 if ((thflags & TH_RST) || 10088 (tp->t_fin_is_rst && (thflags & TH_FIN))) 10089 return (ctf_process_rst(m, th, so, tp)); 10090 /* 10091 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 10092 * synchronized state. 10093 */ 10094 if (thflags & TH_SYN) { 10095 ctf_challenge_ack(m, th, tp, &ret_val); 10096 return (ret_val); 10097 } 10098 /* 10099 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 10100 * it's less than ts_recent, drop it. 10101 */ 10102 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 10103 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 10104 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 10105 return (ret_val); 10106 } 10107 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 10108 return (ret_val); 10109 } 10110 /* 10111 * If new data are received on a connection after the user processes 10112 * are gone, then RST the other end. 10113 */ 10114 if ((so->so_state & SS_NOFDREF) && tlen) { 10115 if (rack_check_data_after_close(m, tp, &tlen, th, so)) 10116 return (1); 10117 } 10118 /* 10119 * If last ACK falls within this segment's sequence numbers, record 10120 * its timestamp. NOTE: 1) That the test incorporates suggestions 10121 * from the latest proposal of the tcplw@cray.com list (Braden 10122 * 1993/04/26). 2) That updating only on newer timestamps interferes 10123 * with our earlier PAWS tests, so this check should be solely 10124 * predicated on the sequence space of this segment. 3) That we 10125 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 10126 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 10127 * SEG.Len, This modified check allows us to overcome RFC1323's 10128 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 10129 * p.869. In such cases, we can still calculate the RTT correctly 10130 * when RCV.NXT == Last.ACK.Sent. 10131 */ 10132 if ((to->to_flags & TOF_TS) != 0 && 10133 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 10134 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 10135 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 10136 tp->ts_recent_age = tcp_ts_getticks(); 10137 tp->ts_recent = to->to_tsval; 10138 } 10139 /* 10140 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 10141 * is on (half-synchronized state), then queue data for later 10142 * processing; else drop segment and return. 10143 */ 10144 if ((thflags & TH_ACK) == 0) { 10145 if (tp->t_flags & TF_NEEDSYN) { 10146 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 10147 tiwin, thflags, nxt_pkt)); 10148 } else if (tp->t_flags & TF_ACKNOW) { 10149 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 10150 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; 10151 return (ret_val); 10152 } else { 10153 ctf_do_drop(m, NULL); 10154 return (0); 10155 } 10156 } 10157 /* 10158 * case TCPS_LAST_ACK: Ack processing. 10159 */ 10160 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 10161 return (ret_val); 10162 } 10163 if (ourfinisacked) { 10164 tp = tcp_close(tp); 10165 ctf_do_drop(m, tp); 10166 return (1); 10167 } 10168 if (sbavail(&so->so_snd)) { 10169 if (ctf_progress_timeout_check(tp, true)) { 10170 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 10171 tp, tick, PROGRESS_DROP, __LINE__); 10172 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 10173 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 10174 return (1); 10175 } 10176 } 10177 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 10178 tiwin, thflags, nxt_pkt)); 10179 } 10180 10181 10182 /* 10183 * Return value of 1, the TCB is unlocked and most 10184 * likely gone, return value of 0, the TCP is still 10185 * locked. 10186 */ 10187 static int 10188 rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so, 10189 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 10190 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 10191 { 10192 int32_t ret_val = 0; 10193 int32_t ourfinisacked = 0; 10194 10195 ctf_calc_rwin(so, tp); 10196 10197 /* Reset receive buffer auto scaling when not in bulk receive mode. */ 10198 if ((thflags & TH_RST) || 10199 (tp->t_fin_is_rst && (thflags & TH_FIN))) 10200 return (ctf_process_rst(m, th, so, tp)); 10201 /* 10202 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 10203 * synchronized state. 10204 */ 10205 if (thflags & TH_SYN) { 10206 ctf_challenge_ack(m, th, tp, &ret_val); 10207 return (ret_val); 10208 } 10209 /* 10210 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 10211 * it's less than ts_recent, drop it. 10212 */ 10213 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 10214 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 10215 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 10216 return (ret_val); 10217 } 10218 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 10219 return (ret_val); 10220 } 10221 /* 10222 * If new data are received on a connection after the user processes 10223 * are gone, then RST the other end. 10224 */ 10225 if ((so->so_state & SS_NOFDREF) && 10226 tlen) { 10227 if (rack_check_data_after_close(m, tp, &tlen, th, so)) 10228 return (1); 10229 } 10230 /* 10231 * If last ACK falls within this segment's sequence numbers, record 10232 * its timestamp. NOTE: 1) That the test incorporates suggestions 10233 * from the latest proposal of the tcplw@cray.com list (Braden 10234 * 1993/04/26). 2) That updating only on newer timestamps interferes 10235 * with our earlier PAWS tests, so this check should be solely 10236 * predicated on the sequence space of this segment. 3) That we 10237 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 10238 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 10239 * SEG.Len, This modified check allows us to overcome RFC1323's 10240 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 10241 * p.869. In such cases, we can still calculate the RTT correctly 10242 * when RCV.NXT == Last.ACK.Sent. 10243 */ 10244 if ((to->to_flags & TOF_TS) != 0 && 10245 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 10246 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 10247 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 10248 tp->ts_recent_age = tcp_ts_getticks(); 10249 tp->ts_recent = to->to_tsval; 10250 } 10251 /* 10252 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 10253 * is on (half-synchronized state), then queue data for later 10254 * processing; else drop segment and return. 10255 */ 10256 if ((thflags & TH_ACK) == 0) { 10257 if (tp->t_flags & TF_NEEDSYN) { 10258 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 10259 tiwin, thflags, nxt_pkt)); 10260 } else if (tp->t_flags & TF_ACKNOW) { 10261 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 10262 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; 10263 return (ret_val); 10264 } else { 10265 ctf_do_drop(m, NULL); 10266 return (0); 10267 } 10268 } 10269 /* 10270 * Ack processing. 10271 */ 10272 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 10273 return (ret_val); 10274 } 10275 if (sbavail(&so->so_snd)) { 10276 if (ctf_progress_timeout_check(tp, true)) { 10277 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 10278 tp, tick, PROGRESS_DROP, __LINE__); 10279 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 10280 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 10281 return (1); 10282 } 10283 } 10284 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 10285 tiwin, thflags, nxt_pkt)); 10286 } 10287 10288 static void inline 10289 rack_clear_rate_sample(struct tcp_rack *rack) 10290 { 10291 rack->r_ctl.rack_rs.rs_flags = RACK_RTT_EMPTY; 10292 rack->r_ctl.rack_rs.rs_rtt_cnt = 0; 10293 rack->r_ctl.rack_rs.rs_rtt_tot = 0; 10294 } 10295 10296 static void 10297 rack_set_pace_segments(struct tcpcb *tp, struct tcp_rack *rack, uint32_t line) 10298 { 10299 uint64_t bw_est, rate_wanted; 10300 int chged = 0; 10301 uint32_t user_max; 10302 10303 user_max = ctf_fixed_maxseg(tp) * rack->rc_user_set_max_segs; 10304 if (ctf_fixed_maxseg(tp) != rack->r_ctl.rc_pace_min_segs) 10305 chged = 1; 10306 rack->r_ctl.rc_pace_min_segs = ctf_fixed_maxseg(tp); 10307 if (rack->use_fixed_rate || rack->rc_force_max_seg) { 10308 if (user_max != rack->r_ctl.rc_pace_max_segs) 10309 chged = 1; 10310 } 10311 if (rack->rc_force_max_seg) { 10312 rack->r_ctl.rc_pace_max_segs = user_max; 10313 } else if (rack->use_fixed_rate) { 10314 bw_est = rack_get_bw(rack); 10315 if ((rack->r_ctl.crte == NULL) || 10316 (bw_est != rack->r_ctl.crte->rate)) { 10317 rack->r_ctl.rc_pace_max_segs = user_max; 10318 } else { 10319 /* We are pacing right at the hardware rate */ 10320 uint32_t segsiz; 10321 10322 segsiz = min(ctf_fixed_maxseg(tp), 10323 rack->r_ctl.rc_pace_min_segs); 10324 rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size( 10325 bw_est, segsiz, 0, 10326 rack->r_ctl.crte, NULL); 10327 } 10328 } else if (rack->rc_always_pace) { 10329 if (rack->r_ctl.gp_bw || 10330 #ifdef NETFLIX_PEAKRATE 10331 rack->rc_tp->t_maxpeakrate || 10332 #endif 10333 rack->r_ctl.init_rate) { 10334 /* We have a rate of some sort set */ 10335 uint32_t orig; 10336 10337 bw_est = rack_get_bw(rack); 10338 orig = rack->r_ctl.rc_pace_max_segs; 10339 rate_wanted = rack_get_output_bw(rack, bw_est, NULL); 10340 if (rate_wanted) { 10341 /* We have something */ 10342 rack->r_ctl.rc_pace_max_segs = rack_get_pacing_len(rack, 10343 rate_wanted, 10344 ctf_fixed_maxseg(rack->rc_tp)); 10345 } else 10346 rack->r_ctl.rc_pace_max_segs = rack->r_ctl.rc_pace_min_segs; 10347 if (orig != rack->r_ctl.rc_pace_max_segs) 10348 chged = 1; 10349 } else if ((rack->r_ctl.gp_bw == 0) && 10350 (rack->r_ctl.rc_pace_max_segs == 0)) { 10351 /* 10352 * If we have nothing limit us to bursting 10353 * out IW sized pieces. 10354 */ 10355 chged = 1; 10356 rack->r_ctl.rc_pace_max_segs = rc_init_window(rack); 10357 } 10358 } 10359 if (rack->r_ctl.rc_pace_max_segs > PACE_MAX_IP_BYTES) { 10360 chged = 1; 10361 rack->r_ctl.rc_pace_max_segs = PACE_MAX_IP_BYTES; 10362 } 10363 if (chged) 10364 rack_log_type_hrdwtso(tp, rack, 0, rack->rc_inp->inp_socket->so_snd.sb_flags, line, 2); 10365 } 10366 10367 static int 10368 rack_init(struct tcpcb *tp) 10369 { 10370 struct tcp_rack *rack = NULL; 10371 struct rack_sendmap *insret; 10372 uint32_t iwin, snt, us_cts; 10373 10374 tp->t_fb_ptr = uma_zalloc(rack_pcb_zone, M_NOWAIT); 10375 if (tp->t_fb_ptr == NULL) { 10376 /* 10377 * We need to allocate memory but cant. The INP and INP_INFO 10378 * locks and they are recusive (happens during setup. So a 10379 * scheme to drop the locks fails :( 10380 * 10381 */ 10382 return (ENOMEM); 10383 } 10384 memset(tp->t_fb_ptr, 0, sizeof(struct tcp_rack)); 10385 10386 rack = (struct tcp_rack *)tp->t_fb_ptr; 10387 RB_INIT(&rack->r_ctl.rc_mtree); 10388 TAILQ_INIT(&rack->r_ctl.rc_free); 10389 TAILQ_INIT(&rack->r_ctl.rc_tmap); 10390 rack->rc_tp = tp; 10391 if (tp->t_inpcb) { 10392 rack->rc_inp = tp->t_inpcb; 10393 } 10394 /* Probably not needed but lets be sure */ 10395 rack_clear_rate_sample(rack); 10396 rack->r_ctl.rc_reorder_fade = rack_reorder_fade; 10397 rack->rc_allow_data_af_clo = rack_ignore_data_after_close; 10398 rack->r_ctl.rc_tlp_threshold = rack_tlp_thresh; 10399 if (use_rack_rr) 10400 rack->use_rack_rr = 1; 10401 if (V_tcp_delack_enabled) 10402 tp->t_delayed_ack = 1; 10403 else 10404 tp->t_delayed_ack = 0; 10405 if (rack_enable_shared_cwnd) 10406 rack->rack_enable_scwnd = 1; 10407 rack->rc_user_set_max_segs = rack_hptsi_segments; 10408 rack->rc_force_max_seg = 0; 10409 if (rack_use_imac_dack) 10410 rack->rc_dack_mode = 1; 10411 rack->r_ctl.rc_reorder_shift = rack_reorder_thresh; 10412 rack->r_ctl.rc_pkt_delay = rack_pkt_delay; 10413 rack->r_ctl.rc_prop_reduce = rack_use_proportional_reduce; 10414 rack->r_ctl.rc_prop_rate = rack_proportional_rate; 10415 rack->r_ctl.rc_tlp_cwnd_reduce = rack_lower_cwnd_at_tlp; 10416 rack->r_ctl.rc_early_recovery = rack_early_recovery; 10417 rack->r_ctl.rc_lowest_us_rtt = 0xffffffff; 10418 rack->r_ctl.rc_highest_us_rtt = 0; 10419 if (rack_disable_prr) 10420 rack->rack_no_prr = 1; 10421 if (rack_gp_no_rec_chg) 10422 rack->rc_gp_no_rec_chg = 1; 10423 rack->rc_always_pace = rack_pace_every_seg; 10424 if (rack_enable_mqueue_for_nonpaced) 10425 rack->r_mbuf_queue = 1; 10426 else 10427 rack->r_mbuf_queue = 0; 10428 if (rack->r_mbuf_queue || rack->rc_always_pace) 10429 tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ; 10430 else 10431 tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ; 10432 rack_set_pace_segments(tp, rack, __LINE__); 10433 if (rack_limits_scwnd) 10434 rack->r_limit_scw = 1; 10435 else 10436 rack->r_limit_scw = 0; 10437 rack->r_ctl.rc_high_rwnd = tp->snd_wnd; 10438 rack->r_ctl.cwnd_to_use = tp->snd_cwnd; 10439 rack->r_ctl.rc_rate_sample_method = rack_rate_sample_method; 10440 rack->rack_tlp_threshold_use = rack_tlp_threshold_use; 10441 rack->r_ctl.rc_prr_sendalot = rack_send_a_lot_in_prr; 10442 rack->r_ctl.rc_min_to = rack_min_to; 10443 microuptime(&rack->r_ctl.act_rcv_time); 10444 rack->r_ctl.rc_last_time_decay = rack->r_ctl.act_rcv_time; 10445 rack->r_running_late = 0; 10446 rack->r_running_early = 0; 10447 rack->rc_init_win = rack_default_init_window; 10448 rack->r_ctl.rack_per_of_gp_ss = rack_per_of_gp_ss; 10449 if (rack_do_dyn_mul) { 10450 /* When dynamic adjustment is on CA needs to start at 100% */ 10451 rack->rc_gp_dyn_mul = 1; 10452 if (rack_do_dyn_mul >= 100) 10453 rack->r_ctl.rack_per_of_gp_ca = rack_do_dyn_mul; 10454 } else 10455 rack->r_ctl.rack_per_of_gp_ca = rack_per_of_gp_ca; 10456 rack->r_ctl.rack_per_of_gp_rec = rack_per_of_gp_rec; 10457 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt; 10458 rack->r_ctl.rc_tlp_rxt_last_time = tcp_tv_to_mssectick(&rack->r_ctl.act_rcv_time); 10459 setup_time_filter_small(&rack->r_ctl.rc_gp_min_rtt, FILTER_TYPE_MIN, 10460 rack_probertt_filter_life); 10461 us_cts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 10462 rack->r_ctl.rc_lower_rtt_us_cts = us_cts; 10463 rack->r_ctl.rc_time_of_last_probertt = us_cts; 10464 rack->r_ctl.rc_time_probertt_starts = 0; 10465 /* Do we force on detection? */ 10466 #ifdef NETFLIX_EXP_DETECTION 10467 if (tcp_force_detection) 10468 rack->do_detection = 1; 10469 else 10470 #endif 10471 rack->do_detection = 0; 10472 if (rack_non_rxt_use_cr) 10473 rack->rack_rec_nonrxt_use_cr = 1; 10474 if (tp->snd_una != tp->snd_max) { 10475 /* Create a send map for the current outstanding data */ 10476 struct rack_sendmap *rsm; 10477 10478 rsm = rack_alloc(rack); 10479 if (rsm == NULL) { 10480 uma_zfree(rack_pcb_zone, tp->t_fb_ptr); 10481 tp->t_fb_ptr = NULL; 10482 return (ENOMEM); 10483 } 10484 rsm->r_flags = RACK_OVERMAX; 10485 rsm->r_tim_lastsent[0] = rack->r_ctl.rc_tlp_rxt_last_time; 10486 rsm->r_rtr_cnt = 1; 10487 rsm->r_rtr_bytes = 0; 10488 rsm->r_start = tp->snd_una; 10489 rsm->r_end = tp->snd_max; 10490 rsm->usec_orig_send = us_cts; 10491 rsm->r_dupack = 0; 10492 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 10493 #ifdef INVARIANTS 10494 if (insret != NULL) { 10495 panic("Insert in rb tree fails ret:%p rack:%p rsm:%p", 10496 insret, rack, rsm); 10497 } 10498 #endif 10499 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); 10500 rsm->r_in_tmap = 1; 10501 } 10502 /* Cancel the GP measurement in progress */ 10503 tp->t_flags &= ~TF_GPUTINPROG; 10504 if (SEQ_GT(tp->snd_max, tp->iss)) 10505 snt = tp->snd_max - tp->iss; 10506 else 10507 snt = 0; 10508 iwin = rc_init_window(rack); 10509 if (snt < iwin) { 10510 /* We are not past the initial window 10511 * so we need to make sure cwnd is 10512 * correct. 10513 */ 10514 if (tp->snd_cwnd < iwin) 10515 tp->snd_cwnd = iwin; 10516 /* 10517 * If we are within the initial window 10518 * we want ssthresh to be unlimited. Setting 10519 * it to the rwnd (which the default stack does 10520 * and older racks) is not really a good idea 10521 * since we want to be in SS and grow both the 10522 * cwnd and the rwnd (via dynamic rwnd growth). If 10523 * we set it to the rwnd then as the peer grows its 10524 * rwnd we will be stuck in CA and never hit SS. 10525 * 10526 * Its far better to raise it up high (this takes the 10527 * risk that there as been a loss already, probably 10528 * we should have an indicator in all stacks of loss 10529 * but we don't), but considering the normal use this 10530 * is a risk worth taking. The consequences of not 10531 * hitting SS are far worse than going one more time 10532 * into it early on (before we have sent even a IW). 10533 * It is highly unlikely that we will have had a loss 10534 * before getting the IW out. 10535 */ 10536 tp->snd_ssthresh = 0xffffffff; 10537 } 10538 rack_stop_all_timers(tp); 10539 rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), 0, 0, 0); 10540 rack_log_rtt_shrinks(rack, us_cts, 0, 10541 __LINE__, RACK_RTTS_INIT); 10542 return (0); 10543 } 10544 10545 static int 10546 rack_handoff_ok(struct tcpcb *tp) 10547 { 10548 if ((tp->t_state == TCPS_CLOSED) || 10549 (tp->t_state == TCPS_LISTEN)) { 10550 /* Sure no problem though it may not stick */ 10551 return (0); 10552 } 10553 if ((tp->t_state == TCPS_SYN_SENT) || 10554 (tp->t_state == TCPS_SYN_RECEIVED)) { 10555 /* 10556 * We really don't know you have to get to ESTAB or beyond 10557 * to tell. 10558 */ 10559 return (EAGAIN); 10560 } 10561 if ((tp->t_flags & TF_SACK_PERMIT) || rack_sack_not_required){ 10562 return (0); 10563 } 10564 /* 10565 * If we reach here we don't do SACK on this connection so we can 10566 * never do rack. 10567 */ 10568 return (EINVAL); 10569 } 10570 10571 static void 10572 rack_fini(struct tcpcb *tp, int32_t tcb_is_purged) 10573 { 10574 if (tp->t_fb_ptr) { 10575 struct tcp_rack *rack; 10576 struct rack_sendmap *rsm, *nrsm, *rm; 10577 10578 rack = (struct tcp_rack *)tp->t_fb_ptr; 10579 #ifdef NETFLIX_SHARED_CWND 10580 if (rack->r_ctl.rc_scw) { 10581 uint32_t limit; 10582 10583 if (rack->r_limit_scw) 10584 limit = max(1, rack->r_ctl.rc_lowest_us_rtt); 10585 else 10586 limit = 0; 10587 tcp_shared_cwnd_free_full(tp, rack->r_ctl.rc_scw, 10588 rack->r_ctl.rc_scw_index, 10589 limit); 10590 rack->r_ctl.rc_scw = NULL; 10591 } 10592 #endif 10593 /* rack does not use force data but other stacks may clear it */ 10594 tp->t_flags &= ~TF_FORCEDATA; 10595 if (tp->t_inpcb) { 10596 tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ; 10597 tp->t_inpcb->inp_flags2 &= ~INP_MBUF_QUEUE_READY; 10598 tp->t_inpcb->inp_flags2 &= ~INP_DONT_SACK_QUEUE; 10599 } 10600 #ifdef TCP_BLACKBOX 10601 tcp_log_flowend(tp); 10602 #endif 10603 RB_FOREACH_SAFE(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm) { 10604 rm = RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 10605 #ifdef INVARIANTS 10606 if (rm != rsm) { 10607 panic("At fini, rack:%p rsm:%p rm:%p", 10608 rack, rsm, rm); 10609 } 10610 #endif 10611 uma_zfree(rack_zone, rsm); 10612 } 10613 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); 10614 while (rsm) { 10615 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext); 10616 uma_zfree(rack_zone, rsm); 10617 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); 10618 } 10619 rack->rc_free_cnt = 0; 10620 uma_zfree(rack_pcb_zone, tp->t_fb_ptr); 10621 tp->t_fb_ptr = NULL; 10622 } 10623 /* Cancel the GP measurement in progress */ 10624 tp->t_flags &= ~TF_GPUTINPROG; 10625 /* Make sure snd_nxt is correctly set */ 10626 tp->snd_nxt = tp->snd_max; 10627 } 10628 10629 10630 static void 10631 rack_set_state(struct tcpcb *tp, struct tcp_rack *rack) 10632 { 10633 switch (tp->t_state) { 10634 case TCPS_SYN_SENT: 10635 rack->r_state = TCPS_SYN_SENT; 10636 rack->r_substate = rack_do_syn_sent; 10637 break; 10638 case TCPS_SYN_RECEIVED: 10639 rack->r_state = TCPS_SYN_RECEIVED; 10640 rack->r_substate = rack_do_syn_recv; 10641 break; 10642 case TCPS_ESTABLISHED: 10643 rack_set_pace_segments(tp, rack, __LINE__); 10644 rack->r_state = TCPS_ESTABLISHED; 10645 rack->r_substate = rack_do_established; 10646 break; 10647 case TCPS_CLOSE_WAIT: 10648 rack->r_state = TCPS_CLOSE_WAIT; 10649 rack->r_substate = rack_do_close_wait; 10650 break; 10651 case TCPS_FIN_WAIT_1: 10652 rack->r_state = TCPS_FIN_WAIT_1; 10653 rack->r_substate = rack_do_fin_wait_1; 10654 break; 10655 case TCPS_CLOSING: 10656 rack->r_state = TCPS_CLOSING; 10657 rack->r_substate = rack_do_closing; 10658 break; 10659 case TCPS_LAST_ACK: 10660 rack->r_state = TCPS_LAST_ACK; 10661 rack->r_substate = rack_do_lastack; 10662 break; 10663 case TCPS_FIN_WAIT_2: 10664 rack->r_state = TCPS_FIN_WAIT_2; 10665 rack->r_substate = rack_do_fin_wait_2; 10666 break; 10667 case TCPS_LISTEN: 10668 case TCPS_CLOSED: 10669 case TCPS_TIME_WAIT: 10670 default: 10671 break; 10672 }; 10673 } 10674 10675 10676 static void 10677 rack_timer_audit(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb) 10678 { 10679 /* 10680 * We received an ack, and then did not 10681 * call send or were bounced out due to the 10682 * hpts was running. Now a timer is up as well, is 10683 * it the right timer? 10684 */ 10685 struct rack_sendmap *rsm; 10686 int tmr_up; 10687 10688 tmr_up = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK; 10689 if (rack->rc_in_persist && (tmr_up == PACE_TMR_PERSIT)) 10690 return; 10691 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 10692 if (((rsm == NULL) || (tp->t_state < TCPS_ESTABLISHED)) && 10693 (tmr_up == PACE_TMR_RXT)) { 10694 /* Should be an RXT */ 10695 return; 10696 } 10697 if (rsm == NULL) { 10698 /* Nothing outstanding? */ 10699 if (tp->t_flags & TF_DELACK) { 10700 if (tmr_up == PACE_TMR_DELACK) 10701 /* We are supposed to have delayed ack up and we do */ 10702 return; 10703 } else if (sbavail(&tp->t_inpcb->inp_socket->so_snd) && (tmr_up == PACE_TMR_RXT)) { 10704 /* 10705 * if we hit enobufs then we would expect the possiblity 10706 * of nothing outstanding and the RXT up (and the hptsi timer). 10707 */ 10708 return; 10709 } else if (((V_tcp_always_keepalive || 10710 rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) && 10711 (tp->t_state <= TCPS_CLOSING)) && 10712 (tmr_up == PACE_TMR_KEEP) && 10713 (tp->snd_max == tp->snd_una)) { 10714 /* We should have keep alive up and we do */ 10715 return; 10716 } 10717 } 10718 if (SEQ_GT(tp->snd_max, tp->snd_una) && 10719 ((tmr_up == PACE_TMR_TLP) || 10720 (tmr_up == PACE_TMR_RACK) || 10721 (tmr_up == PACE_TMR_RXT))) { 10722 /* 10723 * Either a Rack, TLP or RXT is fine if we 10724 * have outstanding data. 10725 */ 10726 return; 10727 } else if (tmr_up == PACE_TMR_DELACK) { 10728 /* 10729 * If the delayed ack was going to go off 10730 * before the rtx/tlp/rack timer were going to 10731 * expire, then that would be the timer in control. 10732 * Note we don't check the time here trusting the 10733 * code is correct. 10734 */ 10735 return; 10736 } 10737 /* 10738 * Ok the timer originally started is not what we want now. 10739 * We will force the hpts to be stopped if any, and restart 10740 * with the slot set to what was in the saved slot. 10741 */ 10742 if (rack->rc_inp->inp_in_hpts) { 10743 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { 10744 uint32_t us_cts; 10745 10746 us_cts = tcp_get_usecs(NULL); 10747 if (TSTMP_GT(rack->r_ctl.rc_last_output_to, us_cts)) { 10748 rack->r_early = 1; 10749 rack->r_ctl.rc_agg_early += (rack->r_ctl.rc_last_output_to - us_cts); 10750 } 10751 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 10752 } 10753 tcp_hpts_remove(tp->t_inpcb, HPTS_REMOVE_OUTPUT); 10754 } 10755 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 10756 rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), 0, 0, 0); 10757 } 10758 10759 static int 10760 rack_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so, 10761 struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos, 10762 int32_t nxt_pkt, struct timeval *tv) 10763 { 10764 int32_t thflags, retval, did_out = 0; 10765 int32_t way_out = 0; 10766 uint32_t cts; 10767 uint32_t tiwin; 10768 struct timespec ts; 10769 struct tcpopt to; 10770 struct tcp_rack *rack; 10771 struct rack_sendmap *rsm; 10772 int32_t prev_state = 0; 10773 uint32_t us_cts; 10774 /* 10775 * tv passed from common code is from either M_TSTMP_LRO or 10776 * tcp_get_usecs() if no LRO m_pkthdr timestamp is present. The 10777 * rack_pacing stack assumes tv always refers to 'now', so we overwrite 10778 * tv here to guarantee that. 10779 */ 10780 if (m->m_flags & M_TSTMP_LRO) 10781 tcp_get_usecs(tv); 10782 10783 cts = tcp_tv_to_mssectick(tv); 10784 rack = (struct tcp_rack *)tp->t_fb_ptr; 10785 10786 if ((m->m_flags & M_TSTMP) || 10787 (m->m_flags & M_TSTMP_LRO)) { 10788 mbuf_tstmp2timespec(m, &ts); 10789 rack->r_ctl.act_rcv_time.tv_sec = ts.tv_sec; 10790 rack->r_ctl.act_rcv_time.tv_usec = ts.tv_nsec/1000; 10791 } else 10792 rack->r_ctl.act_rcv_time = *tv; 10793 kern_prefetch(rack, &prev_state); 10794 prev_state = 0; 10795 thflags = th->th_flags; 10796 10797 NET_EPOCH_ASSERT(); 10798 INP_WLOCK_ASSERT(tp->t_inpcb); 10799 KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", 10800 __func__)); 10801 KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT", 10802 __func__)); 10803 if (tp->t_logstate != TCP_LOG_STATE_OFF) { 10804 union tcp_log_stackspecific log; 10805 struct timeval ltv; 10806 #ifdef NETFLIX_HTTP_LOGGING 10807 struct http_sendfile_track *http_req; 10808 10809 if (SEQ_GT(th->th_ack, tp->snd_una)) { 10810 http_req = tcp_http_find_req_for_seq(tp, (th->th_ack-1)); 10811 } else { 10812 http_req = tcp_http_find_req_for_seq(tp, th->th_ack); 10813 } 10814 #endif 10815 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 10816 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 10817 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 10818 if (rack->rack_no_prr == 0) 10819 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; 10820 else 10821 log.u_bbr.flex1 = 0; 10822 log.u_bbr.flex2 = rack->r_ctl.rc_num_maps_alloced; 10823 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 10824 log.u_bbr.pkts_out = rack->rc_tp->t_maxseg; 10825 log.u_bbr.flex3 = m->m_flags; 10826 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; 10827 if (m->m_flags & M_TSTMP) { 10828 /* Record the hardware timestamp if present */ 10829 mbuf_tstmp2timespec(m, &ts); 10830 ltv.tv_sec = ts.tv_sec; 10831 ltv.tv_usec = ts.tv_nsec / 1000; 10832 log.u_bbr.lt_epoch = tcp_tv_to_usectick(<v); 10833 } else if (m->m_flags & M_TSTMP_LRO) { 10834 /* Record the LRO the arrival timestamp */ 10835 mbuf_tstmp2timespec(m, &ts); 10836 ltv.tv_sec = ts.tv_sec; 10837 ltv.tv_usec = ts.tv_nsec / 1000; 10838 log.u_bbr.flex5 = tcp_tv_to_usectick(<v); 10839 } 10840 log.u_bbr.timeStamp = tcp_get_usecs(<v); 10841 /* Log the rcv time */ 10842 log.u_bbr.delRate = m->m_pkthdr.rcv_tstmp; 10843 #ifdef NETFLIX_HTTP_LOGGING 10844 log.u_bbr.applimited = tp->t_http_closed; 10845 log.u_bbr.applimited <<= 8; 10846 log.u_bbr.applimited |= tp->t_http_open; 10847 log.u_bbr.applimited <<= 8; 10848 log.u_bbr.applimited |= tp->t_http_req; 10849 if (http_req) { 10850 /* Copy out any client req info */ 10851 /* seconds */ 10852 log.u_bbr.pkt_epoch = (http_req->localtime / HPTS_USEC_IN_SEC); 10853 /* useconds */ 10854 log.u_bbr.delivered = (http_req->localtime % HPTS_USEC_IN_SEC); 10855 log.u_bbr.rttProp = http_req->timestamp; 10856 log.u_bbr.cur_del_rate = http_req->start; 10857 if (http_req->flags & TCP_HTTP_TRACK_FLG_OPEN) { 10858 log.u_bbr.flex8 |= 1; 10859 } else { 10860 log.u_bbr.flex8 |= 2; 10861 log.u_bbr.bw_inuse = http_req->end; 10862 } 10863 log.u_bbr.flex6 = http_req->start_seq; 10864 if (http_req->flags & TCP_HTTP_TRACK_FLG_COMP) { 10865 log.u_bbr.flex8 |= 4; 10866 log.u_bbr.epoch = http_req->end_seq; 10867 } 10868 } 10869 #endif 10870 TCP_LOG_EVENTP(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_IN, 0, 10871 tlen, &log, true, <v); 10872 } 10873 if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) { 10874 way_out = 4; 10875 retval = 0; 10876 goto done_with_input; 10877 } 10878 /* 10879 * If a segment with the ACK-bit set arrives in the SYN-SENT state 10880 * check SEQ.ACK first as described on page 66 of RFC 793, section 3.9. 10881 */ 10882 if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) && 10883 (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { 10884 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 10885 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 10886 return(1); 10887 } 10888 /* 10889 * Segment received on connection. Reset idle time and keep-alive 10890 * timer. XXX: This should be done after segment validation to 10891 * ignore broken/spoofed segs. 10892 */ 10893 if (tp->t_idle_reduce && 10894 (tp->snd_max == tp->snd_una) && 10895 ((ticks - tp->t_rcvtime) >= tp->t_rxtcur)) { 10896 counter_u64_add(rack_input_idle_reduces, 1); 10897 rack_cc_after_idle(rack, tp); 10898 } 10899 tp->t_rcvtime = ticks; 10900 /* 10901 * Unscale the window into a 32-bit value. For the SYN_SENT state 10902 * the scale is zero. 10903 */ 10904 tiwin = th->th_win << tp->snd_scale; 10905 #ifdef STATS 10906 stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_FRWIN, tiwin); 10907 #endif 10908 if (tiwin > rack->r_ctl.rc_high_rwnd) 10909 rack->r_ctl.rc_high_rwnd = tiwin; 10910 /* 10911 * TCP ECN processing. XXXJTL: If we ever use ECN, we need to move 10912 * this to occur after we've validated the segment. 10913 */ 10914 if (tp->t_flags2 & TF2_ECN_PERMIT) { 10915 if (thflags & TH_CWR) { 10916 tp->t_flags2 &= ~TF2_ECN_SND_ECE; 10917 tp->t_flags |= TF_ACKNOW; 10918 } 10919 switch (iptos & IPTOS_ECN_MASK) { 10920 case IPTOS_ECN_CE: 10921 tp->t_flags2 |= TF2_ECN_SND_ECE; 10922 KMOD_TCPSTAT_INC(tcps_ecn_ce); 10923 break; 10924 case IPTOS_ECN_ECT0: 10925 KMOD_TCPSTAT_INC(tcps_ecn_ect0); 10926 break; 10927 case IPTOS_ECN_ECT1: 10928 KMOD_TCPSTAT_INC(tcps_ecn_ect1); 10929 break; 10930 } 10931 10932 /* Process a packet differently from RFC3168. */ 10933 cc_ecnpkt_handler(tp, th, iptos); 10934 10935 /* Congestion experienced. */ 10936 if (thflags & TH_ECE) { 10937 rack_cong_signal(tp, th, CC_ECN); 10938 } 10939 } 10940 /* 10941 * Parse options on any incoming segment. 10942 */ 10943 tcp_dooptions(&to, (u_char *)(th + 1), 10944 (th->th_off << 2) - sizeof(struct tcphdr), 10945 (thflags & TH_SYN) ? TO_SYN : 0); 10946 10947 /* 10948 * If echoed timestamp is later than the current time, fall back to 10949 * non RFC1323 RTT calculation. Normalize timestamp if syncookies 10950 * were used when this connection was established. 10951 */ 10952 if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) { 10953 to.to_tsecr -= tp->ts_offset; 10954 if (TSTMP_GT(to.to_tsecr, cts)) 10955 to.to_tsecr = 0; 10956 } 10957 10958 /* 10959 * If its the first time in we need to take care of options and 10960 * verify we can do SACK for rack! 10961 */ 10962 if (rack->r_state == 0) { 10963 /* Should be init'd by rack_init() */ 10964 KASSERT(rack->rc_inp != NULL, 10965 ("%s: rack->rc_inp unexpectedly NULL", __func__)); 10966 if (rack->rc_inp == NULL) { 10967 rack->rc_inp = tp->t_inpcb; 10968 } 10969 10970 /* 10971 * Process options only when we get SYN/ACK back. The SYN 10972 * case for incoming connections is handled in tcp_syncache. 10973 * According to RFC1323 the window field in a SYN (i.e., a 10974 * <SYN> or <SYN,ACK>) segment itself is never scaled. XXX 10975 * this is traditional behavior, may need to be cleaned up. 10976 */ 10977 if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) { 10978 /* Handle parallel SYN for ECN */ 10979 if (!(thflags & TH_ACK) && 10980 ((thflags & (TH_CWR | TH_ECE)) == (TH_CWR | TH_ECE)) && 10981 ((V_tcp_do_ecn == 1) || (V_tcp_do_ecn == 2))) { 10982 tp->t_flags2 |= TF2_ECN_PERMIT; 10983 tp->t_flags2 |= TF2_ECN_SND_ECE; 10984 TCPSTAT_INC(tcps_ecn_shs); 10985 } 10986 if ((to.to_flags & TOF_SCALE) && 10987 (tp->t_flags & TF_REQ_SCALE)) { 10988 tp->t_flags |= TF_RCVD_SCALE; 10989 tp->snd_scale = to.to_wscale; 10990 } else 10991 tp->t_flags &= ~TF_REQ_SCALE; 10992 /* 10993 * Initial send window. It will be updated with the 10994 * next incoming segment to the scaled value. 10995 */ 10996 tp->snd_wnd = th->th_win; 10997 if ((to.to_flags & TOF_TS) && 10998 (tp->t_flags & TF_REQ_TSTMP)) { 10999 tp->t_flags |= TF_RCVD_TSTMP; 11000 tp->ts_recent = to.to_tsval; 11001 tp->ts_recent_age = cts; 11002 } else 11003 tp->t_flags &= ~TF_REQ_TSTMP; 11004 if (to.to_flags & TOF_MSS) 11005 tcp_mss(tp, to.to_mss); 11006 if ((tp->t_flags & TF_SACK_PERMIT) && 11007 (to.to_flags & TOF_SACKPERM) == 0) 11008 tp->t_flags &= ~TF_SACK_PERMIT; 11009 if (IS_FASTOPEN(tp->t_flags)) { 11010 if (to.to_flags & TOF_FASTOPEN) { 11011 uint16_t mss; 11012 11013 if (to.to_flags & TOF_MSS) 11014 mss = to.to_mss; 11015 else 11016 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) 11017 mss = TCP6_MSS; 11018 else 11019 mss = TCP_MSS; 11020 tcp_fastopen_update_cache(tp, mss, 11021 to.to_tfo_len, to.to_tfo_cookie); 11022 } else 11023 tcp_fastopen_disable_path(tp); 11024 } 11025 } 11026 /* 11027 * At this point we are at the initial call. Here we decide 11028 * if we are doing RACK or not. We do this by seeing if 11029 * TF_SACK_PERMIT is set and the sack-not-required is clear. 11030 * The code now does do dup-ack counting so if you don't 11031 * switch back you won't get rack & TLP, but you will still 11032 * get this stack. 11033 */ 11034 11035 if ((rack_sack_not_required == 0) && 11036 ((tp->t_flags & TF_SACK_PERMIT) == 0)) { 11037 tcp_switch_back_to_default(tp); 11038 (*tp->t_fb->tfb_tcp_do_segment) (m, th, so, tp, drop_hdrlen, 11039 tlen, iptos); 11040 return (1); 11041 } 11042 /* Set the flag */ 11043 rack->r_is_v6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; 11044 tcp_set_hpts(tp->t_inpcb); 11045 sack_filter_clear(&rack->r_ctl.rack_sf, th->th_ack); 11046 } 11047 if (thflags & TH_FIN) 11048 tcp_log_end_status(tp, TCP_EI_STATUS_CLIENT_FIN); 11049 us_cts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 11050 if ((rack->rc_gp_dyn_mul) && 11051 (rack->use_fixed_rate == 0) && 11052 (rack->rc_always_pace)) { 11053 /* Check in on probertt */ 11054 rack_check_probe_rtt(rack, us_cts); 11055 } 11056 if (rack->forced_ack) { 11057 uint32_t us_rtt; 11058 11059 /* 11060 * A persist or keep-alive was forced out, update our 11061 * min rtt time. Note we do not worry about lost 11062 * retransmissions since KEEP-ALIVES and persists 11063 * are usually way long on times of sending (though 11064 * if we were really paranoid or worried we could 11065 * at least use timestamps if available to validate). 11066 */ 11067 rack->forced_ack = 0; 11068 us_rtt = us_cts - rack->r_ctl.forced_ack_ts; 11069 if (us_rtt == 0) 11070 us_rtt = 1; 11071 rack_log_rtt_upd(tp, rack, us_rtt, 0, NULL, 3); 11072 rack_apply_updated_usrtt(rack, us_rtt, us_cts); 11073 } 11074 /* 11075 * This is the one exception case where we set the rack state 11076 * always. All other times (timers etc) we must have a rack-state 11077 * set (so we assure we have done the checks above for SACK). 11078 */ 11079 rack->r_ctl.rc_rcvtime = cts; 11080 if (rack->r_state != tp->t_state) 11081 rack_set_state(tp, rack); 11082 if (SEQ_GT(th->th_ack, tp->snd_una) && 11083 (rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree)) != NULL) 11084 kern_prefetch(rsm, &prev_state); 11085 prev_state = rack->r_state; 11086 rack_clear_rate_sample(rack); 11087 retval = (*rack->r_substate) (m, th, so, 11088 tp, &to, drop_hdrlen, 11089 tlen, tiwin, thflags, nxt_pkt, iptos); 11090 #ifdef INVARIANTS 11091 if ((retval == 0) && 11092 (tp->t_inpcb == NULL)) { 11093 panic("retval:%d tp:%p t_inpcb:NULL state:%d", 11094 retval, tp, prev_state); 11095 } 11096 #endif 11097 if (retval == 0) { 11098 /* 11099 * If retval is 1 the tcb is unlocked and most likely the tp 11100 * is gone. 11101 */ 11102 INP_WLOCK_ASSERT(tp->t_inpcb); 11103 if ((rack->rc_gp_dyn_mul) && 11104 (rack->rc_always_pace) && 11105 (rack->use_fixed_rate == 0) && 11106 rack->in_probe_rtt && 11107 (rack->r_ctl.rc_time_probertt_starts == 0)) { 11108 /* 11109 * If we are going for target, lets recheck before 11110 * we output. 11111 */ 11112 rack_check_probe_rtt(rack, us_cts); 11113 } 11114 if (rack->set_pacing_done_a_iw == 0) { 11115 /* How much has been acked? */ 11116 if ((tp->snd_una - tp->iss) > (ctf_fixed_maxseg(tp) * 10)) { 11117 /* We have enough to set in the pacing segment size */ 11118 rack->set_pacing_done_a_iw = 1; 11119 rack_set_pace_segments(tp, rack, __LINE__); 11120 } 11121 } 11122 tcp_rack_xmit_timer_commit(rack, tp); 11123 if (nxt_pkt == 0) { 11124 if (rack->r_wanted_output != 0) { 11125 do_output_now: 11126 did_out = 1; 11127 (void)tp->t_fb->tfb_tcp_output(tp); 11128 } 11129 rack_start_hpts_timer(rack, tp, cts, 0, 0, 0); 11130 } 11131 if ((nxt_pkt == 0) && 11132 ((rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) == 0) && 11133 (SEQ_GT(tp->snd_max, tp->snd_una) || 11134 (tp->t_flags & TF_DELACK) || 11135 ((V_tcp_always_keepalive || rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) && 11136 (tp->t_state <= TCPS_CLOSING)))) { 11137 /* We could not send (probably in the hpts but stopped the timer earlier)? */ 11138 if ((tp->snd_max == tp->snd_una) && 11139 ((tp->t_flags & TF_DELACK) == 0) && 11140 (rack->rc_inp->inp_in_hpts) && 11141 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { 11142 /* keep alive not needed if we are hptsi output yet */ 11143 ; 11144 } else { 11145 int late = 0; 11146 if (rack->rc_inp->inp_in_hpts) { 11147 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { 11148 us_cts = tcp_get_usecs(NULL); 11149 if (TSTMP_GT(rack->r_ctl.rc_last_output_to, us_cts)) { 11150 rack->r_early = 1; 11151 rack->r_ctl.rc_agg_early += (rack->r_ctl.rc_last_output_to - us_cts); 11152 } else 11153 late = 1; 11154 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 11155 } 11156 tcp_hpts_remove(tp->t_inpcb, HPTS_REMOVE_OUTPUT); 11157 } 11158 if (late && (did_out == 0)) { 11159 /* 11160 * We are late in the sending 11161 * and we did not call the output 11162 * (this probably should not happen). 11163 */ 11164 goto do_output_now; 11165 } 11166 rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), 0, 0, 0); 11167 } 11168 way_out = 1; 11169 } else if (nxt_pkt == 0) { 11170 /* Do we have the correct timer running? */ 11171 rack_timer_audit(tp, rack, &so->so_snd); 11172 way_out = 2; 11173 } 11174 done_with_input: 11175 rack_log_doseg_done(rack, cts, nxt_pkt, did_out, way_out); 11176 if (did_out) 11177 rack->r_wanted_output = 0; 11178 #ifdef INVARIANTS 11179 if (tp->t_inpcb == NULL) { 11180 panic("OP:%d retval:%d tp:%p t_inpcb:NULL state:%d", 11181 did_out, 11182 retval, tp, prev_state); 11183 } 11184 #endif 11185 } 11186 return (retval); 11187 } 11188 11189 void 11190 rack_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, 11191 struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos) 11192 { 11193 struct timeval tv; 11194 11195 /* First lets see if we have old packets */ 11196 if (tp->t_in_pkt) { 11197 if (ctf_do_queued_segments(so, tp, 1)) { 11198 m_freem(m); 11199 return; 11200 } 11201 } 11202 if (m->m_flags & M_TSTMP_LRO) { 11203 tv.tv_sec = m->m_pkthdr.rcv_tstmp /1000000000; 11204 tv.tv_usec = (m->m_pkthdr.rcv_tstmp % 1000000000)/1000; 11205 } else { 11206 /* Should not be should we kassert instead? */ 11207 tcp_get_usecs(&tv); 11208 } 11209 if(rack_do_segment_nounlock(m, th, so, tp, 11210 drop_hdrlen, tlen, iptos, 0, &tv) == 0) 11211 INP_WUNLOCK(tp->t_inpcb); 11212 } 11213 11214 struct rack_sendmap * 11215 tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tsused) 11216 { 11217 struct rack_sendmap *rsm = NULL; 11218 int32_t idx; 11219 uint32_t srtt = 0, thresh = 0, ts_low = 0; 11220 11221 /* Return the next guy to be re-transmitted */ 11222 if (RB_EMPTY(&rack->r_ctl.rc_mtree)) { 11223 return (NULL); 11224 } 11225 if (tp->t_flags & TF_SENTFIN) { 11226 /* retran the end FIN? */ 11227 return (NULL); 11228 } 11229 /* ok lets look at this one */ 11230 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 11231 if (rsm && ((rsm->r_flags & RACK_ACKED) == 0)) { 11232 goto check_it; 11233 } 11234 rsm = rack_find_lowest_rsm(rack); 11235 if (rsm == NULL) { 11236 return (NULL); 11237 } 11238 check_it: 11239 if (rsm->r_flags & RACK_ACKED) { 11240 return (NULL); 11241 } 11242 if (((rsm->r_flags & RACK_SACK_PASSED) == 0) && 11243 (rsm->r_dupack < DUP_ACK_THRESHOLD)) { 11244 /* Its not yet ready */ 11245 return (NULL); 11246 } 11247 srtt = rack_grab_rtt(tp, rack); 11248 idx = rsm->r_rtr_cnt - 1; 11249 ts_low = rsm->r_tim_lastsent[idx]; 11250 thresh = rack_calc_thresh_rack(rack, srtt, tsused); 11251 if ((tsused == ts_low) || 11252 (TSTMP_LT(tsused, ts_low))) { 11253 /* No time since sending */ 11254 return (NULL); 11255 } 11256 if ((tsused - ts_low) < thresh) { 11257 /* It has not been long enough yet */ 11258 return (NULL); 11259 } 11260 if ((rsm->r_dupack >= DUP_ACK_THRESHOLD) || 11261 ((rsm->r_flags & RACK_SACK_PASSED) && 11262 (rack->sack_attack_disable == 0))) { 11263 /* 11264 * We have passed the dup-ack threshold <or> 11265 * a SACK has indicated this is missing. 11266 * Note that if you are a declared attacker 11267 * it is only the dup-ack threshold that 11268 * will cause retransmits. 11269 */ 11270 /* log retransmit reason */ 11271 rack_log_retran_reason(rack, rsm, (tsused - ts_low), thresh, 1); 11272 return (rsm); 11273 } 11274 return (NULL); 11275 } 11276 11277 static void 11278 rack_log_pacing_delay_calc(struct tcp_rack *rack, uint32_t len, uint32_t slot, 11279 uint64_t bw_est, uint64_t bw, uint64_t len_time, int method, 11280 int line, struct rack_sendmap *rsm) 11281 { 11282 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 11283 union tcp_log_stackspecific log; 11284 struct timeval tv; 11285 11286 memset(&log, 0, sizeof(log)); 11287 log.u_bbr.flex1 = slot; 11288 log.u_bbr.flex2 = len; 11289 log.u_bbr.flex3 = rack->r_ctl.rc_pace_min_segs; 11290 log.u_bbr.flex4 = rack->r_ctl.rc_pace_max_segs; 11291 log.u_bbr.flex5 = rack->r_ctl.rack_per_of_gp_ss; 11292 log.u_bbr.flex6 = rack->r_ctl.rack_per_of_gp_ca; 11293 log.u_bbr.use_lt_bw = rack->app_limited_needs_set; 11294 log.u_bbr.use_lt_bw <<= 1; 11295 log.u_bbr.use_lt_bw = rack->rc_gp_filled; 11296 log.u_bbr.use_lt_bw <<= 1; 11297 log.u_bbr.use_lt_bw |= rack->measure_saw_probe_rtt; 11298 log.u_bbr.use_lt_bw <<= 1; 11299 log.u_bbr.use_lt_bw |= rack->in_probe_rtt; 11300 log.u_bbr.pkt_epoch = line; 11301 log.u_bbr.applimited = rack->r_ctl.rack_per_of_gp_rec; 11302 log.u_bbr.bw_inuse = bw_est; 11303 log.u_bbr.delRate = bw; 11304 if (rack->r_ctl.gp_bw == 0) 11305 log.u_bbr.cur_del_rate = 0; 11306 else 11307 log.u_bbr.cur_del_rate = rack_get_bw(rack); 11308 log.u_bbr.rttProp = len_time; 11309 log.u_bbr.pkts_out = rack->r_ctl.rc_rack_min_rtt; 11310 log.u_bbr.lost = rack->r_ctl.rc_probertt_sndmax_atexit; 11311 log.u_bbr.pacing_gain = rack_get_output_gain(rack, rsm); 11312 if (rack->r_ctl.cwnd_to_use < rack->rc_tp->snd_ssthresh) { 11313 /* We are in slow start */ 11314 log.u_bbr.flex7 = 1; 11315 } else { 11316 /* we are on congestion avoidance */ 11317 log.u_bbr.flex7 = 0; 11318 } 11319 log.u_bbr.flex8 = method; 11320 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 11321 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 11322 log.u_bbr.cwnd_gain = rack->rc_gp_saw_rec; 11323 log.u_bbr.cwnd_gain <<= 1; 11324 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ss; 11325 log.u_bbr.cwnd_gain <<= 1; 11326 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ca; 11327 TCP_LOG_EVENTP(rack->rc_tp, NULL, 11328 &rack->rc_inp->inp_socket->so_rcv, 11329 &rack->rc_inp->inp_socket->so_snd, 11330 BBR_LOG_HPTSI_CALC, 0, 11331 0, &log, false, &tv); 11332 } 11333 } 11334 11335 static uint32_t 11336 rack_get_pacing_len(struct tcp_rack *rack, uint64_t bw, uint32_t mss) 11337 { 11338 uint32_t new_tso, user_max; 11339 11340 user_max = rack->rc_user_set_max_segs * mss; 11341 if (rack->rc_force_max_seg) { 11342 return (user_max); 11343 } 11344 if (rack->use_fixed_rate && 11345 ((rack->r_ctl.crte == NULL) || 11346 (bw != rack->r_ctl.crte->rate))) { 11347 /* Use the user mss since we are not exactly matched */ 11348 return (user_max); 11349 } 11350 new_tso = tcp_get_pacing_burst_size(bw, mss, rack_pace_one_seg, rack->r_ctl.crte, NULL); 11351 if (new_tso > user_max) 11352 new_tso = user_max; 11353 return(new_tso); 11354 } 11355 11356 static void 11357 rack_log_hdwr_pacing(struct tcp_rack *rack, const struct ifnet *ifp, 11358 uint64_t rate, uint64_t hw_rate, int line, 11359 int error) 11360 { 11361 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 11362 union tcp_log_stackspecific log; 11363 struct timeval tv; 11364 11365 memset(&log, 0, sizeof(log)); 11366 log.u_bbr.flex1 = ((hw_rate >> 32) & 0x00000000ffffffff); 11367 log.u_bbr.flex2 = (hw_rate & 0x00000000ffffffff); 11368 log.u_bbr.flex3 = (((uint64_t)ifp >> 32) & 0x00000000ffffffff); 11369 log.u_bbr.flex4 = ((uint64_t)ifp & 0x00000000ffffffff); 11370 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 11371 log.u_bbr.bw_inuse = rate; 11372 log.u_bbr.flex5 = line; 11373 log.u_bbr.flex6 = error; 11374 log.u_bbr.applimited = rack->r_ctl.rc_pace_max_segs; 11375 log.u_bbr.flex8 = rack->use_fixed_rate; 11376 log.u_bbr.flex8 <<= 1; 11377 log.u_bbr.flex8 |= rack->rack_hdrw_pacing; 11378 log.u_bbr.pkts_out = rack->rc_tp->t_maxseg; 11379 TCP_LOG_EVENTP(rack->rc_tp, NULL, 11380 &rack->rc_inp->inp_socket->so_rcv, 11381 &rack->rc_inp->inp_socket->so_snd, 11382 BBR_LOG_HDWR_PACE, 0, 11383 0, &log, false, &tv); 11384 } 11385 } 11386 11387 static int32_t 11388 pace_to_fill_cwnd(struct tcp_rack *rack, int32_t slot, uint32_t len, uint32_t segsiz) 11389 { 11390 uint64_t lentim, fill_bw; 11391 11392 /* Lets first see if we are full, if so continue with normal rate */ 11393 if (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) > rack->r_ctl.cwnd_to_use) 11394 return (slot); 11395 if ((ctf_outstanding(rack->rc_tp) + (segsiz-1)) > rack->rc_tp->snd_wnd) 11396 return (slot); 11397 if (rack->r_ctl.rc_last_us_rtt == 0) 11398 return (slot); 11399 if (rack->rc_pace_fill_if_rttin_range && 11400 (rack->r_ctl.rc_last_us_rtt >= 11401 (get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack->rtt_limit_mul))) { 11402 /* The rtt is huge, N * smallest, lets not fill */ 11403 return (slot); 11404 } 11405 /* 11406 * first lets calculate the b/w based on the last us-rtt 11407 * and the sndwnd. 11408 */ 11409 fill_bw = rack->r_ctl.cwnd_to_use; 11410 /* Take the rwnd if its smaller */ 11411 if (fill_bw > rack->rc_tp->snd_wnd) 11412 fill_bw = rack->rc_tp->snd_wnd; 11413 fill_bw *= (uint64_t)HPTS_USEC_IN_SEC; 11414 fill_bw /= (uint64_t)rack->r_ctl.rc_last_us_rtt; 11415 /* We are below the min b/w */ 11416 if (fill_bw < RACK_MIN_BW) 11417 return (slot); 11418 /* 11419 * Ok fill_bw holds our mythical b/w to fill the cwnd 11420 * in a rtt, what does that time wise equate too? 11421 */ 11422 lentim = (uint64_t)(len) * (uint64_t)HPTS_USEC_IN_SEC; 11423 lentim /= fill_bw; 11424 if (lentim < slot) { 11425 rack_log_pacing_delay_calc(rack, len, slot, fill_bw, 11426 0, lentim, 12, __LINE__, NULL); 11427 return ((int32_t)lentim); 11428 } else 11429 return (slot); 11430 } 11431 11432 static int32_t 11433 rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, struct rack_sendmap *rsm, uint32_t segsiz) 11434 { 11435 struct rack_sendmap *lrsm; 11436 int32_t slot = 0; 11437 int err; 11438 11439 if (rack->rc_always_pace == 0) { 11440 /* 11441 * We use the most optimistic possible cwnd/srtt for 11442 * sending calculations. This will make our 11443 * calculation anticipate getting more through 11444 * quicker then possible. But thats ok we don't want 11445 * the peer to have a gap in data sending. 11446 */ 11447 uint32_t srtt, cwnd, tr_perms = 0; 11448 int32_t reduce = 0; 11449 11450 old_method: 11451 /* 11452 * We keep no precise pacing with the old method 11453 * instead we use the pacer to mitigate bursts. 11454 */ 11455 rack->r_ctl.rc_agg_delayed = 0; 11456 rack->r_early = 0; 11457 rack->r_late = 0; 11458 rack->r_ctl.rc_agg_early = 0; 11459 if (rack->r_ctl.rc_rack_min_rtt) 11460 srtt = rack->r_ctl.rc_rack_min_rtt; 11461 else 11462 srtt = TICKS_2_MSEC((tp->t_srtt >> TCP_RTT_SHIFT)); 11463 if (rack->r_ctl.rc_rack_largest_cwnd) 11464 cwnd = rack->r_ctl.rc_rack_largest_cwnd; 11465 else 11466 cwnd = rack->r_ctl.cwnd_to_use; 11467 tr_perms = cwnd / srtt; 11468 if (tr_perms == 0) { 11469 tr_perms = ctf_fixed_maxseg(tp); 11470 } 11471 /* 11472 * Calculate how long this will take to drain, if 11473 * the calculation comes out to zero, thats ok we 11474 * will use send_a_lot to possibly spin around for 11475 * more increasing tot_len_this_send to the point 11476 * that its going to require a pace, or we hit the 11477 * cwnd. Which in that case we are just waiting for 11478 * a ACK. 11479 */ 11480 slot = len / tr_perms; 11481 /* Now do we reduce the time so we don't run dry? */ 11482 if (slot && rack_slot_reduction) { 11483 reduce = (slot / rack_slot_reduction); 11484 if (reduce < slot) { 11485 slot -= reduce; 11486 } else 11487 slot = 0; 11488 } 11489 slot *= HPTS_USEC_IN_MSEC; 11490 if (rsm == NULL) { 11491 /* 11492 * We always consider ourselves app limited with old style 11493 * that are not retransmits. This could be the initial 11494 * measurement, but thats ok its all setup and specially 11495 * handled. If another send leaks out, then that too will 11496 * be mark app-limited. 11497 */ 11498 lrsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 11499 if (lrsm && ((lrsm->r_flags & RACK_APP_LIMITED) == 0)) { 11500 rack->r_ctl.rc_first_appl = lrsm; 11501 lrsm->r_flags |= RACK_APP_LIMITED; 11502 rack->r_ctl.rc_app_limited_cnt++; 11503 } 11504 } 11505 rack_log_pacing_delay_calc(rack, len, slot, tr_perms, reduce, 0, 7, __LINE__, NULL); 11506 } else { 11507 uint64_t bw_est, res, lentim, rate_wanted; 11508 uint32_t orig_val, srtt, segs, oh; 11509 11510 if ((rack->r_rr_config == 1) && rsm) { 11511 return (rack->r_ctl.rc_min_to * HPTS_USEC_IN_MSEC); 11512 } 11513 if (rack->use_fixed_rate) { 11514 rate_wanted = bw_est = rack_get_fixed_pacing_bw(rack); 11515 } else if ((rack->r_ctl.init_rate == 0) && 11516 #ifdef NETFLIX_PEAKRATE 11517 (rack->rc_tp->t_maxpeakrate == 0) && 11518 #endif 11519 (rack->r_ctl.gp_bw == 0)) { 11520 /* no way to yet do an estimate */ 11521 bw_est = rate_wanted = 0; 11522 } else { 11523 bw_est = rack_get_bw(rack); 11524 rate_wanted = rack_get_output_bw(rack, bw_est, rsm); 11525 } 11526 if ((bw_est == 0) || (rate_wanted == 0)) { 11527 /* 11528 * No way yet to make a b/w estimate or 11529 * our raise is set incorrectly. 11530 */ 11531 goto old_method; 11532 } 11533 /* We need to account for all the overheads */ 11534 segs = (len + segsiz - 1) / segsiz; 11535 /* 11536 * We need the diff between 1514 bytes (e-mtu with e-hdr) 11537 * and how much data we put in each packet. Yes this 11538 * means we may be off if we are larger than 1500 bytes 11539 * or smaller. But this just makes us more conservative. 11540 */ 11541 if (ETHERNET_SEGMENT_SIZE > segsiz) 11542 oh = ETHERNET_SEGMENT_SIZE - segsiz; 11543 else 11544 oh = 0; 11545 segs *= oh; 11546 lentim = (uint64_t)(len + segs) * (uint64_t)HPTS_USEC_IN_SEC; 11547 res = lentim / rate_wanted; 11548 slot = (uint32_t)res; 11549 orig_val = rack->r_ctl.rc_pace_max_segs; 11550 rack_set_pace_segments(rack->rc_tp, rack, __LINE__); 11551 /* Did we change the TSO size, if so log it */ 11552 if (rack->r_ctl.rc_pace_max_segs != orig_val) 11553 rack_log_pacing_delay_calc(rack, len, slot, orig_val, 0, 0, 15, __LINE__, NULL); 11554 if ((rack->rc_pace_to_cwnd) && 11555 (rack->in_probe_rtt == 0) && 11556 (IN_RECOVERY(rack->rc_tp->t_flags) == 0)) { 11557 /* 11558 * We want to pace at our rate *or* faster to 11559 * fill the cwnd to the max if its not full. 11560 */ 11561 slot = pace_to_fill_cwnd(rack, slot, (len+segs), segsiz); 11562 } 11563 if ((rack->rc_inp->inp_route.ro_nh != NULL) && 11564 (rack->rc_inp->inp_route.ro_nh->nh_ifp != NULL)) { 11565 if ((rack->rack_hdw_pace_ena) && 11566 (rack->rack_hdrw_pacing == 0) && 11567 (rack->rack_attempt_hdwr_pace == 0)) { 11568 /* 11569 * Lets attempt to turn on hardware pacing 11570 * if we can. 11571 */ 11572 rack->rack_attempt_hdwr_pace = 1; 11573 rack->r_ctl.crte = tcp_set_pacing_rate(rack->rc_tp, 11574 rack->rc_inp->inp_route.ro_nh->nh_ifp, 11575 rate_wanted, 11576 RS_PACING_GEQ, 11577 &err); 11578 if (rack->r_ctl.crte) { 11579 rack->rack_hdrw_pacing = 1; 11580 rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size(rate_wanted, segsiz, 11581 0, rack->r_ctl.crte, 11582 NULL); 11583 rack_log_hdwr_pacing(rack, rack->rc_inp->inp_route.ro_nh->nh_ifp, 11584 rate_wanted, rack->r_ctl.crte->rate, __LINE__, 11585 err); 11586 } 11587 } else if (rack->rack_hdrw_pacing && 11588 (rack->r_ctl.crte->rate != rate_wanted)) { 11589 /* Do we need to adjust our rate? */ 11590 const struct tcp_hwrate_limit_table *nrte; 11591 11592 nrte = tcp_chg_pacing_rate(rack->r_ctl.crte, 11593 rack->rc_tp, 11594 rack->rc_inp->inp_route.ro_nh->nh_ifp, 11595 rate_wanted, 11596 RS_PACING_GEQ, 11597 &err); 11598 if (nrte == NULL) { 11599 /* Lost the rate */ 11600 rack->rack_hdrw_pacing = 0; 11601 rack_set_pace_segments(rack->rc_tp, rack, __LINE__); 11602 } else if (nrte != rack->r_ctl.crte) { 11603 rack->r_ctl.crte = nrte; 11604 rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size(rate_wanted, 11605 segsiz, 0, 11606 rack->r_ctl.crte, 11607 NULL); 11608 rack_log_hdwr_pacing(rack, rack->rc_inp->inp_route.ro_nh->nh_ifp, 11609 rate_wanted, rack->r_ctl.crte->rate, __LINE__, 11610 err); 11611 } 11612 11613 } 11614 } 11615 if (rack_limit_time_with_srtt && 11616 (rack->use_fixed_rate == 0) && 11617 #ifdef NETFLIX_PEAKRATE 11618 (rack->rc_tp->t_maxpeakrate == 0) && 11619 #endif 11620 (rack->rack_hdrw_pacing == 0)) { 11621 /* 11622 * Sanity check, we do not allow the pacing delay 11623 * to be longer than the SRTT of the path. If it is 11624 * a slow path, then adding a packet should increase 11625 * the RTT and compensate for this i.e. the srtt will 11626 * be greater so the allowed pacing time will be greater. 11627 * 11628 * Note this restriction is not for where a peak rate 11629 * is set, we are doing fixed pacing or hardware pacing. 11630 */ 11631 if (rack->rc_tp->t_srtt) 11632 srtt = (TICKS_2_USEC(rack->rc_tp->t_srtt) >> TCP_RTT_SHIFT); 11633 else 11634 srtt = RACK_INITIAL_RTO * HPTS_USEC_IN_MSEC; /* its in ms convert */ 11635 if (srtt < slot) { 11636 rack_log_pacing_delay_calc(rack, srtt, slot, rate_wanted, bw_est, lentim, 99, __LINE__, NULL); 11637 slot = srtt; 11638 } 11639 } 11640 rack_log_pacing_delay_calc(rack, len, slot, rate_wanted, bw_est, lentim, 2, __LINE__, rsm); 11641 } 11642 if (slot) 11643 counter_u64_add(rack_calc_nonzero, 1); 11644 else 11645 counter_u64_add(rack_calc_zero, 1); 11646 return (slot); 11647 } 11648 11649 static void 11650 rack_start_gp_measurement(struct tcpcb *tp, struct tcp_rack *rack, 11651 tcp_seq startseq, uint32_t sb_offset) 11652 { 11653 struct rack_sendmap *my_rsm = NULL; 11654 struct rack_sendmap fe; 11655 11656 if (tp->t_state < TCPS_ESTABLISHED) { 11657 /* 11658 * We don't start any measurements if we are 11659 * not at least established. 11660 */ 11661 return; 11662 } 11663 tp->t_flags |= TF_GPUTINPROG; 11664 rack->r_ctl.rc_gp_lowrtt = 0xffffffff; 11665 rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd; 11666 tp->gput_seq = startseq; 11667 rack->app_limited_needs_set = 0; 11668 if (rack->in_probe_rtt) 11669 rack->measure_saw_probe_rtt = 1; 11670 else if ((rack->measure_saw_probe_rtt) && 11671 (SEQ_GEQ(tp->gput_seq, rack->r_ctl.rc_probertt_sndmax_atexit))) 11672 rack->measure_saw_probe_rtt = 0; 11673 if (rack->rc_gp_filled) 11674 tp->gput_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 11675 else { 11676 /* Special case initial measurement */ 11677 rack->r_ctl.rc_gp_output_ts = tp->gput_ts = tcp_get_usecs(NULL); 11678 } 11679 /* 11680 * We take a guess out into the future, 11681 * if we have no measurement and no 11682 * initial rate, we measure the first 11683 * initial-windows worth of data to 11684 * speed up getting some GP measurement and 11685 * thus start pacing. 11686 */ 11687 if ((rack->rc_gp_filled == 0) && (rack->r_ctl.init_rate == 0)) { 11688 rack->app_limited_needs_set = 1; 11689 tp->gput_ack = startseq + max(rc_init_window(rack), 11690 (MIN_GP_WIN * ctf_fixed_maxseg(tp))); 11691 rack_log_pacing_delay_calc(rack, 11692 tp->gput_seq, 11693 tp->gput_ack, 11694 0, 11695 tp->gput_ts, 11696 rack->r_ctl.rc_app_limited_cnt, 11697 9, 11698 __LINE__, NULL); 11699 return; 11700 } 11701 if (sb_offset) { 11702 /* 11703 * We are out somewhere in the sb 11704 * can we use the already outstanding data? 11705 */ 11706 11707 if (rack->r_ctl.rc_app_limited_cnt == 0) { 11708 /* 11709 * Yes first one is good and in this case 11710 * the tp->gput_ts is correctly set based on 11711 * the last ack that arrived (no need to 11712 * set things up when an ack comes in). 11713 */ 11714 my_rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 11715 if ((my_rsm == NULL) || 11716 (my_rsm->r_rtr_cnt != 1)) { 11717 /* retransmission? */ 11718 goto use_latest; 11719 } 11720 } else { 11721 if (rack->r_ctl.rc_first_appl == NULL) { 11722 /* 11723 * If rc_first_appl is NULL 11724 * then the cnt should be 0. 11725 * This is probably an error, maybe 11726 * a KASSERT would be approprate. 11727 */ 11728 goto use_latest; 11729 } 11730 /* 11731 * If we have a marker pointer to the last one that is 11732 * app limited we can use that, but we need to set 11733 * things up so that when it gets ack'ed we record 11734 * the ack time (if its not already acked). 11735 */ 11736 rack->app_limited_needs_set = 1; 11737 /* 11738 * We want to get to the rsm that is either 11739 * next with space i.e. over 1 MSS or the one 11740 * after that (after the app-limited). 11741 */ 11742 my_rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, 11743 rack->r_ctl.rc_first_appl); 11744 if (my_rsm) { 11745 if ((my_rsm->r_end - my_rsm->r_start) <= ctf_fixed_maxseg(tp)) 11746 /* Have to use the next one */ 11747 my_rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, 11748 my_rsm); 11749 else { 11750 /* Use after the first MSS of it is acked */ 11751 tp->gput_seq = my_rsm->r_start + ctf_fixed_maxseg(tp); 11752 goto start_set; 11753 } 11754 } 11755 if ((my_rsm == NULL) || 11756 (my_rsm->r_rtr_cnt != 1)) { 11757 /* 11758 * Either its a retransmit or 11759 * the last is the app-limited one. 11760 */ 11761 goto use_latest; 11762 } 11763 } 11764 tp->gput_seq = my_rsm->r_start; 11765 start_set: 11766 if (my_rsm->r_flags & RACK_ACKED) { 11767 /* 11768 * This one has been acked use the arrival ack time 11769 */ 11770 tp->gput_ts = my_rsm->r_ack_arrival; 11771 rack->app_limited_needs_set = 0; 11772 } 11773 rack->r_ctl.rc_gp_output_ts = my_rsm->usec_orig_send; 11774 tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack); 11775 rack_log_pacing_delay_calc(rack, 11776 tp->gput_seq, 11777 tp->gput_ack, 11778 (uint64_t)my_rsm, 11779 tp->gput_ts, 11780 rack->r_ctl.rc_app_limited_cnt, 11781 9, 11782 __LINE__, NULL); 11783 return; 11784 } 11785 11786 use_latest: 11787 /* 11788 * We don't know how long we may have been 11789 * idle or if this is the first-send. Lets 11790 * setup the flag so we will trim off 11791 * the first ack'd data so we get a true 11792 * measurement. 11793 */ 11794 rack->app_limited_needs_set = 1; 11795 tp->gput_ack = startseq + rack_get_measure_window(tp, rack); 11796 /* Find this guy so we can pull the send time */ 11797 fe.r_start = startseq; 11798 my_rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); 11799 if (my_rsm) { 11800 rack->r_ctl.rc_gp_output_ts = my_rsm->usec_orig_send; 11801 if (my_rsm->r_flags & RACK_ACKED) { 11802 /* 11803 * Unlikely since its probably what was 11804 * just transmitted (but I am paranoid). 11805 */ 11806 tp->gput_ts = my_rsm->r_ack_arrival; 11807 rack->app_limited_needs_set = 0; 11808 } 11809 if (SEQ_LT(my_rsm->r_start, tp->gput_seq)) { 11810 /* This also is unlikely */ 11811 tp->gput_seq = my_rsm->r_start; 11812 } 11813 } else { 11814 /* 11815 * TSNH unless we have some send-map limit, 11816 * and even at that it should not be hitting 11817 * that limit (we should have stopped sending). 11818 */ 11819 rack->r_ctl.rc_gp_output_ts = tcp_get_usecs(NULL); 11820 } 11821 rack_log_pacing_delay_calc(rack, 11822 tp->gput_seq, 11823 tp->gput_ack, 11824 (uint64_t)my_rsm, 11825 tp->gput_ts, 11826 rack->r_ctl.rc_app_limited_cnt, 11827 9, __LINE__, NULL); 11828 } 11829 11830 static inline uint32_t 11831 rack_what_can_we_send(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cwnd_to_use, 11832 uint32_t avail, int32_t sb_offset) 11833 { 11834 uint32_t len; 11835 uint32_t sendwin; 11836 11837 if (tp->snd_wnd > cwnd_to_use) 11838 sendwin = cwnd_to_use; 11839 else 11840 sendwin = tp->snd_wnd; 11841 if (ctf_outstanding(tp) >= tp->snd_wnd) { 11842 /* We never want to go over our peers rcv-window */ 11843 len = 0; 11844 } else { 11845 uint32_t flight; 11846 11847 flight = ctf_flight_size(tp, rack->r_ctl.rc_sacked); 11848 if (flight >= sendwin) { 11849 /* 11850 * We have in flight what we are allowed by cwnd (if 11851 * it was rwnd blocking it would have hit above out 11852 * >= tp->snd_wnd). 11853 */ 11854 return (0); 11855 } 11856 len = sendwin - flight; 11857 if ((len + ctf_outstanding(tp)) > tp->snd_wnd) { 11858 /* We would send too much (beyond the rwnd) */ 11859 len = tp->snd_wnd - ctf_outstanding(tp); 11860 } 11861 if ((len + sb_offset) > avail) { 11862 /* 11863 * We don't have that much in the SB, how much is 11864 * there? 11865 */ 11866 len = avail - sb_offset; 11867 } 11868 } 11869 return (len); 11870 } 11871 11872 static int 11873 rack_output(struct tcpcb *tp) 11874 { 11875 struct socket *so; 11876 uint32_t recwin; 11877 uint32_t sb_offset; 11878 int32_t len, flags, error = 0; 11879 struct mbuf *m; 11880 struct mbuf *mb; 11881 uint32_t if_hw_tsomaxsegcount = 0; 11882 uint32_t if_hw_tsomaxsegsize; 11883 int32_t segsiz, minseg; 11884 long tot_len_this_send = 0; 11885 struct ip *ip = NULL; 11886 #ifdef TCPDEBUG 11887 struct ipovly *ipov = NULL; 11888 #endif 11889 struct udphdr *udp = NULL; 11890 struct tcp_rack *rack; 11891 struct tcphdr *th; 11892 uint8_t pass = 0; 11893 uint8_t mark = 0; 11894 uint8_t wanted_cookie = 0; 11895 u_char opt[TCP_MAXOLEN]; 11896 unsigned ipoptlen, optlen, hdrlen, ulen=0; 11897 uint32_t rack_seq; 11898 11899 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 11900 unsigned ipsec_optlen = 0; 11901 11902 #endif 11903 int32_t idle, sendalot; 11904 int32_t sub_from_prr = 0; 11905 volatile int32_t sack_rxmit; 11906 struct rack_sendmap *rsm = NULL; 11907 int32_t tso, mtu; 11908 struct tcpopt to; 11909 int32_t slot = 0; 11910 int32_t sup_rack = 0; 11911 uint32_t cts, us_cts, delayed, early; 11912 uint8_t hpts_calling, new_data_tlp = 0, doing_tlp = 0; 11913 uint32_t cwnd_to_use; 11914 int32_t do_a_prefetch; 11915 int32_t prefetch_rsm = 0; 11916 int32_t orig_len; 11917 struct timeval tv; 11918 int32_t prefetch_so_done = 0; 11919 struct tcp_log_buffer *lgb = NULL; 11920 struct inpcb *inp; 11921 struct sockbuf *sb; 11922 #ifdef INET6 11923 struct ip6_hdr *ip6 = NULL; 11924 int32_t isipv6; 11925 #endif 11926 uint8_t filled_all = 0; 11927 bool hw_tls = false; 11928 11929 /* setup and take the cache hits here */ 11930 rack = (struct tcp_rack *)tp->t_fb_ptr; 11931 inp = rack->rc_inp; 11932 so = inp->inp_socket; 11933 sb = &so->so_snd; 11934 kern_prefetch(sb, &do_a_prefetch); 11935 do_a_prefetch = 1; 11936 hpts_calling = inp->inp_hpts_calls; 11937 hw_tls = (so->so_snd.sb_flags & SB_TLS_IFNET) != 0; 11938 11939 NET_EPOCH_ASSERT(); 11940 INP_WLOCK_ASSERT(inp); 11941 #ifdef TCP_OFFLOAD 11942 if (tp->t_flags & TF_TOE) 11943 return (tcp_offload_output(tp)); 11944 #endif 11945 /* 11946 * For TFO connections in SYN_RECEIVED, only allow the initial 11947 * SYN|ACK and those sent by the retransmit timer. 11948 */ 11949 if (IS_FASTOPEN(tp->t_flags) && 11950 (tp->t_state == TCPS_SYN_RECEIVED) && 11951 SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN|ACK sent */ 11952 (rack->r_ctl.rc_resend == NULL)) /* not a retransmit */ 11953 return (0); 11954 #ifdef INET6 11955 if (rack->r_state) { 11956 /* Use the cache line loaded if possible */ 11957 isipv6 = rack->r_is_v6; 11958 } else { 11959 isipv6 = (inp->inp_vflag & INP_IPV6) != 0; 11960 } 11961 #endif 11962 early = 0; 11963 us_cts = tcp_get_usecs(&tv); 11964 cts = tcp_tv_to_mssectick(&tv); 11965 if (((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) && 11966 inp->inp_in_hpts) { 11967 /* 11968 * We are on the hpts for some timer but not hptsi output. 11969 * Remove from the hpts unconditionally. 11970 */ 11971 rack_timer_cancel(tp, rack, cts, __LINE__); 11972 } 11973 /* Are we pacing and late? */ 11974 if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) && 11975 TSTMP_GEQ(us_cts, rack->r_ctl.rc_last_output_to)) { 11976 /* We are delayed */ 11977 delayed = us_cts - rack->r_ctl.rc_last_output_to; 11978 } else { 11979 delayed = 0; 11980 } 11981 /* Do the timers, which may override the pacer */ 11982 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { 11983 if (rack_process_timers(tp, rack, cts, hpts_calling)) { 11984 counter_u64_add(rack_out_size[TCP_MSS_ACCT_ATIMER], 1); 11985 return (0); 11986 } 11987 } 11988 if ((rack->r_timer_override) || 11989 (delayed) || 11990 (tp->t_state < TCPS_ESTABLISHED)) { 11991 if (tp->t_inpcb->inp_in_hpts) 11992 tcp_hpts_remove(tp->t_inpcb, HPTS_REMOVE_OUTPUT); 11993 } else if (tp->t_inpcb->inp_in_hpts) { 11994 /* 11995 * On the hpts you can't pass even if ACKNOW is on, we will 11996 * when the hpts fires. 11997 */ 11998 counter_u64_add(rack_out_size[TCP_MSS_ACCT_INPACE], 1); 11999 return (0); 12000 } 12001 inp->inp_hpts_calls = 0; 12002 /* Finish out both pacing early and late accounting */ 12003 if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) && 12004 TSTMP_GT(rack->r_ctl.rc_last_output_to, us_cts)) { 12005 early = rack->r_ctl.rc_last_output_to - us_cts; 12006 } else 12007 early = 0; 12008 if (delayed) { 12009 rack->r_ctl.rc_agg_delayed += delayed; 12010 rack->r_late = 1; 12011 } else if (early) { 12012 rack->r_ctl.rc_agg_early += early; 12013 rack->r_early = 1; 12014 } 12015 /* Now that early/late accounting is done turn off the flag */ 12016 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 12017 rack->r_wanted_output = 0; 12018 rack->r_timer_override = 0; 12019 /* 12020 * For TFO connections in SYN_SENT or SYN_RECEIVED, 12021 * only allow the initial SYN or SYN|ACK and those sent 12022 * by the retransmit timer. 12023 */ 12024 if (IS_FASTOPEN(tp->t_flags) && 12025 ((tp->t_state == TCPS_SYN_RECEIVED) || 12026 (tp->t_state == TCPS_SYN_SENT)) && 12027 SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN or SYN|ACK sent */ 12028 (tp->t_rxtshift == 0)) { /* not a retransmit */ 12029 cwnd_to_use = rack->r_ctl.cwnd_to_use = tp->snd_cwnd; 12030 goto just_return_nolock; 12031 } 12032 /* 12033 * Determine length of data that should be transmitted, and flags 12034 * that will be used. If there is some data or critical controls 12035 * (SYN, RST) to send, then transmit; otherwise, investigate 12036 * further. 12037 */ 12038 idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una); 12039 if (tp->t_idle_reduce) { 12040 if (idle && ((ticks - tp->t_rcvtime) >= tp->t_rxtcur)) 12041 rack_cc_after_idle(rack, tp); 12042 } 12043 tp->t_flags &= ~TF_LASTIDLE; 12044 if (idle) { 12045 if (tp->t_flags & TF_MORETOCOME) { 12046 tp->t_flags |= TF_LASTIDLE; 12047 idle = 0; 12048 } 12049 } 12050 if ((tp->snd_una == tp->snd_max) && 12051 rack->r_ctl.rc_went_idle_time && 12052 TSTMP_GT(us_cts, rack->r_ctl.rc_went_idle_time)) { 12053 idle = us_cts - rack->r_ctl.rc_went_idle_time; 12054 if (idle > rack_min_probertt_hold) { 12055 /* Count as a probe rtt */ 12056 if (rack->in_probe_rtt == 0) { 12057 rack->r_ctl.rc_lower_rtt_us_cts = us_cts; 12058 rack->r_ctl.rc_time_probertt_entered = rack->r_ctl.rc_lower_rtt_us_cts; 12059 rack->r_ctl.rc_time_probertt_starts = rack->r_ctl.rc_lower_rtt_us_cts; 12060 rack->r_ctl.rc_time_of_last_probertt = rack->r_ctl.rc_lower_rtt_us_cts; 12061 } else { 12062 rack_exit_probertt(rack, us_cts); 12063 } 12064 } 12065 idle = 0; 12066 } 12067 again: 12068 /* 12069 * If we've recently taken a timeout, snd_max will be greater than 12070 * snd_nxt. There may be SACK information that allows us to avoid 12071 * resending already delivered data. Adjust snd_nxt accordingly. 12072 */ 12073 sendalot = 0; 12074 us_cts = tcp_get_usecs(&tv); 12075 cts = tcp_tv_to_mssectick(&tv); 12076 tso = 0; 12077 mtu = 0; 12078 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 12079 minseg = segsiz; 12080 sb_offset = tp->snd_max - tp->snd_una; 12081 cwnd_to_use = rack->r_ctl.cwnd_to_use = tp->snd_cwnd; 12082 #ifdef NETFLIX_SHARED_CWND 12083 if ((tp->t_flags2 & TF2_TCP_SCWND_ALLOWED) && 12084 rack->rack_enable_scwnd) { 12085 /* We are doing cwnd sharing */ 12086 if (rack->rc_gp_filled && 12087 (rack->rack_attempted_scwnd == 0) && 12088 (rack->r_ctl.rc_scw == NULL) && 12089 tp->t_lib) { 12090 /* The pcbid is in, lets make an attempt */ 12091 counter_u64_add(rack_try_scwnd, 1); 12092 rack->rack_attempted_scwnd = 1; 12093 rack->r_ctl.rc_scw = tcp_shared_cwnd_alloc(tp, 12094 &rack->r_ctl.rc_scw_index, 12095 segsiz); 12096 } 12097 if (rack->r_ctl.rc_scw && 12098 (rack->rack_scwnd_is_idle == 1) && 12099 (rack->rc_in_persist == 0) && 12100 sbavail(sb)) { 12101 /* we are no longer out of data */ 12102 tcp_shared_cwnd_active(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index); 12103 rack->rack_scwnd_is_idle = 0; 12104 } 12105 if (rack->r_ctl.rc_scw) { 12106 /* First lets update and get the cwnd */ 12107 rack->r_ctl.cwnd_to_use = cwnd_to_use = tcp_shared_cwnd_update(rack->r_ctl.rc_scw, 12108 rack->r_ctl.rc_scw_index, 12109 tp->snd_cwnd, tp->snd_wnd, segsiz); 12110 } 12111 } 12112 #endif 12113 flags = tcp_outflags[tp->t_state]; 12114 while (rack->rc_free_cnt < rack_free_cache) { 12115 rsm = rack_alloc(rack); 12116 if (rsm == NULL) { 12117 if (inp->inp_hpts_calls) 12118 /* Retry in a ms */ 12119 slot = (1 * HPTS_USEC_IN_MSEC); 12120 goto just_return_nolock; 12121 } 12122 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_tnext); 12123 rack->rc_free_cnt++; 12124 rsm = NULL; 12125 } 12126 if (inp->inp_hpts_calls) 12127 inp->inp_hpts_calls = 0; 12128 sack_rxmit = 0; 12129 len = 0; 12130 rsm = NULL; 12131 if (flags & TH_RST) { 12132 SOCKBUF_LOCK(sb); 12133 goto send; 12134 } 12135 if (rack->r_ctl.rc_resend) { 12136 /* Retransmit timer */ 12137 rsm = rack->r_ctl.rc_resend; 12138 rack->r_ctl.rc_resend = NULL; 12139 rsm->r_flags &= ~RACK_TLP; 12140 len = rsm->r_end - rsm->r_start; 12141 sack_rxmit = 1; 12142 sendalot = 0; 12143 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start), 12144 ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p", 12145 __func__, __LINE__, 12146 rsm->r_start, tp->snd_una, tp, rack, rsm)); 12147 sb_offset = rsm->r_start - tp->snd_una; 12148 if (len >= segsiz) 12149 len = segsiz; 12150 } else if ((rack->rc_in_persist == 0) && 12151 ((rsm = tcp_rack_output(tp, rack, cts)) != NULL)) { 12152 /* We have a retransmit that takes precedence */ 12153 rsm->r_flags &= ~RACK_TLP; 12154 if ((!IN_RECOVERY(tp->t_flags)) && 12155 ((tp->t_flags & (TF_WASFRECOVERY | TF_WASCRECOVERY)) == 0)) { 12156 /* Enter recovery if not induced by a time-out */ 12157 rack->r_ctl.rc_rsm_start = rsm->r_start; 12158 rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; 12159 rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; 12160 rack_cong_signal(tp, NULL, CC_NDUPACK); 12161 /* 12162 * When we enter recovery we need to assure we send 12163 * one packet. 12164 */ 12165 if (rack->rack_no_prr == 0) { 12166 rack->r_ctl.rc_prr_sndcnt = segsiz; 12167 rack_log_to_prr(rack, 13, 0); 12168 } 12169 } 12170 #ifdef INVARIANTS 12171 if (SEQ_LT(rsm->r_start, tp->snd_una)) { 12172 panic("Huh, tp:%p rack:%p rsm:%p start:%u < snd_una:%u\n", 12173 tp, rack, rsm, rsm->r_start, tp->snd_una); 12174 } 12175 #endif 12176 len = rsm->r_end - rsm->r_start; 12177 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start), 12178 ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p", 12179 __func__, __LINE__, 12180 rsm->r_start, tp->snd_una, tp, rack, rsm)); 12181 sb_offset = rsm->r_start - tp->snd_una; 12182 /* Can we send it within the PRR boundary? */ 12183 if (rack->rack_no_prr == 0) { 12184 if ((rack->use_rack_rr == 0) && (len > rack->r_ctl.rc_prr_sndcnt)) { 12185 /* It does not fit */ 12186 if ((ctf_flight_size(tp, rack->r_ctl.rc_sacked) > len) && 12187 (rack->r_ctl.rc_prr_sndcnt < segsiz)) { 12188 /* 12189 * prr is less than a segment, we 12190 * have more acks due in besides 12191 * what we need to resend. Lets not send 12192 * to avoid sending small pieces of 12193 * what we need to retransmit. 12194 */ 12195 len = 0; 12196 goto just_return_nolock; 12197 } 12198 len = rack->r_ctl.rc_prr_sndcnt; 12199 } 12200 } 12201 sendalot = 0; 12202 if (len >= segsiz) 12203 len = segsiz; 12204 if (len > 0) { 12205 sub_from_prr = 1; 12206 sack_rxmit = 1; 12207 KMOD_TCPSTAT_INC(tcps_sack_rexmits); 12208 KMOD_TCPSTAT_ADD(tcps_sack_rexmit_bytes, 12209 min(len, segsiz)); 12210 counter_u64_add(rack_rtm_prr_retran, 1); 12211 } 12212 } else if (rack->r_ctl.rc_tlpsend) { 12213 /* Tail loss probe */ 12214 long cwin; 12215 long tlen; 12216 12217 doing_tlp = 1; 12218 /* 12219 * Check if we can do a TLP with a RACK'd packet 12220 * this can happen if we are not doing the rack 12221 * cheat and we skipped to a TLP and it 12222 * went off. 12223 */ 12224 rsm = rack->r_ctl.rc_tlpsend; 12225 rsm->r_flags |= RACK_TLP; 12226 rack->r_ctl.rc_tlpsend = NULL; 12227 sack_rxmit = 1; 12228 tlen = rsm->r_end - rsm->r_start; 12229 if (tlen > segsiz) 12230 tlen = segsiz; 12231 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start), 12232 ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p", 12233 __func__, __LINE__, 12234 rsm->r_start, tp->snd_una, tp, rack, rsm)); 12235 sb_offset = rsm->r_start - tp->snd_una; 12236 cwin = min(tp->snd_wnd, tlen); 12237 len = cwin; 12238 } 12239 /* 12240 * Enforce a connection sendmap count limit if set 12241 * as long as we are not retransmiting. 12242 */ 12243 if ((rsm == NULL) && 12244 (rack->do_detection == 0) && 12245 (V_tcp_map_entries_limit > 0) && 12246 (rack->r_ctl.rc_num_maps_alloced >= V_tcp_map_entries_limit)) { 12247 counter_u64_add(rack_to_alloc_limited, 1); 12248 if (!rack->alloc_limit_reported) { 12249 rack->alloc_limit_reported = 1; 12250 counter_u64_add(rack_alloc_limited_conns, 1); 12251 } 12252 goto just_return_nolock; 12253 } 12254 if (rsm && (rsm->r_flags & RACK_HAS_FIN)) { 12255 /* we are retransmitting the fin */ 12256 len--; 12257 if (len) { 12258 /* 12259 * When retransmitting data do *not* include the 12260 * FIN. This could happen from a TLP probe. 12261 */ 12262 flags &= ~TH_FIN; 12263 } 12264 } 12265 #ifdef INVARIANTS 12266 /* For debugging */ 12267 rack->r_ctl.rc_rsm_at_retran = rsm; 12268 #endif 12269 /* 12270 * Get standard flags, and add SYN or FIN if requested by 'hidden' 12271 * state flags. 12272 */ 12273 if (tp->t_flags & TF_NEEDFIN) 12274 flags |= TH_FIN; 12275 if (tp->t_flags & TF_NEEDSYN) 12276 flags |= TH_SYN; 12277 if ((sack_rxmit == 0) && (prefetch_rsm == 0)) { 12278 void *end_rsm; 12279 end_rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext); 12280 if (end_rsm) 12281 kern_prefetch(end_rsm, &prefetch_rsm); 12282 prefetch_rsm = 1; 12283 } 12284 SOCKBUF_LOCK(sb); 12285 /* 12286 * If snd_nxt == snd_max and we have transmitted a FIN, the 12287 * sb_offset will be > 0 even if so_snd.sb_cc is 0, resulting in a 12288 * negative length. This can also occur when TCP opens up its 12289 * congestion window while receiving additional duplicate acks after 12290 * fast-retransmit because TCP will reset snd_nxt to snd_max after 12291 * the fast-retransmit. 12292 * 12293 * In the normal retransmit-FIN-only case, however, snd_nxt will be 12294 * set to snd_una, the sb_offset will be 0, and the length may wind 12295 * up 0. 12296 * 12297 * If sack_rxmit is true we are retransmitting from the scoreboard 12298 * in which case len is already set. 12299 */ 12300 if ((sack_rxmit == 0) && TCPS_HAVEESTABLISHED(tp->t_state)) { 12301 uint32_t avail; 12302 12303 avail = sbavail(sb); 12304 if (SEQ_GT(tp->snd_nxt, tp->snd_una) && avail) 12305 sb_offset = tp->snd_nxt - tp->snd_una; 12306 else 12307 sb_offset = 0; 12308 if ((IN_RECOVERY(tp->t_flags) == 0) || rack->rack_no_prr) { 12309 if (rack->r_ctl.rc_tlp_new_data) { 12310 /* TLP is forcing out new data */ 12311 if (rack->r_ctl.rc_tlp_new_data > (uint32_t) (avail - sb_offset)) { 12312 rack->r_ctl.rc_tlp_new_data = (uint32_t) (avail - sb_offset); 12313 } 12314 if (rack->r_ctl.rc_tlp_new_data > tp->snd_wnd) 12315 len = tp->snd_wnd; 12316 else 12317 len = rack->r_ctl.rc_tlp_new_data; 12318 rack->r_ctl.rc_tlp_new_data = 0; 12319 new_data_tlp = doing_tlp = 1; 12320 } else 12321 len = rack_what_can_we_send(tp, rack, cwnd_to_use, avail, sb_offset); 12322 if (IN_RECOVERY(tp->t_flags) && (len > segsiz)) { 12323 /* 12324 * For prr=off, we need to send only 1 MSS 12325 * at a time. We do this because another sack could 12326 * be arriving that causes us to send retransmits and 12327 * we don't want to be on a long pace due to a larger send 12328 * that keeps us from sending out the retransmit. 12329 */ 12330 len = segsiz; 12331 } 12332 } else { 12333 uint32_t outstanding; 12334 12335 /* 12336 * We are inside of a SACK recovery episode and are 12337 * sending new data, having retransmitted all the 12338 * data possible so far in the scoreboard. 12339 */ 12340 outstanding = tp->snd_max - tp->snd_una; 12341 if ((rack->r_ctl.rc_prr_sndcnt + outstanding) > tp->snd_wnd) { 12342 if (tp->snd_wnd > outstanding) { 12343 len = tp->snd_wnd - outstanding; 12344 /* Check to see if we have the data */ 12345 if ((sb_offset + len) > avail) { 12346 /* It does not all fit */ 12347 if (avail > sb_offset) 12348 len = avail - sb_offset; 12349 else 12350 len = 0; 12351 } 12352 } else 12353 len = 0; 12354 } else if (avail > sb_offset) 12355 len = avail - sb_offset; 12356 else 12357 len = 0; 12358 if (len > 0) { 12359 if (len > rack->r_ctl.rc_prr_sndcnt) 12360 len = rack->r_ctl.rc_prr_sndcnt; 12361 if (len > 0) { 12362 sub_from_prr = 1; 12363 counter_u64_add(rack_rtm_prr_newdata, 1); 12364 } 12365 } 12366 if (len > segsiz) { 12367 /* 12368 * We should never send more than a MSS when 12369 * retransmitting or sending new data in prr 12370 * mode unless the override flag is on. Most 12371 * likely the PRR algorithm is not going to 12372 * let us send a lot as well :-) 12373 */ 12374 if (rack->r_ctl.rc_prr_sendalot == 0) 12375 len = segsiz; 12376 } else if (len < segsiz) { 12377 /* 12378 * Do we send any? The idea here is if the 12379 * send empty's the socket buffer we want to 12380 * do it. However if not then lets just wait 12381 * for our prr_sndcnt to get bigger. 12382 */ 12383 long leftinsb; 12384 12385 leftinsb = sbavail(sb) - sb_offset; 12386 if (leftinsb > len) { 12387 /* This send does not empty the sb */ 12388 len = 0; 12389 } 12390 } 12391 } 12392 } else if (!TCPS_HAVEESTABLISHED(tp->t_state)) { 12393 /* 12394 * If you have not established 12395 * and are not doing FAST OPEN 12396 * no data please. 12397 */ 12398 if ((sack_rxmit == 0) && 12399 (!IS_FASTOPEN(tp->t_flags))){ 12400 len = 0; 12401 sb_offset = 0; 12402 } 12403 } 12404 if (prefetch_so_done == 0) { 12405 kern_prefetch(so, &prefetch_so_done); 12406 prefetch_so_done = 1; 12407 } 12408 /* 12409 * Lop off SYN bit if it has already been sent. However, if this is 12410 * SYN-SENT state and if segment contains data and if we don't know 12411 * that foreign host supports TAO, suppress sending segment. 12412 */ 12413 if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una) && 12414 ((sack_rxmit == 0) && (tp->t_rxtshift == 0))) { 12415 /* 12416 * When sending additional segments following a TFO SYN|ACK, 12417 * do not include the SYN bit. 12418 */ 12419 if (IS_FASTOPEN(tp->t_flags) && 12420 (tp->t_state == TCPS_SYN_RECEIVED)) 12421 flags &= ~TH_SYN; 12422 } 12423 /* 12424 * Be careful not to send data and/or FIN on SYN segments. This 12425 * measure is needed to prevent interoperability problems with not 12426 * fully conformant TCP implementations. 12427 */ 12428 if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) { 12429 len = 0; 12430 flags &= ~TH_FIN; 12431 } 12432 /* 12433 * On TFO sockets, ensure no data is sent in the following cases: 12434 * 12435 * - When retransmitting SYN|ACK on a passively-created socket 12436 * 12437 * - When retransmitting SYN on an actively created socket 12438 * 12439 * - When sending a zero-length cookie (cookie request) on an 12440 * actively created socket 12441 * 12442 * - When the socket is in the CLOSED state (RST is being sent) 12443 */ 12444 if (IS_FASTOPEN(tp->t_flags) && 12445 (((flags & TH_SYN) && (tp->t_rxtshift > 0)) || 12446 ((tp->t_state == TCPS_SYN_SENT) && 12447 (tp->t_tfo_client_cookie_len == 0)) || 12448 (flags & TH_RST))) { 12449 sack_rxmit = 0; 12450 len = 0; 12451 } 12452 /* Without fast-open there should never be data sent on a SYN */ 12453 if ((flags & TH_SYN) && (!IS_FASTOPEN(tp->t_flags))) { 12454 tp->snd_nxt = tp->iss; 12455 len = 0; 12456 } 12457 orig_len = len; 12458 if (len <= 0) { 12459 /* 12460 * If FIN has been sent but not acked, but we haven't been 12461 * called to retransmit, len will be < 0. Otherwise, window 12462 * shrank after we sent into it. If window shrank to 0, 12463 * cancel pending retransmit, pull snd_nxt back to (closed) 12464 * window, and set the persist timer if it isn't already 12465 * going. If the window didn't close completely, just wait 12466 * for an ACK. 12467 * 12468 * We also do a general check here to ensure that we will 12469 * set the persist timer when we have data to send, but a 12470 * 0-byte window. This makes sure the persist timer is set 12471 * even if the packet hits one of the "goto send" lines 12472 * below. 12473 */ 12474 len = 0; 12475 if ((tp->snd_wnd == 0) && 12476 (TCPS_HAVEESTABLISHED(tp->t_state)) && 12477 (tp->snd_una == tp->snd_max) && 12478 (sb_offset < (int)sbavail(sb))) { 12479 tp->snd_nxt = tp->snd_una; 12480 rack_enter_persist(tp, rack, cts); 12481 } 12482 } else if ((rsm == NULL) && 12483 ((doing_tlp == 0) || (new_data_tlp == 1)) && 12484 (len < rack->r_ctl.rc_pace_max_segs)) { 12485 /* 12486 * We are not sending a maximum sized segment for 12487 * some reason. Should we not send anything (think 12488 * sws or persists)? 12489 */ 12490 if ((tp->snd_wnd < min(max(segsiz, (rack->r_ctl.rc_high_rwnd/2)), minseg)) && 12491 (TCPS_HAVEESTABLISHED(tp->t_state)) && 12492 (len < minseg) && 12493 (len < (int)(sbavail(sb) - sb_offset))) { 12494 /* 12495 * Here the rwnd is less than 12496 * the minimum pacing size, this is not a retransmit, 12497 * we are established and 12498 * the send is not the last in the socket buffer 12499 * we send nothing, and we may enter persists 12500 * if nothing is outstanding. 12501 */ 12502 len = 0; 12503 if (tp->snd_max == tp->snd_una) { 12504 /* 12505 * Nothing out we can 12506 * go into persists. 12507 */ 12508 rack_enter_persist(tp, rack, cts); 12509 tp->snd_nxt = tp->snd_una; 12510 } 12511 } else if ((cwnd_to_use >= max(minseg, (segsiz * 4))) && 12512 (ctf_flight_size(tp, rack->r_ctl.rc_sacked) > (2 * segsiz)) && 12513 (len < (int)(sbavail(sb) - sb_offset)) && 12514 (len < minseg)) { 12515 /* 12516 * Here we are not retransmitting, and 12517 * the cwnd is not so small that we could 12518 * not send at least a min size (rxt timer 12519 * not having gone off), We have 2 segments or 12520 * more already in flight, its not the tail end 12521 * of the socket buffer and the cwnd is blocking 12522 * us from sending out a minimum pacing segment size. 12523 * Lets not send anything. 12524 */ 12525 len = 0; 12526 } else if (((tp->snd_wnd - ctf_outstanding(tp)) < 12527 min((rack->r_ctl.rc_high_rwnd/2), minseg)) && 12528 (ctf_flight_size(tp, rack->r_ctl.rc_sacked) > (2 * segsiz)) && 12529 (len < (int)(sbavail(sb) - sb_offset)) && 12530 (TCPS_HAVEESTABLISHED(tp->t_state))) { 12531 /* 12532 * Here we have a send window but we have 12533 * filled it up and we can't send another pacing segment. 12534 * We also have in flight more than 2 segments 12535 * and we are not completing the sb i.e. we allow 12536 * the last bytes of the sb to go out even if 12537 * its not a full pacing segment. 12538 */ 12539 len = 0; 12540 } 12541 } 12542 /* len will be >= 0 after this point. */ 12543 KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); 12544 tcp_sndbuf_autoscale(tp, so, min(tp->snd_wnd, cwnd_to_use)); 12545 /* 12546 * Decide if we can use TCP Segmentation Offloading (if supported by 12547 * hardware). 12548 * 12549 * TSO may only be used if we are in a pure bulk sending state. The 12550 * presence of TCP-MD5, SACK retransmits, SACK advertizements and IP 12551 * options prevent using TSO. With TSO the TCP header is the same 12552 * (except for the sequence number) for all generated packets. This 12553 * makes it impossible to transmit any options which vary per 12554 * generated segment or packet. 12555 * 12556 * IPv4 handling has a clear separation of ip options and ip header 12557 * flags while IPv6 combines both in in6p_outputopts. ip6_optlen() does 12558 * the right thing below to provide length of just ip options and thus 12559 * checking for ipoptlen is enough to decide if ip options are present. 12560 */ 12561 12562 #ifdef INET6 12563 if (isipv6) 12564 ipoptlen = ip6_optlen(tp->t_inpcb); 12565 else 12566 #endif 12567 if (tp->t_inpcb->inp_options) 12568 ipoptlen = tp->t_inpcb->inp_options->m_len - 12569 offsetof(struct ipoption, ipopt_list); 12570 else 12571 ipoptlen = 0; 12572 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 12573 /* 12574 * Pre-calculate here as we save another lookup into the darknesses 12575 * of IPsec that way and can actually decide if TSO is ok. 12576 */ 12577 #ifdef INET6 12578 if (isipv6 && IPSEC_ENABLED(ipv6)) 12579 ipsec_optlen = IPSEC_HDRSIZE(ipv6, tp->t_inpcb); 12580 #ifdef INET 12581 else 12582 #endif 12583 #endif /* INET6 */ 12584 #ifdef INET 12585 if (IPSEC_ENABLED(ipv4)) 12586 ipsec_optlen = IPSEC_HDRSIZE(ipv4, tp->t_inpcb); 12587 #endif /* INET */ 12588 #endif 12589 12590 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 12591 ipoptlen += ipsec_optlen; 12592 #endif 12593 if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > segsiz && 12594 (tp->t_port == 0) && 12595 ((tp->t_flags & TF_SIGNATURE) == 0) && 12596 tp->rcv_numsacks == 0 && sack_rxmit == 0 && 12597 ipoptlen == 0) 12598 tso = 1; 12599 { 12600 uint32_t outstanding; 12601 12602 outstanding = tp->snd_max - tp->snd_una; 12603 if (tp->t_flags & TF_SENTFIN) { 12604 /* 12605 * If we sent a fin, snd_max is 1 higher than 12606 * snd_una 12607 */ 12608 outstanding--; 12609 } 12610 if (sack_rxmit) { 12611 if ((rsm->r_flags & RACK_HAS_FIN) == 0) 12612 flags &= ~TH_FIN; 12613 } else { 12614 if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + 12615 sbused(sb))) 12616 flags &= ~TH_FIN; 12617 } 12618 } 12619 recwin = lmin(lmax(sbspace(&so->so_rcv), 0), 12620 (long)TCP_MAXWIN << tp->rcv_scale); 12621 12622 /* 12623 * Sender silly window avoidance. We transmit under the following 12624 * conditions when len is non-zero: 12625 * 12626 * - We have a full segment (or more with TSO) - This is the last 12627 * buffer in a write()/send() and we are either idle or running 12628 * NODELAY - we've timed out (e.g. persist timer) - we have more 12629 * then 1/2 the maximum send window's worth of data (receiver may be 12630 * limited the window size) - we need to retransmit 12631 */ 12632 if (len) { 12633 if (len >= segsiz) { 12634 goto send; 12635 } 12636 /* 12637 * NOTE! on localhost connections an 'ack' from the remote 12638 * end may occur synchronously with the output and cause us 12639 * to flush a buffer queued with moretocome. XXX 12640 * 12641 */ 12642 if (!(tp->t_flags & TF_MORETOCOME) && /* normal case */ 12643 (idle || (tp->t_flags & TF_NODELAY)) && 12644 ((uint32_t)len + (uint32_t)sb_offset >= sbavail(sb)) && 12645 (tp->t_flags & TF_NOPUSH) == 0) { 12646 pass = 2; 12647 goto send; 12648 } 12649 if ((tp->snd_una == tp->snd_max) && len) { /* Nothing outstanding */ 12650 pass = 22; 12651 goto send; 12652 } 12653 if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) { 12654 pass = 4; 12655 goto send; 12656 } 12657 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { /* retransmit case */ 12658 pass = 5; 12659 goto send; 12660 } 12661 if (sack_rxmit) { 12662 pass = 6; 12663 goto send; 12664 } 12665 if (((tp->snd_wnd - ctf_outstanding(tp)) < segsiz) && 12666 (ctf_outstanding(tp) < (segsiz * 2))) { 12667 /* 12668 * We have less than two MSS outstanding (delayed ack) 12669 * and our rwnd will not let us send a full sized 12670 * MSS. Lets go ahead and let this small segment 12671 * out because we want to try to have at least two 12672 * packets inflight to not be caught by delayed ack. 12673 */ 12674 pass = 12; 12675 goto send; 12676 } 12677 } 12678 /* 12679 * Sending of standalone window updates. 12680 * 12681 * Window updates are important when we close our window due to a 12682 * full socket buffer and are opening it again after the application 12683 * reads data from it. Once the window has opened again and the 12684 * remote end starts to send again the ACK clock takes over and 12685 * provides the most current window information. 12686 * 12687 * We must avoid the silly window syndrome whereas every read from 12688 * the receive buffer, no matter how small, causes a window update 12689 * to be sent. We also should avoid sending a flurry of window 12690 * updates when the socket buffer had queued a lot of data and the 12691 * application is doing small reads. 12692 * 12693 * Prevent a flurry of pointless window updates by only sending an 12694 * update when we can increase the advertized window by more than 12695 * 1/4th of the socket buffer capacity. When the buffer is getting 12696 * full or is very small be more aggressive and send an update 12697 * whenever we can increase by two mss sized segments. In all other 12698 * situations the ACK's to new incoming data will carry further 12699 * window increases. 12700 * 12701 * Don't send an independent window update if a delayed ACK is 12702 * pending (it will get piggy-backed on it) or the remote side 12703 * already has done a half-close and won't send more data. Skip 12704 * this if the connection is in T/TCP half-open state. 12705 */ 12706 if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN) && 12707 !(tp->t_flags & TF_DELACK) && 12708 !TCPS_HAVERCVDFIN(tp->t_state)) { 12709 /* 12710 * "adv" is the amount we could increase the window, taking 12711 * into account that we are limited by TCP_MAXWIN << 12712 * tp->rcv_scale. 12713 */ 12714 int32_t adv; 12715 int oldwin; 12716 12717 adv = recwin; 12718 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) { 12719 oldwin = (tp->rcv_adv - tp->rcv_nxt); 12720 if (adv > oldwin) 12721 adv -= oldwin; 12722 else { 12723 /* We can't increase the window */ 12724 adv = 0; 12725 } 12726 } else 12727 oldwin = 0; 12728 12729 /* 12730 * If the new window size ends up being the same as or less 12731 * than the old size when it is scaled, then don't force 12732 * a window update. 12733 */ 12734 if (oldwin >> tp->rcv_scale >= (adv + oldwin) >> tp->rcv_scale) 12735 goto dontupdate; 12736 12737 if (adv >= (int32_t)(2 * segsiz) && 12738 (adv >= (int32_t)(so->so_rcv.sb_hiwat / 4) || 12739 recwin <= (int32_t)(so->so_rcv.sb_hiwat / 8) || 12740 so->so_rcv.sb_hiwat <= 8 * segsiz)) { 12741 pass = 7; 12742 goto send; 12743 } 12744 if (2 * adv >= (int32_t) so->so_rcv.sb_hiwat) { 12745 pass = 23; 12746 goto send; 12747 } 12748 } 12749 dontupdate: 12750 12751 /* 12752 * Send if we owe the peer an ACK, RST, SYN, or urgent data. ACKNOW 12753 * is also a catch-all for the retransmit timer timeout case. 12754 */ 12755 if (tp->t_flags & TF_ACKNOW) { 12756 pass = 8; 12757 goto send; 12758 } 12759 if (((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0)) { 12760 pass = 9; 12761 goto send; 12762 } 12763 /* 12764 * If our state indicates that FIN should be sent and we have not 12765 * yet done so, then we need to send. 12766 */ 12767 if ((flags & TH_FIN) && 12768 (tp->snd_nxt == tp->snd_una)) { 12769 pass = 11; 12770 goto send; 12771 } 12772 /* 12773 * No reason to send a segment, just return. 12774 */ 12775 just_return: 12776 SOCKBUF_UNLOCK(sb); 12777 just_return_nolock: 12778 { 12779 int app_limited = CTF_JR_SENT_DATA; 12780 12781 if (tot_len_this_send > 0) { 12782 /* Make sure snd_nxt is up to max */ 12783 if (SEQ_GT(tp->snd_max, tp->snd_nxt)) 12784 tp->snd_nxt = tp->snd_max; 12785 slot = rack_get_pacing_delay(rack, tp, tot_len_this_send, NULL, segsiz); 12786 } else { 12787 int end_window = 0; 12788 uint32_t seq = tp->gput_ack; 12789 12790 rsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 12791 if (rsm) { 12792 /* 12793 * Mark the last sent that we just-returned (hinting 12794 * that delayed ack may play a role in any rtt measurement). 12795 */ 12796 rsm->r_just_ret = 1; 12797 } 12798 counter_u64_add(rack_out_size[TCP_MSS_ACCT_JUSTRET], 1); 12799 rack->r_ctl.rc_agg_delayed = 0; 12800 rack->r_early = 0; 12801 rack->r_late = 0; 12802 rack->r_ctl.rc_agg_early = 0; 12803 if ((ctf_outstanding(tp) + 12804 min(max(segsiz, (rack->r_ctl.rc_high_rwnd/2)), 12805 minseg)) >= tp->snd_wnd) { 12806 /* We are limited by the rwnd */ 12807 app_limited = CTF_JR_RWND_LIMITED; 12808 } else if (ctf_outstanding(tp) >= sbavail(sb)) { 12809 /* We are limited by whats available -- app limited */ 12810 app_limited = CTF_JR_APP_LIMITED; 12811 } else if ((idle == 0) && 12812 ((tp->t_flags & TF_NODELAY) == 0) && 12813 ((uint32_t)len + (uint32_t)sb_offset >= sbavail(sb)) && 12814 (len < segsiz)) { 12815 /* 12816 * No delay is not on and the 12817 * user is sending less than 1MSS. This 12818 * brings out SWS avoidance so we 12819 * don't send. Another app-limited case. 12820 */ 12821 app_limited = CTF_JR_APP_LIMITED; 12822 } else if (tp->t_flags & TF_NOPUSH) { 12823 /* 12824 * The user has requested no push of 12825 * the last segment and we are 12826 * at the last segment. Another app 12827 * limited case. 12828 */ 12829 app_limited = CTF_JR_APP_LIMITED; 12830 } else if ((ctf_outstanding(tp) + minseg) > cwnd_to_use) { 12831 /* Its the cwnd */ 12832 app_limited = CTF_JR_CWND_LIMITED; 12833 } else if (rack->rc_in_persist == 1) { 12834 /* We are in persists */ 12835 app_limited = CTF_JR_PERSISTS; 12836 } else if (IN_RECOVERY(tp->t_flags) && 12837 (rack->rack_no_prr == 0) && 12838 (rack->r_ctl.rc_prr_sndcnt < segsiz)) { 12839 app_limited = CTF_JR_PRR; 12840 } else { 12841 /* Now why here are we not sending? */ 12842 #ifdef NOW 12843 #ifdef INVARIANTS 12844 panic("rack:%p hit JR_ASSESSING case cwnd_to_use:%u?", rack, cwnd_to_use); 12845 #endif 12846 #endif 12847 app_limited = CTF_JR_ASSESSING; 12848 } 12849 /* 12850 * App limited in some fashion, for our pacing GP 12851 * measurements we don't want any gap (even cwnd). 12852 * Close down the measurement window. 12853 */ 12854 if (rack_cwnd_block_ends_measure && 12855 ((app_limited == CTF_JR_CWND_LIMITED) || 12856 (app_limited == CTF_JR_PRR))) { 12857 /* 12858 * The reason we are not sending is 12859 * the cwnd (or prr). We have been configured 12860 * to end the measurement window in 12861 * this case. 12862 */ 12863 end_window = 1; 12864 } else if (app_limited == CTF_JR_PERSISTS) { 12865 /* 12866 * We never end the measurement window 12867 * in persists, though in theory we 12868 * should be only entering after everything 12869 * is acknowledged (so we will probably 12870 * never come here). 12871 */ 12872 end_window = 0; 12873 } else if (rack_rwnd_block_ends_measure && 12874 (app_limited == CTF_JR_RWND_LIMITED)) { 12875 /* 12876 * We are rwnd limited and have been 12877 * configured to end the measurement 12878 * window in this case. 12879 */ 12880 end_window = 1; 12881 } else if (app_limited == CTF_JR_APP_LIMITED) { 12882 /* 12883 * A true application limited period, we have 12884 * ran out of data. 12885 */ 12886 end_window = 1; 12887 } else if (app_limited == CTF_JR_ASSESSING) { 12888 /* 12889 * In the assessing case we hit the end of 12890 * the if/else and had no known reason 12891 * This will panic us under invariants.. 12892 * 12893 * If we get this out in logs we need to 12894 * investagate which reason we missed. 12895 */ 12896 end_window = 1; 12897 } 12898 if (end_window) { 12899 uint8_t log = 0; 12900 12901 if ((tp->t_flags & TF_GPUTINPROG) && 12902 SEQ_GT(tp->gput_ack, tp->snd_max)) { 12903 /* Mark the last packet has app limited */ 12904 tp->gput_ack = tp->snd_max; 12905 log = 1; 12906 } 12907 rsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 12908 if (rsm && ((rsm->r_flags & RACK_APP_LIMITED) == 0)) { 12909 if (rack->r_ctl.rc_app_limited_cnt == 0) 12910 rack->r_ctl.rc_end_appl = rack->r_ctl.rc_first_appl = rsm; 12911 else { 12912 /* 12913 * Go out to the end app limited and mark 12914 * this new one as next and move the end_appl up 12915 * to this guy. 12916 */ 12917 if (rack->r_ctl.rc_end_appl) 12918 rack->r_ctl.rc_end_appl->r_nseq_appl = rsm->r_start; 12919 rack->r_ctl.rc_end_appl = rsm; 12920 } 12921 rsm->r_flags |= RACK_APP_LIMITED; 12922 rack->r_ctl.rc_app_limited_cnt++; 12923 } 12924 if (log) 12925 rack_log_pacing_delay_calc(rack, 12926 rack->r_ctl.rc_app_limited_cnt, seq, 12927 tp->gput_ack, 0, 0, 4, __LINE__, NULL); 12928 } 12929 } 12930 if (slot) { 12931 /* set the rack tcb into the slot N */ 12932 counter_u64_add(rack_paced_segments, 1); 12933 } else if (tot_len_this_send) { 12934 counter_u64_add(rack_unpaced_segments, 1); 12935 } 12936 /* Check if we need to go into persists or not */ 12937 if ((rack->rc_in_persist == 0) && 12938 (tp->snd_max == tp->snd_una) && 12939 TCPS_HAVEESTABLISHED(tp->t_state) && 12940 sbavail(sb) && 12941 (sbavail(sb) > tp->snd_wnd) && 12942 (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), minseg))) { 12943 /* Yes lets make sure to move to persist before timer-start */ 12944 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime); 12945 } 12946 rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, sup_rack); 12947 rack_log_type_just_return(rack, cts, tot_len_this_send, slot, hpts_calling, app_limited, cwnd_to_use); 12948 } 12949 #ifdef NETFLIX_SHARED_CWND 12950 if ((sbavail(sb) == 0) && 12951 rack->r_ctl.rc_scw) { 12952 tcp_shared_cwnd_idle(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index); 12953 rack->rack_scwnd_is_idle = 1; 12954 } 12955 #endif 12956 return (0); 12957 12958 send: 12959 if ((flags & TH_FIN) && 12960 sbavail(sb)) { 12961 /* 12962 * We do not transmit a FIN 12963 * with data outstanding. We 12964 * need to make it so all data 12965 * is acked first. 12966 */ 12967 flags &= ~TH_FIN; 12968 } 12969 /* Enforce stack imposed max seg size if we have one */ 12970 if (rack->r_ctl.rc_pace_max_segs && 12971 (len > rack->r_ctl.rc_pace_max_segs)) { 12972 mark = 1; 12973 len = rack->r_ctl.rc_pace_max_segs; 12974 } 12975 SOCKBUF_LOCK_ASSERT(sb); 12976 if (len > 0) { 12977 if (len >= segsiz) 12978 tp->t_flags2 |= TF2_PLPMTU_MAXSEGSNT; 12979 else 12980 tp->t_flags2 &= ~TF2_PLPMTU_MAXSEGSNT; 12981 } 12982 /* 12983 * Before ESTABLISHED, force sending of initial options unless TCP 12984 * set not to do any options. NOTE: we assume that the IP/TCP header 12985 * plus TCP options always fit in a single mbuf, leaving room for a 12986 * maximum link header, i.e. max_linkhdr + sizeof (struct tcpiphdr) 12987 * + optlen <= MCLBYTES 12988 */ 12989 optlen = 0; 12990 #ifdef INET6 12991 if (isipv6) 12992 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 12993 else 12994 #endif 12995 hdrlen = sizeof(struct tcpiphdr); 12996 12997 /* 12998 * Compute options for segment. We only have to care about SYN and 12999 * established connection segments. Options for SYN-ACK segments 13000 * are handled in TCP syncache. 13001 */ 13002 to.to_flags = 0; 13003 if ((tp->t_flags & TF_NOOPT) == 0) { 13004 /* Maximum segment size. */ 13005 if (flags & TH_SYN) { 13006 tp->snd_nxt = tp->iss; 13007 to.to_mss = tcp_mssopt(&inp->inp_inc); 13008 #ifdef NETFLIX_TCPOUDP 13009 if (tp->t_port) 13010 to.to_mss -= V_tcp_udp_tunneling_overhead; 13011 #endif 13012 to.to_flags |= TOF_MSS; 13013 13014 /* 13015 * On SYN or SYN|ACK transmits on TFO connections, 13016 * only include the TFO option if it is not a 13017 * retransmit, as the presence of the TFO option may 13018 * have caused the original SYN or SYN|ACK to have 13019 * been dropped by a middlebox. 13020 */ 13021 if (IS_FASTOPEN(tp->t_flags) && 13022 (tp->t_rxtshift == 0)) { 13023 if (tp->t_state == TCPS_SYN_RECEIVED) { 13024 to.to_tfo_len = TCP_FASTOPEN_COOKIE_LEN; 13025 to.to_tfo_cookie = 13026 (u_int8_t *)&tp->t_tfo_cookie.server; 13027 to.to_flags |= TOF_FASTOPEN; 13028 wanted_cookie = 1; 13029 } else if (tp->t_state == TCPS_SYN_SENT) { 13030 to.to_tfo_len = 13031 tp->t_tfo_client_cookie_len; 13032 to.to_tfo_cookie = 13033 tp->t_tfo_cookie.client; 13034 to.to_flags |= TOF_FASTOPEN; 13035 wanted_cookie = 1; 13036 /* 13037 * If we wind up having more data to 13038 * send with the SYN than can fit in 13039 * one segment, don't send any more 13040 * until the SYN|ACK comes back from 13041 * the other end. 13042 */ 13043 sendalot = 0; 13044 } 13045 } 13046 } 13047 /* Window scaling. */ 13048 if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) { 13049 to.to_wscale = tp->request_r_scale; 13050 to.to_flags |= TOF_SCALE; 13051 } 13052 /* Timestamps. */ 13053 if ((tp->t_flags & TF_RCVD_TSTMP) || 13054 ((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) { 13055 to.to_tsval = cts + tp->ts_offset; 13056 to.to_tsecr = tp->ts_recent; 13057 to.to_flags |= TOF_TS; 13058 } 13059 /* Set receive buffer autosizing timestamp. */ 13060 if (tp->rfbuf_ts == 0 && 13061 (so->so_rcv.sb_flags & SB_AUTOSIZE)) 13062 tp->rfbuf_ts = tcp_ts_getticks(); 13063 /* Selective ACK's. */ 13064 if (flags & TH_SYN) 13065 to.to_flags |= TOF_SACKPERM; 13066 else if (TCPS_HAVEESTABLISHED(tp->t_state) && 13067 tp->rcv_numsacks > 0) { 13068 to.to_flags |= TOF_SACK; 13069 to.to_nsacks = tp->rcv_numsacks; 13070 to.to_sacks = (u_char *)tp->sackblks; 13071 } 13072 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) 13073 /* TCP-MD5 (RFC2385). */ 13074 if (tp->t_flags & TF_SIGNATURE) 13075 to.to_flags |= TOF_SIGNATURE; 13076 #endif /* TCP_SIGNATURE */ 13077 13078 /* Processing the options. */ 13079 hdrlen += optlen = tcp_addoptions(&to, opt); 13080 /* 13081 * If we wanted a TFO option to be added, but it was unable 13082 * to fit, ensure no data is sent. 13083 */ 13084 if (IS_FASTOPEN(tp->t_flags) && wanted_cookie && 13085 !(to.to_flags & TOF_FASTOPEN)) 13086 len = 0; 13087 } 13088 #ifdef NETFLIX_TCPOUDP 13089 if (tp->t_port) { 13090 if (V_tcp_udp_tunneling_port == 0) { 13091 /* The port was removed?? */ 13092 SOCKBUF_UNLOCK(&so->so_snd); 13093 return (EHOSTUNREACH); 13094 } 13095 hdrlen += sizeof(struct udphdr); 13096 } 13097 #endif 13098 #ifdef INET6 13099 if (isipv6) 13100 ipoptlen = ip6_optlen(tp->t_inpcb); 13101 else 13102 #endif 13103 if (tp->t_inpcb->inp_options) 13104 ipoptlen = tp->t_inpcb->inp_options->m_len - 13105 offsetof(struct ipoption, ipopt_list); 13106 else 13107 ipoptlen = 0; 13108 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 13109 ipoptlen += ipsec_optlen; 13110 #endif 13111 13112 /* 13113 * Adjust data length if insertion of options will bump the packet 13114 * length beyond the t_maxseg length. Clear the FIN bit because we 13115 * cut off the tail of the segment. 13116 */ 13117 if (len + optlen + ipoptlen > tp->t_maxseg) { 13118 if (tso) { 13119 uint32_t if_hw_tsomax; 13120 uint32_t moff; 13121 int32_t max_len; 13122 13123 /* extract TSO information */ 13124 if_hw_tsomax = tp->t_tsomax; 13125 if_hw_tsomaxsegcount = tp->t_tsomaxsegcount; 13126 if_hw_tsomaxsegsize = tp->t_tsomaxsegsize; 13127 KASSERT(ipoptlen == 0, 13128 ("%s: TSO can't do IP options", __func__)); 13129 13130 /* 13131 * Check if we should limit by maximum payload 13132 * length: 13133 */ 13134 if (if_hw_tsomax != 0) { 13135 /* compute maximum TSO length */ 13136 max_len = (if_hw_tsomax - hdrlen - 13137 max_linkhdr); 13138 if (max_len <= 0) { 13139 len = 0; 13140 } else if (len > max_len) { 13141 sendalot = 1; 13142 len = max_len; 13143 mark = 2; 13144 } 13145 } 13146 /* 13147 * Prevent the last segment from being fractional 13148 * unless the send sockbuf can be emptied: 13149 */ 13150 max_len = (tp->t_maxseg - optlen); 13151 if ((sb_offset + len) < sbavail(sb)) { 13152 moff = len % (u_int)max_len; 13153 if (moff != 0) { 13154 mark = 3; 13155 len -= moff; 13156 } 13157 } 13158 /* 13159 * In case there are too many small fragments don't 13160 * use TSO: 13161 */ 13162 if (len <= segsiz) { 13163 mark = 4; 13164 tso = 0; 13165 } 13166 /* 13167 * Send the FIN in a separate segment after the bulk 13168 * sending is done. We don't trust the TSO 13169 * implementations to clear the FIN flag on all but 13170 * the last segment. 13171 */ 13172 if (tp->t_flags & TF_NEEDFIN) { 13173 sendalot = 4; 13174 } 13175 } else { 13176 mark = 5; 13177 if (optlen + ipoptlen >= tp->t_maxseg) { 13178 /* 13179 * Since we don't have enough space to put 13180 * the IP header chain and the TCP header in 13181 * one packet as required by RFC 7112, don't 13182 * send it. Also ensure that at least one 13183 * byte of the payload can be put into the 13184 * TCP segment. 13185 */ 13186 SOCKBUF_UNLOCK(&so->so_snd); 13187 error = EMSGSIZE; 13188 sack_rxmit = 0; 13189 goto out; 13190 } 13191 len = tp->t_maxseg - optlen - ipoptlen; 13192 sendalot = 5; 13193 } 13194 } else { 13195 tso = 0; 13196 mark = 6; 13197 } 13198 KASSERT(len + hdrlen + ipoptlen <= IP_MAXPACKET, 13199 ("%s: len > IP_MAXPACKET", __func__)); 13200 #ifdef DIAGNOSTIC 13201 #ifdef INET6 13202 if (max_linkhdr + hdrlen > MCLBYTES) 13203 #else 13204 if (max_linkhdr + hdrlen > MHLEN) 13205 #endif 13206 panic("tcphdr too big"); 13207 #endif 13208 13209 /* 13210 * This KASSERT is here to catch edge cases at a well defined place. 13211 * Before, those had triggered (random) panic conditions further 13212 * down. 13213 */ 13214 KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); 13215 if ((len == 0) && 13216 (flags & TH_FIN) && 13217 (sbused(sb))) { 13218 /* 13219 * We have outstanding data, don't send a fin by itself!. 13220 */ 13221 goto just_return; 13222 } 13223 /* 13224 * Grab a header mbuf, attaching a copy of data to be transmitted, 13225 * and initialize the header from the template for sends on this 13226 * connection. 13227 */ 13228 if (len) { 13229 uint32_t max_val; 13230 uint32_t moff; 13231 13232 if (rack->r_ctl.rc_pace_max_segs) 13233 max_val = rack->r_ctl.rc_pace_max_segs; 13234 else if (rack->rc_user_set_max_segs) 13235 max_val = rack->rc_user_set_max_segs * segsiz; 13236 else 13237 max_val = len; 13238 /* 13239 * We allow a limit on sending with hptsi. 13240 */ 13241 if (len > max_val) { 13242 mark = 7; 13243 len = max_val; 13244 } 13245 #ifdef INET6 13246 if (MHLEN < hdrlen + max_linkhdr) 13247 m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); 13248 else 13249 #endif 13250 m = m_gethdr(M_NOWAIT, MT_DATA); 13251 13252 if (m == NULL) { 13253 SOCKBUF_UNLOCK(sb); 13254 error = ENOBUFS; 13255 sack_rxmit = 0; 13256 goto out; 13257 } 13258 m->m_data += max_linkhdr; 13259 m->m_len = hdrlen; 13260 13261 /* 13262 * Start the m_copy functions from the closest mbuf to the 13263 * sb_offset in the socket buffer chain. 13264 */ 13265 mb = sbsndptr_noadv(sb, sb_offset, &moff); 13266 if (len <= MHLEN - hdrlen - max_linkhdr && !hw_tls) { 13267 m_copydata(mb, moff, (int)len, 13268 mtod(m, caddr_t)+hdrlen); 13269 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) 13270 sbsndptr_adv(sb, mb, len); 13271 m->m_len += len; 13272 } else { 13273 struct sockbuf *msb; 13274 13275 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) 13276 msb = NULL; 13277 else 13278 msb = sb; 13279 m->m_next = tcp_m_copym( 13280 mb, moff, &len, 13281 if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb, 13282 ((rsm == NULL) ? hw_tls : 0) 13283 #ifdef NETFLIX_COPY_ARGS 13284 , &filled_all 13285 #endif 13286 ); 13287 if (len <= (tp->t_maxseg - optlen)) { 13288 /* 13289 * Must have ran out of mbufs for the copy 13290 * shorten it to no longer need tso. Lets 13291 * not put on sendalot since we are low on 13292 * mbufs. 13293 */ 13294 tso = 0; 13295 } 13296 if (m->m_next == NULL) { 13297 SOCKBUF_UNLOCK(sb); 13298 (void)m_free(m); 13299 error = ENOBUFS; 13300 sack_rxmit = 0; 13301 goto out; 13302 } 13303 } 13304 if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) { 13305 if (rsm && (rsm->r_flags & RACK_TLP)) { 13306 /* 13307 * TLP should not count in retran count, but 13308 * in its own bin 13309 */ 13310 counter_u64_add(rack_tlp_retran, 1); 13311 counter_u64_add(rack_tlp_retran_bytes, len); 13312 } else { 13313 tp->t_sndrexmitpack++; 13314 KMOD_TCPSTAT_INC(tcps_sndrexmitpack); 13315 KMOD_TCPSTAT_ADD(tcps_sndrexmitbyte, len); 13316 } 13317 #ifdef STATS 13318 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB, 13319 len); 13320 #endif 13321 } else { 13322 KMOD_TCPSTAT_INC(tcps_sndpack); 13323 KMOD_TCPSTAT_ADD(tcps_sndbyte, len); 13324 #ifdef STATS 13325 stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB, 13326 len); 13327 #endif 13328 } 13329 /* 13330 * If we're sending everything we've got, set PUSH. (This 13331 * will keep happy those implementations which only give 13332 * data to the user when a buffer fills or a PUSH comes in.) 13333 */ 13334 if (sb_offset + len == sbused(sb) && 13335 sbused(sb) && 13336 !(flags & TH_SYN)) 13337 flags |= TH_PUSH; 13338 13339 SOCKBUF_UNLOCK(sb); 13340 } else { 13341 SOCKBUF_UNLOCK(sb); 13342 if (tp->t_flags & TF_ACKNOW) 13343 KMOD_TCPSTAT_INC(tcps_sndacks); 13344 else if (flags & (TH_SYN | TH_FIN | TH_RST)) 13345 KMOD_TCPSTAT_INC(tcps_sndctrl); 13346 else 13347 KMOD_TCPSTAT_INC(tcps_sndwinup); 13348 13349 m = m_gethdr(M_NOWAIT, MT_DATA); 13350 if (m == NULL) { 13351 error = ENOBUFS; 13352 sack_rxmit = 0; 13353 goto out; 13354 } 13355 #ifdef INET6 13356 if (isipv6 && (MHLEN < hdrlen + max_linkhdr) && 13357 MHLEN >= hdrlen) { 13358 M_ALIGN(m, hdrlen); 13359 } else 13360 #endif 13361 m->m_data += max_linkhdr; 13362 m->m_len = hdrlen; 13363 } 13364 SOCKBUF_UNLOCK_ASSERT(sb); 13365 m->m_pkthdr.rcvif = (struct ifnet *)0; 13366 #ifdef MAC 13367 mac_inpcb_create_mbuf(inp, m); 13368 #endif 13369 #ifdef INET6 13370 if (isipv6) { 13371 ip6 = mtod(m, struct ip6_hdr *); 13372 #ifdef NETFLIX_TCPOUDP 13373 if (tp->t_port) { 13374 udp = (struct udphdr *)((caddr_t)ip6 + ipoptlen + sizeof(struct ip6_hdr)); 13375 udp->uh_sport = htons(V_tcp_udp_tunneling_port); 13376 udp->uh_dport = tp->t_port; 13377 ulen = hdrlen + len - sizeof(struct ip6_hdr); 13378 udp->uh_ulen = htons(ulen); 13379 th = (struct tcphdr *)(udp + 1); 13380 } else 13381 #endif 13382 th = (struct tcphdr *)(ip6 + 1); 13383 tcpip_fillheaders(inp, 13384 #ifdef NETFLIX_TCPOUDP 13385 tp->t_port, 13386 #endif 13387 ip6, th); 13388 } else 13389 #endif /* INET6 */ 13390 { 13391 ip = mtod(m, struct ip *); 13392 #ifdef TCPDEBUG 13393 ipov = (struct ipovly *)ip; 13394 #endif 13395 #ifdef NETFLIX_TCPOUDP 13396 if (tp->t_port) { 13397 udp = (struct udphdr *)((caddr_t)ip + ipoptlen + sizeof(struct ip)); 13398 udp->uh_sport = htons(V_tcp_udp_tunneling_port); 13399 udp->uh_dport = tp->t_port; 13400 ulen = hdrlen + len - sizeof(struct ip); 13401 udp->uh_ulen = htons(ulen); 13402 th = (struct tcphdr *)(udp + 1); 13403 } else 13404 #endif 13405 th = (struct tcphdr *)(ip + 1); 13406 tcpip_fillheaders(inp, 13407 #ifdef NETFLIX_TCPOUDP 13408 tp->t_port, 13409 #endif 13410 ip, th); 13411 } 13412 /* 13413 * Fill in fields, remembering maximum advertised window for use in 13414 * delaying messages about window sizes. If resending a FIN, be sure 13415 * not to use a new sequence number. 13416 */ 13417 if (flags & TH_FIN && tp->t_flags & TF_SENTFIN && 13418 tp->snd_nxt == tp->snd_max) 13419 tp->snd_nxt--; 13420 /* 13421 * If we are starting a connection, send ECN setup SYN packet. If we 13422 * are on a retransmit, we may resend those bits a number of times 13423 * as per RFC 3168. 13424 */ 13425 if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn == 1) { 13426 if (tp->t_rxtshift >= 1) { 13427 if (tp->t_rxtshift <= V_tcp_ecn_maxretries) 13428 flags |= TH_ECE | TH_CWR; 13429 } else 13430 flags |= TH_ECE | TH_CWR; 13431 } 13432 /* Handle parallel SYN for ECN */ 13433 if ((tp->t_state == TCPS_SYN_RECEIVED) && 13434 (tp->t_flags2 & TF2_ECN_SND_ECE)) { 13435 flags |= TH_ECE; 13436 tp->t_flags2 &= ~TF2_ECN_SND_ECE; 13437 } 13438 if (tp->t_state == TCPS_ESTABLISHED && 13439 (tp->t_flags2 & TF2_ECN_PERMIT)) { 13440 /* 13441 * If the peer has ECN, mark data packets with ECN capable 13442 * transmission (ECT). Ignore pure ack packets, 13443 * retransmissions. 13444 */ 13445 if (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) && 13446 (sack_rxmit == 0)) { 13447 #ifdef INET6 13448 if (isipv6) 13449 ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20); 13450 else 13451 #endif 13452 ip->ip_tos |= IPTOS_ECN_ECT0; 13453 KMOD_TCPSTAT_INC(tcps_ecn_ect0); 13454 /* 13455 * Reply with proper ECN notifications. 13456 * Only set CWR on new data segments. 13457 */ 13458 if (tp->t_flags2 & TF2_ECN_SND_CWR) { 13459 flags |= TH_CWR; 13460 tp->t_flags2 &= ~TF2_ECN_SND_CWR; 13461 } 13462 } 13463 if (tp->t_flags2 & TF2_ECN_SND_ECE) 13464 flags |= TH_ECE; 13465 } 13466 /* 13467 * If we are doing retransmissions, then snd_nxt will not reflect 13468 * the first unsent octet. For ACK only packets, we do not want the 13469 * sequence number of the retransmitted packet, we want the sequence 13470 * number of the next unsent octet. So, if there is no data (and no 13471 * SYN or FIN), use snd_max instead of snd_nxt when filling in 13472 * ti_seq. But if we are in persist state, snd_max might reflect 13473 * one byte beyond the right edge of the window, so use snd_nxt in 13474 * that case, since we know we aren't doing a retransmission. 13475 * (retransmit and persist are mutually exclusive...) 13476 */ 13477 if (sack_rxmit == 0) { 13478 if (len || (flags & (TH_SYN | TH_FIN)) || 13479 rack->rc_in_persist) { 13480 th->th_seq = htonl(tp->snd_nxt); 13481 rack_seq = tp->snd_nxt; 13482 } else if (flags & TH_RST) { 13483 /* 13484 * For a Reset send the last cum ack in sequence 13485 * (this like any other choice may still generate a 13486 * challenge ack, if a ack-update packet is in 13487 * flight). 13488 */ 13489 th->th_seq = htonl(tp->snd_una); 13490 rack_seq = tp->snd_una; 13491 } else { 13492 th->th_seq = htonl(tp->snd_max); 13493 rack_seq = tp->snd_max; 13494 } 13495 } else { 13496 th->th_seq = htonl(rsm->r_start); 13497 rack_seq = rsm->r_start; 13498 } 13499 th->th_ack = htonl(tp->rcv_nxt); 13500 if (optlen) { 13501 bcopy(opt, th + 1, optlen); 13502 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 13503 } 13504 th->th_flags = flags; 13505 /* 13506 * Calculate receive window. Don't shrink window, but avoid silly 13507 * window syndrome. 13508 * If a RST segment is sent, advertise a window of zero. 13509 */ 13510 if (flags & TH_RST) { 13511 recwin = 0; 13512 } else { 13513 if (recwin < (long)(so->so_rcv.sb_hiwat / 4) && 13514 recwin < (long)segsiz) 13515 recwin = 0; 13516 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) && 13517 recwin < (long)(tp->rcv_adv - tp->rcv_nxt)) 13518 recwin = (long)(tp->rcv_adv - tp->rcv_nxt); 13519 } 13520 13521 /* 13522 * According to RFC1323 the window field in a SYN (i.e., a <SYN> or 13523 * <SYN,ACK>) segment itself is never scaled. The <SYN,ACK> case is 13524 * handled in syncache. 13525 */ 13526 if (flags & TH_SYN) 13527 th->th_win = htons((u_short) 13528 (min(sbspace(&so->so_rcv), TCP_MAXWIN))); 13529 else { 13530 /* Avoid shrinking window with window scaling. */ 13531 recwin = roundup2(recwin, 1 << tp->rcv_scale); 13532 th->th_win = htons((u_short)(recwin >> tp->rcv_scale)); 13533 } 13534 /* 13535 * Adjust the RXWIN0SENT flag - indicate that we have advertised a 0 13536 * window. This may cause the remote transmitter to stall. This 13537 * flag tells soreceive() to disable delayed acknowledgements when 13538 * draining the buffer. This can occur if the receiver is 13539 * attempting to read more data than can be buffered prior to 13540 * transmitting on the connection. 13541 */ 13542 if (th->th_win == 0) { 13543 tp->t_sndzerowin++; 13544 tp->t_flags |= TF_RXWIN0SENT; 13545 } else 13546 tp->t_flags &= ~TF_RXWIN0SENT; 13547 tp->snd_up = tp->snd_una; /* drag it along, its deprecated */ 13548 13549 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) 13550 if (to.to_flags & TOF_SIGNATURE) { 13551 /* 13552 * Calculate MD5 signature and put it into the place 13553 * determined before. 13554 * NOTE: since TCP options buffer doesn't point into 13555 * mbuf's data, calculate offset and use it. 13556 */ 13557 if (!TCPMD5_ENABLED() || TCPMD5_OUTPUT(m, th, 13558 (u_char *)(th + 1) + (to.to_signature - opt)) != 0) { 13559 /* 13560 * Do not send segment if the calculation of MD5 13561 * digest has failed. 13562 */ 13563 goto out; 13564 } 13565 } 13566 #endif 13567 13568 /* 13569 * Put TCP length in extended header, and then checksum extended 13570 * header and data. 13571 */ 13572 m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */ 13573 #ifdef INET6 13574 if (isipv6) { 13575 /* 13576 * ip6_plen is not need to be filled now, and will be filled 13577 * in ip6_output. 13578 */ 13579 if (tp->t_port) { 13580 m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; 13581 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 13582 udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0); 13583 th->th_sum = htons(0); 13584 UDPSTAT_INC(udps_opackets); 13585 } else { 13586 m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; 13587 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 13588 th->th_sum = in6_cksum_pseudo(ip6, 13589 sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP, 13590 0); 13591 } 13592 } 13593 #endif 13594 #if defined(INET6) && defined(INET) 13595 else 13596 #endif 13597 #ifdef INET 13598 { 13599 if (tp->t_port) { 13600 m->m_pkthdr.csum_flags = CSUM_UDP; 13601 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 13602 udp->uh_sum = in_pseudo(ip->ip_src.s_addr, 13603 ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP)); 13604 th->th_sum = htons(0); 13605 UDPSTAT_INC(udps_opackets); 13606 } else { 13607 m->m_pkthdr.csum_flags = CSUM_TCP; 13608 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 13609 th->th_sum = in_pseudo(ip->ip_src.s_addr, 13610 ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + 13611 IPPROTO_TCP + len + optlen)); 13612 } 13613 /* IP version must be set here for ipv4/ipv6 checking later */ 13614 KASSERT(ip->ip_v == IPVERSION, 13615 ("%s: IP version incorrect: %d", __func__, ip->ip_v)); 13616 } 13617 #endif 13618 /* 13619 * Enable TSO and specify the size of the segments. The TCP pseudo 13620 * header checksum is always provided. XXX: Fixme: This is currently 13621 * not the case for IPv6. 13622 */ 13623 if (tso) { 13624 KASSERT(len > tp->t_maxseg - optlen, 13625 ("%s: len <= tso_segsz", __func__)); 13626 m->m_pkthdr.csum_flags |= CSUM_TSO; 13627 m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen; 13628 } 13629 KASSERT(len + hdrlen == m_length(m, NULL), 13630 ("%s: mbuf chain different than expected: %d + %u != %u", 13631 __func__, len, hdrlen, m_length(m, NULL))); 13632 13633 #ifdef TCP_HHOOK 13634 /* Run HHOOK_TCP_ESTABLISHED_OUT helper hooks. */ 13635 hhook_run_tcp_est_out(tp, th, &to, len, tso); 13636 #endif 13637 #ifdef TCPDEBUG 13638 /* 13639 * Trace. 13640 */ 13641 if (so->so_options & SO_DEBUG) { 13642 u_short save = 0; 13643 13644 #ifdef INET6 13645 if (!isipv6) 13646 #endif 13647 { 13648 save = ipov->ih_len; 13649 ipov->ih_len = htons(m->m_pkthdr.len /* - hdrlen + 13650 * (th->th_off << 2) */ ); 13651 } 13652 tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0); 13653 #ifdef INET6 13654 if (!isipv6) 13655 #endif 13656 ipov->ih_len = save; 13657 } 13658 #endif /* TCPDEBUG */ 13659 13660 /* We're getting ready to send; log now. */ 13661 if (tp->t_logstate != TCP_LOG_STATE_OFF) { 13662 union tcp_log_stackspecific log; 13663 struct timeval tv; 13664 13665 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 13666 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 13667 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 13668 if (rack->rack_no_prr) 13669 log.u_bbr.flex1 = 0; 13670 else 13671 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; 13672 log.u_bbr.flex2 = rack->r_ctl.rc_pace_min_segs; 13673 log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs; 13674 log.u_bbr.flex4 = orig_len; 13675 if (filled_all) 13676 log.u_bbr.flex5 = 0x80000000; 13677 else 13678 log.u_bbr.flex5 = 0; 13679 /* Save off the early/late values */ 13680 log.u_bbr.flex6 = rack->r_ctl.rc_agg_early; 13681 log.u_bbr.applimited = rack->r_ctl.rc_agg_delayed; 13682 log.u_bbr.bw_inuse = rack_get_bw(rack); 13683 if (rsm || sack_rxmit) { 13684 if (doing_tlp) 13685 log.u_bbr.flex8 = 2; 13686 else 13687 log.u_bbr.flex8 = 1; 13688 } else { 13689 log.u_bbr.flex8 = 0; 13690 } 13691 log.u_bbr.pacing_gain = rack_get_output_gain(rack, rsm); 13692 log.u_bbr.flex7 = mark; 13693 log.u_bbr.pkts_out = tp->t_maxseg; 13694 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 13695 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 13696 log.u_bbr.lt_epoch = cwnd_to_use; 13697 log.u_bbr.delivered = sendalot; 13698 lgb = tcp_log_event_(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_OUT, ERRNO_UNK, 13699 len, &log, false, NULL, NULL, 0, &tv); 13700 } else 13701 lgb = NULL; 13702 13703 /* 13704 * Fill in IP length and desired time to live and send to IP level. 13705 * There should be a better way to handle ttl and tos; we could keep 13706 * them in the template, but need a way to checksum without them. 13707 */ 13708 /* 13709 * m->m_pkthdr.len should have been set before cksum calcuration, 13710 * because in6_cksum() need it. 13711 */ 13712 #ifdef INET6 13713 if (isipv6) { 13714 /* 13715 * we separately set hoplimit for every segment, since the 13716 * user might want to change the value via setsockopt. Also, 13717 * desired default hop limit might be changed via Neighbor 13718 * Discovery. 13719 */ 13720 ip6->ip6_hlim = in6_selecthlim(inp, NULL); 13721 13722 /* 13723 * Set the packet size here for the benefit of DTrace 13724 * probes. ip6_output() will set it properly; it's supposed 13725 * to include the option header lengths as well. 13726 */ 13727 ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6)); 13728 13729 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) 13730 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 13731 else 13732 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 13733 13734 if (tp->t_state == TCPS_SYN_SENT) 13735 TCP_PROBE5(connect__request, NULL, tp, ip6, tp, th); 13736 13737 TCP_PROBE5(send, NULL, tp, ip6, tp, th); 13738 /* TODO: IPv6 IP6TOS_ECT bit on */ 13739 error = ip6_output(m, inp->in6p_outputopts, 13740 &inp->inp_route6, 13741 ((rsm || sack_rxmit) ? IP_NO_SND_TAG_RL : 0), 13742 NULL, NULL, inp); 13743 13744 if (error == EMSGSIZE && inp->inp_route6.ro_nh != NULL) 13745 mtu = inp->inp_route6.ro_nh->nh_mtu; 13746 } 13747 #endif /* INET6 */ 13748 #if defined(INET) && defined(INET6) 13749 else 13750 #endif 13751 #ifdef INET 13752 { 13753 ip->ip_len = htons(m->m_pkthdr.len); 13754 #ifdef INET6 13755 if (inp->inp_vflag & INP_IPV6PROTO) 13756 ip->ip_ttl = in6_selecthlim(inp, NULL); 13757 #endif /* INET6 */ 13758 /* 13759 * If we do path MTU discovery, then we set DF on every 13760 * packet. This might not be the best thing to do according 13761 * to RFC3390 Section 2. However the tcp hostcache migitates 13762 * the problem so it affects only the first tcp connection 13763 * with a host. 13764 * 13765 * NB: Don't set DF on small MTU/MSS to have a safe 13766 * fallback. 13767 */ 13768 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) { 13769 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 13770 if (tp->t_port == 0 || len < V_tcp_minmss) { 13771 ip->ip_off |= htons(IP_DF); 13772 } 13773 } else { 13774 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 13775 } 13776 13777 if (tp->t_state == TCPS_SYN_SENT) 13778 TCP_PROBE5(connect__request, NULL, tp, ip, tp, th); 13779 13780 TCP_PROBE5(send, NULL, tp, ip, tp, th); 13781 13782 error = ip_output(m, inp->inp_options, &inp->inp_route, 13783 ((rsm || sack_rxmit) ? IP_NO_SND_TAG_RL : 0), 0, 13784 inp); 13785 if (error == EMSGSIZE && inp->inp_route.ro_nh != NULL) 13786 mtu = inp->inp_route.ro_nh->nh_mtu; 13787 } 13788 #endif /* INET */ 13789 13790 out: 13791 if (lgb) { 13792 lgb->tlb_errno = error; 13793 lgb = NULL; 13794 } 13795 /* 13796 * In transmit state, time the transmission and arrange for the 13797 * retransmit. In persist state, just set snd_max. 13798 */ 13799 if (error == 0) { 13800 rack->forced_ack = 0; /* If we send something zap the FA flag */ 13801 if (rsm && (doing_tlp == 0)) { 13802 /* Set we retransmitted */ 13803 rack->rc_gp_saw_rec = 1; 13804 } else { 13805 if (cwnd_to_use > tp->snd_ssthresh) { 13806 /* Set we sent in CA */ 13807 rack->rc_gp_saw_ca = 1; 13808 } else { 13809 /* Set we sent in SS */ 13810 rack->rc_gp_saw_ss = 1; 13811 } 13812 } 13813 if (TCPS_HAVEESTABLISHED(tp->t_state) && 13814 (tp->t_flags & TF_SACK_PERMIT) && 13815 tp->rcv_numsacks > 0) 13816 tcp_clean_dsack_blocks(tp); 13817 tot_len_this_send += len; 13818 if (len == 0) 13819 counter_u64_add(rack_out_size[TCP_MSS_ACCT_SNDACK], 1); 13820 else if (len == 1) { 13821 counter_u64_add(rack_out_size[TCP_MSS_ACCT_PERSIST], 1); 13822 } else if (len > 1) { 13823 int idx; 13824 13825 idx = (len / segsiz) + 3; 13826 if (idx >= TCP_MSS_ACCT_ATIMER) 13827 counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1); 13828 else 13829 counter_u64_add(rack_out_size[idx], 1); 13830 } 13831 } 13832 if (rack->rack_no_prr == 0) { 13833 if (sub_from_prr && (error == 0)) { 13834 if (rack->r_ctl.rc_prr_sndcnt >= len) 13835 rack->r_ctl.rc_prr_sndcnt -= len; 13836 else 13837 rack->r_ctl.rc_prr_sndcnt = 0; 13838 } 13839 } 13840 sub_from_prr = 0; 13841 rack_log_output(tp, &to, len, rack_seq, (uint8_t) flags, error, cts, 13842 pass, rsm, us_cts); 13843 if ((error == 0) && 13844 (len > 0) && 13845 (tp->snd_una == tp->snd_max)) 13846 rack->r_ctl.rc_tlp_rxt_last_time = cts; 13847 /* Now are we in persists? */ 13848 if (rack->rc_in_persist == 0) { 13849 tcp_seq startseq = tp->snd_nxt; 13850 13851 /* Track our lost count */ 13852 if (rsm && (doing_tlp == 0)) 13853 rack->r_ctl.rc_loss_count += rsm->r_end - rsm->r_start; 13854 /* 13855 * Advance snd_nxt over sequence space of this segment. 13856 */ 13857 if (error) 13858 /* We don't log or do anything with errors */ 13859 goto nomore; 13860 if (doing_tlp == 0) { 13861 if (rsm == NULL) { 13862 /* 13863 * Not a retransmission of some 13864 * sort, new data is going out so 13865 * clear our TLP count and flag. 13866 */ 13867 rack->rc_tlp_in_progress = 0; 13868 rack->r_ctl.rc_tlp_cnt_out = 0; 13869 } 13870 } else { 13871 /* 13872 * We have just sent a TLP, mark that it is true 13873 * and make sure our in progress is set so we 13874 * continue to check the count. 13875 */ 13876 rack->rc_tlp_in_progress = 1; 13877 rack->r_ctl.rc_tlp_cnt_out++; 13878 } 13879 if (flags & (TH_SYN | TH_FIN)) { 13880 if (flags & TH_SYN) 13881 tp->snd_nxt++; 13882 if (flags & TH_FIN) { 13883 tp->snd_nxt++; 13884 tp->t_flags |= TF_SENTFIN; 13885 } 13886 } 13887 /* In the ENOBUFS case we do *not* update snd_max */ 13888 if (sack_rxmit) 13889 goto nomore; 13890 13891 tp->snd_nxt += len; 13892 if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { 13893 if (tp->snd_una == tp->snd_max) { 13894 /* 13895 * Update the time we just added data since 13896 * none was outstanding. 13897 */ 13898 rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__); 13899 tp->t_acktime = ticks; 13900 } 13901 tp->snd_max = tp->snd_nxt; 13902 /* 13903 * Time this transmission if not a retransmission and 13904 * not currently timing anything. 13905 * This is only relevant in case of switching back to 13906 * the base stack. 13907 */ 13908 if (tp->t_rtttime == 0) { 13909 tp->t_rtttime = ticks; 13910 tp->t_rtseq = startseq; 13911 KMOD_TCPSTAT_INC(tcps_segstimed); 13912 } 13913 if (len && 13914 ((tp->t_flags & TF_GPUTINPROG) == 0)) 13915 rack_start_gp_measurement(tp, rack, startseq, sb_offset); 13916 } 13917 } else { 13918 /* 13919 * Persist case, update snd_max but since we are in persist 13920 * mode (no window) we do not update snd_nxt. 13921 */ 13922 int32_t xlen = len; 13923 13924 if (error) 13925 goto nomore; 13926 13927 if (flags & TH_SYN) 13928 ++xlen; 13929 if (flags & TH_FIN) { 13930 ++xlen; 13931 tp->t_flags |= TF_SENTFIN; 13932 } 13933 /* In the ENOBUFS case we do *not* update snd_max */ 13934 if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max)) { 13935 if (tp->snd_una == tp->snd_max) { 13936 /* 13937 * Update the time we just added data since 13938 * none was outstanding. 13939 */ 13940 rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__); 13941 tp->t_acktime = ticks; 13942 } 13943 tp->snd_max = tp->snd_nxt + len; 13944 } 13945 } 13946 nomore: 13947 if (error) { 13948 rack->r_ctl.rc_agg_delayed = 0; 13949 rack->r_early = 0; 13950 rack->r_late = 0; 13951 rack->r_ctl.rc_agg_early = 0; 13952 SOCKBUF_UNLOCK_ASSERT(sb); /* Check gotos. */ 13953 /* 13954 * Failures do not advance the seq counter above. For the 13955 * case of ENOBUFS we will fall out and retry in 1ms with 13956 * the hpts. Everything else will just have to retransmit 13957 * with the timer. 13958 * 13959 * In any case, we do not want to loop around for another 13960 * send without a good reason. 13961 */ 13962 sendalot = 0; 13963 switch (error) { 13964 case EPERM: 13965 tp->t_softerror = error; 13966 return (error); 13967 case ENOBUFS: 13968 if (slot == 0) { 13969 /* 13970 * Pace us right away to retry in a some 13971 * time 13972 */ 13973 slot = ((1 + rack->rc_enobuf) * HPTS_USEC_IN_MSEC); 13974 if (rack->rc_enobuf < 126) 13975 rack->rc_enobuf++; 13976 if (slot > ((rack->rc_rack_rtt / 2) * HPTS_USEC_IN_MSEC)) { 13977 slot = (rack->rc_rack_rtt / 2) * HPTS_USEC_IN_MSEC; 13978 } 13979 if (slot < (10 * HPTS_USEC_IN_MSEC)) 13980 slot = 10 * HPTS_USEC_IN_MSEC; 13981 } 13982 counter_u64_add(rack_saw_enobuf, 1); 13983 error = 0; 13984 goto enobufs; 13985 case EMSGSIZE: 13986 /* 13987 * For some reason the interface we used initially 13988 * to send segments changed to another or lowered 13989 * its MTU. If TSO was active we either got an 13990 * interface without TSO capabilits or TSO was 13991 * turned off. If we obtained mtu from ip_output() 13992 * then update it and try again. 13993 */ 13994 if (tso) 13995 tp->t_flags &= ~TF_TSO; 13996 if (mtu != 0) { 13997 tcp_mss_update(tp, -1, mtu, NULL, NULL); 13998 goto again; 13999 } 14000 slot = 10 * HPTS_USEC_IN_MSEC; 14001 rack_start_hpts_timer(rack, tp, cts, slot, 0, 0); 14002 return (error); 14003 case ENETUNREACH: 14004 counter_u64_add(rack_saw_enetunreach, 1); 14005 case EHOSTDOWN: 14006 case EHOSTUNREACH: 14007 case ENETDOWN: 14008 if (TCPS_HAVERCVDSYN(tp->t_state)) { 14009 tp->t_softerror = error; 14010 } 14011 /* FALLTHROUGH */ 14012 default: 14013 slot = 10 * HPTS_USEC_IN_MSEC; 14014 rack_start_hpts_timer(rack, tp, cts, slot, 0, 0); 14015 return (error); 14016 } 14017 } else { 14018 rack->rc_enobuf = 0; 14019 } 14020 KMOD_TCPSTAT_INC(tcps_sndtotal); 14021 14022 /* 14023 * Data sent (as far as we can tell). If this advertises a larger 14024 * window than any other segment, then remember the size of the 14025 * advertised window. Any pending ACK has now been sent. 14026 */ 14027 if (recwin > 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv)) 14028 tp->rcv_adv = tp->rcv_nxt + recwin; 14029 tp->last_ack_sent = tp->rcv_nxt; 14030 tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); 14031 enobufs: 14032 /* Assure when we leave that snd_nxt will point to top */ 14033 if (SEQ_GT(tp->snd_max, tp->snd_nxt)) 14034 tp->snd_nxt = tp->snd_max; 14035 if (sendalot) { 14036 /* Do we need to turn off sendalot? */ 14037 if (rack->r_ctl.rc_pace_max_segs && 14038 (tot_len_this_send >= rack->r_ctl.rc_pace_max_segs)) { 14039 /* We hit our max. */ 14040 sendalot = 0; 14041 } else if ((rack->rc_user_set_max_segs) && 14042 (tot_len_this_send >= (rack->rc_user_set_max_segs * segsiz))) { 14043 /* We hit the user defined max */ 14044 sendalot = 0; 14045 } 14046 } 14047 if ((error == 0) && (flags & TH_FIN)) 14048 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_FIN); 14049 if (flags & TH_RST) { 14050 /* 14051 * We don't send again after sending a RST. 14052 */ 14053 slot = 0; 14054 sendalot = 0; 14055 if (error == 0) 14056 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); 14057 } else if ((slot == 0) && (sendalot == 0) && tot_len_this_send) { 14058 /* 14059 * Get our pacing rate, if an error 14060 * occured in sending (ENOBUF) we would 14061 * hit the else if with slot preset. Other 14062 * errors return. 14063 */ 14064 slot = rack_get_pacing_delay(rack, tp, tot_len_this_send, rsm, segsiz); 14065 } 14066 if (rsm && 14067 rack->use_rack_rr) { 14068 /* Its a retransmit and we use the rack cheat? */ 14069 if ((slot == 0) || 14070 (rack->rc_always_pace == 0) || 14071 (rack->r_rr_config == 1)) { 14072 /* 14073 * We have no pacing set or we 14074 * are using old-style rack or 14075 * we are overriden to use the old 1ms pacing. 14076 */ 14077 slot = rack->r_ctl.rc_min_to * HPTS_USEC_IN_MSEC; 14078 } 14079 } 14080 if (slot) { 14081 /* set the rack tcb into the slot N */ 14082 counter_u64_add(rack_paced_segments, 1); 14083 } else if (sendalot) { 14084 if (len) 14085 counter_u64_add(rack_unpaced_segments, 1); 14086 sack_rxmit = 0; 14087 goto again; 14088 } else if (len) { 14089 counter_u64_add(rack_unpaced_segments, 1); 14090 } 14091 rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, 0); 14092 return (error); 14093 } 14094 14095 static void 14096 rack_update_seg(struct tcp_rack *rack) 14097 { 14098 uint32_t orig_val; 14099 14100 orig_val = rack->r_ctl.rc_pace_max_segs; 14101 rack_set_pace_segments(rack->rc_tp, rack, __LINE__); 14102 if (orig_val != rack->r_ctl.rc_pace_max_segs) 14103 rack_log_pacing_delay_calc(rack, 0, 0, orig_val, 0, 0, 15, __LINE__, NULL); 14104 } 14105 14106 /* 14107 * rack_ctloutput() must drop the inpcb lock before performing copyin on 14108 * socket option arguments. When it re-acquires the lock after the copy, it 14109 * has to revalidate that the connection is still valid for the socket 14110 * option. 14111 */ 14112 static int 14113 rack_set_sockopt(struct socket *so, struct sockopt *sopt, 14114 struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack) 14115 { 14116 struct epoch_tracker et; 14117 uint64_t val; 14118 int32_t error = 0, optval; 14119 uint16_t ca, ss; 14120 14121 14122 switch (sopt->sopt_name) { 14123 case TCP_RACK_PROP_RATE: /* URL:prop_rate */ 14124 case TCP_RACK_PROP : /* URL:prop */ 14125 case TCP_RACK_TLP_REDUCE: /* URL:tlp_reduce */ 14126 case TCP_RACK_EARLY_RECOV: /* URL:early_recov */ 14127 case TCP_RACK_PACE_REDUCE: /* Not used */ 14128 /* Pacing related ones */ 14129 case TCP_RACK_PACE_ALWAYS: /* URL:pace_always */ 14130 case TCP_BBR_RACK_INIT_RATE: /* URL:irate */ 14131 case TCP_BBR_IWINTSO: /* URL:tso_iwin */ 14132 case TCP_RACK_PACE_MAX_SEG: /* URL:pace_max_seg */ 14133 case TCP_RACK_FORCE_MSEG: /* URL:force_max_seg */ 14134 case TCP_RACK_PACE_RATE_CA: /* URL:pr_ca */ 14135 case TCP_RACK_PACE_RATE_SS: /* URL:pr_ss*/ 14136 case TCP_RACK_PACE_RATE_REC: /* URL:pr_rec */ 14137 case TCP_RACK_GP_INCREASE_CA: /* URL:gp_inc_ca */ 14138 case TCP_RACK_GP_INCREASE_SS: /* URL:gp_inc_ss */ 14139 case TCP_RACK_GP_INCREASE_REC: /* URL:gp_inc_rec */ 14140 case TCP_RACK_RR_CONF: /* URL:rrr_conf */ 14141 case TCP_BBR_HDWR_PACE: /* URL:hdwrpace */ 14142 /* End pacing related */ 14143 case TCP_DELACK: 14144 case TCP_RACK_PRR_SENDALOT: /* URL:prr_sendalot */ 14145 case TCP_RACK_MIN_TO: /* URL:min_to */ 14146 case TCP_RACK_EARLY_SEG: /* URL:early_seg */ 14147 case TCP_RACK_REORD_THRESH: /* URL:reord_thresh */ 14148 case TCP_RACK_REORD_FADE: /* URL:reord_fade */ 14149 case TCP_RACK_TLP_THRESH: /* URL:tlp_thresh */ 14150 case TCP_RACK_PKT_DELAY: /* URL:pkt_delay */ 14151 case TCP_RACK_TLP_USE: /* URL:tlp_use */ 14152 case TCP_RACK_TLP_INC_VAR: /* URL:tlp_inc_var */ 14153 case TCP_RACK_IDLE_REDUCE_HIGH: /* URL:idle_reduce_high */ 14154 case TCP_BBR_RACK_RTT_USE: /* URL:rttuse */ 14155 case TCP_BBR_USE_RACK_RR: /* URL:rackrr */ 14156 case TCP_RACK_DO_DETECTION: /* URL:detect */ 14157 case TCP_NO_PRR: /* URL:noprr */ 14158 case TCP_TIMELY_DYN_ADJ: /* URL:dynamic */ 14159 case TCP_DATA_AFTER_CLOSE: 14160 case TCP_RACK_NONRXT_CFG_RATE: /* URL:nonrxtcr */ 14161 case TCP_SHARED_CWND_ENABLE: /* URL:scwnd */ 14162 case TCP_RACK_MBUF_QUEUE: /* URL:mqueue */ 14163 case TCP_RACK_NO_PUSH_AT_MAX: /* URL:npush */ 14164 case TCP_RACK_PACE_TO_FILL: /* URL:fillcw */ 14165 case TCP_SHARED_CWND_TIME_LIMIT: /* URL:lscwnd */ 14166 case TCP_RACK_PROFILE: /* URL:profile */ 14167 break; 14168 default: 14169 return (tcp_default_ctloutput(so, sopt, inp, tp)); 14170 break; 14171 } 14172 INP_WUNLOCK(inp); 14173 error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); 14174 if (error) 14175 return (error); 14176 INP_WLOCK(inp); 14177 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 14178 INP_WUNLOCK(inp); 14179 return (ECONNRESET); 14180 } 14181 tp = intotcpcb(inp); 14182 rack = (struct tcp_rack *)tp->t_fb_ptr; 14183 switch (sopt->sopt_name) { 14184 case TCP_RACK_PROFILE: 14185 RACK_OPTS_INC(tcp_profile); 14186 if (optval == 1) { 14187 /* pace_always=1 */ 14188 rack->rc_always_pace = 1; 14189 tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ; 14190 /* scwnd=1 */ 14191 rack->rack_enable_scwnd = 1; 14192 /* dynamic=100 */ 14193 rack->rc_gp_dyn_mul = 1; 14194 rack->r_ctl.rack_per_of_gp_ca = 100; 14195 /* rrr_conf=3 */ 14196 rack->r_rr_config = 3; 14197 /* npush=2 */ 14198 rack->r_ctl.rc_no_push_at_mrtt = 2; 14199 /* fillcw=1 */ 14200 rack->rc_pace_to_cwnd = 1; 14201 rack->rc_pace_fill_if_rttin_range = 0; 14202 rack->rtt_limit_mul = 0; 14203 /* noprr=1 */ 14204 rack->rack_no_prr = 1; 14205 /* lscwnd=1 */ 14206 rack->r_limit_scw = 1; 14207 } else if (optval == 2) { 14208 /* pace_always=1 */ 14209 rack->rc_always_pace = 1; 14210 tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ; 14211 /* scwnd=1 */ 14212 rack->rack_enable_scwnd = 1; 14213 /* dynamic=100 */ 14214 rack->rc_gp_dyn_mul = 1; 14215 rack->r_ctl.rack_per_of_gp_ca = 100; 14216 /* rrr_conf=3 */ 14217 rack->r_rr_config = 3; 14218 /* npush=2 */ 14219 rack->r_ctl.rc_no_push_at_mrtt = 2; 14220 /* fillcw=1 */ 14221 rack->rc_pace_to_cwnd = 1; 14222 rack->rc_pace_fill_if_rttin_range = 0; 14223 rack->rtt_limit_mul = 0; 14224 /* noprr=1 */ 14225 rack->rack_no_prr = 1; 14226 /* lscwnd=0 */ 14227 rack->r_limit_scw = 0; 14228 } 14229 break; 14230 case TCP_SHARED_CWND_TIME_LIMIT: 14231 RACK_OPTS_INC(tcp_lscwnd); 14232 if (optval) 14233 rack->r_limit_scw = 1; 14234 else 14235 rack->r_limit_scw = 0; 14236 break; 14237 case TCP_RACK_PACE_TO_FILL: 14238 RACK_OPTS_INC(tcp_fillcw); 14239 if (optval == 0) 14240 rack->rc_pace_to_cwnd = 0; 14241 else 14242 rack->rc_pace_to_cwnd = 1; 14243 if ((optval >= rack_gp_rtt_maxmul) && 14244 rack_gp_rtt_maxmul && 14245 (optval < 0xf)) { 14246 rack->rc_pace_fill_if_rttin_range = 1; 14247 rack->rtt_limit_mul = optval; 14248 } else { 14249 rack->rc_pace_fill_if_rttin_range = 0; 14250 rack->rtt_limit_mul = 0; 14251 } 14252 break; 14253 case TCP_RACK_NO_PUSH_AT_MAX: 14254 RACK_OPTS_INC(tcp_npush); 14255 if (optval == 0) 14256 rack->r_ctl.rc_no_push_at_mrtt = 0; 14257 else if (optval < 0xff) 14258 rack->r_ctl.rc_no_push_at_mrtt = optval; 14259 else 14260 error = EINVAL; 14261 break; 14262 case TCP_SHARED_CWND_ENABLE: 14263 RACK_OPTS_INC(tcp_rack_scwnd); 14264 if (optval == 0) 14265 rack->rack_enable_scwnd = 0; 14266 else 14267 rack->rack_enable_scwnd = 1; 14268 break; 14269 case TCP_RACK_MBUF_QUEUE: 14270 /* Now do we use the LRO mbuf-queue feature */ 14271 RACK_OPTS_INC(tcp_rack_mbufq); 14272 if (optval) 14273 rack->r_mbuf_queue = 1; 14274 else 14275 rack->r_mbuf_queue = 0; 14276 if (rack->r_mbuf_queue || rack->rc_always_pace) 14277 tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ; 14278 else 14279 tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ; 14280 break; 14281 case TCP_RACK_NONRXT_CFG_RATE: 14282 RACK_OPTS_INC(tcp_rack_cfg_rate); 14283 if (optval == 0) 14284 rack->rack_rec_nonrxt_use_cr = 0; 14285 else 14286 rack->rack_rec_nonrxt_use_cr = 1; 14287 break; 14288 case TCP_NO_PRR: 14289 RACK_OPTS_INC(tcp_rack_noprr); 14290 if (optval == 0) 14291 rack->rack_no_prr = 0; 14292 else 14293 rack->rack_no_prr = 1; 14294 break; 14295 case TCP_TIMELY_DYN_ADJ: 14296 RACK_OPTS_INC(tcp_timely_dyn); 14297 if (optval == 0) 14298 rack->rc_gp_dyn_mul = 0; 14299 else { 14300 rack->rc_gp_dyn_mul = 1; 14301 if (optval >= 100) { 14302 /* 14303 * If the user sets something 100 or more 14304 * its the gp_ca value. 14305 */ 14306 rack->r_ctl.rack_per_of_gp_ca = optval; 14307 } 14308 } 14309 break; 14310 case TCP_RACK_DO_DETECTION: 14311 RACK_OPTS_INC(tcp_rack_do_detection); 14312 if (optval == 0) 14313 rack->do_detection = 0; 14314 else 14315 rack->do_detection = 1; 14316 break; 14317 case TCP_RACK_PROP_RATE: 14318 if ((optval <= 0) || (optval >= 100)) { 14319 error = EINVAL; 14320 break; 14321 } 14322 RACK_OPTS_INC(tcp_rack_prop_rate); 14323 rack->r_ctl.rc_prop_rate = optval; 14324 break; 14325 case TCP_RACK_TLP_USE: 14326 if ((optval < TLP_USE_ID) || (optval > TLP_USE_TWO_TWO)) { 14327 error = EINVAL; 14328 break; 14329 } 14330 RACK_OPTS_INC(tcp_tlp_use); 14331 rack->rack_tlp_threshold_use = optval; 14332 break; 14333 case TCP_RACK_PROP: 14334 /* RACK proportional rate reduction (bool) */ 14335 RACK_OPTS_INC(tcp_rack_prop); 14336 rack->r_ctl.rc_prop_reduce = optval; 14337 break; 14338 case TCP_RACK_TLP_REDUCE: 14339 /* RACK TLP cwnd reduction (bool) */ 14340 RACK_OPTS_INC(tcp_rack_tlp_reduce); 14341 rack->r_ctl.rc_tlp_cwnd_reduce = optval; 14342 break; 14343 case TCP_RACK_EARLY_RECOV: 14344 /* Should recovery happen early (bool) */ 14345 RACK_OPTS_INC(tcp_rack_early_recov); 14346 rack->r_ctl.rc_early_recovery = optval; 14347 break; 14348 14349 /* Pacing related ones */ 14350 case TCP_RACK_PACE_ALWAYS: 14351 /* 14352 * zero is old rack method, 1 is new 14353 * method using a pacing rate. 14354 */ 14355 RACK_OPTS_INC(tcp_rack_pace_always); 14356 if (optval > 0) 14357 rack->rc_always_pace = 1; 14358 else 14359 rack->rc_always_pace = 0; 14360 if (rack->r_mbuf_queue || rack->rc_always_pace) 14361 tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ; 14362 else 14363 tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ; 14364 /* A rate may be set irate or other, if so set seg size */ 14365 rack_update_seg(rack); 14366 break; 14367 case TCP_BBR_RACK_INIT_RATE: 14368 RACK_OPTS_INC(tcp_initial_rate); 14369 val = optval; 14370 /* Change from kbits per second to bytes per second */ 14371 val *= 1000; 14372 val /= 8; 14373 rack->r_ctl.init_rate = val; 14374 if (rack->rc_init_win != rack_default_init_window) { 14375 uint32_t win, snt; 14376 14377 /* 14378 * Options don't always get applied 14379 * in the order you think. So in order 14380 * to assure we update a cwnd we need 14381 * to check and see if we are still 14382 * where we should raise the cwnd. 14383 */ 14384 win = rc_init_window(rack); 14385 if (SEQ_GT(tp->snd_max, tp->iss)) 14386 snt = tp->snd_max - tp->iss; 14387 else 14388 snt = 0; 14389 if ((snt < win) && 14390 (tp->snd_cwnd < win)) 14391 tp->snd_cwnd = win; 14392 } 14393 if (rack->rc_always_pace) 14394 rack_update_seg(rack); 14395 break; 14396 case TCP_BBR_IWINTSO: 14397 RACK_OPTS_INC(tcp_initial_win); 14398 if (optval && (optval <= 0xff)) { 14399 uint32_t win, snt; 14400 14401 rack->rc_init_win = optval; 14402 win = rc_init_window(rack); 14403 if (SEQ_GT(tp->snd_max, tp->iss)) 14404 snt = tp->snd_max - tp->iss; 14405 else 14406 snt = 0; 14407 if ((snt < win) && 14408 (tp->t_srtt | 14409 #ifdef NETFLIX_PEAKRATE 14410 tp->t_maxpeakrate | 14411 #endif 14412 rack->r_ctl.init_rate)) { 14413 /* 14414 * We are not past the initial window 14415 * and we have some bases for pacing, 14416 * so we need to possibly adjust up 14417 * the cwnd. Note even if we don't set 14418 * the cwnd, its still ok to raise the rc_init_win 14419 * which can be used coming out of idle when we 14420 * would have a rate. 14421 */ 14422 if (tp->snd_cwnd < win) 14423 tp->snd_cwnd = win; 14424 } 14425 if (rack->rc_always_pace) 14426 rack_update_seg(rack); 14427 } else 14428 error = EINVAL; 14429 break; 14430 case TCP_RACK_FORCE_MSEG: 14431 RACK_OPTS_INC(tcp_rack_force_max_seg); 14432 if (optval) 14433 rack->rc_force_max_seg = 1; 14434 else 14435 rack->rc_force_max_seg = 0; 14436 break; 14437 case TCP_RACK_PACE_MAX_SEG: 14438 /* Max segments size in a pace in bytes */ 14439 RACK_OPTS_INC(tcp_rack_max_seg); 14440 rack->rc_user_set_max_segs = optval; 14441 rack_set_pace_segments(tp, rack, __LINE__); 14442 break; 14443 case TCP_RACK_PACE_RATE_REC: 14444 /* Set the fixed pacing rate in Bytes per second ca */ 14445 RACK_OPTS_INC(tcp_rack_pace_rate_rec); 14446 rack->r_ctl.rc_fixed_pacing_rate_rec = optval; 14447 if (rack->r_ctl.rc_fixed_pacing_rate_ca == 0) 14448 rack->r_ctl.rc_fixed_pacing_rate_ca = optval; 14449 if (rack->r_ctl.rc_fixed_pacing_rate_ss == 0) 14450 rack->r_ctl.rc_fixed_pacing_rate_ss = optval; 14451 rack->use_fixed_rate = 1; 14452 rack_log_pacing_delay_calc(rack, 14453 rack->r_ctl.rc_fixed_pacing_rate_ss, 14454 rack->r_ctl.rc_fixed_pacing_rate_ca, 14455 rack->r_ctl.rc_fixed_pacing_rate_rec, 0, 0, 8, 14456 __LINE__, NULL); 14457 break; 14458 14459 case TCP_RACK_PACE_RATE_SS: 14460 /* Set the fixed pacing rate in Bytes per second ca */ 14461 RACK_OPTS_INC(tcp_rack_pace_rate_ss); 14462 rack->r_ctl.rc_fixed_pacing_rate_ss = optval; 14463 if (rack->r_ctl.rc_fixed_pacing_rate_ca == 0) 14464 rack->r_ctl.rc_fixed_pacing_rate_ca = optval; 14465 if (rack->r_ctl.rc_fixed_pacing_rate_rec == 0) 14466 rack->r_ctl.rc_fixed_pacing_rate_rec = optval; 14467 rack->use_fixed_rate = 1; 14468 rack_log_pacing_delay_calc(rack, 14469 rack->r_ctl.rc_fixed_pacing_rate_ss, 14470 rack->r_ctl.rc_fixed_pacing_rate_ca, 14471 rack->r_ctl.rc_fixed_pacing_rate_rec, 0, 0, 8, 14472 __LINE__, NULL); 14473 break; 14474 14475 case TCP_RACK_PACE_RATE_CA: 14476 /* Set the fixed pacing rate in Bytes per second ca */ 14477 RACK_OPTS_INC(tcp_rack_pace_rate_ca); 14478 rack->r_ctl.rc_fixed_pacing_rate_ca = optval; 14479 if (rack->r_ctl.rc_fixed_pacing_rate_ss == 0) 14480 rack->r_ctl.rc_fixed_pacing_rate_ss = optval; 14481 if (rack->r_ctl.rc_fixed_pacing_rate_rec == 0) 14482 rack->r_ctl.rc_fixed_pacing_rate_rec = optval; 14483 rack->use_fixed_rate = 1; 14484 rack_log_pacing_delay_calc(rack, 14485 rack->r_ctl.rc_fixed_pacing_rate_ss, 14486 rack->r_ctl.rc_fixed_pacing_rate_ca, 14487 rack->r_ctl.rc_fixed_pacing_rate_rec, 0, 0, 8, 14488 __LINE__, NULL); 14489 break; 14490 case TCP_RACK_GP_INCREASE_REC: 14491 RACK_OPTS_INC(tcp_gp_inc_rec); 14492 rack->r_ctl.rack_per_of_gp_rec = optval; 14493 rack_log_pacing_delay_calc(rack, 14494 rack->r_ctl.rack_per_of_gp_ss, 14495 rack->r_ctl.rack_per_of_gp_ca, 14496 rack->r_ctl.rack_per_of_gp_rec, 0, 0, 1, 14497 __LINE__, NULL); 14498 break; 14499 case TCP_RACK_GP_INCREASE_CA: 14500 RACK_OPTS_INC(tcp_gp_inc_ca); 14501 ca = optval; 14502 if (ca < 100) { 14503 /* 14504 * We don't allow any reduction 14505 * over the GP b/w. 14506 */ 14507 error = EINVAL; 14508 break; 14509 } 14510 rack->r_ctl.rack_per_of_gp_ca = ca; 14511 rack_log_pacing_delay_calc(rack, 14512 rack->r_ctl.rack_per_of_gp_ss, 14513 rack->r_ctl.rack_per_of_gp_ca, 14514 rack->r_ctl.rack_per_of_gp_rec, 0, 0, 1, 14515 __LINE__, NULL); 14516 break; 14517 case TCP_RACK_GP_INCREASE_SS: 14518 RACK_OPTS_INC(tcp_gp_inc_ss); 14519 ss = optval; 14520 if (ss < 100) { 14521 /* 14522 * We don't allow any reduction 14523 * over the GP b/w. 14524 */ 14525 error = EINVAL; 14526 break; 14527 } 14528 rack->r_ctl.rack_per_of_gp_ss = ss; 14529 rack_log_pacing_delay_calc(rack, 14530 rack->r_ctl.rack_per_of_gp_ss, 14531 rack->r_ctl.rack_per_of_gp_ca, 14532 rack->r_ctl.rack_per_of_gp_rec, 0, 0, 1, 14533 __LINE__, NULL); 14534 break; 14535 case TCP_RACK_RR_CONF: 14536 RACK_OPTS_INC(tcp_rack_rrr_no_conf_rate); 14537 if (optval && optval <= 3) 14538 rack->r_rr_config = optval; 14539 else 14540 rack->r_rr_config = 0; 14541 break; 14542 case TCP_BBR_HDWR_PACE: 14543 RACK_OPTS_INC(tcp_hdwr_pacing); 14544 if (optval){ 14545 if (rack->rack_hdrw_pacing == 0) { 14546 rack->rack_hdw_pace_ena = 1; 14547 rack->rack_attempt_hdwr_pace = 0; 14548 } else 14549 error = EALREADY; 14550 } else { 14551 rack->rack_hdw_pace_ena = 0; 14552 #ifdef RATELIMIT 14553 if (rack->rack_hdrw_pacing) { 14554 rack->rack_hdrw_pacing = 0; 14555 in_pcbdetach_txrtlmt(rack->rc_inp); 14556 } 14557 #endif 14558 } 14559 break; 14560 /* End Pacing related ones */ 14561 case TCP_RACK_PRR_SENDALOT: 14562 /* Allow PRR to send more than one seg */ 14563 RACK_OPTS_INC(tcp_rack_prr_sendalot); 14564 rack->r_ctl.rc_prr_sendalot = optval; 14565 break; 14566 case TCP_RACK_MIN_TO: 14567 /* Minimum time between rack t-o's in ms */ 14568 RACK_OPTS_INC(tcp_rack_min_to); 14569 rack->r_ctl.rc_min_to = optval; 14570 break; 14571 case TCP_RACK_EARLY_SEG: 14572 /* If early recovery max segments */ 14573 RACK_OPTS_INC(tcp_rack_early_seg); 14574 rack->r_ctl.rc_early_recovery_segs = optval; 14575 break; 14576 case TCP_RACK_REORD_THRESH: 14577 /* RACK reorder threshold (shift amount) */ 14578 RACK_OPTS_INC(tcp_rack_reord_thresh); 14579 if ((optval > 0) && (optval < 31)) 14580 rack->r_ctl.rc_reorder_shift = optval; 14581 else 14582 error = EINVAL; 14583 break; 14584 case TCP_RACK_REORD_FADE: 14585 /* Does reordering fade after ms time */ 14586 RACK_OPTS_INC(tcp_rack_reord_fade); 14587 rack->r_ctl.rc_reorder_fade = optval; 14588 break; 14589 case TCP_RACK_TLP_THRESH: 14590 /* RACK TLP theshold i.e. srtt+(srtt/N) */ 14591 RACK_OPTS_INC(tcp_rack_tlp_thresh); 14592 if (optval) 14593 rack->r_ctl.rc_tlp_threshold = optval; 14594 else 14595 error = EINVAL; 14596 break; 14597 case TCP_BBR_USE_RACK_RR: 14598 RACK_OPTS_INC(tcp_rack_rr); 14599 if (optval) 14600 rack->use_rack_rr = 1; 14601 else 14602 rack->use_rack_rr = 0; 14603 break; 14604 case TCP_RACK_PKT_DELAY: 14605 /* RACK added ms i.e. rack-rtt + reord + N */ 14606 RACK_OPTS_INC(tcp_rack_pkt_delay); 14607 rack->r_ctl.rc_pkt_delay = optval; 14608 break; 14609 case TCP_RACK_TLP_INC_VAR: 14610 /* Does TLP include rtt variance in t-o */ 14611 error = EINVAL; 14612 break; 14613 case TCP_RACK_IDLE_REDUCE_HIGH: 14614 error = EINVAL; 14615 break; 14616 case TCP_DELACK: 14617 if (optval == 0) 14618 tp->t_delayed_ack = 0; 14619 else 14620 tp->t_delayed_ack = 1; 14621 if (tp->t_flags & TF_DELACK) { 14622 tp->t_flags &= ~TF_DELACK; 14623 tp->t_flags |= TF_ACKNOW; 14624 NET_EPOCH_ENTER(et); 14625 rack_output(tp); 14626 NET_EPOCH_EXIT(et); 14627 } 14628 break; 14629 14630 case TCP_BBR_RACK_RTT_USE: 14631 if ((optval != USE_RTT_HIGH) && 14632 (optval != USE_RTT_LOW) && 14633 (optval != USE_RTT_AVG)) 14634 error = EINVAL; 14635 else 14636 rack->r_ctl.rc_rate_sample_method = optval; 14637 break; 14638 case TCP_DATA_AFTER_CLOSE: 14639 if (optval) 14640 rack->rc_allow_data_af_clo = 1; 14641 else 14642 rack->rc_allow_data_af_clo = 0; 14643 break; 14644 case TCP_RACK_PACE_REDUCE: 14645 /* sysctl only now */ 14646 error = EINVAL; 14647 break; 14648 default: 14649 return (tcp_default_ctloutput(so, sopt, inp, tp)); 14650 break; 14651 } 14652 #ifdef NETFLIX_STATS 14653 tcp_log_socket_option(tp, sopt->sopt_name, optval, error); 14654 #endif 14655 INP_WUNLOCK(inp); 14656 return (error); 14657 } 14658 14659 static int 14660 rack_get_sockopt(struct socket *so, struct sockopt *sopt, 14661 struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack) 14662 { 14663 int32_t error, optval; 14664 uint64_t val; 14665 /* 14666 * Because all our options are either boolean or an int, we can just 14667 * pull everything into optval and then unlock and copy. If we ever 14668 * add a option that is not a int, then this will have quite an 14669 * impact to this routine. 14670 */ 14671 error = 0; 14672 switch (sopt->sopt_name) { 14673 case TCP_RACK_PROFILE: 14674 /* You cannot retrieve a profile, its write only */ 14675 error = EINVAL; 14676 break; 14677 case TCP_RACK_PACE_TO_FILL: 14678 optval = rack->rc_pace_to_cwnd; 14679 break; 14680 case TCP_RACK_NO_PUSH_AT_MAX: 14681 optval = rack->r_ctl.rc_no_push_at_mrtt; 14682 break; 14683 case TCP_SHARED_CWND_ENABLE: 14684 optval = rack->rack_enable_scwnd; 14685 break; 14686 case TCP_RACK_NONRXT_CFG_RATE: 14687 optval = rack->rack_rec_nonrxt_use_cr; 14688 break; 14689 case TCP_NO_PRR: 14690 optval = rack->rack_no_prr; 14691 break; 14692 case TCP_RACK_DO_DETECTION: 14693 optval = rack->do_detection; 14694 break; 14695 case TCP_RACK_MBUF_QUEUE: 14696 /* Now do we use the LRO mbuf-queue feature */ 14697 optval = rack->r_mbuf_queue; 14698 break; 14699 case TCP_TIMELY_DYN_ADJ: 14700 optval = rack->rc_gp_dyn_mul; 14701 break; 14702 case TCP_BBR_IWINTSO: 14703 optval = rack->rc_init_win; 14704 break; 14705 case TCP_RACK_PROP_RATE: 14706 optval = rack->r_ctl.rc_prop_rate; 14707 break; 14708 case TCP_RACK_PROP: 14709 /* RACK proportional rate reduction (bool) */ 14710 optval = rack->r_ctl.rc_prop_reduce; 14711 break; 14712 case TCP_RACK_TLP_REDUCE: 14713 /* RACK TLP cwnd reduction (bool) */ 14714 optval = rack->r_ctl.rc_tlp_cwnd_reduce; 14715 break; 14716 case TCP_RACK_EARLY_RECOV: 14717 /* Should recovery happen early (bool) */ 14718 optval = rack->r_ctl.rc_early_recovery; 14719 break; 14720 case TCP_RACK_PACE_REDUCE: 14721 /* RACK Hptsi reduction factor (divisor) */ 14722 error = EINVAL; 14723 break; 14724 case TCP_BBR_RACK_INIT_RATE: 14725 val = rack->r_ctl.init_rate; 14726 /* convert to kbits per sec */ 14727 val *= 8; 14728 val /= 1000; 14729 optval = (uint32_t)val; 14730 break; 14731 case TCP_RACK_FORCE_MSEG: 14732 optval = rack->rc_force_max_seg; 14733 break; 14734 case TCP_RACK_PACE_MAX_SEG: 14735 /* Max segments in a pace */ 14736 optval = rack->rc_user_set_max_segs; 14737 break; 14738 case TCP_RACK_PACE_ALWAYS: 14739 /* Use the always pace method */ 14740 optval = rack->rc_always_pace; 14741 break; 14742 case TCP_RACK_PRR_SENDALOT: 14743 /* Allow PRR to send more than one seg */ 14744 optval = rack->r_ctl.rc_prr_sendalot; 14745 break; 14746 case TCP_RACK_MIN_TO: 14747 /* Minimum time between rack t-o's in ms */ 14748 optval = rack->r_ctl.rc_min_to; 14749 break; 14750 case TCP_RACK_EARLY_SEG: 14751 /* If early recovery max segments */ 14752 optval = rack->r_ctl.rc_early_recovery_segs; 14753 break; 14754 case TCP_RACK_REORD_THRESH: 14755 /* RACK reorder threshold (shift amount) */ 14756 optval = rack->r_ctl.rc_reorder_shift; 14757 break; 14758 case TCP_RACK_REORD_FADE: 14759 /* Does reordering fade after ms time */ 14760 optval = rack->r_ctl.rc_reorder_fade; 14761 break; 14762 case TCP_BBR_USE_RACK_RR: 14763 /* Do we use the rack cheat for rxt */ 14764 optval = rack->use_rack_rr; 14765 break; 14766 case TCP_RACK_RR_CONF: 14767 optval = rack->r_rr_config; 14768 break; 14769 case TCP_BBR_HDWR_PACE: 14770 optval = rack->rack_hdw_pace_ena; 14771 break; 14772 case TCP_RACK_TLP_THRESH: 14773 /* RACK TLP theshold i.e. srtt+(srtt/N) */ 14774 optval = rack->r_ctl.rc_tlp_threshold; 14775 break; 14776 case TCP_RACK_PKT_DELAY: 14777 /* RACK added ms i.e. rack-rtt + reord + N */ 14778 optval = rack->r_ctl.rc_pkt_delay; 14779 break; 14780 case TCP_RACK_TLP_USE: 14781 optval = rack->rack_tlp_threshold_use; 14782 break; 14783 case TCP_RACK_TLP_INC_VAR: 14784 /* Does TLP include rtt variance in t-o */ 14785 error = EINVAL; 14786 break; 14787 case TCP_RACK_IDLE_REDUCE_HIGH: 14788 error = EINVAL; 14789 break; 14790 case TCP_RACK_PACE_RATE_CA: 14791 optval = rack->r_ctl.rc_fixed_pacing_rate_ca; 14792 break; 14793 case TCP_RACK_PACE_RATE_SS: 14794 optval = rack->r_ctl.rc_fixed_pacing_rate_ss; 14795 break; 14796 case TCP_RACK_PACE_RATE_REC: 14797 optval = rack->r_ctl.rc_fixed_pacing_rate_rec; 14798 break; 14799 case TCP_RACK_GP_INCREASE_SS: 14800 optval = rack->r_ctl.rack_per_of_gp_ca; 14801 break; 14802 case TCP_RACK_GP_INCREASE_CA: 14803 optval = rack->r_ctl.rack_per_of_gp_ss; 14804 break; 14805 case TCP_BBR_RACK_RTT_USE: 14806 optval = rack->r_ctl.rc_rate_sample_method; 14807 break; 14808 case TCP_DELACK: 14809 optval = tp->t_delayed_ack; 14810 break; 14811 case TCP_DATA_AFTER_CLOSE: 14812 optval = rack->rc_allow_data_af_clo; 14813 break; 14814 case TCP_SHARED_CWND_TIME_LIMIT: 14815 optval = rack->r_limit_scw; 14816 break; 14817 default: 14818 return (tcp_default_ctloutput(so, sopt, inp, tp)); 14819 break; 14820 } 14821 INP_WUNLOCK(inp); 14822 if (error == 0) { 14823 error = sooptcopyout(sopt, &optval, sizeof optval); 14824 } 14825 return (error); 14826 } 14827 14828 static int 14829 rack_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp) 14830 { 14831 int32_t error = EINVAL; 14832 struct tcp_rack *rack; 14833 14834 rack = (struct tcp_rack *)tp->t_fb_ptr; 14835 if (rack == NULL) { 14836 /* Huh? */ 14837 goto out; 14838 } 14839 if (sopt->sopt_dir == SOPT_SET) { 14840 return (rack_set_sockopt(so, sopt, inp, tp, rack)); 14841 } else if (sopt->sopt_dir == SOPT_GET) { 14842 return (rack_get_sockopt(so, sopt, inp, tp, rack)); 14843 } 14844 out: 14845 INP_WUNLOCK(inp); 14846 return (error); 14847 } 14848 14849 static int 14850 rack_pru_options(struct tcpcb *tp, int flags) 14851 { 14852 if (flags & PRUS_OOB) 14853 return (EOPNOTSUPP); 14854 return (0); 14855 } 14856 14857 static struct tcp_function_block __tcp_rack = { 14858 .tfb_tcp_block_name = __XSTRING(STACKNAME), 14859 .tfb_tcp_output = rack_output, 14860 .tfb_do_queued_segments = ctf_do_queued_segments, 14861 .tfb_do_segment_nounlock = rack_do_segment_nounlock, 14862 .tfb_tcp_do_segment = rack_do_segment, 14863 .tfb_tcp_ctloutput = rack_ctloutput, 14864 .tfb_tcp_fb_init = rack_init, 14865 .tfb_tcp_fb_fini = rack_fini, 14866 .tfb_tcp_timer_stop_all = rack_stopall, 14867 .tfb_tcp_timer_activate = rack_timer_activate, 14868 .tfb_tcp_timer_active = rack_timer_active, 14869 .tfb_tcp_timer_stop = rack_timer_stop, 14870 .tfb_tcp_rexmit_tmr = rack_remxt_tmr, 14871 .tfb_tcp_handoff_ok = rack_handoff_ok, 14872 .tfb_pru_options = rack_pru_options, 14873 }; 14874 14875 static const char *rack_stack_names[] = { 14876 __XSTRING(STACKNAME), 14877 #ifdef STACKALIAS 14878 __XSTRING(STACKALIAS), 14879 #endif 14880 }; 14881 14882 static int 14883 rack_ctor(void *mem, int32_t size, void *arg, int32_t how) 14884 { 14885 memset(mem, 0, size); 14886 return (0); 14887 } 14888 14889 static void 14890 rack_dtor(void *mem, int32_t size, void *arg) 14891 { 14892 14893 } 14894 14895 static bool rack_mod_inited = false; 14896 14897 static int 14898 tcp_addrack(module_t mod, int32_t type, void *data) 14899 { 14900 int32_t err = 0; 14901 int num_stacks; 14902 14903 switch (type) { 14904 case MOD_LOAD: 14905 rack_zone = uma_zcreate(__XSTRING(MODNAME) "_map", 14906 sizeof(struct rack_sendmap), 14907 rack_ctor, rack_dtor, NULL, NULL, UMA_ALIGN_PTR, 0); 14908 14909 rack_pcb_zone = uma_zcreate(__XSTRING(MODNAME) "_pcb", 14910 sizeof(struct tcp_rack), 14911 rack_ctor, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0); 14912 14913 sysctl_ctx_init(&rack_sysctl_ctx); 14914 rack_sysctl_root = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 14915 SYSCTL_STATIC_CHILDREN(_net_inet_tcp), 14916 OID_AUTO, 14917 #ifdef STACKALIAS 14918 __XSTRING(STACKALIAS), 14919 #else 14920 __XSTRING(STACKNAME), 14921 #endif 14922 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 14923 ""); 14924 if (rack_sysctl_root == NULL) { 14925 printf("Failed to add sysctl node\n"); 14926 err = EFAULT; 14927 goto free_uma; 14928 } 14929 rack_init_sysctls(); 14930 num_stacks = nitems(rack_stack_names); 14931 err = register_tcp_functions_as_names(&__tcp_rack, M_WAITOK, 14932 rack_stack_names, &num_stacks); 14933 if (err) { 14934 printf("Failed to register %s stack name for " 14935 "%s module\n", rack_stack_names[num_stacks], 14936 __XSTRING(MODNAME)); 14937 sysctl_ctx_free(&rack_sysctl_ctx); 14938 free_uma: 14939 uma_zdestroy(rack_zone); 14940 uma_zdestroy(rack_pcb_zone); 14941 rack_counter_destroy(); 14942 printf("Failed to register rack module -- err:%d\n", err); 14943 return (err); 14944 } 14945 tcp_lro_reg_mbufq(); 14946 rack_mod_inited = true; 14947 break; 14948 case MOD_QUIESCE: 14949 err = deregister_tcp_functions(&__tcp_rack, true, false); 14950 break; 14951 case MOD_UNLOAD: 14952 err = deregister_tcp_functions(&__tcp_rack, false, true); 14953 if (err == EBUSY) 14954 break; 14955 if (rack_mod_inited) { 14956 uma_zdestroy(rack_zone); 14957 uma_zdestroy(rack_pcb_zone); 14958 sysctl_ctx_free(&rack_sysctl_ctx); 14959 rack_counter_destroy(); 14960 rack_mod_inited = false; 14961 } 14962 tcp_lro_dereg_mbufq(); 14963 err = 0; 14964 break; 14965 default: 14966 return (EOPNOTSUPP); 14967 } 14968 return (err); 14969 } 14970 14971 static moduledata_t tcp_rack = { 14972 .name = __XSTRING(MODNAME), 14973 .evhand = tcp_addrack, 14974 .priv = 0 14975 }; 14976 14977 MODULE_VERSION(MODNAME, 1); 14978 DECLARE_MODULE(MODNAME, tcp_rack, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY); 14979 MODULE_DEPEND(MODNAME, tcphpts, 1, 1, 1); 14980