1 /*- 2 * Copyright (c) 2016-2020 Netflix, Inc. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 16 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 23 * SUCH DAMAGE. 24 * 25 */ 26 27 #include <sys/cdefs.h> 28 __FBSDID("$FreeBSD$"); 29 30 #include "opt_inet.h" 31 #include "opt_inet6.h" 32 #include "opt_ipsec.h" 33 #include "opt_tcpdebug.h" 34 #include "opt_ratelimit.h" 35 #include <sys/param.h> 36 #include <sys/arb.h> 37 #include <sys/module.h> 38 #include <sys/kernel.h> 39 #ifdef TCP_HHOOK 40 #include <sys/hhook.h> 41 #endif 42 #include <sys/lock.h> 43 #include <sys/malloc.h> 44 #include <sys/lock.h> 45 #include <sys/mutex.h> 46 #include <sys/mbuf.h> 47 #include <sys/proc.h> /* for proc0 declaration */ 48 #include <sys/socket.h> 49 #include <sys/socketvar.h> 50 #include <sys/sysctl.h> 51 #include <sys/systm.h> 52 #ifdef STATS 53 #include <sys/qmath.h> 54 #include <sys/tree.h> 55 #include <sys/stats.h> /* Must come after qmath.h and tree.h */ 56 #else 57 #include <sys/tree.h> 58 #endif 59 #include <sys/refcount.h> 60 #include <sys/queue.h> 61 #include <sys/tim_filter.h> 62 #include <sys/smp.h> 63 #include <sys/kthread.h> 64 #include <sys/kern_prefetch.h> 65 #include <sys/protosw.h> 66 67 #include <vm/uma.h> 68 69 #include <net/route.h> 70 #include <net/route/nhop.h> 71 #include <net/vnet.h> 72 73 #define TCPSTATES /* for logging */ 74 75 #include <netinet/in.h> 76 #include <netinet/in_kdtrace.h> 77 #include <netinet/in_pcb.h> 78 #include <netinet/ip.h> 79 #include <netinet/ip_icmp.h> /* required for icmp_var.h */ 80 #include <netinet/icmp_var.h> /* for ICMP_BANDLIM */ 81 #include <netinet/ip_var.h> 82 #include <netinet/ip6.h> 83 #include <netinet6/in6_pcb.h> 84 #include <netinet6/ip6_var.h> 85 #include <netinet/tcp.h> 86 #define TCPOUTFLAGS 87 #include <netinet/tcp_fsm.h> 88 #include <netinet/tcp_log_buf.h> 89 #include <netinet/tcp_seq.h> 90 #include <netinet/tcp_timer.h> 91 #include <netinet/tcp_var.h> 92 #include <netinet/tcp_hpts.h> 93 #include <netinet/tcp_ratelimit.h> 94 #include <netinet/tcpip.h> 95 #include <netinet/cc/cc.h> 96 #include <netinet/tcp_fastopen.h> 97 #include <netinet/tcp_lro.h> 98 #ifdef NETFLIX_SHARED_CWND 99 #include <netinet/tcp_shared_cwnd.h> 100 #endif 101 #ifdef TCPDEBUG 102 #include <netinet/tcp_debug.h> 103 #endif /* TCPDEBUG */ 104 #ifdef TCP_OFFLOAD 105 #include <netinet/tcp_offload.h> 106 #endif 107 #ifdef INET6 108 #include <netinet6/tcp6_var.h> 109 #endif 110 111 #include <netipsec/ipsec_support.h> 112 113 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 114 #include <netipsec/ipsec.h> 115 #include <netipsec/ipsec6.h> 116 #endif /* IPSEC */ 117 118 #include <netinet/udp.h> 119 #include <netinet/udp_var.h> 120 #include <machine/in_cksum.h> 121 122 #ifdef MAC 123 #include <security/mac/mac_framework.h> 124 #endif 125 #include "sack_filter.h" 126 #include "tcp_rack.h" 127 #include "rack_bbr_common.h" 128 129 uma_zone_t rack_zone; 130 uma_zone_t rack_pcb_zone; 131 132 #ifndef TICKS2SBT 133 #define TICKS2SBT(__t) (tick_sbt * ((sbintime_t)(__t))) 134 #endif 135 136 struct sysctl_ctx_list rack_sysctl_ctx; 137 struct sysctl_oid *rack_sysctl_root; 138 139 #define CUM_ACKED 1 140 #define SACKED 2 141 142 /* 143 * The RACK module incorporates a number of 144 * TCP ideas that have been put out into the IETF 145 * over the last few years: 146 * - Matt Mathis's Rate Halving which slowly drops 147 * the congestion window so that the ack clock can 148 * be maintained during a recovery. 149 * - Yuchung Cheng's RACK TCP (for which its named) that 150 * will stop us using the number of dup acks and instead 151 * use time as the gage of when we retransmit. 152 * - Reorder Detection of RFC4737 and the Tail-Loss probe draft 153 * of Dukkipati et.al. 154 * RACK depends on SACK, so if an endpoint arrives that 155 * cannot do SACK the state machine below will shuttle the 156 * connection back to using the "default" TCP stack that is 157 * in FreeBSD. 158 * 159 * To implement RACK the original TCP stack was first decomposed 160 * into a functional state machine with individual states 161 * for each of the possible TCP connection states. The do_segement 162 * functions role in life is to mandate the connection supports SACK 163 * initially and then assure that the RACK state matches the conenction 164 * state before calling the states do_segment function. Each 165 * state is simplified due to the fact that the original do_segment 166 * has been decomposed and we *know* what state we are in (no 167 * switches on the state) and all tests for SACK are gone. This 168 * greatly simplifies what each state does. 169 * 170 * TCP output is also over-written with a new version since it 171 * must maintain the new rack scoreboard. 172 * 173 */ 174 static int32_t rack_tlp_thresh = 1; 175 static int32_t rack_tlp_limit = 2; /* No more than 2 TLPs w-out new data */ 176 static int32_t rack_tlp_use_greater = 1; 177 static int32_t rack_reorder_thresh = 2; 178 static int32_t rack_reorder_fade = 60000; /* 0 - never fade, def 60,000 179 * - 60 seconds */ 180 /* Attack threshold detections */ 181 static uint32_t rack_highest_sack_thresh_seen = 0; 182 static uint32_t rack_highest_move_thresh_seen = 0; 183 184 static int32_t rack_pkt_delay = 1; 185 static int32_t rack_early_recovery = 1; 186 static int32_t rack_send_a_lot_in_prr = 1; 187 static int32_t rack_min_to = 1; /* Number of ms minimum timeout */ 188 static int32_t rack_verbose_logging = 0; 189 static int32_t rack_ignore_data_after_close = 1; 190 static int32_t rack_enable_shared_cwnd = 0; 191 static int32_t rack_limits_scwnd = 1; 192 static int32_t rack_enable_mqueue_for_nonpaced = 0; 193 static int32_t rack_disable_prr = 0; 194 static int32_t use_rack_rr = 1; 195 static int32_t rack_non_rxt_use_cr = 0; /* does a non-rxt in recovery use the configured rate (ss/ca)? */ 196 static int32_t rack_persist_min = 250; /* 250ms */ 197 static int32_t rack_persist_max = 2000; /* 2 Second */ 198 static int32_t rack_sack_not_required = 0; /* set to one to allow non-sack to use rack */ 199 static int32_t rack_default_init_window = 0; /* Use system default */ 200 static int32_t rack_limit_time_with_srtt = 0; 201 static int32_t rack_hw_pace_adjust = 0; 202 /* 203 * Currently regular tcp has a rto_min of 30ms 204 * the backoff goes 12 times so that ends up 205 * being a total of 122.850 seconds before a 206 * connection is killed. 207 */ 208 static uint32_t rack_def_data_window = 20; 209 static uint32_t rack_goal_bdp = 2; 210 static uint32_t rack_min_srtts = 1; 211 static uint32_t rack_min_measure_usec = 0; 212 static int32_t rack_tlp_min = 10; 213 static int32_t rack_rto_min = 30; /* 30ms same as main freebsd */ 214 static int32_t rack_rto_max = 4000; /* 4 seconds */ 215 static const int32_t rack_free_cache = 2; 216 static int32_t rack_hptsi_segments = 40; 217 static int32_t rack_rate_sample_method = USE_RTT_LOW; 218 static int32_t rack_pace_every_seg = 0; 219 static int32_t rack_delayed_ack_time = 200; /* 200ms */ 220 static int32_t rack_slot_reduction = 4; 221 static int32_t rack_wma_divisor = 8; /* For WMA calculation */ 222 static int32_t rack_cwnd_block_ends_measure = 0; 223 static int32_t rack_rwnd_block_ends_measure = 0; 224 225 static int32_t rack_lower_cwnd_at_tlp = 0; 226 static int32_t rack_use_proportional_reduce = 0; 227 static int32_t rack_proportional_rate = 10; 228 static int32_t rack_tlp_max_resend = 2; 229 static int32_t rack_limited_retran = 0; 230 static int32_t rack_always_send_oldest = 0; 231 static int32_t rack_tlp_threshold_use = TLP_USE_TWO_ONE; 232 233 static uint16_t rack_per_of_gp_ss = 250; /* 250 % slow-start */ 234 static uint16_t rack_per_of_gp_ca = 200; /* 200 % congestion-avoidance */ 235 static uint16_t rack_per_of_gp_rec = 200; /* 200 % of bw */ 236 237 /* Probertt */ 238 static uint16_t rack_per_of_gp_probertt = 60; /* 60% of bw */ 239 static uint16_t rack_per_of_gp_lowthresh = 40; /* 40% is bottom */ 240 static uint16_t rack_per_of_gp_probertt_reduce = 10; /* 10% reduction */ 241 static uint16_t rack_atexit_prtt_hbp = 130; /* Clamp to 130% on exit prtt if highly buffered path */ 242 static uint16_t rack_atexit_prtt = 130; /* Clamp to 100% on exit prtt if non highly buffered path */ 243 244 static uint32_t rack_max_drain_wait = 2; /* How man gp srtt's before we give up draining */ 245 static uint32_t rack_must_drain = 1; /* How many GP srtt's we *must* wait */ 246 static uint32_t rack_probertt_use_min_rtt_entry = 1; /* Use the min to calculate the goal else gp_srtt */ 247 static uint32_t rack_probertt_use_min_rtt_exit = 0; 248 static uint32_t rack_probe_rtt_sets_cwnd = 0; 249 static uint32_t rack_probe_rtt_safety_val = 2000000; /* No more than 2 sec in probe-rtt */ 250 static uint32_t rack_time_between_probertt = 9600000; /* 9.6 sec in us */ 251 static uint32_t rack_probertt_gpsrtt_cnt_mul = 0; /* How many srtt periods does probe-rtt last top fraction */ 252 static uint32_t rack_probertt_gpsrtt_cnt_div = 0; /* How many srtt periods does probe-rtt last bottom fraction */ 253 static uint32_t rack_min_probertt_hold = 200000; /* Equal to delayed ack time */ 254 static uint32_t rack_probertt_filter_life = 10000000; 255 static uint32_t rack_probertt_lower_within = 10; 256 static uint32_t rack_min_rtt_movement = 250; /* Must move at least 250 useconds to count as a lowering */ 257 static int32_t rack_pace_one_seg = 0; /* Shall we pace for less than 1.4Meg 1MSS at a time */ 258 static int32_t rack_probertt_clear_is = 1; 259 static int32_t rack_max_drain_hbp = 1; /* Extra drain times gpsrtt for highly buffered paths */ 260 static int32_t rack_hbp_thresh = 3; /* what is the divisor max_rtt/min_rtt to decided a hbp */ 261 262 263 /* Part of pacing */ 264 static int32_t rack_max_per_above = 30; /* When we go to increment stop if above 100+this% */ 265 266 /* Timely information */ 267 /* Combine these two gives the range of 'no change' to bw */ 268 /* ie the up/down provide the upper and lower bound */ 269 static int32_t rack_gp_per_bw_mul_up = 2; /* 2% */ 270 static int32_t rack_gp_per_bw_mul_down = 4; /* 4% */ 271 static int32_t rack_gp_rtt_maxmul = 3; /* 3 x maxmin */ 272 static int32_t rack_gp_rtt_minmul = 1; /* minrtt + (minrtt/mindiv) is lower rtt */ 273 static int32_t rack_gp_rtt_mindiv = 4; /* minrtt + (minrtt * minmul/mindiv) is lower rtt */ 274 static int32_t rack_gp_decrease_per = 20; /* 20% decrease in multipler */ 275 static int32_t rack_gp_increase_per = 2; /* 2% increase in multipler */ 276 static int32_t rack_per_lower_bound = 50; /* Don't allow to drop below this multiplier */ 277 static int32_t rack_per_upper_bound_ss = 0; /* Don't allow SS to grow above this */ 278 static int32_t rack_per_upper_bound_ca = 0; /* Don't allow CA to grow above this */ 279 static int32_t rack_do_dyn_mul = 0; /* Are the rack gp multipliers dynamic */ 280 static int32_t rack_gp_no_rec_chg = 1; /* Prohibit recovery from reducing it's multiplier */ 281 static int32_t rack_timely_dec_clear = 6; /* Do we clear decrement count at a value (6)? */ 282 static int32_t rack_timely_max_push_rise = 3; /* One round of pushing */ 283 static int32_t rack_timely_max_push_drop = 3; /* Three round of pushing */ 284 static int32_t rack_timely_min_segs = 4; /* 4 segment minimum */ 285 static int32_t rack_use_max_for_nobackoff = 0; 286 static int32_t rack_timely_int_timely_only = 0; /* do interim timely's only use the timely algo (no b/w changes)? */ 287 static int32_t rack_timely_no_stopping = 0; 288 static int32_t rack_down_raise_thresh = 100; 289 static int32_t rack_req_segs = 1; 290 291 /* Weird delayed ack mode */ 292 static int32_t rack_use_imac_dack = 0; 293 /* Rack specific counters */ 294 counter_u64_t rack_badfr; 295 counter_u64_t rack_badfr_bytes; 296 counter_u64_t rack_rtm_prr_retran; 297 counter_u64_t rack_rtm_prr_newdata; 298 counter_u64_t rack_timestamp_mismatch; 299 counter_u64_t rack_reorder_seen; 300 counter_u64_t rack_paced_segments; 301 counter_u64_t rack_unpaced_segments; 302 counter_u64_t rack_calc_zero; 303 counter_u64_t rack_calc_nonzero; 304 counter_u64_t rack_saw_enobuf; 305 counter_u64_t rack_saw_enetunreach; 306 counter_u64_t rack_per_timer_hole; 307 308 /* Tail loss probe counters */ 309 counter_u64_t rack_tlp_tot; 310 counter_u64_t rack_tlp_newdata; 311 counter_u64_t rack_tlp_retran; 312 counter_u64_t rack_tlp_retran_bytes; 313 counter_u64_t rack_tlp_retran_fail; 314 counter_u64_t rack_to_tot; 315 counter_u64_t rack_to_arm_rack; 316 counter_u64_t rack_to_arm_tlp; 317 counter_u64_t rack_to_alloc; 318 counter_u64_t rack_to_alloc_hard; 319 counter_u64_t rack_to_alloc_emerg; 320 counter_u64_t rack_to_alloc_limited; 321 counter_u64_t rack_alloc_limited_conns; 322 counter_u64_t rack_split_limited; 323 324 counter_u64_t rack_sack_proc_all; 325 counter_u64_t rack_sack_proc_short; 326 counter_u64_t rack_sack_proc_restart; 327 counter_u64_t rack_sack_attacks_detected; 328 counter_u64_t rack_sack_attacks_reversed; 329 counter_u64_t rack_sack_used_next_merge; 330 counter_u64_t rack_sack_splits; 331 counter_u64_t rack_sack_used_prev_merge; 332 counter_u64_t rack_sack_skipped_acked; 333 counter_u64_t rack_ack_total; 334 counter_u64_t rack_express_sack; 335 counter_u64_t rack_sack_total; 336 counter_u64_t rack_move_none; 337 counter_u64_t rack_move_some; 338 339 counter_u64_t rack_used_tlpmethod; 340 counter_u64_t rack_used_tlpmethod2; 341 counter_u64_t rack_enter_tlp_calc; 342 counter_u64_t rack_input_idle_reduces; 343 counter_u64_t rack_collapsed_win; 344 counter_u64_t rack_tlp_does_nada; 345 counter_u64_t rack_try_scwnd; 346 347 /* Temp CPU counters */ 348 counter_u64_t rack_find_high; 349 350 counter_u64_t rack_progress_drops; 351 counter_u64_t rack_out_size[TCP_MSS_ACCT_SIZE]; 352 counter_u64_t rack_opts_arry[RACK_OPTS_SIZE]; 353 354 static void 355 rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, int event, int line); 356 357 static int 358 rack_process_ack(struct mbuf *m, struct tcphdr *th, 359 struct socket *so, struct tcpcb *tp, struct tcpopt *to, 360 uint32_t tiwin, int32_t tlen, int32_t * ofia, int32_t thflags, int32_t * ret_val); 361 static int 362 rack_process_data(struct mbuf *m, struct tcphdr *th, 363 struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, 364 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); 365 static void 366 rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, 367 struct tcphdr *th, uint16_t nsegs, uint16_t type, int32_t recovery); 368 static struct rack_sendmap *rack_alloc(struct tcp_rack *rack); 369 static struct rack_sendmap *rack_alloc_limit(struct tcp_rack *rack, 370 uint8_t limit_type); 371 static struct rack_sendmap * 372 rack_check_recovery_mode(struct tcpcb *tp, 373 uint32_t tsused); 374 static void 375 rack_cong_signal(struct tcpcb *tp, struct tcphdr *th, 376 uint32_t type); 377 static void rack_counter_destroy(void); 378 static int 379 rack_ctloutput(struct socket *so, struct sockopt *sopt, 380 struct inpcb *inp, struct tcpcb *tp); 381 static int32_t rack_ctor(void *mem, int32_t size, void *arg, int32_t how); 382 static void 383 rack_set_pace_segments(struct tcpcb *tp, struct tcp_rack *rack, uint32_t line); 384 static void 385 rack_do_segment(struct mbuf *m, struct tcphdr *th, 386 struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, 387 uint8_t iptos); 388 static void rack_dtor(void *mem, int32_t size, void *arg); 389 static void 390 rack_earlier_retran(struct tcpcb *tp, struct rack_sendmap *rsm, 391 uint32_t t, uint32_t cts); 392 static void 393 rack_log_alt_to_to_cancel(struct tcp_rack *rack, 394 uint32_t flex1, uint32_t flex2, 395 uint32_t flex3, uint32_t flex4, 396 uint32_t flex5, uint32_t flex6, 397 uint16_t flex7, uint8_t mod); 398 static void 399 rack_log_pacing_delay_calc(struct tcp_rack *rack, uint32_t len, uint32_t slot, 400 uint64_t bw_est, uint64_t bw, uint64_t len_time, int method, int line, struct rack_sendmap *rsm); 401 static struct rack_sendmap * 402 rack_find_high_nonack(struct tcp_rack *rack, 403 struct rack_sendmap *rsm); 404 static struct rack_sendmap *rack_find_lowest_rsm(struct tcp_rack *rack); 405 static void rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm); 406 static void rack_fini(struct tcpcb *tp, int32_t tcb_is_purged); 407 static int 408 rack_get_sockopt(struct socket *so, struct sockopt *sopt, 409 struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack); 410 static void 411 rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack, 412 tcp_seq th_ack, int line); 413 static uint32_t 414 rack_get_pacing_len(struct tcp_rack *rack, uint64_t bw, uint32_t mss); 415 static int32_t rack_handoff_ok(struct tcpcb *tp); 416 static int32_t rack_init(struct tcpcb *tp); 417 static void rack_init_sysctls(void); 418 static void 419 rack_log_ack(struct tcpcb *tp, struct tcpopt *to, 420 struct tcphdr *th); 421 static void 422 rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len, 423 uint32_t seq_out, uint8_t th_flags, int32_t err, uint32_t ts, 424 uint8_t pass, struct rack_sendmap *hintrsm, uint32_t us_cts); 425 static void 426 rack_log_sack_passed(struct tcpcb *tp, struct tcp_rack *rack, 427 struct rack_sendmap *rsm); 428 static void rack_log_to_event(struct tcp_rack *rack, int32_t to_num, struct rack_sendmap *rsm); 429 static int32_t rack_output(struct tcpcb *tp); 430 431 static uint32_t 432 rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, 433 struct sackblk *sack, struct tcpopt *to, struct rack_sendmap **prsm, 434 uint32_t cts, int *moved_two); 435 static void rack_post_recovery(struct tcpcb *tp, struct tcphdr *th); 436 static void rack_remxt_tmr(struct tcpcb *tp); 437 static int 438 rack_set_sockopt(struct socket *so, struct sockopt *sopt, 439 struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack); 440 static void rack_set_state(struct tcpcb *tp, struct tcp_rack *rack); 441 static int32_t rack_stopall(struct tcpcb *tp); 442 static void 443 rack_timer_activate(struct tcpcb *tp, uint32_t timer_type, 444 uint32_t delta); 445 static int32_t rack_timer_active(struct tcpcb *tp, uint32_t timer_type); 446 static void rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line); 447 static void rack_timer_stop(struct tcpcb *tp, uint32_t timer_type); 448 static uint32_t 449 rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack, 450 struct rack_sendmap *rsm, uint32_t ts, int32_t * lenp); 451 static void 452 rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack, 453 struct rack_sendmap *rsm, uint32_t ts); 454 static int 455 rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack, 456 struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type, tcp_seq th_ack); 457 static int32_t tcp_addrack(module_t mod, int32_t type, void *data); 458 static int 459 rack_do_close_wait(struct mbuf *m, struct tcphdr *th, 460 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 461 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 462 static int 463 rack_do_closing(struct mbuf *m, struct tcphdr *th, 464 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 465 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 466 static int 467 rack_do_established(struct mbuf *m, struct tcphdr *th, 468 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 469 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 470 static int 471 rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, 472 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 473 int32_t tlen, uint32_t tiwin, int32_t nxt_pkt, uint8_t iptos); 474 static int 475 rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, 476 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 477 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 478 static int 479 rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, 480 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 481 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 482 static int 483 rack_do_lastack(struct mbuf *m, struct tcphdr *th, 484 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 485 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 486 static int 487 rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, 488 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 489 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 490 static int 491 rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, 492 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 493 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 494 struct rack_sendmap * 495 tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, 496 uint32_t tsused); 497 static void tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt, 498 uint32_t len, uint32_t us_tim, int confidence, struct rack_sendmap *rsm, uint16_t rtrcnt); 499 static void 500 tcp_rack_partialack(struct tcpcb *tp, struct tcphdr *th); 501 502 int32_t rack_clear_counter=0; 503 504 505 static int 506 sysctl_rack_clear(SYSCTL_HANDLER_ARGS) 507 { 508 uint32_t stat; 509 int32_t error; 510 511 error = SYSCTL_OUT(req, &rack_clear_counter, sizeof(uint32_t)); 512 if (error || req->newptr == NULL) 513 return error; 514 515 error = SYSCTL_IN(req, &stat, sizeof(uint32_t)); 516 if (error) 517 return (error); 518 if (stat == 1) { 519 #ifdef INVARIANTS 520 printf("Clearing RACK counters\n"); 521 #endif 522 counter_u64_zero(rack_badfr); 523 counter_u64_zero(rack_badfr_bytes); 524 counter_u64_zero(rack_rtm_prr_retran); 525 counter_u64_zero(rack_rtm_prr_newdata); 526 counter_u64_zero(rack_timestamp_mismatch); 527 counter_u64_zero(rack_reorder_seen); 528 counter_u64_zero(rack_tlp_tot); 529 counter_u64_zero(rack_tlp_newdata); 530 counter_u64_zero(rack_tlp_retran); 531 counter_u64_zero(rack_tlp_retran_bytes); 532 counter_u64_zero(rack_tlp_retran_fail); 533 counter_u64_zero(rack_to_tot); 534 counter_u64_zero(rack_to_arm_rack); 535 counter_u64_zero(rack_to_arm_tlp); 536 counter_u64_zero(rack_paced_segments); 537 counter_u64_zero(rack_calc_zero); 538 counter_u64_zero(rack_calc_nonzero); 539 counter_u64_zero(rack_unpaced_segments); 540 counter_u64_zero(rack_saw_enobuf); 541 counter_u64_zero(rack_saw_enetunreach); 542 counter_u64_zero(rack_per_timer_hole); 543 counter_u64_zero(rack_to_alloc_hard); 544 counter_u64_zero(rack_to_alloc_emerg); 545 counter_u64_zero(rack_sack_proc_all); 546 counter_u64_zero(rack_sack_proc_short); 547 counter_u64_zero(rack_sack_proc_restart); 548 counter_u64_zero(rack_to_alloc); 549 counter_u64_zero(rack_to_alloc_limited); 550 counter_u64_zero(rack_alloc_limited_conns); 551 counter_u64_zero(rack_split_limited); 552 counter_u64_zero(rack_find_high); 553 counter_u64_zero(rack_sack_attacks_detected); 554 counter_u64_zero(rack_sack_attacks_reversed); 555 counter_u64_zero(rack_sack_used_next_merge); 556 counter_u64_zero(rack_sack_used_prev_merge); 557 counter_u64_zero(rack_sack_splits); 558 counter_u64_zero(rack_sack_skipped_acked); 559 counter_u64_zero(rack_ack_total); 560 counter_u64_zero(rack_express_sack); 561 counter_u64_zero(rack_sack_total); 562 counter_u64_zero(rack_move_none); 563 counter_u64_zero(rack_move_some); 564 counter_u64_zero(rack_used_tlpmethod); 565 counter_u64_zero(rack_used_tlpmethod2); 566 counter_u64_zero(rack_enter_tlp_calc); 567 counter_u64_zero(rack_progress_drops); 568 counter_u64_zero(rack_tlp_does_nada); 569 counter_u64_zero(rack_try_scwnd); 570 counter_u64_zero(rack_collapsed_win); 571 572 } 573 rack_clear_counter = 0; 574 return (0); 575 } 576 577 578 579 static void 580 rack_init_sysctls(void) 581 { 582 struct sysctl_oid *rack_counters; 583 struct sysctl_oid *rack_attack; 584 struct sysctl_oid *rack_pacing; 585 struct sysctl_oid *rack_timely; 586 struct sysctl_oid *rack_timers; 587 struct sysctl_oid *rack_tlp; 588 struct sysctl_oid *rack_misc; 589 struct sysctl_oid *rack_measure; 590 struct sysctl_oid *rack_probertt; 591 592 rack_attack = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 593 SYSCTL_CHILDREN(rack_sysctl_root), 594 OID_AUTO, 595 "sack_attack", 596 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 597 "Rack Sack Attack Counters and Controls"); 598 rack_counters = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 599 SYSCTL_CHILDREN(rack_sysctl_root), 600 OID_AUTO, 601 "stats", 602 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 603 "Rack Counters"); 604 SYSCTL_ADD_S32(&rack_sysctl_ctx, 605 SYSCTL_CHILDREN(rack_sysctl_root), 606 OID_AUTO, "rate_sample_method", CTLFLAG_RW, 607 &rack_rate_sample_method , USE_RTT_LOW, 608 "What method should we use for rate sampling 0=high, 1=low "); 609 /* Probe rtt related controls */ 610 rack_probertt = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 611 SYSCTL_CHILDREN(rack_sysctl_root), 612 OID_AUTO, 613 "probertt", 614 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 615 "ProbeRTT related Controls"); 616 SYSCTL_ADD_U16(&rack_sysctl_ctx, 617 SYSCTL_CHILDREN(rack_probertt), 618 OID_AUTO, "exit_per_hpb", CTLFLAG_RW, 619 &rack_atexit_prtt_hbp, 130, 620 "What percentage above goodput do we clamp CA/SS to at exit on high-BDP path 110%"); 621 SYSCTL_ADD_U16(&rack_sysctl_ctx, 622 SYSCTL_CHILDREN(rack_probertt), 623 OID_AUTO, "exit_per_nonhpb", CTLFLAG_RW, 624 &rack_atexit_prtt, 130, 625 "What percentage above goodput do we clamp CA/SS to at exit on a non high-BDP path 100%"); 626 SYSCTL_ADD_U16(&rack_sysctl_ctx, 627 SYSCTL_CHILDREN(rack_probertt), 628 OID_AUTO, "gp_per_mul", CTLFLAG_RW, 629 &rack_per_of_gp_probertt, 60, 630 "What percentage of goodput do we pace at in probertt"); 631 SYSCTL_ADD_U16(&rack_sysctl_ctx, 632 SYSCTL_CHILDREN(rack_probertt), 633 OID_AUTO, "gp_per_reduce", CTLFLAG_RW, 634 &rack_per_of_gp_probertt_reduce, 10, 635 "What percentage of goodput do we reduce every gp_srtt"); 636 SYSCTL_ADD_U16(&rack_sysctl_ctx, 637 SYSCTL_CHILDREN(rack_probertt), 638 OID_AUTO, "gp_per_low", CTLFLAG_RW, 639 &rack_per_of_gp_lowthresh, 40, 640 "What percentage of goodput do we allow the multiplier to fall to"); 641 SYSCTL_ADD_U32(&rack_sysctl_ctx, 642 SYSCTL_CHILDREN(rack_probertt), 643 OID_AUTO, "time_between", CTLFLAG_RW, 644 & rack_time_between_probertt, 96000000, 645 "How many useconds between the lowest rtt falling must past before we enter probertt"); 646 SYSCTL_ADD_U32(&rack_sysctl_ctx, 647 SYSCTL_CHILDREN(rack_probertt), 648 OID_AUTO, "safety", CTLFLAG_RW, 649 &rack_probe_rtt_safety_val, 2000000, 650 "If not zero, provides a maximum usecond that you can stay in probertt (2sec = 2000000)"); 651 SYSCTL_ADD_U32(&rack_sysctl_ctx, 652 SYSCTL_CHILDREN(rack_probertt), 653 OID_AUTO, "sets_cwnd", CTLFLAG_RW, 654 &rack_probe_rtt_sets_cwnd, 0, 655 "Do we set the cwnd too (if always_lower is on)"); 656 SYSCTL_ADD_U32(&rack_sysctl_ctx, 657 SYSCTL_CHILDREN(rack_probertt), 658 OID_AUTO, "maxdrainsrtts", CTLFLAG_RW, 659 &rack_max_drain_wait, 2, 660 "Maximum number of gp_srtt's to hold in drain waiting for flight to reach goal"); 661 SYSCTL_ADD_U32(&rack_sysctl_ctx, 662 SYSCTL_CHILDREN(rack_probertt), 663 OID_AUTO, "mustdrainsrtts", CTLFLAG_RW, 664 &rack_must_drain, 1, 665 "We must drain this many gp_srtt's waiting for flight to reach goal"); 666 SYSCTL_ADD_U32(&rack_sysctl_ctx, 667 SYSCTL_CHILDREN(rack_probertt), 668 OID_AUTO, "goal_use_min_entry", CTLFLAG_RW, 669 &rack_probertt_use_min_rtt_entry, 1, 670 "Should we use the min-rtt to calculate the goal rtt (else gp_srtt) at entry"); 671 SYSCTL_ADD_U32(&rack_sysctl_ctx, 672 SYSCTL_CHILDREN(rack_probertt), 673 OID_AUTO, "goal_use_min_exit", CTLFLAG_RW, 674 &rack_probertt_use_min_rtt_exit, 0, 675 "How to set cwnd at exit, 0 - dynamic, 1 - use min-rtt, 2 - use curgprtt, 3 - entry gp-rtt"); 676 SYSCTL_ADD_U32(&rack_sysctl_ctx, 677 SYSCTL_CHILDREN(rack_probertt), 678 OID_AUTO, "length_div", CTLFLAG_RW, 679 &rack_probertt_gpsrtt_cnt_div, 0, 680 "How many recent goodput srtt periods plus hold tim does probertt last (bottom of fraction)"); 681 SYSCTL_ADD_U32(&rack_sysctl_ctx, 682 SYSCTL_CHILDREN(rack_probertt), 683 OID_AUTO, "length_mul", CTLFLAG_RW, 684 &rack_probertt_gpsrtt_cnt_mul, 0, 685 "How many recent goodput srtt periods plus hold tim does probertt last (top of fraction)"); 686 SYSCTL_ADD_U32(&rack_sysctl_ctx, 687 SYSCTL_CHILDREN(rack_probertt), 688 OID_AUTO, "holdtim_at_target", CTLFLAG_RW, 689 &rack_min_probertt_hold, 200000, 690 "What is the minimum time we hold probertt at target"); 691 SYSCTL_ADD_U32(&rack_sysctl_ctx, 692 SYSCTL_CHILDREN(rack_probertt), 693 OID_AUTO, "filter_life", CTLFLAG_RW, 694 &rack_probertt_filter_life, 10000000, 695 "What is the time for the filters life in useconds"); 696 SYSCTL_ADD_U32(&rack_sysctl_ctx, 697 SYSCTL_CHILDREN(rack_probertt), 698 OID_AUTO, "lower_within", CTLFLAG_RW, 699 &rack_probertt_lower_within, 10, 700 "If the rtt goes lower within this percentage of the time, go into probe-rtt"); 701 SYSCTL_ADD_U32(&rack_sysctl_ctx, 702 SYSCTL_CHILDREN(rack_probertt), 703 OID_AUTO, "must_move", CTLFLAG_RW, 704 &rack_min_rtt_movement, 250, 705 "How much is the minimum movement in rtt to count as a drop for probertt purposes"); 706 SYSCTL_ADD_U32(&rack_sysctl_ctx, 707 SYSCTL_CHILDREN(rack_probertt), 708 OID_AUTO, "clear_is_cnts", CTLFLAG_RW, 709 &rack_probertt_clear_is, 1, 710 "Do we clear I/S counts on exiting probe-rtt"); 711 SYSCTL_ADD_S32(&rack_sysctl_ctx, 712 SYSCTL_CHILDREN(rack_probertt), 713 OID_AUTO, "hbp_extra_drain", CTLFLAG_RW, 714 &rack_max_drain_hbp, 1, 715 "How many extra drain gpsrtt's do we get in highly buffered paths"); 716 SYSCTL_ADD_S32(&rack_sysctl_ctx, 717 SYSCTL_CHILDREN(rack_probertt), 718 OID_AUTO, "hbp_threshold", CTLFLAG_RW, 719 &rack_hbp_thresh, 3, 720 "We are highly buffered if min_rtt_seen / max_rtt_seen > this-threshold"); 721 /* Pacing related sysctls */ 722 rack_pacing = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 723 SYSCTL_CHILDREN(rack_sysctl_root), 724 OID_AUTO, 725 "pacing", 726 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 727 "Pacing related Controls"); 728 SYSCTL_ADD_S32(&rack_sysctl_ctx, 729 SYSCTL_CHILDREN(rack_pacing), 730 OID_AUTO, "max_pace_over", CTLFLAG_RW, 731 &rack_max_per_above, 30, 732 "What is the maximum allowable percentage that we can pace above (so 30 = 130% of our goal)"); 733 SYSCTL_ADD_S32(&rack_sysctl_ctx, 734 SYSCTL_CHILDREN(rack_pacing), 735 OID_AUTO, "pace_to_one", CTLFLAG_RW, 736 &rack_pace_one_seg, 0, 737 "Do we allow low b/w pacing of 1MSS instead of two"); 738 SYSCTL_ADD_S32(&rack_sysctl_ctx, 739 SYSCTL_CHILDREN(rack_pacing), 740 OID_AUTO, "limit_wsrtt", CTLFLAG_RW, 741 &rack_limit_time_with_srtt, 0, 742 "Do we limit pacing time based on srtt"); 743 SYSCTL_ADD_S32(&rack_sysctl_ctx, 744 SYSCTL_CHILDREN(rack_pacing), 745 OID_AUTO, "init_win", CTLFLAG_RW, 746 &rack_default_init_window, 0, 747 "Do we have a rack initial window 0 = system default"); 748 SYSCTL_ADD_U32(&rack_sysctl_ctx, 749 SYSCTL_CHILDREN(rack_pacing), 750 OID_AUTO, "hw_pacing_adjust", CTLFLAG_RW, 751 &rack_hw_pace_adjust, 0, 752 "What percentage do we raise the MSS by (11 = 1.1%)"); 753 SYSCTL_ADD_U16(&rack_sysctl_ctx, 754 SYSCTL_CHILDREN(rack_pacing), 755 OID_AUTO, "gp_per_ss", CTLFLAG_RW, 756 &rack_per_of_gp_ss, 250, 757 "If non zero, what percentage of goodput to pace at in slow start"); 758 SYSCTL_ADD_U16(&rack_sysctl_ctx, 759 SYSCTL_CHILDREN(rack_pacing), 760 OID_AUTO, "gp_per_ca", CTLFLAG_RW, 761 &rack_per_of_gp_ca, 150, 762 "If non zero, what percentage of goodput to pace at in congestion avoidance"); 763 SYSCTL_ADD_U16(&rack_sysctl_ctx, 764 SYSCTL_CHILDREN(rack_pacing), 765 OID_AUTO, "gp_per_rec", CTLFLAG_RW, 766 &rack_per_of_gp_rec, 200, 767 "If non zero, what percentage of goodput to pace at in recovery"); 768 SYSCTL_ADD_S32(&rack_sysctl_ctx, 769 SYSCTL_CHILDREN(rack_pacing), 770 OID_AUTO, "pace_max_seg", CTLFLAG_RW, 771 &rack_hptsi_segments, 40, 772 "What size is the max for TSO segments in pacing and burst mitigation"); 773 SYSCTL_ADD_S32(&rack_sysctl_ctx, 774 SYSCTL_CHILDREN(rack_pacing), 775 OID_AUTO, "burst_reduces", CTLFLAG_RW, 776 &rack_slot_reduction, 4, 777 "When doing only burst mitigation what is the reduce divisor"); 778 SYSCTL_ADD_S32(&rack_sysctl_ctx, 779 SYSCTL_CHILDREN(rack_sysctl_root), 780 OID_AUTO, "use_pacing", CTLFLAG_RW, 781 &rack_pace_every_seg, 0, 782 "If set we use pacing, if clear we use only the original burst mitigation"); 783 784 rack_timely = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 785 SYSCTL_CHILDREN(rack_sysctl_root), 786 OID_AUTO, 787 "timely", 788 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 789 "Rack Timely RTT Controls"); 790 /* Timely based GP dynmics */ 791 SYSCTL_ADD_S32(&rack_sysctl_ctx, 792 SYSCTL_CHILDREN(rack_timely), 793 OID_AUTO, "upper", CTLFLAG_RW, 794 &rack_gp_per_bw_mul_up, 2, 795 "Rack timely upper range for equal b/w (in percentage)"); 796 SYSCTL_ADD_S32(&rack_sysctl_ctx, 797 SYSCTL_CHILDREN(rack_timely), 798 OID_AUTO, "lower", CTLFLAG_RW, 799 &rack_gp_per_bw_mul_down, 4, 800 "Rack timely lower range for equal b/w (in percentage)"); 801 SYSCTL_ADD_S32(&rack_sysctl_ctx, 802 SYSCTL_CHILDREN(rack_timely), 803 OID_AUTO, "rtt_max_mul", CTLFLAG_RW, 804 &rack_gp_rtt_maxmul, 3, 805 "Rack timely multipler of lowest rtt for rtt_max"); 806 SYSCTL_ADD_S32(&rack_sysctl_ctx, 807 SYSCTL_CHILDREN(rack_timely), 808 OID_AUTO, "rtt_min_div", CTLFLAG_RW, 809 &rack_gp_rtt_mindiv, 4, 810 "Rack timely divisor used for rtt + (rtt * mul/divisor) for check for lower rtt"); 811 SYSCTL_ADD_S32(&rack_sysctl_ctx, 812 SYSCTL_CHILDREN(rack_timely), 813 OID_AUTO, "rtt_min_mul", CTLFLAG_RW, 814 &rack_gp_rtt_minmul, 1, 815 "Rack timely multiplier used for rtt + (rtt * mul/divisor) for check for lower rtt"); 816 SYSCTL_ADD_S32(&rack_sysctl_ctx, 817 SYSCTL_CHILDREN(rack_timely), 818 OID_AUTO, "decrease", CTLFLAG_RW, 819 &rack_gp_decrease_per, 20, 820 "Rack timely decrease percentage of our GP multiplication factor"); 821 SYSCTL_ADD_S32(&rack_sysctl_ctx, 822 SYSCTL_CHILDREN(rack_timely), 823 OID_AUTO, "increase", CTLFLAG_RW, 824 &rack_gp_increase_per, 2, 825 "Rack timely increase perentage of our GP multiplication factor"); 826 SYSCTL_ADD_S32(&rack_sysctl_ctx, 827 SYSCTL_CHILDREN(rack_timely), 828 OID_AUTO, "lowerbound", CTLFLAG_RW, 829 &rack_per_lower_bound, 50, 830 "Rack timely lowest percentage we allow GP multiplier to fall to"); 831 SYSCTL_ADD_S32(&rack_sysctl_ctx, 832 SYSCTL_CHILDREN(rack_timely), 833 OID_AUTO, "upperboundss", CTLFLAG_RW, 834 &rack_per_upper_bound_ss, 0, 835 "Rack timely higest percentage we allow GP multiplier in SS to raise to (0 is no upperbound)"); 836 SYSCTL_ADD_S32(&rack_sysctl_ctx, 837 SYSCTL_CHILDREN(rack_timely), 838 OID_AUTO, "upperboundca", CTLFLAG_RW, 839 &rack_per_upper_bound_ca, 0, 840 "Rack timely higest percentage we allow GP multiplier to CA raise to (0 is no upperbound)"); 841 SYSCTL_ADD_S32(&rack_sysctl_ctx, 842 SYSCTL_CHILDREN(rack_timely), 843 OID_AUTO, "dynamicgp", CTLFLAG_RW, 844 &rack_do_dyn_mul, 0, 845 "Rack timely do we enable dynmaic timely goodput by default"); 846 SYSCTL_ADD_S32(&rack_sysctl_ctx, 847 SYSCTL_CHILDREN(rack_timely), 848 OID_AUTO, "no_rec_red", CTLFLAG_RW, 849 &rack_gp_no_rec_chg, 1, 850 "Rack timely do we prohibit the recovery multiplier from being lowered"); 851 SYSCTL_ADD_S32(&rack_sysctl_ctx, 852 SYSCTL_CHILDREN(rack_timely), 853 OID_AUTO, "red_clear_cnt", CTLFLAG_RW, 854 &rack_timely_dec_clear, 6, 855 "Rack timely what threshold do we count to before another boost during b/w decent"); 856 SYSCTL_ADD_S32(&rack_sysctl_ctx, 857 SYSCTL_CHILDREN(rack_timely), 858 OID_AUTO, "max_push_rise", CTLFLAG_RW, 859 &rack_timely_max_push_rise, 3, 860 "Rack timely how many times do we push up with b/w increase"); 861 SYSCTL_ADD_S32(&rack_sysctl_ctx, 862 SYSCTL_CHILDREN(rack_timely), 863 OID_AUTO, "max_push_drop", CTLFLAG_RW, 864 &rack_timely_max_push_drop, 3, 865 "Rack timely how many times do we push back on b/w decent"); 866 SYSCTL_ADD_S32(&rack_sysctl_ctx, 867 SYSCTL_CHILDREN(rack_timely), 868 OID_AUTO, "min_segs", CTLFLAG_RW, 869 &rack_timely_min_segs, 4, 870 "Rack timely when setting the cwnd what is the min num segments"); 871 SYSCTL_ADD_S32(&rack_sysctl_ctx, 872 SYSCTL_CHILDREN(rack_timely), 873 OID_AUTO, "noback_max", CTLFLAG_RW, 874 &rack_use_max_for_nobackoff, 0, 875 "Rack timely when deciding if to backoff on a loss, do we use under max rtt else min"); 876 SYSCTL_ADD_S32(&rack_sysctl_ctx, 877 SYSCTL_CHILDREN(rack_timely), 878 OID_AUTO, "interim_timely_only", CTLFLAG_RW, 879 &rack_timely_int_timely_only, 0, 880 "Rack timely when doing interim timely's do we only do timely (no b/w consideration)"); 881 SYSCTL_ADD_S32(&rack_sysctl_ctx, 882 SYSCTL_CHILDREN(rack_timely), 883 OID_AUTO, "nonstop", CTLFLAG_RW, 884 &rack_timely_no_stopping, 0, 885 "Rack timely don't stop increase"); 886 SYSCTL_ADD_S32(&rack_sysctl_ctx, 887 SYSCTL_CHILDREN(rack_timely), 888 OID_AUTO, "dec_raise_thresh", CTLFLAG_RW, 889 &rack_down_raise_thresh, 100, 890 "If the CA or SS is below this threshold raise on the first 3 b/w lowers (0=always)"); 891 SYSCTL_ADD_S32(&rack_sysctl_ctx, 892 SYSCTL_CHILDREN(rack_timely), 893 OID_AUTO, "bottom_drag_segs", CTLFLAG_RW, 894 &rack_req_segs, 1, 895 "Bottom dragging if not these many segments outstanding and room"); 896 897 /* TLP and Rack related parameters */ 898 rack_tlp = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 899 SYSCTL_CHILDREN(rack_sysctl_root), 900 OID_AUTO, 901 "tlp", 902 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 903 "TLP and Rack related Controls"); 904 SYSCTL_ADD_S32(&rack_sysctl_ctx, 905 SYSCTL_CHILDREN(rack_tlp), 906 OID_AUTO, "use_rrr", CTLFLAG_RW, 907 &use_rack_rr, 1, 908 "Do we use Rack Rapid Recovery"); 909 SYSCTL_ADD_S32(&rack_sysctl_ctx, 910 SYSCTL_CHILDREN(rack_tlp), 911 OID_AUTO, "nonrxt_use_cr", CTLFLAG_RW, 912 &rack_non_rxt_use_cr, 0, 913 "Do we use ss/ca rate if in recovery we are transmitting a new data chunk"); 914 SYSCTL_ADD_S32(&rack_sysctl_ctx, 915 SYSCTL_CHILDREN(rack_tlp), 916 OID_AUTO, "tlpmethod", CTLFLAG_RW, 917 &rack_tlp_threshold_use, TLP_USE_TWO_ONE, 918 "What method do we do for TLP time calc 0=no-de-ack-comp, 1=ID, 2=2.1, 3=2.2"); 919 SYSCTL_ADD_S32(&rack_sysctl_ctx, 920 SYSCTL_CHILDREN(rack_tlp), 921 OID_AUTO, "limit", CTLFLAG_RW, 922 &rack_tlp_limit, 2, 923 "How many TLP's can be sent without sending new data"); 924 SYSCTL_ADD_S32(&rack_sysctl_ctx, 925 SYSCTL_CHILDREN(rack_tlp), 926 OID_AUTO, "use_greater", CTLFLAG_RW, 927 &rack_tlp_use_greater, 1, 928 "Should we use the rack_rtt time if its greater than srtt"); 929 SYSCTL_ADD_S32(&rack_sysctl_ctx, 930 SYSCTL_CHILDREN(rack_tlp), 931 OID_AUTO, "tlpminto", CTLFLAG_RW, 932 &rack_tlp_min, 10, 933 "TLP minimum timeout per the specification (10ms)"); 934 SYSCTL_ADD_S32(&rack_sysctl_ctx, 935 SYSCTL_CHILDREN(rack_tlp), 936 OID_AUTO, "send_oldest", CTLFLAG_RW, 937 &rack_always_send_oldest, 0, 938 "Should we always send the oldest TLP and RACK-TLP"); 939 SYSCTL_ADD_S32(&rack_sysctl_ctx, 940 SYSCTL_CHILDREN(rack_tlp), 941 OID_AUTO, "rack_tlimit", CTLFLAG_RW, 942 &rack_limited_retran, 0, 943 "How many times can a rack timeout drive out sends"); 944 SYSCTL_ADD_S32(&rack_sysctl_ctx, 945 SYSCTL_CHILDREN(rack_tlp), 946 OID_AUTO, "tlp_retry", CTLFLAG_RW, 947 &rack_tlp_max_resend, 2, 948 "How many times does TLP retry a single segment or multiple with no ACK"); 949 SYSCTL_ADD_S32(&rack_sysctl_ctx, 950 SYSCTL_CHILDREN(rack_tlp), 951 OID_AUTO, "tlp_cwnd_flag", CTLFLAG_RW, 952 &rack_lower_cwnd_at_tlp, 0, 953 "When a TLP completes a retran should we enter recovery"); 954 SYSCTL_ADD_S32(&rack_sysctl_ctx, 955 SYSCTL_CHILDREN(rack_tlp), 956 OID_AUTO, "reorder_thresh", CTLFLAG_RW, 957 &rack_reorder_thresh, 2, 958 "What factor for rack will be added when seeing reordering (shift right)"); 959 SYSCTL_ADD_S32(&rack_sysctl_ctx, 960 SYSCTL_CHILDREN(rack_tlp), 961 OID_AUTO, "rtt_tlp_thresh", CTLFLAG_RW, 962 &rack_tlp_thresh, 1, 963 "What divisor for TLP rtt/retran will be added (1=rtt, 2=1/2 rtt etc)"); 964 SYSCTL_ADD_S32(&rack_sysctl_ctx, 965 SYSCTL_CHILDREN(rack_tlp), 966 OID_AUTO, "reorder_fade", CTLFLAG_RW, 967 &rack_reorder_fade, 0, 968 "Does reorder detection fade, if so how many ms (0 means never)"); 969 SYSCTL_ADD_S32(&rack_sysctl_ctx, 970 SYSCTL_CHILDREN(rack_tlp), 971 OID_AUTO, "pktdelay", CTLFLAG_RW, 972 &rack_pkt_delay, 1, 973 "Extra RACK time (in ms) besides reordering thresh"); 974 975 /* Timer related controls */ 976 rack_timers = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 977 SYSCTL_CHILDREN(rack_sysctl_root), 978 OID_AUTO, 979 "timers", 980 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 981 "Timer related controls"); 982 SYSCTL_ADD_U32(&rack_sysctl_ctx, 983 SYSCTL_CHILDREN(rack_timers), 984 OID_AUTO, "persmin", CTLFLAG_RW, 985 &rack_persist_min, 250, 986 "What is the minimum time in milliseconds between persists"); 987 SYSCTL_ADD_U32(&rack_sysctl_ctx, 988 SYSCTL_CHILDREN(rack_timers), 989 OID_AUTO, "persmax", CTLFLAG_RW, 990 &rack_persist_max, 2000, 991 "What is the largest delay in milliseconds between persists"); 992 SYSCTL_ADD_S32(&rack_sysctl_ctx, 993 SYSCTL_CHILDREN(rack_timers), 994 OID_AUTO, "delayed_ack", CTLFLAG_RW, 995 &rack_delayed_ack_time, 200, 996 "Delayed ack time (200ms)"); 997 SYSCTL_ADD_S32(&rack_sysctl_ctx, 998 SYSCTL_CHILDREN(rack_timers), 999 OID_AUTO, "minrto", CTLFLAG_RW, 1000 &rack_rto_min, 0, 1001 "Minimum RTO in ms -- set with caution below 1000 due to TLP"); 1002 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1003 SYSCTL_CHILDREN(rack_timers), 1004 OID_AUTO, "maxrto", CTLFLAG_RW, 1005 &rack_rto_max, 0, 1006 "Maxiumum RTO in ms -- should be at least as large as min_rto"); 1007 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1008 SYSCTL_CHILDREN(rack_timers), 1009 OID_AUTO, "minto", CTLFLAG_RW, 1010 &rack_min_to, 1, 1011 "Minimum rack timeout in milliseconds"); 1012 /* Measure controls */ 1013 rack_measure = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 1014 SYSCTL_CHILDREN(rack_sysctl_root), 1015 OID_AUTO, 1016 "measure", 1017 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1018 "Measure related controls"); 1019 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1020 SYSCTL_CHILDREN(rack_measure), 1021 OID_AUTO, "wma_divisor", CTLFLAG_RW, 1022 &rack_wma_divisor, 8, 1023 "When doing b/w calculation what is the divisor for the WMA"); 1024 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1025 SYSCTL_CHILDREN(rack_measure), 1026 OID_AUTO, "end_cwnd", CTLFLAG_RW, 1027 &rack_cwnd_block_ends_measure, 0, 1028 "Does a cwnd just-return end the measurement window (app limited)"); 1029 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1030 SYSCTL_CHILDREN(rack_measure), 1031 OID_AUTO, "end_rwnd", CTLFLAG_RW, 1032 &rack_rwnd_block_ends_measure, 0, 1033 "Does an rwnd just-return end the measurement window (app limited -- not persists)"); 1034 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1035 SYSCTL_CHILDREN(rack_measure), 1036 OID_AUTO, "min_target", CTLFLAG_RW, 1037 &rack_def_data_window, 20, 1038 "What is the minimum target window (in mss) for a GP measurements"); 1039 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1040 SYSCTL_CHILDREN(rack_measure), 1041 OID_AUTO, "goal_bdp", CTLFLAG_RW, 1042 &rack_goal_bdp, 2, 1043 "What is the goal BDP to measure"); 1044 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1045 SYSCTL_CHILDREN(rack_measure), 1046 OID_AUTO, "min_srtts", CTLFLAG_RW, 1047 &rack_min_srtts, 1, 1048 "What is the goal BDP to measure"); 1049 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1050 SYSCTL_CHILDREN(rack_measure), 1051 OID_AUTO, "min_measure_tim", CTLFLAG_RW, 1052 &rack_min_measure_usec, 0, 1053 "What is the Minimum time time for a measurement if 0, this is off"); 1054 /* Misc rack controls */ 1055 rack_misc = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 1056 SYSCTL_CHILDREN(rack_sysctl_root), 1057 OID_AUTO, 1058 "misc", 1059 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1060 "Misc related controls"); 1061 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1062 SYSCTL_CHILDREN(rack_misc), 1063 OID_AUTO, "shared_cwnd", CTLFLAG_RW, 1064 &rack_enable_shared_cwnd, 0, 1065 "Should RACK try to use the shared cwnd on connections where allowed"); 1066 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1067 SYSCTL_CHILDREN(rack_misc), 1068 OID_AUTO, "limits_on_scwnd", CTLFLAG_RW, 1069 &rack_limits_scwnd, 1, 1070 "Should RACK place low end time limits on the shared cwnd feature"); 1071 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1072 SYSCTL_CHILDREN(rack_misc), 1073 OID_AUTO, "non_paced_lro_queue", CTLFLAG_RW, 1074 &rack_enable_mqueue_for_nonpaced, 0, 1075 "Should RACK use mbuf queuing for non-paced connections"); 1076 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1077 SYSCTL_CHILDREN(rack_misc), 1078 OID_AUTO, "iMac_dack", CTLFLAG_RW, 1079 &rack_use_imac_dack, 0, 1080 "Should RACK try to emulate iMac delayed ack"); 1081 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1082 SYSCTL_CHILDREN(rack_misc), 1083 OID_AUTO, "no_prr", CTLFLAG_RW, 1084 &rack_disable_prr, 0, 1085 "Should RACK not use prr and only pace (must have pacing on)"); 1086 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1087 SYSCTL_CHILDREN(rack_misc), 1088 OID_AUTO, "bb_verbose", CTLFLAG_RW, 1089 &rack_verbose_logging, 0, 1090 "Should RACK black box logging be verbose"); 1091 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1092 SYSCTL_CHILDREN(rack_misc), 1093 OID_AUTO, "data_after_close", CTLFLAG_RW, 1094 &rack_ignore_data_after_close, 1, 1095 "Do we hold off sending a RST until all pending data is ack'd"); 1096 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1097 SYSCTL_CHILDREN(rack_misc), 1098 OID_AUTO, "no_sack_needed", CTLFLAG_RW, 1099 &rack_sack_not_required, 0, 1100 "Do we allow rack to run on connections not supporting SACK"); 1101 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1102 SYSCTL_CHILDREN(rack_misc), 1103 OID_AUTO, "recovery_loss_prop", CTLFLAG_RW, 1104 &rack_use_proportional_reduce, 0, 1105 "Should we proportionaly reduce cwnd based on the number of losses "); 1106 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1107 SYSCTL_CHILDREN(rack_misc), 1108 OID_AUTO, "recovery_prop", CTLFLAG_RW, 1109 &rack_proportional_rate, 10, 1110 "What percent reduction per loss"); 1111 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1112 SYSCTL_CHILDREN(rack_misc), 1113 OID_AUTO, "prr_sendalot", CTLFLAG_RW, 1114 &rack_send_a_lot_in_prr, 1, 1115 "Send a lot in prr"); 1116 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1117 SYSCTL_CHILDREN(rack_misc), 1118 OID_AUTO, "earlyrecovery", CTLFLAG_RW, 1119 &rack_early_recovery, 1, 1120 "Do we do early recovery with rack"); 1121 /* Sack Attacker detection stuff */ 1122 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1123 SYSCTL_CHILDREN(rack_attack), 1124 OID_AUTO, "detect_highsackratio", CTLFLAG_RW, 1125 &rack_highest_sack_thresh_seen, 0, 1126 "Highest sack to ack ratio seen"); 1127 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1128 SYSCTL_CHILDREN(rack_attack), 1129 OID_AUTO, "detect_highmoveratio", CTLFLAG_RW, 1130 &rack_highest_move_thresh_seen, 0, 1131 "Highest move to non-move ratio seen"); 1132 rack_ack_total = counter_u64_alloc(M_WAITOK); 1133 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1134 SYSCTL_CHILDREN(rack_attack), 1135 OID_AUTO, "acktotal", CTLFLAG_RD, 1136 &rack_ack_total, 1137 "Total number of Ack's"); 1138 rack_express_sack = counter_u64_alloc(M_WAITOK); 1139 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1140 SYSCTL_CHILDREN(rack_attack), 1141 OID_AUTO, "exp_sacktotal", CTLFLAG_RD, 1142 &rack_express_sack, 1143 "Total expresss number of Sack's"); 1144 rack_sack_total = counter_u64_alloc(M_WAITOK); 1145 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1146 SYSCTL_CHILDREN(rack_attack), 1147 OID_AUTO, "sacktotal", CTLFLAG_RD, 1148 &rack_sack_total, 1149 "Total number of SACKs"); 1150 rack_move_none = counter_u64_alloc(M_WAITOK); 1151 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1152 SYSCTL_CHILDREN(rack_attack), 1153 OID_AUTO, "move_none", CTLFLAG_RD, 1154 &rack_move_none, 1155 "Total number of SACK index reuse of postions under threshold"); 1156 rack_move_some = counter_u64_alloc(M_WAITOK); 1157 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1158 SYSCTL_CHILDREN(rack_attack), 1159 OID_AUTO, "move_some", CTLFLAG_RD, 1160 &rack_move_some, 1161 "Total number of SACK index reuse of postions over threshold"); 1162 rack_sack_attacks_detected = counter_u64_alloc(M_WAITOK); 1163 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1164 SYSCTL_CHILDREN(rack_attack), 1165 OID_AUTO, "attacks", CTLFLAG_RD, 1166 &rack_sack_attacks_detected, 1167 "Total number of SACK attackers that had sack disabled"); 1168 rack_sack_attacks_reversed = counter_u64_alloc(M_WAITOK); 1169 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1170 SYSCTL_CHILDREN(rack_attack), 1171 OID_AUTO, "reversed", CTLFLAG_RD, 1172 &rack_sack_attacks_reversed, 1173 "Total number of SACK attackers that were later determined false positive"); 1174 rack_sack_used_next_merge = counter_u64_alloc(M_WAITOK); 1175 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1176 SYSCTL_CHILDREN(rack_attack), 1177 OID_AUTO, "nextmerge", CTLFLAG_RD, 1178 &rack_sack_used_next_merge, 1179 "Total number of times we used the next merge"); 1180 rack_sack_used_prev_merge = counter_u64_alloc(M_WAITOK); 1181 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1182 SYSCTL_CHILDREN(rack_attack), 1183 OID_AUTO, "prevmerge", CTLFLAG_RD, 1184 &rack_sack_used_prev_merge, 1185 "Total number of times we used the prev merge"); 1186 /* Counters */ 1187 rack_badfr = counter_u64_alloc(M_WAITOK); 1188 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1189 SYSCTL_CHILDREN(rack_counters), 1190 OID_AUTO, "badfr", CTLFLAG_RD, 1191 &rack_badfr, "Total number of bad FRs"); 1192 rack_badfr_bytes = counter_u64_alloc(M_WAITOK); 1193 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1194 SYSCTL_CHILDREN(rack_counters), 1195 OID_AUTO, "badfr_bytes", CTLFLAG_RD, 1196 &rack_badfr_bytes, "Total number of bad FRs"); 1197 rack_rtm_prr_retran = counter_u64_alloc(M_WAITOK); 1198 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1199 SYSCTL_CHILDREN(rack_counters), 1200 OID_AUTO, "prrsndret", CTLFLAG_RD, 1201 &rack_rtm_prr_retran, 1202 "Total number of prr based retransmits"); 1203 rack_rtm_prr_newdata = counter_u64_alloc(M_WAITOK); 1204 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1205 SYSCTL_CHILDREN(rack_counters), 1206 OID_AUTO, "prrsndnew", CTLFLAG_RD, 1207 &rack_rtm_prr_newdata, 1208 "Total number of prr based new transmits"); 1209 rack_timestamp_mismatch = counter_u64_alloc(M_WAITOK); 1210 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1211 SYSCTL_CHILDREN(rack_counters), 1212 OID_AUTO, "tsnf", CTLFLAG_RD, 1213 &rack_timestamp_mismatch, 1214 "Total number of timestamps that we could not find the reported ts"); 1215 rack_find_high = counter_u64_alloc(M_WAITOK); 1216 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1217 SYSCTL_CHILDREN(rack_counters), 1218 OID_AUTO, "findhigh", CTLFLAG_RD, 1219 &rack_find_high, 1220 "Total number of FIN causing find-high"); 1221 rack_reorder_seen = counter_u64_alloc(M_WAITOK); 1222 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1223 SYSCTL_CHILDREN(rack_counters), 1224 OID_AUTO, "reordering", CTLFLAG_RD, 1225 &rack_reorder_seen, 1226 "Total number of times we added delay due to reordering"); 1227 rack_tlp_tot = counter_u64_alloc(M_WAITOK); 1228 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1229 SYSCTL_CHILDREN(rack_counters), 1230 OID_AUTO, "tlp_to_total", CTLFLAG_RD, 1231 &rack_tlp_tot, 1232 "Total number of tail loss probe expirations"); 1233 rack_tlp_newdata = counter_u64_alloc(M_WAITOK); 1234 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1235 SYSCTL_CHILDREN(rack_counters), 1236 OID_AUTO, "tlp_new", CTLFLAG_RD, 1237 &rack_tlp_newdata, 1238 "Total number of tail loss probe sending new data"); 1239 rack_tlp_retran = counter_u64_alloc(M_WAITOK); 1240 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1241 SYSCTL_CHILDREN(rack_counters), 1242 OID_AUTO, "tlp_retran", CTLFLAG_RD, 1243 &rack_tlp_retran, 1244 "Total number of tail loss probe sending retransmitted data"); 1245 rack_tlp_retran_bytes = counter_u64_alloc(M_WAITOK); 1246 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1247 SYSCTL_CHILDREN(rack_counters), 1248 OID_AUTO, "tlp_retran_bytes", CTLFLAG_RD, 1249 &rack_tlp_retran_bytes, 1250 "Total bytes of tail loss probe sending retransmitted data"); 1251 rack_tlp_retran_fail = counter_u64_alloc(M_WAITOK); 1252 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1253 SYSCTL_CHILDREN(rack_counters), 1254 OID_AUTO, "tlp_retran_fail", CTLFLAG_RD, 1255 &rack_tlp_retran_fail, 1256 "Total number of tail loss probe sending retransmitted data that failed (wait for t3)"); 1257 rack_to_tot = counter_u64_alloc(M_WAITOK); 1258 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1259 SYSCTL_CHILDREN(rack_counters), 1260 OID_AUTO, "rack_to_tot", CTLFLAG_RD, 1261 &rack_to_tot, 1262 "Total number of times the rack to expired"); 1263 rack_to_arm_rack = counter_u64_alloc(M_WAITOK); 1264 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1265 SYSCTL_CHILDREN(rack_counters), 1266 OID_AUTO, "arm_rack", CTLFLAG_RD, 1267 &rack_to_arm_rack, 1268 "Total number of times the rack timer armed"); 1269 rack_to_arm_tlp = counter_u64_alloc(M_WAITOK); 1270 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1271 SYSCTL_CHILDREN(rack_counters), 1272 OID_AUTO, "arm_tlp", CTLFLAG_RD, 1273 &rack_to_arm_tlp, 1274 "Total number of times the tlp timer armed"); 1275 rack_calc_zero = counter_u64_alloc(M_WAITOK); 1276 rack_calc_nonzero = counter_u64_alloc(M_WAITOK); 1277 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1278 SYSCTL_CHILDREN(rack_counters), 1279 OID_AUTO, "calc_zero", CTLFLAG_RD, 1280 &rack_calc_zero, 1281 "Total number of times pacing time worked out to zero"); 1282 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1283 SYSCTL_CHILDREN(rack_counters), 1284 OID_AUTO, "calc_nonzero", CTLFLAG_RD, 1285 &rack_calc_nonzero, 1286 "Total number of times pacing time worked out to non-zero"); 1287 rack_paced_segments = counter_u64_alloc(M_WAITOK); 1288 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1289 SYSCTL_CHILDREN(rack_counters), 1290 OID_AUTO, "paced", CTLFLAG_RD, 1291 &rack_paced_segments, 1292 "Total number of times a segment send caused hptsi"); 1293 rack_unpaced_segments = counter_u64_alloc(M_WAITOK); 1294 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1295 SYSCTL_CHILDREN(rack_counters), 1296 OID_AUTO, "unpaced", CTLFLAG_RD, 1297 &rack_unpaced_segments, 1298 "Total number of times a segment did not cause hptsi"); 1299 rack_saw_enobuf = counter_u64_alloc(M_WAITOK); 1300 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1301 SYSCTL_CHILDREN(rack_counters), 1302 OID_AUTO, "saw_enobufs", CTLFLAG_RD, 1303 &rack_saw_enobuf, 1304 "Total number of times a segment did not cause hptsi"); 1305 rack_saw_enetunreach = counter_u64_alloc(M_WAITOK); 1306 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1307 SYSCTL_CHILDREN(rack_counters), 1308 OID_AUTO, "saw_enetunreach", CTLFLAG_RD, 1309 &rack_saw_enetunreach, 1310 "Total number of times a segment did not cause hptsi"); 1311 rack_to_alloc = counter_u64_alloc(M_WAITOK); 1312 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1313 SYSCTL_CHILDREN(rack_counters), 1314 OID_AUTO, "allocs", CTLFLAG_RD, 1315 &rack_to_alloc, 1316 "Total allocations of tracking structures"); 1317 rack_to_alloc_hard = counter_u64_alloc(M_WAITOK); 1318 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1319 SYSCTL_CHILDREN(rack_counters), 1320 OID_AUTO, "allochard", CTLFLAG_RD, 1321 &rack_to_alloc_hard, 1322 "Total allocations done with sleeping the hard way"); 1323 rack_to_alloc_emerg = counter_u64_alloc(M_WAITOK); 1324 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1325 SYSCTL_CHILDREN(rack_counters), 1326 OID_AUTO, "allocemerg", CTLFLAG_RD, 1327 &rack_to_alloc_emerg, 1328 "Total allocations done from emergency cache"); 1329 rack_to_alloc_limited = counter_u64_alloc(M_WAITOK); 1330 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1331 SYSCTL_CHILDREN(rack_counters), 1332 OID_AUTO, "alloc_limited", CTLFLAG_RD, 1333 &rack_to_alloc_limited, 1334 "Total allocations dropped due to limit"); 1335 rack_alloc_limited_conns = counter_u64_alloc(M_WAITOK); 1336 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1337 SYSCTL_CHILDREN(rack_counters), 1338 OID_AUTO, "alloc_limited_conns", CTLFLAG_RD, 1339 &rack_alloc_limited_conns, 1340 "Connections with allocations dropped due to limit"); 1341 rack_split_limited = counter_u64_alloc(M_WAITOK); 1342 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1343 SYSCTL_CHILDREN(rack_counters), 1344 OID_AUTO, "split_limited", CTLFLAG_RD, 1345 &rack_split_limited, 1346 "Split allocations dropped due to limit"); 1347 rack_sack_proc_all = counter_u64_alloc(M_WAITOK); 1348 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1349 SYSCTL_CHILDREN(rack_counters), 1350 OID_AUTO, "sack_long", CTLFLAG_RD, 1351 &rack_sack_proc_all, 1352 "Total times we had to walk whole list for sack processing"); 1353 rack_sack_proc_restart = counter_u64_alloc(M_WAITOK); 1354 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1355 SYSCTL_CHILDREN(rack_counters), 1356 OID_AUTO, "sack_restart", CTLFLAG_RD, 1357 &rack_sack_proc_restart, 1358 "Total times we had to walk whole list due to a restart"); 1359 rack_sack_proc_short = counter_u64_alloc(M_WAITOK); 1360 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1361 SYSCTL_CHILDREN(rack_counters), 1362 OID_AUTO, "sack_short", CTLFLAG_RD, 1363 &rack_sack_proc_short, 1364 "Total times we took shortcut for sack processing"); 1365 rack_enter_tlp_calc = counter_u64_alloc(M_WAITOK); 1366 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1367 SYSCTL_CHILDREN(rack_counters), 1368 OID_AUTO, "tlp_calc_entered", CTLFLAG_RD, 1369 &rack_enter_tlp_calc, 1370 "Total times we called calc-tlp"); 1371 rack_used_tlpmethod = counter_u64_alloc(M_WAITOK); 1372 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1373 SYSCTL_CHILDREN(rack_counters), 1374 OID_AUTO, "hit_tlp_method", CTLFLAG_RD, 1375 &rack_used_tlpmethod, 1376 "Total number of runt sacks"); 1377 rack_used_tlpmethod2 = counter_u64_alloc(M_WAITOK); 1378 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1379 SYSCTL_CHILDREN(rack_counters), 1380 OID_AUTO, "hit_tlp_method2", CTLFLAG_RD, 1381 &rack_used_tlpmethod2, 1382 "Total number of times we hit TLP method 2"); 1383 rack_sack_skipped_acked = counter_u64_alloc(M_WAITOK); 1384 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1385 SYSCTL_CHILDREN(rack_attack), 1386 OID_AUTO, "skipacked", CTLFLAG_RD, 1387 &rack_sack_skipped_acked, 1388 "Total number of times we skipped previously sacked"); 1389 rack_sack_splits = counter_u64_alloc(M_WAITOK); 1390 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1391 SYSCTL_CHILDREN(rack_attack), 1392 OID_AUTO, "ofsplit", CTLFLAG_RD, 1393 &rack_sack_splits, 1394 "Total number of times we did the old fashion tree split"); 1395 rack_progress_drops = counter_u64_alloc(M_WAITOK); 1396 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1397 SYSCTL_CHILDREN(rack_counters), 1398 OID_AUTO, "prog_drops", CTLFLAG_RD, 1399 &rack_progress_drops, 1400 "Total number of progress drops"); 1401 rack_input_idle_reduces = counter_u64_alloc(M_WAITOK); 1402 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1403 SYSCTL_CHILDREN(rack_counters), 1404 OID_AUTO, "idle_reduce_oninput", CTLFLAG_RD, 1405 &rack_input_idle_reduces, 1406 "Total number of idle reductions on input"); 1407 rack_collapsed_win = counter_u64_alloc(M_WAITOK); 1408 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1409 SYSCTL_CHILDREN(rack_counters), 1410 OID_AUTO, "collapsed_win", CTLFLAG_RD, 1411 &rack_collapsed_win, 1412 "Total number of collapsed windows"); 1413 rack_tlp_does_nada = counter_u64_alloc(M_WAITOK); 1414 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1415 SYSCTL_CHILDREN(rack_counters), 1416 OID_AUTO, "tlp_nada", CTLFLAG_RD, 1417 &rack_tlp_does_nada, 1418 "Total number of nada tlp calls"); 1419 rack_try_scwnd = counter_u64_alloc(M_WAITOK); 1420 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1421 SYSCTL_CHILDREN(rack_counters), 1422 OID_AUTO, "tried_scwnd", CTLFLAG_RD, 1423 &rack_try_scwnd, 1424 "Total number of scwnd attempts"); 1425 1426 rack_per_timer_hole = counter_u64_alloc(M_WAITOK); 1427 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1428 SYSCTL_CHILDREN(rack_counters), 1429 OID_AUTO, "timer_hole", CTLFLAG_RD, 1430 &rack_per_timer_hole, 1431 "Total persists start in timer hole"); 1432 COUNTER_ARRAY_ALLOC(rack_out_size, TCP_MSS_ACCT_SIZE, M_WAITOK); 1433 SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), 1434 OID_AUTO, "outsize", CTLFLAG_RD, 1435 rack_out_size, TCP_MSS_ACCT_SIZE, "MSS send sizes"); 1436 COUNTER_ARRAY_ALLOC(rack_opts_arry, RACK_OPTS_SIZE, M_WAITOK); 1437 SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), 1438 OID_AUTO, "opts", CTLFLAG_RD, 1439 rack_opts_arry, RACK_OPTS_SIZE, "RACK Option Stats"); 1440 SYSCTL_ADD_PROC(&rack_sysctl_ctx, 1441 SYSCTL_CHILDREN(rack_sysctl_root), 1442 OID_AUTO, "clear", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, 1443 &rack_clear_counter, 0, sysctl_rack_clear, "IU", "Clear counters"); 1444 } 1445 1446 static __inline int 1447 rb_map_cmp(struct rack_sendmap *b, struct rack_sendmap *a) 1448 { 1449 if (SEQ_GEQ(b->r_start, a->r_start) && 1450 SEQ_LT(b->r_start, a->r_end)) { 1451 /* 1452 * The entry b is within the 1453 * block a. i.e.: 1454 * a -- |-------------| 1455 * b -- |----| 1456 * <or> 1457 * b -- |------| 1458 * <or> 1459 * b -- |-----------| 1460 */ 1461 return (0); 1462 } else if (SEQ_GEQ(b->r_start, a->r_end)) { 1463 /* 1464 * b falls as either the next 1465 * sequence block after a so a 1466 * is said to be smaller than b. 1467 * i.e: 1468 * a -- |------| 1469 * b -- |--------| 1470 * or 1471 * b -- |-----| 1472 */ 1473 return (1); 1474 } 1475 /* 1476 * Whats left is where a is 1477 * larger than b. i.e: 1478 * a -- |-------| 1479 * b -- |---| 1480 * or even possibly 1481 * b -- |--------------| 1482 */ 1483 return (-1); 1484 } 1485 1486 RB_PROTOTYPE(rack_rb_tree_head, rack_sendmap, r_next, rb_map_cmp); 1487 RB_GENERATE(rack_rb_tree_head, rack_sendmap, r_next, rb_map_cmp); 1488 1489 static uint32_t 1490 rc_init_window(struct tcp_rack *rack) 1491 { 1492 uint32_t win; 1493 1494 if (rack->rc_init_win == 0) { 1495 /* 1496 * Nothing set by the user, use the system stack 1497 * default. 1498 */ 1499 return(tcp_compute_initwnd(tcp_maxseg(rack->rc_tp))); 1500 } 1501 win = ctf_fixed_maxseg(rack->rc_tp) * rack->rc_init_win; 1502 return(win); 1503 } 1504 1505 static uint64_t 1506 rack_get_fixed_pacing_bw(struct tcp_rack *rack) 1507 { 1508 if (IN_RECOVERY(rack->rc_tp->t_flags)) 1509 return (rack->r_ctl.rc_fixed_pacing_rate_rec); 1510 else if (rack->r_ctl.cwnd_to_use < rack->rc_tp->snd_ssthresh) 1511 return (rack->r_ctl.rc_fixed_pacing_rate_ss); 1512 else 1513 return (rack->r_ctl.rc_fixed_pacing_rate_ca); 1514 } 1515 1516 static uint64_t 1517 rack_get_bw(struct tcp_rack *rack) 1518 { 1519 if (rack->use_fixed_rate) { 1520 /* Return the fixed pacing rate */ 1521 return (rack_get_fixed_pacing_bw(rack)); 1522 } 1523 if (rack->r_ctl.gp_bw == 0) { 1524 /* 1525 * We have yet no b/w measurement, 1526 * if we have a user set initial bw 1527 * return it. If we don't have that and 1528 * we have an srtt, use the tcp IW (10) to 1529 * calculate a fictional b/w over the SRTT 1530 * which is more or less a guess. Note 1531 * we don't use our IW from rack on purpose 1532 * so if we have like IW=30, we are not 1533 * calculating a "huge" b/w. 1534 */ 1535 uint64_t bw, srtt; 1536 if (rack->r_ctl.init_rate) 1537 return (rack->r_ctl.init_rate); 1538 1539 /* Has the user set a max peak rate? */ 1540 #ifdef NETFLIX_PEAKRATE 1541 if (rack->rc_tp->t_maxpeakrate) 1542 return (rack->rc_tp->t_maxpeakrate); 1543 #endif 1544 /* Ok lets come up with the IW guess, if we have a srtt */ 1545 if (rack->rc_tp->t_srtt == 0) { 1546 /* 1547 * Go with old pacing method 1548 * i.e. burst mitigation only. 1549 */ 1550 return (0); 1551 } 1552 /* Ok lets get the initial TCP win (not racks) */ 1553 bw = tcp_compute_initwnd(tcp_maxseg(rack->rc_tp)); 1554 srtt = ((uint64_t)TICKS_2_USEC(rack->rc_tp->t_srtt) >> TCP_RTT_SHIFT); 1555 bw *= (uint64_t)USECS_IN_SECOND; 1556 bw /= srtt; 1557 return (bw); 1558 } else { 1559 uint64_t bw; 1560 1561 if(rack->r_ctl.num_avg >= RACK_REQ_AVG) { 1562 /* Averaging is done, we can return the value */ 1563 bw = rack->r_ctl.gp_bw; 1564 } else { 1565 /* Still doing initial average must calculate */ 1566 bw = rack->r_ctl.gp_bw / rack->r_ctl.num_avg; 1567 } 1568 #ifdef NETFLIX_PEAKRATE 1569 if ((rack->rc_tp->t_maxpeakrate) && 1570 (bw > rack->rc_tp->t_maxpeakrate)) { 1571 /* The user has set a peak rate to pace at 1572 * don't allow us to pace faster than that. 1573 */ 1574 return (rack->rc_tp->t_maxpeakrate); 1575 } 1576 #endif 1577 return (bw); 1578 } 1579 } 1580 1581 static uint16_t 1582 rack_get_output_gain(struct tcp_rack *rack, struct rack_sendmap *rsm) 1583 { 1584 if (rack->use_fixed_rate) { 1585 return (100); 1586 } else if (rack->in_probe_rtt && (rsm == NULL)) 1587 return(rack->r_ctl.rack_per_of_gp_probertt); 1588 else if ((IN_RECOVERY(rack->rc_tp->t_flags) && 1589 rack->r_ctl.rack_per_of_gp_rec)) { 1590 if (rsm) { 1591 /* a retransmission always use the recovery rate */ 1592 return(rack->r_ctl.rack_per_of_gp_rec); 1593 } else if (rack->rack_rec_nonrxt_use_cr) { 1594 /* Directed to use the configured rate */ 1595 goto configured_rate; 1596 } else if (rack->rack_no_prr && 1597 (rack->r_ctl.rack_per_of_gp_rec > 100)) { 1598 /* No PRR, lets just use the b/w estimate only */ 1599 return(100); 1600 } else { 1601 /* 1602 * Here we may have a non-retransmit but we 1603 * have no overrides, so just use the recovery 1604 * rate (prr is in effect). 1605 */ 1606 return(rack->r_ctl.rack_per_of_gp_rec); 1607 } 1608 } 1609 configured_rate: 1610 /* For the configured rate we look at our cwnd vs the ssthresh */ 1611 if (rack->r_ctl.cwnd_to_use < rack->rc_tp->snd_ssthresh) 1612 return (rack->r_ctl.rack_per_of_gp_ss); 1613 else 1614 return(rack->r_ctl.rack_per_of_gp_ca); 1615 } 1616 1617 static uint64_t 1618 rack_get_output_bw(struct tcp_rack *rack, uint64_t bw, struct rack_sendmap *rsm) 1619 { 1620 /* 1621 * We allow rack_per_of_gp_xx to dictate our bw rate we want. 1622 */ 1623 uint64_t bw_est; 1624 uint64_t gain; 1625 1626 gain = (uint64_t)rack_get_output_gain(rack, rsm); 1627 bw_est = bw * gain; 1628 bw_est /= (uint64_t)100; 1629 /* Never fall below the minimum (def 64kbps) */ 1630 if (bw_est < RACK_MIN_BW) 1631 bw_est = RACK_MIN_BW; 1632 return (bw_est); 1633 } 1634 1635 static void 1636 rack_log_retran_reason(struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t tsused, uint32_t thresh, int mod) 1637 { 1638 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 1639 union tcp_log_stackspecific log; 1640 struct timeval tv; 1641 1642 if ((mod != 1) && (rack_verbose_logging == 0)) { 1643 /* 1644 * We get 3 values currently for mod 1645 * 1 - We are retransmitting and this tells the reason. 1646 * 2 - We are clearing a dup-ack count. 1647 * 3 - We are incrementing a dup-ack count. 1648 * 1649 * The clear/increment are only logged 1650 * if you have BBverbose on. 1651 */ 1652 return; 1653 } 1654 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 1655 log.u_bbr.flex1 = tsused; 1656 log.u_bbr.flex2 = thresh; 1657 log.u_bbr.flex3 = rsm->r_flags; 1658 log.u_bbr.flex4 = rsm->r_dupack; 1659 log.u_bbr.flex5 = rsm->r_start; 1660 log.u_bbr.flex6 = rsm->r_end; 1661 log.u_bbr.flex8 = mod; 1662 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 1663 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 1664 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 1665 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 1666 TCP_LOG_EVENTP(rack->rc_tp, NULL, 1667 &rack->rc_inp->inp_socket->so_rcv, 1668 &rack->rc_inp->inp_socket->so_snd, 1669 BBR_LOG_SETTINGS_CHG, 0, 1670 0, &log, false, &tv); 1671 } 1672 } 1673 1674 1675 1676 static void 1677 rack_log_to_start(struct tcp_rack *rack, uint32_t cts, uint32_t to, int32_t slot, uint8_t which) 1678 { 1679 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 1680 union tcp_log_stackspecific log; 1681 struct timeval tv; 1682 1683 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 1684 log.u_bbr.flex1 = TICKS_2_MSEC(rack->rc_tp->t_srtt >> TCP_RTT_SHIFT); 1685 log.u_bbr.flex2 = to * 1000; 1686 log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags; 1687 log.u_bbr.flex4 = slot; 1688 log.u_bbr.flex5 = rack->rc_inp->inp_hptsslot; 1689 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur; 1690 log.u_bbr.flex7 = rack->rc_in_persist; 1691 log.u_bbr.flex8 = which; 1692 if (rack->rack_no_prr) 1693 log.u_bbr.pkts_out = 0; 1694 else 1695 log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt; 1696 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 1697 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 1698 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 1699 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 1700 TCP_LOG_EVENTP(rack->rc_tp, NULL, 1701 &rack->rc_inp->inp_socket->so_rcv, 1702 &rack->rc_inp->inp_socket->so_snd, 1703 BBR_LOG_TIMERSTAR, 0, 1704 0, &log, false, &tv); 1705 } 1706 } 1707 1708 static void 1709 rack_log_to_event(struct tcp_rack *rack, int32_t to_num, struct rack_sendmap *rsm) 1710 { 1711 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 1712 union tcp_log_stackspecific log; 1713 struct timeval tv; 1714 1715 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 1716 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 1717 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 1718 log.u_bbr.flex8 = to_num; 1719 log.u_bbr.flex1 = rack->r_ctl.rc_rack_min_rtt; 1720 log.u_bbr.flex2 = rack->rc_rack_rtt; 1721 if (rsm == NULL) 1722 log.u_bbr.flex3 = 0; 1723 else 1724 log.u_bbr.flex3 = rsm->r_end - rsm->r_start; 1725 if (rack->rack_no_prr) 1726 log.u_bbr.flex5 = 0; 1727 else 1728 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; 1729 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 1730 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 1731 TCP_LOG_EVENTP(rack->rc_tp, NULL, 1732 &rack->rc_inp->inp_socket->so_rcv, 1733 &rack->rc_inp->inp_socket->so_snd, 1734 BBR_LOG_RTO, 0, 1735 0, &log, false, &tv); 1736 } 1737 } 1738 1739 static void 1740 rack_log_rtt_upd(struct tcpcb *tp, struct tcp_rack *rack, uint32_t t, uint32_t len, 1741 struct rack_sendmap *rsm, int conf) 1742 { 1743 if (tp->t_logstate != TCP_LOG_STATE_OFF) { 1744 union tcp_log_stackspecific log; 1745 struct timeval tv; 1746 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 1747 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 1748 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 1749 log.u_bbr.flex1 = t; 1750 log.u_bbr.flex2 = len; 1751 log.u_bbr.flex3 = rack->r_ctl.rc_rack_min_rtt * HPTS_USEC_IN_MSEC; 1752 log.u_bbr.flex4 = rack->r_ctl.rack_rs.rs_rtt_lowest * HPTS_USEC_IN_MSEC; 1753 log.u_bbr.flex5 = rack->r_ctl.rack_rs.rs_rtt_highest * HPTS_USEC_IN_MSEC; 1754 log.u_bbr.flex6 = rack->r_ctl.rack_rs.rs_rtt_cnt; 1755 log.u_bbr.flex7 = conf; 1756 log.u_bbr.rttProp = (uint64_t)rack->r_ctl.rack_rs.rs_rtt_tot * (uint64_t)HPTS_USEC_IN_MSEC; 1757 log.u_bbr.flex8 = rack->r_ctl.rc_rate_sample_method; 1758 if (rack->rack_no_prr) 1759 log.u_bbr.pkts_out = 0; 1760 else 1761 log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt; 1762 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 1763 log.u_bbr.delivered = rack->r_ctl.rack_rs.rs_us_rtt; 1764 log.u_bbr.pkts_out = rack->r_ctl.rack_rs.rs_flags; 1765 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 1766 if (rsm) { 1767 log.u_bbr.pkt_epoch = rsm->r_start; 1768 log.u_bbr.lost = rsm->r_end; 1769 log.u_bbr.cwnd_gain = rsm->r_rtr_cnt; 1770 } else { 1771 1772 /* Its a SYN */ 1773 log.u_bbr.pkt_epoch = rack->rc_tp->iss; 1774 log.u_bbr.lost = 0; 1775 log.u_bbr.cwnd_gain = 0; 1776 } 1777 /* Write out general bits of interest rrs here */ 1778 log.u_bbr.use_lt_bw = rack->rc_highly_buffered; 1779 log.u_bbr.use_lt_bw <<= 1; 1780 log.u_bbr.use_lt_bw |= rack->forced_ack; 1781 log.u_bbr.use_lt_bw <<= 1; 1782 log.u_bbr.use_lt_bw |= rack->rc_gp_dyn_mul; 1783 log.u_bbr.use_lt_bw <<= 1; 1784 log.u_bbr.use_lt_bw |= rack->in_probe_rtt; 1785 log.u_bbr.use_lt_bw <<= 1; 1786 log.u_bbr.use_lt_bw |= rack->measure_saw_probe_rtt; 1787 log.u_bbr.use_lt_bw <<= 1; 1788 log.u_bbr.use_lt_bw |= rack->app_limited_needs_set; 1789 log.u_bbr.use_lt_bw <<= 1; 1790 log.u_bbr.use_lt_bw |= rack->rc_gp_filled; 1791 log.u_bbr.use_lt_bw <<= 1; 1792 log.u_bbr.use_lt_bw |= rack->rc_dragged_bottom; 1793 log.u_bbr.applimited = rack->r_ctl.rc_target_probertt_flight; 1794 log.u_bbr.epoch = rack->r_ctl.rc_time_probertt_starts; 1795 log.u_bbr.lt_epoch = rack->r_ctl.rc_time_probertt_entered; 1796 log.u_bbr.cur_del_rate = rack->r_ctl.rc_lower_rtt_us_cts; 1797 log.u_bbr.delRate = rack->r_ctl.rc_gp_srtt; 1798 TCP_LOG_EVENTP(tp, NULL, 1799 &rack->rc_inp->inp_socket->so_rcv, 1800 &rack->rc_inp->inp_socket->so_snd, 1801 BBR_LOG_BBRRTT, 0, 1802 0, &log, false, &tv); 1803 } 1804 } 1805 1806 static void 1807 rack_log_rtt_sample(struct tcp_rack *rack, uint32_t rtt) 1808 { 1809 /* 1810 * Log the rtt sample we are 1811 * applying to the srtt algorithm in 1812 * useconds. 1813 */ 1814 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 1815 union tcp_log_stackspecific log; 1816 struct timeval tv; 1817 1818 /* Convert our ms to a microsecond */ 1819 memset(&log, 0, sizeof(log)); 1820 log.u_bbr.flex1 = rtt * 1000; 1821 log.u_bbr.flex2 = rack->r_ctl.ack_count; 1822 log.u_bbr.flex3 = rack->r_ctl.sack_count; 1823 log.u_bbr.flex4 = rack->r_ctl.sack_noextra_move; 1824 log.u_bbr.flex5 = rack->r_ctl.sack_moved_extra; 1825 log.u_bbr.flex8 = rack->sack_attack_disable; 1826 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 1827 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 1828 TCP_LOG_EVENTP(rack->rc_tp, NULL, 1829 &rack->rc_inp->inp_socket->so_rcv, 1830 &rack->rc_inp->inp_socket->so_snd, 1831 TCP_LOG_RTT, 0, 1832 0, &log, false, &tv); 1833 } 1834 } 1835 1836 1837 static inline void 1838 rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, int event, int line) 1839 { 1840 if (rack_verbose_logging && (tp->t_logstate != TCP_LOG_STATE_OFF)) { 1841 union tcp_log_stackspecific log; 1842 struct timeval tv; 1843 1844 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 1845 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 1846 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 1847 log.u_bbr.flex1 = line; 1848 log.u_bbr.flex2 = tick; 1849 log.u_bbr.flex3 = tp->t_maxunacktime; 1850 log.u_bbr.flex4 = tp->t_acktime; 1851 log.u_bbr.flex8 = event; 1852 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 1853 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 1854 TCP_LOG_EVENTP(tp, NULL, 1855 &rack->rc_inp->inp_socket->so_rcv, 1856 &rack->rc_inp->inp_socket->so_snd, 1857 BBR_LOG_PROGRESS, 0, 1858 0, &log, false, &tv); 1859 } 1860 } 1861 1862 static void 1863 rack_log_type_bbrsnd(struct tcp_rack *rack, uint32_t len, uint32_t slot, uint32_t cts, struct timeval *tv) 1864 { 1865 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 1866 union tcp_log_stackspecific log; 1867 1868 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 1869 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 1870 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 1871 log.u_bbr.flex1 = slot; 1872 if (rack->rack_no_prr) 1873 log.u_bbr.flex2 = 0; 1874 else 1875 log.u_bbr.flex2 = rack->r_ctl.rc_prr_sndcnt; 1876 log.u_bbr.flex7 = (0x0000ffff & rack->r_ctl.rc_hpts_flags); 1877 log.u_bbr.flex8 = rack->rc_in_persist; 1878 log.u_bbr.timeStamp = cts; 1879 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 1880 TCP_LOG_EVENTP(rack->rc_tp, NULL, 1881 &rack->rc_inp->inp_socket->so_rcv, 1882 &rack->rc_inp->inp_socket->so_snd, 1883 BBR_LOG_BBRSND, 0, 1884 0, &log, false, tv); 1885 } 1886 } 1887 1888 static void 1889 rack_log_doseg_done(struct tcp_rack *rack, uint32_t cts, int32_t nxt_pkt, int32_t did_out, int way_out) 1890 { 1891 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 1892 union tcp_log_stackspecific log; 1893 struct timeval tv; 1894 1895 memset(&log, 0, sizeof(log)); 1896 log.u_bbr.flex1 = did_out; 1897 log.u_bbr.flex2 = nxt_pkt; 1898 log.u_bbr.flex3 = way_out; 1899 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; 1900 if (rack->rack_no_prr) 1901 log.u_bbr.flex5 = 0; 1902 else 1903 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; 1904 log.u_bbr.applimited = rack->r_ctl.rc_pace_min_segs; 1905 log.u_bbr.flex7 = rack->r_wanted_output; 1906 log.u_bbr.flex8 = rack->rc_in_persist; 1907 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 1908 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 1909 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 1910 TCP_LOG_EVENTP(rack->rc_tp, NULL, 1911 &rack->rc_inp->inp_socket->so_rcv, 1912 &rack->rc_inp->inp_socket->so_snd, 1913 BBR_LOG_DOSEG_DONE, 0, 1914 0, &log, false, &tv); 1915 } 1916 } 1917 1918 static void 1919 rack_log_type_hrdwtso(struct tcpcb *tp, struct tcp_rack *rack, int len, int mod, int32_t orig_len, int frm) 1920 { 1921 if (tp->t_logstate != TCP_LOG_STATE_OFF) { 1922 union tcp_log_stackspecific log; 1923 struct timeval tv; 1924 uint32_t cts; 1925 1926 memset(&log, 0, sizeof(log)); 1927 cts = tcp_get_usecs(&tv); 1928 log.u_bbr.flex1 = rack->r_ctl.rc_pace_min_segs; 1929 log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs; 1930 log.u_bbr.flex4 = len; 1931 log.u_bbr.flex5 = orig_len; 1932 log.u_bbr.flex6 = rack->r_ctl.rc_sacked; 1933 log.u_bbr.flex7 = mod; 1934 log.u_bbr.flex8 = frm; 1935 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 1936 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 1937 TCP_LOG_EVENTP(tp, NULL, 1938 &tp->t_inpcb->inp_socket->so_rcv, 1939 &tp->t_inpcb->inp_socket->so_snd, 1940 TCP_HDWR_TLS, 0, 1941 0, &log, false, &tv); 1942 } 1943 } 1944 1945 static void 1946 rack_log_type_just_return(struct tcp_rack *rack, uint32_t cts, uint32_t tlen, uint32_t slot, 1947 uint8_t hpts_calling, int reason, uint32_t cwnd_to_use) 1948 { 1949 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 1950 union tcp_log_stackspecific log; 1951 struct timeval tv; 1952 1953 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 1954 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 1955 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 1956 log.u_bbr.flex1 = slot; 1957 log.u_bbr.flex2 = rack->r_ctl.rc_hpts_flags; 1958 log.u_bbr.flex4 = reason; 1959 if (rack->rack_no_prr) 1960 log.u_bbr.flex5 = 0; 1961 else 1962 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; 1963 log.u_bbr.flex7 = hpts_calling; 1964 log.u_bbr.flex8 = rack->rc_in_persist; 1965 log.u_bbr.lt_epoch = cwnd_to_use; 1966 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 1967 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 1968 TCP_LOG_EVENTP(rack->rc_tp, NULL, 1969 &rack->rc_inp->inp_socket->so_rcv, 1970 &rack->rc_inp->inp_socket->so_snd, 1971 BBR_LOG_JUSTRET, 0, 1972 tlen, &log, false, &tv); 1973 } 1974 } 1975 1976 static void 1977 rack_log_to_cancel(struct tcp_rack *rack, int32_t hpts_removed, int line, uint32_t us_cts, 1978 struct timeval *tv, uint32_t flags_on_entry) 1979 { 1980 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 1981 union tcp_log_stackspecific log; 1982 1983 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 1984 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 1985 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 1986 log.u_bbr.flex1 = line; 1987 log.u_bbr.flex2 = rack->r_ctl.rc_last_output_to; 1988 log.u_bbr.flex3 = flags_on_entry; 1989 log.u_bbr.flex4 = us_cts; 1990 if (rack->rack_no_prr) 1991 log.u_bbr.flex5 = 0; 1992 else 1993 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; 1994 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur; 1995 log.u_bbr.flex7 = hpts_removed; 1996 log.u_bbr.flex8 = 1; 1997 log.u_bbr.applimited = rack->r_ctl.rc_hpts_flags; 1998 log.u_bbr.timeStamp = us_cts; 1999 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2000 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2001 &rack->rc_inp->inp_socket->so_rcv, 2002 &rack->rc_inp->inp_socket->so_snd, 2003 BBR_LOG_TIMERCANC, 0, 2004 0, &log, false, tv); 2005 } 2006 } 2007 2008 static void 2009 rack_log_alt_to_to_cancel(struct tcp_rack *rack, 2010 uint32_t flex1, uint32_t flex2, 2011 uint32_t flex3, uint32_t flex4, 2012 uint32_t flex5, uint32_t flex6, 2013 uint16_t flex7, uint8_t mod) 2014 { 2015 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2016 union tcp_log_stackspecific log; 2017 struct timeval tv; 2018 2019 if (mod == 1) { 2020 /* No you can't use 1, its for the real to cancel */ 2021 return; 2022 } 2023 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2024 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2025 log.u_bbr.flex1 = flex1; 2026 log.u_bbr.flex2 = flex2; 2027 log.u_bbr.flex3 = flex3; 2028 log.u_bbr.flex4 = flex4; 2029 log.u_bbr.flex5 = flex5; 2030 log.u_bbr.flex6 = flex6; 2031 log.u_bbr.flex7 = flex7; 2032 log.u_bbr.flex8 = mod; 2033 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2034 &rack->rc_inp->inp_socket->so_rcv, 2035 &rack->rc_inp->inp_socket->so_snd, 2036 BBR_LOG_TIMERCANC, 0, 2037 0, &log, false, &tv); 2038 } 2039 } 2040 2041 static void 2042 rack_log_to_processing(struct tcp_rack *rack, uint32_t cts, int32_t ret, int32_t timers) 2043 { 2044 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2045 union tcp_log_stackspecific log; 2046 struct timeval tv; 2047 2048 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2049 log.u_bbr.flex1 = timers; 2050 log.u_bbr.flex2 = ret; 2051 log.u_bbr.flex3 = rack->r_ctl.rc_timer_exp; 2052 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; 2053 log.u_bbr.flex5 = cts; 2054 if (rack->rack_no_prr) 2055 log.u_bbr.flex6 = 0; 2056 else 2057 log.u_bbr.flex6 = rack->r_ctl.rc_prr_sndcnt; 2058 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2059 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2060 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2061 &rack->rc_inp->inp_socket->so_rcv, 2062 &rack->rc_inp->inp_socket->so_snd, 2063 BBR_LOG_TO_PROCESS, 0, 2064 0, &log, false, &tv); 2065 } 2066 } 2067 2068 static void 2069 rack_log_to_prr(struct tcp_rack *rack, int frm, int orig_cwnd) 2070 { 2071 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2072 union tcp_log_stackspecific log; 2073 struct timeval tv; 2074 2075 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2076 log.u_bbr.flex1 = rack->r_ctl.rc_prr_out; 2077 log.u_bbr.flex2 = rack->r_ctl.rc_prr_recovery_fs; 2078 if (rack->rack_no_prr) 2079 log.u_bbr.flex3 = 0; 2080 else 2081 log.u_bbr.flex3 = rack->r_ctl.rc_prr_sndcnt; 2082 log.u_bbr.flex4 = rack->r_ctl.rc_prr_delivered; 2083 log.u_bbr.flex5 = rack->r_ctl.rc_sacked; 2084 log.u_bbr.flex6 = rack->r_ctl.rc_holes_rxt; 2085 log.u_bbr.flex8 = frm; 2086 log.u_bbr.pkts_out = orig_cwnd; 2087 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2088 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2089 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2090 &rack->rc_inp->inp_socket->so_rcv, 2091 &rack->rc_inp->inp_socket->so_snd, 2092 BBR_LOG_BBRUPD, 0, 2093 0, &log, false, &tv); 2094 } 2095 } 2096 2097 #ifdef NETFLIX_EXP_DETECTION 2098 static void 2099 rack_log_sad(struct tcp_rack *rack, int event) 2100 { 2101 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2102 union tcp_log_stackspecific log; 2103 struct timeval tv; 2104 2105 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2106 log.u_bbr.flex1 = rack->r_ctl.sack_count; 2107 log.u_bbr.flex2 = rack->r_ctl.ack_count; 2108 log.u_bbr.flex3 = rack->r_ctl.sack_moved_extra; 2109 log.u_bbr.flex4 = rack->r_ctl.sack_noextra_move; 2110 log.u_bbr.flex5 = rack->r_ctl.rc_num_maps_alloced; 2111 log.u_bbr.flex6 = tcp_sack_to_ack_thresh; 2112 log.u_bbr.pkts_out = tcp_sack_to_move_thresh; 2113 log.u_bbr.lt_epoch = (tcp_force_detection << 8); 2114 log.u_bbr.lt_epoch |= rack->do_detection; 2115 log.u_bbr.applimited = tcp_map_minimum; 2116 log.u_bbr.flex7 = rack->sack_attack_disable; 2117 log.u_bbr.flex8 = event; 2118 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2119 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2120 log.u_bbr.delivered = tcp_sad_decay_val; 2121 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2122 &rack->rc_inp->inp_socket->so_rcv, 2123 &rack->rc_inp->inp_socket->so_snd, 2124 TCP_SAD_DETECTION, 0, 2125 0, &log, false, &tv); 2126 } 2127 } 2128 #endif 2129 2130 static void 2131 rack_counter_destroy(void) 2132 { 2133 counter_u64_free(rack_ack_total); 2134 counter_u64_free(rack_express_sack); 2135 counter_u64_free(rack_sack_total); 2136 counter_u64_free(rack_move_none); 2137 counter_u64_free(rack_move_some); 2138 counter_u64_free(rack_sack_attacks_detected); 2139 counter_u64_free(rack_sack_attacks_reversed); 2140 counter_u64_free(rack_sack_used_next_merge); 2141 counter_u64_free(rack_sack_used_prev_merge); 2142 counter_u64_free(rack_badfr); 2143 counter_u64_free(rack_badfr_bytes); 2144 counter_u64_free(rack_rtm_prr_retran); 2145 counter_u64_free(rack_rtm_prr_newdata); 2146 counter_u64_free(rack_timestamp_mismatch); 2147 counter_u64_free(rack_find_high); 2148 counter_u64_free(rack_reorder_seen); 2149 counter_u64_free(rack_tlp_tot); 2150 counter_u64_free(rack_tlp_newdata); 2151 counter_u64_free(rack_tlp_retran); 2152 counter_u64_free(rack_tlp_retran_bytes); 2153 counter_u64_free(rack_tlp_retran_fail); 2154 counter_u64_free(rack_to_tot); 2155 counter_u64_free(rack_to_arm_rack); 2156 counter_u64_free(rack_to_arm_tlp); 2157 counter_u64_free(rack_calc_zero); 2158 counter_u64_free(rack_calc_nonzero); 2159 counter_u64_free(rack_paced_segments); 2160 counter_u64_free(rack_unpaced_segments); 2161 counter_u64_free(rack_saw_enobuf); 2162 counter_u64_free(rack_saw_enetunreach); 2163 counter_u64_free(rack_to_alloc); 2164 counter_u64_free(rack_to_alloc_hard); 2165 counter_u64_free(rack_to_alloc_emerg); 2166 counter_u64_free(rack_to_alloc_limited); 2167 counter_u64_free(rack_alloc_limited_conns); 2168 counter_u64_free(rack_split_limited); 2169 counter_u64_free(rack_sack_proc_all); 2170 counter_u64_free(rack_sack_proc_restart); 2171 counter_u64_free(rack_sack_proc_short); 2172 counter_u64_free(rack_enter_tlp_calc); 2173 counter_u64_free(rack_used_tlpmethod); 2174 counter_u64_free(rack_used_tlpmethod2); 2175 counter_u64_free(rack_sack_skipped_acked); 2176 counter_u64_free(rack_sack_splits); 2177 counter_u64_free(rack_progress_drops); 2178 counter_u64_free(rack_input_idle_reduces); 2179 counter_u64_free(rack_collapsed_win); 2180 counter_u64_free(rack_tlp_does_nada); 2181 counter_u64_free(rack_try_scwnd); 2182 counter_u64_free(rack_per_timer_hole); 2183 COUNTER_ARRAY_FREE(rack_out_size, TCP_MSS_ACCT_SIZE); 2184 COUNTER_ARRAY_FREE(rack_opts_arry, RACK_OPTS_SIZE); 2185 } 2186 2187 static struct rack_sendmap * 2188 rack_alloc(struct tcp_rack *rack) 2189 { 2190 struct rack_sendmap *rsm; 2191 2192 rsm = uma_zalloc(rack_zone, M_NOWAIT); 2193 if (rsm) { 2194 rack->r_ctl.rc_num_maps_alloced++; 2195 counter_u64_add(rack_to_alloc, 1); 2196 return (rsm); 2197 } 2198 if (rack->rc_free_cnt) { 2199 counter_u64_add(rack_to_alloc_emerg, 1); 2200 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); 2201 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext); 2202 rack->rc_free_cnt--; 2203 return (rsm); 2204 } 2205 return (NULL); 2206 } 2207 2208 static struct rack_sendmap * 2209 rack_alloc_full_limit(struct tcp_rack *rack) 2210 { 2211 if ((V_tcp_map_entries_limit > 0) && 2212 (rack->do_detection == 0) && 2213 (rack->r_ctl.rc_num_maps_alloced >= V_tcp_map_entries_limit)) { 2214 counter_u64_add(rack_to_alloc_limited, 1); 2215 if (!rack->alloc_limit_reported) { 2216 rack->alloc_limit_reported = 1; 2217 counter_u64_add(rack_alloc_limited_conns, 1); 2218 } 2219 return (NULL); 2220 } 2221 return (rack_alloc(rack)); 2222 } 2223 2224 /* wrapper to allocate a sendmap entry, subject to a specific limit */ 2225 static struct rack_sendmap * 2226 rack_alloc_limit(struct tcp_rack *rack, uint8_t limit_type) 2227 { 2228 struct rack_sendmap *rsm; 2229 2230 if (limit_type) { 2231 /* currently there is only one limit type */ 2232 if (V_tcp_map_split_limit > 0 && 2233 (rack->do_detection == 0) && 2234 rack->r_ctl.rc_num_split_allocs >= V_tcp_map_split_limit) { 2235 counter_u64_add(rack_split_limited, 1); 2236 if (!rack->alloc_limit_reported) { 2237 rack->alloc_limit_reported = 1; 2238 counter_u64_add(rack_alloc_limited_conns, 1); 2239 } 2240 return (NULL); 2241 } 2242 } 2243 2244 /* allocate and mark in the limit type, if set */ 2245 rsm = rack_alloc(rack); 2246 if (rsm != NULL && limit_type) { 2247 rsm->r_limit_type = limit_type; 2248 rack->r_ctl.rc_num_split_allocs++; 2249 } 2250 return (rsm); 2251 } 2252 2253 static void 2254 rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm) 2255 { 2256 if (rsm->r_flags & RACK_APP_LIMITED) { 2257 if (rack->r_ctl.rc_app_limited_cnt > 0) { 2258 rack->r_ctl.rc_app_limited_cnt--; 2259 } 2260 } 2261 if (rsm->r_limit_type) { 2262 /* currently there is only one limit type */ 2263 rack->r_ctl.rc_num_split_allocs--; 2264 } 2265 if (rsm == rack->r_ctl.rc_first_appl) { 2266 if (rack->r_ctl.rc_app_limited_cnt == 0) 2267 rack->r_ctl.rc_first_appl = NULL; 2268 else { 2269 /* Follow the next one out */ 2270 struct rack_sendmap fe; 2271 2272 fe.r_start = rsm->r_nseq_appl; 2273 rack->r_ctl.rc_first_appl = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); 2274 } 2275 } 2276 if (rsm == rack->r_ctl.rc_resend) 2277 rack->r_ctl.rc_resend = NULL; 2278 if (rsm == rack->r_ctl.rc_rsm_at_retran) 2279 rack->r_ctl.rc_rsm_at_retran = NULL; 2280 if (rsm == rack->r_ctl.rc_end_appl) 2281 rack->r_ctl.rc_end_appl = NULL; 2282 if (rack->r_ctl.rc_tlpsend == rsm) 2283 rack->r_ctl.rc_tlpsend = NULL; 2284 if (rack->r_ctl.rc_sacklast == rsm) 2285 rack->r_ctl.rc_sacklast = NULL; 2286 if (rack->rc_free_cnt < rack_free_cache) { 2287 memset(rsm, 0, sizeof(struct rack_sendmap)); 2288 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_tnext); 2289 rsm->r_limit_type = 0; 2290 rack->rc_free_cnt++; 2291 return; 2292 } 2293 rack->r_ctl.rc_num_maps_alloced--; 2294 uma_zfree(rack_zone, rsm); 2295 } 2296 2297 static uint32_t 2298 rack_get_measure_window(struct tcpcb *tp, struct tcp_rack *rack) 2299 { 2300 uint64_t srtt, bw, len, tim; 2301 uint32_t segsiz, def_len, minl; 2302 2303 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 2304 def_len = rack_def_data_window * segsiz; 2305 if (rack->rc_gp_filled == 0) { 2306 /* 2307 * We have no measurement (IW is in flight?) so 2308 * we can only guess using our data_window sysctl 2309 * value (usually 100MSS). 2310 */ 2311 return (def_len); 2312 } 2313 /* 2314 * Now we have a number of factors to consider. 2315 * 2316 * 1) We have a desired BDP which is usually 2317 * at least 2. 2318 * 2) We have a minimum number of rtt's usually 1 SRTT 2319 * but we allow it too to be more. 2320 * 3) We want to make sure a measurement last N useconds (if 2321 * we have set rack_min_measure_usec. 2322 * 2323 * We handle the first concern here by trying to create a data 2324 * window of max(rack_def_data_window, DesiredBDP). The 2325 * second concern we handle in not letting the measurement 2326 * window end normally until at least the required SRTT's 2327 * have gone by which is done further below in 2328 * rack_enough_for_measurement(). Finally the third concern 2329 * we also handle here by calculating how long that time 2330 * would take at the current BW and then return the 2331 * max of our first calculation and that length. Note 2332 * that if rack_min_measure_usec is 0, we don't deal 2333 * with concern 3. Also for both Concern 1 and 3 an 2334 * application limited period could end the measurement 2335 * earlier. 2336 * 2337 * So lets calculate the BDP with the "known" b/w using 2338 * the SRTT has our rtt and then multiply it by the 2339 * goal. 2340 */ 2341 bw = rack_get_bw(rack); 2342 srtt = ((uint64_t)TICKS_2_USEC(tp->t_srtt) >> TCP_RTT_SHIFT); 2343 len = bw * srtt; 2344 len /= (uint64_t)HPTS_USEC_IN_SEC; 2345 len *= max(1, rack_goal_bdp); 2346 /* Now we need to round up to the nearest MSS */ 2347 len = roundup(len, segsiz); 2348 if (rack_min_measure_usec) { 2349 /* Now calculate our min length for this b/w */ 2350 tim = rack_min_measure_usec; 2351 minl = (tim * bw) / (uint64_t)HPTS_USEC_IN_SEC; 2352 if (minl == 0) 2353 minl = 1; 2354 minl = roundup(minl, segsiz); 2355 if (len < minl) 2356 len = minl; 2357 } 2358 /* 2359 * Now if we have a very small window we want 2360 * to attempt to get the window that is 2361 * as small as possible. This happens on 2362 * low b/w connections and we don't want to 2363 * span huge numbers of rtt's between measurements. 2364 * 2365 * We basically include 2 over our "MIN window" so 2366 * that the measurement can be shortened (possibly) by 2367 * an ack'ed packet. 2368 */ 2369 if (len < def_len) 2370 return (max((uint32_t)len, ((MIN_GP_WIN+2) * segsiz))); 2371 else 2372 return (max((uint32_t)len, def_len)); 2373 2374 } 2375 2376 static int 2377 rack_enough_for_measurement(struct tcpcb *tp, struct tcp_rack *rack, tcp_seq th_ack) 2378 { 2379 uint32_t tim, srtts, segsiz; 2380 2381 /* 2382 * Has enough time passed for the GP measurement to be valid? 2383 */ 2384 if ((tp->snd_max == tp->snd_una) || 2385 (th_ack == tp->snd_max)){ 2386 /* All is acked */ 2387 return (1); 2388 } 2389 if (SEQ_LT(th_ack, tp->gput_seq)) { 2390 /* Not enough bytes yet */ 2391 return (0); 2392 } 2393 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 2394 if (SEQ_LT(th_ack, tp->gput_ack) && 2395 ((th_ack - tp->gput_seq) < max(rc_init_window(rack), (MIN_GP_WIN * segsiz)))) { 2396 /* Not enough bytes yet */ 2397 return (0); 2398 } 2399 if (rack->r_ctl.rc_first_appl && 2400 (rack->r_ctl.rc_first_appl->r_start == th_ack)) { 2401 /* 2402 * We are up to the app limited point 2403 * we have to measure irrespective of the time.. 2404 */ 2405 return (1); 2406 } 2407 /* Now what about time? */ 2408 srtts = (rack->r_ctl.rc_gp_srtt * rack_min_srtts); 2409 tim = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time) - tp->gput_ts; 2410 if (tim >= srtts) { 2411 return (1); 2412 } 2413 /* Nope not even a full SRTT has passed */ 2414 return (0); 2415 } 2416 2417 2418 static void 2419 rack_log_timely(struct tcp_rack *rack, 2420 uint32_t logged, uint64_t cur_bw, uint64_t low_bnd, 2421 uint64_t up_bnd, int line, uint8_t method) 2422 { 2423 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2424 union tcp_log_stackspecific log; 2425 struct timeval tv; 2426 2427 memset(&log, 0, sizeof(log)); 2428 log.u_bbr.flex1 = logged; 2429 log.u_bbr.flex2 = rack->rc_gp_timely_inc_cnt; 2430 log.u_bbr.flex2 <<= 4; 2431 log.u_bbr.flex2 |= rack->rc_gp_timely_dec_cnt; 2432 log.u_bbr.flex2 <<= 4; 2433 log.u_bbr.flex2 |= rack->rc_gp_incr; 2434 log.u_bbr.flex2 <<= 4; 2435 log.u_bbr.flex2 |= rack->rc_gp_bwred; 2436 log.u_bbr.flex3 = rack->rc_gp_incr; 2437 log.u_bbr.flex4 = rack->r_ctl.rack_per_of_gp_ss; 2438 log.u_bbr.flex5 = rack->r_ctl.rack_per_of_gp_ca; 2439 log.u_bbr.flex6 = rack->r_ctl.rack_per_of_gp_rec; 2440 log.u_bbr.flex7 = rack->rc_gp_bwred; 2441 log.u_bbr.flex8 = method; 2442 log.u_bbr.cur_del_rate = cur_bw; 2443 log.u_bbr.delRate = low_bnd; 2444 log.u_bbr.bw_inuse = up_bnd; 2445 log.u_bbr.rttProp = rack_get_bw(rack); 2446 log.u_bbr.pkt_epoch = line; 2447 log.u_bbr.pkts_out = rack->r_ctl.rc_rtt_diff; 2448 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2449 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2450 log.u_bbr.epoch = rack->r_ctl.rc_gp_srtt; 2451 log.u_bbr.lt_epoch = rack->r_ctl.rc_prev_gp_srtt; 2452 log.u_bbr.cwnd_gain = rack->rc_dragged_bottom; 2453 log.u_bbr.cwnd_gain <<= 1; 2454 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_rec; 2455 log.u_bbr.cwnd_gain <<= 1; 2456 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ss; 2457 log.u_bbr.cwnd_gain <<= 1; 2458 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ca; 2459 log.u_bbr.lost = rack->r_ctl.rc_loss_count; 2460 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2461 &rack->rc_inp->inp_socket->so_rcv, 2462 &rack->rc_inp->inp_socket->so_snd, 2463 TCP_TIMELY_WORK, 0, 2464 0, &log, false, &tv); 2465 } 2466 } 2467 2468 static int 2469 rack_bw_can_be_raised(struct tcp_rack *rack, uint64_t cur_bw, uint64_t last_bw_est, uint16_t mult) 2470 { 2471 /* 2472 * Before we increase we need to know if 2473 * the estimate just made was less than 2474 * our pacing goal (i.e. (cur_bw * mult) > last_bw_est) 2475 * 2476 * If we already are pacing at a fast enough 2477 * rate to push us faster there is no sense of 2478 * increasing. 2479 * 2480 * We first caculate our actual pacing rate (ss or ca multipler 2481 * times our cur_bw). 2482 * 2483 * Then we take the last measured rate and multipy by our 2484 * maximum pacing overage to give us a max allowable rate. 2485 * 2486 * If our act_rate is smaller than our max_allowable rate 2487 * then we should increase. Else we should hold steady. 2488 * 2489 */ 2490 uint64_t act_rate, max_allow_rate; 2491 2492 if (rack_timely_no_stopping) 2493 return (1); 2494 2495 if ((cur_bw == 0) || (last_bw_est == 0)) { 2496 /* 2497 * Initial startup case or 2498 * everything is acked case. 2499 */ 2500 rack_log_timely(rack, mult, cur_bw, 0, 0, 2501 __LINE__, 9); 2502 return (1); 2503 } 2504 if (mult <= 100) { 2505 /* 2506 * We can always pace at or slightly above our rate. 2507 */ 2508 rack_log_timely(rack, mult, cur_bw, 0, 0, 2509 __LINE__, 9); 2510 return (1); 2511 } 2512 act_rate = cur_bw * (uint64_t)mult; 2513 act_rate /= 100; 2514 max_allow_rate = last_bw_est * ((uint64_t)rack_max_per_above + (uint64_t)100); 2515 max_allow_rate /= 100; 2516 if (act_rate < max_allow_rate) { 2517 /* 2518 * Here the rate we are actually pacing at 2519 * is smaller than 10% above our last measurement. 2520 * This means we are pacing below what we would 2521 * like to try to achieve (plus some wiggle room). 2522 */ 2523 rack_log_timely(rack, mult, cur_bw, act_rate, max_allow_rate, 2524 __LINE__, 9); 2525 return (1); 2526 } else { 2527 /* 2528 * Here we are already pacing at least rack_max_per_above(10%) 2529 * what we are getting back. This indicates most likely 2530 * that we are being limited (cwnd/rwnd/app) and can't 2531 * get any more b/w. There is no sense of trying to 2532 * raise up the pacing rate its not speeding us up 2533 * and we already are pacing faster than we are getting. 2534 */ 2535 rack_log_timely(rack, mult, cur_bw, act_rate, max_allow_rate, 2536 __LINE__, 8); 2537 return (0); 2538 } 2539 } 2540 2541 static void 2542 rack_validate_multipliers_at_or_above100(struct tcp_rack *rack) 2543 { 2544 /* 2545 * When we drag bottom, we want to assure 2546 * that no multiplier is below 1.0, if so 2547 * we want to restore it to at least that. 2548 */ 2549 if (rack->r_ctl.rack_per_of_gp_rec < 100) { 2550 /* This is unlikely we usually do not touch recovery */ 2551 rack->r_ctl.rack_per_of_gp_rec = 100; 2552 } 2553 if (rack->r_ctl.rack_per_of_gp_ca < 100) { 2554 rack->r_ctl.rack_per_of_gp_ca = 100; 2555 } 2556 if (rack->r_ctl.rack_per_of_gp_ss < 100) { 2557 rack->r_ctl.rack_per_of_gp_ss = 100; 2558 } 2559 } 2560 2561 static void 2562 rack_validate_multipliers_at_or_below_100(struct tcp_rack *rack) 2563 { 2564 if (rack->r_ctl.rack_per_of_gp_ca > 100) { 2565 rack->r_ctl.rack_per_of_gp_ca = 100; 2566 } 2567 if (rack->r_ctl.rack_per_of_gp_ss > 100) { 2568 rack->r_ctl.rack_per_of_gp_ss = 100; 2569 } 2570 } 2571 2572 static void 2573 rack_increase_bw_mul(struct tcp_rack *rack, int timely_says, uint64_t cur_bw, uint64_t last_bw_est, int override) 2574 { 2575 int32_t calc, logged, plus; 2576 2577 logged = 0; 2578 2579 if (override) { 2580 /* 2581 * override is passed when we are 2582 * loosing b/w and making one last 2583 * gasp at trying to not loose out 2584 * to a new-reno flow. 2585 */ 2586 goto extra_boost; 2587 } 2588 /* In classic timely we boost by 5x if we have 5 increases in a row, lets not */ 2589 if (rack->rc_gp_incr && 2590 ((rack->rc_gp_timely_inc_cnt + 1) >= RACK_TIMELY_CNT_BOOST)) { 2591 /* 2592 * Reset and get 5 strokes more before the boost. Note 2593 * that the count is 0 based so we have to add one. 2594 */ 2595 extra_boost: 2596 plus = (uint32_t)rack_gp_increase_per * RACK_TIMELY_CNT_BOOST; 2597 rack->rc_gp_timely_inc_cnt = 0; 2598 } else 2599 plus = (uint32_t)rack_gp_increase_per; 2600 /* Must be at least 1% increase for true timely increases */ 2601 if ((plus < 1) && 2602 ((rack->r_ctl.rc_rtt_diff <= 0) || (timely_says <= 0))) 2603 plus = 1; 2604 if (rack->rc_gp_saw_rec && 2605 (rack->rc_gp_no_rec_chg == 0) && 2606 rack_bw_can_be_raised(rack, cur_bw, last_bw_est, 2607 rack->r_ctl.rack_per_of_gp_rec)) { 2608 /* We have been in recovery ding it too */ 2609 calc = rack->r_ctl.rack_per_of_gp_rec + plus; 2610 if (calc > 0xffff) 2611 calc = 0xffff; 2612 logged |= 1; 2613 rack->r_ctl.rack_per_of_gp_rec = (uint16_t)calc; 2614 if (rack_per_upper_bound_ss && 2615 (rack->rc_dragged_bottom == 0) && 2616 (rack->r_ctl.rack_per_of_gp_rec > rack_per_upper_bound_ss)) 2617 rack->r_ctl.rack_per_of_gp_rec = rack_per_upper_bound_ss; 2618 } 2619 if (rack->rc_gp_saw_ca && 2620 (rack->rc_gp_saw_ss == 0) && 2621 rack_bw_can_be_raised(rack, cur_bw, last_bw_est, 2622 rack->r_ctl.rack_per_of_gp_ca)) { 2623 /* In CA */ 2624 calc = rack->r_ctl.rack_per_of_gp_ca + plus; 2625 if (calc > 0xffff) 2626 calc = 0xffff; 2627 logged |= 2; 2628 rack->r_ctl.rack_per_of_gp_ca = (uint16_t)calc; 2629 if (rack_per_upper_bound_ca && 2630 (rack->rc_dragged_bottom == 0) && 2631 (rack->r_ctl.rack_per_of_gp_ca > rack_per_upper_bound_ca)) 2632 rack->r_ctl.rack_per_of_gp_ca = rack_per_upper_bound_ca; 2633 } 2634 if (rack->rc_gp_saw_ss && 2635 rack_bw_can_be_raised(rack, cur_bw, last_bw_est, 2636 rack->r_ctl.rack_per_of_gp_ss)) { 2637 /* In SS */ 2638 calc = rack->r_ctl.rack_per_of_gp_ss + plus; 2639 if (calc > 0xffff) 2640 calc = 0xffff; 2641 rack->r_ctl.rack_per_of_gp_ss = (uint16_t)calc; 2642 if (rack_per_upper_bound_ss && 2643 (rack->rc_dragged_bottom == 0) && 2644 (rack->r_ctl.rack_per_of_gp_ss > rack_per_upper_bound_ss)) 2645 rack->r_ctl.rack_per_of_gp_ss = rack_per_upper_bound_ss; 2646 logged |= 4; 2647 } 2648 if (logged && 2649 (rack->rc_gp_incr == 0)){ 2650 /* Go into increment mode */ 2651 rack->rc_gp_incr = 1; 2652 rack->rc_gp_timely_inc_cnt = 0; 2653 } 2654 if (rack->rc_gp_incr && 2655 logged && 2656 (rack->rc_gp_timely_inc_cnt < RACK_TIMELY_CNT_BOOST)) { 2657 rack->rc_gp_timely_inc_cnt++; 2658 } 2659 rack_log_timely(rack, logged, plus, 0, 0, 2660 __LINE__, 1); 2661 } 2662 2663 static uint32_t 2664 rack_get_decrease(struct tcp_rack *rack, uint32_t curper, int32_t rtt_diff) 2665 { 2666 /* 2667 * norm_grad = rtt_diff / minrtt; 2668 * new_per = curper * (1 - B * norm_grad) 2669 * 2670 * B = rack_gp_decrease_per (default 10%) 2671 * rtt_dif = input var current rtt-diff 2672 * curper = input var current percentage 2673 * minrtt = from rack filter 2674 * 2675 */ 2676 uint64_t perf; 2677 2678 perf = (((uint64_t)curper * ((uint64_t)1000000 - 2679 ((uint64_t)rack_gp_decrease_per * (uint64_t)10000 * 2680 (((uint64_t)rtt_diff * (uint64_t)1000000)/ 2681 (uint64_t)get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt)))/ 2682 (uint64_t)1000000)) / 2683 (uint64_t)1000000); 2684 if (perf > curper) { 2685 /* TSNH */ 2686 perf = curper - 1; 2687 } 2688 return ((uint32_t)perf); 2689 } 2690 2691 static uint32_t 2692 rack_decrease_highrtt(struct tcp_rack *rack, uint32_t curper, uint32_t rtt) 2693 { 2694 /* 2695 * highrttthresh 2696 * result = curper * (1 - (B * ( 1 - ------ )) 2697 * gp_srtt 2698 * 2699 * B = rack_gp_decrease_per (default 10%) 2700 * highrttthresh = filter_min * rack_gp_rtt_maxmul 2701 */ 2702 uint64_t perf; 2703 uint32_t highrttthresh; 2704 2705 highrttthresh = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_maxmul; 2706 2707 perf = (((uint64_t)curper * ((uint64_t)1000000 - 2708 ((uint64_t)rack_gp_decrease_per * ((uint64_t)1000000 - 2709 ((uint64_t)highrttthresh * (uint64_t)1000000) / 2710 (uint64_t)rtt)) / 100)) /(uint64_t)1000000); 2711 return (perf); 2712 } 2713 2714 2715 static void 2716 rack_decrease_bw_mul(struct tcp_rack *rack, int timely_says, uint32_t rtt, int32_t rtt_diff) 2717 { 2718 uint64_t logvar, logvar2, logvar3; 2719 uint32_t logged, new_per, ss_red, ca_red, rec_red, alt, val; 2720 2721 if (rack->rc_gp_incr) { 2722 /* Turn off increment counting */ 2723 rack->rc_gp_incr = 0; 2724 rack->rc_gp_timely_inc_cnt = 0; 2725 } 2726 ss_red = ca_red = rec_red = 0; 2727 logged = 0; 2728 /* Calculate the reduction value */ 2729 if (rtt_diff < 0) { 2730 rtt_diff *= -1; 2731 } 2732 /* Must be at least 1% reduction */ 2733 if (rack->rc_gp_saw_rec && (rack->rc_gp_no_rec_chg == 0)) { 2734 /* We have been in recovery ding it too */ 2735 if (timely_says == 2) { 2736 new_per = rack_decrease_highrtt(rack, rack->r_ctl.rack_per_of_gp_rec, rtt); 2737 alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff); 2738 if (alt < new_per) 2739 val = alt; 2740 else 2741 val = new_per; 2742 } else 2743 val = new_per = alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff); 2744 if (rack->r_ctl.rack_per_of_gp_rec > val) { 2745 rec_red = (rack->r_ctl.rack_per_of_gp_rec - val); 2746 rack->r_ctl.rack_per_of_gp_rec = (uint16_t)val; 2747 } else { 2748 rack->r_ctl.rack_per_of_gp_rec = rack_per_lower_bound; 2749 rec_red = 0; 2750 } 2751 if (rack_per_lower_bound > rack->r_ctl.rack_per_of_gp_rec) 2752 rack->r_ctl.rack_per_of_gp_rec = rack_per_lower_bound; 2753 logged |= 1; 2754 } 2755 if (rack->rc_gp_saw_ss) { 2756 /* Sent in SS */ 2757 if (timely_says == 2) { 2758 new_per = rack_decrease_highrtt(rack, rack->r_ctl.rack_per_of_gp_ss, rtt); 2759 alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff); 2760 if (alt < new_per) 2761 val = alt; 2762 else 2763 val = new_per; 2764 } else 2765 val = new_per = alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_ss, rtt_diff); 2766 if (rack->r_ctl.rack_per_of_gp_ss > new_per) { 2767 ss_red = rack->r_ctl.rack_per_of_gp_ss - val; 2768 rack->r_ctl.rack_per_of_gp_ss = (uint16_t)val; 2769 } else { 2770 ss_red = new_per; 2771 rack->r_ctl.rack_per_of_gp_ss = rack_per_lower_bound; 2772 logvar = new_per; 2773 logvar <<= 32; 2774 logvar |= alt; 2775 logvar2 = (uint32_t)rtt; 2776 logvar2 <<= 32; 2777 logvar2 |= (uint32_t)rtt_diff; 2778 logvar3 = rack_gp_rtt_maxmul; 2779 logvar3 <<= 32; 2780 logvar3 |= get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); 2781 rack_log_timely(rack, timely_says, 2782 logvar2, logvar3, 2783 logvar, __LINE__, 10); 2784 } 2785 if (rack_per_lower_bound > rack->r_ctl.rack_per_of_gp_ss) 2786 rack->r_ctl.rack_per_of_gp_ss = rack_per_lower_bound; 2787 logged |= 4; 2788 } else if (rack->rc_gp_saw_ca) { 2789 /* Sent in CA */ 2790 if (timely_says == 2) { 2791 new_per = rack_decrease_highrtt(rack, rack->r_ctl.rack_per_of_gp_ca, rtt); 2792 alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff); 2793 if (alt < new_per) 2794 val = alt; 2795 else 2796 val = new_per; 2797 } else 2798 val = new_per = alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_ca, rtt_diff); 2799 if (rack->r_ctl.rack_per_of_gp_ca > val) { 2800 ca_red = rack->r_ctl.rack_per_of_gp_ca - val; 2801 rack->r_ctl.rack_per_of_gp_ca = (uint16_t)val; 2802 } else { 2803 rack->r_ctl.rack_per_of_gp_ca = rack_per_lower_bound; 2804 ca_red = 0; 2805 logvar = new_per; 2806 logvar <<= 32; 2807 logvar |= alt; 2808 logvar2 = (uint32_t)rtt; 2809 logvar2 <<= 32; 2810 logvar2 |= (uint32_t)rtt_diff; 2811 logvar3 = rack_gp_rtt_maxmul; 2812 logvar3 <<= 32; 2813 logvar3 |= get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); 2814 rack_log_timely(rack, timely_says, 2815 logvar2, logvar3, 2816 logvar, __LINE__, 10); 2817 } 2818 if (rack_per_lower_bound > rack->r_ctl.rack_per_of_gp_ca) 2819 rack->r_ctl.rack_per_of_gp_ca = rack_per_lower_bound; 2820 logged |= 2; 2821 } 2822 if (rack->rc_gp_timely_dec_cnt < 0x7) { 2823 rack->rc_gp_timely_dec_cnt++; 2824 if (rack_timely_dec_clear && 2825 (rack->rc_gp_timely_dec_cnt == rack_timely_dec_clear)) 2826 rack->rc_gp_timely_dec_cnt = 0; 2827 } 2828 logvar = ss_red; 2829 logvar <<= 32; 2830 logvar |= ca_red; 2831 rack_log_timely(rack, logged, rec_red, rack_per_lower_bound, logvar, 2832 __LINE__, 2); 2833 } 2834 2835 static void 2836 rack_log_rtt_shrinks(struct tcp_rack *rack, uint32_t us_cts, 2837 uint32_t rtt, uint32_t line, uint8_t reas) 2838 { 2839 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2840 union tcp_log_stackspecific log; 2841 struct timeval tv; 2842 2843 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2844 log.u_bbr.flex1 = line; 2845 log.u_bbr.flex2 = rack->r_ctl.rc_time_probertt_starts; 2846 log.u_bbr.flex3 = rack->r_ctl.rc_lower_rtt_us_cts; 2847 log.u_bbr.flex4 = rack->r_ctl.rack_per_of_gp_ss; 2848 log.u_bbr.flex5 = rtt; 2849 log.u_bbr.flex6 = rack->rc_highly_buffered; 2850 log.u_bbr.flex6 <<= 1; 2851 log.u_bbr.flex6 |= rack->forced_ack; 2852 log.u_bbr.flex6 <<= 1; 2853 log.u_bbr.flex6 |= rack->rc_gp_dyn_mul; 2854 log.u_bbr.flex6 <<= 1; 2855 log.u_bbr.flex6 |= rack->in_probe_rtt; 2856 log.u_bbr.flex6 <<= 1; 2857 log.u_bbr.flex6 |= rack->measure_saw_probe_rtt; 2858 log.u_bbr.flex7 = rack->r_ctl.rack_per_of_gp_probertt; 2859 log.u_bbr.pacing_gain = rack->r_ctl.rack_per_of_gp_ca; 2860 log.u_bbr.cwnd_gain = rack->r_ctl.rack_per_of_gp_rec; 2861 log.u_bbr.flex8 = reas; 2862 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2863 log.u_bbr.delRate = rack_get_bw(rack); 2864 log.u_bbr.cur_del_rate = rack->r_ctl.rc_highest_us_rtt; 2865 log.u_bbr.cur_del_rate <<= 32; 2866 log.u_bbr.cur_del_rate |= rack->r_ctl.rc_lowest_us_rtt; 2867 log.u_bbr.applimited = rack->r_ctl.rc_time_probertt_entered; 2868 log.u_bbr.pkts_out = rack->r_ctl.rc_rtt_diff; 2869 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2870 log.u_bbr.epoch = rack->r_ctl.rc_gp_srtt; 2871 log.u_bbr.lt_epoch = rack->r_ctl.rc_prev_gp_srtt; 2872 log.u_bbr.pkt_epoch = rack->r_ctl.rc_lower_rtt_us_cts; 2873 log.u_bbr.delivered = rack->r_ctl.rc_target_probertt_flight; 2874 log.u_bbr.lost = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); 2875 log.u_bbr.rttProp = us_cts; 2876 log.u_bbr.rttProp <<= 32; 2877 log.u_bbr.rttProp |= rack->r_ctl.rc_entry_gp_rtt; 2878 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2879 &rack->rc_inp->inp_socket->so_rcv, 2880 &rack->rc_inp->inp_socket->so_snd, 2881 BBR_LOG_RTT_SHRINKS, 0, 2882 0, &log, false, &rack->r_ctl.act_rcv_time); 2883 } 2884 } 2885 2886 static void 2887 rack_set_prtt_target(struct tcp_rack *rack, uint32_t segsiz, uint32_t rtt) 2888 { 2889 uint64_t bwdp; 2890 2891 bwdp = rack_get_bw(rack); 2892 bwdp *= (uint64_t)rtt; 2893 bwdp /= (uint64_t)HPTS_USEC_IN_SEC; 2894 rack->r_ctl.rc_target_probertt_flight = roundup((uint32_t)bwdp, segsiz); 2895 if (rack->r_ctl.rc_target_probertt_flight < (segsiz * rack_timely_min_segs)) { 2896 /* 2897 * A window protocol must be able to have 4 packets 2898 * outstanding as the floor in order to function 2899 * (especially considering delayed ack :D). 2900 */ 2901 rack->r_ctl.rc_target_probertt_flight = (segsiz * rack_timely_min_segs); 2902 } 2903 } 2904 2905 static void 2906 rack_enter_probertt(struct tcp_rack *rack, uint32_t us_cts) 2907 { 2908 /** 2909 * ProbeRTT is a bit different in rack_pacing than in 2910 * BBR. It is like BBR in that it uses the lowering of 2911 * the RTT as a signal that we saw something new and 2912 * counts from there for how long between. But it is 2913 * different in that its quite simple. It does not 2914 * play with the cwnd and wait until we get down 2915 * to N segments outstanding and hold that for 2916 * 200ms. Instead it just sets the pacing reduction 2917 * rate to a set percentage (70 by default) and hold 2918 * that for a number of recent GP Srtt's. 2919 */ 2920 uint32_t segsiz; 2921 2922 if (rack->rc_gp_dyn_mul == 0) 2923 return; 2924 2925 if (rack->rc_tp->snd_max == rack->rc_tp->snd_una) { 2926 /* We are idle */ 2927 return; 2928 } 2929 if ((rack->rc_tp->t_flags & TF_GPUTINPROG) && 2930 SEQ_GT(rack->rc_tp->snd_una, rack->rc_tp->gput_seq)) { 2931 /* 2932 * Stop the goodput now, the idea here is 2933 * that future measurements with in_probe_rtt 2934 * won't register if they are not greater so 2935 * we want to get what info (if any) is available 2936 * now. 2937 */ 2938 rack_do_goodput_measurement(rack->rc_tp, rack, 2939 rack->rc_tp->snd_una, __LINE__); 2940 } 2941 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt; 2942 rack->r_ctl.rc_time_probertt_entered = us_cts; 2943 segsiz = min(ctf_fixed_maxseg(rack->rc_tp), 2944 rack->r_ctl.rc_pace_min_segs); 2945 rack->in_probe_rtt = 1; 2946 rack->measure_saw_probe_rtt = 1; 2947 rack->r_ctl.rc_lower_rtt_us_cts = us_cts; 2948 rack->r_ctl.rc_time_probertt_starts = 0; 2949 rack->r_ctl.rc_entry_gp_rtt = rack->r_ctl.rc_gp_srtt; 2950 if (rack_probertt_use_min_rtt_entry) 2951 rack_set_prtt_target(rack, segsiz, get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt)); 2952 else 2953 rack_set_prtt_target(rack, segsiz, rack->r_ctl.rc_gp_srtt); 2954 rack_log_rtt_shrinks(rack, us_cts, get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 2955 __LINE__, RACK_RTTS_ENTERPROBE); 2956 } 2957 2958 static void 2959 rack_exit_probertt(struct tcp_rack *rack, uint32_t us_cts) 2960 { 2961 struct rack_sendmap *rsm; 2962 uint32_t segsiz; 2963 2964 segsiz = min(ctf_fixed_maxseg(rack->rc_tp), 2965 rack->r_ctl.rc_pace_min_segs); 2966 rack->in_probe_rtt = 0; 2967 if ((rack->rc_tp->t_flags & TF_GPUTINPROG) && 2968 SEQ_GT(rack->rc_tp->snd_una, rack->rc_tp->gput_seq)) { 2969 /* 2970 * Stop the goodput now, the idea here is 2971 * that future measurements with in_probe_rtt 2972 * won't register if they are not greater so 2973 * we want to get what info (if any) is available 2974 * now. 2975 */ 2976 rack_do_goodput_measurement(rack->rc_tp, rack, 2977 rack->rc_tp->snd_una, __LINE__); 2978 } else if (rack->rc_tp->t_flags & TF_GPUTINPROG) { 2979 /* 2980 * We don't have enough data to make a measurement. 2981 * So lets just stop and start here after exiting 2982 * probe-rtt. We probably are not interested in 2983 * the results anyway. 2984 */ 2985 rack->rc_tp->t_flags &= ~TF_GPUTINPROG; 2986 } 2987 /* 2988 * Measurements through the current snd_max are going 2989 * to be limited by the slower pacing rate. 2990 * 2991 * We need to mark these as app-limited so we 2992 * don't collapse the b/w. 2993 */ 2994 rsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 2995 if (rsm && ((rsm->r_flags & RACK_APP_LIMITED) == 0)) { 2996 if (rack->r_ctl.rc_app_limited_cnt == 0) 2997 rack->r_ctl.rc_end_appl = rack->r_ctl.rc_first_appl = rsm; 2998 else { 2999 /* 3000 * Go out to the end app limited and mark 3001 * this new one as next and move the end_appl up 3002 * to this guy. 3003 */ 3004 if (rack->r_ctl.rc_end_appl) 3005 rack->r_ctl.rc_end_appl->r_nseq_appl = rsm->r_start; 3006 rack->r_ctl.rc_end_appl = rsm; 3007 } 3008 rsm->r_flags |= RACK_APP_LIMITED; 3009 rack->r_ctl.rc_app_limited_cnt++; 3010 } 3011 /* 3012 * Now, we need to examine our pacing rate multipliers. 3013 * If its under 100%, we need to kick it back up to 3014 * 100%. We also don't let it be over our "max" above 3015 * the actual rate i.e. 100% + rack_clamp_atexit_prtt. 3016 * Note setting clamp_atexit_prtt to 0 has the effect 3017 * of setting CA/SS to 100% always at exit (which is 3018 * the default behavior). 3019 */ 3020 if (rack_probertt_clear_is) { 3021 rack->rc_gp_incr = 0; 3022 rack->rc_gp_bwred = 0; 3023 rack->rc_gp_timely_inc_cnt = 0; 3024 rack->rc_gp_timely_dec_cnt = 0; 3025 } 3026 /* Do we do any clamping at exit? */ 3027 if (rack->rc_highly_buffered && rack_atexit_prtt_hbp) { 3028 rack->r_ctl.rack_per_of_gp_ca = rack_atexit_prtt_hbp; 3029 rack->r_ctl.rack_per_of_gp_ss = rack_atexit_prtt_hbp; 3030 } 3031 if ((rack->rc_highly_buffered == 0) && rack_atexit_prtt) { 3032 rack->r_ctl.rack_per_of_gp_ca = rack_atexit_prtt; 3033 rack->r_ctl.rack_per_of_gp_ss = rack_atexit_prtt; 3034 } 3035 /* 3036 * Lets set rtt_diff to 0, so that we will get a "boost" 3037 * after exiting. 3038 */ 3039 rack->r_ctl.rc_rtt_diff = 0; 3040 3041 /* Clear all flags so we start fresh */ 3042 rack->rc_tp->t_bytes_acked = 0; 3043 rack->rc_tp->ccv->flags &= ~CCF_ABC_SENTAWND; 3044 /* 3045 * If configured to, set the cwnd and ssthresh to 3046 * our targets. 3047 */ 3048 if (rack_probe_rtt_sets_cwnd) { 3049 uint64_t ebdp; 3050 uint32_t setto; 3051 3052 /* Set ssthresh so we get into CA once we hit our target */ 3053 if (rack_probertt_use_min_rtt_exit == 1) { 3054 /* Set to min rtt */ 3055 rack_set_prtt_target(rack, segsiz, 3056 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt)); 3057 } else if (rack_probertt_use_min_rtt_exit == 2) { 3058 /* Set to current gp rtt */ 3059 rack_set_prtt_target(rack, segsiz, 3060 rack->r_ctl.rc_gp_srtt); 3061 } else if (rack_probertt_use_min_rtt_exit == 3) { 3062 /* Set to entry gp rtt */ 3063 rack_set_prtt_target(rack, segsiz, 3064 rack->r_ctl.rc_entry_gp_rtt); 3065 } else { 3066 uint64_t sum; 3067 uint32_t setval; 3068 3069 sum = rack->r_ctl.rc_entry_gp_rtt; 3070 sum *= 10; 3071 sum /= (uint64_t)(max(1, rack->r_ctl.rc_gp_srtt)); 3072 if (sum >= 20) { 3073 /* 3074 * A highly buffered path needs 3075 * cwnd space for timely to work. 3076 * Lets set things up as if 3077 * we are heading back here again. 3078 */ 3079 setval = rack->r_ctl.rc_entry_gp_rtt; 3080 } else if (sum >= 15) { 3081 /* 3082 * Lets take the smaller of the 3083 * two since we are just somewhat 3084 * buffered. 3085 */ 3086 setval = rack->r_ctl.rc_gp_srtt; 3087 if (setval > rack->r_ctl.rc_entry_gp_rtt) 3088 setval = rack->r_ctl.rc_entry_gp_rtt; 3089 } else { 3090 /* 3091 * Here we are not highly buffered 3092 * and should pick the min we can to 3093 * keep from causing loss. 3094 */ 3095 setval = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); 3096 } 3097 rack_set_prtt_target(rack, segsiz, 3098 setval); 3099 } 3100 if (rack_probe_rtt_sets_cwnd > 1) { 3101 /* There is a percentage here to boost */ 3102 ebdp = rack->r_ctl.rc_target_probertt_flight; 3103 ebdp *= rack_probe_rtt_sets_cwnd; 3104 ebdp /= 100; 3105 setto = rack->r_ctl.rc_target_probertt_flight + ebdp; 3106 } else 3107 setto = rack->r_ctl.rc_target_probertt_flight; 3108 rack->rc_tp->snd_cwnd = roundup(setto, segsiz); 3109 if (rack->rc_tp->snd_cwnd < (segsiz * rack_timely_min_segs)) { 3110 /* Enforce a min */ 3111 rack->rc_tp->snd_cwnd = segsiz * rack_timely_min_segs; 3112 } 3113 /* If we set in the cwnd also set the ssthresh point so we are in CA */ 3114 rack->rc_tp->snd_ssthresh = (rack->rc_tp->snd_cwnd - 1); 3115 } 3116 rack_log_rtt_shrinks(rack, us_cts, 3117 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 3118 __LINE__, RACK_RTTS_EXITPROBE); 3119 /* Clear times last so log has all the info */ 3120 rack->r_ctl.rc_probertt_sndmax_atexit = rack->rc_tp->snd_max; 3121 rack->r_ctl.rc_time_probertt_entered = us_cts; 3122 rack->r_ctl.rc_time_probertt_starts = rack->r_ctl.rc_lower_rtt_us_cts = us_cts; 3123 rack->r_ctl.rc_time_of_last_probertt = us_cts; 3124 } 3125 3126 static void 3127 rack_check_probe_rtt(struct tcp_rack *rack, uint32_t us_cts) 3128 { 3129 /* Check in on probe-rtt */ 3130 if (rack->rc_gp_filled == 0) { 3131 /* We do not do p-rtt unless we have gp measurements */ 3132 return; 3133 } 3134 if (rack->in_probe_rtt) { 3135 uint64_t no_overflow; 3136 uint32_t endtime, must_stay; 3137 3138 if (rack->r_ctl.rc_went_idle_time && 3139 ((us_cts - rack->r_ctl.rc_went_idle_time) > rack_min_probertt_hold)) { 3140 /* 3141 * We went idle during prtt, just exit now. 3142 */ 3143 rack_exit_probertt(rack, us_cts); 3144 } else if (rack_probe_rtt_safety_val && 3145 TSTMP_GT(us_cts, rack->r_ctl.rc_time_probertt_entered) && 3146 ((us_cts - rack->r_ctl.rc_time_probertt_entered) > rack_probe_rtt_safety_val)) { 3147 /* 3148 * Probe RTT safety value triggered! 3149 */ 3150 rack_log_rtt_shrinks(rack, us_cts, 3151 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 3152 __LINE__, RACK_RTTS_SAFETY); 3153 rack_exit_probertt(rack, us_cts); 3154 } 3155 /* Calculate the max we will wait */ 3156 endtime = rack->r_ctl.rc_time_probertt_entered + (rack->r_ctl.rc_gp_srtt * rack_max_drain_wait); 3157 if (rack->rc_highly_buffered) 3158 endtime += (rack->r_ctl.rc_gp_srtt * rack_max_drain_hbp); 3159 /* Calculate the min we must wait */ 3160 must_stay = rack->r_ctl.rc_time_probertt_entered + (rack->r_ctl.rc_gp_srtt * rack_must_drain); 3161 if ((ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) > rack->r_ctl.rc_target_probertt_flight) && 3162 TSTMP_LT(us_cts, endtime)) { 3163 uint32_t calc; 3164 /* Do we lower more? */ 3165 no_exit: 3166 if (TSTMP_GT(us_cts, rack->r_ctl.rc_time_probertt_entered)) 3167 calc = us_cts - rack->r_ctl.rc_time_probertt_entered; 3168 else 3169 calc = 0; 3170 calc /= max(rack->r_ctl.rc_gp_srtt, 1); 3171 if (calc) { 3172 /* Maybe */ 3173 calc *= rack_per_of_gp_probertt_reduce; 3174 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt - calc; 3175 /* Limit it too */ 3176 if (rack->r_ctl.rack_per_of_gp_probertt < rack_per_of_gp_lowthresh) 3177 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_lowthresh; 3178 } 3179 /* We must reach target or the time set */ 3180 return; 3181 } 3182 if (rack->r_ctl.rc_time_probertt_starts == 0) { 3183 if ((TSTMP_LT(us_cts, must_stay) && 3184 rack->rc_highly_buffered) || 3185 (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) > 3186 rack->r_ctl.rc_target_probertt_flight)) { 3187 /* We are not past the must_stay time */ 3188 goto no_exit; 3189 } 3190 rack_log_rtt_shrinks(rack, us_cts, 3191 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 3192 __LINE__, RACK_RTTS_REACHTARGET); 3193 rack->r_ctl.rc_time_probertt_starts = us_cts; 3194 if (rack->r_ctl.rc_time_probertt_starts == 0) 3195 rack->r_ctl.rc_time_probertt_starts = 1; 3196 /* Restore back to our rate we want to pace at in prtt */ 3197 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt; 3198 } 3199 /* 3200 * Setup our end time, some number of gp_srtts plus 200ms. 3201 */ 3202 no_overflow = ((uint64_t)rack->r_ctl.rc_gp_srtt * 3203 (uint64_t)rack_probertt_gpsrtt_cnt_mul); 3204 if (rack_probertt_gpsrtt_cnt_div) 3205 endtime = (uint32_t)(no_overflow / (uint64_t)rack_probertt_gpsrtt_cnt_div); 3206 else 3207 endtime = 0; 3208 endtime += rack_min_probertt_hold; 3209 endtime += rack->r_ctl.rc_time_probertt_starts; 3210 if (TSTMP_GEQ(us_cts, endtime)) { 3211 /* yes, exit probertt */ 3212 rack_exit_probertt(rack, us_cts); 3213 } 3214 3215 } else if((us_cts - rack->r_ctl.rc_lower_rtt_us_cts) >= rack_time_between_probertt) { 3216 /* Go into probertt, its been too long since we went lower */ 3217 rack_enter_probertt(rack, us_cts); 3218 } 3219 } 3220 3221 static void 3222 rack_update_multiplier(struct tcp_rack *rack, int32_t timely_says, uint64_t last_bw_est, 3223 uint32_t rtt, int32_t rtt_diff) 3224 { 3225 uint64_t cur_bw, up_bnd, low_bnd, subfr; 3226 uint32_t losses; 3227 3228 if ((rack->rc_gp_dyn_mul == 0) || 3229 (rack->use_fixed_rate) || 3230 (rack->in_probe_rtt) || 3231 (rack->rc_always_pace == 0)) { 3232 /* No dynamic GP multipler in play */ 3233 return; 3234 } 3235 losses = rack->r_ctl.rc_loss_count - rack->r_ctl.rc_loss_at_start; 3236 cur_bw = rack_get_bw(rack); 3237 /* Calculate our up and down range */ 3238 up_bnd = rack->r_ctl.last_gp_comp_bw * (uint64_t)rack_gp_per_bw_mul_up; 3239 up_bnd /= 100; 3240 up_bnd += rack->r_ctl.last_gp_comp_bw; 3241 3242 subfr = (uint64_t)rack->r_ctl.last_gp_comp_bw * (uint64_t)rack_gp_per_bw_mul_down; 3243 subfr /= 100; 3244 low_bnd = rack->r_ctl.last_gp_comp_bw - subfr; 3245 if ((timely_says == 2) && (rack->r_ctl.rc_no_push_at_mrtt)) { 3246 /* 3247 * This is the case where our RTT is above 3248 * the max target and we have been configured 3249 * to just do timely no bonus up stuff in that case. 3250 * 3251 * There are two configurations, set to 1, and we 3252 * just do timely if we are over our max. If its 3253 * set above 1 then we slam the multipliers down 3254 * to 100 and then decrement per timely. 3255 */ 3256 rack_log_timely(rack, timely_says, cur_bw, low_bnd, up_bnd, 3257 __LINE__, 3); 3258 if (rack->r_ctl.rc_no_push_at_mrtt > 1) 3259 rack_validate_multipliers_at_or_below_100(rack); 3260 rack_decrease_bw_mul(rack, timely_says, rtt, rtt_diff); 3261 } else if ((last_bw_est < low_bnd) && !losses) { 3262 /* 3263 * We are decreasing this is a bit complicated this 3264 * means we are loosing ground. This could be 3265 * because another flow entered and we are competing 3266 * for b/w with it. This will push the RTT up which 3267 * makes timely unusable unless we want to get shoved 3268 * into a corner and just be backed off (the age 3269 * old problem with delay based CC). 3270 * 3271 * On the other hand if it was a route change we 3272 * would like to stay somewhat contained and not 3273 * blow out the buffers. 3274 */ 3275 rack_log_timely(rack, timely_says, cur_bw, low_bnd, up_bnd, 3276 __LINE__, 3); 3277 rack->r_ctl.last_gp_comp_bw = cur_bw; 3278 if (rack->rc_gp_bwred == 0) { 3279 /* Go into reduction counting */ 3280 rack->rc_gp_bwred = 1; 3281 rack->rc_gp_timely_dec_cnt = 0; 3282 } 3283 if ((rack->rc_gp_timely_dec_cnt < rack_timely_max_push_drop) || 3284 (timely_says == 0)) { 3285 /* 3286 * Push another time with a faster pacing 3287 * to try to gain back (we include override to 3288 * get a full raise factor). 3289 */ 3290 if ((rack->rc_gp_saw_ca && rack->r_ctl.rack_per_of_gp_ca <= rack_down_raise_thresh) || 3291 (rack->rc_gp_saw_ss && rack->r_ctl.rack_per_of_gp_ss <= rack_down_raise_thresh) || 3292 (timely_says == 0) || 3293 (rack_down_raise_thresh == 0)) { 3294 /* 3295 * Do an override up in b/w if we were 3296 * below the threshold or if the threshold 3297 * is zero we always do the raise. 3298 */ 3299 rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 1); 3300 } else { 3301 /* Log it stays the same */ 3302 rack_log_timely(rack, 0, last_bw_est, low_bnd, 0, 3303 __LINE__, 11); 3304 3305 } 3306 rack->rc_gp_timely_dec_cnt++; 3307 /* We are not incrementing really no-count */ 3308 rack->rc_gp_incr = 0; 3309 rack->rc_gp_timely_inc_cnt = 0; 3310 } else { 3311 /* 3312 * Lets just use the RTT 3313 * information and give up 3314 * pushing. 3315 */ 3316 goto use_timely; 3317 } 3318 } else if ((timely_says != 2) && 3319 !losses && 3320 (last_bw_est > up_bnd)) { 3321 /* 3322 * We are increasing b/w lets keep going, updating 3323 * our b/w and ignoring any timely input, unless 3324 * of course we are at our max raise (if there is one). 3325 */ 3326 3327 rack_log_timely(rack, timely_says, cur_bw, low_bnd, up_bnd, 3328 __LINE__, 3); 3329 rack->r_ctl.last_gp_comp_bw = cur_bw; 3330 if (rack->rc_gp_saw_ss && 3331 rack_per_upper_bound_ss && 3332 (rack->r_ctl.rack_per_of_gp_ss == rack_per_upper_bound_ss)) { 3333 /* 3334 * In cases where we can't go higher 3335 * we should just use timely. 3336 */ 3337 goto use_timely; 3338 } 3339 if (rack->rc_gp_saw_ca && 3340 rack_per_upper_bound_ca && 3341 (rack->r_ctl.rack_per_of_gp_ca == rack_per_upper_bound_ca)) { 3342 /* 3343 * In cases where we can't go higher 3344 * we should just use timely. 3345 */ 3346 goto use_timely; 3347 } 3348 rack->rc_gp_bwred = 0; 3349 rack->rc_gp_timely_dec_cnt = 0; 3350 /* You get a set number of pushes if timely is trying to reduce */ 3351 if ((rack->rc_gp_incr < rack_timely_max_push_rise) || (timely_says == 0)) { 3352 rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 0); 3353 } else { 3354 /* Log it stays the same */ 3355 rack_log_timely(rack, 0, last_bw_est, up_bnd, 0, 3356 __LINE__, 12); 3357 3358 } 3359 return; 3360 } else { 3361 /* 3362 * We are staying between the lower and upper range bounds 3363 * so use timely to decide. 3364 */ 3365 rack_log_timely(rack, timely_says, cur_bw, low_bnd, up_bnd, 3366 __LINE__, 3); 3367 use_timely: 3368 if (timely_says) { 3369 rack->rc_gp_incr = 0; 3370 rack->rc_gp_timely_inc_cnt = 0; 3371 if ((rack->rc_gp_timely_dec_cnt < rack_timely_max_push_drop) && 3372 !losses && 3373 (last_bw_est < low_bnd)) { 3374 /* We are loosing ground */ 3375 rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 0); 3376 rack->rc_gp_timely_dec_cnt++; 3377 /* We are not incrementing really no-count */ 3378 rack->rc_gp_incr = 0; 3379 rack->rc_gp_timely_inc_cnt = 0; 3380 } else 3381 rack_decrease_bw_mul(rack, timely_says, rtt, rtt_diff); 3382 } else { 3383 rack->rc_gp_bwred = 0; 3384 rack->rc_gp_timely_dec_cnt = 0; 3385 rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 0); 3386 } 3387 } 3388 } 3389 3390 static int32_t 3391 rack_make_timely_judgement(struct tcp_rack *rack, uint32_t rtt, int32_t rtt_diff, uint32_t prev_rtt) 3392 { 3393 int32_t timely_says; 3394 uint64_t log_mult, log_rtt_a_diff; 3395 3396 log_rtt_a_diff = rtt; 3397 log_rtt_a_diff <<= 32; 3398 log_rtt_a_diff |= (uint32_t)rtt_diff; 3399 if (rtt >= (get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * 3400 rack_gp_rtt_maxmul)) { 3401 /* Reduce the b/w multipler */ 3402 timely_says = 2; 3403 log_mult = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_maxmul; 3404 log_mult <<= 32; 3405 log_mult |= prev_rtt; 3406 rack_log_timely(rack, timely_says, log_mult, 3407 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 3408 log_rtt_a_diff, __LINE__, 4); 3409 } else if (rtt <= (get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) + 3410 ((get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_minmul) / 3411 max(rack_gp_rtt_mindiv , 1)))) { 3412 /* Increase the b/w multipler */ 3413 log_mult = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) + 3414 ((get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_minmul) / 3415 max(rack_gp_rtt_mindiv , 1)); 3416 log_mult <<= 32; 3417 log_mult |= prev_rtt; 3418 timely_says = 0; 3419 rack_log_timely(rack, timely_says, log_mult , 3420 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 3421 log_rtt_a_diff, __LINE__, 5); 3422 } else { 3423 /* 3424 * Use a gradient to find it the timely gradient 3425 * is: 3426 * grad = rc_rtt_diff / min_rtt; 3427 * 3428 * anything below or equal to 0 will be 3429 * a increase indication. Anything above 3430 * zero is a decrease. Note we take care 3431 * of the actual gradient calculation 3432 * in the reduction (its not needed for 3433 * increase). 3434 */ 3435 log_mult = prev_rtt; 3436 if (rtt_diff <= 0) { 3437 /* 3438 * Rttdiff is less than zero, increase the 3439 * b/w multipler (its 0 or negative) 3440 */ 3441 timely_says = 0; 3442 rack_log_timely(rack, timely_says, log_mult, 3443 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), log_rtt_a_diff, __LINE__, 6); 3444 } else { 3445 /* Reduce the b/w multipler */ 3446 timely_says = 1; 3447 rack_log_timely(rack, timely_says, log_mult, 3448 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), log_rtt_a_diff, __LINE__, 7); 3449 } 3450 } 3451 return (timely_says); 3452 } 3453 3454 static void 3455 rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack, 3456 tcp_seq th_ack, int line) 3457 { 3458 uint64_t tim, bytes_ps, ltim, stim, utim; 3459 uint32_t segsiz, bytes, reqbytes, us_cts; 3460 int32_t gput, new_rtt_diff, timely_says; 3461 3462 us_cts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 3463 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 3464 if (TSTMP_GEQ(us_cts, tp->gput_ts)) 3465 tim = us_cts - tp->gput_ts; 3466 else 3467 tim = 0; 3468 3469 if (TSTMP_GT(rack->r_ctl.rc_gp_cumack_ts, rack->r_ctl.rc_gp_output_ts)) 3470 stim = rack->r_ctl.rc_gp_cumack_ts - rack->r_ctl.rc_gp_output_ts; 3471 else 3472 stim = 0; 3473 /* 3474 * Use the larger of the send time or ack time. This prevents us 3475 * from being influenced by ack artifacts to come up with too 3476 * high of measurement. Note that since we are spanning over many more 3477 * bytes in most of our measurements hopefully that is less likely to 3478 * occur. 3479 */ 3480 if (tim > stim) 3481 utim = max(tim, 1); 3482 else 3483 utim = max(stim, 1); 3484 /* Lets validate utim */ 3485 ltim = max(1, (utim/HPTS_USEC_IN_MSEC)); 3486 gput = (((uint64_t) (th_ack - tp->gput_seq)) << 3) / ltim; 3487 reqbytes = min(rc_init_window(rack), (MIN_GP_WIN * segsiz)); 3488 if ((tim == 0) && (stim == 0)) { 3489 /* 3490 * Invalid measurement time, maybe 3491 * all on one ack/one send? 3492 */ 3493 bytes = 0; 3494 bytes_ps = 0; 3495 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, 3496 0, 0, 0, 10, __LINE__, NULL); 3497 goto skip_measurement; 3498 } 3499 if (rack->r_ctl.rc_gp_lowrtt == 0xffffffff) { 3500 /* We never made a us_rtt measurement? */ 3501 bytes = 0; 3502 bytes_ps = 0; 3503 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, 3504 0, 0, 0, 10, __LINE__, NULL); 3505 goto skip_measurement; 3506 } 3507 /* 3508 * Calculate the maximum possible b/w this connection 3509 * could have. We base our calculation on the lowest 3510 * rtt we have seen during the measurement and the 3511 * largest rwnd the client has given us in that time. This 3512 * forms a BDP that is the maximum that we could ever 3513 * get to the client. Anything larger is not valid. 3514 * 3515 * I originally had code here that rejected measurements 3516 * where the time was less than 1/2 the latest us_rtt. 3517 * But after thinking on that I realized its wrong since 3518 * say you had a 150Mbps or even 1Gbps link, and you 3519 * were a long way away.. example I am in Europe (100ms rtt) 3520 * talking to my 1Gbps link in S.C. Now measuring say 150,000 3521 * bytes my time would be 1.2ms, and yet my rtt would say 3522 * the measurement was invalid the time was < 50ms. The 3523 * same thing is true for 150Mb (8ms of time). 3524 * 3525 * A better way I realized is to look at what the maximum 3526 * the connection could possibly do. This is gated on 3527 * the lowest RTT we have seen and the highest rwnd. 3528 * We should in theory never exceed that, if we are 3529 * then something on the path is storing up packets 3530 * and then feeding them all at once to our endpoint 3531 * messing up our measurement. 3532 */ 3533 rack->r_ctl.last_max_bw = rack->r_ctl.rc_gp_high_rwnd; 3534 rack->r_ctl.last_max_bw *= HPTS_USEC_IN_SEC; 3535 rack->r_ctl.last_max_bw /= rack->r_ctl.rc_gp_lowrtt; 3536 if (SEQ_LT(th_ack, tp->gput_seq)) { 3537 /* No measurement can be made */ 3538 bytes = 0; 3539 bytes_ps = 0; 3540 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, 3541 0, 0, 0, 10, __LINE__, NULL); 3542 goto skip_measurement; 3543 } else 3544 bytes = (th_ack - tp->gput_seq); 3545 bytes_ps = (uint64_t)bytes; 3546 /* 3547 * Don't measure a b/w for pacing unless we have gotten at least 3548 * an initial windows worth of data in this measurement interval. 3549 * 3550 * Small numbers of bytes get badly influenced by delayed ack and 3551 * other artifacts. Note we take the initial window or our 3552 * defined minimum GP (defaulting to 10 which hopefully is the 3553 * IW). 3554 */ 3555 if (rack->rc_gp_filled == 0) { 3556 /* 3557 * The initial estimate is special. We 3558 * have blasted out an IW worth of packets 3559 * without a real valid ack ts results. We 3560 * then setup the app_limited_needs_set flag, 3561 * this should get the first ack in (probably 2 3562 * MSS worth) to be recorded as the timestamp. 3563 * We thus allow a smaller number of bytes i.e. 3564 * IW - 2MSS. 3565 */ 3566 reqbytes -= (2 * segsiz); 3567 /* Also lets fill previous for our first measurement to be neutral */ 3568 rack->r_ctl.rc_prev_gp_srtt = rack->r_ctl.rc_gp_srtt; 3569 } 3570 if ((bytes_ps < reqbytes) || rack->app_limited_needs_set) { 3571 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, 3572 rack->r_ctl.rc_app_limited_cnt, 3573 0, 0, 10, __LINE__, NULL); 3574 goto skip_measurement; 3575 } 3576 /* 3577 * We now need to calculate the Timely like status so 3578 * we can update (possibly) the b/w multipliers. 3579 */ 3580 new_rtt_diff = (int32_t)rack->r_ctl.rc_gp_srtt - (int32_t)rack->r_ctl.rc_prev_gp_srtt; 3581 if (rack->rc_gp_filled == 0) { 3582 /* No previous reading */ 3583 rack->r_ctl.rc_rtt_diff = new_rtt_diff; 3584 } else { 3585 if (rack->measure_saw_probe_rtt == 0) { 3586 /* 3587 * We don't want a probertt to be counted 3588 * since it will be negative incorrectly. We 3589 * expect to be reducing the RTT when we 3590 * pace at a slower rate. 3591 */ 3592 rack->r_ctl.rc_rtt_diff -= (rack->r_ctl.rc_rtt_diff / 8); 3593 rack->r_ctl.rc_rtt_diff += (new_rtt_diff / 8); 3594 } 3595 } 3596 timely_says = rack_make_timely_judgement(rack, 3597 rack->r_ctl.rc_gp_srtt, 3598 rack->r_ctl.rc_rtt_diff, 3599 rack->r_ctl.rc_prev_gp_srtt 3600 ); 3601 bytes_ps *= HPTS_USEC_IN_SEC; 3602 bytes_ps /= utim; 3603 if (bytes_ps > rack->r_ctl.last_max_bw) { 3604 /* 3605 * Something is on path playing 3606 * since this b/w is not possible based 3607 * on our BDP (highest rwnd and lowest rtt 3608 * we saw in the measurement window). 3609 * 3610 * Another option here would be to 3611 * instead skip the measurement. 3612 */ 3613 rack_log_pacing_delay_calc(rack, bytes, reqbytes, 3614 bytes_ps, rack->r_ctl.last_max_bw, 0, 3615 11, __LINE__, NULL); 3616 bytes_ps = rack->r_ctl.last_max_bw; 3617 } 3618 /* We store gp for b/w in bytes per second */ 3619 if (rack->rc_gp_filled == 0) { 3620 /* Initial measurment */ 3621 if (bytes_ps) { 3622 rack->r_ctl.gp_bw = bytes_ps; 3623 rack->rc_gp_filled = 1; 3624 rack->r_ctl.num_avg = 1; 3625 rack_set_pace_segments(rack->rc_tp, rack, __LINE__); 3626 } else { 3627 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, 3628 rack->r_ctl.rc_app_limited_cnt, 3629 0, 0, 10, __LINE__, NULL); 3630 } 3631 if (rack->rc_inp->inp_in_hpts && 3632 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { 3633 /* 3634 * Ok we can't trust the pacer in this case 3635 * where we transition from un-paced to paced. 3636 * Or for that matter when the burst mitigation 3637 * was making a wild guess and got it wrong. 3638 * Stop the pacer and clear up all the aggregate 3639 * delays etc. 3640 */ 3641 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); 3642 rack->r_ctl.rc_hpts_flags = 0; 3643 rack->r_ctl.rc_last_output_to = 0; 3644 } 3645 } else if (rack->r_ctl.num_avg < RACK_REQ_AVG) { 3646 /* Still a small number run an average */ 3647 rack->r_ctl.gp_bw += bytes_ps; 3648 rack->r_ctl.num_avg++; 3649 if (rack->r_ctl.num_avg >= RACK_REQ_AVG) { 3650 /* We have collected enought to move forward */ 3651 rack->r_ctl.gp_bw /= (uint64_t)rack->r_ctl.num_avg; 3652 } 3653 } else { 3654 /* 3655 * We want to take 1/wma of the goodput and add in to 7/8th 3656 * of the old value weighted by the srtt. So if your measurement 3657 * period is say 2 SRTT's long you would get 1/4 as the 3658 * value, if it was like 1/2 SRTT then you would get 1/16th. 3659 * 3660 * But we must be careful not to take too much i.e. if the 3661 * srtt is say 20ms and the measurement is taken over 3662 * 400ms our weight would be 400/20 i.e. 20. On the 3663 * other hand if we get a measurement over 1ms with a 3664 * 10ms rtt we only want to take a much smaller portion. 3665 */ 3666 uint64_t resid_bw, subpart, addpart, srtt; 3667 3668 srtt = ((uint64_t)TICKS_2_USEC(tp->t_srtt) >> TCP_RTT_SHIFT); 3669 if (srtt == 0) { 3670 /* 3671 * Strange why did t_srtt go back to zero? 3672 */ 3673 if (rack->r_ctl.rc_rack_min_rtt) 3674 srtt = (rack->r_ctl.rc_rack_min_rtt * HPTS_USEC_IN_MSEC); 3675 else 3676 srtt = HPTS_USEC_IN_MSEC; 3677 } 3678 /* 3679 * XXXrrs: Note for reviewers, in playing with 3680 * dynamic pacing I discovered this GP calculation 3681 * as done originally leads to some undesired results. 3682 * Basically you can get longer measurements contributing 3683 * too much to the WMA. Thus I changed it if you are doing 3684 * dynamic adjustments to only do the aportioned adjustment 3685 * if we have a very small (time wise) measurement. Longer 3686 * measurements just get there weight (defaulting to 1/8) 3687 * add to the WMA. We may want to think about changing 3688 * this to always do that for both sides i.e. dynamic 3689 * and non-dynamic... but considering lots of folks 3690 * were playing with this I did not want to change the 3691 * calculation per.se. without your thoughts.. Lawerence? 3692 * Peter?? 3693 */ 3694 if (rack->rc_gp_dyn_mul == 0) { 3695 subpart = rack->r_ctl.gp_bw * utim; 3696 subpart /= (srtt * 8); 3697 if (subpart < (rack->r_ctl.gp_bw / 2)) { 3698 /* 3699 * The b/w update takes no more 3700 * away then 1/2 our running total 3701 * so factor it in. 3702 */ 3703 addpart = bytes_ps * utim; 3704 addpart /= (srtt * 8); 3705 } else { 3706 /* 3707 * Don't allow a single measurement 3708 * to account for more than 1/2 of the 3709 * WMA. This could happen on a retransmission 3710 * where utim becomes huge compared to 3711 * srtt (multiple retransmissions when using 3712 * the sending rate which factors in all the 3713 * transmissions from the first one). 3714 */ 3715 subpart = rack->r_ctl.gp_bw / 2; 3716 addpart = bytes_ps / 2; 3717 } 3718 resid_bw = rack->r_ctl.gp_bw - subpart; 3719 rack->r_ctl.gp_bw = resid_bw + addpart; 3720 } else { 3721 if ((utim / srtt) <= 1) { 3722 /* 3723 * The b/w update was over a small period 3724 * of time. The idea here is to prevent a small 3725 * measurement time period from counting 3726 * too much. So we scale it based on the 3727 * time so it attributes less than 1/rack_wma_divisor 3728 * of its measurement. 3729 */ 3730 subpart = rack->r_ctl.gp_bw * utim; 3731 subpart /= (srtt * rack_wma_divisor); 3732 addpart = bytes_ps * utim; 3733 addpart /= (srtt * rack_wma_divisor); 3734 } else { 3735 /* 3736 * The scaled measurement was long 3737 * enough so lets just add in the 3738 * portion of the measurment i.e. 1/rack_wma_divisor 3739 */ 3740 subpart = rack->r_ctl.gp_bw / rack_wma_divisor; 3741 addpart = bytes_ps / rack_wma_divisor; 3742 } 3743 if ((rack->measure_saw_probe_rtt == 0) || 3744 (bytes_ps > rack->r_ctl.gp_bw)) { 3745 /* 3746 * For probe-rtt we only add it in 3747 * if its larger, all others we just 3748 * add in. 3749 */ 3750 resid_bw = rack->r_ctl.gp_bw - subpart; 3751 rack->r_ctl.gp_bw = resid_bw + addpart; 3752 } 3753 } 3754 } 3755 /* We do not update any multipliers if we are in or have seen a probe-rtt */ 3756 if ((rack->measure_saw_probe_rtt == 0) && rack->rc_gp_rtt_set) 3757 rack_update_multiplier(rack, timely_says, bytes_ps, 3758 rack->r_ctl.rc_gp_srtt, 3759 rack->r_ctl.rc_rtt_diff); 3760 rack_log_pacing_delay_calc(rack, bytes, tim, bytes_ps, stim, 3761 rack_get_bw(rack), 3, line, NULL); 3762 /* reset the gp srtt and setup the new prev */ 3763 rack->r_ctl.rc_prev_gp_srtt = rack->r_ctl.rc_gp_srtt; 3764 /* Record the lost count for the next measurement */ 3765 rack->r_ctl.rc_loss_at_start = rack->r_ctl.rc_loss_count; 3766 /* 3767 * We restart our diffs based on the gpsrtt in the 3768 * measurement window. 3769 */ 3770 rack->rc_gp_rtt_set = 0; 3771 rack->rc_gp_saw_rec = 0; 3772 rack->rc_gp_saw_ca = 0; 3773 rack->rc_gp_saw_ss = 0; 3774 rack->rc_dragged_bottom = 0; 3775 skip_measurement: 3776 3777 #ifdef STATS 3778 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_GPUT, 3779 gput); 3780 /* 3781 * XXXLAS: This is a temporary hack, and should be 3782 * chained off VOI_TCP_GPUT when stats(9) grows an 3783 * API to deal with chained VOIs. 3784 */ 3785 if (tp->t_stats_gput_prev > 0) 3786 stats_voi_update_abs_s32(tp->t_stats, 3787 VOI_TCP_GPUT_ND, 3788 ((gput - tp->t_stats_gput_prev) * 100) / 3789 tp->t_stats_gput_prev); 3790 #endif 3791 tp->t_flags &= ~TF_GPUTINPROG; 3792 tp->t_stats_gput_prev = gput; 3793 /* 3794 * Now are we app limited now and there is space from where we 3795 * were to where we want to go? 3796 * 3797 * We don't do the other case i.e. non-applimited here since 3798 * the next send will trigger us picking up the missing data. 3799 */ 3800 if (rack->r_ctl.rc_first_appl && 3801 TCPS_HAVEESTABLISHED(tp->t_state) && 3802 rack->r_ctl.rc_app_limited_cnt && 3803 (SEQ_GT(rack->r_ctl.rc_first_appl->r_start, th_ack)) && 3804 ((rack->r_ctl.rc_first_appl->r_start - th_ack) > 3805 max(rc_init_window(rack), (MIN_GP_WIN * segsiz)))) { 3806 /* 3807 * Yep there is enough outstanding to make a measurement here. 3808 */ 3809 struct rack_sendmap *rsm, fe; 3810 3811 tp->t_flags |= TF_GPUTINPROG; 3812 rack->r_ctl.rc_gp_lowrtt = 0xffffffff; 3813 rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd; 3814 tp->gput_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 3815 rack->app_limited_needs_set = 0; 3816 tp->gput_seq = th_ack; 3817 if (rack->in_probe_rtt) 3818 rack->measure_saw_probe_rtt = 1; 3819 else if ((rack->measure_saw_probe_rtt) && 3820 (SEQ_GEQ(tp->gput_seq, rack->r_ctl.rc_probertt_sndmax_atexit))) 3821 rack->measure_saw_probe_rtt = 0; 3822 if ((rack->r_ctl.rc_first_appl->r_start - th_ack) >= rack_get_measure_window(tp, rack)) { 3823 /* There is a full window to gain info from */ 3824 tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack); 3825 } else { 3826 /* We can only measure up to the applimited point */ 3827 tp->gput_ack = tp->gput_seq + (rack->r_ctl.rc_first_appl->r_start - th_ack); 3828 } 3829 /* 3830 * Now we need to find the timestamp of the send at tp->gput_seq 3831 * for the send based measurement. 3832 */ 3833 fe.r_start = tp->gput_seq; 3834 rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); 3835 if (rsm) { 3836 /* Ok send-based limit is set */ 3837 if (SEQ_LT(rsm->r_start, tp->gput_seq)) { 3838 /* 3839 * Move back to include the earlier part 3840 * so our ack time lines up right (this may 3841 * make an overlapping measurement but thats 3842 * ok). 3843 */ 3844 tp->gput_seq = rsm->r_start; 3845 } 3846 if (rsm->r_flags & RACK_ACKED) 3847 tp->gput_ts = rsm->r_ack_arrival; 3848 else 3849 rack->app_limited_needs_set = 1; 3850 rack->r_ctl.rc_gp_output_ts = rsm->usec_orig_send; 3851 } else { 3852 /* 3853 * If we don't find the rsm due to some 3854 * send-limit set the current time, which 3855 * basically disables the send-limit. 3856 */ 3857 rack->r_ctl.rc_gp_output_ts = tcp_get_usecs(NULL); 3858 } 3859 rack_log_pacing_delay_calc(rack, 3860 tp->gput_seq, 3861 tp->gput_ack, 3862 (uint64_t)rsm, 3863 tp->gput_ts, 3864 rack->r_ctl.rc_app_limited_cnt, 3865 9, 3866 __LINE__, NULL); 3867 } 3868 } 3869 3870 /* 3871 * CC wrapper hook functions 3872 */ 3873 static void 3874 rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, struct tcphdr *th, uint16_t nsegs, 3875 uint16_t type, int32_t recovery) 3876 { 3877 INP_WLOCK_ASSERT(tp->t_inpcb); 3878 tp->ccv->nsegs = nsegs; 3879 tp->ccv->bytes_this_ack = BYTES_THIS_ACK(tp, th); 3880 if ((recovery) && (rack->r_ctl.rc_early_recovery_segs)) { 3881 uint32_t max; 3882 3883 max = rack->r_ctl.rc_early_recovery_segs * ctf_fixed_maxseg(tp); 3884 if (tp->ccv->bytes_this_ack > max) { 3885 tp->ccv->bytes_this_ack = max; 3886 } 3887 } 3888 if (rack->r_ctl.cwnd_to_use <= tp->snd_wnd) 3889 tp->ccv->flags |= CCF_CWND_LIMITED; 3890 else 3891 tp->ccv->flags &= ~CCF_CWND_LIMITED; 3892 #ifdef STATS 3893 stats_voi_update_abs_s32(tp->t_stats, VOI_TCP_CALCFRWINDIFF, 3894 ((int32_t)rack->r_ctl.cwnd_to_use) - tp->snd_wnd); 3895 #endif 3896 if ((tp->t_flags & TF_GPUTINPROG) && 3897 rack_enough_for_measurement(tp, rack, th->th_ack)) { 3898 /* Measure the Goodput */ 3899 rack_do_goodput_measurement(tp, rack, th->th_ack, __LINE__); 3900 #ifdef NETFLIX_PEAKRATE 3901 if ((type == CC_ACK) && 3902 (tp->t_maxpeakrate)) { 3903 /* 3904 * We update t_peakrate_thr. This gives us roughly 3905 * one update per round trip time. Note 3906 * it will only be used if pace_always is off i.e 3907 * we don't do this for paced flows. 3908 */ 3909 tcp_update_peakrate_thr(tp); 3910 } 3911 #endif 3912 } 3913 if (rack->r_ctl.cwnd_to_use > tp->snd_ssthresh) { 3914 tp->t_bytes_acked += tp->ccv->bytes_this_ack; 3915 if (tp->t_bytes_acked >= rack->r_ctl.cwnd_to_use) { 3916 tp->t_bytes_acked -= rack->r_ctl.cwnd_to_use; 3917 tp->ccv->flags |= CCF_ABC_SENTAWND; 3918 } 3919 } else { 3920 tp->ccv->flags &= ~CCF_ABC_SENTAWND; 3921 tp->t_bytes_acked = 0; 3922 } 3923 if (CC_ALGO(tp)->ack_received != NULL) { 3924 /* XXXLAS: Find a way to live without this */ 3925 tp->ccv->curack = th->th_ack; 3926 CC_ALGO(tp)->ack_received(tp->ccv, type); 3927 } 3928 #ifdef STATS 3929 stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_LCWIN, rack->r_ctl.cwnd_to_use); 3930 #endif 3931 if (rack->r_ctl.rc_rack_largest_cwnd < rack->r_ctl.cwnd_to_use) { 3932 rack->r_ctl.rc_rack_largest_cwnd = rack->r_ctl.cwnd_to_use; 3933 } 3934 #ifdef NETFLIX_PEAKRATE 3935 /* we enforce max peak rate if it is set and we are not pacing */ 3936 if ((rack->rc_always_pace == 0) && 3937 tp->t_peakrate_thr && 3938 (tp->snd_cwnd > tp->t_peakrate_thr)) { 3939 tp->snd_cwnd = tp->t_peakrate_thr; 3940 } 3941 #endif 3942 } 3943 3944 static void 3945 tcp_rack_partialack(struct tcpcb *tp, struct tcphdr *th) 3946 { 3947 struct tcp_rack *rack; 3948 3949 rack = (struct tcp_rack *)tp->t_fb_ptr; 3950 INP_WLOCK_ASSERT(tp->t_inpcb); 3951 /* 3952 * If we are doing PRR and have enough 3953 * room to send <or> we are pacing and prr 3954 * is disabled we will want to see if we 3955 * can send data (by setting r_wanted_output to 3956 * true). 3957 */ 3958 if ((rack->r_ctl.rc_prr_sndcnt > 0) || 3959 rack->rack_no_prr) 3960 rack->r_wanted_output = 1; 3961 } 3962 3963 static void 3964 rack_post_recovery(struct tcpcb *tp, struct tcphdr *th) 3965 { 3966 struct tcp_rack *rack; 3967 uint32_t orig_cwnd; 3968 3969 3970 orig_cwnd = tp->snd_cwnd; 3971 INP_WLOCK_ASSERT(tp->t_inpcb); 3972 rack = (struct tcp_rack *)tp->t_fb_ptr; 3973 if (rack->rc_not_backing_off == 0) { 3974 /* only alert CC if we alerted when we entered */ 3975 if (CC_ALGO(tp)->post_recovery != NULL) { 3976 tp->ccv->curack = th->th_ack; 3977 CC_ALGO(tp)->post_recovery(tp->ccv); 3978 } 3979 if (tp->snd_cwnd > tp->snd_ssthresh) { 3980 /* Drop us down to the ssthresh (1/2 cwnd at loss) */ 3981 tp->snd_cwnd = tp->snd_ssthresh; 3982 } 3983 } 3984 if ((rack->rack_no_prr == 0) && 3985 (rack->r_ctl.rc_prr_sndcnt > 0)) { 3986 /* Suck the next prr cnt back into cwnd */ 3987 tp->snd_cwnd += rack->r_ctl.rc_prr_sndcnt; 3988 rack->r_ctl.rc_prr_sndcnt = 0; 3989 rack_log_to_prr(rack, 1, 0); 3990 } 3991 rack_log_to_prr(rack, 14, orig_cwnd); 3992 tp->snd_recover = tp->snd_una; 3993 EXIT_RECOVERY(tp->t_flags); 3994 } 3995 3996 static void 3997 rack_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type) 3998 { 3999 struct tcp_rack *rack; 4000 4001 INP_WLOCK_ASSERT(tp->t_inpcb); 4002 4003 rack = (struct tcp_rack *)tp->t_fb_ptr; 4004 switch (type) { 4005 case CC_NDUPACK: 4006 tp->t_flags &= ~TF_WASFRECOVERY; 4007 tp->t_flags &= ~TF_WASCRECOVERY; 4008 if (!IN_FASTRECOVERY(tp->t_flags)) { 4009 rack->r_ctl.rc_prr_delivered = 0; 4010 rack->r_ctl.rc_prr_out = 0; 4011 if (rack->rack_no_prr == 0) { 4012 rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); 4013 rack_log_to_prr(rack, 2, 0); 4014 } 4015 rack->r_ctl.rc_prr_recovery_fs = tp->snd_max - tp->snd_una; 4016 tp->snd_recover = tp->snd_max; 4017 if (tp->t_flags2 & TF2_ECN_PERMIT) 4018 tp->t_flags2 |= TF2_ECN_SND_CWR; 4019 } 4020 break; 4021 case CC_ECN: 4022 if (!IN_CONGRECOVERY(tp->t_flags) || 4023 /* 4024 * Allow ECN reaction on ACK to CWR, if 4025 * that data segment was also CE marked. 4026 */ 4027 SEQ_GEQ(th->th_ack, tp->snd_recover)) { 4028 EXIT_CONGRECOVERY(tp->t_flags); 4029 KMOD_TCPSTAT_INC(tcps_ecn_rcwnd); 4030 tp->snd_recover = tp->snd_max + 1; 4031 if (tp->t_flags2 & TF2_ECN_PERMIT) 4032 tp->t_flags2 |= TF2_ECN_SND_CWR; 4033 } 4034 break; 4035 case CC_RTO: 4036 tp->t_dupacks = 0; 4037 tp->t_bytes_acked = 0; 4038 EXIT_RECOVERY(tp->t_flags); 4039 tp->snd_ssthresh = max(2, min(tp->snd_wnd, rack->r_ctl.cwnd_to_use) / 2 / 4040 ctf_fixed_maxseg(tp)) * ctf_fixed_maxseg(tp); 4041 tp->snd_cwnd = ctf_fixed_maxseg(tp); 4042 if (tp->t_flags2 & TF2_ECN_PERMIT) 4043 tp->t_flags2 |= TF2_ECN_SND_CWR; 4044 break; 4045 case CC_RTO_ERR: 4046 KMOD_TCPSTAT_INC(tcps_sndrexmitbad); 4047 /* RTO was unnecessary, so reset everything. */ 4048 tp->snd_cwnd = tp->snd_cwnd_prev; 4049 tp->snd_ssthresh = tp->snd_ssthresh_prev; 4050 tp->snd_recover = tp->snd_recover_prev; 4051 if (tp->t_flags & TF_WASFRECOVERY) { 4052 ENTER_FASTRECOVERY(tp->t_flags); 4053 tp->t_flags &= ~TF_WASFRECOVERY; 4054 } 4055 if (tp->t_flags & TF_WASCRECOVERY) { 4056 ENTER_CONGRECOVERY(tp->t_flags); 4057 tp->t_flags &= ~TF_WASCRECOVERY; 4058 } 4059 tp->snd_nxt = tp->snd_max; 4060 tp->t_badrxtwin = 0; 4061 break; 4062 } 4063 /* 4064 * If we are below our max rtt, don't 4065 * signal the CC control to change things. 4066 * instead set it up so that we are in 4067 * recovery but not going to back off. 4068 */ 4069 4070 if (rack->rc_highly_buffered) { 4071 /* 4072 * Do we use the higher rtt for 4073 * our threshold to not backoff (like CDG)? 4074 */ 4075 uint32_t rtt_mul, rtt_div; 4076 4077 if (rack_use_max_for_nobackoff) { 4078 rtt_mul = (rack_gp_rtt_maxmul - 1); 4079 rtt_div = 1; 4080 } else { 4081 rtt_mul = rack_gp_rtt_minmul; 4082 rtt_div = max(rack_gp_rtt_mindiv , 1); 4083 } 4084 if (rack->r_ctl.rc_gp_srtt <= (rack->r_ctl.rc_lowest_us_rtt + 4085 ((rack->r_ctl.rc_lowest_us_rtt * rtt_mul) / 4086 rtt_div))) { 4087 /* below our min threshold */ 4088 rack->rc_not_backing_off = 1; 4089 ENTER_RECOVERY(rack->rc_tp->t_flags); 4090 rack_log_rtt_shrinks(rack, 0, 4091 rtt_mul, 4092 rtt_div, 4093 RACK_RTTS_NOBACKOFF); 4094 return; 4095 } 4096 } 4097 rack->rc_not_backing_off = 0; 4098 if (CC_ALGO(tp)->cong_signal != NULL) { 4099 if (th != NULL) 4100 tp->ccv->curack = th->th_ack; 4101 CC_ALGO(tp)->cong_signal(tp->ccv, type); 4102 } 4103 } 4104 4105 4106 4107 static inline void 4108 rack_cc_after_idle(struct tcp_rack *rack, struct tcpcb *tp) 4109 { 4110 uint32_t i_cwnd; 4111 4112 INP_WLOCK_ASSERT(tp->t_inpcb); 4113 4114 #ifdef NETFLIX_STATS 4115 KMOD_TCPSTAT_INC(tcps_idle_restarts); 4116 if (tp->t_state == TCPS_ESTABLISHED) 4117 KMOD_TCPSTAT_INC(tcps_idle_estrestarts); 4118 #endif 4119 if (CC_ALGO(tp)->after_idle != NULL) 4120 CC_ALGO(tp)->after_idle(tp->ccv); 4121 4122 if (tp->snd_cwnd == 1) 4123 i_cwnd = tp->t_maxseg; /* SYN(-ACK) lost */ 4124 else 4125 i_cwnd = rc_init_window(rack); 4126 4127 /* 4128 * Being idle is no differnt than the initial window. If the cc 4129 * clamps it down below the initial window raise it to the initial 4130 * window. 4131 */ 4132 if (tp->snd_cwnd < i_cwnd) { 4133 tp->snd_cwnd = i_cwnd; 4134 } 4135 } 4136 4137 4138 /* 4139 * Indicate whether this ack should be delayed. We can delay the ack if 4140 * following conditions are met: 4141 * - There is no delayed ack timer in progress. 4142 * - Our last ack wasn't a 0-sized window. We never want to delay 4143 * the ack that opens up a 0-sized window. 4144 * - LRO wasn't used for this segment. We make sure by checking that the 4145 * segment size is not larger than the MSS. 4146 * - Delayed acks are enabled or this is a half-synchronized T/TCP 4147 * connection. 4148 */ 4149 #define DELAY_ACK(tp, tlen) \ 4150 (((tp->t_flags & TF_RXWIN0SENT) == 0) && \ 4151 ((tp->t_flags & TF_DELACK) == 0) && \ 4152 (tlen <= tp->t_maxseg) && \ 4153 (tp->t_delayed_ack || (tp->t_flags & TF_NEEDSYN))) 4154 4155 static struct rack_sendmap * 4156 rack_find_lowest_rsm(struct tcp_rack *rack) 4157 { 4158 struct rack_sendmap *rsm; 4159 4160 /* 4161 * Walk the time-order transmitted list looking for an rsm that is 4162 * not acked. This will be the one that was sent the longest time 4163 * ago that is still outstanding. 4164 */ 4165 TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) { 4166 if (rsm->r_flags & RACK_ACKED) { 4167 continue; 4168 } 4169 goto finish; 4170 } 4171 finish: 4172 return (rsm); 4173 } 4174 4175 static struct rack_sendmap * 4176 rack_find_high_nonack(struct tcp_rack *rack, struct rack_sendmap *rsm) 4177 { 4178 struct rack_sendmap *prsm; 4179 4180 /* 4181 * Walk the sequence order list backward until we hit and arrive at 4182 * the highest seq not acked. In theory when this is called it 4183 * should be the last segment (which it was not). 4184 */ 4185 counter_u64_add(rack_find_high, 1); 4186 prsm = rsm; 4187 RB_FOREACH_REVERSE_FROM(prsm, rack_rb_tree_head, rsm) { 4188 if (prsm->r_flags & (RACK_ACKED | RACK_HAS_FIN)) { 4189 continue; 4190 } 4191 return (prsm); 4192 } 4193 return (NULL); 4194 } 4195 4196 4197 static uint32_t 4198 rack_calc_thresh_rack(struct tcp_rack *rack, uint32_t srtt, uint32_t cts) 4199 { 4200 int32_t lro; 4201 uint32_t thresh; 4202 4203 /* 4204 * lro is the flag we use to determine if we have seen reordering. 4205 * If it gets set we have seen reordering. The reorder logic either 4206 * works in one of two ways: 4207 * 4208 * If reorder-fade is configured, then we track the last time we saw 4209 * re-ordering occur. If we reach the point where enough time as 4210 * passed we no longer consider reordering has occuring. 4211 * 4212 * Or if reorder-face is 0, then once we see reordering we consider 4213 * the connection to alway be subject to reordering and just set lro 4214 * to 1. 4215 * 4216 * In the end if lro is non-zero we add the extra time for 4217 * reordering in. 4218 */ 4219 if (srtt == 0) 4220 srtt = 1; 4221 if (rack->r_ctl.rc_reorder_ts) { 4222 if (rack->r_ctl.rc_reorder_fade) { 4223 if (SEQ_GEQ(cts, rack->r_ctl.rc_reorder_ts)) { 4224 lro = cts - rack->r_ctl.rc_reorder_ts; 4225 if (lro == 0) { 4226 /* 4227 * No time as passed since the last 4228 * reorder, mark it as reordering. 4229 */ 4230 lro = 1; 4231 } 4232 } else { 4233 /* Negative time? */ 4234 lro = 0; 4235 } 4236 if (lro > rack->r_ctl.rc_reorder_fade) { 4237 /* Turn off reordering seen too */ 4238 rack->r_ctl.rc_reorder_ts = 0; 4239 lro = 0; 4240 } 4241 } else { 4242 /* Reodering does not fade */ 4243 lro = 1; 4244 } 4245 } else { 4246 lro = 0; 4247 } 4248 thresh = srtt + rack->r_ctl.rc_pkt_delay; 4249 if (lro) { 4250 /* It must be set, if not you get 1/4 rtt */ 4251 if (rack->r_ctl.rc_reorder_shift) 4252 thresh += (srtt >> rack->r_ctl.rc_reorder_shift); 4253 else 4254 thresh += (srtt >> 2); 4255 } else { 4256 thresh += 1; 4257 } 4258 /* We don't let the rack timeout be above a RTO */ 4259 if (thresh > TICKS_2_MSEC(rack->rc_tp->t_rxtcur)) { 4260 thresh = TICKS_2_MSEC(rack->rc_tp->t_rxtcur); 4261 } 4262 /* And we don't want it above the RTO max either */ 4263 if (thresh > rack_rto_max) { 4264 thresh = rack_rto_max; 4265 } 4266 return (thresh); 4267 } 4268 4269 static uint32_t 4270 rack_calc_thresh_tlp(struct tcpcb *tp, struct tcp_rack *rack, 4271 struct rack_sendmap *rsm, uint32_t srtt) 4272 { 4273 struct rack_sendmap *prsm; 4274 uint32_t thresh, len; 4275 int segsiz; 4276 4277 if (srtt == 0) 4278 srtt = 1; 4279 if (rack->r_ctl.rc_tlp_threshold) 4280 thresh = srtt + (srtt / rack->r_ctl.rc_tlp_threshold); 4281 else 4282 thresh = (srtt * 2); 4283 4284 /* Get the previous sent packet, if any */ 4285 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 4286 counter_u64_add(rack_enter_tlp_calc, 1); 4287 len = rsm->r_end - rsm->r_start; 4288 if (rack->rack_tlp_threshold_use == TLP_USE_ID) { 4289 /* Exactly like the ID */ 4290 if (((tp->snd_max - tp->snd_una) - rack->r_ctl.rc_sacked + rack->r_ctl.rc_holes_rxt) <= segsiz) { 4291 uint32_t alt_thresh; 4292 /* 4293 * Compensate for delayed-ack with the d-ack time. 4294 */ 4295 counter_u64_add(rack_used_tlpmethod, 1); 4296 alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time; 4297 if (alt_thresh > thresh) 4298 thresh = alt_thresh; 4299 } 4300 } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_ONE) { 4301 /* 2.1 behavior */ 4302 prsm = TAILQ_PREV(rsm, rack_head, r_tnext); 4303 if (prsm && (len <= segsiz)) { 4304 /* 4305 * Two packets outstanding, thresh should be (2*srtt) + 4306 * possible inter-packet delay (if any). 4307 */ 4308 uint32_t inter_gap = 0; 4309 int idx, nidx; 4310 4311 counter_u64_add(rack_used_tlpmethod, 1); 4312 idx = rsm->r_rtr_cnt - 1; 4313 nidx = prsm->r_rtr_cnt - 1; 4314 if (TSTMP_GEQ(rsm->r_tim_lastsent[nidx], prsm->r_tim_lastsent[idx])) { 4315 /* Yes it was sent later (or at the same time) */ 4316 inter_gap = rsm->r_tim_lastsent[idx] - prsm->r_tim_lastsent[nidx]; 4317 } 4318 thresh += inter_gap; 4319 } else if (len <= segsiz) { 4320 /* 4321 * Possibly compensate for delayed-ack. 4322 */ 4323 uint32_t alt_thresh; 4324 4325 counter_u64_add(rack_used_tlpmethod2, 1); 4326 alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time; 4327 if (alt_thresh > thresh) 4328 thresh = alt_thresh; 4329 } 4330 } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_TWO) { 4331 /* 2.2 behavior */ 4332 if (len <= segsiz) { 4333 uint32_t alt_thresh; 4334 /* 4335 * Compensate for delayed-ack with the d-ack time. 4336 */ 4337 counter_u64_add(rack_used_tlpmethod, 1); 4338 alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time; 4339 if (alt_thresh > thresh) 4340 thresh = alt_thresh; 4341 } 4342 } 4343 /* Not above an RTO */ 4344 if (thresh > TICKS_2_MSEC(tp->t_rxtcur)) { 4345 thresh = TICKS_2_MSEC(tp->t_rxtcur); 4346 } 4347 /* Not above a RTO max */ 4348 if (thresh > rack_rto_max) { 4349 thresh = rack_rto_max; 4350 } 4351 /* Apply user supplied min TLP */ 4352 if (thresh < rack_tlp_min) { 4353 thresh = rack_tlp_min; 4354 } 4355 return (thresh); 4356 } 4357 4358 static uint32_t 4359 rack_grab_rtt(struct tcpcb *tp, struct tcp_rack *rack) 4360 { 4361 /* 4362 * We want the rack_rtt which is the 4363 * last rtt we measured. However if that 4364 * does not exist we fallback to the srtt (which 4365 * we probably will never do) and then as a last 4366 * resort we use RACK_INITIAL_RTO if no srtt is 4367 * yet set. 4368 */ 4369 if (rack->rc_rack_rtt) 4370 return(rack->rc_rack_rtt); 4371 else if (tp->t_srtt == 0) 4372 return(RACK_INITIAL_RTO); 4373 return (TICKS_2_MSEC(tp->t_srtt >> TCP_RTT_SHIFT)); 4374 } 4375 4376 static struct rack_sendmap * 4377 rack_check_recovery_mode(struct tcpcb *tp, uint32_t tsused) 4378 { 4379 /* 4380 * Check to see that we don't need to fall into recovery. We will 4381 * need to do so if our oldest transmit is past the time we should 4382 * have had an ack. 4383 */ 4384 struct tcp_rack *rack; 4385 struct rack_sendmap *rsm; 4386 int32_t idx; 4387 uint32_t srtt, thresh; 4388 4389 rack = (struct tcp_rack *)tp->t_fb_ptr; 4390 if (RB_EMPTY(&rack->r_ctl.rc_mtree)) { 4391 return (NULL); 4392 } 4393 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 4394 if (rsm == NULL) 4395 return (NULL); 4396 4397 if (rsm->r_flags & RACK_ACKED) { 4398 rsm = rack_find_lowest_rsm(rack); 4399 if (rsm == NULL) 4400 return (NULL); 4401 } 4402 idx = rsm->r_rtr_cnt - 1; 4403 srtt = rack_grab_rtt(tp, rack); 4404 thresh = rack_calc_thresh_rack(rack, srtt, tsused); 4405 if (TSTMP_LT(tsused, rsm->r_tim_lastsent[idx])) { 4406 return (NULL); 4407 } 4408 if ((tsused - rsm->r_tim_lastsent[idx]) < thresh) { 4409 return (NULL); 4410 } 4411 /* Ok if we reach here we are over-due and this guy can be sent */ 4412 if (IN_RECOVERY(tp->t_flags) == 0) { 4413 /* 4414 * For the one that enters us into recovery record undo 4415 * info. 4416 */ 4417 rack->r_ctl.rc_rsm_start = rsm->r_start; 4418 rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; 4419 rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; 4420 } 4421 rack_cong_signal(tp, NULL, CC_NDUPACK); 4422 return (rsm); 4423 } 4424 4425 static uint32_t 4426 rack_get_persists_timer_val(struct tcpcb *tp, struct tcp_rack *rack) 4427 { 4428 int32_t t; 4429 int32_t tt; 4430 uint32_t ret_val; 4431 4432 t = TICKS_2_MSEC((tp->t_srtt >> TCP_RTT_SHIFT) + ((tp->t_rttvar * 4) >> TCP_RTT_SHIFT)); 4433 TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift], 4434 rack_persist_min, rack_persist_max); 4435 if (tp->t_rxtshift < TCP_MAXRXTSHIFT) 4436 tp->t_rxtshift++; 4437 rack->r_ctl.rc_hpts_flags |= PACE_TMR_PERSIT; 4438 ret_val = (uint32_t)tt; 4439 return (ret_val); 4440 } 4441 4442 static uint32_t 4443 rack_timer_start(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int sup_rack) 4444 { 4445 /* 4446 * Start the FR timer, we do this based on getting the first one in 4447 * the rc_tmap. Note that if its NULL we must stop the timer. in all 4448 * events we need to stop the running timer (if its running) before 4449 * starting the new one. 4450 */ 4451 uint32_t thresh, exp, to, srtt, time_since_sent, tstmp_touse; 4452 uint32_t srtt_cur; 4453 int32_t idx; 4454 int32_t is_tlp_timer = 0; 4455 struct rack_sendmap *rsm; 4456 4457 if (rack->t_timers_stopped) { 4458 /* All timers have been stopped none are to run */ 4459 return (0); 4460 } 4461 if (rack->rc_in_persist) { 4462 /* We can't start any timer in persists */ 4463 return (rack_get_persists_timer_val(tp, rack)); 4464 } 4465 rack->rc_on_min_to = 0; 4466 if ((tp->t_state < TCPS_ESTABLISHED) || 4467 ((tp->t_flags & TF_SACK_PERMIT) == 0)) 4468 goto activate_rxt; 4469 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 4470 if ((rsm == NULL) || sup_rack) { 4471 /* Nothing on the send map */ 4472 activate_rxt: 4473 time_since_sent = 0; 4474 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 4475 if (rsm) { 4476 idx = rsm->r_rtr_cnt - 1; 4477 if (TSTMP_GEQ(rsm->r_tim_lastsent[idx], rack->r_ctl.rc_tlp_rxt_last_time)) 4478 tstmp_touse = rsm->r_tim_lastsent[idx]; 4479 else 4480 tstmp_touse = rack->r_ctl.rc_tlp_rxt_last_time; 4481 if (TSTMP_GT(cts, tstmp_touse)) 4482 time_since_sent = cts - tstmp_touse; 4483 } 4484 if (SEQ_LT(tp->snd_una, tp->snd_max) || sbavail(&(tp->t_inpcb->inp_socket->so_snd))) { 4485 rack->r_ctl.rc_hpts_flags |= PACE_TMR_RXT; 4486 to = TICKS_2_MSEC(tp->t_rxtcur); 4487 if (to > time_since_sent) 4488 to -= time_since_sent; 4489 else 4490 to = rack->r_ctl.rc_min_to; 4491 if (to == 0) 4492 to = 1; 4493 return (to); 4494 } 4495 return (0); 4496 } 4497 if (rsm->r_flags & RACK_ACKED) { 4498 rsm = rack_find_lowest_rsm(rack); 4499 if (rsm == NULL) { 4500 /* No lowest? */ 4501 goto activate_rxt; 4502 } 4503 } 4504 if (rack->sack_attack_disable) { 4505 /* 4506 * We don't want to do 4507 * any TLP's if you are an attacker. 4508 * Though if you are doing what 4509 * is expected you may still have 4510 * SACK-PASSED marks. 4511 */ 4512 goto activate_rxt; 4513 } 4514 /* Convert from ms to usecs */ 4515 if ((rsm->r_flags & RACK_SACK_PASSED) || (rsm->r_dupack >= DUP_ACK_THRESHOLD)) { 4516 if ((tp->t_flags & TF_SENTFIN) && 4517 ((tp->snd_max - tp->snd_una) == 1) && 4518 (rsm->r_flags & RACK_HAS_FIN)) { 4519 /* 4520 * We don't start a rack timer if all we have is a 4521 * FIN outstanding. 4522 */ 4523 goto activate_rxt; 4524 } 4525 if ((rack->use_rack_rr == 0) && 4526 (IN_RECOVERY(tp->t_flags)) && 4527 (rack->rack_no_prr == 0) && 4528 (rack->r_ctl.rc_prr_sndcnt < ctf_fixed_maxseg(tp))) { 4529 /* 4530 * We are not cheating, in recovery and 4531 * not enough ack's to yet get our next 4532 * retransmission out. 4533 * 4534 * Note that classified attackers do not 4535 * get to use the rack-cheat. 4536 */ 4537 goto activate_tlp; 4538 } 4539 srtt = rack_grab_rtt(tp, rack); 4540 thresh = rack_calc_thresh_rack(rack, srtt, cts); 4541 idx = rsm->r_rtr_cnt - 1; 4542 exp = rsm->r_tim_lastsent[idx] + thresh; 4543 if (SEQ_GEQ(exp, cts)) { 4544 to = exp - cts; 4545 if (to < rack->r_ctl.rc_min_to) { 4546 to = rack->r_ctl.rc_min_to; 4547 if (rack->r_rr_config == 3) 4548 rack->rc_on_min_to = 1; 4549 } 4550 } else { 4551 to = rack->r_ctl.rc_min_to; 4552 if (rack->r_rr_config == 3) 4553 rack->rc_on_min_to = 1; 4554 } 4555 } else { 4556 /* Ok we need to do a TLP not RACK */ 4557 activate_tlp: 4558 if ((rack->rc_tlp_in_progress != 0) && 4559 (rack->r_ctl.rc_tlp_cnt_out >= rack_tlp_limit)) { 4560 /* 4561 * The previous send was a TLP and we have sent 4562 * N TLP's without sending new data. 4563 */ 4564 goto activate_rxt; 4565 } 4566 rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext); 4567 if (rsm == NULL) { 4568 /* We found no rsm to TLP with. */ 4569 goto activate_rxt; 4570 } 4571 if (rsm->r_flags & RACK_HAS_FIN) { 4572 /* If its a FIN we dont do TLP */ 4573 rsm = NULL; 4574 goto activate_rxt; 4575 } 4576 idx = rsm->r_rtr_cnt - 1; 4577 time_since_sent = 0; 4578 if (TSTMP_GEQ(rsm->r_tim_lastsent[idx], rack->r_ctl.rc_tlp_rxt_last_time)) 4579 tstmp_touse = rsm->r_tim_lastsent[idx]; 4580 else 4581 tstmp_touse = rack->r_ctl.rc_tlp_rxt_last_time; 4582 if (TSTMP_GT(cts, tstmp_touse)) 4583 time_since_sent = cts - tstmp_touse; 4584 is_tlp_timer = 1; 4585 if (tp->t_srtt) { 4586 srtt_cur = (tp->t_srtt >> TCP_RTT_SHIFT); 4587 srtt = TICKS_2_MSEC(srtt_cur); 4588 } else 4589 srtt = RACK_INITIAL_RTO; 4590 /* 4591 * If the SRTT is not keeping up and the 4592 * rack RTT has spiked we want to use 4593 * the last RTT not the smoothed one. 4594 */ 4595 if (rack_tlp_use_greater && (srtt < rack_grab_rtt(tp, rack))) 4596 srtt = rack_grab_rtt(tp, rack); 4597 thresh = rack_calc_thresh_tlp(tp, rack, rsm, srtt); 4598 if (thresh > time_since_sent) 4599 to = thresh - time_since_sent; 4600 else { 4601 to = rack->r_ctl.rc_min_to; 4602 rack_log_alt_to_to_cancel(rack, 4603 thresh, /* flex1 */ 4604 time_since_sent, /* flex2 */ 4605 tstmp_touse, /* flex3 */ 4606 rack->r_ctl.rc_tlp_rxt_last_time, /* flex4 */ 4607 rsm->r_tim_lastsent[idx], 4608 srtt, 4609 idx, 99); 4610 } 4611 if (to > TCPTV_REXMTMAX) { 4612 /* 4613 * If the TLP time works out to larger than the max 4614 * RTO lets not do TLP.. just RTO. 4615 */ 4616 goto activate_rxt; 4617 } 4618 } 4619 if (is_tlp_timer == 0) { 4620 rack->r_ctl.rc_hpts_flags |= PACE_TMR_RACK; 4621 } else { 4622 rack->r_ctl.rc_hpts_flags |= PACE_TMR_TLP; 4623 } 4624 if (to == 0) 4625 to = 1; 4626 return (to); 4627 } 4628 4629 static void 4630 rack_enter_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 4631 { 4632 if (rack->rc_in_persist == 0) { 4633 if (tp->t_flags & TF_GPUTINPROG) { 4634 /* 4635 * Stop the goodput now, the calling of the 4636 * measurement function clears the flag. 4637 */ 4638 rack_do_goodput_measurement(tp, rack, tp->snd_una, __LINE__); 4639 } 4640 #ifdef NETFLIX_SHARED_CWND 4641 if (rack->r_ctl.rc_scw) { 4642 tcp_shared_cwnd_idle(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index); 4643 rack->rack_scwnd_is_idle = 1; 4644 } 4645 #endif 4646 rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL); 4647 if (rack->r_ctl.rc_went_idle_time == 0) 4648 rack->r_ctl.rc_went_idle_time = 1; 4649 rack_timer_cancel(tp, rack, cts, __LINE__); 4650 tp->t_rxtshift = 0; 4651 rack->rc_in_persist = 1; 4652 } 4653 } 4654 4655 static void 4656 rack_exit_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 4657 { 4658 if (rack->rc_inp->inp_in_hpts) { 4659 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); 4660 rack->r_ctl.rc_hpts_flags = 0; 4661 } 4662 #ifdef NETFLIX_SHARED_CWND 4663 if (rack->r_ctl.rc_scw) { 4664 tcp_shared_cwnd_active(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index); 4665 rack->rack_scwnd_is_idle = 0; 4666 } 4667 #endif 4668 if (rack->rc_gp_dyn_mul && 4669 (rack->use_fixed_rate == 0) && 4670 (rack->rc_always_pace)) { 4671 /* 4672 * Do we count this as if a probe-rtt just 4673 * finished? 4674 */ 4675 uint32_t time_idle, idle_min; 4676 4677 time_idle = tcp_get_usecs(NULL) - rack->r_ctl.rc_went_idle_time; 4678 idle_min = rack_min_probertt_hold; 4679 if (rack_probertt_gpsrtt_cnt_div) { 4680 uint64_t extra; 4681 extra = (uint64_t)rack->r_ctl.rc_gp_srtt * 4682 (uint64_t)rack_probertt_gpsrtt_cnt_mul; 4683 extra /= (uint64_t)rack_probertt_gpsrtt_cnt_div; 4684 idle_min += (uint32_t)extra; 4685 } 4686 if (time_idle >= idle_min) { 4687 /* Yes, we count it as a probe-rtt. */ 4688 uint32_t us_cts; 4689 4690 us_cts = tcp_get_usecs(NULL); 4691 if (rack->in_probe_rtt == 0) { 4692 rack->r_ctl.rc_lower_rtt_us_cts = us_cts; 4693 rack->r_ctl.rc_time_probertt_entered = rack->r_ctl.rc_lower_rtt_us_cts; 4694 rack->r_ctl.rc_time_probertt_starts = rack->r_ctl.rc_lower_rtt_us_cts; 4695 rack->r_ctl.rc_time_of_last_probertt = rack->r_ctl.rc_lower_rtt_us_cts; 4696 } else { 4697 rack_exit_probertt(rack, us_cts); 4698 } 4699 } 4700 4701 } 4702 rack->rc_in_persist = 0; 4703 rack->r_ctl.rc_went_idle_time = 0; 4704 tp->t_rxtshift = 0; 4705 rack->r_ctl.rc_agg_delayed = 0; 4706 rack->r_early = 0; 4707 rack->r_late = 0; 4708 rack->r_ctl.rc_agg_early = 0; 4709 } 4710 4711 static void 4712 rack_log_hpts_diag(struct tcp_rack *rack, uint32_t cts, 4713 struct hpts_diag *diag, struct timeval *tv) 4714 { 4715 if (rack_verbose_logging && rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 4716 union tcp_log_stackspecific log; 4717 4718 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 4719 log.u_bbr.flex1 = diag->p_nxt_slot; 4720 log.u_bbr.flex2 = diag->p_cur_slot; 4721 log.u_bbr.flex3 = diag->slot_req; 4722 log.u_bbr.flex4 = diag->inp_hptsslot; 4723 log.u_bbr.flex5 = diag->slot_remaining; 4724 log.u_bbr.flex6 = diag->need_new_to; 4725 log.u_bbr.flex7 = diag->p_hpts_active; 4726 log.u_bbr.flex8 = diag->p_on_min_sleep; 4727 /* Hijack other fields as needed */ 4728 log.u_bbr.epoch = diag->have_slept; 4729 log.u_bbr.lt_epoch = diag->yet_to_sleep; 4730 log.u_bbr.pkts_out = diag->co_ret; 4731 log.u_bbr.applimited = diag->hpts_sleep_time; 4732 log.u_bbr.delivered = diag->p_prev_slot; 4733 log.u_bbr.inflight = diag->p_runningtick; 4734 log.u_bbr.bw_inuse = diag->wheel_tick; 4735 log.u_bbr.rttProp = diag->wheel_cts; 4736 log.u_bbr.timeStamp = cts; 4737 log.u_bbr.delRate = diag->maxticks; 4738 log.u_bbr.cur_del_rate = diag->p_curtick; 4739 log.u_bbr.cur_del_rate <<= 32; 4740 log.u_bbr.cur_del_rate |= diag->p_lasttick; 4741 TCP_LOG_EVENTP(rack->rc_tp, NULL, 4742 &rack->rc_inp->inp_socket->so_rcv, 4743 &rack->rc_inp->inp_socket->so_snd, 4744 BBR_LOG_HPTSDIAG, 0, 4745 0, &log, false, tv); 4746 } 4747 4748 } 4749 4750 static void 4751 rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, 4752 int32_t slot, uint32_t tot_len_this_send, int sup_rack) 4753 { 4754 struct hpts_diag diag; 4755 struct inpcb *inp; 4756 struct timeval tv; 4757 uint32_t delayed_ack = 0; 4758 uint32_t hpts_timeout; 4759 uint8_t stopped; 4760 uint32_t left = 0; 4761 uint32_t us_cts; 4762 4763 inp = tp->t_inpcb; 4764 if ((tp->t_state == TCPS_CLOSED) || 4765 (tp->t_state == TCPS_LISTEN)) { 4766 return; 4767 } 4768 if (inp->inp_in_hpts) { 4769 /* Already on the pacer */ 4770 return; 4771 } 4772 stopped = rack->rc_tmr_stopped; 4773 if (stopped && TSTMP_GT(rack->r_ctl.rc_timer_exp, cts)) { 4774 left = rack->r_ctl.rc_timer_exp - cts; 4775 } 4776 rack->r_ctl.rc_timer_exp = 0; 4777 rack->r_ctl.rc_hpts_flags = 0; 4778 us_cts = tcp_get_usecs(&tv); 4779 /* Now early/late accounting */ 4780 if (rack->r_early) { 4781 /* 4782 * We have a early carry over set, 4783 * we can always add more time so we 4784 * can always make this compensation. 4785 */ 4786 slot += rack->r_ctl.rc_agg_early; 4787 rack->r_early = 0; 4788 rack->r_ctl.rc_agg_early = 0; 4789 } 4790 if (rack->r_late) { 4791 /* 4792 * This is harder, we can 4793 * compensate some but it 4794 * really depends on what 4795 * the current pacing time is. 4796 */ 4797 if (rack->r_ctl.rc_agg_delayed >= slot) { 4798 /* 4799 * We can't compensate for it all. 4800 * And we have to have some time 4801 * on the clock. We always have a min 4802 * 10 slots (10 x 10 i.e. 100 usecs). 4803 */ 4804 if (slot <= HPTS_TICKS_PER_USEC) { 4805 /* We gain delay */ 4806 rack->r_ctl.rc_agg_delayed += (HPTS_TICKS_PER_USEC - slot); 4807 slot = HPTS_TICKS_PER_USEC; 4808 } else { 4809 /* We take off some */ 4810 rack->r_ctl.rc_agg_delayed -= (slot - HPTS_TICKS_PER_USEC); 4811 slot = HPTS_TICKS_PER_USEC; 4812 } 4813 } else { 4814 4815 slot -= rack->r_ctl.rc_agg_delayed; 4816 rack->r_ctl.rc_agg_delayed = 0; 4817 /* Make sure we have 100 useconds at minimum */ 4818 if (slot < HPTS_TICKS_PER_USEC) { 4819 rack->r_ctl.rc_agg_delayed = HPTS_TICKS_PER_USEC - slot; 4820 slot = HPTS_TICKS_PER_USEC; 4821 } 4822 if (rack->r_ctl.rc_agg_delayed == 0) 4823 rack->r_late = 0; 4824 } 4825 } 4826 if (slot) { 4827 /* We are pacing too */ 4828 rack->r_ctl.rc_hpts_flags |= PACE_PKT_OUTPUT; 4829 } 4830 hpts_timeout = rack_timer_start(tp, rack, cts, sup_rack); 4831 #ifdef NETFLIX_EXP_DETECTION 4832 if (rack->sack_attack_disable && 4833 (slot < tcp_sad_pacing_interval)) { 4834 /* 4835 * We have a potential attacker on 4836 * the line. We have possibly some 4837 * (or now) pacing time set. We want to 4838 * slow down the processing of sacks by some 4839 * amount (if it is an attacker). Set the default 4840 * slot for attackers in place (unless the orginal 4841 * interval is longer). Its stored in 4842 * micro-seconds, so lets convert to msecs. 4843 */ 4844 slot = tcp_sad_pacing_interval; 4845 } 4846 #endif 4847 if (tp->t_flags & TF_DELACK) { 4848 delayed_ack = TICKS_2_MSEC(tcp_delacktime); 4849 rack->r_ctl.rc_hpts_flags |= PACE_TMR_DELACK; 4850 } 4851 if (delayed_ack && ((hpts_timeout == 0) || 4852 (delayed_ack < hpts_timeout))) 4853 hpts_timeout = delayed_ack; 4854 else 4855 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK; 4856 /* 4857 * If no timers are going to run and we will fall off the hptsi 4858 * wheel, we resort to a keep-alive timer if its configured. 4859 */ 4860 if ((hpts_timeout == 0) && 4861 (slot == 0)) { 4862 if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && 4863 (tp->t_state <= TCPS_CLOSING)) { 4864 /* 4865 * Ok we have no timer (persists, rack, tlp, rxt or 4866 * del-ack), we don't have segments being paced. So 4867 * all that is left is the keepalive timer. 4868 */ 4869 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 4870 /* Get the established keep-alive time */ 4871 hpts_timeout = TP_KEEPIDLE(tp); 4872 } else { 4873 /* Get the initial setup keep-alive time */ 4874 hpts_timeout = TP_KEEPINIT(tp); 4875 } 4876 rack->r_ctl.rc_hpts_flags |= PACE_TMR_KEEP; 4877 if (rack->in_probe_rtt) { 4878 /* 4879 * We want to instead not wake up a long time from 4880 * now but to wake up about the time we would 4881 * exit probe-rtt and initiate a keep-alive ack. 4882 * This will get us out of probe-rtt and update 4883 * our min-rtt. 4884 */ 4885 hpts_timeout = (rack_min_probertt_hold / HPTS_USEC_IN_MSEC); 4886 } 4887 } 4888 } 4889 if (left && (stopped & (PACE_TMR_KEEP | PACE_TMR_DELACK)) == 4890 (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK)) { 4891 /* 4892 * RACK, TLP, persists and RXT timers all are restartable 4893 * based on actions input .. i.e we received a packet (ack 4894 * or sack) and that changes things (rw, or snd_una etc). 4895 * Thus we can restart them with a new value. For 4896 * keep-alive, delayed_ack we keep track of what was left 4897 * and restart the timer with a smaller value. 4898 */ 4899 if (left < hpts_timeout) 4900 hpts_timeout = left; 4901 } 4902 if (hpts_timeout) { 4903 /* 4904 * Hack alert for now we can't time-out over 2,147,483 4905 * seconds (a bit more than 596 hours), which is probably ok 4906 * :). 4907 */ 4908 if (hpts_timeout > 0x7ffffffe) 4909 hpts_timeout = 0x7ffffffe; 4910 rack->r_ctl.rc_timer_exp = cts + hpts_timeout; 4911 } 4912 if ((rack->rc_gp_filled == 0) && 4913 (hpts_timeout < slot) && 4914 (rack->r_ctl.rc_hpts_flags & (PACE_TMR_TLP|PACE_TMR_RXT))) { 4915 /* 4916 * We have no good estimate yet for the 4917 * old clunky burst mitigation or the 4918 * real pacing. And the tlp or rxt is smaller 4919 * than the pacing calculation. Lets not 4920 * pace that long since we know the calculation 4921 * so far is not accurate. 4922 */ 4923 slot = hpts_timeout; 4924 } 4925 rack->r_ctl.last_pacing_time = slot; 4926 if (slot) { 4927 rack->r_ctl.rc_last_output_to = us_cts + slot; 4928 if (rack->rc_always_pace || rack->r_mbuf_queue) { 4929 if ((rack->rc_gp_filled == 0) || 4930 rack->pacing_longer_than_rtt) { 4931 inp->inp_flags2 &= ~(INP_DONT_SACK_QUEUE|INP_MBUF_QUEUE_READY); 4932 } else { 4933 inp->inp_flags2 |= INP_MBUF_QUEUE_READY; 4934 if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK) && 4935 (rack->r_rr_config != 3)) 4936 inp->inp_flags2 |= INP_DONT_SACK_QUEUE; 4937 else 4938 inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE; 4939 } 4940 } 4941 if ((rack->use_rack_rr) && 4942 (rack->r_rr_config < 2) && 4943 ((hpts_timeout) && ((hpts_timeout * HPTS_USEC_IN_MSEC) < slot))) { 4944 /* 4945 * Arrange for the hpts to kick back in after the 4946 * t-o if the t-o does not cause a send. 4947 */ 4948 (void)tcp_hpts_insert_diag(tp->t_inpcb, HPTS_MS_TO_SLOTS(hpts_timeout), 4949 __LINE__, &diag); 4950 rack_log_hpts_diag(rack, us_cts, &diag, &tv); 4951 rack_log_to_start(rack, cts, hpts_timeout, slot, 0); 4952 } else { 4953 (void)tcp_hpts_insert_diag(tp->t_inpcb, HPTS_USEC_TO_SLOTS(slot), 4954 __LINE__, &diag); 4955 rack_log_hpts_diag(rack, us_cts, &diag, &tv); 4956 rack_log_to_start(rack, cts, hpts_timeout, slot, 1); 4957 } 4958 } else if (hpts_timeout) { 4959 if (rack->rc_always_pace || rack->r_mbuf_queue) { 4960 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK) { 4961 /* For a rack timer, don't wake us */ 4962 inp->inp_flags2 |= INP_MBUF_QUEUE_READY; 4963 if (rack->r_rr_config != 3) 4964 inp->inp_flags2 |= INP_DONT_SACK_QUEUE; 4965 else 4966 inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE; 4967 } else { 4968 /* All other timers wake us up */ 4969 inp->inp_flags2 &= ~INP_MBUF_QUEUE_READY; 4970 inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE; 4971 } 4972 } 4973 (void)tcp_hpts_insert_diag(tp->t_inpcb, HPTS_MS_TO_SLOTS(hpts_timeout), 4974 __LINE__, &diag); 4975 rack_log_hpts_diag(rack, us_cts, &diag, &tv); 4976 rack_log_to_start(rack, cts, hpts_timeout, slot, 0); 4977 } else { 4978 /* No timer starting */ 4979 #ifdef INVARIANTS 4980 if (SEQ_GT(tp->snd_max, tp->snd_una)) { 4981 panic("tp:%p rack:%p tlts:%d cts:%u slot:%u pto:%u -- no timer started?", 4982 tp, rack, tot_len_this_send, cts, slot, hpts_timeout); 4983 } 4984 #endif 4985 } 4986 rack->rc_tmr_stopped = 0; 4987 if (slot) 4988 rack_log_type_bbrsnd(rack, tot_len_this_send, slot, us_cts, &tv); 4989 } 4990 4991 /* 4992 * RACK Timer, here we simply do logging and house keeping. 4993 * the normal rack_output() function will call the 4994 * appropriate thing to check if we need to do a RACK retransmit. 4995 * We return 1, saying don't proceed with rack_output only 4996 * when all timers have been stopped (destroyed PCB?). 4997 */ 4998 static int 4999 rack_timeout_rack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 5000 { 5001 /* 5002 * This timer simply provides an internal trigger to send out data. 5003 * The check_recovery_mode call will see if there are needed 5004 * retransmissions, if so we will enter fast-recovery. The output 5005 * call may or may not do the same thing depending on sysctl 5006 * settings. 5007 */ 5008 struct rack_sendmap *rsm; 5009 int32_t recovery; 5010 5011 if (tp->t_timers->tt_flags & TT_STOPPED) { 5012 return (1); 5013 } 5014 recovery = IN_RECOVERY(tp->t_flags); 5015 counter_u64_add(rack_to_tot, 1); 5016 if (rack->r_state && (rack->r_state != tp->t_state)) 5017 rack_set_state(tp, rack); 5018 rack->rc_on_min_to = 0; 5019 rsm = rack_check_recovery_mode(tp, cts); 5020 rack_log_to_event(rack, RACK_TO_FRM_RACK, rsm); 5021 if (rsm) { 5022 uint32_t rtt; 5023 5024 rack->r_ctl.rc_resend = rsm; 5025 if (rack->use_rack_rr) { 5026 /* 5027 * Don't accumulate extra pacing delay 5028 * we are allowing the rack timer to 5029 * over-ride pacing i.e. rrr takes precedence 5030 * if the pacing interval is longer than the rrr 5031 * time (in other words we get the min pacing 5032 * time versus rrr pacing time). 5033 */ 5034 rack->r_timer_override = 1; 5035 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 5036 } 5037 rtt = rack->rc_rack_rtt; 5038 if (rtt == 0) 5039 rtt = 1; 5040 if (rack->rack_no_prr == 0) { 5041 if ((recovery == 0) && 5042 (rack->r_ctl.rc_prr_sndcnt < ctf_fixed_maxseg(tp))) { 5043 /* 5044 * The rack-timeout that enter's us into recovery 5045 * will force out one MSS and set us up so that we 5046 * can do one more send in 2*rtt (transitioning the 5047 * rack timeout into a rack-tlp). 5048 */ 5049 rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); 5050 rack->r_timer_override = 1; 5051 rack_log_to_prr(rack, 3, 0); 5052 } else if ((rack->r_ctl.rc_prr_sndcnt < (rsm->r_end - rsm->r_start)) && 5053 rack->use_rack_rr) { 5054 /* 5055 * When a rack timer goes, if the rack rr is 5056 * on, arrange it so we can send a full segment 5057 * overriding prr (though we pay a price for this 5058 * for future new sends). 5059 */ 5060 rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); 5061 rack_log_to_prr(rack, 4, 0); 5062 } 5063 } 5064 } 5065 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RACK; 5066 if (rsm == NULL) { 5067 /* restart a timer and return 1 */ 5068 rack_start_hpts_timer(rack, tp, cts, 5069 0, 0, 0); 5070 return (1); 5071 } 5072 return (0); 5073 } 5074 5075 static __inline void 5076 rack_clone_rsm(struct tcp_rack *rack, struct rack_sendmap *nrsm, 5077 struct rack_sendmap *rsm, uint32_t start) 5078 { 5079 int idx; 5080 5081 nrsm->r_start = start; 5082 nrsm->r_end = rsm->r_end; 5083 nrsm->r_rtr_cnt = rsm->r_rtr_cnt; 5084 nrsm->r_flags = rsm->r_flags; 5085 nrsm->r_dupack = rsm->r_dupack; 5086 nrsm->usec_orig_send = rsm->usec_orig_send; 5087 nrsm->r_rtr_bytes = 0; 5088 rsm->r_end = nrsm->r_start; 5089 nrsm->r_just_ret = rsm->r_just_ret; 5090 for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { 5091 nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; 5092 } 5093 } 5094 5095 static struct rack_sendmap * 5096 rack_merge_rsm(struct tcp_rack *rack, 5097 struct rack_sendmap *l_rsm, 5098 struct rack_sendmap *r_rsm) 5099 { 5100 /* 5101 * We are merging two ack'd RSM's, 5102 * the l_rsm is on the left (lower seq 5103 * values) and the r_rsm is on the right 5104 * (higher seq value). The simplest way 5105 * to merge these is to move the right 5106 * one into the left. I don't think there 5107 * is any reason we need to try to find 5108 * the oldest (or last oldest retransmitted). 5109 */ 5110 struct rack_sendmap *rm; 5111 5112 l_rsm->r_end = r_rsm->r_end; 5113 if (l_rsm->r_dupack < r_rsm->r_dupack) 5114 l_rsm->r_dupack = r_rsm->r_dupack; 5115 if (r_rsm->r_rtr_bytes) 5116 l_rsm->r_rtr_bytes += r_rsm->r_rtr_bytes; 5117 if (r_rsm->r_in_tmap) { 5118 /* This really should not happen */ 5119 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, r_rsm, r_tnext); 5120 r_rsm->r_in_tmap = 0; 5121 } 5122 5123 /* Now the flags */ 5124 if (r_rsm->r_flags & RACK_HAS_FIN) 5125 l_rsm->r_flags |= RACK_HAS_FIN; 5126 if (r_rsm->r_flags & RACK_TLP) 5127 l_rsm->r_flags |= RACK_TLP; 5128 if (r_rsm->r_flags & RACK_RWND_COLLAPSED) 5129 l_rsm->r_flags |= RACK_RWND_COLLAPSED; 5130 if ((r_rsm->r_flags & RACK_APP_LIMITED) && 5131 ((l_rsm->r_flags & RACK_APP_LIMITED) == 0)) { 5132 /* 5133 * If both are app-limited then let the 5134 * free lower the count. If right is app 5135 * limited and left is not, transfer. 5136 */ 5137 l_rsm->r_flags |= RACK_APP_LIMITED; 5138 r_rsm->r_flags &= ~RACK_APP_LIMITED; 5139 if (r_rsm == rack->r_ctl.rc_first_appl) 5140 rack->r_ctl.rc_first_appl = l_rsm; 5141 } 5142 rm = RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, r_rsm); 5143 #ifdef INVARIANTS 5144 if (rm != r_rsm) { 5145 panic("removing head in rack:%p rsm:%p rm:%p", 5146 rack, r_rsm, rm); 5147 } 5148 #endif 5149 if ((r_rsm->r_limit_type == 0) && (l_rsm->r_limit_type != 0)) { 5150 /* Transfer the split limit to the map we free */ 5151 r_rsm->r_limit_type = l_rsm->r_limit_type; 5152 l_rsm->r_limit_type = 0; 5153 } 5154 rack_free(rack, r_rsm); 5155 return(l_rsm); 5156 } 5157 5158 /* 5159 * TLP Timer, here we simply setup what segment we want to 5160 * have the TLP expire on, the normal rack_output() will then 5161 * send it out. 5162 * 5163 * We return 1, saying don't proceed with rack_output only 5164 * when all timers have been stopped (destroyed PCB?). 5165 */ 5166 static int 5167 rack_timeout_tlp(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 5168 { 5169 /* 5170 * Tail Loss Probe. 5171 */ 5172 struct rack_sendmap *rsm = NULL; 5173 struct rack_sendmap *insret; 5174 struct socket *so; 5175 uint32_t amm, old_prr_snd = 0; 5176 uint32_t out, avail; 5177 int collapsed_win = 0; 5178 5179 if (tp->t_timers->tt_flags & TT_STOPPED) { 5180 return (1); 5181 } 5182 if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) { 5183 /* Its not time yet */ 5184 return (0); 5185 } 5186 if (ctf_progress_timeout_check(tp, true)) { 5187 rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__); 5188 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 5189 return (1); 5190 } 5191 /* 5192 * A TLP timer has expired. We have been idle for 2 rtts. So we now 5193 * need to figure out how to force a full MSS segment out. 5194 */ 5195 rack_log_to_event(rack, RACK_TO_FRM_TLP, NULL); 5196 counter_u64_add(rack_tlp_tot, 1); 5197 if (rack->r_state && (rack->r_state != tp->t_state)) 5198 rack_set_state(tp, rack); 5199 so = tp->t_inpcb->inp_socket; 5200 avail = sbavail(&so->so_snd); 5201 out = tp->snd_max - tp->snd_una; 5202 if (out > tp->snd_wnd) { 5203 /* special case, we need a retransmission */ 5204 collapsed_win = 1; 5205 goto need_retran; 5206 } 5207 /* 5208 * Check our send oldest always settings, and if 5209 * there is an oldest to send jump to the need_retran. 5210 */ 5211 if (rack_always_send_oldest && (TAILQ_EMPTY(&rack->r_ctl.rc_tmap) == 0)) 5212 goto need_retran; 5213 5214 if (avail > out) { 5215 /* New data is available */ 5216 amm = avail - out; 5217 if (amm > ctf_fixed_maxseg(tp)) { 5218 amm = ctf_fixed_maxseg(tp); 5219 if ((amm + out) > tp->snd_wnd) { 5220 /* We are rwnd limited */ 5221 goto need_retran; 5222 } 5223 } else if (amm < ctf_fixed_maxseg(tp)) { 5224 /* not enough to fill a MTU */ 5225 goto need_retran; 5226 } 5227 if (IN_RECOVERY(tp->t_flags)) { 5228 /* Unlikely */ 5229 if (rack->rack_no_prr == 0) { 5230 old_prr_snd = rack->r_ctl.rc_prr_sndcnt; 5231 if (out + amm <= tp->snd_wnd) { 5232 rack->r_ctl.rc_prr_sndcnt = amm; 5233 rack_log_to_prr(rack, 4, 0); 5234 } 5235 } else 5236 goto need_retran; 5237 } else { 5238 /* Set the send-new override */ 5239 if (out + amm <= tp->snd_wnd) 5240 rack->r_ctl.rc_tlp_new_data = amm; 5241 else 5242 goto need_retran; 5243 } 5244 rack->r_ctl.rc_tlpsend = NULL; 5245 counter_u64_add(rack_tlp_newdata, 1); 5246 goto send; 5247 } 5248 need_retran: 5249 /* 5250 * Ok we need to arrange the last un-acked segment to be re-sent, or 5251 * optionally the first un-acked segment. 5252 */ 5253 if (collapsed_win == 0) { 5254 if (rack_always_send_oldest) 5255 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 5256 else { 5257 rsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 5258 if (rsm && (rsm->r_flags & (RACK_ACKED | RACK_HAS_FIN))) { 5259 rsm = rack_find_high_nonack(rack, rsm); 5260 } 5261 } 5262 if (rsm == NULL) { 5263 counter_u64_add(rack_tlp_does_nada, 1); 5264 #ifdef TCP_BLACKBOX 5265 tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true); 5266 #endif 5267 goto out; 5268 } 5269 } else { 5270 /* 5271 * We must find the last segment 5272 * that was acceptable by the client. 5273 */ 5274 RB_FOREACH_REVERSE(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) { 5275 if ((rsm->r_flags & RACK_RWND_COLLAPSED) == 0) { 5276 /* Found one */ 5277 break; 5278 } 5279 } 5280 if (rsm == NULL) { 5281 /* None? if so send the first */ 5282 rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 5283 if (rsm == NULL) { 5284 counter_u64_add(rack_tlp_does_nada, 1); 5285 #ifdef TCP_BLACKBOX 5286 tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true); 5287 #endif 5288 goto out; 5289 } 5290 } 5291 } 5292 if ((rsm->r_end - rsm->r_start) > ctf_fixed_maxseg(tp)) { 5293 /* 5294 * We need to split this the last segment in two. 5295 */ 5296 struct rack_sendmap *nrsm; 5297 5298 5299 nrsm = rack_alloc_full_limit(rack); 5300 if (nrsm == NULL) { 5301 /* 5302 * No memory to split, we will just exit and punt 5303 * off to the RXT timer. 5304 */ 5305 counter_u64_add(rack_tlp_does_nada, 1); 5306 goto out; 5307 } 5308 rack_clone_rsm(rack, nrsm, rsm, 5309 (rsm->r_end - ctf_fixed_maxseg(tp))); 5310 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); 5311 #ifdef INVARIANTS 5312 if (insret != NULL) { 5313 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", 5314 nrsm, insret, rack, rsm); 5315 } 5316 #endif 5317 if (rsm->r_in_tmap) { 5318 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 5319 nrsm->r_in_tmap = 1; 5320 } 5321 rsm->r_flags &= (~RACK_HAS_FIN); 5322 rsm = nrsm; 5323 } 5324 rack->r_ctl.rc_tlpsend = rsm; 5325 send: 5326 rack->r_timer_override = 1; 5327 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP; 5328 return (0); 5329 out: 5330 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP; 5331 return (0); 5332 } 5333 5334 /* 5335 * Delayed ack Timer, here we simply need to setup the 5336 * ACK_NOW flag and remove the DELACK flag. From there 5337 * the output routine will send the ack out. 5338 * 5339 * We only return 1, saying don't proceed, if all timers 5340 * are stopped (destroyed PCB?). 5341 */ 5342 static int 5343 rack_timeout_delack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 5344 { 5345 if (tp->t_timers->tt_flags & TT_STOPPED) { 5346 return (1); 5347 } 5348 rack_log_to_event(rack, RACK_TO_FRM_DELACK, NULL); 5349 tp->t_flags &= ~TF_DELACK; 5350 tp->t_flags |= TF_ACKNOW; 5351 KMOD_TCPSTAT_INC(tcps_delack); 5352 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK; 5353 return (0); 5354 } 5355 5356 /* 5357 * Persists timer, here we simply send the 5358 * same thing as a keepalive will. 5359 * the one byte send. 5360 * 5361 * We only return 1, saying don't proceed, if all timers 5362 * are stopped (destroyed PCB?). 5363 */ 5364 static int 5365 rack_timeout_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 5366 { 5367 struct tcptemp *t_template; 5368 struct inpcb *inp; 5369 int32_t retval = 1; 5370 5371 inp = tp->t_inpcb; 5372 5373 if (tp->t_timers->tt_flags & TT_STOPPED) { 5374 return (1); 5375 } 5376 if (rack->rc_in_persist == 0) 5377 return (0); 5378 if (ctf_progress_timeout_check(tp, false)) { 5379 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); 5380 rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__); 5381 tcp_set_inp_to_drop(inp, ETIMEDOUT); 5382 return (1); 5383 } 5384 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 5385 /* 5386 * Persistence timer into zero window. Force a byte to be output, if 5387 * possible. 5388 */ 5389 KMOD_TCPSTAT_INC(tcps_persisttimeo); 5390 /* 5391 * Hack: if the peer is dead/unreachable, we do not time out if the 5392 * window is closed. After a full backoff, drop the connection if 5393 * the idle time (no responses to probes) reaches the maximum 5394 * backoff that we would use if retransmitting. 5395 */ 5396 if (tp->t_rxtshift == TCP_MAXRXTSHIFT && 5397 (ticks - tp->t_rcvtime >= tcp_maxpersistidle || 5398 ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) { 5399 KMOD_TCPSTAT_INC(tcps_persistdrop); 5400 retval = 1; 5401 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); 5402 tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT); 5403 goto out; 5404 } 5405 if ((sbavail(&rack->rc_inp->inp_socket->so_snd) == 0) && 5406 tp->snd_una == tp->snd_max) 5407 rack_exit_persist(tp, rack, cts); 5408 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_PERSIT; 5409 /* 5410 * If the user has closed the socket then drop a persisting 5411 * connection after a much reduced timeout. 5412 */ 5413 if (tp->t_state > TCPS_CLOSE_WAIT && 5414 (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) { 5415 retval = 1; 5416 KMOD_TCPSTAT_INC(tcps_persistdrop); 5417 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); 5418 tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT); 5419 goto out; 5420 } 5421 t_template = tcpip_maketemplate(rack->rc_inp); 5422 if (t_template) { 5423 /* only set it if we were answered */ 5424 if (rack->forced_ack == 0) { 5425 rack->forced_ack = 1; 5426 rack->r_ctl.forced_ack_ts = tcp_get_usecs(NULL); 5427 } 5428 tcp_respond(tp, t_template->tt_ipgen, 5429 &t_template->tt_t, (struct mbuf *)NULL, 5430 tp->rcv_nxt, tp->snd_una - 1, 0); 5431 /* This sends an ack */ 5432 if (tp->t_flags & TF_DELACK) 5433 tp->t_flags &= ~TF_DELACK; 5434 free(t_template, M_TEMP); 5435 } 5436 if (tp->t_rxtshift < TCP_MAXRXTSHIFT) 5437 tp->t_rxtshift++; 5438 out: 5439 rack_log_to_event(rack, RACK_TO_FRM_PERSIST, NULL); 5440 rack_start_hpts_timer(rack, tp, cts, 5441 0, 0, 0); 5442 return (retval); 5443 } 5444 5445 /* 5446 * If a keepalive goes off, we had no other timers 5447 * happening. We always return 1 here since this 5448 * routine either drops the connection or sends 5449 * out a segment with respond. 5450 */ 5451 static int 5452 rack_timeout_keepalive(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 5453 { 5454 struct tcptemp *t_template; 5455 struct inpcb *inp; 5456 5457 if (tp->t_timers->tt_flags & TT_STOPPED) { 5458 return (1); 5459 } 5460 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_KEEP; 5461 inp = tp->t_inpcb; 5462 rack_log_to_event(rack, RACK_TO_FRM_KEEP, NULL); 5463 /* 5464 * Keep-alive timer went off; send something or drop connection if 5465 * idle for too long. 5466 */ 5467 KMOD_TCPSTAT_INC(tcps_keeptimeo); 5468 if (tp->t_state < TCPS_ESTABLISHED) 5469 goto dropit; 5470 if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && 5471 tp->t_state <= TCPS_CLOSING) { 5472 if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp)) 5473 goto dropit; 5474 /* 5475 * Send a packet designed to force a response if the peer is 5476 * up and reachable: either an ACK if the connection is 5477 * still alive, or an RST if the peer has closed the 5478 * connection due to timeout or reboot. Using sequence 5479 * number tp->snd_una-1 causes the transmitted zero-length 5480 * segment to lie outside the receive window; by the 5481 * protocol spec, this requires the correspondent TCP to 5482 * respond. 5483 */ 5484 KMOD_TCPSTAT_INC(tcps_keepprobe); 5485 t_template = tcpip_maketemplate(inp); 5486 if (t_template) { 5487 if (rack->forced_ack == 0) { 5488 rack->forced_ack = 1; 5489 rack->r_ctl.forced_ack_ts = tcp_get_usecs(NULL); 5490 } 5491 tcp_respond(tp, t_template->tt_ipgen, 5492 &t_template->tt_t, (struct mbuf *)NULL, 5493 tp->rcv_nxt, tp->snd_una - 1, 0); 5494 free(t_template, M_TEMP); 5495 } 5496 } 5497 rack_start_hpts_timer(rack, tp, cts, 0, 0, 0); 5498 return (1); 5499 dropit: 5500 KMOD_TCPSTAT_INC(tcps_keepdrops); 5501 tcp_log_end_status(tp, TCP_EI_STATUS_KEEP_MAX); 5502 tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT); 5503 return (1); 5504 } 5505 5506 /* 5507 * Retransmit helper function, clear up all the ack 5508 * flags and take care of important book keeping. 5509 */ 5510 static void 5511 rack_remxt_tmr(struct tcpcb *tp) 5512 { 5513 /* 5514 * The retransmit timer went off, all sack'd blocks must be 5515 * un-acked. 5516 */ 5517 struct rack_sendmap *rsm, *trsm = NULL; 5518 struct tcp_rack *rack; 5519 int32_t cnt = 0; 5520 5521 rack = (struct tcp_rack *)tp->t_fb_ptr; 5522 rack_timer_cancel(tp, rack, tcp_ts_getticks(), __LINE__); 5523 rack_log_to_event(rack, RACK_TO_FRM_TMR, NULL); 5524 if (rack->r_state && (rack->r_state != tp->t_state)) 5525 rack_set_state(tp, rack); 5526 /* 5527 * Ideally we would like to be able to 5528 * mark SACK-PASS on anything not acked here. 5529 * However, if we do that we would burst out 5530 * all that data 1ms apart. This would be unwise, 5531 * so for now we will just let the normal rxt timer 5532 * and tlp timer take care of it. 5533 */ 5534 RB_FOREACH(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) { 5535 if (rsm->r_flags & RACK_ACKED) { 5536 cnt++; 5537 rsm->r_dupack = 0; 5538 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 5539 if (rsm->r_in_tmap == 0) { 5540 /* We must re-add it back to the tlist */ 5541 if (trsm == NULL) { 5542 TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext); 5543 } else { 5544 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, trsm, rsm, r_tnext); 5545 } 5546 rsm->r_in_tmap = 1; 5547 } 5548 } 5549 trsm = rsm; 5550 if (rsm->r_flags & RACK_ACKED) 5551 rsm->r_flags |= RACK_WAS_ACKED; 5552 rsm->r_flags &= ~(RACK_ACKED | RACK_SACK_PASSED | RACK_WAS_SACKPASS); 5553 } 5554 /* Clear the count (we just un-acked them) */ 5555 rack->r_ctl.rc_sacked = 0; 5556 rack->r_ctl.rc_agg_delayed = 0; 5557 rack->r_early = 0; 5558 rack->r_ctl.rc_agg_early = 0; 5559 rack->r_late = 0; 5560 /* Clear the tlp rtx mark */ 5561 rack->r_ctl.rc_resend = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 5562 rack->r_ctl.rc_prr_sndcnt = 0; 5563 rack_log_to_prr(rack, 6, 0); 5564 rack->r_timer_override = 1; 5565 } 5566 5567 static void 5568 rack_cc_conn_init(struct tcpcb *tp) 5569 { 5570 struct tcp_rack *rack; 5571 5572 5573 rack = (struct tcp_rack *)tp->t_fb_ptr; 5574 cc_conn_init(tp); 5575 /* 5576 * We want a chance to stay in slowstart as 5577 * we create a connection. TCP spec says that 5578 * initially ssthresh is infinite. For our 5579 * purposes that is the snd_wnd. 5580 */ 5581 if (tp->snd_ssthresh < tp->snd_wnd) { 5582 tp->snd_ssthresh = tp->snd_wnd; 5583 } 5584 /* 5585 * We also want to assure a IW worth of 5586 * data can get inflight. 5587 */ 5588 if (rc_init_window(rack) < tp->snd_cwnd) 5589 tp->snd_cwnd = rc_init_window(rack); 5590 } 5591 5592 /* 5593 * Re-transmit timeout! If we drop the PCB we will return 1, otherwise 5594 * we will setup to retransmit the lowest seq number outstanding. 5595 */ 5596 static int 5597 rack_timeout_rxt(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 5598 { 5599 int32_t rexmt; 5600 struct inpcb *inp; 5601 int32_t retval = 0; 5602 bool isipv6; 5603 5604 inp = tp->t_inpcb; 5605 if (tp->t_timers->tt_flags & TT_STOPPED) { 5606 return (1); 5607 } 5608 if (ctf_progress_timeout_check(tp, false)) { 5609 tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN); 5610 rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__); 5611 tcp_set_inp_to_drop(inp, ETIMEDOUT); 5612 return (1); 5613 } 5614 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RXT; 5615 if (TCPS_HAVEESTABLISHED(tp->t_state) && 5616 (tp->snd_una == tp->snd_max)) { 5617 /* Nothing outstanding .. nothing to do */ 5618 return (0); 5619 } 5620 /* 5621 * Retransmission timer went off. Message has not been acked within 5622 * retransmit interval. Back off to a longer retransmit interval 5623 * and retransmit one segment. 5624 */ 5625 rack_remxt_tmr(tp); 5626 if ((rack->r_ctl.rc_resend == NULL) || 5627 ((rack->r_ctl.rc_resend->r_flags & RACK_RWND_COLLAPSED) == 0)) { 5628 /* 5629 * If the rwnd collapsed on 5630 * the one we are retransmitting 5631 * it does not count against the 5632 * rxt count. 5633 */ 5634 tp->t_rxtshift++; 5635 } 5636 if (tp->t_rxtshift > TCP_MAXRXTSHIFT) { 5637 tp->t_rxtshift = TCP_MAXRXTSHIFT; 5638 KMOD_TCPSTAT_INC(tcps_timeoutdrop); 5639 retval = 1; 5640 tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN); 5641 tcp_set_inp_to_drop(rack->rc_inp, 5642 (tp->t_softerror ? (uint16_t) tp->t_softerror : ETIMEDOUT)); 5643 goto out; 5644 } 5645 if (tp->t_state == TCPS_SYN_SENT) { 5646 /* 5647 * If the SYN was retransmitted, indicate CWND to be limited 5648 * to 1 segment in cc_conn_init(). 5649 */ 5650 tp->snd_cwnd = 1; 5651 } else if (tp->t_rxtshift == 1) { 5652 /* 5653 * first retransmit; record ssthresh and cwnd so they can be 5654 * recovered if this turns out to be a "bad" retransmit. A 5655 * retransmit is considered "bad" if an ACK for this segment 5656 * is received within RTT/2 interval; the assumption here is 5657 * that the ACK was already in flight. See "On Estimating 5658 * End-to-End Network Path Properties" by Allman and Paxson 5659 * for more details. 5660 */ 5661 tp->snd_cwnd_prev = tp->snd_cwnd; 5662 tp->snd_ssthresh_prev = tp->snd_ssthresh; 5663 tp->snd_recover_prev = tp->snd_recover; 5664 if (IN_FASTRECOVERY(tp->t_flags)) 5665 tp->t_flags |= TF_WASFRECOVERY; 5666 else 5667 tp->t_flags &= ~TF_WASFRECOVERY; 5668 if (IN_CONGRECOVERY(tp->t_flags)) 5669 tp->t_flags |= TF_WASCRECOVERY; 5670 else 5671 tp->t_flags &= ~TF_WASCRECOVERY; 5672 tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1)); 5673 tp->t_flags |= TF_PREVVALID; 5674 } else 5675 tp->t_flags &= ~TF_PREVVALID; 5676 KMOD_TCPSTAT_INC(tcps_rexmttimeo); 5677 if ((tp->t_state == TCPS_SYN_SENT) || 5678 (tp->t_state == TCPS_SYN_RECEIVED)) 5679 rexmt = MSEC_2_TICKS(RACK_INITIAL_RTO * tcp_backoff[tp->t_rxtshift]); 5680 else 5681 rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; 5682 TCPT_RANGESET(tp->t_rxtcur, rexmt, 5683 max(MSEC_2_TICKS(rack_rto_min), rexmt), 5684 MSEC_2_TICKS(rack_rto_max)); 5685 /* 5686 * We enter the path for PLMTUD if connection is established or, if 5687 * connection is FIN_WAIT_1 status, reason for the last is that if 5688 * amount of data we send is very small, we could send it in couple 5689 * of packets and process straight to FIN. In that case we won't 5690 * catch ESTABLISHED state. 5691 */ 5692 #ifdef INET6 5693 isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? true : false; 5694 #else 5695 isipv6 = false; 5696 #endif 5697 if (((V_tcp_pmtud_blackhole_detect == 1) || 5698 (V_tcp_pmtud_blackhole_detect == 2 && !isipv6) || 5699 (V_tcp_pmtud_blackhole_detect == 3 && isipv6)) && 5700 ((tp->t_state == TCPS_ESTABLISHED) || 5701 (tp->t_state == TCPS_FIN_WAIT_1))) { 5702 5703 /* 5704 * Idea here is that at each stage of mtu probe (usually, 5705 * 1448 -> 1188 -> 524) should be given 2 chances to recover 5706 * before further clamping down. 'tp->t_rxtshift % 2 == 0' 5707 * should take care of that. 5708 */ 5709 if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) == 5710 (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) && 5711 (tp->t_rxtshift >= 2 && tp->t_rxtshift < 6 && 5712 tp->t_rxtshift % 2 == 0)) { 5713 /* 5714 * Enter Path MTU Black-hole Detection mechanism: - 5715 * Disable Path MTU Discovery (IP "DF" bit). - 5716 * Reduce MTU to lower value than what we negotiated 5717 * with peer. 5718 */ 5719 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) == 0) { 5720 /* Record that we may have found a black hole. */ 5721 tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE; 5722 /* Keep track of previous MSS. */ 5723 tp->t_pmtud_saved_maxseg = tp->t_maxseg; 5724 } 5725 5726 /* 5727 * Reduce the MSS to blackhole value or to the 5728 * default in an attempt to retransmit. 5729 */ 5730 #ifdef INET6 5731 if (isipv6 && 5732 tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) { 5733 /* Use the sysctl tuneable blackhole MSS. */ 5734 tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss; 5735 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated); 5736 } else if (isipv6) { 5737 /* Use the default MSS. */ 5738 tp->t_maxseg = V_tcp_v6mssdflt; 5739 /* 5740 * Disable Path MTU Discovery when we switch 5741 * to minmss. 5742 */ 5743 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 5744 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); 5745 } 5746 #endif 5747 #if defined(INET6) && defined(INET) 5748 else 5749 #endif 5750 #ifdef INET 5751 if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) { 5752 /* Use the sysctl tuneable blackhole MSS. */ 5753 tp->t_maxseg = V_tcp_pmtud_blackhole_mss; 5754 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated); 5755 } else { 5756 /* Use the default MSS. */ 5757 tp->t_maxseg = V_tcp_mssdflt; 5758 /* 5759 * Disable Path MTU Discovery when we switch 5760 * to minmss. 5761 */ 5762 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 5763 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); 5764 } 5765 #endif 5766 } else { 5767 /* 5768 * If further retransmissions are still unsuccessful 5769 * with a lowered MTU, maybe this isn't a blackhole 5770 * and we restore the previous MSS and blackhole 5771 * detection flags. The limit '6' is determined by 5772 * giving each probe stage (1448, 1188, 524) 2 5773 * chances to recover. 5774 */ 5775 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) && 5776 (tp->t_rxtshift >= 6)) { 5777 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 5778 tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE; 5779 tp->t_maxseg = tp->t_pmtud_saved_maxseg; 5780 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_failed); 5781 } 5782 } 5783 } 5784 /* 5785 * If we backed off this far, our srtt estimate is probably bogus. 5786 * Clobber it so we'll take the next rtt measurement as our srtt; 5787 * move the current srtt into rttvar to keep the current retransmit 5788 * times until then. 5789 */ 5790 if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) { 5791 #ifdef INET6 5792 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) 5793 in6_losing(tp->t_inpcb); 5794 else 5795 #endif 5796 in_losing(tp->t_inpcb); 5797 tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT); 5798 tp->t_srtt = 0; 5799 } 5800 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 5801 tp->snd_recover = tp->snd_max; 5802 tp->t_flags |= TF_ACKNOW; 5803 tp->t_rtttime = 0; 5804 rack_cong_signal(tp, NULL, CC_RTO); 5805 out: 5806 return (retval); 5807 } 5808 5809 static int 5810 rack_process_timers(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8_t hpts_calling) 5811 { 5812 int32_t ret = 0; 5813 int32_t timers = (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK); 5814 5815 if (timers == 0) { 5816 return (0); 5817 } 5818 if (tp->t_state == TCPS_LISTEN) { 5819 /* no timers on listen sockets */ 5820 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) 5821 return (0); 5822 return (1); 5823 } 5824 if ((timers & PACE_TMR_RACK) && 5825 rack->rc_on_min_to) { 5826 /* 5827 * For the rack timer when we 5828 * are on a min-timeout (which means rrr_conf = 3) 5829 * we don't want to check the timer. It may 5830 * be going off for a pace and thats ok we 5831 * want to send the retransmit (if its ready). 5832 * 5833 * If its on a normal rack timer (non-min) then 5834 * we will check if its expired. 5835 */ 5836 goto skip_time_check; 5837 } 5838 if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) { 5839 uint32_t left; 5840 5841 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { 5842 ret = -1; 5843 rack_log_to_processing(rack, cts, ret, 0); 5844 return (0); 5845 } 5846 if (hpts_calling == 0) { 5847 /* 5848 * A user send or queued mbuf (sack) has called us? We 5849 * return 0 and let the pacing guards 5850 * deal with it if they should or 5851 * should not cause a send. 5852 */ 5853 ret = -2; 5854 rack_log_to_processing(rack, cts, ret, 0); 5855 return (0); 5856 } 5857 /* 5858 * Ok our timer went off early and we are not paced false 5859 * alarm, go back to sleep. 5860 */ 5861 ret = -3; 5862 left = rack->r_ctl.rc_timer_exp - cts; 5863 tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(left)); 5864 rack_log_to_processing(rack, cts, ret, left); 5865 return (1); 5866 } 5867 skip_time_check: 5868 rack->rc_tmr_stopped = 0; 5869 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_MASK; 5870 if (timers & PACE_TMR_DELACK) { 5871 ret = rack_timeout_delack(tp, rack, cts); 5872 } else if (timers & PACE_TMR_RACK) { 5873 rack->r_ctl.rc_tlp_rxt_last_time = cts; 5874 ret = rack_timeout_rack(tp, rack, cts); 5875 } else if (timers & PACE_TMR_TLP) { 5876 rack->r_ctl.rc_tlp_rxt_last_time = cts; 5877 ret = rack_timeout_tlp(tp, rack, cts); 5878 } else if (timers & PACE_TMR_RXT) { 5879 rack->r_ctl.rc_tlp_rxt_last_time = cts; 5880 ret = rack_timeout_rxt(tp, rack, cts); 5881 } else if (timers & PACE_TMR_PERSIT) { 5882 ret = rack_timeout_persist(tp, rack, cts); 5883 } else if (timers & PACE_TMR_KEEP) { 5884 ret = rack_timeout_keepalive(tp, rack, cts); 5885 } 5886 rack_log_to_processing(rack, cts, ret, timers); 5887 return (ret); 5888 } 5889 5890 static void 5891 rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line) 5892 { 5893 struct timeval tv; 5894 uint32_t us_cts, flags_on_entry; 5895 uint8_t hpts_removed = 0; 5896 5897 5898 flags_on_entry = rack->r_ctl.rc_hpts_flags; 5899 us_cts = tcp_get_usecs(&tv); 5900 if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) && 5901 ((TSTMP_GEQ(us_cts, rack->r_ctl.rc_last_output_to)) || 5902 ((tp->snd_max - tp->snd_una) == 0))) { 5903 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); 5904 hpts_removed = 1; 5905 /* If we were not delayed cancel out the flag. */ 5906 if ((tp->snd_max - tp->snd_una) == 0) 5907 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 5908 rack_log_to_cancel(rack, hpts_removed, line, us_cts, &tv, flags_on_entry); 5909 } 5910 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { 5911 rack->rc_tmr_stopped = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK; 5912 if (rack->rc_inp->inp_in_hpts && 5913 ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0)) { 5914 /* 5915 * Canceling timer's when we have no output being 5916 * paced. We also must remove ourselves from the 5917 * hpts. 5918 */ 5919 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); 5920 hpts_removed = 1; 5921 } 5922 rack->r_ctl.rc_hpts_flags &= ~(PACE_TMR_MASK); 5923 } 5924 if (hpts_removed == 0) 5925 rack_log_to_cancel(rack, hpts_removed, line, us_cts, &tv, flags_on_entry); 5926 } 5927 5928 static void 5929 rack_timer_stop(struct tcpcb *tp, uint32_t timer_type) 5930 { 5931 return; 5932 } 5933 5934 static int 5935 rack_stopall(struct tcpcb *tp) 5936 { 5937 struct tcp_rack *rack; 5938 rack = (struct tcp_rack *)tp->t_fb_ptr; 5939 rack->t_timers_stopped = 1; 5940 return (0); 5941 } 5942 5943 static void 5944 rack_timer_activate(struct tcpcb *tp, uint32_t timer_type, uint32_t delta) 5945 { 5946 return; 5947 } 5948 5949 static int 5950 rack_timer_active(struct tcpcb *tp, uint32_t timer_type) 5951 { 5952 return (0); 5953 } 5954 5955 static void 5956 rack_stop_all_timers(struct tcpcb *tp) 5957 { 5958 struct tcp_rack *rack; 5959 5960 /* 5961 * Assure no timers are running. 5962 */ 5963 if (tcp_timer_active(tp, TT_PERSIST)) { 5964 /* We enter in persists, set the flag appropriately */ 5965 rack = (struct tcp_rack *)tp->t_fb_ptr; 5966 rack->rc_in_persist = 1; 5967 } 5968 tcp_timer_suspend(tp, TT_PERSIST); 5969 tcp_timer_suspend(tp, TT_REXMT); 5970 tcp_timer_suspend(tp, TT_KEEP); 5971 tcp_timer_suspend(tp, TT_DELACK); 5972 } 5973 5974 static void 5975 rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack, 5976 struct rack_sendmap *rsm, uint32_t ts) 5977 { 5978 int32_t idx; 5979 5980 rsm->r_rtr_cnt++; 5981 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 5982 rsm->r_dupack = 0; 5983 if (rsm->r_rtr_cnt > RACK_NUM_OF_RETRANS) { 5984 rsm->r_rtr_cnt = RACK_NUM_OF_RETRANS; 5985 rsm->r_flags |= RACK_OVERMAX; 5986 } 5987 if ((rsm->r_rtr_cnt > 1) && ((rsm->r_flags & RACK_TLP) == 0)) { 5988 rack->r_ctl.rc_holes_rxt += (rsm->r_end - rsm->r_start); 5989 rsm->r_rtr_bytes += (rsm->r_end - rsm->r_start); 5990 } 5991 idx = rsm->r_rtr_cnt - 1; 5992 rsm->r_tim_lastsent[idx] = ts; 5993 if (rsm->r_flags & RACK_ACKED) { 5994 /* Problably MTU discovery messing with us */ 5995 rsm->r_flags &= ~RACK_ACKED; 5996 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); 5997 } 5998 if (rsm->r_in_tmap) { 5999 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 6000 rsm->r_in_tmap = 0; 6001 } 6002 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); 6003 rsm->r_in_tmap = 1; 6004 if (rsm->r_flags & RACK_SACK_PASSED) { 6005 /* We have retransmitted due to the SACK pass */ 6006 rsm->r_flags &= ~RACK_SACK_PASSED; 6007 rsm->r_flags |= RACK_WAS_SACKPASS; 6008 } 6009 } 6010 6011 6012 static uint32_t 6013 rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack, 6014 struct rack_sendmap *rsm, uint32_t ts, int32_t *lenp) 6015 { 6016 /* 6017 * We (re-)transmitted starting at rsm->r_start for some length 6018 * (possibly less than r_end. 6019 */ 6020 struct rack_sendmap *nrsm, *insret; 6021 uint32_t c_end; 6022 int32_t len; 6023 6024 len = *lenp; 6025 c_end = rsm->r_start + len; 6026 if (SEQ_GEQ(c_end, rsm->r_end)) { 6027 /* 6028 * We retransmitted the whole piece or more than the whole 6029 * slopping into the next rsm. 6030 */ 6031 rack_update_rsm(tp, rack, rsm, ts); 6032 if (c_end == rsm->r_end) { 6033 *lenp = 0; 6034 return (0); 6035 } else { 6036 int32_t act_len; 6037 6038 /* Hangs over the end return whats left */ 6039 act_len = rsm->r_end - rsm->r_start; 6040 *lenp = (len - act_len); 6041 return (rsm->r_end); 6042 } 6043 /* We don't get out of this block. */ 6044 } 6045 /* 6046 * Here we retransmitted less than the whole thing which means we 6047 * have to split this into what was transmitted and what was not. 6048 */ 6049 nrsm = rack_alloc_full_limit(rack); 6050 if (nrsm == NULL) { 6051 /* 6052 * We can't get memory, so lets not proceed. 6053 */ 6054 *lenp = 0; 6055 return (0); 6056 } 6057 /* 6058 * So here we are going to take the original rsm and make it what we 6059 * retransmitted. nrsm will be the tail portion we did not 6060 * retransmit. For example say the chunk was 1, 11 (10 bytes). And 6061 * we retransmitted 5 bytes i.e. 1, 5. The original piece shrinks to 6062 * 1, 6 and the new piece will be 6, 11. 6063 */ 6064 rack_clone_rsm(rack, nrsm, rsm, c_end); 6065 nrsm->r_dupack = 0; 6066 rack_log_retran_reason(rack, nrsm, __LINE__, 0, 2); 6067 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); 6068 #ifdef INVARIANTS 6069 if (insret != NULL) { 6070 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", 6071 nrsm, insret, rack, rsm); 6072 } 6073 #endif 6074 if (rsm->r_in_tmap) { 6075 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 6076 nrsm->r_in_tmap = 1; 6077 } 6078 rsm->r_flags &= (~RACK_HAS_FIN); 6079 rack_update_rsm(tp, rack, rsm, ts); 6080 *lenp = 0; 6081 return (0); 6082 } 6083 6084 6085 static void 6086 rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len, 6087 uint32_t seq_out, uint8_t th_flags, int32_t err, uint32_t ts, 6088 uint8_t pass, struct rack_sendmap *hintrsm, uint32_t us_cts) 6089 { 6090 struct tcp_rack *rack; 6091 struct rack_sendmap *rsm, *nrsm, *insret, fe; 6092 register uint32_t snd_max, snd_una; 6093 6094 /* 6095 * Add to the RACK log of packets in flight or retransmitted. If 6096 * there is a TS option we will use the TS echoed, if not we will 6097 * grab a TS. 6098 * 6099 * Retransmissions will increment the count and move the ts to its 6100 * proper place. Note that if options do not include TS's then we 6101 * won't be able to effectively use the ACK for an RTT on a retran. 6102 * 6103 * Notes about r_start and r_end. Lets consider a send starting at 6104 * sequence 1 for 10 bytes. In such an example the r_start would be 6105 * 1 (starting sequence) but the r_end would be r_start+len i.e. 11. 6106 * This means that r_end is actually the first sequence for the next 6107 * slot (11). 6108 * 6109 */ 6110 /* 6111 * If err is set what do we do XXXrrs? should we not add the thing? 6112 * -- i.e. return if err != 0 or should we pretend we sent it? -- 6113 * i.e. proceed with add ** do this for now. 6114 */ 6115 INP_WLOCK_ASSERT(tp->t_inpcb); 6116 if (err) 6117 /* 6118 * We don't log errors -- we could but snd_max does not 6119 * advance in this case either. 6120 */ 6121 return; 6122 6123 if (th_flags & TH_RST) { 6124 /* 6125 * We don't log resets and we return immediately from 6126 * sending 6127 */ 6128 return; 6129 } 6130 rack = (struct tcp_rack *)tp->t_fb_ptr; 6131 snd_una = tp->snd_una; 6132 if (SEQ_LEQ((seq_out + len), snd_una)) { 6133 /* Are sending an old segment to induce an ack (keep-alive)? */ 6134 return; 6135 } 6136 if (SEQ_LT(seq_out, snd_una)) { 6137 /* huh? should we panic? */ 6138 uint32_t end; 6139 6140 end = seq_out + len; 6141 seq_out = snd_una; 6142 if (SEQ_GEQ(end, seq_out)) 6143 len = end - seq_out; 6144 else 6145 len = 0; 6146 } 6147 snd_max = tp->snd_max; 6148 if (th_flags & (TH_SYN | TH_FIN)) { 6149 /* 6150 * The call to rack_log_output is made before bumping 6151 * snd_max. This means we can record one extra byte on a SYN 6152 * or FIN if seq_out is adding more on and a FIN is present 6153 * (and we are not resending). 6154 */ 6155 if ((th_flags & TH_SYN) && (seq_out == tp->iss)) 6156 len++; 6157 if (th_flags & TH_FIN) 6158 len++; 6159 if (SEQ_LT(snd_max, tp->snd_nxt)) { 6160 /* 6161 * The add/update as not been done for the FIN/SYN 6162 * yet. 6163 */ 6164 snd_max = tp->snd_nxt; 6165 } 6166 } 6167 if (len == 0) { 6168 /* We don't log zero window probes */ 6169 return; 6170 } 6171 rack->r_ctl.rc_time_last_sent = ts; 6172 if (IN_RECOVERY(tp->t_flags)) { 6173 rack->r_ctl.rc_prr_out += len; 6174 } 6175 /* First question is it a retransmission or new? */ 6176 if (seq_out == snd_max) { 6177 /* Its new */ 6178 again: 6179 rsm = rack_alloc(rack); 6180 if (rsm == NULL) { 6181 /* 6182 * Hmm out of memory and the tcb got destroyed while 6183 * we tried to wait. 6184 */ 6185 return; 6186 } 6187 if (th_flags & TH_FIN) { 6188 rsm->r_flags = RACK_HAS_FIN; 6189 } else { 6190 rsm->r_flags = 0; 6191 } 6192 rsm->r_tim_lastsent[0] = ts; 6193 rsm->r_rtr_cnt = 1; 6194 rsm->r_rtr_bytes = 0; 6195 rsm->usec_orig_send = us_cts; 6196 if (th_flags & TH_SYN) { 6197 /* The data space is one beyond snd_una */ 6198 rsm->r_flags |= RACK_HAS_SIN; 6199 rsm->r_start = seq_out + 1; 6200 rsm->r_end = rsm->r_start + (len - 1); 6201 } else { 6202 /* Normal case */ 6203 rsm->r_start = seq_out; 6204 rsm->r_end = rsm->r_start + len; 6205 } 6206 rsm->r_dupack = 0; 6207 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 6208 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 6209 #ifdef INVARIANTS 6210 if (insret != NULL) { 6211 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", 6212 nrsm, insret, rack, rsm); 6213 } 6214 #endif 6215 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); 6216 rsm->r_in_tmap = 1; 6217 /* 6218 * Special case detection, is there just a single 6219 * packet outstanding when we are not in recovery? 6220 * 6221 * If this is true mark it so. 6222 */ 6223 if ((IN_RECOVERY(tp->t_flags) == 0) && 6224 (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) == ctf_fixed_maxseg(tp))) { 6225 struct rack_sendmap *prsm; 6226 6227 prsm = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 6228 if (prsm) 6229 prsm->r_one_out_nr = 1; 6230 } 6231 return; 6232 } 6233 /* 6234 * If we reach here its a retransmission and we need to find it. 6235 */ 6236 memset(&fe, 0, sizeof(fe)); 6237 more: 6238 if (hintrsm && (hintrsm->r_start == seq_out)) { 6239 rsm = hintrsm; 6240 hintrsm = NULL; 6241 } else { 6242 /* No hints sorry */ 6243 rsm = NULL; 6244 } 6245 if ((rsm) && (rsm->r_start == seq_out)) { 6246 seq_out = rack_update_entry(tp, rack, rsm, ts, &len); 6247 if (len == 0) { 6248 return; 6249 } else { 6250 goto more; 6251 } 6252 } 6253 /* Ok it was not the last pointer go through it the hard way. */ 6254 refind: 6255 fe.r_start = seq_out; 6256 rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); 6257 if (rsm) { 6258 if (rsm->r_start == seq_out) { 6259 seq_out = rack_update_entry(tp, rack, rsm, ts, &len); 6260 if (len == 0) { 6261 return; 6262 } else { 6263 goto refind; 6264 } 6265 } 6266 if (SEQ_GEQ(seq_out, rsm->r_start) && SEQ_LT(seq_out, rsm->r_end)) { 6267 /* Transmitted within this piece */ 6268 /* 6269 * Ok we must split off the front and then let the 6270 * update do the rest 6271 */ 6272 nrsm = rack_alloc_full_limit(rack); 6273 if (nrsm == NULL) { 6274 rack_update_rsm(tp, rack, rsm, ts); 6275 return; 6276 } 6277 /* 6278 * copy rsm to nrsm and then trim the front of rsm 6279 * to not include this part. 6280 */ 6281 rack_clone_rsm(rack, nrsm, rsm, seq_out); 6282 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); 6283 #ifdef INVARIANTS 6284 if (insret != NULL) { 6285 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", 6286 nrsm, insret, rack, rsm); 6287 } 6288 #endif 6289 if (rsm->r_in_tmap) { 6290 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 6291 nrsm->r_in_tmap = 1; 6292 } 6293 rsm->r_flags &= (~RACK_HAS_FIN); 6294 seq_out = rack_update_entry(tp, rack, nrsm, ts, &len); 6295 if (len == 0) { 6296 return; 6297 } else if (len > 0) 6298 goto refind; 6299 } 6300 } 6301 /* 6302 * Hmm not found in map did they retransmit both old and on into the 6303 * new? 6304 */ 6305 if (seq_out == tp->snd_max) { 6306 goto again; 6307 } else if (SEQ_LT(seq_out, tp->snd_max)) { 6308 #ifdef INVARIANTS 6309 printf("seq_out:%u len:%d snd_una:%u snd_max:%u -- but rsm not found?\n", 6310 seq_out, len, tp->snd_una, tp->snd_max); 6311 printf("Starting Dump of all rack entries\n"); 6312 RB_FOREACH(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) { 6313 printf("rsm:%p start:%u end:%u\n", 6314 rsm, rsm->r_start, rsm->r_end); 6315 } 6316 printf("Dump complete\n"); 6317 panic("seq_out not found rack:%p tp:%p", 6318 rack, tp); 6319 #endif 6320 } else { 6321 #ifdef INVARIANTS 6322 /* 6323 * Hmm beyond sndmax? (only if we are using the new rtt-pack 6324 * flag) 6325 */ 6326 panic("seq_out:%u(%d) is beyond snd_max:%u tp:%p", 6327 seq_out, len, tp->snd_max, tp); 6328 #endif 6329 } 6330 } 6331 6332 /* 6333 * Record one of the RTT updates from an ack into 6334 * our sample structure. 6335 */ 6336 6337 static void 6338 tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt, uint32_t len, uint32_t us_rtt, 6339 int confidence, struct rack_sendmap *rsm, uint16_t rtrcnt) 6340 { 6341 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) || 6342 (rack->r_ctl.rack_rs.rs_rtt_lowest > rtt)) { 6343 rack->r_ctl.rack_rs.rs_rtt_lowest = rtt; 6344 } 6345 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) || 6346 (rack->r_ctl.rack_rs.rs_rtt_highest < rtt)) { 6347 rack->r_ctl.rack_rs.rs_rtt_highest = rtt; 6348 } 6349 if (rack->rc_tp->t_flags & TF_GPUTINPROG) { 6350 if (us_rtt < rack->r_ctl.rc_gp_lowrtt) 6351 rack->r_ctl.rc_gp_lowrtt = us_rtt; 6352 if (rack->rc_tp->snd_wnd > rack->r_ctl.rc_gp_high_rwnd) 6353 rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd; 6354 } 6355 if ((confidence == 1) && 6356 ((rsm == NULL) || 6357 (rsm->r_just_ret) || 6358 (rsm->r_one_out_nr && 6359 len < (ctf_fixed_maxseg(rack->rc_tp) * 2)))) { 6360 /* 6361 * If the rsm had a just return 6362 * hit it then we can't trust the 6363 * rtt measurement for buffer deterimination 6364 * Note that a confidence of 2, indicates 6365 * SACK'd which overrides the r_just_ret or 6366 * the r_one_out_nr. If it was a CUM-ACK and 6367 * we had only two outstanding, but get an 6368 * ack for only 1. Then that also lowers our 6369 * confidence. 6370 */ 6371 confidence = 0; 6372 } 6373 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) || 6374 (rack->r_ctl.rack_rs.rs_us_rtt > us_rtt)) { 6375 if (rack->r_ctl.rack_rs.confidence == 0) { 6376 /* 6377 * We take anything with no current confidence 6378 * saved. 6379 */ 6380 rack->r_ctl.rack_rs.rs_us_rtt = us_rtt; 6381 rack->r_ctl.rack_rs.confidence = confidence; 6382 rack->r_ctl.rack_rs.rs_us_rtrcnt = rtrcnt; 6383 } else if (confidence || rack->r_ctl.rack_rs.confidence) { 6384 /* 6385 * Once we have a confident number, 6386 * we can update it with a smaller 6387 * value since this confident number 6388 * may include the DSACK time until 6389 * the next segment (the second one) arrived. 6390 */ 6391 rack->r_ctl.rack_rs.rs_us_rtt = us_rtt; 6392 rack->r_ctl.rack_rs.confidence = confidence; 6393 rack->r_ctl.rack_rs.rs_us_rtrcnt = rtrcnt; 6394 } 6395 6396 } 6397 rack_log_rtt_upd(rack->rc_tp, rack, us_rtt, len, rsm, confidence); 6398 rack->r_ctl.rack_rs.rs_flags = RACK_RTT_VALID; 6399 rack->r_ctl.rack_rs.rs_rtt_tot += rtt; 6400 rack->r_ctl.rack_rs.rs_rtt_cnt++; 6401 } 6402 6403 /* 6404 * Collect new round-trip time estimate 6405 * and update averages and current timeout. 6406 */ 6407 static void 6408 tcp_rack_xmit_timer_commit(struct tcp_rack *rack, struct tcpcb *tp) 6409 { 6410 int32_t delta; 6411 uint32_t o_srtt, o_var; 6412 int32_t hrtt_up = 0; 6413 int32_t rtt; 6414 6415 if (rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) 6416 /* No valid sample */ 6417 return; 6418 if (rack->r_ctl.rc_rate_sample_method == USE_RTT_LOW) { 6419 /* We are to use the lowest RTT seen in a single ack */ 6420 rtt = rack->r_ctl.rack_rs.rs_rtt_lowest; 6421 } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_HIGH) { 6422 /* We are to use the highest RTT seen in a single ack */ 6423 rtt = rack->r_ctl.rack_rs.rs_rtt_highest; 6424 } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_AVG) { 6425 /* We are to use the average RTT seen in a single ack */ 6426 rtt = (int32_t)(rack->r_ctl.rack_rs.rs_rtt_tot / 6427 (uint64_t)rack->r_ctl.rack_rs.rs_rtt_cnt); 6428 } else { 6429 #ifdef INVARIANTS 6430 panic("Unknown rtt variant %d", rack->r_ctl.rc_rate_sample_method); 6431 #endif 6432 return; 6433 } 6434 if (rtt == 0) 6435 rtt = 1; 6436 if (rack->rc_gp_rtt_set == 0) { 6437 /* 6438 * With no RTT we have to accept 6439 * even one we are not confident of. 6440 */ 6441 rack->r_ctl.rc_gp_srtt = rack->r_ctl.rack_rs.rs_us_rtt; 6442 rack->rc_gp_rtt_set = 1; 6443 } else if (rack->r_ctl.rack_rs.confidence) { 6444 /* update the running gp srtt */ 6445 rack->r_ctl.rc_gp_srtt -= (rack->r_ctl.rc_gp_srtt/8); 6446 rack->r_ctl.rc_gp_srtt += rack->r_ctl.rack_rs.rs_us_rtt / 8; 6447 } 6448 if (rack->r_ctl.rack_rs.confidence) { 6449 /* 6450 * record the low and high for highly buffered path computation, 6451 * we only do this if we are confident (not a retransmission). 6452 */ 6453 if (rack->r_ctl.rc_highest_us_rtt < rack->r_ctl.rack_rs.rs_us_rtt) { 6454 rack->r_ctl.rc_highest_us_rtt = rack->r_ctl.rack_rs.rs_us_rtt; 6455 hrtt_up = 1; 6456 } 6457 if (rack->rc_highly_buffered == 0) { 6458 /* 6459 * Currently once we declare a path has 6460 * highly buffered there is no going 6461 * back, which may be a problem... 6462 */ 6463 if ((rack->r_ctl.rc_highest_us_rtt / rack->r_ctl.rc_lowest_us_rtt) > rack_hbp_thresh) { 6464 rack_log_rtt_shrinks(rack, rack->r_ctl.rack_rs.rs_us_rtt, 6465 rack->r_ctl.rc_highest_us_rtt, 6466 rack->r_ctl.rc_lowest_us_rtt, 6467 RACK_RTTS_SEEHBP); 6468 rack->rc_highly_buffered = 1; 6469 } 6470 } 6471 } 6472 if ((rack->r_ctl.rack_rs.confidence) || 6473 (rack->r_ctl.rack_rs.rs_us_rtrcnt == 1)) { 6474 /* 6475 * If we are highly confident of it <or> it was 6476 * never retransmitted we accept it as the last us_rtt. 6477 */ 6478 rack->r_ctl.rc_last_us_rtt = rack->r_ctl.rack_rs.rs_us_rtt; 6479 /* The lowest rtt can be set if its was not retransmited */ 6480 if (rack->r_ctl.rc_lowest_us_rtt > rack->r_ctl.rack_rs.rs_us_rtt) { 6481 rack->r_ctl.rc_lowest_us_rtt = rack->r_ctl.rack_rs.rs_us_rtt; 6482 if (rack->r_ctl.rc_lowest_us_rtt == 0) 6483 rack->r_ctl.rc_lowest_us_rtt = 1; 6484 } 6485 } 6486 rack_log_rtt_sample(rack, rtt); 6487 o_srtt = tp->t_srtt; 6488 o_var = tp->t_rttvar; 6489 rack = (struct tcp_rack *)tp->t_fb_ptr; 6490 if (tp->t_srtt != 0) { 6491 /* 6492 * srtt is stored as fixed point with 5 bits after the 6493 * binary point (i.e., scaled by 8). The following magic is 6494 * equivalent to the smoothing algorithm in rfc793 with an 6495 * alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed point). 6496 * Adjust rtt to origin 0. 6497 */ 6498 delta = ((rtt - 1) << TCP_DELTA_SHIFT) 6499 - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)); 6500 6501 tp->t_srtt += delta; 6502 if (tp->t_srtt <= 0) 6503 tp->t_srtt = 1; 6504 6505 /* 6506 * We accumulate a smoothed rtt variance (actually, a 6507 * smoothed mean difference), then set the retransmit timer 6508 * to smoothed rtt + 4 times the smoothed variance. rttvar 6509 * is stored as fixed point with 4 bits after the binary 6510 * point (scaled by 16). The following is equivalent to 6511 * rfc793 smoothing with an alpha of .75 (rttvar = 6512 * rttvar*3/4 + |delta| / 4). This replaces rfc793's 6513 * wired-in beta. 6514 */ 6515 if (delta < 0) 6516 delta = -delta; 6517 delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT); 6518 tp->t_rttvar += delta; 6519 if (tp->t_rttvar <= 0) 6520 tp->t_rttvar = 1; 6521 if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar) 6522 tp->t_rttbest = tp->t_srtt + tp->t_rttvar; 6523 } else { 6524 /* 6525 * No rtt measurement yet - use the unsmoothed rtt. Set the 6526 * variance to half the rtt (so our first retransmit happens 6527 * at 3*rtt). 6528 */ 6529 tp->t_srtt = rtt << TCP_RTT_SHIFT; 6530 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1); 6531 tp->t_rttbest = tp->t_srtt + tp->t_rttvar; 6532 } 6533 KMOD_TCPSTAT_INC(tcps_rttupdated); 6534 tp->t_rttupdated++; 6535 #ifdef STATS 6536 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rtt)); 6537 #endif 6538 tp->t_rxtshift = 0; 6539 6540 /* 6541 * the retransmit should happen at rtt + 4 * rttvar. Because of the 6542 * way we do the smoothing, srtt and rttvar will each average +1/2 6543 * tick of bias. When we compute the retransmit timer, we want 1/2 6544 * tick of rounding and 1 extra tick because of +-1/2 tick 6545 * uncertainty in the firing of the timer. The bias will give us 6546 * exactly the 1.5 tick we need. But, because the bias is 6547 * statistical, we have to test that we don't drop below the minimum 6548 * feasible timer (which is 2 ticks). 6549 */ 6550 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), 6551 max(MSEC_2_TICKS(rack_rto_min), rtt + 2), MSEC_2_TICKS(rack_rto_max)); 6552 tp->t_softerror = 0; 6553 } 6554 6555 static void 6556 rack_earlier_retran(struct tcpcb *tp, struct rack_sendmap *rsm, 6557 uint32_t t, uint32_t cts) 6558 { 6559 /* 6560 * For this RSM, we acknowledged the data from a previous 6561 * transmission, not the last one we made. This means we did a false 6562 * retransmit. 6563 */ 6564 struct tcp_rack *rack; 6565 6566 if (rsm->r_flags & RACK_HAS_FIN) { 6567 /* 6568 * The sending of the FIN often is multiple sent when we 6569 * have everything outstanding ack'd. We ignore this case 6570 * since its over now. 6571 */ 6572 return; 6573 } 6574 if (rsm->r_flags & RACK_TLP) { 6575 /* 6576 * We expect TLP's to have this occur. 6577 */ 6578 return; 6579 } 6580 rack = (struct tcp_rack *)tp->t_fb_ptr; 6581 /* should we undo cc changes and exit recovery? */ 6582 if (IN_RECOVERY(tp->t_flags)) { 6583 if (rack->r_ctl.rc_rsm_start == rsm->r_start) { 6584 /* 6585 * Undo what we ratched down and exit recovery if 6586 * possible 6587 */ 6588 EXIT_RECOVERY(tp->t_flags); 6589 tp->snd_recover = tp->snd_una; 6590 if (rack->r_ctl.rc_cwnd_at > tp->snd_cwnd) 6591 tp->snd_cwnd = rack->r_ctl.rc_cwnd_at; 6592 if (rack->r_ctl.rc_ssthresh_at > tp->snd_ssthresh) 6593 tp->snd_ssthresh = rack->r_ctl.rc_ssthresh_at; 6594 } 6595 } 6596 if (rsm->r_flags & RACK_WAS_SACKPASS) { 6597 /* 6598 * We retransmitted based on a sack and the earlier 6599 * retransmission ack'd it - re-ordering is occuring. 6600 */ 6601 counter_u64_add(rack_reorder_seen, 1); 6602 rack->r_ctl.rc_reorder_ts = cts; 6603 } 6604 counter_u64_add(rack_badfr, 1); 6605 counter_u64_add(rack_badfr_bytes, (rsm->r_end - rsm->r_start)); 6606 } 6607 6608 static void 6609 rack_apply_updated_usrtt(struct tcp_rack *rack, uint32_t us_rtt, uint32_t us_cts) 6610 { 6611 /* 6612 * Apply to filter the inbound us-rtt at us_cts. 6613 */ 6614 uint32_t old_rtt; 6615 6616 old_rtt = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); 6617 apply_filter_min_small(&rack->r_ctl.rc_gp_min_rtt, 6618 us_rtt, us_cts); 6619 if (rack->r_ctl.last_pacing_time && 6620 rack->rc_gp_dyn_mul && 6621 (rack->r_ctl.last_pacing_time > us_rtt)) 6622 rack->pacing_longer_than_rtt = 1; 6623 else 6624 rack->pacing_longer_than_rtt = 0; 6625 if (old_rtt > us_rtt) { 6626 /* We just hit a new lower rtt time */ 6627 rack_log_rtt_shrinks(rack, us_cts, old_rtt, 6628 __LINE__, RACK_RTTS_NEWRTT); 6629 /* 6630 * Only count it if its lower than what we saw within our 6631 * calculated range. 6632 */ 6633 if ((old_rtt - us_rtt) > rack_min_rtt_movement) { 6634 if (rack_probertt_lower_within && 6635 rack->rc_gp_dyn_mul && 6636 (rack->use_fixed_rate == 0) && 6637 (rack->rc_always_pace)) { 6638 /* 6639 * We are seeing a new lower rtt very close 6640 * to the time that we would have entered probe-rtt. 6641 * This is probably due to the fact that a peer flow 6642 * has entered probe-rtt. Lets go in now too. 6643 */ 6644 uint32_t val; 6645 6646 val = rack_probertt_lower_within * rack_time_between_probertt; 6647 val /= 100; 6648 if ((rack->in_probe_rtt == 0) && 6649 ((us_cts - rack->r_ctl.rc_lower_rtt_us_cts) >= (rack_time_between_probertt - val))) { 6650 rack_enter_probertt(rack, us_cts); 6651 } 6652 } 6653 rack->r_ctl.rc_lower_rtt_us_cts = us_cts; 6654 } 6655 } 6656 } 6657 6658 static int 6659 rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack, 6660 struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type, tcp_seq th_ack) 6661 { 6662 int32_t i; 6663 uint32_t t, len_acked; 6664 6665 if ((rsm->r_flags & RACK_ACKED) || 6666 (rsm->r_flags & RACK_WAS_ACKED)) 6667 /* Already done */ 6668 return (0); 6669 6670 if (ack_type == CUM_ACKED) { 6671 if (SEQ_GT(th_ack, rsm->r_end)) 6672 len_acked = rsm->r_end - rsm->r_start; 6673 else 6674 len_acked = th_ack - rsm->r_start; 6675 } else 6676 len_acked = rsm->r_end - rsm->r_start; 6677 if (rsm->r_rtr_cnt == 1) { 6678 uint32_t us_rtt; 6679 6680 t = cts - rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; 6681 if ((int)t <= 0) 6682 t = 1; 6683 if (!tp->t_rttlow || tp->t_rttlow > t) 6684 tp->t_rttlow = t; 6685 if (!rack->r_ctl.rc_rack_min_rtt || 6686 SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 6687 rack->r_ctl.rc_rack_min_rtt = t; 6688 if (rack->r_ctl.rc_rack_min_rtt == 0) { 6689 rack->r_ctl.rc_rack_min_rtt = 1; 6690 } 6691 } 6692 us_rtt = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time) - rsm->usec_orig_send; 6693 if (us_rtt == 0) 6694 us_rtt = 1; 6695 rack_apply_updated_usrtt(rack, us_rtt, tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time)); 6696 if (ack_type == SACKED) 6697 tcp_rack_xmit_timer(rack, t + 1, len_acked, us_rtt, 2 , rsm, rsm->r_rtr_cnt); 6698 else { 6699 /* 6700 * For cum-ack we are only confident if what 6701 * is being acked is included in a measurement. 6702 * Otherwise it could be an idle period that 6703 * includes Delayed-ack time. 6704 */ 6705 tcp_rack_xmit_timer(rack, t + 1, len_acked, us_rtt, 6706 (rack->app_limited_needs_set ? 0 : 1), rsm, rsm->r_rtr_cnt); 6707 } 6708 if ((rsm->r_flags & RACK_TLP) && 6709 (!IN_RECOVERY(tp->t_flags))) { 6710 /* Segment was a TLP and our retrans matched */ 6711 if (rack->r_ctl.rc_tlp_cwnd_reduce) { 6712 rack->r_ctl.rc_rsm_start = tp->snd_max; 6713 rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; 6714 rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; 6715 rack_cong_signal(tp, NULL, CC_NDUPACK); 6716 /* 6717 * When we enter recovery we need to assure 6718 * we send one packet. 6719 */ 6720 if (rack->rack_no_prr == 0) { 6721 rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); 6722 rack_log_to_prr(rack, 7, 0); 6723 } 6724 } 6725 } 6726 if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) { 6727 /* New more recent rack_tmit_time */ 6728 rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; 6729 rack->rc_rack_rtt = t; 6730 } 6731 return (1); 6732 } 6733 /* 6734 * We clear the soft/rxtshift since we got an ack. 6735 * There is no assurance we will call the commit() function 6736 * so we need to clear these to avoid incorrect handling. 6737 */ 6738 tp->t_rxtshift = 0; 6739 tp->t_softerror = 0; 6740 if ((to->to_flags & TOF_TS) && 6741 (ack_type == CUM_ACKED) && 6742 (to->to_tsecr) && 6743 ((rsm->r_flags & RACK_OVERMAX) == 0)) { 6744 /* 6745 * Now which timestamp does it match? In this block the ACK 6746 * must be coming from a previous transmission. 6747 */ 6748 for (i = 0; i < rsm->r_rtr_cnt; i++) { 6749 if (rsm->r_tim_lastsent[i] == to->to_tsecr) { 6750 t = cts - rsm->r_tim_lastsent[i]; 6751 if ((int)t <= 0) 6752 t = 1; 6753 if ((i + 1) < rsm->r_rtr_cnt) { 6754 /* Likely */ 6755 rack_earlier_retran(tp, rsm, t, cts); 6756 } 6757 if (!tp->t_rttlow || tp->t_rttlow > t) 6758 tp->t_rttlow = t; 6759 if (!rack->r_ctl.rc_rack_min_rtt || SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 6760 rack->r_ctl.rc_rack_min_rtt = t; 6761 if (rack->r_ctl.rc_rack_min_rtt == 0) { 6762 rack->r_ctl.rc_rack_min_rtt = 1; 6763 } 6764 } 6765 if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, 6766 rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) { 6767 /* New more recent rack_tmit_time */ 6768 rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; 6769 rack->rc_rack_rtt = t; 6770 } 6771 tcp_rack_xmit_timer(rack, t + 1, len_acked, (t * HPTS_USEC_IN_MSEC), 0, rsm, 6772 rsm->r_rtr_cnt); 6773 return (1); 6774 } 6775 } 6776 goto ts_not_found; 6777 } else { 6778 /* 6779 * Ok its a SACK block that we retransmitted. or a windows 6780 * machine without timestamps. We can tell nothing from the 6781 * time-stamp since its not there or the time the peer last 6782 * recieved a segment that moved forward its cum-ack point. 6783 */ 6784 ts_not_found: 6785 i = rsm->r_rtr_cnt - 1; 6786 t = cts - rsm->r_tim_lastsent[i]; 6787 if ((int)t <= 0) 6788 t = 1; 6789 if (rack->r_ctl.rc_rack_min_rtt && SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 6790 /* 6791 * We retransmitted and the ack came back in less 6792 * than the smallest rtt we have observed. We most 6793 * likey did an improper retransmit as outlined in 6794 * 4.2 Step 3 point 2 in the rack-draft. 6795 */ 6796 i = rsm->r_rtr_cnt - 2; 6797 t = cts - rsm->r_tim_lastsent[i]; 6798 rack_earlier_retran(tp, rsm, t, cts); 6799 } else if (rack->r_ctl.rc_rack_min_rtt) { 6800 /* 6801 * We retransmitted it and the retransmit did the 6802 * job. 6803 */ 6804 if (!rack->r_ctl.rc_rack_min_rtt || 6805 SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 6806 rack->r_ctl.rc_rack_min_rtt = t; 6807 if (rack->r_ctl.rc_rack_min_rtt == 0) { 6808 rack->r_ctl.rc_rack_min_rtt = 1; 6809 } 6810 } 6811 if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, rsm->r_tim_lastsent[i])) { 6812 /* New more recent rack_tmit_time */ 6813 rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[i]; 6814 rack->rc_rack_rtt = t; 6815 } 6816 return (1); 6817 } 6818 } 6819 return (0); 6820 } 6821 6822 /* 6823 * Mark the SACK_PASSED flag on all entries prior to rsm send wise. 6824 */ 6825 static void 6826 rack_log_sack_passed(struct tcpcb *tp, 6827 struct tcp_rack *rack, struct rack_sendmap *rsm) 6828 { 6829 struct rack_sendmap *nrsm; 6830 6831 nrsm = rsm; 6832 TAILQ_FOREACH_REVERSE_FROM(nrsm, &rack->r_ctl.rc_tmap, 6833 rack_head, r_tnext) { 6834 if (nrsm == rsm) { 6835 /* Skip orginal segment he is acked */ 6836 continue; 6837 } 6838 if (nrsm->r_flags & RACK_ACKED) { 6839 /* 6840 * Skip ack'd segments, though we 6841 * should not see these, since tmap 6842 * should not have ack'd segments. 6843 */ 6844 continue; 6845 } 6846 if (nrsm->r_flags & RACK_SACK_PASSED) { 6847 /* 6848 * We found one that is already marked 6849 * passed, we have been here before and 6850 * so all others below this are marked. 6851 */ 6852 break; 6853 } 6854 nrsm->r_flags |= RACK_SACK_PASSED; 6855 nrsm->r_flags &= ~RACK_WAS_SACKPASS; 6856 } 6857 } 6858 6859 static void 6860 rack_need_set_test(struct tcpcb *tp, 6861 struct tcp_rack *rack, 6862 struct rack_sendmap *rsm, 6863 tcp_seq th_ack, 6864 int line, 6865 int use_which) 6866 { 6867 6868 if ((tp->t_flags & TF_GPUTINPROG) && 6869 SEQ_GEQ(rsm->r_end, tp->gput_seq)) { 6870 /* 6871 * We were app limited, and this ack 6872 * butts up or goes beyond the point where we want 6873 * to start our next measurement. We need 6874 * to record the new gput_ts as here and 6875 * possibly update the start sequence. 6876 */ 6877 uint32_t seq, ts; 6878 6879 if (rsm->r_rtr_cnt > 1) { 6880 /* 6881 * This is a retransmit, can we 6882 * really make any assessment at this 6883 * point? We are not really sure of 6884 * the timestamp, is it this or the 6885 * previous transmission? 6886 * 6887 * Lets wait for something better that 6888 * is not retransmitted. 6889 */ 6890 return; 6891 } 6892 seq = tp->gput_seq; 6893 ts = tp->gput_ts; 6894 rack->app_limited_needs_set = 0; 6895 tp->gput_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 6896 /* Do we start at a new end? */ 6897 if ((use_which == RACK_USE_BEG) && 6898 SEQ_GEQ(rsm->r_start, tp->gput_seq)) { 6899 /* 6900 * When we get an ACK that just eats 6901 * up some of the rsm, we set RACK_USE_BEG 6902 * since whats at r_start (i.e. th_ack) 6903 * is left unacked and thats where the 6904 * measurement not starts. 6905 */ 6906 tp->gput_seq = rsm->r_start; 6907 rack->r_ctl.rc_gp_output_ts = rsm->usec_orig_send; 6908 } 6909 if ((use_which == RACK_USE_END) && 6910 SEQ_GEQ(rsm->r_end, tp->gput_seq)) { 6911 /* 6912 * We use the end when the cumack 6913 * is moving forward and completely 6914 * deleting the rsm passed so basically 6915 * r_end holds th_ack. 6916 * 6917 * For SACK's we also want to use the end 6918 * since this piece just got sacked and 6919 * we want to target anything after that 6920 * in our measurement. 6921 */ 6922 tp->gput_seq = rsm->r_end; 6923 rack->r_ctl.rc_gp_output_ts = rsm->usec_orig_send; 6924 } 6925 if (use_which == RACK_USE_END_OR_THACK) { 6926 /* 6927 * special case for ack moving forward, 6928 * not a sack, we need to move all the 6929 * way up to where this ack cum-ack moves 6930 * to. 6931 */ 6932 if (SEQ_GT(th_ack, rsm->r_end)) 6933 tp->gput_seq = th_ack; 6934 else 6935 tp->gput_seq = rsm->r_end; 6936 rack->r_ctl.rc_gp_output_ts = rsm->usec_orig_send; 6937 } 6938 if (SEQ_GT(tp->gput_seq, tp->gput_ack)) { 6939 /* 6940 * We moved beyond this guy's range, re-calculate 6941 * the new end point. 6942 */ 6943 if (rack->rc_gp_filled == 0) { 6944 tp->gput_ack = tp->gput_seq + max(rc_init_window(rack), (MIN_GP_WIN * ctf_fixed_maxseg(tp))); 6945 } else { 6946 tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack); 6947 } 6948 } 6949 /* 6950 * We are moving the goal post, we may be able to clear the 6951 * measure_saw_probe_rtt flag. 6952 */ 6953 if ((rack->in_probe_rtt == 0) && 6954 (rack->measure_saw_probe_rtt) && 6955 (SEQ_GEQ(tp->gput_seq, rack->r_ctl.rc_probertt_sndmax_atexit))) 6956 rack->measure_saw_probe_rtt = 0; 6957 rack_log_pacing_delay_calc(rack, ts, tp->gput_ts, 6958 seq, tp->gput_seq, 0, 5, line, NULL); 6959 if (rack->rc_gp_filled && 6960 ((tp->gput_ack - tp->gput_seq) < 6961 max(rc_init_window(rack), (MIN_GP_WIN * 6962 ctf_fixed_maxseg(tp))))) { 6963 /* 6964 * There is no sense of continuing this measurement 6965 * because its too small to gain us anything we 6966 * trust. Skip it and that way we can start a new 6967 * measurement quicker. 6968 */ 6969 rack_log_pacing_delay_calc(rack, tp->gput_ack, tp->gput_seq, 6970 0, 0, 0, 6, __LINE__, NULL); 6971 tp->t_flags &= ~TF_GPUTINPROG; 6972 } 6973 } 6974 } 6975 6976 static uint32_t 6977 rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, struct sackblk *sack, 6978 struct tcpopt *to, struct rack_sendmap **prsm, uint32_t cts, int *moved_two) 6979 { 6980 uint32_t start, end, changed = 0; 6981 struct rack_sendmap stack_map; 6982 struct rack_sendmap *rsm, *nrsm, fe, *insret, *prev, *next; 6983 int32_t used_ref = 1; 6984 int moved = 0; 6985 6986 start = sack->start; 6987 end = sack->end; 6988 rsm = *prsm; 6989 memset(&fe, 0, sizeof(fe)); 6990 do_rest_ofb: 6991 if ((rsm == NULL) || 6992 (SEQ_LT(end, rsm->r_start)) || 6993 (SEQ_GEQ(start, rsm->r_end)) || 6994 (SEQ_LT(start, rsm->r_start))) { 6995 /* 6996 * We are not in the right spot, 6997 * find the correct spot in the tree. 6998 */ 6999 used_ref = 0; 7000 fe.r_start = start; 7001 rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); 7002 moved++; 7003 } 7004 if (rsm == NULL) { 7005 /* TSNH */ 7006 goto out; 7007 } 7008 /* Ok we have an ACK for some piece of this rsm */ 7009 if (rsm->r_start != start) { 7010 if ((rsm->r_flags & RACK_ACKED) == 0) { 7011 /** 7012 * Need to split this in two pieces the before and after, 7013 * the before remains in the map, the after must be 7014 * added. In other words we have: 7015 * rsm |--------------| 7016 * sackblk |-------> 7017 * rsm will become 7018 * rsm |---| 7019 * and nrsm will be the sacked piece 7020 * nrsm |----------| 7021 * 7022 * But before we start down that path lets 7023 * see if the sack spans over on top of 7024 * the next guy and it is already sacked. 7025 */ 7026 next = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7027 if (next && (next->r_flags & RACK_ACKED) && 7028 SEQ_GEQ(end, next->r_start)) { 7029 /** 7030 * So the next one is already acked, and 7031 * we can thus by hookery use our stack_map 7032 * to reflect the piece being sacked and 7033 * then adjust the two tree entries moving 7034 * the start and ends around. So we start like: 7035 * rsm |------------| (not-acked) 7036 * next |-----------| (acked) 7037 * sackblk |--------> 7038 * We want to end like so: 7039 * rsm |------| (not-acked) 7040 * next |-----------------| (acked) 7041 * nrsm |-----| 7042 * Where nrsm is a temporary stack piece we 7043 * use to update all the gizmos. 7044 */ 7045 /* Copy up our fudge block */ 7046 nrsm = &stack_map; 7047 memcpy(nrsm, rsm, sizeof(struct rack_sendmap)); 7048 /* Now adjust our tree blocks */ 7049 rsm->r_end = start; 7050 next->r_start = start; 7051 /* Clear out the dup ack count of the remainder */ 7052 rsm->r_dupack = 0; 7053 rsm->r_just_ret = 0; 7054 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 7055 /* Now lets make sure our fudge block is right */ 7056 nrsm->r_start = start; 7057 /* Now lets update all the stats and such */ 7058 rack_update_rtt(tp, rack, nrsm, to, cts, SACKED, 0); 7059 if (rack->app_limited_needs_set) 7060 rack_need_set_test(tp, rack, nrsm, tp->snd_una, __LINE__, RACK_USE_END); 7061 changed += (nrsm->r_end - nrsm->r_start); 7062 rack->r_ctl.rc_sacked += (nrsm->r_end - nrsm->r_start); 7063 if (nrsm->r_flags & RACK_SACK_PASSED) { 7064 counter_u64_add(rack_reorder_seen, 1); 7065 rack->r_ctl.rc_reorder_ts = cts; 7066 } 7067 /* 7068 * Now we want to go up from rsm (the 7069 * one left un-acked) to the next one 7070 * in the tmap. We do this so when 7071 * we walk backwards we include marking 7072 * sack-passed on rsm (The one passed in 7073 * is skipped since it is generally called 7074 * on something sacked before removing it 7075 * from the tmap). 7076 */ 7077 if (rsm->r_in_tmap) { 7078 nrsm = TAILQ_NEXT(rsm, r_tnext); 7079 /* 7080 * Now that we have the next 7081 * one walk backwards from there. 7082 */ 7083 if (nrsm && nrsm->r_in_tmap) 7084 rack_log_sack_passed(tp, rack, nrsm); 7085 } 7086 /* Now are we done? */ 7087 if (SEQ_LT(end, next->r_end) || 7088 (end == next->r_end)) { 7089 /* Done with block */ 7090 goto out; 7091 } 7092 counter_u64_add(rack_sack_used_next_merge, 1); 7093 /* Postion for the next block */ 7094 start = next->r_end; 7095 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, next); 7096 if (rsm == NULL) 7097 goto out; 7098 } else { 7099 /** 7100 * We can't use any hookery here, so we 7101 * need to split the map. We enter like 7102 * so: 7103 * rsm |--------| 7104 * sackblk |-----> 7105 * We will add the new block nrsm and 7106 * that will be the new portion, and then 7107 * fall through after reseting rsm. So we 7108 * split and look like this: 7109 * rsm |----| 7110 * sackblk |-----> 7111 * nrsm |---| 7112 * We then fall through reseting 7113 * rsm to nrsm, so the next block 7114 * picks it up. 7115 */ 7116 nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT); 7117 if (nrsm == NULL) { 7118 /* 7119 * failed XXXrrs what can we do but loose the sack 7120 * info? 7121 */ 7122 goto out; 7123 } 7124 counter_u64_add(rack_sack_splits, 1); 7125 rack_clone_rsm(rack, nrsm, rsm, start); 7126 rsm->r_just_ret = 0; 7127 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); 7128 #ifdef INVARIANTS 7129 if (insret != NULL) { 7130 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", 7131 nrsm, insret, rack, rsm); 7132 } 7133 #endif 7134 if (rsm->r_in_tmap) { 7135 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 7136 nrsm->r_in_tmap = 1; 7137 } 7138 rsm->r_flags &= (~RACK_HAS_FIN); 7139 /* Position us to point to the new nrsm that starts the sack blk */ 7140 rsm = nrsm; 7141 } 7142 } else { 7143 /* Already sacked this piece */ 7144 counter_u64_add(rack_sack_skipped_acked, 1); 7145 moved++; 7146 if (end == rsm->r_end) { 7147 /* Done with block */ 7148 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7149 goto out; 7150 } else if (SEQ_LT(end, rsm->r_end)) { 7151 /* A partial sack to a already sacked block */ 7152 moved++; 7153 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7154 goto out; 7155 } else { 7156 /* 7157 * The end goes beyond this guy 7158 * repostion the start to the 7159 * next block. 7160 */ 7161 start = rsm->r_end; 7162 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7163 if (rsm == NULL) 7164 goto out; 7165 } 7166 } 7167 } 7168 if (SEQ_GEQ(end, rsm->r_end)) { 7169 /** 7170 * The end of this block is either beyond this guy or right 7171 * at this guy. I.e.: 7172 * rsm --- |-----| 7173 * end |-----| 7174 * <or> 7175 * end |---------| 7176 */ 7177 if ((rsm->r_flags & RACK_ACKED) == 0) { 7178 rack_update_rtt(tp, rack, rsm, to, cts, SACKED, 0); 7179 changed += (rsm->r_end - rsm->r_start); 7180 rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start); 7181 if (rsm->r_in_tmap) /* should be true */ 7182 rack_log_sack_passed(tp, rack, rsm); 7183 /* Is Reordering occuring? */ 7184 if (rsm->r_flags & RACK_SACK_PASSED) { 7185 rsm->r_flags &= ~RACK_SACK_PASSED; 7186 counter_u64_add(rack_reorder_seen, 1); 7187 rack->r_ctl.rc_reorder_ts = cts; 7188 } 7189 if (rack->app_limited_needs_set) 7190 rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_END); 7191 rsm->r_ack_arrival = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 7192 rsm->r_flags |= RACK_ACKED; 7193 rsm->r_flags &= ~RACK_TLP; 7194 if (rsm->r_in_tmap) { 7195 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 7196 rsm->r_in_tmap = 0; 7197 } 7198 } else { 7199 counter_u64_add(rack_sack_skipped_acked, 1); 7200 moved++; 7201 } 7202 if (end == rsm->r_end) { 7203 /* This block only - done, setup for next */ 7204 goto out; 7205 } 7206 /* 7207 * There is more not coverend by this rsm move on 7208 * to the next block in the RB tree. 7209 */ 7210 nrsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7211 start = rsm->r_end; 7212 rsm = nrsm; 7213 if (rsm == NULL) 7214 goto out; 7215 goto do_rest_ofb; 7216 } 7217 /** 7218 * The end of this sack block is smaller than 7219 * our rsm i.e.: 7220 * rsm --- |-----| 7221 * end |--| 7222 */ 7223 if ((rsm->r_flags & RACK_ACKED) == 0) { 7224 prev = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7225 if (prev && (prev->r_flags & RACK_ACKED)) { 7226 /** 7227 * Goal, we want the right remainder of rsm to shrink 7228 * in place and span from (rsm->r_start = end) to rsm->r_end. 7229 * We want to expand prev to go all the way 7230 * to prev->r_end <- end. 7231 * so in the tree we have before: 7232 * prev |--------| (acked) 7233 * rsm |-------| (non-acked) 7234 * sackblk |-| 7235 * We churn it so we end up with 7236 * prev |----------| (acked) 7237 * rsm |-----| (non-acked) 7238 * nrsm |-| (temporary) 7239 */ 7240 nrsm = &stack_map; 7241 memcpy(nrsm, rsm, sizeof(struct rack_sendmap)); 7242 prev->r_end = end; 7243 rsm->r_start = end; 7244 /* Now adjust nrsm (stack copy) to be 7245 * the one that is the small 7246 * piece that was "sacked". 7247 */ 7248 nrsm->r_end = end; 7249 rsm->r_dupack = 0; 7250 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 7251 /* 7252 * Now nrsm is our new little piece 7253 * that is acked (which was merged 7254 * to prev). Update the rtt and changed 7255 * based on that. Also check for reordering. 7256 */ 7257 rack_update_rtt(tp, rack, nrsm, to, cts, SACKED, 0); 7258 if (rack->app_limited_needs_set) 7259 rack_need_set_test(tp, rack, nrsm, tp->snd_una, __LINE__, RACK_USE_END); 7260 changed += (nrsm->r_end - nrsm->r_start); 7261 rack->r_ctl.rc_sacked += (nrsm->r_end - nrsm->r_start); 7262 if (nrsm->r_flags & RACK_SACK_PASSED) { 7263 counter_u64_add(rack_reorder_seen, 1); 7264 rack->r_ctl.rc_reorder_ts = cts; 7265 } 7266 rsm = prev; 7267 counter_u64_add(rack_sack_used_prev_merge, 1); 7268 } else { 7269 /** 7270 * This is the case where our previous 7271 * block is not acked either, so we must 7272 * split the block in two. 7273 */ 7274 nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT); 7275 if (nrsm == NULL) { 7276 /* failed rrs what can we do but loose the sack info? */ 7277 goto out; 7278 } 7279 /** 7280 * In this case nrsm becomes 7281 * nrsm->r_start = end; 7282 * nrsm->r_end = rsm->r_end; 7283 * which is un-acked. 7284 * <and> 7285 * rsm->r_end = nrsm->r_start; 7286 * i.e. the remaining un-acked 7287 * piece is left on the left 7288 * hand side. 7289 * 7290 * So we start like this 7291 * rsm |----------| (not acked) 7292 * sackblk |---| 7293 * build it so we have 7294 * rsm |---| (acked) 7295 * nrsm |------| (not acked) 7296 */ 7297 counter_u64_add(rack_sack_splits, 1); 7298 rack_clone_rsm(rack, nrsm, rsm, end); 7299 rsm->r_flags &= (~RACK_HAS_FIN); 7300 rsm->r_just_ret = 0; 7301 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); 7302 #ifdef INVARIANTS 7303 if (insret != NULL) { 7304 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", 7305 nrsm, insret, rack, rsm); 7306 } 7307 #endif 7308 if (rsm->r_in_tmap) { 7309 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 7310 nrsm->r_in_tmap = 1; 7311 } 7312 nrsm->r_dupack = 0; 7313 rack_log_retran_reason(rack, nrsm, __LINE__, 0, 2); 7314 rack_update_rtt(tp, rack, rsm, to, cts, SACKED, 0); 7315 changed += (rsm->r_end - rsm->r_start); 7316 rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start); 7317 if (rsm->r_in_tmap) /* should be true */ 7318 rack_log_sack_passed(tp, rack, rsm); 7319 /* Is Reordering occuring? */ 7320 if (rsm->r_flags & RACK_SACK_PASSED) { 7321 rsm->r_flags &= ~RACK_SACK_PASSED; 7322 counter_u64_add(rack_reorder_seen, 1); 7323 rack->r_ctl.rc_reorder_ts = cts; 7324 } 7325 if (rack->app_limited_needs_set) 7326 rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_END); 7327 rsm->r_ack_arrival = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 7328 rsm->r_flags |= RACK_ACKED; 7329 rsm->r_flags &= ~RACK_TLP; 7330 if (rsm->r_in_tmap) { 7331 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 7332 rsm->r_in_tmap = 0; 7333 } 7334 } 7335 } else if (start != end){ 7336 /* 7337 * The block was already acked. 7338 */ 7339 counter_u64_add(rack_sack_skipped_acked, 1); 7340 moved++; 7341 } 7342 out: 7343 if (rsm && (rsm->r_flags & RACK_ACKED)) { 7344 /* 7345 * Now can we merge where we worked 7346 * with either the previous or 7347 * next block? 7348 */ 7349 next = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7350 while (next) { 7351 if (next->r_flags & RACK_ACKED) { 7352 /* yep this and next can be merged */ 7353 rsm = rack_merge_rsm(rack, rsm, next); 7354 next = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7355 } else 7356 break; 7357 } 7358 /* Now what about the previous? */ 7359 prev = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7360 while (prev) { 7361 if (prev->r_flags & RACK_ACKED) { 7362 /* yep the previous and this can be merged */ 7363 rsm = rack_merge_rsm(rack, prev, rsm); 7364 prev = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7365 } else 7366 break; 7367 } 7368 } 7369 if (used_ref == 0) { 7370 counter_u64_add(rack_sack_proc_all, 1); 7371 } else { 7372 counter_u64_add(rack_sack_proc_short, 1); 7373 } 7374 /* Save off the next one for quick reference. */ 7375 if (rsm) 7376 nrsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7377 else 7378 nrsm = NULL; 7379 *prsm = rack->r_ctl.rc_sacklast = nrsm; 7380 /* Pass back the moved. */ 7381 *moved_two = moved; 7382 return (changed); 7383 } 7384 7385 static void inline 7386 rack_peer_reneges(struct tcp_rack *rack, struct rack_sendmap *rsm, tcp_seq th_ack) 7387 { 7388 struct rack_sendmap *tmap; 7389 7390 tmap = NULL; 7391 while (rsm && (rsm->r_flags & RACK_ACKED)) { 7392 /* Its no longer sacked, mark it so */ 7393 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); 7394 #ifdef INVARIANTS 7395 if (rsm->r_in_tmap) { 7396 panic("rack:%p rsm:%p flags:0x%x in tmap?", 7397 rack, rsm, rsm->r_flags); 7398 } 7399 #endif 7400 rsm->r_flags &= ~(RACK_ACKED|RACK_SACK_PASSED|RACK_WAS_SACKPASS); 7401 /* Rebuild it into our tmap */ 7402 if (tmap == NULL) { 7403 TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext); 7404 tmap = rsm; 7405 } else { 7406 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, tmap, rsm, r_tnext); 7407 tmap = rsm; 7408 } 7409 tmap->r_in_tmap = 1; 7410 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7411 } 7412 /* 7413 * Now lets possibly clear the sack filter so we start 7414 * recognizing sacks that cover this area. 7415 */ 7416 sack_filter_clear(&rack->r_ctl.rack_sf, th_ack); 7417 7418 } 7419 7420 static void 7421 rack_do_decay(struct tcp_rack *rack) 7422 { 7423 struct timeval res; 7424 7425 #define timersub(tvp, uvp, vvp) \ 7426 do { \ 7427 (vvp)->tv_sec = (tvp)->tv_sec - (uvp)->tv_sec; \ 7428 (vvp)->tv_usec = (tvp)->tv_usec - (uvp)->tv_usec; \ 7429 if ((vvp)->tv_usec < 0) { \ 7430 (vvp)->tv_sec--; \ 7431 (vvp)->tv_usec += 1000000; \ 7432 } \ 7433 } while (0) 7434 7435 timersub(&rack->r_ctl.act_rcv_time, &rack->r_ctl.rc_last_time_decay, &res); 7436 #undef timersub 7437 7438 rack->r_ctl.input_pkt++; 7439 if ((rack->rc_in_persist) || 7440 (res.tv_sec >= 1) || 7441 (rack->rc_tp->snd_max == rack->rc_tp->snd_una)) { 7442 /* 7443 * Check for decay of non-SAD, 7444 * we want all SAD detection metrics to 7445 * decay 1/4 per second (or more) passed. 7446 */ 7447 uint32_t pkt_delta; 7448 7449 pkt_delta = rack->r_ctl.input_pkt - rack->r_ctl.saved_input_pkt; 7450 /* Update our saved tracking values */ 7451 rack->r_ctl.saved_input_pkt = rack->r_ctl.input_pkt; 7452 rack->r_ctl.rc_last_time_decay = rack->r_ctl.act_rcv_time; 7453 /* Now do we escape without decay? */ 7454 #ifdef NETFLIX_EXP_DETECTION 7455 if (rack->rc_in_persist || 7456 (rack->rc_tp->snd_max == rack->rc_tp->snd_una) || 7457 (pkt_delta < tcp_sad_low_pps)){ 7458 /* 7459 * We don't decay idle connections 7460 * or ones that have a low input pps. 7461 */ 7462 return; 7463 } 7464 /* Decay the counters */ 7465 rack->r_ctl.ack_count = ctf_decay_count(rack->r_ctl.ack_count, 7466 tcp_sad_decay_val); 7467 rack->r_ctl.sack_count = ctf_decay_count(rack->r_ctl.sack_count, 7468 tcp_sad_decay_val); 7469 rack->r_ctl.sack_moved_extra = ctf_decay_count(rack->r_ctl.sack_moved_extra, 7470 tcp_sad_decay_val); 7471 rack->r_ctl.sack_noextra_move = ctf_decay_count(rack->r_ctl.sack_noextra_move, 7472 tcp_sad_decay_val); 7473 #endif 7474 } 7475 } 7476 7477 static void 7478 rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th) 7479 { 7480 uint32_t changed, entered_recovery = 0; 7481 struct tcp_rack *rack; 7482 struct rack_sendmap *rsm, *rm; 7483 struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1]; 7484 register uint32_t th_ack; 7485 int32_t i, j, k, num_sack_blks = 0; 7486 uint32_t cts, acked, ack_point, sack_changed = 0; 7487 int loop_start = 0, moved_two = 0; 7488 uint32_t tsused; 7489 7490 7491 INP_WLOCK_ASSERT(tp->t_inpcb); 7492 if (th->th_flags & TH_RST) { 7493 /* We don't log resets */ 7494 return; 7495 } 7496 rack = (struct tcp_rack *)tp->t_fb_ptr; 7497 cts = tcp_ts_getticks(); 7498 rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 7499 changed = 0; 7500 th_ack = th->th_ack; 7501 if (rack->sack_attack_disable == 0) 7502 rack_do_decay(rack); 7503 if (BYTES_THIS_ACK(tp, th) >= ctf_fixed_maxseg(rack->rc_tp)) { 7504 /* 7505 * You only get credit for 7506 * MSS and greater (and you get extra 7507 * credit for larger cum-ack moves). 7508 */ 7509 int ac; 7510 7511 ac = BYTES_THIS_ACK(tp, th) / ctf_fixed_maxseg(rack->rc_tp); 7512 rack->r_ctl.ack_count += ac; 7513 counter_u64_add(rack_ack_total, ac); 7514 } 7515 if (rack->r_ctl.ack_count > 0xfff00000) { 7516 /* 7517 * reduce the number to keep us under 7518 * a uint32_t. 7519 */ 7520 rack->r_ctl.ack_count /= 2; 7521 rack->r_ctl.sack_count /= 2; 7522 } 7523 if (SEQ_GT(th_ack, tp->snd_una)) { 7524 rack_log_progress_event(rack, tp, ticks, PROGRESS_UPDATE, __LINE__); 7525 tp->t_acktime = ticks; 7526 } 7527 if (rsm && SEQ_GT(th_ack, rsm->r_start)) 7528 changed = th_ack - rsm->r_start; 7529 if (changed) { 7530 /* 7531 * The ACK point is advancing to th_ack, we must drop off 7532 * the packets in the rack log and calculate any eligble 7533 * RTT's. 7534 */ 7535 rack->r_wanted_output = 1; 7536 more: 7537 rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 7538 if (rsm == NULL) { 7539 if ((th_ack - 1) == tp->iss) { 7540 /* 7541 * For the SYN incoming case we will not 7542 * have called tcp_output for the sending of 7543 * the SYN, so there will be no map. All 7544 * other cases should probably be a panic. 7545 */ 7546 goto proc_sack; 7547 } 7548 if (tp->t_flags & TF_SENTFIN) { 7549 /* if we send a FIN we will not hav a map */ 7550 goto proc_sack; 7551 } 7552 #ifdef INVARIANTS 7553 panic("No rack map tp:%p for th:%p state:%d rack:%p snd_una:%u snd_max:%u snd_nxt:%u chg:%d\n", 7554 tp, 7555 th, tp->t_state, rack, 7556 tp->snd_una, tp->snd_max, tp->snd_nxt, changed); 7557 #endif 7558 goto proc_sack; 7559 } 7560 if (SEQ_LT(th_ack, rsm->r_start)) { 7561 /* Huh map is missing this */ 7562 #ifdef INVARIANTS 7563 printf("Rack map starts at r_start:%u for th_ack:%u huh? ts:%d rs:%d\n", 7564 rsm->r_start, 7565 th_ack, tp->t_state, rack->r_state); 7566 #endif 7567 goto proc_sack; 7568 } 7569 rack_update_rtt(tp, rack, rsm, to, cts, CUM_ACKED, th_ack); 7570 /* Now do we consume the whole thing? */ 7571 if (SEQ_GEQ(th_ack, rsm->r_end)) { 7572 /* Its all consumed. */ 7573 uint32_t left; 7574 uint8_t newly_acked; 7575 7576 rack->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes; 7577 rsm->r_rtr_bytes = 0; 7578 /* Record the time of highest cumack sent */ 7579 rack->r_ctl.rc_gp_cumack_ts = rsm->usec_orig_send; 7580 rm = RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7581 #ifdef INVARIANTS 7582 if (rm != rsm) { 7583 panic("removing head in rack:%p rsm:%p rm:%p", 7584 rack, rsm, rm); 7585 } 7586 #endif 7587 if (rsm->r_in_tmap) { 7588 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 7589 rsm->r_in_tmap = 0; 7590 } 7591 newly_acked = 1; 7592 if (rsm->r_flags & RACK_ACKED) { 7593 /* 7594 * It was acked on the scoreboard -- remove 7595 * it from total 7596 */ 7597 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); 7598 newly_acked = 0; 7599 } else if (rsm->r_flags & RACK_SACK_PASSED) { 7600 /* 7601 * There are segments ACKED on the 7602 * scoreboard further up. We are seeing 7603 * reordering. 7604 */ 7605 rsm->r_flags &= ~RACK_SACK_PASSED; 7606 counter_u64_add(rack_reorder_seen, 1); 7607 rsm->r_ack_arrival = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 7608 rsm->r_flags |= RACK_ACKED; 7609 rack->r_ctl.rc_reorder_ts = cts; 7610 } 7611 left = th_ack - rsm->r_end; 7612 if (rack->app_limited_needs_set && newly_acked) 7613 rack_need_set_test(tp, rack, rsm, th_ack, __LINE__, RACK_USE_END_OR_THACK); 7614 /* Free back to zone */ 7615 rack_free(rack, rsm); 7616 if (left) { 7617 goto more; 7618 } 7619 goto proc_sack; 7620 } 7621 if (rsm->r_flags & RACK_ACKED) { 7622 /* 7623 * It was acked on the scoreboard -- remove it from 7624 * total for the part being cum-acked. 7625 */ 7626 rack->r_ctl.rc_sacked -= (th_ack - rsm->r_start); 7627 } 7628 /* 7629 * Clear the dup ack count for 7630 * the piece that remains. 7631 */ 7632 rsm->r_dupack = 0; 7633 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 7634 if (rsm->r_rtr_bytes) { 7635 /* 7636 * It was retransmitted adjust the 7637 * sack holes for what was acked. 7638 */ 7639 int ack_am; 7640 7641 ack_am = (th_ack - rsm->r_start); 7642 if (ack_am >= rsm->r_rtr_bytes) { 7643 rack->r_ctl.rc_holes_rxt -= ack_am; 7644 rsm->r_rtr_bytes -= ack_am; 7645 } 7646 } 7647 /* 7648 * Update where the piece starts and record 7649 * the time of send of highest cumack sent. 7650 */ 7651 rack->r_ctl.rc_gp_cumack_ts = rsm->usec_orig_send; 7652 rsm->r_start = th_ack; 7653 if (rack->app_limited_needs_set) 7654 rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_BEG); 7655 7656 } 7657 proc_sack: 7658 /* Check for reneging */ 7659 rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 7660 if (rsm && (rsm->r_flags & RACK_ACKED) && (th_ack == rsm->r_start)) { 7661 /* 7662 * The peer has moved snd_una up to 7663 * the edge of this send, i.e. one 7664 * that it had previously acked. The only 7665 * way that can be true if the peer threw 7666 * away data (space issues) that it had 7667 * previously sacked (else it would have 7668 * given us snd_una up to (rsm->r_end). 7669 * We need to undo the acked markings here. 7670 * 7671 * Note we have to look to make sure th_ack is 7672 * our rsm->r_start in case we get an old ack 7673 * where th_ack is behind snd_una. 7674 */ 7675 rack_peer_reneges(rack, rsm, th->th_ack); 7676 } 7677 if ((to->to_flags & TOF_SACK) == 0) { 7678 /* We are done nothing left */ 7679 goto out; 7680 } 7681 /* Sack block processing */ 7682 if (SEQ_GT(th_ack, tp->snd_una)) 7683 ack_point = th_ack; 7684 else 7685 ack_point = tp->snd_una; 7686 for (i = 0; i < to->to_nsacks; i++) { 7687 bcopy((to->to_sacks + i * TCPOLEN_SACK), 7688 &sack, sizeof(sack)); 7689 sack.start = ntohl(sack.start); 7690 sack.end = ntohl(sack.end); 7691 if (SEQ_GT(sack.end, sack.start) && 7692 SEQ_GT(sack.start, ack_point) && 7693 SEQ_LT(sack.start, tp->snd_max) && 7694 SEQ_GT(sack.end, ack_point) && 7695 SEQ_LEQ(sack.end, tp->snd_max)) { 7696 sack_blocks[num_sack_blks] = sack; 7697 num_sack_blks++; 7698 #ifdef NETFLIX_STATS 7699 } else if (SEQ_LEQ(sack.start, th_ack) && 7700 SEQ_LEQ(sack.end, th_ack)) { 7701 /* 7702 * Its a D-SACK block. 7703 */ 7704 tcp_record_dsack(sack.start, sack.end); 7705 #endif 7706 } 7707 7708 } 7709 /* 7710 * Sort the SACK blocks so we can update the rack scoreboard with 7711 * just one pass. 7712 */ 7713 num_sack_blks = sack_filter_blks(&rack->r_ctl.rack_sf, sack_blocks, 7714 num_sack_blks, th->th_ack); 7715 ctf_log_sack_filter(rack->rc_tp, num_sack_blks, sack_blocks); 7716 if (num_sack_blks == 0) { 7717 /* Nothing to sack (DSACKs?) */ 7718 goto out_with_totals; 7719 } 7720 if (num_sack_blks < 2) { 7721 /* Only one, we don't need to sort */ 7722 goto do_sack_work; 7723 } 7724 /* Sort the sacks */ 7725 for (i = 0; i < num_sack_blks; i++) { 7726 for (j = i + 1; j < num_sack_blks; j++) { 7727 if (SEQ_GT(sack_blocks[i].end, sack_blocks[j].end)) { 7728 sack = sack_blocks[i]; 7729 sack_blocks[i] = sack_blocks[j]; 7730 sack_blocks[j] = sack; 7731 } 7732 } 7733 } 7734 /* 7735 * Now are any of the sack block ends the same (yes some 7736 * implementations send these)? 7737 */ 7738 again: 7739 if (num_sack_blks == 0) 7740 goto out_with_totals; 7741 if (num_sack_blks > 1) { 7742 for (i = 0; i < num_sack_blks; i++) { 7743 for (j = i + 1; j < num_sack_blks; j++) { 7744 if (sack_blocks[i].end == sack_blocks[j].end) { 7745 /* 7746 * Ok these two have the same end we 7747 * want the smallest end and then 7748 * throw away the larger and start 7749 * again. 7750 */ 7751 if (SEQ_LT(sack_blocks[j].start, sack_blocks[i].start)) { 7752 /* 7753 * The second block covers 7754 * more area use that 7755 */ 7756 sack_blocks[i].start = sack_blocks[j].start; 7757 } 7758 /* 7759 * Now collapse out the dup-sack and 7760 * lower the count 7761 */ 7762 for (k = (j + 1); k < num_sack_blks; k++) { 7763 sack_blocks[j].start = sack_blocks[k].start; 7764 sack_blocks[j].end = sack_blocks[k].end; 7765 j++; 7766 } 7767 num_sack_blks--; 7768 goto again; 7769 } 7770 } 7771 } 7772 } 7773 do_sack_work: 7774 /* 7775 * First lets look to see if 7776 * we have retransmitted and 7777 * can use the transmit next? 7778 */ 7779 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 7780 if (rsm && 7781 SEQ_GT(sack_blocks[0].end, rsm->r_start) && 7782 SEQ_LT(sack_blocks[0].start, rsm->r_end)) { 7783 /* 7784 * We probably did the FR and the next 7785 * SACK in continues as we would expect. 7786 */ 7787 acked = rack_proc_sack_blk(tp, rack, &sack_blocks[0], to, &rsm, cts, &moved_two); 7788 if (acked) { 7789 rack->r_wanted_output = 1; 7790 changed += acked; 7791 sack_changed += acked; 7792 } 7793 if (num_sack_blks == 1) { 7794 /* 7795 * This is what we would expect from 7796 * a normal implementation to happen 7797 * after we have retransmitted the FR, 7798 * i.e the sack-filter pushes down 7799 * to 1 block and the next to be retransmitted 7800 * is the sequence in the sack block (has more 7801 * are acked). Count this as ACK'd data to boost 7802 * up the chances of recovering any false positives. 7803 */ 7804 rack->r_ctl.ack_count += (acked / ctf_fixed_maxseg(rack->rc_tp)); 7805 counter_u64_add(rack_ack_total, (acked / ctf_fixed_maxseg(rack->rc_tp))); 7806 counter_u64_add(rack_express_sack, 1); 7807 if (rack->r_ctl.ack_count > 0xfff00000) { 7808 /* 7809 * reduce the number to keep us under 7810 * a uint32_t. 7811 */ 7812 rack->r_ctl.ack_count /= 2; 7813 rack->r_ctl.sack_count /= 2; 7814 } 7815 goto out_with_totals; 7816 } else { 7817 /* 7818 * Start the loop through the 7819 * rest of blocks, past the first block. 7820 */ 7821 moved_two = 0; 7822 loop_start = 1; 7823 } 7824 } 7825 /* Its a sack of some sort */ 7826 rack->r_ctl.sack_count++; 7827 if (rack->r_ctl.sack_count > 0xfff00000) { 7828 /* 7829 * reduce the number to keep us under 7830 * a uint32_t. 7831 */ 7832 rack->r_ctl.ack_count /= 2; 7833 rack->r_ctl.sack_count /= 2; 7834 } 7835 counter_u64_add(rack_sack_total, 1); 7836 if (rack->sack_attack_disable) { 7837 /* An attacker disablement is in place */ 7838 if (num_sack_blks > 1) { 7839 rack->r_ctl.sack_count += (num_sack_blks - 1); 7840 rack->r_ctl.sack_moved_extra++; 7841 counter_u64_add(rack_move_some, 1); 7842 if (rack->r_ctl.sack_moved_extra > 0xfff00000) { 7843 rack->r_ctl.sack_moved_extra /= 2; 7844 rack->r_ctl.sack_noextra_move /= 2; 7845 } 7846 } 7847 goto out; 7848 } 7849 rsm = rack->r_ctl.rc_sacklast; 7850 for (i = loop_start; i < num_sack_blks; i++) { 7851 acked = rack_proc_sack_blk(tp, rack, &sack_blocks[i], to, &rsm, cts, &moved_two); 7852 if (acked) { 7853 rack->r_wanted_output = 1; 7854 changed += acked; 7855 sack_changed += acked; 7856 } 7857 if (moved_two) { 7858 /* 7859 * If we did not get a SACK for at least a MSS and 7860 * had to move at all, or if we moved more than our 7861 * threshold, it counts against the "extra" move. 7862 */ 7863 rack->r_ctl.sack_moved_extra += moved_two; 7864 counter_u64_add(rack_move_some, 1); 7865 } else { 7866 /* 7867 * else we did not have to move 7868 * any more than we would expect. 7869 */ 7870 rack->r_ctl.sack_noextra_move++; 7871 counter_u64_add(rack_move_none, 1); 7872 } 7873 if (moved_two && (acked < ctf_fixed_maxseg(rack->rc_tp))) { 7874 /* 7875 * If the SACK was not a full MSS then 7876 * we add to sack_count the number of 7877 * MSS's (or possibly more than 7878 * a MSS if its a TSO send) we had to skip by. 7879 */ 7880 rack->r_ctl.sack_count += moved_two; 7881 counter_u64_add(rack_sack_total, moved_two); 7882 } 7883 /* 7884 * Now we need to setup for the next 7885 * round. First we make sure we won't 7886 * exceed the size of our uint32_t on 7887 * the various counts, and then clear out 7888 * moved_two. 7889 */ 7890 if ((rack->r_ctl.sack_moved_extra > 0xfff00000) || 7891 (rack->r_ctl.sack_noextra_move > 0xfff00000)) { 7892 rack->r_ctl.sack_moved_extra /= 2; 7893 rack->r_ctl.sack_noextra_move /= 2; 7894 } 7895 if (rack->r_ctl.sack_count > 0xfff00000) { 7896 rack->r_ctl.ack_count /= 2; 7897 rack->r_ctl.sack_count /= 2; 7898 } 7899 moved_two = 0; 7900 } 7901 out_with_totals: 7902 if (num_sack_blks > 1) { 7903 /* 7904 * You get an extra stroke if 7905 * you have more than one sack-blk, this 7906 * could be where we are skipping forward 7907 * and the sack-filter is still working, or 7908 * it could be an attacker constantly 7909 * moving us. 7910 */ 7911 rack->r_ctl.sack_moved_extra++; 7912 counter_u64_add(rack_move_some, 1); 7913 } 7914 out: 7915 #ifdef NETFLIX_EXP_DETECTION 7916 if ((rack->do_detection || tcp_force_detection) && 7917 tcp_sack_to_ack_thresh && 7918 tcp_sack_to_move_thresh && 7919 ((rack->r_ctl.rc_num_maps_alloced > tcp_map_minimum) || rack->sack_attack_disable)) { 7920 /* 7921 * We have thresholds set to find 7922 * possible attackers and disable sack. 7923 * Check them. 7924 */ 7925 uint64_t ackratio, moveratio, movetotal; 7926 7927 /* Log detecting */ 7928 rack_log_sad(rack, 1); 7929 ackratio = (uint64_t)(rack->r_ctl.sack_count); 7930 ackratio *= (uint64_t)(1000); 7931 if (rack->r_ctl.ack_count) 7932 ackratio /= (uint64_t)(rack->r_ctl.ack_count); 7933 else { 7934 /* We really should not hit here */ 7935 ackratio = 1000; 7936 } 7937 if ((rack->sack_attack_disable == 0) && 7938 (ackratio > rack_highest_sack_thresh_seen)) 7939 rack_highest_sack_thresh_seen = (uint32_t)ackratio; 7940 movetotal = rack->r_ctl.sack_moved_extra; 7941 movetotal += rack->r_ctl.sack_noextra_move; 7942 moveratio = rack->r_ctl.sack_moved_extra; 7943 moveratio *= (uint64_t)1000; 7944 if (movetotal) 7945 moveratio /= movetotal; 7946 else { 7947 /* No moves, thats pretty good */ 7948 moveratio = 0; 7949 } 7950 if ((rack->sack_attack_disable == 0) && 7951 (moveratio > rack_highest_move_thresh_seen)) 7952 rack_highest_move_thresh_seen = (uint32_t)moveratio; 7953 if (rack->sack_attack_disable == 0) { 7954 if ((ackratio > tcp_sack_to_ack_thresh) && 7955 (moveratio > tcp_sack_to_move_thresh)) { 7956 /* Disable sack processing */ 7957 rack->sack_attack_disable = 1; 7958 if (rack->r_rep_attack == 0) { 7959 rack->r_rep_attack = 1; 7960 counter_u64_add(rack_sack_attacks_detected, 1); 7961 } 7962 if (tcp_attack_on_turns_on_logging) { 7963 /* 7964 * Turn on logging, used for debugging 7965 * false positives. 7966 */ 7967 rack->rc_tp->t_logstate = tcp_attack_on_turns_on_logging; 7968 } 7969 /* Clamp the cwnd at flight size */ 7970 rack->r_ctl.rc_saved_cwnd = rack->rc_tp->snd_cwnd; 7971 rack->rc_tp->snd_cwnd = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 7972 rack_log_sad(rack, 2); 7973 } 7974 } else { 7975 /* We are sack-disabled check for false positives */ 7976 if ((ackratio <= tcp_restoral_thresh) || 7977 (rack->r_ctl.rc_num_maps_alloced < tcp_map_minimum)) { 7978 rack->sack_attack_disable = 0; 7979 rack_log_sad(rack, 3); 7980 /* Restart counting */ 7981 rack->r_ctl.sack_count = 0; 7982 rack->r_ctl.sack_moved_extra = 0; 7983 rack->r_ctl.sack_noextra_move = 1; 7984 rack->r_ctl.ack_count = max(1, 7985 (BYTES_THIS_ACK(tp, th)/ctf_fixed_maxseg(rack->rc_tp))); 7986 7987 if (rack->r_rep_reverse == 0) { 7988 rack->r_rep_reverse = 1; 7989 counter_u64_add(rack_sack_attacks_reversed, 1); 7990 } 7991 /* Restore the cwnd */ 7992 if (rack->r_ctl.rc_saved_cwnd > rack->rc_tp->snd_cwnd) 7993 rack->rc_tp->snd_cwnd = rack->r_ctl.rc_saved_cwnd; 7994 } 7995 } 7996 } 7997 #endif 7998 if (changed) { 7999 /* Something changed cancel the rack timer */ 8000 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 8001 } 8002 tsused = tcp_ts_getticks(); 8003 rsm = tcp_rack_output(tp, rack, tsused); 8004 if ((!IN_RECOVERY(tp->t_flags)) && 8005 rsm) { 8006 /* Enter recovery */ 8007 rack->r_ctl.rc_rsm_start = rsm->r_start; 8008 rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; 8009 rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; 8010 entered_recovery = 1; 8011 rack_cong_signal(tp, NULL, CC_NDUPACK); 8012 /* 8013 * When we enter recovery we need to assure we send 8014 * one packet. 8015 */ 8016 if (rack->rack_no_prr == 0) { 8017 rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); 8018 rack_log_to_prr(rack, 8, 0); 8019 } 8020 rack->r_timer_override = 1; 8021 rack->r_early = 0; 8022 rack->r_ctl.rc_agg_early = 0; 8023 } else if (IN_RECOVERY(tp->t_flags) && 8024 rsm && 8025 (rack->r_rr_config == 3)) { 8026 /* 8027 * Assure we can output and we get no 8028 * remembered pace time except the retransmit. 8029 */ 8030 rack->r_timer_override = 1; 8031 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 8032 rack->r_ctl.rc_resend = rsm; 8033 } 8034 if (IN_RECOVERY(tp->t_flags) && 8035 (rack->rack_no_prr == 0) && 8036 (entered_recovery == 0)) { 8037 /* Deal with PRR here (in recovery only) */ 8038 uint32_t pipe, snd_una; 8039 8040 rack->r_ctl.rc_prr_delivered += changed; 8041 /* Compute prr_sndcnt */ 8042 if (SEQ_GT(tp->snd_una, th_ack)) { 8043 snd_una = tp->snd_una; 8044 } else { 8045 snd_una = th_ack; 8046 } 8047 pipe = ((tp->snd_max - snd_una) - rack->r_ctl.rc_sacked) + rack->r_ctl.rc_holes_rxt; 8048 if (pipe > tp->snd_ssthresh) { 8049 long sndcnt; 8050 8051 sndcnt = rack->r_ctl.rc_prr_delivered * tp->snd_ssthresh; 8052 if (rack->r_ctl.rc_prr_recovery_fs > 0) 8053 sndcnt /= (long)rack->r_ctl.rc_prr_recovery_fs; 8054 else { 8055 rack->r_ctl.rc_prr_sndcnt = 0; 8056 rack_log_to_prr(rack, 9, 0); 8057 sndcnt = 0; 8058 } 8059 sndcnt++; 8060 if (sndcnt > (long)rack->r_ctl.rc_prr_out) 8061 sndcnt -= rack->r_ctl.rc_prr_out; 8062 else 8063 sndcnt = 0; 8064 rack->r_ctl.rc_prr_sndcnt = sndcnt; 8065 rack_log_to_prr(rack, 10, 0); 8066 } else { 8067 uint32_t limit; 8068 8069 if (rack->r_ctl.rc_prr_delivered > rack->r_ctl.rc_prr_out) 8070 limit = (rack->r_ctl.rc_prr_delivered - rack->r_ctl.rc_prr_out); 8071 else 8072 limit = 0; 8073 if (changed > limit) 8074 limit = changed; 8075 limit += ctf_fixed_maxseg(tp); 8076 if (tp->snd_ssthresh > pipe) { 8077 rack->r_ctl.rc_prr_sndcnt = min((tp->snd_ssthresh - pipe), limit); 8078 rack_log_to_prr(rack, 11, 0); 8079 } else { 8080 rack->r_ctl.rc_prr_sndcnt = min(0, limit); 8081 rack_log_to_prr(rack, 12, 0); 8082 } 8083 } 8084 if ((rsm && (rack->r_ctl.rc_prr_sndcnt >= ctf_fixed_maxseg(tp)) && 8085 ((rack->rc_inp->inp_in_hpts == 0) && 8086 ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0)))) { 8087 /* 8088 * If you are pacing output you don't want 8089 * to override. 8090 */ 8091 rack->r_early = 0; 8092 rack->r_ctl.rc_agg_early = 0; 8093 rack->r_timer_override = 1; 8094 } 8095 } 8096 } 8097 8098 static void 8099 rack_strike_dupack(struct tcp_rack *rack) 8100 { 8101 struct rack_sendmap *rsm; 8102 8103 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 8104 if (rsm && (rsm->r_dupack < 0xff)) { 8105 rsm->r_dupack++; 8106 if (rsm->r_dupack >= DUP_ACK_THRESHOLD) { 8107 rack->r_wanted_output = 1; 8108 rack->r_timer_override = 1; 8109 rack_log_retran_reason(rack, rsm, __LINE__, 1, 3); 8110 } else { 8111 rack_log_retran_reason(rack, rsm, __LINE__, 0, 3); 8112 } 8113 } 8114 } 8115 8116 static void 8117 rack_check_bottom_drag(struct tcpcb *tp, 8118 struct tcp_rack *rack, 8119 struct socket *so, int32_t acked) 8120 { 8121 uint32_t segsiz, minseg; 8122 8123 segsiz = ctf_fixed_maxseg(tp); 8124 minseg = segsiz; 8125 8126 if (tp->snd_max == tp->snd_una) { 8127 /* 8128 * We are doing dynamic pacing and we are way 8129 * under. Basically everything got acked while 8130 * we were still waiting on the pacer to expire. 8131 * 8132 * This means we need to boost the b/w in 8133 * addition to any earlier boosting of 8134 * the multipler. 8135 */ 8136 rack->rc_dragged_bottom = 1; 8137 rack_validate_multipliers_at_or_above100(rack); 8138 /* 8139 * Lets use the segment bytes acked plus 8140 * the lowest RTT seen as the basis to 8141 * form a b/w estimate. This will be off 8142 * due to the fact that the true estimate 8143 * should be around 1/2 the time of the RTT 8144 * but we can settle for that. 8145 */ 8146 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_VALID) && 8147 acked) { 8148 uint64_t bw, calc_bw, rtt; 8149 8150 rtt = rack->r_ctl.rack_rs.rs_us_rtt; 8151 bw = acked; 8152 calc_bw = bw * 1000000; 8153 calc_bw /= rtt; 8154 if (rack->r_ctl.last_max_bw && 8155 (rack->r_ctl.last_max_bw < calc_bw)) { 8156 /* 8157 * If we have a last calculated max bw 8158 * enforce it. 8159 */ 8160 calc_bw = rack->r_ctl.last_max_bw; 8161 } 8162 /* now plop it in */ 8163 if (rack->rc_gp_filled == 0) { 8164 if (calc_bw > ONE_POINT_TWO_MEG) { 8165 /* 8166 * If we have no measurement 8167 * don't let us set in more than 8168 * 1.2Mbps. If we are still too 8169 * low after pacing with this we 8170 * will hopefully have a max b/w 8171 * available to sanity check things. 8172 */ 8173 calc_bw = ONE_POINT_TWO_MEG; 8174 } 8175 rack->r_ctl.rc_rtt_diff = 0; 8176 rack->r_ctl.gp_bw = calc_bw; 8177 rack->rc_gp_filled = 1; 8178 rack->r_ctl.num_avg = RACK_REQ_AVG; 8179 rack_set_pace_segments(rack->rc_tp, rack, __LINE__); 8180 } else if (calc_bw > rack->r_ctl.gp_bw) { 8181 rack->r_ctl.rc_rtt_diff = 0; 8182 rack->r_ctl.num_avg = RACK_REQ_AVG; 8183 rack->r_ctl.gp_bw = calc_bw; 8184 rack_set_pace_segments(rack->rc_tp, rack, __LINE__); 8185 } else 8186 rack_increase_bw_mul(rack, -1, 0, 0, 1); 8187 /* 8188 * For acks over 1mss we do a extra boost to simulate 8189 * where we would get 2 acks (we want 110 for the mul). 8190 */ 8191 if (acked > segsiz) 8192 rack_increase_bw_mul(rack, -1, 0, 0, 1); 8193 } else { 8194 /* 8195 * Huh, this should not be, settle 8196 * for just an old increase. 8197 */ 8198 rack_increase_bw_mul(rack, -1, 0, 0, 1); 8199 } 8200 } else if ((IN_RECOVERY(tp->t_flags) == 0) && 8201 (sbavail(&so->so_snd) > max((segsiz * (4 + rack_req_segs)), 8202 minseg)) && 8203 (rack->r_ctl.cwnd_to_use > max((segsiz * (rack_req_segs + 2)), minseg)) && 8204 (tp->snd_wnd > max((segsiz * (rack_req_segs + 2)), minseg)) && 8205 (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) <= 8206 (segsiz * rack_req_segs))) { 8207 /* 8208 * We are doing dynamic GP pacing and 8209 * we have everything except 1MSS or less 8210 * bytes left out. We are still pacing away. 8211 * And there is data that could be sent, This 8212 * means we are inserting delayed ack time in 8213 * our measurements because we are pacing too slow. 8214 */ 8215 rack_validate_multipliers_at_or_above100(rack); 8216 rack->rc_dragged_bottom = 1; 8217 rack_increase_bw_mul(rack, -1, 0, 0, 1); 8218 } 8219 } 8220 8221 /* 8222 * Return value of 1, we do not need to call rack_process_data(). 8223 * return value of 0, rack_process_data can be called. 8224 * For ret_val if its 0 the TCP is locked, if its non-zero 8225 * its unlocked and probably unsafe to touch the TCB. 8226 */ 8227 static int 8228 rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so, 8229 struct tcpcb *tp, struct tcpopt *to, 8230 uint32_t tiwin, int32_t tlen, 8231 int32_t * ofia, int32_t thflags, int32_t * ret_val) 8232 { 8233 int32_t ourfinisacked = 0; 8234 int32_t nsegs, acked_amount; 8235 int32_t acked; 8236 struct mbuf *mfree; 8237 struct tcp_rack *rack; 8238 int32_t under_pacing = 0; 8239 int32_t recovery = 0; 8240 8241 rack = (struct tcp_rack *)tp->t_fb_ptr; 8242 if (SEQ_GT(th->th_ack, tp->snd_max)) { 8243 ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val); 8244 rack->r_wanted_output = 1; 8245 return (1); 8246 } 8247 if (rack->rc_gp_filled && 8248 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { 8249 under_pacing = 1; 8250 } 8251 if (SEQ_GEQ(th->th_ack, tp->snd_una) || to->to_nsacks) { 8252 if (rack->rc_in_persist) 8253 tp->t_rxtshift = 0; 8254 if ((th->th_ack == tp->snd_una) && (tiwin == tp->snd_wnd)) 8255 rack_strike_dupack(rack); 8256 rack_log_ack(tp, to, th); 8257 } 8258 if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) { 8259 /* 8260 * Old ack, behind (or duplicate to) the last one rcv'd 8261 * Note: Should mark reordering is occuring! We should also 8262 * look for sack blocks arriving e.g. ack 1, 4-4 then ack 1, 8263 * 3-3, 4-4 would be reording. As well as ack 1, 3-3 <no 8264 * retran and> ack 3 8265 */ 8266 return (0); 8267 } 8268 /* 8269 * If we reach this point, ACK is not a duplicate, i.e., it ACKs 8270 * something we sent. 8271 */ 8272 if (tp->t_flags & TF_NEEDSYN) { 8273 /* 8274 * T/TCP: Connection was half-synchronized, and our SYN has 8275 * been ACK'd (so connection is now fully synchronized). Go 8276 * to non-starred state, increment snd_una for ACK of SYN, 8277 * and check if we can do window scaling. 8278 */ 8279 tp->t_flags &= ~TF_NEEDSYN; 8280 tp->snd_una++; 8281 /* Do window scaling? */ 8282 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 8283 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 8284 tp->rcv_scale = tp->request_r_scale; 8285 /* Send window already scaled. */ 8286 } 8287 } 8288 nsegs = max(1, m->m_pkthdr.lro_nsegs); 8289 INP_WLOCK_ASSERT(tp->t_inpcb); 8290 8291 acked = BYTES_THIS_ACK(tp, th); 8292 KMOD_TCPSTAT_ADD(tcps_rcvackpack, nsegs); 8293 KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked); 8294 /* 8295 * If we just performed our first retransmit, and the ACK arrives 8296 * within our recovery window, then it was a mistake to do the 8297 * retransmit in the first place. Recover our original cwnd and 8298 * ssthresh, and proceed to transmit where we left off. 8299 */ 8300 if (tp->t_flags & TF_PREVVALID) { 8301 tp->t_flags &= ~TF_PREVVALID; 8302 if (tp->t_rxtshift == 1 && 8303 (int)(ticks - tp->t_badrxtwin) < 0) 8304 rack_cong_signal(tp, th, CC_RTO_ERR); 8305 } 8306 if (acked) { 8307 /* assure we are not backed off */ 8308 tp->t_rxtshift = 0; 8309 rack->rc_tlp_in_progress = 0; 8310 rack->r_ctl.rc_tlp_cnt_out = 0; 8311 /* 8312 * If it is the RXT timer we want to 8313 * stop it, so we can restart a TLP. 8314 */ 8315 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) 8316 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 8317 #ifdef NETFLIX_HTTP_LOGGING 8318 tcp_http_check_for_comp(rack->rc_tp, th->th_ack); 8319 #endif 8320 } 8321 /* 8322 * If we have a timestamp reply, update smoothed round trip time. If 8323 * no timestamp is present but transmit timer is running and timed 8324 * sequence number was acked, update smoothed round trip time. Since 8325 * we now have an rtt measurement, cancel the timer backoff (cf., 8326 * Phil Karn's retransmit alg.). Recompute the initial retransmit 8327 * timer. 8328 * 8329 * Some boxes send broken timestamp replies during the SYN+ACK 8330 * phase, ignore timestamps of 0 or we could calculate a huge RTT 8331 * and blow up the retransmit timer. 8332 */ 8333 /* 8334 * If all outstanding data is acked, stop retransmit timer and 8335 * remember to restart (more output or persist). If there is more 8336 * data to be acked, restart retransmit timer, using current 8337 * (possibly backed-off) value. 8338 */ 8339 if (acked == 0) { 8340 if (ofia) 8341 *ofia = ourfinisacked; 8342 return (0); 8343 } 8344 if (rack->r_ctl.rc_early_recovery) { 8345 if (IN_RECOVERY(tp->t_flags)) { 8346 if (SEQ_LT(th->th_ack, tp->snd_recover) && 8347 (SEQ_LT(th->th_ack, tp->snd_max))) { 8348 tcp_rack_partialack(tp, th); 8349 } else { 8350 rack_post_recovery(tp, th); 8351 recovery = 1; 8352 } 8353 } 8354 } 8355 /* 8356 * Let the congestion control algorithm update congestion control 8357 * related information. This typically means increasing the 8358 * congestion window. 8359 */ 8360 rack_ack_received(tp, rack, th, nsegs, CC_ACK, recovery); 8361 SOCKBUF_LOCK(&so->so_snd); 8362 acked_amount = min(acked, (int)sbavail(&so->so_snd)); 8363 tp->snd_wnd -= acked_amount; 8364 mfree = sbcut_locked(&so->so_snd, acked_amount); 8365 if ((sbused(&so->so_snd) == 0) && 8366 (acked > acked_amount) && 8367 (tp->t_state >= TCPS_FIN_WAIT_1) && 8368 (tp->t_flags & TF_SENTFIN)) { 8369 /* 8370 * We must be sure our fin 8371 * was sent and acked (we can be 8372 * in FIN_WAIT_1 without having 8373 * sent the fin). 8374 */ 8375 ourfinisacked = 1; 8376 } 8377 /* NB: sowwakeup_locked() does an implicit unlock. */ 8378 sowwakeup_locked(so); 8379 m_freem(mfree); 8380 if (rack->r_ctl.rc_early_recovery == 0) { 8381 if (IN_RECOVERY(tp->t_flags)) { 8382 if (SEQ_LT(th->th_ack, tp->snd_recover) && 8383 (SEQ_LT(th->th_ack, tp->snd_max))) { 8384 tcp_rack_partialack(tp, th); 8385 } else { 8386 rack_post_recovery(tp, th); 8387 } 8388 } 8389 } 8390 tp->snd_una = th->th_ack; 8391 if (SEQ_GT(tp->snd_una, tp->snd_recover)) 8392 tp->snd_recover = tp->snd_una; 8393 8394 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) { 8395 tp->snd_nxt = tp->snd_una; 8396 } 8397 if (under_pacing && 8398 (rack->use_fixed_rate == 0) && 8399 (rack->in_probe_rtt == 0) && 8400 rack->rc_gp_dyn_mul && 8401 rack->rc_always_pace) { 8402 /* Check if we are dragging bottom */ 8403 rack_check_bottom_drag(tp, rack, so, acked); 8404 } 8405 if (tp->snd_una == tp->snd_max) { 8406 /* Nothing left outstanding */ 8407 rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL); 8408 if (rack->r_ctl.rc_went_idle_time == 0) 8409 rack->r_ctl.rc_went_idle_time = 1; 8410 rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__); 8411 if (sbavail(&tp->t_inpcb->inp_socket->so_snd) == 0) 8412 tp->t_acktime = 0; 8413 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 8414 /* Set need output so persist might get set */ 8415 rack->r_wanted_output = 1; 8416 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 8417 if ((tp->t_state >= TCPS_FIN_WAIT_1) && 8418 (sbavail(&so->so_snd) == 0) && 8419 (tp->t_flags2 & TF2_DROP_AF_DATA)) { 8420 /* 8421 * The socket was gone and the 8422 * peer sent data, time to 8423 * reset him. 8424 */ 8425 *ret_val = 1; 8426 /* tcp_close will kill the inp pre-log the Reset */ 8427 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); 8428 tp = tcp_close(tp); 8429 ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, tlen); 8430 return (1); 8431 8432 } 8433 } 8434 if (ofia) 8435 *ofia = ourfinisacked; 8436 return (0); 8437 } 8438 8439 static void 8440 rack_collapsed_window(struct tcp_rack *rack) 8441 { 8442 /* 8443 * Now we must walk the 8444 * send map and divide the 8445 * ones left stranded. These 8446 * guys can't cause us to abort 8447 * the connection and are really 8448 * "unsent". However if a buggy 8449 * client actually did keep some 8450 * of the data i.e. collapsed the win 8451 * and refused to ack and then opened 8452 * the win and acked that data. We would 8453 * get into an ack war, the simplier 8454 * method then of just pretending we 8455 * did not send those segments something 8456 * won't work. 8457 */ 8458 struct rack_sendmap *rsm, *nrsm, fe, *insret; 8459 tcp_seq max_seq; 8460 8461 max_seq = rack->rc_tp->snd_una + rack->rc_tp->snd_wnd; 8462 memset(&fe, 0, sizeof(fe)); 8463 fe.r_start = max_seq; 8464 /* Find the first seq past or at maxseq */ 8465 rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); 8466 if (rsm == NULL) { 8467 /* Nothing to do strange */ 8468 rack->rc_has_collapsed = 0; 8469 return; 8470 } 8471 /* 8472 * Now do we need to split at 8473 * the collapse point? 8474 */ 8475 if (SEQ_GT(max_seq, rsm->r_start)) { 8476 nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT); 8477 if (nrsm == NULL) { 8478 /* We can't get a rsm, mark all? */ 8479 nrsm = rsm; 8480 goto no_split; 8481 } 8482 /* Clone it */ 8483 rack_clone_rsm(rack, nrsm, rsm, max_seq); 8484 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); 8485 #ifdef INVARIANTS 8486 if (insret != NULL) { 8487 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", 8488 nrsm, insret, rack, rsm); 8489 } 8490 #endif 8491 if (rsm->r_in_tmap) { 8492 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 8493 nrsm->r_in_tmap = 1; 8494 } 8495 /* 8496 * Set in the new RSM as the 8497 * collapsed starting point 8498 */ 8499 rsm = nrsm; 8500 } 8501 no_split: 8502 counter_u64_add(rack_collapsed_win, 1); 8503 RB_FOREACH_FROM(nrsm, rack_rb_tree_head, rsm) { 8504 nrsm->r_flags |= RACK_RWND_COLLAPSED; 8505 rack->rc_has_collapsed = 1; 8506 } 8507 } 8508 8509 static void 8510 rack_un_collapse_window(struct tcp_rack *rack) 8511 { 8512 struct rack_sendmap *rsm; 8513 8514 RB_FOREACH_REVERSE(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) { 8515 if (rsm->r_flags & RACK_RWND_COLLAPSED) 8516 rsm->r_flags &= ~RACK_RWND_COLLAPSED; 8517 else 8518 break; 8519 } 8520 rack->rc_has_collapsed = 0; 8521 } 8522 8523 static void 8524 rack_handle_delayed_ack(struct tcpcb *tp, struct tcp_rack *rack, 8525 int32_t tlen, int32_t tfo_syn) 8526 { 8527 if (DELAY_ACK(tp, tlen) || tfo_syn) { 8528 if (rack->rc_dack_mode && 8529 (tlen > 500) && 8530 (rack->rc_dack_toggle == 1)) { 8531 goto no_delayed_ack; 8532 } 8533 rack_timer_cancel(tp, rack, 8534 rack->r_ctl.rc_rcvtime, __LINE__); 8535 tp->t_flags |= TF_DELACK; 8536 } else { 8537 no_delayed_ack: 8538 rack->r_wanted_output = 1; 8539 tp->t_flags |= TF_ACKNOW; 8540 if (rack->rc_dack_mode) { 8541 if (tp->t_flags & TF_DELACK) 8542 rack->rc_dack_toggle = 1; 8543 else 8544 rack->rc_dack_toggle = 0; 8545 } 8546 } 8547 } 8548 /* 8549 * Return value of 1, the TCB is unlocked and most 8550 * likely gone, return value of 0, the TCP is still 8551 * locked. 8552 */ 8553 static int 8554 rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so, 8555 struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, 8556 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) 8557 { 8558 /* 8559 * Update window information. Don't look at window if no ACK: TAC's 8560 * send garbage on first SYN. 8561 */ 8562 int32_t nsegs; 8563 int32_t tfo_syn; 8564 struct tcp_rack *rack; 8565 8566 rack = (struct tcp_rack *)tp->t_fb_ptr; 8567 INP_WLOCK_ASSERT(tp->t_inpcb); 8568 nsegs = max(1, m->m_pkthdr.lro_nsegs); 8569 if ((thflags & TH_ACK) && 8570 (SEQ_LT(tp->snd_wl1, th->th_seq) || 8571 (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || 8572 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { 8573 /* keep track of pure window updates */ 8574 if (tlen == 0 && 8575 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) 8576 KMOD_TCPSTAT_INC(tcps_rcvwinupd); 8577 tp->snd_wnd = tiwin; 8578 tp->snd_wl1 = th->th_seq; 8579 tp->snd_wl2 = th->th_ack; 8580 if (tp->snd_wnd > tp->max_sndwnd) 8581 tp->max_sndwnd = tp->snd_wnd; 8582 rack->r_wanted_output = 1; 8583 } else if (thflags & TH_ACK) { 8584 if ((tp->snd_wl2 == th->th_ack) && (tiwin < tp->snd_wnd)) { 8585 tp->snd_wnd = tiwin; 8586 tp->snd_wl1 = th->th_seq; 8587 tp->snd_wl2 = th->th_ack; 8588 } 8589 } 8590 if (tp->snd_wnd < ctf_outstanding(tp)) 8591 /* The peer collapsed the window */ 8592 rack_collapsed_window(rack); 8593 else if (rack->rc_has_collapsed) 8594 rack_un_collapse_window(rack); 8595 /* Was persist timer active and now we have window space? */ 8596 if ((rack->rc_in_persist != 0) && 8597 (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2), 8598 rack->r_ctl.rc_pace_min_segs))) { 8599 rack_exit_persist(tp, rack, rack->r_ctl.rc_rcvtime); 8600 tp->snd_nxt = tp->snd_max; 8601 /* Make sure we output to start the timer */ 8602 rack->r_wanted_output = 1; 8603 } 8604 /* Do we enter persists? */ 8605 if ((rack->rc_in_persist == 0) && 8606 (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) && 8607 TCPS_HAVEESTABLISHED(tp->t_state) && 8608 (tp->snd_max == tp->snd_una) && 8609 sbavail(&tp->t_inpcb->inp_socket->so_snd) && 8610 (sbavail(&tp->t_inpcb->inp_socket->so_snd) > tp->snd_wnd)) { 8611 /* 8612 * Here the rwnd is less than 8613 * the pacing size, we are established, 8614 * nothing is outstanding, and there is 8615 * data to send. Enter persists. 8616 */ 8617 tp->snd_nxt = tp->snd_una; 8618 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime); 8619 } 8620 if (tp->t_flags2 & TF2_DROP_AF_DATA) { 8621 m_freem(m); 8622 return (0); 8623 } 8624 /* 8625 * don't process the URG bit, ignore them drag 8626 * along the up. 8627 */ 8628 tp->rcv_up = tp->rcv_nxt; 8629 INP_WLOCK_ASSERT(tp->t_inpcb); 8630 8631 /* 8632 * Process the segment text, merging it into the TCP sequencing 8633 * queue, and arranging for acknowledgment of receipt if necessary. 8634 * This process logically involves adjusting tp->rcv_wnd as data is 8635 * presented to the user (this happens in tcp_usrreq.c, case 8636 * PRU_RCVD). If a FIN has already been received on this connection 8637 * then we just ignore the text. 8638 */ 8639 tfo_syn = ((tp->t_state == TCPS_SYN_RECEIVED) && 8640 IS_FASTOPEN(tp->t_flags)); 8641 if ((tlen || (thflags & TH_FIN) || (tfo_syn && tlen > 0)) && 8642 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 8643 tcp_seq save_start = th->th_seq; 8644 tcp_seq save_rnxt = tp->rcv_nxt; 8645 int save_tlen = tlen; 8646 8647 m_adj(m, drop_hdrlen); /* delayed header drop */ 8648 /* 8649 * Insert segment which includes th into TCP reassembly 8650 * queue with control block tp. Set thflags to whether 8651 * reassembly now includes a segment with FIN. This handles 8652 * the common case inline (segment is the next to be 8653 * received on an established connection, and the queue is 8654 * empty), avoiding linkage into and removal from the queue 8655 * and repetition of various conversions. Set DELACK for 8656 * segments received in order, but ack immediately when 8657 * segments are out of order (so fast retransmit can work). 8658 */ 8659 if (th->th_seq == tp->rcv_nxt && 8660 SEGQ_EMPTY(tp) && 8661 (TCPS_HAVEESTABLISHED(tp->t_state) || 8662 tfo_syn)) { 8663 #ifdef NETFLIX_SB_LIMITS 8664 u_int mcnt, appended; 8665 8666 if (so->so_rcv.sb_shlim) { 8667 mcnt = m_memcnt(m); 8668 appended = 0; 8669 if (counter_fo_get(so->so_rcv.sb_shlim, mcnt, 8670 CFO_NOSLEEP, NULL) == false) { 8671 counter_u64_add(tcp_sb_shlim_fails, 1); 8672 m_freem(m); 8673 return (0); 8674 } 8675 } 8676 #endif 8677 rack_handle_delayed_ack(tp, rack, tlen, tfo_syn); 8678 tp->rcv_nxt += tlen; 8679 if (tlen && 8680 ((tp->t_flags2 & TF2_FBYTES_COMPLETE) == 0) && 8681 (tp->t_fbyte_in == 0)) { 8682 tp->t_fbyte_in = ticks; 8683 if (tp->t_fbyte_in == 0) 8684 tp->t_fbyte_in = 1; 8685 if (tp->t_fbyte_out && tp->t_fbyte_in) 8686 tp->t_flags2 |= TF2_FBYTES_COMPLETE; 8687 } 8688 thflags = th->th_flags & TH_FIN; 8689 KMOD_TCPSTAT_ADD(tcps_rcvpack, nsegs); 8690 KMOD_TCPSTAT_ADD(tcps_rcvbyte, tlen); 8691 SOCKBUF_LOCK(&so->so_rcv); 8692 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 8693 m_freem(m); 8694 } else 8695 #ifdef NETFLIX_SB_LIMITS 8696 appended = 8697 #endif 8698 sbappendstream_locked(&so->so_rcv, m, 0); 8699 /* NB: sorwakeup_locked() does an implicit unlock. */ 8700 sorwakeup_locked(so); 8701 #ifdef NETFLIX_SB_LIMITS 8702 if (so->so_rcv.sb_shlim && appended != mcnt) 8703 counter_fo_release(so->so_rcv.sb_shlim, 8704 mcnt - appended); 8705 #endif 8706 } else { 8707 /* 8708 * XXX: Due to the header drop above "th" is 8709 * theoretically invalid by now. Fortunately 8710 * m_adj() doesn't actually frees any mbufs when 8711 * trimming from the head. 8712 */ 8713 tcp_seq temp = save_start; 8714 thflags = tcp_reass(tp, th, &temp, &tlen, m); 8715 tp->t_flags |= TF_ACKNOW; 8716 } 8717 if ((tp->t_flags & TF_SACK_PERMIT) && (save_tlen > 0)) { 8718 if ((tlen == 0) && (SEQ_LT(save_start, save_rnxt))) { 8719 /* 8720 * DSACK actually handled in the fastpath 8721 * above. 8722 */ 8723 RACK_OPTS_INC(tcp_sack_path_1); 8724 tcp_update_sack_list(tp, save_start, 8725 save_start + save_tlen); 8726 } else if ((tlen > 0) && SEQ_GT(tp->rcv_nxt, save_rnxt)) { 8727 if ((tp->rcv_numsacks >= 1) && 8728 (tp->sackblks[0].end == save_start)) { 8729 /* 8730 * Partial overlap, recorded at todrop 8731 * above. 8732 */ 8733 RACK_OPTS_INC(tcp_sack_path_2a); 8734 tcp_update_sack_list(tp, 8735 tp->sackblks[0].start, 8736 tp->sackblks[0].end); 8737 } else { 8738 RACK_OPTS_INC(tcp_sack_path_2b); 8739 tcp_update_dsack_list(tp, save_start, 8740 save_start + save_tlen); 8741 } 8742 } else if (tlen >= save_tlen) { 8743 /* Update of sackblks. */ 8744 RACK_OPTS_INC(tcp_sack_path_3); 8745 tcp_update_dsack_list(tp, save_start, 8746 save_start + save_tlen); 8747 } else if (tlen > 0) { 8748 RACK_OPTS_INC(tcp_sack_path_4); 8749 tcp_update_dsack_list(tp, save_start, 8750 save_start + tlen); 8751 } 8752 } 8753 } else { 8754 m_freem(m); 8755 thflags &= ~TH_FIN; 8756 } 8757 8758 /* 8759 * If FIN is received ACK the FIN and let the user know that the 8760 * connection is closing. 8761 */ 8762 if (thflags & TH_FIN) { 8763 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 8764 socantrcvmore(so); 8765 /* 8766 * If connection is half-synchronized (ie NEEDSYN 8767 * flag on) then delay ACK, so it may be piggybacked 8768 * when SYN is sent. Otherwise, since we received a 8769 * FIN then no more input can be expected, send ACK 8770 * now. 8771 */ 8772 if (tp->t_flags & TF_NEEDSYN) { 8773 rack_timer_cancel(tp, rack, 8774 rack->r_ctl.rc_rcvtime, __LINE__); 8775 tp->t_flags |= TF_DELACK; 8776 } else { 8777 tp->t_flags |= TF_ACKNOW; 8778 } 8779 tp->rcv_nxt++; 8780 } 8781 switch (tp->t_state) { 8782 8783 /* 8784 * In SYN_RECEIVED and ESTABLISHED STATES enter the 8785 * CLOSE_WAIT state. 8786 */ 8787 case TCPS_SYN_RECEIVED: 8788 tp->t_starttime = ticks; 8789 /* FALLTHROUGH */ 8790 case TCPS_ESTABLISHED: 8791 rack_timer_cancel(tp, rack, 8792 rack->r_ctl.rc_rcvtime, __LINE__); 8793 tcp_state_change(tp, TCPS_CLOSE_WAIT); 8794 break; 8795 8796 /* 8797 * If still in FIN_WAIT_1 STATE FIN has not been 8798 * acked so enter the CLOSING state. 8799 */ 8800 case TCPS_FIN_WAIT_1: 8801 rack_timer_cancel(tp, rack, 8802 rack->r_ctl.rc_rcvtime, __LINE__); 8803 tcp_state_change(tp, TCPS_CLOSING); 8804 break; 8805 8806 /* 8807 * In FIN_WAIT_2 state enter the TIME_WAIT state, 8808 * starting the time-wait timer, turning off the 8809 * other standard timers. 8810 */ 8811 case TCPS_FIN_WAIT_2: 8812 rack_timer_cancel(tp, rack, 8813 rack->r_ctl.rc_rcvtime, __LINE__); 8814 tcp_twstart(tp); 8815 return (1); 8816 } 8817 } 8818 /* 8819 * Return any desired output. 8820 */ 8821 if ((tp->t_flags & TF_ACKNOW) || 8822 (sbavail(&so->so_snd) > (tp->snd_max - tp->snd_una))) { 8823 rack->r_wanted_output = 1; 8824 } 8825 INP_WLOCK_ASSERT(tp->t_inpcb); 8826 return (0); 8827 } 8828 8829 /* 8830 * Here nothing is really faster, its just that we 8831 * have broken out the fast-data path also just like 8832 * the fast-ack. 8833 */ 8834 static int 8835 rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so, 8836 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 8837 uint32_t tiwin, int32_t nxt_pkt, uint8_t iptos) 8838 { 8839 int32_t nsegs; 8840 int32_t newsize = 0; /* automatic sockbuf scaling */ 8841 struct tcp_rack *rack; 8842 #ifdef NETFLIX_SB_LIMITS 8843 u_int mcnt, appended; 8844 #endif 8845 #ifdef TCPDEBUG 8846 /* 8847 * The size of tcp_saveipgen must be the size of the max ip header, 8848 * now IPv6. 8849 */ 8850 u_char tcp_saveipgen[IP6_HDR_LEN]; 8851 struct tcphdr tcp_savetcp; 8852 short ostate = 0; 8853 8854 #endif 8855 /* 8856 * If last ACK falls within this segment's sequence numbers, record 8857 * the timestamp. NOTE that the test is modified according to the 8858 * latest proposal of the tcplw@cray.com list (Braden 1993/04/26). 8859 */ 8860 if (__predict_false(th->th_seq != tp->rcv_nxt)) { 8861 return (0); 8862 } 8863 if (__predict_false(tp->snd_nxt != tp->snd_max)) { 8864 return (0); 8865 } 8866 if (tiwin && tiwin != tp->snd_wnd) { 8867 return (0); 8868 } 8869 if (__predict_false((tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN)))) { 8870 return (0); 8871 } 8872 if (__predict_false((to->to_flags & TOF_TS) && 8873 (TSTMP_LT(to->to_tsval, tp->ts_recent)))) { 8874 return (0); 8875 } 8876 if (__predict_false((th->th_ack != tp->snd_una))) { 8877 return (0); 8878 } 8879 if (__predict_false(tlen > sbspace(&so->so_rcv))) { 8880 return (0); 8881 } 8882 if ((to->to_flags & TOF_TS) != 0 && 8883 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 8884 tp->ts_recent_age = tcp_ts_getticks(); 8885 tp->ts_recent = to->to_tsval; 8886 } 8887 rack = (struct tcp_rack *)tp->t_fb_ptr; 8888 /* 8889 * This is a pure, in-sequence data packet with nothing on the 8890 * reassembly queue and we have enough buffer space to take it. 8891 */ 8892 nsegs = max(1, m->m_pkthdr.lro_nsegs); 8893 8894 #ifdef NETFLIX_SB_LIMITS 8895 if (so->so_rcv.sb_shlim) { 8896 mcnt = m_memcnt(m); 8897 appended = 0; 8898 if (counter_fo_get(so->so_rcv.sb_shlim, mcnt, 8899 CFO_NOSLEEP, NULL) == false) { 8900 counter_u64_add(tcp_sb_shlim_fails, 1); 8901 m_freem(m); 8902 return (1); 8903 } 8904 } 8905 #endif 8906 /* Clean receiver SACK report if present */ 8907 if (tp->rcv_numsacks) 8908 tcp_clean_sackreport(tp); 8909 KMOD_TCPSTAT_INC(tcps_preddat); 8910 tp->rcv_nxt += tlen; 8911 if (tlen && 8912 ((tp->t_flags2 & TF2_FBYTES_COMPLETE) == 0) && 8913 (tp->t_fbyte_in == 0)) { 8914 tp->t_fbyte_in = ticks; 8915 if (tp->t_fbyte_in == 0) 8916 tp->t_fbyte_in = 1; 8917 if (tp->t_fbyte_out && tp->t_fbyte_in) 8918 tp->t_flags2 |= TF2_FBYTES_COMPLETE; 8919 } 8920 /* 8921 * Pull snd_wl1 up to prevent seq wrap relative to th_seq. 8922 */ 8923 tp->snd_wl1 = th->th_seq; 8924 /* 8925 * Pull rcv_up up to prevent seq wrap relative to rcv_nxt. 8926 */ 8927 tp->rcv_up = tp->rcv_nxt; 8928 KMOD_TCPSTAT_ADD(tcps_rcvpack, nsegs); 8929 KMOD_TCPSTAT_ADD(tcps_rcvbyte, tlen); 8930 #ifdef TCPDEBUG 8931 if (so->so_options & SO_DEBUG) 8932 tcp_trace(TA_INPUT, ostate, tp, 8933 (void *)tcp_saveipgen, &tcp_savetcp, 0); 8934 #endif 8935 newsize = tcp_autorcvbuf(m, th, so, tp, tlen); 8936 8937 /* Add data to socket buffer. */ 8938 SOCKBUF_LOCK(&so->so_rcv); 8939 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 8940 m_freem(m); 8941 } else { 8942 /* 8943 * Set new socket buffer size. Give up when limit is 8944 * reached. 8945 */ 8946 if (newsize) 8947 if (!sbreserve_locked(&so->so_rcv, 8948 newsize, so, NULL)) 8949 so->so_rcv.sb_flags &= ~SB_AUTOSIZE; 8950 m_adj(m, drop_hdrlen); /* delayed header drop */ 8951 #ifdef NETFLIX_SB_LIMITS 8952 appended = 8953 #endif 8954 sbappendstream_locked(&so->so_rcv, m, 0); 8955 ctf_calc_rwin(so, tp); 8956 } 8957 /* NB: sorwakeup_locked() does an implicit unlock. */ 8958 sorwakeup_locked(so); 8959 #ifdef NETFLIX_SB_LIMITS 8960 if (so->so_rcv.sb_shlim && mcnt != appended) 8961 counter_fo_release(so->so_rcv.sb_shlim, mcnt - appended); 8962 #endif 8963 rack_handle_delayed_ack(tp, rack, tlen, 0); 8964 if (tp->snd_una == tp->snd_max) 8965 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 8966 return (1); 8967 } 8968 8969 /* 8970 * This subfunction is used to try to highly optimize the 8971 * fast path. We again allow window updates that are 8972 * in sequence to remain in the fast-path. We also add 8973 * in the __predict's to attempt to help the compiler. 8974 * Note that if we return a 0, then we can *not* process 8975 * it and the caller should push the packet into the 8976 * slow-path. 8977 */ 8978 static int 8979 rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so, 8980 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 8981 uint32_t tiwin, int32_t nxt_pkt, uint32_t cts) 8982 { 8983 int32_t acked; 8984 int32_t nsegs; 8985 #ifdef TCPDEBUG 8986 /* 8987 * The size of tcp_saveipgen must be the size of the max ip header, 8988 * now IPv6. 8989 */ 8990 u_char tcp_saveipgen[IP6_HDR_LEN]; 8991 struct tcphdr tcp_savetcp; 8992 short ostate = 0; 8993 #endif 8994 int32_t under_pacing = 0; 8995 struct tcp_rack *rack; 8996 8997 if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) { 8998 /* Old ack, behind (or duplicate to) the last one rcv'd */ 8999 return (0); 9000 } 9001 if (__predict_false(SEQ_GT(th->th_ack, tp->snd_max))) { 9002 /* Above what we have sent? */ 9003 return (0); 9004 } 9005 if (__predict_false(tp->snd_nxt != tp->snd_max)) { 9006 /* We are retransmitting */ 9007 return (0); 9008 } 9009 if (__predict_false(tiwin == 0)) { 9010 /* zero window */ 9011 return (0); 9012 } 9013 if (__predict_false(tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN))) { 9014 /* We need a SYN or a FIN, unlikely.. */ 9015 return (0); 9016 } 9017 if ((to->to_flags & TOF_TS) && __predict_false(TSTMP_LT(to->to_tsval, tp->ts_recent))) { 9018 /* Timestamp is behind .. old ack with seq wrap? */ 9019 return (0); 9020 } 9021 if (__predict_false(IN_RECOVERY(tp->t_flags))) { 9022 /* Still recovering */ 9023 return (0); 9024 } 9025 rack = (struct tcp_rack *)tp->t_fb_ptr; 9026 if (rack->r_ctl.rc_sacked) { 9027 /* We have sack holes on our scoreboard */ 9028 return (0); 9029 } 9030 /* Ok if we reach here, we can process a fast-ack */ 9031 if (rack->rc_gp_filled && 9032 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { 9033 under_pacing = 1; 9034 } 9035 nsegs = max(1, m->m_pkthdr.lro_nsegs); 9036 rack_log_ack(tp, to, th); 9037 /* Did the window get updated? */ 9038 if (tiwin != tp->snd_wnd) { 9039 tp->snd_wnd = tiwin; 9040 tp->snd_wl1 = th->th_seq; 9041 if (tp->snd_wnd > tp->max_sndwnd) 9042 tp->max_sndwnd = tp->snd_wnd; 9043 } 9044 /* Do we exit persists? */ 9045 if ((rack->rc_in_persist != 0) && 9046 (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2), 9047 rack->r_ctl.rc_pace_min_segs))) { 9048 rack_exit_persist(tp, rack, cts); 9049 } 9050 /* Do we enter persists? */ 9051 if ((rack->rc_in_persist == 0) && 9052 (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) && 9053 TCPS_HAVEESTABLISHED(tp->t_state) && 9054 (tp->snd_max == tp->snd_una) && 9055 sbavail(&tp->t_inpcb->inp_socket->so_snd) && 9056 (sbavail(&tp->t_inpcb->inp_socket->so_snd) > tp->snd_wnd)) { 9057 /* 9058 * Here the rwnd is less than 9059 * the pacing size, we are established, 9060 * nothing is outstanding, and there is 9061 * data to send. Enter persists. 9062 */ 9063 tp->snd_nxt = tp->snd_una; 9064 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime); 9065 } 9066 /* 9067 * If last ACK falls within this segment's sequence numbers, record 9068 * the timestamp. NOTE that the test is modified according to the 9069 * latest proposal of the tcplw@cray.com list (Braden 1993/04/26). 9070 */ 9071 if ((to->to_flags & TOF_TS) != 0 && 9072 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 9073 tp->ts_recent_age = tcp_ts_getticks(); 9074 tp->ts_recent = to->to_tsval; 9075 } 9076 /* 9077 * This is a pure ack for outstanding data. 9078 */ 9079 KMOD_TCPSTAT_INC(tcps_predack); 9080 9081 /* 9082 * "bad retransmit" recovery. 9083 */ 9084 if (tp->t_flags & TF_PREVVALID) { 9085 tp->t_flags &= ~TF_PREVVALID; 9086 if (tp->t_rxtshift == 1 && 9087 (int)(ticks - tp->t_badrxtwin) < 0) 9088 rack_cong_signal(tp, th, CC_RTO_ERR); 9089 } 9090 /* 9091 * Recalculate the transmit timer / rtt. 9092 * 9093 * Some boxes send broken timestamp replies during the SYN+ACK 9094 * phase, ignore timestamps of 0 or we could calculate a huge RTT 9095 * and blow up the retransmit timer. 9096 */ 9097 acked = BYTES_THIS_ACK(tp, th); 9098 9099 #ifdef TCP_HHOOK 9100 /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ 9101 hhook_run_tcp_est_in(tp, th, to); 9102 #endif 9103 9104 KMOD_TCPSTAT_ADD(tcps_rcvackpack, nsegs); 9105 KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked); 9106 sbdrop(&so->so_snd, acked); 9107 if (acked) { 9108 /* assure we are not backed off */ 9109 tp->t_rxtshift = 0; 9110 rack->rc_tlp_in_progress = 0; 9111 rack->r_ctl.rc_tlp_cnt_out = 0; 9112 /* 9113 * If it is the RXT timer we want to 9114 * stop it, so we can restart a TLP. 9115 */ 9116 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) 9117 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 9118 #ifdef NETFLIX_HTTP_LOGGING 9119 tcp_http_check_for_comp(rack->rc_tp, th->th_ack); 9120 #endif 9121 } 9122 /* 9123 * Let the congestion control algorithm update congestion control 9124 * related information. This typically means increasing the 9125 * congestion window. 9126 */ 9127 rack_ack_received(tp, rack, th, nsegs, CC_ACK, 0); 9128 9129 tp->snd_una = th->th_ack; 9130 if (tp->snd_wnd < ctf_outstanding(tp)) { 9131 /* The peer collapsed the window */ 9132 rack_collapsed_window(rack); 9133 } else if (rack->rc_has_collapsed) 9134 rack_un_collapse_window(rack); 9135 9136 /* 9137 * Pull snd_wl2 up to prevent seq wrap relative to th_ack. 9138 */ 9139 tp->snd_wl2 = th->th_ack; 9140 tp->t_dupacks = 0; 9141 m_freem(m); 9142 /* ND6_HINT(tp); *//* Some progress has been made. */ 9143 9144 /* 9145 * If all outstanding data are acked, stop retransmit timer, 9146 * otherwise restart timer using current (possibly backed-off) 9147 * value. If process is waiting for space, wakeup/selwakeup/signal. 9148 * If data are ready to send, let tcp_output decide between more 9149 * output or persist. 9150 */ 9151 #ifdef TCPDEBUG 9152 if (so->so_options & SO_DEBUG) 9153 tcp_trace(TA_INPUT, ostate, tp, 9154 (void *)tcp_saveipgen, 9155 &tcp_savetcp, 0); 9156 #endif 9157 if (under_pacing && 9158 (rack->use_fixed_rate == 0) && 9159 (rack->in_probe_rtt == 0) && 9160 rack->rc_gp_dyn_mul && 9161 rack->rc_always_pace) { 9162 /* Check if we are dragging bottom */ 9163 rack_check_bottom_drag(tp, rack, so, acked); 9164 } 9165 if (tp->snd_una == tp->snd_max) { 9166 rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL); 9167 if (rack->r_ctl.rc_went_idle_time == 0) 9168 rack->r_ctl.rc_went_idle_time = 1; 9169 rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__); 9170 if (sbavail(&tp->t_inpcb->inp_socket->so_snd) == 0) 9171 tp->t_acktime = 0; 9172 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 9173 } 9174 /* Wake up the socket if we have room to write more */ 9175 sowwakeup(so); 9176 if (sbavail(&so->so_snd)) { 9177 rack->r_wanted_output = 1; 9178 } 9179 return (1); 9180 } 9181 9182 /* 9183 * Return value of 1, the TCB is unlocked and most 9184 * likely gone, return value of 0, the TCP is still 9185 * locked. 9186 */ 9187 static int 9188 rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so, 9189 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 9190 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 9191 { 9192 int32_t ret_val = 0; 9193 int32_t todrop; 9194 int32_t ourfinisacked = 0; 9195 struct tcp_rack *rack; 9196 9197 ctf_calc_rwin(so, tp); 9198 /* 9199 * If the state is SYN_SENT: if seg contains an ACK, but not for our 9200 * SYN, drop the input. if seg contains a RST, then drop the 9201 * connection. if seg does not contain SYN, then drop it. Otherwise 9202 * this is an acceptable SYN segment initialize tp->rcv_nxt and 9203 * tp->irs if seg contains ack then advance tp->snd_una if seg 9204 * contains an ECE and ECN support is enabled, the stream is ECN 9205 * capable. if SYN has been acked change to ESTABLISHED else 9206 * SYN_RCVD state arrange for segment to be acked (eventually) 9207 * continue processing rest of data/controls. 9208 */ 9209 if ((thflags & TH_ACK) && 9210 (SEQ_LEQ(th->th_ack, tp->iss) || 9211 SEQ_GT(th->th_ack, tp->snd_max))) { 9212 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 9213 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 9214 return (1); 9215 } 9216 if ((thflags & (TH_ACK | TH_RST)) == (TH_ACK | TH_RST)) { 9217 TCP_PROBE5(connect__refused, NULL, tp, 9218 mtod(m, const char *), tp, th); 9219 tp = tcp_drop(tp, ECONNREFUSED); 9220 ctf_do_drop(m, tp); 9221 return (1); 9222 } 9223 if (thflags & TH_RST) { 9224 ctf_do_drop(m, tp); 9225 return (1); 9226 } 9227 if (!(thflags & TH_SYN)) { 9228 ctf_do_drop(m, tp); 9229 return (1); 9230 } 9231 tp->irs = th->th_seq; 9232 tcp_rcvseqinit(tp); 9233 rack = (struct tcp_rack *)tp->t_fb_ptr; 9234 if (thflags & TH_ACK) { 9235 int tfo_partial = 0; 9236 9237 KMOD_TCPSTAT_INC(tcps_connects); 9238 soisconnected(so); 9239 #ifdef MAC 9240 mac_socketpeer_set_from_mbuf(m, so); 9241 #endif 9242 /* Do window scaling on this connection? */ 9243 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 9244 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 9245 tp->rcv_scale = tp->request_r_scale; 9246 } 9247 tp->rcv_adv += min(tp->rcv_wnd, 9248 TCP_MAXWIN << tp->rcv_scale); 9249 /* 9250 * If not all the data that was sent in the TFO SYN 9251 * has been acked, resend the remainder right away. 9252 */ 9253 if (IS_FASTOPEN(tp->t_flags) && 9254 (tp->snd_una != tp->snd_max)) { 9255 tp->snd_nxt = th->th_ack; 9256 tfo_partial = 1; 9257 } 9258 /* 9259 * If there's data, delay ACK; if there's also a FIN ACKNOW 9260 * will be turned on later. 9261 */ 9262 if (DELAY_ACK(tp, tlen) && tlen != 0 && !tfo_partial) { 9263 rack_timer_cancel(tp, rack, 9264 rack->r_ctl.rc_rcvtime, __LINE__); 9265 tp->t_flags |= TF_DELACK; 9266 } else { 9267 rack->r_wanted_output = 1; 9268 tp->t_flags |= TF_ACKNOW; 9269 rack->rc_dack_toggle = 0; 9270 } 9271 if (((thflags & (TH_CWR | TH_ECE)) == TH_ECE) && 9272 (V_tcp_do_ecn == 1)) { 9273 tp->t_flags2 |= TF2_ECN_PERMIT; 9274 KMOD_TCPSTAT_INC(tcps_ecn_shs); 9275 } 9276 if (SEQ_GT(th->th_ack, tp->snd_una)) { 9277 /* 9278 * We advance snd_una for the 9279 * fast open case. If th_ack is 9280 * acknowledging data beyond 9281 * snd_una we can't just call 9282 * ack-processing since the 9283 * data stream in our send-map 9284 * will start at snd_una + 1 (one 9285 * beyond the SYN). If its just 9286 * equal we don't need to do that 9287 * and there is no send_map. 9288 */ 9289 tp->snd_una++; 9290 } 9291 /* 9292 * Received <SYN,ACK> in SYN_SENT[*] state. Transitions: 9293 * SYN_SENT --> ESTABLISHED SYN_SENT* --> FIN_WAIT_1 9294 */ 9295 tp->t_starttime = ticks; 9296 if (tp->t_flags & TF_NEEDFIN) { 9297 tcp_state_change(tp, TCPS_FIN_WAIT_1); 9298 tp->t_flags &= ~TF_NEEDFIN; 9299 thflags &= ~TH_SYN; 9300 } else { 9301 tcp_state_change(tp, TCPS_ESTABLISHED); 9302 TCP_PROBE5(connect__established, NULL, tp, 9303 mtod(m, const char *), tp, th); 9304 rack_cc_conn_init(tp); 9305 } 9306 } else { 9307 /* 9308 * Received initial SYN in SYN-SENT[*] state => simultaneous 9309 * open. If segment contains CC option and there is a 9310 * cached CC, apply TAO test. If it succeeds, connection is * 9311 * half-synchronized. Otherwise, do 3-way handshake: 9312 * SYN-SENT -> SYN-RECEIVED SYN-SENT* -> SYN-RECEIVED* If 9313 * there was no CC option, clear cached CC value. 9314 */ 9315 tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN); 9316 tcp_state_change(tp, TCPS_SYN_RECEIVED); 9317 } 9318 INP_WLOCK_ASSERT(tp->t_inpcb); 9319 /* 9320 * Advance th->th_seq to correspond to first data byte. If data, 9321 * trim to stay within window, dropping FIN if necessary. 9322 */ 9323 th->th_seq++; 9324 if (tlen > tp->rcv_wnd) { 9325 todrop = tlen - tp->rcv_wnd; 9326 m_adj(m, -todrop); 9327 tlen = tp->rcv_wnd; 9328 thflags &= ~TH_FIN; 9329 KMOD_TCPSTAT_INC(tcps_rcvpackafterwin); 9330 KMOD_TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); 9331 } 9332 tp->snd_wl1 = th->th_seq - 1; 9333 tp->rcv_up = th->th_seq; 9334 /* 9335 * Client side of transaction: already sent SYN and data. If the 9336 * remote host used T/TCP to validate the SYN, our data will be 9337 * ACK'd; if so, enter normal data segment processing in the middle 9338 * of step 5, ack processing. Otherwise, goto step 6. 9339 */ 9340 if (thflags & TH_ACK) { 9341 /* For syn-sent we need to possibly update the rtt */ 9342 if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) { 9343 uint32_t t; 9344 9345 t = tcp_ts_getticks() - to->to_tsecr; 9346 if (!tp->t_rttlow || tp->t_rttlow > t) 9347 tp->t_rttlow = t; 9348 tcp_rack_xmit_timer(rack, t + 1, 1, (t * HPTS_USEC_IN_MSEC), 0, NULL, 2); 9349 tcp_rack_xmit_timer_commit(rack, tp); 9350 } 9351 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) 9352 return (ret_val); 9353 /* We may have changed to FIN_WAIT_1 above */ 9354 if (tp->t_state == TCPS_FIN_WAIT_1) { 9355 /* 9356 * In FIN_WAIT_1 STATE in addition to the processing 9357 * for the ESTABLISHED state if our FIN is now 9358 * acknowledged then enter FIN_WAIT_2. 9359 */ 9360 if (ourfinisacked) { 9361 /* 9362 * If we can't receive any more data, then 9363 * closing user can proceed. Starting the 9364 * timer is contrary to the specification, 9365 * but if we don't get a FIN we'll hang 9366 * forever. 9367 * 9368 * XXXjl: we should release the tp also, and 9369 * use a compressed state. 9370 */ 9371 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 9372 soisdisconnected(so); 9373 tcp_timer_activate(tp, TT_2MSL, 9374 (tcp_fast_finwait2_recycle ? 9375 tcp_finwait2_timeout : 9376 TP_MAXIDLE(tp))); 9377 } 9378 tcp_state_change(tp, TCPS_FIN_WAIT_2); 9379 } 9380 } 9381 } 9382 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 9383 tiwin, thflags, nxt_pkt)); 9384 } 9385 9386 /* 9387 * Return value of 1, the TCB is unlocked and most 9388 * likely gone, return value of 0, the TCP is still 9389 * locked. 9390 */ 9391 static int 9392 rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, 9393 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 9394 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 9395 { 9396 struct tcp_rack *rack; 9397 int32_t ret_val = 0; 9398 int32_t ourfinisacked = 0; 9399 9400 ctf_calc_rwin(so, tp); 9401 if ((thflags & TH_ACK) && 9402 (SEQ_LEQ(th->th_ack, tp->snd_una) || 9403 SEQ_GT(th->th_ack, tp->snd_max))) { 9404 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 9405 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 9406 return (1); 9407 } 9408 rack = (struct tcp_rack *)tp->t_fb_ptr; 9409 if (IS_FASTOPEN(tp->t_flags)) { 9410 /* 9411 * When a TFO connection is in SYN_RECEIVED, the 9412 * only valid packets are the initial SYN, a 9413 * retransmit/copy of the initial SYN (possibly with 9414 * a subset of the original data), a valid ACK, a 9415 * FIN, or a RST. 9416 */ 9417 if ((thflags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) { 9418 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 9419 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 9420 return (1); 9421 } else if (thflags & TH_SYN) { 9422 /* non-initial SYN is ignored */ 9423 if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) || 9424 (rack->r_ctl.rc_hpts_flags & PACE_TMR_TLP) || 9425 (rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK)) { 9426 ctf_do_drop(m, NULL); 9427 return (0); 9428 } 9429 } else if (!(thflags & (TH_ACK | TH_FIN | TH_RST))) { 9430 ctf_do_drop(m, NULL); 9431 return (0); 9432 } 9433 } 9434 if ((thflags & TH_RST) || 9435 (tp->t_fin_is_rst && (thflags & TH_FIN))) 9436 return (ctf_process_rst(m, th, so, tp)); 9437 /* 9438 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 9439 * it's less than ts_recent, drop it. 9440 */ 9441 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 9442 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 9443 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 9444 return (ret_val); 9445 } 9446 /* 9447 * In the SYN-RECEIVED state, validate that the packet belongs to 9448 * this connection before trimming the data to fit the receive 9449 * window. Check the sequence number versus IRS since we know the 9450 * sequence numbers haven't wrapped. This is a partial fix for the 9451 * "LAND" DoS attack. 9452 */ 9453 if (SEQ_LT(th->th_seq, tp->irs)) { 9454 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 9455 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 9456 return (1); 9457 } 9458 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 9459 return (ret_val); 9460 } 9461 /* 9462 * If last ACK falls within this segment's sequence numbers, record 9463 * its timestamp. NOTE: 1) That the test incorporates suggestions 9464 * from the latest proposal of the tcplw@cray.com list (Braden 9465 * 1993/04/26). 2) That updating only on newer timestamps interferes 9466 * with our earlier PAWS tests, so this check should be solely 9467 * predicated on the sequence space of this segment. 3) That we 9468 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 9469 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 9470 * SEG.Len, This modified check allows us to overcome RFC1323's 9471 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 9472 * p.869. In such cases, we can still calculate the RTT correctly 9473 * when RCV.NXT == Last.ACK.Sent. 9474 */ 9475 if ((to->to_flags & TOF_TS) != 0 && 9476 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 9477 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 9478 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 9479 tp->ts_recent_age = tcp_ts_getticks(); 9480 tp->ts_recent = to->to_tsval; 9481 } 9482 tp->snd_wnd = tiwin; 9483 /* 9484 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 9485 * is on (half-synchronized state), then queue data for later 9486 * processing; else drop segment and return. 9487 */ 9488 if ((thflags & TH_ACK) == 0) { 9489 if (IS_FASTOPEN(tp->t_flags)) { 9490 rack_cc_conn_init(tp); 9491 } 9492 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 9493 tiwin, thflags, nxt_pkt)); 9494 } 9495 KMOD_TCPSTAT_INC(tcps_connects); 9496 soisconnected(so); 9497 /* Do window scaling? */ 9498 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 9499 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 9500 tp->rcv_scale = tp->request_r_scale; 9501 } 9502 /* 9503 * Make transitions: SYN-RECEIVED -> ESTABLISHED SYN-RECEIVED* -> 9504 * FIN-WAIT-1 9505 */ 9506 tp->t_starttime = ticks; 9507 if (IS_FASTOPEN(tp->t_flags) && tp->t_tfo_pending) { 9508 tcp_fastopen_decrement_counter(tp->t_tfo_pending); 9509 tp->t_tfo_pending = NULL; 9510 } 9511 if (tp->t_flags & TF_NEEDFIN) { 9512 tcp_state_change(tp, TCPS_FIN_WAIT_1); 9513 tp->t_flags &= ~TF_NEEDFIN; 9514 } else { 9515 tcp_state_change(tp, TCPS_ESTABLISHED); 9516 TCP_PROBE5(accept__established, NULL, tp, 9517 mtod(m, const char *), tp, th); 9518 /* 9519 * TFO connections call cc_conn_init() during SYN 9520 * processing. Calling it again here for such connections 9521 * is not harmless as it would undo the snd_cwnd reduction 9522 * that occurs when a TFO SYN|ACK is retransmitted. 9523 */ 9524 if (!IS_FASTOPEN(tp->t_flags)) 9525 rack_cc_conn_init(tp); 9526 } 9527 /* 9528 * Account for the ACK of our SYN prior to 9529 * regular ACK processing below, except for 9530 * simultaneous SYN, which is handled later. 9531 */ 9532 if (SEQ_GT(th->th_ack, tp->snd_una) && !(tp->t_flags & TF_NEEDSYN)) 9533 tp->snd_una++; 9534 /* 9535 * If segment contains data or ACK, will call tcp_reass() later; if 9536 * not, do so now to pass queued data to user. 9537 */ 9538 if (tlen == 0 && (thflags & TH_FIN) == 0) 9539 (void) tcp_reass(tp, (struct tcphdr *)0, NULL, 0, 9540 (struct mbuf *)0); 9541 tp->snd_wl1 = th->th_seq - 1; 9542 /* For syn-recv we need to possibly update the rtt */ 9543 if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) { 9544 uint32_t t; 9545 9546 t = tcp_ts_getticks() - to->to_tsecr; 9547 if (!tp->t_rttlow || tp->t_rttlow > t) 9548 tp->t_rttlow = t; 9549 tcp_rack_xmit_timer(rack, t + 1, 1, (t * HPTS_USEC_IN_MSEC), 0, NULL, 2); 9550 tcp_rack_xmit_timer_commit(rack, tp); 9551 } 9552 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 9553 return (ret_val); 9554 } 9555 if (tp->t_state == TCPS_FIN_WAIT_1) { 9556 /* We could have went to FIN_WAIT_1 (or EST) above */ 9557 /* 9558 * In FIN_WAIT_1 STATE in addition to the processing for the 9559 * ESTABLISHED state if our FIN is now acknowledged then 9560 * enter FIN_WAIT_2. 9561 */ 9562 if (ourfinisacked) { 9563 /* 9564 * If we can't receive any more data, then closing 9565 * user can proceed. Starting the timer is contrary 9566 * to the specification, but if we don't get a FIN 9567 * we'll hang forever. 9568 * 9569 * XXXjl: we should release the tp also, and use a 9570 * compressed state. 9571 */ 9572 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 9573 soisdisconnected(so); 9574 tcp_timer_activate(tp, TT_2MSL, 9575 (tcp_fast_finwait2_recycle ? 9576 tcp_finwait2_timeout : 9577 TP_MAXIDLE(tp))); 9578 } 9579 tcp_state_change(tp, TCPS_FIN_WAIT_2); 9580 } 9581 } 9582 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 9583 tiwin, thflags, nxt_pkt)); 9584 } 9585 9586 /* 9587 * Return value of 1, the TCB is unlocked and most 9588 * likely gone, return value of 0, the TCP is still 9589 * locked. 9590 */ 9591 static int 9592 rack_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so, 9593 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 9594 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 9595 { 9596 int32_t ret_val = 0; 9597 struct tcp_rack *rack; 9598 9599 /* 9600 * Header prediction: check for the two common cases of a 9601 * uni-directional data xfer. If the packet has no control flags, 9602 * is in-sequence, the window didn't change and we're not 9603 * retransmitting, it's a candidate. If the length is zero and the 9604 * ack moved forward, we're the sender side of the xfer. Just free 9605 * the data acked & wake any higher level process that was blocked 9606 * waiting for space. If the length is non-zero and the ack didn't 9607 * move, we're the receiver side. If we're getting packets in-order 9608 * (the reassembly queue is empty), add the data toc The socket 9609 * buffer and note that we need a delayed ack. Make sure that the 9610 * hidden state-flags are also off. Since we check for 9611 * TCPS_ESTABLISHED first, it can only be TH_NEEDSYN. 9612 */ 9613 rack = (struct tcp_rack *)tp->t_fb_ptr; 9614 if (__predict_true(((to->to_flags & TOF_SACK) == 0)) && 9615 __predict_true((thflags & (TH_SYN | TH_FIN | TH_RST | TH_ACK)) == TH_ACK) && 9616 __predict_true(SEGQ_EMPTY(tp)) && 9617 __predict_true(th->th_seq == tp->rcv_nxt)) { 9618 if (tlen == 0) { 9619 if (rack_fastack(m, th, so, tp, to, drop_hdrlen, tlen, 9620 tiwin, nxt_pkt, rack->r_ctl.rc_rcvtime)) { 9621 return (0); 9622 } 9623 } else { 9624 if (rack_do_fastnewdata(m, th, so, tp, to, drop_hdrlen, tlen, 9625 tiwin, nxt_pkt, iptos)) { 9626 return (0); 9627 } 9628 } 9629 } 9630 ctf_calc_rwin(so, tp); 9631 9632 if ((thflags & TH_RST) || 9633 (tp->t_fin_is_rst && (thflags & TH_FIN))) 9634 return (ctf_process_rst(m, th, so, tp)); 9635 9636 /* 9637 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 9638 * synchronized state. 9639 */ 9640 if (thflags & TH_SYN) { 9641 ctf_challenge_ack(m, th, tp, &ret_val); 9642 return (ret_val); 9643 } 9644 /* 9645 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 9646 * it's less than ts_recent, drop it. 9647 */ 9648 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 9649 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 9650 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 9651 return (ret_val); 9652 } 9653 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 9654 return (ret_val); 9655 } 9656 /* 9657 * If last ACK falls within this segment's sequence numbers, record 9658 * its timestamp. NOTE: 1) That the test incorporates suggestions 9659 * from the latest proposal of the tcplw@cray.com list (Braden 9660 * 1993/04/26). 2) That updating only on newer timestamps interferes 9661 * with our earlier PAWS tests, so this check should be solely 9662 * predicated on the sequence space of this segment. 3) That we 9663 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 9664 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 9665 * SEG.Len, This modified check allows us to overcome RFC1323's 9666 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 9667 * p.869. In such cases, we can still calculate the RTT correctly 9668 * when RCV.NXT == Last.ACK.Sent. 9669 */ 9670 if ((to->to_flags & TOF_TS) != 0 && 9671 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 9672 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 9673 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 9674 tp->ts_recent_age = tcp_ts_getticks(); 9675 tp->ts_recent = to->to_tsval; 9676 } 9677 /* 9678 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 9679 * is on (half-synchronized state), then queue data for later 9680 * processing; else drop segment and return. 9681 */ 9682 if ((thflags & TH_ACK) == 0) { 9683 if (tp->t_flags & TF_NEEDSYN) { 9684 9685 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 9686 tiwin, thflags, nxt_pkt)); 9687 9688 } else if (tp->t_flags & TF_ACKNOW) { 9689 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 9690 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output= 1; 9691 return (ret_val); 9692 } else { 9693 ctf_do_drop(m, NULL); 9694 return (0); 9695 } 9696 } 9697 /* 9698 * Ack processing. 9699 */ 9700 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) { 9701 return (ret_val); 9702 } 9703 if (sbavail(&so->so_snd)) { 9704 if (ctf_progress_timeout_check(tp, true)) { 9705 rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__); 9706 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 9707 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 9708 return (1); 9709 } 9710 } 9711 /* State changes only happen in rack_process_data() */ 9712 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 9713 tiwin, thflags, nxt_pkt)); 9714 } 9715 9716 /* 9717 * Return value of 1, the TCB is unlocked and most 9718 * likely gone, return value of 0, the TCP is still 9719 * locked. 9720 */ 9721 static int 9722 rack_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so, 9723 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 9724 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 9725 { 9726 int32_t ret_val = 0; 9727 9728 ctf_calc_rwin(so, tp); 9729 if ((thflags & TH_RST) || 9730 (tp->t_fin_is_rst && (thflags & TH_FIN))) 9731 return (ctf_process_rst(m, th, so, tp)); 9732 /* 9733 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 9734 * synchronized state. 9735 */ 9736 if (thflags & TH_SYN) { 9737 ctf_challenge_ack(m, th, tp, &ret_val); 9738 return (ret_val); 9739 } 9740 /* 9741 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 9742 * it's less than ts_recent, drop it. 9743 */ 9744 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 9745 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 9746 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 9747 return (ret_val); 9748 } 9749 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 9750 return (ret_val); 9751 } 9752 /* 9753 * If last ACK falls within this segment's sequence numbers, record 9754 * its timestamp. NOTE: 1) That the test incorporates suggestions 9755 * from the latest proposal of the tcplw@cray.com list (Braden 9756 * 1993/04/26). 2) That updating only on newer timestamps interferes 9757 * with our earlier PAWS tests, so this check should be solely 9758 * predicated on the sequence space of this segment. 3) That we 9759 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 9760 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 9761 * SEG.Len, This modified check allows us to overcome RFC1323's 9762 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 9763 * p.869. In such cases, we can still calculate the RTT correctly 9764 * when RCV.NXT == Last.ACK.Sent. 9765 */ 9766 if ((to->to_flags & TOF_TS) != 0 && 9767 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 9768 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 9769 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 9770 tp->ts_recent_age = tcp_ts_getticks(); 9771 tp->ts_recent = to->to_tsval; 9772 } 9773 /* 9774 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 9775 * is on (half-synchronized state), then queue data for later 9776 * processing; else drop segment and return. 9777 */ 9778 if ((thflags & TH_ACK) == 0) { 9779 if (tp->t_flags & TF_NEEDSYN) { 9780 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 9781 tiwin, thflags, nxt_pkt)); 9782 9783 } else if (tp->t_flags & TF_ACKNOW) { 9784 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 9785 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; 9786 return (ret_val); 9787 } else { 9788 ctf_do_drop(m, NULL); 9789 return (0); 9790 } 9791 } 9792 /* 9793 * Ack processing. 9794 */ 9795 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) { 9796 return (ret_val); 9797 } 9798 if (sbavail(&so->so_snd)) { 9799 if (ctf_progress_timeout_check(tp, true)) { 9800 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 9801 tp, tick, PROGRESS_DROP, __LINE__); 9802 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 9803 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 9804 return (1); 9805 } 9806 } 9807 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 9808 tiwin, thflags, nxt_pkt)); 9809 } 9810 9811 static int 9812 rack_check_data_after_close(struct mbuf *m, 9813 struct tcpcb *tp, int32_t *tlen, struct tcphdr *th, struct socket *so) 9814 { 9815 struct tcp_rack *rack; 9816 9817 rack = (struct tcp_rack *)tp->t_fb_ptr; 9818 if (rack->rc_allow_data_af_clo == 0) { 9819 close_now: 9820 tcp_log_end_status(tp, TCP_EI_STATUS_DATA_A_CLOSE); 9821 /* tcp_close will kill the inp pre-log the Reset */ 9822 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); 9823 tp = tcp_close(tp); 9824 KMOD_TCPSTAT_INC(tcps_rcvafterclose); 9825 ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, (*tlen)); 9826 return (1); 9827 } 9828 if (sbavail(&so->so_snd) == 0) 9829 goto close_now; 9830 /* Ok we allow data that is ignored and a followup reset */ 9831 tcp_log_end_status(tp, TCP_EI_STATUS_DATA_A_CLOSE); 9832 tp->rcv_nxt = th->th_seq + *tlen; 9833 tp->t_flags2 |= TF2_DROP_AF_DATA; 9834 rack->r_wanted_output = 1; 9835 *tlen = 0; 9836 return (0); 9837 } 9838 9839 /* 9840 * Return value of 1, the TCB is unlocked and most 9841 * likely gone, return value of 0, the TCP is still 9842 * locked. 9843 */ 9844 static int 9845 rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so, 9846 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 9847 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 9848 { 9849 int32_t ret_val = 0; 9850 int32_t ourfinisacked = 0; 9851 9852 ctf_calc_rwin(so, tp); 9853 9854 if ((thflags & TH_RST) || 9855 (tp->t_fin_is_rst && (thflags & TH_FIN))) 9856 return (ctf_process_rst(m, th, so, tp)); 9857 /* 9858 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 9859 * synchronized state. 9860 */ 9861 if (thflags & TH_SYN) { 9862 ctf_challenge_ack(m, th, tp, &ret_val); 9863 return (ret_val); 9864 } 9865 /* 9866 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 9867 * it's less than ts_recent, drop it. 9868 */ 9869 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 9870 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 9871 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 9872 return (ret_val); 9873 } 9874 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 9875 return (ret_val); 9876 } 9877 /* 9878 * If new data are received on a connection after the user processes 9879 * are gone, then RST the other end. 9880 */ 9881 if ((so->so_state & SS_NOFDREF) && tlen) { 9882 if (rack_check_data_after_close(m, tp, &tlen, th, so)) 9883 return (1); 9884 } 9885 /* 9886 * If last ACK falls within this segment's sequence numbers, record 9887 * its timestamp. NOTE: 1) That the test incorporates suggestions 9888 * from the latest proposal of the tcplw@cray.com list (Braden 9889 * 1993/04/26). 2) That updating only on newer timestamps interferes 9890 * with our earlier PAWS tests, so this check should be solely 9891 * predicated on the sequence space of this segment. 3) That we 9892 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 9893 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 9894 * SEG.Len, This modified check allows us to overcome RFC1323's 9895 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 9896 * p.869. In such cases, we can still calculate the RTT correctly 9897 * when RCV.NXT == Last.ACK.Sent. 9898 */ 9899 if ((to->to_flags & TOF_TS) != 0 && 9900 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 9901 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 9902 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 9903 tp->ts_recent_age = tcp_ts_getticks(); 9904 tp->ts_recent = to->to_tsval; 9905 } 9906 /* 9907 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 9908 * is on (half-synchronized state), then queue data for later 9909 * processing; else drop segment and return. 9910 */ 9911 if ((thflags & TH_ACK) == 0) { 9912 if (tp->t_flags & TF_NEEDSYN) { 9913 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 9914 tiwin, thflags, nxt_pkt)); 9915 } else if (tp->t_flags & TF_ACKNOW) { 9916 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 9917 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; 9918 return (ret_val); 9919 } else { 9920 ctf_do_drop(m, NULL); 9921 return (0); 9922 } 9923 } 9924 /* 9925 * Ack processing. 9926 */ 9927 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 9928 return (ret_val); 9929 } 9930 if (ourfinisacked) { 9931 /* 9932 * If we can't receive any more data, then closing user can 9933 * proceed. Starting the timer is contrary to the 9934 * specification, but if we don't get a FIN we'll hang 9935 * forever. 9936 * 9937 * XXXjl: we should release the tp also, and use a 9938 * compressed state. 9939 */ 9940 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 9941 soisdisconnected(so); 9942 tcp_timer_activate(tp, TT_2MSL, 9943 (tcp_fast_finwait2_recycle ? 9944 tcp_finwait2_timeout : 9945 TP_MAXIDLE(tp))); 9946 } 9947 tcp_state_change(tp, TCPS_FIN_WAIT_2); 9948 } 9949 if (sbavail(&so->so_snd)) { 9950 if (ctf_progress_timeout_check(tp, true)) { 9951 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 9952 tp, tick, PROGRESS_DROP, __LINE__); 9953 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 9954 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 9955 return (1); 9956 } 9957 } 9958 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 9959 tiwin, thflags, nxt_pkt)); 9960 } 9961 9962 /* 9963 * Return value of 1, the TCB is unlocked and most 9964 * likely gone, return value of 0, the TCP is still 9965 * locked. 9966 */ 9967 static int 9968 rack_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so, 9969 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 9970 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 9971 { 9972 int32_t ret_val = 0; 9973 int32_t ourfinisacked = 0; 9974 9975 ctf_calc_rwin(so, tp); 9976 9977 if ((thflags & TH_RST) || 9978 (tp->t_fin_is_rst && (thflags & TH_FIN))) 9979 return (ctf_process_rst(m, th, so, tp)); 9980 /* 9981 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 9982 * synchronized state. 9983 */ 9984 if (thflags & TH_SYN) { 9985 ctf_challenge_ack(m, th, tp, &ret_val); 9986 return (ret_val); 9987 } 9988 /* 9989 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 9990 * it's less than ts_recent, drop it. 9991 */ 9992 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 9993 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 9994 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 9995 return (ret_val); 9996 } 9997 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 9998 return (ret_val); 9999 } 10000 /* 10001 * If new data are received on a connection after the user processes 10002 * are gone, then RST the other end. 10003 */ 10004 if ((so->so_state & SS_NOFDREF) && tlen) { 10005 if (rack_check_data_after_close(m, tp, &tlen, th, so)) 10006 return (1); 10007 } 10008 /* 10009 * If last ACK falls within this segment's sequence numbers, record 10010 * its timestamp. NOTE: 1) That the test incorporates suggestions 10011 * from the latest proposal of the tcplw@cray.com list (Braden 10012 * 1993/04/26). 2) That updating only on newer timestamps interferes 10013 * with our earlier PAWS tests, so this check should be solely 10014 * predicated on the sequence space of this segment. 3) That we 10015 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 10016 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 10017 * SEG.Len, This modified check allows us to overcome RFC1323's 10018 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 10019 * p.869. In such cases, we can still calculate the RTT correctly 10020 * when RCV.NXT == Last.ACK.Sent. 10021 */ 10022 if ((to->to_flags & TOF_TS) != 0 && 10023 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 10024 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 10025 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 10026 tp->ts_recent_age = tcp_ts_getticks(); 10027 tp->ts_recent = to->to_tsval; 10028 } 10029 /* 10030 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 10031 * is on (half-synchronized state), then queue data for later 10032 * processing; else drop segment and return. 10033 */ 10034 if ((thflags & TH_ACK) == 0) { 10035 if (tp->t_flags & TF_NEEDSYN) { 10036 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 10037 tiwin, thflags, nxt_pkt)); 10038 } else if (tp->t_flags & TF_ACKNOW) { 10039 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 10040 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output= 1; 10041 return (ret_val); 10042 } else { 10043 ctf_do_drop(m, NULL); 10044 return (0); 10045 } 10046 } 10047 /* 10048 * Ack processing. 10049 */ 10050 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 10051 return (ret_val); 10052 } 10053 if (ourfinisacked) { 10054 tcp_twstart(tp); 10055 m_freem(m); 10056 return (1); 10057 } 10058 if (sbavail(&so->so_snd)) { 10059 if (ctf_progress_timeout_check(tp, true)) { 10060 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 10061 tp, tick, PROGRESS_DROP, __LINE__); 10062 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 10063 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 10064 return (1); 10065 } 10066 } 10067 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 10068 tiwin, thflags, nxt_pkt)); 10069 } 10070 10071 /* 10072 * Return value of 1, the TCB is unlocked and most 10073 * likely gone, return value of 0, the TCP is still 10074 * locked. 10075 */ 10076 static int 10077 rack_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so, 10078 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 10079 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 10080 { 10081 int32_t ret_val = 0; 10082 int32_t ourfinisacked = 0; 10083 10084 ctf_calc_rwin(so, tp); 10085 10086 if ((thflags & TH_RST) || 10087 (tp->t_fin_is_rst && (thflags & TH_FIN))) 10088 return (ctf_process_rst(m, th, so, tp)); 10089 /* 10090 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 10091 * synchronized state. 10092 */ 10093 if (thflags & TH_SYN) { 10094 ctf_challenge_ack(m, th, tp, &ret_val); 10095 return (ret_val); 10096 } 10097 /* 10098 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 10099 * it's less than ts_recent, drop it. 10100 */ 10101 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 10102 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 10103 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 10104 return (ret_val); 10105 } 10106 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 10107 return (ret_val); 10108 } 10109 /* 10110 * If new data are received on a connection after the user processes 10111 * are gone, then RST the other end. 10112 */ 10113 if ((so->so_state & SS_NOFDREF) && tlen) { 10114 if (rack_check_data_after_close(m, tp, &tlen, th, so)) 10115 return (1); 10116 } 10117 /* 10118 * If last ACK falls within this segment's sequence numbers, record 10119 * its timestamp. NOTE: 1) That the test incorporates suggestions 10120 * from the latest proposal of the tcplw@cray.com list (Braden 10121 * 1993/04/26). 2) That updating only on newer timestamps interferes 10122 * with our earlier PAWS tests, so this check should be solely 10123 * predicated on the sequence space of this segment. 3) That we 10124 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 10125 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 10126 * SEG.Len, This modified check allows us to overcome RFC1323's 10127 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 10128 * p.869. In such cases, we can still calculate the RTT correctly 10129 * when RCV.NXT == Last.ACK.Sent. 10130 */ 10131 if ((to->to_flags & TOF_TS) != 0 && 10132 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 10133 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 10134 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 10135 tp->ts_recent_age = tcp_ts_getticks(); 10136 tp->ts_recent = to->to_tsval; 10137 } 10138 /* 10139 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 10140 * is on (half-synchronized state), then queue data for later 10141 * processing; else drop segment and return. 10142 */ 10143 if ((thflags & TH_ACK) == 0) { 10144 if (tp->t_flags & TF_NEEDSYN) { 10145 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 10146 tiwin, thflags, nxt_pkt)); 10147 } else if (tp->t_flags & TF_ACKNOW) { 10148 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 10149 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; 10150 return (ret_val); 10151 } else { 10152 ctf_do_drop(m, NULL); 10153 return (0); 10154 } 10155 } 10156 /* 10157 * case TCPS_LAST_ACK: Ack processing. 10158 */ 10159 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 10160 return (ret_val); 10161 } 10162 if (ourfinisacked) { 10163 tp = tcp_close(tp); 10164 ctf_do_drop(m, tp); 10165 return (1); 10166 } 10167 if (sbavail(&so->so_snd)) { 10168 if (ctf_progress_timeout_check(tp, true)) { 10169 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 10170 tp, tick, PROGRESS_DROP, __LINE__); 10171 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 10172 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 10173 return (1); 10174 } 10175 } 10176 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 10177 tiwin, thflags, nxt_pkt)); 10178 } 10179 10180 10181 /* 10182 * Return value of 1, the TCB is unlocked and most 10183 * likely gone, return value of 0, the TCP is still 10184 * locked. 10185 */ 10186 static int 10187 rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so, 10188 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 10189 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 10190 { 10191 int32_t ret_val = 0; 10192 int32_t ourfinisacked = 0; 10193 10194 ctf_calc_rwin(so, tp); 10195 10196 /* Reset receive buffer auto scaling when not in bulk receive mode. */ 10197 if ((thflags & TH_RST) || 10198 (tp->t_fin_is_rst && (thflags & TH_FIN))) 10199 return (ctf_process_rst(m, th, so, tp)); 10200 /* 10201 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 10202 * synchronized state. 10203 */ 10204 if (thflags & TH_SYN) { 10205 ctf_challenge_ack(m, th, tp, &ret_val); 10206 return (ret_val); 10207 } 10208 /* 10209 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 10210 * it's less than ts_recent, drop it. 10211 */ 10212 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 10213 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 10214 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 10215 return (ret_val); 10216 } 10217 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 10218 return (ret_val); 10219 } 10220 /* 10221 * If new data are received on a connection after the user processes 10222 * are gone, then RST the other end. 10223 */ 10224 if ((so->so_state & SS_NOFDREF) && 10225 tlen) { 10226 if (rack_check_data_after_close(m, tp, &tlen, th, so)) 10227 return (1); 10228 } 10229 /* 10230 * If last ACK falls within this segment's sequence numbers, record 10231 * its timestamp. NOTE: 1) That the test incorporates suggestions 10232 * from the latest proposal of the tcplw@cray.com list (Braden 10233 * 1993/04/26). 2) That updating only on newer timestamps interferes 10234 * with our earlier PAWS tests, so this check should be solely 10235 * predicated on the sequence space of this segment. 3) That we 10236 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 10237 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 10238 * SEG.Len, This modified check allows us to overcome RFC1323's 10239 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 10240 * p.869. In such cases, we can still calculate the RTT correctly 10241 * when RCV.NXT == Last.ACK.Sent. 10242 */ 10243 if ((to->to_flags & TOF_TS) != 0 && 10244 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 10245 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 10246 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 10247 tp->ts_recent_age = tcp_ts_getticks(); 10248 tp->ts_recent = to->to_tsval; 10249 } 10250 /* 10251 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 10252 * is on (half-synchronized state), then queue data for later 10253 * processing; else drop segment and return. 10254 */ 10255 if ((thflags & TH_ACK) == 0) { 10256 if (tp->t_flags & TF_NEEDSYN) { 10257 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 10258 tiwin, thflags, nxt_pkt)); 10259 } else if (tp->t_flags & TF_ACKNOW) { 10260 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 10261 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; 10262 return (ret_val); 10263 } else { 10264 ctf_do_drop(m, NULL); 10265 return (0); 10266 } 10267 } 10268 /* 10269 * Ack processing. 10270 */ 10271 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 10272 return (ret_val); 10273 } 10274 if (sbavail(&so->so_snd)) { 10275 if (ctf_progress_timeout_check(tp, true)) { 10276 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 10277 tp, tick, PROGRESS_DROP, __LINE__); 10278 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 10279 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 10280 return (1); 10281 } 10282 } 10283 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 10284 tiwin, thflags, nxt_pkt)); 10285 } 10286 10287 static void inline 10288 rack_clear_rate_sample(struct tcp_rack *rack) 10289 { 10290 rack->r_ctl.rack_rs.rs_flags = RACK_RTT_EMPTY; 10291 rack->r_ctl.rack_rs.rs_rtt_cnt = 0; 10292 rack->r_ctl.rack_rs.rs_rtt_tot = 0; 10293 } 10294 10295 static void 10296 rack_set_pace_segments(struct tcpcb *tp, struct tcp_rack *rack, uint32_t line) 10297 { 10298 uint64_t bw_est, rate_wanted; 10299 int chged = 0; 10300 uint32_t user_max; 10301 10302 user_max = ctf_fixed_maxseg(tp) * rack->rc_user_set_max_segs; 10303 if (ctf_fixed_maxseg(tp) != rack->r_ctl.rc_pace_min_segs) 10304 chged = 1; 10305 rack->r_ctl.rc_pace_min_segs = ctf_fixed_maxseg(tp); 10306 if (rack->use_fixed_rate || rack->rc_force_max_seg) { 10307 if (user_max != rack->r_ctl.rc_pace_max_segs) 10308 chged = 1; 10309 } 10310 if (rack->rc_force_max_seg) { 10311 rack->r_ctl.rc_pace_max_segs = user_max; 10312 } else if (rack->use_fixed_rate) { 10313 bw_est = rack_get_bw(rack); 10314 if ((rack->r_ctl.crte == NULL) || 10315 (bw_est != rack->r_ctl.crte->rate)) { 10316 rack->r_ctl.rc_pace_max_segs = user_max; 10317 } else { 10318 /* We are pacing right at the hardware rate */ 10319 uint32_t segsiz; 10320 10321 segsiz = min(ctf_fixed_maxseg(tp), 10322 rack->r_ctl.rc_pace_min_segs); 10323 rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size( 10324 bw_est, segsiz, 0, 10325 rack->r_ctl.crte, NULL); 10326 } 10327 } else if (rack->rc_always_pace) { 10328 if (rack->r_ctl.gp_bw || 10329 #ifdef NETFLIX_PEAKRATE 10330 rack->rc_tp->t_maxpeakrate || 10331 #endif 10332 rack->r_ctl.init_rate) { 10333 /* We have a rate of some sort set */ 10334 uint32_t orig; 10335 10336 bw_est = rack_get_bw(rack); 10337 orig = rack->r_ctl.rc_pace_max_segs; 10338 rate_wanted = rack_get_output_bw(rack, bw_est, NULL); 10339 if (rate_wanted) { 10340 /* We have something */ 10341 rack->r_ctl.rc_pace_max_segs = rack_get_pacing_len(rack, 10342 rate_wanted, 10343 ctf_fixed_maxseg(rack->rc_tp)); 10344 } else 10345 rack->r_ctl.rc_pace_max_segs = rack->r_ctl.rc_pace_min_segs; 10346 if (orig != rack->r_ctl.rc_pace_max_segs) 10347 chged = 1; 10348 } else if ((rack->r_ctl.gp_bw == 0) && 10349 (rack->r_ctl.rc_pace_max_segs == 0)) { 10350 /* 10351 * If we have nothing limit us to bursting 10352 * out IW sized pieces. 10353 */ 10354 chged = 1; 10355 rack->r_ctl.rc_pace_max_segs = rc_init_window(rack); 10356 } 10357 } 10358 if (rack->r_ctl.rc_pace_max_segs > PACE_MAX_IP_BYTES) { 10359 chged = 1; 10360 rack->r_ctl.rc_pace_max_segs = PACE_MAX_IP_BYTES; 10361 } 10362 if (chged) 10363 rack_log_type_hrdwtso(tp, rack, 0, rack->rc_inp->inp_socket->so_snd.sb_flags, line, 2); 10364 } 10365 10366 static int 10367 rack_init(struct tcpcb *tp) 10368 { 10369 struct tcp_rack *rack = NULL; 10370 struct rack_sendmap *insret; 10371 uint32_t iwin, snt, us_cts; 10372 10373 tp->t_fb_ptr = uma_zalloc(rack_pcb_zone, M_NOWAIT); 10374 if (tp->t_fb_ptr == NULL) { 10375 /* 10376 * We need to allocate memory but cant. The INP and INP_INFO 10377 * locks and they are recusive (happens during setup. So a 10378 * scheme to drop the locks fails :( 10379 * 10380 */ 10381 return (ENOMEM); 10382 } 10383 memset(tp->t_fb_ptr, 0, sizeof(struct tcp_rack)); 10384 10385 rack = (struct tcp_rack *)tp->t_fb_ptr; 10386 RB_INIT(&rack->r_ctl.rc_mtree); 10387 TAILQ_INIT(&rack->r_ctl.rc_free); 10388 TAILQ_INIT(&rack->r_ctl.rc_tmap); 10389 rack->rc_tp = tp; 10390 if (tp->t_inpcb) { 10391 rack->rc_inp = tp->t_inpcb; 10392 } 10393 /* Probably not needed but lets be sure */ 10394 rack_clear_rate_sample(rack); 10395 rack->r_ctl.rc_reorder_fade = rack_reorder_fade; 10396 rack->rc_allow_data_af_clo = rack_ignore_data_after_close; 10397 rack->r_ctl.rc_tlp_threshold = rack_tlp_thresh; 10398 if (use_rack_rr) 10399 rack->use_rack_rr = 1; 10400 if (V_tcp_delack_enabled) 10401 tp->t_delayed_ack = 1; 10402 else 10403 tp->t_delayed_ack = 0; 10404 if (rack_enable_shared_cwnd) 10405 rack->rack_enable_scwnd = 1; 10406 rack->rc_user_set_max_segs = rack_hptsi_segments; 10407 rack->rc_force_max_seg = 0; 10408 if (rack_use_imac_dack) 10409 rack->rc_dack_mode = 1; 10410 rack->r_ctl.rc_reorder_shift = rack_reorder_thresh; 10411 rack->r_ctl.rc_pkt_delay = rack_pkt_delay; 10412 rack->r_ctl.rc_prop_reduce = rack_use_proportional_reduce; 10413 rack->r_ctl.rc_prop_rate = rack_proportional_rate; 10414 rack->r_ctl.rc_tlp_cwnd_reduce = rack_lower_cwnd_at_tlp; 10415 rack->r_ctl.rc_early_recovery = rack_early_recovery; 10416 rack->r_ctl.rc_lowest_us_rtt = 0xffffffff; 10417 rack->r_ctl.rc_highest_us_rtt = 0; 10418 if (rack_disable_prr) 10419 rack->rack_no_prr = 1; 10420 if (rack_gp_no_rec_chg) 10421 rack->rc_gp_no_rec_chg = 1; 10422 rack->rc_always_pace = rack_pace_every_seg; 10423 if (rack_enable_mqueue_for_nonpaced) 10424 rack->r_mbuf_queue = 1; 10425 else 10426 rack->r_mbuf_queue = 0; 10427 if (rack->r_mbuf_queue || rack->rc_always_pace) 10428 tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ; 10429 else 10430 tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ; 10431 rack_set_pace_segments(tp, rack, __LINE__); 10432 if (rack_limits_scwnd) 10433 rack->r_limit_scw = 1; 10434 else 10435 rack->r_limit_scw = 0; 10436 rack->r_ctl.rc_high_rwnd = tp->snd_wnd; 10437 rack->r_ctl.cwnd_to_use = tp->snd_cwnd; 10438 rack->r_ctl.rc_rate_sample_method = rack_rate_sample_method; 10439 rack->rack_tlp_threshold_use = rack_tlp_threshold_use; 10440 rack->r_ctl.rc_prr_sendalot = rack_send_a_lot_in_prr; 10441 rack->r_ctl.rc_min_to = rack_min_to; 10442 microuptime(&rack->r_ctl.act_rcv_time); 10443 rack->r_ctl.rc_last_time_decay = rack->r_ctl.act_rcv_time; 10444 rack->r_running_late = 0; 10445 rack->r_running_early = 0; 10446 rack->rc_init_win = rack_default_init_window; 10447 rack->r_ctl.rack_per_of_gp_ss = rack_per_of_gp_ss; 10448 if (rack_do_dyn_mul) { 10449 /* When dynamic adjustment is on CA needs to start at 100% */ 10450 rack->rc_gp_dyn_mul = 1; 10451 if (rack_do_dyn_mul >= 100) 10452 rack->r_ctl.rack_per_of_gp_ca = rack_do_dyn_mul; 10453 } else 10454 rack->r_ctl.rack_per_of_gp_ca = rack_per_of_gp_ca; 10455 rack->r_ctl.rack_per_of_gp_rec = rack_per_of_gp_rec; 10456 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt; 10457 rack->r_ctl.rc_tlp_rxt_last_time = tcp_tv_to_mssectick(&rack->r_ctl.act_rcv_time); 10458 setup_time_filter_small(&rack->r_ctl.rc_gp_min_rtt, FILTER_TYPE_MIN, 10459 rack_probertt_filter_life); 10460 us_cts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 10461 rack->r_ctl.rc_lower_rtt_us_cts = us_cts; 10462 rack->r_ctl.rc_time_of_last_probertt = us_cts; 10463 rack->r_ctl.rc_time_probertt_starts = 0; 10464 /* Do we force on detection? */ 10465 #ifdef NETFLIX_EXP_DETECTION 10466 if (tcp_force_detection) 10467 rack->do_detection = 1; 10468 else 10469 #endif 10470 rack->do_detection = 0; 10471 if (rack_non_rxt_use_cr) 10472 rack->rack_rec_nonrxt_use_cr = 1; 10473 if (tp->snd_una != tp->snd_max) { 10474 /* Create a send map for the current outstanding data */ 10475 struct rack_sendmap *rsm; 10476 10477 rsm = rack_alloc(rack); 10478 if (rsm == NULL) { 10479 uma_zfree(rack_pcb_zone, tp->t_fb_ptr); 10480 tp->t_fb_ptr = NULL; 10481 return (ENOMEM); 10482 } 10483 rsm->r_flags = RACK_OVERMAX; 10484 rsm->r_tim_lastsent[0] = rack->r_ctl.rc_tlp_rxt_last_time; 10485 rsm->r_rtr_cnt = 1; 10486 rsm->r_rtr_bytes = 0; 10487 rsm->r_start = tp->snd_una; 10488 rsm->r_end = tp->snd_max; 10489 rsm->usec_orig_send = us_cts; 10490 rsm->r_dupack = 0; 10491 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 10492 #ifdef INVARIANTS 10493 if (insret != NULL) { 10494 panic("Insert in rb tree fails ret:%p rack:%p rsm:%p", 10495 insret, rack, rsm); 10496 } 10497 #endif 10498 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); 10499 rsm->r_in_tmap = 1; 10500 } 10501 /* Cancel the GP measurement in progress */ 10502 tp->t_flags &= ~TF_GPUTINPROG; 10503 if (SEQ_GT(tp->snd_max, tp->iss)) 10504 snt = tp->snd_max - tp->iss; 10505 else 10506 snt = 0; 10507 iwin = rc_init_window(rack); 10508 if (snt < iwin) { 10509 /* We are not past the initial window 10510 * so we need to make sure cwnd is 10511 * correct. 10512 */ 10513 if (tp->snd_cwnd < iwin) 10514 tp->snd_cwnd = iwin; 10515 /* 10516 * If we are within the initial window 10517 * we want ssthresh to be unlimited. Setting 10518 * it to the rwnd (which the default stack does 10519 * and older racks) is not really a good idea 10520 * since we want to be in SS and grow both the 10521 * cwnd and the rwnd (via dynamic rwnd growth). If 10522 * we set it to the rwnd then as the peer grows its 10523 * rwnd we will be stuck in CA and never hit SS. 10524 * 10525 * Its far better to raise it up high (this takes the 10526 * risk that there as been a loss already, probably 10527 * we should have an indicator in all stacks of loss 10528 * but we don't), but considering the normal use this 10529 * is a risk worth taking. The consequences of not 10530 * hitting SS are far worse than going one more time 10531 * into it early on (before we have sent even a IW). 10532 * It is highly unlikely that we will have had a loss 10533 * before getting the IW out. 10534 */ 10535 tp->snd_ssthresh = 0xffffffff; 10536 } 10537 rack_stop_all_timers(tp); 10538 rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), 0, 0, 0); 10539 rack_log_rtt_shrinks(rack, us_cts, 0, 10540 __LINE__, RACK_RTTS_INIT); 10541 return (0); 10542 } 10543 10544 static int 10545 rack_handoff_ok(struct tcpcb *tp) 10546 { 10547 if ((tp->t_state == TCPS_CLOSED) || 10548 (tp->t_state == TCPS_LISTEN)) { 10549 /* Sure no problem though it may not stick */ 10550 return (0); 10551 } 10552 if ((tp->t_state == TCPS_SYN_SENT) || 10553 (tp->t_state == TCPS_SYN_RECEIVED)) { 10554 /* 10555 * We really don't know you have to get to ESTAB or beyond 10556 * to tell. 10557 */ 10558 return (EAGAIN); 10559 } 10560 if ((tp->t_flags & TF_SACK_PERMIT) || rack_sack_not_required){ 10561 return (0); 10562 } 10563 /* 10564 * If we reach here we don't do SACK on this connection so we can 10565 * never do rack. 10566 */ 10567 return (EINVAL); 10568 } 10569 10570 static void 10571 rack_fini(struct tcpcb *tp, int32_t tcb_is_purged) 10572 { 10573 if (tp->t_fb_ptr) { 10574 struct tcp_rack *rack; 10575 struct rack_sendmap *rsm, *nrsm, *rm; 10576 10577 rack = (struct tcp_rack *)tp->t_fb_ptr; 10578 #ifdef NETFLIX_SHARED_CWND 10579 if (rack->r_ctl.rc_scw) { 10580 uint32_t limit; 10581 10582 if (rack->r_limit_scw) 10583 limit = max(1, rack->r_ctl.rc_lowest_us_rtt); 10584 else 10585 limit = 0; 10586 tcp_shared_cwnd_free_full(tp, rack->r_ctl.rc_scw, 10587 rack->r_ctl.rc_scw_index, 10588 limit); 10589 rack->r_ctl.rc_scw = NULL; 10590 } 10591 #endif 10592 /* rack does not use force data but other stacks may clear it */ 10593 tp->t_flags &= ~TF_FORCEDATA; 10594 if (tp->t_inpcb) { 10595 tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ; 10596 tp->t_inpcb->inp_flags2 &= ~INP_MBUF_QUEUE_READY; 10597 tp->t_inpcb->inp_flags2 &= ~INP_DONT_SACK_QUEUE; 10598 } 10599 #ifdef TCP_BLACKBOX 10600 tcp_log_flowend(tp); 10601 #endif 10602 RB_FOREACH_SAFE(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm) { 10603 rm = RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 10604 #ifdef INVARIANTS 10605 if (rm != rsm) { 10606 panic("At fini, rack:%p rsm:%p rm:%p", 10607 rack, rsm, rm); 10608 } 10609 #endif 10610 uma_zfree(rack_zone, rsm); 10611 } 10612 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); 10613 while (rsm) { 10614 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext); 10615 uma_zfree(rack_zone, rsm); 10616 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); 10617 } 10618 rack->rc_free_cnt = 0; 10619 uma_zfree(rack_pcb_zone, tp->t_fb_ptr); 10620 tp->t_fb_ptr = NULL; 10621 } 10622 /* Cancel the GP measurement in progress */ 10623 tp->t_flags &= ~TF_GPUTINPROG; 10624 /* Make sure snd_nxt is correctly set */ 10625 tp->snd_nxt = tp->snd_max; 10626 } 10627 10628 10629 static void 10630 rack_set_state(struct tcpcb *tp, struct tcp_rack *rack) 10631 { 10632 switch (tp->t_state) { 10633 case TCPS_SYN_SENT: 10634 rack->r_state = TCPS_SYN_SENT; 10635 rack->r_substate = rack_do_syn_sent; 10636 break; 10637 case TCPS_SYN_RECEIVED: 10638 rack->r_state = TCPS_SYN_RECEIVED; 10639 rack->r_substate = rack_do_syn_recv; 10640 break; 10641 case TCPS_ESTABLISHED: 10642 rack_set_pace_segments(tp, rack, __LINE__); 10643 rack->r_state = TCPS_ESTABLISHED; 10644 rack->r_substate = rack_do_established; 10645 break; 10646 case TCPS_CLOSE_WAIT: 10647 rack->r_state = TCPS_CLOSE_WAIT; 10648 rack->r_substate = rack_do_close_wait; 10649 break; 10650 case TCPS_FIN_WAIT_1: 10651 rack->r_state = TCPS_FIN_WAIT_1; 10652 rack->r_substate = rack_do_fin_wait_1; 10653 break; 10654 case TCPS_CLOSING: 10655 rack->r_state = TCPS_CLOSING; 10656 rack->r_substate = rack_do_closing; 10657 break; 10658 case TCPS_LAST_ACK: 10659 rack->r_state = TCPS_LAST_ACK; 10660 rack->r_substate = rack_do_lastack; 10661 break; 10662 case TCPS_FIN_WAIT_2: 10663 rack->r_state = TCPS_FIN_WAIT_2; 10664 rack->r_substate = rack_do_fin_wait_2; 10665 break; 10666 case TCPS_LISTEN: 10667 case TCPS_CLOSED: 10668 case TCPS_TIME_WAIT: 10669 default: 10670 break; 10671 }; 10672 } 10673 10674 10675 static void 10676 rack_timer_audit(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb) 10677 { 10678 /* 10679 * We received an ack, and then did not 10680 * call send or were bounced out due to the 10681 * hpts was running. Now a timer is up as well, is 10682 * it the right timer? 10683 */ 10684 struct rack_sendmap *rsm; 10685 int tmr_up; 10686 10687 tmr_up = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK; 10688 if (rack->rc_in_persist && (tmr_up == PACE_TMR_PERSIT)) 10689 return; 10690 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 10691 if (((rsm == NULL) || (tp->t_state < TCPS_ESTABLISHED)) && 10692 (tmr_up == PACE_TMR_RXT)) { 10693 /* Should be an RXT */ 10694 return; 10695 } 10696 if (rsm == NULL) { 10697 /* Nothing outstanding? */ 10698 if (tp->t_flags & TF_DELACK) { 10699 if (tmr_up == PACE_TMR_DELACK) 10700 /* We are supposed to have delayed ack up and we do */ 10701 return; 10702 } else if (sbavail(&tp->t_inpcb->inp_socket->so_snd) && (tmr_up == PACE_TMR_RXT)) { 10703 /* 10704 * if we hit enobufs then we would expect the possiblity 10705 * of nothing outstanding and the RXT up (and the hptsi timer). 10706 */ 10707 return; 10708 } else if (((V_tcp_always_keepalive || 10709 rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) && 10710 (tp->t_state <= TCPS_CLOSING)) && 10711 (tmr_up == PACE_TMR_KEEP) && 10712 (tp->snd_max == tp->snd_una)) { 10713 /* We should have keep alive up and we do */ 10714 return; 10715 } 10716 } 10717 if (SEQ_GT(tp->snd_max, tp->snd_una) && 10718 ((tmr_up == PACE_TMR_TLP) || 10719 (tmr_up == PACE_TMR_RACK) || 10720 (tmr_up == PACE_TMR_RXT))) { 10721 /* 10722 * Either a Rack, TLP or RXT is fine if we 10723 * have outstanding data. 10724 */ 10725 return; 10726 } else if (tmr_up == PACE_TMR_DELACK) { 10727 /* 10728 * If the delayed ack was going to go off 10729 * before the rtx/tlp/rack timer were going to 10730 * expire, then that would be the timer in control. 10731 * Note we don't check the time here trusting the 10732 * code is correct. 10733 */ 10734 return; 10735 } 10736 /* 10737 * Ok the timer originally started is not what we want now. 10738 * We will force the hpts to be stopped if any, and restart 10739 * with the slot set to what was in the saved slot. 10740 */ 10741 if (rack->rc_inp->inp_in_hpts) { 10742 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { 10743 uint32_t us_cts; 10744 10745 us_cts = tcp_get_usecs(NULL); 10746 if (TSTMP_GT(rack->r_ctl.rc_last_output_to, us_cts)) { 10747 rack->r_early = 1; 10748 rack->r_ctl.rc_agg_early += (rack->r_ctl.rc_last_output_to - us_cts); 10749 } 10750 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 10751 } 10752 tcp_hpts_remove(tp->t_inpcb, HPTS_REMOVE_OUTPUT); 10753 } 10754 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 10755 rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), 0, 0, 0); 10756 } 10757 10758 static int 10759 rack_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so, 10760 struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos, 10761 int32_t nxt_pkt, struct timeval *tv) 10762 { 10763 int32_t thflags, retval, did_out = 0; 10764 int32_t way_out = 0; 10765 uint32_t cts; 10766 uint32_t tiwin; 10767 struct timespec ts; 10768 struct tcpopt to; 10769 struct tcp_rack *rack; 10770 struct rack_sendmap *rsm; 10771 int32_t prev_state = 0; 10772 uint32_t us_cts; 10773 /* 10774 * tv passed from common code is from either M_TSTMP_LRO or 10775 * tcp_get_usecs() if no LRO m_pkthdr timestamp is present. The 10776 * rack_pacing stack assumes tv always refers to 'now', so we overwrite 10777 * tv here to guarantee that. 10778 */ 10779 if (m->m_flags & M_TSTMP_LRO) 10780 tcp_get_usecs(tv); 10781 10782 cts = tcp_tv_to_mssectick(tv); 10783 rack = (struct tcp_rack *)tp->t_fb_ptr; 10784 10785 if ((m->m_flags & M_TSTMP) || 10786 (m->m_flags & M_TSTMP_LRO)) { 10787 mbuf_tstmp2timespec(m, &ts); 10788 rack->r_ctl.act_rcv_time.tv_sec = ts.tv_sec; 10789 rack->r_ctl.act_rcv_time.tv_usec = ts.tv_nsec/1000; 10790 } else 10791 rack->r_ctl.act_rcv_time = *tv; 10792 kern_prefetch(rack, &prev_state); 10793 prev_state = 0; 10794 thflags = th->th_flags; 10795 10796 NET_EPOCH_ASSERT(); 10797 INP_WLOCK_ASSERT(tp->t_inpcb); 10798 KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", 10799 __func__)); 10800 KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT", 10801 __func__)); 10802 if (tp->t_logstate != TCP_LOG_STATE_OFF) { 10803 union tcp_log_stackspecific log; 10804 struct timeval ltv; 10805 #ifdef NETFLIX_HTTP_LOGGING 10806 struct http_sendfile_track *http_req; 10807 10808 if (SEQ_GT(th->th_ack, tp->snd_una)) { 10809 http_req = tcp_http_find_req_for_seq(tp, (th->th_ack-1)); 10810 } else { 10811 http_req = tcp_http_find_req_for_seq(tp, th->th_ack); 10812 } 10813 #endif 10814 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 10815 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 10816 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 10817 if (rack->rack_no_prr == 0) 10818 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; 10819 else 10820 log.u_bbr.flex1 = 0; 10821 log.u_bbr.flex2 = rack->r_ctl.rc_num_maps_alloced; 10822 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 10823 log.u_bbr.pkts_out = rack->rc_tp->t_maxseg; 10824 log.u_bbr.flex3 = m->m_flags; 10825 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; 10826 if (m->m_flags & M_TSTMP) { 10827 /* Record the hardware timestamp if present */ 10828 mbuf_tstmp2timespec(m, &ts); 10829 ltv.tv_sec = ts.tv_sec; 10830 ltv.tv_usec = ts.tv_nsec / 1000; 10831 log.u_bbr.lt_epoch = tcp_tv_to_usectick(<v); 10832 } else if (m->m_flags & M_TSTMP_LRO) { 10833 /* Record the LRO the arrival timestamp */ 10834 mbuf_tstmp2timespec(m, &ts); 10835 ltv.tv_sec = ts.tv_sec; 10836 ltv.tv_usec = ts.tv_nsec / 1000; 10837 log.u_bbr.flex5 = tcp_tv_to_usectick(<v); 10838 } 10839 log.u_bbr.timeStamp = tcp_get_usecs(<v); 10840 /* Log the rcv time */ 10841 log.u_bbr.delRate = m->m_pkthdr.rcv_tstmp; 10842 #ifdef NETFLIX_HTTP_LOGGING 10843 log.u_bbr.applimited = tp->t_http_closed; 10844 log.u_bbr.applimited <<= 8; 10845 log.u_bbr.applimited |= tp->t_http_open; 10846 log.u_bbr.applimited <<= 8; 10847 log.u_bbr.applimited |= tp->t_http_req; 10848 if (http_req) { 10849 /* Copy out any client req info */ 10850 /* seconds */ 10851 log.u_bbr.pkt_epoch = (http_req->localtime / HPTS_USEC_IN_SEC); 10852 /* useconds */ 10853 log.u_bbr.delivered = (http_req->localtime % HPTS_USEC_IN_SEC); 10854 log.u_bbr.rttProp = http_req->timestamp; 10855 log.u_bbr.cur_del_rate = http_req->start; 10856 if (http_req->flags & TCP_HTTP_TRACK_FLG_OPEN) { 10857 log.u_bbr.flex8 |= 1; 10858 } else { 10859 log.u_bbr.flex8 |= 2; 10860 log.u_bbr.bw_inuse = http_req->end; 10861 } 10862 log.u_bbr.flex6 = http_req->start_seq; 10863 if (http_req->flags & TCP_HTTP_TRACK_FLG_COMP) { 10864 log.u_bbr.flex8 |= 4; 10865 log.u_bbr.epoch = http_req->end_seq; 10866 } 10867 } 10868 #endif 10869 TCP_LOG_EVENTP(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_IN, 0, 10870 tlen, &log, true, <v); 10871 } 10872 if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) { 10873 way_out = 4; 10874 retval = 0; 10875 goto done_with_input; 10876 } 10877 /* 10878 * If a segment with the ACK-bit set arrives in the SYN-SENT state 10879 * check SEQ.ACK first as described on page 66 of RFC 793, section 3.9. 10880 */ 10881 if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) && 10882 (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { 10883 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 10884 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 10885 return(1); 10886 } 10887 /* 10888 * Segment received on connection. Reset idle time and keep-alive 10889 * timer. XXX: This should be done after segment validation to 10890 * ignore broken/spoofed segs. 10891 */ 10892 if (tp->t_idle_reduce && 10893 (tp->snd_max == tp->snd_una) && 10894 ((ticks - tp->t_rcvtime) >= tp->t_rxtcur)) { 10895 counter_u64_add(rack_input_idle_reduces, 1); 10896 rack_cc_after_idle(rack, tp); 10897 } 10898 tp->t_rcvtime = ticks; 10899 /* 10900 * Unscale the window into a 32-bit value. For the SYN_SENT state 10901 * the scale is zero. 10902 */ 10903 tiwin = th->th_win << tp->snd_scale; 10904 #ifdef STATS 10905 stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_FRWIN, tiwin); 10906 #endif 10907 if (tiwin > rack->r_ctl.rc_high_rwnd) 10908 rack->r_ctl.rc_high_rwnd = tiwin; 10909 /* 10910 * TCP ECN processing. XXXJTL: If we ever use ECN, we need to move 10911 * this to occur after we've validated the segment. 10912 */ 10913 if (tp->t_flags2 & TF2_ECN_PERMIT) { 10914 if (thflags & TH_CWR) { 10915 tp->t_flags2 &= ~TF2_ECN_SND_ECE; 10916 tp->t_flags |= TF_ACKNOW; 10917 } 10918 switch (iptos & IPTOS_ECN_MASK) { 10919 case IPTOS_ECN_CE: 10920 tp->t_flags2 |= TF2_ECN_SND_ECE; 10921 KMOD_TCPSTAT_INC(tcps_ecn_ce); 10922 break; 10923 case IPTOS_ECN_ECT0: 10924 KMOD_TCPSTAT_INC(tcps_ecn_ect0); 10925 break; 10926 case IPTOS_ECN_ECT1: 10927 KMOD_TCPSTAT_INC(tcps_ecn_ect1); 10928 break; 10929 } 10930 10931 /* Process a packet differently from RFC3168. */ 10932 cc_ecnpkt_handler(tp, th, iptos); 10933 10934 /* Congestion experienced. */ 10935 if (thflags & TH_ECE) { 10936 rack_cong_signal(tp, th, CC_ECN); 10937 } 10938 } 10939 /* 10940 * Parse options on any incoming segment. 10941 */ 10942 tcp_dooptions(&to, (u_char *)(th + 1), 10943 (th->th_off << 2) - sizeof(struct tcphdr), 10944 (thflags & TH_SYN) ? TO_SYN : 0); 10945 10946 /* 10947 * If echoed timestamp is later than the current time, fall back to 10948 * non RFC1323 RTT calculation. Normalize timestamp if syncookies 10949 * were used when this connection was established. 10950 */ 10951 if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) { 10952 to.to_tsecr -= tp->ts_offset; 10953 if (TSTMP_GT(to.to_tsecr, cts)) 10954 to.to_tsecr = 0; 10955 } 10956 10957 /* 10958 * If its the first time in we need to take care of options and 10959 * verify we can do SACK for rack! 10960 */ 10961 if (rack->r_state == 0) { 10962 /* Should be init'd by rack_init() */ 10963 KASSERT(rack->rc_inp != NULL, 10964 ("%s: rack->rc_inp unexpectedly NULL", __func__)); 10965 if (rack->rc_inp == NULL) { 10966 rack->rc_inp = tp->t_inpcb; 10967 } 10968 10969 /* 10970 * Process options only when we get SYN/ACK back. The SYN 10971 * case for incoming connections is handled in tcp_syncache. 10972 * According to RFC1323 the window field in a SYN (i.e., a 10973 * <SYN> or <SYN,ACK>) segment itself is never scaled. XXX 10974 * this is traditional behavior, may need to be cleaned up. 10975 */ 10976 if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) { 10977 /* Handle parallel SYN for ECN */ 10978 if (!(thflags & TH_ACK) && 10979 ((thflags & (TH_CWR | TH_ECE)) == (TH_CWR | TH_ECE)) && 10980 ((V_tcp_do_ecn == 1) || (V_tcp_do_ecn == 2))) { 10981 tp->t_flags2 |= TF2_ECN_PERMIT; 10982 tp->t_flags2 |= TF2_ECN_SND_ECE; 10983 TCPSTAT_INC(tcps_ecn_shs); 10984 } 10985 if ((to.to_flags & TOF_SCALE) && 10986 (tp->t_flags & TF_REQ_SCALE)) { 10987 tp->t_flags |= TF_RCVD_SCALE; 10988 tp->snd_scale = to.to_wscale; 10989 } else 10990 tp->t_flags &= ~TF_REQ_SCALE; 10991 /* 10992 * Initial send window. It will be updated with the 10993 * next incoming segment to the scaled value. 10994 */ 10995 tp->snd_wnd = th->th_win; 10996 if ((to.to_flags & TOF_TS) && 10997 (tp->t_flags & TF_REQ_TSTMP)) { 10998 tp->t_flags |= TF_RCVD_TSTMP; 10999 tp->ts_recent = to.to_tsval; 11000 tp->ts_recent_age = cts; 11001 } else 11002 tp->t_flags &= ~TF_REQ_TSTMP; 11003 if (to.to_flags & TOF_MSS) 11004 tcp_mss(tp, to.to_mss); 11005 if ((tp->t_flags & TF_SACK_PERMIT) && 11006 (to.to_flags & TOF_SACKPERM) == 0) 11007 tp->t_flags &= ~TF_SACK_PERMIT; 11008 if (IS_FASTOPEN(tp->t_flags)) { 11009 if (to.to_flags & TOF_FASTOPEN) { 11010 uint16_t mss; 11011 11012 if (to.to_flags & TOF_MSS) 11013 mss = to.to_mss; 11014 else 11015 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) 11016 mss = TCP6_MSS; 11017 else 11018 mss = TCP_MSS; 11019 tcp_fastopen_update_cache(tp, mss, 11020 to.to_tfo_len, to.to_tfo_cookie); 11021 } else 11022 tcp_fastopen_disable_path(tp); 11023 } 11024 } 11025 /* 11026 * At this point we are at the initial call. Here we decide 11027 * if we are doing RACK or not. We do this by seeing if 11028 * TF_SACK_PERMIT is set and the sack-not-required is clear. 11029 * The code now does do dup-ack counting so if you don't 11030 * switch back you won't get rack & TLP, but you will still 11031 * get this stack. 11032 */ 11033 11034 if ((rack_sack_not_required == 0) && 11035 ((tp->t_flags & TF_SACK_PERMIT) == 0)) { 11036 tcp_switch_back_to_default(tp); 11037 (*tp->t_fb->tfb_tcp_do_segment) (m, th, so, tp, drop_hdrlen, 11038 tlen, iptos); 11039 return (1); 11040 } 11041 /* Set the flag */ 11042 rack->r_is_v6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; 11043 tcp_set_hpts(tp->t_inpcb); 11044 sack_filter_clear(&rack->r_ctl.rack_sf, th->th_ack); 11045 } 11046 if (thflags & TH_FIN) 11047 tcp_log_end_status(tp, TCP_EI_STATUS_CLIENT_FIN); 11048 us_cts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 11049 if ((rack->rc_gp_dyn_mul) && 11050 (rack->use_fixed_rate == 0) && 11051 (rack->rc_always_pace)) { 11052 /* Check in on probertt */ 11053 rack_check_probe_rtt(rack, us_cts); 11054 } 11055 if (rack->forced_ack) { 11056 uint32_t us_rtt; 11057 11058 /* 11059 * A persist or keep-alive was forced out, update our 11060 * min rtt time. Note we do not worry about lost 11061 * retransmissions since KEEP-ALIVES and persists 11062 * are usually way long on times of sending (though 11063 * if we were really paranoid or worried we could 11064 * at least use timestamps if available to validate). 11065 */ 11066 rack->forced_ack = 0; 11067 us_rtt = us_cts - rack->r_ctl.forced_ack_ts; 11068 if (us_rtt == 0) 11069 us_rtt = 1; 11070 rack_log_rtt_upd(tp, rack, us_rtt, 0, NULL, 3); 11071 rack_apply_updated_usrtt(rack, us_rtt, us_cts); 11072 } 11073 /* 11074 * This is the one exception case where we set the rack state 11075 * always. All other times (timers etc) we must have a rack-state 11076 * set (so we assure we have done the checks above for SACK). 11077 */ 11078 rack->r_ctl.rc_rcvtime = cts; 11079 if (rack->r_state != tp->t_state) 11080 rack_set_state(tp, rack); 11081 if (SEQ_GT(th->th_ack, tp->snd_una) && 11082 (rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree)) != NULL) 11083 kern_prefetch(rsm, &prev_state); 11084 prev_state = rack->r_state; 11085 rack_clear_rate_sample(rack); 11086 retval = (*rack->r_substate) (m, th, so, 11087 tp, &to, drop_hdrlen, 11088 tlen, tiwin, thflags, nxt_pkt, iptos); 11089 #ifdef INVARIANTS 11090 if ((retval == 0) && 11091 (tp->t_inpcb == NULL)) { 11092 panic("retval:%d tp:%p t_inpcb:NULL state:%d", 11093 retval, tp, prev_state); 11094 } 11095 #endif 11096 if (retval == 0) { 11097 /* 11098 * If retval is 1 the tcb is unlocked and most likely the tp 11099 * is gone. 11100 */ 11101 INP_WLOCK_ASSERT(tp->t_inpcb); 11102 if ((rack->rc_gp_dyn_mul) && 11103 (rack->rc_always_pace) && 11104 (rack->use_fixed_rate == 0) && 11105 rack->in_probe_rtt && 11106 (rack->r_ctl.rc_time_probertt_starts == 0)) { 11107 /* 11108 * If we are going for target, lets recheck before 11109 * we output. 11110 */ 11111 rack_check_probe_rtt(rack, us_cts); 11112 } 11113 if (rack->set_pacing_done_a_iw == 0) { 11114 /* How much has been acked? */ 11115 if ((tp->snd_una - tp->iss) > (ctf_fixed_maxseg(tp) * 10)) { 11116 /* We have enough to set in the pacing segment size */ 11117 rack->set_pacing_done_a_iw = 1; 11118 rack_set_pace_segments(tp, rack, __LINE__); 11119 } 11120 } 11121 tcp_rack_xmit_timer_commit(rack, tp); 11122 if (nxt_pkt == 0) { 11123 if (rack->r_wanted_output != 0) { 11124 do_output_now: 11125 did_out = 1; 11126 (void)tp->t_fb->tfb_tcp_output(tp); 11127 } 11128 rack_start_hpts_timer(rack, tp, cts, 0, 0, 0); 11129 } 11130 if ((nxt_pkt == 0) && 11131 ((rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) == 0) && 11132 (SEQ_GT(tp->snd_max, tp->snd_una) || 11133 (tp->t_flags & TF_DELACK) || 11134 ((V_tcp_always_keepalive || rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) && 11135 (tp->t_state <= TCPS_CLOSING)))) { 11136 /* We could not send (probably in the hpts but stopped the timer earlier)? */ 11137 if ((tp->snd_max == tp->snd_una) && 11138 ((tp->t_flags & TF_DELACK) == 0) && 11139 (rack->rc_inp->inp_in_hpts) && 11140 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { 11141 /* keep alive not needed if we are hptsi output yet */ 11142 ; 11143 } else { 11144 int late = 0; 11145 if (rack->rc_inp->inp_in_hpts) { 11146 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { 11147 us_cts = tcp_get_usecs(NULL); 11148 if (TSTMP_GT(rack->r_ctl.rc_last_output_to, us_cts)) { 11149 rack->r_early = 1; 11150 rack->r_ctl.rc_agg_early += (rack->r_ctl.rc_last_output_to - us_cts); 11151 } else 11152 late = 1; 11153 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 11154 } 11155 tcp_hpts_remove(tp->t_inpcb, HPTS_REMOVE_OUTPUT); 11156 } 11157 if (late && (did_out == 0)) { 11158 /* 11159 * We are late in the sending 11160 * and we did not call the output 11161 * (this probably should not happen). 11162 */ 11163 goto do_output_now; 11164 } 11165 rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), 0, 0, 0); 11166 } 11167 way_out = 1; 11168 } else if (nxt_pkt == 0) { 11169 /* Do we have the correct timer running? */ 11170 rack_timer_audit(tp, rack, &so->so_snd); 11171 way_out = 2; 11172 } 11173 done_with_input: 11174 rack_log_doseg_done(rack, cts, nxt_pkt, did_out, way_out); 11175 if (did_out) 11176 rack->r_wanted_output = 0; 11177 #ifdef INVARIANTS 11178 if (tp->t_inpcb == NULL) { 11179 panic("OP:%d retval:%d tp:%p t_inpcb:NULL state:%d", 11180 did_out, 11181 retval, tp, prev_state); 11182 } 11183 #endif 11184 } 11185 return (retval); 11186 } 11187 11188 void 11189 rack_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, 11190 struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos) 11191 { 11192 struct timeval tv; 11193 11194 /* First lets see if we have old packets */ 11195 if (tp->t_in_pkt) { 11196 if (ctf_do_queued_segments(so, tp, 1)) { 11197 m_freem(m); 11198 return; 11199 } 11200 } 11201 if (m->m_flags & M_TSTMP_LRO) { 11202 tv.tv_sec = m->m_pkthdr.rcv_tstmp /1000000000; 11203 tv.tv_usec = (m->m_pkthdr.rcv_tstmp % 1000000000)/1000; 11204 } else { 11205 /* Should not be should we kassert instead? */ 11206 tcp_get_usecs(&tv); 11207 } 11208 if(rack_do_segment_nounlock(m, th, so, tp, 11209 drop_hdrlen, tlen, iptos, 0, &tv) == 0) 11210 INP_WUNLOCK(tp->t_inpcb); 11211 } 11212 11213 struct rack_sendmap * 11214 tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tsused) 11215 { 11216 struct rack_sendmap *rsm = NULL; 11217 int32_t idx; 11218 uint32_t srtt = 0, thresh = 0, ts_low = 0; 11219 11220 /* Return the next guy to be re-transmitted */ 11221 if (RB_EMPTY(&rack->r_ctl.rc_mtree)) { 11222 return (NULL); 11223 } 11224 if (tp->t_flags & TF_SENTFIN) { 11225 /* retran the end FIN? */ 11226 return (NULL); 11227 } 11228 /* ok lets look at this one */ 11229 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 11230 if (rsm && ((rsm->r_flags & RACK_ACKED) == 0)) { 11231 goto check_it; 11232 } 11233 rsm = rack_find_lowest_rsm(rack); 11234 if (rsm == NULL) { 11235 return (NULL); 11236 } 11237 check_it: 11238 if (rsm->r_flags & RACK_ACKED) { 11239 return (NULL); 11240 } 11241 if (((rsm->r_flags & RACK_SACK_PASSED) == 0) && 11242 (rsm->r_dupack < DUP_ACK_THRESHOLD)) { 11243 /* Its not yet ready */ 11244 return (NULL); 11245 } 11246 srtt = rack_grab_rtt(tp, rack); 11247 idx = rsm->r_rtr_cnt - 1; 11248 ts_low = rsm->r_tim_lastsent[idx]; 11249 thresh = rack_calc_thresh_rack(rack, srtt, tsused); 11250 if ((tsused == ts_low) || 11251 (TSTMP_LT(tsused, ts_low))) { 11252 /* No time since sending */ 11253 return (NULL); 11254 } 11255 if ((tsused - ts_low) < thresh) { 11256 /* It has not been long enough yet */ 11257 return (NULL); 11258 } 11259 if ((rsm->r_dupack >= DUP_ACK_THRESHOLD) || 11260 ((rsm->r_flags & RACK_SACK_PASSED) && 11261 (rack->sack_attack_disable == 0))) { 11262 /* 11263 * We have passed the dup-ack threshold <or> 11264 * a SACK has indicated this is missing. 11265 * Note that if you are a declared attacker 11266 * it is only the dup-ack threshold that 11267 * will cause retransmits. 11268 */ 11269 /* log retransmit reason */ 11270 rack_log_retran_reason(rack, rsm, (tsused - ts_low), thresh, 1); 11271 return (rsm); 11272 } 11273 return (NULL); 11274 } 11275 11276 static void 11277 rack_log_pacing_delay_calc(struct tcp_rack *rack, uint32_t len, uint32_t slot, 11278 uint64_t bw_est, uint64_t bw, uint64_t len_time, int method, 11279 int line, struct rack_sendmap *rsm) 11280 { 11281 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 11282 union tcp_log_stackspecific log; 11283 struct timeval tv; 11284 11285 memset(&log, 0, sizeof(log)); 11286 log.u_bbr.flex1 = slot; 11287 log.u_bbr.flex2 = len; 11288 log.u_bbr.flex3 = rack->r_ctl.rc_pace_min_segs; 11289 log.u_bbr.flex4 = rack->r_ctl.rc_pace_max_segs; 11290 log.u_bbr.flex5 = rack->r_ctl.rack_per_of_gp_ss; 11291 log.u_bbr.flex6 = rack->r_ctl.rack_per_of_gp_ca; 11292 log.u_bbr.use_lt_bw = rack->app_limited_needs_set; 11293 log.u_bbr.use_lt_bw <<= 1; 11294 log.u_bbr.use_lt_bw = rack->rc_gp_filled; 11295 log.u_bbr.use_lt_bw <<= 1; 11296 log.u_bbr.use_lt_bw |= rack->measure_saw_probe_rtt; 11297 log.u_bbr.use_lt_bw <<= 1; 11298 log.u_bbr.use_lt_bw |= rack->in_probe_rtt; 11299 log.u_bbr.pkt_epoch = line; 11300 log.u_bbr.applimited = rack->r_ctl.rack_per_of_gp_rec; 11301 log.u_bbr.bw_inuse = bw_est; 11302 log.u_bbr.delRate = bw; 11303 if (rack->r_ctl.gp_bw == 0) 11304 log.u_bbr.cur_del_rate = 0; 11305 else 11306 log.u_bbr.cur_del_rate = rack_get_bw(rack); 11307 log.u_bbr.rttProp = len_time; 11308 log.u_bbr.pkts_out = rack->r_ctl.rc_rack_min_rtt; 11309 log.u_bbr.lost = rack->r_ctl.rc_probertt_sndmax_atexit; 11310 log.u_bbr.pacing_gain = rack_get_output_gain(rack, rsm); 11311 if (rack->r_ctl.cwnd_to_use < rack->rc_tp->snd_ssthresh) { 11312 /* We are in slow start */ 11313 log.u_bbr.flex7 = 1; 11314 } else { 11315 /* we are on congestion avoidance */ 11316 log.u_bbr.flex7 = 0; 11317 } 11318 log.u_bbr.flex8 = method; 11319 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 11320 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 11321 log.u_bbr.cwnd_gain = rack->rc_gp_saw_rec; 11322 log.u_bbr.cwnd_gain <<= 1; 11323 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ss; 11324 log.u_bbr.cwnd_gain <<= 1; 11325 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ca; 11326 TCP_LOG_EVENTP(rack->rc_tp, NULL, 11327 &rack->rc_inp->inp_socket->so_rcv, 11328 &rack->rc_inp->inp_socket->so_snd, 11329 BBR_LOG_HPTSI_CALC, 0, 11330 0, &log, false, &tv); 11331 } 11332 } 11333 11334 static uint32_t 11335 rack_get_pacing_len(struct tcp_rack *rack, uint64_t bw, uint32_t mss) 11336 { 11337 uint32_t new_tso, user_max; 11338 11339 user_max = rack->rc_user_set_max_segs * mss; 11340 if (rack->rc_force_max_seg) { 11341 return (user_max); 11342 } 11343 if (rack->use_fixed_rate && 11344 ((rack->r_ctl.crte == NULL) || 11345 (bw != rack->r_ctl.crte->rate))) { 11346 /* Use the user mss since we are not exactly matched */ 11347 return (user_max); 11348 } 11349 new_tso = tcp_get_pacing_burst_size(bw, mss, rack_pace_one_seg, rack->r_ctl.crte, NULL); 11350 if (new_tso > user_max) 11351 new_tso = user_max; 11352 return(new_tso); 11353 } 11354 11355 static void 11356 rack_log_hdwr_pacing(struct tcp_rack *rack, const struct ifnet *ifp, 11357 uint64_t rate, uint64_t hw_rate, int line, 11358 int error) 11359 { 11360 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 11361 union tcp_log_stackspecific log; 11362 struct timeval tv; 11363 11364 memset(&log, 0, sizeof(log)); 11365 log.u_bbr.flex1 = ((hw_rate >> 32) & 0x00000000ffffffff); 11366 log.u_bbr.flex2 = (hw_rate & 0x00000000ffffffff); 11367 log.u_bbr.flex3 = (((uint64_t)ifp >> 32) & 0x00000000ffffffff); 11368 log.u_bbr.flex4 = ((uint64_t)ifp & 0x00000000ffffffff); 11369 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 11370 log.u_bbr.bw_inuse = rate; 11371 log.u_bbr.flex5 = line; 11372 log.u_bbr.flex6 = error; 11373 log.u_bbr.applimited = rack->r_ctl.rc_pace_max_segs; 11374 log.u_bbr.flex8 = rack->use_fixed_rate; 11375 log.u_bbr.flex8 <<= 1; 11376 log.u_bbr.flex8 |= rack->rack_hdrw_pacing; 11377 log.u_bbr.pkts_out = rack->rc_tp->t_maxseg; 11378 TCP_LOG_EVENTP(rack->rc_tp, NULL, 11379 &rack->rc_inp->inp_socket->so_rcv, 11380 &rack->rc_inp->inp_socket->so_snd, 11381 BBR_LOG_HDWR_PACE, 0, 11382 0, &log, false, &tv); 11383 } 11384 } 11385 11386 static int32_t 11387 pace_to_fill_cwnd(struct tcp_rack *rack, int32_t slot, uint32_t len, uint32_t segsiz) 11388 { 11389 uint64_t lentim, fill_bw; 11390 11391 /* Lets first see if we are full, if so continue with normal rate */ 11392 if (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) > rack->r_ctl.cwnd_to_use) 11393 return (slot); 11394 if ((ctf_outstanding(rack->rc_tp) + (segsiz-1)) > rack->rc_tp->snd_wnd) 11395 return (slot); 11396 if (rack->r_ctl.rc_last_us_rtt == 0) 11397 return (slot); 11398 if (rack->rc_pace_fill_if_rttin_range && 11399 (rack->r_ctl.rc_last_us_rtt >= 11400 (get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack->rtt_limit_mul))) { 11401 /* The rtt is huge, N * smallest, lets not fill */ 11402 return (slot); 11403 } 11404 /* 11405 * first lets calculate the b/w based on the last us-rtt 11406 * and the sndwnd. 11407 */ 11408 fill_bw = rack->r_ctl.cwnd_to_use; 11409 /* Take the rwnd if its smaller */ 11410 if (fill_bw > rack->rc_tp->snd_wnd) 11411 fill_bw = rack->rc_tp->snd_wnd; 11412 fill_bw *= (uint64_t)HPTS_USEC_IN_SEC; 11413 fill_bw /= (uint64_t)rack->r_ctl.rc_last_us_rtt; 11414 /* We are below the min b/w */ 11415 if (fill_bw < RACK_MIN_BW) 11416 return (slot); 11417 /* 11418 * Ok fill_bw holds our mythical b/w to fill the cwnd 11419 * in a rtt, what does that time wise equate too? 11420 */ 11421 lentim = (uint64_t)(len) * (uint64_t)HPTS_USEC_IN_SEC; 11422 lentim /= fill_bw; 11423 if (lentim < slot) { 11424 rack_log_pacing_delay_calc(rack, len, slot, fill_bw, 11425 0, lentim, 12, __LINE__, NULL); 11426 return ((int32_t)lentim); 11427 } else 11428 return (slot); 11429 } 11430 11431 static int32_t 11432 rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, struct rack_sendmap *rsm, uint32_t segsiz) 11433 { 11434 struct rack_sendmap *lrsm; 11435 int32_t slot = 0; 11436 int err; 11437 11438 if (rack->rc_always_pace == 0) { 11439 /* 11440 * We use the most optimistic possible cwnd/srtt for 11441 * sending calculations. This will make our 11442 * calculation anticipate getting more through 11443 * quicker then possible. But thats ok we don't want 11444 * the peer to have a gap in data sending. 11445 */ 11446 uint32_t srtt, cwnd, tr_perms = 0; 11447 int32_t reduce = 0; 11448 11449 old_method: 11450 /* 11451 * We keep no precise pacing with the old method 11452 * instead we use the pacer to mitigate bursts. 11453 */ 11454 rack->r_ctl.rc_agg_delayed = 0; 11455 rack->r_early = 0; 11456 rack->r_late = 0; 11457 rack->r_ctl.rc_agg_early = 0; 11458 if (rack->r_ctl.rc_rack_min_rtt) 11459 srtt = rack->r_ctl.rc_rack_min_rtt; 11460 else 11461 srtt = TICKS_2_MSEC((tp->t_srtt >> TCP_RTT_SHIFT)); 11462 if (rack->r_ctl.rc_rack_largest_cwnd) 11463 cwnd = rack->r_ctl.rc_rack_largest_cwnd; 11464 else 11465 cwnd = rack->r_ctl.cwnd_to_use; 11466 tr_perms = cwnd / srtt; 11467 if (tr_perms == 0) { 11468 tr_perms = ctf_fixed_maxseg(tp); 11469 } 11470 /* 11471 * Calculate how long this will take to drain, if 11472 * the calculation comes out to zero, thats ok we 11473 * will use send_a_lot to possibly spin around for 11474 * more increasing tot_len_this_send to the point 11475 * that its going to require a pace, or we hit the 11476 * cwnd. Which in that case we are just waiting for 11477 * a ACK. 11478 */ 11479 slot = len / tr_perms; 11480 /* Now do we reduce the time so we don't run dry? */ 11481 if (slot && rack_slot_reduction) { 11482 reduce = (slot / rack_slot_reduction); 11483 if (reduce < slot) { 11484 slot -= reduce; 11485 } else 11486 slot = 0; 11487 } 11488 slot *= HPTS_USEC_IN_MSEC; 11489 if (rsm == NULL) { 11490 /* 11491 * We always consider ourselves app limited with old style 11492 * that are not retransmits. This could be the initial 11493 * measurement, but thats ok its all setup and specially 11494 * handled. If another send leaks out, then that too will 11495 * be mark app-limited. 11496 */ 11497 lrsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 11498 if (lrsm && ((lrsm->r_flags & RACK_APP_LIMITED) == 0)) { 11499 rack->r_ctl.rc_first_appl = lrsm; 11500 lrsm->r_flags |= RACK_APP_LIMITED; 11501 rack->r_ctl.rc_app_limited_cnt++; 11502 } 11503 } 11504 rack_log_pacing_delay_calc(rack, len, slot, tr_perms, reduce, 0, 7, __LINE__, NULL); 11505 } else { 11506 uint64_t bw_est, res, lentim, rate_wanted; 11507 uint32_t orig_val, srtt, segs, oh; 11508 11509 if ((rack->r_rr_config == 1) && rsm) { 11510 return (rack->r_ctl.rc_min_to * HPTS_USEC_IN_MSEC); 11511 } 11512 if (rack->use_fixed_rate) { 11513 rate_wanted = bw_est = rack_get_fixed_pacing_bw(rack); 11514 } else if ((rack->r_ctl.init_rate == 0) && 11515 #ifdef NETFLIX_PEAKRATE 11516 (rack->rc_tp->t_maxpeakrate == 0) && 11517 #endif 11518 (rack->r_ctl.gp_bw == 0)) { 11519 /* no way to yet do an estimate */ 11520 bw_est = rate_wanted = 0; 11521 } else { 11522 bw_est = rack_get_bw(rack); 11523 rate_wanted = rack_get_output_bw(rack, bw_est, rsm); 11524 } 11525 if ((bw_est == 0) || (rate_wanted == 0)) { 11526 /* 11527 * No way yet to make a b/w estimate or 11528 * our raise is set incorrectly. 11529 */ 11530 goto old_method; 11531 } 11532 /* We need to account for all the overheads */ 11533 segs = (len + segsiz - 1) / segsiz; 11534 /* 11535 * We need the diff between 1514 bytes (e-mtu with e-hdr) 11536 * and how much data we put in each packet. Yes this 11537 * means we may be off if we are larger than 1500 bytes 11538 * or smaller. But this just makes us more conservative. 11539 */ 11540 if (ETHERNET_SEGMENT_SIZE > segsiz) 11541 oh = ETHERNET_SEGMENT_SIZE - segsiz; 11542 else 11543 oh = 0; 11544 segs *= oh; 11545 lentim = (uint64_t)(len + segs) * (uint64_t)HPTS_USEC_IN_SEC; 11546 res = lentim / rate_wanted; 11547 slot = (uint32_t)res; 11548 orig_val = rack->r_ctl.rc_pace_max_segs; 11549 rack_set_pace_segments(rack->rc_tp, rack, __LINE__); 11550 /* Did we change the TSO size, if so log it */ 11551 if (rack->r_ctl.rc_pace_max_segs != orig_val) 11552 rack_log_pacing_delay_calc(rack, len, slot, orig_val, 0, 0, 15, __LINE__, NULL); 11553 if ((rack->rc_pace_to_cwnd) && 11554 (rack->in_probe_rtt == 0) && 11555 (IN_RECOVERY(rack->rc_tp->t_flags) == 0)) { 11556 /* 11557 * We want to pace at our rate *or* faster to 11558 * fill the cwnd to the max if its not full. 11559 */ 11560 slot = pace_to_fill_cwnd(rack, slot, (len+segs), segsiz); 11561 } 11562 if ((rack->rc_inp->inp_route.ro_nh != NULL) && 11563 (rack->rc_inp->inp_route.ro_nh->nh_ifp != NULL)) { 11564 if ((rack->rack_hdw_pace_ena) && 11565 (rack->rack_hdrw_pacing == 0) && 11566 (rack->rack_attempt_hdwr_pace == 0)) { 11567 /* 11568 * Lets attempt to turn on hardware pacing 11569 * if we can. 11570 */ 11571 rack->rack_attempt_hdwr_pace = 1; 11572 rack->r_ctl.crte = tcp_set_pacing_rate(rack->rc_tp, 11573 rack->rc_inp->inp_route.ro_nh->nh_ifp, 11574 rate_wanted, 11575 RS_PACING_GEQ, 11576 &err); 11577 if (rack->r_ctl.crte) { 11578 rack->rack_hdrw_pacing = 1; 11579 rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size(rate_wanted, segsiz, 11580 0, rack->r_ctl.crte, 11581 NULL); 11582 rack_log_hdwr_pacing(rack, rack->rc_inp->inp_route.ro_nh->nh_ifp, 11583 rate_wanted, rack->r_ctl.crte->rate, __LINE__, 11584 err); 11585 } 11586 } else if (rack->rack_hdrw_pacing && 11587 (rack->r_ctl.crte->rate != rate_wanted)) { 11588 /* Do we need to adjust our rate? */ 11589 const struct tcp_hwrate_limit_table *nrte; 11590 11591 nrte = tcp_chg_pacing_rate(rack->r_ctl.crte, 11592 rack->rc_tp, 11593 rack->rc_inp->inp_route.ro_nh->nh_ifp, 11594 rate_wanted, 11595 RS_PACING_GEQ, 11596 &err); 11597 if (nrte == NULL) { 11598 /* Lost the rate */ 11599 rack->rack_hdrw_pacing = 0; 11600 rack_set_pace_segments(rack->rc_tp, rack, __LINE__); 11601 } else if (nrte != rack->r_ctl.crte) { 11602 rack->r_ctl.crte = nrte; 11603 rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size(rate_wanted, 11604 segsiz, 0, 11605 rack->r_ctl.crte, 11606 NULL); 11607 rack_log_hdwr_pacing(rack, rack->rc_inp->inp_route.ro_nh->nh_ifp, 11608 rate_wanted, rack->r_ctl.crte->rate, __LINE__, 11609 err); 11610 } 11611 11612 } 11613 } 11614 if (rack_limit_time_with_srtt && 11615 (rack->use_fixed_rate == 0) && 11616 #ifdef NETFLIX_PEAKRATE 11617 (rack->rc_tp->t_maxpeakrate == 0) && 11618 #endif 11619 (rack->rack_hdrw_pacing == 0)) { 11620 /* 11621 * Sanity check, we do not allow the pacing delay 11622 * to be longer than the SRTT of the path. If it is 11623 * a slow path, then adding a packet should increase 11624 * the RTT and compensate for this i.e. the srtt will 11625 * be greater so the allowed pacing time will be greater. 11626 * 11627 * Note this restriction is not for where a peak rate 11628 * is set, we are doing fixed pacing or hardware pacing. 11629 */ 11630 if (rack->rc_tp->t_srtt) 11631 srtt = (TICKS_2_USEC(rack->rc_tp->t_srtt) >> TCP_RTT_SHIFT); 11632 else 11633 srtt = RACK_INITIAL_RTO * HPTS_USEC_IN_MSEC; /* its in ms convert */ 11634 if (srtt < slot) { 11635 rack_log_pacing_delay_calc(rack, srtt, slot, rate_wanted, bw_est, lentim, 99, __LINE__, NULL); 11636 slot = srtt; 11637 } 11638 } 11639 rack_log_pacing_delay_calc(rack, len, slot, rate_wanted, bw_est, lentim, 2, __LINE__, rsm); 11640 } 11641 if (slot) 11642 counter_u64_add(rack_calc_nonzero, 1); 11643 else 11644 counter_u64_add(rack_calc_zero, 1); 11645 return (slot); 11646 } 11647 11648 static void 11649 rack_start_gp_measurement(struct tcpcb *tp, struct tcp_rack *rack, 11650 tcp_seq startseq, uint32_t sb_offset) 11651 { 11652 struct rack_sendmap *my_rsm = NULL; 11653 struct rack_sendmap fe; 11654 11655 if (tp->t_state < TCPS_ESTABLISHED) { 11656 /* 11657 * We don't start any measurements if we are 11658 * not at least established. 11659 */ 11660 return; 11661 } 11662 tp->t_flags |= TF_GPUTINPROG; 11663 rack->r_ctl.rc_gp_lowrtt = 0xffffffff; 11664 rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd; 11665 tp->gput_seq = startseq; 11666 rack->app_limited_needs_set = 0; 11667 if (rack->in_probe_rtt) 11668 rack->measure_saw_probe_rtt = 1; 11669 else if ((rack->measure_saw_probe_rtt) && 11670 (SEQ_GEQ(tp->gput_seq, rack->r_ctl.rc_probertt_sndmax_atexit))) 11671 rack->measure_saw_probe_rtt = 0; 11672 if (rack->rc_gp_filled) 11673 tp->gput_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 11674 else { 11675 /* Special case initial measurement */ 11676 rack->r_ctl.rc_gp_output_ts = tp->gput_ts = tcp_get_usecs(NULL); 11677 } 11678 /* 11679 * We take a guess out into the future, 11680 * if we have no measurement and no 11681 * initial rate, we measure the first 11682 * initial-windows worth of data to 11683 * speed up getting some GP measurement and 11684 * thus start pacing. 11685 */ 11686 if ((rack->rc_gp_filled == 0) && (rack->r_ctl.init_rate == 0)) { 11687 rack->app_limited_needs_set = 1; 11688 tp->gput_ack = startseq + max(rc_init_window(rack), 11689 (MIN_GP_WIN * ctf_fixed_maxseg(tp))); 11690 rack_log_pacing_delay_calc(rack, 11691 tp->gput_seq, 11692 tp->gput_ack, 11693 0, 11694 tp->gput_ts, 11695 rack->r_ctl.rc_app_limited_cnt, 11696 9, 11697 __LINE__, NULL); 11698 return; 11699 } 11700 if (sb_offset) { 11701 /* 11702 * We are out somewhere in the sb 11703 * can we use the already outstanding data? 11704 */ 11705 11706 if (rack->r_ctl.rc_app_limited_cnt == 0) { 11707 /* 11708 * Yes first one is good and in this case 11709 * the tp->gput_ts is correctly set based on 11710 * the last ack that arrived (no need to 11711 * set things up when an ack comes in). 11712 */ 11713 my_rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 11714 if ((my_rsm == NULL) || 11715 (my_rsm->r_rtr_cnt != 1)) { 11716 /* retransmission? */ 11717 goto use_latest; 11718 } 11719 } else { 11720 if (rack->r_ctl.rc_first_appl == NULL) { 11721 /* 11722 * If rc_first_appl is NULL 11723 * then the cnt should be 0. 11724 * This is probably an error, maybe 11725 * a KASSERT would be approprate. 11726 */ 11727 goto use_latest; 11728 } 11729 /* 11730 * If we have a marker pointer to the last one that is 11731 * app limited we can use that, but we need to set 11732 * things up so that when it gets ack'ed we record 11733 * the ack time (if its not already acked). 11734 */ 11735 rack->app_limited_needs_set = 1; 11736 /* 11737 * We want to get to the rsm that is either 11738 * next with space i.e. over 1 MSS or the one 11739 * after that (after the app-limited). 11740 */ 11741 my_rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, 11742 rack->r_ctl.rc_first_appl); 11743 if (my_rsm) { 11744 if ((my_rsm->r_end - my_rsm->r_start) <= ctf_fixed_maxseg(tp)) 11745 /* Have to use the next one */ 11746 my_rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, 11747 my_rsm); 11748 else { 11749 /* Use after the first MSS of it is acked */ 11750 tp->gput_seq = my_rsm->r_start + ctf_fixed_maxseg(tp); 11751 goto start_set; 11752 } 11753 } 11754 if ((my_rsm == NULL) || 11755 (my_rsm->r_rtr_cnt != 1)) { 11756 /* 11757 * Either its a retransmit or 11758 * the last is the app-limited one. 11759 */ 11760 goto use_latest; 11761 } 11762 } 11763 tp->gput_seq = my_rsm->r_start; 11764 start_set: 11765 if (my_rsm->r_flags & RACK_ACKED) { 11766 /* 11767 * This one has been acked use the arrival ack time 11768 */ 11769 tp->gput_ts = my_rsm->r_ack_arrival; 11770 rack->app_limited_needs_set = 0; 11771 } 11772 rack->r_ctl.rc_gp_output_ts = my_rsm->usec_orig_send; 11773 tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack); 11774 rack_log_pacing_delay_calc(rack, 11775 tp->gput_seq, 11776 tp->gput_ack, 11777 (uint64_t)my_rsm, 11778 tp->gput_ts, 11779 rack->r_ctl.rc_app_limited_cnt, 11780 9, 11781 __LINE__, NULL); 11782 return; 11783 } 11784 11785 use_latest: 11786 /* 11787 * We don't know how long we may have been 11788 * idle or if this is the first-send. Lets 11789 * setup the flag so we will trim off 11790 * the first ack'd data so we get a true 11791 * measurement. 11792 */ 11793 rack->app_limited_needs_set = 1; 11794 tp->gput_ack = startseq + rack_get_measure_window(tp, rack); 11795 /* Find this guy so we can pull the send time */ 11796 fe.r_start = startseq; 11797 my_rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); 11798 if (my_rsm) { 11799 rack->r_ctl.rc_gp_output_ts = my_rsm->usec_orig_send; 11800 if (my_rsm->r_flags & RACK_ACKED) { 11801 /* 11802 * Unlikely since its probably what was 11803 * just transmitted (but I am paranoid). 11804 */ 11805 tp->gput_ts = my_rsm->r_ack_arrival; 11806 rack->app_limited_needs_set = 0; 11807 } 11808 if (SEQ_LT(my_rsm->r_start, tp->gput_seq)) { 11809 /* This also is unlikely */ 11810 tp->gput_seq = my_rsm->r_start; 11811 } 11812 } else { 11813 /* 11814 * TSNH unless we have some send-map limit, 11815 * and even at that it should not be hitting 11816 * that limit (we should have stopped sending). 11817 */ 11818 rack->r_ctl.rc_gp_output_ts = tcp_get_usecs(NULL); 11819 } 11820 rack_log_pacing_delay_calc(rack, 11821 tp->gput_seq, 11822 tp->gput_ack, 11823 (uint64_t)my_rsm, 11824 tp->gput_ts, 11825 rack->r_ctl.rc_app_limited_cnt, 11826 9, __LINE__, NULL); 11827 } 11828 11829 static inline uint32_t 11830 rack_what_can_we_send(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cwnd_to_use, 11831 uint32_t avail, int32_t sb_offset) 11832 { 11833 uint32_t len; 11834 uint32_t sendwin; 11835 11836 if (tp->snd_wnd > cwnd_to_use) 11837 sendwin = cwnd_to_use; 11838 else 11839 sendwin = tp->snd_wnd; 11840 if (ctf_outstanding(tp) >= tp->snd_wnd) { 11841 /* We never want to go over our peers rcv-window */ 11842 len = 0; 11843 } else { 11844 uint32_t flight; 11845 11846 flight = ctf_flight_size(tp, rack->r_ctl.rc_sacked); 11847 if (flight >= sendwin) { 11848 /* 11849 * We have in flight what we are allowed by cwnd (if 11850 * it was rwnd blocking it would have hit above out 11851 * >= tp->snd_wnd). 11852 */ 11853 return (0); 11854 } 11855 len = sendwin - flight; 11856 if ((len + ctf_outstanding(tp)) > tp->snd_wnd) { 11857 /* We would send too much (beyond the rwnd) */ 11858 len = tp->snd_wnd - ctf_outstanding(tp); 11859 } 11860 if ((len + sb_offset) > avail) { 11861 /* 11862 * We don't have that much in the SB, how much is 11863 * there? 11864 */ 11865 len = avail - sb_offset; 11866 } 11867 } 11868 return (len); 11869 } 11870 11871 static int 11872 rack_output(struct tcpcb *tp) 11873 { 11874 struct socket *so; 11875 uint32_t recwin; 11876 uint32_t sb_offset; 11877 int32_t len, flags, error = 0; 11878 struct mbuf *m; 11879 struct mbuf *mb; 11880 uint32_t if_hw_tsomaxsegcount = 0; 11881 uint32_t if_hw_tsomaxsegsize; 11882 int32_t segsiz, minseg; 11883 long tot_len_this_send = 0; 11884 struct ip *ip = NULL; 11885 #ifdef TCPDEBUG 11886 struct ipovly *ipov = NULL; 11887 #endif 11888 struct udphdr *udp = NULL; 11889 struct tcp_rack *rack; 11890 struct tcphdr *th; 11891 uint8_t pass = 0; 11892 uint8_t mark = 0; 11893 uint8_t wanted_cookie = 0; 11894 u_char opt[TCP_MAXOLEN]; 11895 unsigned ipoptlen, optlen, hdrlen, ulen=0; 11896 uint32_t rack_seq; 11897 11898 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 11899 unsigned ipsec_optlen = 0; 11900 11901 #endif 11902 int32_t idle, sendalot; 11903 int32_t sub_from_prr = 0; 11904 volatile int32_t sack_rxmit; 11905 struct rack_sendmap *rsm = NULL; 11906 int32_t tso, mtu; 11907 struct tcpopt to; 11908 int32_t slot = 0; 11909 int32_t sup_rack = 0; 11910 uint32_t cts, us_cts, delayed, early; 11911 uint8_t hpts_calling, new_data_tlp = 0, doing_tlp = 0; 11912 uint32_t cwnd_to_use; 11913 int32_t do_a_prefetch; 11914 int32_t prefetch_rsm = 0; 11915 int32_t orig_len; 11916 struct timeval tv; 11917 int32_t prefetch_so_done = 0; 11918 struct tcp_log_buffer *lgb = NULL; 11919 struct inpcb *inp; 11920 struct sockbuf *sb; 11921 #ifdef INET6 11922 struct ip6_hdr *ip6 = NULL; 11923 int32_t isipv6; 11924 #endif 11925 uint8_t filled_all = 0; 11926 bool hw_tls = false; 11927 11928 /* setup and take the cache hits here */ 11929 rack = (struct tcp_rack *)tp->t_fb_ptr; 11930 inp = rack->rc_inp; 11931 so = inp->inp_socket; 11932 sb = &so->so_snd; 11933 kern_prefetch(sb, &do_a_prefetch); 11934 do_a_prefetch = 1; 11935 hpts_calling = inp->inp_hpts_calls; 11936 hw_tls = (so->so_snd.sb_flags & SB_TLS_IFNET) != 0; 11937 11938 NET_EPOCH_ASSERT(); 11939 INP_WLOCK_ASSERT(inp); 11940 #ifdef TCP_OFFLOAD 11941 if (tp->t_flags & TF_TOE) 11942 return (tcp_offload_output(tp)); 11943 #endif 11944 /* 11945 * For TFO connections in SYN_RECEIVED, only allow the initial 11946 * SYN|ACK and those sent by the retransmit timer. 11947 */ 11948 if (IS_FASTOPEN(tp->t_flags) && 11949 (tp->t_state == TCPS_SYN_RECEIVED) && 11950 SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN|ACK sent */ 11951 (rack->r_ctl.rc_resend == NULL)) /* not a retransmit */ 11952 return (0); 11953 #ifdef INET6 11954 if (rack->r_state) { 11955 /* Use the cache line loaded if possible */ 11956 isipv6 = rack->r_is_v6; 11957 } else { 11958 isipv6 = (inp->inp_vflag & INP_IPV6) != 0; 11959 } 11960 #endif 11961 early = 0; 11962 us_cts = tcp_get_usecs(&tv); 11963 cts = tcp_tv_to_mssectick(&tv); 11964 if (((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) && 11965 inp->inp_in_hpts) { 11966 /* 11967 * We are on the hpts for some timer but not hptsi output. 11968 * Remove from the hpts unconditionally. 11969 */ 11970 rack_timer_cancel(tp, rack, cts, __LINE__); 11971 } 11972 /* Are we pacing and late? */ 11973 if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) && 11974 TSTMP_GEQ(us_cts, rack->r_ctl.rc_last_output_to)) { 11975 /* We are delayed */ 11976 delayed = us_cts - rack->r_ctl.rc_last_output_to; 11977 } else { 11978 delayed = 0; 11979 } 11980 /* Do the timers, which may override the pacer */ 11981 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { 11982 if (rack_process_timers(tp, rack, cts, hpts_calling)) { 11983 counter_u64_add(rack_out_size[TCP_MSS_ACCT_ATIMER], 1); 11984 return (0); 11985 } 11986 } 11987 if ((rack->r_timer_override) || 11988 (delayed) || 11989 (tp->t_state < TCPS_ESTABLISHED)) { 11990 if (tp->t_inpcb->inp_in_hpts) 11991 tcp_hpts_remove(tp->t_inpcb, HPTS_REMOVE_OUTPUT); 11992 } else if (tp->t_inpcb->inp_in_hpts) { 11993 /* 11994 * On the hpts you can't pass even if ACKNOW is on, we will 11995 * when the hpts fires. 11996 */ 11997 counter_u64_add(rack_out_size[TCP_MSS_ACCT_INPACE], 1); 11998 return (0); 11999 } 12000 inp->inp_hpts_calls = 0; 12001 /* Finish out both pacing early and late accounting */ 12002 if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) && 12003 TSTMP_GT(rack->r_ctl.rc_last_output_to, us_cts)) { 12004 early = rack->r_ctl.rc_last_output_to - us_cts; 12005 } else 12006 early = 0; 12007 if (delayed) { 12008 rack->r_ctl.rc_agg_delayed += delayed; 12009 rack->r_late = 1; 12010 } else if (early) { 12011 rack->r_ctl.rc_agg_early += early; 12012 rack->r_early = 1; 12013 } 12014 /* Now that early/late accounting is done turn off the flag */ 12015 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 12016 rack->r_wanted_output = 0; 12017 rack->r_timer_override = 0; 12018 /* 12019 * For TFO connections in SYN_SENT or SYN_RECEIVED, 12020 * only allow the initial SYN or SYN|ACK and those sent 12021 * by the retransmit timer. 12022 */ 12023 if (IS_FASTOPEN(tp->t_flags) && 12024 ((tp->t_state == TCPS_SYN_RECEIVED) || 12025 (tp->t_state == TCPS_SYN_SENT)) && 12026 SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN or SYN|ACK sent */ 12027 (tp->t_rxtshift == 0)) { /* not a retransmit */ 12028 cwnd_to_use = rack->r_ctl.cwnd_to_use = tp->snd_cwnd; 12029 goto just_return_nolock; 12030 } 12031 /* 12032 * Determine length of data that should be transmitted, and flags 12033 * that will be used. If there is some data or critical controls 12034 * (SYN, RST) to send, then transmit; otherwise, investigate 12035 * further. 12036 */ 12037 idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una); 12038 if (tp->t_idle_reduce) { 12039 if (idle && ((ticks - tp->t_rcvtime) >= tp->t_rxtcur)) 12040 rack_cc_after_idle(rack, tp); 12041 } 12042 tp->t_flags &= ~TF_LASTIDLE; 12043 if (idle) { 12044 if (tp->t_flags & TF_MORETOCOME) { 12045 tp->t_flags |= TF_LASTIDLE; 12046 idle = 0; 12047 } 12048 } 12049 if ((tp->snd_una == tp->snd_max) && 12050 rack->r_ctl.rc_went_idle_time && 12051 TSTMP_GT(us_cts, rack->r_ctl.rc_went_idle_time)) { 12052 idle = us_cts - rack->r_ctl.rc_went_idle_time; 12053 if (idle > rack_min_probertt_hold) { 12054 /* Count as a probe rtt */ 12055 if (rack->in_probe_rtt == 0) { 12056 rack->r_ctl.rc_lower_rtt_us_cts = us_cts; 12057 rack->r_ctl.rc_time_probertt_entered = rack->r_ctl.rc_lower_rtt_us_cts; 12058 rack->r_ctl.rc_time_probertt_starts = rack->r_ctl.rc_lower_rtt_us_cts; 12059 rack->r_ctl.rc_time_of_last_probertt = rack->r_ctl.rc_lower_rtt_us_cts; 12060 } else { 12061 rack_exit_probertt(rack, us_cts); 12062 } 12063 } 12064 idle = 0; 12065 } 12066 again: 12067 /* 12068 * If we've recently taken a timeout, snd_max will be greater than 12069 * snd_nxt. There may be SACK information that allows us to avoid 12070 * resending already delivered data. Adjust snd_nxt accordingly. 12071 */ 12072 sendalot = 0; 12073 us_cts = tcp_get_usecs(&tv); 12074 cts = tcp_tv_to_mssectick(&tv); 12075 tso = 0; 12076 mtu = 0; 12077 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 12078 minseg = segsiz; 12079 sb_offset = tp->snd_max - tp->snd_una; 12080 cwnd_to_use = rack->r_ctl.cwnd_to_use = tp->snd_cwnd; 12081 #ifdef NETFLIX_SHARED_CWND 12082 if ((tp->t_flags2 & TF2_TCP_SCWND_ALLOWED) && 12083 rack->rack_enable_scwnd) { 12084 /* We are doing cwnd sharing */ 12085 if (rack->rc_gp_filled && 12086 (rack->rack_attempted_scwnd == 0) && 12087 (rack->r_ctl.rc_scw == NULL) && 12088 tp->t_lib) { 12089 /* The pcbid is in, lets make an attempt */ 12090 counter_u64_add(rack_try_scwnd, 1); 12091 rack->rack_attempted_scwnd = 1; 12092 rack->r_ctl.rc_scw = tcp_shared_cwnd_alloc(tp, 12093 &rack->r_ctl.rc_scw_index, 12094 segsiz); 12095 } 12096 if (rack->r_ctl.rc_scw && 12097 (rack->rack_scwnd_is_idle == 1) && 12098 (rack->rc_in_persist == 0) && 12099 sbavail(sb)) { 12100 /* we are no longer out of data */ 12101 tcp_shared_cwnd_active(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index); 12102 rack->rack_scwnd_is_idle = 0; 12103 } 12104 if (rack->r_ctl.rc_scw) { 12105 /* First lets update and get the cwnd */ 12106 rack->r_ctl.cwnd_to_use = cwnd_to_use = tcp_shared_cwnd_update(rack->r_ctl.rc_scw, 12107 rack->r_ctl.rc_scw_index, 12108 tp->snd_cwnd, tp->snd_wnd, segsiz); 12109 } 12110 } 12111 #endif 12112 flags = tcp_outflags[tp->t_state]; 12113 while (rack->rc_free_cnt < rack_free_cache) { 12114 rsm = rack_alloc(rack); 12115 if (rsm == NULL) { 12116 if (inp->inp_hpts_calls) 12117 /* Retry in a ms */ 12118 slot = (1 * HPTS_USEC_IN_MSEC); 12119 goto just_return_nolock; 12120 } 12121 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_tnext); 12122 rack->rc_free_cnt++; 12123 rsm = NULL; 12124 } 12125 if (inp->inp_hpts_calls) 12126 inp->inp_hpts_calls = 0; 12127 sack_rxmit = 0; 12128 len = 0; 12129 rsm = NULL; 12130 if (flags & TH_RST) { 12131 SOCKBUF_LOCK(sb); 12132 goto send; 12133 } 12134 if (rack->r_ctl.rc_resend) { 12135 /* Retransmit timer */ 12136 rsm = rack->r_ctl.rc_resend; 12137 rack->r_ctl.rc_resend = NULL; 12138 rsm->r_flags &= ~RACK_TLP; 12139 len = rsm->r_end - rsm->r_start; 12140 sack_rxmit = 1; 12141 sendalot = 0; 12142 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start), 12143 ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p", 12144 __func__, __LINE__, 12145 rsm->r_start, tp->snd_una, tp, rack, rsm)); 12146 sb_offset = rsm->r_start - tp->snd_una; 12147 if (len >= segsiz) 12148 len = segsiz; 12149 } else if ((rack->rc_in_persist == 0) && 12150 ((rsm = tcp_rack_output(tp, rack, cts)) != NULL)) { 12151 /* We have a retransmit that takes precedence */ 12152 rsm->r_flags &= ~RACK_TLP; 12153 if ((!IN_RECOVERY(tp->t_flags)) && 12154 ((tp->t_flags & (TF_WASFRECOVERY | TF_WASCRECOVERY)) == 0)) { 12155 /* Enter recovery if not induced by a time-out */ 12156 rack->r_ctl.rc_rsm_start = rsm->r_start; 12157 rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; 12158 rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; 12159 rack_cong_signal(tp, NULL, CC_NDUPACK); 12160 /* 12161 * When we enter recovery we need to assure we send 12162 * one packet. 12163 */ 12164 if (rack->rack_no_prr == 0) { 12165 rack->r_ctl.rc_prr_sndcnt = segsiz; 12166 rack_log_to_prr(rack, 13, 0); 12167 } 12168 } 12169 #ifdef INVARIANTS 12170 if (SEQ_LT(rsm->r_start, tp->snd_una)) { 12171 panic("Huh, tp:%p rack:%p rsm:%p start:%u < snd_una:%u\n", 12172 tp, rack, rsm, rsm->r_start, tp->snd_una); 12173 } 12174 #endif 12175 len = rsm->r_end - rsm->r_start; 12176 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start), 12177 ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p", 12178 __func__, __LINE__, 12179 rsm->r_start, tp->snd_una, tp, rack, rsm)); 12180 sb_offset = rsm->r_start - tp->snd_una; 12181 /* Can we send it within the PRR boundary? */ 12182 if (rack->rack_no_prr == 0) { 12183 if ((rack->use_rack_rr == 0) && (len > rack->r_ctl.rc_prr_sndcnt)) { 12184 /* It does not fit */ 12185 if ((ctf_flight_size(tp, rack->r_ctl.rc_sacked) > len) && 12186 (rack->r_ctl.rc_prr_sndcnt < segsiz)) { 12187 /* 12188 * prr is less than a segment, we 12189 * have more acks due in besides 12190 * what we need to resend. Lets not send 12191 * to avoid sending small pieces of 12192 * what we need to retransmit. 12193 */ 12194 len = 0; 12195 goto just_return_nolock; 12196 } 12197 len = rack->r_ctl.rc_prr_sndcnt; 12198 } 12199 } 12200 sendalot = 0; 12201 if (len >= segsiz) 12202 len = segsiz; 12203 if (len > 0) { 12204 sub_from_prr = 1; 12205 sack_rxmit = 1; 12206 KMOD_TCPSTAT_INC(tcps_sack_rexmits); 12207 KMOD_TCPSTAT_ADD(tcps_sack_rexmit_bytes, 12208 min(len, segsiz)); 12209 counter_u64_add(rack_rtm_prr_retran, 1); 12210 } 12211 } else if (rack->r_ctl.rc_tlpsend) { 12212 /* Tail loss probe */ 12213 long cwin; 12214 long tlen; 12215 12216 doing_tlp = 1; 12217 /* 12218 * Check if we can do a TLP with a RACK'd packet 12219 * this can happen if we are not doing the rack 12220 * cheat and we skipped to a TLP and it 12221 * went off. 12222 */ 12223 rsm = rack->r_ctl.rc_tlpsend; 12224 rsm->r_flags |= RACK_TLP; 12225 rack->r_ctl.rc_tlpsend = NULL; 12226 sack_rxmit = 1; 12227 tlen = rsm->r_end - rsm->r_start; 12228 if (tlen > segsiz) 12229 tlen = segsiz; 12230 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start), 12231 ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p", 12232 __func__, __LINE__, 12233 rsm->r_start, tp->snd_una, tp, rack, rsm)); 12234 sb_offset = rsm->r_start - tp->snd_una; 12235 cwin = min(tp->snd_wnd, tlen); 12236 len = cwin; 12237 } 12238 /* 12239 * Enforce a connection sendmap count limit if set 12240 * as long as we are not retransmiting. 12241 */ 12242 if ((rsm == NULL) && 12243 (rack->do_detection == 0) && 12244 (V_tcp_map_entries_limit > 0) && 12245 (rack->r_ctl.rc_num_maps_alloced >= V_tcp_map_entries_limit)) { 12246 counter_u64_add(rack_to_alloc_limited, 1); 12247 if (!rack->alloc_limit_reported) { 12248 rack->alloc_limit_reported = 1; 12249 counter_u64_add(rack_alloc_limited_conns, 1); 12250 } 12251 goto just_return_nolock; 12252 } 12253 if (rsm && (rsm->r_flags & RACK_HAS_FIN)) { 12254 /* we are retransmitting the fin */ 12255 len--; 12256 if (len) { 12257 /* 12258 * When retransmitting data do *not* include the 12259 * FIN. This could happen from a TLP probe. 12260 */ 12261 flags &= ~TH_FIN; 12262 } 12263 } 12264 #ifdef INVARIANTS 12265 /* For debugging */ 12266 rack->r_ctl.rc_rsm_at_retran = rsm; 12267 #endif 12268 /* 12269 * Get standard flags, and add SYN or FIN if requested by 'hidden' 12270 * state flags. 12271 */ 12272 if (tp->t_flags & TF_NEEDFIN) 12273 flags |= TH_FIN; 12274 if (tp->t_flags & TF_NEEDSYN) 12275 flags |= TH_SYN; 12276 if ((sack_rxmit == 0) && (prefetch_rsm == 0)) { 12277 void *end_rsm; 12278 end_rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext); 12279 if (end_rsm) 12280 kern_prefetch(end_rsm, &prefetch_rsm); 12281 prefetch_rsm = 1; 12282 } 12283 SOCKBUF_LOCK(sb); 12284 /* 12285 * If snd_nxt == snd_max and we have transmitted a FIN, the 12286 * sb_offset will be > 0 even if so_snd.sb_cc is 0, resulting in a 12287 * negative length. This can also occur when TCP opens up its 12288 * congestion window while receiving additional duplicate acks after 12289 * fast-retransmit because TCP will reset snd_nxt to snd_max after 12290 * the fast-retransmit. 12291 * 12292 * In the normal retransmit-FIN-only case, however, snd_nxt will be 12293 * set to snd_una, the sb_offset will be 0, and the length may wind 12294 * up 0. 12295 * 12296 * If sack_rxmit is true we are retransmitting from the scoreboard 12297 * in which case len is already set. 12298 */ 12299 if ((sack_rxmit == 0) && TCPS_HAVEESTABLISHED(tp->t_state)) { 12300 uint32_t avail; 12301 12302 avail = sbavail(sb); 12303 if (SEQ_GT(tp->snd_nxt, tp->snd_una) && avail) 12304 sb_offset = tp->snd_nxt - tp->snd_una; 12305 else 12306 sb_offset = 0; 12307 if ((IN_RECOVERY(tp->t_flags) == 0) || rack->rack_no_prr) { 12308 if (rack->r_ctl.rc_tlp_new_data) { 12309 /* TLP is forcing out new data */ 12310 if (rack->r_ctl.rc_tlp_new_data > (uint32_t) (avail - sb_offset)) { 12311 rack->r_ctl.rc_tlp_new_data = (uint32_t) (avail - sb_offset); 12312 } 12313 if (rack->r_ctl.rc_tlp_new_data > tp->snd_wnd) 12314 len = tp->snd_wnd; 12315 else 12316 len = rack->r_ctl.rc_tlp_new_data; 12317 rack->r_ctl.rc_tlp_new_data = 0; 12318 new_data_tlp = doing_tlp = 1; 12319 } else 12320 len = rack_what_can_we_send(tp, rack, cwnd_to_use, avail, sb_offset); 12321 if (IN_RECOVERY(tp->t_flags) && (len > segsiz)) { 12322 /* 12323 * For prr=off, we need to send only 1 MSS 12324 * at a time. We do this because another sack could 12325 * be arriving that causes us to send retransmits and 12326 * we don't want to be on a long pace due to a larger send 12327 * that keeps us from sending out the retransmit. 12328 */ 12329 len = segsiz; 12330 } 12331 } else { 12332 uint32_t outstanding; 12333 12334 /* 12335 * We are inside of a SACK recovery episode and are 12336 * sending new data, having retransmitted all the 12337 * data possible so far in the scoreboard. 12338 */ 12339 outstanding = tp->snd_max - tp->snd_una; 12340 if ((rack->r_ctl.rc_prr_sndcnt + outstanding) > tp->snd_wnd) { 12341 if (tp->snd_wnd > outstanding) { 12342 len = tp->snd_wnd - outstanding; 12343 /* Check to see if we have the data */ 12344 if ((sb_offset + len) > avail) { 12345 /* It does not all fit */ 12346 if (avail > sb_offset) 12347 len = avail - sb_offset; 12348 else 12349 len = 0; 12350 } 12351 } else 12352 len = 0; 12353 } else if (avail > sb_offset) 12354 len = avail - sb_offset; 12355 else 12356 len = 0; 12357 if (len > 0) { 12358 if (len > rack->r_ctl.rc_prr_sndcnt) 12359 len = rack->r_ctl.rc_prr_sndcnt; 12360 if (len > 0) { 12361 sub_from_prr = 1; 12362 counter_u64_add(rack_rtm_prr_newdata, 1); 12363 } 12364 } 12365 if (len > segsiz) { 12366 /* 12367 * We should never send more than a MSS when 12368 * retransmitting or sending new data in prr 12369 * mode unless the override flag is on. Most 12370 * likely the PRR algorithm is not going to 12371 * let us send a lot as well :-) 12372 */ 12373 if (rack->r_ctl.rc_prr_sendalot == 0) 12374 len = segsiz; 12375 } else if (len < segsiz) { 12376 /* 12377 * Do we send any? The idea here is if the 12378 * send empty's the socket buffer we want to 12379 * do it. However if not then lets just wait 12380 * for our prr_sndcnt to get bigger. 12381 */ 12382 long leftinsb; 12383 12384 leftinsb = sbavail(sb) - sb_offset; 12385 if (leftinsb > len) { 12386 /* This send does not empty the sb */ 12387 len = 0; 12388 } 12389 } 12390 } 12391 } else if (!TCPS_HAVEESTABLISHED(tp->t_state)) { 12392 /* 12393 * If you have not established 12394 * and are not doing FAST OPEN 12395 * no data please. 12396 */ 12397 if ((sack_rxmit == 0) && 12398 (!IS_FASTOPEN(tp->t_flags))){ 12399 len = 0; 12400 sb_offset = 0; 12401 } 12402 } 12403 if (prefetch_so_done == 0) { 12404 kern_prefetch(so, &prefetch_so_done); 12405 prefetch_so_done = 1; 12406 } 12407 /* 12408 * Lop off SYN bit if it has already been sent. However, if this is 12409 * SYN-SENT state and if segment contains data and if we don't know 12410 * that foreign host supports TAO, suppress sending segment. 12411 */ 12412 if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una) && 12413 ((sack_rxmit == 0) && (tp->t_rxtshift == 0))) { 12414 /* 12415 * When sending additional segments following a TFO SYN|ACK, 12416 * do not include the SYN bit. 12417 */ 12418 if (IS_FASTOPEN(tp->t_flags) && 12419 (tp->t_state == TCPS_SYN_RECEIVED)) 12420 flags &= ~TH_SYN; 12421 } 12422 /* 12423 * Be careful not to send data and/or FIN on SYN segments. This 12424 * measure is needed to prevent interoperability problems with not 12425 * fully conformant TCP implementations. 12426 */ 12427 if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) { 12428 len = 0; 12429 flags &= ~TH_FIN; 12430 } 12431 /* 12432 * On TFO sockets, ensure no data is sent in the following cases: 12433 * 12434 * - When retransmitting SYN|ACK on a passively-created socket 12435 * 12436 * - When retransmitting SYN on an actively created socket 12437 * 12438 * - When sending a zero-length cookie (cookie request) on an 12439 * actively created socket 12440 * 12441 * - When the socket is in the CLOSED state (RST is being sent) 12442 */ 12443 if (IS_FASTOPEN(tp->t_flags) && 12444 (((flags & TH_SYN) && (tp->t_rxtshift > 0)) || 12445 ((tp->t_state == TCPS_SYN_SENT) && 12446 (tp->t_tfo_client_cookie_len == 0)) || 12447 (flags & TH_RST))) { 12448 sack_rxmit = 0; 12449 len = 0; 12450 } 12451 /* Without fast-open there should never be data sent on a SYN */ 12452 if ((flags & TH_SYN) && (!IS_FASTOPEN(tp->t_flags))) { 12453 tp->snd_nxt = tp->iss; 12454 len = 0; 12455 } 12456 orig_len = len; 12457 if (len <= 0) { 12458 /* 12459 * If FIN has been sent but not acked, but we haven't been 12460 * called to retransmit, len will be < 0. Otherwise, window 12461 * shrank after we sent into it. If window shrank to 0, 12462 * cancel pending retransmit, pull snd_nxt back to (closed) 12463 * window, and set the persist timer if it isn't already 12464 * going. If the window didn't close completely, just wait 12465 * for an ACK. 12466 * 12467 * We also do a general check here to ensure that we will 12468 * set the persist timer when we have data to send, but a 12469 * 0-byte window. This makes sure the persist timer is set 12470 * even if the packet hits one of the "goto send" lines 12471 * below. 12472 */ 12473 len = 0; 12474 if ((tp->snd_wnd == 0) && 12475 (TCPS_HAVEESTABLISHED(tp->t_state)) && 12476 (tp->snd_una == tp->snd_max) && 12477 (sb_offset < (int)sbavail(sb))) { 12478 tp->snd_nxt = tp->snd_una; 12479 rack_enter_persist(tp, rack, cts); 12480 } 12481 } else if ((rsm == NULL) && 12482 ((doing_tlp == 0) || (new_data_tlp == 1)) && 12483 (len < rack->r_ctl.rc_pace_max_segs)) { 12484 /* 12485 * We are not sending a maximum sized segment for 12486 * some reason. Should we not send anything (think 12487 * sws or persists)? 12488 */ 12489 if ((tp->snd_wnd < min(max(segsiz, (rack->r_ctl.rc_high_rwnd/2)), minseg)) && 12490 (TCPS_HAVEESTABLISHED(tp->t_state)) && 12491 (len < minseg) && 12492 (len < (int)(sbavail(sb) - sb_offset))) { 12493 /* 12494 * Here the rwnd is less than 12495 * the minimum pacing size, this is not a retransmit, 12496 * we are established and 12497 * the send is not the last in the socket buffer 12498 * we send nothing, and we may enter persists 12499 * if nothing is outstanding. 12500 */ 12501 len = 0; 12502 if (tp->snd_max == tp->snd_una) { 12503 /* 12504 * Nothing out we can 12505 * go into persists. 12506 */ 12507 rack_enter_persist(tp, rack, cts); 12508 tp->snd_nxt = tp->snd_una; 12509 } 12510 } else if ((cwnd_to_use >= max(minseg, (segsiz * 4))) && 12511 (ctf_flight_size(tp, rack->r_ctl.rc_sacked) > (2 * segsiz)) && 12512 (len < (int)(sbavail(sb) - sb_offset)) && 12513 (len < minseg)) { 12514 /* 12515 * Here we are not retransmitting, and 12516 * the cwnd is not so small that we could 12517 * not send at least a min size (rxt timer 12518 * not having gone off), We have 2 segments or 12519 * more already in flight, its not the tail end 12520 * of the socket buffer and the cwnd is blocking 12521 * us from sending out a minimum pacing segment size. 12522 * Lets not send anything. 12523 */ 12524 len = 0; 12525 } else if (((tp->snd_wnd - ctf_outstanding(tp)) < 12526 min((rack->r_ctl.rc_high_rwnd/2), minseg)) && 12527 (ctf_flight_size(tp, rack->r_ctl.rc_sacked) > (2 * segsiz)) && 12528 (len < (int)(sbavail(sb) - sb_offset)) && 12529 (TCPS_HAVEESTABLISHED(tp->t_state))) { 12530 /* 12531 * Here we have a send window but we have 12532 * filled it up and we can't send another pacing segment. 12533 * We also have in flight more than 2 segments 12534 * and we are not completing the sb i.e. we allow 12535 * the last bytes of the sb to go out even if 12536 * its not a full pacing segment. 12537 */ 12538 len = 0; 12539 } 12540 } 12541 /* len will be >= 0 after this point. */ 12542 KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); 12543 tcp_sndbuf_autoscale(tp, so, min(tp->snd_wnd, cwnd_to_use)); 12544 /* 12545 * Decide if we can use TCP Segmentation Offloading (if supported by 12546 * hardware). 12547 * 12548 * TSO may only be used if we are in a pure bulk sending state. The 12549 * presence of TCP-MD5, SACK retransmits, SACK advertizements and IP 12550 * options prevent using TSO. With TSO the TCP header is the same 12551 * (except for the sequence number) for all generated packets. This 12552 * makes it impossible to transmit any options which vary per 12553 * generated segment or packet. 12554 * 12555 * IPv4 handling has a clear separation of ip options and ip header 12556 * flags while IPv6 combines both in in6p_outputopts. ip6_optlen() does 12557 * the right thing below to provide length of just ip options and thus 12558 * checking for ipoptlen is enough to decide if ip options are present. 12559 */ 12560 12561 #ifdef INET6 12562 if (isipv6) 12563 ipoptlen = ip6_optlen(tp->t_inpcb); 12564 else 12565 #endif 12566 if (tp->t_inpcb->inp_options) 12567 ipoptlen = tp->t_inpcb->inp_options->m_len - 12568 offsetof(struct ipoption, ipopt_list); 12569 else 12570 ipoptlen = 0; 12571 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 12572 /* 12573 * Pre-calculate here as we save another lookup into the darknesses 12574 * of IPsec that way and can actually decide if TSO is ok. 12575 */ 12576 #ifdef INET6 12577 if (isipv6 && IPSEC_ENABLED(ipv6)) 12578 ipsec_optlen = IPSEC_HDRSIZE(ipv6, tp->t_inpcb); 12579 #ifdef INET 12580 else 12581 #endif 12582 #endif /* INET6 */ 12583 #ifdef INET 12584 if (IPSEC_ENABLED(ipv4)) 12585 ipsec_optlen = IPSEC_HDRSIZE(ipv4, tp->t_inpcb); 12586 #endif /* INET */ 12587 #endif 12588 12589 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 12590 ipoptlen += ipsec_optlen; 12591 #endif 12592 if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > segsiz && 12593 (tp->t_port == 0) && 12594 ((tp->t_flags & TF_SIGNATURE) == 0) && 12595 tp->rcv_numsacks == 0 && sack_rxmit == 0 && 12596 ipoptlen == 0) 12597 tso = 1; 12598 { 12599 uint32_t outstanding; 12600 12601 outstanding = tp->snd_max - tp->snd_una; 12602 if (tp->t_flags & TF_SENTFIN) { 12603 /* 12604 * If we sent a fin, snd_max is 1 higher than 12605 * snd_una 12606 */ 12607 outstanding--; 12608 } 12609 if (sack_rxmit) { 12610 if ((rsm->r_flags & RACK_HAS_FIN) == 0) 12611 flags &= ~TH_FIN; 12612 } else { 12613 if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + 12614 sbused(sb))) 12615 flags &= ~TH_FIN; 12616 } 12617 } 12618 recwin = lmin(lmax(sbspace(&so->so_rcv), 0), 12619 (long)TCP_MAXWIN << tp->rcv_scale); 12620 12621 /* 12622 * Sender silly window avoidance. We transmit under the following 12623 * conditions when len is non-zero: 12624 * 12625 * - We have a full segment (or more with TSO) - This is the last 12626 * buffer in a write()/send() and we are either idle or running 12627 * NODELAY - we've timed out (e.g. persist timer) - we have more 12628 * then 1/2 the maximum send window's worth of data (receiver may be 12629 * limited the window size) - we need to retransmit 12630 */ 12631 if (len) { 12632 if (len >= segsiz) { 12633 goto send; 12634 } 12635 /* 12636 * NOTE! on localhost connections an 'ack' from the remote 12637 * end may occur synchronously with the output and cause us 12638 * to flush a buffer queued with moretocome. XXX 12639 * 12640 */ 12641 if (!(tp->t_flags & TF_MORETOCOME) && /* normal case */ 12642 (idle || (tp->t_flags & TF_NODELAY)) && 12643 ((uint32_t)len + (uint32_t)sb_offset >= sbavail(sb)) && 12644 (tp->t_flags & TF_NOPUSH) == 0) { 12645 pass = 2; 12646 goto send; 12647 } 12648 if ((tp->snd_una == tp->snd_max) && len) { /* Nothing outstanding */ 12649 pass = 22; 12650 goto send; 12651 } 12652 if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) { 12653 pass = 4; 12654 goto send; 12655 } 12656 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { /* retransmit case */ 12657 pass = 5; 12658 goto send; 12659 } 12660 if (sack_rxmit) { 12661 pass = 6; 12662 goto send; 12663 } 12664 if (((tp->snd_wnd - ctf_outstanding(tp)) < segsiz) && 12665 (ctf_outstanding(tp) < (segsiz * 2))) { 12666 /* 12667 * We have less than two MSS outstanding (delayed ack) 12668 * and our rwnd will not let us send a full sized 12669 * MSS. Lets go ahead and let this small segment 12670 * out because we want to try to have at least two 12671 * packets inflight to not be caught by delayed ack. 12672 */ 12673 pass = 12; 12674 goto send; 12675 } 12676 } 12677 /* 12678 * Sending of standalone window updates. 12679 * 12680 * Window updates are important when we close our window due to a 12681 * full socket buffer and are opening it again after the application 12682 * reads data from it. Once the window has opened again and the 12683 * remote end starts to send again the ACK clock takes over and 12684 * provides the most current window information. 12685 * 12686 * We must avoid the silly window syndrome whereas every read from 12687 * the receive buffer, no matter how small, causes a window update 12688 * to be sent. We also should avoid sending a flurry of window 12689 * updates when the socket buffer had queued a lot of data and the 12690 * application is doing small reads. 12691 * 12692 * Prevent a flurry of pointless window updates by only sending an 12693 * update when we can increase the advertized window by more than 12694 * 1/4th of the socket buffer capacity. When the buffer is getting 12695 * full or is very small be more aggressive and send an update 12696 * whenever we can increase by two mss sized segments. In all other 12697 * situations the ACK's to new incoming data will carry further 12698 * window increases. 12699 * 12700 * Don't send an independent window update if a delayed ACK is 12701 * pending (it will get piggy-backed on it) or the remote side 12702 * already has done a half-close and won't send more data. Skip 12703 * this if the connection is in T/TCP half-open state. 12704 */ 12705 if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN) && 12706 !(tp->t_flags & TF_DELACK) && 12707 !TCPS_HAVERCVDFIN(tp->t_state)) { 12708 /* 12709 * "adv" is the amount we could increase the window, taking 12710 * into account that we are limited by TCP_MAXWIN << 12711 * tp->rcv_scale. 12712 */ 12713 int32_t adv; 12714 int oldwin; 12715 12716 adv = recwin; 12717 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) { 12718 oldwin = (tp->rcv_adv - tp->rcv_nxt); 12719 if (adv > oldwin) 12720 adv -= oldwin; 12721 else { 12722 /* We can't increase the window */ 12723 adv = 0; 12724 } 12725 } else 12726 oldwin = 0; 12727 12728 /* 12729 * If the new window size ends up being the same as or less 12730 * than the old size when it is scaled, then don't force 12731 * a window update. 12732 */ 12733 if (oldwin >> tp->rcv_scale >= (adv + oldwin) >> tp->rcv_scale) 12734 goto dontupdate; 12735 12736 if (adv >= (int32_t)(2 * segsiz) && 12737 (adv >= (int32_t)(so->so_rcv.sb_hiwat / 4) || 12738 recwin <= (int32_t)(so->so_rcv.sb_hiwat / 8) || 12739 so->so_rcv.sb_hiwat <= 8 * segsiz)) { 12740 pass = 7; 12741 goto send; 12742 } 12743 if (2 * adv >= (int32_t) so->so_rcv.sb_hiwat) { 12744 pass = 23; 12745 goto send; 12746 } 12747 } 12748 dontupdate: 12749 12750 /* 12751 * Send if we owe the peer an ACK, RST, SYN, or urgent data. ACKNOW 12752 * is also a catch-all for the retransmit timer timeout case. 12753 */ 12754 if (tp->t_flags & TF_ACKNOW) { 12755 pass = 8; 12756 goto send; 12757 } 12758 if (((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0)) { 12759 pass = 9; 12760 goto send; 12761 } 12762 /* 12763 * If our state indicates that FIN should be sent and we have not 12764 * yet done so, then we need to send. 12765 */ 12766 if ((flags & TH_FIN) && 12767 (tp->snd_nxt == tp->snd_una)) { 12768 pass = 11; 12769 goto send; 12770 } 12771 /* 12772 * No reason to send a segment, just return. 12773 */ 12774 just_return: 12775 SOCKBUF_UNLOCK(sb); 12776 just_return_nolock: 12777 { 12778 int app_limited = CTF_JR_SENT_DATA; 12779 12780 if (tot_len_this_send > 0) { 12781 /* Make sure snd_nxt is up to max */ 12782 if (SEQ_GT(tp->snd_max, tp->snd_nxt)) 12783 tp->snd_nxt = tp->snd_max; 12784 slot = rack_get_pacing_delay(rack, tp, tot_len_this_send, NULL, segsiz); 12785 } else { 12786 int end_window = 0; 12787 uint32_t seq = tp->gput_ack; 12788 12789 rsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 12790 if (rsm) { 12791 /* 12792 * Mark the last sent that we just-returned (hinting 12793 * that delayed ack may play a role in any rtt measurement). 12794 */ 12795 rsm->r_just_ret = 1; 12796 } 12797 counter_u64_add(rack_out_size[TCP_MSS_ACCT_JUSTRET], 1); 12798 rack->r_ctl.rc_agg_delayed = 0; 12799 rack->r_early = 0; 12800 rack->r_late = 0; 12801 rack->r_ctl.rc_agg_early = 0; 12802 if ((ctf_outstanding(tp) + 12803 min(max(segsiz, (rack->r_ctl.rc_high_rwnd/2)), 12804 minseg)) >= tp->snd_wnd) { 12805 /* We are limited by the rwnd */ 12806 app_limited = CTF_JR_RWND_LIMITED; 12807 } else if (ctf_outstanding(tp) >= sbavail(sb)) { 12808 /* We are limited by whats available -- app limited */ 12809 app_limited = CTF_JR_APP_LIMITED; 12810 } else if ((idle == 0) && 12811 ((tp->t_flags & TF_NODELAY) == 0) && 12812 ((uint32_t)len + (uint32_t)sb_offset >= sbavail(sb)) && 12813 (len < segsiz)) { 12814 /* 12815 * No delay is not on and the 12816 * user is sending less than 1MSS. This 12817 * brings out SWS avoidance so we 12818 * don't send. Another app-limited case. 12819 */ 12820 app_limited = CTF_JR_APP_LIMITED; 12821 } else if (tp->t_flags & TF_NOPUSH) { 12822 /* 12823 * The user has requested no push of 12824 * the last segment and we are 12825 * at the last segment. Another app 12826 * limited case. 12827 */ 12828 app_limited = CTF_JR_APP_LIMITED; 12829 } else if ((ctf_outstanding(tp) + minseg) > cwnd_to_use) { 12830 /* Its the cwnd */ 12831 app_limited = CTF_JR_CWND_LIMITED; 12832 } else if (rack->rc_in_persist == 1) { 12833 /* We are in persists */ 12834 app_limited = CTF_JR_PERSISTS; 12835 } else if (IN_RECOVERY(tp->t_flags) && 12836 (rack->rack_no_prr == 0) && 12837 (rack->r_ctl.rc_prr_sndcnt < segsiz)) { 12838 app_limited = CTF_JR_PRR; 12839 } else { 12840 /* Now why here are we not sending? */ 12841 #ifdef NOW 12842 #ifdef INVARIANTS 12843 panic("rack:%p hit JR_ASSESSING case cwnd_to_use:%u?", rack, cwnd_to_use); 12844 #endif 12845 #endif 12846 app_limited = CTF_JR_ASSESSING; 12847 } 12848 /* 12849 * App limited in some fashion, for our pacing GP 12850 * measurements we don't want any gap (even cwnd). 12851 * Close down the measurement window. 12852 */ 12853 if (rack_cwnd_block_ends_measure && 12854 ((app_limited == CTF_JR_CWND_LIMITED) || 12855 (app_limited == CTF_JR_PRR))) { 12856 /* 12857 * The reason we are not sending is 12858 * the cwnd (or prr). We have been configured 12859 * to end the measurement window in 12860 * this case. 12861 */ 12862 end_window = 1; 12863 } else if (app_limited == CTF_JR_PERSISTS) { 12864 /* 12865 * We never end the measurement window 12866 * in persists, though in theory we 12867 * should be only entering after everything 12868 * is acknowledged (so we will probably 12869 * never come here). 12870 */ 12871 end_window = 0; 12872 } else if (rack_rwnd_block_ends_measure && 12873 (app_limited == CTF_JR_RWND_LIMITED)) { 12874 /* 12875 * We are rwnd limited and have been 12876 * configured to end the measurement 12877 * window in this case. 12878 */ 12879 end_window = 1; 12880 } else if (app_limited == CTF_JR_APP_LIMITED) { 12881 /* 12882 * A true application limited period, we have 12883 * ran out of data. 12884 */ 12885 end_window = 1; 12886 } else if (app_limited == CTF_JR_ASSESSING) { 12887 /* 12888 * In the assessing case we hit the end of 12889 * the if/else and had no known reason 12890 * This will panic us under invariants.. 12891 * 12892 * If we get this out in logs we need to 12893 * investagate which reason we missed. 12894 */ 12895 end_window = 1; 12896 } 12897 if (end_window) { 12898 uint8_t log = 0; 12899 12900 if ((tp->t_flags & TF_GPUTINPROG) && 12901 SEQ_GT(tp->gput_ack, tp->snd_max)) { 12902 /* Mark the last packet has app limited */ 12903 tp->gput_ack = tp->snd_max; 12904 log = 1; 12905 } 12906 rsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 12907 if (rsm && ((rsm->r_flags & RACK_APP_LIMITED) == 0)) { 12908 if (rack->r_ctl.rc_app_limited_cnt == 0) 12909 rack->r_ctl.rc_end_appl = rack->r_ctl.rc_first_appl = rsm; 12910 else { 12911 /* 12912 * Go out to the end app limited and mark 12913 * this new one as next and move the end_appl up 12914 * to this guy. 12915 */ 12916 if (rack->r_ctl.rc_end_appl) 12917 rack->r_ctl.rc_end_appl->r_nseq_appl = rsm->r_start; 12918 rack->r_ctl.rc_end_appl = rsm; 12919 } 12920 rsm->r_flags |= RACK_APP_LIMITED; 12921 rack->r_ctl.rc_app_limited_cnt++; 12922 } 12923 if (log) 12924 rack_log_pacing_delay_calc(rack, 12925 rack->r_ctl.rc_app_limited_cnt, seq, 12926 tp->gput_ack, 0, 0, 4, __LINE__, NULL); 12927 } 12928 } 12929 if (slot) { 12930 /* set the rack tcb into the slot N */ 12931 counter_u64_add(rack_paced_segments, 1); 12932 } else if (tot_len_this_send) { 12933 counter_u64_add(rack_unpaced_segments, 1); 12934 } 12935 /* Check if we need to go into persists or not */ 12936 if ((rack->rc_in_persist == 0) && 12937 (tp->snd_max == tp->snd_una) && 12938 TCPS_HAVEESTABLISHED(tp->t_state) && 12939 sbavail(sb) && 12940 (sbavail(sb) > tp->snd_wnd) && 12941 (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), minseg))) { 12942 /* Yes lets make sure to move to persist before timer-start */ 12943 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime); 12944 } 12945 rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, sup_rack); 12946 rack_log_type_just_return(rack, cts, tot_len_this_send, slot, hpts_calling, app_limited, cwnd_to_use); 12947 } 12948 #ifdef NETFLIX_SHARED_CWND 12949 if ((sbavail(sb) == 0) && 12950 rack->r_ctl.rc_scw) { 12951 tcp_shared_cwnd_idle(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index); 12952 rack->rack_scwnd_is_idle = 1; 12953 } 12954 #endif 12955 return (0); 12956 12957 send: 12958 if ((flags & TH_FIN) && 12959 sbavail(sb)) { 12960 /* 12961 * We do not transmit a FIN 12962 * with data outstanding. We 12963 * need to make it so all data 12964 * is acked first. 12965 */ 12966 flags &= ~TH_FIN; 12967 } 12968 /* Enforce stack imposed max seg size if we have one */ 12969 if (rack->r_ctl.rc_pace_max_segs && 12970 (len > rack->r_ctl.rc_pace_max_segs)) { 12971 mark = 1; 12972 len = rack->r_ctl.rc_pace_max_segs; 12973 } 12974 SOCKBUF_LOCK_ASSERT(sb); 12975 if (len > 0) { 12976 if (len >= segsiz) 12977 tp->t_flags2 |= TF2_PLPMTU_MAXSEGSNT; 12978 else 12979 tp->t_flags2 &= ~TF2_PLPMTU_MAXSEGSNT; 12980 } 12981 /* 12982 * Before ESTABLISHED, force sending of initial options unless TCP 12983 * set not to do any options. NOTE: we assume that the IP/TCP header 12984 * plus TCP options always fit in a single mbuf, leaving room for a 12985 * maximum link header, i.e. max_linkhdr + sizeof (struct tcpiphdr) 12986 * + optlen <= MCLBYTES 12987 */ 12988 optlen = 0; 12989 #ifdef INET6 12990 if (isipv6) 12991 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 12992 else 12993 #endif 12994 hdrlen = sizeof(struct tcpiphdr); 12995 12996 /* 12997 * Compute options for segment. We only have to care about SYN and 12998 * established connection segments. Options for SYN-ACK segments 12999 * are handled in TCP syncache. 13000 */ 13001 to.to_flags = 0; 13002 if ((tp->t_flags & TF_NOOPT) == 0) { 13003 /* Maximum segment size. */ 13004 if (flags & TH_SYN) { 13005 tp->snd_nxt = tp->iss; 13006 to.to_mss = tcp_mssopt(&inp->inp_inc); 13007 #ifdef NETFLIX_TCPOUDP 13008 if (tp->t_port) 13009 to.to_mss -= V_tcp_udp_tunneling_overhead; 13010 #endif 13011 to.to_flags |= TOF_MSS; 13012 13013 /* 13014 * On SYN or SYN|ACK transmits on TFO connections, 13015 * only include the TFO option if it is not a 13016 * retransmit, as the presence of the TFO option may 13017 * have caused the original SYN or SYN|ACK to have 13018 * been dropped by a middlebox. 13019 */ 13020 if (IS_FASTOPEN(tp->t_flags) && 13021 (tp->t_rxtshift == 0)) { 13022 if (tp->t_state == TCPS_SYN_RECEIVED) { 13023 to.to_tfo_len = TCP_FASTOPEN_COOKIE_LEN; 13024 to.to_tfo_cookie = 13025 (u_int8_t *)&tp->t_tfo_cookie.server; 13026 to.to_flags |= TOF_FASTOPEN; 13027 wanted_cookie = 1; 13028 } else if (tp->t_state == TCPS_SYN_SENT) { 13029 to.to_tfo_len = 13030 tp->t_tfo_client_cookie_len; 13031 to.to_tfo_cookie = 13032 tp->t_tfo_cookie.client; 13033 to.to_flags |= TOF_FASTOPEN; 13034 wanted_cookie = 1; 13035 /* 13036 * If we wind up having more data to 13037 * send with the SYN than can fit in 13038 * one segment, don't send any more 13039 * until the SYN|ACK comes back from 13040 * the other end. 13041 */ 13042 sendalot = 0; 13043 } 13044 } 13045 } 13046 /* Window scaling. */ 13047 if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) { 13048 to.to_wscale = tp->request_r_scale; 13049 to.to_flags |= TOF_SCALE; 13050 } 13051 /* Timestamps. */ 13052 if ((tp->t_flags & TF_RCVD_TSTMP) || 13053 ((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) { 13054 to.to_tsval = cts + tp->ts_offset; 13055 to.to_tsecr = tp->ts_recent; 13056 to.to_flags |= TOF_TS; 13057 } 13058 /* Set receive buffer autosizing timestamp. */ 13059 if (tp->rfbuf_ts == 0 && 13060 (so->so_rcv.sb_flags & SB_AUTOSIZE)) 13061 tp->rfbuf_ts = tcp_ts_getticks(); 13062 /* Selective ACK's. */ 13063 if (flags & TH_SYN) 13064 to.to_flags |= TOF_SACKPERM; 13065 else if (TCPS_HAVEESTABLISHED(tp->t_state) && 13066 tp->rcv_numsacks > 0) { 13067 to.to_flags |= TOF_SACK; 13068 to.to_nsacks = tp->rcv_numsacks; 13069 to.to_sacks = (u_char *)tp->sackblks; 13070 } 13071 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) 13072 /* TCP-MD5 (RFC2385). */ 13073 if (tp->t_flags & TF_SIGNATURE) 13074 to.to_flags |= TOF_SIGNATURE; 13075 #endif /* TCP_SIGNATURE */ 13076 13077 /* Processing the options. */ 13078 hdrlen += optlen = tcp_addoptions(&to, opt); 13079 /* 13080 * If we wanted a TFO option to be added, but it was unable 13081 * to fit, ensure no data is sent. 13082 */ 13083 if (IS_FASTOPEN(tp->t_flags) && wanted_cookie && 13084 !(to.to_flags & TOF_FASTOPEN)) 13085 len = 0; 13086 } 13087 #ifdef NETFLIX_TCPOUDP 13088 if (tp->t_port) { 13089 if (V_tcp_udp_tunneling_port == 0) { 13090 /* The port was removed?? */ 13091 SOCKBUF_UNLOCK(&so->so_snd); 13092 return (EHOSTUNREACH); 13093 } 13094 hdrlen += sizeof(struct udphdr); 13095 } 13096 #endif 13097 #ifdef INET6 13098 if (isipv6) 13099 ipoptlen = ip6_optlen(tp->t_inpcb); 13100 else 13101 #endif 13102 if (tp->t_inpcb->inp_options) 13103 ipoptlen = tp->t_inpcb->inp_options->m_len - 13104 offsetof(struct ipoption, ipopt_list); 13105 else 13106 ipoptlen = 0; 13107 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 13108 ipoptlen += ipsec_optlen; 13109 #endif 13110 13111 /* 13112 * Adjust data length if insertion of options will bump the packet 13113 * length beyond the t_maxseg length. Clear the FIN bit because we 13114 * cut off the tail of the segment. 13115 */ 13116 if (len + optlen + ipoptlen > tp->t_maxseg) { 13117 if (tso) { 13118 uint32_t if_hw_tsomax; 13119 uint32_t moff; 13120 int32_t max_len; 13121 13122 /* extract TSO information */ 13123 if_hw_tsomax = tp->t_tsomax; 13124 if_hw_tsomaxsegcount = tp->t_tsomaxsegcount; 13125 if_hw_tsomaxsegsize = tp->t_tsomaxsegsize; 13126 KASSERT(ipoptlen == 0, 13127 ("%s: TSO can't do IP options", __func__)); 13128 13129 /* 13130 * Check if we should limit by maximum payload 13131 * length: 13132 */ 13133 if (if_hw_tsomax != 0) { 13134 /* compute maximum TSO length */ 13135 max_len = (if_hw_tsomax - hdrlen - 13136 max_linkhdr); 13137 if (max_len <= 0) { 13138 len = 0; 13139 } else if (len > max_len) { 13140 sendalot = 1; 13141 len = max_len; 13142 mark = 2; 13143 } 13144 } 13145 /* 13146 * Prevent the last segment from being fractional 13147 * unless the send sockbuf can be emptied: 13148 */ 13149 max_len = (tp->t_maxseg - optlen); 13150 if ((sb_offset + len) < sbavail(sb)) { 13151 moff = len % (u_int)max_len; 13152 if (moff != 0) { 13153 mark = 3; 13154 len -= moff; 13155 } 13156 } 13157 /* 13158 * In case there are too many small fragments don't 13159 * use TSO: 13160 */ 13161 if (len <= segsiz) { 13162 mark = 4; 13163 tso = 0; 13164 } 13165 /* 13166 * Send the FIN in a separate segment after the bulk 13167 * sending is done. We don't trust the TSO 13168 * implementations to clear the FIN flag on all but 13169 * the last segment. 13170 */ 13171 if (tp->t_flags & TF_NEEDFIN) { 13172 sendalot = 4; 13173 } 13174 } else { 13175 mark = 5; 13176 if (optlen + ipoptlen >= tp->t_maxseg) { 13177 /* 13178 * Since we don't have enough space to put 13179 * the IP header chain and the TCP header in 13180 * one packet as required by RFC 7112, don't 13181 * send it. Also ensure that at least one 13182 * byte of the payload can be put into the 13183 * TCP segment. 13184 */ 13185 SOCKBUF_UNLOCK(&so->so_snd); 13186 error = EMSGSIZE; 13187 sack_rxmit = 0; 13188 goto out; 13189 } 13190 len = tp->t_maxseg - optlen - ipoptlen; 13191 sendalot = 5; 13192 } 13193 } else { 13194 tso = 0; 13195 mark = 6; 13196 } 13197 KASSERT(len + hdrlen + ipoptlen <= IP_MAXPACKET, 13198 ("%s: len > IP_MAXPACKET", __func__)); 13199 #ifdef DIAGNOSTIC 13200 #ifdef INET6 13201 if (max_linkhdr + hdrlen > MCLBYTES) 13202 #else 13203 if (max_linkhdr + hdrlen > MHLEN) 13204 #endif 13205 panic("tcphdr too big"); 13206 #endif 13207 13208 /* 13209 * This KASSERT is here to catch edge cases at a well defined place. 13210 * Before, those had triggered (random) panic conditions further 13211 * down. 13212 */ 13213 KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); 13214 if ((len == 0) && 13215 (flags & TH_FIN) && 13216 (sbused(sb))) { 13217 /* 13218 * We have outstanding data, don't send a fin by itself!. 13219 */ 13220 goto just_return; 13221 } 13222 /* 13223 * Grab a header mbuf, attaching a copy of data to be transmitted, 13224 * and initialize the header from the template for sends on this 13225 * connection. 13226 */ 13227 if (len) { 13228 uint32_t max_val; 13229 uint32_t moff; 13230 13231 if (rack->r_ctl.rc_pace_max_segs) 13232 max_val = rack->r_ctl.rc_pace_max_segs; 13233 else if (rack->rc_user_set_max_segs) 13234 max_val = rack->rc_user_set_max_segs * segsiz; 13235 else 13236 max_val = len; 13237 /* 13238 * We allow a limit on sending with hptsi. 13239 */ 13240 if (len > max_val) { 13241 mark = 7; 13242 len = max_val; 13243 } 13244 #ifdef INET6 13245 if (MHLEN < hdrlen + max_linkhdr) 13246 m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); 13247 else 13248 #endif 13249 m = m_gethdr(M_NOWAIT, MT_DATA); 13250 13251 if (m == NULL) { 13252 SOCKBUF_UNLOCK(sb); 13253 error = ENOBUFS; 13254 sack_rxmit = 0; 13255 goto out; 13256 } 13257 m->m_data += max_linkhdr; 13258 m->m_len = hdrlen; 13259 13260 /* 13261 * Start the m_copy functions from the closest mbuf to the 13262 * sb_offset in the socket buffer chain. 13263 */ 13264 mb = sbsndptr_noadv(sb, sb_offset, &moff); 13265 if (len <= MHLEN - hdrlen - max_linkhdr && !hw_tls) { 13266 m_copydata(mb, moff, (int)len, 13267 mtod(m, caddr_t)+hdrlen); 13268 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) 13269 sbsndptr_adv(sb, mb, len); 13270 m->m_len += len; 13271 } else { 13272 struct sockbuf *msb; 13273 13274 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) 13275 msb = NULL; 13276 else 13277 msb = sb; 13278 m->m_next = tcp_m_copym( 13279 mb, moff, &len, 13280 if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb, 13281 ((rsm == NULL) ? hw_tls : 0) 13282 #ifdef NETFLIX_COPY_ARGS 13283 , &filled_all 13284 #endif 13285 ); 13286 if (len <= (tp->t_maxseg - optlen)) { 13287 /* 13288 * Must have ran out of mbufs for the copy 13289 * shorten it to no longer need tso. Lets 13290 * not put on sendalot since we are low on 13291 * mbufs. 13292 */ 13293 tso = 0; 13294 } 13295 if (m->m_next == NULL) { 13296 SOCKBUF_UNLOCK(sb); 13297 (void)m_free(m); 13298 error = ENOBUFS; 13299 sack_rxmit = 0; 13300 goto out; 13301 } 13302 } 13303 if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) { 13304 if (rsm && (rsm->r_flags & RACK_TLP)) { 13305 /* 13306 * TLP should not count in retran count, but 13307 * in its own bin 13308 */ 13309 counter_u64_add(rack_tlp_retran, 1); 13310 counter_u64_add(rack_tlp_retran_bytes, len); 13311 } else { 13312 tp->t_sndrexmitpack++; 13313 KMOD_TCPSTAT_INC(tcps_sndrexmitpack); 13314 KMOD_TCPSTAT_ADD(tcps_sndrexmitbyte, len); 13315 } 13316 #ifdef STATS 13317 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB, 13318 len); 13319 #endif 13320 } else { 13321 KMOD_TCPSTAT_INC(tcps_sndpack); 13322 KMOD_TCPSTAT_ADD(tcps_sndbyte, len); 13323 #ifdef STATS 13324 stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB, 13325 len); 13326 #endif 13327 } 13328 /* 13329 * If we're sending everything we've got, set PUSH. (This 13330 * will keep happy those implementations which only give 13331 * data to the user when a buffer fills or a PUSH comes in.) 13332 */ 13333 if (sb_offset + len == sbused(sb) && 13334 sbused(sb) && 13335 !(flags & TH_SYN)) 13336 flags |= TH_PUSH; 13337 13338 SOCKBUF_UNLOCK(sb); 13339 } else { 13340 SOCKBUF_UNLOCK(sb); 13341 if (tp->t_flags & TF_ACKNOW) 13342 KMOD_TCPSTAT_INC(tcps_sndacks); 13343 else if (flags & (TH_SYN | TH_FIN | TH_RST)) 13344 KMOD_TCPSTAT_INC(tcps_sndctrl); 13345 else 13346 KMOD_TCPSTAT_INC(tcps_sndwinup); 13347 13348 m = m_gethdr(M_NOWAIT, MT_DATA); 13349 if (m == NULL) { 13350 error = ENOBUFS; 13351 sack_rxmit = 0; 13352 goto out; 13353 } 13354 #ifdef INET6 13355 if (isipv6 && (MHLEN < hdrlen + max_linkhdr) && 13356 MHLEN >= hdrlen) { 13357 M_ALIGN(m, hdrlen); 13358 } else 13359 #endif 13360 m->m_data += max_linkhdr; 13361 m->m_len = hdrlen; 13362 } 13363 SOCKBUF_UNLOCK_ASSERT(sb); 13364 m->m_pkthdr.rcvif = (struct ifnet *)0; 13365 #ifdef MAC 13366 mac_inpcb_create_mbuf(inp, m); 13367 #endif 13368 #ifdef INET6 13369 if (isipv6) { 13370 ip6 = mtod(m, struct ip6_hdr *); 13371 #ifdef NETFLIX_TCPOUDP 13372 if (tp->t_port) { 13373 udp = (struct udphdr *)((caddr_t)ip6 + ipoptlen + sizeof(struct ip6_hdr)); 13374 udp->uh_sport = htons(V_tcp_udp_tunneling_port); 13375 udp->uh_dport = tp->t_port; 13376 ulen = hdrlen + len - sizeof(struct ip6_hdr); 13377 udp->uh_ulen = htons(ulen); 13378 th = (struct tcphdr *)(udp + 1); 13379 } else 13380 #endif 13381 th = (struct tcphdr *)(ip6 + 1); 13382 tcpip_fillheaders(inp, 13383 #ifdef NETFLIX_TCPOUDP 13384 tp->t_port, 13385 #endif 13386 ip6, th); 13387 } else 13388 #endif /* INET6 */ 13389 { 13390 ip = mtod(m, struct ip *); 13391 #ifdef TCPDEBUG 13392 ipov = (struct ipovly *)ip; 13393 #endif 13394 #ifdef NETFLIX_TCPOUDP 13395 if (tp->t_port) { 13396 udp = (struct udphdr *)((caddr_t)ip + ipoptlen + sizeof(struct ip)); 13397 udp->uh_sport = htons(V_tcp_udp_tunneling_port); 13398 udp->uh_dport = tp->t_port; 13399 ulen = hdrlen + len - sizeof(struct ip); 13400 udp->uh_ulen = htons(ulen); 13401 th = (struct tcphdr *)(udp + 1); 13402 } else 13403 #endif 13404 th = (struct tcphdr *)(ip + 1); 13405 tcpip_fillheaders(inp, 13406 #ifdef NETFLIX_TCPOUDP 13407 tp->t_port, 13408 #endif 13409 ip, th); 13410 } 13411 /* 13412 * Fill in fields, remembering maximum advertised window for use in 13413 * delaying messages about window sizes. If resending a FIN, be sure 13414 * not to use a new sequence number. 13415 */ 13416 if (flags & TH_FIN && tp->t_flags & TF_SENTFIN && 13417 tp->snd_nxt == tp->snd_max) 13418 tp->snd_nxt--; 13419 /* 13420 * If we are starting a connection, send ECN setup SYN packet. If we 13421 * are on a retransmit, we may resend those bits a number of times 13422 * as per RFC 3168. 13423 */ 13424 if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn == 1) { 13425 if (tp->t_rxtshift >= 1) { 13426 if (tp->t_rxtshift <= V_tcp_ecn_maxretries) 13427 flags |= TH_ECE | TH_CWR; 13428 } else 13429 flags |= TH_ECE | TH_CWR; 13430 } 13431 /* Handle parallel SYN for ECN */ 13432 if ((tp->t_state == TCPS_SYN_RECEIVED) && 13433 (tp->t_flags2 & TF2_ECN_SND_ECE)) { 13434 flags |= TH_ECE; 13435 tp->t_flags2 &= ~TF2_ECN_SND_ECE; 13436 } 13437 if (tp->t_state == TCPS_ESTABLISHED && 13438 (tp->t_flags2 & TF2_ECN_PERMIT)) { 13439 /* 13440 * If the peer has ECN, mark data packets with ECN capable 13441 * transmission (ECT). Ignore pure ack packets, 13442 * retransmissions. 13443 */ 13444 if (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) && 13445 (sack_rxmit == 0)) { 13446 #ifdef INET6 13447 if (isipv6) 13448 ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20); 13449 else 13450 #endif 13451 ip->ip_tos |= IPTOS_ECN_ECT0; 13452 KMOD_TCPSTAT_INC(tcps_ecn_ect0); 13453 /* 13454 * Reply with proper ECN notifications. 13455 * Only set CWR on new data segments. 13456 */ 13457 if (tp->t_flags2 & TF2_ECN_SND_CWR) { 13458 flags |= TH_CWR; 13459 tp->t_flags2 &= ~TF2_ECN_SND_CWR; 13460 } 13461 } 13462 if (tp->t_flags2 & TF2_ECN_SND_ECE) 13463 flags |= TH_ECE; 13464 } 13465 /* 13466 * If we are doing retransmissions, then snd_nxt will not reflect 13467 * the first unsent octet. For ACK only packets, we do not want the 13468 * sequence number of the retransmitted packet, we want the sequence 13469 * number of the next unsent octet. So, if there is no data (and no 13470 * SYN or FIN), use snd_max instead of snd_nxt when filling in 13471 * ti_seq. But if we are in persist state, snd_max might reflect 13472 * one byte beyond the right edge of the window, so use snd_nxt in 13473 * that case, since we know we aren't doing a retransmission. 13474 * (retransmit and persist are mutually exclusive...) 13475 */ 13476 if (sack_rxmit == 0) { 13477 if (len || (flags & (TH_SYN | TH_FIN)) || 13478 rack->rc_in_persist) { 13479 th->th_seq = htonl(tp->snd_nxt); 13480 rack_seq = tp->snd_nxt; 13481 } else if (flags & TH_RST) { 13482 /* 13483 * For a Reset send the last cum ack in sequence 13484 * (this like any other choice may still generate a 13485 * challenge ack, if a ack-update packet is in 13486 * flight). 13487 */ 13488 th->th_seq = htonl(tp->snd_una); 13489 rack_seq = tp->snd_una; 13490 } else { 13491 th->th_seq = htonl(tp->snd_max); 13492 rack_seq = tp->snd_max; 13493 } 13494 } else { 13495 th->th_seq = htonl(rsm->r_start); 13496 rack_seq = rsm->r_start; 13497 } 13498 th->th_ack = htonl(tp->rcv_nxt); 13499 if (optlen) { 13500 bcopy(opt, th + 1, optlen); 13501 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 13502 } 13503 th->th_flags = flags; 13504 /* 13505 * Calculate receive window. Don't shrink window, but avoid silly 13506 * window syndrome. 13507 * If a RST segment is sent, advertise a window of zero. 13508 */ 13509 if (flags & TH_RST) { 13510 recwin = 0; 13511 } else { 13512 if (recwin < (long)(so->so_rcv.sb_hiwat / 4) && 13513 recwin < (long)segsiz) 13514 recwin = 0; 13515 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) && 13516 recwin < (long)(tp->rcv_adv - tp->rcv_nxt)) 13517 recwin = (long)(tp->rcv_adv - tp->rcv_nxt); 13518 } 13519 13520 /* 13521 * According to RFC1323 the window field in a SYN (i.e., a <SYN> or 13522 * <SYN,ACK>) segment itself is never scaled. The <SYN,ACK> case is 13523 * handled in syncache. 13524 */ 13525 if (flags & TH_SYN) 13526 th->th_win = htons((u_short) 13527 (min(sbspace(&so->so_rcv), TCP_MAXWIN))); 13528 else { 13529 /* Avoid shrinking window with window scaling. */ 13530 recwin = roundup2(recwin, 1 << tp->rcv_scale); 13531 th->th_win = htons((u_short)(recwin >> tp->rcv_scale)); 13532 } 13533 /* 13534 * Adjust the RXWIN0SENT flag - indicate that we have advertised a 0 13535 * window. This may cause the remote transmitter to stall. This 13536 * flag tells soreceive() to disable delayed acknowledgements when 13537 * draining the buffer. This can occur if the receiver is 13538 * attempting to read more data than can be buffered prior to 13539 * transmitting on the connection. 13540 */ 13541 if (th->th_win == 0) { 13542 tp->t_sndzerowin++; 13543 tp->t_flags |= TF_RXWIN0SENT; 13544 } else 13545 tp->t_flags &= ~TF_RXWIN0SENT; 13546 tp->snd_up = tp->snd_una; /* drag it along, its deprecated */ 13547 13548 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) 13549 if (to.to_flags & TOF_SIGNATURE) { 13550 /* 13551 * Calculate MD5 signature and put it into the place 13552 * determined before. 13553 * NOTE: since TCP options buffer doesn't point into 13554 * mbuf's data, calculate offset and use it. 13555 */ 13556 if (!TCPMD5_ENABLED() || TCPMD5_OUTPUT(m, th, 13557 (u_char *)(th + 1) + (to.to_signature - opt)) != 0) { 13558 /* 13559 * Do not send segment if the calculation of MD5 13560 * digest has failed. 13561 */ 13562 goto out; 13563 } 13564 } 13565 #endif 13566 13567 /* 13568 * Put TCP length in extended header, and then checksum extended 13569 * header and data. 13570 */ 13571 m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */ 13572 #ifdef INET6 13573 if (isipv6) { 13574 /* 13575 * ip6_plen is not need to be filled now, and will be filled 13576 * in ip6_output. 13577 */ 13578 if (tp->t_port) { 13579 m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; 13580 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 13581 udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0); 13582 th->th_sum = htons(0); 13583 UDPSTAT_INC(udps_opackets); 13584 } else { 13585 m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; 13586 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 13587 th->th_sum = in6_cksum_pseudo(ip6, 13588 sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP, 13589 0); 13590 } 13591 } 13592 #endif 13593 #if defined(INET6) && defined(INET) 13594 else 13595 #endif 13596 #ifdef INET 13597 { 13598 if (tp->t_port) { 13599 m->m_pkthdr.csum_flags = CSUM_UDP; 13600 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 13601 udp->uh_sum = in_pseudo(ip->ip_src.s_addr, 13602 ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP)); 13603 th->th_sum = htons(0); 13604 UDPSTAT_INC(udps_opackets); 13605 } else { 13606 m->m_pkthdr.csum_flags = CSUM_TCP; 13607 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 13608 th->th_sum = in_pseudo(ip->ip_src.s_addr, 13609 ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + 13610 IPPROTO_TCP + len + optlen)); 13611 } 13612 /* IP version must be set here for ipv4/ipv6 checking later */ 13613 KASSERT(ip->ip_v == IPVERSION, 13614 ("%s: IP version incorrect: %d", __func__, ip->ip_v)); 13615 } 13616 #endif 13617 /* 13618 * Enable TSO and specify the size of the segments. The TCP pseudo 13619 * header checksum is always provided. XXX: Fixme: This is currently 13620 * not the case for IPv6. 13621 */ 13622 if (tso) { 13623 KASSERT(len > tp->t_maxseg - optlen, 13624 ("%s: len <= tso_segsz", __func__)); 13625 m->m_pkthdr.csum_flags |= CSUM_TSO; 13626 m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen; 13627 } 13628 KASSERT(len + hdrlen == m_length(m, NULL), 13629 ("%s: mbuf chain different than expected: %d + %u != %u", 13630 __func__, len, hdrlen, m_length(m, NULL))); 13631 13632 #ifdef TCP_HHOOK 13633 /* Run HHOOK_TCP_ESTABLISHED_OUT helper hooks. */ 13634 hhook_run_tcp_est_out(tp, th, &to, len, tso); 13635 #endif 13636 #ifdef TCPDEBUG 13637 /* 13638 * Trace. 13639 */ 13640 if (so->so_options & SO_DEBUG) { 13641 u_short save = 0; 13642 13643 #ifdef INET6 13644 if (!isipv6) 13645 #endif 13646 { 13647 save = ipov->ih_len; 13648 ipov->ih_len = htons(m->m_pkthdr.len /* - hdrlen + 13649 * (th->th_off << 2) */ ); 13650 } 13651 tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0); 13652 #ifdef INET6 13653 if (!isipv6) 13654 #endif 13655 ipov->ih_len = save; 13656 } 13657 #endif /* TCPDEBUG */ 13658 13659 /* We're getting ready to send; log now. */ 13660 if (tp->t_logstate != TCP_LOG_STATE_OFF) { 13661 union tcp_log_stackspecific log; 13662 struct timeval tv; 13663 13664 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 13665 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 13666 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 13667 if (rack->rack_no_prr) 13668 log.u_bbr.flex1 = 0; 13669 else 13670 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; 13671 log.u_bbr.flex2 = rack->r_ctl.rc_pace_min_segs; 13672 log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs; 13673 log.u_bbr.flex4 = orig_len; 13674 if (filled_all) 13675 log.u_bbr.flex5 = 0x80000000; 13676 else 13677 log.u_bbr.flex5 = 0; 13678 /* Save off the early/late values */ 13679 log.u_bbr.flex6 = rack->r_ctl.rc_agg_early; 13680 log.u_bbr.applimited = rack->r_ctl.rc_agg_delayed; 13681 log.u_bbr.bw_inuse = rack_get_bw(rack); 13682 if (rsm || sack_rxmit) { 13683 if (doing_tlp) 13684 log.u_bbr.flex8 = 2; 13685 else 13686 log.u_bbr.flex8 = 1; 13687 } else { 13688 log.u_bbr.flex8 = 0; 13689 } 13690 log.u_bbr.pacing_gain = rack_get_output_gain(rack, rsm); 13691 log.u_bbr.flex7 = mark; 13692 log.u_bbr.pkts_out = tp->t_maxseg; 13693 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 13694 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 13695 log.u_bbr.lt_epoch = cwnd_to_use; 13696 log.u_bbr.delivered = sendalot; 13697 lgb = tcp_log_event_(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_OUT, ERRNO_UNK, 13698 len, &log, false, NULL, NULL, 0, &tv); 13699 } else 13700 lgb = NULL; 13701 13702 /* 13703 * Fill in IP length and desired time to live and send to IP level. 13704 * There should be a better way to handle ttl and tos; we could keep 13705 * them in the template, but need a way to checksum without them. 13706 */ 13707 /* 13708 * m->m_pkthdr.len should have been set before cksum calcuration, 13709 * because in6_cksum() need it. 13710 */ 13711 #ifdef INET6 13712 if (isipv6) { 13713 /* 13714 * we separately set hoplimit for every segment, since the 13715 * user might want to change the value via setsockopt. Also, 13716 * desired default hop limit might be changed via Neighbor 13717 * Discovery. 13718 */ 13719 ip6->ip6_hlim = in6_selecthlim(inp, NULL); 13720 13721 /* 13722 * Set the packet size here for the benefit of DTrace 13723 * probes. ip6_output() will set it properly; it's supposed 13724 * to include the option header lengths as well. 13725 */ 13726 ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6)); 13727 13728 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) 13729 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 13730 else 13731 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 13732 13733 if (tp->t_state == TCPS_SYN_SENT) 13734 TCP_PROBE5(connect__request, NULL, tp, ip6, tp, th); 13735 13736 TCP_PROBE5(send, NULL, tp, ip6, tp, th); 13737 /* TODO: IPv6 IP6TOS_ECT bit on */ 13738 error = ip6_output(m, inp->in6p_outputopts, 13739 &inp->inp_route6, 13740 ((rsm || sack_rxmit) ? IP_NO_SND_TAG_RL : 0), 13741 NULL, NULL, inp); 13742 13743 if (error == EMSGSIZE && inp->inp_route6.ro_nh != NULL) 13744 mtu = inp->inp_route6.ro_nh->nh_mtu; 13745 } 13746 #endif /* INET6 */ 13747 #if defined(INET) && defined(INET6) 13748 else 13749 #endif 13750 #ifdef INET 13751 { 13752 ip->ip_len = htons(m->m_pkthdr.len); 13753 #ifdef INET6 13754 if (inp->inp_vflag & INP_IPV6PROTO) 13755 ip->ip_ttl = in6_selecthlim(inp, NULL); 13756 #endif /* INET6 */ 13757 /* 13758 * If we do path MTU discovery, then we set DF on every 13759 * packet. This might not be the best thing to do according 13760 * to RFC3390 Section 2. However the tcp hostcache migitates 13761 * the problem so it affects only the first tcp connection 13762 * with a host. 13763 * 13764 * NB: Don't set DF on small MTU/MSS to have a safe 13765 * fallback. 13766 */ 13767 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) { 13768 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 13769 if (tp->t_port == 0 || len < V_tcp_minmss) { 13770 ip->ip_off |= htons(IP_DF); 13771 } 13772 } else { 13773 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 13774 } 13775 13776 if (tp->t_state == TCPS_SYN_SENT) 13777 TCP_PROBE5(connect__request, NULL, tp, ip, tp, th); 13778 13779 TCP_PROBE5(send, NULL, tp, ip, tp, th); 13780 13781 error = ip_output(m, inp->inp_options, &inp->inp_route, 13782 ((rsm || sack_rxmit) ? IP_NO_SND_TAG_RL : 0), 0, 13783 inp); 13784 if (error == EMSGSIZE && inp->inp_route.ro_nh != NULL) 13785 mtu = inp->inp_route.ro_nh->nh_mtu; 13786 } 13787 #endif /* INET */ 13788 13789 out: 13790 if (lgb) { 13791 lgb->tlb_errno = error; 13792 lgb = NULL; 13793 } 13794 /* 13795 * In transmit state, time the transmission and arrange for the 13796 * retransmit. In persist state, just set snd_max. 13797 */ 13798 if (error == 0) { 13799 rack->forced_ack = 0; /* If we send something zap the FA flag */ 13800 if (rsm && (doing_tlp == 0)) { 13801 /* Set we retransmitted */ 13802 rack->rc_gp_saw_rec = 1; 13803 } else { 13804 if (cwnd_to_use > tp->snd_ssthresh) { 13805 /* Set we sent in CA */ 13806 rack->rc_gp_saw_ca = 1; 13807 } else { 13808 /* Set we sent in SS */ 13809 rack->rc_gp_saw_ss = 1; 13810 } 13811 } 13812 if (TCPS_HAVEESTABLISHED(tp->t_state) && 13813 (tp->t_flags & TF_SACK_PERMIT) && 13814 tp->rcv_numsacks > 0) 13815 tcp_clean_dsack_blocks(tp); 13816 tot_len_this_send += len; 13817 if (len == 0) 13818 counter_u64_add(rack_out_size[TCP_MSS_ACCT_SNDACK], 1); 13819 else if (len == 1) { 13820 counter_u64_add(rack_out_size[TCP_MSS_ACCT_PERSIST], 1); 13821 } else if (len > 1) { 13822 int idx; 13823 13824 idx = (len / segsiz) + 3; 13825 if (idx >= TCP_MSS_ACCT_ATIMER) 13826 counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1); 13827 else 13828 counter_u64_add(rack_out_size[idx], 1); 13829 } 13830 } 13831 if (rack->rack_no_prr == 0) { 13832 if (sub_from_prr && (error == 0)) { 13833 if (rack->r_ctl.rc_prr_sndcnt >= len) 13834 rack->r_ctl.rc_prr_sndcnt -= len; 13835 else 13836 rack->r_ctl.rc_prr_sndcnt = 0; 13837 } 13838 } 13839 sub_from_prr = 0; 13840 rack_log_output(tp, &to, len, rack_seq, (uint8_t) flags, error, cts, 13841 pass, rsm, us_cts); 13842 if ((error == 0) && 13843 (len > 0) && 13844 (tp->snd_una == tp->snd_max)) 13845 rack->r_ctl.rc_tlp_rxt_last_time = cts; 13846 /* Now are we in persists? */ 13847 if (rack->rc_in_persist == 0) { 13848 tcp_seq startseq = tp->snd_nxt; 13849 13850 /* Track our lost count */ 13851 if (rsm && (doing_tlp == 0)) 13852 rack->r_ctl.rc_loss_count += rsm->r_end - rsm->r_start; 13853 /* 13854 * Advance snd_nxt over sequence space of this segment. 13855 */ 13856 if (error) 13857 /* We don't log or do anything with errors */ 13858 goto nomore; 13859 if (doing_tlp == 0) { 13860 if (rsm == NULL) { 13861 /* 13862 * Not a retransmission of some 13863 * sort, new data is going out so 13864 * clear our TLP count and flag. 13865 */ 13866 rack->rc_tlp_in_progress = 0; 13867 rack->r_ctl.rc_tlp_cnt_out = 0; 13868 } 13869 } else { 13870 /* 13871 * We have just sent a TLP, mark that it is true 13872 * and make sure our in progress is set so we 13873 * continue to check the count. 13874 */ 13875 rack->rc_tlp_in_progress = 1; 13876 rack->r_ctl.rc_tlp_cnt_out++; 13877 } 13878 if (flags & (TH_SYN | TH_FIN)) { 13879 if (flags & TH_SYN) 13880 tp->snd_nxt++; 13881 if (flags & TH_FIN) { 13882 tp->snd_nxt++; 13883 tp->t_flags |= TF_SENTFIN; 13884 } 13885 } 13886 /* In the ENOBUFS case we do *not* update snd_max */ 13887 if (sack_rxmit) 13888 goto nomore; 13889 13890 tp->snd_nxt += len; 13891 if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { 13892 if (tp->snd_una == tp->snd_max) { 13893 /* 13894 * Update the time we just added data since 13895 * none was outstanding. 13896 */ 13897 rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__); 13898 tp->t_acktime = ticks; 13899 } 13900 tp->snd_max = tp->snd_nxt; 13901 /* 13902 * Time this transmission if not a retransmission and 13903 * not currently timing anything. 13904 * This is only relevant in case of switching back to 13905 * the base stack. 13906 */ 13907 if (tp->t_rtttime == 0) { 13908 tp->t_rtttime = ticks; 13909 tp->t_rtseq = startseq; 13910 KMOD_TCPSTAT_INC(tcps_segstimed); 13911 } 13912 if (len && 13913 ((tp->t_flags & TF_GPUTINPROG) == 0)) 13914 rack_start_gp_measurement(tp, rack, startseq, sb_offset); 13915 } 13916 } else { 13917 /* 13918 * Persist case, update snd_max but since we are in persist 13919 * mode (no window) we do not update snd_nxt. 13920 */ 13921 int32_t xlen = len; 13922 13923 if (error) 13924 goto nomore; 13925 13926 if (flags & TH_SYN) 13927 ++xlen; 13928 if (flags & TH_FIN) { 13929 ++xlen; 13930 tp->t_flags |= TF_SENTFIN; 13931 } 13932 /* In the ENOBUFS case we do *not* update snd_max */ 13933 if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max)) { 13934 if (tp->snd_una == tp->snd_max) { 13935 /* 13936 * Update the time we just added data since 13937 * none was outstanding. 13938 */ 13939 rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__); 13940 tp->t_acktime = ticks; 13941 } 13942 tp->snd_max = tp->snd_nxt + len; 13943 } 13944 } 13945 nomore: 13946 if (error) { 13947 rack->r_ctl.rc_agg_delayed = 0; 13948 rack->r_early = 0; 13949 rack->r_late = 0; 13950 rack->r_ctl.rc_agg_early = 0; 13951 SOCKBUF_UNLOCK_ASSERT(sb); /* Check gotos. */ 13952 /* 13953 * Failures do not advance the seq counter above. For the 13954 * case of ENOBUFS we will fall out and retry in 1ms with 13955 * the hpts. Everything else will just have to retransmit 13956 * with the timer. 13957 * 13958 * In any case, we do not want to loop around for another 13959 * send without a good reason. 13960 */ 13961 sendalot = 0; 13962 switch (error) { 13963 case EPERM: 13964 tp->t_softerror = error; 13965 return (error); 13966 case ENOBUFS: 13967 if (slot == 0) { 13968 /* 13969 * Pace us right away to retry in a some 13970 * time 13971 */ 13972 slot = ((1 + rack->rc_enobuf) * HPTS_USEC_IN_MSEC); 13973 if (rack->rc_enobuf < 126) 13974 rack->rc_enobuf++; 13975 if (slot > ((rack->rc_rack_rtt / 2) * HPTS_USEC_IN_MSEC)) { 13976 slot = (rack->rc_rack_rtt / 2) * HPTS_USEC_IN_MSEC; 13977 } 13978 if (slot < (10 * HPTS_USEC_IN_MSEC)) 13979 slot = 10 * HPTS_USEC_IN_MSEC; 13980 } 13981 counter_u64_add(rack_saw_enobuf, 1); 13982 error = 0; 13983 goto enobufs; 13984 case EMSGSIZE: 13985 /* 13986 * For some reason the interface we used initially 13987 * to send segments changed to another or lowered 13988 * its MTU. If TSO was active we either got an 13989 * interface without TSO capabilits or TSO was 13990 * turned off. If we obtained mtu from ip_output() 13991 * then update it and try again. 13992 */ 13993 if (tso) 13994 tp->t_flags &= ~TF_TSO; 13995 if (mtu != 0) { 13996 tcp_mss_update(tp, -1, mtu, NULL, NULL); 13997 goto again; 13998 } 13999 slot = 10 * HPTS_USEC_IN_MSEC; 14000 rack_start_hpts_timer(rack, tp, cts, slot, 0, 0); 14001 return (error); 14002 case ENETUNREACH: 14003 counter_u64_add(rack_saw_enetunreach, 1); 14004 case EHOSTDOWN: 14005 case EHOSTUNREACH: 14006 case ENETDOWN: 14007 if (TCPS_HAVERCVDSYN(tp->t_state)) { 14008 tp->t_softerror = error; 14009 } 14010 /* FALLTHROUGH */ 14011 default: 14012 slot = 10 * HPTS_USEC_IN_MSEC; 14013 rack_start_hpts_timer(rack, tp, cts, slot, 0, 0); 14014 return (error); 14015 } 14016 } else { 14017 rack->rc_enobuf = 0; 14018 } 14019 KMOD_TCPSTAT_INC(tcps_sndtotal); 14020 14021 /* 14022 * Data sent (as far as we can tell). If this advertises a larger 14023 * window than any other segment, then remember the size of the 14024 * advertised window. Any pending ACK has now been sent. 14025 */ 14026 if (recwin > 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv)) 14027 tp->rcv_adv = tp->rcv_nxt + recwin; 14028 tp->last_ack_sent = tp->rcv_nxt; 14029 tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); 14030 enobufs: 14031 /* Assure when we leave that snd_nxt will point to top */ 14032 if (SEQ_GT(tp->snd_max, tp->snd_nxt)) 14033 tp->snd_nxt = tp->snd_max; 14034 if (sendalot) { 14035 /* Do we need to turn off sendalot? */ 14036 if (rack->r_ctl.rc_pace_max_segs && 14037 (tot_len_this_send >= rack->r_ctl.rc_pace_max_segs)) { 14038 /* We hit our max. */ 14039 sendalot = 0; 14040 } else if ((rack->rc_user_set_max_segs) && 14041 (tot_len_this_send >= (rack->rc_user_set_max_segs * segsiz))) { 14042 /* We hit the user defined max */ 14043 sendalot = 0; 14044 } 14045 } 14046 if ((error == 0) && (flags & TH_FIN)) 14047 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_FIN); 14048 if (flags & TH_RST) { 14049 /* 14050 * We don't send again after sending a RST. 14051 */ 14052 slot = 0; 14053 sendalot = 0; 14054 if (error == 0) 14055 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); 14056 } else if ((slot == 0) && (sendalot == 0) && tot_len_this_send) { 14057 /* 14058 * Get our pacing rate, if an error 14059 * occured in sending (ENOBUF) we would 14060 * hit the else if with slot preset. Other 14061 * errors return. 14062 */ 14063 slot = rack_get_pacing_delay(rack, tp, tot_len_this_send, rsm, segsiz); 14064 } 14065 if (rsm && 14066 rack->use_rack_rr) { 14067 /* Its a retransmit and we use the rack cheat? */ 14068 if ((slot == 0) || 14069 (rack->rc_always_pace == 0) || 14070 (rack->r_rr_config == 1)) { 14071 /* 14072 * We have no pacing set or we 14073 * are using old-style rack or 14074 * we are overriden to use the old 1ms pacing. 14075 */ 14076 slot = rack->r_ctl.rc_min_to * HPTS_USEC_IN_MSEC; 14077 } 14078 } 14079 if (slot) { 14080 /* set the rack tcb into the slot N */ 14081 counter_u64_add(rack_paced_segments, 1); 14082 } else if (sendalot) { 14083 if (len) 14084 counter_u64_add(rack_unpaced_segments, 1); 14085 sack_rxmit = 0; 14086 goto again; 14087 } else if (len) { 14088 counter_u64_add(rack_unpaced_segments, 1); 14089 } 14090 rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, 0); 14091 return (error); 14092 } 14093 14094 static void 14095 rack_update_seg(struct tcp_rack *rack) 14096 { 14097 uint32_t orig_val; 14098 14099 orig_val = rack->r_ctl.rc_pace_max_segs; 14100 rack_set_pace_segments(rack->rc_tp, rack, __LINE__); 14101 if (orig_val != rack->r_ctl.rc_pace_max_segs) 14102 rack_log_pacing_delay_calc(rack, 0, 0, orig_val, 0, 0, 15, __LINE__, NULL); 14103 } 14104 14105 /* 14106 * rack_ctloutput() must drop the inpcb lock before performing copyin on 14107 * socket option arguments. When it re-acquires the lock after the copy, it 14108 * has to revalidate that the connection is still valid for the socket 14109 * option. 14110 */ 14111 static int 14112 rack_set_sockopt(struct socket *so, struct sockopt *sopt, 14113 struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack) 14114 { 14115 struct epoch_tracker et; 14116 uint64_t val; 14117 int32_t error = 0, optval; 14118 uint16_t ca, ss; 14119 14120 14121 switch (sopt->sopt_name) { 14122 case TCP_RACK_PROP_RATE: /* URL:prop_rate */ 14123 case TCP_RACK_PROP : /* URL:prop */ 14124 case TCP_RACK_TLP_REDUCE: /* URL:tlp_reduce */ 14125 case TCP_RACK_EARLY_RECOV: /* URL:early_recov */ 14126 case TCP_RACK_PACE_REDUCE: /* Not used */ 14127 /* Pacing related ones */ 14128 case TCP_RACK_PACE_ALWAYS: /* URL:pace_always */ 14129 case TCP_BBR_RACK_INIT_RATE: /* URL:irate */ 14130 case TCP_BBR_IWINTSO: /* URL:tso_iwin */ 14131 case TCP_RACK_PACE_MAX_SEG: /* URL:pace_max_seg */ 14132 case TCP_RACK_FORCE_MSEG: /* URL:force_max_seg */ 14133 case TCP_RACK_PACE_RATE_CA: /* URL:pr_ca */ 14134 case TCP_RACK_PACE_RATE_SS: /* URL:pr_ss*/ 14135 case TCP_RACK_PACE_RATE_REC: /* URL:pr_rec */ 14136 case TCP_RACK_GP_INCREASE_CA: /* URL:gp_inc_ca */ 14137 case TCP_RACK_GP_INCREASE_SS: /* URL:gp_inc_ss */ 14138 case TCP_RACK_GP_INCREASE_REC: /* URL:gp_inc_rec */ 14139 case TCP_RACK_RR_CONF: /* URL:rrr_conf */ 14140 case TCP_BBR_HDWR_PACE: /* URL:hdwrpace */ 14141 /* End pacing related */ 14142 case TCP_DELACK: 14143 case TCP_RACK_PRR_SENDALOT: /* URL:prr_sendalot */ 14144 case TCP_RACK_MIN_TO: /* URL:min_to */ 14145 case TCP_RACK_EARLY_SEG: /* URL:early_seg */ 14146 case TCP_RACK_REORD_THRESH: /* URL:reord_thresh */ 14147 case TCP_RACK_REORD_FADE: /* URL:reord_fade */ 14148 case TCP_RACK_TLP_THRESH: /* URL:tlp_thresh */ 14149 case TCP_RACK_PKT_DELAY: /* URL:pkt_delay */ 14150 case TCP_RACK_TLP_USE: /* URL:tlp_use */ 14151 case TCP_RACK_TLP_INC_VAR: /* URL:tlp_inc_var */ 14152 case TCP_RACK_IDLE_REDUCE_HIGH: /* URL:idle_reduce_high */ 14153 case TCP_BBR_RACK_RTT_USE: /* URL:rttuse */ 14154 case TCP_BBR_USE_RACK_RR: /* URL:rackrr */ 14155 case TCP_RACK_DO_DETECTION: /* URL:detect */ 14156 case TCP_NO_PRR: /* URL:noprr */ 14157 case TCP_TIMELY_DYN_ADJ: /* URL:dynamic */ 14158 case TCP_DATA_AFTER_CLOSE: 14159 case TCP_RACK_NONRXT_CFG_RATE: /* URL:nonrxtcr */ 14160 case TCP_SHARED_CWND_ENABLE: /* URL:scwnd */ 14161 case TCP_RACK_MBUF_QUEUE: /* URL:mqueue */ 14162 case TCP_RACK_NO_PUSH_AT_MAX: /* URL:npush */ 14163 case TCP_RACK_PACE_TO_FILL: /* URL:fillcw */ 14164 case TCP_SHARED_CWND_TIME_LIMIT: /* URL:lscwnd */ 14165 case TCP_RACK_PROFILE: /* URL:profile */ 14166 break; 14167 default: 14168 return (tcp_default_ctloutput(so, sopt, inp, tp)); 14169 break; 14170 } 14171 INP_WUNLOCK(inp); 14172 error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); 14173 if (error) 14174 return (error); 14175 INP_WLOCK(inp); 14176 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 14177 INP_WUNLOCK(inp); 14178 return (ECONNRESET); 14179 } 14180 tp = intotcpcb(inp); 14181 rack = (struct tcp_rack *)tp->t_fb_ptr; 14182 switch (sopt->sopt_name) { 14183 case TCP_RACK_PROFILE: 14184 RACK_OPTS_INC(tcp_profile); 14185 if (optval == 1) { 14186 /* pace_always=1 */ 14187 rack->rc_always_pace = 1; 14188 tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ; 14189 /* scwnd=1 */ 14190 rack->rack_enable_scwnd = 1; 14191 /* dynamic=100 */ 14192 rack->rc_gp_dyn_mul = 1; 14193 rack->r_ctl.rack_per_of_gp_ca = 100; 14194 /* rrr_conf=3 */ 14195 rack->r_rr_config = 3; 14196 /* npush=2 */ 14197 rack->r_ctl.rc_no_push_at_mrtt = 2; 14198 /* fillcw=1 */ 14199 rack->rc_pace_to_cwnd = 1; 14200 rack->rc_pace_fill_if_rttin_range = 0; 14201 rack->rtt_limit_mul = 0; 14202 /* noprr=1 */ 14203 rack->rack_no_prr = 1; 14204 /* lscwnd=1 */ 14205 rack->r_limit_scw = 1; 14206 } else if (optval == 2) { 14207 /* pace_always=1 */ 14208 rack->rc_always_pace = 1; 14209 tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ; 14210 /* scwnd=1 */ 14211 rack->rack_enable_scwnd = 1; 14212 /* dynamic=100 */ 14213 rack->rc_gp_dyn_mul = 1; 14214 rack->r_ctl.rack_per_of_gp_ca = 100; 14215 /* rrr_conf=3 */ 14216 rack->r_rr_config = 3; 14217 /* npush=2 */ 14218 rack->r_ctl.rc_no_push_at_mrtt = 2; 14219 /* fillcw=1 */ 14220 rack->rc_pace_to_cwnd = 1; 14221 rack->rc_pace_fill_if_rttin_range = 0; 14222 rack->rtt_limit_mul = 0; 14223 /* noprr=1 */ 14224 rack->rack_no_prr = 1; 14225 /* lscwnd=0 */ 14226 rack->r_limit_scw = 0; 14227 } 14228 break; 14229 case TCP_SHARED_CWND_TIME_LIMIT: 14230 RACK_OPTS_INC(tcp_lscwnd); 14231 if (optval) 14232 rack->r_limit_scw = 1; 14233 else 14234 rack->r_limit_scw = 0; 14235 break; 14236 case TCP_RACK_PACE_TO_FILL: 14237 RACK_OPTS_INC(tcp_fillcw); 14238 if (optval == 0) 14239 rack->rc_pace_to_cwnd = 0; 14240 else 14241 rack->rc_pace_to_cwnd = 1; 14242 if ((optval >= rack_gp_rtt_maxmul) && 14243 rack_gp_rtt_maxmul && 14244 (optval < 0xf)) { 14245 rack->rc_pace_fill_if_rttin_range = 1; 14246 rack->rtt_limit_mul = optval; 14247 } else { 14248 rack->rc_pace_fill_if_rttin_range = 0; 14249 rack->rtt_limit_mul = 0; 14250 } 14251 break; 14252 case TCP_RACK_NO_PUSH_AT_MAX: 14253 RACK_OPTS_INC(tcp_npush); 14254 if (optval == 0) 14255 rack->r_ctl.rc_no_push_at_mrtt = 0; 14256 else if (optval < 0xff) 14257 rack->r_ctl.rc_no_push_at_mrtt = optval; 14258 else 14259 error = EINVAL; 14260 break; 14261 case TCP_SHARED_CWND_ENABLE: 14262 RACK_OPTS_INC(tcp_rack_scwnd); 14263 if (optval == 0) 14264 rack->rack_enable_scwnd = 0; 14265 else 14266 rack->rack_enable_scwnd = 1; 14267 break; 14268 case TCP_RACK_MBUF_QUEUE: 14269 /* Now do we use the LRO mbuf-queue feature */ 14270 RACK_OPTS_INC(tcp_rack_mbufq); 14271 if (optval) 14272 rack->r_mbuf_queue = 1; 14273 else 14274 rack->r_mbuf_queue = 0; 14275 if (rack->r_mbuf_queue || rack->rc_always_pace) 14276 tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ; 14277 else 14278 tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ; 14279 break; 14280 case TCP_RACK_NONRXT_CFG_RATE: 14281 RACK_OPTS_INC(tcp_rack_cfg_rate); 14282 if (optval == 0) 14283 rack->rack_rec_nonrxt_use_cr = 0; 14284 else 14285 rack->rack_rec_nonrxt_use_cr = 1; 14286 break; 14287 case TCP_NO_PRR: 14288 RACK_OPTS_INC(tcp_rack_noprr); 14289 if (optval == 0) 14290 rack->rack_no_prr = 0; 14291 else 14292 rack->rack_no_prr = 1; 14293 break; 14294 case TCP_TIMELY_DYN_ADJ: 14295 RACK_OPTS_INC(tcp_timely_dyn); 14296 if (optval == 0) 14297 rack->rc_gp_dyn_mul = 0; 14298 else { 14299 rack->rc_gp_dyn_mul = 1; 14300 if (optval >= 100) { 14301 /* 14302 * If the user sets something 100 or more 14303 * its the gp_ca value. 14304 */ 14305 rack->r_ctl.rack_per_of_gp_ca = optval; 14306 } 14307 } 14308 break; 14309 case TCP_RACK_DO_DETECTION: 14310 RACK_OPTS_INC(tcp_rack_do_detection); 14311 if (optval == 0) 14312 rack->do_detection = 0; 14313 else 14314 rack->do_detection = 1; 14315 break; 14316 case TCP_RACK_PROP_RATE: 14317 if ((optval <= 0) || (optval >= 100)) { 14318 error = EINVAL; 14319 break; 14320 } 14321 RACK_OPTS_INC(tcp_rack_prop_rate); 14322 rack->r_ctl.rc_prop_rate = optval; 14323 break; 14324 case TCP_RACK_TLP_USE: 14325 if ((optval < TLP_USE_ID) || (optval > TLP_USE_TWO_TWO)) { 14326 error = EINVAL; 14327 break; 14328 } 14329 RACK_OPTS_INC(tcp_tlp_use); 14330 rack->rack_tlp_threshold_use = optval; 14331 break; 14332 case TCP_RACK_PROP: 14333 /* RACK proportional rate reduction (bool) */ 14334 RACK_OPTS_INC(tcp_rack_prop); 14335 rack->r_ctl.rc_prop_reduce = optval; 14336 break; 14337 case TCP_RACK_TLP_REDUCE: 14338 /* RACK TLP cwnd reduction (bool) */ 14339 RACK_OPTS_INC(tcp_rack_tlp_reduce); 14340 rack->r_ctl.rc_tlp_cwnd_reduce = optval; 14341 break; 14342 case TCP_RACK_EARLY_RECOV: 14343 /* Should recovery happen early (bool) */ 14344 RACK_OPTS_INC(tcp_rack_early_recov); 14345 rack->r_ctl.rc_early_recovery = optval; 14346 break; 14347 14348 /* Pacing related ones */ 14349 case TCP_RACK_PACE_ALWAYS: 14350 /* 14351 * zero is old rack method, 1 is new 14352 * method using a pacing rate. 14353 */ 14354 RACK_OPTS_INC(tcp_rack_pace_always); 14355 if (optval > 0) 14356 rack->rc_always_pace = 1; 14357 else 14358 rack->rc_always_pace = 0; 14359 if (rack->r_mbuf_queue || rack->rc_always_pace) 14360 tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ; 14361 else 14362 tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ; 14363 /* A rate may be set irate or other, if so set seg size */ 14364 rack_update_seg(rack); 14365 break; 14366 case TCP_BBR_RACK_INIT_RATE: 14367 RACK_OPTS_INC(tcp_initial_rate); 14368 val = optval; 14369 /* Change from kbits per second to bytes per second */ 14370 val *= 1000; 14371 val /= 8; 14372 rack->r_ctl.init_rate = val; 14373 if (rack->rc_init_win != rack_default_init_window) { 14374 uint32_t win, snt; 14375 14376 /* 14377 * Options don't always get applied 14378 * in the order you think. So in order 14379 * to assure we update a cwnd we need 14380 * to check and see if we are still 14381 * where we should raise the cwnd. 14382 */ 14383 win = rc_init_window(rack); 14384 if (SEQ_GT(tp->snd_max, tp->iss)) 14385 snt = tp->snd_max - tp->iss; 14386 else 14387 snt = 0; 14388 if ((snt < win) && 14389 (tp->snd_cwnd < win)) 14390 tp->snd_cwnd = win; 14391 } 14392 if (rack->rc_always_pace) 14393 rack_update_seg(rack); 14394 break; 14395 case TCP_BBR_IWINTSO: 14396 RACK_OPTS_INC(tcp_initial_win); 14397 if (optval && (optval <= 0xff)) { 14398 uint32_t win, snt; 14399 14400 rack->rc_init_win = optval; 14401 win = rc_init_window(rack); 14402 if (SEQ_GT(tp->snd_max, tp->iss)) 14403 snt = tp->snd_max - tp->iss; 14404 else 14405 snt = 0; 14406 if ((snt < win) && 14407 (tp->t_srtt | 14408 #ifdef NETFLIX_PEAKRATE 14409 tp->t_maxpeakrate | 14410 #endif 14411 rack->r_ctl.init_rate)) { 14412 /* 14413 * We are not past the initial window 14414 * and we have some bases for pacing, 14415 * so we need to possibly adjust up 14416 * the cwnd. Note even if we don't set 14417 * the cwnd, its still ok to raise the rc_init_win 14418 * which can be used coming out of idle when we 14419 * would have a rate. 14420 */ 14421 if (tp->snd_cwnd < win) 14422 tp->snd_cwnd = win; 14423 } 14424 if (rack->rc_always_pace) 14425 rack_update_seg(rack); 14426 } else 14427 error = EINVAL; 14428 break; 14429 case TCP_RACK_FORCE_MSEG: 14430 RACK_OPTS_INC(tcp_rack_force_max_seg); 14431 if (optval) 14432 rack->rc_force_max_seg = 1; 14433 else 14434 rack->rc_force_max_seg = 0; 14435 break; 14436 case TCP_RACK_PACE_MAX_SEG: 14437 /* Max segments size in a pace in bytes */ 14438 RACK_OPTS_INC(tcp_rack_max_seg); 14439 rack->rc_user_set_max_segs = optval; 14440 rack_set_pace_segments(tp, rack, __LINE__); 14441 break; 14442 case TCP_RACK_PACE_RATE_REC: 14443 /* Set the fixed pacing rate in Bytes per second ca */ 14444 RACK_OPTS_INC(tcp_rack_pace_rate_rec); 14445 rack->r_ctl.rc_fixed_pacing_rate_rec = optval; 14446 if (rack->r_ctl.rc_fixed_pacing_rate_ca == 0) 14447 rack->r_ctl.rc_fixed_pacing_rate_ca = optval; 14448 if (rack->r_ctl.rc_fixed_pacing_rate_ss == 0) 14449 rack->r_ctl.rc_fixed_pacing_rate_ss = optval; 14450 rack->use_fixed_rate = 1; 14451 rack_log_pacing_delay_calc(rack, 14452 rack->r_ctl.rc_fixed_pacing_rate_ss, 14453 rack->r_ctl.rc_fixed_pacing_rate_ca, 14454 rack->r_ctl.rc_fixed_pacing_rate_rec, 0, 0, 8, 14455 __LINE__, NULL); 14456 break; 14457 14458 case TCP_RACK_PACE_RATE_SS: 14459 /* Set the fixed pacing rate in Bytes per second ca */ 14460 RACK_OPTS_INC(tcp_rack_pace_rate_ss); 14461 rack->r_ctl.rc_fixed_pacing_rate_ss = optval; 14462 if (rack->r_ctl.rc_fixed_pacing_rate_ca == 0) 14463 rack->r_ctl.rc_fixed_pacing_rate_ca = optval; 14464 if (rack->r_ctl.rc_fixed_pacing_rate_rec == 0) 14465 rack->r_ctl.rc_fixed_pacing_rate_rec = optval; 14466 rack->use_fixed_rate = 1; 14467 rack_log_pacing_delay_calc(rack, 14468 rack->r_ctl.rc_fixed_pacing_rate_ss, 14469 rack->r_ctl.rc_fixed_pacing_rate_ca, 14470 rack->r_ctl.rc_fixed_pacing_rate_rec, 0, 0, 8, 14471 __LINE__, NULL); 14472 break; 14473 14474 case TCP_RACK_PACE_RATE_CA: 14475 /* Set the fixed pacing rate in Bytes per second ca */ 14476 RACK_OPTS_INC(tcp_rack_pace_rate_ca); 14477 rack->r_ctl.rc_fixed_pacing_rate_ca = optval; 14478 if (rack->r_ctl.rc_fixed_pacing_rate_ss == 0) 14479 rack->r_ctl.rc_fixed_pacing_rate_ss = optval; 14480 if (rack->r_ctl.rc_fixed_pacing_rate_rec == 0) 14481 rack->r_ctl.rc_fixed_pacing_rate_rec = optval; 14482 rack->use_fixed_rate = 1; 14483 rack_log_pacing_delay_calc(rack, 14484 rack->r_ctl.rc_fixed_pacing_rate_ss, 14485 rack->r_ctl.rc_fixed_pacing_rate_ca, 14486 rack->r_ctl.rc_fixed_pacing_rate_rec, 0, 0, 8, 14487 __LINE__, NULL); 14488 break; 14489 case TCP_RACK_GP_INCREASE_REC: 14490 RACK_OPTS_INC(tcp_gp_inc_rec); 14491 rack->r_ctl.rack_per_of_gp_rec = optval; 14492 rack_log_pacing_delay_calc(rack, 14493 rack->r_ctl.rack_per_of_gp_ss, 14494 rack->r_ctl.rack_per_of_gp_ca, 14495 rack->r_ctl.rack_per_of_gp_rec, 0, 0, 1, 14496 __LINE__, NULL); 14497 break; 14498 case TCP_RACK_GP_INCREASE_CA: 14499 RACK_OPTS_INC(tcp_gp_inc_ca); 14500 ca = optval; 14501 if (ca < 100) { 14502 /* 14503 * We don't allow any reduction 14504 * over the GP b/w. 14505 */ 14506 error = EINVAL; 14507 break; 14508 } 14509 rack->r_ctl.rack_per_of_gp_ca = ca; 14510 rack_log_pacing_delay_calc(rack, 14511 rack->r_ctl.rack_per_of_gp_ss, 14512 rack->r_ctl.rack_per_of_gp_ca, 14513 rack->r_ctl.rack_per_of_gp_rec, 0, 0, 1, 14514 __LINE__, NULL); 14515 break; 14516 case TCP_RACK_GP_INCREASE_SS: 14517 RACK_OPTS_INC(tcp_gp_inc_ss); 14518 ss = optval; 14519 if (ss < 100) { 14520 /* 14521 * We don't allow any reduction 14522 * over the GP b/w. 14523 */ 14524 error = EINVAL; 14525 break; 14526 } 14527 rack->r_ctl.rack_per_of_gp_ss = ss; 14528 rack_log_pacing_delay_calc(rack, 14529 rack->r_ctl.rack_per_of_gp_ss, 14530 rack->r_ctl.rack_per_of_gp_ca, 14531 rack->r_ctl.rack_per_of_gp_rec, 0, 0, 1, 14532 __LINE__, NULL); 14533 break; 14534 case TCP_RACK_RR_CONF: 14535 RACK_OPTS_INC(tcp_rack_rrr_no_conf_rate); 14536 if (optval && optval <= 3) 14537 rack->r_rr_config = optval; 14538 else 14539 rack->r_rr_config = 0; 14540 break; 14541 case TCP_BBR_HDWR_PACE: 14542 RACK_OPTS_INC(tcp_hdwr_pacing); 14543 if (optval){ 14544 if (rack->rack_hdrw_pacing == 0) { 14545 rack->rack_hdw_pace_ena = 1; 14546 rack->rack_attempt_hdwr_pace = 0; 14547 } else 14548 error = EALREADY; 14549 } else { 14550 rack->rack_hdw_pace_ena = 0; 14551 #ifdef RATELIMIT 14552 if (rack->rack_hdrw_pacing) { 14553 rack->rack_hdrw_pacing = 0; 14554 in_pcbdetach_txrtlmt(rack->rc_inp); 14555 } 14556 #endif 14557 } 14558 break; 14559 /* End Pacing related ones */ 14560 case TCP_RACK_PRR_SENDALOT: 14561 /* Allow PRR to send more than one seg */ 14562 RACK_OPTS_INC(tcp_rack_prr_sendalot); 14563 rack->r_ctl.rc_prr_sendalot = optval; 14564 break; 14565 case TCP_RACK_MIN_TO: 14566 /* Minimum time between rack t-o's in ms */ 14567 RACK_OPTS_INC(tcp_rack_min_to); 14568 rack->r_ctl.rc_min_to = optval; 14569 break; 14570 case TCP_RACK_EARLY_SEG: 14571 /* If early recovery max segments */ 14572 RACK_OPTS_INC(tcp_rack_early_seg); 14573 rack->r_ctl.rc_early_recovery_segs = optval; 14574 break; 14575 case TCP_RACK_REORD_THRESH: 14576 /* RACK reorder threshold (shift amount) */ 14577 RACK_OPTS_INC(tcp_rack_reord_thresh); 14578 if ((optval > 0) && (optval < 31)) 14579 rack->r_ctl.rc_reorder_shift = optval; 14580 else 14581 error = EINVAL; 14582 break; 14583 case TCP_RACK_REORD_FADE: 14584 /* Does reordering fade after ms time */ 14585 RACK_OPTS_INC(tcp_rack_reord_fade); 14586 rack->r_ctl.rc_reorder_fade = optval; 14587 break; 14588 case TCP_RACK_TLP_THRESH: 14589 /* RACK TLP theshold i.e. srtt+(srtt/N) */ 14590 RACK_OPTS_INC(tcp_rack_tlp_thresh); 14591 if (optval) 14592 rack->r_ctl.rc_tlp_threshold = optval; 14593 else 14594 error = EINVAL; 14595 break; 14596 case TCP_BBR_USE_RACK_RR: 14597 RACK_OPTS_INC(tcp_rack_rr); 14598 if (optval) 14599 rack->use_rack_rr = 1; 14600 else 14601 rack->use_rack_rr = 0; 14602 break; 14603 case TCP_RACK_PKT_DELAY: 14604 /* RACK added ms i.e. rack-rtt + reord + N */ 14605 RACK_OPTS_INC(tcp_rack_pkt_delay); 14606 rack->r_ctl.rc_pkt_delay = optval; 14607 break; 14608 case TCP_RACK_TLP_INC_VAR: 14609 /* Does TLP include rtt variance in t-o */ 14610 error = EINVAL; 14611 break; 14612 case TCP_RACK_IDLE_REDUCE_HIGH: 14613 error = EINVAL; 14614 break; 14615 case TCP_DELACK: 14616 if (optval == 0) 14617 tp->t_delayed_ack = 0; 14618 else 14619 tp->t_delayed_ack = 1; 14620 if (tp->t_flags & TF_DELACK) { 14621 tp->t_flags &= ~TF_DELACK; 14622 tp->t_flags |= TF_ACKNOW; 14623 NET_EPOCH_ENTER(et); 14624 rack_output(tp); 14625 NET_EPOCH_EXIT(et); 14626 } 14627 break; 14628 14629 case TCP_BBR_RACK_RTT_USE: 14630 if ((optval != USE_RTT_HIGH) && 14631 (optval != USE_RTT_LOW) && 14632 (optval != USE_RTT_AVG)) 14633 error = EINVAL; 14634 else 14635 rack->r_ctl.rc_rate_sample_method = optval; 14636 break; 14637 case TCP_DATA_AFTER_CLOSE: 14638 if (optval) 14639 rack->rc_allow_data_af_clo = 1; 14640 else 14641 rack->rc_allow_data_af_clo = 0; 14642 break; 14643 case TCP_RACK_PACE_REDUCE: 14644 /* sysctl only now */ 14645 error = EINVAL; 14646 break; 14647 default: 14648 return (tcp_default_ctloutput(so, sopt, inp, tp)); 14649 break; 14650 } 14651 #ifdef NETFLIX_STATS 14652 tcp_log_socket_option(tp, sopt->sopt_name, optval, error); 14653 #endif 14654 INP_WUNLOCK(inp); 14655 return (error); 14656 } 14657 14658 static int 14659 rack_get_sockopt(struct socket *so, struct sockopt *sopt, 14660 struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack) 14661 { 14662 int32_t error, optval; 14663 uint64_t val; 14664 /* 14665 * Because all our options are either boolean or an int, we can just 14666 * pull everything into optval and then unlock and copy. If we ever 14667 * add a option that is not a int, then this will have quite an 14668 * impact to this routine. 14669 */ 14670 error = 0; 14671 switch (sopt->sopt_name) { 14672 case TCP_RACK_PROFILE: 14673 /* You cannot retrieve a profile, its write only */ 14674 error = EINVAL; 14675 break; 14676 case TCP_RACK_PACE_TO_FILL: 14677 optval = rack->rc_pace_to_cwnd; 14678 break; 14679 case TCP_RACK_NO_PUSH_AT_MAX: 14680 optval = rack->r_ctl.rc_no_push_at_mrtt; 14681 break; 14682 case TCP_SHARED_CWND_ENABLE: 14683 optval = rack->rack_enable_scwnd; 14684 break; 14685 case TCP_RACK_NONRXT_CFG_RATE: 14686 optval = rack->rack_rec_nonrxt_use_cr; 14687 break; 14688 case TCP_NO_PRR: 14689 optval = rack->rack_no_prr; 14690 break; 14691 case TCP_RACK_DO_DETECTION: 14692 optval = rack->do_detection; 14693 break; 14694 case TCP_RACK_MBUF_QUEUE: 14695 /* Now do we use the LRO mbuf-queue feature */ 14696 optval = rack->r_mbuf_queue; 14697 break; 14698 case TCP_TIMELY_DYN_ADJ: 14699 optval = rack->rc_gp_dyn_mul; 14700 break; 14701 case TCP_BBR_IWINTSO: 14702 optval = rack->rc_init_win; 14703 break; 14704 case TCP_RACK_PROP_RATE: 14705 optval = rack->r_ctl.rc_prop_rate; 14706 break; 14707 case TCP_RACK_PROP: 14708 /* RACK proportional rate reduction (bool) */ 14709 optval = rack->r_ctl.rc_prop_reduce; 14710 break; 14711 case TCP_RACK_TLP_REDUCE: 14712 /* RACK TLP cwnd reduction (bool) */ 14713 optval = rack->r_ctl.rc_tlp_cwnd_reduce; 14714 break; 14715 case TCP_RACK_EARLY_RECOV: 14716 /* Should recovery happen early (bool) */ 14717 optval = rack->r_ctl.rc_early_recovery; 14718 break; 14719 case TCP_RACK_PACE_REDUCE: 14720 /* RACK Hptsi reduction factor (divisor) */ 14721 error = EINVAL; 14722 break; 14723 case TCP_BBR_RACK_INIT_RATE: 14724 val = rack->r_ctl.init_rate; 14725 /* convert to kbits per sec */ 14726 val *= 8; 14727 val /= 1000; 14728 optval = (uint32_t)val; 14729 break; 14730 case TCP_RACK_FORCE_MSEG: 14731 optval = rack->rc_force_max_seg; 14732 break; 14733 case TCP_RACK_PACE_MAX_SEG: 14734 /* Max segments in a pace */ 14735 optval = rack->rc_user_set_max_segs; 14736 break; 14737 case TCP_RACK_PACE_ALWAYS: 14738 /* Use the always pace method */ 14739 optval = rack->rc_always_pace; 14740 break; 14741 case TCP_RACK_PRR_SENDALOT: 14742 /* Allow PRR to send more than one seg */ 14743 optval = rack->r_ctl.rc_prr_sendalot; 14744 break; 14745 case TCP_RACK_MIN_TO: 14746 /* Minimum time between rack t-o's in ms */ 14747 optval = rack->r_ctl.rc_min_to; 14748 break; 14749 case TCP_RACK_EARLY_SEG: 14750 /* If early recovery max segments */ 14751 optval = rack->r_ctl.rc_early_recovery_segs; 14752 break; 14753 case TCP_RACK_REORD_THRESH: 14754 /* RACK reorder threshold (shift amount) */ 14755 optval = rack->r_ctl.rc_reorder_shift; 14756 break; 14757 case TCP_RACK_REORD_FADE: 14758 /* Does reordering fade after ms time */ 14759 optval = rack->r_ctl.rc_reorder_fade; 14760 break; 14761 case TCP_BBR_USE_RACK_RR: 14762 /* Do we use the rack cheat for rxt */ 14763 optval = rack->use_rack_rr; 14764 break; 14765 case TCP_RACK_RR_CONF: 14766 optval = rack->r_rr_config; 14767 break; 14768 case TCP_BBR_HDWR_PACE: 14769 optval = rack->rack_hdw_pace_ena; 14770 break; 14771 case TCP_RACK_TLP_THRESH: 14772 /* RACK TLP theshold i.e. srtt+(srtt/N) */ 14773 optval = rack->r_ctl.rc_tlp_threshold; 14774 break; 14775 case TCP_RACK_PKT_DELAY: 14776 /* RACK added ms i.e. rack-rtt + reord + N */ 14777 optval = rack->r_ctl.rc_pkt_delay; 14778 break; 14779 case TCP_RACK_TLP_USE: 14780 optval = rack->rack_tlp_threshold_use; 14781 break; 14782 case TCP_RACK_TLP_INC_VAR: 14783 /* Does TLP include rtt variance in t-o */ 14784 error = EINVAL; 14785 break; 14786 case TCP_RACK_IDLE_REDUCE_HIGH: 14787 error = EINVAL; 14788 break; 14789 case TCP_RACK_PACE_RATE_CA: 14790 optval = rack->r_ctl.rc_fixed_pacing_rate_ca; 14791 break; 14792 case TCP_RACK_PACE_RATE_SS: 14793 optval = rack->r_ctl.rc_fixed_pacing_rate_ss; 14794 break; 14795 case TCP_RACK_PACE_RATE_REC: 14796 optval = rack->r_ctl.rc_fixed_pacing_rate_rec; 14797 break; 14798 case TCP_RACK_GP_INCREASE_SS: 14799 optval = rack->r_ctl.rack_per_of_gp_ca; 14800 break; 14801 case TCP_RACK_GP_INCREASE_CA: 14802 optval = rack->r_ctl.rack_per_of_gp_ss; 14803 break; 14804 case TCP_BBR_RACK_RTT_USE: 14805 optval = rack->r_ctl.rc_rate_sample_method; 14806 break; 14807 case TCP_DELACK: 14808 optval = tp->t_delayed_ack; 14809 break; 14810 case TCP_DATA_AFTER_CLOSE: 14811 optval = rack->rc_allow_data_af_clo; 14812 break; 14813 case TCP_SHARED_CWND_TIME_LIMIT: 14814 optval = rack->r_limit_scw; 14815 break; 14816 default: 14817 return (tcp_default_ctloutput(so, sopt, inp, tp)); 14818 break; 14819 } 14820 INP_WUNLOCK(inp); 14821 if (error == 0) { 14822 error = sooptcopyout(sopt, &optval, sizeof optval); 14823 } 14824 return (error); 14825 } 14826 14827 static int 14828 rack_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp) 14829 { 14830 int32_t error = EINVAL; 14831 struct tcp_rack *rack; 14832 14833 rack = (struct tcp_rack *)tp->t_fb_ptr; 14834 if (rack == NULL) { 14835 /* Huh? */ 14836 goto out; 14837 } 14838 if (sopt->sopt_dir == SOPT_SET) { 14839 return (rack_set_sockopt(so, sopt, inp, tp, rack)); 14840 } else if (sopt->sopt_dir == SOPT_GET) { 14841 return (rack_get_sockopt(so, sopt, inp, tp, rack)); 14842 } 14843 out: 14844 INP_WUNLOCK(inp); 14845 return (error); 14846 } 14847 14848 static int 14849 rack_pru_options(struct tcpcb *tp, int flags) 14850 { 14851 if (flags & PRUS_OOB) 14852 return (EOPNOTSUPP); 14853 return (0); 14854 } 14855 14856 static struct tcp_function_block __tcp_rack = { 14857 .tfb_tcp_block_name = __XSTRING(STACKNAME), 14858 .tfb_tcp_output = rack_output, 14859 .tfb_do_queued_segments = ctf_do_queued_segments, 14860 .tfb_do_segment_nounlock = rack_do_segment_nounlock, 14861 .tfb_tcp_do_segment = rack_do_segment, 14862 .tfb_tcp_ctloutput = rack_ctloutput, 14863 .tfb_tcp_fb_init = rack_init, 14864 .tfb_tcp_fb_fini = rack_fini, 14865 .tfb_tcp_timer_stop_all = rack_stopall, 14866 .tfb_tcp_timer_activate = rack_timer_activate, 14867 .tfb_tcp_timer_active = rack_timer_active, 14868 .tfb_tcp_timer_stop = rack_timer_stop, 14869 .tfb_tcp_rexmit_tmr = rack_remxt_tmr, 14870 .tfb_tcp_handoff_ok = rack_handoff_ok, 14871 .tfb_pru_options = rack_pru_options, 14872 }; 14873 14874 static const char *rack_stack_names[] = { 14875 __XSTRING(STACKNAME), 14876 #ifdef STACKALIAS 14877 __XSTRING(STACKALIAS), 14878 #endif 14879 }; 14880 14881 static int 14882 rack_ctor(void *mem, int32_t size, void *arg, int32_t how) 14883 { 14884 memset(mem, 0, size); 14885 return (0); 14886 } 14887 14888 static void 14889 rack_dtor(void *mem, int32_t size, void *arg) 14890 { 14891 14892 } 14893 14894 static bool rack_mod_inited = false; 14895 14896 static int 14897 tcp_addrack(module_t mod, int32_t type, void *data) 14898 { 14899 int32_t err = 0; 14900 int num_stacks; 14901 14902 switch (type) { 14903 case MOD_LOAD: 14904 rack_zone = uma_zcreate(__XSTRING(MODNAME) "_map", 14905 sizeof(struct rack_sendmap), 14906 rack_ctor, rack_dtor, NULL, NULL, UMA_ALIGN_PTR, 0); 14907 14908 rack_pcb_zone = uma_zcreate(__XSTRING(MODNAME) "_pcb", 14909 sizeof(struct tcp_rack), 14910 rack_ctor, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0); 14911 14912 sysctl_ctx_init(&rack_sysctl_ctx); 14913 rack_sysctl_root = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 14914 SYSCTL_STATIC_CHILDREN(_net_inet_tcp), 14915 OID_AUTO, 14916 #ifdef STACKALIAS 14917 __XSTRING(STACKALIAS), 14918 #else 14919 __XSTRING(STACKNAME), 14920 #endif 14921 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 14922 ""); 14923 if (rack_sysctl_root == NULL) { 14924 printf("Failed to add sysctl node\n"); 14925 err = EFAULT; 14926 goto free_uma; 14927 } 14928 rack_init_sysctls(); 14929 num_stacks = nitems(rack_stack_names); 14930 err = register_tcp_functions_as_names(&__tcp_rack, M_WAITOK, 14931 rack_stack_names, &num_stacks); 14932 if (err) { 14933 printf("Failed to register %s stack name for " 14934 "%s module\n", rack_stack_names[num_stacks], 14935 __XSTRING(MODNAME)); 14936 sysctl_ctx_free(&rack_sysctl_ctx); 14937 free_uma: 14938 uma_zdestroy(rack_zone); 14939 uma_zdestroy(rack_pcb_zone); 14940 rack_counter_destroy(); 14941 printf("Failed to register rack module -- err:%d\n", err); 14942 return (err); 14943 } 14944 tcp_lro_reg_mbufq(); 14945 rack_mod_inited = true; 14946 break; 14947 case MOD_QUIESCE: 14948 err = deregister_tcp_functions(&__tcp_rack, true, false); 14949 break; 14950 case MOD_UNLOAD: 14951 err = deregister_tcp_functions(&__tcp_rack, false, true); 14952 if (err == EBUSY) 14953 break; 14954 if (rack_mod_inited) { 14955 uma_zdestroy(rack_zone); 14956 uma_zdestroy(rack_pcb_zone); 14957 sysctl_ctx_free(&rack_sysctl_ctx); 14958 rack_counter_destroy(); 14959 rack_mod_inited = false; 14960 } 14961 tcp_lro_dereg_mbufq(); 14962 err = 0; 14963 break; 14964 default: 14965 return (EOPNOTSUPP); 14966 } 14967 return (err); 14968 } 14969 14970 static moduledata_t tcp_rack = { 14971 .name = __XSTRING(MODNAME), 14972 .evhand = tcp_addrack, 14973 .priv = 0 14974 }; 14975 14976 MODULE_VERSION(MODNAME, 1); 14977 DECLARE_MODULE(MODNAME, tcp_rack, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY); 14978 MODULE_DEPEND(MODNAME, tcphpts, 1, 1, 1); 14979