1 /*- 2 * Copyright (c) 2016-2020 Netflix, Inc. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 16 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 23 * SUCH DAMAGE. 24 * 25 */ 26 27 #include <sys/cdefs.h> 28 __FBSDID("$FreeBSD$"); 29 30 #include "opt_inet.h" 31 #include "opt_inet6.h" 32 #include "opt_ipsec.h" 33 #include "opt_tcpdebug.h" 34 #include "opt_ratelimit.h" 35 #include "opt_kern_tls.h" 36 #include <sys/param.h> 37 #include <sys/arb.h> 38 #include <sys/module.h> 39 #include <sys/kernel.h> 40 #ifdef TCP_HHOOK 41 #include <sys/hhook.h> 42 #endif 43 #include <sys/lock.h> 44 #include <sys/malloc.h> 45 #include <sys/lock.h> 46 #include <sys/mutex.h> 47 #include <sys/mbuf.h> 48 #include <sys/proc.h> /* for proc0 declaration */ 49 #include <sys/socket.h> 50 #include <sys/socketvar.h> 51 #include <sys/sysctl.h> 52 #include <sys/systm.h> 53 #ifdef STATS 54 #include <sys/qmath.h> 55 #include <sys/tree.h> 56 #include <sys/stats.h> /* Must come after qmath.h and tree.h */ 57 #else 58 #include <sys/tree.h> 59 #endif 60 #include <sys/refcount.h> 61 #include <sys/queue.h> 62 #include <sys/tim_filter.h> 63 #include <sys/smp.h> 64 #include <sys/kthread.h> 65 #include <sys/kern_prefetch.h> 66 #include <sys/protosw.h> 67 #ifdef TCP_ACCOUNTING 68 #include <sys/sched.h> 69 #include <machine/cpu.h> 70 #endif 71 #include <vm/uma.h> 72 73 #include <net/route.h> 74 #include <net/route/nhop.h> 75 #include <net/vnet.h> 76 77 #define TCPSTATES /* for logging */ 78 79 #include <netinet/in.h> 80 #include <netinet/in_kdtrace.h> 81 #include <netinet/in_pcb.h> 82 #include <netinet/ip.h> 83 #include <netinet/ip_icmp.h> /* required for icmp_var.h */ 84 #include <netinet/icmp_var.h> /* for ICMP_BANDLIM */ 85 #include <netinet/ip_var.h> 86 #include <netinet/ip6.h> 87 #include <netinet6/in6_pcb.h> 88 #include <netinet6/ip6_var.h> 89 #include <netinet/tcp.h> 90 #define TCPOUTFLAGS 91 #include <netinet/tcp_fsm.h> 92 #include <netinet/tcp_log_buf.h> 93 #include <netinet/tcp_seq.h> 94 #include <netinet/tcp_timer.h> 95 #include <netinet/tcp_var.h> 96 #include <netinet/tcp_syncache.h> 97 #include <netinet/tcp_hpts.h> 98 #include <netinet/tcp_ratelimit.h> 99 #include <netinet/tcp_accounting.h> 100 #include <netinet/tcpip.h> 101 #include <netinet/cc/cc.h> 102 #include <netinet/cc/cc_newreno.h> 103 #include <netinet/tcp_fastopen.h> 104 #include <netinet/tcp_lro.h> 105 #ifdef NETFLIX_SHARED_CWND 106 #include <netinet/tcp_shared_cwnd.h> 107 #endif 108 #ifdef TCPDEBUG 109 #include <netinet/tcp_debug.h> 110 #endif /* TCPDEBUG */ 111 #ifdef TCP_OFFLOAD 112 #include <netinet/tcp_offload.h> 113 #endif 114 #ifdef INET6 115 #include <netinet6/tcp6_var.h> 116 #endif 117 #include <netinet/tcp_ecn.h> 118 119 #include <netipsec/ipsec_support.h> 120 121 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 122 #include <netipsec/ipsec.h> 123 #include <netipsec/ipsec6.h> 124 #endif /* IPSEC */ 125 126 #include <netinet/udp.h> 127 #include <netinet/udp_var.h> 128 #include <machine/in_cksum.h> 129 130 #ifdef MAC 131 #include <security/mac/mac_framework.h> 132 #endif 133 #include "sack_filter.h" 134 #include "tcp_rack.h" 135 #include "rack_bbr_common.h" 136 137 uma_zone_t rack_zone; 138 uma_zone_t rack_pcb_zone; 139 140 #ifndef TICKS2SBT 141 #define TICKS2SBT(__t) (tick_sbt * ((sbintime_t)(__t))) 142 #endif 143 144 VNET_DECLARE(uint32_t, newreno_beta); 145 VNET_DECLARE(uint32_t, newreno_beta_ecn); 146 #define V_newreno_beta VNET(newreno_beta) 147 #define V_newreno_beta_ecn VNET(newreno_beta_ecn) 148 149 150 MALLOC_DEFINE(M_TCPFSB, "tcp_fsb", "TCP fast send block"); 151 MALLOC_DEFINE(M_TCPDO, "tcp_do", "TCP deferred options"); 152 153 struct sysctl_ctx_list rack_sysctl_ctx; 154 struct sysctl_oid *rack_sysctl_root; 155 156 #define CUM_ACKED 1 157 #define SACKED 2 158 159 /* 160 * The RACK module incorporates a number of 161 * TCP ideas that have been put out into the IETF 162 * over the last few years: 163 * - Matt Mathis's Rate Halving which slowly drops 164 * the congestion window so that the ack clock can 165 * be maintained during a recovery. 166 * - Yuchung Cheng's RACK TCP (for which its named) that 167 * will stop us using the number of dup acks and instead 168 * use time as the gage of when we retransmit. 169 * - Reorder Detection of RFC4737 and the Tail-Loss probe draft 170 * of Dukkipati et.al. 171 * RACK depends on SACK, so if an endpoint arrives that 172 * cannot do SACK the state machine below will shuttle the 173 * connection back to using the "default" TCP stack that is 174 * in FreeBSD. 175 * 176 * To implement RACK the original TCP stack was first decomposed 177 * into a functional state machine with individual states 178 * for each of the possible TCP connection states. The do_segment 179 * functions role in life is to mandate the connection supports SACK 180 * initially and then assure that the RACK state matches the conenction 181 * state before calling the states do_segment function. Each 182 * state is simplified due to the fact that the original do_segment 183 * has been decomposed and we *know* what state we are in (no 184 * switches on the state) and all tests for SACK are gone. This 185 * greatly simplifies what each state does. 186 * 187 * TCP output is also over-written with a new version since it 188 * must maintain the new rack scoreboard. 189 * 190 */ 191 static int32_t rack_tlp_thresh = 1; 192 static int32_t rack_tlp_limit = 2; /* No more than 2 TLPs w-out new data */ 193 static int32_t rack_tlp_use_greater = 1; 194 static int32_t rack_reorder_thresh = 2; 195 static int32_t rack_reorder_fade = 60000000; /* 0 - never fade, def 60,000,000 196 * - 60 seconds */ 197 static uint8_t rack_req_measurements = 1; 198 /* Attack threshold detections */ 199 static uint32_t rack_highest_sack_thresh_seen = 0; 200 static uint32_t rack_highest_move_thresh_seen = 0; 201 static int32_t rack_enable_hw_pacing = 0; /* Due to CCSP keep it off by default */ 202 static int32_t rack_hw_pace_extra_slots = 2; /* 2 extra MSS time betweens */ 203 static int32_t rack_hw_rate_caps = 1; /* 1; */ 204 static int32_t rack_hw_rate_min = 0; /* 1500000;*/ 205 static int32_t rack_hw_rate_to_low = 0; /* 1200000; */ 206 static int32_t rack_hw_up_only = 1; 207 static int32_t rack_stats_gets_ms_rtt = 1; 208 static int32_t rack_prr_addbackmax = 2; 209 static int32_t rack_do_hystart = 0; 210 static int32_t rack_apply_rtt_with_reduced_conf = 0; 211 212 static int32_t rack_pkt_delay = 1000; 213 static int32_t rack_send_a_lot_in_prr = 1; 214 static int32_t rack_min_to = 1000; /* Number of microsecond min timeout */ 215 static int32_t rack_verbose_logging = 0; 216 static int32_t rack_ignore_data_after_close = 1; 217 static int32_t rack_enable_shared_cwnd = 1; 218 static int32_t rack_use_cmp_acks = 1; 219 static int32_t rack_use_fsb = 1; 220 static int32_t rack_use_rfo = 1; 221 static int32_t rack_use_rsm_rfo = 1; 222 static int32_t rack_max_abc_post_recovery = 2; 223 static int32_t rack_client_low_buf = 0; 224 static int32_t rack_dsack_std_based = 0x3; /* bit field bit 1 sets rc_rack_tmr_std_based and bit 2 sets rc_rack_use_dsack */ 225 #ifdef TCP_ACCOUNTING 226 static int32_t rack_tcp_accounting = 0; 227 #endif 228 static int32_t rack_limits_scwnd = 1; 229 static int32_t rack_enable_mqueue_for_nonpaced = 0; 230 static int32_t rack_disable_prr = 0; 231 static int32_t use_rack_rr = 1; 232 static int32_t rack_non_rxt_use_cr = 0; /* does a non-rxt in recovery use the configured rate (ss/ca)? */ 233 static int32_t rack_persist_min = 250000; /* 250usec */ 234 static int32_t rack_persist_max = 2000000; /* 2 Second in usec's */ 235 static int32_t rack_sack_not_required = 1; /* set to one to allow non-sack to use rack */ 236 static int32_t rack_default_init_window = 0; /* Use system default */ 237 static int32_t rack_limit_time_with_srtt = 0; 238 static int32_t rack_autosndbuf_inc = 20; /* In percentage form */ 239 static int32_t rack_enobuf_hw_boost_mult = 2; /* How many times the hw rate we boost slot using time_between */ 240 static int32_t rack_enobuf_hw_max = 12000; /* 12 ms in usecs */ 241 static int32_t rack_enobuf_hw_min = 10000; /* 10 ms in usecs */ 242 static int32_t rack_hw_rwnd_factor = 2; /* How many max_segs the rwnd must be before we hold off sending */ 243 244 /* 245 * Currently regular tcp has a rto_min of 30ms 246 * the backoff goes 12 times so that ends up 247 * being a total of 122.850 seconds before a 248 * connection is killed. 249 */ 250 static uint32_t rack_def_data_window = 20; 251 static uint32_t rack_goal_bdp = 2; 252 static uint32_t rack_min_srtts = 1; 253 static uint32_t rack_min_measure_usec = 0; 254 static int32_t rack_tlp_min = 10000; /* 10ms */ 255 static int32_t rack_rto_min = 30000; /* 30,000 usec same as main freebsd */ 256 static int32_t rack_rto_max = 4000000; /* 4 seconds in usec's */ 257 static const int32_t rack_free_cache = 2; 258 static int32_t rack_hptsi_segments = 40; 259 static int32_t rack_rate_sample_method = USE_RTT_LOW; 260 static int32_t rack_pace_every_seg = 0; 261 static int32_t rack_delayed_ack_time = 40000; /* 40ms in usecs */ 262 static int32_t rack_slot_reduction = 4; 263 static int32_t rack_wma_divisor = 8; /* For WMA calculation */ 264 static int32_t rack_cwnd_block_ends_measure = 0; 265 static int32_t rack_rwnd_block_ends_measure = 0; 266 static int32_t rack_def_profile = 0; 267 268 static int32_t rack_lower_cwnd_at_tlp = 0; 269 static int32_t rack_limited_retran = 0; 270 static int32_t rack_always_send_oldest = 0; 271 static int32_t rack_tlp_threshold_use = TLP_USE_TWO_ONE; 272 273 static uint16_t rack_per_of_gp_ss = 250; /* 250 % slow-start */ 274 static uint16_t rack_per_of_gp_ca = 200; /* 200 % congestion-avoidance */ 275 static uint16_t rack_per_of_gp_rec = 200; /* 200 % of bw */ 276 277 /* Probertt */ 278 static uint16_t rack_per_of_gp_probertt = 60; /* 60% of bw */ 279 static uint16_t rack_per_of_gp_lowthresh = 40; /* 40% is bottom */ 280 static uint16_t rack_per_of_gp_probertt_reduce = 10; /* 10% reduction */ 281 static uint16_t rack_atexit_prtt_hbp = 130; /* Clamp to 130% on exit prtt if highly buffered path */ 282 static uint16_t rack_atexit_prtt = 130; /* Clamp to 100% on exit prtt if non highly buffered path */ 283 284 static uint32_t rack_max_drain_wait = 2; /* How man gp srtt's before we give up draining */ 285 static uint32_t rack_must_drain = 1; /* How many GP srtt's we *must* wait */ 286 static uint32_t rack_probertt_use_min_rtt_entry = 1; /* Use the min to calculate the goal else gp_srtt */ 287 static uint32_t rack_probertt_use_min_rtt_exit = 0; 288 static uint32_t rack_probe_rtt_sets_cwnd = 0; 289 static uint32_t rack_probe_rtt_safety_val = 2000000; /* No more than 2 sec in probe-rtt */ 290 static uint32_t rack_time_between_probertt = 9600000; /* 9.6 sec in usecs */ 291 static uint32_t rack_probertt_gpsrtt_cnt_mul = 0; /* How many srtt periods does probe-rtt last top fraction */ 292 static uint32_t rack_probertt_gpsrtt_cnt_div = 0; /* How many srtt periods does probe-rtt last bottom fraction */ 293 static uint32_t rack_min_probertt_hold = 40000; /* Equal to delayed ack time */ 294 static uint32_t rack_probertt_filter_life = 10000000; 295 static uint32_t rack_probertt_lower_within = 10; 296 static uint32_t rack_min_rtt_movement = 250000; /* Must move at least 250ms (in microseconds) to count as a lowering */ 297 static int32_t rack_pace_one_seg = 0; /* Shall we pace for less than 1.4Meg 1MSS at a time */ 298 static int32_t rack_probertt_clear_is = 1; 299 static int32_t rack_max_drain_hbp = 1; /* Extra drain times gpsrtt for highly buffered paths */ 300 static int32_t rack_hbp_thresh = 3; /* what is the divisor max_rtt/min_rtt to decided a hbp */ 301 302 /* Part of pacing */ 303 static int32_t rack_max_per_above = 30; /* When we go to increment stop if above 100+this% */ 304 305 /* Timely information */ 306 /* Combine these two gives the range of 'no change' to bw */ 307 /* ie the up/down provide the upper and lower bound */ 308 static int32_t rack_gp_per_bw_mul_up = 2; /* 2% */ 309 static int32_t rack_gp_per_bw_mul_down = 4; /* 4% */ 310 static int32_t rack_gp_rtt_maxmul = 3; /* 3 x maxmin */ 311 static int32_t rack_gp_rtt_minmul = 1; /* minrtt + (minrtt/mindiv) is lower rtt */ 312 static int32_t rack_gp_rtt_mindiv = 4; /* minrtt + (minrtt * minmul/mindiv) is lower rtt */ 313 static int32_t rack_gp_decrease_per = 20; /* 20% decrease in multiplier */ 314 static int32_t rack_gp_increase_per = 2; /* 2% increase in multiplier */ 315 static int32_t rack_per_lower_bound = 50; /* Don't allow to drop below this multiplier */ 316 static int32_t rack_per_upper_bound_ss = 0; /* Don't allow SS to grow above this */ 317 static int32_t rack_per_upper_bound_ca = 0; /* Don't allow CA to grow above this */ 318 static int32_t rack_do_dyn_mul = 0; /* Are the rack gp multipliers dynamic */ 319 static int32_t rack_gp_no_rec_chg = 1; /* Prohibit recovery from reducing it's multiplier */ 320 static int32_t rack_timely_dec_clear = 6; /* Do we clear decrement count at a value (6)? */ 321 static int32_t rack_timely_max_push_rise = 3; /* One round of pushing */ 322 static int32_t rack_timely_max_push_drop = 3; /* Three round of pushing */ 323 static int32_t rack_timely_min_segs = 4; /* 4 segment minimum */ 324 static int32_t rack_use_max_for_nobackoff = 0; 325 static int32_t rack_timely_int_timely_only = 0; /* do interim timely's only use the timely algo (no b/w changes)? */ 326 static int32_t rack_timely_no_stopping = 0; 327 static int32_t rack_down_raise_thresh = 100; 328 static int32_t rack_req_segs = 1; 329 static uint64_t rack_bw_rate_cap = 0; 330 static uint32_t rack_trace_point_config = 0; 331 static uint32_t rack_trace_point_bb_mode = 4; 332 static int32_t rack_trace_point_count = 0; 333 334 335 /* Weird delayed ack mode */ 336 static int32_t rack_use_imac_dack = 0; 337 /* Rack specific counters */ 338 counter_u64_t rack_saw_enobuf; 339 counter_u64_t rack_saw_enobuf_hw; 340 counter_u64_t rack_saw_enetunreach; 341 counter_u64_t rack_persists_sends; 342 counter_u64_t rack_persists_acks; 343 counter_u64_t rack_persists_loss; 344 counter_u64_t rack_persists_lost_ends; 345 #ifdef INVARIANTS 346 counter_u64_t rack_adjust_map_bw; 347 #endif 348 /* Tail loss probe counters */ 349 counter_u64_t rack_tlp_tot; 350 counter_u64_t rack_tlp_newdata; 351 counter_u64_t rack_tlp_retran; 352 counter_u64_t rack_tlp_retran_bytes; 353 counter_u64_t rack_to_tot; 354 counter_u64_t rack_hot_alloc; 355 counter_u64_t rack_to_alloc; 356 counter_u64_t rack_to_alloc_hard; 357 counter_u64_t rack_to_alloc_emerg; 358 counter_u64_t rack_to_alloc_limited; 359 counter_u64_t rack_alloc_limited_conns; 360 counter_u64_t rack_split_limited; 361 362 counter_u64_t rack_multi_single_eq; 363 counter_u64_t rack_proc_non_comp_ack; 364 365 counter_u64_t rack_fto_send; 366 counter_u64_t rack_fto_rsm_send; 367 counter_u64_t rack_nfto_resend; 368 counter_u64_t rack_non_fto_send; 369 counter_u64_t rack_extended_rfo; 370 371 counter_u64_t rack_sack_proc_all; 372 counter_u64_t rack_sack_proc_short; 373 counter_u64_t rack_sack_proc_restart; 374 counter_u64_t rack_sack_attacks_detected; 375 counter_u64_t rack_sack_attacks_reversed; 376 counter_u64_t rack_sack_used_next_merge; 377 counter_u64_t rack_sack_splits; 378 counter_u64_t rack_sack_used_prev_merge; 379 counter_u64_t rack_sack_skipped_acked; 380 counter_u64_t rack_ack_total; 381 counter_u64_t rack_express_sack; 382 counter_u64_t rack_sack_total; 383 counter_u64_t rack_move_none; 384 counter_u64_t rack_move_some; 385 386 counter_u64_t rack_input_idle_reduces; 387 counter_u64_t rack_collapsed_win; 388 counter_u64_t rack_collapsed_win_seen; 389 counter_u64_t rack_collapsed_win_rxt; 390 counter_u64_t rack_collapsed_win_rxt_bytes; 391 counter_u64_t rack_try_scwnd; 392 counter_u64_t rack_hw_pace_init_fail; 393 counter_u64_t rack_hw_pace_lost; 394 395 counter_u64_t rack_out_size[TCP_MSS_ACCT_SIZE]; 396 counter_u64_t rack_opts_arry[RACK_OPTS_SIZE]; 397 398 399 #define RACK_REXMTVAL(tp) max(rack_rto_min, ((tp)->t_srtt + ((tp)->t_rttvar << 2))) 400 401 #define RACK_TCPT_RANGESET(tv, value, tvmin, tvmax, slop) do { \ 402 (tv) = (value) + slop; \ 403 if ((u_long)(tv) < (u_long)(tvmin)) \ 404 (tv) = (tvmin); \ 405 if ((u_long)(tv) > (u_long)(tvmax)) \ 406 (tv) = (tvmax); \ 407 } while (0) 408 409 static void 410 rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, int event, int line); 411 412 static int 413 rack_process_ack(struct mbuf *m, struct tcphdr *th, 414 struct socket *so, struct tcpcb *tp, struct tcpopt *to, 415 uint32_t tiwin, int32_t tlen, int32_t * ofia, int32_t thflags, int32_t * ret_val); 416 static int 417 rack_process_data(struct mbuf *m, struct tcphdr *th, 418 struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, 419 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); 420 static void 421 rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, 422 uint32_t th_ack, uint16_t nsegs, uint16_t type, int32_t recovery); 423 static struct rack_sendmap *rack_alloc(struct tcp_rack *rack); 424 static struct rack_sendmap *rack_alloc_limit(struct tcp_rack *rack, 425 uint8_t limit_type); 426 static struct rack_sendmap * 427 rack_check_recovery_mode(struct tcpcb *tp, 428 uint32_t tsused); 429 static void 430 rack_cong_signal(struct tcpcb *tp, 431 uint32_t type, uint32_t ack, int ); 432 static void rack_counter_destroy(void); 433 static int 434 rack_ctloutput(struct inpcb *inp, struct sockopt *sopt); 435 static int32_t rack_ctor(void *mem, int32_t size, void *arg, int32_t how); 436 static void 437 rack_set_pace_segments(struct tcpcb *tp, struct tcp_rack *rack, uint32_t line, uint64_t *fill_override); 438 static void 439 rack_do_segment(struct mbuf *m, struct tcphdr *th, 440 struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, 441 uint8_t iptos); 442 static void rack_dtor(void *mem, int32_t size, void *arg); 443 static void 444 rack_log_alt_to_to_cancel(struct tcp_rack *rack, 445 uint32_t flex1, uint32_t flex2, 446 uint32_t flex3, uint32_t flex4, 447 uint32_t flex5, uint32_t flex6, 448 uint16_t flex7, uint8_t mod); 449 450 static void 451 rack_log_pacing_delay_calc(struct tcp_rack *rack, uint32_t len, uint32_t slot, 452 uint64_t bw_est, uint64_t bw, uint64_t len_time, int method, int line, 453 struct rack_sendmap *rsm, uint8_t quality); 454 static struct rack_sendmap * 455 rack_find_high_nonack(struct tcp_rack *rack, 456 struct rack_sendmap *rsm); 457 static struct rack_sendmap *rack_find_lowest_rsm(struct tcp_rack *rack); 458 static void rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm); 459 static void rack_fini(struct tcpcb *tp, int32_t tcb_is_purged); 460 static int rack_get_sockopt(struct inpcb *inp, struct sockopt *sopt); 461 static void 462 rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack, 463 tcp_seq th_ack, int line, uint8_t quality); 464 static uint32_t 465 rack_get_pacing_len(struct tcp_rack *rack, uint64_t bw, uint32_t mss); 466 static int32_t rack_handoff_ok(struct tcpcb *tp); 467 static int32_t rack_init(struct tcpcb *tp); 468 static void rack_init_sysctls(void); 469 static void 470 rack_log_ack(struct tcpcb *tp, struct tcpopt *to, 471 struct tcphdr *th, int entered_rec, int dup_ack_struck); 472 static void 473 rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len, 474 uint32_t seq_out, uint16_t th_flags, int32_t err, uint64_t ts, 475 struct rack_sendmap *hintrsm, uint16_t add_flags, struct mbuf *s_mb, uint32_t s_moff, int hw_tls); 476 477 static void 478 rack_log_sack_passed(struct tcpcb *tp, struct tcp_rack *rack, 479 struct rack_sendmap *rsm); 480 static void rack_log_to_event(struct tcp_rack *rack, int32_t to_num, struct rack_sendmap *rsm); 481 static int32_t rack_output(struct tcpcb *tp); 482 483 static uint32_t 484 rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, 485 struct sackblk *sack, struct tcpopt *to, struct rack_sendmap **prsm, 486 uint32_t cts, int *moved_two); 487 static void rack_post_recovery(struct tcpcb *tp, uint32_t th_seq); 488 static void rack_remxt_tmr(struct tcpcb *tp); 489 static int rack_set_sockopt(struct inpcb *inp, struct sockopt *sopt); 490 static void rack_set_state(struct tcpcb *tp, struct tcp_rack *rack); 491 static int32_t rack_stopall(struct tcpcb *tp); 492 static void rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line); 493 static void rack_timer_stop(struct tcpcb *tp, uint32_t timer_type); 494 static uint32_t 495 rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack, 496 struct rack_sendmap *rsm, uint64_t ts, int32_t * lenp, uint16_t add_flag); 497 static void 498 rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack, 499 struct rack_sendmap *rsm, uint64_t ts, uint16_t add_flag); 500 static int 501 rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack, 502 struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type, tcp_seq th_ack); 503 static int32_t tcp_addrack(module_t mod, int32_t type, void *data); 504 static int 505 rack_do_close_wait(struct mbuf *m, struct tcphdr *th, 506 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 507 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 508 static int 509 rack_do_closing(struct mbuf *m, struct tcphdr *th, 510 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 511 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 512 static int 513 rack_do_established(struct mbuf *m, struct tcphdr *th, 514 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 515 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 516 static int 517 rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, 518 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 519 int32_t tlen, uint32_t tiwin, int32_t nxt_pkt, uint8_t iptos); 520 static int 521 rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, 522 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 523 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 524 static int 525 rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, 526 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 527 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 528 static int 529 rack_do_lastack(struct mbuf *m, struct tcphdr *th, 530 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 531 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 532 static int 533 rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, 534 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 535 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 536 static int 537 rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, 538 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 539 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 540 struct rack_sendmap * 541 tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, 542 uint32_t tsused); 543 static void tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt, 544 uint32_t len, uint32_t us_tim, int confidence, struct rack_sendmap *rsm, uint16_t rtrcnt); 545 static void 546 tcp_rack_partialack(struct tcpcb *tp); 547 static int 548 rack_set_profile(struct tcp_rack *rack, int prof); 549 static void 550 rack_apply_deferred_options(struct tcp_rack *rack); 551 552 int32_t rack_clear_counter=0; 553 554 static inline void 555 rack_trace_point(struct tcp_rack *rack, int num) 556 { 557 if (((rack_trace_point_config == num) || 558 (rack_trace_point_config = 0xffffffff)) && 559 (rack_trace_point_bb_mode != 0) && 560 (rack_trace_point_count > 0) && 561 (rack->rc_tp->t_logstate == 0)) { 562 int res; 563 res = atomic_fetchadd_int(&rack_trace_point_count, -1); 564 if (res > 0) { 565 rack->rc_tp->t_logstate = rack_trace_point_bb_mode; 566 } else { 567 /* Loss a race assure its zero now */ 568 rack_trace_point_count = 0; 569 } 570 } 571 } 572 573 static void 574 rack_set_cc_pacing(struct tcp_rack *rack) 575 { 576 struct sockopt sopt; 577 struct cc_newreno_opts opt; 578 struct newreno old, *ptr; 579 struct tcpcb *tp; 580 int error; 581 582 if (rack->rc_pacing_cc_set) 583 return; 584 585 tp = rack->rc_tp; 586 if (tp->t_cc == NULL) { 587 /* Tcb is leaving */ 588 return; 589 } 590 rack->rc_pacing_cc_set = 1; 591 if (strcmp(tp->t_cc->name, CCALGONAME_NEWRENO) != 0) { 592 /* Not new-reno we can't play games with beta! */ 593 goto out; 594 } 595 ptr = ((struct newreno *)tp->t_ccv.cc_data); 596 if (CC_ALGO(tp)->ctl_output == NULL) { 597 /* Huh, why does new_reno no longer have a set function? */ 598 goto out; 599 } 600 if (ptr == NULL) { 601 /* Just the default values */ 602 old.beta = V_newreno_beta_ecn; 603 old.beta_ecn = V_newreno_beta_ecn; 604 old.newreno_flags = 0; 605 } else { 606 old.beta = ptr->beta; 607 old.beta_ecn = ptr->beta_ecn; 608 old.newreno_flags = ptr->newreno_flags; 609 } 610 sopt.sopt_valsize = sizeof(struct cc_newreno_opts); 611 sopt.sopt_dir = SOPT_SET; 612 opt.name = CC_NEWRENO_BETA; 613 opt.val = rack->r_ctl.rc_saved_beta.beta; 614 error = CC_ALGO(tp)->ctl_output(&tp->t_ccv, &sopt, &opt); 615 if (error) { 616 goto out; 617 } 618 /* 619 * Hack alert we need to set in our newreno_flags 620 * so that Abe behavior is also applied. 621 */ 622 ((struct newreno *)tp->t_ccv.cc_data)->newreno_flags |= CC_NEWRENO_BETA_ECN_ENABLED; 623 opt.name = CC_NEWRENO_BETA_ECN; 624 opt.val = rack->r_ctl.rc_saved_beta.beta_ecn; 625 error = CC_ALGO(tp)->ctl_output(&tp->t_ccv, &sopt, &opt); 626 if (error) { 627 goto out; 628 } 629 /* Save off the original values for restoral */ 630 memcpy(&rack->r_ctl.rc_saved_beta, &old, sizeof(struct newreno)); 631 out: 632 if (rack_verbose_logging && (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) { 633 union tcp_log_stackspecific log; 634 struct timeval tv; 635 636 ptr = ((struct newreno *)tp->t_ccv.cc_data); 637 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 638 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 639 if (ptr) { 640 log.u_bbr.flex1 = ptr->beta; 641 log.u_bbr.flex2 = ptr->beta_ecn; 642 log.u_bbr.flex3 = ptr->newreno_flags; 643 } 644 log.u_bbr.flex4 = rack->r_ctl.rc_saved_beta.beta; 645 log.u_bbr.flex5 = rack->r_ctl.rc_saved_beta.beta_ecn; 646 log.u_bbr.flex6 = rack->r_ctl.rc_saved_beta.newreno_flags; 647 log.u_bbr.flex7 = rack->gp_ready; 648 log.u_bbr.flex7 <<= 1; 649 log.u_bbr.flex7 |= rack->use_fixed_rate; 650 log.u_bbr.flex7 <<= 1; 651 log.u_bbr.flex7 |= rack->rc_pacing_cc_set; 652 log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt; 653 log.u_bbr.flex8 = 3; 654 tcp_log_event_(tp, NULL, NULL, NULL, BBR_LOG_CWND, error, 655 0, &log, false, NULL, NULL, 0, &tv); 656 } 657 } 658 659 static void 660 rack_undo_cc_pacing(struct tcp_rack *rack) 661 { 662 struct newreno old, *ptr; 663 struct tcpcb *tp; 664 665 if (rack->rc_pacing_cc_set == 0) 666 return; 667 tp = rack->rc_tp; 668 rack->rc_pacing_cc_set = 0; 669 if (tp->t_cc == NULL) 670 /* Tcb is leaving */ 671 return; 672 if (strcmp(tp->t_cc->name, CCALGONAME_NEWRENO) != 0) { 673 /* Not new-reno nothing to do! */ 674 return; 675 } 676 ptr = ((struct newreno *)tp->t_ccv.cc_data); 677 if (ptr == NULL) { 678 /* 679 * This happens at rack_fini() if the 680 * cc module gets freed on us. In that 681 * case we loose our "new" settings but 682 * thats ok, since the tcb is going away anyway. 683 */ 684 return; 685 } 686 /* Grab out our set values */ 687 memcpy(&old, ptr, sizeof(struct newreno)); 688 /* Copy back in the original values */ 689 memcpy(ptr, &rack->r_ctl.rc_saved_beta, sizeof(struct newreno)); 690 /* Now save back the values we had set in (for when pacing is restored) */ 691 memcpy(&rack->r_ctl.rc_saved_beta, &old, sizeof(struct newreno)); 692 if (rack_verbose_logging && (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) { 693 union tcp_log_stackspecific log; 694 struct timeval tv; 695 696 ptr = ((struct newreno *)tp->t_ccv.cc_data); 697 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 698 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 699 log.u_bbr.flex1 = ptr->beta; 700 log.u_bbr.flex2 = ptr->beta_ecn; 701 log.u_bbr.flex3 = ptr->newreno_flags; 702 log.u_bbr.flex4 = rack->r_ctl.rc_saved_beta.beta; 703 log.u_bbr.flex5 = rack->r_ctl.rc_saved_beta.beta_ecn; 704 log.u_bbr.flex6 = rack->r_ctl.rc_saved_beta.newreno_flags; 705 log.u_bbr.flex7 = rack->gp_ready; 706 log.u_bbr.flex7 <<= 1; 707 log.u_bbr.flex7 |= rack->use_fixed_rate; 708 log.u_bbr.flex7 <<= 1; 709 log.u_bbr.flex7 |= rack->rc_pacing_cc_set; 710 log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt; 711 log.u_bbr.flex8 = 4; 712 tcp_log_event_(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0, 713 0, &log, false, NULL, NULL, 0, &tv); 714 } 715 } 716 717 #ifdef NETFLIX_PEAKRATE 718 static inline void 719 rack_update_peakrate_thr(struct tcpcb *tp) 720 { 721 /* Keep in mind that t_maxpeakrate is in B/s. */ 722 uint64_t peak; 723 peak = uqmax((tp->t_maxseg * 2), 724 (((uint64_t)tp->t_maxpeakrate * (uint64_t)(tp->t_srtt)) / (uint64_t)HPTS_USEC_IN_SEC)); 725 tp->t_peakrate_thr = (uint32_t)uqmin(peak, UINT32_MAX); 726 } 727 #endif 728 729 static int 730 sysctl_rack_clear(SYSCTL_HANDLER_ARGS) 731 { 732 uint32_t stat; 733 int32_t error; 734 735 error = SYSCTL_OUT(req, &rack_clear_counter, sizeof(uint32_t)); 736 if (error || req->newptr == NULL) 737 return error; 738 739 error = SYSCTL_IN(req, &stat, sizeof(uint32_t)); 740 if (error) 741 return (error); 742 if (stat == 1) { 743 #ifdef INVARIANTS 744 printf("Clearing RACK counters\n"); 745 #endif 746 counter_u64_zero(rack_tlp_tot); 747 counter_u64_zero(rack_tlp_newdata); 748 counter_u64_zero(rack_tlp_retran); 749 counter_u64_zero(rack_tlp_retran_bytes); 750 counter_u64_zero(rack_to_tot); 751 counter_u64_zero(rack_saw_enobuf); 752 counter_u64_zero(rack_saw_enobuf_hw); 753 counter_u64_zero(rack_saw_enetunreach); 754 counter_u64_zero(rack_persists_sends); 755 counter_u64_zero(rack_persists_acks); 756 counter_u64_zero(rack_persists_loss); 757 counter_u64_zero(rack_persists_lost_ends); 758 #ifdef INVARIANTS 759 counter_u64_zero(rack_adjust_map_bw); 760 #endif 761 counter_u64_zero(rack_to_alloc_hard); 762 counter_u64_zero(rack_to_alloc_emerg); 763 counter_u64_zero(rack_sack_proc_all); 764 counter_u64_zero(rack_fto_send); 765 counter_u64_zero(rack_fto_rsm_send); 766 counter_u64_zero(rack_extended_rfo); 767 counter_u64_zero(rack_hw_pace_init_fail); 768 counter_u64_zero(rack_hw_pace_lost); 769 counter_u64_zero(rack_non_fto_send); 770 counter_u64_zero(rack_nfto_resend); 771 counter_u64_zero(rack_sack_proc_short); 772 counter_u64_zero(rack_sack_proc_restart); 773 counter_u64_zero(rack_to_alloc); 774 counter_u64_zero(rack_to_alloc_limited); 775 counter_u64_zero(rack_alloc_limited_conns); 776 counter_u64_zero(rack_split_limited); 777 counter_u64_zero(rack_multi_single_eq); 778 counter_u64_zero(rack_proc_non_comp_ack); 779 counter_u64_zero(rack_sack_attacks_detected); 780 counter_u64_zero(rack_sack_attacks_reversed); 781 counter_u64_zero(rack_sack_used_next_merge); 782 counter_u64_zero(rack_sack_used_prev_merge); 783 counter_u64_zero(rack_sack_splits); 784 counter_u64_zero(rack_sack_skipped_acked); 785 counter_u64_zero(rack_ack_total); 786 counter_u64_zero(rack_express_sack); 787 counter_u64_zero(rack_sack_total); 788 counter_u64_zero(rack_move_none); 789 counter_u64_zero(rack_move_some); 790 counter_u64_zero(rack_try_scwnd); 791 counter_u64_zero(rack_collapsed_win); 792 counter_u64_zero(rack_collapsed_win_rxt); 793 counter_u64_zero(rack_collapsed_win_seen); 794 counter_u64_zero(rack_collapsed_win_rxt_bytes); 795 } 796 rack_clear_counter = 0; 797 return (0); 798 } 799 800 static void 801 rack_init_sysctls(void) 802 { 803 struct sysctl_oid *rack_counters; 804 struct sysctl_oid *rack_attack; 805 struct sysctl_oid *rack_pacing; 806 struct sysctl_oid *rack_timely; 807 struct sysctl_oid *rack_timers; 808 struct sysctl_oid *rack_tlp; 809 struct sysctl_oid *rack_misc; 810 struct sysctl_oid *rack_features; 811 struct sysctl_oid *rack_measure; 812 struct sysctl_oid *rack_probertt; 813 struct sysctl_oid *rack_hw_pacing; 814 struct sysctl_oid *rack_tracepoint; 815 816 rack_attack = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 817 SYSCTL_CHILDREN(rack_sysctl_root), 818 OID_AUTO, 819 "sack_attack", 820 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 821 "Rack Sack Attack Counters and Controls"); 822 rack_counters = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 823 SYSCTL_CHILDREN(rack_sysctl_root), 824 OID_AUTO, 825 "stats", 826 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 827 "Rack Counters"); 828 SYSCTL_ADD_S32(&rack_sysctl_ctx, 829 SYSCTL_CHILDREN(rack_sysctl_root), 830 OID_AUTO, "rate_sample_method", CTLFLAG_RW, 831 &rack_rate_sample_method , USE_RTT_LOW, 832 "What method should we use for rate sampling 0=high, 1=low "); 833 /* Probe rtt related controls */ 834 rack_probertt = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 835 SYSCTL_CHILDREN(rack_sysctl_root), 836 OID_AUTO, 837 "probertt", 838 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 839 "ProbeRTT related Controls"); 840 SYSCTL_ADD_U16(&rack_sysctl_ctx, 841 SYSCTL_CHILDREN(rack_probertt), 842 OID_AUTO, "exit_per_hpb", CTLFLAG_RW, 843 &rack_atexit_prtt_hbp, 130, 844 "What percentage above goodput do we clamp CA/SS to at exit on high-BDP path 110%"); 845 SYSCTL_ADD_U16(&rack_sysctl_ctx, 846 SYSCTL_CHILDREN(rack_probertt), 847 OID_AUTO, "exit_per_nonhpb", CTLFLAG_RW, 848 &rack_atexit_prtt, 130, 849 "What percentage above goodput do we clamp CA/SS to at exit on a non high-BDP path 100%"); 850 SYSCTL_ADD_U16(&rack_sysctl_ctx, 851 SYSCTL_CHILDREN(rack_probertt), 852 OID_AUTO, "gp_per_mul", CTLFLAG_RW, 853 &rack_per_of_gp_probertt, 60, 854 "What percentage of goodput do we pace at in probertt"); 855 SYSCTL_ADD_U16(&rack_sysctl_ctx, 856 SYSCTL_CHILDREN(rack_probertt), 857 OID_AUTO, "gp_per_reduce", CTLFLAG_RW, 858 &rack_per_of_gp_probertt_reduce, 10, 859 "What percentage of goodput do we reduce every gp_srtt"); 860 SYSCTL_ADD_U16(&rack_sysctl_ctx, 861 SYSCTL_CHILDREN(rack_probertt), 862 OID_AUTO, "gp_per_low", CTLFLAG_RW, 863 &rack_per_of_gp_lowthresh, 40, 864 "What percentage of goodput do we allow the multiplier to fall to"); 865 SYSCTL_ADD_U32(&rack_sysctl_ctx, 866 SYSCTL_CHILDREN(rack_probertt), 867 OID_AUTO, "time_between", CTLFLAG_RW, 868 & rack_time_between_probertt, 96000000, 869 "How many useconds between the lowest rtt falling must past before we enter probertt"); 870 SYSCTL_ADD_U32(&rack_sysctl_ctx, 871 SYSCTL_CHILDREN(rack_probertt), 872 OID_AUTO, "safety", CTLFLAG_RW, 873 &rack_probe_rtt_safety_val, 2000000, 874 "If not zero, provides a maximum usecond that you can stay in probertt (2sec = 2000000)"); 875 SYSCTL_ADD_U32(&rack_sysctl_ctx, 876 SYSCTL_CHILDREN(rack_probertt), 877 OID_AUTO, "sets_cwnd", CTLFLAG_RW, 878 &rack_probe_rtt_sets_cwnd, 0, 879 "Do we set the cwnd too (if always_lower is on)"); 880 SYSCTL_ADD_U32(&rack_sysctl_ctx, 881 SYSCTL_CHILDREN(rack_probertt), 882 OID_AUTO, "maxdrainsrtts", CTLFLAG_RW, 883 &rack_max_drain_wait, 2, 884 "Maximum number of gp_srtt's to hold in drain waiting for flight to reach goal"); 885 SYSCTL_ADD_U32(&rack_sysctl_ctx, 886 SYSCTL_CHILDREN(rack_probertt), 887 OID_AUTO, "mustdrainsrtts", CTLFLAG_RW, 888 &rack_must_drain, 1, 889 "We must drain this many gp_srtt's waiting for flight to reach goal"); 890 SYSCTL_ADD_U32(&rack_sysctl_ctx, 891 SYSCTL_CHILDREN(rack_probertt), 892 OID_AUTO, "goal_use_min_entry", CTLFLAG_RW, 893 &rack_probertt_use_min_rtt_entry, 1, 894 "Should we use the min-rtt to calculate the goal rtt (else gp_srtt) at entry"); 895 SYSCTL_ADD_U32(&rack_sysctl_ctx, 896 SYSCTL_CHILDREN(rack_probertt), 897 OID_AUTO, "goal_use_min_exit", CTLFLAG_RW, 898 &rack_probertt_use_min_rtt_exit, 0, 899 "How to set cwnd at exit, 0 - dynamic, 1 - use min-rtt, 2 - use curgprtt, 3 - entry gp-rtt"); 900 SYSCTL_ADD_U32(&rack_sysctl_ctx, 901 SYSCTL_CHILDREN(rack_probertt), 902 OID_AUTO, "length_div", CTLFLAG_RW, 903 &rack_probertt_gpsrtt_cnt_div, 0, 904 "How many recent goodput srtt periods plus hold tim does probertt last (bottom of fraction)"); 905 SYSCTL_ADD_U32(&rack_sysctl_ctx, 906 SYSCTL_CHILDREN(rack_probertt), 907 OID_AUTO, "length_mul", CTLFLAG_RW, 908 &rack_probertt_gpsrtt_cnt_mul, 0, 909 "How many recent goodput srtt periods plus hold tim does probertt last (top of fraction)"); 910 SYSCTL_ADD_U32(&rack_sysctl_ctx, 911 SYSCTL_CHILDREN(rack_probertt), 912 OID_AUTO, "holdtim_at_target", CTLFLAG_RW, 913 &rack_min_probertt_hold, 200000, 914 "What is the minimum time we hold probertt at target"); 915 SYSCTL_ADD_U32(&rack_sysctl_ctx, 916 SYSCTL_CHILDREN(rack_probertt), 917 OID_AUTO, "filter_life", CTLFLAG_RW, 918 &rack_probertt_filter_life, 10000000, 919 "What is the time for the filters life in useconds"); 920 SYSCTL_ADD_U32(&rack_sysctl_ctx, 921 SYSCTL_CHILDREN(rack_probertt), 922 OID_AUTO, "lower_within", CTLFLAG_RW, 923 &rack_probertt_lower_within, 10, 924 "If the rtt goes lower within this percentage of the time, go into probe-rtt"); 925 SYSCTL_ADD_U32(&rack_sysctl_ctx, 926 SYSCTL_CHILDREN(rack_probertt), 927 OID_AUTO, "must_move", CTLFLAG_RW, 928 &rack_min_rtt_movement, 250, 929 "How much is the minimum movement in rtt to count as a drop for probertt purposes"); 930 SYSCTL_ADD_U32(&rack_sysctl_ctx, 931 SYSCTL_CHILDREN(rack_probertt), 932 OID_AUTO, "clear_is_cnts", CTLFLAG_RW, 933 &rack_probertt_clear_is, 1, 934 "Do we clear I/S counts on exiting probe-rtt"); 935 SYSCTL_ADD_S32(&rack_sysctl_ctx, 936 SYSCTL_CHILDREN(rack_probertt), 937 OID_AUTO, "hbp_extra_drain", CTLFLAG_RW, 938 &rack_max_drain_hbp, 1, 939 "How many extra drain gpsrtt's do we get in highly buffered paths"); 940 SYSCTL_ADD_S32(&rack_sysctl_ctx, 941 SYSCTL_CHILDREN(rack_probertt), 942 OID_AUTO, "hbp_threshold", CTLFLAG_RW, 943 &rack_hbp_thresh, 3, 944 "We are highly buffered if min_rtt_seen / max_rtt_seen > this-threshold"); 945 946 rack_tracepoint = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 947 SYSCTL_CHILDREN(rack_sysctl_root), 948 OID_AUTO, 949 "tp", 950 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 951 "Rack tracepoint facility"); 952 SYSCTL_ADD_U32(&rack_sysctl_ctx, 953 SYSCTL_CHILDREN(rack_tracepoint), 954 OID_AUTO, "number", CTLFLAG_RW, 955 &rack_trace_point_config, 0, 956 "What is the trace point number to activate (0=none, 0xffffffff = all)?"); 957 SYSCTL_ADD_U32(&rack_sysctl_ctx, 958 SYSCTL_CHILDREN(rack_tracepoint), 959 OID_AUTO, "bbmode", CTLFLAG_RW, 960 &rack_trace_point_bb_mode, 4, 961 "What is BB logging mode that is activated?"); 962 SYSCTL_ADD_S32(&rack_sysctl_ctx, 963 SYSCTL_CHILDREN(rack_tracepoint), 964 OID_AUTO, "count", CTLFLAG_RW, 965 &rack_trace_point_count, 0, 966 "How many connections will have BB logging turned on that hit the tracepoint?"); 967 /* Pacing related sysctls */ 968 rack_pacing = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 969 SYSCTL_CHILDREN(rack_sysctl_root), 970 OID_AUTO, 971 "pacing", 972 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 973 "Pacing related Controls"); 974 SYSCTL_ADD_S32(&rack_sysctl_ctx, 975 SYSCTL_CHILDREN(rack_pacing), 976 OID_AUTO, "max_pace_over", CTLFLAG_RW, 977 &rack_max_per_above, 30, 978 "What is the maximum allowable percentage that we can pace above (so 30 = 130% of our goal)"); 979 SYSCTL_ADD_S32(&rack_sysctl_ctx, 980 SYSCTL_CHILDREN(rack_pacing), 981 OID_AUTO, "pace_to_one", CTLFLAG_RW, 982 &rack_pace_one_seg, 0, 983 "Do we allow low b/w pacing of 1MSS instead of two"); 984 SYSCTL_ADD_S32(&rack_sysctl_ctx, 985 SYSCTL_CHILDREN(rack_pacing), 986 OID_AUTO, "limit_wsrtt", CTLFLAG_RW, 987 &rack_limit_time_with_srtt, 0, 988 "Do we limit pacing time based on srtt"); 989 SYSCTL_ADD_S32(&rack_sysctl_ctx, 990 SYSCTL_CHILDREN(rack_pacing), 991 OID_AUTO, "init_win", CTLFLAG_RW, 992 &rack_default_init_window, 0, 993 "Do we have a rack initial window 0 = system default"); 994 SYSCTL_ADD_U16(&rack_sysctl_ctx, 995 SYSCTL_CHILDREN(rack_pacing), 996 OID_AUTO, "gp_per_ss", CTLFLAG_RW, 997 &rack_per_of_gp_ss, 250, 998 "If non zero, what percentage of goodput to pace at in slow start"); 999 SYSCTL_ADD_U16(&rack_sysctl_ctx, 1000 SYSCTL_CHILDREN(rack_pacing), 1001 OID_AUTO, "gp_per_ca", CTLFLAG_RW, 1002 &rack_per_of_gp_ca, 150, 1003 "If non zero, what percentage of goodput to pace at in congestion avoidance"); 1004 SYSCTL_ADD_U16(&rack_sysctl_ctx, 1005 SYSCTL_CHILDREN(rack_pacing), 1006 OID_AUTO, "gp_per_rec", CTLFLAG_RW, 1007 &rack_per_of_gp_rec, 200, 1008 "If non zero, what percentage of goodput to pace at in recovery"); 1009 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1010 SYSCTL_CHILDREN(rack_pacing), 1011 OID_AUTO, "pace_max_seg", CTLFLAG_RW, 1012 &rack_hptsi_segments, 40, 1013 "What size is the max for TSO segments in pacing and burst mitigation"); 1014 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1015 SYSCTL_CHILDREN(rack_pacing), 1016 OID_AUTO, "burst_reduces", CTLFLAG_RW, 1017 &rack_slot_reduction, 4, 1018 "When doing only burst mitigation what is the reduce divisor"); 1019 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1020 SYSCTL_CHILDREN(rack_sysctl_root), 1021 OID_AUTO, "use_pacing", CTLFLAG_RW, 1022 &rack_pace_every_seg, 0, 1023 "If set we use pacing, if clear we use only the original burst mitigation"); 1024 SYSCTL_ADD_U64(&rack_sysctl_ctx, 1025 SYSCTL_CHILDREN(rack_pacing), 1026 OID_AUTO, "rate_cap", CTLFLAG_RW, 1027 &rack_bw_rate_cap, 0, 1028 "If set we apply this value to the absolute rate cap used by pacing"); 1029 SYSCTL_ADD_U8(&rack_sysctl_ctx, 1030 SYSCTL_CHILDREN(rack_sysctl_root), 1031 OID_AUTO, "req_measure_cnt", CTLFLAG_RW, 1032 &rack_req_measurements, 1, 1033 "If doing dynamic pacing, how many measurements must be in before we start pacing?"); 1034 /* Hardware pacing */ 1035 rack_hw_pacing = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 1036 SYSCTL_CHILDREN(rack_sysctl_root), 1037 OID_AUTO, 1038 "hdwr_pacing", 1039 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1040 "Pacing related Controls"); 1041 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1042 SYSCTL_CHILDREN(rack_hw_pacing), 1043 OID_AUTO, "rwnd_factor", CTLFLAG_RW, 1044 &rack_hw_rwnd_factor, 2, 1045 "How many times does snd_wnd need to be bigger than pace_max_seg so we will hold off and get more acks?"); 1046 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1047 SYSCTL_CHILDREN(rack_hw_pacing), 1048 OID_AUTO, "pace_enobuf_mult", CTLFLAG_RW, 1049 &rack_enobuf_hw_boost_mult, 2, 1050 "By how many time_betweens should we boost the pacing time if we see a ENOBUFS?"); 1051 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1052 SYSCTL_CHILDREN(rack_hw_pacing), 1053 OID_AUTO, "pace_enobuf_max", CTLFLAG_RW, 1054 &rack_enobuf_hw_max, 2, 1055 "What is the max boost the pacing time if we see a ENOBUFS?"); 1056 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1057 SYSCTL_CHILDREN(rack_hw_pacing), 1058 OID_AUTO, "pace_enobuf_min", CTLFLAG_RW, 1059 &rack_enobuf_hw_min, 2, 1060 "What is the min boost the pacing time if we see a ENOBUFS?"); 1061 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1062 SYSCTL_CHILDREN(rack_hw_pacing), 1063 OID_AUTO, "enable", CTLFLAG_RW, 1064 &rack_enable_hw_pacing, 0, 1065 "Should RACK attempt to use hw pacing?"); 1066 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1067 SYSCTL_CHILDREN(rack_hw_pacing), 1068 OID_AUTO, "rate_cap", CTLFLAG_RW, 1069 &rack_hw_rate_caps, 1, 1070 "Does the highest hardware pacing rate cap the rate we will send at??"); 1071 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1072 SYSCTL_CHILDREN(rack_hw_pacing), 1073 OID_AUTO, "rate_min", CTLFLAG_RW, 1074 &rack_hw_rate_min, 0, 1075 "Do we need a minimum estimate of this many bytes per second in order to engage hw pacing?"); 1076 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1077 SYSCTL_CHILDREN(rack_hw_pacing), 1078 OID_AUTO, "rate_to_low", CTLFLAG_RW, 1079 &rack_hw_rate_to_low, 0, 1080 "If we fall below this rate, dis-engage hw pacing?"); 1081 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1082 SYSCTL_CHILDREN(rack_hw_pacing), 1083 OID_AUTO, "up_only", CTLFLAG_RW, 1084 &rack_hw_up_only, 1, 1085 "Do we allow hw pacing to lower the rate selected?"); 1086 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1087 SYSCTL_CHILDREN(rack_hw_pacing), 1088 OID_AUTO, "extra_mss_precise", CTLFLAG_RW, 1089 &rack_hw_pace_extra_slots, 2, 1090 "If the rates between software and hardware match precisely how many extra time_betweens do we get?"); 1091 rack_timely = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 1092 SYSCTL_CHILDREN(rack_sysctl_root), 1093 OID_AUTO, 1094 "timely", 1095 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1096 "Rack Timely RTT Controls"); 1097 /* Timely based GP dynmics */ 1098 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1099 SYSCTL_CHILDREN(rack_timely), 1100 OID_AUTO, "upper", CTLFLAG_RW, 1101 &rack_gp_per_bw_mul_up, 2, 1102 "Rack timely upper range for equal b/w (in percentage)"); 1103 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1104 SYSCTL_CHILDREN(rack_timely), 1105 OID_AUTO, "lower", CTLFLAG_RW, 1106 &rack_gp_per_bw_mul_down, 4, 1107 "Rack timely lower range for equal b/w (in percentage)"); 1108 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1109 SYSCTL_CHILDREN(rack_timely), 1110 OID_AUTO, "rtt_max_mul", CTLFLAG_RW, 1111 &rack_gp_rtt_maxmul, 3, 1112 "Rack timely multiplier of lowest rtt for rtt_max"); 1113 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1114 SYSCTL_CHILDREN(rack_timely), 1115 OID_AUTO, "rtt_min_div", CTLFLAG_RW, 1116 &rack_gp_rtt_mindiv, 4, 1117 "Rack timely divisor used for rtt + (rtt * mul/divisor) for check for lower rtt"); 1118 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1119 SYSCTL_CHILDREN(rack_timely), 1120 OID_AUTO, "rtt_min_mul", CTLFLAG_RW, 1121 &rack_gp_rtt_minmul, 1, 1122 "Rack timely multiplier used for rtt + (rtt * mul/divisor) for check for lower rtt"); 1123 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1124 SYSCTL_CHILDREN(rack_timely), 1125 OID_AUTO, "decrease", CTLFLAG_RW, 1126 &rack_gp_decrease_per, 20, 1127 "Rack timely decrease percentage of our GP multiplication factor"); 1128 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1129 SYSCTL_CHILDREN(rack_timely), 1130 OID_AUTO, "increase", CTLFLAG_RW, 1131 &rack_gp_increase_per, 2, 1132 "Rack timely increase perentage of our GP multiplication factor"); 1133 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1134 SYSCTL_CHILDREN(rack_timely), 1135 OID_AUTO, "lowerbound", CTLFLAG_RW, 1136 &rack_per_lower_bound, 50, 1137 "Rack timely lowest percentage we allow GP multiplier to fall to"); 1138 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1139 SYSCTL_CHILDREN(rack_timely), 1140 OID_AUTO, "upperboundss", CTLFLAG_RW, 1141 &rack_per_upper_bound_ss, 0, 1142 "Rack timely highest percentage we allow GP multiplier in SS to raise to (0 is no upperbound)"); 1143 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1144 SYSCTL_CHILDREN(rack_timely), 1145 OID_AUTO, "upperboundca", CTLFLAG_RW, 1146 &rack_per_upper_bound_ca, 0, 1147 "Rack timely highest percentage we allow GP multiplier to CA raise to (0 is no upperbound)"); 1148 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1149 SYSCTL_CHILDREN(rack_timely), 1150 OID_AUTO, "dynamicgp", CTLFLAG_RW, 1151 &rack_do_dyn_mul, 0, 1152 "Rack timely do we enable dynmaic timely goodput by default"); 1153 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1154 SYSCTL_CHILDREN(rack_timely), 1155 OID_AUTO, "no_rec_red", CTLFLAG_RW, 1156 &rack_gp_no_rec_chg, 1, 1157 "Rack timely do we prohibit the recovery multiplier from being lowered"); 1158 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1159 SYSCTL_CHILDREN(rack_timely), 1160 OID_AUTO, "red_clear_cnt", CTLFLAG_RW, 1161 &rack_timely_dec_clear, 6, 1162 "Rack timely what threshold do we count to before another boost during b/w decent"); 1163 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1164 SYSCTL_CHILDREN(rack_timely), 1165 OID_AUTO, "max_push_rise", CTLFLAG_RW, 1166 &rack_timely_max_push_rise, 3, 1167 "Rack timely how many times do we push up with b/w increase"); 1168 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1169 SYSCTL_CHILDREN(rack_timely), 1170 OID_AUTO, "max_push_drop", CTLFLAG_RW, 1171 &rack_timely_max_push_drop, 3, 1172 "Rack timely how many times do we push back on b/w decent"); 1173 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1174 SYSCTL_CHILDREN(rack_timely), 1175 OID_AUTO, "min_segs", CTLFLAG_RW, 1176 &rack_timely_min_segs, 4, 1177 "Rack timely when setting the cwnd what is the min num segments"); 1178 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1179 SYSCTL_CHILDREN(rack_timely), 1180 OID_AUTO, "noback_max", CTLFLAG_RW, 1181 &rack_use_max_for_nobackoff, 0, 1182 "Rack timely when deciding if to backoff on a loss, do we use under max rtt else min"); 1183 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1184 SYSCTL_CHILDREN(rack_timely), 1185 OID_AUTO, "interim_timely_only", CTLFLAG_RW, 1186 &rack_timely_int_timely_only, 0, 1187 "Rack timely when doing interim timely's do we only do timely (no b/w consideration)"); 1188 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1189 SYSCTL_CHILDREN(rack_timely), 1190 OID_AUTO, "nonstop", CTLFLAG_RW, 1191 &rack_timely_no_stopping, 0, 1192 "Rack timely don't stop increase"); 1193 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1194 SYSCTL_CHILDREN(rack_timely), 1195 OID_AUTO, "dec_raise_thresh", CTLFLAG_RW, 1196 &rack_down_raise_thresh, 100, 1197 "If the CA or SS is below this threshold raise on the first 3 b/w lowers (0=always)"); 1198 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1199 SYSCTL_CHILDREN(rack_timely), 1200 OID_AUTO, "bottom_drag_segs", CTLFLAG_RW, 1201 &rack_req_segs, 1, 1202 "Bottom dragging if not these many segments outstanding and room"); 1203 1204 /* TLP and Rack related parameters */ 1205 rack_tlp = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 1206 SYSCTL_CHILDREN(rack_sysctl_root), 1207 OID_AUTO, 1208 "tlp", 1209 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1210 "TLP and Rack related Controls"); 1211 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1212 SYSCTL_CHILDREN(rack_tlp), 1213 OID_AUTO, "use_rrr", CTLFLAG_RW, 1214 &use_rack_rr, 1, 1215 "Do we use Rack Rapid Recovery"); 1216 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1217 SYSCTL_CHILDREN(rack_tlp), 1218 OID_AUTO, "post_rec_labc", CTLFLAG_RW, 1219 &rack_max_abc_post_recovery, 2, 1220 "Since we do early recovery, do we override the l_abc to a value, if so what?"); 1221 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1222 SYSCTL_CHILDREN(rack_tlp), 1223 OID_AUTO, "nonrxt_use_cr", CTLFLAG_RW, 1224 &rack_non_rxt_use_cr, 0, 1225 "Do we use ss/ca rate if in recovery we are transmitting a new data chunk"); 1226 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1227 SYSCTL_CHILDREN(rack_tlp), 1228 OID_AUTO, "tlpmethod", CTLFLAG_RW, 1229 &rack_tlp_threshold_use, TLP_USE_TWO_ONE, 1230 "What method do we do for TLP time calc 0=no-de-ack-comp, 1=ID, 2=2.1, 3=2.2"); 1231 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1232 SYSCTL_CHILDREN(rack_tlp), 1233 OID_AUTO, "limit", CTLFLAG_RW, 1234 &rack_tlp_limit, 2, 1235 "How many TLP's can be sent without sending new data"); 1236 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1237 SYSCTL_CHILDREN(rack_tlp), 1238 OID_AUTO, "use_greater", CTLFLAG_RW, 1239 &rack_tlp_use_greater, 1, 1240 "Should we use the rack_rtt time if its greater than srtt"); 1241 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1242 SYSCTL_CHILDREN(rack_tlp), 1243 OID_AUTO, "tlpminto", CTLFLAG_RW, 1244 &rack_tlp_min, 10000, 1245 "TLP minimum timeout per the specification (in microseconds)"); 1246 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1247 SYSCTL_CHILDREN(rack_tlp), 1248 OID_AUTO, "send_oldest", CTLFLAG_RW, 1249 &rack_always_send_oldest, 0, 1250 "Should we always send the oldest TLP and RACK-TLP"); 1251 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1252 SYSCTL_CHILDREN(rack_tlp), 1253 OID_AUTO, "rack_tlimit", CTLFLAG_RW, 1254 &rack_limited_retran, 0, 1255 "How many times can a rack timeout drive out sends"); 1256 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1257 SYSCTL_CHILDREN(rack_tlp), 1258 OID_AUTO, "tlp_cwnd_flag", CTLFLAG_RW, 1259 &rack_lower_cwnd_at_tlp, 0, 1260 "When a TLP completes a retran should we enter recovery"); 1261 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1262 SYSCTL_CHILDREN(rack_tlp), 1263 OID_AUTO, "reorder_thresh", CTLFLAG_RW, 1264 &rack_reorder_thresh, 2, 1265 "What factor for rack will be added when seeing reordering (shift right)"); 1266 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1267 SYSCTL_CHILDREN(rack_tlp), 1268 OID_AUTO, "rtt_tlp_thresh", CTLFLAG_RW, 1269 &rack_tlp_thresh, 1, 1270 "What divisor for TLP rtt/retran will be added (1=rtt, 2=1/2 rtt etc)"); 1271 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1272 SYSCTL_CHILDREN(rack_tlp), 1273 OID_AUTO, "reorder_fade", CTLFLAG_RW, 1274 &rack_reorder_fade, 60000000, 1275 "Does reorder detection fade, if so how many microseconds (0 means never)"); 1276 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1277 SYSCTL_CHILDREN(rack_tlp), 1278 OID_AUTO, "pktdelay", CTLFLAG_RW, 1279 &rack_pkt_delay, 1000, 1280 "Extra RACK time (in microseconds) besides reordering thresh"); 1281 1282 /* Timer related controls */ 1283 rack_timers = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 1284 SYSCTL_CHILDREN(rack_sysctl_root), 1285 OID_AUTO, 1286 "timers", 1287 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1288 "Timer related controls"); 1289 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1290 SYSCTL_CHILDREN(rack_timers), 1291 OID_AUTO, "persmin", CTLFLAG_RW, 1292 &rack_persist_min, 250000, 1293 "What is the minimum time in microseconds between persists"); 1294 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1295 SYSCTL_CHILDREN(rack_timers), 1296 OID_AUTO, "persmax", CTLFLAG_RW, 1297 &rack_persist_max, 2000000, 1298 "What is the largest delay in microseconds between persists"); 1299 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1300 SYSCTL_CHILDREN(rack_timers), 1301 OID_AUTO, "delayed_ack", CTLFLAG_RW, 1302 &rack_delayed_ack_time, 40000, 1303 "Delayed ack time (40ms in microseconds)"); 1304 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1305 SYSCTL_CHILDREN(rack_timers), 1306 OID_AUTO, "minrto", CTLFLAG_RW, 1307 &rack_rto_min, 30000, 1308 "Minimum RTO in microseconds -- set with caution below 1000 due to TLP"); 1309 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1310 SYSCTL_CHILDREN(rack_timers), 1311 OID_AUTO, "maxrto", CTLFLAG_RW, 1312 &rack_rto_max, 4000000, 1313 "Maximum RTO in microseconds -- should be at least as large as min_rto"); 1314 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1315 SYSCTL_CHILDREN(rack_timers), 1316 OID_AUTO, "minto", CTLFLAG_RW, 1317 &rack_min_to, 1000, 1318 "Minimum rack timeout in microseconds"); 1319 /* Measure controls */ 1320 rack_measure = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 1321 SYSCTL_CHILDREN(rack_sysctl_root), 1322 OID_AUTO, 1323 "measure", 1324 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1325 "Measure related controls"); 1326 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1327 SYSCTL_CHILDREN(rack_measure), 1328 OID_AUTO, "wma_divisor", CTLFLAG_RW, 1329 &rack_wma_divisor, 8, 1330 "When doing b/w calculation what is the divisor for the WMA"); 1331 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1332 SYSCTL_CHILDREN(rack_measure), 1333 OID_AUTO, "end_cwnd", CTLFLAG_RW, 1334 &rack_cwnd_block_ends_measure, 0, 1335 "Does a cwnd just-return end the measurement window (app limited)"); 1336 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1337 SYSCTL_CHILDREN(rack_measure), 1338 OID_AUTO, "end_rwnd", CTLFLAG_RW, 1339 &rack_rwnd_block_ends_measure, 0, 1340 "Does an rwnd just-return end the measurement window (app limited -- not persists)"); 1341 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1342 SYSCTL_CHILDREN(rack_measure), 1343 OID_AUTO, "min_target", CTLFLAG_RW, 1344 &rack_def_data_window, 20, 1345 "What is the minimum target window (in mss) for a GP measurements"); 1346 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1347 SYSCTL_CHILDREN(rack_measure), 1348 OID_AUTO, "goal_bdp", CTLFLAG_RW, 1349 &rack_goal_bdp, 2, 1350 "What is the goal BDP to measure"); 1351 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1352 SYSCTL_CHILDREN(rack_measure), 1353 OID_AUTO, "min_srtts", CTLFLAG_RW, 1354 &rack_min_srtts, 1, 1355 "What is the goal BDP to measure"); 1356 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1357 SYSCTL_CHILDREN(rack_measure), 1358 OID_AUTO, "min_measure_tim", CTLFLAG_RW, 1359 &rack_min_measure_usec, 0, 1360 "What is the Minimum time time for a measurement if 0, this is off"); 1361 /* Features */ 1362 rack_features = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 1363 SYSCTL_CHILDREN(rack_sysctl_root), 1364 OID_AUTO, 1365 "features", 1366 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1367 "Feature controls"); 1368 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1369 SYSCTL_CHILDREN(rack_features), 1370 OID_AUTO, "cmpack", CTLFLAG_RW, 1371 &rack_use_cmp_acks, 1, 1372 "Should RACK have LRO send compressed acks"); 1373 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1374 SYSCTL_CHILDREN(rack_features), 1375 OID_AUTO, "fsb", CTLFLAG_RW, 1376 &rack_use_fsb, 1, 1377 "Should RACK use the fast send block?"); 1378 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1379 SYSCTL_CHILDREN(rack_features), 1380 OID_AUTO, "rfo", CTLFLAG_RW, 1381 &rack_use_rfo, 1, 1382 "Should RACK use rack_fast_output()?"); 1383 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1384 SYSCTL_CHILDREN(rack_features), 1385 OID_AUTO, "rsmrfo", CTLFLAG_RW, 1386 &rack_use_rsm_rfo, 1, 1387 "Should RACK use rack_fast_rsm_output()?"); 1388 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1389 SYSCTL_CHILDREN(rack_features), 1390 OID_AUTO, "non_paced_lro_queue", CTLFLAG_RW, 1391 &rack_enable_mqueue_for_nonpaced, 0, 1392 "Should RACK use mbuf queuing for non-paced connections"); 1393 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1394 SYSCTL_CHILDREN(rack_features), 1395 OID_AUTO, "hystartplusplus", CTLFLAG_RW, 1396 &rack_do_hystart, 0, 1397 "Should RACK enable HyStart++ on connections?"); 1398 /* Misc rack controls */ 1399 rack_misc = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 1400 SYSCTL_CHILDREN(rack_sysctl_root), 1401 OID_AUTO, 1402 "misc", 1403 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1404 "Misc related controls"); 1405 #ifdef TCP_ACCOUNTING 1406 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1407 SYSCTL_CHILDREN(rack_misc), 1408 OID_AUTO, "tcp_acct", CTLFLAG_RW, 1409 &rack_tcp_accounting, 0, 1410 "Should we turn on TCP accounting for all rack sessions?"); 1411 #endif 1412 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1413 SYSCTL_CHILDREN(rack_misc), 1414 OID_AUTO, "apply_rtt_with_low_conf", CTLFLAG_RW, 1415 &rack_apply_rtt_with_reduced_conf, 0, 1416 "When a persist or keep-alive probe is not answered do we calculate rtt on subsequent answers?"); 1417 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1418 SYSCTL_CHILDREN(rack_misc), 1419 OID_AUTO, "rack_dsack_ctl", CTLFLAG_RW, 1420 &rack_dsack_std_based, 3, 1421 "How do we process dsack with respect to rack timers, bit field, 3 is standards based?"); 1422 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1423 SYSCTL_CHILDREN(rack_misc), 1424 OID_AUTO, "prr_addback_max", CTLFLAG_RW, 1425 &rack_prr_addbackmax, 2, 1426 "What is the maximum number of MSS we allow to be added back if prr can't send all its data?"); 1427 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1428 SYSCTL_CHILDREN(rack_misc), 1429 OID_AUTO, "stats_gets_ms", CTLFLAG_RW, 1430 &rack_stats_gets_ms_rtt, 1, 1431 "What do we feed the stats framework (1 = ms_rtt, 0 = us_rtt, 2 = ms_rtt from hdwr, > 2 usec rtt from hdwr)?"); 1432 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1433 SYSCTL_CHILDREN(rack_misc), 1434 OID_AUTO, "clientlowbuf", CTLFLAG_RW, 1435 &rack_client_low_buf, 0, 1436 "Client low buffer level (below this we are more aggressive in DGP exiting recovery (0 = off)?"); 1437 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1438 SYSCTL_CHILDREN(rack_misc), 1439 OID_AUTO, "defprofile", CTLFLAG_RW, 1440 &rack_def_profile, 0, 1441 "Should RACK use a default profile (0=no, num == profile num)?"); 1442 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1443 SYSCTL_CHILDREN(rack_misc), 1444 OID_AUTO, "shared_cwnd", CTLFLAG_RW, 1445 &rack_enable_shared_cwnd, 1, 1446 "Should RACK try to use the shared cwnd on connections where allowed"); 1447 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1448 SYSCTL_CHILDREN(rack_misc), 1449 OID_AUTO, "limits_on_scwnd", CTLFLAG_RW, 1450 &rack_limits_scwnd, 1, 1451 "Should RACK place low end time limits on the shared cwnd feature"); 1452 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1453 SYSCTL_CHILDREN(rack_misc), 1454 OID_AUTO, "iMac_dack", CTLFLAG_RW, 1455 &rack_use_imac_dack, 0, 1456 "Should RACK try to emulate iMac delayed ack"); 1457 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1458 SYSCTL_CHILDREN(rack_misc), 1459 OID_AUTO, "no_prr", CTLFLAG_RW, 1460 &rack_disable_prr, 0, 1461 "Should RACK not use prr and only pace (must have pacing on)"); 1462 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1463 SYSCTL_CHILDREN(rack_misc), 1464 OID_AUTO, "bb_verbose", CTLFLAG_RW, 1465 &rack_verbose_logging, 0, 1466 "Should RACK black box logging be verbose"); 1467 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1468 SYSCTL_CHILDREN(rack_misc), 1469 OID_AUTO, "data_after_close", CTLFLAG_RW, 1470 &rack_ignore_data_after_close, 1, 1471 "Do we hold off sending a RST until all pending data is ack'd"); 1472 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1473 SYSCTL_CHILDREN(rack_misc), 1474 OID_AUTO, "no_sack_needed", CTLFLAG_RW, 1475 &rack_sack_not_required, 1, 1476 "Do we allow rack to run on connections not supporting SACK"); 1477 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1478 SYSCTL_CHILDREN(rack_misc), 1479 OID_AUTO, "prr_sendalot", CTLFLAG_RW, 1480 &rack_send_a_lot_in_prr, 1, 1481 "Send a lot in prr"); 1482 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1483 SYSCTL_CHILDREN(rack_misc), 1484 OID_AUTO, "autoscale", CTLFLAG_RW, 1485 &rack_autosndbuf_inc, 20, 1486 "What percentage should rack scale up its snd buffer by?"); 1487 /* Sack Attacker detection stuff */ 1488 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1489 SYSCTL_CHILDREN(rack_attack), 1490 OID_AUTO, "detect_highsackratio", CTLFLAG_RW, 1491 &rack_highest_sack_thresh_seen, 0, 1492 "Highest sack to ack ratio seen"); 1493 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1494 SYSCTL_CHILDREN(rack_attack), 1495 OID_AUTO, "detect_highmoveratio", CTLFLAG_RW, 1496 &rack_highest_move_thresh_seen, 0, 1497 "Highest move to non-move ratio seen"); 1498 rack_ack_total = counter_u64_alloc(M_WAITOK); 1499 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1500 SYSCTL_CHILDREN(rack_attack), 1501 OID_AUTO, "acktotal", CTLFLAG_RD, 1502 &rack_ack_total, 1503 "Total number of Ack's"); 1504 rack_express_sack = counter_u64_alloc(M_WAITOK); 1505 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1506 SYSCTL_CHILDREN(rack_attack), 1507 OID_AUTO, "exp_sacktotal", CTLFLAG_RD, 1508 &rack_express_sack, 1509 "Total expresss number of Sack's"); 1510 rack_sack_total = counter_u64_alloc(M_WAITOK); 1511 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1512 SYSCTL_CHILDREN(rack_attack), 1513 OID_AUTO, "sacktotal", CTLFLAG_RD, 1514 &rack_sack_total, 1515 "Total number of SACKs"); 1516 rack_move_none = counter_u64_alloc(M_WAITOK); 1517 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1518 SYSCTL_CHILDREN(rack_attack), 1519 OID_AUTO, "move_none", CTLFLAG_RD, 1520 &rack_move_none, 1521 "Total number of SACK index reuse of positions under threshold"); 1522 rack_move_some = counter_u64_alloc(M_WAITOK); 1523 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1524 SYSCTL_CHILDREN(rack_attack), 1525 OID_AUTO, "move_some", CTLFLAG_RD, 1526 &rack_move_some, 1527 "Total number of SACK index reuse of positions over threshold"); 1528 rack_sack_attacks_detected = counter_u64_alloc(M_WAITOK); 1529 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1530 SYSCTL_CHILDREN(rack_attack), 1531 OID_AUTO, "attacks", CTLFLAG_RD, 1532 &rack_sack_attacks_detected, 1533 "Total number of SACK attackers that had sack disabled"); 1534 rack_sack_attacks_reversed = counter_u64_alloc(M_WAITOK); 1535 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1536 SYSCTL_CHILDREN(rack_attack), 1537 OID_AUTO, "reversed", CTLFLAG_RD, 1538 &rack_sack_attacks_reversed, 1539 "Total number of SACK attackers that were later determined false positive"); 1540 rack_sack_used_next_merge = counter_u64_alloc(M_WAITOK); 1541 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1542 SYSCTL_CHILDREN(rack_attack), 1543 OID_AUTO, "nextmerge", CTLFLAG_RD, 1544 &rack_sack_used_next_merge, 1545 "Total number of times we used the next merge"); 1546 rack_sack_used_prev_merge = counter_u64_alloc(M_WAITOK); 1547 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1548 SYSCTL_CHILDREN(rack_attack), 1549 OID_AUTO, "prevmerge", CTLFLAG_RD, 1550 &rack_sack_used_prev_merge, 1551 "Total number of times we used the prev merge"); 1552 /* Counters */ 1553 rack_fto_send = counter_u64_alloc(M_WAITOK); 1554 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1555 SYSCTL_CHILDREN(rack_counters), 1556 OID_AUTO, "fto_send", CTLFLAG_RD, 1557 &rack_fto_send, "Total number of rack_fast_output sends"); 1558 rack_fto_rsm_send = counter_u64_alloc(M_WAITOK); 1559 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1560 SYSCTL_CHILDREN(rack_counters), 1561 OID_AUTO, "fto_rsm_send", CTLFLAG_RD, 1562 &rack_fto_rsm_send, "Total number of rack_fast_rsm_output sends"); 1563 rack_nfto_resend = counter_u64_alloc(M_WAITOK); 1564 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1565 SYSCTL_CHILDREN(rack_counters), 1566 OID_AUTO, "nfto_resend", CTLFLAG_RD, 1567 &rack_nfto_resend, "Total number of rack_output retransmissions"); 1568 rack_non_fto_send = counter_u64_alloc(M_WAITOK); 1569 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1570 SYSCTL_CHILDREN(rack_counters), 1571 OID_AUTO, "nfto_send", CTLFLAG_RD, 1572 &rack_non_fto_send, "Total number of rack_output first sends"); 1573 rack_extended_rfo = counter_u64_alloc(M_WAITOK); 1574 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1575 SYSCTL_CHILDREN(rack_counters), 1576 OID_AUTO, "rfo_extended", CTLFLAG_RD, 1577 &rack_extended_rfo, "Total number of times we extended rfo"); 1578 1579 rack_hw_pace_init_fail = counter_u64_alloc(M_WAITOK); 1580 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1581 SYSCTL_CHILDREN(rack_counters), 1582 OID_AUTO, "hwpace_init_fail", CTLFLAG_RD, 1583 &rack_hw_pace_init_fail, "Total number of times we failed to initialize hw pacing"); 1584 rack_hw_pace_lost = counter_u64_alloc(M_WAITOK); 1585 1586 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1587 SYSCTL_CHILDREN(rack_counters), 1588 OID_AUTO, "hwpace_lost", CTLFLAG_RD, 1589 &rack_hw_pace_lost, "Total number of times we failed to initialize hw pacing"); 1590 rack_tlp_tot = counter_u64_alloc(M_WAITOK); 1591 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1592 SYSCTL_CHILDREN(rack_counters), 1593 OID_AUTO, "tlp_to_total", CTLFLAG_RD, 1594 &rack_tlp_tot, 1595 "Total number of tail loss probe expirations"); 1596 rack_tlp_newdata = counter_u64_alloc(M_WAITOK); 1597 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1598 SYSCTL_CHILDREN(rack_counters), 1599 OID_AUTO, "tlp_new", CTLFLAG_RD, 1600 &rack_tlp_newdata, 1601 "Total number of tail loss probe sending new data"); 1602 rack_tlp_retran = counter_u64_alloc(M_WAITOK); 1603 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1604 SYSCTL_CHILDREN(rack_counters), 1605 OID_AUTO, "tlp_retran", CTLFLAG_RD, 1606 &rack_tlp_retran, 1607 "Total number of tail loss probe sending retransmitted data"); 1608 rack_tlp_retran_bytes = counter_u64_alloc(M_WAITOK); 1609 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1610 SYSCTL_CHILDREN(rack_counters), 1611 OID_AUTO, "tlp_retran_bytes", CTLFLAG_RD, 1612 &rack_tlp_retran_bytes, 1613 "Total bytes of tail loss probe sending retransmitted data"); 1614 rack_to_tot = counter_u64_alloc(M_WAITOK); 1615 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1616 SYSCTL_CHILDREN(rack_counters), 1617 OID_AUTO, "rack_to_tot", CTLFLAG_RD, 1618 &rack_to_tot, 1619 "Total number of times the rack to expired"); 1620 rack_saw_enobuf = counter_u64_alloc(M_WAITOK); 1621 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1622 SYSCTL_CHILDREN(rack_counters), 1623 OID_AUTO, "saw_enobufs", CTLFLAG_RD, 1624 &rack_saw_enobuf, 1625 "Total number of times a sends returned enobuf for non-hdwr paced connections"); 1626 rack_saw_enobuf_hw = counter_u64_alloc(M_WAITOK); 1627 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1628 SYSCTL_CHILDREN(rack_counters), 1629 OID_AUTO, "saw_enobufs_hw", CTLFLAG_RD, 1630 &rack_saw_enobuf_hw, 1631 "Total number of times a send returned enobuf for hdwr paced connections"); 1632 rack_saw_enetunreach = counter_u64_alloc(M_WAITOK); 1633 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1634 SYSCTL_CHILDREN(rack_counters), 1635 OID_AUTO, "saw_enetunreach", CTLFLAG_RD, 1636 &rack_saw_enetunreach, 1637 "Total number of times a send received a enetunreachable"); 1638 rack_hot_alloc = counter_u64_alloc(M_WAITOK); 1639 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1640 SYSCTL_CHILDREN(rack_counters), 1641 OID_AUTO, "alloc_hot", CTLFLAG_RD, 1642 &rack_hot_alloc, 1643 "Total allocations from the top of our list"); 1644 rack_to_alloc = counter_u64_alloc(M_WAITOK); 1645 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1646 SYSCTL_CHILDREN(rack_counters), 1647 OID_AUTO, "allocs", CTLFLAG_RD, 1648 &rack_to_alloc, 1649 "Total allocations of tracking structures"); 1650 rack_to_alloc_hard = counter_u64_alloc(M_WAITOK); 1651 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1652 SYSCTL_CHILDREN(rack_counters), 1653 OID_AUTO, "allochard", CTLFLAG_RD, 1654 &rack_to_alloc_hard, 1655 "Total allocations done with sleeping the hard way"); 1656 rack_to_alloc_emerg = counter_u64_alloc(M_WAITOK); 1657 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1658 SYSCTL_CHILDREN(rack_counters), 1659 OID_AUTO, "allocemerg", CTLFLAG_RD, 1660 &rack_to_alloc_emerg, 1661 "Total allocations done from emergency cache"); 1662 rack_to_alloc_limited = counter_u64_alloc(M_WAITOK); 1663 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1664 SYSCTL_CHILDREN(rack_counters), 1665 OID_AUTO, "alloc_limited", CTLFLAG_RD, 1666 &rack_to_alloc_limited, 1667 "Total allocations dropped due to limit"); 1668 rack_alloc_limited_conns = counter_u64_alloc(M_WAITOK); 1669 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1670 SYSCTL_CHILDREN(rack_counters), 1671 OID_AUTO, "alloc_limited_conns", CTLFLAG_RD, 1672 &rack_alloc_limited_conns, 1673 "Connections with allocations dropped due to limit"); 1674 rack_split_limited = counter_u64_alloc(M_WAITOK); 1675 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1676 SYSCTL_CHILDREN(rack_counters), 1677 OID_AUTO, "split_limited", CTLFLAG_RD, 1678 &rack_split_limited, 1679 "Split allocations dropped due to limit"); 1680 rack_persists_sends = counter_u64_alloc(M_WAITOK); 1681 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1682 SYSCTL_CHILDREN(rack_counters), 1683 OID_AUTO, "persist_sends", CTLFLAG_RD, 1684 &rack_persists_sends, 1685 "Number of times we sent a persist probe"); 1686 rack_persists_acks = counter_u64_alloc(M_WAITOK); 1687 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1688 SYSCTL_CHILDREN(rack_counters), 1689 OID_AUTO, "persist_acks", CTLFLAG_RD, 1690 &rack_persists_acks, 1691 "Number of times a persist probe was acked"); 1692 rack_persists_loss = counter_u64_alloc(M_WAITOK); 1693 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1694 SYSCTL_CHILDREN(rack_counters), 1695 OID_AUTO, "persist_loss", CTLFLAG_RD, 1696 &rack_persists_loss, 1697 "Number of times we detected a lost persist probe (no ack)"); 1698 rack_persists_lost_ends = counter_u64_alloc(M_WAITOK); 1699 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1700 SYSCTL_CHILDREN(rack_counters), 1701 OID_AUTO, "persist_loss_ends", CTLFLAG_RD, 1702 &rack_persists_lost_ends, 1703 "Number of lost persist probe (no ack) that the run ended with a PERSIST abort"); 1704 #ifdef INVARIANTS 1705 rack_adjust_map_bw = counter_u64_alloc(M_WAITOK); 1706 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1707 SYSCTL_CHILDREN(rack_counters), 1708 OID_AUTO, "map_adjust_req", CTLFLAG_RD, 1709 &rack_adjust_map_bw, 1710 "Number of times we hit the case where the sb went up and down on a sendmap entry"); 1711 #endif 1712 rack_multi_single_eq = counter_u64_alloc(M_WAITOK); 1713 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1714 SYSCTL_CHILDREN(rack_counters), 1715 OID_AUTO, "cmp_ack_equiv", CTLFLAG_RD, 1716 &rack_multi_single_eq, 1717 "Number of compressed acks total represented"); 1718 rack_proc_non_comp_ack = counter_u64_alloc(M_WAITOK); 1719 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1720 SYSCTL_CHILDREN(rack_counters), 1721 OID_AUTO, "cmp_ack_not", CTLFLAG_RD, 1722 &rack_proc_non_comp_ack, 1723 "Number of non compresseds acks that we processed"); 1724 1725 1726 rack_sack_proc_all = counter_u64_alloc(M_WAITOK); 1727 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1728 SYSCTL_CHILDREN(rack_counters), 1729 OID_AUTO, "sack_long", CTLFLAG_RD, 1730 &rack_sack_proc_all, 1731 "Total times we had to walk whole list for sack processing"); 1732 rack_sack_proc_restart = counter_u64_alloc(M_WAITOK); 1733 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1734 SYSCTL_CHILDREN(rack_counters), 1735 OID_AUTO, "sack_restart", CTLFLAG_RD, 1736 &rack_sack_proc_restart, 1737 "Total times we had to walk whole list due to a restart"); 1738 rack_sack_proc_short = counter_u64_alloc(M_WAITOK); 1739 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1740 SYSCTL_CHILDREN(rack_counters), 1741 OID_AUTO, "sack_short", CTLFLAG_RD, 1742 &rack_sack_proc_short, 1743 "Total times we took shortcut for sack processing"); 1744 rack_sack_skipped_acked = counter_u64_alloc(M_WAITOK); 1745 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1746 SYSCTL_CHILDREN(rack_attack), 1747 OID_AUTO, "skipacked", CTLFLAG_RD, 1748 &rack_sack_skipped_acked, 1749 "Total number of times we skipped previously sacked"); 1750 rack_sack_splits = counter_u64_alloc(M_WAITOK); 1751 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1752 SYSCTL_CHILDREN(rack_attack), 1753 OID_AUTO, "ofsplit", CTLFLAG_RD, 1754 &rack_sack_splits, 1755 "Total number of times we did the old fashion tree split"); 1756 rack_input_idle_reduces = counter_u64_alloc(M_WAITOK); 1757 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1758 SYSCTL_CHILDREN(rack_counters), 1759 OID_AUTO, "idle_reduce_oninput", CTLFLAG_RD, 1760 &rack_input_idle_reduces, 1761 "Total number of idle reductions on input"); 1762 rack_collapsed_win_seen = counter_u64_alloc(M_WAITOK); 1763 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1764 SYSCTL_CHILDREN(rack_counters), 1765 OID_AUTO, "collapsed_win_seen", CTLFLAG_RD, 1766 &rack_collapsed_win_seen, 1767 "Total number of collapsed window events seen (where our window shrinks)"); 1768 1769 rack_collapsed_win = counter_u64_alloc(M_WAITOK); 1770 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1771 SYSCTL_CHILDREN(rack_counters), 1772 OID_AUTO, "collapsed_win", CTLFLAG_RD, 1773 &rack_collapsed_win, 1774 "Total number of collapsed window events where we mark packets"); 1775 rack_collapsed_win_rxt = counter_u64_alloc(M_WAITOK); 1776 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1777 SYSCTL_CHILDREN(rack_counters), 1778 OID_AUTO, "collapsed_win_rxt", CTLFLAG_RD, 1779 &rack_collapsed_win_rxt, 1780 "Total number of packets that were retransmitted"); 1781 rack_collapsed_win_rxt_bytes = counter_u64_alloc(M_WAITOK); 1782 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1783 SYSCTL_CHILDREN(rack_counters), 1784 OID_AUTO, "collapsed_win_bytes", CTLFLAG_RD, 1785 &rack_collapsed_win_rxt_bytes, 1786 "Total number of bytes that were retransmitted"); 1787 rack_try_scwnd = counter_u64_alloc(M_WAITOK); 1788 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1789 SYSCTL_CHILDREN(rack_counters), 1790 OID_AUTO, "tried_scwnd", CTLFLAG_RD, 1791 &rack_try_scwnd, 1792 "Total number of scwnd attempts"); 1793 COUNTER_ARRAY_ALLOC(rack_out_size, TCP_MSS_ACCT_SIZE, M_WAITOK); 1794 SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), 1795 OID_AUTO, "outsize", CTLFLAG_RD, 1796 rack_out_size, TCP_MSS_ACCT_SIZE, "MSS send sizes"); 1797 COUNTER_ARRAY_ALLOC(rack_opts_arry, RACK_OPTS_SIZE, M_WAITOK); 1798 SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), 1799 OID_AUTO, "opts", CTLFLAG_RD, 1800 rack_opts_arry, RACK_OPTS_SIZE, "RACK Option Stats"); 1801 SYSCTL_ADD_PROC(&rack_sysctl_ctx, 1802 SYSCTL_CHILDREN(rack_sysctl_root), 1803 OID_AUTO, "clear", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, 1804 &rack_clear_counter, 0, sysctl_rack_clear, "IU", "Clear counters"); 1805 } 1806 1807 static __inline int 1808 rb_map_cmp(struct rack_sendmap *b, struct rack_sendmap *a) 1809 { 1810 if (SEQ_GEQ(b->r_start, a->r_start) && 1811 SEQ_LT(b->r_start, a->r_end)) { 1812 /* 1813 * The entry b is within the 1814 * block a. i.e.: 1815 * a -- |-------------| 1816 * b -- |----| 1817 * <or> 1818 * b -- |------| 1819 * <or> 1820 * b -- |-----------| 1821 */ 1822 return (0); 1823 } else if (SEQ_GEQ(b->r_start, a->r_end)) { 1824 /* 1825 * b falls as either the next 1826 * sequence block after a so a 1827 * is said to be smaller than b. 1828 * i.e: 1829 * a -- |------| 1830 * b -- |--------| 1831 * or 1832 * b -- |-----| 1833 */ 1834 return (1); 1835 } 1836 /* 1837 * Whats left is where a is 1838 * larger than b. i.e: 1839 * a -- |-------| 1840 * b -- |---| 1841 * or even possibly 1842 * b -- |--------------| 1843 */ 1844 return (-1); 1845 } 1846 1847 RB_PROTOTYPE(rack_rb_tree_head, rack_sendmap, r_next, rb_map_cmp); 1848 RB_GENERATE(rack_rb_tree_head, rack_sendmap, r_next, rb_map_cmp); 1849 1850 static uint32_t 1851 rc_init_window(struct tcp_rack *rack) 1852 { 1853 uint32_t win; 1854 1855 if (rack->rc_init_win == 0) { 1856 /* 1857 * Nothing set by the user, use the system stack 1858 * default. 1859 */ 1860 return (tcp_compute_initwnd(tcp_maxseg(rack->rc_tp))); 1861 } 1862 win = ctf_fixed_maxseg(rack->rc_tp) * rack->rc_init_win; 1863 return (win); 1864 } 1865 1866 static uint64_t 1867 rack_get_fixed_pacing_bw(struct tcp_rack *rack) 1868 { 1869 if (IN_FASTRECOVERY(rack->rc_tp->t_flags)) 1870 return (rack->r_ctl.rc_fixed_pacing_rate_rec); 1871 else if (rack->r_ctl.cwnd_to_use < rack->rc_tp->snd_ssthresh) 1872 return (rack->r_ctl.rc_fixed_pacing_rate_ss); 1873 else 1874 return (rack->r_ctl.rc_fixed_pacing_rate_ca); 1875 } 1876 1877 static uint64_t 1878 rack_get_bw(struct tcp_rack *rack) 1879 { 1880 if (rack->use_fixed_rate) { 1881 /* Return the fixed pacing rate */ 1882 return (rack_get_fixed_pacing_bw(rack)); 1883 } 1884 if (rack->r_ctl.gp_bw == 0) { 1885 /* 1886 * We have yet no b/w measurement, 1887 * if we have a user set initial bw 1888 * return it. If we don't have that and 1889 * we have an srtt, use the tcp IW (10) to 1890 * calculate a fictional b/w over the SRTT 1891 * which is more or less a guess. Note 1892 * we don't use our IW from rack on purpose 1893 * so if we have like IW=30, we are not 1894 * calculating a "huge" b/w. 1895 */ 1896 uint64_t bw, srtt; 1897 if (rack->r_ctl.init_rate) 1898 return (rack->r_ctl.init_rate); 1899 1900 /* Has the user set a max peak rate? */ 1901 #ifdef NETFLIX_PEAKRATE 1902 if (rack->rc_tp->t_maxpeakrate) 1903 return (rack->rc_tp->t_maxpeakrate); 1904 #endif 1905 /* Ok lets come up with the IW guess, if we have a srtt */ 1906 if (rack->rc_tp->t_srtt == 0) { 1907 /* 1908 * Go with old pacing method 1909 * i.e. burst mitigation only. 1910 */ 1911 return (0); 1912 } 1913 /* Ok lets get the initial TCP win (not racks) */ 1914 bw = tcp_compute_initwnd(tcp_maxseg(rack->rc_tp)); 1915 srtt = (uint64_t)rack->rc_tp->t_srtt; 1916 bw *= (uint64_t)USECS_IN_SECOND; 1917 bw /= srtt; 1918 if (rack->r_ctl.bw_rate_cap && (bw > rack->r_ctl.bw_rate_cap)) 1919 bw = rack->r_ctl.bw_rate_cap; 1920 return (bw); 1921 } else { 1922 uint64_t bw; 1923 1924 if (rack->r_ctl.num_measurements >= RACK_REQ_AVG) { 1925 /* Averaging is done, we can return the value */ 1926 bw = rack->r_ctl.gp_bw; 1927 } else { 1928 /* Still doing initial average must calculate */ 1929 bw = rack->r_ctl.gp_bw / rack->r_ctl.num_measurements; 1930 } 1931 #ifdef NETFLIX_PEAKRATE 1932 if ((rack->rc_tp->t_maxpeakrate) && 1933 (bw > rack->rc_tp->t_maxpeakrate)) { 1934 /* The user has set a peak rate to pace at 1935 * don't allow us to pace faster than that. 1936 */ 1937 return (rack->rc_tp->t_maxpeakrate); 1938 } 1939 #endif 1940 if (rack->r_ctl.bw_rate_cap && (bw > rack->r_ctl.bw_rate_cap)) 1941 bw = rack->r_ctl.bw_rate_cap; 1942 return (bw); 1943 } 1944 } 1945 1946 static uint16_t 1947 rack_get_output_gain(struct tcp_rack *rack, struct rack_sendmap *rsm) 1948 { 1949 if (rack->use_fixed_rate) { 1950 return (100); 1951 } else if (rack->in_probe_rtt && (rsm == NULL)) 1952 return (rack->r_ctl.rack_per_of_gp_probertt); 1953 else if ((IN_FASTRECOVERY(rack->rc_tp->t_flags) && 1954 rack->r_ctl.rack_per_of_gp_rec)) { 1955 if (rsm) { 1956 /* a retransmission always use the recovery rate */ 1957 return (rack->r_ctl.rack_per_of_gp_rec); 1958 } else if (rack->rack_rec_nonrxt_use_cr) { 1959 /* Directed to use the configured rate */ 1960 goto configured_rate; 1961 } else if (rack->rack_no_prr && 1962 (rack->r_ctl.rack_per_of_gp_rec > 100)) { 1963 /* No PRR, lets just use the b/w estimate only */ 1964 return (100); 1965 } else { 1966 /* 1967 * Here we may have a non-retransmit but we 1968 * have no overrides, so just use the recovery 1969 * rate (prr is in effect). 1970 */ 1971 return (rack->r_ctl.rack_per_of_gp_rec); 1972 } 1973 } 1974 configured_rate: 1975 /* For the configured rate we look at our cwnd vs the ssthresh */ 1976 if (rack->r_ctl.cwnd_to_use < rack->rc_tp->snd_ssthresh) 1977 return (rack->r_ctl.rack_per_of_gp_ss); 1978 else 1979 return (rack->r_ctl.rack_per_of_gp_ca); 1980 } 1981 1982 static void 1983 rack_log_dsack_event(struct tcp_rack *rack, uint8_t mod, uint32_t flex4, uint32_t flex5, uint32_t flex6) 1984 { 1985 /* 1986 * Types of logs (mod value) 1987 * 1 = dsack_persists reduced by 1 via T-O or fast recovery exit. 1988 * 2 = a dsack round begins, persist is reset to 16. 1989 * 3 = a dsack round ends 1990 * 4 = Dsack option increases rack rtt flex5 is the srtt input, flex6 is thresh 1991 * 5 = Socket option set changing the control flags rc_rack_tmr_std_based, rc_rack_use_dsack 1992 * 6 = Final rack rtt, flex4 is srtt and flex6 is final limited thresh. 1993 */ 1994 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 1995 union tcp_log_stackspecific log; 1996 struct timeval tv; 1997 1998 memset(&log, 0, sizeof(log)); 1999 log.u_bbr.flex1 = rack->rc_rack_tmr_std_based; 2000 log.u_bbr.flex1 <<= 1; 2001 log.u_bbr.flex1 |= rack->rc_rack_use_dsack; 2002 log.u_bbr.flex1 <<= 1; 2003 log.u_bbr.flex1 |= rack->rc_dsack_round_seen; 2004 log.u_bbr.flex2 = rack->r_ctl.dsack_round_end; 2005 log.u_bbr.flex3 = rack->r_ctl.num_dsack; 2006 log.u_bbr.flex4 = flex4; 2007 log.u_bbr.flex5 = flex5; 2008 log.u_bbr.flex6 = flex6; 2009 log.u_bbr.flex7 = rack->r_ctl.dsack_persist; 2010 log.u_bbr.flex8 = mod; 2011 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2012 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2013 &rack->rc_inp->inp_socket->so_rcv, 2014 &rack->rc_inp->inp_socket->so_snd, 2015 RACK_DSACK_HANDLING, 0, 2016 0, &log, false, &tv); 2017 } 2018 } 2019 2020 static void 2021 rack_log_hdwr_pacing(struct tcp_rack *rack, 2022 uint64_t rate, uint64_t hw_rate, int line, 2023 int error, uint16_t mod) 2024 { 2025 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2026 union tcp_log_stackspecific log; 2027 struct timeval tv; 2028 const struct ifnet *ifp; 2029 2030 memset(&log, 0, sizeof(log)); 2031 log.u_bbr.flex1 = ((hw_rate >> 32) & 0x00000000ffffffff); 2032 log.u_bbr.flex2 = (hw_rate & 0x00000000ffffffff); 2033 if (rack->r_ctl.crte) { 2034 ifp = rack->r_ctl.crte->ptbl->rs_ifp; 2035 } else if (rack->rc_inp->inp_route.ro_nh && 2036 rack->rc_inp->inp_route.ro_nh->nh_ifp) { 2037 ifp = rack->rc_inp->inp_route.ro_nh->nh_ifp; 2038 } else 2039 ifp = NULL; 2040 if (ifp) { 2041 log.u_bbr.flex3 = (((uint64_t)ifp >> 32) & 0x00000000ffffffff); 2042 log.u_bbr.flex4 = ((uint64_t)ifp & 0x00000000ffffffff); 2043 } 2044 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2045 log.u_bbr.bw_inuse = rate; 2046 log.u_bbr.flex5 = line; 2047 log.u_bbr.flex6 = error; 2048 log.u_bbr.flex7 = mod; 2049 log.u_bbr.applimited = rack->r_ctl.rc_pace_max_segs; 2050 log.u_bbr.flex8 = rack->use_fixed_rate; 2051 log.u_bbr.flex8 <<= 1; 2052 log.u_bbr.flex8 |= rack->rack_hdrw_pacing; 2053 log.u_bbr.pkts_out = rack->rc_tp->t_maxseg; 2054 log.u_bbr.delRate = rack->r_ctl.crte_prev_rate; 2055 if (rack->r_ctl.crte) 2056 log.u_bbr.cur_del_rate = rack->r_ctl.crte->rate; 2057 else 2058 log.u_bbr.cur_del_rate = 0; 2059 log.u_bbr.rttProp = rack->r_ctl.last_hw_bw_req; 2060 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2061 &rack->rc_inp->inp_socket->so_rcv, 2062 &rack->rc_inp->inp_socket->so_snd, 2063 BBR_LOG_HDWR_PACE, 0, 2064 0, &log, false, &tv); 2065 } 2066 } 2067 2068 static uint64_t 2069 rack_get_output_bw(struct tcp_rack *rack, uint64_t bw, struct rack_sendmap *rsm, int *capped) 2070 { 2071 /* 2072 * We allow rack_per_of_gp_xx to dictate our bw rate we want. 2073 */ 2074 uint64_t bw_est, high_rate; 2075 uint64_t gain; 2076 2077 gain = (uint64_t)rack_get_output_gain(rack, rsm); 2078 bw_est = bw * gain; 2079 bw_est /= (uint64_t)100; 2080 /* Never fall below the minimum (def 64kbps) */ 2081 if (bw_est < RACK_MIN_BW) 2082 bw_est = RACK_MIN_BW; 2083 if (rack->r_rack_hw_rate_caps) { 2084 /* Rate caps are in place */ 2085 if (rack->r_ctl.crte != NULL) { 2086 /* We have a hdwr rate already */ 2087 high_rate = tcp_hw_highest_rate(rack->r_ctl.crte); 2088 if (bw_est >= high_rate) { 2089 /* We are capping bw at the highest rate table entry */ 2090 rack_log_hdwr_pacing(rack, 2091 bw_est, high_rate, __LINE__, 2092 0, 3); 2093 bw_est = high_rate; 2094 if (capped) 2095 *capped = 1; 2096 } 2097 } else if ((rack->rack_hdrw_pacing == 0) && 2098 (rack->rack_hdw_pace_ena) && 2099 (rack->rack_attempt_hdwr_pace == 0) && 2100 (rack->rc_inp->inp_route.ro_nh != NULL) && 2101 (rack->rc_inp->inp_route.ro_nh->nh_ifp != NULL)) { 2102 /* 2103 * Special case, we have not yet attempted hardware 2104 * pacing, and yet we may, when we do, find out if we are 2105 * above the highest rate. We need to know the maxbw for the interface 2106 * in question (if it supports ratelimiting). We get back 2107 * a 0, if the interface is not found in the RL lists. 2108 */ 2109 high_rate = tcp_hw_highest_rate_ifp(rack->rc_inp->inp_route.ro_nh->nh_ifp, rack->rc_inp); 2110 if (high_rate) { 2111 /* Yep, we have a rate is it above this rate? */ 2112 if (bw_est > high_rate) { 2113 bw_est = high_rate; 2114 if (capped) 2115 *capped = 1; 2116 } 2117 } 2118 } 2119 } 2120 return (bw_est); 2121 } 2122 2123 static void 2124 rack_log_retran_reason(struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t tsused, uint32_t thresh, int mod) 2125 { 2126 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2127 union tcp_log_stackspecific log; 2128 struct timeval tv; 2129 2130 if ((mod != 1) && (rack_verbose_logging == 0)) { 2131 /* 2132 * We get 3 values currently for mod 2133 * 1 - We are retransmitting and this tells the reason. 2134 * 2 - We are clearing a dup-ack count. 2135 * 3 - We are incrementing a dup-ack count. 2136 * 2137 * The clear/increment are only logged 2138 * if you have BBverbose on. 2139 */ 2140 return; 2141 } 2142 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2143 log.u_bbr.flex1 = tsused; 2144 log.u_bbr.flex2 = thresh; 2145 log.u_bbr.flex3 = rsm->r_flags; 2146 log.u_bbr.flex4 = rsm->r_dupack; 2147 log.u_bbr.flex5 = rsm->r_start; 2148 log.u_bbr.flex6 = rsm->r_end; 2149 log.u_bbr.flex8 = mod; 2150 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); 2151 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2152 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2153 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 2154 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 2155 log.u_bbr.pacing_gain = rack->r_must_retran; 2156 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2157 &rack->rc_inp->inp_socket->so_rcv, 2158 &rack->rc_inp->inp_socket->so_snd, 2159 BBR_LOG_SETTINGS_CHG, 0, 2160 0, &log, false, &tv); 2161 } 2162 } 2163 2164 static void 2165 rack_log_to_start(struct tcp_rack *rack, uint32_t cts, uint32_t to, int32_t slot, uint8_t which) 2166 { 2167 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2168 union tcp_log_stackspecific log; 2169 struct timeval tv; 2170 2171 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2172 log.u_bbr.flex1 = rack->rc_tp->t_srtt; 2173 log.u_bbr.flex2 = to; 2174 log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags; 2175 log.u_bbr.flex4 = slot; 2176 log.u_bbr.flex5 = rack->rc_inp->inp_hptsslot; 2177 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur; 2178 log.u_bbr.flex7 = rack->rc_in_persist; 2179 log.u_bbr.flex8 = which; 2180 if (rack->rack_no_prr) 2181 log.u_bbr.pkts_out = 0; 2182 else 2183 log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt; 2184 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); 2185 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2186 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2187 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 2188 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 2189 log.u_bbr.pacing_gain = rack->r_must_retran; 2190 log.u_bbr.cwnd_gain = rack->rc_has_collapsed; 2191 log.u_bbr.lt_epoch = rack->rc_tp->t_rxtshift; 2192 log.u_bbr.lost = rack_rto_min; 2193 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2194 &rack->rc_inp->inp_socket->so_rcv, 2195 &rack->rc_inp->inp_socket->so_snd, 2196 BBR_LOG_TIMERSTAR, 0, 2197 0, &log, false, &tv); 2198 } 2199 } 2200 2201 static void 2202 rack_log_to_event(struct tcp_rack *rack, int32_t to_num, struct rack_sendmap *rsm) 2203 { 2204 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2205 union tcp_log_stackspecific log; 2206 struct timeval tv; 2207 2208 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2209 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); 2210 log.u_bbr.flex8 = to_num; 2211 log.u_bbr.flex1 = rack->r_ctl.rc_rack_min_rtt; 2212 log.u_bbr.flex2 = rack->rc_rack_rtt; 2213 if (rsm == NULL) 2214 log.u_bbr.flex3 = 0; 2215 else 2216 log.u_bbr.flex3 = rsm->r_end - rsm->r_start; 2217 if (rack->rack_no_prr) 2218 log.u_bbr.flex5 = 0; 2219 else 2220 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; 2221 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2222 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2223 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 2224 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 2225 log.u_bbr.pacing_gain = rack->r_must_retran; 2226 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2227 &rack->rc_inp->inp_socket->so_rcv, 2228 &rack->rc_inp->inp_socket->so_snd, 2229 BBR_LOG_RTO, 0, 2230 0, &log, false, &tv); 2231 } 2232 } 2233 2234 static void 2235 rack_log_map_chg(struct tcpcb *tp, struct tcp_rack *rack, 2236 struct rack_sendmap *prev, 2237 struct rack_sendmap *rsm, 2238 struct rack_sendmap *next, 2239 int flag, uint32_t th_ack, int line) 2240 { 2241 if (rack_verbose_logging && (tp->t_logstate != TCP_LOG_STATE_OFF)) { 2242 union tcp_log_stackspecific log; 2243 struct timeval tv; 2244 2245 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2246 log.u_bbr.flex8 = flag; 2247 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); 2248 log.u_bbr.cur_del_rate = (uint64_t)prev; 2249 log.u_bbr.delRate = (uint64_t)rsm; 2250 log.u_bbr.rttProp = (uint64_t)next; 2251 log.u_bbr.flex7 = 0; 2252 if (prev) { 2253 log.u_bbr.flex1 = prev->r_start; 2254 log.u_bbr.flex2 = prev->r_end; 2255 log.u_bbr.flex7 |= 0x4; 2256 } 2257 if (rsm) { 2258 log.u_bbr.flex3 = rsm->r_start; 2259 log.u_bbr.flex4 = rsm->r_end; 2260 log.u_bbr.flex7 |= 0x2; 2261 } 2262 if (next) { 2263 log.u_bbr.flex5 = next->r_start; 2264 log.u_bbr.flex6 = next->r_end; 2265 log.u_bbr.flex7 |= 0x1; 2266 } 2267 log.u_bbr.applimited = line; 2268 log.u_bbr.pkts_out = th_ack; 2269 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2270 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2271 if (rack->rack_no_prr) 2272 log.u_bbr.lost = 0; 2273 else 2274 log.u_bbr.lost = rack->r_ctl.rc_prr_sndcnt; 2275 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2276 &rack->rc_inp->inp_socket->so_rcv, 2277 &rack->rc_inp->inp_socket->so_snd, 2278 TCP_LOG_MAPCHG, 0, 2279 0, &log, false, &tv); 2280 } 2281 } 2282 2283 static void 2284 rack_log_rtt_upd(struct tcpcb *tp, struct tcp_rack *rack, uint32_t t, uint32_t len, 2285 struct rack_sendmap *rsm, int conf) 2286 { 2287 if (tp->t_logstate != TCP_LOG_STATE_OFF) { 2288 union tcp_log_stackspecific log; 2289 struct timeval tv; 2290 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2291 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); 2292 log.u_bbr.flex1 = t; 2293 log.u_bbr.flex2 = len; 2294 log.u_bbr.flex3 = rack->r_ctl.rc_rack_min_rtt; 2295 log.u_bbr.flex4 = rack->r_ctl.rack_rs.rs_rtt_lowest; 2296 log.u_bbr.flex5 = rack->r_ctl.rack_rs.rs_rtt_highest; 2297 log.u_bbr.flex6 = rack->r_ctl.rack_rs.rs_us_rtrcnt; 2298 log.u_bbr.flex7 = conf; 2299 log.u_bbr.rttProp = (uint64_t)rack->r_ctl.rack_rs.rs_rtt_tot; 2300 log.u_bbr.flex8 = rack->r_ctl.rc_rate_sample_method; 2301 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2302 log.u_bbr.delivered = rack->r_ctl.rack_rs.rs_us_rtrcnt; 2303 log.u_bbr.pkts_out = rack->r_ctl.rack_rs.rs_flags; 2304 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2305 if (rsm) { 2306 log.u_bbr.pkt_epoch = rsm->r_start; 2307 log.u_bbr.lost = rsm->r_end; 2308 log.u_bbr.cwnd_gain = rsm->r_rtr_cnt; 2309 /* We loose any upper of the 24 bits */ 2310 log.u_bbr.pacing_gain = (uint16_t)rsm->r_flags; 2311 } else { 2312 /* Its a SYN */ 2313 log.u_bbr.pkt_epoch = rack->rc_tp->iss; 2314 log.u_bbr.lost = 0; 2315 log.u_bbr.cwnd_gain = 0; 2316 log.u_bbr.pacing_gain = 0; 2317 } 2318 /* Write out general bits of interest rrs here */ 2319 log.u_bbr.use_lt_bw = rack->rc_highly_buffered; 2320 log.u_bbr.use_lt_bw <<= 1; 2321 log.u_bbr.use_lt_bw |= rack->forced_ack; 2322 log.u_bbr.use_lt_bw <<= 1; 2323 log.u_bbr.use_lt_bw |= rack->rc_gp_dyn_mul; 2324 log.u_bbr.use_lt_bw <<= 1; 2325 log.u_bbr.use_lt_bw |= rack->in_probe_rtt; 2326 log.u_bbr.use_lt_bw <<= 1; 2327 log.u_bbr.use_lt_bw |= rack->measure_saw_probe_rtt; 2328 log.u_bbr.use_lt_bw <<= 1; 2329 log.u_bbr.use_lt_bw |= rack->app_limited_needs_set; 2330 log.u_bbr.use_lt_bw <<= 1; 2331 log.u_bbr.use_lt_bw |= rack->rc_gp_filled; 2332 log.u_bbr.use_lt_bw <<= 1; 2333 log.u_bbr.use_lt_bw |= rack->rc_dragged_bottom; 2334 log.u_bbr.applimited = rack->r_ctl.rc_target_probertt_flight; 2335 log.u_bbr.epoch = rack->r_ctl.rc_time_probertt_starts; 2336 log.u_bbr.lt_epoch = rack->r_ctl.rc_time_probertt_entered; 2337 log.u_bbr.cur_del_rate = rack->r_ctl.rc_lower_rtt_us_cts; 2338 log.u_bbr.delRate = rack->r_ctl.rc_gp_srtt; 2339 log.u_bbr.bw_inuse = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 2340 log.u_bbr.bw_inuse <<= 32; 2341 if (rsm) 2342 log.u_bbr.bw_inuse |= ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]); 2343 TCP_LOG_EVENTP(tp, NULL, 2344 &rack->rc_inp->inp_socket->so_rcv, 2345 &rack->rc_inp->inp_socket->so_snd, 2346 BBR_LOG_BBRRTT, 0, 2347 0, &log, false, &tv); 2348 2349 2350 } 2351 } 2352 2353 static void 2354 rack_log_rtt_sample(struct tcp_rack *rack, uint32_t rtt) 2355 { 2356 /* 2357 * Log the rtt sample we are 2358 * applying to the srtt algorithm in 2359 * useconds. 2360 */ 2361 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2362 union tcp_log_stackspecific log; 2363 struct timeval tv; 2364 2365 /* Convert our ms to a microsecond */ 2366 memset(&log, 0, sizeof(log)); 2367 log.u_bbr.flex1 = rtt; 2368 log.u_bbr.flex2 = rack->r_ctl.ack_count; 2369 log.u_bbr.flex3 = rack->r_ctl.sack_count; 2370 log.u_bbr.flex4 = rack->r_ctl.sack_noextra_move; 2371 log.u_bbr.flex5 = rack->r_ctl.sack_moved_extra; 2372 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur; 2373 log.u_bbr.flex7 = 1; 2374 log.u_bbr.flex8 = rack->sack_attack_disable; 2375 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2376 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2377 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 2378 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 2379 log.u_bbr.pacing_gain = rack->r_must_retran; 2380 /* 2381 * We capture in delRate the upper 32 bits as 2382 * the confidence level we had declared, and the 2383 * lower 32 bits as the actual RTT using the arrival 2384 * timestamp. 2385 */ 2386 log.u_bbr.delRate = rack->r_ctl.rack_rs.confidence; 2387 log.u_bbr.delRate <<= 32; 2388 log.u_bbr.delRate |= rack->r_ctl.rack_rs.rs_us_rtt; 2389 /* Lets capture all the things that make up t_rtxcur */ 2390 log.u_bbr.applimited = rack_rto_min; 2391 log.u_bbr.epoch = rack_rto_max; 2392 log.u_bbr.lt_epoch = rack->r_ctl.timer_slop; 2393 log.u_bbr.lost = rack_rto_min; 2394 log.u_bbr.pkt_epoch = TICKS_2_USEC(tcp_rexmit_slop); 2395 log.u_bbr.rttProp = RACK_REXMTVAL(rack->rc_tp); 2396 log.u_bbr.bw_inuse = rack->r_ctl.act_rcv_time.tv_sec; 2397 log.u_bbr.bw_inuse *= HPTS_USEC_IN_SEC; 2398 log.u_bbr.bw_inuse += rack->r_ctl.act_rcv_time.tv_usec; 2399 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2400 &rack->rc_inp->inp_socket->so_rcv, 2401 &rack->rc_inp->inp_socket->so_snd, 2402 TCP_LOG_RTT, 0, 2403 0, &log, false, &tv); 2404 } 2405 } 2406 2407 static void 2408 rack_log_rtt_sample_calc(struct tcp_rack *rack, uint32_t rtt, uint32_t send_time, uint32_t ack_time, int where) 2409 { 2410 if (rack_verbose_logging && (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) { 2411 union tcp_log_stackspecific log; 2412 struct timeval tv; 2413 2414 /* Convert our ms to a microsecond */ 2415 memset(&log, 0, sizeof(log)); 2416 log.u_bbr.flex1 = rtt; 2417 log.u_bbr.flex2 = send_time; 2418 log.u_bbr.flex3 = ack_time; 2419 log.u_bbr.flex4 = where; 2420 log.u_bbr.flex7 = 2; 2421 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2422 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2423 &rack->rc_inp->inp_socket->so_rcv, 2424 &rack->rc_inp->inp_socket->so_snd, 2425 TCP_LOG_RTT, 0, 2426 0, &log, false, &tv); 2427 } 2428 } 2429 2430 2431 2432 static inline void 2433 rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, int event, int line) 2434 { 2435 if (rack_verbose_logging && (tp->t_logstate != TCP_LOG_STATE_OFF)) { 2436 union tcp_log_stackspecific log; 2437 struct timeval tv; 2438 2439 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2440 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); 2441 log.u_bbr.flex1 = line; 2442 log.u_bbr.flex2 = tick; 2443 log.u_bbr.flex3 = tp->t_maxunacktime; 2444 log.u_bbr.flex4 = tp->t_acktime; 2445 log.u_bbr.flex8 = event; 2446 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2447 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2448 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 2449 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 2450 log.u_bbr.pacing_gain = rack->r_must_retran; 2451 TCP_LOG_EVENTP(tp, NULL, 2452 &rack->rc_inp->inp_socket->so_rcv, 2453 &rack->rc_inp->inp_socket->so_snd, 2454 BBR_LOG_PROGRESS, 0, 2455 0, &log, false, &tv); 2456 } 2457 } 2458 2459 static void 2460 rack_log_type_bbrsnd(struct tcp_rack *rack, uint32_t len, uint32_t slot, uint32_t cts, struct timeval *tv) 2461 { 2462 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2463 union tcp_log_stackspecific log; 2464 2465 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2466 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); 2467 log.u_bbr.flex1 = slot; 2468 if (rack->rack_no_prr) 2469 log.u_bbr.flex2 = 0; 2470 else 2471 log.u_bbr.flex2 = rack->r_ctl.rc_prr_sndcnt; 2472 log.u_bbr.flex7 = (0x0000ffff & rack->r_ctl.rc_hpts_flags); 2473 log.u_bbr.flex8 = rack->rc_in_persist; 2474 log.u_bbr.timeStamp = cts; 2475 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2476 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 2477 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 2478 log.u_bbr.pacing_gain = rack->r_must_retran; 2479 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2480 &rack->rc_inp->inp_socket->so_rcv, 2481 &rack->rc_inp->inp_socket->so_snd, 2482 BBR_LOG_BBRSND, 0, 2483 0, &log, false, tv); 2484 } 2485 } 2486 2487 static void 2488 rack_log_doseg_done(struct tcp_rack *rack, uint32_t cts, int32_t nxt_pkt, int32_t did_out, int way_out, int nsegs) 2489 { 2490 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2491 union tcp_log_stackspecific log; 2492 struct timeval tv; 2493 2494 memset(&log, 0, sizeof(log)); 2495 log.u_bbr.flex1 = did_out; 2496 log.u_bbr.flex2 = nxt_pkt; 2497 log.u_bbr.flex3 = way_out; 2498 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; 2499 if (rack->rack_no_prr) 2500 log.u_bbr.flex5 = 0; 2501 else 2502 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; 2503 log.u_bbr.flex6 = nsegs; 2504 log.u_bbr.applimited = rack->r_ctl.rc_pace_min_segs; 2505 log.u_bbr.flex7 = rack->rc_ack_can_sendout_data; /* Do we have ack-can-send set */ 2506 log.u_bbr.flex7 <<= 1; 2507 log.u_bbr.flex7 |= rack->r_fast_output; /* is fast output primed */ 2508 log.u_bbr.flex7 <<= 1; 2509 log.u_bbr.flex7 |= rack->r_wanted_output; /* Do we want output */ 2510 log.u_bbr.flex8 = rack->rc_in_persist; 2511 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); 2512 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2513 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2514 log.u_bbr.use_lt_bw = rack->r_ent_rec_ns; 2515 log.u_bbr.use_lt_bw <<= 1; 2516 log.u_bbr.use_lt_bw |= rack->r_might_revert; 2517 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 2518 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 2519 log.u_bbr.pacing_gain = rack->r_must_retran; 2520 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2521 &rack->rc_inp->inp_socket->so_rcv, 2522 &rack->rc_inp->inp_socket->so_snd, 2523 BBR_LOG_DOSEG_DONE, 0, 2524 0, &log, false, &tv); 2525 } 2526 } 2527 2528 static void 2529 rack_log_type_pacing_sizes(struct tcpcb *tp, struct tcp_rack *rack, uint32_t arg1, uint32_t arg2, uint32_t arg3, uint8_t frm) 2530 { 2531 if (tp->t_logstate != TCP_LOG_STATE_OFF) { 2532 union tcp_log_stackspecific log; 2533 struct timeval tv; 2534 2535 memset(&log, 0, sizeof(log)); 2536 log.u_bbr.flex1 = rack->r_ctl.rc_pace_min_segs; 2537 log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs; 2538 log.u_bbr.flex4 = arg1; 2539 log.u_bbr.flex5 = arg2; 2540 log.u_bbr.flex6 = arg3; 2541 log.u_bbr.flex8 = frm; 2542 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2543 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2544 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 2545 log.u_bbr.applimited = rack->r_ctl.rc_sacked; 2546 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 2547 log.u_bbr.pacing_gain = rack->r_must_retran; 2548 TCP_LOG_EVENTP(tp, NULL, &tptosocket(tp)->so_rcv, 2549 &tptosocket(tp)->so_snd, 2550 TCP_HDWR_PACE_SIZE, 0, 0, &log, false, &tv); 2551 } 2552 } 2553 2554 static void 2555 rack_log_type_just_return(struct tcp_rack *rack, uint32_t cts, uint32_t tlen, uint32_t slot, 2556 uint8_t hpts_calling, int reason, uint32_t cwnd_to_use) 2557 { 2558 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2559 union tcp_log_stackspecific log; 2560 struct timeval tv; 2561 2562 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2563 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); 2564 log.u_bbr.flex1 = slot; 2565 log.u_bbr.flex2 = rack->r_ctl.rc_hpts_flags; 2566 log.u_bbr.flex4 = reason; 2567 if (rack->rack_no_prr) 2568 log.u_bbr.flex5 = 0; 2569 else 2570 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; 2571 log.u_bbr.flex7 = hpts_calling; 2572 log.u_bbr.flex8 = rack->rc_in_persist; 2573 log.u_bbr.lt_epoch = cwnd_to_use; 2574 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2575 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2576 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 2577 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 2578 log.u_bbr.pacing_gain = rack->r_must_retran; 2579 log.u_bbr.cwnd_gain = rack->rc_has_collapsed; 2580 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2581 &rack->rc_inp->inp_socket->so_rcv, 2582 &rack->rc_inp->inp_socket->so_snd, 2583 BBR_LOG_JUSTRET, 0, 2584 tlen, &log, false, &tv); 2585 } 2586 } 2587 2588 static void 2589 rack_log_to_cancel(struct tcp_rack *rack, int32_t hpts_removed, int line, uint32_t us_cts, 2590 struct timeval *tv, uint32_t flags_on_entry) 2591 { 2592 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2593 union tcp_log_stackspecific log; 2594 2595 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2596 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); 2597 log.u_bbr.flex1 = line; 2598 log.u_bbr.flex2 = rack->r_ctl.rc_last_output_to; 2599 log.u_bbr.flex3 = flags_on_entry; 2600 log.u_bbr.flex4 = us_cts; 2601 if (rack->rack_no_prr) 2602 log.u_bbr.flex5 = 0; 2603 else 2604 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; 2605 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur; 2606 log.u_bbr.flex7 = hpts_removed; 2607 log.u_bbr.flex8 = 1; 2608 log.u_bbr.applimited = rack->r_ctl.rc_hpts_flags; 2609 log.u_bbr.timeStamp = us_cts; 2610 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2611 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 2612 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 2613 log.u_bbr.pacing_gain = rack->r_must_retran; 2614 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2615 &rack->rc_inp->inp_socket->so_rcv, 2616 &rack->rc_inp->inp_socket->so_snd, 2617 BBR_LOG_TIMERCANC, 0, 2618 0, &log, false, tv); 2619 } 2620 } 2621 2622 static void 2623 rack_log_alt_to_to_cancel(struct tcp_rack *rack, 2624 uint32_t flex1, uint32_t flex2, 2625 uint32_t flex3, uint32_t flex4, 2626 uint32_t flex5, uint32_t flex6, 2627 uint16_t flex7, uint8_t mod) 2628 { 2629 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2630 union tcp_log_stackspecific log; 2631 struct timeval tv; 2632 2633 if (mod == 1) { 2634 /* No you can't use 1, its for the real to cancel */ 2635 return; 2636 } 2637 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2638 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2639 log.u_bbr.flex1 = flex1; 2640 log.u_bbr.flex2 = flex2; 2641 log.u_bbr.flex3 = flex3; 2642 log.u_bbr.flex4 = flex4; 2643 log.u_bbr.flex5 = flex5; 2644 log.u_bbr.flex6 = flex6; 2645 log.u_bbr.flex7 = flex7; 2646 log.u_bbr.flex8 = mod; 2647 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2648 &rack->rc_inp->inp_socket->so_rcv, 2649 &rack->rc_inp->inp_socket->so_snd, 2650 BBR_LOG_TIMERCANC, 0, 2651 0, &log, false, &tv); 2652 } 2653 } 2654 2655 static void 2656 rack_log_to_processing(struct tcp_rack *rack, uint32_t cts, int32_t ret, int32_t timers) 2657 { 2658 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2659 union tcp_log_stackspecific log; 2660 struct timeval tv; 2661 2662 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2663 log.u_bbr.flex1 = timers; 2664 log.u_bbr.flex2 = ret; 2665 log.u_bbr.flex3 = rack->r_ctl.rc_timer_exp; 2666 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; 2667 log.u_bbr.flex5 = cts; 2668 if (rack->rack_no_prr) 2669 log.u_bbr.flex6 = 0; 2670 else 2671 log.u_bbr.flex6 = rack->r_ctl.rc_prr_sndcnt; 2672 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 2673 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 2674 log.u_bbr.pacing_gain = rack->r_must_retran; 2675 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2676 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2677 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2678 &rack->rc_inp->inp_socket->so_rcv, 2679 &rack->rc_inp->inp_socket->so_snd, 2680 BBR_LOG_TO_PROCESS, 0, 2681 0, &log, false, &tv); 2682 } 2683 } 2684 2685 static void 2686 rack_log_to_prr(struct tcp_rack *rack, int frm, int orig_cwnd, int line) 2687 { 2688 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2689 union tcp_log_stackspecific log; 2690 struct timeval tv; 2691 2692 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2693 log.u_bbr.flex1 = rack->r_ctl.rc_prr_out; 2694 log.u_bbr.flex2 = rack->r_ctl.rc_prr_recovery_fs; 2695 if (rack->rack_no_prr) 2696 log.u_bbr.flex3 = 0; 2697 else 2698 log.u_bbr.flex3 = rack->r_ctl.rc_prr_sndcnt; 2699 log.u_bbr.flex4 = rack->r_ctl.rc_prr_delivered; 2700 log.u_bbr.flex5 = rack->r_ctl.rc_sacked; 2701 log.u_bbr.flex6 = rack->r_ctl.rc_holes_rxt; 2702 log.u_bbr.flex7 = line; 2703 log.u_bbr.flex8 = frm; 2704 log.u_bbr.pkts_out = orig_cwnd; 2705 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2706 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2707 log.u_bbr.use_lt_bw = rack->r_ent_rec_ns; 2708 log.u_bbr.use_lt_bw <<= 1; 2709 log.u_bbr.use_lt_bw |= rack->r_might_revert; 2710 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2711 &rack->rc_inp->inp_socket->so_rcv, 2712 &rack->rc_inp->inp_socket->so_snd, 2713 BBR_LOG_BBRUPD, 0, 2714 0, &log, false, &tv); 2715 } 2716 } 2717 2718 #ifdef NETFLIX_EXP_DETECTION 2719 static void 2720 rack_log_sad(struct tcp_rack *rack, int event) 2721 { 2722 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2723 union tcp_log_stackspecific log; 2724 struct timeval tv; 2725 2726 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2727 log.u_bbr.flex1 = rack->r_ctl.sack_count; 2728 log.u_bbr.flex2 = rack->r_ctl.ack_count; 2729 log.u_bbr.flex3 = rack->r_ctl.sack_moved_extra; 2730 log.u_bbr.flex4 = rack->r_ctl.sack_noextra_move; 2731 log.u_bbr.flex5 = rack->r_ctl.rc_num_maps_alloced; 2732 log.u_bbr.flex6 = tcp_sack_to_ack_thresh; 2733 log.u_bbr.pkts_out = tcp_sack_to_move_thresh; 2734 log.u_bbr.lt_epoch = (tcp_force_detection << 8); 2735 log.u_bbr.lt_epoch |= rack->do_detection; 2736 log.u_bbr.applimited = tcp_map_minimum; 2737 log.u_bbr.flex7 = rack->sack_attack_disable; 2738 log.u_bbr.flex8 = event; 2739 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2740 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2741 log.u_bbr.delivered = tcp_sad_decay_val; 2742 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2743 &rack->rc_inp->inp_socket->so_rcv, 2744 &rack->rc_inp->inp_socket->so_snd, 2745 TCP_SAD_DETECTION, 0, 2746 0, &log, false, &tv); 2747 } 2748 } 2749 #endif 2750 2751 static void 2752 rack_counter_destroy(void) 2753 { 2754 counter_u64_free(rack_fto_send); 2755 counter_u64_free(rack_fto_rsm_send); 2756 counter_u64_free(rack_nfto_resend); 2757 counter_u64_free(rack_hw_pace_init_fail); 2758 counter_u64_free(rack_hw_pace_lost); 2759 counter_u64_free(rack_non_fto_send); 2760 counter_u64_free(rack_extended_rfo); 2761 counter_u64_free(rack_ack_total); 2762 counter_u64_free(rack_express_sack); 2763 counter_u64_free(rack_sack_total); 2764 counter_u64_free(rack_move_none); 2765 counter_u64_free(rack_move_some); 2766 counter_u64_free(rack_sack_attacks_detected); 2767 counter_u64_free(rack_sack_attacks_reversed); 2768 counter_u64_free(rack_sack_used_next_merge); 2769 counter_u64_free(rack_sack_used_prev_merge); 2770 counter_u64_free(rack_tlp_tot); 2771 counter_u64_free(rack_tlp_newdata); 2772 counter_u64_free(rack_tlp_retran); 2773 counter_u64_free(rack_tlp_retran_bytes); 2774 counter_u64_free(rack_to_tot); 2775 counter_u64_free(rack_saw_enobuf); 2776 counter_u64_free(rack_saw_enobuf_hw); 2777 counter_u64_free(rack_saw_enetunreach); 2778 counter_u64_free(rack_hot_alloc); 2779 counter_u64_free(rack_to_alloc); 2780 counter_u64_free(rack_to_alloc_hard); 2781 counter_u64_free(rack_to_alloc_emerg); 2782 counter_u64_free(rack_to_alloc_limited); 2783 counter_u64_free(rack_alloc_limited_conns); 2784 counter_u64_free(rack_split_limited); 2785 counter_u64_free(rack_multi_single_eq); 2786 counter_u64_free(rack_proc_non_comp_ack); 2787 counter_u64_free(rack_sack_proc_all); 2788 counter_u64_free(rack_sack_proc_restart); 2789 counter_u64_free(rack_sack_proc_short); 2790 counter_u64_free(rack_sack_skipped_acked); 2791 counter_u64_free(rack_sack_splits); 2792 counter_u64_free(rack_input_idle_reduces); 2793 counter_u64_free(rack_collapsed_win); 2794 counter_u64_free(rack_collapsed_win_rxt); 2795 counter_u64_free(rack_collapsed_win_rxt_bytes); 2796 counter_u64_free(rack_collapsed_win_seen); 2797 counter_u64_free(rack_try_scwnd); 2798 counter_u64_free(rack_persists_sends); 2799 counter_u64_free(rack_persists_acks); 2800 counter_u64_free(rack_persists_loss); 2801 counter_u64_free(rack_persists_lost_ends); 2802 #ifdef INVARIANTS 2803 counter_u64_free(rack_adjust_map_bw); 2804 #endif 2805 COUNTER_ARRAY_FREE(rack_out_size, TCP_MSS_ACCT_SIZE); 2806 COUNTER_ARRAY_FREE(rack_opts_arry, RACK_OPTS_SIZE); 2807 } 2808 2809 static struct rack_sendmap * 2810 rack_alloc(struct tcp_rack *rack) 2811 { 2812 struct rack_sendmap *rsm; 2813 2814 /* 2815 * First get the top of the list it in 2816 * theory is the "hottest" rsm we have, 2817 * possibly just freed by ack processing. 2818 */ 2819 if (rack->rc_free_cnt > rack_free_cache) { 2820 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); 2821 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext); 2822 counter_u64_add(rack_hot_alloc, 1); 2823 rack->rc_free_cnt--; 2824 return (rsm); 2825 } 2826 /* 2827 * Once we get under our free cache we probably 2828 * no longer have a "hot" one available. Lets 2829 * get one from UMA. 2830 */ 2831 rsm = uma_zalloc(rack_zone, M_NOWAIT); 2832 if (rsm) { 2833 rack->r_ctl.rc_num_maps_alloced++; 2834 counter_u64_add(rack_to_alloc, 1); 2835 return (rsm); 2836 } 2837 /* 2838 * Dig in to our aux rsm's (the last two) since 2839 * UMA failed to get us one. 2840 */ 2841 if (rack->rc_free_cnt) { 2842 counter_u64_add(rack_to_alloc_emerg, 1); 2843 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); 2844 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext); 2845 rack->rc_free_cnt--; 2846 return (rsm); 2847 } 2848 return (NULL); 2849 } 2850 2851 static struct rack_sendmap * 2852 rack_alloc_full_limit(struct tcp_rack *rack) 2853 { 2854 if ((V_tcp_map_entries_limit > 0) && 2855 (rack->do_detection == 0) && 2856 (rack->r_ctl.rc_num_maps_alloced >= V_tcp_map_entries_limit)) { 2857 counter_u64_add(rack_to_alloc_limited, 1); 2858 if (!rack->alloc_limit_reported) { 2859 rack->alloc_limit_reported = 1; 2860 counter_u64_add(rack_alloc_limited_conns, 1); 2861 } 2862 return (NULL); 2863 } 2864 return (rack_alloc(rack)); 2865 } 2866 2867 /* wrapper to allocate a sendmap entry, subject to a specific limit */ 2868 static struct rack_sendmap * 2869 rack_alloc_limit(struct tcp_rack *rack, uint8_t limit_type) 2870 { 2871 struct rack_sendmap *rsm; 2872 2873 if (limit_type) { 2874 /* currently there is only one limit type */ 2875 if (V_tcp_map_split_limit > 0 && 2876 (rack->do_detection == 0) && 2877 rack->r_ctl.rc_num_split_allocs >= V_tcp_map_split_limit) { 2878 counter_u64_add(rack_split_limited, 1); 2879 if (!rack->alloc_limit_reported) { 2880 rack->alloc_limit_reported = 1; 2881 counter_u64_add(rack_alloc_limited_conns, 1); 2882 } 2883 return (NULL); 2884 } 2885 } 2886 2887 /* allocate and mark in the limit type, if set */ 2888 rsm = rack_alloc(rack); 2889 if (rsm != NULL && limit_type) { 2890 rsm->r_limit_type = limit_type; 2891 rack->r_ctl.rc_num_split_allocs++; 2892 } 2893 return (rsm); 2894 } 2895 2896 static void 2897 rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm) 2898 { 2899 if (rsm->r_flags & RACK_APP_LIMITED) { 2900 if (rack->r_ctl.rc_app_limited_cnt > 0) { 2901 rack->r_ctl.rc_app_limited_cnt--; 2902 } 2903 } 2904 if (rsm->r_limit_type) { 2905 /* currently there is only one limit type */ 2906 rack->r_ctl.rc_num_split_allocs--; 2907 } 2908 if (rsm == rack->r_ctl.rc_first_appl) { 2909 if (rack->r_ctl.rc_app_limited_cnt == 0) 2910 rack->r_ctl.rc_first_appl = NULL; 2911 else { 2912 /* Follow the next one out */ 2913 struct rack_sendmap fe; 2914 2915 fe.r_start = rsm->r_nseq_appl; 2916 rack->r_ctl.rc_first_appl = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); 2917 } 2918 } 2919 if (rsm == rack->r_ctl.rc_resend) 2920 rack->r_ctl.rc_resend = NULL; 2921 if (rsm == rack->r_ctl.rc_end_appl) 2922 rack->r_ctl.rc_end_appl = NULL; 2923 if (rack->r_ctl.rc_tlpsend == rsm) 2924 rack->r_ctl.rc_tlpsend = NULL; 2925 if (rack->r_ctl.rc_sacklast == rsm) 2926 rack->r_ctl.rc_sacklast = NULL; 2927 memset(rsm, 0, sizeof(struct rack_sendmap)); 2928 TAILQ_INSERT_HEAD(&rack->r_ctl.rc_free, rsm, r_tnext); 2929 rack->rc_free_cnt++; 2930 } 2931 2932 static void 2933 rack_free_trim(struct tcp_rack *rack) 2934 { 2935 struct rack_sendmap *rsm; 2936 2937 /* 2938 * Free up all the tail entries until 2939 * we get our list down to the limit. 2940 */ 2941 while (rack->rc_free_cnt > rack_free_cache) { 2942 rsm = TAILQ_LAST(&rack->r_ctl.rc_free, rack_head); 2943 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext); 2944 rack->rc_free_cnt--; 2945 uma_zfree(rack_zone, rsm); 2946 } 2947 } 2948 2949 2950 static uint32_t 2951 rack_get_measure_window(struct tcpcb *tp, struct tcp_rack *rack) 2952 { 2953 uint64_t srtt, bw, len, tim; 2954 uint32_t segsiz, def_len, minl; 2955 2956 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 2957 def_len = rack_def_data_window * segsiz; 2958 if (rack->rc_gp_filled == 0) { 2959 /* 2960 * We have no measurement (IW is in flight?) so 2961 * we can only guess using our data_window sysctl 2962 * value (usually 20MSS). 2963 */ 2964 return (def_len); 2965 } 2966 /* 2967 * Now we have a number of factors to consider. 2968 * 2969 * 1) We have a desired BDP which is usually 2970 * at least 2. 2971 * 2) We have a minimum number of rtt's usually 1 SRTT 2972 * but we allow it too to be more. 2973 * 3) We want to make sure a measurement last N useconds (if 2974 * we have set rack_min_measure_usec. 2975 * 2976 * We handle the first concern here by trying to create a data 2977 * window of max(rack_def_data_window, DesiredBDP). The 2978 * second concern we handle in not letting the measurement 2979 * window end normally until at least the required SRTT's 2980 * have gone by which is done further below in 2981 * rack_enough_for_measurement(). Finally the third concern 2982 * we also handle here by calculating how long that time 2983 * would take at the current BW and then return the 2984 * max of our first calculation and that length. Note 2985 * that if rack_min_measure_usec is 0, we don't deal 2986 * with concern 3. Also for both Concern 1 and 3 an 2987 * application limited period could end the measurement 2988 * earlier. 2989 * 2990 * So lets calculate the BDP with the "known" b/w using 2991 * the SRTT has our rtt and then multiply it by the 2992 * goal. 2993 */ 2994 bw = rack_get_bw(rack); 2995 srtt = (uint64_t)tp->t_srtt; 2996 len = bw * srtt; 2997 len /= (uint64_t)HPTS_USEC_IN_SEC; 2998 len *= max(1, rack_goal_bdp); 2999 /* Now we need to round up to the nearest MSS */ 3000 len = roundup(len, segsiz); 3001 if (rack_min_measure_usec) { 3002 /* Now calculate our min length for this b/w */ 3003 tim = rack_min_measure_usec; 3004 minl = (tim * bw) / (uint64_t)HPTS_USEC_IN_SEC; 3005 if (minl == 0) 3006 minl = 1; 3007 minl = roundup(minl, segsiz); 3008 if (len < minl) 3009 len = minl; 3010 } 3011 /* 3012 * Now if we have a very small window we want 3013 * to attempt to get the window that is 3014 * as small as possible. This happens on 3015 * low b/w connections and we don't want to 3016 * span huge numbers of rtt's between measurements. 3017 * 3018 * We basically include 2 over our "MIN window" so 3019 * that the measurement can be shortened (possibly) by 3020 * an ack'ed packet. 3021 */ 3022 if (len < def_len) 3023 return (max((uint32_t)len, ((MIN_GP_WIN+2) * segsiz))); 3024 else 3025 return (max((uint32_t)len, def_len)); 3026 3027 } 3028 3029 static int 3030 rack_enough_for_measurement(struct tcpcb *tp, struct tcp_rack *rack, tcp_seq th_ack, uint8_t *quality) 3031 { 3032 uint32_t tim, srtts, segsiz; 3033 3034 /* 3035 * Has enough time passed for the GP measurement to be valid? 3036 */ 3037 if ((tp->snd_max == tp->snd_una) || 3038 (th_ack == tp->snd_max)){ 3039 /* All is acked */ 3040 *quality = RACK_QUALITY_ALLACKED; 3041 return (1); 3042 } 3043 if (SEQ_LT(th_ack, tp->gput_seq)) { 3044 /* Not enough bytes yet */ 3045 return (0); 3046 } 3047 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 3048 if (SEQ_LT(th_ack, tp->gput_ack) && 3049 ((th_ack - tp->gput_seq) < max(rc_init_window(rack), (MIN_GP_WIN * segsiz)))) { 3050 /* Not enough bytes yet */ 3051 return (0); 3052 } 3053 if (rack->r_ctl.rc_first_appl && 3054 (SEQ_GEQ(th_ack, rack->r_ctl.rc_first_appl->r_end))) { 3055 /* 3056 * We are up to the app limited send point 3057 * we have to measure irrespective of the time.. 3058 */ 3059 *quality = RACK_QUALITY_APPLIMITED; 3060 return (1); 3061 } 3062 /* Now what about time? */ 3063 srtts = (rack->r_ctl.rc_gp_srtt * rack_min_srtts); 3064 tim = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time) - tp->gput_ts; 3065 if (tim >= srtts) { 3066 *quality = RACK_QUALITY_HIGH; 3067 return (1); 3068 } 3069 /* Nope not even a full SRTT has passed */ 3070 return (0); 3071 } 3072 3073 static void 3074 rack_log_timely(struct tcp_rack *rack, 3075 uint32_t logged, uint64_t cur_bw, uint64_t low_bnd, 3076 uint64_t up_bnd, int line, uint8_t method) 3077 { 3078 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 3079 union tcp_log_stackspecific log; 3080 struct timeval tv; 3081 3082 memset(&log, 0, sizeof(log)); 3083 log.u_bbr.flex1 = logged; 3084 log.u_bbr.flex2 = rack->rc_gp_timely_inc_cnt; 3085 log.u_bbr.flex2 <<= 4; 3086 log.u_bbr.flex2 |= rack->rc_gp_timely_dec_cnt; 3087 log.u_bbr.flex2 <<= 4; 3088 log.u_bbr.flex2 |= rack->rc_gp_incr; 3089 log.u_bbr.flex2 <<= 4; 3090 log.u_bbr.flex2 |= rack->rc_gp_bwred; 3091 log.u_bbr.flex3 = rack->rc_gp_incr; 3092 log.u_bbr.flex4 = rack->r_ctl.rack_per_of_gp_ss; 3093 log.u_bbr.flex5 = rack->r_ctl.rack_per_of_gp_ca; 3094 log.u_bbr.flex6 = rack->r_ctl.rack_per_of_gp_rec; 3095 log.u_bbr.flex7 = rack->rc_gp_bwred; 3096 log.u_bbr.flex8 = method; 3097 log.u_bbr.cur_del_rate = cur_bw; 3098 log.u_bbr.delRate = low_bnd; 3099 log.u_bbr.bw_inuse = up_bnd; 3100 log.u_bbr.rttProp = rack_get_bw(rack); 3101 log.u_bbr.pkt_epoch = line; 3102 log.u_bbr.pkts_out = rack->r_ctl.rc_rtt_diff; 3103 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 3104 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 3105 log.u_bbr.epoch = rack->r_ctl.rc_gp_srtt; 3106 log.u_bbr.lt_epoch = rack->r_ctl.rc_prev_gp_srtt; 3107 log.u_bbr.cwnd_gain = rack->rc_dragged_bottom; 3108 log.u_bbr.cwnd_gain <<= 1; 3109 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_rec; 3110 log.u_bbr.cwnd_gain <<= 1; 3111 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ss; 3112 log.u_bbr.cwnd_gain <<= 1; 3113 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ca; 3114 log.u_bbr.lost = rack->r_ctl.rc_loss_count; 3115 TCP_LOG_EVENTP(rack->rc_tp, NULL, 3116 &rack->rc_inp->inp_socket->so_rcv, 3117 &rack->rc_inp->inp_socket->so_snd, 3118 TCP_TIMELY_WORK, 0, 3119 0, &log, false, &tv); 3120 } 3121 } 3122 3123 static int 3124 rack_bw_can_be_raised(struct tcp_rack *rack, uint64_t cur_bw, uint64_t last_bw_est, uint16_t mult) 3125 { 3126 /* 3127 * Before we increase we need to know if 3128 * the estimate just made was less than 3129 * our pacing goal (i.e. (cur_bw * mult) > last_bw_est) 3130 * 3131 * If we already are pacing at a fast enough 3132 * rate to push us faster there is no sense of 3133 * increasing. 3134 * 3135 * We first caculate our actual pacing rate (ss or ca multiplier 3136 * times our cur_bw). 3137 * 3138 * Then we take the last measured rate and multipy by our 3139 * maximum pacing overage to give us a max allowable rate. 3140 * 3141 * If our act_rate is smaller than our max_allowable rate 3142 * then we should increase. Else we should hold steady. 3143 * 3144 */ 3145 uint64_t act_rate, max_allow_rate; 3146 3147 if (rack_timely_no_stopping) 3148 return (1); 3149 3150 if ((cur_bw == 0) || (last_bw_est == 0)) { 3151 /* 3152 * Initial startup case or 3153 * everything is acked case. 3154 */ 3155 rack_log_timely(rack, mult, cur_bw, 0, 0, 3156 __LINE__, 9); 3157 return (1); 3158 } 3159 if (mult <= 100) { 3160 /* 3161 * We can always pace at or slightly above our rate. 3162 */ 3163 rack_log_timely(rack, mult, cur_bw, 0, 0, 3164 __LINE__, 9); 3165 return (1); 3166 } 3167 act_rate = cur_bw * (uint64_t)mult; 3168 act_rate /= 100; 3169 max_allow_rate = last_bw_est * ((uint64_t)rack_max_per_above + (uint64_t)100); 3170 max_allow_rate /= 100; 3171 if (act_rate < max_allow_rate) { 3172 /* 3173 * Here the rate we are actually pacing at 3174 * is smaller than 10% above our last measurement. 3175 * This means we are pacing below what we would 3176 * like to try to achieve (plus some wiggle room). 3177 */ 3178 rack_log_timely(rack, mult, cur_bw, act_rate, max_allow_rate, 3179 __LINE__, 9); 3180 return (1); 3181 } else { 3182 /* 3183 * Here we are already pacing at least rack_max_per_above(10%) 3184 * what we are getting back. This indicates most likely 3185 * that we are being limited (cwnd/rwnd/app) and can't 3186 * get any more b/w. There is no sense of trying to 3187 * raise up the pacing rate its not speeding us up 3188 * and we already are pacing faster than we are getting. 3189 */ 3190 rack_log_timely(rack, mult, cur_bw, act_rate, max_allow_rate, 3191 __LINE__, 8); 3192 return (0); 3193 } 3194 } 3195 3196 static void 3197 rack_validate_multipliers_at_or_above100(struct tcp_rack *rack) 3198 { 3199 /* 3200 * When we drag bottom, we want to assure 3201 * that no multiplier is below 1.0, if so 3202 * we want to restore it to at least that. 3203 */ 3204 if (rack->r_ctl.rack_per_of_gp_rec < 100) { 3205 /* This is unlikely we usually do not touch recovery */ 3206 rack->r_ctl.rack_per_of_gp_rec = 100; 3207 } 3208 if (rack->r_ctl.rack_per_of_gp_ca < 100) { 3209 rack->r_ctl.rack_per_of_gp_ca = 100; 3210 } 3211 if (rack->r_ctl.rack_per_of_gp_ss < 100) { 3212 rack->r_ctl.rack_per_of_gp_ss = 100; 3213 } 3214 } 3215 3216 static void 3217 rack_validate_multipliers_at_or_below_100(struct tcp_rack *rack) 3218 { 3219 if (rack->r_ctl.rack_per_of_gp_ca > 100) { 3220 rack->r_ctl.rack_per_of_gp_ca = 100; 3221 } 3222 if (rack->r_ctl.rack_per_of_gp_ss > 100) { 3223 rack->r_ctl.rack_per_of_gp_ss = 100; 3224 } 3225 } 3226 3227 static void 3228 rack_increase_bw_mul(struct tcp_rack *rack, int timely_says, uint64_t cur_bw, uint64_t last_bw_est, int override) 3229 { 3230 int32_t calc, logged, plus; 3231 3232 logged = 0; 3233 3234 if (override) { 3235 /* 3236 * override is passed when we are 3237 * loosing b/w and making one last 3238 * gasp at trying to not loose out 3239 * to a new-reno flow. 3240 */ 3241 goto extra_boost; 3242 } 3243 /* In classic timely we boost by 5x if we have 5 increases in a row, lets not */ 3244 if (rack->rc_gp_incr && 3245 ((rack->rc_gp_timely_inc_cnt + 1) >= RACK_TIMELY_CNT_BOOST)) { 3246 /* 3247 * Reset and get 5 strokes more before the boost. Note 3248 * that the count is 0 based so we have to add one. 3249 */ 3250 extra_boost: 3251 plus = (uint32_t)rack_gp_increase_per * RACK_TIMELY_CNT_BOOST; 3252 rack->rc_gp_timely_inc_cnt = 0; 3253 } else 3254 plus = (uint32_t)rack_gp_increase_per; 3255 /* Must be at least 1% increase for true timely increases */ 3256 if ((plus < 1) && 3257 ((rack->r_ctl.rc_rtt_diff <= 0) || (timely_says <= 0))) 3258 plus = 1; 3259 if (rack->rc_gp_saw_rec && 3260 (rack->rc_gp_no_rec_chg == 0) && 3261 rack_bw_can_be_raised(rack, cur_bw, last_bw_est, 3262 rack->r_ctl.rack_per_of_gp_rec)) { 3263 /* We have been in recovery ding it too */ 3264 calc = rack->r_ctl.rack_per_of_gp_rec + plus; 3265 if (calc > 0xffff) 3266 calc = 0xffff; 3267 logged |= 1; 3268 rack->r_ctl.rack_per_of_gp_rec = (uint16_t)calc; 3269 if (rack_per_upper_bound_ss && 3270 (rack->rc_dragged_bottom == 0) && 3271 (rack->r_ctl.rack_per_of_gp_rec > rack_per_upper_bound_ss)) 3272 rack->r_ctl.rack_per_of_gp_rec = rack_per_upper_bound_ss; 3273 } 3274 if (rack->rc_gp_saw_ca && 3275 (rack->rc_gp_saw_ss == 0) && 3276 rack_bw_can_be_raised(rack, cur_bw, last_bw_est, 3277 rack->r_ctl.rack_per_of_gp_ca)) { 3278 /* In CA */ 3279 calc = rack->r_ctl.rack_per_of_gp_ca + plus; 3280 if (calc > 0xffff) 3281 calc = 0xffff; 3282 logged |= 2; 3283 rack->r_ctl.rack_per_of_gp_ca = (uint16_t)calc; 3284 if (rack_per_upper_bound_ca && 3285 (rack->rc_dragged_bottom == 0) && 3286 (rack->r_ctl.rack_per_of_gp_ca > rack_per_upper_bound_ca)) 3287 rack->r_ctl.rack_per_of_gp_ca = rack_per_upper_bound_ca; 3288 } 3289 if (rack->rc_gp_saw_ss && 3290 rack_bw_can_be_raised(rack, cur_bw, last_bw_est, 3291 rack->r_ctl.rack_per_of_gp_ss)) { 3292 /* In SS */ 3293 calc = rack->r_ctl.rack_per_of_gp_ss + plus; 3294 if (calc > 0xffff) 3295 calc = 0xffff; 3296 rack->r_ctl.rack_per_of_gp_ss = (uint16_t)calc; 3297 if (rack_per_upper_bound_ss && 3298 (rack->rc_dragged_bottom == 0) && 3299 (rack->r_ctl.rack_per_of_gp_ss > rack_per_upper_bound_ss)) 3300 rack->r_ctl.rack_per_of_gp_ss = rack_per_upper_bound_ss; 3301 logged |= 4; 3302 } 3303 if (logged && 3304 (rack->rc_gp_incr == 0)){ 3305 /* Go into increment mode */ 3306 rack->rc_gp_incr = 1; 3307 rack->rc_gp_timely_inc_cnt = 0; 3308 } 3309 if (rack->rc_gp_incr && 3310 logged && 3311 (rack->rc_gp_timely_inc_cnt < RACK_TIMELY_CNT_BOOST)) { 3312 rack->rc_gp_timely_inc_cnt++; 3313 } 3314 rack_log_timely(rack, logged, plus, 0, 0, 3315 __LINE__, 1); 3316 } 3317 3318 static uint32_t 3319 rack_get_decrease(struct tcp_rack *rack, uint32_t curper, int32_t rtt_diff) 3320 { 3321 /* 3322 * norm_grad = rtt_diff / minrtt; 3323 * new_per = curper * (1 - B * norm_grad) 3324 * 3325 * B = rack_gp_decrease_per (default 10%) 3326 * rtt_dif = input var current rtt-diff 3327 * curper = input var current percentage 3328 * minrtt = from rack filter 3329 * 3330 */ 3331 uint64_t perf; 3332 3333 perf = (((uint64_t)curper * ((uint64_t)1000000 - 3334 ((uint64_t)rack_gp_decrease_per * (uint64_t)10000 * 3335 (((uint64_t)rtt_diff * (uint64_t)1000000)/ 3336 (uint64_t)get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt)))/ 3337 (uint64_t)1000000)) / 3338 (uint64_t)1000000); 3339 if (perf > curper) { 3340 /* TSNH */ 3341 perf = curper - 1; 3342 } 3343 return ((uint32_t)perf); 3344 } 3345 3346 static uint32_t 3347 rack_decrease_highrtt(struct tcp_rack *rack, uint32_t curper, uint32_t rtt) 3348 { 3349 /* 3350 * highrttthresh 3351 * result = curper * (1 - (B * ( 1 - ------ )) 3352 * gp_srtt 3353 * 3354 * B = rack_gp_decrease_per (default 10%) 3355 * highrttthresh = filter_min * rack_gp_rtt_maxmul 3356 */ 3357 uint64_t perf; 3358 uint32_t highrttthresh; 3359 3360 highrttthresh = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_maxmul; 3361 3362 perf = (((uint64_t)curper * ((uint64_t)1000000 - 3363 ((uint64_t)rack_gp_decrease_per * ((uint64_t)1000000 - 3364 ((uint64_t)highrttthresh * (uint64_t)1000000) / 3365 (uint64_t)rtt)) / 100)) /(uint64_t)1000000); 3366 return (perf); 3367 } 3368 3369 static void 3370 rack_decrease_bw_mul(struct tcp_rack *rack, int timely_says, uint32_t rtt, int32_t rtt_diff) 3371 { 3372 uint64_t logvar, logvar2, logvar3; 3373 uint32_t logged, new_per, ss_red, ca_red, rec_red, alt, val; 3374 3375 if (rack->rc_gp_incr) { 3376 /* Turn off increment counting */ 3377 rack->rc_gp_incr = 0; 3378 rack->rc_gp_timely_inc_cnt = 0; 3379 } 3380 ss_red = ca_red = rec_red = 0; 3381 logged = 0; 3382 /* Calculate the reduction value */ 3383 if (rtt_diff < 0) { 3384 rtt_diff *= -1; 3385 } 3386 /* Must be at least 1% reduction */ 3387 if (rack->rc_gp_saw_rec && (rack->rc_gp_no_rec_chg == 0)) { 3388 /* We have been in recovery ding it too */ 3389 if (timely_says == 2) { 3390 new_per = rack_decrease_highrtt(rack, rack->r_ctl.rack_per_of_gp_rec, rtt); 3391 alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff); 3392 if (alt < new_per) 3393 val = alt; 3394 else 3395 val = new_per; 3396 } else 3397 val = new_per = alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff); 3398 if (rack->r_ctl.rack_per_of_gp_rec > val) { 3399 rec_red = (rack->r_ctl.rack_per_of_gp_rec - val); 3400 rack->r_ctl.rack_per_of_gp_rec = (uint16_t)val; 3401 } else { 3402 rack->r_ctl.rack_per_of_gp_rec = rack_per_lower_bound; 3403 rec_red = 0; 3404 } 3405 if (rack_per_lower_bound > rack->r_ctl.rack_per_of_gp_rec) 3406 rack->r_ctl.rack_per_of_gp_rec = rack_per_lower_bound; 3407 logged |= 1; 3408 } 3409 if (rack->rc_gp_saw_ss) { 3410 /* Sent in SS */ 3411 if (timely_says == 2) { 3412 new_per = rack_decrease_highrtt(rack, rack->r_ctl.rack_per_of_gp_ss, rtt); 3413 alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff); 3414 if (alt < new_per) 3415 val = alt; 3416 else 3417 val = new_per; 3418 } else 3419 val = new_per = alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_ss, rtt_diff); 3420 if (rack->r_ctl.rack_per_of_gp_ss > new_per) { 3421 ss_red = rack->r_ctl.rack_per_of_gp_ss - val; 3422 rack->r_ctl.rack_per_of_gp_ss = (uint16_t)val; 3423 } else { 3424 ss_red = new_per; 3425 rack->r_ctl.rack_per_of_gp_ss = rack_per_lower_bound; 3426 logvar = new_per; 3427 logvar <<= 32; 3428 logvar |= alt; 3429 logvar2 = (uint32_t)rtt; 3430 logvar2 <<= 32; 3431 logvar2 |= (uint32_t)rtt_diff; 3432 logvar3 = rack_gp_rtt_maxmul; 3433 logvar3 <<= 32; 3434 logvar3 |= get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); 3435 rack_log_timely(rack, timely_says, 3436 logvar2, logvar3, 3437 logvar, __LINE__, 10); 3438 } 3439 if (rack_per_lower_bound > rack->r_ctl.rack_per_of_gp_ss) 3440 rack->r_ctl.rack_per_of_gp_ss = rack_per_lower_bound; 3441 logged |= 4; 3442 } else if (rack->rc_gp_saw_ca) { 3443 /* Sent in CA */ 3444 if (timely_says == 2) { 3445 new_per = rack_decrease_highrtt(rack, rack->r_ctl.rack_per_of_gp_ca, rtt); 3446 alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff); 3447 if (alt < new_per) 3448 val = alt; 3449 else 3450 val = new_per; 3451 } else 3452 val = new_per = alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_ca, rtt_diff); 3453 if (rack->r_ctl.rack_per_of_gp_ca > val) { 3454 ca_red = rack->r_ctl.rack_per_of_gp_ca - val; 3455 rack->r_ctl.rack_per_of_gp_ca = (uint16_t)val; 3456 } else { 3457 rack->r_ctl.rack_per_of_gp_ca = rack_per_lower_bound; 3458 ca_red = 0; 3459 logvar = new_per; 3460 logvar <<= 32; 3461 logvar |= alt; 3462 logvar2 = (uint32_t)rtt; 3463 logvar2 <<= 32; 3464 logvar2 |= (uint32_t)rtt_diff; 3465 logvar3 = rack_gp_rtt_maxmul; 3466 logvar3 <<= 32; 3467 logvar3 |= get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); 3468 rack_log_timely(rack, timely_says, 3469 logvar2, logvar3, 3470 logvar, __LINE__, 10); 3471 } 3472 if (rack_per_lower_bound > rack->r_ctl.rack_per_of_gp_ca) 3473 rack->r_ctl.rack_per_of_gp_ca = rack_per_lower_bound; 3474 logged |= 2; 3475 } 3476 if (rack->rc_gp_timely_dec_cnt < 0x7) { 3477 rack->rc_gp_timely_dec_cnt++; 3478 if (rack_timely_dec_clear && 3479 (rack->rc_gp_timely_dec_cnt == rack_timely_dec_clear)) 3480 rack->rc_gp_timely_dec_cnt = 0; 3481 } 3482 logvar = ss_red; 3483 logvar <<= 32; 3484 logvar |= ca_red; 3485 rack_log_timely(rack, logged, rec_red, rack_per_lower_bound, logvar, 3486 __LINE__, 2); 3487 } 3488 3489 static void 3490 rack_log_rtt_shrinks(struct tcp_rack *rack, uint32_t us_cts, 3491 uint32_t rtt, uint32_t line, uint8_t reas) 3492 { 3493 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 3494 union tcp_log_stackspecific log; 3495 struct timeval tv; 3496 3497 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 3498 log.u_bbr.flex1 = line; 3499 log.u_bbr.flex2 = rack->r_ctl.rc_time_probertt_starts; 3500 log.u_bbr.flex3 = rack->r_ctl.rc_lower_rtt_us_cts; 3501 log.u_bbr.flex4 = rack->r_ctl.rack_per_of_gp_ss; 3502 log.u_bbr.flex5 = rtt; 3503 log.u_bbr.flex6 = rack->rc_highly_buffered; 3504 log.u_bbr.flex6 <<= 1; 3505 log.u_bbr.flex6 |= rack->forced_ack; 3506 log.u_bbr.flex6 <<= 1; 3507 log.u_bbr.flex6 |= rack->rc_gp_dyn_mul; 3508 log.u_bbr.flex6 <<= 1; 3509 log.u_bbr.flex6 |= rack->in_probe_rtt; 3510 log.u_bbr.flex6 <<= 1; 3511 log.u_bbr.flex6 |= rack->measure_saw_probe_rtt; 3512 log.u_bbr.flex7 = rack->r_ctl.rack_per_of_gp_probertt; 3513 log.u_bbr.pacing_gain = rack->r_ctl.rack_per_of_gp_ca; 3514 log.u_bbr.cwnd_gain = rack->r_ctl.rack_per_of_gp_rec; 3515 log.u_bbr.flex8 = reas; 3516 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 3517 log.u_bbr.delRate = rack_get_bw(rack); 3518 log.u_bbr.cur_del_rate = rack->r_ctl.rc_highest_us_rtt; 3519 log.u_bbr.cur_del_rate <<= 32; 3520 log.u_bbr.cur_del_rate |= rack->r_ctl.rc_lowest_us_rtt; 3521 log.u_bbr.applimited = rack->r_ctl.rc_time_probertt_entered; 3522 log.u_bbr.pkts_out = rack->r_ctl.rc_rtt_diff; 3523 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 3524 log.u_bbr.epoch = rack->r_ctl.rc_gp_srtt; 3525 log.u_bbr.lt_epoch = rack->r_ctl.rc_prev_gp_srtt; 3526 log.u_bbr.pkt_epoch = rack->r_ctl.rc_lower_rtt_us_cts; 3527 log.u_bbr.delivered = rack->r_ctl.rc_target_probertt_flight; 3528 log.u_bbr.lost = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); 3529 log.u_bbr.rttProp = us_cts; 3530 log.u_bbr.rttProp <<= 32; 3531 log.u_bbr.rttProp |= rack->r_ctl.rc_entry_gp_rtt; 3532 TCP_LOG_EVENTP(rack->rc_tp, NULL, 3533 &rack->rc_inp->inp_socket->so_rcv, 3534 &rack->rc_inp->inp_socket->so_snd, 3535 BBR_LOG_RTT_SHRINKS, 0, 3536 0, &log, false, &rack->r_ctl.act_rcv_time); 3537 } 3538 } 3539 3540 static void 3541 rack_set_prtt_target(struct tcp_rack *rack, uint32_t segsiz, uint32_t rtt) 3542 { 3543 uint64_t bwdp; 3544 3545 bwdp = rack_get_bw(rack); 3546 bwdp *= (uint64_t)rtt; 3547 bwdp /= (uint64_t)HPTS_USEC_IN_SEC; 3548 rack->r_ctl.rc_target_probertt_flight = roundup((uint32_t)bwdp, segsiz); 3549 if (rack->r_ctl.rc_target_probertt_flight < (segsiz * rack_timely_min_segs)) { 3550 /* 3551 * A window protocol must be able to have 4 packets 3552 * outstanding as the floor in order to function 3553 * (especially considering delayed ack :D). 3554 */ 3555 rack->r_ctl.rc_target_probertt_flight = (segsiz * rack_timely_min_segs); 3556 } 3557 } 3558 3559 static void 3560 rack_enter_probertt(struct tcp_rack *rack, uint32_t us_cts) 3561 { 3562 /** 3563 * ProbeRTT is a bit different in rack_pacing than in 3564 * BBR. It is like BBR in that it uses the lowering of 3565 * the RTT as a signal that we saw something new and 3566 * counts from there for how long between. But it is 3567 * different in that its quite simple. It does not 3568 * play with the cwnd and wait until we get down 3569 * to N segments outstanding and hold that for 3570 * 200ms. Instead it just sets the pacing reduction 3571 * rate to a set percentage (70 by default) and hold 3572 * that for a number of recent GP Srtt's. 3573 */ 3574 uint32_t segsiz; 3575 3576 if (rack->rc_gp_dyn_mul == 0) 3577 return; 3578 3579 if (rack->rc_tp->snd_max == rack->rc_tp->snd_una) { 3580 /* We are idle */ 3581 return; 3582 } 3583 if ((rack->rc_tp->t_flags & TF_GPUTINPROG) && 3584 SEQ_GT(rack->rc_tp->snd_una, rack->rc_tp->gput_seq)) { 3585 /* 3586 * Stop the goodput now, the idea here is 3587 * that future measurements with in_probe_rtt 3588 * won't register if they are not greater so 3589 * we want to get what info (if any) is available 3590 * now. 3591 */ 3592 rack_do_goodput_measurement(rack->rc_tp, rack, 3593 rack->rc_tp->snd_una, __LINE__, 3594 RACK_QUALITY_PROBERTT); 3595 } 3596 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt; 3597 rack->r_ctl.rc_time_probertt_entered = us_cts; 3598 segsiz = min(ctf_fixed_maxseg(rack->rc_tp), 3599 rack->r_ctl.rc_pace_min_segs); 3600 rack->in_probe_rtt = 1; 3601 rack->measure_saw_probe_rtt = 1; 3602 rack->r_ctl.rc_lower_rtt_us_cts = us_cts; 3603 rack->r_ctl.rc_time_probertt_starts = 0; 3604 rack->r_ctl.rc_entry_gp_rtt = rack->r_ctl.rc_gp_srtt; 3605 if (rack_probertt_use_min_rtt_entry) 3606 rack_set_prtt_target(rack, segsiz, get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt)); 3607 else 3608 rack_set_prtt_target(rack, segsiz, rack->r_ctl.rc_gp_srtt); 3609 rack_log_rtt_shrinks(rack, us_cts, get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 3610 __LINE__, RACK_RTTS_ENTERPROBE); 3611 } 3612 3613 static void 3614 rack_exit_probertt(struct tcp_rack *rack, uint32_t us_cts) 3615 { 3616 struct rack_sendmap *rsm; 3617 uint32_t segsiz; 3618 3619 segsiz = min(ctf_fixed_maxseg(rack->rc_tp), 3620 rack->r_ctl.rc_pace_min_segs); 3621 rack->in_probe_rtt = 0; 3622 if ((rack->rc_tp->t_flags & TF_GPUTINPROG) && 3623 SEQ_GT(rack->rc_tp->snd_una, rack->rc_tp->gput_seq)) { 3624 /* 3625 * Stop the goodput now, the idea here is 3626 * that future measurements with in_probe_rtt 3627 * won't register if they are not greater so 3628 * we want to get what info (if any) is available 3629 * now. 3630 */ 3631 rack_do_goodput_measurement(rack->rc_tp, rack, 3632 rack->rc_tp->snd_una, __LINE__, 3633 RACK_QUALITY_PROBERTT); 3634 } else if (rack->rc_tp->t_flags & TF_GPUTINPROG) { 3635 /* 3636 * We don't have enough data to make a measurement. 3637 * So lets just stop and start here after exiting 3638 * probe-rtt. We probably are not interested in 3639 * the results anyway. 3640 */ 3641 rack->rc_tp->t_flags &= ~TF_GPUTINPROG; 3642 } 3643 /* 3644 * Measurements through the current snd_max are going 3645 * to be limited by the slower pacing rate. 3646 * 3647 * We need to mark these as app-limited so we 3648 * don't collapse the b/w. 3649 */ 3650 rsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 3651 if (rsm && ((rsm->r_flags & RACK_APP_LIMITED) == 0)) { 3652 if (rack->r_ctl.rc_app_limited_cnt == 0) 3653 rack->r_ctl.rc_end_appl = rack->r_ctl.rc_first_appl = rsm; 3654 else { 3655 /* 3656 * Go out to the end app limited and mark 3657 * this new one as next and move the end_appl up 3658 * to this guy. 3659 */ 3660 if (rack->r_ctl.rc_end_appl) 3661 rack->r_ctl.rc_end_appl->r_nseq_appl = rsm->r_start; 3662 rack->r_ctl.rc_end_appl = rsm; 3663 } 3664 rsm->r_flags |= RACK_APP_LIMITED; 3665 rack->r_ctl.rc_app_limited_cnt++; 3666 } 3667 /* 3668 * Now, we need to examine our pacing rate multipliers. 3669 * If its under 100%, we need to kick it back up to 3670 * 100%. We also don't let it be over our "max" above 3671 * the actual rate i.e. 100% + rack_clamp_atexit_prtt. 3672 * Note setting clamp_atexit_prtt to 0 has the effect 3673 * of setting CA/SS to 100% always at exit (which is 3674 * the default behavior). 3675 */ 3676 if (rack_probertt_clear_is) { 3677 rack->rc_gp_incr = 0; 3678 rack->rc_gp_bwred = 0; 3679 rack->rc_gp_timely_inc_cnt = 0; 3680 rack->rc_gp_timely_dec_cnt = 0; 3681 } 3682 /* Do we do any clamping at exit? */ 3683 if (rack->rc_highly_buffered && rack_atexit_prtt_hbp) { 3684 rack->r_ctl.rack_per_of_gp_ca = rack_atexit_prtt_hbp; 3685 rack->r_ctl.rack_per_of_gp_ss = rack_atexit_prtt_hbp; 3686 } 3687 if ((rack->rc_highly_buffered == 0) && rack_atexit_prtt) { 3688 rack->r_ctl.rack_per_of_gp_ca = rack_atexit_prtt; 3689 rack->r_ctl.rack_per_of_gp_ss = rack_atexit_prtt; 3690 } 3691 /* 3692 * Lets set rtt_diff to 0, so that we will get a "boost" 3693 * after exiting. 3694 */ 3695 rack->r_ctl.rc_rtt_diff = 0; 3696 3697 /* Clear all flags so we start fresh */ 3698 rack->rc_tp->t_bytes_acked = 0; 3699 rack->rc_tp->t_ccv.flags &= ~CCF_ABC_SENTAWND; 3700 /* 3701 * If configured to, set the cwnd and ssthresh to 3702 * our targets. 3703 */ 3704 if (rack_probe_rtt_sets_cwnd) { 3705 uint64_t ebdp; 3706 uint32_t setto; 3707 3708 /* Set ssthresh so we get into CA once we hit our target */ 3709 if (rack_probertt_use_min_rtt_exit == 1) { 3710 /* Set to min rtt */ 3711 rack_set_prtt_target(rack, segsiz, 3712 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt)); 3713 } else if (rack_probertt_use_min_rtt_exit == 2) { 3714 /* Set to current gp rtt */ 3715 rack_set_prtt_target(rack, segsiz, 3716 rack->r_ctl.rc_gp_srtt); 3717 } else if (rack_probertt_use_min_rtt_exit == 3) { 3718 /* Set to entry gp rtt */ 3719 rack_set_prtt_target(rack, segsiz, 3720 rack->r_ctl.rc_entry_gp_rtt); 3721 } else { 3722 uint64_t sum; 3723 uint32_t setval; 3724 3725 sum = rack->r_ctl.rc_entry_gp_rtt; 3726 sum *= 10; 3727 sum /= (uint64_t)(max(1, rack->r_ctl.rc_gp_srtt)); 3728 if (sum >= 20) { 3729 /* 3730 * A highly buffered path needs 3731 * cwnd space for timely to work. 3732 * Lets set things up as if 3733 * we are heading back here again. 3734 */ 3735 setval = rack->r_ctl.rc_entry_gp_rtt; 3736 } else if (sum >= 15) { 3737 /* 3738 * Lets take the smaller of the 3739 * two since we are just somewhat 3740 * buffered. 3741 */ 3742 setval = rack->r_ctl.rc_gp_srtt; 3743 if (setval > rack->r_ctl.rc_entry_gp_rtt) 3744 setval = rack->r_ctl.rc_entry_gp_rtt; 3745 } else { 3746 /* 3747 * Here we are not highly buffered 3748 * and should pick the min we can to 3749 * keep from causing loss. 3750 */ 3751 setval = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); 3752 } 3753 rack_set_prtt_target(rack, segsiz, 3754 setval); 3755 } 3756 if (rack_probe_rtt_sets_cwnd > 1) { 3757 /* There is a percentage here to boost */ 3758 ebdp = rack->r_ctl.rc_target_probertt_flight; 3759 ebdp *= rack_probe_rtt_sets_cwnd; 3760 ebdp /= 100; 3761 setto = rack->r_ctl.rc_target_probertt_flight + ebdp; 3762 } else 3763 setto = rack->r_ctl.rc_target_probertt_flight; 3764 rack->rc_tp->snd_cwnd = roundup(setto, segsiz); 3765 if (rack->rc_tp->snd_cwnd < (segsiz * rack_timely_min_segs)) { 3766 /* Enforce a min */ 3767 rack->rc_tp->snd_cwnd = segsiz * rack_timely_min_segs; 3768 } 3769 /* If we set in the cwnd also set the ssthresh point so we are in CA */ 3770 rack->rc_tp->snd_ssthresh = (rack->rc_tp->snd_cwnd - 1); 3771 } 3772 rack_log_rtt_shrinks(rack, us_cts, 3773 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 3774 __LINE__, RACK_RTTS_EXITPROBE); 3775 /* Clear times last so log has all the info */ 3776 rack->r_ctl.rc_probertt_sndmax_atexit = rack->rc_tp->snd_max; 3777 rack->r_ctl.rc_time_probertt_entered = us_cts; 3778 rack->r_ctl.rc_time_probertt_starts = rack->r_ctl.rc_lower_rtt_us_cts = us_cts; 3779 rack->r_ctl.rc_time_of_last_probertt = us_cts; 3780 } 3781 3782 static void 3783 rack_check_probe_rtt(struct tcp_rack *rack, uint32_t us_cts) 3784 { 3785 /* Check in on probe-rtt */ 3786 if (rack->rc_gp_filled == 0) { 3787 /* We do not do p-rtt unless we have gp measurements */ 3788 return; 3789 } 3790 if (rack->in_probe_rtt) { 3791 uint64_t no_overflow; 3792 uint32_t endtime, must_stay; 3793 3794 if (rack->r_ctl.rc_went_idle_time && 3795 ((us_cts - rack->r_ctl.rc_went_idle_time) > rack_min_probertt_hold)) { 3796 /* 3797 * We went idle during prtt, just exit now. 3798 */ 3799 rack_exit_probertt(rack, us_cts); 3800 } else if (rack_probe_rtt_safety_val && 3801 TSTMP_GT(us_cts, rack->r_ctl.rc_time_probertt_entered) && 3802 ((us_cts - rack->r_ctl.rc_time_probertt_entered) > rack_probe_rtt_safety_val)) { 3803 /* 3804 * Probe RTT safety value triggered! 3805 */ 3806 rack_log_rtt_shrinks(rack, us_cts, 3807 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 3808 __LINE__, RACK_RTTS_SAFETY); 3809 rack_exit_probertt(rack, us_cts); 3810 } 3811 /* Calculate the max we will wait */ 3812 endtime = rack->r_ctl.rc_time_probertt_entered + (rack->r_ctl.rc_gp_srtt * rack_max_drain_wait); 3813 if (rack->rc_highly_buffered) 3814 endtime += (rack->r_ctl.rc_gp_srtt * rack_max_drain_hbp); 3815 /* Calculate the min we must wait */ 3816 must_stay = rack->r_ctl.rc_time_probertt_entered + (rack->r_ctl.rc_gp_srtt * rack_must_drain); 3817 if ((ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) > rack->r_ctl.rc_target_probertt_flight) && 3818 TSTMP_LT(us_cts, endtime)) { 3819 uint32_t calc; 3820 /* Do we lower more? */ 3821 no_exit: 3822 if (TSTMP_GT(us_cts, rack->r_ctl.rc_time_probertt_entered)) 3823 calc = us_cts - rack->r_ctl.rc_time_probertt_entered; 3824 else 3825 calc = 0; 3826 calc /= max(rack->r_ctl.rc_gp_srtt, 1); 3827 if (calc) { 3828 /* Maybe */ 3829 calc *= rack_per_of_gp_probertt_reduce; 3830 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt - calc; 3831 /* Limit it too */ 3832 if (rack->r_ctl.rack_per_of_gp_probertt < rack_per_of_gp_lowthresh) 3833 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_lowthresh; 3834 } 3835 /* We must reach target or the time set */ 3836 return; 3837 } 3838 if (rack->r_ctl.rc_time_probertt_starts == 0) { 3839 if ((TSTMP_LT(us_cts, must_stay) && 3840 rack->rc_highly_buffered) || 3841 (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) > 3842 rack->r_ctl.rc_target_probertt_flight)) { 3843 /* We are not past the must_stay time */ 3844 goto no_exit; 3845 } 3846 rack_log_rtt_shrinks(rack, us_cts, 3847 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 3848 __LINE__, RACK_RTTS_REACHTARGET); 3849 rack->r_ctl.rc_time_probertt_starts = us_cts; 3850 if (rack->r_ctl.rc_time_probertt_starts == 0) 3851 rack->r_ctl.rc_time_probertt_starts = 1; 3852 /* Restore back to our rate we want to pace at in prtt */ 3853 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt; 3854 } 3855 /* 3856 * Setup our end time, some number of gp_srtts plus 200ms. 3857 */ 3858 no_overflow = ((uint64_t)rack->r_ctl.rc_gp_srtt * 3859 (uint64_t)rack_probertt_gpsrtt_cnt_mul); 3860 if (rack_probertt_gpsrtt_cnt_div) 3861 endtime = (uint32_t)(no_overflow / (uint64_t)rack_probertt_gpsrtt_cnt_div); 3862 else 3863 endtime = 0; 3864 endtime += rack_min_probertt_hold; 3865 endtime += rack->r_ctl.rc_time_probertt_starts; 3866 if (TSTMP_GEQ(us_cts, endtime)) { 3867 /* yes, exit probertt */ 3868 rack_exit_probertt(rack, us_cts); 3869 } 3870 3871 } else if ((us_cts - rack->r_ctl.rc_lower_rtt_us_cts) >= rack_time_between_probertt) { 3872 /* Go into probertt, its been too long since we went lower */ 3873 rack_enter_probertt(rack, us_cts); 3874 } 3875 } 3876 3877 static void 3878 rack_update_multiplier(struct tcp_rack *rack, int32_t timely_says, uint64_t last_bw_est, 3879 uint32_t rtt, int32_t rtt_diff) 3880 { 3881 uint64_t cur_bw, up_bnd, low_bnd, subfr; 3882 uint32_t losses; 3883 3884 if ((rack->rc_gp_dyn_mul == 0) || 3885 (rack->use_fixed_rate) || 3886 (rack->in_probe_rtt) || 3887 (rack->rc_always_pace == 0)) { 3888 /* No dynamic GP multiplier in play */ 3889 return; 3890 } 3891 losses = rack->r_ctl.rc_loss_count - rack->r_ctl.rc_loss_at_start; 3892 cur_bw = rack_get_bw(rack); 3893 /* Calculate our up and down range */ 3894 up_bnd = rack->r_ctl.last_gp_comp_bw * (uint64_t)rack_gp_per_bw_mul_up; 3895 up_bnd /= 100; 3896 up_bnd += rack->r_ctl.last_gp_comp_bw; 3897 3898 subfr = (uint64_t)rack->r_ctl.last_gp_comp_bw * (uint64_t)rack_gp_per_bw_mul_down; 3899 subfr /= 100; 3900 low_bnd = rack->r_ctl.last_gp_comp_bw - subfr; 3901 if ((timely_says == 2) && (rack->r_ctl.rc_no_push_at_mrtt)) { 3902 /* 3903 * This is the case where our RTT is above 3904 * the max target and we have been configured 3905 * to just do timely no bonus up stuff in that case. 3906 * 3907 * There are two configurations, set to 1, and we 3908 * just do timely if we are over our max. If its 3909 * set above 1 then we slam the multipliers down 3910 * to 100 and then decrement per timely. 3911 */ 3912 rack_log_timely(rack, timely_says, cur_bw, low_bnd, up_bnd, 3913 __LINE__, 3); 3914 if (rack->r_ctl.rc_no_push_at_mrtt > 1) 3915 rack_validate_multipliers_at_or_below_100(rack); 3916 rack_decrease_bw_mul(rack, timely_says, rtt, rtt_diff); 3917 } else if ((last_bw_est < low_bnd) && !losses) { 3918 /* 3919 * We are decreasing this is a bit complicated this 3920 * means we are loosing ground. This could be 3921 * because another flow entered and we are competing 3922 * for b/w with it. This will push the RTT up which 3923 * makes timely unusable unless we want to get shoved 3924 * into a corner and just be backed off (the age 3925 * old problem with delay based CC). 3926 * 3927 * On the other hand if it was a route change we 3928 * would like to stay somewhat contained and not 3929 * blow out the buffers. 3930 */ 3931 rack_log_timely(rack, timely_says, cur_bw, low_bnd, up_bnd, 3932 __LINE__, 3); 3933 rack->r_ctl.last_gp_comp_bw = cur_bw; 3934 if (rack->rc_gp_bwred == 0) { 3935 /* Go into reduction counting */ 3936 rack->rc_gp_bwred = 1; 3937 rack->rc_gp_timely_dec_cnt = 0; 3938 } 3939 if ((rack->rc_gp_timely_dec_cnt < rack_timely_max_push_drop) || 3940 (timely_says == 0)) { 3941 /* 3942 * Push another time with a faster pacing 3943 * to try to gain back (we include override to 3944 * get a full raise factor). 3945 */ 3946 if ((rack->rc_gp_saw_ca && rack->r_ctl.rack_per_of_gp_ca <= rack_down_raise_thresh) || 3947 (rack->rc_gp_saw_ss && rack->r_ctl.rack_per_of_gp_ss <= rack_down_raise_thresh) || 3948 (timely_says == 0) || 3949 (rack_down_raise_thresh == 0)) { 3950 /* 3951 * Do an override up in b/w if we were 3952 * below the threshold or if the threshold 3953 * is zero we always do the raise. 3954 */ 3955 rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 1); 3956 } else { 3957 /* Log it stays the same */ 3958 rack_log_timely(rack, 0, last_bw_est, low_bnd, 0, 3959 __LINE__, 11); 3960 } 3961 rack->rc_gp_timely_dec_cnt++; 3962 /* We are not incrementing really no-count */ 3963 rack->rc_gp_incr = 0; 3964 rack->rc_gp_timely_inc_cnt = 0; 3965 } else { 3966 /* 3967 * Lets just use the RTT 3968 * information and give up 3969 * pushing. 3970 */ 3971 goto use_timely; 3972 } 3973 } else if ((timely_says != 2) && 3974 !losses && 3975 (last_bw_est > up_bnd)) { 3976 /* 3977 * We are increasing b/w lets keep going, updating 3978 * our b/w and ignoring any timely input, unless 3979 * of course we are at our max raise (if there is one). 3980 */ 3981 3982 rack_log_timely(rack, timely_says, cur_bw, low_bnd, up_bnd, 3983 __LINE__, 3); 3984 rack->r_ctl.last_gp_comp_bw = cur_bw; 3985 if (rack->rc_gp_saw_ss && 3986 rack_per_upper_bound_ss && 3987 (rack->r_ctl.rack_per_of_gp_ss == rack_per_upper_bound_ss)) { 3988 /* 3989 * In cases where we can't go higher 3990 * we should just use timely. 3991 */ 3992 goto use_timely; 3993 } 3994 if (rack->rc_gp_saw_ca && 3995 rack_per_upper_bound_ca && 3996 (rack->r_ctl.rack_per_of_gp_ca == rack_per_upper_bound_ca)) { 3997 /* 3998 * In cases where we can't go higher 3999 * we should just use timely. 4000 */ 4001 goto use_timely; 4002 } 4003 rack->rc_gp_bwred = 0; 4004 rack->rc_gp_timely_dec_cnt = 0; 4005 /* You get a set number of pushes if timely is trying to reduce */ 4006 if ((rack->rc_gp_incr < rack_timely_max_push_rise) || (timely_says == 0)) { 4007 rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 0); 4008 } else { 4009 /* Log it stays the same */ 4010 rack_log_timely(rack, 0, last_bw_est, up_bnd, 0, 4011 __LINE__, 12); 4012 } 4013 return; 4014 } else { 4015 /* 4016 * We are staying between the lower and upper range bounds 4017 * so use timely to decide. 4018 */ 4019 rack_log_timely(rack, timely_says, cur_bw, low_bnd, up_bnd, 4020 __LINE__, 3); 4021 use_timely: 4022 if (timely_says) { 4023 rack->rc_gp_incr = 0; 4024 rack->rc_gp_timely_inc_cnt = 0; 4025 if ((rack->rc_gp_timely_dec_cnt < rack_timely_max_push_drop) && 4026 !losses && 4027 (last_bw_est < low_bnd)) { 4028 /* We are loosing ground */ 4029 rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 0); 4030 rack->rc_gp_timely_dec_cnt++; 4031 /* We are not incrementing really no-count */ 4032 rack->rc_gp_incr = 0; 4033 rack->rc_gp_timely_inc_cnt = 0; 4034 } else 4035 rack_decrease_bw_mul(rack, timely_says, rtt, rtt_diff); 4036 } else { 4037 rack->rc_gp_bwred = 0; 4038 rack->rc_gp_timely_dec_cnt = 0; 4039 rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 0); 4040 } 4041 } 4042 } 4043 4044 static int32_t 4045 rack_make_timely_judgement(struct tcp_rack *rack, uint32_t rtt, int32_t rtt_diff, uint32_t prev_rtt) 4046 { 4047 int32_t timely_says; 4048 uint64_t log_mult, log_rtt_a_diff; 4049 4050 log_rtt_a_diff = rtt; 4051 log_rtt_a_diff <<= 32; 4052 log_rtt_a_diff |= (uint32_t)rtt_diff; 4053 if (rtt >= (get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * 4054 rack_gp_rtt_maxmul)) { 4055 /* Reduce the b/w multiplier */ 4056 timely_says = 2; 4057 log_mult = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_maxmul; 4058 log_mult <<= 32; 4059 log_mult |= prev_rtt; 4060 rack_log_timely(rack, timely_says, log_mult, 4061 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 4062 log_rtt_a_diff, __LINE__, 4); 4063 } else if (rtt <= (get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) + 4064 ((get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_minmul) / 4065 max(rack_gp_rtt_mindiv , 1)))) { 4066 /* Increase the b/w multiplier */ 4067 log_mult = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) + 4068 ((get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_minmul) / 4069 max(rack_gp_rtt_mindiv , 1)); 4070 log_mult <<= 32; 4071 log_mult |= prev_rtt; 4072 timely_says = 0; 4073 rack_log_timely(rack, timely_says, log_mult , 4074 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 4075 log_rtt_a_diff, __LINE__, 5); 4076 } else { 4077 /* 4078 * Use a gradient to find it the timely gradient 4079 * is: 4080 * grad = rc_rtt_diff / min_rtt; 4081 * 4082 * anything below or equal to 0 will be 4083 * a increase indication. Anything above 4084 * zero is a decrease. Note we take care 4085 * of the actual gradient calculation 4086 * in the reduction (its not needed for 4087 * increase). 4088 */ 4089 log_mult = prev_rtt; 4090 if (rtt_diff <= 0) { 4091 /* 4092 * Rttdiff is less than zero, increase the 4093 * b/w multiplier (its 0 or negative) 4094 */ 4095 timely_says = 0; 4096 rack_log_timely(rack, timely_says, log_mult, 4097 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), log_rtt_a_diff, __LINE__, 6); 4098 } else { 4099 /* Reduce the b/w multiplier */ 4100 timely_says = 1; 4101 rack_log_timely(rack, timely_says, log_mult, 4102 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), log_rtt_a_diff, __LINE__, 7); 4103 } 4104 } 4105 return (timely_says); 4106 } 4107 4108 static void 4109 rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack, 4110 tcp_seq th_ack, int line, uint8_t quality) 4111 { 4112 uint64_t tim, bytes_ps, ltim, stim, utim; 4113 uint32_t segsiz, bytes, reqbytes, us_cts; 4114 int32_t gput, new_rtt_diff, timely_says; 4115 uint64_t resid_bw, subpart = 0, addpart = 0, srtt; 4116 int did_add = 0; 4117 4118 us_cts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 4119 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 4120 if (TSTMP_GEQ(us_cts, tp->gput_ts)) 4121 tim = us_cts - tp->gput_ts; 4122 else 4123 tim = 0; 4124 if (rack->r_ctl.rc_gp_cumack_ts > rack->r_ctl.rc_gp_output_ts) 4125 stim = rack->r_ctl.rc_gp_cumack_ts - rack->r_ctl.rc_gp_output_ts; 4126 else 4127 stim = 0; 4128 /* 4129 * Use the larger of the send time or ack time. This prevents us 4130 * from being influenced by ack artifacts to come up with too 4131 * high of measurement. Note that since we are spanning over many more 4132 * bytes in most of our measurements hopefully that is less likely to 4133 * occur. 4134 */ 4135 if (tim > stim) 4136 utim = max(tim, 1); 4137 else 4138 utim = max(stim, 1); 4139 /* Lets get a msec time ltim too for the old stuff */ 4140 ltim = max(1, (utim / HPTS_USEC_IN_MSEC)); 4141 gput = (((uint64_t) (th_ack - tp->gput_seq)) << 3) / ltim; 4142 reqbytes = min(rc_init_window(rack), (MIN_GP_WIN * segsiz)); 4143 if ((tim == 0) && (stim == 0)) { 4144 /* 4145 * Invalid measurement time, maybe 4146 * all on one ack/one send? 4147 */ 4148 bytes = 0; 4149 bytes_ps = 0; 4150 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, 4151 0, 0, 0, 10, __LINE__, NULL, quality); 4152 goto skip_measurement; 4153 } 4154 if (rack->r_ctl.rc_gp_lowrtt == 0xffffffff) { 4155 /* We never made a us_rtt measurement? */ 4156 bytes = 0; 4157 bytes_ps = 0; 4158 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, 4159 0, 0, 0, 10, __LINE__, NULL, quality); 4160 goto skip_measurement; 4161 } 4162 /* 4163 * Calculate the maximum possible b/w this connection 4164 * could have. We base our calculation on the lowest 4165 * rtt we have seen during the measurement and the 4166 * largest rwnd the client has given us in that time. This 4167 * forms a BDP that is the maximum that we could ever 4168 * get to the client. Anything larger is not valid. 4169 * 4170 * I originally had code here that rejected measurements 4171 * where the time was less than 1/2 the latest us_rtt. 4172 * But after thinking on that I realized its wrong since 4173 * say you had a 150Mbps or even 1Gbps link, and you 4174 * were a long way away.. example I am in Europe (100ms rtt) 4175 * talking to my 1Gbps link in S.C. Now measuring say 150,000 4176 * bytes my time would be 1.2ms, and yet my rtt would say 4177 * the measurement was invalid the time was < 50ms. The 4178 * same thing is true for 150Mb (8ms of time). 4179 * 4180 * A better way I realized is to look at what the maximum 4181 * the connection could possibly do. This is gated on 4182 * the lowest RTT we have seen and the highest rwnd. 4183 * We should in theory never exceed that, if we are 4184 * then something on the path is storing up packets 4185 * and then feeding them all at once to our endpoint 4186 * messing up our measurement. 4187 */ 4188 rack->r_ctl.last_max_bw = rack->r_ctl.rc_gp_high_rwnd; 4189 rack->r_ctl.last_max_bw *= HPTS_USEC_IN_SEC; 4190 rack->r_ctl.last_max_bw /= rack->r_ctl.rc_gp_lowrtt; 4191 if (SEQ_LT(th_ack, tp->gput_seq)) { 4192 /* No measurement can be made */ 4193 bytes = 0; 4194 bytes_ps = 0; 4195 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, 4196 0, 0, 0, 10, __LINE__, NULL, quality); 4197 goto skip_measurement; 4198 } else 4199 bytes = (th_ack - tp->gput_seq); 4200 bytes_ps = (uint64_t)bytes; 4201 /* 4202 * Don't measure a b/w for pacing unless we have gotten at least 4203 * an initial windows worth of data in this measurement interval. 4204 * 4205 * Small numbers of bytes get badly influenced by delayed ack and 4206 * other artifacts. Note we take the initial window or our 4207 * defined minimum GP (defaulting to 10 which hopefully is the 4208 * IW). 4209 */ 4210 if (rack->rc_gp_filled == 0) { 4211 /* 4212 * The initial estimate is special. We 4213 * have blasted out an IW worth of packets 4214 * without a real valid ack ts results. We 4215 * then setup the app_limited_needs_set flag, 4216 * this should get the first ack in (probably 2 4217 * MSS worth) to be recorded as the timestamp. 4218 * We thus allow a smaller number of bytes i.e. 4219 * IW - 2MSS. 4220 */ 4221 reqbytes -= (2 * segsiz); 4222 /* Also lets fill previous for our first measurement to be neutral */ 4223 rack->r_ctl.rc_prev_gp_srtt = rack->r_ctl.rc_gp_srtt; 4224 } 4225 if ((bytes_ps < reqbytes) || rack->app_limited_needs_set) { 4226 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, 4227 rack->r_ctl.rc_app_limited_cnt, 4228 0, 0, 10, __LINE__, NULL, quality); 4229 goto skip_measurement; 4230 } 4231 /* 4232 * We now need to calculate the Timely like status so 4233 * we can update (possibly) the b/w multipliers. 4234 */ 4235 new_rtt_diff = (int32_t)rack->r_ctl.rc_gp_srtt - (int32_t)rack->r_ctl.rc_prev_gp_srtt; 4236 if (rack->rc_gp_filled == 0) { 4237 /* No previous reading */ 4238 rack->r_ctl.rc_rtt_diff = new_rtt_diff; 4239 } else { 4240 if (rack->measure_saw_probe_rtt == 0) { 4241 /* 4242 * We don't want a probertt to be counted 4243 * since it will be negative incorrectly. We 4244 * expect to be reducing the RTT when we 4245 * pace at a slower rate. 4246 */ 4247 rack->r_ctl.rc_rtt_diff -= (rack->r_ctl.rc_rtt_diff / 8); 4248 rack->r_ctl.rc_rtt_diff += (new_rtt_diff / 8); 4249 } 4250 } 4251 timely_says = rack_make_timely_judgement(rack, 4252 rack->r_ctl.rc_gp_srtt, 4253 rack->r_ctl.rc_rtt_diff, 4254 rack->r_ctl.rc_prev_gp_srtt 4255 ); 4256 bytes_ps *= HPTS_USEC_IN_SEC; 4257 bytes_ps /= utim; 4258 if (bytes_ps > rack->r_ctl.last_max_bw) { 4259 /* 4260 * Something is on path playing 4261 * since this b/w is not possible based 4262 * on our BDP (highest rwnd and lowest rtt 4263 * we saw in the measurement window). 4264 * 4265 * Another option here would be to 4266 * instead skip the measurement. 4267 */ 4268 rack_log_pacing_delay_calc(rack, bytes, reqbytes, 4269 bytes_ps, rack->r_ctl.last_max_bw, 0, 4270 11, __LINE__, NULL, quality); 4271 bytes_ps = rack->r_ctl.last_max_bw; 4272 } 4273 /* We store gp for b/w in bytes per second */ 4274 if (rack->rc_gp_filled == 0) { 4275 /* Initial measurement */ 4276 if (bytes_ps) { 4277 rack->r_ctl.gp_bw = bytes_ps; 4278 rack->rc_gp_filled = 1; 4279 rack->r_ctl.num_measurements = 1; 4280 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL); 4281 } else { 4282 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, 4283 rack->r_ctl.rc_app_limited_cnt, 4284 0, 0, 10, __LINE__, NULL, quality); 4285 } 4286 if (tcp_in_hpts(rack->rc_inp) && 4287 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { 4288 /* 4289 * Ok we can't trust the pacer in this case 4290 * where we transition from un-paced to paced. 4291 * Or for that matter when the burst mitigation 4292 * was making a wild guess and got it wrong. 4293 * Stop the pacer and clear up all the aggregate 4294 * delays etc. 4295 */ 4296 tcp_hpts_remove(rack->rc_inp); 4297 rack->r_ctl.rc_hpts_flags = 0; 4298 rack->r_ctl.rc_last_output_to = 0; 4299 } 4300 did_add = 2; 4301 } else if (rack->r_ctl.num_measurements < RACK_REQ_AVG) { 4302 /* Still a small number run an average */ 4303 rack->r_ctl.gp_bw += bytes_ps; 4304 addpart = rack->r_ctl.num_measurements; 4305 rack->r_ctl.num_measurements++; 4306 if (rack->r_ctl.num_measurements >= RACK_REQ_AVG) { 4307 /* We have collected enough to move forward */ 4308 rack->r_ctl.gp_bw /= (uint64_t)rack->r_ctl.num_measurements; 4309 } 4310 did_add = 3; 4311 } else { 4312 /* 4313 * We want to take 1/wma of the goodput and add in to 7/8th 4314 * of the old value weighted by the srtt. So if your measurement 4315 * period is say 2 SRTT's long you would get 1/4 as the 4316 * value, if it was like 1/2 SRTT then you would get 1/16th. 4317 * 4318 * But we must be careful not to take too much i.e. if the 4319 * srtt is say 20ms and the measurement is taken over 4320 * 400ms our weight would be 400/20 i.e. 20. On the 4321 * other hand if we get a measurement over 1ms with a 4322 * 10ms rtt we only want to take a much smaller portion. 4323 */ 4324 if (rack->r_ctl.num_measurements < 0xff) { 4325 rack->r_ctl.num_measurements++; 4326 } 4327 srtt = (uint64_t)tp->t_srtt; 4328 if (srtt == 0) { 4329 /* 4330 * Strange why did t_srtt go back to zero? 4331 */ 4332 if (rack->r_ctl.rc_rack_min_rtt) 4333 srtt = rack->r_ctl.rc_rack_min_rtt; 4334 else 4335 srtt = HPTS_USEC_IN_MSEC; 4336 } 4337 /* 4338 * XXXrrs: Note for reviewers, in playing with 4339 * dynamic pacing I discovered this GP calculation 4340 * as done originally leads to some undesired results. 4341 * Basically you can get longer measurements contributing 4342 * too much to the WMA. Thus I changed it if you are doing 4343 * dynamic adjustments to only do the aportioned adjustment 4344 * if we have a very small (time wise) measurement. Longer 4345 * measurements just get there weight (defaulting to 1/8) 4346 * add to the WMA. We may want to think about changing 4347 * this to always do that for both sides i.e. dynamic 4348 * and non-dynamic... but considering lots of folks 4349 * were playing with this I did not want to change the 4350 * calculation per.se. without your thoughts.. Lawerence? 4351 * Peter?? 4352 */ 4353 if (rack->rc_gp_dyn_mul == 0) { 4354 subpart = rack->r_ctl.gp_bw * utim; 4355 subpart /= (srtt * 8); 4356 if (subpart < (rack->r_ctl.gp_bw / 2)) { 4357 /* 4358 * The b/w update takes no more 4359 * away then 1/2 our running total 4360 * so factor it in. 4361 */ 4362 addpart = bytes_ps * utim; 4363 addpart /= (srtt * 8); 4364 } else { 4365 /* 4366 * Don't allow a single measurement 4367 * to account for more than 1/2 of the 4368 * WMA. This could happen on a retransmission 4369 * where utim becomes huge compared to 4370 * srtt (multiple retransmissions when using 4371 * the sending rate which factors in all the 4372 * transmissions from the first one). 4373 */ 4374 subpart = rack->r_ctl.gp_bw / 2; 4375 addpart = bytes_ps / 2; 4376 } 4377 resid_bw = rack->r_ctl.gp_bw - subpart; 4378 rack->r_ctl.gp_bw = resid_bw + addpart; 4379 did_add = 1; 4380 } else { 4381 if ((utim / srtt) <= 1) { 4382 /* 4383 * The b/w update was over a small period 4384 * of time. The idea here is to prevent a small 4385 * measurement time period from counting 4386 * too much. So we scale it based on the 4387 * time so it attributes less than 1/rack_wma_divisor 4388 * of its measurement. 4389 */ 4390 subpart = rack->r_ctl.gp_bw * utim; 4391 subpart /= (srtt * rack_wma_divisor); 4392 addpart = bytes_ps * utim; 4393 addpart /= (srtt * rack_wma_divisor); 4394 } else { 4395 /* 4396 * The scaled measurement was long 4397 * enough so lets just add in the 4398 * portion of the measurement i.e. 1/rack_wma_divisor 4399 */ 4400 subpart = rack->r_ctl.gp_bw / rack_wma_divisor; 4401 addpart = bytes_ps / rack_wma_divisor; 4402 } 4403 if ((rack->measure_saw_probe_rtt == 0) || 4404 (bytes_ps > rack->r_ctl.gp_bw)) { 4405 /* 4406 * For probe-rtt we only add it in 4407 * if its larger, all others we just 4408 * add in. 4409 */ 4410 did_add = 1; 4411 resid_bw = rack->r_ctl.gp_bw - subpart; 4412 rack->r_ctl.gp_bw = resid_bw + addpart; 4413 } 4414 } 4415 } 4416 if ((rack->gp_ready == 0) && 4417 (rack->r_ctl.num_measurements >= rack->r_ctl.req_measurements)) { 4418 /* We have enough measurements now */ 4419 rack->gp_ready = 1; 4420 rack_set_cc_pacing(rack); 4421 if (rack->defer_options) 4422 rack_apply_deferred_options(rack); 4423 } 4424 rack_log_pacing_delay_calc(rack, subpart, addpart, bytes_ps, stim, 4425 rack_get_bw(rack), 22, did_add, NULL, quality); 4426 /* We do not update any multipliers if we are in or have seen a probe-rtt */ 4427 if ((rack->measure_saw_probe_rtt == 0) && rack->rc_gp_rtt_set) 4428 rack_update_multiplier(rack, timely_says, bytes_ps, 4429 rack->r_ctl.rc_gp_srtt, 4430 rack->r_ctl.rc_rtt_diff); 4431 rack_log_pacing_delay_calc(rack, bytes, tim, bytes_ps, stim, 4432 rack_get_bw(rack), 3, line, NULL, quality); 4433 /* reset the gp srtt and setup the new prev */ 4434 rack->r_ctl.rc_prev_gp_srtt = rack->r_ctl.rc_gp_srtt; 4435 /* Record the lost count for the next measurement */ 4436 rack->r_ctl.rc_loss_at_start = rack->r_ctl.rc_loss_count; 4437 /* 4438 * We restart our diffs based on the gpsrtt in the 4439 * measurement window. 4440 */ 4441 rack->rc_gp_rtt_set = 0; 4442 rack->rc_gp_saw_rec = 0; 4443 rack->rc_gp_saw_ca = 0; 4444 rack->rc_gp_saw_ss = 0; 4445 rack->rc_dragged_bottom = 0; 4446 skip_measurement: 4447 4448 #ifdef STATS 4449 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_GPUT, 4450 gput); 4451 /* 4452 * XXXLAS: This is a temporary hack, and should be 4453 * chained off VOI_TCP_GPUT when stats(9) grows an 4454 * API to deal with chained VOIs. 4455 */ 4456 if (tp->t_stats_gput_prev > 0) 4457 stats_voi_update_abs_s32(tp->t_stats, 4458 VOI_TCP_GPUT_ND, 4459 ((gput - tp->t_stats_gput_prev) * 100) / 4460 tp->t_stats_gput_prev); 4461 #endif 4462 tp->t_flags &= ~TF_GPUTINPROG; 4463 tp->t_stats_gput_prev = gput; 4464 /* 4465 * Now are we app limited now and there is space from where we 4466 * were to where we want to go? 4467 * 4468 * We don't do the other case i.e. non-applimited here since 4469 * the next send will trigger us picking up the missing data. 4470 */ 4471 if (rack->r_ctl.rc_first_appl && 4472 TCPS_HAVEESTABLISHED(tp->t_state) && 4473 rack->r_ctl.rc_app_limited_cnt && 4474 (SEQ_GT(rack->r_ctl.rc_first_appl->r_start, th_ack)) && 4475 ((rack->r_ctl.rc_first_appl->r_end - th_ack) > 4476 max(rc_init_window(rack), (MIN_GP_WIN * segsiz)))) { 4477 /* 4478 * Yep there is enough outstanding to make a measurement here. 4479 */ 4480 struct rack_sendmap *rsm, fe; 4481 4482 rack->r_ctl.rc_gp_lowrtt = 0xffffffff; 4483 rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd; 4484 tp->gput_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 4485 rack->app_limited_needs_set = 0; 4486 tp->gput_seq = th_ack; 4487 if (rack->in_probe_rtt) 4488 rack->measure_saw_probe_rtt = 1; 4489 else if ((rack->measure_saw_probe_rtt) && 4490 (SEQ_GEQ(tp->gput_seq, rack->r_ctl.rc_probertt_sndmax_atexit))) 4491 rack->measure_saw_probe_rtt = 0; 4492 if ((rack->r_ctl.rc_first_appl->r_end - th_ack) >= rack_get_measure_window(tp, rack)) { 4493 /* There is a full window to gain info from */ 4494 tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack); 4495 } else { 4496 /* We can only measure up to the applimited point */ 4497 tp->gput_ack = tp->gput_seq + (rack->r_ctl.rc_first_appl->r_end - th_ack); 4498 if ((tp->gput_ack - tp->gput_seq) < (MIN_GP_WIN * segsiz)) { 4499 /* 4500 * We don't have enough to make a measurement. 4501 */ 4502 tp->t_flags &= ~TF_GPUTINPROG; 4503 rack_log_pacing_delay_calc(rack, tp->gput_ack, tp->gput_seq, 4504 0, 0, 0, 6, __LINE__, NULL, quality); 4505 return; 4506 } 4507 } 4508 if (tp->t_state >= TCPS_FIN_WAIT_1) { 4509 /* 4510 * We will get no more data into the SB 4511 * this means we need to have the data available 4512 * before we start a measurement. 4513 */ 4514 if (sbavail(&tptosocket(tp)->so_snd) < (tp->gput_ack - tp->gput_seq)) { 4515 /* Nope not enough data. */ 4516 return; 4517 } 4518 } 4519 tp->t_flags |= TF_GPUTINPROG; 4520 /* 4521 * Now we need to find the timestamp of the send at tp->gput_seq 4522 * for the send based measurement. 4523 */ 4524 fe.r_start = tp->gput_seq; 4525 rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); 4526 if (rsm) { 4527 /* Ok send-based limit is set */ 4528 if (SEQ_LT(rsm->r_start, tp->gput_seq)) { 4529 /* 4530 * Move back to include the earlier part 4531 * so our ack time lines up right (this may 4532 * make an overlapping measurement but thats 4533 * ok). 4534 */ 4535 tp->gput_seq = rsm->r_start; 4536 } 4537 if (rsm->r_flags & RACK_ACKED) 4538 tp->gput_ts = (uint32_t)rsm->r_ack_arrival; 4539 else 4540 rack->app_limited_needs_set = 1; 4541 rack->r_ctl.rc_gp_output_ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]; 4542 } else { 4543 /* 4544 * If we don't find the rsm due to some 4545 * send-limit set the current time, which 4546 * basically disables the send-limit. 4547 */ 4548 struct timeval tv; 4549 4550 microuptime(&tv); 4551 rack->r_ctl.rc_gp_output_ts = rack_to_usec_ts(&tv); 4552 } 4553 rack_log_pacing_delay_calc(rack, 4554 tp->gput_seq, 4555 tp->gput_ack, 4556 (uint64_t)rsm, 4557 tp->gput_ts, 4558 rack->r_ctl.rc_app_limited_cnt, 4559 9, 4560 __LINE__, NULL, quality); 4561 } 4562 } 4563 4564 /* 4565 * CC wrapper hook functions 4566 */ 4567 static void 4568 rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, uint32_t th_ack, uint16_t nsegs, 4569 uint16_t type, int32_t recovery) 4570 { 4571 uint32_t prior_cwnd, acked; 4572 struct tcp_log_buffer *lgb = NULL; 4573 uint8_t labc_to_use, quality; 4574 4575 INP_WLOCK_ASSERT(tptoinpcb(tp)); 4576 tp->t_ccv.nsegs = nsegs; 4577 acked = tp->t_ccv.bytes_this_ack = (th_ack - tp->snd_una); 4578 if ((recovery) && (rack->r_ctl.rc_early_recovery_segs)) { 4579 uint32_t max; 4580 4581 max = rack->r_ctl.rc_early_recovery_segs * ctf_fixed_maxseg(tp); 4582 if (tp->t_ccv.bytes_this_ack > max) { 4583 tp->t_ccv.bytes_this_ack = max; 4584 } 4585 } 4586 #ifdef STATS 4587 stats_voi_update_abs_s32(tp->t_stats, VOI_TCP_CALCFRWINDIFF, 4588 ((int32_t)rack->r_ctl.cwnd_to_use) - tp->snd_wnd); 4589 #endif 4590 quality = RACK_QUALITY_NONE; 4591 if ((tp->t_flags & TF_GPUTINPROG) && 4592 rack_enough_for_measurement(tp, rack, th_ack, &quality)) { 4593 /* Measure the Goodput */ 4594 rack_do_goodput_measurement(tp, rack, th_ack, __LINE__, quality); 4595 #ifdef NETFLIX_PEAKRATE 4596 if ((type == CC_ACK) && 4597 (tp->t_maxpeakrate)) { 4598 /* 4599 * We update t_peakrate_thr. This gives us roughly 4600 * one update per round trip time. Note 4601 * it will only be used if pace_always is off i.e 4602 * we don't do this for paced flows. 4603 */ 4604 rack_update_peakrate_thr(tp); 4605 } 4606 #endif 4607 } 4608 /* Which way our we limited, if not cwnd limited no advance in CA */ 4609 if (tp->snd_cwnd <= tp->snd_wnd) 4610 tp->t_ccv.flags |= CCF_CWND_LIMITED; 4611 else 4612 tp->t_ccv.flags &= ~CCF_CWND_LIMITED; 4613 if (tp->snd_cwnd > tp->snd_ssthresh) { 4614 tp->t_bytes_acked += min(tp->t_ccv.bytes_this_ack, 4615 nsegs * V_tcp_abc_l_var * ctf_fixed_maxseg(tp)); 4616 /* For the setting of a window past use the actual scwnd we are using */ 4617 if (tp->t_bytes_acked >= rack->r_ctl.cwnd_to_use) { 4618 tp->t_bytes_acked -= rack->r_ctl.cwnd_to_use; 4619 tp->t_ccv.flags |= CCF_ABC_SENTAWND; 4620 } 4621 } else { 4622 tp->t_ccv.flags &= ~CCF_ABC_SENTAWND; 4623 tp->t_bytes_acked = 0; 4624 } 4625 prior_cwnd = tp->snd_cwnd; 4626 if ((recovery == 0) || (rack_max_abc_post_recovery == 0) || rack->r_use_labc_for_rec || 4627 (rack_client_low_buf && (rack->client_bufferlvl < rack_client_low_buf))) 4628 labc_to_use = rack->rc_labc; 4629 else 4630 labc_to_use = rack_max_abc_post_recovery; 4631 if (rack_verbose_logging && (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) { 4632 union tcp_log_stackspecific log; 4633 struct timeval tv; 4634 4635 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 4636 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 4637 log.u_bbr.flex1 = th_ack; 4638 log.u_bbr.flex2 = tp->t_ccv.flags; 4639 log.u_bbr.flex3 = tp->t_ccv.bytes_this_ack; 4640 log.u_bbr.flex4 = tp->t_ccv.nsegs; 4641 log.u_bbr.flex5 = labc_to_use; 4642 log.u_bbr.flex6 = prior_cwnd; 4643 log.u_bbr.flex7 = V_tcp_do_newsack; 4644 log.u_bbr.flex8 = 1; 4645 lgb = tcp_log_event_(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0, 4646 0, &log, false, NULL, NULL, 0, &tv); 4647 } 4648 if (CC_ALGO(tp)->ack_received != NULL) { 4649 /* XXXLAS: Find a way to live without this */ 4650 tp->t_ccv.curack = th_ack; 4651 tp->t_ccv.labc = labc_to_use; 4652 tp->t_ccv.flags |= CCF_USE_LOCAL_ABC; 4653 CC_ALGO(tp)->ack_received(&tp->t_ccv, type); 4654 } 4655 if (lgb) { 4656 lgb->tlb_stackinfo.u_bbr.flex6 = tp->snd_cwnd; 4657 } 4658 if (rack->r_must_retran) { 4659 if (SEQ_GEQ(th_ack, rack->r_ctl.rc_snd_max_at_rto)) { 4660 /* 4661 * We now are beyond the rxt point so lets disable 4662 * the flag. 4663 */ 4664 rack->r_ctl.rc_out_at_rto = 0; 4665 rack->r_must_retran = 0; 4666 } else if ((prior_cwnd + ctf_fixed_maxseg(tp)) <= tp->snd_cwnd) { 4667 /* 4668 * Only decrement the rc_out_at_rto if the cwnd advances 4669 * at least a whole segment. Otherwise next time the peer 4670 * acks, we won't be able to send this generaly happens 4671 * when we are in Congestion Avoidance. 4672 */ 4673 if (acked <= rack->r_ctl.rc_out_at_rto){ 4674 rack->r_ctl.rc_out_at_rto -= acked; 4675 } else { 4676 rack->r_ctl.rc_out_at_rto = 0; 4677 } 4678 } 4679 } 4680 #ifdef STATS 4681 stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_LCWIN, rack->r_ctl.cwnd_to_use); 4682 #endif 4683 if (rack->r_ctl.rc_rack_largest_cwnd < rack->r_ctl.cwnd_to_use) { 4684 rack->r_ctl.rc_rack_largest_cwnd = rack->r_ctl.cwnd_to_use; 4685 } 4686 #ifdef NETFLIX_PEAKRATE 4687 /* we enforce max peak rate if it is set and we are not pacing */ 4688 if ((rack->rc_always_pace == 0) && 4689 tp->t_peakrate_thr && 4690 (tp->snd_cwnd > tp->t_peakrate_thr)) { 4691 tp->snd_cwnd = tp->t_peakrate_thr; 4692 } 4693 #endif 4694 } 4695 4696 static void 4697 tcp_rack_partialack(struct tcpcb *tp) 4698 { 4699 struct tcp_rack *rack; 4700 4701 rack = (struct tcp_rack *)tp->t_fb_ptr; 4702 INP_WLOCK_ASSERT(tptoinpcb(tp)); 4703 /* 4704 * If we are doing PRR and have enough 4705 * room to send <or> we are pacing and prr 4706 * is disabled we will want to see if we 4707 * can send data (by setting r_wanted_output to 4708 * true). 4709 */ 4710 if ((rack->r_ctl.rc_prr_sndcnt > 0) || 4711 rack->rack_no_prr) 4712 rack->r_wanted_output = 1; 4713 } 4714 4715 static void 4716 rack_post_recovery(struct tcpcb *tp, uint32_t th_ack) 4717 { 4718 struct tcp_rack *rack; 4719 uint32_t orig_cwnd; 4720 4721 orig_cwnd = tp->snd_cwnd; 4722 INP_WLOCK_ASSERT(tptoinpcb(tp)); 4723 rack = (struct tcp_rack *)tp->t_fb_ptr; 4724 /* only alert CC if we alerted when we entered */ 4725 if (CC_ALGO(tp)->post_recovery != NULL) { 4726 tp->t_ccv.curack = th_ack; 4727 CC_ALGO(tp)->post_recovery(&tp->t_ccv); 4728 if (tp->snd_cwnd < tp->snd_ssthresh) { 4729 /* 4730 * Rack has burst control and pacing 4731 * so lets not set this any lower than 4732 * snd_ssthresh per RFC-6582 (option 2). 4733 */ 4734 tp->snd_cwnd = tp->snd_ssthresh; 4735 } 4736 } 4737 if (rack_verbose_logging && (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) { 4738 union tcp_log_stackspecific log; 4739 struct timeval tv; 4740 4741 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 4742 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 4743 log.u_bbr.flex1 = th_ack; 4744 log.u_bbr.flex2 = tp->t_ccv.flags; 4745 log.u_bbr.flex3 = tp->t_ccv.bytes_this_ack; 4746 log.u_bbr.flex4 = tp->t_ccv.nsegs; 4747 log.u_bbr.flex5 = V_tcp_abc_l_var; 4748 log.u_bbr.flex6 = orig_cwnd; 4749 log.u_bbr.flex7 = V_tcp_do_newsack; 4750 log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt; 4751 log.u_bbr.flex8 = 2; 4752 tcp_log_event_(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0, 4753 0, &log, false, NULL, NULL, 0, &tv); 4754 } 4755 if ((rack->rack_no_prr == 0) && 4756 (rack->no_prr_addback == 0) && 4757 (rack->r_ctl.rc_prr_sndcnt > 0)) { 4758 /* 4759 * Suck the next prr cnt back into cwnd, but 4760 * only do that if we are not application limited. 4761 */ 4762 if (ctf_outstanding(tp) <= sbavail(&tptosocket(tp)->so_snd)) { 4763 /* 4764 * We are allowed to add back to the cwnd the amount we did 4765 * not get out if: 4766 * a) no_prr_addback is off. 4767 * b) we are not app limited 4768 * c) we are doing prr 4769 * <and> 4770 * d) it is bounded by rack_prr_addbackmax (if addback is 0, then none). 4771 */ 4772 tp->snd_cwnd += min((ctf_fixed_maxseg(tp) * rack_prr_addbackmax), 4773 rack->r_ctl.rc_prr_sndcnt); 4774 } 4775 rack->r_ctl.rc_prr_sndcnt = 0; 4776 rack_log_to_prr(rack, 1, 0, __LINE__); 4777 } 4778 rack_log_to_prr(rack, 14, orig_cwnd, __LINE__); 4779 tp->snd_recover = tp->snd_una; 4780 if (rack->r_ctl.dsack_persist) { 4781 rack->r_ctl.dsack_persist--; 4782 if (rack->r_ctl.num_dsack && (rack->r_ctl.dsack_persist == 0)) { 4783 rack->r_ctl.num_dsack = 0; 4784 } 4785 rack_log_dsack_event(rack, 1, __LINE__, 0, 0); 4786 } 4787 EXIT_RECOVERY(tp->t_flags); 4788 } 4789 4790 static void 4791 rack_cong_signal(struct tcpcb *tp, uint32_t type, uint32_t ack, int line) 4792 { 4793 struct tcp_rack *rack; 4794 uint32_t ssthresh_enter, cwnd_enter, in_rec_at_entry, orig_cwnd; 4795 4796 INP_WLOCK_ASSERT(tptoinpcb(tp)); 4797 #ifdef STATS 4798 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_CSIG, type); 4799 #endif 4800 if (IN_RECOVERY(tp->t_flags) == 0) { 4801 in_rec_at_entry = 0; 4802 ssthresh_enter = tp->snd_ssthresh; 4803 cwnd_enter = tp->snd_cwnd; 4804 } else 4805 in_rec_at_entry = 1; 4806 rack = (struct tcp_rack *)tp->t_fb_ptr; 4807 switch (type) { 4808 case CC_NDUPACK: 4809 tp->t_flags &= ~TF_WASFRECOVERY; 4810 tp->t_flags &= ~TF_WASCRECOVERY; 4811 if (!IN_FASTRECOVERY(tp->t_flags)) { 4812 rack->r_ctl.rc_prr_delivered = 0; 4813 rack->r_ctl.rc_prr_out = 0; 4814 if (rack->rack_no_prr == 0) { 4815 rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); 4816 rack_log_to_prr(rack, 2, in_rec_at_entry, line); 4817 } 4818 rack->r_ctl.rc_prr_recovery_fs = tp->snd_max - tp->snd_una; 4819 tp->snd_recover = tp->snd_max; 4820 if (tp->t_flags2 & TF2_ECN_PERMIT) 4821 tp->t_flags2 |= TF2_ECN_SND_CWR; 4822 } 4823 break; 4824 case CC_ECN: 4825 if (!IN_CONGRECOVERY(tp->t_flags) || 4826 /* 4827 * Allow ECN reaction on ACK to CWR, if 4828 * that data segment was also CE marked. 4829 */ 4830 SEQ_GEQ(ack, tp->snd_recover)) { 4831 EXIT_CONGRECOVERY(tp->t_flags); 4832 KMOD_TCPSTAT_INC(tcps_ecn_rcwnd); 4833 tp->snd_recover = tp->snd_max + 1; 4834 if (tp->t_flags2 & TF2_ECN_PERMIT) 4835 tp->t_flags2 |= TF2_ECN_SND_CWR; 4836 } 4837 break; 4838 case CC_RTO: 4839 tp->t_dupacks = 0; 4840 tp->t_bytes_acked = 0; 4841 EXIT_RECOVERY(tp->t_flags); 4842 tp->snd_ssthresh = max(2, min(tp->snd_wnd, rack->r_ctl.cwnd_to_use) / 2 / 4843 ctf_fixed_maxseg(tp)) * ctf_fixed_maxseg(tp); 4844 orig_cwnd = tp->snd_cwnd; 4845 tp->snd_cwnd = ctf_fixed_maxseg(tp); 4846 rack_log_to_prr(rack, 16, orig_cwnd, line); 4847 if (tp->t_flags2 & TF2_ECN_PERMIT) 4848 tp->t_flags2 |= TF2_ECN_SND_CWR; 4849 break; 4850 case CC_RTO_ERR: 4851 KMOD_TCPSTAT_INC(tcps_sndrexmitbad); 4852 /* RTO was unnecessary, so reset everything. */ 4853 tp->snd_cwnd = tp->snd_cwnd_prev; 4854 tp->snd_ssthresh = tp->snd_ssthresh_prev; 4855 tp->snd_recover = tp->snd_recover_prev; 4856 if (tp->t_flags & TF_WASFRECOVERY) { 4857 ENTER_FASTRECOVERY(tp->t_flags); 4858 tp->t_flags &= ~TF_WASFRECOVERY; 4859 } 4860 if (tp->t_flags & TF_WASCRECOVERY) { 4861 ENTER_CONGRECOVERY(tp->t_flags); 4862 tp->t_flags &= ~TF_WASCRECOVERY; 4863 } 4864 tp->snd_nxt = tp->snd_max; 4865 tp->t_badrxtwin = 0; 4866 break; 4867 } 4868 if ((CC_ALGO(tp)->cong_signal != NULL) && 4869 (type != CC_RTO)){ 4870 tp->t_ccv.curack = ack; 4871 CC_ALGO(tp)->cong_signal(&tp->t_ccv, type); 4872 } 4873 if ((in_rec_at_entry == 0) && IN_RECOVERY(tp->t_flags)) { 4874 rack_log_to_prr(rack, 15, cwnd_enter, line); 4875 rack->r_ctl.dsack_byte_cnt = 0; 4876 rack->r_ctl.retran_during_recovery = 0; 4877 rack->r_ctl.rc_cwnd_at_erec = cwnd_enter; 4878 rack->r_ctl.rc_ssthresh_at_erec = ssthresh_enter; 4879 rack->r_ent_rec_ns = 1; 4880 } 4881 } 4882 4883 static inline void 4884 rack_cc_after_idle(struct tcp_rack *rack, struct tcpcb *tp) 4885 { 4886 uint32_t i_cwnd; 4887 4888 INP_WLOCK_ASSERT(tptoinpcb(tp)); 4889 4890 #ifdef NETFLIX_STATS 4891 KMOD_TCPSTAT_INC(tcps_idle_restarts); 4892 if (tp->t_state == TCPS_ESTABLISHED) 4893 KMOD_TCPSTAT_INC(tcps_idle_estrestarts); 4894 #endif 4895 if (CC_ALGO(tp)->after_idle != NULL) 4896 CC_ALGO(tp)->after_idle(&tp->t_ccv); 4897 4898 if (tp->snd_cwnd == 1) 4899 i_cwnd = tp->t_maxseg; /* SYN(-ACK) lost */ 4900 else 4901 i_cwnd = rc_init_window(rack); 4902 4903 /* 4904 * Being idle is no different than the initial window. If the cc 4905 * clamps it down below the initial window raise it to the initial 4906 * window. 4907 */ 4908 if (tp->snd_cwnd < i_cwnd) { 4909 tp->snd_cwnd = i_cwnd; 4910 } 4911 } 4912 4913 /* 4914 * Indicate whether this ack should be delayed. We can delay the ack if 4915 * following conditions are met: 4916 * - There is no delayed ack timer in progress. 4917 * - Our last ack wasn't a 0-sized window. We never want to delay 4918 * the ack that opens up a 0-sized window. 4919 * - LRO wasn't used for this segment. We make sure by checking that the 4920 * segment size is not larger than the MSS. 4921 * - Delayed acks are enabled or this is a half-synchronized T/TCP 4922 * connection. 4923 */ 4924 #define DELAY_ACK(tp, tlen) \ 4925 (((tp->t_flags & TF_RXWIN0SENT) == 0) && \ 4926 ((tp->t_flags & TF_DELACK) == 0) && \ 4927 (tlen <= tp->t_maxseg) && \ 4928 (tp->t_delayed_ack || (tp->t_flags & TF_NEEDSYN))) 4929 4930 static struct rack_sendmap * 4931 rack_find_lowest_rsm(struct tcp_rack *rack) 4932 { 4933 struct rack_sendmap *rsm; 4934 4935 /* 4936 * Walk the time-order transmitted list looking for an rsm that is 4937 * not acked. This will be the one that was sent the longest time 4938 * ago that is still outstanding. 4939 */ 4940 TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) { 4941 if (rsm->r_flags & RACK_ACKED) { 4942 continue; 4943 } 4944 goto finish; 4945 } 4946 finish: 4947 return (rsm); 4948 } 4949 4950 static struct rack_sendmap * 4951 rack_find_high_nonack(struct tcp_rack *rack, struct rack_sendmap *rsm) 4952 { 4953 struct rack_sendmap *prsm; 4954 4955 /* 4956 * Walk the sequence order list backward until we hit and arrive at 4957 * the highest seq not acked. In theory when this is called it 4958 * should be the last segment (which it was not). 4959 */ 4960 prsm = rsm; 4961 RB_FOREACH_REVERSE_FROM(prsm, rack_rb_tree_head, rsm) { 4962 if (prsm->r_flags & (RACK_ACKED | RACK_HAS_FIN)) { 4963 continue; 4964 } 4965 return (prsm); 4966 } 4967 return (NULL); 4968 } 4969 4970 static uint32_t 4971 rack_calc_thresh_rack(struct tcp_rack *rack, uint32_t srtt, uint32_t cts) 4972 { 4973 int32_t lro; 4974 uint32_t thresh; 4975 4976 /* 4977 * lro is the flag we use to determine if we have seen reordering. 4978 * If it gets set we have seen reordering. The reorder logic either 4979 * works in one of two ways: 4980 * 4981 * If reorder-fade is configured, then we track the last time we saw 4982 * re-ordering occur. If we reach the point where enough time as 4983 * passed we no longer consider reordering has occuring. 4984 * 4985 * Or if reorder-face is 0, then once we see reordering we consider 4986 * the connection to alway be subject to reordering and just set lro 4987 * to 1. 4988 * 4989 * In the end if lro is non-zero we add the extra time for 4990 * reordering in. 4991 */ 4992 if (srtt == 0) 4993 srtt = 1; 4994 if (rack->r_ctl.rc_reorder_ts) { 4995 if (rack->r_ctl.rc_reorder_fade) { 4996 if (SEQ_GEQ(cts, rack->r_ctl.rc_reorder_ts)) { 4997 lro = cts - rack->r_ctl.rc_reorder_ts; 4998 if (lro == 0) { 4999 /* 5000 * No time as passed since the last 5001 * reorder, mark it as reordering. 5002 */ 5003 lro = 1; 5004 } 5005 } else { 5006 /* Negative time? */ 5007 lro = 0; 5008 } 5009 if (lro > rack->r_ctl.rc_reorder_fade) { 5010 /* Turn off reordering seen too */ 5011 rack->r_ctl.rc_reorder_ts = 0; 5012 lro = 0; 5013 } 5014 } else { 5015 /* Reodering does not fade */ 5016 lro = 1; 5017 } 5018 } else { 5019 lro = 0; 5020 } 5021 if (rack->rc_rack_tmr_std_based == 0) { 5022 thresh = srtt + rack->r_ctl.rc_pkt_delay; 5023 } else { 5024 /* Standards based pkt-delay is 1/4 srtt */ 5025 thresh = srtt + (srtt >> 2); 5026 } 5027 if (lro && (rack->rc_rack_tmr_std_based == 0)) { 5028 /* It must be set, if not you get 1/4 rtt */ 5029 if (rack->r_ctl.rc_reorder_shift) 5030 thresh += (srtt >> rack->r_ctl.rc_reorder_shift); 5031 else 5032 thresh += (srtt >> 2); 5033 } 5034 if (rack->rc_rack_use_dsack && 5035 lro && 5036 (rack->r_ctl.num_dsack > 0)) { 5037 /* 5038 * We only increase the reordering window if we 5039 * have seen reordering <and> we have a DSACK count. 5040 */ 5041 thresh += rack->r_ctl.num_dsack * (srtt >> 2); 5042 rack_log_dsack_event(rack, 4, __LINE__, srtt, thresh); 5043 } 5044 /* SRTT * 2 is the ceiling */ 5045 if (thresh > (srtt * 2)) { 5046 thresh = srtt * 2; 5047 } 5048 /* And we don't want it above the RTO max either */ 5049 if (thresh > rack_rto_max) { 5050 thresh = rack_rto_max; 5051 } 5052 rack_log_dsack_event(rack, 6, __LINE__, srtt, thresh); 5053 return (thresh); 5054 } 5055 5056 static uint32_t 5057 rack_calc_thresh_tlp(struct tcpcb *tp, struct tcp_rack *rack, 5058 struct rack_sendmap *rsm, uint32_t srtt) 5059 { 5060 struct rack_sendmap *prsm; 5061 uint32_t thresh, len; 5062 int segsiz; 5063 5064 if (srtt == 0) 5065 srtt = 1; 5066 if (rack->r_ctl.rc_tlp_threshold) 5067 thresh = srtt + (srtt / rack->r_ctl.rc_tlp_threshold); 5068 else 5069 thresh = (srtt * 2); 5070 5071 /* Get the previous sent packet, if any */ 5072 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 5073 len = rsm->r_end - rsm->r_start; 5074 if (rack->rack_tlp_threshold_use == TLP_USE_ID) { 5075 /* Exactly like the ID */ 5076 if (((tp->snd_max - tp->snd_una) - rack->r_ctl.rc_sacked + rack->r_ctl.rc_holes_rxt) <= segsiz) { 5077 uint32_t alt_thresh; 5078 /* 5079 * Compensate for delayed-ack with the d-ack time. 5080 */ 5081 alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time; 5082 if (alt_thresh > thresh) 5083 thresh = alt_thresh; 5084 } 5085 } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_ONE) { 5086 /* 2.1 behavior */ 5087 prsm = TAILQ_PREV(rsm, rack_head, r_tnext); 5088 if (prsm && (len <= segsiz)) { 5089 /* 5090 * Two packets outstanding, thresh should be (2*srtt) + 5091 * possible inter-packet delay (if any). 5092 */ 5093 uint32_t inter_gap = 0; 5094 int idx, nidx; 5095 5096 idx = rsm->r_rtr_cnt - 1; 5097 nidx = prsm->r_rtr_cnt - 1; 5098 if (rsm->r_tim_lastsent[nidx] >= prsm->r_tim_lastsent[idx]) { 5099 /* Yes it was sent later (or at the same time) */ 5100 inter_gap = rsm->r_tim_lastsent[idx] - prsm->r_tim_lastsent[nidx]; 5101 } 5102 thresh += inter_gap; 5103 } else if (len <= segsiz) { 5104 /* 5105 * Possibly compensate for delayed-ack. 5106 */ 5107 uint32_t alt_thresh; 5108 5109 alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time; 5110 if (alt_thresh > thresh) 5111 thresh = alt_thresh; 5112 } 5113 } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_TWO) { 5114 /* 2.2 behavior */ 5115 if (len <= segsiz) { 5116 uint32_t alt_thresh; 5117 /* 5118 * Compensate for delayed-ack with the d-ack time. 5119 */ 5120 alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time; 5121 if (alt_thresh > thresh) 5122 thresh = alt_thresh; 5123 } 5124 } 5125 /* Not above an RTO */ 5126 if (thresh > tp->t_rxtcur) { 5127 thresh = tp->t_rxtcur; 5128 } 5129 /* Not above a RTO max */ 5130 if (thresh > rack_rto_max) { 5131 thresh = rack_rto_max; 5132 } 5133 /* Apply user supplied min TLP */ 5134 if (thresh < rack_tlp_min) { 5135 thresh = rack_tlp_min; 5136 } 5137 return (thresh); 5138 } 5139 5140 static uint32_t 5141 rack_grab_rtt(struct tcpcb *tp, struct tcp_rack *rack) 5142 { 5143 /* 5144 * We want the rack_rtt which is the 5145 * last rtt we measured. However if that 5146 * does not exist we fallback to the srtt (which 5147 * we probably will never do) and then as a last 5148 * resort we use RACK_INITIAL_RTO if no srtt is 5149 * yet set. 5150 */ 5151 if (rack->rc_rack_rtt) 5152 return (rack->rc_rack_rtt); 5153 else if (tp->t_srtt == 0) 5154 return (RACK_INITIAL_RTO); 5155 return (tp->t_srtt); 5156 } 5157 5158 static struct rack_sendmap * 5159 rack_check_recovery_mode(struct tcpcb *tp, uint32_t tsused) 5160 { 5161 /* 5162 * Check to see that we don't need to fall into recovery. We will 5163 * need to do so if our oldest transmit is past the time we should 5164 * have had an ack. 5165 */ 5166 struct tcp_rack *rack; 5167 struct rack_sendmap *rsm; 5168 int32_t idx; 5169 uint32_t srtt, thresh; 5170 5171 rack = (struct tcp_rack *)tp->t_fb_ptr; 5172 if (RB_EMPTY(&rack->r_ctl.rc_mtree)) { 5173 return (NULL); 5174 } 5175 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 5176 if (rsm == NULL) 5177 return (NULL); 5178 5179 5180 if (rsm->r_flags & RACK_ACKED) { 5181 rsm = rack_find_lowest_rsm(rack); 5182 if (rsm == NULL) 5183 return (NULL); 5184 } 5185 idx = rsm->r_rtr_cnt - 1; 5186 srtt = rack_grab_rtt(tp, rack); 5187 thresh = rack_calc_thresh_rack(rack, srtt, tsused); 5188 if (TSTMP_LT(tsused, ((uint32_t)rsm->r_tim_lastsent[idx]))) { 5189 return (NULL); 5190 } 5191 if ((tsused - ((uint32_t)rsm->r_tim_lastsent[idx])) < thresh) { 5192 return (NULL); 5193 } 5194 /* Ok if we reach here we are over-due and this guy can be sent */ 5195 rack_cong_signal(tp, CC_NDUPACK, tp->snd_una, __LINE__); 5196 return (rsm); 5197 } 5198 5199 static uint32_t 5200 rack_get_persists_timer_val(struct tcpcb *tp, struct tcp_rack *rack) 5201 { 5202 int32_t t; 5203 int32_t tt; 5204 uint32_t ret_val; 5205 5206 t = (tp->t_srtt + (tp->t_rttvar << 2)); 5207 RACK_TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift], 5208 rack_persist_min, rack_persist_max, rack->r_ctl.timer_slop); 5209 rack->r_ctl.rc_hpts_flags |= PACE_TMR_PERSIT; 5210 ret_val = (uint32_t)tt; 5211 return (ret_val); 5212 } 5213 5214 static uint32_t 5215 rack_timer_start(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int sup_rack) 5216 { 5217 /* 5218 * Start the FR timer, we do this based on getting the first one in 5219 * the rc_tmap. Note that if its NULL we must stop the timer. in all 5220 * events we need to stop the running timer (if its running) before 5221 * starting the new one. 5222 */ 5223 uint32_t thresh, exp, to, srtt, time_since_sent, tstmp_touse; 5224 uint32_t srtt_cur; 5225 int32_t idx; 5226 int32_t is_tlp_timer = 0; 5227 struct rack_sendmap *rsm; 5228 5229 if (rack->t_timers_stopped) { 5230 /* All timers have been stopped none are to run */ 5231 return (0); 5232 } 5233 if (rack->rc_in_persist) { 5234 /* We can't start any timer in persists */ 5235 return (rack_get_persists_timer_val(tp, rack)); 5236 } 5237 rack->rc_on_min_to = 0; 5238 if ((tp->t_state < TCPS_ESTABLISHED) || 5239 ((tp->t_flags & TF_SACK_PERMIT) == 0)) { 5240 goto activate_rxt; 5241 } 5242 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 5243 if ((rsm == NULL) || sup_rack) { 5244 /* Nothing on the send map or no rack */ 5245 activate_rxt: 5246 time_since_sent = 0; 5247 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 5248 if (rsm) { 5249 /* 5250 * Should we discount the RTX timer any? 5251 * 5252 * We want to discount it the smallest amount. 5253 * If a timer (Rack/TLP or RXT) has gone off more 5254 * recently thats the discount we want to use (now - timer time). 5255 * If the retransmit of the oldest packet was more recent then 5256 * we want to use that (now - oldest-packet-last_transmit_time). 5257 * 5258 */ 5259 idx = rsm->r_rtr_cnt - 1; 5260 if (TSTMP_GEQ(rack->r_ctl.rc_tlp_rxt_last_time, ((uint32_t)rsm->r_tim_lastsent[idx]))) 5261 tstmp_touse = (uint32_t)rack->r_ctl.rc_tlp_rxt_last_time; 5262 else 5263 tstmp_touse = (uint32_t)rsm->r_tim_lastsent[idx]; 5264 if (TSTMP_GT(cts, tstmp_touse)) 5265 time_since_sent = cts - tstmp_touse; 5266 } 5267 if (SEQ_LT(tp->snd_una, tp->snd_max) || 5268 sbavail(&tptosocket(tp)->so_snd)) { 5269 rack->r_ctl.rc_hpts_flags |= PACE_TMR_RXT; 5270 to = tp->t_rxtcur; 5271 if (to > time_since_sent) 5272 to -= time_since_sent; 5273 else 5274 to = rack->r_ctl.rc_min_to; 5275 if (to == 0) 5276 to = 1; 5277 /* Special case for KEEPINIT */ 5278 if ((TCPS_HAVEESTABLISHED(tp->t_state) == 0) && 5279 (TP_KEEPINIT(tp) != 0) && 5280 rsm) { 5281 /* 5282 * We have to put a ceiling on the rxt timer 5283 * of the keep-init timeout. 5284 */ 5285 uint32_t max_time, red; 5286 5287 max_time = TICKS_2_USEC(TP_KEEPINIT(tp)); 5288 if (TSTMP_GT(cts, (uint32_t)rsm->r_tim_lastsent[0])) { 5289 red = (cts - (uint32_t)rsm->r_tim_lastsent[0]); 5290 if (red < max_time) 5291 max_time -= red; 5292 else 5293 max_time = 1; 5294 } 5295 /* Reduce timeout to the keep value if needed */ 5296 if (max_time < to) 5297 to = max_time; 5298 } 5299 return (to); 5300 } 5301 return (0); 5302 } 5303 if (rsm->r_flags & RACK_ACKED) { 5304 rsm = rack_find_lowest_rsm(rack); 5305 if (rsm == NULL) { 5306 /* No lowest? */ 5307 goto activate_rxt; 5308 } 5309 } 5310 if (rack->sack_attack_disable) { 5311 /* 5312 * We don't want to do 5313 * any TLP's if you are an attacker. 5314 * Though if you are doing what 5315 * is expected you may still have 5316 * SACK-PASSED marks. 5317 */ 5318 goto activate_rxt; 5319 } 5320 /* Convert from ms to usecs */ 5321 if ((rsm->r_flags & RACK_SACK_PASSED) || 5322 (rsm->r_flags & RACK_RWND_COLLAPSED) || 5323 (rsm->r_dupack >= DUP_ACK_THRESHOLD)) { 5324 if ((tp->t_flags & TF_SENTFIN) && 5325 ((tp->snd_max - tp->snd_una) == 1) && 5326 (rsm->r_flags & RACK_HAS_FIN)) { 5327 /* 5328 * We don't start a rack timer if all we have is a 5329 * FIN outstanding. 5330 */ 5331 goto activate_rxt; 5332 } 5333 if ((rack->use_rack_rr == 0) && 5334 (IN_FASTRECOVERY(tp->t_flags)) && 5335 (rack->rack_no_prr == 0) && 5336 (rack->r_ctl.rc_prr_sndcnt < ctf_fixed_maxseg(tp))) { 5337 /* 5338 * We are not cheating, in recovery and 5339 * not enough ack's to yet get our next 5340 * retransmission out. 5341 * 5342 * Note that classified attackers do not 5343 * get to use the rack-cheat. 5344 */ 5345 goto activate_tlp; 5346 } 5347 srtt = rack_grab_rtt(tp, rack); 5348 thresh = rack_calc_thresh_rack(rack, srtt, cts); 5349 idx = rsm->r_rtr_cnt - 1; 5350 exp = ((uint32_t)rsm->r_tim_lastsent[idx]) + thresh; 5351 if (SEQ_GEQ(exp, cts)) { 5352 to = exp - cts; 5353 if (to < rack->r_ctl.rc_min_to) { 5354 to = rack->r_ctl.rc_min_to; 5355 if (rack->r_rr_config == 3) 5356 rack->rc_on_min_to = 1; 5357 } 5358 } else { 5359 to = rack->r_ctl.rc_min_to; 5360 if (rack->r_rr_config == 3) 5361 rack->rc_on_min_to = 1; 5362 } 5363 } else { 5364 /* Ok we need to do a TLP not RACK */ 5365 activate_tlp: 5366 if ((rack->rc_tlp_in_progress != 0) && 5367 (rack->r_ctl.rc_tlp_cnt_out >= rack_tlp_limit)) { 5368 /* 5369 * The previous send was a TLP and we have sent 5370 * N TLP's without sending new data. 5371 */ 5372 goto activate_rxt; 5373 } 5374 rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext); 5375 if (rsm == NULL) { 5376 /* We found no rsm to TLP with. */ 5377 goto activate_rxt; 5378 } 5379 if (rsm->r_flags & RACK_HAS_FIN) { 5380 /* If its a FIN we dont do TLP */ 5381 rsm = NULL; 5382 goto activate_rxt; 5383 } 5384 idx = rsm->r_rtr_cnt - 1; 5385 time_since_sent = 0; 5386 if (TSTMP_GEQ(((uint32_t)rsm->r_tim_lastsent[idx]), rack->r_ctl.rc_tlp_rxt_last_time)) 5387 tstmp_touse = (uint32_t)rsm->r_tim_lastsent[idx]; 5388 else 5389 tstmp_touse = (uint32_t)rack->r_ctl.rc_tlp_rxt_last_time; 5390 if (TSTMP_GT(cts, tstmp_touse)) 5391 time_since_sent = cts - tstmp_touse; 5392 is_tlp_timer = 1; 5393 if (tp->t_srtt) { 5394 if ((rack->rc_srtt_measure_made == 0) && 5395 (tp->t_srtt == 1)) { 5396 /* 5397 * If another stack as run and set srtt to 1, 5398 * then the srtt was 0, so lets use the initial. 5399 */ 5400 srtt = RACK_INITIAL_RTO; 5401 } else { 5402 srtt_cur = tp->t_srtt; 5403 srtt = srtt_cur; 5404 } 5405 } else 5406 srtt = RACK_INITIAL_RTO; 5407 /* 5408 * If the SRTT is not keeping up and the 5409 * rack RTT has spiked we want to use 5410 * the last RTT not the smoothed one. 5411 */ 5412 if (rack_tlp_use_greater && 5413 tp->t_srtt && 5414 (srtt < rack_grab_rtt(tp, rack))) { 5415 srtt = rack_grab_rtt(tp, rack); 5416 } 5417 thresh = rack_calc_thresh_tlp(tp, rack, rsm, srtt); 5418 if (thresh > time_since_sent) { 5419 to = thresh - time_since_sent; 5420 } else { 5421 to = rack->r_ctl.rc_min_to; 5422 rack_log_alt_to_to_cancel(rack, 5423 thresh, /* flex1 */ 5424 time_since_sent, /* flex2 */ 5425 tstmp_touse, /* flex3 */ 5426 rack->r_ctl.rc_tlp_rxt_last_time, /* flex4 */ 5427 (uint32_t)rsm->r_tim_lastsent[idx], 5428 srtt, 5429 idx, 99); 5430 } 5431 if (to < rack_tlp_min) { 5432 to = rack_tlp_min; 5433 } 5434 if (to > TICKS_2_USEC(TCPTV_REXMTMAX)) { 5435 /* 5436 * If the TLP time works out to larger than the max 5437 * RTO lets not do TLP.. just RTO. 5438 */ 5439 goto activate_rxt; 5440 } 5441 } 5442 if (is_tlp_timer == 0) { 5443 rack->r_ctl.rc_hpts_flags |= PACE_TMR_RACK; 5444 } else { 5445 rack->r_ctl.rc_hpts_flags |= PACE_TMR_TLP; 5446 } 5447 if (to == 0) 5448 to = 1; 5449 return (to); 5450 } 5451 5452 static void 5453 rack_enter_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 5454 { 5455 if (rack->rc_in_persist == 0) { 5456 if (tp->t_flags & TF_GPUTINPROG) { 5457 /* 5458 * Stop the goodput now, the calling of the 5459 * measurement function clears the flag. 5460 */ 5461 rack_do_goodput_measurement(tp, rack, tp->snd_una, __LINE__, 5462 RACK_QUALITY_PERSIST); 5463 } 5464 #ifdef NETFLIX_SHARED_CWND 5465 if (rack->r_ctl.rc_scw) { 5466 tcp_shared_cwnd_idle(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index); 5467 rack->rack_scwnd_is_idle = 1; 5468 } 5469 #endif 5470 rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL); 5471 if (rack->r_ctl.rc_went_idle_time == 0) 5472 rack->r_ctl.rc_went_idle_time = 1; 5473 rack_timer_cancel(tp, rack, cts, __LINE__); 5474 rack->r_ctl.persist_lost_ends = 0; 5475 rack->probe_not_answered = 0; 5476 rack->forced_ack = 0; 5477 tp->t_rxtshift = 0; 5478 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 5479 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop); 5480 rack->rc_in_persist = 1; 5481 } 5482 } 5483 5484 static void 5485 rack_exit_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 5486 { 5487 if (tcp_in_hpts(rack->rc_inp)) { 5488 tcp_hpts_remove(rack->rc_inp); 5489 rack->r_ctl.rc_hpts_flags = 0; 5490 } 5491 #ifdef NETFLIX_SHARED_CWND 5492 if (rack->r_ctl.rc_scw) { 5493 tcp_shared_cwnd_active(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index); 5494 rack->rack_scwnd_is_idle = 0; 5495 } 5496 #endif 5497 if (rack->rc_gp_dyn_mul && 5498 (rack->use_fixed_rate == 0) && 5499 (rack->rc_always_pace)) { 5500 /* 5501 * Do we count this as if a probe-rtt just 5502 * finished? 5503 */ 5504 uint32_t time_idle, idle_min; 5505 5506 time_idle = tcp_get_usecs(NULL) - rack->r_ctl.rc_went_idle_time; 5507 idle_min = rack_min_probertt_hold; 5508 if (rack_probertt_gpsrtt_cnt_div) { 5509 uint64_t extra; 5510 extra = (uint64_t)rack->r_ctl.rc_gp_srtt * 5511 (uint64_t)rack_probertt_gpsrtt_cnt_mul; 5512 extra /= (uint64_t)rack_probertt_gpsrtt_cnt_div; 5513 idle_min += (uint32_t)extra; 5514 } 5515 if (time_idle >= idle_min) { 5516 /* Yes, we count it as a probe-rtt. */ 5517 uint32_t us_cts; 5518 5519 us_cts = tcp_get_usecs(NULL); 5520 if (rack->in_probe_rtt == 0) { 5521 rack->r_ctl.rc_lower_rtt_us_cts = us_cts; 5522 rack->r_ctl.rc_time_probertt_entered = rack->r_ctl.rc_lower_rtt_us_cts; 5523 rack->r_ctl.rc_time_probertt_starts = rack->r_ctl.rc_lower_rtt_us_cts; 5524 rack->r_ctl.rc_time_of_last_probertt = rack->r_ctl.rc_lower_rtt_us_cts; 5525 } else { 5526 rack_exit_probertt(rack, us_cts); 5527 } 5528 } 5529 } 5530 rack->rc_in_persist = 0; 5531 rack->r_ctl.rc_went_idle_time = 0; 5532 tp->t_rxtshift = 0; 5533 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 5534 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop); 5535 rack->r_ctl.rc_agg_delayed = 0; 5536 rack->r_early = 0; 5537 rack->r_late = 0; 5538 rack->r_ctl.rc_agg_early = 0; 5539 } 5540 5541 static void 5542 rack_log_hpts_diag(struct tcp_rack *rack, uint32_t cts, 5543 struct hpts_diag *diag, struct timeval *tv) 5544 { 5545 if (rack_verbose_logging && rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 5546 union tcp_log_stackspecific log; 5547 5548 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 5549 log.u_bbr.flex1 = diag->p_nxt_slot; 5550 log.u_bbr.flex2 = diag->p_cur_slot; 5551 log.u_bbr.flex3 = diag->slot_req; 5552 log.u_bbr.flex4 = diag->inp_hptsslot; 5553 log.u_bbr.flex5 = diag->slot_remaining; 5554 log.u_bbr.flex6 = diag->need_new_to; 5555 log.u_bbr.flex7 = diag->p_hpts_active; 5556 log.u_bbr.flex8 = diag->p_on_min_sleep; 5557 /* Hijack other fields as needed */ 5558 log.u_bbr.epoch = diag->have_slept; 5559 log.u_bbr.lt_epoch = diag->yet_to_sleep; 5560 log.u_bbr.pkts_out = diag->co_ret; 5561 log.u_bbr.applimited = diag->hpts_sleep_time; 5562 log.u_bbr.delivered = diag->p_prev_slot; 5563 log.u_bbr.inflight = diag->p_runningslot; 5564 log.u_bbr.bw_inuse = diag->wheel_slot; 5565 log.u_bbr.rttProp = diag->wheel_cts; 5566 log.u_bbr.timeStamp = cts; 5567 log.u_bbr.delRate = diag->maxslots; 5568 log.u_bbr.cur_del_rate = diag->p_curtick; 5569 log.u_bbr.cur_del_rate <<= 32; 5570 log.u_bbr.cur_del_rate |= diag->p_lasttick; 5571 TCP_LOG_EVENTP(rack->rc_tp, NULL, 5572 &rack->rc_inp->inp_socket->so_rcv, 5573 &rack->rc_inp->inp_socket->so_snd, 5574 BBR_LOG_HPTSDIAG, 0, 5575 0, &log, false, tv); 5576 } 5577 5578 } 5579 5580 static void 5581 rack_log_wakeup(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb, uint32_t len, int type) 5582 { 5583 if (rack_verbose_logging && rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 5584 union tcp_log_stackspecific log; 5585 struct timeval tv; 5586 5587 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 5588 log.u_bbr.flex1 = sb->sb_flags; 5589 log.u_bbr.flex2 = len; 5590 log.u_bbr.flex3 = sb->sb_state; 5591 log.u_bbr.flex8 = type; 5592 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 5593 TCP_LOG_EVENTP(rack->rc_tp, NULL, 5594 &rack->rc_inp->inp_socket->so_rcv, 5595 &rack->rc_inp->inp_socket->so_snd, 5596 TCP_LOG_SB_WAKE, 0, 5597 len, &log, false, &tv); 5598 } 5599 } 5600 5601 static void 5602 rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, 5603 int32_t slot, uint32_t tot_len_this_send, int sup_rack) 5604 { 5605 struct hpts_diag diag; 5606 struct inpcb *inp = tptoinpcb(tp); 5607 struct timeval tv; 5608 uint32_t delayed_ack = 0; 5609 uint32_t hpts_timeout; 5610 uint32_t entry_slot = slot; 5611 uint8_t stopped; 5612 uint32_t left = 0; 5613 uint32_t us_cts; 5614 5615 if ((tp->t_state == TCPS_CLOSED) || 5616 (tp->t_state == TCPS_LISTEN)) { 5617 return; 5618 } 5619 if (tcp_in_hpts(inp)) { 5620 /* Already on the pacer */ 5621 return; 5622 } 5623 stopped = rack->rc_tmr_stopped; 5624 if (stopped && TSTMP_GT(rack->r_ctl.rc_timer_exp, cts)) { 5625 left = rack->r_ctl.rc_timer_exp - cts; 5626 } 5627 rack->r_ctl.rc_timer_exp = 0; 5628 rack->r_ctl.rc_hpts_flags = 0; 5629 us_cts = tcp_get_usecs(&tv); 5630 /* Now early/late accounting */ 5631 rack_log_pacing_delay_calc(rack, entry_slot, slot, 0, 0, 0, 26, __LINE__, NULL, 0); 5632 if (rack->r_early && (rack->rc_ack_can_sendout_data == 0)) { 5633 /* 5634 * We have a early carry over set, 5635 * we can always add more time so we 5636 * can always make this compensation. 5637 * 5638 * Note if ack's are allowed to wake us do not 5639 * penalize the next timer for being awoke 5640 * by an ack aka the rc_agg_early (non-paced mode). 5641 */ 5642 slot += rack->r_ctl.rc_agg_early; 5643 rack->r_early = 0; 5644 rack->r_ctl.rc_agg_early = 0; 5645 } 5646 if (rack->r_late) { 5647 /* 5648 * This is harder, we can 5649 * compensate some but it 5650 * really depends on what 5651 * the current pacing time is. 5652 */ 5653 if (rack->r_ctl.rc_agg_delayed >= slot) { 5654 /* 5655 * We can't compensate for it all. 5656 * And we have to have some time 5657 * on the clock. We always have a min 5658 * 10 slots (10 x 10 i.e. 100 usecs). 5659 */ 5660 if (slot <= HPTS_TICKS_PER_SLOT) { 5661 /* We gain delay */ 5662 rack->r_ctl.rc_agg_delayed += (HPTS_TICKS_PER_SLOT - slot); 5663 slot = HPTS_TICKS_PER_SLOT; 5664 } else { 5665 /* We take off some */ 5666 rack->r_ctl.rc_agg_delayed -= (slot - HPTS_TICKS_PER_SLOT); 5667 slot = HPTS_TICKS_PER_SLOT; 5668 } 5669 } else { 5670 slot -= rack->r_ctl.rc_agg_delayed; 5671 rack->r_ctl.rc_agg_delayed = 0; 5672 /* Make sure we have 100 useconds at minimum */ 5673 if (slot < HPTS_TICKS_PER_SLOT) { 5674 rack->r_ctl.rc_agg_delayed = HPTS_TICKS_PER_SLOT - slot; 5675 slot = HPTS_TICKS_PER_SLOT; 5676 } 5677 if (rack->r_ctl.rc_agg_delayed == 0) 5678 rack->r_late = 0; 5679 } 5680 } 5681 if (slot) { 5682 /* We are pacing too */ 5683 rack->r_ctl.rc_hpts_flags |= PACE_PKT_OUTPUT; 5684 } 5685 hpts_timeout = rack_timer_start(tp, rack, cts, sup_rack); 5686 #ifdef NETFLIX_EXP_DETECTION 5687 if (rack->sack_attack_disable && 5688 (slot < tcp_sad_pacing_interval)) { 5689 /* 5690 * We have a potential attacker on 5691 * the line. We have possibly some 5692 * (or now) pacing time set. We want to 5693 * slow down the processing of sacks by some 5694 * amount (if it is an attacker). Set the default 5695 * slot for attackers in place (unless the orginal 5696 * interval is longer). Its stored in 5697 * micro-seconds, so lets convert to msecs. 5698 */ 5699 slot = tcp_sad_pacing_interval; 5700 } 5701 #endif 5702 if (tp->t_flags & TF_DELACK) { 5703 delayed_ack = TICKS_2_USEC(tcp_delacktime); 5704 rack->r_ctl.rc_hpts_flags |= PACE_TMR_DELACK; 5705 } 5706 if (delayed_ack && ((hpts_timeout == 0) || 5707 (delayed_ack < hpts_timeout))) 5708 hpts_timeout = delayed_ack; 5709 else 5710 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK; 5711 /* 5712 * If no timers are going to run and we will fall off the hptsi 5713 * wheel, we resort to a keep-alive timer if its configured. 5714 */ 5715 if ((hpts_timeout == 0) && 5716 (slot == 0)) { 5717 if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && 5718 (tp->t_state <= TCPS_CLOSING)) { 5719 /* 5720 * Ok we have no timer (persists, rack, tlp, rxt or 5721 * del-ack), we don't have segments being paced. So 5722 * all that is left is the keepalive timer. 5723 */ 5724 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 5725 /* Get the established keep-alive time */ 5726 hpts_timeout = TICKS_2_USEC(TP_KEEPIDLE(tp)); 5727 } else { 5728 /* 5729 * Get the initial setup keep-alive time, 5730 * note that this is probably not going to 5731 * happen, since rack will be running a rxt timer 5732 * if a SYN of some sort is outstanding. It is 5733 * actually handled in rack_timeout_rxt(). 5734 */ 5735 hpts_timeout = TICKS_2_USEC(TP_KEEPINIT(tp)); 5736 } 5737 rack->r_ctl.rc_hpts_flags |= PACE_TMR_KEEP; 5738 if (rack->in_probe_rtt) { 5739 /* 5740 * We want to instead not wake up a long time from 5741 * now but to wake up about the time we would 5742 * exit probe-rtt and initiate a keep-alive ack. 5743 * This will get us out of probe-rtt and update 5744 * our min-rtt. 5745 */ 5746 hpts_timeout = rack_min_probertt_hold; 5747 } 5748 } 5749 } 5750 if (left && (stopped & (PACE_TMR_KEEP | PACE_TMR_DELACK)) == 5751 (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK)) { 5752 /* 5753 * RACK, TLP, persists and RXT timers all are restartable 5754 * based on actions input .. i.e we received a packet (ack 5755 * or sack) and that changes things (rw, or snd_una etc). 5756 * Thus we can restart them with a new value. For 5757 * keep-alive, delayed_ack we keep track of what was left 5758 * and restart the timer with a smaller value. 5759 */ 5760 if (left < hpts_timeout) 5761 hpts_timeout = left; 5762 } 5763 if (hpts_timeout) { 5764 /* 5765 * Hack alert for now we can't time-out over 2,147,483 5766 * seconds (a bit more than 596 hours), which is probably ok 5767 * :). 5768 */ 5769 if (hpts_timeout > 0x7ffffffe) 5770 hpts_timeout = 0x7ffffffe; 5771 rack->r_ctl.rc_timer_exp = cts + hpts_timeout; 5772 } 5773 rack_log_pacing_delay_calc(rack, entry_slot, slot, hpts_timeout, 0, 0, 27, __LINE__, NULL, 0); 5774 if ((rack->gp_ready == 0) && 5775 (rack->use_fixed_rate == 0) && 5776 (hpts_timeout < slot) && 5777 (rack->r_ctl.rc_hpts_flags & (PACE_TMR_TLP|PACE_TMR_RXT))) { 5778 /* 5779 * We have no good estimate yet for the 5780 * old clunky burst mitigation or the 5781 * real pacing. And the tlp or rxt is smaller 5782 * than the pacing calculation. Lets not 5783 * pace that long since we know the calculation 5784 * so far is not accurate. 5785 */ 5786 slot = hpts_timeout; 5787 } 5788 /** 5789 * Turn off all the flags for queuing by default. The 5790 * flags have important meanings to what happens when 5791 * LRO interacts with the transport. Most likely (by default now) 5792 * mbuf_queueing and ack compression are on. So the transport 5793 * has a couple of flags that control what happens (if those 5794 * are not on then these flags won't have any effect since it 5795 * won't go through the queuing LRO path). 5796 * 5797 * INP_MBUF_QUEUE_READY - This flags says that I am busy 5798 * pacing output, so don't disturb. But 5799 * it also means LRO can wake me if there 5800 * is a SACK arrival. 5801 * 5802 * INP_DONT_SACK_QUEUE - This flag is used in conjunction 5803 * with the above flag (QUEUE_READY) and 5804 * when present it says don't even wake me 5805 * if a SACK arrives. 5806 * 5807 * The idea behind these flags is that if we are pacing we 5808 * set the MBUF_QUEUE_READY and only get woken up if 5809 * a SACK arrives (which could change things) or if 5810 * our pacing timer expires. If, however, we have a rack 5811 * timer running, then we don't even want a sack to wake 5812 * us since the rack timer has to expire before we can send. 5813 * 5814 * Other cases should usually have none of the flags set 5815 * so LRO can call into us. 5816 */ 5817 inp->inp_flags2 &= ~(INP_DONT_SACK_QUEUE|INP_MBUF_QUEUE_READY); 5818 if (slot) { 5819 rack->r_ctl.rc_last_output_to = us_cts + slot; 5820 /* 5821 * A pacing timer (slot) is being set, in 5822 * such a case we cannot send (we are blocked by 5823 * the timer). So lets tell LRO that it should not 5824 * wake us unless there is a SACK. Note this only 5825 * will be effective if mbuf queueing is on or 5826 * compressed acks are being processed. 5827 */ 5828 inp->inp_flags2 |= INP_MBUF_QUEUE_READY; 5829 /* 5830 * But wait if we have a Rack timer running 5831 * even a SACK should not disturb us (with 5832 * the exception of r_rr_config 3). 5833 */ 5834 if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK) && 5835 (rack->r_rr_config != 3)) 5836 inp->inp_flags2 |= INP_DONT_SACK_QUEUE; 5837 if (rack->rc_ack_can_sendout_data) { 5838 /* 5839 * Ahh but wait, this is that special case 5840 * where the pacing timer can be disturbed 5841 * backout the changes (used for non-paced 5842 * burst limiting). 5843 */ 5844 inp->inp_flags2 &= ~(INP_DONT_SACK_QUEUE|INP_MBUF_QUEUE_READY); 5845 } 5846 if ((rack->use_rack_rr) && 5847 (rack->r_rr_config < 2) && 5848 ((hpts_timeout) && (hpts_timeout < slot))) { 5849 /* 5850 * Arrange for the hpts to kick back in after the 5851 * t-o if the t-o does not cause a send. 5852 */ 5853 (void)tcp_hpts_insert_diag(inp, HPTS_USEC_TO_SLOTS(hpts_timeout), 5854 __LINE__, &diag); 5855 rack_log_hpts_diag(rack, us_cts, &diag, &tv); 5856 rack_log_to_start(rack, cts, hpts_timeout, slot, 0); 5857 } else { 5858 (void)tcp_hpts_insert_diag(inp, HPTS_USEC_TO_SLOTS(slot), 5859 __LINE__, &diag); 5860 rack_log_hpts_diag(rack, us_cts, &diag, &tv); 5861 rack_log_to_start(rack, cts, hpts_timeout, slot, 1); 5862 } 5863 } else if (hpts_timeout) { 5864 /* 5865 * With respect to inp_flags2 here, lets let any new acks wake 5866 * us up here. Since we are not pacing (no pacing timer), output 5867 * can happen so we should let it. If its a Rack timer, then any inbound 5868 * packet probably won't change the sending (we will be blocked) 5869 * but it may change the prr stats so letting it in (the set defaults 5870 * at the start of this block) are good enough. 5871 */ 5872 (void)tcp_hpts_insert_diag(inp, HPTS_USEC_TO_SLOTS(hpts_timeout), 5873 __LINE__, &diag); 5874 rack_log_hpts_diag(rack, us_cts, &diag, &tv); 5875 rack_log_to_start(rack, cts, hpts_timeout, slot, 0); 5876 } else { 5877 /* No timer starting */ 5878 #ifdef INVARIANTS 5879 if (SEQ_GT(tp->snd_max, tp->snd_una)) { 5880 panic("tp:%p rack:%p tlts:%d cts:%u slot:%u pto:%u -- no timer started?", 5881 tp, rack, tot_len_this_send, cts, slot, hpts_timeout); 5882 } 5883 #endif 5884 } 5885 rack->rc_tmr_stopped = 0; 5886 if (slot) 5887 rack_log_type_bbrsnd(rack, tot_len_this_send, slot, us_cts, &tv); 5888 } 5889 5890 /* 5891 * RACK Timer, here we simply do logging and house keeping. 5892 * the normal rack_output() function will call the 5893 * appropriate thing to check if we need to do a RACK retransmit. 5894 * We return 1, saying don't proceed with rack_output only 5895 * when all timers have been stopped (destroyed PCB?). 5896 */ 5897 static int 5898 rack_timeout_rack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 5899 { 5900 /* 5901 * This timer simply provides an internal trigger to send out data. 5902 * The check_recovery_mode call will see if there are needed 5903 * retransmissions, if so we will enter fast-recovery. The output 5904 * call may or may not do the same thing depending on sysctl 5905 * settings. 5906 */ 5907 struct rack_sendmap *rsm; 5908 5909 counter_u64_add(rack_to_tot, 1); 5910 if (rack->r_state && (rack->r_state != tp->t_state)) 5911 rack_set_state(tp, rack); 5912 rack->rc_on_min_to = 0; 5913 rsm = rack_check_recovery_mode(tp, cts); 5914 rack_log_to_event(rack, RACK_TO_FRM_RACK, rsm); 5915 if (rsm) { 5916 rack->r_ctl.rc_resend = rsm; 5917 rack->r_timer_override = 1; 5918 if (rack->use_rack_rr) { 5919 /* 5920 * Don't accumulate extra pacing delay 5921 * we are allowing the rack timer to 5922 * over-ride pacing i.e. rrr takes precedence 5923 * if the pacing interval is longer than the rrr 5924 * time (in other words we get the min pacing 5925 * time versus rrr pacing time). 5926 */ 5927 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 5928 } 5929 } 5930 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RACK; 5931 if (rsm == NULL) { 5932 /* restart a timer and return 1 */ 5933 rack_start_hpts_timer(rack, tp, cts, 5934 0, 0, 0); 5935 return (1); 5936 } 5937 return (0); 5938 } 5939 5940 static void 5941 rack_adjust_orig_mlen(struct rack_sendmap *rsm) 5942 { 5943 if (rsm->m->m_len > rsm->orig_m_len) { 5944 /* 5945 * Mbuf grew, caused by sbcompress, our offset does 5946 * not change. 5947 */ 5948 rsm->orig_m_len = rsm->m->m_len; 5949 } else if (rsm->m->m_len < rsm->orig_m_len) { 5950 /* 5951 * Mbuf shrank, trimmed off the top by an ack, our 5952 * offset changes. 5953 */ 5954 rsm->soff -= (rsm->orig_m_len - rsm->m->m_len); 5955 rsm->orig_m_len = rsm->m->m_len; 5956 } 5957 } 5958 5959 static void 5960 rack_setup_offset_for_rsm(struct rack_sendmap *src_rsm, struct rack_sendmap *rsm) 5961 { 5962 struct mbuf *m; 5963 uint32_t soff; 5964 5965 if (src_rsm->m && (src_rsm->orig_m_len != src_rsm->m->m_len)) { 5966 /* Fix up the orig_m_len and possibly the mbuf offset */ 5967 rack_adjust_orig_mlen(src_rsm); 5968 } 5969 m = src_rsm->m; 5970 soff = src_rsm->soff + (src_rsm->r_end - src_rsm->r_start); 5971 while (soff >= m->m_len) { 5972 /* Move out past this mbuf */ 5973 soff -= m->m_len; 5974 m = m->m_next; 5975 KASSERT((m != NULL), 5976 ("rsm:%p nrsm:%p hit at soff:%u null m", 5977 src_rsm, rsm, soff)); 5978 } 5979 rsm->m = m; 5980 rsm->soff = soff; 5981 rsm->orig_m_len = m->m_len; 5982 } 5983 5984 static __inline void 5985 rack_clone_rsm(struct tcp_rack *rack, struct rack_sendmap *nrsm, 5986 struct rack_sendmap *rsm, uint32_t start) 5987 { 5988 int idx; 5989 5990 nrsm->r_start = start; 5991 nrsm->r_end = rsm->r_end; 5992 nrsm->r_rtr_cnt = rsm->r_rtr_cnt; 5993 nrsm->r_flags = rsm->r_flags; 5994 nrsm->r_dupack = rsm->r_dupack; 5995 nrsm->r_no_rtt_allowed = rsm->r_no_rtt_allowed; 5996 nrsm->r_rtr_bytes = 0; 5997 nrsm->r_fas = rsm->r_fas; 5998 rsm->r_end = nrsm->r_start; 5999 nrsm->r_just_ret = rsm->r_just_ret; 6000 for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { 6001 nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; 6002 } 6003 /* Now if we have SYN flag we keep it on the left edge */ 6004 if (nrsm->r_flags & RACK_HAS_SYN) 6005 nrsm->r_flags &= ~RACK_HAS_SYN; 6006 /* Now if we have a FIN flag we keep it on the right edge */ 6007 if (rsm->r_flags & RACK_HAS_FIN) 6008 rsm->r_flags &= ~RACK_HAS_FIN; 6009 /* Push bit must go to the right edge as well */ 6010 if (rsm->r_flags & RACK_HAD_PUSH) 6011 rsm->r_flags &= ~RACK_HAD_PUSH; 6012 /* Clone over the state of the hw_tls flag */ 6013 nrsm->r_hw_tls = rsm->r_hw_tls; 6014 /* 6015 * Now we need to find nrsm's new location in the mbuf chain 6016 * we basically calculate a new offset, which is soff + 6017 * how much is left in original rsm. Then we walk out the mbuf 6018 * chain to find the righ position, it may be the same mbuf 6019 * or maybe not. 6020 */ 6021 KASSERT(((rsm->m != NULL) || 6022 (rsm->r_flags & (RACK_HAS_SYN|RACK_HAS_FIN))), 6023 ("rsm:%p nrsm:%p rack:%p -- rsm->m is NULL?", rsm, nrsm, rack)); 6024 if (rsm->m) 6025 rack_setup_offset_for_rsm(rsm, nrsm); 6026 } 6027 6028 static struct rack_sendmap * 6029 rack_merge_rsm(struct tcp_rack *rack, 6030 struct rack_sendmap *l_rsm, 6031 struct rack_sendmap *r_rsm) 6032 { 6033 /* 6034 * We are merging two ack'd RSM's, 6035 * the l_rsm is on the left (lower seq 6036 * values) and the r_rsm is on the right 6037 * (higher seq value). The simplest way 6038 * to merge these is to move the right 6039 * one into the left. I don't think there 6040 * is any reason we need to try to find 6041 * the oldest (or last oldest retransmitted). 6042 */ 6043 #ifdef INVARIANTS 6044 struct rack_sendmap *rm; 6045 #endif 6046 rack_log_map_chg(rack->rc_tp, rack, NULL, 6047 l_rsm, r_rsm, MAP_MERGE, r_rsm->r_end, __LINE__); 6048 l_rsm->r_end = r_rsm->r_end; 6049 if (l_rsm->r_dupack < r_rsm->r_dupack) 6050 l_rsm->r_dupack = r_rsm->r_dupack; 6051 if (r_rsm->r_rtr_bytes) 6052 l_rsm->r_rtr_bytes += r_rsm->r_rtr_bytes; 6053 if (r_rsm->r_in_tmap) { 6054 /* This really should not happen */ 6055 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, r_rsm, r_tnext); 6056 r_rsm->r_in_tmap = 0; 6057 } 6058 6059 /* Now the flags */ 6060 if (r_rsm->r_flags & RACK_HAS_FIN) 6061 l_rsm->r_flags |= RACK_HAS_FIN; 6062 if (r_rsm->r_flags & RACK_TLP) 6063 l_rsm->r_flags |= RACK_TLP; 6064 if (r_rsm->r_flags & RACK_RWND_COLLAPSED) 6065 l_rsm->r_flags |= RACK_RWND_COLLAPSED; 6066 if ((r_rsm->r_flags & RACK_APP_LIMITED) && 6067 ((l_rsm->r_flags & RACK_APP_LIMITED) == 0)) { 6068 /* 6069 * If both are app-limited then let the 6070 * free lower the count. If right is app 6071 * limited and left is not, transfer. 6072 */ 6073 l_rsm->r_flags |= RACK_APP_LIMITED; 6074 r_rsm->r_flags &= ~RACK_APP_LIMITED; 6075 if (r_rsm == rack->r_ctl.rc_first_appl) 6076 rack->r_ctl.rc_first_appl = l_rsm; 6077 } 6078 #ifndef INVARIANTS 6079 (void)RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, r_rsm); 6080 #else 6081 rm = RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, r_rsm); 6082 if (rm != r_rsm) { 6083 panic("removing head in rack:%p rsm:%p rm:%p", 6084 rack, r_rsm, rm); 6085 } 6086 #endif 6087 if ((r_rsm->r_limit_type == 0) && (l_rsm->r_limit_type != 0)) { 6088 /* Transfer the split limit to the map we free */ 6089 r_rsm->r_limit_type = l_rsm->r_limit_type; 6090 l_rsm->r_limit_type = 0; 6091 } 6092 rack_free(rack, r_rsm); 6093 return (l_rsm); 6094 } 6095 6096 /* 6097 * TLP Timer, here we simply setup what segment we want to 6098 * have the TLP expire on, the normal rack_output() will then 6099 * send it out. 6100 * 6101 * We return 1, saying don't proceed with rack_output only 6102 * when all timers have been stopped (destroyed PCB?). 6103 */ 6104 static int 6105 rack_timeout_tlp(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8_t *doing_tlp) 6106 { 6107 /* 6108 * Tail Loss Probe. 6109 */ 6110 struct rack_sendmap *rsm = NULL; 6111 #ifdef INVARIANTS 6112 struct rack_sendmap *insret; 6113 #endif 6114 struct socket *so = tptosocket(tp); 6115 uint32_t amm; 6116 uint32_t out, avail; 6117 int collapsed_win = 0; 6118 6119 if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) { 6120 /* Its not time yet */ 6121 return (0); 6122 } 6123 if (ctf_progress_timeout_check(tp, true)) { 6124 rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__); 6125 return (-ETIMEDOUT); /* tcp_drop() */ 6126 } 6127 /* 6128 * A TLP timer has expired. We have been idle for 2 rtts. So we now 6129 * need to figure out how to force a full MSS segment out. 6130 */ 6131 rack_log_to_event(rack, RACK_TO_FRM_TLP, NULL); 6132 rack->r_ctl.retran_during_recovery = 0; 6133 rack->r_ctl.dsack_byte_cnt = 0; 6134 counter_u64_add(rack_tlp_tot, 1); 6135 if (rack->r_state && (rack->r_state != tp->t_state)) 6136 rack_set_state(tp, rack); 6137 avail = sbavail(&so->so_snd); 6138 out = tp->snd_max - tp->snd_una; 6139 if ((out > tp->snd_wnd) || rack->rc_has_collapsed) { 6140 /* special case, we need a retransmission */ 6141 collapsed_win = 1; 6142 goto need_retran; 6143 } 6144 if (rack->r_ctl.dsack_persist && (rack->r_ctl.rc_tlp_cnt_out >= 1)) { 6145 rack->r_ctl.dsack_persist--; 6146 if (rack->r_ctl.num_dsack && (rack->r_ctl.dsack_persist == 0)) { 6147 rack->r_ctl.num_dsack = 0; 6148 } 6149 rack_log_dsack_event(rack, 1, __LINE__, 0, 0); 6150 } 6151 if ((tp->t_flags & TF_GPUTINPROG) && 6152 (rack->r_ctl.rc_tlp_cnt_out == 1)) { 6153 /* 6154 * If this is the second in a row 6155 * TLP and we are doing a measurement 6156 * its time to abandon the measurement. 6157 * Something is likely broken on 6158 * the clients network and measuring a 6159 * broken network does us no good. 6160 */ 6161 tp->t_flags &= ~TF_GPUTINPROG; 6162 rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/, 6163 rack->r_ctl.rc_gp_srtt /*flex1*/, 6164 tp->gput_seq, 6165 0, 0, 18, __LINE__, NULL, 0); 6166 } 6167 /* 6168 * Check our send oldest always settings, and if 6169 * there is an oldest to send jump to the need_retran. 6170 */ 6171 if (rack_always_send_oldest && (TAILQ_EMPTY(&rack->r_ctl.rc_tmap) == 0)) 6172 goto need_retran; 6173 6174 if (avail > out) { 6175 /* New data is available */ 6176 amm = avail - out; 6177 if (amm > ctf_fixed_maxseg(tp)) { 6178 amm = ctf_fixed_maxseg(tp); 6179 if ((amm + out) > tp->snd_wnd) { 6180 /* We are rwnd limited */ 6181 goto need_retran; 6182 } 6183 } else if (amm < ctf_fixed_maxseg(tp)) { 6184 /* not enough to fill a MTU */ 6185 goto need_retran; 6186 } 6187 if (IN_FASTRECOVERY(tp->t_flags)) { 6188 /* Unlikely */ 6189 if (rack->rack_no_prr == 0) { 6190 if (out + amm <= tp->snd_wnd) { 6191 rack->r_ctl.rc_prr_sndcnt = amm; 6192 rack->r_ctl.rc_tlp_new_data = amm; 6193 rack_log_to_prr(rack, 4, 0, __LINE__); 6194 } 6195 } else 6196 goto need_retran; 6197 } else { 6198 /* Set the send-new override */ 6199 if (out + amm <= tp->snd_wnd) 6200 rack->r_ctl.rc_tlp_new_data = amm; 6201 else 6202 goto need_retran; 6203 } 6204 rack->r_ctl.rc_tlpsend = NULL; 6205 counter_u64_add(rack_tlp_newdata, 1); 6206 goto send; 6207 } 6208 need_retran: 6209 /* 6210 * Ok we need to arrange the last un-acked segment to be re-sent, or 6211 * optionally the first un-acked segment. 6212 */ 6213 if (collapsed_win == 0) { 6214 if (rack_always_send_oldest) 6215 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 6216 else { 6217 rsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 6218 if (rsm && (rsm->r_flags & (RACK_ACKED | RACK_HAS_FIN))) { 6219 rsm = rack_find_high_nonack(rack, rsm); 6220 } 6221 } 6222 if (rsm == NULL) { 6223 #ifdef TCP_BLACKBOX 6224 tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true); 6225 #endif 6226 goto out; 6227 } 6228 } else { 6229 /* 6230 * We must find the last segment 6231 * that was acceptable by the client. 6232 */ 6233 RB_FOREACH_REVERSE(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) { 6234 if ((rsm->r_flags & RACK_RWND_COLLAPSED) == 0) { 6235 /* Found one */ 6236 break; 6237 } 6238 } 6239 if (rsm == NULL) { 6240 /* None? if so send the first */ 6241 rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 6242 if (rsm == NULL) { 6243 #ifdef TCP_BLACKBOX 6244 tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true); 6245 #endif 6246 goto out; 6247 } 6248 } 6249 } 6250 if ((rsm->r_end - rsm->r_start) > ctf_fixed_maxseg(tp)) { 6251 /* 6252 * We need to split this the last segment in two. 6253 */ 6254 struct rack_sendmap *nrsm; 6255 6256 nrsm = rack_alloc_full_limit(rack); 6257 if (nrsm == NULL) { 6258 /* 6259 * No memory to split, we will just exit and punt 6260 * off to the RXT timer. 6261 */ 6262 goto out; 6263 } 6264 rack_clone_rsm(rack, nrsm, rsm, 6265 (rsm->r_end - ctf_fixed_maxseg(tp))); 6266 rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SPLIT, 0, __LINE__); 6267 #ifndef INVARIANTS 6268 (void)RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); 6269 #else 6270 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); 6271 if (insret != NULL) { 6272 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", 6273 nrsm, insret, rack, rsm); 6274 } 6275 #endif 6276 if (rsm->r_in_tmap) { 6277 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 6278 nrsm->r_in_tmap = 1; 6279 } 6280 rsm = nrsm; 6281 } 6282 rack->r_ctl.rc_tlpsend = rsm; 6283 send: 6284 /* Make sure output path knows we are doing a TLP */ 6285 *doing_tlp = 1; 6286 rack->r_timer_override = 1; 6287 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP; 6288 return (0); 6289 out: 6290 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP; 6291 return (0); 6292 } 6293 6294 /* 6295 * Delayed ack Timer, here we simply need to setup the 6296 * ACK_NOW flag and remove the DELACK flag. From there 6297 * the output routine will send the ack out. 6298 * 6299 * We only return 1, saying don't proceed, if all timers 6300 * are stopped (destroyed PCB?). 6301 */ 6302 static int 6303 rack_timeout_delack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 6304 { 6305 6306 rack_log_to_event(rack, RACK_TO_FRM_DELACK, NULL); 6307 tp->t_flags &= ~TF_DELACK; 6308 tp->t_flags |= TF_ACKNOW; 6309 KMOD_TCPSTAT_INC(tcps_delack); 6310 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK; 6311 return (0); 6312 } 6313 6314 /* 6315 * Persists timer, here we simply send the 6316 * same thing as a keepalive will. 6317 * the one byte send. 6318 * 6319 * We only return 1, saying don't proceed, if all timers 6320 * are stopped (destroyed PCB?). 6321 */ 6322 static int 6323 rack_timeout_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 6324 { 6325 struct tcptemp *t_template; 6326 int32_t retval = 1; 6327 6328 if (rack->rc_in_persist == 0) 6329 return (0); 6330 if (ctf_progress_timeout_check(tp, false)) { 6331 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); 6332 rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__); 6333 counter_u64_add(rack_persists_lost_ends, rack->r_ctl.persist_lost_ends); 6334 return (-ETIMEDOUT); /* tcp_drop() */ 6335 } 6336 /* 6337 * Persistence timer into zero window. Force a byte to be output, if 6338 * possible. 6339 */ 6340 KMOD_TCPSTAT_INC(tcps_persisttimeo); 6341 /* 6342 * Hack: if the peer is dead/unreachable, we do not time out if the 6343 * window is closed. After a full backoff, drop the connection if 6344 * the idle time (no responses to probes) reaches the maximum 6345 * backoff that we would use if retransmitting. 6346 */ 6347 if (tp->t_rxtshift == TCP_MAXRXTSHIFT && 6348 (ticks - tp->t_rcvtime >= tcp_maxpersistidle || 6349 TICKS_2_USEC(ticks - tp->t_rcvtime) >= RACK_REXMTVAL(tp) * tcp_totbackoff)) { 6350 KMOD_TCPSTAT_INC(tcps_persistdrop); 6351 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); 6352 counter_u64_add(rack_persists_lost_ends, rack->r_ctl.persist_lost_ends); 6353 retval = -ETIMEDOUT; /* tcp_drop() */ 6354 goto out; 6355 } 6356 if ((sbavail(&rack->rc_inp->inp_socket->so_snd) == 0) && 6357 tp->snd_una == tp->snd_max) 6358 rack_exit_persist(tp, rack, cts); 6359 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_PERSIT; 6360 /* 6361 * If the user has closed the socket then drop a persisting 6362 * connection after a much reduced timeout. 6363 */ 6364 if (tp->t_state > TCPS_CLOSE_WAIT && 6365 (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) { 6366 KMOD_TCPSTAT_INC(tcps_persistdrop); 6367 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); 6368 counter_u64_add(rack_persists_lost_ends, rack->r_ctl.persist_lost_ends); 6369 retval = -ETIMEDOUT; /* tcp_drop() */ 6370 goto out; 6371 } 6372 t_template = tcpip_maketemplate(rack->rc_inp); 6373 if (t_template) { 6374 /* only set it if we were answered */ 6375 if (rack->forced_ack == 0) { 6376 rack->forced_ack = 1; 6377 rack->r_ctl.forced_ack_ts = tcp_get_usecs(NULL); 6378 } else { 6379 rack->probe_not_answered = 1; 6380 counter_u64_add(rack_persists_loss, 1); 6381 rack->r_ctl.persist_lost_ends++; 6382 } 6383 counter_u64_add(rack_persists_sends, 1); 6384 tcp_respond(tp, t_template->tt_ipgen, 6385 &t_template->tt_t, (struct mbuf *)NULL, 6386 tp->rcv_nxt, tp->snd_una - 1, 0); 6387 /* This sends an ack */ 6388 if (tp->t_flags & TF_DELACK) 6389 tp->t_flags &= ~TF_DELACK; 6390 free(t_template, M_TEMP); 6391 } 6392 if (tp->t_rxtshift < TCP_MAXRXTSHIFT) 6393 tp->t_rxtshift++; 6394 out: 6395 rack_log_to_event(rack, RACK_TO_FRM_PERSIST, NULL); 6396 rack_start_hpts_timer(rack, tp, cts, 6397 0, 0, 0); 6398 return (retval); 6399 } 6400 6401 /* 6402 * If a keepalive goes off, we had no other timers 6403 * happening. We always return 1 here since this 6404 * routine either drops the connection or sends 6405 * out a segment with respond. 6406 */ 6407 static int 6408 rack_timeout_keepalive(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 6409 { 6410 struct tcptemp *t_template; 6411 struct inpcb *inp = tptoinpcb(tp); 6412 6413 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_KEEP; 6414 rack_log_to_event(rack, RACK_TO_FRM_KEEP, NULL); 6415 /* 6416 * Keep-alive timer went off; send something or drop connection if 6417 * idle for too long. 6418 */ 6419 KMOD_TCPSTAT_INC(tcps_keeptimeo); 6420 if (tp->t_state < TCPS_ESTABLISHED) 6421 goto dropit; 6422 if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && 6423 tp->t_state <= TCPS_CLOSING) { 6424 if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp)) 6425 goto dropit; 6426 /* 6427 * Send a packet designed to force a response if the peer is 6428 * up and reachable: either an ACK if the connection is 6429 * still alive, or an RST if the peer has closed the 6430 * connection due to timeout or reboot. Using sequence 6431 * number tp->snd_una-1 causes the transmitted zero-length 6432 * segment to lie outside the receive window; by the 6433 * protocol spec, this requires the correspondent TCP to 6434 * respond. 6435 */ 6436 KMOD_TCPSTAT_INC(tcps_keepprobe); 6437 t_template = tcpip_maketemplate(inp); 6438 if (t_template) { 6439 if (rack->forced_ack == 0) { 6440 rack->forced_ack = 1; 6441 rack->r_ctl.forced_ack_ts = tcp_get_usecs(NULL); 6442 } else { 6443 rack->probe_not_answered = 1; 6444 } 6445 tcp_respond(tp, t_template->tt_ipgen, 6446 &t_template->tt_t, (struct mbuf *)NULL, 6447 tp->rcv_nxt, tp->snd_una - 1, 0); 6448 free(t_template, M_TEMP); 6449 } 6450 } 6451 rack_start_hpts_timer(rack, tp, cts, 0, 0, 0); 6452 return (1); 6453 dropit: 6454 KMOD_TCPSTAT_INC(tcps_keepdrops); 6455 tcp_log_end_status(tp, TCP_EI_STATUS_KEEP_MAX); 6456 return (-ETIMEDOUT); /* tcp_drop() */ 6457 } 6458 6459 /* 6460 * Retransmit helper function, clear up all the ack 6461 * flags and take care of important book keeping. 6462 */ 6463 static void 6464 rack_remxt_tmr(struct tcpcb *tp) 6465 { 6466 /* 6467 * The retransmit timer went off, all sack'd blocks must be 6468 * un-acked. 6469 */ 6470 struct rack_sendmap *rsm, *trsm = NULL; 6471 struct tcp_rack *rack; 6472 6473 rack = (struct tcp_rack *)tp->t_fb_ptr; 6474 rack_timer_cancel(tp, rack, tcp_get_usecs(NULL), __LINE__); 6475 rack_log_to_event(rack, RACK_TO_FRM_TMR, NULL); 6476 if (rack->r_state && (rack->r_state != tp->t_state)) 6477 rack_set_state(tp, rack); 6478 /* 6479 * Ideally we would like to be able to 6480 * mark SACK-PASS on anything not acked here. 6481 * 6482 * However, if we do that we would burst out 6483 * all that data 1ms apart. This would be unwise, 6484 * so for now we will just let the normal rxt timer 6485 * and tlp timer take care of it. 6486 * 6487 * Also we really need to stick them back in sequence 6488 * order. This way we send in the proper order and any 6489 * sacks that come floating in will "re-ack" the data. 6490 * To do this we zap the tmap with an INIT and then 6491 * walk through and place every rsm in the RB tree 6492 * back in its seq ordered place. 6493 */ 6494 TAILQ_INIT(&rack->r_ctl.rc_tmap); 6495 RB_FOREACH(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) { 6496 rsm->r_dupack = 0; 6497 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 6498 /* We must re-add it back to the tlist */ 6499 if (trsm == NULL) { 6500 TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext); 6501 } else { 6502 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, trsm, rsm, r_tnext); 6503 } 6504 rsm->r_in_tmap = 1; 6505 trsm = rsm; 6506 if (rsm->r_flags & RACK_ACKED) 6507 rsm->r_flags |= RACK_WAS_ACKED; 6508 rsm->r_flags &= ~(RACK_ACKED | RACK_SACK_PASSED | RACK_WAS_SACKPASS | RACK_RWND_COLLAPSED); 6509 rsm->r_flags |= RACK_MUST_RXT; 6510 } 6511 /* Clear the count (we just un-acked them) */ 6512 rack->r_ctl.rc_last_timeout_snduna = tp->snd_una; 6513 rack->r_ctl.rc_sacked = 0; 6514 rack->r_ctl.rc_sacklast = NULL; 6515 rack->r_ctl.rc_agg_delayed = 0; 6516 rack->r_early = 0; 6517 rack->r_ctl.rc_agg_early = 0; 6518 rack->r_late = 0; 6519 /* Clear the tlp rtx mark */ 6520 rack->r_ctl.rc_resend = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 6521 if (rack->r_ctl.rc_resend != NULL) 6522 rack->r_ctl.rc_resend->r_flags |= RACK_TO_REXT; 6523 rack->r_ctl.rc_prr_sndcnt = 0; 6524 rack_log_to_prr(rack, 6, 0, __LINE__); 6525 rack->r_timer_override = 1; 6526 if ((((tp->t_flags & TF_SACK_PERMIT) == 0) 6527 #ifdef NETFLIX_EXP_DETECTION 6528 || (rack->sack_attack_disable != 0) 6529 #endif 6530 ) && ((tp->t_flags & TF_SENTFIN) == 0)) { 6531 /* 6532 * For non-sack customers new data 6533 * needs to go out as retransmits until 6534 * we retransmit up to snd_max. 6535 */ 6536 rack->r_must_retran = 1; 6537 rack->r_ctl.rc_out_at_rto = ctf_flight_size(rack->rc_tp, 6538 rack->r_ctl.rc_sacked); 6539 } 6540 rack->r_ctl.rc_snd_max_at_rto = tp->snd_max; 6541 } 6542 6543 static void 6544 rack_convert_rtts(struct tcpcb *tp) 6545 { 6546 if (tp->t_srtt > 1) { 6547 uint32_t val, frac; 6548 6549 val = tp->t_srtt >> TCP_RTT_SHIFT; 6550 frac = tp->t_srtt & 0x1f; 6551 tp->t_srtt = TICKS_2_USEC(val); 6552 /* 6553 * frac is the fractional part of the srtt (if any) 6554 * but its in ticks and every bit represents 6555 * 1/32nd of a hz. 6556 */ 6557 if (frac) { 6558 if (hz == 1000) { 6559 frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_MSEC) / (uint64_t)TCP_RTT_SCALE); 6560 } else { 6561 frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_SEC) / ((uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE)); 6562 } 6563 tp->t_srtt += frac; 6564 } 6565 } 6566 if (tp->t_rttvar) { 6567 uint32_t val, frac; 6568 6569 val = tp->t_rttvar >> TCP_RTTVAR_SHIFT; 6570 frac = tp->t_rttvar & 0x1f; 6571 tp->t_rttvar = TICKS_2_USEC(val); 6572 /* 6573 * frac is the fractional part of the srtt (if any) 6574 * but its in ticks and every bit represents 6575 * 1/32nd of a hz. 6576 */ 6577 if (frac) { 6578 if (hz == 1000) { 6579 frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_MSEC) / (uint64_t)TCP_RTT_SCALE); 6580 } else { 6581 frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_SEC) / ((uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE)); 6582 } 6583 tp->t_rttvar += frac; 6584 } 6585 } 6586 tp->t_rxtcur = RACK_REXMTVAL(tp); 6587 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 6588 tp->t_rxtcur += TICKS_2_USEC(tcp_rexmit_slop); 6589 } 6590 if (tp->t_rxtcur > rack_rto_max) { 6591 tp->t_rxtcur = rack_rto_max; 6592 } 6593 } 6594 6595 static void 6596 rack_cc_conn_init(struct tcpcb *tp) 6597 { 6598 struct tcp_rack *rack; 6599 uint32_t srtt; 6600 6601 rack = (struct tcp_rack *)tp->t_fb_ptr; 6602 srtt = tp->t_srtt; 6603 cc_conn_init(tp); 6604 /* 6605 * Now convert to rack's internal format, 6606 * if required. 6607 */ 6608 if ((srtt == 0) && (tp->t_srtt != 0)) 6609 rack_convert_rtts(tp); 6610 /* 6611 * We want a chance to stay in slowstart as 6612 * we create a connection. TCP spec says that 6613 * initially ssthresh is infinite. For our 6614 * purposes that is the snd_wnd. 6615 */ 6616 if (tp->snd_ssthresh < tp->snd_wnd) { 6617 tp->snd_ssthresh = tp->snd_wnd; 6618 } 6619 /* 6620 * We also want to assure a IW worth of 6621 * data can get inflight. 6622 */ 6623 if (rc_init_window(rack) < tp->snd_cwnd) 6624 tp->snd_cwnd = rc_init_window(rack); 6625 } 6626 6627 /* 6628 * Re-transmit timeout! If we drop the PCB we will return 1, otherwise 6629 * we will setup to retransmit the lowest seq number outstanding. 6630 */ 6631 static int 6632 rack_timeout_rxt(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 6633 { 6634 struct inpcb *inp = tptoinpcb(tp); 6635 int32_t rexmt; 6636 int32_t retval = 0; 6637 bool isipv6; 6638 6639 if ((tp->t_flags & TF_GPUTINPROG) && 6640 (tp->t_rxtshift)) { 6641 /* 6642 * We have had a second timeout 6643 * measurements on successive rxt's are not profitable. 6644 * It is unlikely to be of any use (the network is 6645 * broken or the client went away). 6646 */ 6647 tp->t_flags &= ~TF_GPUTINPROG; 6648 rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/, 6649 rack->r_ctl.rc_gp_srtt /*flex1*/, 6650 tp->gput_seq, 6651 0, 0, 18, __LINE__, NULL, 0); 6652 } 6653 if (ctf_progress_timeout_check(tp, false)) { 6654 tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN); 6655 rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__); 6656 return (-ETIMEDOUT); /* tcp_drop() */ 6657 } 6658 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RXT; 6659 rack->r_ctl.retran_during_recovery = 0; 6660 rack->rc_ack_required = 1; 6661 rack->r_ctl.dsack_byte_cnt = 0; 6662 if (IN_FASTRECOVERY(tp->t_flags)) 6663 tp->t_flags |= TF_WASFRECOVERY; 6664 else 6665 tp->t_flags &= ~TF_WASFRECOVERY; 6666 if (IN_CONGRECOVERY(tp->t_flags)) 6667 tp->t_flags |= TF_WASCRECOVERY; 6668 else 6669 tp->t_flags &= ~TF_WASCRECOVERY; 6670 if (TCPS_HAVEESTABLISHED(tp->t_state) && 6671 (tp->snd_una == tp->snd_max)) { 6672 /* Nothing outstanding .. nothing to do */ 6673 return (0); 6674 } 6675 if (rack->r_ctl.dsack_persist) { 6676 rack->r_ctl.dsack_persist--; 6677 if (rack->r_ctl.num_dsack && (rack->r_ctl.dsack_persist == 0)) { 6678 rack->r_ctl.num_dsack = 0; 6679 } 6680 rack_log_dsack_event(rack, 1, __LINE__, 0, 0); 6681 } 6682 /* 6683 * Rack can only run one timer at a time, so we cannot 6684 * run a KEEPINIT (gating SYN sending) and a retransmit 6685 * timer for the SYN. So if we are in a front state and 6686 * have a KEEPINIT timer we need to check the first transmit 6687 * against now to see if we have exceeded the KEEPINIT time 6688 * (if one is set). 6689 */ 6690 if ((TCPS_HAVEESTABLISHED(tp->t_state) == 0) && 6691 (TP_KEEPINIT(tp) != 0)) { 6692 struct rack_sendmap *rsm; 6693 6694 rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 6695 if (rsm) { 6696 /* Ok we have something outstanding to test keepinit with */ 6697 if ((TSTMP_GT(cts, (uint32_t)rsm->r_tim_lastsent[0])) && 6698 ((cts - (uint32_t)rsm->r_tim_lastsent[0]) >= TICKS_2_USEC(TP_KEEPINIT(tp)))) { 6699 /* We have exceeded the KEEPINIT time */ 6700 tcp_log_end_status(tp, TCP_EI_STATUS_KEEP_MAX); 6701 goto drop_it; 6702 } 6703 } 6704 } 6705 /* 6706 * Retransmission timer went off. Message has not been acked within 6707 * retransmit interval. Back off to a longer retransmit interval 6708 * and retransmit one segment. 6709 */ 6710 rack_remxt_tmr(tp); 6711 if ((rack->r_ctl.rc_resend == NULL) || 6712 ((rack->r_ctl.rc_resend->r_flags & RACK_RWND_COLLAPSED) == 0)) { 6713 /* 6714 * If the rwnd collapsed on 6715 * the one we are retransmitting 6716 * it does not count against the 6717 * rxt count. 6718 */ 6719 tp->t_rxtshift++; 6720 } 6721 if (tp->t_rxtshift > TCP_MAXRXTSHIFT) { 6722 tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN); 6723 drop_it: 6724 tp->t_rxtshift = TCP_MAXRXTSHIFT; 6725 KMOD_TCPSTAT_INC(tcps_timeoutdrop); 6726 /* XXXGL: previously t_softerror was casted to uint16_t */ 6727 MPASS(tp->t_softerror >= 0); 6728 retval = tp->t_softerror ? -tp->t_softerror : -ETIMEDOUT; 6729 goto out; /* tcp_drop() */ 6730 } 6731 if (tp->t_state == TCPS_SYN_SENT) { 6732 /* 6733 * If the SYN was retransmitted, indicate CWND to be limited 6734 * to 1 segment in cc_conn_init(). 6735 */ 6736 tp->snd_cwnd = 1; 6737 } else if (tp->t_rxtshift == 1) { 6738 /* 6739 * first retransmit; record ssthresh and cwnd so they can be 6740 * recovered if this turns out to be a "bad" retransmit. A 6741 * retransmit is considered "bad" if an ACK for this segment 6742 * is received within RTT/2 interval; the assumption here is 6743 * that the ACK was already in flight. See "On Estimating 6744 * End-to-End Network Path Properties" by Allman and Paxson 6745 * for more details. 6746 */ 6747 tp->snd_cwnd_prev = tp->snd_cwnd; 6748 tp->snd_ssthresh_prev = tp->snd_ssthresh; 6749 tp->snd_recover_prev = tp->snd_recover; 6750 tp->t_badrxtwin = ticks + (USEC_2_TICKS(tp->t_srtt)/2); 6751 tp->t_flags |= TF_PREVVALID; 6752 } else if ((tp->t_flags & TF_RCVD_TSTMP) == 0) 6753 tp->t_flags &= ~TF_PREVVALID; 6754 KMOD_TCPSTAT_INC(tcps_rexmttimeo); 6755 if ((tp->t_state == TCPS_SYN_SENT) || 6756 (tp->t_state == TCPS_SYN_RECEIVED)) 6757 rexmt = RACK_INITIAL_RTO * tcp_backoff[tp->t_rxtshift]; 6758 else 6759 rexmt = max(rack_rto_min, (tp->t_srtt + (tp->t_rttvar << 2))) * tcp_backoff[tp->t_rxtshift]; 6760 6761 RACK_TCPT_RANGESET(tp->t_rxtcur, rexmt, 6762 max(rack_rto_min, rexmt), rack_rto_max, rack->r_ctl.timer_slop); 6763 /* 6764 * We enter the path for PLMTUD if connection is established or, if 6765 * connection is FIN_WAIT_1 status, reason for the last is that if 6766 * amount of data we send is very small, we could send it in couple 6767 * of packets and process straight to FIN. In that case we won't 6768 * catch ESTABLISHED state. 6769 */ 6770 #ifdef INET6 6771 isipv6 = (inp->inp_vflag & INP_IPV6) ? true : false; 6772 #else 6773 isipv6 = false; 6774 #endif 6775 if (((V_tcp_pmtud_blackhole_detect == 1) || 6776 (V_tcp_pmtud_blackhole_detect == 2 && !isipv6) || 6777 (V_tcp_pmtud_blackhole_detect == 3 && isipv6)) && 6778 ((tp->t_state == TCPS_ESTABLISHED) || 6779 (tp->t_state == TCPS_FIN_WAIT_1))) { 6780 /* 6781 * Idea here is that at each stage of mtu probe (usually, 6782 * 1448 -> 1188 -> 524) should be given 2 chances to recover 6783 * before further clamping down. 'tp->t_rxtshift % 2 == 0' 6784 * should take care of that. 6785 */ 6786 if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) == 6787 (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) && 6788 (tp->t_rxtshift >= 2 && tp->t_rxtshift < 6 && 6789 tp->t_rxtshift % 2 == 0)) { 6790 /* 6791 * Enter Path MTU Black-hole Detection mechanism: - 6792 * Disable Path MTU Discovery (IP "DF" bit). - 6793 * Reduce MTU to lower value than what we negotiated 6794 * with peer. 6795 */ 6796 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) == 0) { 6797 /* Record that we may have found a black hole. */ 6798 tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE; 6799 /* Keep track of previous MSS. */ 6800 tp->t_pmtud_saved_maxseg = tp->t_maxseg; 6801 } 6802 6803 /* 6804 * Reduce the MSS to blackhole value or to the 6805 * default in an attempt to retransmit. 6806 */ 6807 #ifdef INET6 6808 if (isipv6 && 6809 tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) { 6810 /* Use the sysctl tuneable blackhole MSS. */ 6811 tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss; 6812 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated); 6813 } else if (isipv6) { 6814 /* Use the default MSS. */ 6815 tp->t_maxseg = V_tcp_v6mssdflt; 6816 /* 6817 * Disable Path MTU Discovery when we switch 6818 * to minmss. 6819 */ 6820 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 6821 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); 6822 } 6823 #endif 6824 #if defined(INET6) && defined(INET) 6825 else 6826 #endif 6827 #ifdef INET 6828 if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) { 6829 /* Use the sysctl tuneable blackhole MSS. */ 6830 tp->t_maxseg = V_tcp_pmtud_blackhole_mss; 6831 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated); 6832 } else { 6833 /* Use the default MSS. */ 6834 tp->t_maxseg = V_tcp_mssdflt; 6835 /* 6836 * Disable Path MTU Discovery when we switch 6837 * to minmss. 6838 */ 6839 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 6840 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); 6841 } 6842 #endif 6843 } else { 6844 /* 6845 * If further retransmissions are still unsuccessful 6846 * with a lowered MTU, maybe this isn't a blackhole 6847 * and we restore the previous MSS and blackhole 6848 * detection flags. The limit '6' is determined by 6849 * giving each probe stage (1448, 1188, 524) 2 6850 * chances to recover. 6851 */ 6852 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) && 6853 (tp->t_rxtshift >= 6)) { 6854 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 6855 tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE; 6856 tp->t_maxseg = tp->t_pmtud_saved_maxseg; 6857 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_failed); 6858 } 6859 } 6860 } 6861 /* 6862 * Disable RFC1323 and SACK if we haven't got any response to 6863 * our third SYN to work-around some broken terminal servers 6864 * (most of which have hopefully been retired) that have bad VJ 6865 * header compression code which trashes TCP segments containing 6866 * unknown-to-them TCP options. 6867 */ 6868 if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) && 6869 (tp->t_rxtshift == 3)) 6870 tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_SACK_PERMIT); 6871 /* 6872 * If we backed off this far, our srtt estimate is probably bogus. 6873 * Clobber it so we'll take the next rtt measurement as our srtt; 6874 * move the current srtt into rttvar to keep the current retransmit 6875 * times until then. 6876 */ 6877 if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) { 6878 #ifdef INET6 6879 if ((inp->inp_vflag & INP_IPV6) != 0) 6880 in6_losing(inp); 6881 else 6882 #endif 6883 in_losing(inp); 6884 tp->t_rttvar += tp->t_srtt; 6885 tp->t_srtt = 0; 6886 } 6887 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 6888 tp->snd_recover = tp->snd_max; 6889 tp->t_flags |= TF_ACKNOW; 6890 tp->t_rtttime = 0; 6891 rack_cong_signal(tp, CC_RTO, tp->snd_una, __LINE__); 6892 out: 6893 return (retval); 6894 } 6895 6896 static int 6897 rack_process_timers(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8_t hpts_calling, uint8_t *doing_tlp) 6898 { 6899 int32_t ret = 0; 6900 int32_t timers = (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK); 6901 6902 if ((tp->t_state >= TCPS_FIN_WAIT_1) && 6903 (tp->t_flags & TF_GPUTINPROG)) { 6904 /* 6905 * We have a goodput in progress 6906 * and we have entered a late state. 6907 * Do we have enough data in the sb 6908 * to handle the GPUT request? 6909 */ 6910 uint32_t bytes; 6911 6912 bytes = tp->gput_ack - tp->gput_seq; 6913 if (SEQ_GT(tp->gput_seq, tp->snd_una)) 6914 bytes += tp->gput_seq - tp->snd_una; 6915 if (bytes > sbavail(&tptosocket(tp)->so_snd)) { 6916 /* 6917 * There are not enough bytes in the socket 6918 * buffer that have been sent to cover this 6919 * measurement. Cancel it. 6920 */ 6921 rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/, 6922 rack->r_ctl.rc_gp_srtt /*flex1*/, 6923 tp->gput_seq, 6924 0, 0, 18, __LINE__, NULL, 0); 6925 tp->t_flags &= ~TF_GPUTINPROG; 6926 } 6927 } 6928 if (timers == 0) { 6929 return (0); 6930 } 6931 if (tp->t_state == TCPS_LISTEN) { 6932 /* no timers on listen sockets */ 6933 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) 6934 return (0); 6935 return (1); 6936 } 6937 if ((timers & PACE_TMR_RACK) && 6938 rack->rc_on_min_to) { 6939 /* 6940 * For the rack timer when we 6941 * are on a min-timeout (which means rrr_conf = 3) 6942 * we don't want to check the timer. It may 6943 * be going off for a pace and thats ok we 6944 * want to send the retransmit (if its ready). 6945 * 6946 * If its on a normal rack timer (non-min) then 6947 * we will check if its expired. 6948 */ 6949 goto skip_time_check; 6950 } 6951 if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) { 6952 uint32_t left; 6953 6954 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { 6955 ret = -1; 6956 rack_log_to_processing(rack, cts, ret, 0); 6957 return (0); 6958 } 6959 if (hpts_calling == 0) { 6960 /* 6961 * A user send or queued mbuf (sack) has called us? We 6962 * return 0 and let the pacing guards 6963 * deal with it if they should or 6964 * should not cause a send. 6965 */ 6966 ret = -2; 6967 rack_log_to_processing(rack, cts, ret, 0); 6968 return (0); 6969 } 6970 /* 6971 * Ok our timer went off early and we are not paced false 6972 * alarm, go back to sleep. 6973 */ 6974 ret = -3; 6975 left = rack->r_ctl.rc_timer_exp - cts; 6976 tcp_hpts_insert(tptoinpcb(tp), HPTS_MS_TO_SLOTS(left)); 6977 rack_log_to_processing(rack, cts, ret, left); 6978 return (1); 6979 } 6980 skip_time_check: 6981 rack->rc_tmr_stopped = 0; 6982 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_MASK; 6983 if (timers & PACE_TMR_DELACK) { 6984 ret = rack_timeout_delack(tp, rack, cts); 6985 } else if (timers & PACE_TMR_RACK) { 6986 rack->r_ctl.rc_tlp_rxt_last_time = cts; 6987 rack->r_fast_output = 0; 6988 ret = rack_timeout_rack(tp, rack, cts); 6989 } else if (timers & PACE_TMR_TLP) { 6990 rack->r_ctl.rc_tlp_rxt_last_time = cts; 6991 ret = rack_timeout_tlp(tp, rack, cts, doing_tlp); 6992 } else if (timers & PACE_TMR_RXT) { 6993 rack->r_ctl.rc_tlp_rxt_last_time = cts; 6994 rack->r_fast_output = 0; 6995 ret = rack_timeout_rxt(tp, rack, cts); 6996 } else if (timers & PACE_TMR_PERSIT) { 6997 ret = rack_timeout_persist(tp, rack, cts); 6998 } else if (timers & PACE_TMR_KEEP) { 6999 ret = rack_timeout_keepalive(tp, rack, cts); 7000 } 7001 rack_log_to_processing(rack, cts, ret, timers); 7002 return (ret); 7003 } 7004 7005 static void 7006 rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line) 7007 { 7008 struct timeval tv; 7009 uint32_t us_cts, flags_on_entry; 7010 uint8_t hpts_removed = 0; 7011 7012 flags_on_entry = rack->r_ctl.rc_hpts_flags; 7013 us_cts = tcp_get_usecs(&tv); 7014 if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) && 7015 ((TSTMP_GEQ(us_cts, rack->r_ctl.rc_last_output_to)) || 7016 ((tp->snd_max - tp->snd_una) == 0))) { 7017 tcp_hpts_remove(rack->rc_inp); 7018 hpts_removed = 1; 7019 /* If we were not delayed cancel out the flag. */ 7020 if ((tp->snd_max - tp->snd_una) == 0) 7021 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 7022 rack_log_to_cancel(rack, hpts_removed, line, us_cts, &tv, flags_on_entry); 7023 } 7024 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { 7025 rack->rc_tmr_stopped = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK; 7026 if (tcp_in_hpts(rack->rc_inp) && 7027 ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0)) { 7028 /* 7029 * Canceling timer's when we have no output being 7030 * paced. We also must remove ourselves from the 7031 * hpts. 7032 */ 7033 tcp_hpts_remove(rack->rc_inp); 7034 hpts_removed = 1; 7035 } 7036 rack->r_ctl.rc_hpts_flags &= ~(PACE_TMR_MASK); 7037 } 7038 if (hpts_removed == 0) 7039 rack_log_to_cancel(rack, hpts_removed, line, us_cts, &tv, flags_on_entry); 7040 } 7041 7042 static int 7043 rack_stopall(struct tcpcb *tp) 7044 { 7045 struct tcp_rack *rack; 7046 rack = (struct tcp_rack *)tp->t_fb_ptr; 7047 rack->t_timers_stopped = 1; 7048 return (0); 7049 } 7050 7051 static void 7052 rack_stop_all_timers(struct tcpcb *tp) 7053 { 7054 struct tcp_rack *rack; 7055 7056 /* 7057 * Assure no timers are running. 7058 */ 7059 if (tcp_timer_active(tp, TT_PERSIST)) { 7060 /* We enter in persists, set the flag appropriately */ 7061 rack = (struct tcp_rack *)tp->t_fb_ptr; 7062 rack->rc_in_persist = 1; 7063 } 7064 } 7065 7066 static void 7067 rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack, 7068 struct rack_sendmap *rsm, uint64_t ts, uint16_t add_flag) 7069 { 7070 int32_t idx; 7071 7072 rsm->r_rtr_cnt++; 7073 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 7074 rsm->r_dupack = 0; 7075 if (rsm->r_rtr_cnt > RACK_NUM_OF_RETRANS) { 7076 rsm->r_rtr_cnt = RACK_NUM_OF_RETRANS; 7077 rsm->r_flags |= RACK_OVERMAX; 7078 } 7079 if ((rsm->r_rtr_cnt > 1) && ((rsm->r_flags & RACK_TLP) == 0)) { 7080 rack->r_ctl.rc_holes_rxt += (rsm->r_end - rsm->r_start); 7081 rsm->r_rtr_bytes += (rsm->r_end - rsm->r_start); 7082 } 7083 idx = rsm->r_rtr_cnt - 1; 7084 rsm->r_tim_lastsent[idx] = ts; 7085 /* 7086 * Here we don't add in the len of send, since its already 7087 * in snduna <->snd_max. 7088 */ 7089 rsm->r_fas = ctf_flight_size(rack->rc_tp, 7090 rack->r_ctl.rc_sacked); 7091 if (rsm->r_flags & RACK_ACKED) { 7092 /* Problably MTU discovery messing with us */ 7093 rsm->r_flags &= ~RACK_ACKED; 7094 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); 7095 } 7096 if (rsm->r_in_tmap) { 7097 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 7098 rsm->r_in_tmap = 0; 7099 } 7100 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); 7101 rsm->r_in_tmap = 1; 7102 /* Take off the must retransmit flag, if its on */ 7103 if (rsm->r_flags & RACK_MUST_RXT) { 7104 if (rack->r_must_retran) 7105 rack->r_ctl.rc_out_at_rto -= (rsm->r_end - rsm->r_start); 7106 if (SEQ_GEQ(rsm->r_end, rack->r_ctl.rc_snd_max_at_rto)) { 7107 /* 7108 * We have retransmitted all we need. Clear 7109 * any must retransmit flags. 7110 */ 7111 rack->r_must_retran = 0; 7112 rack->r_ctl.rc_out_at_rto = 0; 7113 } 7114 rsm->r_flags &= ~RACK_MUST_RXT; 7115 } 7116 if (rsm->r_flags & RACK_SACK_PASSED) { 7117 /* We have retransmitted due to the SACK pass */ 7118 rsm->r_flags &= ~RACK_SACK_PASSED; 7119 rsm->r_flags |= RACK_WAS_SACKPASS; 7120 } 7121 } 7122 7123 static uint32_t 7124 rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack, 7125 struct rack_sendmap *rsm, uint64_t ts, int32_t *lenp, uint16_t add_flag) 7126 { 7127 /* 7128 * We (re-)transmitted starting at rsm->r_start for some length 7129 * (possibly less than r_end. 7130 */ 7131 struct rack_sendmap *nrsm; 7132 #ifdef INVARIANTS 7133 struct rack_sendmap *insret; 7134 #endif 7135 uint32_t c_end; 7136 int32_t len; 7137 7138 len = *lenp; 7139 c_end = rsm->r_start + len; 7140 if (SEQ_GEQ(c_end, rsm->r_end)) { 7141 /* 7142 * We retransmitted the whole piece or more than the whole 7143 * slopping into the next rsm. 7144 */ 7145 rack_update_rsm(tp, rack, rsm, ts, add_flag); 7146 if (c_end == rsm->r_end) { 7147 *lenp = 0; 7148 return (0); 7149 } else { 7150 int32_t act_len; 7151 7152 /* Hangs over the end return whats left */ 7153 act_len = rsm->r_end - rsm->r_start; 7154 *lenp = (len - act_len); 7155 return (rsm->r_end); 7156 } 7157 /* We don't get out of this block. */ 7158 } 7159 /* 7160 * Here we retransmitted less than the whole thing which means we 7161 * have to split this into what was transmitted and what was not. 7162 */ 7163 nrsm = rack_alloc_full_limit(rack); 7164 if (nrsm == NULL) { 7165 /* 7166 * We can't get memory, so lets not proceed. 7167 */ 7168 *lenp = 0; 7169 return (0); 7170 } 7171 /* 7172 * So here we are going to take the original rsm and make it what we 7173 * retransmitted. nrsm will be the tail portion we did not 7174 * retransmit. For example say the chunk was 1, 11 (10 bytes). And 7175 * we retransmitted 5 bytes i.e. 1, 5. The original piece shrinks to 7176 * 1, 6 and the new piece will be 6, 11. 7177 */ 7178 rack_clone_rsm(rack, nrsm, rsm, c_end); 7179 nrsm->r_dupack = 0; 7180 rack_log_retran_reason(rack, nrsm, __LINE__, 0, 2); 7181 #ifndef INVARIANTS 7182 (void)RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); 7183 #else 7184 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); 7185 if (insret != NULL) { 7186 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", 7187 nrsm, insret, rack, rsm); 7188 } 7189 #endif 7190 if (rsm->r_in_tmap) { 7191 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 7192 nrsm->r_in_tmap = 1; 7193 } 7194 rsm->r_flags &= (~RACK_HAS_FIN); 7195 rack_update_rsm(tp, rack, rsm, ts, add_flag); 7196 /* Log a split of rsm into rsm and nrsm */ 7197 rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SPLIT, 0, __LINE__); 7198 *lenp = 0; 7199 return (0); 7200 } 7201 7202 static void 7203 rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len, 7204 uint32_t seq_out, uint16_t th_flags, int32_t err, uint64_t cts, 7205 struct rack_sendmap *hintrsm, uint16_t add_flag, struct mbuf *s_mb, uint32_t s_moff, int hw_tls) 7206 { 7207 struct tcp_rack *rack; 7208 struct rack_sendmap *rsm, *nrsm, fe; 7209 #ifdef INVARIANTS 7210 struct rack_sendmap *insret; 7211 #endif 7212 register uint32_t snd_max, snd_una; 7213 7214 /* 7215 * Add to the RACK log of packets in flight or retransmitted. If 7216 * there is a TS option we will use the TS echoed, if not we will 7217 * grab a TS. 7218 * 7219 * Retransmissions will increment the count and move the ts to its 7220 * proper place. Note that if options do not include TS's then we 7221 * won't be able to effectively use the ACK for an RTT on a retran. 7222 * 7223 * Notes about r_start and r_end. Lets consider a send starting at 7224 * sequence 1 for 10 bytes. In such an example the r_start would be 7225 * 1 (starting sequence) but the r_end would be r_start+len i.e. 11. 7226 * This means that r_end is actually the first sequence for the next 7227 * slot (11). 7228 * 7229 */ 7230 /* 7231 * If err is set what do we do XXXrrs? should we not add the thing? 7232 * -- i.e. return if err != 0 or should we pretend we sent it? -- 7233 * i.e. proceed with add ** do this for now. 7234 */ 7235 INP_WLOCK_ASSERT(tptoinpcb(tp)); 7236 if (err) 7237 /* 7238 * We don't log errors -- we could but snd_max does not 7239 * advance in this case either. 7240 */ 7241 return; 7242 7243 if (th_flags & TH_RST) { 7244 /* 7245 * We don't log resets and we return immediately from 7246 * sending 7247 */ 7248 return; 7249 } 7250 rack = (struct tcp_rack *)tp->t_fb_ptr; 7251 snd_una = tp->snd_una; 7252 snd_max = tp->snd_max; 7253 if (th_flags & (TH_SYN | TH_FIN)) { 7254 /* 7255 * The call to rack_log_output is made before bumping 7256 * snd_max. This means we can record one extra byte on a SYN 7257 * or FIN if seq_out is adding more on and a FIN is present 7258 * (and we are not resending). 7259 */ 7260 if ((th_flags & TH_SYN) && (seq_out == tp->iss)) 7261 len++; 7262 if (th_flags & TH_FIN) 7263 len++; 7264 if (SEQ_LT(snd_max, tp->snd_nxt)) { 7265 /* 7266 * The add/update as not been done for the FIN/SYN 7267 * yet. 7268 */ 7269 snd_max = tp->snd_nxt; 7270 } 7271 } 7272 if (SEQ_LEQ((seq_out + len), snd_una)) { 7273 /* Are sending an old segment to induce an ack (keep-alive)? */ 7274 return; 7275 } 7276 if (SEQ_LT(seq_out, snd_una)) { 7277 /* huh? should we panic? */ 7278 uint32_t end; 7279 7280 end = seq_out + len; 7281 seq_out = snd_una; 7282 if (SEQ_GEQ(end, seq_out)) 7283 len = end - seq_out; 7284 else 7285 len = 0; 7286 } 7287 if (len == 0) { 7288 /* We don't log zero window probes */ 7289 return; 7290 } 7291 if (IN_FASTRECOVERY(tp->t_flags)) { 7292 rack->r_ctl.rc_prr_out += len; 7293 } 7294 /* First question is it a retransmission or new? */ 7295 if (seq_out == snd_max) { 7296 /* Its new */ 7297 again: 7298 rsm = rack_alloc(rack); 7299 if (rsm == NULL) { 7300 /* 7301 * Hmm out of memory and the tcb got destroyed while 7302 * we tried to wait. 7303 */ 7304 return; 7305 } 7306 if (th_flags & TH_FIN) { 7307 rsm->r_flags = RACK_HAS_FIN|add_flag; 7308 } else { 7309 rsm->r_flags = add_flag; 7310 } 7311 if (hw_tls) 7312 rsm->r_hw_tls = 1; 7313 rsm->r_tim_lastsent[0] = cts; 7314 rsm->r_rtr_cnt = 1; 7315 rsm->r_rtr_bytes = 0; 7316 if (th_flags & TH_SYN) { 7317 /* The data space is one beyond snd_una */ 7318 rsm->r_flags |= RACK_HAS_SYN; 7319 } 7320 rsm->r_start = seq_out; 7321 rsm->r_end = rsm->r_start + len; 7322 rsm->r_dupack = 0; 7323 /* 7324 * save off the mbuf location that 7325 * sndmbuf_noadv returned (which is 7326 * where we started copying from).. 7327 */ 7328 rsm->m = s_mb; 7329 rsm->soff = s_moff; 7330 /* 7331 * Here we do add in the len of send, since its not yet 7332 * reflected in in snduna <->snd_max 7333 */ 7334 rsm->r_fas = (ctf_flight_size(rack->rc_tp, 7335 rack->r_ctl.rc_sacked) + 7336 (rsm->r_end - rsm->r_start)); 7337 /* rsm->m will be NULL if RACK_HAS_SYN or RACK_HAS_FIN is set */ 7338 if (rsm->m) { 7339 if (rsm->m->m_len <= rsm->soff) { 7340 /* 7341 * XXXrrs Question, will this happen? 7342 * 7343 * If sbsndptr is set at the correct place 7344 * then s_moff should always be somewhere 7345 * within rsm->m. But if the sbsndptr was 7346 * off then that won't be true. If it occurs 7347 * we need to walkout to the correct location. 7348 */ 7349 struct mbuf *lm; 7350 7351 lm = rsm->m; 7352 while (lm->m_len <= rsm->soff) { 7353 rsm->soff -= lm->m_len; 7354 lm = lm->m_next; 7355 KASSERT(lm != NULL, ("%s rack:%p lm goes null orig_off:%u origmb:%p rsm->soff:%u", 7356 __func__, rack, s_moff, s_mb, rsm->soff)); 7357 } 7358 rsm->m = lm; 7359 } 7360 rsm->orig_m_len = rsm->m->m_len; 7361 } else 7362 rsm->orig_m_len = 0; 7363 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 7364 /* Log a new rsm */ 7365 rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_NEW, 0, __LINE__); 7366 #ifndef INVARIANTS 7367 (void)RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7368 #else 7369 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7370 if (insret != NULL) { 7371 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", 7372 nrsm, insret, rack, rsm); 7373 } 7374 #endif 7375 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); 7376 rsm->r_in_tmap = 1; 7377 /* 7378 * Special case detection, is there just a single 7379 * packet outstanding when we are not in recovery? 7380 * 7381 * If this is true mark it so. 7382 */ 7383 if ((IN_FASTRECOVERY(tp->t_flags) == 0) && 7384 (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) == ctf_fixed_maxseg(tp))) { 7385 struct rack_sendmap *prsm; 7386 7387 prsm = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7388 if (prsm) 7389 prsm->r_one_out_nr = 1; 7390 } 7391 return; 7392 } 7393 /* 7394 * If we reach here its a retransmission and we need to find it. 7395 */ 7396 memset(&fe, 0, sizeof(fe)); 7397 more: 7398 if (hintrsm && (hintrsm->r_start == seq_out)) { 7399 rsm = hintrsm; 7400 hintrsm = NULL; 7401 } else { 7402 /* No hints sorry */ 7403 rsm = NULL; 7404 } 7405 if ((rsm) && (rsm->r_start == seq_out)) { 7406 seq_out = rack_update_entry(tp, rack, rsm, cts, &len, add_flag); 7407 if (len == 0) { 7408 return; 7409 } else { 7410 goto more; 7411 } 7412 } 7413 /* Ok it was not the last pointer go through it the hard way. */ 7414 refind: 7415 fe.r_start = seq_out; 7416 rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); 7417 if (rsm) { 7418 if (rsm->r_start == seq_out) { 7419 seq_out = rack_update_entry(tp, rack, rsm, cts, &len, add_flag); 7420 if (len == 0) { 7421 return; 7422 } else { 7423 goto refind; 7424 } 7425 } 7426 if (SEQ_GEQ(seq_out, rsm->r_start) && SEQ_LT(seq_out, rsm->r_end)) { 7427 /* Transmitted within this piece */ 7428 /* 7429 * Ok we must split off the front and then let the 7430 * update do the rest 7431 */ 7432 nrsm = rack_alloc_full_limit(rack); 7433 if (nrsm == NULL) { 7434 rack_update_rsm(tp, rack, rsm, cts, add_flag); 7435 return; 7436 } 7437 /* 7438 * copy rsm to nrsm and then trim the front of rsm 7439 * to not include this part. 7440 */ 7441 rack_clone_rsm(rack, nrsm, rsm, seq_out); 7442 rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SPLIT, 0, __LINE__); 7443 #ifndef INVARIANTS 7444 (void)RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); 7445 #else 7446 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); 7447 if (insret != NULL) { 7448 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", 7449 nrsm, insret, rack, rsm); 7450 } 7451 #endif 7452 if (rsm->r_in_tmap) { 7453 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 7454 nrsm->r_in_tmap = 1; 7455 } 7456 rsm->r_flags &= (~RACK_HAS_FIN); 7457 seq_out = rack_update_entry(tp, rack, nrsm, cts, &len, add_flag); 7458 if (len == 0) { 7459 return; 7460 } else if (len > 0) 7461 goto refind; 7462 } 7463 } 7464 /* 7465 * Hmm not found in map did they retransmit both old and on into the 7466 * new? 7467 */ 7468 if (seq_out == tp->snd_max) { 7469 goto again; 7470 } else if (SEQ_LT(seq_out, tp->snd_max)) { 7471 #ifdef INVARIANTS 7472 printf("seq_out:%u len:%d snd_una:%u snd_max:%u -- but rsm not found?\n", 7473 seq_out, len, tp->snd_una, tp->snd_max); 7474 printf("Starting Dump of all rack entries\n"); 7475 RB_FOREACH(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) { 7476 printf("rsm:%p start:%u end:%u\n", 7477 rsm, rsm->r_start, rsm->r_end); 7478 } 7479 printf("Dump complete\n"); 7480 panic("seq_out not found rack:%p tp:%p", 7481 rack, tp); 7482 #endif 7483 } else { 7484 #ifdef INVARIANTS 7485 /* 7486 * Hmm beyond sndmax? (only if we are using the new rtt-pack 7487 * flag) 7488 */ 7489 panic("seq_out:%u(%d) is beyond snd_max:%u tp:%p", 7490 seq_out, len, tp->snd_max, tp); 7491 #endif 7492 } 7493 } 7494 7495 /* 7496 * Record one of the RTT updates from an ack into 7497 * our sample structure. 7498 */ 7499 7500 static void 7501 tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt, uint32_t len, uint32_t us_rtt, 7502 int confidence, struct rack_sendmap *rsm, uint16_t rtrcnt) 7503 { 7504 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) || 7505 (rack->r_ctl.rack_rs.rs_rtt_lowest > rtt)) { 7506 rack->r_ctl.rack_rs.rs_rtt_lowest = rtt; 7507 } 7508 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) || 7509 (rack->r_ctl.rack_rs.rs_rtt_highest < rtt)) { 7510 rack->r_ctl.rack_rs.rs_rtt_highest = rtt; 7511 } 7512 if (rack->rc_tp->t_flags & TF_GPUTINPROG) { 7513 if (us_rtt < rack->r_ctl.rc_gp_lowrtt) 7514 rack->r_ctl.rc_gp_lowrtt = us_rtt; 7515 if (rack->rc_tp->snd_wnd > rack->r_ctl.rc_gp_high_rwnd) 7516 rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd; 7517 } 7518 if ((confidence == 1) && 7519 ((rsm == NULL) || 7520 (rsm->r_just_ret) || 7521 (rsm->r_one_out_nr && 7522 len < (ctf_fixed_maxseg(rack->rc_tp) * 2)))) { 7523 /* 7524 * If the rsm had a just return 7525 * hit it then we can't trust the 7526 * rtt measurement for buffer deterimination 7527 * Note that a confidence of 2, indicates 7528 * SACK'd which overrides the r_just_ret or 7529 * the r_one_out_nr. If it was a CUM-ACK and 7530 * we had only two outstanding, but get an 7531 * ack for only 1. Then that also lowers our 7532 * confidence. 7533 */ 7534 confidence = 0; 7535 } 7536 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) || 7537 (rack->r_ctl.rack_rs.rs_us_rtt > us_rtt)) { 7538 if (rack->r_ctl.rack_rs.confidence == 0) { 7539 /* 7540 * We take anything with no current confidence 7541 * saved. 7542 */ 7543 rack->r_ctl.rack_rs.rs_us_rtt = us_rtt; 7544 rack->r_ctl.rack_rs.confidence = confidence; 7545 rack->r_ctl.rack_rs.rs_us_rtrcnt = rtrcnt; 7546 } else if (confidence || rack->r_ctl.rack_rs.confidence) { 7547 /* 7548 * Once we have a confident number, 7549 * we can update it with a smaller 7550 * value since this confident number 7551 * may include the DSACK time until 7552 * the next segment (the second one) arrived. 7553 */ 7554 rack->r_ctl.rack_rs.rs_us_rtt = us_rtt; 7555 rack->r_ctl.rack_rs.confidence = confidence; 7556 rack->r_ctl.rack_rs.rs_us_rtrcnt = rtrcnt; 7557 } 7558 } 7559 rack_log_rtt_upd(rack->rc_tp, rack, us_rtt, len, rsm, confidence); 7560 rack->r_ctl.rack_rs.rs_flags = RACK_RTT_VALID; 7561 rack->r_ctl.rack_rs.rs_rtt_tot += rtt; 7562 rack->r_ctl.rack_rs.rs_rtt_cnt++; 7563 } 7564 7565 /* 7566 * Collect new round-trip time estimate 7567 * and update averages and current timeout. 7568 */ 7569 static void 7570 tcp_rack_xmit_timer_commit(struct tcp_rack *rack, struct tcpcb *tp) 7571 { 7572 int32_t delta; 7573 int32_t rtt; 7574 7575 if (rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) 7576 /* No valid sample */ 7577 return; 7578 if (rack->r_ctl.rc_rate_sample_method == USE_RTT_LOW) { 7579 /* We are to use the lowest RTT seen in a single ack */ 7580 rtt = rack->r_ctl.rack_rs.rs_rtt_lowest; 7581 } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_HIGH) { 7582 /* We are to use the highest RTT seen in a single ack */ 7583 rtt = rack->r_ctl.rack_rs.rs_rtt_highest; 7584 } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_AVG) { 7585 /* We are to use the average RTT seen in a single ack */ 7586 rtt = (int32_t)(rack->r_ctl.rack_rs.rs_rtt_tot / 7587 (uint64_t)rack->r_ctl.rack_rs.rs_rtt_cnt); 7588 } else { 7589 #ifdef INVARIANTS 7590 panic("Unknown rtt variant %d", rack->r_ctl.rc_rate_sample_method); 7591 #endif 7592 return; 7593 } 7594 if (rtt == 0) 7595 rtt = 1; 7596 if (rack->rc_gp_rtt_set == 0) { 7597 /* 7598 * With no RTT we have to accept 7599 * even one we are not confident of. 7600 */ 7601 rack->r_ctl.rc_gp_srtt = rack->r_ctl.rack_rs.rs_us_rtt; 7602 rack->rc_gp_rtt_set = 1; 7603 } else if (rack->r_ctl.rack_rs.confidence) { 7604 /* update the running gp srtt */ 7605 rack->r_ctl.rc_gp_srtt -= (rack->r_ctl.rc_gp_srtt/8); 7606 rack->r_ctl.rc_gp_srtt += rack->r_ctl.rack_rs.rs_us_rtt / 8; 7607 } 7608 if (rack->r_ctl.rack_rs.confidence) { 7609 /* 7610 * record the low and high for highly buffered path computation, 7611 * we only do this if we are confident (not a retransmission). 7612 */ 7613 if (rack->r_ctl.rc_highest_us_rtt < rack->r_ctl.rack_rs.rs_us_rtt) { 7614 rack->r_ctl.rc_highest_us_rtt = rack->r_ctl.rack_rs.rs_us_rtt; 7615 } 7616 if (rack->rc_highly_buffered == 0) { 7617 /* 7618 * Currently once we declare a path has 7619 * highly buffered there is no going 7620 * back, which may be a problem... 7621 */ 7622 if ((rack->r_ctl.rc_highest_us_rtt / rack->r_ctl.rc_lowest_us_rtt) > rack_hbp_thresh) { 7623 rack_log_rtt_shrinks(rack, rack->r_ctl.rack_rs.rs_us_rtt, 7624 rack->r_ctl.rc_highest_us_rtt, 7625 rack->r_ctl.rc_lowest_us_rtt, 7626 RACK_RTTS_SEEHBP); 7627 rack->rc_highly_buffered = 1; 7628 } 7629 } 7630 } 7631 if ((rack->r_ctl.rack_rs.confidence) || 7632 (rack->r_ctl.rack_rs.rs_us_rtrcnt == 1)) { 7633 /* 7634 * If we are highly confident of it <or> it was 7635 * never retransmitted we accept it as the last us_rtt. 7636 */ 7637 rack->r_ctl.rc_last_us_rtt = rack->r_ctl.rack_rs.rs_us_rtt; 7638 /* The lowest rtt can be set if its was not retransmited */ 7639 if (rack->r_ctl.rc_lowest_us_rtt > rack->r_ctl.rack_rs.rs_us_rtt) { 7640 rack->r_ctl.rc_lowest_us_rtt = rack->r_ctl.rack_rs.rs_us_rtt; 7641 if (rack->r_ctl.rc_lowest_us_rtt == 0) 7642 rack->r_ctl.rc_lowest_us_rtt = 1; 7643 } 7644 } 7645 rack = (struct tcp_rack *)tp->t_fb_ptr; 7646 if (tp->t_srtt != 0) { 7647 /* 7648 * We keep a simple srtt in microseconds, like our rtt 7649 * measurement. We don't need to do any tricks with shifting 7650 * etc. Instead we just add in 1/8th of the new measurement 7651 * and subtract out 1/8 of the old srtt. We do the same with 7652 * the variance after finding the absolute value of the 7653 * difference between this sample and the current srtt. 7654 */ 7655 delta = tp->t_srtt - rtt; 7656 /* Take off 1/8th of the current sRTT */ 7657 tp->t_srtt -= (tp->t_srtt >> 3); 7658 /* Add in 1/8th of the new RTT just measured */ 7659 tp->t_srtt += (rtt >> 3); 7660 if (tp->t_srtt <= 0) 7661 tp->t_srtt = 1; 7662 /* Now lets make the absolute value of the variance */ 7663 if (delta < 0) 7664 delta = -delta; 7665 /* Subtract out 1/8th */ 7666 tp->t_rttvar -= (tp->t_rttvar >> 3); 7667 /* Add in 1/8th of the new variance we just saw */ 7668 tp->t_rttvar += (delta >> 3); 7669 if (tp->t_rttvar <= 0) 7670 tp->t_rttvar = 1; 7671 } else { 7672 /* 7673 * No rtt measurement yet - use the unsmoothed rtt. Set the 7674 * variance to half the rtt (so our first retransmit happens 7675 * at 3*rtt). 7676 */ 7677 tp->t_srtt = rtt; 7678 tp->t_rttvar = rtt >> 1; 7679 } 7680 rack->rc_srtt_measure_made = 1; 7681 KMOD_TCPSTAT_INC(tcps_rttupdated); 7682 tp->t_rttupdated++; 7683 #ifdef STATS 7684 if (rack_stats_gets_ms_rtt == 0) { 7685 /* Send in the microsecond rtt used for rxt timeout purposes */ 7686 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rtt)); 7687 } else if (rack_stats_gets_ms_rtt == 1) { 7688 /* Send in the millisecond rtt used for rxt timeout purposes */ 7689 int32_t ms_rtt; 7690 7691 /* Round up */ 7692 ms_rtt = (rtt + HPTS_USEC_IN_MSEC - 1) / HPTS_USEC_IN_MSEC; 7693 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, ms_rtt)); 7694 } else if (rack_stats_gets_ms_rtt == 2) { 7695 /* Send in the millisecond rtt has close to the path RTT as we can get */ 7696 int32_t ms_rtt; 7697 7698 /* Round up */ 7699 ms_rtt = (rack->r_ctl.rack_rs.rs_us_rtt + HPTS_USEC_IN_MSEC - 1) / HPTS_USEC_IN_MSEC; 7700 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, ms_rtt)); 7701 } else { 7702 /* Send in the microsecond rtt has close to the path RTT as we can get */ 7703 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rack->r_ctl.rack_rs.rs_us_rtt)); 7704 } 7705 7706 #endif 7707 /* 7708 * the retransmit should happen at rtt + 4 * rttvar. Because of the 7709 * way we do the smoothing, srtt and rttvar will each average +1/2 7710 * tick of bias. When we compute the retransmit timer, we want 1/2 7711 * tick of rounding and 1 extra tick because of +-1/2 tick 7712 * uncertainty in the firing of the timer. The bias will give us 7713 * exactly the 1.5 tick we need. But, because the bias is 7714 * statistical, we have to test that we don't drop below the minimum 7715 * feasible timer (which is 2 ticks). 7716 */ 7717 tp->t_rxtshift = 0; 7718 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 7719 max(rack_rto_min, rtt + 2), rack_rto_max, rack->r_ctl.timer_slop); 7720 rack_log_rtt_sample(rack, rtt); 7721 tp->t_softerror = 0; 7722 } 7723 7724 7725 static void 7726 rack_apply_updated_usrtt(struct tcp_rack *rack, uint32_t us_rtt, uint32_t us_cts) 7727 { 7728 /* 7729 * Apply to filter the inbound us-rtt at us_cts. 7730 */ 7731 uint32_t old_rtt; 7732 7733 old_rtt = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); 7734 apply_filter_min_small(&rack->r_ctl.rc_gp_min_rtt, 7735 us_rtt, us_cts); 7736 if (old_rtt > us_rtt) { 7737 /* We just hit a new lower rtt time */ 7738 rack_log_rtt_shrinks(rack, us_cts, old_rtt, 7739 __LINE__, RACK_RTTS_NEWRTT); 7740 /* 7741 * Only count it if its lower than what we saw within our 7742 * calculated range. 7743 */ 7744 if ((old_rtt - us_rtt) > rack_min_rtt_movement) { 7745 if (rack_probertt_lower_within && 7746 rack->rc_gp_dyn_mul && 7747 (rack->use_fixed_rate == 0) && 7748 (rack->rc_always_pace)) { 7749 /* 7750 * We are seeing a new lower rtt very close 7751 * to the time that we would have entered probe-rtt. 7752 * This is probably due to the fact that a peer flow 7753 * has entered probe-rtt. Lets go in now too. 7754 */ 7755 uint32_t val; 7756 7757 val = rack_probertt_lower_within * rack_time_between_probertt; 7758 val /= 100; 7759 if ((rack->in_probe_rtt == 0) && 7760 ((us_cts - rack->r_ctl.rc_lower_rtt_us_cts) >= (rack_time_between_probertt - val))) { 7761 rack_enter_probertt(rack, us_cts); 7762 } 7763 } 7764 rack->r_ctl.rc_lower_rtt_us_cts = us_cts; 7765 } 7766 } 7767 } 7768 7769 static int 7770 rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack, 7771 struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type, tcp_seq th_ack) 7772 { 7773 uint32_t us_rtt; 7774 int32_t i, all; 7775 uint32_t t, len_acked; 7776 7777 if ((rsm->r_flags & RACK_ACKED) || 7778 (rsm->r_flags & RACK_WAS_ACKED)) 7779 /* Already done */ 7780 return (0); 7781 if (rsm->r_no_rtt_allowed) { 7782 /* Not allowed */ 7783 return (0); 7784 } 7785 if (ack_type == CUM_ACKED) { 7786 if (SEQ_GT(th_ack, rsm->r_end)) { 7787 len_acked = rsm->r_end - rsm->r_start; 7788 all = 1; 7789 } else { 7790 len_acked = th_ack - rsm->r_start; 7791 all = 0; 7792 } 7793 } else { 7794 len_acked = rsm->r_end - rsm->r_start; 7795 all = 0; 7796 } 7797 if (rsm->r_rtr_cnt == 1) { 7798 7799 t = cts - (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; 7800 if ((int)t <= 0) 7801 t = 1; 7802 if (!tp->t_rttlow || tp->t_rttlow > t) 7803 tp->t_rttlow = t; 7804 if (!rack->r_ctl.rc_rack_min_rtt || 7805 SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 7806 rack->r_ctl.rc_rack_min_rtt = t; 7807 if (rack->r_ctl.rc_rack_min_rtt == 0) { 7808 rack->r_ctl.rc_rack_min_rtt = 1; 7809 } 7810 } 7811 if (TSTMP_GT(tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time), rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)])) 7812 us_rtt = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time) - (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]; 7813 else 7814 us_rtt = tcp_get_usecs(NULL) - (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]; 7815 if (us_rtt == 0) 7816 us_rtt = 1; 7817 if (CC_ALGO(tp)->rttsample != NULL) { 7818 /* Kick the RTT to the CC */ 7819 CC_ALGO(tp)->rttsample(&tp->t_ccv, us_rtt, 1, rsm->r_fas); 7820 } 7821 rack_apply_updated_usrtt(rack, us_rtt, tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time)); 7822 if (ack_type == SACKED) { 7823 rack_log_rtt_sample_calc(rack, t, (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)], cts, 1); 7824 tcp_rack_xmit_timer(rack, t + 1, len_acked, us_rtt, 2 , rsm, rsm->r_rtr_cnt); 7825 } else { 7826 /* 7827 * We need to setup what our confidence 7828 * is in this ack. 7829 * 7830 * If the rsm was app limited and it is 7831 * less than a mss in length (the end 7832 * of the send) then we have a gap. If we 7833 * were app limited but say we were sending 7834 * multiple MSS's then we are more confident 7835 * int it. 7836 * 7837 * When we are not app-limited then we see if 7838 * the rsm is being included in the current 7839 * measurement, we tell this by the app_limited_needs_set 7840 * flag. 7841 * 7842 * Note that being cwnd blocked is not applimited 7843 * as well as the pacing delay between packets which 7844 * are sending only 1 or 2 MSS's also will show up 7845 * in the RTT. We probably need to examine this algorithm 7846 * a bit more and enhance it to account for the delay 7847 * between rsm's. We could do that by saving off the 7848 * pacing delay of each rsm (in an rsm) and then 7849 * factoring that in somehow though for now I am 7850 * not sure how :) 7851 */ 7852 int calc_conf = 0; 7853 7854 if (rsm->r_flags & RACK_APP_LIMITED) { 7855 if (all && (len_acked <= ctf_fixed_maxseg(tp))) 7856 calc_conf = 0; 7857 else 7858 calc_conf = 1; 7859 } else if (rack->app_limited_needs_set == 0) { 7860 calc_conf = 1; 7861 } else { 7862 calc_conf = 0; 7863 } 7864 rack_log_rtt_sample_calc(rack, t, (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)], cts, 2); 7865 tcp_rack_xmit_timer(rack, t + 1, len_acked, us_rtt, 7866 calc_conf, rsm, rsm->r_rtr_cnt); 7867 } 7868 if ((rsm->r_flags & RACK_TLP) && 7869 (!IN_FASTRECOVERY(tp->t_flags))) { 7870 /* Segment was a TLP and our retrans matched */ 7871 if (rack->r_ctl.rc_tlp_cwnd_reduce) { 7872 rack_cong_signal(tp, CC_NDUPACK, tp->snd_una, __LINE__); 7873 } 7874 } 7875 if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) { 7876 /* New more recent rack_tmit_time */ 7877 rack->r_ctl.rc_rack_tmit_time = (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; 7878 rack->rc_rack_rtt = t; 7879 } 7880 return (1); 7881 } 7882 /* 7883 * We clear the soft/rxtshift since we got an ack. 7884 * There is no assurance we will call the commit() function 7885 * so we need to clear these to avoid incorrect handling. 7886 */ 7887 tp->t_rxtshift = 0; 7888 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 7889 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop); 7890 tp->t_softerror = 0; 7891 if (to && (to->to_flags & TOF_TS) && 7892 (ack_type == CUM_ACKED) && 7893 (to->to_tsecr) && 7894 ((rsm->r_flags & RACK_OVERMAX) == 0)) { 7895 /* 7896 * Now which timestamp does it match? In this block the ACK 7897 * must be coming from a previous transmission. 7898 */ 7899 for (i = 0; i < rsm->r_rtr_cnt; i++) { 7900 if (rack_ts_to_msec(rsm->r_tim_lastsent[i]) == to->to_tsecr) { 7901 t = cts - (uint32_t)rsm->r_tim_lastsent[i]; 7902 if ((int)t <= 0) 7903 t = 1; 7904 if (CC_ALGO(tp)->rttsample != NULL) { 7905 /* 7906 * Kick the RTT to the CC, here 7907 * we lie a bit in that we know the 7908 * retransmission is correct even though 7909 * we retransmitted. This is because 7910 * we match the timestamps. 7911 */ 7912 if (TSTMP_GT(tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time), rsm->r_tim_lastsent[i])) 7913 us_rtt = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time) - (uint32_t)rsm->r_tim_lastsent[i]; 7914 else 7915 us_rtt = tcp_get_usecs(NULL) - (uint32_t)rsm->r_tim_lastsent[i]; 7916 CC_ALGO(tp)->rttsample(&tp->t_ccv, us_rtt, 1, rsm->r_fas); 7917 } 7918 if ((i + 1) < rsm->r_rtr_cnt) { 7919 /* 7920 * The peer ack'd from our previous 7921 * transmission. We have a spurious 7922 * retransmission and thus we dont 7923 * want to update our rack_rtt. 7924 * 7925 * Hmm should there be a CC revert here? 7926 * 7927 */ 7928 return (0); 7929 } 7930 if (!tp->t_rttlow || tp->t_rttlow > t) 7931 tp->t_rttlow = t; 7932 if (!rack->r_ctl.rc_rack_min_rtt || SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 7933 rack->r_ctl.rc_rack_min_rtt = t; 7934 if (rack->r_ctl.rc_rack_min_rtt == 0) { 7935 rack->r_ctl.rc_rack_min_rtt = 1; 7936 } 7937 } 7938 if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, 7939 (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) { 7940 /* New more recent rack_tmit_time */ 7941 rack->r_ctl.rc_rack_tmit_time = (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; 7942 rack->rc_rack_rtt = t; 7943 } 7944 rack_log_rtt_sample_calc(rack, t, (uint32_t)rsm->r_tim_lastsent[i], cts, 3); 7945 tcp_rack_xmit_timer(rack, t + 1, len_acked, t, 0, rsm, 7946 rsm->r_rtr_cnt); 7947 return (1); 7948 } 7949 } 7950 goto ts_not_found; 7951 } else { 7952 /* 7953 * Ok its a SACK block that we retransmitted. or a windows 7954 * machine without timestamps. We can tell nothing from the 7955 * time-stamp since its not there or the time the peer last 7956 * recieved a segment that moved forward its cum-ack point. 7957 */ 7958 ts_not_found: 7959 i = rsm->r_rtr_cnt - 1; 7960 t = cts - (uint32_t)rsm->r_tim_lastsent[i]; 7961 if ((int)t <= 0) 7962 t = 1; 7963 if (rack->r_ctl.rc_rack_min_rtt && SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 7964 /* 7965 * We retransmitted and the ack came back in less 7966 * than the smallest rtt we have observed. We most 7967 * likely did an improper retransmit as outlined in 7968 * 6.2 Step 2 point 2 in the rack-draft so we 7969 * don't want to update our rack_rtt. We in 7970 * theory (in future) might want to think about reverting our 7971 * cwnd state but we won't for now. 7972 */ 7973 return (0); 7974 } else if (rack->r_ctl.rc_rack_min_rtt) { 7975 /* 7976 * We retransmitted it and the retransmit did the 7977 * job. 7978 */ 7979 if (!rack->r_ctl.rc_rack_min_rtt || 7980 SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 7981 rack->r_ctl.rc_rack_min_rtt = t; 7982 if (rack->r_ctl.rc_rack_min_rtt == 0) { 7983 rack->r_ctl.rc_rack_min_rtt = 1; 7984 } 7985 } 7986 if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, (uint32_t)rsm->r_tim_lastsent[i])) { 7987 /* New more recent rack_tmit_time */ 7988 rack->r_ctl.rc_rack_tmit_time = (uint32_t)rsm->r_tim_lastsent[i]; 7989 rack->rc_rack_rtt = t; 7990 } 7991 return (1); 7992 } 7993 } 7994 return (0); 7995 } 7996 7997 /* 7998 * Mark the SACK_PASSED flag on all entries prior to rsm send wise. 7999 */ 8000 static void 8001 rack_log_sack_passed(struct tcpcb *tp, 8002 struct tcp_rack *rack, struct rack_sendmap *rsm) 8003 { 8004 struct rack_sendmap *nrsm; 8005 8006 nrsm = rsm; 8007 TAILQ_FOREACH_REVERSE_FROM(nrsm, &rack->r_ctl.rc_tmap, 8008 rack_head, r_tnext) { 8009 if (nrsm == rsm) { 8010 /* Skip orginal segment he is acked */ 8011 continue; 8012 } 8013 if (nrsm->r_flags & RACK_ACKED) { 8014 /* 8015 * Skip ack'd segments, though we 8016 * should not see these, since tmap 8017 * should not have ack'd segments. 8018 */ 8019 continue; 8020 } 8021 if (nrsm->r_flags & RACK_RWND_COLLAPSED) { 8022 /* 8023 * If the peer dropped the rwnd on 8024 * these then we don't worry about them. 8025 */ 8026 continue; 8027 } 8028 if (nrsm->r_flags & RACK_SACK_PASSED) { 8029 /* 8030 * We found one that is already marked 8031 * passed, we have been here before and 8032 * so all others below this are marked. 8033 */ 8034 break; 8035 } 8036 nrsm->r_flags |= RACK_SACK_PASSED; 8037 nrsm->r_flags &= ~RACK_WAS_SACKPASS; 8038 } 8039 } 8040 8041 static void 8042 rack_need_set_test(struct tcpcb *tp, 8043 struct tcp_rack *rack, 8044 struct rack_sendmap *rsm, 8045 tcp_seq th_ack, 8046 int line, 8047 int use_which) 8048 { 8049 8050 if ((tp->t_flags & TF_GPUTINPROG) && 8051 SEQ_GEQ(rsm->r_end, tp->gput_seq)) { 8052 /* 8053 * We were app limited, and this ack 8054 * butts up or goes beyond the point where we want 8055 * to start our next measurement. We need 8056 * to record the new gput_ts as here and 8057 * possibly update the start sequence. 8058 */ 8059 uint32_t seq, ts; 8060 8061 if (rsm->r_rtr_cnt > 1) { 8062 /* 8063 * This is a retransmit, can we 8064 * really make any assessment at this 8065 * point? We are not really sure of 8066 * the timestamp, is it this or the 8067 * previous transmission? 8068 * 8069 * Lets wait for something better that 8070 * is not retransmitted. 8071 */ 8072 return; 8073 } 8074 seq = tp->gput_seq; 8075 ts = tp->gput_ts; 8076 rack->app_limited_needs_set = 0; 8077 tp->gput_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 8078 /* Do we start at a new end? */ 8079 if ((use_which == RACK_USE_BEG) && 8080 SEQ_GEQ(rsm->r_start, tp->gput_seq)) { 8081 /* 8082 * When we get an ACK that just eats 8083 * up some of the rsm, we set RACK_USE_BEG 8084 * since whats at r_start (i.e. th_ack) 8085 * is left unacked and thats where the 8086 * measurement not starts. 8087 */ 8088 tp->gput_seq = rsm->r_start; 8089 rack->r_ctl.rc_gp_output_ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]; 8090 } 8091 if ((use_which == RACK_USE_END) && 8092 SEQ_GEQ(rsm->r_end, tp->gput_seq)) { 8093 /* 8094 * We use the end when the cumack 8095 * is moving forward and completely 8096 * deleting the rsm passed so basically 8097 * r_end holds th_ack. 8098 * 8099 * For SACK's we also want to use the end 8100 * since this piece just got sacked and 8101 * we want to target anything after that 8102 * in our measurement. 8103 */ 8104 tp->gput_seq = rsm->r_end; 8105 rack->r_ctl.rc_gp_output_ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]; 8106 } 8107 if (use_which == RACK_USE_END_OR_THACK) { 8108 /* 8109 * special case for ack moving forward, 8110 * not a sack, we need to move all the 8111 * way up to where this ack cum-ack moves 8112 * to. 8113 */ 8114 if (SEQ_GT(th_ack, rsm->r_end)) 8115 tp->gput_seq = th_ack; 8116 else 8117 tp->gput_seq = rsm->r_end; 8118 rack->r_ctl.rc_gp_output_ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]; 8119 } 8120 if (SEQ_GT(tp->gput_seq, tp->gput_ack)) { 8121 /* 8122 * We moved beyond this guy's range, re-calculate 8123 * the new end point. 8124 */ 8125 if (rack->rc_gp_filled == 0) { 8126 tp->gput_ack = tp->gput_seq + max(rc_init_window(rack), (MIN_GP_WIN * ctf_fixed_maxseg(tp))); 8127 } else { 8128 tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack); 8129 } 8130 } 8131 /* 8132 * We are moving the goal post, we may be able to clear the 8133 * measure_saw_probe_rtt flag. 8134 */ 8135 if ((rack->in_probe_rtt == 0) && 8136 (rack->measure_saw_probe_rtt) && 8137 (SEQ_GEQ(tp->gput_seq, rack->r_ctl.rc_probertt_sndmax_atexit))) 8138 rack->measure_saw_probe_rtt = 0; 8139 rack_log_pacing_delay_calc(rack, ts, tp->gput_ts, 8140 seq, tp->gput_seq, 0, 5, line, NULL, 0); 8141 if (rack->rc_gp_filled && 8142 ((tp->gput_ack - tp->gput_seq) < 8143 max(rc_init_window(rack), (MIN_GP_WIN * 8144 ctf_fixed_maxseg(tp))))) { 8145 uint32_t ideal_amount; 8146 8147 ideal_amount = rack_get_measure_window(tp, rack); 8148 if (ideal_amount > sbavail(&tptosocket(tp)->so_snd)) { 8149 /* 8150 * There is no sense of continuing this measurement 8151 * because its too small to gain us anything we 8152 * trust. Skip it and that way we can start a new 8153 * measurement quicker. 8154 */ 8155 tp->t_flags &= ~TF_GPUTINPROG; 8156 rack_log_pacing_delay_calc(rack, tp->gput_ack, tp->gput_seq, 8157 0, 0, 0, 6, __LINE__, NULL, 0); 8158 } else { 8159 /* 8160 * Reset the window further out. 8161 */ 8162 tp->gput_ack = tp->gput_seq + ideal_amount; 8163 } 8164 } 8165 } 8166 } 8167 8168 static inline int 8169 is_rsm_inside_declared_tlp_block(struct tcp_rack *rack, struct rack_sendmap *rsm) 8170 { 8171 if (SEQ_LT(rsm->r_end, rack->r_ctl.last_tlp_acked_start)) { 8172 /* Behind our TLP definition or right at */ 8173 return (0); 8174 } 8175 if (SEQ_GT(rsm->r_start, rack->r_ctl.last_tlp_acked_end)) { 8176 /* The start is beyond or right at our end of TLP definition */ 8177 return (0); 8178 } 8179 /* It has to be a sub-part of the original TLP recorded */ 8180 return (1); 8181 } 8182 8183 8184 static uint32_t 8185 rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, struct sackblk *sack, 8186 struct tcpopt *to, struct rack_sendmap **prsm, uint32_t cts, int *moved_two) 8187 { 8188 uint32_t start, end, changed = 0; 8189 struct rack_sendmap stack_map; 8190 struct rack_sendmap *rsm, *nrsm, fe, *prev, *next; 8191 #ifdef INVARIANTS 8192 struct rack_sendmap *insret; 8193 #endif 8194 int32_t used_ref = 1; 8195 int moved = 0; 8196 8197 start = sack->start; 8198 end = sack->end; 8199 rsm = *prsm; 8200 memset(&fe, 0, sizeof(fe)); 8201 do_rest_ofb: 8202 if ((rsm == NULL) || 8203 (SEQ_LT(end, rsm->r_start)) || 8204 (SEQ_GEQ(start, rsm->r_end)) || 8205 (SEQ_LT(start, rsm->r_start))) { 8206 /* 8207 * We are not in the right spot, 8208 * find the correct spot in the tree. 8209 */ 8210 used_ref = 0; 8211 fe.r_start = start; 8212 rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); 8213 moved++; 8214 } 8215 if (rsm == NULL) { 8216 /* TSNH */ 8217 goto out; 8218 } 8219 /* Ok we have an ACK for some piece of this rsm */ 8220 if (rsm->r_start != start) { 8221 if ((rsm->r_flags & RACK_ACKED) == 0) { 8222 /* 8223 * Before any splitting or hookery is 8224 * done is it a TLP of interest i.e. rxt? 8225 */ 8226 if ((rsm->r_flags & RACK_TLP) && 8227 (rsm->r_rtr_cnt > 1)) { 8228 /* 8229 * We are splitting a rxt TLP, check 8230 * if we need to save off the start/end 8231 */ 8232 if (rack->rc_last_tlp_acked_set && 8233 (is_rsm_inside_declared_tlp_block(rack, rsm))) { 8234 /* 8235 * We already turned this on since we are inside 8236 * the previous one was a partially sack now we 8237 * are getting another one (maybe all of it). 8238 * 8239 */ 8240 rack_log_dsack_event(rack, 10, __LINE__, rsm->r_start, rsm->r_end); 8241 /* 8242 * Lets make sure we have all of it though. 8243 */ 8244 if (SEQ_LT(rsm->r_start, rack->r_ctl.last_tlp_acked_start)) { 8245 rack->r_ctl.last_tlp_acked_start = rsm->r_start; 8246 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start, 8247 rack->r_ctl.last_tlp_acked_end); 8248 } 8249 if (SEQ_GT(rsm->r_end, rack->r_ctl.last_tlp_acked_end)) { 8250 rack->r_ctl.last_tlp_acked_end = rsm->r_end; 8251 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start, 8252 rack->r_ctl.last_tlp_acked_end); 8253 } 8254 } else { 8255 rack->r_ctl.last_tlp_acked_start = rsm->r_start; 8256 rack->r_ctl.last_tlp_acked_end = rsm->r_end; 8257 rack->rc_last_tlp_past_cumack = 0; 8258 rack->rc_last_tlp_acked_set = 1; 8259 rack_log_dsack_event(rack, 8, __LINE__, rsm->r_start, rsm->r_end); 8260 } 8261 } 8262 /** 8263 * Need to split this in two pieces the before and after, 8264 * the before remains in the map, the after must be 8265 * added. In other words we have: 8266 * rsm |--------------| 8267 * sackblk |-------> 8268 * rsm will become 8269 * rsm |---| 8270 * and nrsm will be the sacked piece 8271 * nrsm |----------| 8272 * 8273 * But before we start down that path lets 8274 * see if the sack spans over on top of 8275 * the next guy and it is already sacked. 8276 * 8277 */ 8278 next = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 8279 if (next && (next->r_flags & RACK_ACKED) && 8280 SEQ_GEQ(end, next->r_start)) { 8281 /** 8282 * So the next one is already acked, and 8283 * we can thus by hookery use our stack_map 8284 * to reflect the piece being sacked and 8285 * then adjust the two tree entries moving 8286 * the start and ends around. So we start like: 8287 * rsm |------------| (not-acked) 8288 * next |-----------| (acked) 8289 * sackblk |--------> 8290 * We want to end like so: 8291 * rsm |------| (not-acked) 8292 * next |-----------------| (acked) 8293 * nrsm |-----| 8294 * Where nrsm is a temporary stack piece we 8295 * use to update all the gizmos. 8296 */ 8297 /* Copy up our fudge block */ 8298 nrsm = &stack_map; 8299 memcpy(nrsm, rsm, sizeof(struct rack_sendmap)); 8300 /* Now adjust our tree blocks */ 8301 rsm->r_end = start; 8302 next->r_start = start; 8303 /* Now we must adjust back where next->m is */ 8304 rack_setup_offset_for_rsm(rsm, next); 8305 8306 /* We don't need to adjust rsm, it did not change */ 8307 /* Clear out the dup ack count of the remainder */ 8308 rsm->r_dupack = 0; 8309 rsm->r_just_ret = 0; 8310 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 8311 /* Now lets make sure our fudge block is right */ 8312 nrsm->r_start = start; 8313 /* Now lets update all the stats and such */ 8314 rack_update_rtt(tp, rack, nrsm, to, cts, SACKED, 0); 8315 if (rack->app_limited_needs_set) 8316 rack_need_set_test(tp, rack, nrsm, tp->snd_una, __LINE__, RACK_USE_END); 8317 changed += (nrsm->r_end - nrsm->r_start); 8318 rack->r_ctl.rc_sacked += (nrsm->r_end - nrsm->r_start); 8319 if (nrsm->r_flags & RACK_SACK_PASSED) { 8320 rack->r_ctl.rc_reorder_ts = cts; 8321 } 8322 /* 8323 * Now we want to go up from rsm (the 8324 * one left un-acked) to the next one 8325 * in the tmap. We do this so when 8326 * we walk backwards we include marking 8327 * sack-passed on rsm (The one passed in 8328 * is skipped since it is generally called 8329 * on something sacked before removing it 8330 * from the tmap). 8331 */ 8332 if (rsm->r_in_tmap) { 8333 nrsm = TAILQ_NEXT(rsm, r_tnext); 8334 /* 8335 * Now that we have the next 8336 * one walk backwards from there. 8337 */ 8338 if (nrsm && nrsm->r_in_tmap) 8339 rack_log_sack_passed(tp, rack, nrsm); 8340 } 8341 /* Now are we done? */ 8342 if (SEQ_LT(end, next->r_end) || 8343 (end == next->r_end)) { 8344 /* Done with block */ 8345 goto out; 8346 } 8347 rack_log_map_chg(tp, rack, &stack_map, rsm, next, MAP_SACK_M1, end, __LINE__); 8348 counter_u64_add(rack_sack_used_next_merge, 1); 8349 /* Postion for the next block */ 8350 start = next->r_end; 8351 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, next); 8352 if (rsm == NULL) 8353 goto out; 8354 } else { 8355 /** 8356 * We can't use any hookery here, so we 8357 * need to split the map. We enter like 8358 * so: 8359 * rsm |--------| 8360 * sackblk |-----> 8361 * We will add the new block nrsm and 8362 * that will be the new portion, and then 8363 * fall through after reseting rsm. So we 8364 * split and look like this: 8365 * rsm |----| 8366 * sackblk |-----> 8367 * nrsm |---| 8368 * We then fall through reseting 8369 * rsm to nrsm, so the next block 8370 * picks it up. 8371 */ 8372 nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT); 8373 if (nrsm == NULL) { 8374 /* 8375 * failed XXXrrs what can we do but loose the sack 8376 * info? 8377 */ 8378 goto out; 8379 } 8380 counter_u64_add(rack_sack_splits, 1); 8381 rack_clone_rsm(rack, nrsm, rsm, start); 8382 rsm->r_just_ret = 0; 8383 #ifndef INVARIANTS 8384 (void)RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); 8385 #else 8386 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); 8387 if (insret != NULL) { 8388 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", 8389 nrsm, insret, rack, rsm); 8390 } 8391 #endif 8392 if (rsm->r_in_tmap) { 8393 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 8394 nrsm->r_in_tmap = 1; 8395 } 8396 rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SACK_M2, end, __LINE__); 8397 rsm->r_flags &= (~RACK_HAS_FIN); 8398 /* Position us to point to the new nrsm that starts the sack blk */ 8399 rsm = nrsm; 8400 } 8401 } else { 8402 /* Already sacked this piece */ 8403 counter_u64_add(rack_sack_skipped_acked, 1); 8404 moved++; 8405 if (end == rsm->r_end) { 8406 /* Done with block */ 8407 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 8408 goto out; 8409 } else if (SEQ_LT(end, rsm->r_end)) { 8410 /* A partial sack to a already sacked block */ 8411 moved++; 8412 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 8413 goto out; 8414 } else { 8415 /* 8416 * The end goes beyond this guy 8417 * reposition the start to the 8418 * next block. 8419 */ 8420 start = rsm->r_end; 8421 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 8422 if (rsm == NULL) 8423 goto out; 8424 } 8425 } 8426 } 8427 if (SEQ_GEQ(end, rsm->r_end)) { 8428 /** 8429 * The end of this block is either beyond this guy or right 8430 * at this guy. I.e.: 8431 * rsm --- |-----| 8432 * end |-----| 8433 * <or> 8434 * end |---------| 8435 */ 8436 if ((rsm->r_flags & RACK_ACKED) == 0) { 8437 /* 8438 * Is it a TLP of interest? 8439 */ 8440 if ((rsm->r_flags & RACK_TLP) && 8441 (rsm->r_rtr_cnt > 1)) { 8442 /* 8443 * We are splitting a rxt TLP, check 8444 * if we need to save off the start/end 8445 */ 8446 if (rack->rc_last_tlp_acked_set && 8447 (is_rsm_inside_declared_tlp_block(rack, rsm))) { 8448 /* 8449 * We already turned this on since we are inside 8450 * the previous one was a partially sack now we 8451 * are getting another one (maybe all of it). 8452 */ 8453 rack_log_dsack_event(rack, 10, __LINE__, rsm->r_start, rsm->r_end); 8454 /* 8455 * Lets make sure we have all of it though. 8456 */ 8457 if (SEQ_LT(rsm->r_start, rack->r_ctl.last_tlp_acked_start)) { 8458 rack->r_ctl.last_tlp_acked_start = rsm->r_start; 8459 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start, 8460 rack->r_ctl.last_tlp_acked_end); 8461 } 8462 if (SEQ_GT(rsm->r_end, rack->r_ctl.last_tlp_acked_end)) { 8463 rack->r_ctl.last_tlp_acked_end = rsm->r_end; 8464 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start, 8465 rack->r_ctl.last_tlp_acked_end); 8466 } 8467 } else { 8468 rack->r_ctl.last_tlp_acked_start = rsm->r_start; 8469 rack->r_ctl.last_tlp_acked_end = rsm->r_end; 8470 rack->rc_last_tlp_past_cumack = 0; 8471 rack->rc_last_tlp_acked_set = 1; 8472 rack_log_dsack_event(rack, 8, __LINE__, rsm->r_start, rsm->r_end); 8473 } 8474 } 8475 rack_update_rtt(tp, rack, rsm, to, cts, SACKED, 0); 8476 changed += (rsm->r_end - rsm->r_start); 8477 rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start); 8478 if (rsm->r_in_tmap) /* should be true */ 8479 rack_log_sack_passed(tp, rack, rsm); 8480 /* Is Reordering occuring? */ 8481 if (rsm->r_flags & RACK_SACK_PASSED) { 8482 rsm->r_flags &= ~RACK_SACK_PASSED; 8483 rack->r_ctl.rc_reorder_ts = cts; 8484 } 8485 if (rack->app_limited_needs_set) 8486 rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_END); 8487 rsm->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time); 8488 rsm->r_flags |= RACK_ACKED; 8489 if (rsm->r_in_tmap) { 8490 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 8491 rsm->r_in_tmap = 0; 8492 } 8493 rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_SACK_M3, end, __LINE__); 8494 } else { 8495 counter_u64_add(rack_sack_skipped_acked, 1); 8496 moved++; 8497 } 8498 if (end == rsm->r_end) { 8499 /* This block only - done, setup for next */ 8500 goto out; 8501 } 8502 /* 8503 * There is more not coverend by this rsm move on 8504 * to the next block in the RB tree. 8505 */ 8506 nrsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 8507 start = rsm->r_end; 8508 rsm = nrsm; 8509 if (rsm == NULL) 8510 goto out; 8511 goto do_rest_ofb; 8512 } 8513 /** 8514 * The end of this sack block is smaller than 8515 * our rsm i.e.: 8516 * rsm --- |-----| 8517 * end |--| 8518 */ 8519 if ((rsm->r_flags & RACK_ACKED) == 0) { 8520 /* 8521 * Is it a TLP of interest? 8522 */ 8523 if ((rsm->r_flags & RACK_TLP) && 8524 (rsm->r_rtr_cnt > 1)) { 8525 /* 8526 * We are splitting a rxt TLP, check 8527 * if we need to save off the start/end 8528 */ 8529 if (rack->rc_last_tlp_acked_set && 8530 (is_rsm_inside_declared_tlp_block(rack, rsm))) { 8531 /* 8532 * We already turned this on since we are inside 8533 * the previous one was a partially sack now we 8534 * are getting another one (maybe all of it). 8535 */ 8536 rack_log_dsack_event(rack, 10, __LINE__, rsm->r_start, rsm->r_end); 8537 /* 8538 * Lets make sure we have all of it though. 8539 */ 8540 if (SEQ_LT(rsm->r_start, rack->r_ctl.last_tlp_acked_start)) { 8541 rack->r_ctl.last_tlp_acked_start = rsm->r_start; 8542 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start, 8543 rack->r_ctl.last_tlp_acked_end); 8544 } 8545 if (SEQ_GT(rsm->r_end, rack->r_ctl.last_tlp_acked_end)) { 8546 rack->r_ctl.last_tlp_acked_end = rsm->r_end; 8547 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start, 8548 rack->r_ctl.last_tlp_acked_end); 8549 } 8550 } else { 8551 rack->r_ctl.last_tlp_acked_start = rsm->r_start; 8552 rack->r_ctl.last_tlp_acked_end = rsm->r_end; 8553 rack->rc_last_tlp_past_cumack = 0; 8554 rack->rc_last_tlp_acked_set = 1; 8555 rack_log_dsack_event(rack, 8, __LINE__, rsm->r_start, rsm->r_end); 8556 } 8557 } 8558 prev = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 8559 if (prev && 8560 (prev->r_flags & RACK_ACKED)) { 8561 /** 8562 * Goal, we want the right remainder of rsm to shrink 8563 * in place and span from (rsm->r_start = end) to rsm->r_end. 8564 * We want to expand prev to go all the way 8565 * to prev->r_end <- end. 8566 * so in the tree we have before: 8567 * prev |--------| (acked) 8568 * rsm |-------| (non-acked) 8569 * sackblk |-| 8570 * We churn it so we end up with 8571 * prev |----------| (acked) 8572 * rsm |-----| (non-acked) 8573 * nrsm |-| (temporary) 8574 * 8575 * Note if either prev/rsm is a TLP we don't 8576 * do this. 8577 */ 8578 nrsm = &stack_map; 8579 memcpy(nrsm, rsm, sizeof(struct rack_sendmap)); 8580 prev->r_end = end; 8581 rsm->r_start = end; 8582 /* Now adjust nrsm (stack copy) to be 8583 * the one that is the small 8584 * piece that was "sacked". 8585 */ 8586 nrsm->r_end = end; 8587 rsm->r_dupack = 0; 8588 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 8589 /* 8590 * Now that the rsm has had its start moved forward 8591 * lets go ahead and get its new place in the world. 8592 */ 8593 rack_setup_offset_for_rsm(prev, rsm); 8594 /* 8595 * Now nrsm is our new little piece 8596 * that is acked (which was merged 8597 * to prev). Update the rtt and changed 8598 * based on that. Also check for reordering. 8599 */ 8600 rack_update_rtt(tp, rack, nrsm, to, cts, SACKED, 0); 8601 if (rack->app_limited_needs_set) 8602 rack_need_set_test(tp, rack, nrsm, tp->snd_una, __LINE__, RACK_USE_END); 8603 changed += (nrsm->r_end - nrsm->r_start); 8604 rack->r_ctl.rc_sacked += (nrsm->r_end - nrsm->r_start); 8605 if (nrsm->r_flags & RACK_SACK_PASSED) { 8606 rack->r_ctl.rc_reorder_ts = cts; 8607 } 8608 rack_log_map_chg(tp, rack, prev, &stack_map, rsm, MAP_SACK_M4, end, __LINE__); 8609 rsm = prev; 8610 counter_u64_add(rack_sack_used_prev_merge, 1); 8611 } else { 8612 /** 8613 * This is the case where our previous 8614 * block is not acked either, so we must 8615 * split the block in two. 8616 */ 8617 nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT); 8618 if (nrsm == NULL) { 8619 /* failed rrs what can we do but loose the sack info? */ 8620 goto out; 8621 } 8622 if ((rsm->r_flags & RACK_TLP) && 8623 (rsm->r_rtr_cnt > 1)) { 8624 /* 8625 * We are splitting a rxt TLP, check 8626 * if we need to save off the start/end 8627 */ 8628 if (rack->rc_last_tlp_acked_set && 8629 (is_rsm_inside_declared_tlp_block(rack, rsm))) { 8630 /* 8631 * We already turned this on since this block is inside 8632 * the previous one was a partially sack now we 8633 * are getting another one (maybe all of it). 8634 */ 8635 rack_log_dsack_event(rack, 10, __LINE__, rsm->r_start, rsm->r_end); 8636 /* 8637 * Lets make sure we have all of it though. 8638 */ 8639 if (SEQ_LT(rsm->r_start, rack->r_ctl.last_tlp_acked_start)) { 8640 rack->r_ctl.last_tlp_acked_start = rsm->r_start; 8641 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start, 8642 rack->r_ctl.last_tlp_acked_end); 8643 } 8644 if (SEQ_GT(rsm->r_end, rack->r_ctl.last_tlp_acked_end)) { 8645 rack->r_ctl.last_tlp_acked_end = rsm->r_end; 8646 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start, 8647 rack->r_ctl.last_tlp_acked_end); 8648 } 8649 } else { 8650 rack->r_ctl.last_tlp_acked_start = rsm->r_start; 8651 rack->r_ctl.last_tlp_acked_end = rsm->r_end; 8652 rack->rc_last_tlp_acked_set = 1; 8653 rack->rc_last_tlp_past_cumack = 0; 8654 rack_log_dsack_event(rack, 8, __LINE__, rsm->r_start, rsm->r_end); 8655 } 8656 } 8657 /** 8658 * In this case nrsm becomes 8659 * nrsm->r_start = end; 8660 * nrsm->r_end = rsm->r_end; 8661 * which is un-acked. 8662 * <and> 8663 * rsm->r_end = nrsm->r_start; 8664 * i.e. the remaining un-acked 8665 * piece is left on the left 8666 * hand side. 8667 * 8668 * So we start like this 8669 * rsm |----------| (not acked) 8670 * sackblk |---| 8671 * build it so we have 8672 * rsm |---| (acked) 8673 * nrsm |------| (not acked) 8674 */ 8675 counter_u64_add(rack_sack_splits, 1); 8676 rack_clone_rsm(rack, nrsm, rsm, end); 8677 rsm->r_flags &= (~RACK_HAS_FIN); 8678 rsm->r_just_ret = 0; 8679 #ifndef INVARIANTS 8680 (void)RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); 8681 #else 8682 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); 8683 if (insret != NULL) { 8684 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", 8685 nrsm, insret, rack, rsm); 8686 } 8687 #endif 8688 if (rsm->r_in_tmap) { 8689 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 8690 nrsm->r_in_tmap = 1; 8691 } 8692 nrsm->r_dupack = 0; 8693 rack_log_retran_reason(rack, nrsm, __LINE__, 0, 2); 8694 rack_update_rtt(tp, rack, rsm, to, cts, SACKED, 0); 8695 changed += (rsm->r_end - rsm->r_start); 8696 rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start); 8697 if (rsm->r_in_tmap) /* should be true */ 8698 rack_log_sack_passed(tp, rack, rsm); 8699 /* Is Reordering occuring? */ 8700 if (rsm->r_flags & RACK_SACK_PASSED) { 8701 rsm->r_flags &= ~RACK_SACK_PASSED; 8702 rack->r_ctl.rc_reorder_ts = cts; 8703 } 8704 if (rack->app_limited_needs_set) 8705 rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_END); 8706 rsm->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time); 8707 rsm->r_flags |= RACK_ACKED; 8708 rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SACK_M5, end, __LINE__); 8709 if (rsm->r_in_tmap) { 8710 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 8711 rsm->r_in_tmap = 0; 8712 } 8713 } 8714 } else if (start != end){ 8715 /* 8716 * The block was already acked. 8717 */ 8718 counter_u64_add(rack_sack_skipped_acked, 1); 8719 moved++; 8720 } 8721 out: 8722 if (rsm && 8723 ((rsm->r_flags & RACK_TLP) == 0) && 8724 (rsm->r_flags & RACK_ACKED)) { 8725 /* 8726 * Now can we merge where we worked 8727 * with either the previous or 8728 * next block? 8729 */ 8730 next = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 8731 while (next) { 8732 if (next->r_flags & RACK_TLP) 8733 break; 8734 if (next->r_flags & RACK_ACKED) { 8735 /* yep this and next can be merged */ 8736 rsm = rack_merge_rsm(rack, rsm, next); 8737 next = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 8738 } else 8739 break; 8740 } 8741 /* Now what about the previous? */ 8742 prev = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 8743 while (prev) { 8744 if (prev->r_flags & RACK_TLP) 8745 break; 8746 if (prev->r_flags & RACK_ACKED) { 8747 /* yep the previous and this can be merged */ 8748 rsm = rack_merge_rsm(rack, prev, rsm); 8749 prev = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 8750 } else 8751 break; 8752 } 8753 } 8754 if (used_ref == 0) { 8755 counter_u64_add(rack_sack_proc_all, 1); 8756 } else { 8757 counter_u64_add(rack_sack_proc_short, 1); 8758 } 8759 /* Save off the next one for quick reference. */ 8760 if (rsm) 8761 nrsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 8762 else 8763 nrsm = NULL; 8764 *prsm = rack->r_ctl.rc_sacklast = nrsm; 8765 /* Pass back the moved. */ 8766 *moved_two = moved; 8767 return (changed); 8768 } 8769 8770 static void inline 8771 rack_peer_reneges(struct tcp_rack *rack, struct rack_sendmap *rsm, tcp_seq th_ack) 8772 { 8773 struct rack_sendmap *tmap; 8774 8775 tmap = NULL; 8776 while (rsm && (rsm->r_flags & RACK_ACKED)) { 8777 /* Its no longer sacked, mark it so */ 8778 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); 8779 #ifdef INVARIANTS 8780 if (rsm->r_in_tmap) { 8781 panic("rack:%p rsm:%p flags:0x%x in tmap?", 8782 rack, rsm, rsm->r_flags); 8783 } 8784 #endif 8785 rsm->r_flags &= ~(RACK_ACKED|RACK_SACK_PASSED|RACK_WAS_SACKPASS); 8786 /* Rebuild it into our tmap */ 8787 if (tmap == NULL) { 8788 TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext); 8789 tmap = rsm; 8790 } else { 8791 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, tmap, rsm, r_tnext); 8792 tmap = rsm; 8793 } 8794 tmap->r_in_tmap = 1; 8795 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 8796 } 8797 /* 8798 * Now lets possibly clear the sack filter so we start 8799 * recognizing sacks that cover this area. 8800 */ 8801 sack_filter_clear(&rack->r_ctl.rack_sf, th_ack); 8802 8803 } 8804 8805 static void 8806 rack_do_decay(struct tcp_rack *rack) 8807 { 8808 struct timeval res; 8809 8810 #define timersub(tvp, uvp, vvp) \ 8811 do { \ 8812 (vvp)->tv_sec = (tvp)->tv_sec - (uvp)->tv_sec; \ 8813 (vvp)->tv_usec = (tvp)->tv_usec - (uvp)->tv_usec; \ 8814 if ((vvp)->tv_usec < 0) { \ 8815 (vvp)->tv_sec--; \ 8816 (vvp)->tv_usec += 1000000; \ 8817 } \ 8818 } while (0) 8819 8820 timersub(&rack->r_ctl.act_rcv_time, &rack->r_ctl.rc_last_time_decay, &res); 8821 #undef timersub 8822 8823 rack->r_ctl.input_pkt++; 8824 if ((rack->rc_in_persist) || 8825 (res.tv_sec >= 1) || 8826 (rack->rc_tp->snd_max == rack->rc_tp->snd_una)) { 8827 /* 8828 * Check for decay of non-SAD, 8829 * we want all SAD detection metrics to 8830 * decay 1/4 per second (or more) passed. 8831 */ 8832 #ifdef NETFLIX_EXP_DETECTION 8833 uint32_t pkt_delta; 8834 8835 pkt_delta = rack->r_ctl.input_pkt - rack->r_ctl.saved_input_pkt; 8836 #endif 8837 /* Update our saved tracking values */ 8838 rack->r_ctl.saved_input_pkt = rack->r_ctl.input_pkt; 8839 rack->r_ctl.rc_last_time_decay = rack->r_ctl.act_rcv_time; 8840 /* Now do we escape without decay? */ 8841 #ifdef NETFLIX_EXP_DETECTION 8842 if (rack->rc_in_persist || 8843 (rack->rc_tp->snd_max == rack->rc_tp->snd_una) || 8844 (pkt_delta < tcp_sad_low_pps)){ 8845 /* 8846 * We don't decay idle connections 8847 * or ones that have a low input pps. 8848 */ 8849 return; 8850 } 8851 /* Decay the counters */ 8852 rack->r_ctl.ack_count = ctf_decay_count(rack->r_ctl.ack_count, 8853 tcp_sad_decay_val); 8854 rack->r_ctl.sack_count = ctf_decay_count(rack->r_ctl.sack_count, 8855 tcp_sad_decay_val); 8856 rack->r_ctl.sack_moved_extra = ctf_decay_count(rack->r_ctl.sack_moved_extra, 8857 tcp_sad_decay_val); 8858 rack->r_ctl.sack_noextra_move = ctf_decay_count(rack->r_ctl.sack_noextra_move, 8859 tcp_sad_decay_val); 8860 #endif 8861 } 8862 } 8863 8864 static void 8865 rack_process_to_cumack(struct tcpcb *tp, struct tcp_rack *rack, register uint32_t th_ack, uint32_t cts, struct tcpopt *to) 8866 { 8867 struct rack_sendmap *rsm; 8868 #ifdef INVARIANTS 8869 struct rack_sendmap *rm; 8870 #endif 8871 8872 /* 8873 * The ACK point is advancing to th_ack, we must drop off 8874 * the packets in the rack log and calculate any eligble 8875 * RTT's. 8876 */ 8877 rack->r_wanted_output = 1; 8878 8879 /* Tend any TLP that has been marked for 1/2 the seq space (its old) */ 8880 if ((rack->rc_last_tlp_acked_set == 1)&& 8881 (rack->rc_last_tlp_past_cumack == 1) && 8882 (SEQ_GT(rack->r_ctl.last_tlp_acked_start, th_ack))) { 8883 /* 8884 * We have reached the point where our last rack 8885 * tlp retransmit sequence is ahead of the cum-ack. 8886 * This can only happen when the cum-ack moves all 8887 * the way around (its been a full 2^^31+1 bytes 8888 * or more since we sent a retransmitted TLP). Lets 8889 * turn off the valid flag since its not really valid. 8890 * 8891 * Note since sack's also turn on this event we have 8892 * a complication, we have to wait to age it out until 8893 * the cum-ack is by the TLP before checking which is 8894 * what the next else clause does. 8895 */ 8896 rack_log_dsack_event(rack, 9, __LINE__, 8897 rack->r_ctl.last_tlp_acked_start, 8898 rack->r_ctl.last_tlp_acked_end); 8899 rack->rc_last_tlp_acked_set = 0; 8900 rack->rc_last_tlp_past_cumack = 0; 8901 } else if ((rack->rc_last_tlp_acked_set == 1) && 8902 (rack->rc_last_tlp_past_cumack == 0) && 8903 (SEQ_GEQ(th_ack, rack->r_ctl.last_tlp_acked_end))) { 8904 /* 8905 * It is safe to start aging TLP's out. 8906 */ 8907 rack->rc_last_tlp_past_cumack = 1; 8908 } 8909 /* We do the same for the tlp send seq as well */ 8910 if ((rack->rc_last_sent_tlp_seq_valid == 1) && 8911 (rack->rc_last_sent_tlp_past_cumack == 1) && 8912 (SEQ_GT(rack->r_ctl.last_sent_tlp_seq, th_ack))) { 8913 rack_log_dsack_event(rack, 9, __LINE__, 8914 rack->r_ctl.last_sent_tlp_seq, 8915 (rack->r_ctl.last_sent_tlp_seq + 8916 rack->r_ctl.last_sent_tlp_len)); 8917 rack->rc_last_sent_tlp_seq_valid = 0; 8918 rack->rc_last_sent_tlp_past_cumack = 0; 8919 } else if ((rack->rc_last_sent_tlp_seq_valid == 1) && 8920 (rack->rc_last_sent_tlp_past_cumack == 0) && 8921 (SEQ_GEQ(th_ack, rack->r_ctl.last_sent_tlp_seq))) { 8922 /* 8923 * It is safe to start aging TLP's send. 8924 */ 8925 rack->rc_last_sent_tlp_past_cumack = 1; 8926 } 8927 more: 8928 rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 8929 if (rsm == NULL) { 8930 if ((th_ack - 1) == tp->iss) { 8931 /* 8932 * For the SYN incoming case we will not 8933 * have called tcp_output for the sending of 8934 * the SYN, so there will be no map. All 8935 * other cases should probably be a panic. 8936 */ 8937 return; 8938 } 8939 if (tp->t_flags & TF_SENTFIN) { 8940 /* if we sent a FIN we often will not have map */ 8941 return; 8942 } 8943 #ifdef INVARIANTS 8944 panic("No rack map tp:%p for state:%d ack:%u rack:%p snd_una:%u snd_max:%u snd_nxt:%u\n", 8945 tp, 8946 tp->t_state, th_ack, rack, 8947 tp->snd_una, tp->snd_max, tp->snd_nxt); 8948 #endif 8949 return; 8950 } 8951 if (SEQ_LT(th_ack, rsm->r_start)) { 8952 /* Huh map is missing this */ 8953 #ifdef INVARIANTS 8954 printf("Rack map starts at r_start:%u for th_ack:%u huh? ts:%d rs:%d\n", 8955 rsm->r_start, 8956 th_ack, tp->t_state, rack->r_state); 8957 #endif 8958 return; 8959 } 8960 rack_update_rtt(tp, rack, rsm, to, cts, CUM_ACKED, th_ack); 8961 8962 /* Now was it a retransmitted TLP? */ 8963 if ((rsm->r_flags & RACK_TLP) && 8964 (rsm->r_rtr_cnt > 1)) { 8965 /* 8966 * Yes, this rsm was a TLP and retransmitted, remember that 8967 * since if a DSACK comes back on this we don't want 8968 * to think of it as a reordered segment. This may 8969 * get updated again with possibly even other TLPs 8970 * in flight, but thats ok. Only when we don't send 8971 * a retransmitted TLP for 1/2 the sequences space 8972 * will it get turned off (above). 8973 */ 8974 if (rack->rc_last_tlp_acked_set && 8975 (is_rsm_inside_declared_tlp_block(rack, rsm))) { 8976 /* 8977 * We already turned this on since the end matches, 8978 * the previous one was a partially ack now we 8979 * are getting another one (maybe all of it). 8980 */ 8981 rack_log_dsack_event(rack, 10, __LINE__, rsm->r_start, rsm->r_end); 8982 /* 8983 * Lets make sure we have all of it though. 8984 */ 8985 if (SEQ_LT(rsm->r_start, rack->r_ctl.last_tlp_acked_start)) { 8986 rack->r_ctl.last_tlp_acked_start = rsm->r_start; 8987 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start, 8988 rack->r_ctl.last_tlp_acked_end); 8989 } 8990 if (SEQ_GT(rsm->r_end, rack->r_ctl.last_tlp_acked_end)) { 8991 rack->r_ctl.last_tlp_acked_end = rsm->r_end; 8992 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start, 8993 rack->r_ctl.last_tlp_acked_end); 8994 } 8995 } else { 8996 rack->rc_last_tlp_past_cumack = 1; 8997 rack->r_ctl.last_tlp_acked_start = rsm->r_start; 8998 rack->r_ctl.last_tlp_acked_end = rsm->r_end; 8999 rack->rc_last_tlp_acked_set = 1; 9000 rack_log_dsack_event(rack, 8, __LINE__, rsm->r_start, rsm->r_end); 9001 } 9002 } 9003 /* Now do we consume the whole thing? */ 9004 if (SEQ_GEQ(th_ack, rsm->r_end)) { 9005 /* Its all consumed. */ 9006 uint32_t left; 9007 uint8_t newly_acked; 9008 9009 rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_FREE, rsm->r_end, __LINE__); 9010 rack->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes; 9011 rsm->r_rtr_bytes = 0; 9012 /* Record the time of highest cumack sent */ 9013 rack->r_ctl.rc_gp_cumack_ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]; 9014 #ifndef INVARIANTS 9015 (void)RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 9016 #else 9017 rm = RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 9018 if (rm != rsm) { 9019 panic("removing head in rack:%p rsm:%p rm:%p", 9020 rack, rsm, rm); 9021 } 9022 #endif 9023 if (rsm->r_in_tmap) { 9024 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 9025 rsm->r_in_tmap = 0; 9026 } 9027 newly_acked = 1; 9028 if (rsm->r_flags & RACK_ACKED) { 9029 /* 9030 * It was acked on the scoreboard -- remove 9031 * it from total 9032 */ 9033 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); 9034 newly_acked = 0; 9035 } else if (rsm->r_flags & RACK_SACK_PASSED) { 9036 /* 9037 * There are segments ACKED on the 9038 * scoreboard further up. We are seeing 9039 * reordering. 9040 */ 9041 rsm->r_flags &= ~RACK_SACK_PASSED; 9042 rsm->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time); 9043 rsm->r_flags |= RACK_ACKED; 9044 rack->r_ctl.rc_reorder_ts = cts; 9045 if (rack->r_ent_rec_ns) { 9046 /* 9047 * We have sent no more, and we saw an sack 9048 * then ack arrive. 9049 */ 9050 rack->r_might_revert = 1; 9051 } 9052 } 9053 if ((rsm->r_flags & RACK_TO_REXT) && 9054 (tp->t_flags & TF_RCVD_TSTMP) && 9055 (to->to_flags & TOF_TS) && 9056 (to->to_tsecr != 0) && 9057 (tp->t_flags & TF_PREVVALID)) { 9058 /* 9059 * We can use the timestamp to see 9060 * if this retransmission was from the 9061 * first transmit. If so we made a mistake. 9062 */ 9063 tp->t_flags &= ~TF_PREVVALID; 9064 if (to->to_tsecr == rack_ts_to_msec(rsm->r_tim_lastsent[0])) { 9065 /* The first transmit is what this ack is for */ 9066 rack_cong_signal(tp, CC_RTO_ERR, th_ack, __LINE__); 9067 } 9068 } 9069 left = th_ack - rsm->r_end; 9070 if (rack->app_limited_needs_set && newly_acked) 9071 rack_need_set_test(tp, rack, rsm, th_ack, __LINE__, RACK_USE_END_OR_THACK); 9072 /* Free back to zone */ 9073 rack_free(rack, rsm); 9074 if (left) { 9075 goto more; 9076 } 9077 /* Check for reneging */ 9078 rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 9079 if (rsm && (rsm->r_flags & RACK_ACKED) && (th_ack == rsm->r_start)) { 9080 /* 9081 * The peer has moved snd_una up to 9082 * the edge of this send, i.e. one 9083 * that it had previously acked. The only 9084 * way that can be true if the peer threw 9085 * away data (space issues) that it had 9086 * previously sacked (else it would have 9087 * given us snd_una up to (rsm->r_end). 9088 * We need to undo the acked markings here. 9089 * 9090 * Note we have to look to make sure th_ack is 9091 * our rsm->r_start in case we get an old ack 9092 * where th_ack is behind snd_una. 9093 */ 9094 rack_peer_reneges(rack, rsm, th_ack); 9095 } 9096 return; 9097 } 9098 if (rsm->r_flags & RACK_ACKED) { 9099 /* 9100 * It was acked on the scoreboard -- remove it from 9101 * total for the part being cum-acked. 9102 */ 9103 rack->r_ctl.rc_sacked -= (th_ack - rsm->r_start); 9104 } 9105 /* 9106 * Clear the dup ack count for 9107 * the piece that remains. 9108 */ 9109 rsm->r_dupack = 0; 9110 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 9111 if (rsm->r_rtr_bytes) { 9112 /* 9113 * It was retransmitted adjust the 9114 * sack holes for what was acked. 9115 */ 9116 int ack_am; 9117 9118 ack_am = (th_ack - rsm->r_start); 9119 if (ack_am >= rsm->r_rtr_bytes) { 9120 rack->r_ctl.rc_holes_rxt -= ack_am; 9121 rsm->r_rtr_bytes -= ack_am; 9122 } 9123 } 9124 /* 9125 * Update where the piece starts and record 9126 * the time of send of highest cumack sent. 9127 */ 9128 rack->r_ctl.rc_gp_cumack_ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]; 9129 rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_TRIM_HEAD, th_ack, __LINE__); 9130 /* Now we need to move our offset forward too */ 9131 if (rsm->m && (rsm->orig_m_len != rsm->m->m_len)) { 9132 /* Fix up the orig_m_len and possibly the mbuf offset */ 9133 rack_adjust_orig_mlen(rsm); 9134 } 9135 rsm->soff += (th_ack - rsm->r_start); 9136 rsm->r_start = th_ack; 9137 /* Now do we need to move the mbuf fwd too? */ 9138 if (rsm->m) { 9139 while (rsm->soff >= rsm->m->m_len) { 9140 rsm->soff -= rsm->m->m_len; 9141 rsm->m = rsm->m->m_next; 9142 KASSERT((rsm->m != NULL), 9143 (" nrsm:%p hit at soff:%u null m", 9144 rsm, rsm->soff)); 9145 } 9146 rsm->orig_m_len = rsm->m->m_len; 9147 } 9148 if (rack->app_limited_needs_set) 9149 rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_BEG); 9150 } 9151 9152 static void 9153 rack_handle_might_revert(struct tcpcb *tp, struct tcp_rack *rack) 9154 { 9155 struct rack_sendmap *rsm; 9156 int sack_pass_fnd = 0; 9157 9158 if (rack->r_might_revert) { 9159 /* 9160 * Ok we have reordering, have not sent anything, we 9161 * might want to revert the congestion state if nothing 9162 * further has SACK_PASSED on it. Lets check. 9163 * 9164 * We also get here when we have DSACKs come in for 9165 * all the data that we FR'd. Note that a rxt or tlp 9166 * timer clears this from happening. 9167 */ 9168 9169 TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) { 9170 if (rsm->r_flags & RACK_SACK_PASSED) { 9171 sack_pass_fnd = 1; 9172 break; 9173 } 9174 } 9175 if (sack_pass_fnd == 0) { 9176 /* 9177 * We went into recovery 9178 * incorrectly due to reordering! 9179 */ 9180 int orig_cwnd; 9181 9182 rack->r_ent_rec_ns = 0; 9183 orig_cwnd = tp->snd_cwnd; 9184 tp->snd_ssthresh = rack->r_ctl.rc_ssthresh_at_erec; 9185 tp->snd_recover = tp->snd_una; 9186 rack_log_to_prr(rack, 14, orig_cwnd, __LINE__); 9187 EXIT_RECOVERY(tp->t_flags); 9188 } 9189 rack->r_might_revert = 0; 9190 } 9191 } 9192 9193 #ifdef NETFLIX_EXP_DETECTION 9194 static void 9195 rack_do_detection(struct tcpcb *tp, struct tcp_rack *rack, uint32_t bytes_this_ack, uint32_t segsiz) 9196 { 9197 if ((rack->do_detection || tcp_force_detection) && 9198 tcp_sack_to_ack_thresh && 9199 tcp_sack_to_move_thresh && 9200 ((rack->r_ctl.rc_num_maps_alloced > tcp_map_minimum) || rack->sack_attack_disable)) { 9201 /* 9202 * We have thresholds set to find 9203 * possible attackers and disable sack. 9204 * Check them. 9205 */ 9206 uint64_t ackratio, moveratio, movetotal; 9207 9208 /* Log detecting */ 9209 rack_log_sad(rack, 1); 9210 ackratio = (uint64_t)(rack->r_ctl.sack_count); 9211 ackratio *= (uint64_t)(1000); 9212 if (rack->r_ctl.ack_count) 9213 ackratio /= (uint64_t)(rack->r_ctl.ack_count); 9214 else { 9215 /* We really should not hit here */ 9216 ackratio = 1000; 9217 } 9218 if ((rack->sack_attack_disable == 0) && 9219 (ackratio > rack_highest_sack_thresh_seen)) 9220 rack_highest_sack_thresh_seen = (uint32_t)ackratio; 9221 movetotal = rack->r_ctl.sack_moved_extra; 9222 movetotal += rack->r_ctl.sack_noextra_move; 9223 moveratio = rack->r_ctl.sack_moved_extra; 9224 moveratio *= (uint64_t)1000; 9225 if (movetotal) 9226 moveratio /= movetotal; 9227 else { 9228 /* No moves, thats pretty good */ 9229 moveratio = 0; 9230 } 9231 if ((rack->sack_attack_disable == 0) && 9232 (moveratio > rack_highest_move_thresh_seen)) 9233 rack_highest_move_thresh_seen = (uint32_t)moveratio; 9234 if (rack->sack_attack_disable == 0) { 9235 if ((ackratio > tcp_sack_to_ack_thresh) && 9236 (moveratio > tcp_sack_to_move_thresh)) { 9237 /* Disable sack processing */ 9238 rack->sack_attack_disable = 1; 9239 if (rack->r_rep_attack == 0) { 9240 rack->r_rep_attack = 1; 9241 counter_u64_add(rack_sack_attacks_detected, 1); 9242 } 9243 if (tcp_attack_on_turns_on_logging) { 9244 /* 9245 * Turn on logging, used for debugging 9246 * false positives. 9247 */ 9248 rack->rc_tp->t_logstate = tcp_attack_on_turns_on_logging; 9249 } 9250 /* Clamp the cwnd at flight size */ 9251 rack->r_ctl.rc_saved_cwnd = rack->rc_tp->snd_cwnd; 9252 rack->rc_tp->snd_cwnd = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 9253 rack_log_sad(rack, 2); 9254 } 9255 } else { 9256 /* We are sack-disabled check for false positives */ 9257 if ((ackratio <= tcp_restoral_thresh) || 9258 (rack->r_ctl.rc_num_maps_alloced < tcp_map_minimum)) { 9259 rack->sack_attack_disable = 0; 9260 rack_log_sad(rack, 3); 9261 /* Restart counting */ 9262 rack->r_ctl.sack_count = 0; 9263 rack->r_ctl.sack_moved_extra = 0; 9264 rack->r_ctl.sack_noextra_move = 1; 9265 rack->r_ctl.ack_count = max(1, 9266 (bytes_this_ack / segsiz)); 9267 9268 if (rack->r_rep_reverse == 0) { 9269 rack->r_rep_reverse = 1; 9270 counter_u64_add(rack_sack_attacks_reversed, 1); 9271 } 9272 /* Restore the cwnd */ 9273 if (rack->r_ctl.rc_saved_cwnd > rack->rc_tp->snd_cwnd) 9274 rack->rc_tp->snd_cwnd = rack->r_ctl.rc_saved_cwnd; 9275 } 9276 } 9277 } 9278 } 9279 #endif 9280 9281 static int 9282 rack_note_dsack(struct tcp_rack *rack, tcp_seq start, tcp_seq end) 9283 { 9284 9285 uint32_t am, l_end; 9286 int was_tlp = 0; 9287 9288 if (SEQ_GT(end, start)) 9289 am = end - start; 9290 else 9291 am = 0; 9292 if ((rack->rc_last_tlp_acked_set ) && 9293 (SEQ_GEQ(start, rack->r_ctl.last_tlp_acked_start)) && 9294 (SEQ_LEQ(end, rack->r_ctl.last_tlp_acked_end))) { 9295 /* 9296 * The DSACK is because of a TLP which we don't 9297 * do anything with the reordering window over since 9298 * it was not reordering that caused the DSACK but 9299 * our previous retransmit TLP. 9300 */ 9301 rack_log_dsack_event(rack, 7, __LINE__, start, end); 9302 was_tlp = 1; 9303 goto skip_dsack_round; 9304 } 9305 if (rack->rc_last_sent_tlp_seq_valid) { 9306 l_end = rack->r_ctl.last_sent_tlp_seq + rack->r_ctl.last_sent_tlp_len; 9307 if (SEQ_GEQ(start, rack->r_ctl.last_sent_tlp_seq) && 9308 (SEQ_LEQ(end, l_end))) { 9309 /* 9310 * This dsack is from the last sent TLP, ignore it 9311 * for reordering purposes. 9312 */ 9313 rack_log_dsack_event(rack, 7, __LINE__, start, end); 9314 was_tlp = 1; 9315 goto skip_dsack_round; 9316 } 9317 } 9318 if (rack->rc_dsack_round_seen == 0) { 9319 rack->rc_dsack_round_seen = 1; 9320 rack->r_ctl.dsack_round_end = rack->rc_tp->snd_max; 9321 rack->r_ctl.num_dsack++; 9322 rack->r_ctl.dsack_persist = 16; /* 16 is from the standard */ 9323 rack_log_dsack_event(rack, 2, __LINE__, 0, 0); 9324 } 9325 skip_dsack_round: 9326 /* 9327 * We keep track of how many DSACK blocks we get 9328 * after a recovery incident. 9329 */ 9330 rack->r_ctl.dsack_byte_cnt += am; 9331 if (!IN_FASTRECOVERY(rack->rc_tp->t_flags) && 9332 rack->r_ctl.retran_during_recovery && 9333 (rack->r_ctl.dsack_byte_cnt >= rack->r_ctl.retran_during_recovery)) { 9334 /* 9335 * False recovery most likely culprit is reordering. If 9336 * nothing else is missing we need to revert. 9337 */ 9338 rack->r_might_revert = 1; 9339 rack_handle_might_revert(rack->rc_tp, rack); 9340 rack->r_might_revert = 0; 9341 rack->r_ctl.retran_during_recovery = 0; 9342 rack->r_ctl.dsack_byte_cnt = 0; 9343 } 9344 return (was_tlp); 9345 } 9346 9347 static uint32_t 9348 do_rack_compute_pipe(struct tcpcb *tp, struct tcp_rack *rack, uint32_t snd_una) 9349 { 9350 return (((tp->snd_max - snd_una) - rack->r_ctl.rc_sacked) + rack->r_ctl.rc_holes_rxt); 9351 } 9352 9353 static int32_t 9354 rack_compute_pipe(struct tcpcb *tp) 9355 { 9356 return ((int32_t)do_rack_compute_pipe(tp, 9357 (struct tcp_rack *)tp->t_fb_ptr, 9358 tp->snd_una)); 9359 } 9360 9361 static void 9362 rack_update_prr(struct tcpcb *tp, struct tcp_rack *rack, uint32_t changed, tcp_seq th_ack) 9363 { 9364 /* Deal with changed and PRR here (in recovery only) */ 9365 uint32_t pipe, snd_una; 9366 9367 rack->r_ctl.rc_prr_delivered += changed; 9368 9369 if (sbavail(&rack->rc_inp->inp_socket->so_snd) <= (tp->snd_max - tp->snd_una)) { 9370 /* 9371 * It is all outstanding, we are application limited 9372 * and thus we don't need more room to send anything. 9373 * Note we use tp->snd_una here and not th_ack because 9374 * the data as yet not been cut from the sb. 9375 */ 9376 rack->r_ctl.rc_prr_sndcnt = 0; 9377 return; 9378 } 9379 /* Compute prr_sndcnt */ 9380 if (SEQ_GT(tp->snd_una, th_ack)) { 9381 snd_una = tp->snd_una; 9382 } else { 9383 snd_una = th_ack; 9384 } 9385 pipe = do_rack_compute_pipe(tp, rack, snd_una); 9386 if (pipe > tp->snd_ssthresh) { 9387 long sndcnt; 9388 9389 sndcnt = rack->r_ctl.rc_prr_delivered * tp->snd_ssthresh; 9390 if (rack->r_ctl.rc_prr_recovery_fs > 0) 9391 sndcnt /= (long)rack->r_ctl.rc_prr_recovery_fs; 9392 else { 9393 rack->r_ctl.rc_prr_sndcnt = 0; 9394 rack_log_to_prr(rack, 9, 0, __LINE__); 9395 sndcnt = 0; 9396 } 9397 sndcnt++; 9398 if (sndcnt > (long)rack->r_ctl.rc_prr_out) 9399 sndcnt -= rack->r_ctl.rc_prr_out; 9400 else 9401 sndcnt = 0; 9402 rack->r_ctl.rc_prr_sndcnt = sndcnt; 9403 rack_log_to_prr(rack, 10, 0, __LINE__); 9404 } else { 9405 uint32_t limit; 9406 9407 if (rack->r_ctl.rc_prr_delivered > rack->r_ctl.rc_prr_out) 9408 limit = (rack->r_ctl.rc_prr_delivered - rack->r_ctl.rc_prr_out); 9409 else 9410 limit = 0; 9411 if (changed > limit) 9412 limit = changed; 9413 limit += ctf_fixed_maxseg(tp); 9414 if (tp->snd_ssthresh > pipe) { 9415 rack->r_ctl.rc_prr_sndcnt = min((tp->snd_ssthresh - pipe), limit); 9416 rack_log_to_prr(rack, 11, 0, __LINE__); 9417 } else { 9418 rack->r_ctl.rc_prr_sndcnt = min(0, limit); 9419 rack_log_to_prr(rack, 12, 0, __LINE__); 9420 } 9421 } 9422 } 9423 9424 static void 9425 rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th, int entered_recovery, int dup_ack_struck) 9426 { 9427 uint32_t changed; 9428 struct tcp_rack *rack; 9429 struct rack_sendmap *rsm; 9430 struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1]; 9431 register uint32_t th_ack; 9432 int32_t i, j, k, num_sack_blks = 0; 9433 uint32_t cts, acked, ack_point; 9434 int loop_start = 0, moved_two = 0; 9435 uint32_t tsused; 9436 9437 9438 INP_WLOCK_ASSERT(tptoinpcb(tp)); 9439 if (tcp_get_flags(th) & TH_RST) { 9440 /* We don't log resets */ 9441 return; 9442 } 9443 rack = (struct tcp_rack *)tp->t_fb_ptr; 9444 cts = tcp_get_usecs(NULL); 9445 rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 9446 changed = 0; 9447 th_ack = th->th_ack; 9448 if (rack->sack_attack_disable == 0) 9449 rack_do_decay(rack); 9450 if (BYTES_THIS_ACK(tp, th) >= ctf_fixed_maxseg(rack->rc_tp)) { 9451 /* 9452 * You only get credit for 9453 * MSS and greater (and you get extra 9454 * credit for larger cum-ack moves). 9455 */ 9456 int ac; 9457 9458 ac = BYTES_THIS_ACK(tp, th) / ctf_fixed_maxseg(rack->rc_tp); 9459 rack->r_ctl.ack_count += ac; 9460 counter_u64_add(rack_ack_total, ac); 9461 } 9462 if (rack->r_ctl.ack_count > 0xfff00000) { 9463 /* 9464 * reduce the number to keep us under 9465 * a uint32_t. 9466 */ 9467 rack->r_ctl.ack_count /= 2; 9468 rack->r_ctl.sack_count /= 2; 9469 } 9470 if (SEQ_GT(th_ack, tp->snd_una)) { 9471 rack_log_progress_event(rack, tp, ticks, PROGRESS_UPDATE, __LINE__); 9472 tp->t_acktime = ticks; 9473 } 9474 if (rsm && SEQ_GT(th_ack, rsm->r_start)) 9475 changed = th_ack - rsm->r_start; 9476 if (changed) { 9477 rack_process_to_cumack(tp, rack, th_ack, cts, to); 9478 } 9479 if ((to->to_flags & TOF_SACK) == 0) { 9480 /* We are done nothing left and no sack. */ 9481 rack_handle_might_revert(tp, rack); 9482 /* 9483 * For cases where we struck a dup-ack 9484 * with no SACK, add to the changes so 9485 * PRR will work right. 9486 */ 9487 if (dup_ack_struck && (changed == 0)) { 9488 changed += ctf_fixed_maxseg(rack->rc_tp); 9489 } 9490 goto out; 9491 } 9492 /* Sack block processing */ 9493 if (SEQ_GT(th_ack, tp->snd_una)) 9494 ack_point = th_ack; 9495 else 9496 ack_point = tp->snd_una; 9497 for (i = 0; i < to->to_nsacks; i++) { 9498 bcopy((to->to_sacks + i * TCPOLEN_SACK), 9499 &sack, sizeof(sack)); 9500 sack.start = ntohl(sack.start); 9501 sack.end = ntohl(sack.end); 9502 if (SEQ_GT(sack.end, sack.start) && 9503 SEQ_GT(sack.start, ack_point) && 9504 SEQ_LT(sack.start, tp->snd_max) && 9505 SEQ_GT(sack.end, ack_point) && 9506 SEQ_LEQ(sack.end, tp->snd_max)) { 9507 sack_blocks[num_sack_blks] = sack; 9508 num_sack_blks++; 9509 } else if (SEQ_LEQ(sack.start, th_ack) && 9510 SEQ_LEQ(sack.end, th_ack)) { 9511 int was_tlp; 9512 9513 was_tlp = rack_note_dsack(rack, sack.start, sack.end); 9514 /* 9515 * Its a D-SACK block. 9516 */ 9517 tcp_record_dsack(tp, sack.start, sack.end, was_tlp); 9518 } 9519 } 9520 if (rack->rc_dsack_round_seen) { 9521 /* Is the dsack roound over? */ 9522 if (SEQ_GEQ(th_ack, rack->r_ctl.dsack_round_end)) { 9523 /* Yes it is */ 9524 rack->rc_dsack_round_seen = 0; 9525 rack_log_dsack_event(rack, 3, __LINE__, 0, 0); 9526 } 9527 } 9528 /* 9529 * Sort the SACK blocks so we can update the rack scoreboard with 9530 * just one pass. 9531 */ 9532 num_sack_blks = sack_filter_blks(&rack->r_ctl.rack_sf, sack_blocks, 9533 num_sack_blks, th->th_ack); 9534 ctf_log_sack_filter(rack->rc_tp, num_sack_blks, sack_blocks); 9535 if (num_sack_blks == 0) { 9536 /* Nothing to sack (DSACKs?) */ 9537 goto out_with_totals; 9538 } 9539 if (num_sack_blks < 2) { 9540 /* Only one, we don't need to sort */ 9541 goto do_sack_work; 9542 } 9543 /* Sort the sacks */ 9544 for (i = 0; i < num_sack_blks; i++) { 9545 for (j = i + 1; j < num_sack_blks; j++) { 9546 if (SEQ_GT(sack_blocks[i].end, sack_blocks[j].end)) { 9547 sack = sack_blocks[i]; 9548 sack_blocks[i] = sack_blocks[j]; 9549 sack_blocks[j] = sack; 9550 } 9551 } 9552 } 9553 /* 9554 * Now are any of the sack block ends the same (yes some 9555 * implementations send these)? 9556 */ 9557 again: 9558 if (num_sack_blks == 0) 9559 goto out_with_totals; 9560 if (num_sack_blks > 1) { 9561 for (i = 0; i < num_sack_blks; i++) { 9562 for (j = i + 1; j < num_sack_blks; j++) { 9563 if (sack_blocks[i].end == sack_blocks[j].end) { 9564 /* 9565 * Ok these two have the same end we 9566 * want the smallest end and then 9567 * throw away the larger and start 9568 * again. 9569 */ 9570 if (SEQ_LT(sack_blocks[j].start, sack_blocks[i].start)) { 9571 /* 9572 * The second block covers 9573 * more area use that 9574 */ 9575 sack_blocks[i].start = sack_blocks[j].start; 9576 } 9577 /* 9578 * Now collapse out the dup-sack and 9579 * lower the count 9580 */ 9581 for (k = (j + 1); k < num_sack_blks; k++) { 9582 sack_blocks[j].start = sack_blocks[k].start; 9583 sack_blocks[j].end = sack_blocks[k].end; 9584 j++; 9585 } 9586 num_sack_blks--; 9587 goto again; 9588 } 9589 } 9590 } 9591 } 9592 do_sack_work: 9593 /* 9594 * First lets look to see if 9595 * we have retransmitted and 9596 * can use the transmit next? 9597 */ 9598 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 9599 if (rsm && 9600 SEQ_GT(sack_blocks[0].end, rsm->r_start) && 9601 SEQ_LT(sack_blocks[0].start, rsm->r_end)) { 9602 /* 9603 * We probably did the FR and the next 9604 * SACK in continues as we would expect. 9605 */ 9606 acked = rack_proc_sack_blk(tp, rack, &sack_blocks[0], to, &rsm, cts, &moved_two); 9607 if (acked) { 9608 rack->r_wanted_output = 1; 9609 changed += acked; 9610 } 9611 if (num_sack_blks == 1) { 9612 /* 9613 * This is what we would expect from 9614 * a normal implementation to happen 9615 * after we have retransmitted the FR, 9616 * i.e the sack-filter pushes down 9617 * to 1 block and the next to be retransmitted 9618 * is the sequence in the sack block (has more 9619 * are acked). Count this as ACK'd data to boost 9620 * up the chances of recovering any false positives. 9621 */ 9622 rack->r_ctl.ack_count += (acked / ctf_fixed_maxseg(rack->rc_tp)); 9623 counter_u64_add(rack_ack_total, (acked / ctf_fixed_maxseg(rack->rc_tp))); 9624 counter_u64_add(rack_express_sack, 1); 9625 if (rack->r_ctl.ack_count > 0xfff00000) { 9626 /* 9627 * reduce the number to keep us under 9628 * a uint32_t. 9629 */ 9630 rack->r_ctl.ack_count /= 2; 9631 rack->r_ctl.sack_count /= 2; 9632 } 9633 goto out_with_totals; 9634 } else { 9635 /* 9636 * Start the loop through the 9637 * rest of blocks, past the first block. 9638 */ 9639 moved_two = 0; 9640 loop_start = 1; 9641 } 9642 } 9643 /* Its a sack of some sort */ 9644 rack->r_ctl.sack_count++; 9645 if (rack->r_ctl.sack_count > 0xfff00000) { 9646 /* 9647 * reduce the number to keep us under 9648 * a uint32_t. 9649 */ 9650 rack->r_ctl.ack_count /= 2; 9651 rack->r_ctl.sack_count /= 2; 9652 } 9653 counter_u64_add(rack_sack_total, 1); 9654 if (rack->sack_attack_disable) { 9655 /* An attacker disablement is in place */ 9656 if (num_sack_blks > 1) { 9657 rack->r_ctl.sack_count += (num_sack_blks - 1); 9658 rack->r_ctl.sack_moved_extra++; 9659 counter_u64_add(rack_move_some, 1); 9660 if (rack->r_ctl.sack_moved_extra > 0xfff00000) { 9661 rack->r_ctl.sack_moved_extra /= 2; 9662 rack->r_ctl.sack_noextra_move /= 2; 9663 } 9664 } 9665 goto out; 9666 } 9667 rsm = rack->r_ctl.rc_sacklast; 9668 for (i = loop_start; i < num_sack_blks; i++) { 9669 acked = rack_proc_sack_blk(tp, rack, &sack_blocks[i], to, &rsm, cts, &moved_two); 9670 if (acked) { 9671 rack->r_wanted_output = 1; 9672 changed += acked; 9673 } 9674 if (moved_two) { 9675 /* 9676 * If we did not get a SACK for at least a MSS and 9677 * had to move at all, or if we moved more than our 9678 * threshold, it counts against the "extra" move. 9679 */ 9680 rack->r_ctl.sack_moved_extra += moved_two; 9681 counter_u64_add(rack_move_some, 1); 9682 } else { 9683 /* 9684 * else we did not have to move 9685 * any more than we would expect. 9686 */ 9687 rack->r_ctl.sack_noextra_move++; 9688 counter_u64_add(rack_move_none, 1); 9689 } 9690 if (moved_two && (acked < ctf_fixed_maxseg(rack->rc_tp))) { 9691 /* 9692 * If the SACK was not a full MSS then 9693 * we add to sack_count the number of 9694 * MSS's (or possibly more than 9695 * a MSS if its a TSO send) we had to skip by. 9696 */ 9697 rack->r_ctl.sack_count += moved_two; 9698 counter_u64_add(rack_sack_total, moved_two); 9699 } 9700 /* 9701 * Now we need to setup for the next 9702 * round. First we make sure we won't 9703 * exceed the size of our uint32_t on 9704 * the various counts, and then clear out 9705 * moved_two. 9706 */ 9707 if ((rack->r_ctl.sack_moved_extra > 0xfff00000) || 9708 (rack->r_ctl.sack_noextra_move > 0xfff00000)) { 9709 rack->r_ctl.sack_moved_extra /= 2; 9710 rack->r_ctl.sack_noextra_move /= 2; 9711 } 9712 if (rack->r_ctl.sack_count > 0xfff00000) { 9713 rack->r_ctl.ack_count /= 2; 9714 rack->r_ctl.sack_count /= 2; 9715 } 9716 moved_two = 0; 9717 } 9718 out_with_totals: 9719 if (num_sack_blks > 1) { 9720 /* 9721 * You get an extra stroke if 9722 * you have more than one sack-blk, this 9723 * could be where we are skipping forward 9724 * and the sack-filter is still working, or 9725 * it could be an attacker constantly 9726 * moving us. 9727 */ 9728 rack->r_ctl.sack_moved_extra++; 9729 counter_u64_add(rack_move_some, 1); 9730 } 9731 out: 9732 #ifdef NETFLIX_EXP_DETECTION 9733 rack_do_detection(tp, rack, BYTES_THIS_ACK(tp, th), ctf_fixed_maxseg(rack->rc_tp)); 9734 #endif 9735 if (changed) { 9736 /* Something changed cancel the rack timer */ 9737 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 9738 } 9739 tsused = tcp_get_usecs(NULL); 9740 rsm = tcp_rack_output(tp, rack, tsused); 9741 if ((!IN_FASTRECOVERY(tp->t_flags)) && 9742 rsm && 9743 ((rsm->r_flags & RACK_MUST_RXT) == 0)) { 9744 /* Enter recovery */ 9745 entered_recovery = 1; 9746 rack_cong_signal(tp, CC_NDUPACK, tp->snd_una, __LINE__); 9747 /* 9748 * When we enter recovery we need to assure we send 9749 * one packet. 9750 */ 9751 if (rack->rack_no_prr == 0) { 9752 rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); 9753 rack_log_to_prr(rack, 8, 0, __LINE__); 9754 } 9755 rack->r_timer_override = 1; 9756 rack->r_early = 0; 9757 rack->r_ctl.rc_agg_early = 0; 9758 } else if (IN_FASTRECOVERY(tp->t_flags) && 9759 rsm && 9760 (rack->r_rr_config == 3)) { 9761 /* 9762 * Assure we can output and we get no 9763 * remembered pace time except the retransmit. 9764 */ 9765 rack->r_timer_override = 1; 9766 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 9767 rack->r_ctl.rc_resend = rsm; 9768 } 9769 if (IN_FASTRECOVERY(tp->t_flags) && 9770 (rack->rack_no_prr == 0) && 9771 (entered_recovery == 0)) { 9772 rack_update_prr(tp, rack, changed, th_ack); 9773 if ((rsm && (rack->r_ctl.rc_prr_sndcnt >= ctf_fixed_maxseg(tp)) && 9774 ((tcp_in_hpts(rack->rc_inp) == 0) && 9775 ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0)))) { 9776 /* 9777 * If you are pacing output you don't want 9778 * to override. 9779 */ 9780 rack->r_early = 0; 9781 rack->r_ctl.rc_agg_early = 0; 9782 rack->r_timer_override = 1; 9783 } 9784 } 9785 } 9786 9787 static void 9788 rack_strike_dupack(struct tcp_rack *rack) 9789 { 9790 struct rack_sendmap *rsm; 9791 9792 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 9793 while (rsm && (rsm->r_dupack >= DUP_ACK_THRESHOLD)) { 9794 rsm = TAILQ_NEXT(rsm, r_tnext); 9795 if (rsm->r_flags & RACK_MUST_RXT) { 9796 /* Sendmap entries that are marked to 9797 * be retransmitted do not need dupack's 9798 * struck. We get these marks for a number 9799 * of reasons (rxt timeout with no sack, 9800 * mtu change, or rwnd collapses). When 9801 * these events occur, we know we must retransmit 9802 * them and mark the sendmap entries. Dupack counting 9803 * is not needed since we are already set to retransmit 9804 * it as soon as we can. 9805 */ 9806 continue; 9807 } 9808 } 9809 if (rsm && (rsm->r_dupack < 0xff)) { 9810 rsm->r_dupack++; 9811 if (rsm->r_dupack >= DUP_ACK_THRESHOLD) { 9812 struct timeval tv; 9813 uint32_t cts; 9814 /* 9815 * Here we see if we need to retransmit. For 9816 * a SACK type connection if enough time has passed 9817 * we will get a return of the rsm. For a non-sack 9818 * connection we will get the rsm returned if the 9819 * dupack value is 3 or more. 9820 */ 9821 cts = tcp_get_usecs(&tv); 9822 rack->r_ctl.rc_resend = tcp_rack_output(rack->rc_tp, rack, cts); 9823 if (rack->r_ctl.rc_resend != NULL) { 9824 if (!IN_FASTRECOVERY(rack->rc_tp->t_flags)) { 9825 rack_cong_signal(rack->rc_tp, CC_NDUPACK, 9826 rack->rc_tp->snd_una, __LINE__); 9827 } 9828 rack->r_wanted_output = 1; 9829 rack->r_timer_override = 1; 9830 rack_log_retran_reason(rack, rsm, __LINE__, 1, 3); 9831 } 9832 } else { 9833 rack_log_retran_reason(rack, rsm, __LINE__, 0, 3); 9834 } 9835 } 9836 } 9837 9838 static void 9839 rack_check_bottom_drag(struct tcpcb *tp, 9840 struct tcp_rack *rack, 9841 struct socket *so, int32_t acked) 9842 { 9843 uint32_t segsiz, minseg; 9844 9845 segsiz = ctf_fixed_maxseg(tp); 9846 minseg = segsiz; 9847 9848 if (tp->snd_max == tp->snd_una) { 9849 /* 9850 * We are doing dynamic pacing and we are way 9851 * under. Basically everything got acked while 9852 * we were still waiting on the pacer to expire. 9853 * 9854 * This means we need to boost the b/w in 9855 * addition to any earlier boosting of 9856 * the multiplier. 9857 */ 9858 rack->rc_dragged_bottom = 1; 9859 rack_validate_multipliers_at_or_above100(rack); 9860 /* 9861 * Lets use the segment bytes acked plus 9862 * the lowest RTT seen as the basis to 9863 * form a b/w estimate. This will be off 9864 * due to the fact that the true estimate 9865 * should be around 1/2 the time of the RTT 9866 * but we can settle for that. 9867 */ 9868 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_VALID) && 9869 acked) { 9870 uint64_t bw, calc_bw, rtt; 9871 9872 rtt = rack->r_ctl.rack_rs.rs_us_rtt; 9873 if (rtt == 0) { 9874 /* no us sample is there a ms one? */ 9875 if (rack->r_ctl.rack_rs.rs_rtt_lowest) { 9876 rtt = rack->r_ctl.rack_rs.rs_rtt_lowest; 9877 } else { 9878 goto no_measurement; 9879 } 9880 } 9881 bw = acked; 9882 calc_bw = bw * 1000000; 9883 calc_bw /= rtt; 9884 if (rack->r_ctl.last_max_bw && 9885 (rack->r_ctl.last_max_bw < calc_bw)) { 9886 /* 9887 * If we have a last calculated max bw 9888 * enforce it. 9889 */ 9890 calc_bw = rack->r_ctl.last_max_bw; 9891 } 9892 /* now plop it in */ 9893 if (rack->rc_gp_filled == 0) { 9894 if (calc_bw > ONE_POINT_TWO_MEG) { 9895 /* 9896 * If we have no measurement 9897 * don't let us set in more than 9898 * 1.2Mbps. If we are still too 9899 * low after pacing with this we 9900 * will hopefully have a max b/w 9901 * available to sanity check things. 9902 */ 9903 calc_bw = ONE_POINT_TWO_MEG; 9904 } 9905 rack->r_ctl.rc_rtt_diff = 0; 9906 rack->r_ctl.gp_bw = calc_bw; 9907 rack->rc_gp_filled = 1; 9908 if (rack->r_ctl.num_measurements < RACK_REQ_AVG) 9909 rack->r_ctl.num_measurements = RACK_REQ_AVG; 9910 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL); 9911 } else if (calc_bw > rack->r_ctl.gp_bw) { 9912 rack->r_ctl.rc_rtt_diff = 0; 9913 if (rack->r_ctl.num_measurements < RACK_REQ_AVG) 9914 rack->r_ctl.num_measurements = RACK_REQ_AVG; 9915 rack->r_ctl.gp_bw = calc_bw; 9916 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL); 9917 } else 9918 rack_increase_bw_mul(rack, -1, 0, 0, 1); 9919 if ((rack->gp_ready == 0) && 9920 (rack->r_ctl.num_measurements >= rack->r_ctl.req_measurements)) { 9921 /* We have enough measurements now */ 9922 rack->gp_ready = 1; 9923 rack_set_cc_pacing(rack); 9924 if (rack->defer_options) 9925 rack_apply_deferred_options(rack); 9926 } 9927 /* 9928 * For acks over 1mss we do a extra boost to simulate 9929 * where we would get 2 acks (we want 110 for the mul). 9930 */ 9931 if (acked > segsiz) 9932 rack_increase_bw_mul(rack, -1, 0, 0, 1); 9933 } else { 9934 /* 9935 * zero rtt possibly?, settle for just an old increase. 9936 */ 9937 no_measurement: 9938 rack_increase_bw_mul(rack, -1, 0, 0, 1); 9939 } 9940 } else if ((IN_FASTRECOVERY(tp->t_flags) == 0) && 9941 (sbavail(&so->so_snd) > max((segsiz * (4 + rack_req_segs)), 9942 minseg)) && 9943 (rack->r_ctl.cwnd_to_use > max((segsiz * (rack_req_segs + 2)), minseg)) && 9944 (tp->snd_wnd > max((segsiz * (rack_req_segs + 2)), minseg)) && 9945 (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) <= 9946 (segsiz * rack_req_segs))) { 9947 /* 9948 * We are doing dynamic GP pacing and 9949 * we have everything except 1MSS or less 9950 * bytes left out. We are still pacing away. 9951 * And there is data that could be sent, This 9952 * means we are inserting delayed ack time in 9953 * our measurements because we are pacing too slow. 9954 */ 9955 rack_validate_multipliers_at_or_above100(rack); 9956 rack->rc_dragged_bottom = 1; 9957 rack_increase_bw_mul(rack, -1, 0, 0, 1); 9958 } 9959 } 9960 9961 9962 9963 static void 9964 rack_gain_for_fastoutput(struct tcp_rack *rack, struct tcpcb *tp, struct socket *so, uint32_t acked_amount) 9965 { 9966 /* 9967 * The fast output path is enabled and we 9968 * have moved the cumack forward. Lets see if 9969 * we can expand forward the fast path length by 9970 * that amount. What we would ideally like to 9971 * do is increase the number of bytes in the 9972 * fast path block (left_to_send) by the 9973 * acked amount. However we have to gate that 9974 * by two factors: 9975 * 1) The amount outstanding and the rwnd of the peer 9976 * (i.e. we don't want to exceed the rwnd of the peer). 9977 * <and> 9978 * 2) The amount of data left in the socket buffer (i.e. 9979 * we can't send beyond what is in the buffer). 9980 * 9981 * Note that this does not take into account any increase 9982 * in the cwnd. We will only extend the fast path by 9983 * what was acked. 9984 */ 9985 uint32_t new_total, gating_val; 9986 9987 new_total = acked_amount + rack->r_ctl.fsb.left_to_send; 9988 gating_val = min((sbavail(&so->so_snd) - (tp->snd_max - tp->snd_una)), 9989 (tp->snd_wnd - (tp->snd_max - tp->snd_una))); 9990 if (new_total <= gating_val) { 9991 /* We can increase left_to_send by the acked amount */ 9992 counter_u64_add(rack_extended_rfo, 1); 9993 rack->r_ctl.fsb.left_to_send = new_total; 9994 KASSERT((rack->r_ctl.fsb.left_to_send <= (sbavail(&rack->rc_inp->inp_socket->so_snd) - (tp->snd_max - tp->snd_una))), 9995 ("rack:%p left_to_send:%u sbavail:%u out:%u", 9996 rack, rack->r_ctl.fsb.left_to_send, 9997 sbavail(&rack->rc_inp->inp_socket->so_snd), 9998 (tp->snd_max - tp->snd_una))); 9999 10000 } 10001 } 10002 10003 static void 10004 rack_adjust_sendmap(struct tcp_rack *rack, struct sockbuf *sb, tcp_seq snd_una) 10005 { 10006 /* 10007 * Here any sendmap entry that points to the 10008 * beginning mbuf must be adjusted to the correct 10009 * offset. This must be called with: 10010 * 1) The socket buffer locked 10011 * 2) snd_una adjusted to its new postion. 10012 * 10013 * Note that (2) implies rack_ack_received has also 10014 * been called. 10015 * 10016 * We grab the first mbuf in the socket buffer and 10017 * then go through the front of the sendmap, recalculating 10018 * the stored offset for any sendmap entry that has 10019 * that mbuf. We must use the sb functions to do this 10020 * since its possible an add was done has well as 10021 * the subtraction we may have just completed. This should 10022 * not be a penalty though, since we just referenced the sb 10023 * to go in and trim off the mbufs that we freed (of course 10024 * there will be a penalty for the sendmap references though). 10025 */ 10026 struct mbuf *m; 10027 struct rack_sendmap *rsm; 10028 10029 SOCKBUF_LOCK_ASSERT(sb); 10030 m = sb->sb_mb; 10031 rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 10032 if ((rsm == NULL) || (m == NULL)) { 10033 /* Nothing outstanding */ 10034 return; 10035 } 10036 while (rsm->m && (rsm->m == m)) { 10037 /* one to adjust */ 10038 #ifdef INVARIANTS 10039 struct mbuf *tm; 10040 uint32_t soff; 10041 10042 tm = sbsndmbuf(sb, (rsm->r_start - snd_una), &soff); 10043 if (rsm->orig_m_len != m->m_len) { 10044 rack_adjust_orig_mlen(rsm); 10045 } 10046 if (rsm->soff != soff) { 10047 /* 10048 * This is not a fatal error, we anticipate it 10049 * might happen (the else code), so we count it here 10050 * so that under invariant we can see that it really 10051 * does happen. 10052 */ 10053 counter_u64_add(rack_adjust_map_bw, 1); 10054 } 10055 rsm->m = tm; 10056 rsm->soff = soff; 10057 if (tm) 10058 rsm->orig_m_len = rsm->m->m_len; 10059 else 10060 rsm->orig_m_len = 0; 10061 #else 10062 rsm->m = sbsndmbuf(sb, (rsm->r_start - snd_una), &rsm->soff); 10063 if (rsm->m) 10064 rsm->orig_m_len = rsm->m->m_len; 10065 else 10066 rsm->orig_m_len = 0; 10067 #endif 10068 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, 10069 rsm); 10070 if (rsm == NULL) 10071 break; 10072 } 10073 } 10074 10075 /* 10076 * Return value of 1, we do not need to call rack_process_data(). 10077 * return value of 0, rack_process_data can be called. 10078 * For ret_val if its 0 the TCP is locked, if its non-zero 10079 * its unlocked and probably unsafe to touch the TCB. 10080 */ 10081 static int 10082 rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so, 10083 struct tcpcb *tp, struct tcpopt *to, 10084 uint32_t tiwin, int32_t tlen, 10085 int32_t * ofia, int32_t thflags, int32_t *ret_val) 10086 { 10087 int32_t ourfinisacked = 0; 10088 int32_t nsegs, acked_amount; 10089 int32_t acked; 10090 struct mbuf *mfree; 10091 struct tcp_rack *rack; 10092 int32_t under_pacing = 0; 10093 int32_t recovery = 0; 10094 10095 INP_WLOCK_ASSERT(tptoinpcb(tp)); 10096 10097 rack = (struct tcp_rack *)tp->t_fb_ptr; 10098 if (SEQ_GT(th->th_ack, tp->snd_max)) { 10099 __ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val, 10100 &rack->r_ctl.challenge_ack_ts, 10101 &rack->r_ctl.challenge_ack_cnt); 10102 rack->r_wanted_output = 1; 10103 return (1); 10104 } 10105 if (rack->gp_ready && 10106 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { 10107 under_pacing = 1; 10108 } 10109 if (SEQ_GEQ(th->th_ack, tp->snd_una) || to->to_nsacks) { 10110 int in_rec, dup_ack_struck = 0; 10111 10112 in_rec = IN_FASTRECOVERY(tp->t_flags); 10113 if (rack->rc_in_persist) { 10114 tp->t_rxtshift = 0; 10115 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 10116 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop); 10117 } 10118 if ((th->th_ack == tp->snd_una) && 10119 (tiwin == tp->snd_wnd) && 10120 ((to->to_flags & TOF_SACK) == 0)) { 10121 rack_strike_dupack(rack); 10122 dup_ack_struck = 1; 10123 } 10124 rack_log_ack(tp, to, th, ((in_rec == 0) && IN_FASTRECOVERY(tp->t_flags)), dup_ack_struck); 10125 } 10126 if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) { 10127 /* 10128 * Old ack, behind (or duplicate to) the last one rcv'd 10129 * Note: We mark reordering is occuring if its 10130 * less than and we have not closed our window. 10131 */ 10132 if (SEQ_LT(th->th_ack, tp->snd_una) && (sbspace(&so->so_rcv) > ctf_fixed_maxseg(tp))) { 10133 rack->r_ctl.rc_reorder_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 10134 } 10135 return (0); 10136 } 10137 /* 10138 * If we reach this point, ACK is not a duplicate, i.e., it ACKs 10139 * something we sent. 10140 */ 10141 if (tp->t_flags & TF_NEEDSYN) { 10142 /* 10143 * T/TCP: Connection was half-synchronized, and our SYN has 10144 * been ACK'd (so connection is now fully synchronized). Go 10145 * to non-starred state, increment snd_una for ACK of SYN, 10146 * and check if we can do window scaling. 10147 */ 10148 tp->t_flags &= ~TF_NEEDSYN; 10149 tp->snd_una++; 10150 /* Do window scaling? */ 10151 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 10152 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 10153 tp->rcv_scale = tp->request_r_scale; 10154 /* Send window already scaled. */ 10155 } 10156 } 10157 nsegs = max(1, m->m_pkthdr.lro_nsegs); 10158 10159 acked = BYTES_THIS_ACK(tp, th); 10160 if (acked) { 10161 /* 10162 * Any time we move the cum-ack forward clear 10163 * keep-alive tied probe-not-answered. The 10164 * persists clears its own on entry. 10165 */ 10166 rack->probe_not_answered = 0; 10167 } 10168 KMOD_TCPSTAT_ADD(tcps_rcvackpack, nsegs); 10169 KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked); 10170 /* 10171 * If we just performed our first retransmit, and the ACK arrives 10172 * within our recovery window, then it was a mistake to do the 10173 * retransmit in the first place. Recover our original cwnd and 10174 * ssthresh, and proceed to transmit where we left off. 10175 */ 10176 if ((tp->t_flags & TF_PREVVALID) && 10177 ((tp->t_flags & TF_RCVD_TSTMP) == 0)) { 10178 tp->t_flags &= ~TF_PREVVALID; 10179 if (tp->t_rxtshift == 1 && 10180 (int)(ticks - tp->t_badrxtwin) < 0) 10181 rack_cong_signal(tp, CC_RTO_ERR, th->th_ack, __LINE__); 10182 } 10183 if (acked) { 10184 /* assure we are not backed off */ 10185 tp->t_rxtshift = 0; 10186 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 10187 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop); 10188 rack->rc_tlp_in_progress = 0; 10189 rack->r_ctl.rc_tlp_cnt_out = 0; 10190 /* 10191 * If it is the RXT timer we want to 10192 * stop it, so we can restart a TLP. 10193 */ 10194 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) 10195 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 10196 #ifdef NETFLIX_HTTP_LOGGING 10197 tcp_http_check_for_comp(rack->rc_tp, th->th_ack); 10198 #endif 10199 } 10200 /* 10201 * If we have a timestamp reply, update smoothed round trip time. If 10202 * no timestamp is present but transmit timer is running and timed 10203 * sequence number was acked, update smoothed round trip time. Since 10204 * we now have an rtt measurement, cancel the timer backoff (cf., 10205 * Phil Karn's retransmit alg.). Recompute the initial retransmit 10206 * timer. 10207 * 10208 * Some boxes send broken timestamp replies during the SYN+ACK 10209 * phase, ignore timestamps of 0 or we could calculate a huge RTT 10210 * and blow up the retransmit timer. 10211 */ 10212 /* 10213 * If all outstanding data is acked, stop retransmit timer and 10214 * remember to restart (more output or persist). If there is more 10215 * data to be acked, restart retransmit timer, using current 10216 * (possibly backed-off) value. 10217 */ 10218 if (acked == 0) { 10219 if (ofia) 10220 *ofia = ourfinisacked; 10221 return (0); 10222 } 10223 if (IN_RECOVERY(tp->t_flags)) { 10224 if (SEQ_LT(th->th_ack, tp->snd_recover) && 10225 (SEQ_LT(th->th_ack, tp->snd_max))) { 10226 tcp_rack_partialack(tp); 10227 } else { 10228 rack_post_recovery(tp, th->th_ack); 10229 recovery = 1; 10230 } 10231 } 10232 /* 10233 * Let the congestion control algorithm update congestion control 10234 * related information. This typically means increasing the 10235 * congestion window. 10236 */ 10237 rack_ack_received(tp, rack, th->th_ack, nsegs, CC_ACK, recovery); 10238 SOCKBUF_LOCK(&so->so_snd); 10239 acked_amount = min(acked, (int)sbavail(&so->so_snd)); 10240 tp->snd_wnd -= acked_amount; 10241 mfree = sbcut_locked(&so->so_snd, acked_amount); 10242 if ((sbused(&so->so_snd) == 0) && 10243 (acked > acked_amount) && 10244 (tp->t_state >= TCPS_FIN_WAIT_1) && 10245 (tp->t_flags & TF_SENTFIN)) { 10246 /* 10247 * We must be sure our fin 10248 * was sent and acked (we can be 10249 * in FIN_WAIT_1 without having 10250 * sent the fin). 10251 */ 10252 ourfinisacked = 1; 10253 } 10254 tp->snd_una = th->th_ack; 10255 if (acked_amount && sbavail(&so->so_snd)) 10256 rack_adjust_sendmap(rack, &so->so_snd, tp->snd_una); 10257 rack_log_wakeup(tp,rack, &so->so_snd, acked, 2); 10258 /* NB: sowwakeup_locked() does an implicit unlock. */ 10259 sowwakeup_locked(so); 10260 m_freem(mfree); 10261 if (SEQ_GT(tp->snd_una, tp->snd_recover)) 10262 tp->snd_recover = tp->snd_una; 10263 10264 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) { 10265 tp->snd_nxt = tp->snd_una; 10266 } 10267 if (under_pacing && 10268 (rack->use_fixed_rate == 0) && 10269 (rack->in_probe_rtt == 0) && 10270 rack->rc_gp_dyn_mul && 10271 rack->rc_always_pace) { 10272 /* Check if we are dragging bottom */ 10273 rack_check_bottom_drag(tp, rack, so, acked); 10274 } 10275 if (tp->snd_una == tp->snd_max) { 10276 /* Nothing left outstanding */ 10277 tp->t_flags &= ~TF_PREVVALID; 10278 rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL); 10279 rack->r_ctl.retran_during_recovery = 0; 10280 rack->r_ctl.dsack_byte_cnt = 0; 10281 if (rack->r_ctl.rc_went_idle_time == 0) 10282 rack->r_ctl.rc_went_idle_time = 1; 10283 rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__); 10284 if (sbavail(&tptosocket(tp)->so_snd) == 0) 10285 tp->t_acktime = 0; 10286 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 10287 /* Set need output so persist might get set */ 10288 rack->r_wanted_output = 1; 10289 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 10290 if ((tp->t_state >= TCPS_FIN_WAIT_1) && 10291 (sbavail(&so->so_snd) == 0) && 10292 (tp->t_flags2 & TF2_DROP_AF_DATA)) { 10293 /* 10294 * The socket was gone and the 10295 * peer sent data (now or in the past), time to 10296 * reset him. 10297 */ 10298 *ret_val = 1; 10299 /* tcp_close will kill the inp pre-log the Reset */ 10300 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); 10301 tp = tcp_close(tp); 10302 ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, tlen); 10303 return (1); 10304 } 10305 } 10306 if (ofia) 10307 *ofia = ourfinisacked; 10308 return (0); 10309 } 10310 10311 10312 static void 10313 rack_log_collapse(struct tcp_rack *rack, uint32_t cnt, uint32_t split, uint32_t out, int line, 10314 int dir, uint32_t flags, struct rack_sendmap *rsm) 10315 { 10316 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 10317 union tcp_log_stackspecific log; 10318 struct timeval tv; 10319 10320 memset(&log, 0, sizeof(log)); 10321 log.u_bbr.flex1 = cnt; 10322 log.u_bbr.flex2 = split; 10323 log.u_bbr.flex3 = out; 10324 log.u_bbr.flex4 = line; 10325 log.u_bbr.flex5 = rack->r_must_retran; 10326 log.u_bbr.flex6 = flags; 10327 log.u_bbr.flex7 = rack->rc_has_collapsed; 10328 log.u_bbr.flex8 = dir; /* 10329 * 1 is collapsed, 0 is uncollapsed, 10330 * 2 is log of a rsm being marked, 3 is a split. 10331 */ 10332 if (rsm == NULL) 10333 log.u_bbr.rttProp = 0; 10334 else 10335 log.u_bbr.rttProp = (uint64_t)rsm; 10336 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 10337 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 10338 TCP_LOG_EVENTP(rack->rc_tp, NULL, 10339 &rack->rc_inp->inp_socket->so_rcv, 10340 &rack->rc_inp->inp_socket->so_snd, 10341 TCP_RACK_LOG_COLLAPSE, 0, 10342 0, &log, false, &tv); 10343 } 10344 } 10345 10346 static void 10347 rack_collapsed_window(struct tcp_rack *rack, uint32_t out, int line) 10348 { 10349 /* 10350 * Here all we do is mark the collapsed point and set the flag. 10351 * This may happen again and again, but there is no 10352 * sense splitting our map until we know where the 10353 * peer finally lands in the collapse. 10354 */ 10355 rack_trace_point(rack, RACK_TP_COLLAPSED_WND); 10356 if ((rack->rc_has_collapsed == 0) || 10357 (rack->r_ctl.last_collapse_point != (rack->rc_tp->snd_una + rack->rc_tp->snd_wnd))) 10358 counter_u64_add(rack_collapsed_win_seen, 1); 10359 rack->r_ctl.last_collapse_point = rack->rc_tp->snd_una + rack->rc_tp->snd_wnd; 10360 rack->r_ctl.high_collapse_point = rack->rc_tp->snd_max; 10361 rack->rc_has_collapsed = 1; 10362 rack->r_collapse_point_valid = 1; 10363 rack_log_collapse(rack, 0, 0, rack->r_ctl.last_collapse_point, line, 1, 0, NULL); 10364 } 10365 10366 static void 10367 rack_un_collapse_window(struct tcp_rack *rack, int line) 10368 { 10369 struct rack_sendmap *nrsm, *rsm, fe; 10370 int cnt = 0, split = 0; 10371 #ifdef INVARIANTS 10372 struct rack_sendmap *insret; 10373 #endif 10374 10375 memset(&fe, 0, sizeof(fe)); 10376 rack->rc_has_collapsed = 0; 10377 fe.r_start = rack->r_ctl.last_collapse_point; 10378 rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); 10379 if (rsm == NULL) { 10380 /* Nothing to do maybe the peer ack'ed it all */ 10381 rack_log_collapse(rack, 0, 0, ctf_outstanding(rack->rc_tp), line, 0, 0, NULL); 10382 return; 10383 } 10384 /* Now do we need to split this one? */ 10385 if (SEQ_GT(rack->r_ctl.last_collapse_point, rsm->r_start)) { 10386 rack_log_collapse(rack, rsm->r_start, rsm->r_end, 10387 rack->r_ctl.last_collapse_point, line, 3, rsm->r_flags, rsm); 10388 nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT); 10389 if (nrsm == NULL) { 10390 /* We can't get a rsm, mark all? */ 10391 nrsm = rsm; 10392 goto no_split; 10393 } 10394 /* Clone it */ 10395 split = 1; 10396 rack_clone_rsm(rack, nrsm, rsm, rack->r_ctl.last_collapse_point); 10397 #ifndef INVARIANTS 10398 (void)RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); 10399 #else 10400 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); 10401 if (insret != NULL) { 10402 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", 10403 nrsm, insret, rack, rsm); 10404 } 10405 #endif 10406 rack_log_map_chg(rack->rc_tp, rack, NULL, rsm, nrsm, MAP_SPLIT, 10407 rack->r_ctl.last_collapse_point, __LINE__); 10408 if (rsm->r_in_tmap) { 10409 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 10410 nrsm->r_in_tmap = 1; 10411 } 10412 /* 10413 * Set in the new RSM as the 10414 * collapsed starting point 10415 */ 10416 rsm = nrsm; 10417 } 10418 no_split: 10419 RB_FOREACH_FROM(nrsm, rack_rb_tree_head, rsm) { 10420 nrsm->r_flags |= RACK_RWND_COLLAPSED; 10421 rack_log_collapse(rack, nrsm->r_start, nrsm->r_end, 0, line, 4, nrsm->r_flags, nrsm); 10422 cnt++; 10423 } 10424 if (cnt) { 10425 counter_u64_add(rack_collapsed_win, 1); 10426 } 10427 rack_log_collapse(rack, cnt, split, ctf_outstanding(rack->rc_tp), line, 0, 0, NULL); 10428 } 10429 10430 static void 10431 rack_handle_delayed_ack(struct tcpcb *tp, struct tcp_rack *rack, 10432 int32_t tlen, int32_t tfo_syn) 10433 { 10434 if (DELAY_ACK(tp, tlen) || tfo_syn) { 10435 if (rack->rc_dack_mode && 10436 (tlen > 500) && 10437 (rack->rc_dack_toggle == 1)) { 10438 goto no_delayed_ack; 10439 } 10440 rack_timer_cancel(tp, rack, 10441 rack->r_ctl.rc_rcvtime, __LINE__); 10442 tp->t_flags |= TF_DELACK; 10443 } else { 10444 no_delayed_ack: 10445 rack->r_wanted_output = 1; 10446 tp->t_flags |= TF_ACKNOW; 10447 if (rack->rc_dack_mode) { 10448 if (tp->t_flags & TF_DELACK) 10449 rack->rc_dack_toggle = 1; 10450 else 10451 rack->rc_dack_toggle = 0; 10452 } 10453 } 10454 } 10455 10456 static void 10457 rack_validate_fo_sendwin_up(struct tcpcb *tp, struct tcp_rack *rack) 10458 { 10459 /* 10460 * If fast output is in progress, lets validate that 10461 * the new window did not shrink on us and make it 10462 * so fast output should end. 10463 */ 10464 if (rack->r_fast_output) { 10465 uint32_t out; 10466 10467 /* 10468 * Calculate what we will send if left as is 10469 * and compare that to our send window. 10470 */ 10471 out = ctf_outstanding(tp); 10472 if ((out + rack->r_ctl.fsb.left_to_send) > tp->snd_wnd) { 10473 /* ok we have an issue */ 10474 if (out >= tp->snd_wnd) { 10475 /* Turn off fast output the window is met or collapsed */ 10476 rack->r_fast_output = 0; 10477 } else { 10478 /* we have some room left */ 10479 rack->r_ctl.fsb.left_to_send = tp->snd_wnd - out; 10480 if (rack->r_ctl.fsb.left_to_send < ctf_fixed_maxseg(tp)) { 10481 /* If not at least 1 full segment never mind */ 10482 rack->r_fast_output = 0; 10483 } 10484 } 10485 } 10486 } 10487 } 10488 10489 10490 /* 10491 * Return value of 1, the TCB is unlocked and most 10492 * likely gone, return value of 0, the TCP is still 10493 * locked. 10494 */ 10495 static int 10496 rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so, 10497 struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, 10498 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) 10499 { 10500 /* 10501 * Update window information. Don't look at window if no ACK: TAC's 10502 * send garbage on first SYN. 10503 */ 10504 int32_t nsegs; 10505 int32_t tfo_syn; 10506 struct tcp_rack *rack; 10507 10508 INP_WLOCK_ASSERT(tptoinpcb(tp)); 10509 10510 rack = (struct tcp_rack *)tp->t_fb_ptr; 10511 nsegs = max(1, m->m_pkthdr.lro_nsegs); 10512 if ((thflags & TH_ACK) && 10513 (SEQ_LT(tp->snd_wl1, th->th_seq) || 10514 (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || 10515 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { 10516 /* keep track of pure window updates */ 10517 if (tlen == 0 && 10518 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) 10519 KMOD_TCPSTAT_INC(tcps_rcvwinupd); 10520 tp->snd_wnd = tiwin; 10521 rack_validate_fo_sendwin_up(tp, rack); 10522 tp->snd_wl1 = th->th_seq; 10523 tp->snd_wl2 = th->th_ack; 10524 if (tp->snd_wnd > tp->max_sndwnd) 10525 tp->max_sndwnd = tp->snd_wnd; 10526 rack->r_wanted_output = 1; 10527 } else if (thflags & TH_ACK) { 10528 if ((tp->snd_wl2 == th->th_ack) && (tiwin < tp->snd_wnd)) { 10529 tp->snd_wnd = tiwin; 10530 rack_validate_fo_sendwin_up(tp, rack); 10531 tp->snd_wl1 = th->th_seq; 10532 tp->snd_wl2 = th->th_ack; 10533 } 10534 } 10535 if (tp->snd_wnd < ctf_outstanding(tp)) 10536 /* The peer collapsed the window */ 10537 rack_collapsed_window(rack, ctf_outstanding(tp), __LINE__); 10538 else if (rack->rc_has_collapsed) 10539 rack_un_collapse_window(rack, __LINE__); 10540 if ((rack->r_collapse_point_valid) && 10541 (SEQ_GT(th->th_ack, rack->r_ctl.high_collapse_point))) 10542 rack->r_collapse_point_valid = 0; 10543 /* Was persist timer active and now we have window space? */ 10544 if ((rack->rc_in_persist != 0) && 10545 (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2), 10546 rack->r_ctl.rc_pace_min_segs))) { 10547 rack_exit_persist(tp, rack, rack->r_ctl.rc_rcvtime); 10548 tp->snd_nxt = tp->snd_max; 10549 /* Make sure we output to start the timer */ 10550 rack->r_wanted_output = 1; 10551 } 10552 /* Do we enter persists? */ 10553 if ((rack->rc_in_persist == 0) && 10554 (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) && 10555 TCPS_HAVEESTABLISHED(tp->t_state) && 10556 ((tp->snd_max == tp->snd_una) || rack->rc_has_collapsed) && 10557 sbavail(&tptosocket(tp)->so_snd) && 10558 (sbavail(&tptosocket(tp)->so_snd) > tp->snd_wnd)) { 10559 /* 10560 * Here the rwnd is less than 10561 * the pacing size, we are established, 10562 * nothing is outstanding, and there is 10563 * data to send. Enter persists. 10564 */ 10565 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime); 10566 } 10567 if (tp->t_flags2 & TF2_DROP_AF_DATA) { 10568 m_freem(m); 10569 return (0); 10570 } 10571 /* 10572 * don't process the URG bit, ignore them drag 10573 * along the up. 10574 */ 10575 tp->rcv_up = tp->rcv_nxt; 10576 10577 /* 10578 * Process the segment text, merging it into the TCP sequencing 10579 * queue, and arranging for acknowledgment of receipt if necessary. 10580 * This process logically involves adjusting tp->rcv_wnd as data is 10581 * presented to the user (this happens in tcp_usrreq.c, case 10582 * PRU_RCVD). If a FIN has already been received on this connection 10583 * then we just ignore the text. 10584 */ 10585 tfo_syn = ((tp->t_state == TCPS_SYN_RECEIVED) && 10586 IS_FASTOPEN(tp->t_flags)); 10587 if ((tlen || (thflags & TH_FIN) || (tfo_syn && tlen > 0)) && 10588 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 10589 tcp_seq save_start = th->th_seq; 10590 tcp_seq save_rnxt = tp->rcv_nxt; 10591 int save_tlen = tlen; 10592 10593 m_adj(m, drop_hdrlen); /* delayed header drop */ 10594 /* 10595 * Insert segment which includes th into TCP reassembly 10596 * queue with control block tp. Set thflags to whether 10597 * reassembly now includes a segment with FIN. This handles 10598 * the common case inline (segment is the next to be 10599 * received on an established connection, and the queue is 10600 * empty), avoiding linkage into and removal from the queue 10601 * and repetition of various conversions. Set DELACK for 10602 * segments received in order, but ack immediately when 10603 * segments are out of order (so fast retransmit can work). 10604 */ 10605 if (th->th_seq == tp->rcv_nxt && 10606 SEGQ_EMPTY(tp) && 10607 (TCPS_HAVEESTABLISHED(tp->t_state) || 10608 tfo_syn)) { 10609 #ifdef NETFLIX_SB_LIMITS 10610 u_int mcnt, appended; 10611 10612 if (so->so_rcv.sb_shlim) { 10613 mcnt = m_memcnt(m); 10614 appended = 0; 10615 if (counter_fo_get(so->so_rcv.sb_shlim, mcnt, 10616 CFO_NOSLEEP, NULL) == false) { 10617 counter_u64_add(tcp_sb_shlim_fails, 1); 10618 m_freem(m); 10619 return (0); 10620 } 10621 } 10622 #endif 10623 rack_handle_delayed_ack(tp, rack, tlen, tfo_syn); 10624 tp->rcv_nxt += tlen; 10625 if (tlen && 10626 ((tp->t_flags2 & TF2_FBYTES_COMPLETE) == 0) && 10627 (tp->t_fbyte_in == 0)) { 10628 tp->t_fbyte_in = ticks; 10629 if (tp->t_fbyte_in == 0) 10630 tp->t_fbyte_in = 1; 10631 if (tp->t_fbyte_out && tp->t_fbyte_in) 10632 tp->t_flags2 |= TF2_FBYTES_COMPLETE; 10633 } 10634 thflags = tcp_get_flags(th) & TH_FIN; 10635 KMOD_TCPSTAT_ADD(tcps_rcvpack, nsegs); 10636 KMOD_TCPSTAT_ADD(tcps_rcvbyte, tlen); 10637 SOCKBUF_LOCK(&so->so_rcv); 10638 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 10639 m_freem(m); 10640 } else 10641 #ifdef NETFLIX_SB_LIMITS 10642 appended = 10643 #endif 10644 sbappendstream_locked(&so->so_rcv, m, 0); 10645 10646 rack_log_wakeup(tp,rack, &so->so_rcv, tlen, 1); 10647 /* NB: sorwakeup_locked() does an implicit unlock. */ 10648 sorwakeup_locked(so); 10649 #ifdef NETFLIX_SB_LIMITS 10650 if (so->so_rcv.sb_shlim && appended != mcnt) 10651 counter_fo_release(so->so_rcv.sb_shlim, 10652 mcnt - appended); 10653 #endif 10654 } else { 10655 /* 10656 * XXX: Due to the header drop above "th" is 10657 * theoretically invalid by now. Fortunately 10658 * m_adj() doesn't actually frees any mbufs when 10659 * trimming from the head. 10660 */ 10661 tcp_seq temp = save_start; 10662 10663 thflags = tcp_reass(tp, th, &temp, &tlen, m); 10664 tp->t_flags |= TF_ACKNOW; 10665 if (tp->t_flags & TF_WAKESOR) { 10666 tp->t_flags &= ~TF_WAKESOR; 10667 /* NB: sorwakeup_locked() does an implicit unlock. */ 10668 sorwakeup_locked(so); 10669 } 10670 } 10671 if ((tp->t_flags & TF_SACK_PERMIT) && 10672 (save_tlen > 0) && 10673 TCPS_HAVEESTABLISHED(tp->t_state)) { 10674 if ((tlen == 0) && (SEQ_LT(save_start, save_rnxt))) { 10675 /* 10676 * DSACK actually handled in the fastpath 10677 * above. 10678 */ 10679 RACK_OPTS_INC(tcp_sack_path_1); 10680 tcp_update_sack_list(tp, save_start, 10681 save_start + save_tlen); 10682 } else if ((tlen > 0) && SEQ_GT(tp->rcv_nxt, save_rnxt)) { 10683 if ((tp->rcv_numsacks >= 1) && 10684 (tp->sackblks[0].end == save_start)) { 10685 /* 10686 * Partial overlap, recorded at todrop 10687 * above. 10688 */ 10689 RACK_OPTS_INC(tcp_sack_path_2a); 10690 tcp_update_sack_list(tp, 10691 tp->sackblks[0].start, 10692 tp->sackblks[0].end); 10693 } else { 10694 RACK_OPTS_INC(tcp_sack_path_2b); 10695 tcp_update_dsack_list(tp, save_start, 10696 save_start + save_tlen); 10697 } 10698 } else if (tlen >= save_tlen) { 10699 /* Update of sackblks. */ 10700 RACK_OPTS_INC(tcp_sack_path_3); 10701 tcp_update_dsack_list(tp, save_start, 10702 save_start + save_tlen); 10703 } else if (tlen > 0) { 10704 RACK_OPTS_INC(tcp_sack_path_4); 10705 tcp_update_dsack_list(tp, save_start, 10706 save_start + tlen); 10707 } 10708 } 10709 } else { 10710 m_freem(m); 10711 thflags &= ~TH_FIN; 10712 } 10713 10714 /* 10715 * If FIN is received ACK the FIN and let the user know that the 10716 * connection is closing. 10717 */ 10718 if (thflags & TH_FIN) { 10719 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 10720 /* The socket upcall is handled by socantrcvmore. */ 10721 socantrcvmore(so); 10722 /* 10723 * If connection is half-synchronized (ie NEEDSYN 10724 * flag on) then delay ACK, so it may be piggybacked 10725 * when SYN is sent. Otherwise, since we received a 10726 * FIN then no more input can be expected, send ACK 10727 * now. 10728 */ 10729 if (tp->t_flags & TF_NEEDSYN) { 10730 rack_timer_cancel(tp, rack, 10731 rack->r_ctl.rc_rcvtime, __LINE__); 10732 tp->t_flags |= TF_DELACK; 10733 } else { 10734 tp->t_flags |= TF_ACKNOW; 10735 } 10736 tp->rcv_nxt++; 10737 } 10738 switch (tp->t_state) { 10739 /* 10740 * In SYN_RECEIVED and ESTABLISHED STATES enter the 10741 * CLOSE_WAIT state. 10742 */ 10743 case TCPS_SYN_RECEIVED: 10744 tp->t_starttime = ticks; 10745 /* FALLTHROUGH */ 10746 case TCPS_ESTABLISHED: 10747 rack_timer_cancel(tp, rack, 10748 rack->r_ctl.rc_rcvtime, __LINE__); 10749 tcp_state_change(tp, TCPS_CLOSE_WAIT); 10750 break; 10751 10752 /* 10753 * If still in FIN_WAIT_1 STATE FIN has not been 10754 * acked so enter the CLOSING state. 10755 */ 10756 case TCPS_FIN_WAIT_1: 10757 rack_timer_cancel(tp, rack, 10758 rack->r_ctl.rc_rcvtime, __LINE__); 10759 tcp_state_change(tp, TCPS_CLOSING); 10760 break; 10761 10762 /* 10763 * In FIN_WAIT_2 state enter the TIME_WAIT state, 10764 * starting the time-wait timer, turning off the 10765 * other standard timers. 10766 */ 10767 case TCPS_FIN_WAIT_2: 10768 rack_timer_cancel(tp, rack, 10769 rack->r_ctl.rc_rcvtime, __LINE__); 10770 tcp_twstart(tp); 10771 return (1); 10772 } 10773 } 10774 /* 10775 * Return any desired output. 10776 */ 10777 if ((tp->t_flags & TF_ACKNOW) || 10778 (sbavail(&so->so_snd) > (tp->snd_max - tp->snd_una))) { 10779 rack->r_wanted_output = 1; 10780 } 10781 return (0); 10782 } 10783 10784 /* 10785 * Here nothing is really faster, its just that we 10786 * have broken out the fast-data path also just like 10787 * the fast-ack. 10788 */ 10789 static int 10790 rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so, 10791 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 10792 uint32_t tiwin, int32_t nxt_pkt, uint8_t iptos) 10793 { 10794 int32_t nsegs; 10795 int32_t newsize = 0; /* automatic sockbuf scaling */ 10796 struct tcp_rack *rack; 10797 #ifdef NETFLIX_SB_LIMITS 10798 u_int mcnt, appended; 10799 #endif 10800 #ifdef TCPDEBUG 10801 /* 10802 * The size of tcp_saveipgen must be the size of the max ip header, 10803 * now IPv6. 10804 */ 10805 u_char tcp_saveipgen[IP6_HDR_LEN]; 10806 struct tcphdr tcp_savetcp; 10807 short ostate = 0; 10808 10809 #endif 10810 /* 10811 * If last ACK falls within this segment's sequence numbers, record 10812 * the timestamp. NOTE that the test is modified according to the 10813 * latest proposal of the tcplw@cray.com list (Braden 1993/04/26). 10814 */ 10815 if (__predict_false(th->th_seq != tp->rcv_nxt)) { 10816 return (0); 10817 } 10818 if (__predict_false(tp->snd_nxt != tp->snd_max)) { 10819 return (0); 10820 } 10821 if (tiwin && tiwin != tp->snd_wnd) { 10822 return (0); 10823 } 10824 if (__predict_false((tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN)))) { 10825 return (0); 10826 } 10827 if (__predict_false((to->to_flags & TOF_TS) && 10828 (TSTMP_LT(to->to_tsval, tp->ts_recent)))) { 10829 return (0); 10830 } 10831 if (__predict_false((th->th_ack != tp->snd_una))) { 10832 return (0); 10833 } 10834 if (__predict_false(tlen > sbspace(&so->so_rcv))) { 10835 return (0); 10836 } 10837 if ((to->to_flags & TOF_TS) != 0 && 10838 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 10839 tp->ts_recent_age = tcp_ts_getticks(); 10840 tp->ts_recent = to->to_tsval; 10841 } 10842 rack = (struct tcp_rack *)tp->t_fb_ptr; 10843 /* 10844 * This is a pure, in-sequence data packet with nothing on the 10845 * reassembly queue and we have enough buffer space to take it. 10846 */ 10847 nsegs = max(1, m->m_pkthdr.lro_nsegs); 10848 10849 #ifdef NETFLIX_SB_LIMITS 10850 if (so->so_rcv.sb_shlim) { 10851 mcnt = m_memcnt(m); 10852 appended = 0; 10853 if (counter_fo_get(so->so_rcv.sb_shlim, mcnt, 10854 CFO_NOSLEEP, NULL) == false) { 10855 counter_u64_add(tcp_sb_shlim_fails, 1); 10856 m_freem(m); 10857 return (1); 10858 } 10859 } 10860 #endif 10861 /* Clean receiver SACK report if present */ 10862 if (tp->rcv_numsacks) 10863 tcp_clean_sackreport(tp); 10864 KMOD_TCPSTAT_INC(tcps_preddat); 10865 tp->rcv_nxt += tlen; 10866 if (tlen && 10867 ((tp->t_flags2 & TF2_FBYTES_COMPLETE) == 0) && 10868 (tp->t_fbyte_in == 0)) { 10869 tp->t_fbyte_in = ticks; 10870 if (tp->t_fbyte_in == 0) 10871 tp->t_fbyte_in = 1; 10872 if (tp->t_fbyte_out && tp->t_fbyte_in) 10873 tp->t_flags2 |= TF2_FBYTES_COMPLETE; 10874 } 10875 /* 10876 * Pull snd_wl1 up to prevent seq wrap relative to th_seq. 10877 */ 10878 tp->snd_wl1 = th->th_seq; 10879 /* 10880 * Pull rcv_up up to prevent seq wrap relative to rcv_nxt. 10881 */ 10882 tp->rcv_up = tp->rcv_nxt; 10883 KMOD_TCPSTAT_ADD(tcps_rcvpack, nsegs); 10884 KMOD_TCPSTAT_ADD(tcps_rcvbyte, tlen); 10885 #ifdef TCPDEBUG 10886 if (so->so_options & SO_DEBUG) 10887 tcp_trace(TA_INPUT, ostate, tp, 10888 (void *)tcp_saveipgen, &tcp_savetcp, 0); 10889 #endif 10890 newsize = tcp_autorcvbuf(m, th, so, tp, tlen); 10891 10892 /* Add data to socket buffer. */ 10893 SOCKBUF_LOCK(&so->so_rcv); 10894 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 10895 m_freem(m); 10896 } else { 10897 /* 10898 * Set new socket buffer size. Give up when limit is 10899 * reached. 10900 */ 10901 if (newsize) 10902 if (!sbreserve_locked(so, SO_RCV, newsize, NULL)) 10903 so->so_rcv.sb_flags &= ~SB_AUTOSIZE; 10904 m_adj(m, drop_hdrlen); /* delayed header drop */ 10905 #ifdef NETFLIX_SB_LIMITS 10906 appended = 10907 #endif 10908 sbappendstream_locked(&so->so_rcv, m, 0); 10909 ctf_calc_rwin(so, tp); 10910 } 10911 rack_log_wakeup(tp,rack, &so->so_rcv, tlen, 1); 10912 /* NB: sorwakeup_locked() does an implicit unlock. */ 10913 sorwakeup_locked(so); 10914 #ifdef NETFLIX_SB_LIMITS 10915 if (so->so_rcv.sb_shlim && mcnt != appended) 10916 counter_fo_release(so->so_rcv.sb_shlim, mcnt - appended); 10917 #endif 10918 rack_handle_delayed_ack(tp, rack, tlen, 0); 10919 if (tp->snd_una == tp->snd_max) 10920 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 10921 return (1); 10922 } 10923 10924 /* 10925 * This subfunction is used to try to highly optimize the 10926 * fast path. We again allow window updates that are 10927 * in sequence to remain in the fast-path. We also add 10928 * in the __predict's to attempt to help the compiler. 10929 * Note that if we return a 0, then we can *not* process 10930 * it and the caller should push the packet into the 10931 * slow-path. 10932 */ 10933 static int 10934 rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so, 10935 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 10936 uint32_t tiwin, int32_t nxt_pkt, uint32_t cts) 10937 { 10938 int32_t acked; 10939 int32_t nsegs; 10940 #ifdef TCPDEBUG 10941 /* 10942 * The size of tcp_saveipgen must be the size of the max ip header, 10943 * now IPv6. 10944 */ 10945 u_char tcp_saveipgen[IP6_HDR_LEN]; 10946 struct tcphdr tcp_savetcp; 10947 short ostate = 0; 10948 #endif 10949 int32_t under_pacing = 0; 10950 struct tcp_rack *rack; 10951 10952 if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) { 10953 /* Old ack, behind (or duplicate to) the last one rcv'd */ 10954 return (0); 10955 } 10956 if (__predict_false(SEQ_GT(th->th_ack, tp->snd_max))) { 10957 /* Above what we have sent? */ 10958 return (0); 10959 } 10960 if (__predict_false(tp->snd_nxt != tp->snd_max)) { 10961 /* We are retransmitting */ 10962 return (0); 10963 } 10964 if (__predict_false(tiwin == 0)) { 10965 /* zero window */ 10966 return (0); 10967 } 10968 if (__predict_false(tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN))) { 10969 /* We need a SYN or a FIN, unlikely.. */ 10970 return (0); 10971 } 10972 if ((to->to_flags & TOF_TS) && __predict_false(TSTMP_LT(to->to_tsval, tp->ts_recent))) { 10973 /* Timestamp is behind .. old ack with seq wrap? */ 10974 return (0); 10975 } 10976 if (__predict_false(IN_RECOVERY(tp->t_flags))) { 10977 /* Still recovering */ 10978 return (0); 10979 } 10980 rack = (struct tcp_rack *)tp->t_fb_ptr; 10981 if (rack->r_ctl.rc_sacked) { 10982 /* We have sack holes on our scoreboard */ 10983 return (0); 10984 } 10985 /* Ok if we reach here, we can process a fast-ack */ 10986 if (rack->gp_ready && 10987 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { 10988 under_pacing = 1; 10989 } 10990 nsegs = max(1, m->m_pkthdr.lro_nsegs); 10991 rack_log_ack(tp, to, th, 0, 0); 10992 /* Did the window get updated? */ 10993 if (tiwin != tp->snd_wnd) { 10994 tp->snd_wnd = tiwin; 10995 rack_validate_fo_sendwin_up(tp, rack); 10996 tp->snd_wl1 = th->th_seq; 10997 if (tp->snd_wnd > tp->max_sndwnd) 10998 tp->max_sndwnd = tp->snd_wnd; 10999 } 11000 /* Do we exit persists? */ 11001 if ((rack->rc_in_persist != 0) && 11002 (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2), 11003 rack->r_ctl.rc_pace_min_segs))) { 11004 rack_exit_persist(tp, rack, cts); 11005 } 11006 /* Do we enter persists? */ 11007 if ((rack->rc_in_persist == 0) && 11008 (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) && 11009 TCPS_HAVEESTABLISHED(tp->t_state) && 11010 ((tp->snd_max == tp->snd_una) || rack->rc_has_collapsed) && 11011 sbavail(&tptosocket(tp)->so_snd) && 11012 (sbavail(&tptosocket(tp)->so_snd) > tp->snd_wnd)) { 11013 /* 11014 * Here the rwnd is less than 11015 * the pacing size, we are established, 11016 * nothing is outstanding, and there is 11017 * data to send. Enter persists. 11018 */ 11019 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime); 11020 } 11021 /* 11022 * If last ACK falls within this segment's sequence numbers, record 11023 * the timestamp. NOTE that the test is modified according to the 11024 * latest proposal of the tcplw@cray.com list (Braden 1993/04/26). 11025 */ 11026 if ((to->to_flags & TOF_TS) != 0 && 11027 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 11028 tp->ts_recent_age = tcp_ts_getticks(); 11029 tp->ts_recent = to->to_tsval; 11030 } 11031 /* 11032 * This is a pure ack for outstanding data. 11033 */ 11034 KMOD_TCPSTAT_INC(tcps_predack); 11035 11036 /* 11037 * "bad retransmit" recovery. 11038 */ 11039 if ((tp->t_flags & TF_PREVVALID) && 11040 ((tp->t_flags & TF_RCVD_TSTMP) == 0)) { 11041 tp->t_flags &= ~TF_PREVVALID; 11042 if (tp->t_rxtshift == 1 && 11043 (int)(ticks - tp->t_badrxtwin) < 0) 11044 rack_cong_signal(tp, CC_RTO_ERR, th->th_ack, __LINE__); 11045 } 11046 /* 11047 * Recalculate the transmit timer / rtt. 11048 * 11049 * Some boxes send broken timestamp replies during the SYN+ACK 11050 * phase, ignore timestamps of 0 or we could calculate a huge RTT 11051 * and blow up the retransmit timer. 11052 */ 11053 acked = BYTES_THIS_ACK(tp, th); 11054 11055 #ifdef TCP_HHOOK 11056 /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ 11057 hhook_run_tcp_est_in(tp, th, to); 11058 #endif 11059 KMOD_TCPSTAT_ADD(tcps_rcvackpack, nsegs); 11060 KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked); 11061 if (acked) { 11062 struct mbuf *mfree; 11063 11064 rack_ack_received(tp, rack, th->th_ack, nsegs, CC_ACK, 0); 11065 SOCKBUF_LOCK(&so->so_snd); 11066 mfree = sbcut_locked(&so->so_snd, acked); 11067 tp->snd_una = th->th_ack; 11068 /* Note we want to hold the sb lock through the sendmap adjust */ 11069 rack_adjust_sendmap(rack, &so->so_snd, tp->snd_una); 11070 /* Wake up the socket if we have room to write more */ 11071 rack_log_wakeup(tp,rack, &so->so_snd, acked, 2); 11072 sowwakeup_locked(so); 11073 m_freem(mfree); 11074 tp->t_rxtshift = 0; 11075 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 11076 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop); 11077 rack->rc_tlp_in_progress = 0; 11078 rack->r_ctl.rc_tlp_cnt_out = 0; 11079 /* 11080 * If it is the RXT timer we want to 11081 * stop it, so we can restart a TLP. 11082 */ 11083 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) 11084 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 11085 #ifdef NETFLIX_HTTP_LOGGING 11086 tcp_http_check_for_comp(rack->rc_tp, th->th_ack); 11087 #endif 11088 } 11089 /* 11090 * Let the congestion control algorithm update congestion control 11091 * related information. This typically means increasing the 11092 * congestion window. 11093 */ 11094 if (tp->snd_wnd < ctf_outstanding(tp)) { 11095 /* The peer collapsed the window */ 11096 rack_collapsed_window(rack, ctf_outstanding(tp), __LINE__); 11097 } else if (rack->rc_has_collapsed) 11098 rack_un_collapse_window(rack, __LINE__); 11099 if ((rack->r_collapse_point_valid) && 11100 (SEQ_GT(tp->snd_una, rack->r_ctl.high_collapse_point))) 11101 rack->r_collapse_point_valid = 0; 11102 /* 11103 * Pull snd_wl2 up to prevent seq wrap relative to th_ack. 11104 */ 11105 tp->snd_wl2 = th->th_ack; 11106 tp->t_dupacks = 0; 11107 m_freem(m); 11108 /* ND6_HINT(tp); *//* Some progress has been made. */ 11109 11110 /* 11111 * If all outstanding data are acked, stop retransmit timer, 11112 * otherwise restart timer using current (possibly backed-off) 11113 * value. If process is waiting for space, wakeup/selwakeup/signal. 11114 * If data are ready to send, let tcp_output decide between more 11115 * output or persist. 11116 */ 11117 #ifdef TCPDEBUG 11118 if (so->so_options & SO_DEBUG) 11119 tcp_trace(TA_INPUT, ostate, tp, 11120 (void *)tcp_saveipgen, 11121 &tcp_savetcp, 0); 11122 #endif 11123 if (under_pacing && 11124 (rack->use_fixed_rate == 0) && 11125 (rack->in_probe_rtt == 0) && 11126 rack->rc_gp_dyn_mul && 11127 rack->rc_always_pace) { 11128 /* Check if we are dragging bottom */ 11129 rack_check_bottom_drag(tp, rack, so, acked); 11130 } 11131 if (tp->snd_una == tp->snd_max) { 11132 tp->t_flags &= ~TF_PREVVALID; 11133 rack->r_ctl.retran_during_recovery = 0; 11134 rack->r_ctl.dsack_byte_cnt = 0; 11135 rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL); 11136 if (rack->r_ctl.rc_went_idle_time == 0) 11137 rack->r_ctl.rc_went_idle_time = 1; 11138 rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__); 11139 if (sbavail(&tptosocket(tp)->so_snd) == 0) 11140 tp->t_acktime = 0; 11141 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 11142 } 11143 if (acked && rack->r_fast_output) 11144 rack_gain_for_fastoutput(rack, tp, so, (uint32_t)acked); 11145 if (sbavail(&so->so_snd)) { 11146 rack->r_wanted_output = 1; 11147 } 11148 return (1); 11149 } 11150 11151 /* 11152 * Return value of 1, the TCB is unlocked and most 11153 * likely gone, return value of 0, the TCP is still 11154 * locked. 11155 */ 11156 static int 11157 rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so, 11158 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 11159 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 11160 { 11161 int32_t ret_val = 0; 11162 int32_t todrop; 11163 int32_t ourfinisacked = 0; 11164 struct tcp_rack *rack; 11165 11166 INP_WLOCK_ASSERT(tptoinpcb(tp)); 11167 11168 ctf_calc_rwin(so, tp); 11169 /* 11170 * If the state is SYN_SENT: if seg contains an ACK, but not for our 11171 * SYN, drop the input. if seg contains a RST, then drop the 11172 * connection. if seg does not contain SYN, then drop it. Otherwise 11173 * this is an acceptable SYN segment initialize tp->rcv_nxt and 11174 * tp->irs if seg contains ack then advance tp->snd_una if seg 11175 * contains an ECE and ECN support is enabled, the stream is ECN 11176 * capable. if SYN has been acked change to ESTABLISHED else 11177 * SYN_RCVD state arrange for segment to be acked (eventually) 11178 * continue processing rest of data/controls. 11179 */ 11180 if ((thflags & TH_ACK) && 11181 (SEQ_LEQ(th->th_ack, tp->iss) || 11182 SEQ_GT(th->th_ack, tp->snd_max))) { 11183 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 11184 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 11185 return (1); 11186 } 11187 if ((thflags & (TH_ACK | TH_RST)) == (TH_ACK | TH_RST)) { 11188 TCP_PROBE5(connect__refused, NULL, tp, 11189 mtod(m, const char *), tp, th); 11190 tp = tcp_drop(tp, ECONNREFUSED); 11191 ctf_do_drop(m, tp); 11192 return (1); 11193 } 11194 if (thflags & TH_RST) { 11195 ctf_do_drop(m, tp); 11196 return (1); 11197 } 11198 if (!(thflags & TH_SYN)) { 11199 ctf_do_drop(m, tp); 11200 return (1); 11201 } 11202 tp->irs = th->th_seq; 11203 tcp_rcvseqinit(tp); 11204 rack = (struct tcp_rack *)tp->t_fb_ptr; 11205 if (thflags & TH_ACK) { 11206 int tfo_partial = 0; 11207 11208 KMOD_TCPSTAT_INC(tcps_connects); 11209 soisconnected(so); 11210 #ifdef MAC 11211 mac_socketpeer_set_from_mbuf(m, so); 11212 #endif 11213 /* Do window scaling on this connection? */ 11214 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 11215 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 11216 tp->rcv_scale = tp->request_r_scale; 11217 } 11218 tp->rcv_adv += min(tp->rcv_wnd, 11219 TCP_MAXWIN << tp->rcv_scale); 11220 /* 11221 * If not all the data that was sent in the TFO SYN 11222 * has been acked, resend the remainder right away. 11223 */ 11224 if (IS_FASTOPEN(tp->t_flags) && 11225 (tp->snd_una != tp->snd_max)) { 11226 tp->snd_nxt = th->th_ack; 11227 tfo_partial = 1; 11228 } 11229 /* 11230 * If there's data, delay ACK; if there's also a FIN ACKNOW 11231 * will be turned on later. 11232 */ 11233 if (DELAY_ACK(tp, tlen) && tlen != 0 && !tfo_partial) { 11234 rack_timer_cancel(tp, rack, 11235 rack->r_ctl.rc_rcvtime, __LINE__); 11236 tp->t_flags |= TF_DELACK; 11237 } else { 11238 rack->r_wanted_output = 1; 11239 tp->t_flags |= TF_ACKNOW; 11240 rack->rc_dack_toggle = 0; 11241 } 11242 11243 tcp_ecn_input_syn_sent(tp, thflags, iptos); 11244 11245 if (SEQ_GT(th->th_ack, tp->snd_una)) { 11246 /* 11247 * We advance snd_una for the 11248 * fast open case. If th_ack is 11249 * acknowledging data beyond 11250 * snd_una we can't just call 11251 * ack-processing since the 11252 * data stream in our send-map 11253 * will start at snd_una + 1 (one 11254 * beyond the SYN). If its just 11255 * equal we don't need to do that 11256 * and there is no send_map. 11257 */ 11258 tp->snd_una++; 11259 } 11260 /* 11261 * Received <SYN,ACK> in SYN_SENT[*] state. Transitions: 11262 * SYN_SENT --> ESTABLISHED SYN_SENT* --> FIN_WAIT_1 11263 */ 11264 tp->t_starttime = ticks; 11265 if (tp->t_flags & TF_NEEDFIN) { 11266 tcp_state_change(tp, TCPS_FIN_WAIT_1); 11267 tp->t_flags &= ~TF_NEEDFIN; 11268 thflags &= ~TH_SYN; 11269 } else { 11270 tcp_state_change(tp, TCPS_ESTABLISHED); 11271 TCP_PROBE5(connect__established, NULL, tp, 11272 mtod(m, const char *), tp, th); 11273 rack_cc_conn_init(tp); 11274 } 11275 } else { 11276 /* 11277 * Received initial SYN in SYN-SENT[*] state => simultaneous 11278 * open. If segment contains CC option and there is a 11279 * cached CC, apply TAO test. If it succeeds, connection is * 11280 * half-synchronized. Otherwise, do 3-way handshake: 11281 * SYN-SENT -> SYN-RECEIVED SYN-SENT* -> SYN-RECEIVED* If 11282 * there was no CC option, clear cached CC value. 11283 */ 11284 tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN | TF_SONOTCONN); 11285 tcp_state_change(tp, TCPS_SYN_RECEIVED); 11286 } 11287 /* 11288 * Advance th->th_seq to correspond to first data byte. If data, 11289 * trim to stay within window, dropping FIN if necessary. 11290 */ 11291 th->th_seq++; 11292 if (tlen > tp->rcv_wnd) { 11293 todrop = tlen - tp->rcv_wnd; 11294 m_adj(m, -todrop); 11295 tlen = tp->rcv_wnd; 11296 thflags &= ~TH_FIN; 11297 KMOD_TCPSTAT_INC(tcps_rcvpackafterwin); 11298 KMOD_TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); 11299 } 11300 tp->snd_wl1 = th->th_seq - 1; 11301 tp->rcv_up = th->th_seq; 11302 /* 11303 * Client side of transaction: already sent SYN and data. If the 11304 * remote host used T/TCP to validate the SYN, our data will be 11305 * ACK'd; if so, enter normal data segment processing in the middle 11306 * of step 5, ack processing. Otherwise, goto step 6. 11307 */ 11308 if (thflags & TH_ACK) { 11309 /* For syn-sent we need to possibly update the rtt */ 11310 if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) { 11311 uint32_t t, mcts; 11312 11313 mcts = tcp_ts_getticks(); 11314 t = (mcts - to->to_tsecr) * HPTS_USEC_IN_MSEC; 11315 if (!tp->t_rttlow || tp->t_rttlow > t) 11316 tp->t_rttlow = t; 11317 rack_log_rtt_sample_calc(rack, t, (to->to_tsecr * 1000), (mcts * 1000), 4); 11318 tcp_rack_xmit_timer(rack, t + 1, 1, t, 0, NULL, 2); 11319 tcp_rack_xmit_timer_commit(rack, tp); 11320 } 11321 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) 11322 return (ret_val); 11323 /* We may have changed to FIN_WAIT_1 above */ 11324 if (tp->t_state == TCPS_FIN_WAIT_1) { 11325 /* 11326 * In FIN_WAIT_1 STATE in addition to the processing 11327 * for the ESTABLISHED state if our FIN is now 11328 * acknowledged then enter FIN_WAIT_2. 11329 */ 11330 if (ourfinisacked) { 11331 /* 11332 * If we can't receive any more data, then 11333 * closing user can proceed. Starting the 11334 * timer is contrary to the specification, 11335 * but if we don't get a FIN we'll hang 11336 * forever. 11337 * 11338 * XXXjl: we should release the tp also, and 11339 * use a compressed state. 11340 */ 11341 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 11342 soisdisconnected(so); 11343 tcp_timer_activate(tp, TT_2MSL, 11344 (tcp_fast_finwait2_recycle ? 11345 tcp_finwait2_timeout : 11346 TP_MAXIDLE(tp))); 11347 } 11348 tcp_state_change(tp, TCPS_FIN_WAIT_2); 11349 } 11350 } 11351 } 11352 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 11353 tiwin, thflags, nxt_pkt)); 11354 } 11355 11356 /* 11357 * Return value of 1, the TCB is unlocked and most 11358 * likely gone, return value of 0, the TCP is still 11359 * locked. 11360 */ 11361 static int 11362 rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, 11363 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 11364 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 11365 { 11366 struct tcp_rack *rack; 11367 int32_t ret_val = 0; 11368 int32_t ourfinisacked = 0; 11369 11370 ctf_calc_rwin(so, tp); 11371 if ((thflags & TH_ACK) && 11372 (SEQ_LEQ(th->th_ack, tp->snd_una) || 11373 SEQ_GT(th->th_ack, tp->snd_max))) { 11374 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 11375 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 11376 return (1); 11377 } 11378 rack = (struct tcp_rack *)tp->t_fb_ptr; 11379 if (IS_FASTOPEN(tp->t_flags)) { 11380 /* 11381 * When a TFO connection is in SYN_RECEIVED, the 11382 * only valid packets are the initial SYN, a 11383 * retransmit/copy of the initial SYN (possibly with 11384 * a subset of the original data), a valid ACK, a 11385 * FIN, or a RST. 11386 */ 11387 if ((thflags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) { 11388 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 11389 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 11390 return (1); 11391 } else if (thflags & TH_SYN) { 11392 /* non-initial SYN is ignored */ 11393 if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) || 11394 (rack->r_ctl.rc_hpts_flags & PACE_TMR_TLP) || 11395 (rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK)) { 11396 ctf_do_drop(m, NULL); 11397 return (0); 11398 } 11399 } else if (!(thflags & (TH_ACK | TH_FIN | TH_RST))) { 11400 ctf_do_drop(m, NULL); 11401 return (0); 11402 } 11403 } 11404 11405 if ((thflags & TH_RST) || 11406 (tp->t_fin_is_rst && (thflags & TH_FIN))) 11407 return (__ctf_process_rst(m, th, so, tp, 11408 &rack->r_ctl.challenge_ack_ts, 11409 &rack->r_ctl.challenge_ack_cnt)); 11410 /* 11411 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 11412 * it's less than ts_recent, drop it. 11413 */ 11414 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 11415 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 11416 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 11417 return (ret_val); 11418 } 11419 /* 11420 * In the SYN-RECEIVED state, validate that the packet belongs to 11421 * this connection before trimming the data to fit the receive 11422 * window. Check the sequence number versus IRS since we know the 11423 * sequence numbers haven't wrapped. This is a partial fix for the 11424 * "LAND" DoS attack. 11425 */ 11426 if (SEQ_LT(th->th_seq, tp->irs)) { 11427 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 11428 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 11429 return (1); 11430 } 11431 if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val, 11432 &rack->r_ctl.challenge_ack_ts, 11433 &rack->r_ctl.challenge_ack_cnt)) { 11434 return (ret_val); 11435 } 11436 /* 11437 * If last ACK falls within this segment's sequence numbers, record 11438 * its timestamp. NOTE: 1) That the test incorporates suggestions 11439 * from the latest proposal of the tcplw@cray.com list (Braden 11440 * 1993/04/26). 2) That updating only on newer timestamps interferes 11441 * with our earlier PAWS tests, so this check should be solely 11442 * predicated on the sequence space of this segment. 3) That we 11443 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 11444 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 11445 * SEG.Len, This modified check allows us to overcome RFC1323's 11446 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 11447 * p.869. In such cases, we can still calculate the RTT correctly 11448 * when RCV.NXT == Last.ACK.Sent. 11449 */ 11450 if ((to->to_flags & TOF_TS) != 0 && 11451 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 11452 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 11453 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 11454 tp->ts_recent_age = tcp_ts_getticks(); 11455 tp->ts_recent = to->to_tsval; 11456 } 11457 tp->snd_wnd = tiwin; 11458 rack_validate_fo_sendwin_up(tp, rack); 11459 /* 11460 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 11461 * is on (half-synchronized state), then queue data for later 11462 * processing; else drop segment and return. 11463 */ 11464 if ((thflags & TH_ACK) == 0) { 11465 if (IS_FASTOPEN(tp->t_flags)) { 11466 rack_cc_conn_init(tp); 11467 } 11468 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 11469 tiwin, thflags, nxt_pkt)); 11470 } 11471 KMOD_TCPSTAT_INC(tcps_connects); 11472 if (tp->t_flags & TF_SONOTCONN) { 11473 tp->t_flags &= ~TF_SONOTCONN; 11474 soisconnected(so); 11475 } 11476 /* Do window scaling? */ 11477 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 11478 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 11479 tp->rcv_scale = tp->request_r_scale; 11480 } 11481 /* 11482 * Make transitions: SYN-RECEIVED -> ESTABLISHED SYN-RECEIVED* -> 11483 * FIN-WAIT-1 11484 */ 11485 tp->t_starttime = ticks; 11486 if (IS_FASTOPEN(tp->t_flags) && tp->t_tfo_pending) { 11487 tcp_fastopen_decrement_counter(tp->t_tfo_pending); 11488 tp->t_tfo_pending = NULL; 11489 } 11490 if (tp->t_flags & TF_NEEDFIN) { 11491 tcp_state_change(tp, TCPS_FIN_WAIT_1); 11492 tp->t_flags &= ~TF_NEEDFIN; 11493 } else { 11494 tcp_state_change(tp, TCPS_ESTABLISHED); 11495 TCP_PROBE5(accept__established, NULL, tp, 11496 mtod(m, const char *), tp, th); 11497 /* 11498 * TFO connections call cc_conn_init() during SYN 11499 * processing. Calling it again here for such connections 11500 * is not harmless as it would undo the snd_cwnd reduction 11501 * that occurs when a TFO SYN|ACK is retransmitted. 11502 */ 11503 if (!IS_FASTOPEN(tp->t_flags)) 11504 rack_cc_conn_init(tp); 11505 } 11506 /* 11507 * Account for the ACK of our SYN prior to 11508 * regular ACK processing below, except for 11509 * simultaneous SYN, which is handled later. 11510 */ 11511 if (SEQ_GT(th->th_ack, tp->snd_una) && !(tp->t_flags & TF_NEEDSYN)) 11512 tp->snd_una++; 11513 /* 11514 * If segment contains data or ACK, will call tcp_reass() later; if 11515 * not, do so now to pass queued data to user. 11516 */ 11517 if (tlen == 0 && (thflags & TH_FIN) == 0) { 11518 (void) tcp_reass(tp, (struct tcphdr *)0, NULL, 0, 11519 (struct mbuf *)0); 11520 if (tp->t_flags & TF_WAKESOR) { 11521 tp->t_flags &= ~TF_WAKESOR; 11522 /* NB: sorwakeup_locked() does an implicit unlock. */ 11523 sorwakeup_locked(so); 11524 } 11525 } 11526 tp->snd_wl1 = th->th_seq - 1; 11527 /* For syn-recv we need to possibly update the rtt */ 11528 if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) { 11529 uint32_t t, mcts; 11530 11531 mcts = tcp_ts_getticks(); 11532 t = (mcts - to->to_tsecr) * HPTS_USEC_IN_MSEC; 11533 if (!tp->t_rttlow || tp->t_rttlow > t) 11534 tp->t_rttlow = t; 11535 rack_log_rtt_sample_calc(rack, t, (to->to_tsecr * 1000), (mcts * 1000), 5); 11536 tcp_rack_xmit_timer(rack, t + 1, 1, t, 0, NULL, 2); 11537 tcp_rack_xmit_timer_commit(rack, tp); 11538 } 11539 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 11540 return (ret_val); 11541 } 11542 if (tp->t_state == TCPS_FIN_WAIT_1) { 11543 /* We could have went to FIN_WAIT_1 (or EST) above */ 11544 /* 11545 * In FIN_WAIT_1 STATE in addition to the processing for the 11546 * ESTABLISHED state if our FIN is now acknowledged then 11547 * enter FIN_WAIT_2. 11548 */ 11549 if (ourfinisacked) { 11550 /* 11551 * If we can't receive any more data, then closing 11552 * user can proceed. Starting the timer is contrary 11553 * to the specification, but if we don't get a FIN 11554 * we'll hang forever. 11555 * 11556 * XXXjl: we should release the tp also, and use a 11557 * compressed state. 11558 */ 11559 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 11560 soisdisconnected(so); 11561 tcp_timer_activate(tp, TT_2MSL, 11562 (tcp_fast_finwait2_recycle ? 11563 tcp_finwait2_timeout : 11564 TP_MAXIDLE(tp))); 11565 } 11566 tcp_state_change(tp, TCPS_FIN_WAIT_2); 11567 } 11568 } 11569 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 11570 tiwin, thflags, nxt_pkt)); 11571 } 11572 11573 /* 11574 * Return value of 1, the TCB is unlocked and most 11575 * likely gone, return value of 0, the TCP is still 11576 * locked. 11577 */ 11578 static int 11579 rack_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so, 11580 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 11581 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 11582 { 11583 int32_t ret_val = 0; 11584 struct tcp_rack *rack; 11585 11586 /* 11587 * Header prediction: check for the two common cases of a 11588 * uni-directional data xfer. If the packet has no control flags, 11589 * is in-sequence, the window didn't change and we're not 11590 * retransmitting, it's a candidate. If the length is zero and the 11591 * ack moved forward, we're the sender side of the xfer. Just free 11592 * the data acked & wake any higher level process that was blocked 11593 * waiting for space. If the length is non-zero and the ack didn't 11594 * move, we're the receiver side. If we're getting packets in-order 11595 * (the reassembly queue is empty), add the data toc The socket 11596 * buffer and note that we need a delayed ack. Make sure that the 11597 * hidden state-flags are also off. Since we check for 11598 * TCPS_ESTABLISHED first, it can only be TH_NEEDSYN. 11599 */ 11600 rack = (struct tcp_rack *)tp->t_fb_ptr; 11601 if (__predict_true(((to->to_flags & TOF_SACK) == 0)) && 11602 __predict_true((thflags & (TH_SYN | TH_FIN | TH_RST | TH_ACK)) == TH_ACK) && 11603 __predict_true(SEGQ_EMPTY(tp)) && 11604 __predict_true(th->th_seq == tp->rcv_nxt)) { 11605 if (tlen == 0) { 11606 if (rack_fastack(m, th, so, tp, to, drop_hdrlen, tlen, 11607 tiwin, nxt_pkt, rack->r_ctl.rc_rcvtime)) { 11608 return (0); 11609 } 11610 } else { 11611 if (rack_do_fastnewdata(m, th, so, tp, to, drop_hdrlen, tlen, 11612 tiwin, nxt_pkt, iptos)) { 11613 return (0); 11614 } 11615 } 11616 } 11617 ctf_calc_rwin(so, tp); 11618 11619 if ((thflags & TH_RST) || 11620 (tp->t_fin_is_rst && (thflags & TH_FIN))) 11621 return (__ctf_process_rst(m, th, so, tp, 11622 &rack->r_ctl.challenge_ack_ts, 11623 &rack->r_ctl.challenge_ack_cnt)); 11624 11625 /* 11626 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 11627 * synchronized state. 11628 */ 11629 if (thflags & TH_SYN) { 11630 ctf_challenge_ack(m, th, tp, iptos, &ret_val); 11631 return (ret_val); 11632 } 11633 /* 11634 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 11635 * it's less than ts_recent, drop it. 11636 */ 11637 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 11638 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 11639 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 11640 return (ret_val); 11641 } 11642 if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val, 11643 &rack->r_ctl.challenge_ack_ts, 11644 &rack->r_ctl.challenge_ack_cnt)) { 11645 return (ret_val); 11646 } 11647 /* 11648 * If last ACK falls within this segment's sequence numbers, record 11649 * its timestamp. NOTE: 1) That the test incorporates suggestions 11650 * from the latest proposal of the tcplw@cray.com list (Braden 11651 * 1993/04/26). 2) That updating only on newer timestamps interferes 11652 * with our earlier PAWS tests, so this check should be solely 11653 * predicated on the sequence space of this segment. 3) That we 11654 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 11655 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 11656 * SEG.Len, This modified check allows us to overcome RFC1323's 11657 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 11658 * p.869. In such cases, we can still calculate the RTT correctly 11659 * when RCV.NXT == Last.ACK.Sent. 11660 */ 11661 if ((to->to_flags & TOF_TS) != 0 && 11662 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 11663 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 11664 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 11665 tp->ts_recent_age = tcp_ts_getticks(); 11666 tp->ts_recent = to->to_tsval; 11667 } 11668 /* 11669 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 11670 * is on (half-synchronized state), then queue data for later 11671 * processing; else drop segment and return. 11672 */ 11673 if ((thflags & TH_ACK) == 0) { 11674 if (tp->t_flags & TF_NEEDSYN) { 11675 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 11676 tiwin, thflags, nxt_pkt)); 11677 11678 } else if (tp->t_flags & TF_ACKNOW) { 11679 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 11680 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; 11681 return (ret_val); 11682 } else { 11683 ctf_do_drop(m, NULL); 11684 return (0); 11685 } 11686 } 11687 /* 11688 * Ack processing. 11689 */ 11690 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) { 11691 return (ret_val); 11692 } 11693 if (sbavail(&so->so_snd)) { 11694 if (ctf_progress_timeout_check(tp, true)) { 11695 rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__); 11696 ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 11697 return (1); 11698 } 11699 } 11700 /* State changes only happen in rack_process_data() */ 11701 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 11702 tiwin, thflags, nxt_pkt)); 11703 } 11704 11705 /* 11706 * Return value of 1, the TCB is unlocked and most 11707 * likely gone, return value of 0, the TCP is still 11708 * locked. 11709 */ 11710 static int 11711 rack_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so, 11712 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 11713 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 11714 { 11715 int32_t ret_val = 0; 11716 struct tcp_rack *rack; 11717 11718 rack = (struct tcp_rack *)tp->t_fb_ptr; 11719 ctf_calc_rwin(so, tp); 11720 if ((thflags & TH_RST) || 11721 (tp->t_fin_is_rst && (thflags & TH_FIN))) 11722 return (__ctf_process_rst(m, th, so, tp, 11723 &rack->r_ctl.challenge_ack_ts, 11724 &rack->r_ctl.challenge_ack_cnt)); 11725 /* 11726 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 11727 * synchronized state. 11728 */ 11729 if (thflags & TH_SYN) { 11730 ctf_challenge_ack(m, th, tp, iptos, &ret_val); 11731 return (ret_val); 11732 } 11733 /* 11734 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 11735 * it's less than ts_recent, drop it. 11736 */ 11737 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 11738 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 11739 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 11740 return (ret_val); 11741 } 11742 if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val, 11743 &rack->r_ctl.challenge_ack_ts, 11744 &rack->r_ctl.challenge_ack_cnt)) { 11745 return (ret_val); 11746 } 11747 /* 11748 * If last ACK falls within this segment's sequence numbers, record 11749 * its timestamp. NOTE: 1) That the test incorporates suggestions 11750 * from the latest proposal of the tcplw@cray.com list (Braden 11751 * 1993/04/26). 2) That updating only on newer timestamps interferes 11752 * with our earlier PAWS tests, so this check should be solely 11753 * predicated on the sequence space of this segment. 3) That we 11754 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 11755 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 11756 * SEG.Len, This modified check allows us to overcome RFC1323's 11757 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 11758 * p.869. In such cases, we can still calculate the RTT correctly 11759 * when RCV.NXT == Last.ACK.Sent. 11760 */ 11761 if ((to->to_flags & TOF_TS) != 0 && 11762 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 11763 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 11764 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 11765 tp->ts_recent_age = tcp_ts_getticks(); 11766 tp->ts_recent = to->to_tsval; 11767 } 11768 /* 11769 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 11770 * is on (half-synchronized state), then queue data for later 11771 * processing; else drop segment and return. 11772 */ 11773 if ((thflags & TH_ACK) == 0) { 11774 if (tp->t_flags & TF_NEEDSYN) { 11775 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 11776 tiwin, thflags, nxt_pkt)); 11777 11778 } else if (tp->t_flags & TF_ACKNOW) { 11779 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 11780 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; 11781 return (ret_val); 11782 } else { 11783 ctf_do_drop(m, NULL); 11784 return (0); 11785 } 11786 } 11787 /* 11788 * Ack processing. 11789 */ 11790 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) { 11791 return (ret_val); 11792 } 11793 if (sbavail(&so->so_snd)) { 11794 if (ctf_progress_timeout_check(tp, true)) { 11795 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 11796 tp, tick, PROGRESS_DROP, __LINE__); 11797 ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 11798 return (1); 11799 } 11800 } 11801 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 11802 tiwin, thflags, nxt_pkt)); 11803 } 11804 11805 static int 11806 rack_check_data_after_close(struct mbuf *m, 11807 struct tcpcb *tp, int32_t *tlen, struct tcphdr *th, struct socket *so) 11808 { 11809 struct tcp_rack *rack; 11810 11811 rack = (struct tcp_rack *)tp->t_fb_ptr; 11812 if (rack->rc_allow_data_af_clo == 0) { 11813 close_now: 11814 tcp_log_end_status(tp, TCP_EI_STATUS_DATA_A_CLOSE); 11815 /* tcp_close will kill the inp pre-log the Reset */ 11816 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); 11817 tp = tcp_close(tp); 11818 KMOD_TCPSTAT_INC(tcps_rcvafterclose); 11819 ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, (*tlen)); 11820 return (1); 11821 } 11822 if (sbavail(&so->so_snd) == 0) 11823 goto close_now; 11824 /* Ok we allow data that is ignored and a followup reset */ 11825 tcp_log_end_status(tp, TCP_EI_STATUS_DATA_A_CLOSE); 11826 tp->rcv_nxt = th->th_seq + *tlen; 11827 tp->t_flags2 |= TF2_DROP_AF_DATA; 11828 rack->r_wanted_output = 1; 11829 *tlen = 0; 11830 return (0); 11831 } 11832 11833 /* 11834 * Return value of 1, the TCB is unlocked and most 11835 * likely gone, return value of 0, the TCP is still 11836 * locked. 11837 */ 11838 static int 11839 rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so, 11840 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 11841 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 11842 { 11843 int32_t ret_val = 0; 11844 int32_t ourfinisacked = 0; 11845 struct tcp_rack *rack; 11846 11847 rack = (struct tcp_rack *)tp->t_fb_ptr; 11848 ctf_calc_rwin(so, tp); 11849 11850 if ((thflags & TH_RST) || 11851 (tp->t_fin_is_rst && (thflags & TH_FIN))) 11852 return (__ctf_process_rst(m, th, so, tp, 11853 &rack->r_ctl.challenge_ack_ts, 11854 &rack->r_ctl.challenge_ack_cnt)); 11855 /* 11856 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 11857 * synchronized state. 11858 */ 11859 if (thflags & TH_SYN) { 11860 ctf_challenge_ack(m, th, tp, iptos, &ret_val); 11861 return (ret_val); 11862 } 11863 /* 11864 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 11865 * it's less than ts_recent, drop it. 11866 */ 11867 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 11868 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 11869 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 11870 return (ret_val); 11871 } 11872 if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val, 11873 &rack->r_ctl.challenge_ack_ts, 11874 &rack->r_ctl.challenge_ack_cnt)) { 11875 return (ret_val); 11876 } 11877 /* 11878 * If new data are received on a connection after the user processes 11879 * are gone, then RST the other end. 11880 */ 11881 if ((tp->t_flags & TF_CLOSED) && tlen && 11882 rack_check_data_after_close(m, tp, &tlen, th, so)) 11883 return (1); 11884 /* 11885 * If last ACK falls within this segment's sequence numbers, record 11886 * its timestamp. NOTE: 1) That the test incorporates suggestions 11887 * from the latest proposal of the tcplw@cray.com list (Braden 11888 * 1993/04/26). 2) That updating only on newer timestamps interferes 11889 * with our earlier PAWS tests, so this check should be solely 11890 * predicated on the sequence space of this segment. 3) That we 11891 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 11892 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 11893 * SEG.Len, This modified check allows us to overcome RFC1323's 11894 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 11895 * p.869. In such cases, we can still calculate the RTT correctly 11896 * when RCV.NXT == Last.ACK.Sent. 11897 */ 11898 if ((to->to_flags & TOF_TS) != 0 && 11899 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 11900 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 11901 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 11902 tp->ts_recent_age = tcp_ts_getticks(); 11903 tp->ts_recent = to->to_tsval; 11904 } 11905 /* 11906 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 11907 * is on (half-synchronized state), then queue data for later 11908 * processing; else drop segment and return. 11909 */ 11910 if ((thflags & TH_ACK) == 0) { 11911 if (tp->t_flags & TF_NEEDSYN) { 11912 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 11913 tiwin, thflags, nxt_pkt)); 11914 } else if (tp->t_flags & TF_ACKNOW) { 11915 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 11916 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; 11917 return (ret_val); 11918 } else { 11919 ctf_do_drop(m, NULL); 11920 return (0); 11921 } 11922 } 11923 /* 11924 * Ack processing. 11925 */ 11926 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 11927 return (ret_val); 11928 } 11929 if (ourfinisacked) { 11930 /* 11931 * If we can't receive any more data, then closing user can 11932 * proceed. Starting the timer is contrary to the 11933 * specification, but if we don't get a FIN we'll hang 11934 * forever. 11935 * 11936 * XXXjl: we should release the tp also, and use a 11937 * compressed state. 11938 */ 11939 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 11940 soisdisconnected(so); 11941 tcp_timer_activate(tp, TT_2MSL, 11942 (tcp_fast_finwait2_recycle ? 11943 tcp_finwait2_timeout : 11944 TP_MAXIDLE(tp))); 11945 } 11946 tcp_state_change(tp, TCPS_FIN_WAIT_2); 11947 } 11948 if (sbavail(&so->so_snd)) { 11949 if (ctf_progress_timeout_check(tp, true)) { 11950 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 11951 tp, tick, PROGRESS_DROP, __LINE__); 11952 ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 11953 return (1); 11954 } 11955 } 11956 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 11957 tiwin, thflags, nxt_pkt)); 11958 } 11959 11960 /* 11961 * Return value of 1, the TCB is unlocked and most 11962 * likely gone, return value of 0, the TCP is still 11963 * locked. 11964 */ 11965 static int 11966 rack_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so, 11967 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 11968 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 11969 { 11970 int32_t ret_val = 0; 11971 int32_t ourfinisacked = 0; 11972 struct tcp_rack *rack; 11973 11974 rack = (struct tcp_rack *)tp->t_fb_ptr; 11975 ctf_calc_rwin(so, tp); 11976 11977 if ((thflags & TH_RST) || 11978 (tp->t_fin_is_rst && (thflags & TH_FIN))) 11979 return (__ctf_process_rst(m, th, so, tp, 11980 &rack->r_ctl.challenge_ack_ts, 11981 &rack->r_ctl.challenge_ack_cnt)); 11982 /* 11983 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 11984 * synchronized state. 11985 */ 11986 if (thflags & TH_SYN) { 11987 ctf_challenge_ack(m, th, tp, iptos, &ret_val); 11988 return (ret_val); 11989 } 11990 /* 11991 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 11992 * it's less than ts_recent, drop it. 11993 */ 11994 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 11995 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 11996 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 11997 return (ret_val); 11998 } 11999 if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val, 12000 &rack->r_ctl.challenge_ack_ts, 12001 &rack->r_ctl.challenge_ack_cnt)) { 12002 return (ret_val); 12003 } 12004 /* 12005 * If new data are received on a connection after the user processes 12006 * are gone, then RST the other end. 12007 */ 12008 if ((tp->t_flags & TF_CLOSED) && tlen && 12009 rack_check_data_after_close(m, tp, &tlen, th, so)) 12010 return (1); 12011 /* 12012 * If last ACK falls within this segment's sequence numbers, record 12013 * its timestamp. NOTE: 1) That the test incorporates suggestions 12014 * from the latest proposal of the tcplw@cray.com list (Braden 12015 * 1993/04/26). 2) That updating only on newer timestamps interferes 12016 * with our earlier PAWS tests, so this check should be solely 12017 * predicated on the sequence space of this segment. 3) That we 12018 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 12019 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 12020 * SEG.Len, This modified check allows us to overcome RFC1323's 12021 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 12022 * p.869. In such cases, we can still calculate the RTT correctly 12023 * when RCV.NXT == Last.ACK.Sent. 12024 */ 12025 if ((to->to_flags & TOF_TS) != 0 && 12026 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 12027 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 12028 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 12029 tp->ts_recent_age = tcp_ts_getticks(); 12030 tp->ts_recent = to->to_tsval; 12031 } 12032 /* 12033 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 12034 * is on (half-synchronized state), then queue data for later 12035 * processing; else drop segment and return. 12036 */ 12037 if ((thflags & TH_ACK) == 0) { 12038 if (tp->t_flags & TF_NEEDSYN) { 12039 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 12040 tiwin, thflags, nxt_pkt)); 12041 } else if (tp->t_flags & TF_ACKNOW) { 12042 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 12043 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; 12044 return (ret_val); 12045 } else { 12046 ctf_do_drop(m, NULL); 12047 return (0); 12048 } 12049 } 12050 /* 12051 * Ack processing. 12052 */ 12053 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 12054 return (ret_val); 12055 } 12056 if (ourfinisacked) { 12057 tcp_twstart(tp); 12058 m_freem(m); 12059 return (1); 12060 } 12061 if (sbavail(&so->so_snd)) { 12062 if (ctf_progress_timeout_check(tp, true)) { 12063 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 12064 tp, tick, PROGRESS_DROP, __LINE__); 12065 ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 12066 return (1); 12067 } 12068 } 12069 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 12070 tiwin, thflags, nxt_pkt)); 12071 } 12072 12073 /* 12074 * Return value of 1, the TCB is unlocked and most 12075 * likely gone, return value of 0, the TCP is still 12076 * locked. 12077 */ 12078 static int 12079 rack_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so, 12080 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 12081 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 12082 { 12083 int32_t ret_val = 0; 12084 int32_t ourfinisacked = 0; 12085 struct tcp_rack *rack; 12086 12087 rack = (struct tcp_rack *)tp->t_fb_ptr; 12088 ctf_calc_rwin(so, tp); 12089 12090 if ((thflags & TH_RST) || 12091 (tp->t_fin_is_rst && (thflags & TH_FIN))) 12092 return (__ctf_process_rst(m, th, so, tp, 12093 &rack->r_ctl.challenge_ack_ts, 12094 &rack->r_ctl.challenge_ack_cnt)); 12095 /* 12096 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 12097 * synchronized state. 12098 */ 12099 if (thflags & TH_SYN) { 12100 ctf_challenge_ack(m, th, tp, iptos, &ret_val); 12101 return (ret_val); 12102 } 12103 /* 12104 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 12105 * it's less than ts_recent, drop it. 12106 */ 12107 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 12108 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 12109 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 12110 return (ret_val); 12111 } 12112 if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val, 12113 &rack->r_ctl.challenge_ack_ts, 12114 &rack->r_ctl.challenge_ack_cnt)) { 12115 return (ret_val); 12116 } 12117 /* 12118 * If new data are received on a connection after the user processes 12119 * are gone, then RST the other end. 12120 */ 12121 if ((tp->t_flags & TF_CLOSED) && tlen && 12122 rack_check_data_after_close(m, tp, &tlen, th, so)) 12123 return (1); 12124 /* 12125 * If last ACK falls within this segment's sequence numbers, record 12126 * its timestamp. NOTE: 1) That the test incorporates suggestions 12127 * from the latest proposal of the tcplw@cray.com list (Braden 12128 * 1993/04/26). 2) That updating only on newer timestamps interferes 12129 * with our earlier PAWS tests, so this check should be solely 12130 * predicated on the sequence space of this segment. 3) That we 12131 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 12132 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 12133 * SEG.Len, This modified check allows us to overcome RFC1323's 12134 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 12135 * p.869. In such cases, we can still calculate the RTT correctly 12136 * when RCV.NXT == Last.ACK.Sent. 12137 */ 12138 if ((to->to_flags & TOF_TS) != 0 && 12139 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 12140 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 12141 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 12142 tp->ts_recent_age = tcp_ts_getticks(); 12143 tp->ts_recent = to->to_tsval; 12144 } 12145 /* 12146 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 12147 * is on (half-synchronized state), then queue data for later 12148 * processing; else drop segment and return. 12149 */ 12150 if ((thflags & TH_ACK) == 0) { 12151 if (tp->t_flags & TF_NEEDSYN) { 12152 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 12153 tiwin, thflags, nxt_pkt)); 12154 } else if (tp->t_flags & TF_ACKNOW) { 12155 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 12156 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; 12157 return (ret_val); 12158 } else { 12159 ctf_do_drop(m, NULL); 12160 return (0); 12161 } 12162 } 12163 /* 12164 * case TCPS_LAST_ACK: Ack processing. 12165 */ 12166 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 12167 return (ret_val); 12168 } 12169 if (ourfinisacked) { 12170 tp = tcp_close(tp); 12171 ctf_do_drop(m, tp); 12172 return (1); 12173 } 12174 if (sbavail(&so->so_snd)) { 12175 if (ctf_progress_timeout_check(tp, true)) { 12176 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 12177 tp, tick, PROGRESS_DROP, __LINE__); 12178 ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 12179 return (1); 12180 } 12181 } 12182 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 12183 tiwin, thflags, nxt_pkt)); 12184 } 12185 12186 /* 12187 * Return value of 1, the TCB is unlocked and most 12188 * likely gone, return value of 0, the TCP is still 12189 * locked. 12190 */ 12191 static int 12192 rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so, 12193 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 12194 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 12195 { 12196 int32_t ret_val = 0; 12197 int32_t ourfinisacked = 0; 12198 struct tcp_rack *rack; 12199 12200 rack = (struct tcp_rack *)tp->t_fb_ptr; 12201 ctf_calc_rwin(so, tp); 12202 12203 /* Reset receive buffer auto scaling when not in bulk receive mode. */ 12204 if ((thflags & TH_RST) || 12205 (tp->t_fin_is_rst && (thflags & TH_FIN))) 12206 return (__ctf_process_rst(m, th, so, tp, 12207 &rack->r_ctl.challenge_ack_ts, 12208 &rack->r_ctl.challenge_ack_cnt)); 12209 /* 12210 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 12211 * synchronized state. 12212 */ 12213 if (thflags & TH_SYN) { 12214 ctf_challenge_ack(m, th, tp, iptos, &ret_val); 12215 return (ret_val); 12216 } 12217 /* 12218 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 12219 * it's less than ts_recent, drop it. 12220 */ 12221 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 12222 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 12223 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 12224 return (ret_val); 12225 } 12226 if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val, 12227 &rack->r_ctl.challenge_ack_ts, 12228 &rack->r_ctl.challenge_ack_cnt)) { 12229 return (ret_val); 12230 } 12231 /* 12232 * If new data are received on a connection after the user processes 12233 * are gone, then RST the other end. 12234 */ 12235 if ((tp->t_flags & TF_CLOSED) && tlen && 12236 rack_check_data_after_close(m, tp, &tlen, th, so)) 12237 return (1); 12238 /* 12239 * If last ACK falls within this segment's sequence numbers, record 12240 * its timestamp. NOTE: 1) That the test incorporates suggestions 12241 * from the latest proposal of the tcplw@cray.com list (Braden 12242 * 1993/04/26). 2) That updating only on newer timestamps interferes 12243 * with our earlier PAWS tests, so this check should be solely 12244 * predicated on the sequence space of this segment. 3) That we 12245 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 12246 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 12247 * SEG.Len, This modified check allows us to overcome RFC1323's 12248 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 12249 * p.869. In such cases, we can still calculate the RTT correctly 12250 * when RCV.NXT == Last.ACK.Sent. 12251 */ 12252 if ((to->to_flags & TOF_TS) != 0 && 12253 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 12254 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 12255 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 12256 tp->ts_recent_age = tcp_ts_getticks(); 12257 tp->ts_recent = to->to_tsval; 12258 } 12259 /* 12260 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 12261 * is on (half-synchronized state), then queue data for later 12262 * processing; else drop segment and return. 12263 */ 12264 if ((thflags & TH_ACK) == 0) { 12265 if (tp->t_flags & TF_NEEDSYN) { 12266 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 12267 tiwin, thflags, nxt_pkt)); 12268 } else if (tp->t_flags & TF_ACKNOW) { 12269 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 12270 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; 12271 return (ret_val); 12272 } else { 12273 ctf_do_drop(m, NULL); 12274 return (0); 12275 } 12276 } 12277 /* 12278 * Ack processing. 12279 */ 12280 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 12281 return (ret_val); 12282 } 12283 if (sbavail(&so->so_snd)) { 12284 if (ctf_progress_timeout_check(tp, true)) { 12285 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 12286 tp, tick, PROGRESS_DROP, __LINE__); 12287 ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 12288 return (1); 12289 } 12290 } 12291 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 12292 tiwin, thflags, nxt_pkt)); 12293 } 12294 12295 static void inline 12296 rack_clear_rate_sample(struct tcp_rack *rack) 12297 { 12298 rack->r_ctl.rack_rs.rs_flags = RACK_RTT_EMPTY; 12299 rack->r_ctl.rack_rs.rs_rtt_cnt = 0; 12300 rack->r_ctl.rack_rs.rs_rtt_tot = 0; 12301 } 12302 12303 static void 12304 rack_set_pace_segments(struct tcpcb *tp, struct tcp_rack *rack, uint32_t line, uint64_t *fill_override) 12305 { 12306 uint64_t bw_est, rate_wanted; 12307 int chged = 0; 12308 uint32_t user_max, orig_min, orig_max; 12309 12310 orig_min = rack->r_ctl.rc_pace_min_segs; 12311 orig_max = rack->r_ctl.rc_pace_max_segs; 12312 user_max = ctf_fixed_maxseg(tp) * rack->rc_user_set_max_segs; 12313 if (ctf_fixed_maxseg(tp) != rack->r_ctl.rc_pace_min_segs) 12314 chged = 1; 12315 rack->r_ctl.rc_pace_min_segs = ctf_fixed_maxseg(tp); 12316 if (rack->use_fixed_rate || rack->rc_force_max_seg) { 12317 if (user_max != rack->r_ctl.rc_pace_max_segs) 12318 chged = 1; 12319 } 12320 if (rack->rc_force_max_seg) { 12321 rack->r_ctl.rc_pace_max_segs = user_max; 12322 } else if (rack->use_fixed_rate) { 12323 bw_est = rack_get_bw(rack); 12324 if ((rack->r_ctl.crte == NULL) || 12325 (bw_est != rack->r_ctl.crte->rate)) { 12326 rack->r_ctl.rc_pace_max_segs = user_max; 12327 } else { 12328 /* We are pacing right at the hardware rate */ 12329 uint32_t segsiz; 12330 12331 segsiz = min(ctf_fixed_maxseg(tp), 12332 rack->r_ctl.rc_pace_min_segs); 12333 rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size( 12334 tp, bw_est, segsiz, 0, 12335 rack->r_ctl.crte, NULL); 12336 } 12337 } else if (rack->rc_always_pace) { 12338 if (rack->r_ctl.gp_bw || 12339 #ifdef NETFLIX_PEAKRATE 12340 rack->rc_tp->t_maxpeakrate || 12341 #endif 12342 rack->r_ctl.init_rate) { 12343 /* We have a rate of some sort set */ 12344 uint32_t orig; 12345 12346 bw_est = rack_get_bw(rack); 12347 orig = rack->r_ctl.rc_pace_max_segs; 12348 if (fill_override) 12349 rate_wanted = *fill_override; 12350 else 12351 rate_wanted = rack_get_output_bw(rack, bw_est, NULL, NULL); 12352 if (rate_wanted) { 12353 /* We have something */ 12354 rack->r_ctl.rc_pace_max_segs = rack_get_pacing_len(rack, 12355 rate_wanted, 12356 ctf_fixed_maxseg(rack->rc_tp)); 12357 } else 12358 rack->r_ctl.rc_pace_max_segs = rack->r_ctl.rc_pace_min_segs; 12359 if (orig != rack->r_ctl.rc_pace_max_segs) 12360 chged = 1; 12361 } else if ((rack->r_ctl.gp_bw == 0) && 12362 (rack->r_ctl.rc_pace_max_segs == 0)) { 12363 /* 12364 * If we have nothing limit us to bursting 12365 * out IW sized pieces. 12366 */ 12367 chged = 1; 12368 rack->r_ctl.rc_pace_max_segs = rc_init_window(rack); 12369 } 12370 } 12371 if (rack->r_ctl.rc_pace_max_segs > PACE_MAX_IP_BYTES) { 12372 chged = 1; 12373 rack->r_ctl.rc_pace_max_segs = PACE_MAX_IP_BYTES; 12374 } 12375 if (chged) 12376 rack_log_type_pacing_sizes(tp, rack, orig_min, orig_max, line, 2); 12377 } 12378 12379 12380 static void 12381 rack_init_fsb_block(struct tcpcb *tp, struct tcp_rack *rack) 12382 { 12383 #ifdef INET6 12384 struct ip6_hdr *ip6 = NULL; 12385 #endif 12386 #ifdef INET 12387 struct ip *ip = NULL; 12388 #endif 12389 struct udphdr *udp = NULL; 12390 12391 /* Ok lets fill in the fast block, it can only be used with no IP options! */ 12392 #ifdef INET6 12393 if (rack->r_is_v6) { 12394 rack->r_ctl.fsb.tcp_ip_hdr_len = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 12395 ip6 = (struct ip6_hdr *)rack->r_ctl.fsb.tcp_ip_hdr; 12396 if (tp->t_port) { 12397 rack->r_ctl.fsb.tcp_ip_hdr_len += sizeof(struct udphdr); 12398 udp = (struct udphdr *)((caddr_t)ip6 + sizeof(struct ip6_hdr)); 12399 udp->uh_sport = htons(V_tcp_udp_tunneling_port); 12400 udp->uh_dport = tp->t_port; 12401 rack->r_ctl.fsb.udp = udp; 12402 rack->r_ctl.fsb.th = (struct tcphdr *)(udp + 1); 12403 } else 12404 { 12405 rack->r_ctl.fsb.th = (struct tcphdr *)(ip6 + 1); 12406 rack->r_ctl.fsb.udp = NULL; 12407 } 12408 tcpip_fillheaders(rack->rc_inp, 12409 tp->t_port, 12410 ip6, rack->r_ctl.fsb.th); 12411 } else 12412 #endif /* INET6 */ 12413 { 12414 rack->r_ctl.fsb.tcp_ip_hdr_len = sizeof(struct tcpiphdr); 12415 ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr; 12416 if (tp->t_port) { 12417 rack->r_ctl.fsb.tcp_ip_hdr_len += sizeof(struct udphdr); 12418 udp = (struct udphdr *)((caddr_t)ip + sizeof(struct ip)); 12419 udp->uh_sport = htons(V_tcp_udp_tunneling_port); 12420 udp->uh_dport = tp->t_port; 12421 rack->r_ctl.fsb.udp = udp; 12422 rack->r_ctl.fsb.th = (struct tcphdr *)(udp + 1); 12423 } else 12424 { 12425 rack->r_ctl.fsb.udp = NULL; 12426 rack->r_ctl.fsb.th = (struct tcphdr *)(ip + 1); 12427 } 12428 tcpip_fillheaders(rack->rc_inp, 12429 tp->t_port, 12430 ip, rack->r_ctl.fsb.th); 12431 } 12432 rack->r_fsb_inited = 1; 12433 } 12434 12435 static int 12436 rack_init_fsb(struct tcpcb *tp, struct tcp_rack *rack) 12437 { 12438 /* 12439 * Allocate the larger of spaces V6 if available else just 12440 * V4 and include udphdr (overbook) 12441 */ 12442 #ifdef INET6 12443 rack->r_ctl.fsb.tcp_ip_hdr_len = sizeof(struct ip6_hdr) + sizeof(struct tcphdr) + sizeof(struct udphdr); 12444 #else 12445 rack->r_ctl.fsb.tcp_ip_hdr_len = sizeof(struct tcpiphdr) + sizeof(struct udphdr); 12446 #endif 12447 rack->r_ctl.fsb.tcp_ip_hdr = malloc(rack->r_ctl.fsb.tcp_ip_hdr_len, 12448 M_TCPFSB, M_NOWAIT|M_ZERO); 12449 if (rack->r_ctl.fsb.tcp_ip_hdr == NULL) { 12450 return (ENOMEM); 12451 } 12452 rack->r_fsb_inited = 0; 12453 return (0); 12454 } 12455 12456 static int 12457 rack_init(struct tcpcb *tp) 12458 { 12459 struct inpcb *inp = tptoinpcb(tp); 12460 struct tcp_rack *rack = NULL; 12461 #ifdef INVARIANTS 12462 struct rack_sendmap *insret; 12463 #endif 12464 uint32_t iwin, snt, us_cts; 12465 int err; 12466 12467 tp->t_fb_ptr = uma_zalloc(rack_pcb_zone, M_NOWAIT); 12468 if (tp->t_fb_ptr == NULL) { 12469 /* 12470 * We need to allocate memory but cant. The INP and INP_INFO 12471 * locks and they are recursive (happens during setup. So a 12472 * scheme to drop the locks fails :( 12473 * 12474 */ 12475 return (ENOMEM); 12476 } 12477 memset(tp->t_fb_ptr, 0, sizeof(struct tcp_rack)); 12478 12479 rack = (struct tcp_rack *)tp->t_fb_ptr; 12480 RB_INIT(&rack->r_ctl.rc_mtree); 12481 TAILQ_INIT(&rack->r_ctl.rc_free); 12482 TAILQ_INIT(&rack->r_ctl.rc_tmap); 12483 rack->rc_tp = tp; 12484 rack->rc_inp = inp; 12485 /* Set the flag */ 12486 rack->r_is_v6 = (inp->inp_vflag & INP_IPV6) != 0; 12487 /* Probably not needed but lets be sure */ 12488 rack_clear_rate_sample(rack); 12489 /* 12490 * Save off the default values, socket options will poke 12491 * at these if pacing is not on or we have not yet 12492 * reached where pacing is on (gp_ready/fixed enabled). 12493 * When they get set into the CC module (when gp_ready 12494 * is enabled or we enable fixed) then we will set these 12495 * values into the CC and place in here the old values 12496 * so we have a restoral. Then we will set the flag 12497 * rc_pacing_cc_set. That way whenever we turn off pacing 12498 * or switch off this stack, we will know to go restore 12499 * the saved values. 12500 */ 12501 rack->r_ctl.rc_saved_beta.beta = V_newreno_beta_ecn; 12502 rack->r_ctl.rc_saved_beta.beta_ecn = V_newreno_beta_ecn; 12503 /* We want abe like behavior as well */ 12504 rack->r_ctl.rc_saved_beta.newreno_flags |= CC_NEWRENO_BETA_ECN_ENABLED; 12505 rack->r_ctl.rc_reorder_fade = rack_reorder_fade; 12506 rack->rc_allow_data_af_clo = rack_ignore_data_after_close; 12507 rack->r_ctl.rc_tlp_threshold = rack_tlp_thresh; 12508 rack->r_ctl.roundends = tp->snd_max; 12509 if (use_rack_rr) 12510 rack->use_rack_rr = 1; 12511 if (V_tcp_delack_enabled) 12512 tp->t_delayed_ack = 1; 12513 else 12514 tp->t_delayed_ack = 0; 12515 #ifdef TCP_ACCOUNTING 12516 if (rack_tcp_accounting) { 12517 tp->t_flags2 |= TF2_TCP_ACCOUNTING; 12518 } 12519 #endif 12520 if (rack_enable_shared_cwnd) 12521 rack->rack_enable_scwnd = 1; 12522 rack->rc_user_set_max_segs = rack_hptsi_segments; 12523 rack->rc_force_max_seg = 0; 12524 if (rack_use_imac_dack) 12525 rack->rc_dack_mode = 1; 12526 TAILQ_INIT(&rack->r_ctl.opt_list); 12527 rack->r_ctl.rc_reorder_shift = rack_reorder_thresh; 12528 rack->r_ctl.rc_pkt_delay = rack_pkt_delay; 12529 rack->r_ctl.rc_tlp_cwnd_reduce = rack_lower_cwnd_at_tlp; 12530 rack->r_ctl.rc_lowest_us_rtt = 0xffffffff; 12531 rack->r_ctl.rc_highest_us_rtt = 0; 12532 rack->r_ctl.bw_rate_cap = rack_bw_rate_cap; 12533 rack->r_ctl.timer_slop = TICKS_2_USEC(tcp_rexmit_slop); 12534 if (rack_use_cmp_acks) 12535 rack->r_use_cmp_ack = 1; 12536 if (rack_disable_prr) 12537 rack->rack_no_prr = 1; 12538 if (rack_gp_no_rec_chg) 12539 rack->rc_gp_no_rec_chg = 1; 12540 if (rack_pace_every_seg && tcp_can_enable_pacing()) { 12541 rack->rc_always_pace = 1; 12542 if (rack->use_fixed_rate || rack->gp_ready) 12543 rack_set_cc_pacing(rack); 12544 } else 12545 rack->rc_always_pace = 0; 12546 if (rack_enable_mqueue_for_nonpaced || rack->r_use_cmp_ack) 12547 rack->r_mbuf_queue = 1; 12548 else 12549 rack->r_mbuf_queue = 0; 12550 if (rack->r_mbuf_queue || rack->rc_always_pace || rack->r_use_cmp_ack) 12551 inp->inp_flags2 |= INP_SUPPORTS_MBUFQ; 12552 else 12553 inp->inp_flags2 &= ~INP_SUPPORTS_MBUFQ; 12554 rack_set_pace_segments(tp, rack, __LINE__, NULL); 12555 if (rack_limits_scwnd) 12556 rack->r_limit_scw = 1; 12557 else 12558 rack->r_limit_scw = 0; 12559 rack->rc_labc = V_tcp_abc_l_var; 12560 rack->r_ctl.rc_high_rwnd = tp->snd_wnd; 12561 rack->r_ctl.cwnd_to_use = tp->snd_cwnd; 12562 rack->r_ctl.rc_rate_sample_method = rack_rate_sample_method; 12563 rack->rack_tlp_threshold_use = rack_tlp_threshold_use; 12564 rack->r_ctl.rc_prr_sendalot = rack_send_a_lot_in_prr; 12565 rack->r_ctl.rc_min_to = rack_min_to; 12566 microuptime(&rack->r_ctl.act_rcv_time); 12567 rack->r_ctl.rc_last_time_decay = rack->r_ctl.act_rcv_time; 12568 rack->rc_init_win = rack_default_init_window; 12569 rack->r_ctl.rack_per_of_gp_ss = rack_per_of_gp_ss; 12570 if (rack_hw_up_only) 12571 rack->r_up_only = 1; 12572 if (rack_do_dyn_mul) { 12573 /* When dynamic adjustment is on CA needs to start at 100% */ 12574 rack->rc_gp_dyn_mul = 1; 12575 if (rack_do_dyn_mul >= 100) 12576 rack->r_ctl.rack_per_of_gp_ca = rack_do_dyn_mul; 12577 } else 12578 rack->r_ctl.rack_per_of_gp_ca = rack_per_of_gp_ca; 12579 rack->r_ctl.rack_per_of_gp_rec = rack_per_of_gp_rec; 12580 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt; 12581 rack->r_ctl.rc_tlp_rxt_last_time = tcp_tv_to_mssectick(&rack->r_ctl.act_rcv_time); 12582 setup_time_filter_small(&rack->r_ctl.rc_gp_min_rtt, FILTER_TYPE_MIN, 12583 rack_probertt_filter_life); 12584 us_cts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 12585 rack->r_ctl.rc_lower_rtt_us_cts = us_cts; 12586 rack->r_ctl.rc_time_of_last_probertt = us_cts; 12587 rack->r_ctl.challenge_ack_ts = tcp_ts_getticks(); 12588 rack->r_ctl.rc_time_probertt_starts = 0; 12589 if (rack_dsack_std_based & 0x1) { 12590 /* Basically this means all rack timers are at least (srtt + 1/4 srtt) */ 12591 rack->rc_rack_tmr_std_based = 1; 12592 } 12593 if (rack_dsack_std_based & 0x2) { 12594 /* Basically this means rack timers are extended based on dsack by up to (2 * srtt) */ 12595 rack->rc_rack_use_dsack = 1; 12596 } 12597 /* We require at least one measurement, even if the sysctl is 0 */ 12598 if (rack_req_measurements) 12599 rack->r_ctl.req_measurements = rack_req_measurements; 12600 else 12601 rack->r_ctl.req_measurements = 1; 12602 if (rack_enable_hw_pacing) 12603 rack->rack_hdw_pace_ena = 1; 12604 if (rack_hw_rate_caps) 12605 rack->r_rack_hw_rate_caps = 1; 12606 /* Do we force on detection? */ 12607 #ifdef NETFLIX_EXP_DETECTION 12608 if (tcp_force_detection) 12609 rack->do_detection = 1; 12610 else 12611 #endif 12612 rack->do_detection = 0; 12613 if (rack_non_rxt_use_cr) 12614 rack->rack_rec_nonrxt_use_cr = 1; 12615 err = rack_init_fsb(tp, rack); 12616 if (err) { 12617 uma_zfree(rack_pcb_zone, tp->t_fb_ptr); 12618 tp->t_fb_ptr = NULL; 12619 return (err); 12620 } 12621 if (tp->snd_una != tp->snd_max) { 12622 /* Create a send map for the current outstanding data */ 12623 struct rack_sendmap *rsm; 12624 12625 rsm = rack_alloc(rack); 12626 if (rsm == NULL) { 12627 uma_zfree(rack_pcb_zone, tp->t_fb_ptr); 12628 tp->t_fb_ptr = NULL; 12629 return (ENOMEM); 12630 } 12631 rsm->r_no_rtt_allowed = 1; 12632 rsm->r_tim_lastsent[0] = rack_to_usec_ts(&rack->r_ctl.act_rcv_time); 12633 rsm->r_rtr_cnt = 1; 12634 rsm->r_rtr_bytes = 0; 12635 if (tp->t_flags & TF_SENTFIN) 12636 rsm->r_flags |= RACK_HAS_FIN; 12637 if ((tp->snd_una == tp->iss) && 12638 !TCPS_HAVEESTABLISHED(tp->t_state)) 12639 rsm->r_flags |= RACK_HAS_SYN; 12640 rsm->r_start = tp->snd_una; 12641 rsm->r_end = tp->snd_max; 12642 rsm->r_dupack = 0; 12643 if (rack->rc_inp->inp_socket->so_snd.sb_mb != NULL) { 12644 rsm->m = sbsndmbuf(&rack->rc_inp->inp_socket->so_snd, 0, &rsm->soff); 12645 if (rsm->m) 12646 rsm->orig_m_len = rsm->m->m_len; 12647 else 12648 rsm->orig_m_len = 0; 12649 } else { 12650 /* 12651 * This can happen if we have a stand-alone FIN or 12652 * SYN. 12653 */ 12654 rsm->m = NULL; 12655 rsm->orig_m_len = 0; 12656 rsm->soff = 0; 12657 } 12658 #ifndef INVARIANTS 12659 (void)RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 12660 #else 12661 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 12662 if (insret != NULL) { 12663 panic("Insert in rb tree fails ret:%p rack:%p rsm:%p", 12664 insret, rack, rsm); 12665 } 12666 #endif 12667 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); 12668 rsm->r_in_tmap = 1; 12669 } 12670 /* 12671 * Timers in Rack are kept in microseconds so lets 12672 * convert any initial incoming variables 12673 * from ticks into usecs. Note that we 12674 * also change the values of t_srtt and t_rttvar, if 12675 * they are non-zero. They are kept with a 5 12676 * bit decimal so we have to carefully convert 12677 * these to get the full precision. 12678 */ 12679 rack_convert_rtts(tp); 12680 tp->t_rttlow = TICKS_2_USEC(tp->t_rttlow); 12681 if (rack_do_hystart) { 12682 tp->t_ccv.flags |= CCF_HYSTART_ALLOWED; 12683 if (rack_do_hystart > 1) 12684 tp->t_ccv.flags |= CCF_HYSTART_CAN_SH_CWND; 12685 if (rack_do_hystart > 2) 12686 tp->t_ccv.flags |= CCF_HYSTART_CONS_SSTH; 12687 } 12688 if (rack_def_profile) 12689 rack_set_profile(rack, rack_def_profile); 12690 /* Cancel the GP measurement in progress */ 12691 tp->t_flags &= ~TF_GPUTINPROG; 12692 if (SEQ_GT(tp->snd_max, tp->iss)) 12693 snt = tp->snd_max - tp->iss; 12694 else 12695 snt = 0; 12696 iwin = rc_init_window(rack); 12697 if (snt < iwin) { 12698 /* We are not past the initial window 12699 * so we need to make sure cwnd is 12700 * correct. 12701 */ 12702 if (tp->snd_cwnd < iwin) 12703 tp->snd_cwnd = iwin; 12704 /* 12705 * If we are within the initial window 12706 * we want ssthresh to be unlimited. Setting 12707 * it to the rwnd (which the default stack does 12708 * and older racks) is not really a good idea 12709 * since we want to be in SS and grow both the 12710 * cwnd and the rwnd (via dynamic rwnd growth). If 12711 * we set it to the rwnd then as the peer grows its 12712 * rwnd we will be stuck in CA and never hit SS. 12713 * 12714 * Its far better to raise it up high (this takes the 12715 * risk that there as been a loss already, probably 12716 * we should have an indicator in all stacks of loss 12717 * but we don't), but considering the normal use this 12718 * is a risk worth taking. The consequences of not 12719 * hitting SS are far worse than going one more time 12720 * into it early on (before we have sent even a IW). 12721 * It is highly unlikely that we will have had a loss 12722 * before getting the IW out. 12723 */ 12724 tp->snd_ssthresh = 0xffffffff; 12725 } 12726 rack_stop_all_timers(tp); 12727 /* Lets setup the fsb block */ 12728 rack_start_hpts_timer(rack, tp, tcp_get_usecs(NULL), 0, 0, 0); 12729 rack_log_rtt_shrinks(rack, us_cts, tp->t_rxtcur, 12730 __LINE__, RACK_RTTS_INIT); 12731 return (0); 12732 } 12733 12734 static int 12735 rack_handoff_ok(struct tcpcb *tp) 12736 { 12737 if ((tp->t_state == TCPS_CLOSED) || 12738 (tp->t_state == TCPS_LISTEN)) { 12739 /* Sure no problem though it may not stick */ 12740 return (0); 12741 } 12742 if ((tp->t_state == TCPS_SYN_SENT) || 12743 (tp->t_state == TCPS_SYN_RECEIVED)) { 12744 /* 12745 * We really don't know if you support sack, 12746 * you have to get to ESTAB or beyond to tell. 12747 */ 12748 return (EAGAIN); 12749 } 12750 if ((tp->t_flags & TF_SENTFIN) && ((tp->snd_max - tp->snd_una) > 1)) { 12751 /* 12752 * Rack will only send a FIN after all data is acknowledged. 12753 * So in this case we have more data outstanding. We can't 12754 * switch stacks until either all data and only the FIN 12755 * is left (in which case rack_init() now knows how 12756 * to deal with that) <or> all is acknowledged and we 12757 * are only left with incoming data, though why you 12758 * would want to switch to rack after all data is acknowledged 12759 * I have no idea (rrs)! 12760 */ 12761 return (EAGAIN); 12762 } 12763 if ((tp->t_flags & TF_SACK_PERMIT) || rack_sack_not_required){ 12764 return (0); 12765 } 12766 /* 12767 * If we reach here we don't do SACK on this connection so we can 12768 * never do rack. 12769 */ 12770 return (EINVAL); 12771 } 12772 12773 12774 static void 12775 rack_fini(struct tcpcb *tp, int32_t tcb_is_purged) 12776 { 12777 struct inpcb *inp = tptoinpcb(tp); 12778 12779 if (tp->t_fb_ptr) { 12780 struct tcp_rack *rack; 12781 struct rack_sendmap *rsm, *nrsm; 12782 #ifdef INVARIANTS 12783 struct rack_sendmap *rm; 12784 #endif 12785 12786 rack = (struct tcp_rack *)tp->t_fb_ptr; 12787 if (tp->t_in_pkt) { 12788 /* 12789 * It is unsafe to process the packets since a 12790 * reset may be lurking in them (its rare but it 12791 * can occur). If we were to find a RST, then we 12792 * would end up dropping the connection and the 12793 * INP lock, so when we return the caller (tcp_usrreq) 12794 * will blow up when it trys to unlock the inp. 12795 */ 12796 struct mbuf *save, *m; 12797 12798 m = tp->t_in_pkt; 12799 tp->t_in_pkt = NULL; 12800 tp->t_tail_pkt = NULL; 12801 while (m) { 12802 save = m->m_nextpkt; 12803 m->m_nextpkt = NULL; 12804 m_freem(m); 12805 m = save; 12806 } 12807 } 12808 tp->t_flags &= ~TF_FORCEDATA; 12809 #ifdef NETFLIX_SHARED_CWND 12810 if (rack->r_ctl.rc_scw) { 12811 uint32_t limit; 12812 12813 if (rack->r_limit_scw) 12814 limit = max(1, rack->r_ctl.rc_lowest_us_rtt); 12815 else 12816 limit = 0; 12817 tcp_shared_cwnd_free_full(tp, rack->r_ctl.rc_scw, 12818 rack->r_ctl.rc_scw_index, 12819 limit); 12820 rack->r_ctl.rc_scw = NULL; 12821 } 12822 #endif 12823 if (rack->r_ctl.fsb.tcp_ip_hdr) { 12824 free(rack->r_ctl.fsb.tcp_ip_hdr, M_TCPFSB); 12825 rack->r_ctl.fsb.tcp_ip_hdr = NULL; 12826 rack->r_ctl.fsb.th = NULL; 12827 } 12828 /* Convert back to ticks, with */ 12829 if (tp->t_srtt > 1) { 12830 uint32_t val, frac; 12831 12832 val = USEC_2_TICKS(tp->t_srtt); 12833 frac = tp->t_srtt % (HPTS_USEC_IN_SEC / hz); 12834 tp->t_srtt = val << TCP_RTT_SHIFT; 12835 /* 12836 * frac is the fractional part here is left 12837 * over from converting to hz and shifting. 12838 * We need to convert this to the 5 bit 12839 * remainder. 12840 */ 12841 if (frac) { 12842 if (hz == 1000) { 12843 frac = (((uint64_t)frac * (uint64_t)TCP_RTT_SCALE) / (uint64_t)HPTS_USEC_IN_MSEC); 12844 } else { 12845 frac = (((uint64_t)frac * (uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE) /(uint64_t)HPTS_USEC_IN_SEC); 12846 } 12847 tp->t_srtt += frac; 12848 } 12849 } 12850 if (tp->t_rttvar) { 12851 uint32_t val, frac; 12852 12853 val = USEC_2_TICKS(tp->t_rttvar); 12854 frac = tp->t_srtt % (HPTS_USEC_IN_SEC / hz); 12855 tp->t_rttvar = val << TCP_RTTVAR_SHIFT; 12856 /* 12857 * frac is the fractional part here is left 12858 * over from converting to hz and shifting. 12859 * We need to convert this to the 5 bit 12860 * remainder. 12861 */ 12862 if (frac) { 12863 if (hz == 1000) { 12864 frac = (((uint64_t)frac * (uint64_t)TCP_RTT_SCALE) / (uint64_t)HPTS_USEC_IN_MSEC); 12865 } else { 12866 frac = (((uint64_t)frac * (uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE) /(uint64_t)HPTS_USEC_IN_SEC); 12867 } 12868 tp->t_rttvar += frac; 12869 } 12870 } 12871 tp->t_rxtcur = USEC_2_TICKS(tp->t_rxtcur); 12872 tp->t_rttlow = USEC_2_TICKS(tp->t_rttlow); 12873 if (rack->rc_always_pace) { 12874 tcp_decrement_paced_conn(); 12875 rack_undo_cc_pacing(rack); 12876 rack->rc_always_pace = 0; 12877 } 12878 /* Clean up any options if they were not applied */ 12879 while (!TAILQ_EMPTY(&rack->r_ctl.opt_list)) { 12880 struct deferred_opt_list *dol; 12881 12882 dol = TAILQ_FIRST(&rack->r_ctl.opt_list); 12883 TAILQ_REMOVE(&rack->r_ctl.opt_list, dol, next); 12884 free(dol, M_TCPDO); 12885 } 12886 /* rack does not use force data but other stacks may clear it */ 12887 if (rack->r_ctl.crte != NULL) { 12888 tcp_rel_pacing_rate(rack->r_ctl.crte, tp); 12889 rack->rack_hdrw_pacing = 0; 12890 rack->r_ctl.crte = NULL; 12891 } 12892 #ifdef TCP_BLACKBOX 12893 tcp_log_flowend(tp); 12894 #endif 12895 RB_FOREACH_SAFE(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm) { 12896 #ifndef INVARIANTS 12897 (void)RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 12898 #else 12899 rm = RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 12900 if (rm != rsm) { 12901 panic("At fini, rack:%p rsm:%p rm:%p", 12902 rack, rsm, rm); 12903 } 12904 #endif 12905 uma_zfree(rack_zone, rsm); 12906 } 12907 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); 12908 while (rsm) { 12909 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext); 12910 uma_zfree(rack_zone, rsm); 12911 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); 12912 } 12913 rack->rc_free_cnt = 0; 12914 uma_zfree(rack_pcb_zone, tp->t_fb_ptr); 12915 tp->t_fb_ptr = NULL; 12916 } 12917 inp->inp_flags2 &= ~INP_SUPPORTS_MBUFQ; 12918 inp->inp_flags2 &= ~INP_MBUF_QUEUE_READY; 12919 inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE; 12920 inp->inp_flags2 &= ~INP_MBUF_ACKCMP; 12921 /* Cancel the GP measurement in progress */ 12922 tp->t_flags &= ~TF_GPUTINPROG; 12923 inp->inp_flags2 &= ~INP_MBUF_L_ACKS; 12924 /* Make sure snd_nxt is correctly set */ 12925 tp->snd_nxt = tp->snd_max; 12926 } 12927 12928 static void 12929 rack_set_state(struct tcpcb *tp, struct tcp_rack *rack) 12930 { 12931 if ((rack->r_state == TCPS_CLOSED) && (tp->t_state != TCPS_CLOSED)) { 12932 rack->r_is_v6 = (tptoinpcb(tp)->inp_vflag & INP_IPV6) != 0; 12933 } 12934 switch (tp->t_state) { 12935 case TCPS_SYN_SENT: 12936 rack->r_state = TCPS_SYN_SENT; 12937 rack->r_substate = rack_do_syn_sent; 12938 break; 12939 case TCPS_SYN_RECEIVED: 12940 rack->r_state = TCPS_SYN_RECEIVED; 12941 rack->r_substate = rack_do_syn_recv; 12942 break; 12943 case TCPS_ESTABLISHED: 12944 rack_set_pace_segments(tp, rack, __LINE__, NULL); 12945 rack->r_state = TCPS_ESTABLISHED; 12946 rack->r_substate = rack_do_established; 12947 break; 12948 case TCPS_CLOSE_WAIT: 12949 rack_set_pace_segments(tp, rack, __LINE__, NULL); 12950 rack->r_state = TCPS_CLOSE_WAIT; 12951 rack->r_substate = rack_do_close_wait; 12952 break; 12953 case TCPS_FIN_WAIT_1: 12954 rack_set_pace_segments(tp, rack, __LINE__, NULL); 12955 rack->r_state = TCPS_FIN_WAIT_1; 12956 rack->r_substate = rack_do_fin_wait_1; 12957 break; 12958 case TCPS_CLOSING: 12959 rack_set_pace_segments(tp, rack, __LINE__, NULL); 12960 rack->r_state = TCPS_CLOSING; 12961 rack->r_substate = rack_do_closing; 12962 break; 12963 case TCPS_LAST_ACK: 12964 rack_set_pace_segments(tp, rack, __LINE__, NULL); 12965 rack->r_state = TCPS_LAST_ACK; 12966 rack->r_substate = rack_do_lastack; 12967 break; 12968 case TCPS_FIN_WAIT_2: 12969 rack_set_pace_segments(tp, rack, __LINE__, NULL); 12970 rack->r_state = TCPS_FIN_WAIT_2; 12971 rack->r_substate = rack_do_fin_wait_2; 12972 break; 12973 case TCPS_LISTEN: 12974 case TCPS_CLOSED: 12975 case TCPS_TIME_WAIT: 12976 default: 12977 break; 12978 }; 12979 if (rack->r_use_cmp_ack && TCPS_HAVEESTABLISHED(tp->t_state)) 12980 rack->rc_inp->inp_flags2 |= INP_MBUF_ACKCMP; 12981 12982 } 12983 12984 static void 12985 rack_timer_audit(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb) 12986 { 12987 /* 12988 * We received an ack, and then did not 12989 * call send or were bounced out due to the 12990 * hpts was running. Now a timer is up as well, is 12991 * it the right timer? 12992 */ 12993 struct rack_sendmap *rsm; 12994 int tmr_up; 12995 12996 tmr_up = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK; 12997 if (rack->rc_in_persist && (tmr_up == PACE_TMR_PERSIT)) 12998 return; 12999 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 13000 if (((rsm == NULL) || (tp->t_state < TCPS_ESTABLISHED)) && 13001 (tmr_up == PACE_TMR_RXT)) { 13002 /* Should be an RXT */ 13003 return; 13004 } 13005 if (rsm == NULL) { 13006 /* Nothing outstanding? */ 13007 if (tp->t_flags & TF_DELACK) { 13008 if (tmr_up == PACE_TMR_DELACK) 13009 /* We are supposed to have delayed ack up and we do */ 13010 return; 13011 } else if (sbavail(&tptosocket(tp)->so_snd) && (tmr_up == PACE_TMR_RXT)) { 13012 /* 13013 * if we hit enobufs then we would expect the possibility 13014 * of nothing outstanding and the RXT up (and the hptsi timer). 13015 */ 13016 return; 13017 } else if (((V_tcp_always_keepalive || 13018 rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) && 13019 (tp->t_state <= TCPS_CLOSING)) && 13020 (tmr_up == PACE_TMR_KEEP) && 13021 (tp->snd_max == tp->snd_una)) { 13022 /* We should have keep alive up and we do */ 13023 return; 13024 } 13025 } 13026 if (SEQ_GT(tp->snd_max, tp->snd_una) && 13027 ((tmr_up == PACE_TMR_TLP) || 13028 (tmr_up == PACE_TMR_RACK) || 13029 (tmr_up == PACE_TMR_RXT))) { 13030 /* 13031 * Either a Rack, TLP or RXT is fine if we 13032 * have outstanding data. 13033 */ 13034 return; 13035 } else if (tmr_up == PACE_TMR_DELACK) { 13036 /* 13037 * If the delayed ack was going to go off 13038 * before the rtx/tlp/rack timer were going to 13039 * expire, then that would be the timer in control. 13040 * Note we don't check the time here trusting the 13041 * code is correct. 13042 */ 13043 return; 13044 } 13045 /* 13046 * Ok the timer originally started is not what we want now. 13047 * We will force the hpts to be stopped if any, and restart 13048 * with the slot set to what was in the saved slot. 13049 */ 13050 if (tcp_in_hpts(rack->rc_inp)) { 13051 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { 13052 uint32_t us_cts; 13053 13054 us_cts = tcp_get_usecs(NULL); 13055 if (TSTMP_GT(rack->r_ctl.rc_last_output_to, us_cts)) { 13056 rack->r_early = 1; 13057 rack->r_ctl.rc_agg_early += (rack->r_ctl.rc_last_output_to - us_cts); 13058 } 13059 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 13060 } 13061 tcp_hpts_remove(rack->rc_inp); 13062 } 13063 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 13064 rack_start_hpts_timer(rack, tp, tcp_get_usecs(NULL), 0, 0, 0); 13065 } 13066 13067 13068 static void 13069 rack_do_win_updates(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tiwin, uint32_t seq, uint32_t ack, uint32_t cts, uint32_t high_seq) 13070 { 13071 if ((SEQ_LT(tp->snd_wl1, seq) || 13072 (tp->snd_wl1 == seq && (SEQ_LT(tp->snd_wl2, ack) || 13073 (tp->snd_wl2 == ack && tiwin > tp->snd_wnd))))) { 13074 /* keep track of pure window updates */ 13075 if ((tp->snd_wl2 == ack) && (tiwin > tp->snd_wnd)) 13076 KMOD_TCPSTAT_INC(tcps_rcvwinupd); 13077 tp->snd_wnd = tiwin; 13078 rack_validate_fo_sendwin_up(tp, rack); 13079 tp->snd_wl1 = seq; 13080 tp->snd_wl2 = ack; 13081 if (tp->snd_wnd > tp->max_sndwnd) 13082 tp->max_sndwnd = tp->snd_wnd; 13083 rack->r_wanted_output = 1; 13084 } else if ((tp->snd_wl2 == ack) && (tiwin < tp->snd_wnd)) { 13085 tp->snd_wnd = tiwin; 13086 rack_validate_fo_sendwin_up(tp, rack); 13087 tp->snd_wl1 = seq; 13088 tp->snd_wl2 = ack; 13089 } else { 13090 /* Not a valid win update */ 13091 return; 13092 } 13093 /* Do we exit persists? */ 13094 if ((rack->rc_in_persist != 0) && 13095 (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2), 13096 rack->r_ctl.rc_pace_min_segs))) { 13097 rack_exit_persist(tp, rack, cts); 13098 } 13099 /* Do we enter persists? */ 13100 if ((rack->rc_in_persist == 0) && 13101 (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) && 13102 TCPS_HAVEESTABLISHED(tp->t_state) && 13103 ((tp->snd_max == tp->snd_una) || rack->rc_has_collapsed) && 13104 sbavail(&tptosocket(tp)->so_snd) && 13105 (sbavail(&tptosocket(tp)->so_snd) > tp->snd_wnd)) { 13106 /* 13107 * Here the rwnd is less than 13108 * the pacing size, we are established, 13109 * nothing is outstanding, and there is 13110 * data to send. Enter persists. 13111 */ 13112 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime); 13113 } 13114 } 13115 13116 static void 13117 rack_log_input_packet(struct tcpcb *tp, struct tcp_rack *rack, struct tcp_ackent *ae, int ackval, uint32_t high_seq) 13118 { 13119 13120 if (tp->t_logstate != TCP_LOG_STATE_OFF) { 13121 struct inpcb *inp = tptoinpcb(tp); 13122 union tcp_log_stackspecific log; 13123 struct timeval ltv; 13124 char tcp_hdr_buf[60]; 13125 struct tcphdr *th; 13126 struct timespec ts; 13127 uint32_t orig_snd_una; 13128 uint8_t xx = 0; 13129 13130 #ifdef NETFLIX_HTTP_LOGGING 13131 struct http_sendfile_track *http_req; 13132 13133 if (SEQ_GT(ae->ack, tp->snd_una)) { 13134 http_req = tcp_http_find_req_for_seq(tp, (ae->ack-1)); 13135 } else { 13136 http_req = tcp_http_find_req_for_seq(tp, ae->ack); 13137 } 13138 #endif 13139 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 13140 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); 13141 if (rack->rack_no_prr == 0) 13142 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; 13143 else 13144 log.u_bbr.flex1 = 0; 13145 log.u_bbr.use_lt_bw = rack->r_ent_rec_ns; 13146 log.u_bbr.use_lt_bw <<= 1; 13147 log.u_bbr.use_lt_bw |= rack->r_might_revert; 13148 log.u_bbr.flex2 = rack->r_ctl.rc_num_maps_alloced; 13149 log.u_bbr.inflight = ctf_flight_size(tp, rack->r_ctl.rc_sacked); 13150 log.u_bbr.pkts_out = tp->t_maxseg; 13151 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; 13152 log.u_bbr.flex7 = 1; 13153 log.u_bbr.lost = ae->flags; 13154 log.u_bbr.cwnd_gain = ackval; 13155 log.u_bbr.pacing_gain = 0x2; 13156 if (ae->flags & TSTMP_HDWR) { 13157 /* Record the hardware timestamp if present */ 13158 log.u_bbr.flex3 = M_TSTMP; 13159 ts.tv_sec = ae->timestamp / 1000000000; 13160 ts.tv_nsec = ae->timestamp % 1000000000; 13161 ltv.tv_sec = ts.tv_sec; 13162 ltv.tv_usec = ts.tv_nsec / 1000; 13163 log.u_bbr.lt_epoch = tcp_tv_to_usectick(<v); 13164 } else if (ae->flags & TSTMP_LRO) { 13165 /* Record the LRO the arrival timestamp */ 13166 log.u_bbr.flex3 = M_TSTMP_LRO; 13167 ts.tv_sec = ae->timestamp / 1000000000; 13168 ts.tv_nsec = ae->timestamp % 1000000000; 13169 ltv.tv_sec = ts.tv_sec; 13170 ltv.tv_usec = ts.tv_nsec / 1000; 13171 log.u_bbr.flex5 = tcp_tv_to_usectick(<v); 13172 } 13173 log.u_bbr.timeStamp = tcp_get_usecs(<v); 13174 /* Log the rcv time */ 13175 log.u_bbr.delRate = ae->timestamp; 13176 #ifdef NETFLIX_HTTP_LOGGING 13177 log.u_bbr.applimited = tp->t_http_closed; 13178 log.u_bbr.applimited <<= 8; 13179 log.u_bbr.applimited |= tp->t_http_open; 13180 log.u_bbr.applimited <<= 8; 13181 log.u_bbr.applimited |= tp->t_http_req; 13182 if (http_req) { 13183 /* Copy out any client req info */ 13184 /* seconds */ 13185 log.u_bbr.pkt_epoch = (http_req->localtime / HPTS_USEC_IN_SEC); 13186 /* useconds */ 13187 log.u_bbr.delivered = (http_req->localtime % HPTS_USEC_IN_SEC); 13188 log.u_bbr.rttProp = http_req->timestamp; 13189 log.u_bbr.cur_del_rate = http_req->start; 13190 if (http_req->flags & TCP_HTTP_TRACK_FLG_OPEN) { 13191 log.u_bbr.flex8 |= 1; 13192 } else { 13193 log.u_bbr.flex8 |= 2; 13194 log.u_bbr.bw_inuse = http_req->end; 13195 } 13196 log.u_bbr.flex6 = http_req->start_seq; 13197 if (http_req->flags & TCP_HTTP_TRACK_FLG_COMP) { 13198 log.u_bbr.flex8 |= 4; 13199 log.u_bbr.epoch = http_req->end_seq; 13200 } 13201 } 13202 #endif 13203 memset(tcp_hdr_buf, 0, sizeof(tcp_hdr_buf)); 13204 th = (struct tcphdr *)tcp_hdr_buf; 13205 th->th_seq = ae->seq; 13206 th->th_ack = ae->ack; 13207 th->th_win = ae->win; 13208 /* Now fill in the ports */ 13209 th->th_sport = inp->inp_fport; 13210 th->th_dport = inp->inp_lport; 13211 tcp_set_flags(th, ae->flags); 13212 /* Now do we have a timestamp option? */ 13213 if (ae->flags & HAS_TSTMP) { 13214 u_char *cp; 13215 uint32_t val; 13216 13217 th->th_off = ((sizeof(struct tcphdr) + TCPOLEN_TSTAMP_APPA) >> 2); 13218 cp = (u_char *)(th + 1); 13219 *cp = TCPOPT_NOP; 13220 cp++; 13221 *cp = TCPOPT_NOP; 13222 cp++; 13223 *cp = TCPOPT_TIMESTAMP; 13224 cp++; 13225 *cp = TCPOLEN_TIMESTAMP; 13226 cp++; 13227 val = htonl(ae->ts_value); 13228 bcopy((char *)&val, 13229 (char *)cp, sizeof(uint32_t)); 13230 val = htonl(ae->ts_echo); 13231 bcopy((char *)&val, 13232 (char *)(cp + 4), sizeof(uint32_t)); 13233 } else 13234 th->th_off = (sizeof(struct tcphdr) >> 2); 13235 13236 /* 13237 * For sane logging we need to play a little trick. 13238 * If the ack were fully processed we would have moved 13239 * snd_una to high_seq, but since compressed acks are 13240 * processed in two phases, at this point (logging) snd_una 13241 * won't be advanced. So we would see multiple acks showing 13242 * the advancement. We can prevent that by "pretending" that 13243 * snd_una was advanced and then un-advancing it so that the 13244 * logging code has the right value for tlb_snd_una. 13245 */ 13246 if (tp->snd_una != high_seq) { 13247 orig_snd_una = tp->snd_una; 13248 tp->snd_una = high_seq; 13249 xx = 1; 13250 } else 13251 xx = 0; 13252 TCP_LOG_EVENTP(tp, th, 13253 &tptosocket(tp)->so_rcv, 13254 &tptosocket(tp)->so_snd, TCP_LOG_IN, 0, 13255 0, &log, true, <v); 13256 if (xx) { 13257 tp->snd_una = orig_snd_una; 13258 } 13259 } 13260 13261 } 13262 13263 static void 13264 rack_handle_probe_response(struct tcp_rack *rack, uint32_t tiwin, uint32_t us_cts) 13265 { 13266 uint32_t us_rtt; 13267 /* 13268 * A persist or keep-alive was forced out, update our 13269 * min rtt time. Note now worry about lost responses. 13270 * When a subsequent keep-alive or persist times out 13271 * and forced_ack is still on, then the last probe 13272 * was not responded to. In such cases we have a 13273 * sysctl that controls the behavior. Either we apply 13274 * the rtt but with reduced confidence (0). Or we just 13275 * plain don't apply the rtt estimate. Having data flow 13276 * will clear the probe_not_answered flag i.e. cum-ack 13277 * move forward <or> exiting and reentering persists. 13278 */ 13279 13280 rack->forced_ack = 0; 13281 rack->rc_tp->t_rxtshift = 0; 13282 if ((rack->rc_in_persist && 13283 (tiwin == rack->rc_tp->snd_wnd)) || 13284 (rack->rc_in_persist == 0)) { 13285 /* 13286 * In persists only apply the RTT update if this is 13287 * a response to our window probe. And that 13288 * means the rwnd sent must match the current 13289 * snd_wnd. If it does not, then we got a 13290 * window update ack instead. For keepalive 13291 * we allow the answer no matter what the window. 13292 * 13293 * Note that if the probe_not_answered is set then 13294 * the forced_ack_ts is the oldest one i.e. the first 13295 * probe sent that might have been lost. This assures 13296 * us that if we do calculate an RTT it is longer not 13297 * some short thing. 13298 */ 13299 if (rack->rc_in_persist) 13300 counter_u64_add(rack_persists_acks, 1); 13301 us_rtt = us_cts - rack->r_ctl.forced_ack_ts; 13302 if (us_rtt == 0) 13303 us_rtt = 1; 13304 if (rack->probe_not_answered == 0) { 13305 rack_apply_updated_usrtt(rack, us_rtt, us_cts); 13306 tcp_rack_xmit_timer(rack, us_rtt, 0, us_rtt, 3, NULL, 1); 13307 } else { 13308 /* We have a retransmitted probe here too */ 13309 if (rack_apply_rtt_with_reduced_conf) { 13310 rack_apply_updated_usrtt(rack, us_rtt, us_cts); 13311 tcp_rack_xmit_timer(rack, us_rtt, 0, us_rtt, 0, NULL, 1); 13312 } 13313 } 13314 } 13315 } 13316 13317 static int 13318 rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mbuf *m, int nxt_pkt, struct timeval *tv) 13319 { 13320 /* 13321 * Handle a "special" compressed ack mbuf. Each incoming 13322 * ack has only four possible dispositions: 13323 * 13324 * A) It moves the cum-ack forward 13325 * B) It is behind the cum-ack. 13326 * C) It is a window-update ack. 13327 * D) It is a dup-ack. 13328 * 13329 * Note that we can have between 1 -> TCP_COMP_ACK_ENTRIES 13330 * in the incoming mbuf. We also need to still pay attention 13331 * to nxt_pkt since there may be another packet after this 13332 * one. 13333 */ 13334 #ifdef TCP_ACCOUNTING 13335 uint64_t ts_val; 13336 uint64_t rdstc; 13337 #endif 13338 int segsiz; 13339 struct timespec ts; 13340 struct tcp_rack *rack; 13341 struct tcp_ackent *ae; 13342 uint32_t tiwin, ms_cts, cts, acked, acked_amount, high_seq, win_seq, the_win, win_upd_ack; 13343 int cnt, i, did_out, ourfinisacked = 0; 13344 struct tcpopt to_holder, *to = NULL; 13345 #ifdef TCP_ACCOUNTING 13346 int win_up_req = 0; 13347 #endif 13348 int nsegs = 0; 13349 int under_pacing = 1; 13350 int recovery = 0; 13351 #ifdef TCP_ACCOUNTING 13352 sched_pin(); 13353 #endif 13354 rack = (struct tcp_rack *)tp->t_fb_ptr; 13355 if (rack->gp_ready && 13356 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) 13357 under_pacing = 0; 13358 else 13359 under_pacing = 1; 13360 13361 if (rack->r_state != tp->t_state) 13362 rack_set_state(tp, rack); 13363 if ((tp->t_state >= TCPS_FIN_WAIT_1) && 13364 (tp->t_flags & TF_GPUTINPROG)) { 13365 /* 13366 * We have a goodput in progress 13367 * and we have entered a late state. 13368 * Do we have enough data in the sb 13369 * to handle the GPUT request? 13370 */ 13371 uint32_t bytes; 13372 13373 bytes = tp->gput_ack - tp->gput_seq; 13374 if (SEQ_GT(tp->gput_seq, tp->snd_una)) 13375 bytes += tp->gput_seq - tp->snd_una; 13376 if (bytes > sbavail(&tptosocket(tp)->so_snd)) { 13377 /* 13378 * There are not enough bytes in the socket 13379 * buffer that have been sent to cover this 13380 * measurement. Cancel it. 13381 */ 13382 rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/, 13383 rack->r_ctl.rc_gp_srtt /*flex1*/, 13384 tp->gput_seq, 13385 0, 0, 18, __LINE__, NULL, 0); 13386 tp->t_flags &= ~TF_GPUTINPROG; 13387 } 13388 } 13389 to = &to_holder; 13390 to->to_flags = 0; 13391 KASSERT((m->m_len >= sizeof(struct tcp_ackent)), 13392 ("tp:%p m_cmpack:%p with invalid len:%u", tp, m, m->m_len)); 13393 cnt = m->m_len / sizeof(struct tcp_ackent); 13394 counter_u64_add(rack_multi_single_eq, cnt); 13395 high_seq = tp->snd_una; 13396 the_win = tp->snd_wnd; 13397 win_seq = tp->snd_wl1; 13398 win_upd_ack = tp->snd_wl2; 13399 cts = tcp_tv_to_usectick(tv); 13400 ms_cts = tcp_tv_to_mssectick(tv); 13401 rack->r_ctl.rc_rcvtime = cts; 13402 segsiz = ctf_fixed_maxseg(tp); 13403 if ((rack->rc_gp_dyn_mul) && 13404 (rack->use_fixed_rate == 0) && 13405 (rack->rc_always_pace)) { 13406 /* Check in on probertt */ 13407 rack_check_probe_rtt(rack, cts); 13408 } 13409 for (i = 0; i < cnt; i++) { 13410 #ifdef TCP_ACCOUNTING 13411 ts_val = get_cyclecount(); 13412 #endif 13413 rack_clear_rate_sample(rack); 13414 ae = ((mtod(m, struct tcp_ackent *)) + i); 13415 /* Setup the window */ 13416 tiwin = ae->win << tp->snd_scale; 13417 if (tiwin > rack->r_ctl.rc_high_rwnd) 13418 rack->r_ctl.rc_high_rwnd = tiwin; 13419 /* figure out the type of ack */ 13420 if (SEQ_LT(ae->ack, high_seq)) { 13421 /* Case B*/ 13422 ae->ack_val_set = ACK_BEHIND; 13423 } else if (SEQ_GT(ae->ack, high_seq)) { 13424 /* Case A */ 13425 ae->ack_val_set = ACK_CUMACK; 13426 } else if ((tiwin == the_win) && (rack->rc_in_persist == 0)){ 13427 /* Case D */ 13428 ae->ack_val_set = ACK_DUPACK; 13429 } else { 13430 /* Case C */ 13431 ae->ack_val_set = ACK_RWND; 13432 } 13433 rack_log_input_packet(tp, rack, ae, ae->ack_val_set, high_seq); 13434 /* Validate timestamp */ 13435 if (ae->flags & HAS_TSTMP) { 13436 /* Setup for a timestamp */ 13437 to->to_flags = TOF_TS; 13438 ae->ts_echo -= tp->ts_offset; 13439 to->to_tsecr = ae->ts_echo; 13440 to->to_tsval = ae->ts_value; 13441 /* 13442 * If echoed timestamp is later than the current time, fall back to 13443 * non RFC1323 RTT calculation. Normalize timestamp if syncookies 13444 * were used when this connection was established. 13445 */ 13446 if (TSTMP_GT(ae->ts_echo, ms_cts)) 13447 to->to_tsecr = 0; 13448 if (tp->ts_recent && 13449 TSTMP_LT(ae->ts_value, tp->ts_recent)) { 13450 if (ctf_ts_check_ac(tp, (ae->flags & 0xff))) { 13451 #ifdef TCP_ACCOUNTING 13452 rdstc = get_cyclecount(); 13453 if (rdstc > ts_val) { 13454 counter_u64_add(tcp_proc_time[ae->ack_val_set] , 13455 (rdstc - ts_val)); 13456 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 13457 tp->tcp_proc_time[ae->ack_val_set] += (rdstc - ts_val); 13458 } 13459 } 13460 #endif 13461 continue; 13462 } 13463 } 13464 if (SEQ_LEQ(ae->seq, tp->last_ack_sent) && 13465 SEQ_LEQ(tp->last_ack_sent, ae->seq)) { 13466 tp->ts_recent_age = tcp_ts_getticks(); 13467 tp->ts_recent = ae->ts_value; 13468 } 13469 } else { 13470 /* Setup for a no options */ 13471 to->to_flags = 0; 13472 } 13473 /* Update the rcv time and perform idle reduction possibly */ 13474 if (tp->t_idle_reduce && 13475 (tp->snd_max == tp->snd_una) && 13476 (TICKS_2_USEC(ticks - tp->t_rcvtime) >= tp->t_rxtcur)) { 13477 counter_u64_add(rack_input_idle_reduces, 1); 13478 rack_cc_after_idle(rack, tp); 13479 } 13480 tp->t_rcvtime = ticks; 13481 /* Now what about ECN of a chain of pure ACKs? */ 13482 if (tcp_ecn_input_segment(tp, ae->flags, 0, 13483 tcp_packets_this_ack(tp, ae->ack), 13484 ae->codepoint)) 13485 rack_cong_signal(tp, CC_ECN, ae->ack, __LINE__); 13486 #ifdef TCP_ACCOUNTING 13487 /* Count for the specific type of ack in */ 13488 counter_u64_add(tcp_cnt_counters[ae->ack_val_set], 1); 13489 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 13490 tp->tcp_cnt_counters[ae->ack_val_set]++; 13491 } 13492 #endif 13493 /* 13494 * Note how we could move up these in the determination 13495 * above, but we don't so that way the timestamp checks (and ECN) 13496 * is done first before we do any processing on the ACK. 13497 * The non-compressed path through the code has this 13498 * weakness (noted by @jtl) that it actually does some 13499 * processing before verifying the timestamp information. 13500 * We don't take that path here which is why we set 13501 * the ack_val_set first, do the timestamp and ecn 13502 * processing, and then look at what we have setup. 13503 */ 13504 if (ae->ack_val_set == ACK_BEHIND) { 13505 /* 13506 * Case B flag reordering, if window is not closed 13507 * or it could be a keep-alive or persists 13508 */ 13509 if (SEQ_LT(ae->ack, tp->snd_una) && (sbspace(&so->so_rcv) > segsiz)) { 13510 rack->r_ctl.rc_reorder_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 13511 } 13512 } else if (ae->ack_val_set == ACK_DUPACK) { 13513 /* Case D */ 13514 rack_strike_dupack(rack); 13515 } else if (ae->ack_val_set == ACK_RWND) { 13516 /* Case C */ 13517 if ((ae->flags & TSTMP_LRO) || (ae->flags & TSTMP_HDWR)) { 13518 ts.tv_sec = ae->timestamp / 1000000000; 13519 ts.tv_nsec = ae->timestamp % 1000000000; 13520 rack->r_ctl.act_rcv_time.tv_sec = ts.tv_sec; 13521 rack->r_ctl.act_rcv_time.tv_usec = ts.tv_nsec/1000; 13522 } else { 13523 rack->r_ctl.act_rcv_time = *tv; 13524 } 13525 if (rack->forced_ack) { 13526 rack_handle_probe_response(rack, tiwin, 13527 tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time)); 13528 } 13529 #ifdef TCP_ACCOUNTING 13530 win_up_req = 1; 13531 #endif 13532 win_upd_ack = ae->ack; 13533 win_seq = ae->seq; 13534 the_win = tiwin; 13535 rack_do_win_updates(tp, rack, the_win, win_seq, win_upd_ack, cts, high_seq); 13536 } else { 13537 /* Case A */ 13538 if (SEQ_GT(ae->ack, tp->snd_max)) { 13539 /* 13540 * We just send an ack since the incoming 13541 * ack is beyond the largest seq we sent. 13542 */ 13543 if ((tp->t_flags & TF_ACKNOW) == 0) { 13544 ctf_ack_war_checks(tp, &rack->r_ctl.challenge_ack_ts, &rack->r_ctl.challenge_ack_cnt); 13545 if (tp->t_flags && TF_ACKNOW) 13546 rack->r_wanted_output = 1; 13547 } 13548 } else { 13549 nsegs++; 13550 /* If the window changed setup to update */ 13551 if (tiwin != tp->snd_wnd) { 13552 win_upd_ack = ae->ack; 13553 win_seq = ae->seq; 13554 the_win = tiwin; 13555 rack_do_win_updates(tp, rack, the_win, win_seq, win_upd_ack, cts, high_seq); 13556 } 13557 #ifdef TCP_ACCOUNTING 13558 /* Account for the acks */ 13559 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 13560 tp->tcp_cnt_counters[CNT_OF_ACKS_IN] += (((ae->ack - high_seq) + segsiz - 1) / segsiz); 13561 } 13562 counter_u64_add(tcp_cnt_counters[CNT_OF_ACKS_IN], 13563 (((ae->ack - high_seq) + segsiz - 1) / segsiz)); 13564 #endif 13565 high_seq = ae->ack; 13566 if (rack_verbose_logging && (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) { 13567 union tcp_log_stackspecific log; 13568 struct timeval tv; 13569 13570 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 13571 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 13572 log.u_bbr.flex1 = high_seq; 13573 log.u_bbr.flex2 = rack->r_ctl.roundends; 13574 log.u_bbr.flex3 = rack->r_ctl.current_round; 13575 log.u_bbr.rttProp = (uint64_t)CC_ALGO(tp)->newround; 13576 log.u_bbr.flex8 = 8; 13577 tcp_log_event_(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0, 13578 0, &log, false, NULL, NULL, 0, &tv); 13579 } 13580 /* 13581 * The draft (v3) calls for us to use SEQ_GEQ, but that 13582 * causes issues when we are just going app limited. Lets 13583 * instead use SEQ_GT <or> where its equal but more data 13584 * is outstanding. 13585 */ 13586 if ((SEQ_GT(high_seq, rack->r_ctl.roundends)) || 13587 ((high_seq == rack->r_ctl.roundends) && 13588 SEQ_GT(tp->snd_max, tp->snd_una))) { 13589 rack->r_ctl.current_round++; 13590 rack->r_ctl.roundends = tp->snd_max; 13591 if (CC_ALGO(tp)->newround != NULL) { 13592 CC_ALGO(tp)->newround(&tp->t_ccv, rack->r_ctl.current_round); 13593 } 13594 } 13595 /* Setup our act_rcv_time */ 13596 if ((ae->flags & TSTMP_LRO) || (ae->flags & TSTMP_HDWR)) { 13597 ts.tv_sec = ae->timestamp / 1000000000; 13598 ts.tv_nsec = ae->timestamp % 1000000000; 13599 rack->r_ctl.act_rcv_time.tv_sec = ts.tv_sec; 13600 rack->r_ctl.act_rcv_time.tv_usec = ts.tv_nsec/1000; 13601 } else { 13602 rack->r_ctl.act_rcv_time = *tv; 13603 } 13604 rack_process_to_cumack(tp, rack, ae->ack, cts, to); 13605 if (rack->rc_dsack_round_seen) { 13606 /* Is the dsack round over? */ 13607 if (SEQ_GEQ(ae->ack, rack->r_ctl.dsack_round_end)) { 13608 /* Yes it is */ 13609 rack->rc_dsack_round_seen = 0; 13610 rack_log_dsack_event(rack, 3, __LINE__, 0, 0); 13611 } 13612 } 13613 } 13614 } 13615 /* And lets be sure to commit the rtt measurements for this ack */ 13616 tcp_rack_xmit_timer_commit(rack, tp); 13617 #ifdef TCP_ACCOUNTING 13618 rdstc = get_cyclecount(); 13619 if (rdstc > ts_val) { 13620 counter_u64_add(tcp_proc_time[ae->ack_val_set] , (rdstc - ts_val)); 13621 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 13622 tp->tcp_proc_time[ae->ack_val_set] += (rdstc - ts_val); 13623 if (ae->ack_val_set == ACK_CUMACK) 13624 tp->tcp_proc_time[CYC_HANDLE_MAP] += (rdstc - ts_val); 13625 } 13626 } 13627 #endif 13628 } 13629 #ifdef TCP_ACCOUNTING 13630 ts_val = get_cyclecount(); 13631 #endif 13632 /* Tend to any collapsed window */ 13633 if (SEQ_GT(tp->snd_max, high_seq) && (tp->snd_wnd < (tp->snd_max - high_seq))) { 13634 /* The peer collapsed the window */ 13635 rack_collapsed_window(rack, (tp->snd_max - high_seq), __LINE__); 13636 } else if (rack->rc_has_collapsed) 13637 rack_un_collapse_window(rack, __LINE__); 13638 if ((rack->r_collapse_point_valid) && 13639 (SEQ_GT(high_seq, rack->r_ctl.high_collapse_point))) 13640 rack->r_collapse_point_valid = 0; 13641 acked_amount = acked = (high_seq - tp->snd_una); 13642 if (acked) { 13643 /* 13644 * Clear the probe not answered flag 13645 * since cum-ack moved forward. 13646 */ 13647 rack->probe_not_answered = 0; 13648 if (rack->sack_attack_disable == 0) 13649 rack_do_decay(rack); 13650 if (acked >= segsiz) { 13651 /* 13652 * You only get credit for 13653 * MSS and greater (and you get extra 13654 * credit for larger cum-ack moves). 13655 */ 13656 int ac; 13657 13658 ac = acked / segsiz; 13659 rack->r_ctl.ack_count += ac; 13660 counter_u64_add(rack_ack_total, ac); 13661 } 13662 if (rack->r_ctl.ack_count > 0xfff00000) { 13663 /* 13664 * reduce the number to keep us under 13665 * a uint32_t. 13666 */ 13667 rack->r_ctl.ack_count /= 2; 13668 rack->r_ctl.sack_count /= 2; 13669 } 13670 if (tp->t_flags & TF_NEEDSYN) { 13671 /* 13672 * T/TCP: Connection was half-synchronized, and our SYN has 13673 * been ACK'd (so connection is now fully synchronized). Go 13674 * to non-starred state, increment snd_una for ACK of SYN, 13675 * and check if we can do window scaling. 13676 */ 13677 tp->t_flags &= ~TF_NEEDSYN; 13678 tp->snd_una++; 13679 acked_amount = acked = (high_seq - tp->snd_una); 13680 } 13681 if (acked > sbavail(&so->so_snd)) 13682 acked_amount = sbavail(&so->so_snd); 13683 #ifdef NETFLIX_EXP_DETECTION 13684 /* 13685 * We only care on a cum-ack move if we are in a sack-disabled 13686 * state. We have already added in to the ack_count, and we never 13687 * would disable on a cum-ack move, so we only care to do the 13688 * detection if it may "undo" it, i.e. we were in disabled already. 13689 */ 13690 if (rack->sack_attack_disable) 13691 rack_do_detection(tp, rack, acked_amount, segsiz); 13692 #endif 13693 if (IN_FASTRECOVERY(tp->t_flags) && 13694 (rack->rack_no_prr == 0)) 13695 rack_update_prr(tp, rack, acked_amount, high_seq); 13696 if (IN_RECOVERY(tp->t_flags)) { 13697 if (SEQ_LT(high_seq, tp->snd_recover) && 13698 (SEQ_LT(high_seq, tp->snd_max))) { 13699 tcp_rack_partialack(tp); 13700 } else { 13701 rack_post_recovery(tp, high_seq); 13702 recovery = 1; 13703 } 13704 } 13705 /* Handle the rack-log-ack part (sendmap) */ 13706 if ((sbused(&so->so_snd) == 0) && 13707 (acked > acked_amount) && 13708 (tp->t_state >= TCPS_FIN_WAIT_1) && 13709 (tp->t_flags & TF_SENTFIN)) { 13710 /* 13711 * We must be sure our fin 13712 * was sent and acked (we can be 13713 * in FIN_WAIT_1 without having 13714 * sent the fin). 13715 */ 13716 ourfinisacked = 1; 13717 /* 13718 * Lets make sure snd_una is updated 13719 * since most likely acked_amount = 0 (it 13720 * should be). 13721 */ 13722 tp->snd_una = high_seq; 13723 } 13724 /* Did we make a RTO error? */ 13725 if ((tp->t_flags & TF_PREVVALID) && 13726 ((tp->t_flags & TF_RCVD_TSTMP) == 0)) { 13727 tp->t_flags &= ~TF_PREVVALID; 13728 if (tp->t_rxtshift == 1 && 13729 (int)(ticks - tp->t_badrxtwin) < 0) 13730 rack_cong_signal(tp, CC_RTO_ERR, high_seq, __LINE__); 13731 } 13732 /* Handle the data in the socket buffer */ 13733 KMOD_TCPSTAT_ADD(tcps_rcvackpack, 1); 13734 KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked); 13735 if (acked_amount > 0) { 13736 struct mbuf *mfree; 13737 13738 rack_ack_received(tp, rack, high_seq, nsegs, CC_ACK, recovery); 13739 SOCKBUF_LOCK(&so->so_snd); 13740 mfree = sbcut_locked(&so->so_snd, acked_amount); 13741 tp->snd_una = high_seq; 13742 /* Note we want to hold the sb lock through the sendmap adjust */ 13743 rack_adjust_sendmap(rack, &so->so_snd, tp->snd_una); 13744 /* Wake up the socket if we have room to write more */ 13745 rack_log_wakeup(tp,rack, &so->so_snd, acked, 2); 13746 sowwakeup_locked(so); 13747 m_freem(mfree); 13748 } 13749 /* update progress */ 13750 tp->t_acktime = ticks; 13751 rack_log_progress_event(rack, tp, tp->t_acktime, 13752 PROGRESS_UPDATE, __LINE__); 13753 /* Clear out shifts and such */ 13754 tp->t_rxtshift = 0; 13755 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 13756 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop); 13757 rack->rc_tlp_in_progress = 0; 13758 rack->r_ctl.rc_tlp_cnt_out = 0; 13759 /* Send recover and snd_nxt must be dragged along */ 13760 if (SEQ_GT(tp->snd_una, tp->snd_recover)) 13761 tp->snd_recover = tp->snd_una; 13762 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 13763 tp->snd_nxt = tp->snd_una; 13764 /* 13765 * If the RXT timer is running we want to 13766 * stop it, so we can restart a TLP (or new RXT). 13767 */ 13768 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) 13769 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 13770 #ifdef NETFLIX_HTTP_LOGGING 13771 tcp_http_check_for_comp(rack->rc_tp, high_seq); 13772 #endif 13773 tp->snd_wl2 = high_seq; 13774 tp->t_dupacks = 0; 13775 if (under_pacing && 13776 (rack->use_fixed_rate == 0) && 13777 (rack->in_probe_rtt == 0) && 13778 rack->rc_gp_dyn_mul && 13779 rack->rc_always_pace) { 13780 /* Check if we are dragging bottom */ 13781 rack_check_bottom_drag(tp, rack, so, acked); 13782 } 13783 if (tp->snd_una == tp->snd_max) { 13784 tp->t_flags &= ~TF_PREVVALID; 13785 rack->r_ctl.retran_during_recovery = 0; 13786 rack->r_ctl.dsack_byte_cnt = 0; 13787 rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL); 13788 if (rack->r_ctl.rc_went_idle_time == 0) 13789 rack->r_ctl.rc_went_idle_time = 1; 13790 rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__); 13791 if (sbavail(&tptosocket(tp)->so_snd) == 0) 13792 tp->t_acktime = 0; 13793 /* Set so we might enter persists... */ 13794 rack->r_wanted_output = 1; 13795 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 13796 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 13797 if ((tp->t_state >= TCPS_FIN_WAIT_1) && 13798 (sbavail(&so->so_snd) == 0) && 13799 (tp->t_flags2 & TF2_DROP_AF_DATA)) { 13800 /* 13801 * The socket was gone and the 13802 * peer sent data (not now in the past), time to 13803 * reset him. 13804 */ 13805 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 13806 /* tcp_close will kill the inp pre-log the Reset */ 13807 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); 13808 #ifdef TCP_ACCOUNTING 13809 rdstc = get_cyclecount(); 13810 if (rdstc > ts_val) { 13811 counter_u64_add(tcp_proc_time[ACK_CUMACK] , (rdstc - ts_val)); 13812 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 13813 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val); 13814 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val); 13815 } 13816 } 13817 #endif 13818 m_freem(m); 13819 tp = tcp_close(tp); 13820 if (tp == NULL) { 13821 #ifdef TCP_ACCOUNTING 13822 sched_unpin(); 13823 #endif 13824 return (1); 13825 } 13826 /* 13827 * We would normally do drop-with-reset which would 13828 * send back a reset. We can't since we don't have 13829 * all the needed bits. Instead lets arrange for 13830 * a call to tcp_output(). That way since we 13831 * are in the closed state we will generate a reset. 13832 * 13833 * Note if tcp_accounting is on we don't unpin since 13834 * we do that after the goto label. 13835 */ 13836 goto send_out_a_rst; 13837 } 13838 if ((sbused(&so->so_snd) == 0) && 13839 (tp->t_state >= TCPS_FIN_WAIT_1) && 13840 (tp->t_flags & TF_SENTFIN)) { 13841 /* 13842 * If we can't receive any more data, then closing user can 13843 * proceed. Starting the timer is contrary to the 13844 * specification, but if we don't get a FIN we'll hang 13845 * forever. 13846 * 13847 */ 13848 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 13849 soisdisconnected(so); 13850 tcp_timer_activate(tp, TT_2MSL, 13851 (tcp_fast_finwait2_recycle ? 13852 tcp_finwait2_timeout : 13853 TP_MAXIDLE(tp))); 13854 } 13855 if (ourfinisacked == 0) { 13856 /* 13857 * We don't change to fin-wait-2 if we have our fin acked 13858 * which means we are probably in TCPS_CLOSING. 13859 */ 13860 tcp_state_change(tp, TCPS_FIN_WAIT_2); 13861 } 13862 } 13863 } 13864 /* Wake up the socket if we have room to write more */ 13865 if (sbavail(&so->so_snd)) { 13866 rack->r_wanted_output = 1; 13867 if (ctf_progress_timeout_check(tp, true)) { 13868 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 13869 tp, tick, PROGRESS_DROP, __LINE__); 13870 /* 13871 * We cheat here and don't send a RST, we should send one 13872 * when the pacer drops the connection. 13873 */ 13874 #ifdef TCP_ACCOUNTING 13875 rdstc = get_cyclecount(); 13876 if (rdstc > ts_val) { 13877 counter_u64_add(tcp_proc_time[ACK_CUMACK] , (rdstc - ts_val)); 13878 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 13879 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val); 13880 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val); 13881 } 13882 } 13883 sched_unpin(); 13884 #endif 13885 (void)tcp_drop(tp, ETIMEDOUT); 13886 m_freem(m); 13887 return (1); 13888 } 13889 } 13890 if (ourfinisacked) { 13891 switch(tp->t_state) { 13892 case TCPS_CLOSING: 13893 #ifdef TCP_ACCOUNTING 13894 rdstc = get_cyclecount(); 13895 if (rdstc > ts_val) { 13896 counter_u64_add(tcp_proc_time[ACK_CUMACK] , 13897 (rdstc - ts_val)); 13898 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 13899 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val); 13900 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val); 13901 } 13902 } 13903 sched_unpin(); 13904 #endif 13905 tcp_twstart(tp); 13906 m_freem(m); 13907 return (1); 13908 break; 13909 case TCPS_LAST_ACK: 13910 #ifdef TCP_ACCOUNTING 13911 rdstc = get_cyclecount(); 13912 if (rdstc > ts_val) { 13913 counter_u64_add(tcp_proc_time[ACK_CUMACK] , 13914 (rdstc - ts_val)); 13915 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 13916 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val); 13917 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val); 13918 } 13919 } 13920 sched_unpin(); 13921 #endif 13922 tp = tcp_close(tp); 13923 ctf_do_drop(m, tp); 13924 return (1); 13925 break; 13926 case TCPS_FIN_WAIT_1: 13927 #ifdef TCP_ACCOUNTING 13928 rdstc = get_cyclecount(); 13929 if (rdstc > ts_val) { 13930 counter_u64_add(tcp_proc_time[ACK_CUMACK] , 13931 (rdstc - ts_val)); 13932 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 13933 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val); 13934 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val); 13935 } 13936 } 13937 #endif 13938 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 13939 soisdisconnected(so); 13940 tcp_timer_activate(tp, TT_2MSL, 13941 (tcp_fast_finwait2_recycle ? 13942 tcp_finwait2_timeout : 13943 TP_MAXIDLE(tp))); 13944 } 13945 tcp_state_change(tp, TCPS_FIN_WAIT_2); 13946 break; 13947 default: 13948 break; 13949 } 13950 } 13951 if (rack->r_fast_output) { 13952 /* 13953 * We re doing fast output.. can we expand that? 13954 */ 13955 rack_gain_for_fastoutput(rack, tp, so, acked_amount); 13956 } 13957 #ifdef TCP_ACCOUNTING 13958 rdstc = get_cyclecount(); 13959 if (rdstc > ts_val) { 13960 counter_u64_add(tcp_proc_time[ACK_CUMACK] , (rdstc - ts_val)); 13961 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 13962 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val); 13963 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val); 13964 } 13965 } 13966 13967 } else if (win_up_req) { 13968 rdstc = get_cyclecount(); 13969 if (rdstc > ts_val) { 13970 counter_u64_add(tcp_proc_time[ACK_RWND] , (rdstc - ts_val)); 13971 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 13972 tp->tcp_proc_time[ACK_RWND] += (rdstc - ts_val); 13973 } 13974 } 13975 #endif 13976 } 13977 /* Now is there a next packet, if so we are done */ 13978 m_freem(m); 13979 did_out = 0; 13980 if (nxt_pkt) { 13981 #ifdef TCP_ACCOUNTING 13982 sched_unpin(); 13983 #endif 13984 rack_log_doseg_done(rack, cts, nxt_pkt, did_out, 5, nsegs); 13985 return (0); 13986 } 13987 rack_handle_might_revert(tp, rack); 13988 ctf_calc_rwin(so, tp); 13989 if ((rack->r_wanted_output != 0) || (rack->r_fast_output != 0)) { 13990 send_out_a_rst: 13991 if (tcp_output(tp) < 0) { 13992 #ifdef TCP_ACCOUNTING 13993 sched_unpin(); 13994 #endif 13995 return (1); 13996 } 13997 did_out = 1; 13998 } 13999 rack_free_trim(rack); 14000 #ifdef TCP_ACCOUNTING 14001 sched_unpin(); 14002 #endif 14003 rack_timer_audit(tp, rack, &so->so_snd); 14004 rack_log_doseg_done(rack, cts, nxt_pkt, did_out, 6, nsegs); 14005 return (0); 14006 } 14007 14008 14009 static int 14010 rack_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so, 14011 struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos, 14012 int32_t nxt_pkt, struct timeval *tv) 14013 { 14014 struct inpcb *inp = tptoinpcb(tp); 14015 #ifdef TCP_ACCOUNTING 14016 uint64_t ts_val; 14017 #endif 14018 int32_t thflags, retval, did_out = 0; 14019 int32_t way_out = 0; 14020 /* 14021 * cts - is the current time from tv (caller gets ts) in microseconds. 14022 * ms_cts - is the current time from tv in milliseconds. 14023 * us_cts - is the time that LRO or hardware actually got the packet in microseconds. 14024 */ 14025 uint32_t cts, us_cts, ms_cts; 14026 uint32_t tiwin, high_seq; 14027 struct timespec ts; 14028 struct tcpopt to; 14029 struct tcp_rack *rack; 14030 struct rack_sendmap *rsm; 14031 int32_t prev_state = 0; 14032 #ifdef TCP_ACCOUNTING 14033 int ack_val_set = 0xf; 14034 #endif 14035 int nsegs; 14036 14037 NET_EPOCH_ASSERT(); 14038 INP_WLOCK_ASSERT(inp); 14039 14040 /* 14041 * tv passed from common code is from either M_TSTMP_LRO or 14042 * tcp_get_usecs() if no LRO m_pkthdr timestamp is present. 14043 */ 14044 rack = (struct tcp_rack *)tp->t_fb_ptr; 14045 if (m->m_flags & M_ACKCMP) { 14046 /* 14047 * All compressed ack's are ack's by definition so 14048 * remove any ack required flag and then do the processing. 14049 */ 14050 rack->rc_ack_required = 0; 14051 return (rack_do_compressed_ack_processing(tp, so, m, nxt_pkt, tv)); 14052 } 14053 if (m->m_flags & M_ACKCMP) { 14054 panic("Impossible reach m has ackcmp? m:%p tp:%p", m, tp); 14055 } 14056 cts = tcp_tv_to_usectick(tv); 14057 ms_cts = tcp_tv_to_mssectick(tv); 14058 nsegs = m->m_pkthdr.lro_nsegs; 14059 counter_u64_add(rack_proc_non_comp_ack, 1); 14060 thflags = tcp_get_flags(th); 14061 #ifdef TCP_ACCOUNTING 14062 sched_pin(); 14063 if (thflags & TH_ACK) 14064 ts_val = get_cyclecount(); 14065 #endif 14066 if ((m->m_flags & M_TSTMP) || 14067 (m->m_flags & M_TSTMP_LRO)) { 14068 mbuf_tstmp2timespec(m, &ts); 14069 rack->r_ctl.act_rcv_time.tv_sec = ts.tv_sec; 14070 rack->r_ctl.act_rcv_time.tv_usec = ts.tv_nsec/1000; 14071 } else 14072 rack->r_ctl.act_rcv_time = *tv; 14073 kern_prefetch(rack, &prev_state); 14074 prev_state = 0; 14075 /* 14076 * Unscale the window into a 32-bit value. For the SYN_SENT state 14077 * the scale is zero. 14078 */ 14079 tiwin = th->th_win << tp->snd_scale; 14080 #ifdef TCP_ACCOUNTING 14081 if (thflags & TH_ACK) { 14082 /* 14083 * We have a tradeoff here. We can either do what we are 14084 * doing i.e. pinning to this CPU and then doing the accounting 14085 * <or> we could do a critical enter, setup the rdtsc and cpu 14086 * as in below, and then validate we are on the same CPU on 14087 * exit. I have choosen to not do the critical enter since 14088 * that often will gain you a context switch, and instead lock 14089 * us (line above this if) to the same CPU with sched_pin(). This 14090 * means we may be context switched out for a higher priority 14091 * interupt but we won't be moved to another CPU. 14092 * 14093 * If this occurs (which it won't very often since we most likely 14094 * are running this code in interupt context and only a higher 14095 * priority will bump us ... clock?) we will falsely add in 14096 * to the time the interupt processing time plus the ack processing 14097 * time. This is ok since its a rare event. 14098 */ 14099 ack_val_set = tcp_do_ack_accounting(tp, th, &to, tiwin, 14100 ctf_fixed_maxseg(tp)); 14101 } 14102 #endif 14103 /* 14104 * Parse options on any incoming segment. 14105 */ 14106 memset(&to, 0, sizeof(to)); 14107 tcp_dooptions(&to, (u_char *)(th + 1), 14108 (th->th_off << 2) - sizeof(struct tcphdr), 14109 (thflags & TH_SYN) ? TO_SYN : 0); 14110 KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", 14111 __func__)); 14112 KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT", 14113 __func__)); 14114 14115 if ((tp->t_state >= TCPS_FIN_WAIT_1) && 14116 (tp->t_flags & TF_GPUTINPROG)) { 14117 /* 14118 * We have a goodput in progress 14119 * and we have entered a late state. 14120 * Do we have enough data in the sb 14121 * to handle the GPUT request? 14122 */ 14123 uint32_t bytes; 14124 14125 bytes = tp->gput_ack - tp->gput_seq; 14126 if (SEQ_GT(tp->gput_seq, tp->snd_una)) 14127 bytes += tp->gput_seq - tp->snd_una; 14128 if (bytes > sbavail(&tptosocket(tp)->so_snd)) { 14129 /* 14130 * There are not enough bytes in the socket 14131 * buffer that have been sent to cover this 14132 * measurement. Cancel it. 14133 */ 14134 rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/, 14135 rack->r_ctl.rc_gp_srtt /*flex1*/, 14136 tp->gput_seq, 14137 0, 0, 18, __LINE__, NULL, 0); 14138 tp->t_flags &= ~TF_GPUTINPROG; 14139 } 14140 } 14141 high_seq = th->th_ack; 14142 if (tp->t_logstate != TCP_LOG_STATE_OFF) { 14143 union tcp_log_stackspecific log; 14144 struct timeval ltv; 14145 #ifdef NETFLIX_HTTP_LOGGING 14146 struct http_sendfile_track *http_req; 14147 14148 if (SEQ_GT(th->th_ack, tp->snd_una)) { 14149 http_req = tcp_http_find_req_for_seq(tp, (th->th_ack-1)); 14150 } else { 14151 http_req = tcp_http_find_req_for_seq(tp, th->th_ack); 14152 } 14153 #endif 14154 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 14155 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); 14156 if (rack->rack_no_prr == 0) 14157 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; 14158 else 14159 log.u_bbr.flex1 = 0; 14160 log.u_bbr.use_lt_bw = rack->r_ent_rec_ns; 14161 log.u_bbr.use_lt_bw <<= 1; 14162 log.u_bbr.use_lt_bw |= rack->r_might_revert; 14163 log.u_bbr.flex2 = rack->r_ctl.rc_num_maps_alloced; 14164 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 14165 log.u_bbr.pkts_out = rack->rc_tp->t_maxseg; 14166 log.u_bbr.flex3 = m->m_flags; 14167 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; 14168 log.u_bbr.lost = thflags; 14169 log.u_bbr.pacing_gain = 0x1; 14170 #ifdef TCP_ACCOUNTING 14171 log.u_bbr.cwnd_gain = ack_val_set; 14172 #endif 14173 log.u_bbr.flex7 = 2; 14174 if (m->m_flags & M_TSTMP) { 14175 /* Record the hardware timestamp if present */ 14176 mbuf_tstmp2timespec(m, &ts); 14177 ltv.tv_sec = ts.tv_sec; 14178 ltv.tv_usec = ts.tv_nsec / 1000; 14179 log.u_bbr.lt_epoch = tcp_tv_to_usectick(<v); 14180 } else if (m->m_flags & M_TSTMP_LRO) { 14181 /* Record the LRO the arrival timestamp */ 14182 mbuf_tstmp2timespec(m, &ts); 14183 ltv.tv_sec = ts.tv_sec; 14184 ltv.tv_usec = ts.tv_nsec / 1000; 14185 log.u_bbr.flex5 = tcp_tv_to_usectick(<v); 14186 } 14187 log.u_bbr.timeStamp = tcp_get_usecs(<v); 14188 /* Log the rcv time */ 14189 log.u_bbr.delRate = m->m_pkthdr.rcv_tstmp; 14190 #ifdef NETFLIX_HTTP_LOGGING 14191 log.u_bbr.applimited = tp->t_http_closed; 14192 log.u_bbr.applimited <<= 8; 14193 log.u_bbr.applimited |= tp->t_http_open; 14194 log.u_bbr.applimited <<= 8; 14195 log.u_bbr.applimited |= tp->t_http_req; 14196 if (http_req) { 14197 /* Copy out any client req info */ 14198 /* seconds */ 14199 log.u_bbr.pkt_epoch = (http_req->localtime / HPTS_USEC_IN_SEC); 14200 /* useconds */ 14201 log.u_bbr.delivered = (http_req->localtime % HPTS_USEC_IN_SEC); 14202 log.u_bbr.rttProp = http_req->timestamp; 14203 log.u_bbr.cur_del_rate = http_req->start; 14204 if (http_req->flags & TCP_HTTP_TRACK_FLG_OPEN) { 14205 log.u_bbr.flex8 |= 1; 14206 } else { 14207 log.u_bbr.flex8 |= 2; 14208 log.u_bbr.bw_inuse = http_req->end; 14209 } 14210 log.u_bbr.flex6 = http_req->start_seq; 14211 if (http_req->flags & TCP_HTTP_TRACK_FLG_COMP) { 14212 log.u_bbr.flex8 |= 4; 14213 log.u_bbr.epoch = http_req->end_seq; 14214 } 14215 } 14216 #endif 14217 TCP_LOG_EVENTP(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_IN, 0, 14218 tlen, &log, true, <v); 14219 } 14220 /* Remove ack required flag if set, we have one */ 14221 if (thflags & TH_ACK) 14222 rack->rc_ack_required = 0; 14223 if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) { 14224 way_out = 4; 14225 retval = 0; 14226 m_freem(m); 14227 goto done_with_input; 14228 } 14229 /* 14230 * If a segment with the ACK-bit set arrives in the SYN-SENT state 14231 * check SEQ.ACK first as described on page 66 of RFC 793, section 3.9. 14232 */ 14233 if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) && 14234 (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { 14235 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 14236 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 14237 #ifdef TCP_ACCOUNTING 14238 sched_unpin(); 14239 #endif 14240 return (1); 14241 } 14242 /* 14243 * If timestamps were negotiated during SYN/ACK and a 14244 * segment without a timestamp is received, silently drop 14245 * the segment, unless it is a RST segment or missing timestamps are 14246 * tolerated. 14247 * See section 3.2 of RFC 7323. 14248 */ 14249 if ((tp->t_flags & TF_RCVD_TSTMP) && !(to.to_flags & TOF_TS) && 14250 ((thflags & TH_RST) == 0) && (V_tcp_tolerate_missing_ts == 0)) { 14251 way_out = 5; 14252 retval = 0; 14253 m_freem(m); 14254 goto done_with_input; 14255 } 14256 14257 /* 14258 * Segment received on connection. Reset idle time and keep-alive 14259 * timer. XXX: This should be done after segment validation to 14260 * ignore broken/spoofed segs. 14261 */ 14262 if (tp->t_idle_reduce && 14263 (tp->snd_max == tp->snd_una) && 14264 (TICKS_2_USEC(ticks - tp->t_rcvtime) >= tp->t_rxtcur)) { 14265 counter_u64_add(rack_input_idle_reduces, 1); 14266 rack_cc_after_idle(rack, tp); 14267 } 14268 tp->t_rcvtime = ticks; 14269 #ifdef STATS 14270 stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_FRWIN, tiwin); 14271 #endif 14272 if (tiwin > rack->r_ctl.rc_high_rwnd) 14273 rack->r_ctl.rc_high_rwnd = tiwin; 14274 /* 14275 * TCP ECN processing. XXXJTL: If we ever use ECN, we need to move 14276 * this to occur after we've validated the segment. 14277 */ 14278 if (tcp_ecn_input_segment(tp, thflags, tlen, 14279 tcp_packets_this_ack(tp, th->th_ack), 14280 iptos)) 14281 rack_cong_signal(tp, CC_ECN, th->th_ack, __LINE__); 14282 14283 /* 14284 * If echoed timestamp is later than the current time, fall back to 14285 * non RFC1323 RTT calculation. Normalize timestamp if syncookies 14286 * were used when this connection was established. 14287 */ 14288 if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) { 14289 to.to_tsecr -= tp->ts_offset; 14290 if (TSTMP_GT(to.to_tsecr, ms_cts)) 14291 to.to_tsecr = 0; 14292 } 14293 14294 /* 14295 * If its the first time in we need to take care of options and 14296 * verify we can do SACK for rack! 14297 */ 14298 if (rack->r_state == 0) { 14299 /* Should be init'd by rack_init() */ 14300 KASSERT(rack->rc_inp != NULL, 14301 ("%s: rack->rc_inp unexpectedly NULL", __func__)); 14302 if (rack->rc_inp == NULL) { 14303 rack->rc_inp = inp; 14304 } 14305 14306 /* 14307 * Process options only when we get SYN/ACK back. The SYN 14308 * case for incoming connections is handled in tcp_syncache. 14309 * According to RFC1323 the window field in a SYN (i.e., a 14310 * <SYN> or <SYN,ACK>) segment itself is never scaled. XXX 14311 * this is traditional behavior, may need to be cleaned up. 14312 */ 14313 if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) { 14314 /* Handle parallel SYN for ECN */ 14315 tcp_ecn_input_parallel_syn(tp, thflags, iptos); 14316 if ((to.to_flags & TOF_SCALE) && 14317 (tp->t_flags & TF_REQ_SCALE)) { 14318 tp->t_flags |= TF_RCVD_SCALE; 14319 tp->snd_scale = to.to_wscale; 14320 } else 14321 tp->t_flags &= ~TF_REQ_SCALE; 14322 /* 14323 * Initial send window. It will be updated with the 14324 * next incoming segment to the scaled value. 14325 */ 14326 tp->snd_wnd = th->th_win; 14327 rack_validate_fo_sendwin_up(tp, rack); 14328 if ((to.to_flags & TOF_TS) && 14329 (tp->t_flags & TF_REQ_TSTMP)) { 14330 tp->t_flags |= TF_RCVD_TSTMP; 14331 tp->ts_recent = to.to_tsval; 14332 tp->ts_recent_age = cts; 14333 } else 14334 tp->t_flags &= ~TF_REQ_TSTMP; 14335 if (to.to_flags & TOF_MSS) { 14336 tcp_mss(tp, to.to_mss); 14337 } 14338 if ((tp->t_flags & TF_SACK_PERMIT) && 14339 (to.to_flags & TOF_SACKPERM) == 0) 14340 tp->t_flags &= ~TF_SACK_PERMIT; 14341 if (IS_FASTOPEN(tp->t_flags)) { 14342 if (to.to_flags & TOF_FASTOPEN) { 14343 uint16_t mss; 14344 14345 if (to.to_flags & TOF_MSS) 14346 mss = to.to_mss; 14347 else 14348 if ((inp->inp_vflag & INP_IPV6) != 0) 14349 mss = TCP6_MSS; 14350 else 14351 mss = TCP_MSS; 14352 tcp_fastopen_update_cache(tp, mss, 14353 to.to_tfo_len, to.to_tfo_cookie); 14354 } else 14355 tcp_fastopen_disable_path(tp); 14356 } 14357 } 14358 /* 14359 * At this point we are at the initial call. Here we decide 14360 * if we are doing RACK or not. We do this by seeing if 14361 * TF_SACK_PERMIT is set and the sack-not-required is clear. 14362 * The code now does do dup-ack counting so if you don't 14363 * switch back you won't get rack & TLP, but you will still 14364 * get this stack. 14365 */ 14366 14367 if ((rack_sack_not_required == 0) && 14368 ((tp->t_flags & TF_SACK_PERMIT) == 0)) { 14369 tcp_switch_back_to_default(tp); 14370 (*tp->t_fb->tfb_tcp_do_segment) (m, th, so, tp, drop_hdrlen, 14371 tlen, iptos); 14372 #ifdef TCP_ACCOUNTING 14373 sched_unpin(); 14374 #endif 14375 return (1); 14376 } 14377 tcp_set_hpts(inp); 14378 sack_filter_clear(&rack->r_ctl.rack_sf, th->th_ack); 14379 } 14380 if (thflags & TH_FIN) 14381 tcp_log_end_status(tp, TCP_EI_STATUS_CLIENT_FIN); 14382 us_cts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 14383 if ((rack->rc_gp_dyn_mul) && 14384 (rack->use_fixed_rate == 0) && 14385 (rack->rc_always_pace)) { 14386 /* Check in on probertt */ 14387 rack_check_probe_rtt(rack, us_cts); 14388 } 14389 rack_clear_rate_sample(rack); 14390 if ((rack->forced_ack) && 14391 ((tcp_get_flags(th) & TH_RST) == 0)) { 14392 rack_handle_probe_response(rack, tiwin, us_cts); 14393 } 14394 /* 14395 * This is the one exception case where we set the rack state 14396 * always. All other times (timers etc) we must have a rack-state 14397 * set (so we assure we have done the checks above for SACK). 14398 */ 14399 rack->r_ctl.rc_rcvtime = cts; 14400 if (rack->r_state != tp->t_state) 14401 rack_set_state(tp, rack); 14402 if (SEQ_GT(th->th_ack, tp->snd_una) && 14403 (rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree)) != NULL) 14404 kern_prefetch(rsm, &prev_state); 14405 prev_state = rack->r_state; 14406 retval = (*rack->r_substate) (m, th, so, 14407 tp, &to, drop_hdrlen, 14408 tlen, tiwin, thflags, nxt_pkt, iptos); 14409 if (retval == 0) { 14410 /* 14411 * If retval is 1 the tcb is unlocked and most likely the tp 14412 * is gone. 14413 */ 14414 INP_WLOCK_ASSERT(inp); 14415 if ((rack->rc_gp_dyn_mul) && 14416 (rack->rc_always_pace) && 14417 (rack->use_fixed_rate == 0) && 14418 rack->in_probe_rtt && 14419 (rack->r_ctl.rc_time_probertt_starts == 0)) { 14420 /* 14421 * If we are going for target, lets recheck before 14422 * we output. 14423 */ 14424 rack_check_probe_rtt(rack, us_cts); 14425 } 14426 if (rack->set_pacing_done_a_iw == 0) { 14427 /* How much has been acked? */ 14428 if ((tp->snd_una - tp->iss) > (ctf_fixed_maxseg(tp) * 10)) { 14429 /* We have enough to set in the pacing segment size */ 14430 rack->set_pacing_done_a_iw = 1; 14431 rack_set_pace_segments(tp, rack, __LINE__, NULL); 14432 } 14433 } 14434 tcp_rack_xmit_timer_commit(rack, tp); 14435 #ifdef TCP_ACCOUNTING 14436 /* 14437 * If we set the ack_val_se to what ack processing we are doing 14438 * we also want to track how many cycles we burned. Note 14439 * the bits after tcp_output we let be "free". This is because 14440 * we are also tracking the tcp_output times as well. Note the 14441 * use of 0xf here since we only have 11 counter (0 - 0xa) and 14442 * 0xf cannot be returned and is what we initialize it too to 14443 * indicate we are not doing the tabulations. 14444 */ 14445 if (ack_val_set != 0xf) { 14446 uint64_t crtsc; 14447 14448 crtsc = get_cyclecount(); 14449 counter_u64_add(tcp_proc_time[ack_val_set] , (crtsc - ts_val)); 14450 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 14451 tp->tcp_proc_time[ack_val_set] += (crtsc - ts_val); 14452 } 14453 } 14454 #endif 14455 if (nxt_pkt == 0) { 14456 if ((rack->r_wanted_output != 0) || (rack->r_fast_output != 0)) { 14457 do_output_now: 14458 if (tcp_output(tp) < 0) 14459 return (1); 14460 did_out = 1; 14461 } 14462 rack_start_hpts_timer(rack, tp, cts, 0, 0, 0); 14463 rack_free_trim(rack); 14464 } 14465 /* Update any rounds needed */ 14466 if (rack_verbose_logging && (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) { 14467 union tcp_log_stackspecific log; 14468 struct timeval tv; 14469 14470 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 14471 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 14472 log.u_bbr.flex1 = high_seq; 14473 log.u_bbr.flex2 = rack->r_ctl.roundends; 14474 log.u_bbr.flex3 = rack->r_ctl.current_round; 14475 log.u_bbr.rttProp = (uint64_t)CC_ALGO(tp)->newround; 14476 log.u_bbr.flex8 = 9; 14477 tcp_log_event_(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0, 14478 0, &log, false, NULL, NULL, 0, &tv); 14479 } 14480 /* 14481 * The draft (v3) calls for us to use SEQ_GEQ, but that 14482 * causes issues when we are just going app limited. Lets 14483 * instead use SEQ_GT <or> where its equal but more data 14484 * is outstanding. 14485 */ 14486 if ((SEQ_GT(tp->snd_una, rack->r_ctl.roundends)) || 14487 ((tp->snd_una == rack->r_ctl.roundends) && SEQ_GT(tp->snd_max, tp->snd_una))) { 14488 rack->r_ctl.current_round++; 14489 rack->r_ctl.roundends = tp->snd_max; 14490 if (CC_ALGO(tp)->newround != NULL) { 14491 CC_ALGO(tp)->newround(&tp->t_ccv, rack->r_ctl.current_round); 14492 } 14493 } 14494 if ((nxt_pkt == 0) && 14495 ((rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) == 0) && 14496 (SEQ_GT(tp->snd_max, tp->snd_una) || 14497 (tp->t_flags & TF_DELACK) || 14498 ((V_tcp_always_keepalive || rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) && 14499 (tp->t_state <= TCPS_CLOSING)))) { 14500 /* We could not send (probably in the hpts but stopped the timer earlier)? */ 14501 if ((tp->snd_max == tp->snd_una) && 14502 ((tp->t_flags & TF_DELACK) == 0) && 14503 (tcp_in_hpts(rack->rc_inp)) && 14504 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { 14505 /* keep alive not needed if we are hptsi output yet */ 14506 ; 14507 } else { 14508 int late = 0; 14509 if (tcp_in_hpts(inp)) { 14510 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { 14511 us_cts = tcp_get_usecs(NULL); 14512 if (TSTMP_GT(rack->r_ctl.rc_last_output_to, us_cts)) { 14513 rack->r_early = 1; 14514 rack->r_ctl.rc_agg_early += (rack->r_ctl.rc_last_output_to - us_cts); 14515 } else 14516 late = 1; 14517 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 14518 } 14519 tcp_hpts_remove(inp); 14520 } 14521 if (late && (did_out == 0)) { 14522 /* 14523 * We are late in the sending 14524 * and we did not call the output 14525 * (this probably should not happen). 14526 */ 14527 goto do_output_now; 14528 } 14529 rack_start_hpts_timer(rack, tp, tcp_get_usecs(NULL), 0, 0, 0); 14530 } 14531 way_out = 1; 14532 } else if (nxt_pkt == 0) { 14533 /* Do we have the correct timer running? */ 14534 rack_timer_audit(tp, rack, &so->so_snd); 14535 way_out = 2; 14536 } 14537 done_with_input: 14538 rack_log_doseg_done(rack, cts, nxt_pkt, did_out, way_out, max(1, nsegs)); 14539 if (did_out) 14540 rack->r_wanted_output = 0; 14541 #ifdef TCP_ACCOUNTING 14542 } else { 14543 /* 14544 * Track the time (see above). 14545 */ 14546 if (ack_val_set != 0xf) { 14547 uint64_t crtsc; 14548 14549 crtsc = get_cyclecount(); 14550 counter_u64_add(tcp_proc_time[ack_val_set] , (crtsc - ts_val)); 14551 /* 14552 * Note we *DO NOT* increment the per-tcb counters since 14553 * in the else the TP may be gone!! 14554 */ 14555 } 14556 #endif 14557 } 14558 #ifdef TCP_ACCOUNTING 14559 sched_unpin(); 14560 #endif 14561 return (retval); 14562 } 14563 14564 void 14565 rack_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, 14566 struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos) 14567 { 14568 struct timeval tv; 14569 14570 /* First lets see if we have old packets */ 14571 if (tp->t_in_pkt) { 14572 if (ctf_do_queued_segments(so, tp, 1)) { 14573 m_freem(m); 14574 return; 14575 } 14576 } 14577 if (m->m_flags & M_TSTMP_LRO) { 14578 mbuf_tstmp2timeval(m, &tv); 14579 } else { 14580 /* Should not be should we kassert instead? */ 14581 tcp_get_usecs(&tv); 14582 } 14583 if (rack_do_segment_nounlock(m, th, so, tp, 14584 drop_hdrlen, tlen, iptos, 0, &tv) == 0) { 14585 INP_WUNLOCK(tptoinpcb(tp)); 14586 } 14587 } 14588 14589 struct rack_sendmap * 14590 tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tsused) 14591 { 14592 struct rack_sendmap *rsm = NULL; 14593 int32_t idx; 14594 uint32_t srtt = 0, thresh = 0, ts_low = 0; 14595 14596 /* Return the next guy to be re-transmitted */ 14597 if (RB_EMPTY(&rack->r_ctl.rc_mtree)) { 14598 return (NULL); 14599 } 14600 if (tp->t_flags & TF_SENTFIN) { 14601 /* retran the end FIN? */ 14602 return (NULL); 14603 } 14604 /* ok lets look at this one */ 14605 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 14606 if (rack->r_must_retran && rsm && (rsm->r_flags & RACK_MUST_RXT)) { 14607 return (rsm); 14608 } 14609 if (rsm && ((rsm->r_flags & RACK_ACKED) == 0)) { 14610 goto check_it; 14611 } 14612 rsm = rack_find_lowest_rsm(rack); 14613 if (rsm == NULL) { 14614 return (NULL); 14615 } 14616 check_it: 14617 if (((rack->rc_tp->t_flags & TF_SACK_PERMIT) == 0) && 14618 (rsm->r_dupack >= DUP_ACK_THRESHOLD)) { 14619 /* 14620 * No sack so we automatically do the 3 strikes and 14621 * retransmit (no rack timer would be started). 14622 */ 14623 14624 return (rsm); 14625 } 14626 if (rsm->r_flags & RACK_ACKED) { 14627 return (NULL); 14628 } 14629 if (((rsm->r_flags & RACK_SACK_PASSED) == 0) && 14630 (rsm->r_dupack < DUP_ACK_THRESHOLD)) { 14631 /* Its not yet ready */ 14632 return (NULL); 14633 } 14634 srtt = rack_grab_rtt(tp, rack); 14635 idx = rsm->r_rtr_cnt - 1; 14636 ts_low = (uint32_t)rsm->r_tim_lastsent[idx]; 14637 thresh = rack_calc_thresh_rack(rack, srtt, tsused); 14638 if ((tsused == ts_low) || 14639 (TSTMP_LT(tsused, ts_low))) { 14640 /* No time since sending */ 14641 return (NULL); 14642 } 14643 if ((tsused - ts_low) < thresh) { 14644 /* It has not been long enough yet */ 14645 return (NULL); 14646 } 14647 if ((rsm->r_dupack >= DUP_ACK_THRESHOLD) || 14648 ((rsm->r_flags & RACK_SACK_PASSED) && 14649 (rack->sack_attack_disable == 0))) { 14650 /* 14651 * We have passed the dup-ack threshold <or> 14652 * a SACK has indicated this is missing. 14653 * Note that if you are a declared attacker 14654 * it is only the dup-ack threshold that 14655 * will cause retransmits. 14656 */ 14657 /* log retransmit reason */ 14658 rack_log_retran_reason(rack, rsm, (tsused - ts_low), thresh, 1); 14659 rack->r_fast_output = 0; 14660 return (rsm); 14661 } 14662 return (NULL); 14663 } 14664 14665 static void 14666 rack_log_pacing_delay_calc(struct tcp_rack *rack, uint32_t len, uint32_t slot, 14667 uint64_t bw_est, uint64_t bw, uint64_t len_time, int method, 14668 int line, struct rack_sendmap *rsm, uint8_t quality) 14669 { 14670 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 14671 union tcp_log_stackspecific log; 14672 struct timeval tv; 14673 14674 memset(&log, 0, sizeof(log)); 14675 log.u_bbr.flex1 = slot; 14676 log.u_bbr.flex2 = len; 14677 log.u_bbr.flex3 = rack->r_ctl.rc_pace_min_segs; 14678 log.u_bbr.flex4 = rack->r_ctl.rc_pace_max_segs; 14679 log.u_bbr.flex5 = rack->r_ctl.rack_per_of_gp_ss; 14680 log.u_bbr.flex6 = rack->r_ctl.rack_per_of_gp_ca; 14681 log.u_bbr.use_lt_bw = rack->rc_ack_can_sendout_data; 14682 log.u_bbr.use_lt_bw <<= 1; 14683 log.u_bbr.use_lt_bw |= rack->r_late; 14684 log.u_bbr.use_lt_bw <<= 1; 14685 log.u_bbr.use_lt_bw |= rack->r_early; 14686 log.u_bbr.use_lt_bw <<= 1; 14687 log.u_bbr.use_lt_bw |= rack->app_limited_needs_set; 14688 log.u_bbr.use_lt_bw <<= 1; 14689 log.u_bbr.use_lt_bw |= rack->rc_gp_filled; 14690 log.u_bbr.use_lt_bw <<= 1; 14691 log.u_bbr.use_lt_bw |= rack->measure_saw_probe_rtt; 14692 log.u_bbr.use_lt_bw <<= 1; 14693 log.u_bbr.use_lt_bw |= rack->in_probe_rtt; 14694 log.u_bbr.use_lt_bw <<= 1; 14695 log.u_bbr.use_lt_bw |= rack->gp_ready; 14696 log.u_bbr.pkt_epoch = line; 14697 log.u_bbr.epoch = rack->r_ctl.rc_agg_delayed; 14698 log.u_bbr.lt_epoch = rack->r_ctl.rc_agg_early; 14699 log.u_bbr.applimited = rack->r_ctl.rack_per_of_gp_rec; 14700 log.u_bbr.bw_inuse = bw_est; 14701 log.u_bbr.delRate = bw; 14702 if (rack->r_ctl.gp_bw == 0) 14703 log.u_bbr.cur_del_rate = 0; 14704 else 14705 log.u_bbr.cur_del_rate = rack_get_bw(rack); 14706 log.u_bbr.rttProp = len_time; 14707 log.u_bbr.pkts_out = rack->r_ctl.rc_rack_min_rtt; 14708 log.u_bbr.lost = rack->r_ctl.rc_probertt_sndmax_atexit; 14709 log.u_bbr.pacing_gain = rack_get_output_gain(rack, rsm); 14710 if (rack->r_ctl.cwnd_to_use < rack->rc_tp->snd_ssthresh) { 14711 /* We are in slow start */ 14712 log.u_bbr.flex7 = 1; 14713 } else { 14714 /* we are on congestion avoidance */ 14715 log.u_bbr.flex7 = 0; 14716 } 14717 log.u_bbr.flex8 = method; 14718 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 14719 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 14720 log.u_bbr.cwnd_gain = rack->rc_gp_saw_rec; 14721 log.u_bbr.cwnd_gain <<= 1; 14722 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ss; 14723 log.u_bbr.cwnd_gain <<= 1; 14724 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ca; 14725 log.u_bbr.bbr_substate = quality; 14726 TCP_LOG_EVENTP(rack->rc_tp, NULL, 14727 &rack->rc_inp->inp_socket->so_rcv, 14728 &rack->rc_inp->inp_socket->so_snd, 14729 BBR_LOG_HPTSI_CALC, 0, 14730 0, &log, false, &tv); 14731 } 14732 } 14733 14734 static uint32_t 14735 rack_get_pacing_len(struct tcp_rack *rack, uint64_t bw, uint32_t mss) 14736 { 14737 uint32_t new_tso, user_max; 14738 14739 user_max = rack->rc_user_set_max_segs * mss; 14740 if (rack->rc_force_max_seg) { 14741 return (user_max); 14742 } 14743 if (rack->use_fixed_rate && 14744 ((rack->r_ctl.crte == NULL) || 14745 (bw != rack->r_ctl.crte->rate))) { 14746 /* Use the user mss since we are not exactly matched */ 14747 return (user_max); 14748 } 14749 new_tso = tcp_get_pacing_burst_size(rack->rc_tp, bw, mss, rack_pace_one_seg, rack->r_ctl.crte, NULL); 14750 if (new_tso > user_max) 14751 new_tso = user_max; 14752 return (new_tso); 14753 } 14754 14755 static int32_t 14756 pace_to_fill_cwnd(struct tcp_rack *rack, int32_t slot, uint32_t len, uint32_t segsiz, int *capped, uint64_t *rate_wanted, uint8_t non_paced) 14757 { 14758 uint64_t lentim, fill_bw; 14759 14760 /* Lets first see if we are full, if so continue with normal rate */ 14761 rack->r_via_fill_cw = 0; 14762 if (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) > rack->r_ctl.cwnd_to_use) 14763 return (slot); 14764 if ((ctf_outstanding(rack->rc_tp) + (segsiz-1)) > rack->rc_tp->snd_wnd) 14765 return (slot); 14766 if (rack->r_ctl.rc_last_us_rtt == 0) 14767 return (slot); 14768 if (rack->rc_pace_fill_if_rttin_range && 14769 (rack->r_ctl.rc_last_us_rtt >= 14770 (get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack->rtt_limit_mul))) { 14771 /* The rtt is huge, N * smallest, lets not fill */ 14772 return (slot); 14773 } 14774 /* 14775 * first lets calculate the b/w based on the last us-rtt 14776 * and the sndwnd. 14777 */ 14778 fill_bw = rack->r_ctl.cwnd_to_use; 14779 /* Take the rwnd if its smaller */ 14780 if (fill_bw > rack->rc_tp->snd_wnd) 14781 fill_bw = rack->rc_tp->snd_wnd; 14782 if (rack->r_fill_less_agg) { 14783 /* 14784 * Now take away the inflight (this will reduce our 14785 * aggressiveness and yeah, if we get that much out in 1RTT 14786 * we will have had acks come back and still be behind). 14787 */ 14788 fill_bw -= ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 14789 } 14790 /* Now lets make it into a b/w */ 14791 fill_bw *= (uint64_t)HPTS_USEC_IN_SEC; 14792 fill_bw /= (uint64_t)rack->r_ctl.rc_last_us_rtt; 14793 /* We are below the min b/w */ 14794 if (non_paced) 14795 *rate_wanted = fill_bw; 14796 if ((fill_bw < RACK_MIN_BW) || (fill_bw < *rate_wanted)) 14797 return (slot); 14798 if (rack->r_ctl.bw_rate_cap && (fill_bw > rack->r_ctl.bw_rate_cap)) 14799 fill_bw = rack->r_ctl.bw_rate_cap; 14800 rack->r_via_fill_cw = 1; 14801 if (rack->r_rack_hw_rate_caps && 14802 (rack->r_ctl.crte != NULL)) { 14803 uint64_t high_rate; 14804 14805 high_rate = tcp_hw_highest_rate(rack->r_ctl.crte); 14806 if (fill_bw > high_rate) { 14807 /* We are capping bw at the highest rate table entry */ 14808 if (*rate_wanted > high_rate) { 14809 /* The original rate was also capped */ 14810 rack->r_via_fill_cw = 0; 14811 } 14812 rack_log_hdwr_pacing(rack, 14813 fill_bw, high_rate, __LINE__, 14814 0, 3); 14815 fill_bw = high_rate; 14816 if (capped) 14817 *capped = 1; 14818 } 14819 } else if ((rack->r_ctl.crte == NULL) && 14820 (rack->rack_hdrw_pacing == 0) && 14821 (rack->rack_hdw_pace_ena) && 14822 rack->r_rack_hw_rate_caps && 14823 (rack->rack_attempt_hdwr_pace == 0) && 14824 (rack->rc_inp->inp_route.ro_nh != NULL) && 14825 (rack->rc_inp->inp_route.ro_nh->nh_ifp != NULL)) { 14826 /* 14827 * Ok we may have a first attempt that is greater than our top rate 14828 * lets check. 14829 */ 14830 uint64_t high_rate; 14831 14832 high_rate = tcp_hw_highest_rate_ifp(rack->rc_inp->inp_route.ro_nh->nh_ifp, rack->rc_inp); 14833 if (high_rate) { 14834 if (fill_bw > high_rate) { 14835 fill_bw = high_rate; 14836 if (capped) 14837 *capped = 1; 14838 } 14839 } 14840 } 14841 /* 14842 * Ok fill_bw holds our mythical b/w to fill the cwnd 14843 * in a rtt, what does that time wise equate too? 14844 */ 14845 lentim = (uint64_t)(len) * (uint64_t)HPTS_USEC_IN_SEC; 14846 lentim /= fill_bw; 14847 *rate_wanted = fill_bw; 14848 if (non_paced || (lentim < slot)) { 14849 rack_log_pacing_delay_calc(rack, len, slot, fill_bw, 14850 0, lentim, 12, __LINE__, NULL, 0); 14851 return ((int32_t)lentim); 14852 } else 14853 return (slot); 14854 } 14855 14856 static int32_t 14857 rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, struct rack_sendmap *rsm, uint32_t segsiz) 14858 { 14859 uint64_t srtt; 14860 int32_t slot = 0; 14861 int can_start_hw_pacing = 1; 14862 int err; 14863 14864 if (rack->rc_always_pace == 0) { 14865 /* 14866 * We use the most optimistic possible cwnd/srtt for 14867 * sending calculations. This will make our 14868 * calculation anticipate getting more through 14869 * quicker then possible. But thats ok we don't want 14870 * the peer to have a gap in data sending. 14871 */ 14872 uint64_t cwnd, tr_perms = 0; 14873 int32_t reduce = 0; 14874 14875 old_method: 14876 /* 14877 * We keep no precise pacing with the old method 14878 * instead we use the pacer to mitigate bursts. 14879 */ 14880 if (rack->r_ctl.rc_rack_min_rtt) 14881 srtt = rack->r_ctl.rc_rack_min_rtt; 14882 else 14883 srtt = max(tp->t_srtt, 1); 14884 if (rack->r_ctl.rc_rack_largest_cwnd) 14885 cwnd = rack->r_ctl.rc_rack_largest_cwnd; 14886 else 14887 cwnd = rack->r_ctl.cwnd_to_use; 14888 /* Inflate cwnd by 1000 so srtt of usecs is in ms */ 14889 tr_perms = (cwnd * 1000) / srtt; 14890 if (tr_perms == 0) { 14891 tr_perms = ctf_fixed_maxseg(tp); 14892 } 14893 /* 14894 * Calculate how long this will take to drain, if 14895 * the calculation comes out to zero, thats ok we 14896 * will use send_a_lot to possibly spin around for 14897 * more increasing tot_len_this_send to the point 14898 * that its going to require a pace, or we hit the 14899 * cwnd. Which in that case we are just waiting for 14900 * a ACK. 14901 */ 14902 slot = len / tr_perms; 14903 /* Now do we reduce the time so we don't run dry? */ 14904 if (slot && rack_slot_reduction) { 14905 reduce = (slot / rack_slot_reduction); 14906 if (reduce < slot) { 14907 slot -= reduce; 14908 } else 14909 slot = 0; 14910 } 14911 slot *= HPTS_USEC_IN_MSEC; 14912 if (rack->rc_pace_to_cwnd) { 14913 uint64_t rate_wanted = 0; 14914 14915 slot = pace_to_fill_cwnd(rack, slot, len, segsiz, NULL, &rate_wanted, 1); 14916 rack->rc_ack_can_sendout_data = 1; 14917 rack_log_pacing_delay_calc(rack, len, slot, rate_wanted, 0, 0, 14, __LINE__, NULL, 0); 14918 } else 14919 rack_log_pacing_delay_calc(rack, len, slot, tr_perms, reduce, 0, 7, __LINE__, NULL, 0); 14920 } else { 14921 uint64_t bw_est, res, lentim, rate_wanted; 14922 uint32_t orig_val, segs, oh; 14923 int capped = 0; 14924 int prev_fill; 14925 14926 if ((rack->r_rr_config == 1) && rsm) { 14927 return (rack->r_ctl.rc_min_to); 14928 } 14929 if (rack->use_fixed_rate) { 14930 rate_wanted = bw_est = rack_get_fixed_pacing_bw(rack); 14931 } else if ((rack->r_ctl.init_rate == 0) && 14932 #ifdef NETFLIX_PEAKRATE 14933 (rack->rc_tp->t_maxpeakrate == 0) && 14934 #endif 14935 (rack->r_ctl.gp_bw == 0)) { 14936 /* no way to yet do an estimate */ 14937 bw_est = rate_wanted = 0; 14938 } else { 14939 bw_est = rack_get_bw(rack); 14940 rate_wanted = rack_get_output_bw(rack, bw_est, rsm, &capped); 14941 } 14942 if ((bw_est == 0) || (rate_wanted == 0) || 14943 ((rack->gp_ready == 0) && (rack->use_fixed_rate == 0))) { 14944 /* 14945 * No way yet to make a b/w estimate or 14946 * our raise is set incorrectly. 14947 */ 14948 goto old_method; 14949 } 14950 /* We need to account for all the overheads */ 14951 segs = (len + segsiz - 1) / segsiz; 14952 /* 14953 * We need the diff between 1514 bytes (e-mtu with e-hdr) 14954 * and how much data we put in each packet. Yes this 14955 * means we may be off if we are larger than 1500 bytes 14956 * or smaller. But this just makes us more conservative. 14957 */ 14958 if (rack_hw_rate_min && 14959 (bw_est < rack_hw_rate_min)) 14960 can_start_hw_pacing = 0; 14961 if (ETHERNET_SEGMENT_SIZE > segsiz) 14962 oh = ETHERNET_SEGMENT_SIZE - segsiz; 14963 else 14964 oh = 0; 14965 segs *= oh; 14966 lentim = (uint64_t)(len + segs) * (uint64_t)HPTS_USEC_IN_SEC; 14967 res = lentim / rate_wanted; 14968 slot = (uint32_t)res; 14969 orig_val = rack->r_ctl.rc_pace_max_segs; 14970 if (rack->r_ctl.crte == NULL) { 14971 /* 14972 * Only do this if we are not hardware pacing 14973 * since if we are doing hw-pacing below we will 14974 * set make a call after setting up or changing 14975 * the rate. 14976 */ 14977 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL); 14978 } else if (rack->rc_inp->inp_snd_tag == NULL) { 14979 /* 14980 * We lost our rate somehow, this can happen 14981 * if the interface changed underneath us. 14982 */ 14983 tcp_rel_pacing_rate(rack->r_ctl.crte, rack->rc_tp); 14984 rack->r_ctl.crte = NULL; 14985 /* Lets re-allow attempting to setup pacing */ 14986 rack->rack_hdrw_pacing = 0; 14987 rack->rack_attempt_hdwr_pace = 0; 14988 rack_log_hdwr_pacing(rack, 14989 rate_wanted, bw_est, __LINE__, 14990 0, 6); 14991 } 14992 /* Did we change the TSO size, if so log it */ 14993 if (rack->r_ctl.rc_pace_max_segs != orig_val) 14994 rack_log_pacing_delay_calc(rack, len, slot, orig_val, 0, 0, 15, __LINE__, NULL, 0); 14995 prev_fill = rack->r_via_fill_cw; 14996 if ((rack->rc_pace_to_cwnd) && 14997 (capped == 0) && 14998 (rack->use_fixed_rate == 0) && 14999 (rack->in_probe_rtt == 0) && 15000 (IN_FASTRECOVERY(rack->rc_tp->t_flags) == 0)) { 15001 /* 15002 * We want to pace at our rate *or* faster to 15003 * fill the cwnd to the max if its not full. 15004 */ 15005 slot = pace_to_fill_cwnd(rack, slot, (len+segs), segsiz, &capped, &rate_wanted, 0); 15006 } 15007 if ((rack->rc_inp->inp_route.ro_nh != NULL) && 15008 (rack->rc_inp->inp_route.ro_nh->nh_ifp != NULL)) { 15009 if ((rack->rack_hdw_pace_ena) && 15010 (can_start_hw_pacing > 0) && 15011 (rack->rack_hdrw_pacing == 0) && 15012 (rack->rack_attempt_hdwr_pace == 0)) { 15013 /* 15014 * Lets attempt to turn on hardware pacing 15015 * if we can. 15016 */ 15017 rack->rack_attempt_hdwr_pace = 1; 15018 rack->r_ctl.crte = tcp_set_pacing_rate(rack->rc_tp, 15019 rack->rc_inp->inp_route.ro_nh->nh_ifp, 15020 rate_wanted, 15021 RS_PACING_GEQ, 15022 &err, &rack->r_ctl.crte_prev_rate); 15023 if (rack->r_ctl.crte) { 15024 rack->rack_hdrw_pacing = 1; 15025 rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size(tp, rate_wanted, segsiz, 15026 0, rack->r_ctl.crte, 15027 NULL); 15028 rack_log_hdwr_pacing(rack, 15029 rate_wanted, rack->r_ctl.crte->rate, __LINE__, 15030 err, 0); 15031 rack->r_ctl.last_hw_bw_req = rate_wanted; 15032 } else { 15033 counter_u64_add(rack_hw_pace_init_fail, 1); 15034 } 15035 } else if (rack->rack_hdrw_pacing && 15036 (rack->r_ctl.last_hw_bw_req != rate_wanted)) { 15037 /* Do we need to adjust our rate? */ 15038 const struct tcp_hwrate_limit_table *nrte; 15039 15040 if (rack->r_up_only && 15041 (rate_wanted < rack->r_ctl.crte->rate)) { 15042 /** 15043 * We have four possible states here 15044 * having to do with the previous time 15045 * and this time. 15046 * previous | this-time 15047 * A) 0 | 0 -- fill_cw not in the picture 15048 * B) 1 | 0 -- we were doing a fill-cw but now are not 15049 * C) 1 | 1 -- all rates from fill_cw 15050 * D) 0 | 1 -- we were doing non-fill and now we are filling 15051 * 15052 * For case A, C and D we don't allow a drop. But for 15053 * case B where we now our on our steady rate we do 15054 * allow a drop. 15055 * 15056 */ 15057 if (!((prev_fill == 1) && (rack->r_via_fill_cw == 0))) 15058 goto done_w_hdwr; 15059 } 15060 if ((rate_wanted > rack->r_ctl.crte->rate) || 15061 (rate_wanted <= rack->r_ctl.crte_prev_rate)) { 15062 if (rack_hw_rate_to_low && 15063 (bw_est < rack_hw_rate_to_low)) { 15064 /* 15065 * The pacing rate is too low for hardware, but 15066 * do allow hardware pacing to be restarted. 15067 */ 15068 rack_log_hdwr_pacing(rack, 15069 bw_est, rack->r_ctl.crte->rate, __LINE__, 15070 0, 5); 15071 tcp_rel_pacing_rate(rack->r_ctl.crte, rack->rc_tp); 15072 rack->r_ctl.crte = NULL; 15073 rack->rack_attempt_hdwr_pace = 0; 15074 rack->rack_hdrw_pacing = 0; 15075 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, &rate_wanted); 15076 goto done_w_hdwr; 15077 } 15078 nrte = tcp_chg_pacing_rate(rack->r_ctl.crte, 15079 rack->rc_tp, 15080 rack->rc_inp->inp_route.ro_nh->nh_ifp, 15081 rate_wanted, 15082 RS_PACING_GEQ, 15083 &err, &rack->r_ctl.crte_prev_rate); 15084 if (nrte == NULL) { 15085 /* Lost the rate */ 15086 rack->rack_hdrw_pacing = 0; 15087 rack->r_ctl.crte = NULL; 15088 rack_log_hdwr_pacing(rack, 15089 rate_wanted, 0, __LINE__, 15090 err, 1); 15091 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, &rate_wanted); 15092 counter_u64_add(rack_hw_pace_lost, 1); 15093 } else if (nrte != rack->r_ctl.crte) { 15094 rack->r_ctl.crte = nrte; 15095 rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size(tp, rate_wanted, 15096 segsiz, 0, 15097 rack->r_ctl.crte, 15098 NULL); 15099 rack_log_hdwr_pacing(rack, 15100 rate_wanted, rack->r_ctl.crte->rate, __LINE__, 15101 err, 2); 15102 rack->r_ctl.last_hw_bw_req = rate_wanted; 15103 } 15104 } else { 15105 /* We just need to adjust the segment size */ 15106 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, &rate_wanted); 15107 rack_log_hdwr_pacing(rack, 15108 rate_wanted, rack->r_ctl.crte->rate, __LINE__, 15109 0, 4); 15110 rack->r_ctl.last_hw_bw_req = rate_wanted; 15111 } 15112 } 15113 } 15114 if ((rack->r_ctl.crte != NULL) && 15115 (rack->r_ctl.crte->rate == rate_wanted)) { 15116 /* 15117 * We need to add a extra if the rates 15118 * are exactly matched. The idea is 15119 * we want the software to make sure the 15120 * queue is empty before adding more, this 15121 * gives us N MSS extra pace times where 15122 * N is our sysctl 15123 */ 15124 slot += (rack->r_ctl.crte->time_between * rack_hw_pace_extra_slots); 15125 } 15126 done_w_hdwr: 15127 if (rack_limit_time_with_srtt && 15128 (rack->use_fixed_rate == 0) && 15129 #ifdef NETFLIX_PEAKRATE 15130 (rack->rc_tp->t_maxpeakrate == 0) && 15131 #endif 15132 (rack->rack_hdrw_pacing == 0)) { 15133 /* 15134 * Sanity check, we do not allow the pacing delay 15135 * to be longer than the SRTT of the path. If it is 15136 * a slow path, then adding a packet should increase 15137 * the RTT and compensate for this i.e. the srtt will 15138 * be greater so the allowed pacing time will be greater. 15139 * 15140 * Note this restriction is not for where a peak rate 15141 * is set, we are doing fixed pacing or hardware pacing. 15142 */ 15143 if (rack->rc_tp->t_srtt) 15144 srtt = rack->rc_tp->t_srtt; 15145 else 15146 srtt = RACK_INITIAL_RTO * HPTS_USEC_IN_MSEC; /* its in ms convert */ 15147 if (srtt < (uint64_t)slot) { 15148 rack_log_pacing_delay_calc(rack, srtt, slot, rate_wanted, bw_est, lentim, 99, __LINE__, NULL, 0); 15149 slot = srtt; 15150 } 15151 } 15152 rack_log_pacing_delay_calc(rack, len, slot, rate_wanted, bw_est, lentim, 2, __LINE__, rsm, 0); 15153 } 15154 if (rack->r_ctl.crte && (rack->r_ctl.crte->rs_num_enobufs > 0)) { 15155 /* 15156 * If this rate is seeing enobufs when it 15157 * goes to send then either the nic is out 15158 * of gas or we are mis-estimating the time 15159 * somehow and not letting the queue empty 15160 * completely. Lets add to the pacing time. 15161 */ 15162 int hw_boost_delay; 15163 15164 hw_boost_delay = rack->r_ctl.crte->time_between * rack_enobuf_hw_boost_mult; 15165 if (hw_boost_delay > rack_enobuf_hw_max) 15166 hw_boost_delay = rack_enobuf_hw_max; 15167 else if (hw_boost_delay < rack_enobuf_hw_min) 15168 hw_boost_delay = rack_enobuf_hw_min; 15169 slot += hw_boost_delay; 15170 } 15171 return (slot); 15172 } 15173 15174 static void 15175 rack_start_gp_measurement(struct tcpcb *tp, struct tcp_rack *rack, 15176 tcp_seq startseq, uint32_t sb_offset) 15177 { 15178 struct rack_sendmap *my_rsm = NULL; 15179 struct rack_sendmap fe; 15180 15181 if (tp->t_state < TCPS_ESTABLISHED) { 15182 /* 15183 * We don't start any measurements if we are 15184 * not at least established. 15185 */ 15186 return; 15187 } 15188 if (tp->t_state >= TCPS_FIN_WAIT_1) { 15189 /* 15190 * We will get no more data into the SB 15191 * this means we need to have the data available 15192 * before we start a measurement. 15193 */ 15194 15195 if (sbavail(&tptosocket(tp)->so_snd) < 15196 max(rc_init_window(rack), 15197 (MIN_GP_WIN * ctf_fixed_maxseg(tp)))) { 15198 /* Nope not enough data */ 15199 return; 15200 } 15201 } 15202 tp->t_flags |= TF_GPUTINPROG; 15203 rack->r_ctl.rc_gp_lowrtt = 0xffffffff; 15204 rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd; 15205 tp->gput_seq = startseq; 15206 rack->app_limited_needs_set = 0; 15207 if (rack->in_probe_rtt) 15208 rack->measure_saw_probe_rtt = 1; 15209 else if ((rack->measure_saw_probe_rtt) && 15210 (SEQ_GEQ(tp->gput_seq, rack->r_ctl.rc_probertt_sndmax_atexit))) 15211 rack->measure_saw_probe_rtt = 0; 15212 if (rack->rc_gp_filled) 15213 tp->gput_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 15214 else { 15215 /* Special case initial measurement */ 15216 struct timeval tv; 15217 15218 tp->gput_ts = tcp_get_usecs(&tv); 15219 rack->r_ctl.rc_gp_output_ts = rack_to_usec_ts(&tv); 15220 } 15221 /* 15222 * We take a guess out into the future, 15223 * if we have no measurement and no 15224 * initial rate, we measure the first 15225 * initial-windows worth of data to 15226 * speed up getting some GP measurement and 15227 * thus start pacing. 15228 */ 15229 if ((rack->rc_gp_filled == 0) && (rack->r_ctl.init_rate == 0)) { 15230 rack->app_limited_needs_set = 1; 15231 tp->gput_ack = startseq + max(rc_init_window(rack), 15232 (MIN_GP_WIN * ctf_fixed_maxseg(tp))); 15233 rack_log_pacing_delay_calc(rack, 15234 tp->gput_seq, 15235 tp->gput_ack, 15236 0, 15237 tp->gput_ts, 15238 rack->r_ctl.rc_app_limited_cnt, 15239 9, 15240 __LINE__, NULL, 0); 15241 return; 15242 } 15243 if (sb_offset) { 15244 /* 15245 * We are out somewhere in the sb 15246 * can we use the already outstanding data? 15247 */ 15248 if (rack->r_ctl.rc_app_limited_cnt == 0) { 15249 /* 15250 * Yes first one is good and in this case 15251 * the tp->gput_ts is correctly set based on 15252 * the last ack that arrived (no need to 15253 * set things up when an ack comes in). 15254 */ 15255 my_rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 15256 if ((my_rsm == NULL) || 15257 (my_rsm->r_rtr_cnt != 1)) { 15258 /* retransmission? */ 15259 goto use_latest; 15260 } 15261 } else { 15262 if (rack->r_ctl.rc_first_appl == NULL) { 15263 /* 15264 * If rc_first_appl is NULL 15265 * then the cnt should be 0. 15266 * This is probably an error, maybe 15267 * a KASSERT would be approprate. 15268 */ 15269 goto use_latest; 15270 } 15271 /* 15272 * If we have a marker pointer to the last one that is 15273 * app limited we can use that, but we need to set 15274 * things up so that when it gets ack'ed we record 15275 * the ack time (if its not already acked). 15276 */ 15277 rack->app_limited_needs_set = 1; 15278 /* 15279 * We want to get to the rsm that is either 15280 * next with space i.e. over 1 MSS or the one 15281 * after that (after the app-limited). 15282 */ 15283 my_rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, 15284 rack->r_ctl.rc_first_appl); 15285 if (my_rsm) { 15286 if ((my_rsm->r_end - my_rsm->r_start) <= ctf_fixed_maxseg(tp)) 15287 /* Have to use the next one */ 15288 my_rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, 15289 my_rsm); 15290 else { 15291 /* Use after the first MSS of it is acked */ 15292 tp->gput_seq = my_rsm->r_start + ctf_fixed_maxseg(tp); 15293 goto start_set; 15294 } 15295 } 15296 if ((my_rsm == NULL) || 15297 (my_rsm->r_rtr_cnt != 1)) { 15298 /* 15299 * Either its a retransmit or 15300 * the last is the app-limited one. 15301 */ 15302 goto use_latest; 15303 } 15304 } 15305 tp->gput_seq = my_rsm->r_start; 15306 start_set: 15307 if (my_rsm->r_flags & RACK_ACKED) { 15308 /* 15309 * This one has been acked use the arrival ack time 15310 */ 15311 tp->gput_ts = (uint32_t)my_rsm->r_ack_arrival; 15312 rack->app_limited_needs_set = 0; 15313 } 15314 rack->r_ctl.rc_gp_output_ts = my_rsm->r_tim_lastsent[(my_rsm->r_rtr_cnt-1)]; 15315 tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack); 15316 rack_log_pacing_delay_calc(rack, 15317 tp->gput_seq, 15318 tp->gput_ack, 15319 (uint64_t)my_rsm, 15320 tp->gput_ts, 15321 rack->r_ctl.rc_app_limited_cnt, 15322 9, 15323 __LINE__, NULL, 0); 15324 return; 15325 } 15326 15327 use_latest: 15328 /* 15329 * We don't know how long we may have been 15330 * idle or if this is the first-send. Lets 15331 * setup the flag so we will trim off 15332 * the first ack'd data so we get a true 15333 * measurement. 15334 */ 15335 rack->app_limited_needs_set = 1; 15336 tp->gput_ack = startseq + rack_get_measure_window(tp, rack); 15337 /* Find this guy so we can pull the send time */ 15338 fe.r_start = startseq; 15339 my_rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); 15340 if (my_rsm) { 15341 rack->r_ctl.rc_gp_output_ts = my_rsm->r_tim_lastsent[(my_rsm->r_rtr_cnt-1)]; 15342 if (my_rsm->r_flags & RACK_ACKED) { 15343 /* 15344 * Unlikely since its probably what was 15345 * just transmitted (but I am paranoid). 15346 */ 15347 tp->gput_ts = (uint32_t)my_rsm->r_ack_arrival; 15348 rack->app_limited_needs_set = 0; 15349 } 15350 if (SEQ_LT(my_rsm->r_start, tp->gput_seq)) { 15351 /* This also is unlikely */ 15352 tp->gput_seq = my_rsm->r_start; 15353 } 15354 } else { 15355 /* 15356 * TSNH unless we have some send-map limit, 15357 * and even at that it should not be hitting 15358 * that limit (we should have stopped sending). 15359 */ 15360 struct timeval tv; 15361 15362 microuptime(&tv); 15363 rack->r_ctl.rc_gp_output_ts = rack_to_usec_ts(&tv); 15364 } 15365 rack_log_pacing_delay_calc(rack, 15366 tp->gput_seq, 15367 tp->gput_ack, 15368 (uint64_t)my_rsm, 15369 tp->gput_ts, 15370 rack->r_ctl.rc_app_limited_cnt, 15371 9, __LINE__, NULL, 0); 15372 } 15373 15374 static inline uint32_t 15375 rack_what_can_we_send(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cwnd_to_use, 15376 uint32_t avail, int32_t sb_offset) 15377 { 15378 uint32_t len; 15379 uint32_t sendwin; 15380 15381 if (tp->snd_wnd > cwnd_to_use) 15382 sendwin = cwnd_to_use; 15383 else 15384 sendwin = tp->snd_wnd; 15385 if (ctf_outstanding(tp) >= tp->snd_wnd) { 15386 /* We never want to go over our peers rcv-window */ 15387 len = 0; 15388 } else { 15389 uint32_t flight; 15390 15391 flight = ctf_flight_size(tp, rack->r_ctl.rc_sacked); 15392 if (flight >= sendwin) { 15393 /* 15394 * We have in flight what we are allowed by cwnd (if 15395 * it was rwnd blocking it would have hit above out 15396 * >= tp->snd_wnd). 15397 */ 15398 return (0); 15399 } 15400 len = sendwin - flight; 15401 if ((len + ctf_outstanding(tp)) > tp->snd_wnd) { 15402 /* We would send too much (beyond the rwnd) */ 15403 len = tp->snd_wnd - ctf_outstanding(tp); 15404 } 15405 if ((len + sb_offset) > avail) { 15406 /* 15407 * We don't have that much in the SB, how much is 15408 * there? 15409 */ 15410 len = avail - sb_offset; 15411 } 15412 } 15413 return (len); 15414 } 15415 15416 static void 15417 rack_log_fsb(struct tcp_rack *rack, struct tcpcb *tp, struct socket *so, uint32_t flags, 15418 unsigned ipoptlen, int32_t orig_len, int32_t len, int error, 15419 int rsm_is_null, int optlen, int line, uint16_t mode) 15420 { 15421 if (tp->t_logstate != TCP_LOG_STATE_OFF) { 15422 union tcp_log_stackspecific log; 15423 struct timeval tv; 15424 15425 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 15426 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); 15427 log.u_bbr.flex1 = error; 15428 log.u_bbr.flex2 = flags; 15429 log.u_bbr.flex3 = rsm_is_null; 15430 log.u_bbr.flex4 = ipoptlen; 15431 log.u_bbr.flex5 = tp->rcv_numsacks; 15432 log.u_bbr.flex6 = rack->r_ctl.rc_agg_early; 15433 log.u_bbr.flex7 = optlen; 15434 log.u_bbr.flex8 = rack->r_fsb_inited; 15435 log.u_bbr.applimited = rack->r_fast_output; 15436 log.u_bbr.bw_inuse = rack_get_bw(rack); 15437 log.u_bbr.pacing_gain = rack_get_output_gain(rack, NULL); 15438 log.u_bbr.cwnd_gain = mode; 15439 log.u_bbr.pkts_out = orig_len; 15440 log.u_bbr.lt_epoch = len; 15441 log.u_bbr.delivered = line; 15442 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 15443 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 15444 tcp_log_event_(tp, NULL, &so->so_rcv, &so->so_snd, TCP_LOG_FSB, 0, 15445 len, &log, false, NULL, NULL, 0, &tv); 15446 } 15447 } 15448 15449 15450 static struct mbuf * 15451 rack_fo_base_copym(struct mbuf *the_m, uint32_t the_off, int32_t *plen, 15452 struct rack_fast_send_blk *fsb, 15453 int32_t seglimit, int32_t segsize, int hw_tls) 15454 { 15455 #ifdef KERN_TLS 15456 struct ktls_session *tls, *ntls; 15457 #ifdef INVARIANTS 15458 struct mbuf *start; 15459 #endif 15460 #endif 15461 struct mbuf *m, *n, **np, *smb; 15462 struct mbuf *top; 15463 int32_t off, soff; 15464 int32_t len = *plen; 15465 int32_t fragsize; 15466 int32_t len_cp = 0; 15467 uint32_t mlen, frags; 15468 15469 soff = off = the_off; 15470 smb = m = the_m; 15471 np = ⊤ 15472 top = NULL; 15473 #ifdef KERN_TLS 15474 if (hw_tls && (m->m_flags & M_EXTPG)) 15475 tls = m->m_epg_tls; 15476 else 15477 tls = NULL; 15478 #ifdef INVARIANTS 15479 start = m; 15480 #endif 15481 #endif 15482 while (len > 0) { 15483 if (m == NULL) { 15484 *plen = len_cp; 15485 break; 15486 } 15487 #ifdef KERN_TLS 15488 if (hw_tls) { 15489 if (m->m_flags & M_EXTPG) 15490 ntls = m->m_epg_tls; 15491 else 15492 ntls = NULL; 15493 15494 /* 15495 * Avoid mixing TLS records with handshake 15496 * data or TLS records from different 15497 * sessions. 15498 */ 15499 if (tls != ntls) { 15500 MPASS(m != start); 15501 *plen = len_cp; 15502 break; 15503 } 15504 } 15505 #endif 15506 mlen = min(len, m->m_len - off); 15507 if (seglimit) { 15508 /* 15509 * For M_EXTPG mbufs, add 3 segments 15510 * + 1 in case we are crossing page boundaries 15511 * + 2 in case the TLS hdr/trailer are used 15512 * It is cheaper to just add the segments 15513 * than it is to take the cache miss to look 15514 * at the mbuf ext_pgs state in detail. 15515 */ 15516 if (m->m_flags & M_EXTPG) { 15517 fragsize = min(segsize, PAGE_SIZE); 15518 frags = 3; 15519 } else { 15520 fragsize = segsize; 15521 frags = 0; 15522 } 15523 15524 /* Break if we really can't fit anymore. */ 15525 if ((frags + 1) >= seglimit) { 15526 *plen = len_cp; 15527 break; 15528 } 15529 15530 /* 15531 * Reduce size if you can't copy the whole 15532 * mbuf. If we can't copy the whole mbuf, also 15533 * adjust len so the loop will end after this 15534 * mbuf. 15535 */ 15536 if ((frags + howmany(mlen, fragsize)) >= seglimit) { 15537 mlen = (seglimit - frags - 1) * fragsize; 15538 len = mlen; 15539 *plen = len_cp + len; 15540 } 15541 frags += howmany(mlen, fragsize); 15542 if (frags == 0) 15543 frags++; 15544 seglimit -= frags; 15545 KASSERT(seglimit > 0, 15546 ("%s: seglimit went too low", __func__)); 15547 } 15548 n = m_get(M_NOWAIT, m->m_type); 15549 *np = n; 15550 if (n == NULL) 15551 goto nospace; 15552 n->m_len = mlen; 15553 soff += mlen; 15554 len_cp += n->m_len; 15555 if (m->m_flags & (M_EXT|M_EXTPG)) { 15556 n->m_data = m->m_data + off; 15557 mb_dupcl(n, m); 15558 } else { 15559 bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t), 15560 (u_int)n->m_len); 15561 } 15562 len -= n->m_len; 15563 off = 0; 15564 m = m->m_next; 15565 np = &n->m_next; 15566 if (len || (soff == smb->m_len)) { 15567 /* 15568 * We have more so we move forward or 15569 * we have consumed the entire mbuf and 15570 * len has fell to 0. 15571 */ 15572 soff = 0; 15573 smb = m; 15574 } 15575 15576 } 15577 if (fsb != NULL) { 15578 fsb->m = smb; 15579 fsb->off = soff; 15580 if (smb) { 15581 /* 15582 * Save off the size of the mbuf. We do 15583 * this so that we can recognize when it 15584 * has been trimmed by sbcut() as acks 15585 * come in. 15586 */ 15587 fsb->o_m_len = smb->m_len; 15588 } else { 15589 /* 15590 * This is the case where the next mbuf went to NULL. This 15591 * means with this copy we have sent everything in the sb. 15592 * In theory we could clear the fast_output flag, but lets 15593 * not since its possible that we could get more added 15594 * and acks that call the extend function which would let 15595 * us send more. 15596 */ 15597 fsb->o_m_len = 0; 15598 } 15599 } 15600 return (top); 15601 nospace: 15602 if (top) 15603 m_freem(top); 15604 return (NULL); 15605 15606 } 15607 15608 /* 15609 * This is a copy of m_copym(), taking the TSO segment size/limit 15610 * constraints into account, and advancing the sndptr as it goes. 15611 */ 15612 static struct mbuf * 15613 rack_fo_m_copym(struct tcp_rack *rack, int32_t *plen, 15614 int32_t seglimit, int32_t segsize, struct mbuf **s_mb, int *s_soff) 15615 { 15616 struct mbuf *m, *n; 15617 int32_t soff; 15618 15619 soff = rack->r_ctl.fsb.off; 15620 m = rack->r_ctl.fsb.m; 15621 if (rack->r_ctl.fsb.o_m_len > m->m_len) { 15622 /* 15623 * The mbuf had the front of it chopped off by an ack 15624 * we need to adjust the soff/off by that difference. 15625 */ 15626 uint32_t delta; 15627 15628 delta = rack->r_ctl.fsb.o_m_len - m->m_len; 15629 soff -= delta; 15630 } else if (rack->r_ctl.fsb.o_m_len < m->m_len) { 15631 /* 15632 * The mbuf was expanded probably by 15633 * a m_compress. Just update o_m_len. 15634 */ 15635 rack->r_ctl.fsb.o_m_len = m->m_len; 15636 } 15637 KASSERT(soff >= 0, ("%s, negative off %d", __FUNCTION__, soff)); 15638 KASSERT(*plen >= 0, ("%s, negative len %d", __FUNCTION__, *plen)); 15639 KASSERT(soff < m->m_len, ("%s rack:%p len:%u m:%p m->m_len:%u < off?", 15640 __FUNCTION__, 15641 rack, *plen, m, m->m_len)); 15642 /* Save off the right location before we copy and advance */ 15643 *s_soff = soff; 15644 *s_mb = rack->r_ctl.fsb.m; 15645 n = rack_fo_base_copym(m, soff, plen, 15646 &rack->r_ctl.fsb, 15647 seglimit, segsize, rack->r_ctl.fsb.hw_tls); 15648 return (n); 15649 } 15650 15651 static int 15652 rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendmap *rsm, 15653 uint64_t ts_val, uint32_t cts, uint32_t ms_cts, struct timeval *tv, int len, uint8_t doing_tlp) 15654 { 15655 /* 15656 * Enter the fast retransmit path. We are given that a sched_pin is 15657 * in place (if accounting is compliled in) and the cycle count taken 15658 * at the entry is in the ts_val. The concept her is that the rsm 15659 * now holds the mbuf offsets and such so we can directly transmit 15660 * without a lot of overhead, the len field is already set for 15661 * us to prohibit us from sending too much (usually its 1MSS). 15662 */ 15663 struct ip *ip = NULL; 15664 struct udphdr *udp = NULL; 15665 struct tcphdr *th = NULL; 15666 struct mbuf *m = NULL; 15667 struct inpcb *inp; 15668 uint8_t *cpto; 15669 struct tcp_log_buffer *lgb; 15670 #ifdef TCP_ACCOUNTING 15671 uint64_t crtsc; 15672 int cnt_thru = 1; 15673 #endif 15674 struct tcpopt to; 15675 u_char opt[TCP_MAXOLEN]; 15676 uint32_t hdrlen, optlen; 15677 int32_t slot, segsiz, max_val, tso = 0, error, ulen = 0; 15678 uint16_t flags; 15679 uint32_t if_hw_tsomaxsegcount = 0, startseq; 15680 uint32_t if_hw_tsomaxsegsize; 15681 15682 #ifdef INET6 15683 struct ip6_hdr *ip6 = NULL; 15684 15685 if (rack->r_is_v6) { 15686 ip6 = (struct ip6_hdr *)rack->r_ctl.fsb.tcp_ip_hdr; 15687 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 15688 } else 15689 #endif /* INET6 */ 15690 { 15691 ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr; 15692 hdrlen = sizeof(struct tcpiphdr); 15693 } 15694 if (tp->t_port && (V_tcp_udp_tunneling_port == 0)) { 15695 goto failed; 15696 } 15697 if (doing_tlp) { 15698 /* Its a TLP add the flag, it may already be there but be sure */ 15699 rsm->r_flags |= RACK_TLP; 15700 } else { 15701 /* If it was a TLP it is not not on this retransmit */ 15702 rsm->r_flags &= ~RACK_TLP; 15703 } 15704 startseq = rsm->r_start; 15705 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 15706 inp = rack->rc_inp; 15707 to.to_flags = 0; 15708 flags = tcp_outflags[tp->t_state]; 15709 if (flags & (TH_SYN|TH_RST)) { 15710 goto failed; 15711 } 15712 if (rsm->r_flags & RACK_HAS_FIN) { 15713 /* We can't send a FIN here */ 15714 goto failed; 15715 } 15716 if (flags & TH_FIN) { 15717 /* We never send a FIN */ 15718 flags &= ~TH_FIN; 15719 } 15720 if (tp->t_flags & TF_RCVD_TSTMP) { 15721 to.to_tsval = ms_cts + tp->ts_offset; 15722 to.to_tsecr = tp->ts_recent; 15723 to.to_flags = TOF_TS; 15724 } 15725 optlen = tcp_addoptions(&to, opt); 15726 hdrlen += optlen; 15727 udp = rack->r_ctl.fsb.udp; 15728 if (udp) 15729 hdrlen += sizeof(struct udphdr); 15730 if (rack->r_ctl.rc_pace_max_segs) 15731 max_val = rack->r_ctl.rc_pace_max_segs; 15732 else if (rack->rc_user_set_max_segs) 15733 max_val = rack->rc_user_set_max_segs * segsiz; 15734 else 15735 max_val = len; 15736 if ((tp->t_flags & TF_TSO) && 15737 V_tcp_do_tso && 15738 (len > segsiz) && 15739 (tp->t_port == 0)) 15740 tso = 1; 15741 #ifdef INET6 15742 if (MHLEN < hdrlen + max_linkhdr) 15743 m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); 15744 else 15745 #endif 15746 m = m_gethdr(M_NOWAIT, MT_DATA); 15747 if (m == NULL) 15748 goto failed; 15749 m->m_data += max_linkhdr; 15750 m->m_len = hdrlen; 15751 th = rack->r_ctl.fsb.th; 15752 /* Establish the len to send */ 15753 if (len > max_val) 15754 len = max_val; 15755 if ((tso) && (len + optlen > tp->t_maxseg)) { 15756 uint32_t if_hw_tsomax; 15757 int32_t max_len; 15758 15759 /* extract TSO information */ 15760 if_hw_tsomax = tp->t_tsomax; 15761 if_hw_tsomaxsegcount = tp->t_tsomaxsegcount; 15762 if_hw_tsomaxsegsize = tp->t_tsomaxsegsize; 15763 /* 15764 * Check if we should limit by maximum payload 15765 * length: 15766 */ 15767 if (if_hw_tsomax != 0) { 15768 /* compute maximum TSO length */ 15769 max_len = (if_hw_tsomax - hdrlen - 15770 max_linkhdr); 15771 if (max_len <= 0) { 15772 goto failed; 15773 } else if (len > max_len) { 15774 len = max_len; 15775 } 15776 } 15777 if (len <= segsiz) { 15778 /* 15779 * In case there are too many small fragments don't 15780 * use TSO: 15781 */ 15782 tso = 0; 15783 } 15784 } else { 15785 tso = 0; 15786 } 15787 if ((tso == 0) && (len > segsiz)) 15788 len = segsiz; 15789 if ((len == 0) || 15790 (len <= MHLEN - hdrlen - max_linkhdr)) { 15791 goto failed; 15792 } 15793 th->th_seq = htonl(rsm->r_start); 15794 th->th_ack = htonl(tp->rcv_nxt); 15795 /* 15796 * The PUSH bit should only be applied 15797 * if the full retransmission is made. If 15798 * we are sending less than this is the 15799 * left hand edge and should not have 15800 * the PUSH bit. 15801 */ 15802 if ((rsm->r_flags & RACK_HAD_PUSH) && 15803 (len == (rsm->r_end - rsm->r_start))) 15804 flags |= TH_PUSH; 15805 th->th_win = htons((u_short)(rack->r_ctl.fsb.recwin >> tp->rcv_scale)); 15806 if (th->th_win == 0) { 15807 tp->t_sndzerowin++; 15808 tp->t_flags |= TF_RXWIN0SENT; 15809 } else 15810 tp->t_flags &= ~TF_RXWIN0SENT; 15811 if (rsm->r_flags & RACK_TLP) { 15812 /* 15813 * TLP should not count in retran count, but 15814 * in its own bin 15815 */ 15816 counter_u64_add(rack_tlp_retran, 1); 15817 counter_u64_add(rack_tlp_retran_bytes, len); 15818 } else { 15819 tp->t_sndrexmitpack++; 15820 KMOD_TCPSTAT_INC(tcps_sndrexmitpack); 15821 KMOD_TCPSTAT_ADD(tcps_sndrexmitbyte, len); 15822 } 15823 #ifdef STATS 15824 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB, 15825 len); 15826 #endif 15827 if (rsm->m == NULL) 15828 goto failed; 15829 if (rsm->orig_m_len != rsm->m->m_len) { 15830 /* Fix up the orig_m_len and possibly the mbuf offset */ 15831 rack_adjust_orig_mlen(rsm); 15832 } 15833 m->m_next = rack_fo_base_copym(rsm->m, rsm->soff, &len, NULL, if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, rsm->r_hw_tls); 15834 if (len <= segsiz) { 15835 /* 15836 * Must have ran out of mbufs for the copy 15837 * shorten it to no longer need tso. Lets 15838 * not put on sendalot since we are low on 15839 * mbufs. 15840 */ 15841 tso = 0; 15842 } 15843 if ((m->m_next == NULL) || (len <= 0)){ 15844 goto failed; 15845 } 15846 if (udp) { 15847 if (rack->r_is_v6) 15848 ulen = hdrlen + len - sizeof(struct ip6_hdr); 15849 else 15850 ulen = hdrlen + len - sizeof(struct ip); 15851 udp->uh_ulen = htons(ulen); 15852 } 15853 m->m_pkthdr.rcvif = (struct ifnet *)0; 15854 if (TCPS_HAVERCVDSYN(tp->t_state) && 15855 (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT))) { 15856 int ect = tcp_ecn_output_established(tp, &flags, len, true); 15857 if ((tp->t_state == TCPS_SYN_RECEIVED) && 15858 (tp->t_flags2 & TF2_ECN_SND_ECE)) 15859 tp->t_flags2 &= ~TF2_ECN_SND_ECE; 15860 #ifdef INET6 15861 if (rack->r_is_v6) { 15862 ip6->ip6_flow &= ~htonl(IPTOS_ECN_MASK << 20); 15863 ip6->ip6_flow |= htonl(ect << 20); 15864 } 15865 else 15866 #endif 15867 { 15868 ip->ip_tos &= ~IPTOS_ECN_MASK; 15869 ip->ip_tos |= ect; 15870 } 15871 } 15872 tcp_set_flags(th, flags); 15873 m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */ 15874 #ifdef INET6 15875 if (rack->r_is_v6) { 15876 if (tp->t_port) { 15877 m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; 15878 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 15879 udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0); 15880 th->th_sum = htons(0); 15881 UDPSTAT_INC(udps_opackets); 15882 } else { 15883 m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; 15884 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 15885 th->th_sum = in6_cksum_pseudo(ip6, 15886 sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP, 15887 0); 15888 } 15889 } 15890 #endif 15891 #if defined(INET6) && defined(INET) 15892 else 15893 #endif 15894 #ifdef INET 15895 { 15896 if (tp->t_port) { 15897 m->m_pkthdr.csum_flags = CSUM_UDP; 15898 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 15899 udp->uh_sum = in_pseudo(ip->ip_src.s_addr, 15900 ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP)); 15901 th->th_sum = htons(0); 15902 UDPSTAT_INC(udps_opackets); 15903 } else { 15904 m->m_pkthdr.csum_flags = CSUM_TCP; 15905 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 15906 th->th_sum = in_pseudo(ip->ip_src.s_addr, 15907 ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + 15908 IPPROTO_TCP + len + optlen)); 15909 } 15910 /* IP version must be set here for ipv4/ipv6 checking later */ 15911 KASSERT(ip->ip_v == IPVERSION, 15912 ("%s: IP version incorrect: %d", __func__, ip->ip_v)); 15913 } 15914 #endif 15915 if (tso) { 15916 KASSERT(len > tp->t_maxseg - optlen, 15917 ("%s: len <= tso_segsz tp:%p", __func__, tp)); 15918 m->m_pkthdr.csum_flags |= CSUM_TSO; 15919 m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen; 15920 } 15921 #ifdef INET6 15922 if (rack->r_is_v6) { 15923 ip6->ip6_hlim = rack->r_ctl.fsb.hoplimit; 15924 ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6)); 15925 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) 15926 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 15927 else 15928 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 15929 } 15930 #endif 15931 #if defined(INET) && defined(INET6) 15932 else 15933 #endif 15934 #ifdef INET 15935 { 15936 ip->ip_len = htons(m->m_pkthdr.len); 15937 ip->ip_ttl = rack->r_ctl.fsb.hoplimit; 15938 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) { 15939 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 15940 if (tp->t_port == 0 || len < V_tcp_minmss) { 15941 ip->ip_off |= htons(IP_DF); 15942 } 15943 } else { 15944 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 15945 } 15946 } 15947 #endif 15948 /* Time to copy in our header */ 15949 cpto = mtod(m, uint8_t *); 15950 memcpy(cpto, rack->r_ctl.fsb.tcp_ip_hdr, rack->r_ctl.fsb.tcp_ip_hdr_len); 15951 th = (struct tcphdr *)(cpto + ((uint8_t *)rack->r_ctl.fsb.th - rack->r_ctl.fsb.tcp_ip_hdr)); 15952 if (optlen) { 15953 bcopy(opt, th + 1, optlen); 15954 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 15955 } else { 15956 th->th_off = sizeof(struct tcphdr) >> 2; 15957 } 15958 if (tp->t_logstate != TCP_LOG_STATE_OFF) { 15959 union tcp_log_stackspecific log; 15960 15961 if (rsm->r_flags & RACK_RWND_COLLAPSED) { 15962 rack_log_collapse(rack, rsm->r_start, rsm->r_end, 0, __LINE__, 5, rsm->r_flags, rsm); 15963 counter_u64_add(rack_collapsed_win_rxt, 1); 15964 counter_u64_add(rack_collapsed_win_rxt_bytes, (rsm->r_end - rsm->r_start)); 15965 } 15966 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 15967 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); 15968 if (rack->rack_no_prr) 15969 log.u_bbr.flex1 = 0; 15970 else 15971 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; 15972 log.u_bbr.flex2 = rack->r_ctl.rc_pace_min_segs; 15973 log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs; 15974 log.u_bbr.flex4 = max_val; 15975 log.u_bbr.flex5 = 0; 15976 /* Save off the early/late values */ 15977 log.u_bbr.flex6 = rack->r_ctl.rc_agg_early; 15978 log.u_bbr.applimited = rack->r_ctl.rc_agg_delayed; 15979 log.u_bbr.bw_inuse = rack_get_bw(rack); 15980 if (doing_tlp == 0) 15981 log.u_bbr.flex8 = 1; 15982 else 15983 log.u_bbr.flex8 = 2; 15984 log.u_bbr.pacing_gain = rack_get_output_gain(rack, NULL); 15985 log.u_bbr.flex7 = 55; 15986 log.u_bbr.pkts_out = tp->t_maxseg; 15987 log.u_bbr.timeStamp = cts; 15988 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 15989 log.u_bbr.lt_epoch = rack->r_ctl.cwnd_to_use; 15990 log.u_bbr.delivered = 0; 15991 lgb = tcp_log_event_(tp, th, NULL, NULL, TCP_LOG_OUT, ERRNO_UNK, 15992 len, &log, false, NULL, NULL, 0, tv); 15993 } else 15994 lgb = NULL; 15995 #ifdef INET6 15996 if (rack->r_is_v6) { 15997 error = ip6_output(m, NULL, 15998 &inp->inp_route6, 15999 0, NULL, NULL, inp); 16000 } 16001 #endif 16002 #if defined(INET) && defined(INET6) 16003 else 16004 #endif 16005 #ifdef INET 16006 { 16007 error = ip_output(m, NULL, 16008 &inp->inp_route, 16009 0, 0, inp); 16010 } 16011 #endif 16012 m = NULL; 16013 if (lgb) { 16014 lgb->tlb_errno = error; 16015 lgb = NULL; 16016 } 16017 if (error) { 16018 goto failed; 16019 } 16020 rack_log_output(tp, &to, len, rsm->r_start, flags, error, rack_to_usec_ts(tv), 16021 rsm, RACK_SENT_FP, rsm->m, rsm->soff, rsm->r_hw_tls); 16022 if (doing_tlp && (rack->fast_rsm_hack == 0)) { 16023 rack->rc_tlp_in_progress = 1; 16024 rack->r_ctl.rc_tlp_cnt_out++; 16025 } 16026 if (error == 0) { 16027 tcp_account_for_send(tp, len, 1, doing_tlp, rsm->r_hw_tls); 16028 if (doing_tlp) { 16029 rack->rc_last_sent_tlp_past_cumack = 0; 16030 rack->rc_last_sent_tlp_seq_valid = 1; 16031 rack->r_ctl.last_sent_tlp_seq = rsm->r_start; 16032 rack->r_ctl.last_sent_tlp_len = rsm->r_end - rsm->r_start; 16033 } 16034 } 16035 tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); 16036 rack->forced_ack = 0; /* If we send something zap the FA flag */ 16037 if (IN_FASTRECOVERY(tp->t_flags) && rsm) 16038 rack->r_ctl.retran_during_recovery += len; 16039 { 16040 int idx; 16041 16042 idx = (len / segsiz) + 3; 16043 if (idx >= TCP_MSS_ACCT_ATIMER) 16044 counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1); 16045 else 16046 counter_u64_add(rack_out_size[idx], 1); 16047 } 16048 if (tp->t_rtttime == 0) { 16049 tp->t_rtttime = ticks; 16050 tp->t_rtseq = startseq; 16051 KMOD_TCPSTAT_INC(tcps_segstimed); 16052 } 16053 counter_u64_add(rack_fto_rsm_send, 1); 16054 if (error && (error == ENOBUFS)) { 16055 if (rack->r_ctl.crte != NULL) { 16056 rack_trace_point(rack, RACK_TP_HWENOBUF); 16057 } else 16058 rack_trace_point(rack, RACK_TP_ENOBUF); 16059 slot = ((1 + rack->rc_enobuf) * HPTS_USEC_IN_MSEC); 16060 if (rack->rc_enobuf < 0x7f) 16061 rack->rc_enobuf++; 16062 if (slot < (10 * HPTS_USEC_IN_MSEC)) 16063 slot = 10 * HPTS_USEC_IN_MSEC; 16064 } else 16065 slot = rack_get_pacing_delay(rack, tp, len, NULL, segsiz); 16066 if ((slot == 0) || 16067 (rack->rc_always_pace == 0) || 16068 (rack->r_rr_config == 1)) { 16069 /* 16070 * We have no pacing set or we 16071 * are using old-style rack or 16072 * we are overridden to use the old 1ms pacing. 16073 */ 16074 slot = rack->r_ctl.rc_min_to; 16075 } 16076 rack_start_hpts_timer(rack, tp, cts, slot, len, 0); 16077 #ifdef TCP_ACCOUNTING 16078 crtsc = get_cyclecount(); 16079 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 16080 tp->tcp_cnt_counters[SND_OUT_DATA] += cnt_thru; 16081 } 16082 counter_u64_add(tcp_cnt_counters[SND_OUT_DATA], cnt_thru); 16083 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 16084 tp->tcp_proc_time[SND_OUT_DATA] += (crtsc - ts_val); 16085 } 16086 counter_u64_add(tcp_proc_time[SND_OUT_DATA], (crtsc - ts_val)); 16087 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 16088 tp->tcp_cnt_counters[CNT_OF_MSS_OUT] += ((len + segsiz - 1) / segsiz); 16089 } 16090 counter_u64_add(tcp_cnt_counters[CNT_OF_MSS_OUT], ((len + segsiz - 1) / segsiz)); 16091 sched_unpin(); 16092 #endif 16093 return (0); 16094 failed: 16095 if (m) 16096 m_free(m); 16097 return (-1); 16098 } 16099 16100 static void 16101 rack_sndbuf_autoscale(struct tcp_rack *rack) 16102 { 16103 /* 16104 * Automatic sizing of send socket buffer. Often the send buffer 16105 * size is not optimally adjusted to the actual network conditions 16106 * at hand (delay bandwidth product). Setting the buffer size too 16107 * small limits throughput on links with high bandwidth and high 16108 * delay (eg. trans-continental/oceanic links). Setting the 16109 * buffer size too big consumes too much real kernel memory, 16110 * especially with many connections on busy servers. 16111 * 16112 * The criteria to step up the send buffer one notch are: 16113 * 1. receive window of remote host is larger than send buffer 16114 * (with a fudge factor of 5/4th); 16115 * 2. send buffer is filled to 7/8th with data (so we actually 16116 * have data to make use of it); 16117 * 3. send buffer fill has not hit maximal automatic size; 16118 * 4. our send window (slow start and cogestion controlled) is 16119 * larger than sent but unacknowledged data in send buffer. 16120 * 16121 * Note that the rack version moves things much faster since 16122 * we want to avoid hitting cache lines in the rack_fast_output() 16123 * path so this is called much less often and thus moves 16124 * the SB forward by a percentage. 16125 */ 16126 struct socket *so; 16127 struct tcpcb *tp; 16128 uint32_t sendwin, scaleup; 16129 16130 tp = rack->rc_tp; 16131 so = rack->rc_inp->inp_socket; 16132 sendwin = min(rack->r_ctl.cwnd_to_use, tp->snd_wnd); 16133 if (V_tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE) { 16134 if ((tp->snd_wnd / 4 * 5) >= so->so_snd.sb_hiwat && 16135 sbused(&so->so_snd) >= 16136 (so->so_snd.sb_hiwat / 8 * 7) && 16137 sbused(&so->so_snd) < V_tcp_autosndbuf_max && 16138 sendwin >= (sbused(&so->so_snd) - 16139 (tp->snd_nxt - tp->snd_una))) { 16140 if (rack_autosndbuf_inc) 16141 scaleup = (rack_autosndbuf_inc * so->so_snd.sb_hiwat) / 100; 16142 else 16143 scaleup = V_tcp_autosndbuf_inc; 16144 if (scaleup < V_tcp_autosndbuf_inc) 16145 scaleup = V_tcp_autosndbuf_inc; 16146 scaleup += so->so_snd.sb_hiwat; 16147 if (scaleup > V_tcp_autosndbuf_max) 16148 scaleup = V_tcp_autosndbuf_max; 16149 if (!sbreserve_locked(so, SO_SND, scaleup, curthread)) 16150 so->so_snd.sb_flags &= ~SB_AUTOSIZE; 16151 } 16152 } 16153 } 16154 16155 static int 16156 rack_fast_output(struct tcpcb *tp, struct tcp_rack *rack, uint64_t ts_val, 16157 uint32_t cts, uint32_t ms_cts, struct timeval *tv, long tot_len, int *send_err) 16158 { 16159 /* 16160 * Enter to do fast output. We are given that the sched_pin is 16161 * in place (if accounting is compiled in) and the cycle count taken 16162 * at entry is in place in ts_val. The idea here is that 16163 * we know how many more bytes needs to be sent (presumably either 16164 * during pacing or to fill the cwnd and that was greater than 16165 * the max-burst). We have how much to send and all the info we 16166 * need to just send. 16167 */ 16168 struct ip *ip = NULL; 16169 struct udphdr *udp = NULL; 16170 struct tcphdr *th = NULL; 16171 struct mbuf *m, *s_mb; 16172 struct inpcb *inp; 16173 uint8_t *cpto; 16174 struct tcp_log_buffer *lgb; 16175 #ifdef TCP_ACCOUNTING 16176 uint64_t crtsc; 16177 #endif 16178 struct tcpopt to; 16179 u_char opt[TCP_MAXOLEN]; 16180 uint32_t hdrlen, optlen; 16181 #ifdef TCP_ACCOUNTING 16182 int cnt_thru = 1; 16183 #endif 16184 int32_t slot, segsiz, len, max_val, tso = 0, sb_offset, error, ulen = 0; 16185 uint16_t flags; 16186 uint32_t s_soff; 16187 uint32_t if_hw_tsomaxsegcount = 0, startseq; 16188 uint32_t if_hw_tsomaxsegsize; 16189 uint16_t add_flag = RACK_SENT_FP; 16190 #ifdef INET6 16191 struct ip6_hdr *ip6 = NULL; 16192 16193 if (rack->r_is_v6) { 16194 ip6 = (struct ip6_hdr *)rack->r_ctl.fsb.tcp_ip_hdr; 16195 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 16196 } else 16197 #endif /* INET6 */ 16198 { 16199 ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr; 16200 hdrlen = sizeof(struct tcpiphdr); 16201 } 16202 if (tp->t_port && (V_tcp_udp_tunneling_port == 0)) { 16203 m = NULL; 16204 goto failed; 16205 } 16206 startseq = tp->snd_max; 16207 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 16208 inp = rack->rc_inp; 16209 len = rack->r_ctl.fsb.left_to_send; 16210 to.to_flags = 0; 16211 flags = rack->r_ctl.fsb.tcp_flags; 16212 if (tp->t_flags & TF_RCVD_TSTMP) { 16213 to.to_tsval = ms_cts + tp->ts_offset; 16214 to.to_tsecr = tp->ts_recent; 16215 to.to_flags = TOF_TS; 16216 } 16217 optlen = tcp_addoptions(&to, opt); 16218 hdrlen += optlen; 16219 udp = rack->r_ctl.fsb.udp; 16220 if (udp) 16221 hdrlen += sizeof(struct udphdr); 16222 if (rack->r_ctl.rc_pace_max_segs) 16223 max_val = rack->r_ctl.rc_pace_max_segs; 16224 else if (rack->rc_user_set_max_segs) 16225 max_val = rack->rc_user_set_max_segs * segsiz; 16226 else 16227 max_val = len; 16228 if ((tp->t_flags & TF_TSO) && 16229 V_tcp_do_tso && 16230 (len > segsiz) && 16231 (tp->t_port == 0)) 16232 tso = 1; 16233 again: 16234 #ifdef INET6 16235 if (MHLEN < hdrlen + max_linkhdr) 16236 m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); 16237 else 16238 #endif 16239 m = m_gethdr(M_NOWAIT, MT_DATA); 16240 if (m == NULL) 16241 goto failed; 16242 m->m_data += max_linkhdr; 16243 m->m_len = hdrlen; 16244 th = rack->r_ctl.fsb.th; 16245 /* Establish the len to send */ 16246 if (len > max_val) 16247 len = max_val; 16248 if ((tso) && (len + optlen > tp->t_maxseg)) { 16249 uint32_t if_hw_tsomax; 16250 int32_t max_len; 16251 16252 /* extract TSO information */ 16253 if_hw_tsomax = tp->t_tsomax; 16254 if_hw_tsomaxsegcount = tp->t_tsomaxsegcount; 16255 if_hw_tsomaxsegsize = tp->t_tsomaxsegsize; 16256 /* 16257 * Check if we should limit by maximum payload 16258 * length: 16259 */ 16260 if (if_hw_tsomax != 0) { 16261 /* compute maximum TSO length */ 16262 max_len = (if_hw_tsomax - hdrlen - 16263 max_linkhdr); 16264 if (max_len <= 0) { 16265 goto failed; 16266 } else if (len > max_len) { 16267 len = max_len; 16268 } 16269 } 16270 if (len <= segsiz) { 16271 /* 16272 * In case there are too many small fragments don't 16273 * use TSO: 16274 */ 16275 tso = 0; 16276 } 16277 } else { 16278 tso = 0; 16279 } 16280 if ((tso == 0) && (len > segsiz)) 16281 len = segsiz; 16282 if ((len == 0) || 16283 (len <= MHLEN - hdrlen - max_linkhdr)) { 16284 goto failed; 16285 } 16286 sb_offset = tp->snd_max - tp->snd_una; 16287 th->th_seq = htonl(tp->snd_max); 16288 th->th_ack = htonl(tp->rcv_nxt); 16289 th->th_win = htons((u_short)(rack->r_ctl.fsb.recwin >> tp->rcv_scale)); 16290 if (th->th_win == 0) { 16291 tp->t_sndzerowin++; 16292 tp->t_flags |= TF_RXWIN0SENT; 16293 } else 16294 tp->t_flags &= ~TF_RXWIN0SENT; 16295 tp->snd_up = tp->snd_una; /* drag it along, its deprecated */ 16296 KMOD_TCPSTAT_INC(tcps_sndpack); 16297 KMOD_TCPSTAT_ADD(tcps_sndbyte, len); 16298 #ifdef STATS 16299 stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB, 16300 len); 16301 #endif 16302 if (rack->r_ctl.fsb.m == NULL) 16303 goto failed; 16304 16305 /* s_mb and s_soff are saved for rack_log_output */ 16306 m->m_next = rack_fo_m_copym(rack, &len, if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, 16307 &s_mb, &s_soff); 16308 if (len <= segsiz) { 16309 /* 16310 * Must have ran out of mbufs for the copy 16311 * shorten it to no longer need tso. Lets 16312 * not put on sendalot since we are low on 16313 * mbufs. 16314 */ 16315 tso = 0; 16316 } 16317 if (rack->r_ctl.fsb.rfo_apply_push && 16318 (len == rack->r_ctl.fsb.left_to_send)) { 16319 flags |= TH_PUSH; 16320 add_flag |= RACK_HAD_PUSH; 16321 } 16322 if ((m->m_next == NULL) || (len <= 0)){ 16323 goto failed; 16324 } 16325 if (udp) { 16326 if (rack->r_is_v6) 16327 ulen = hdrlen + len - sizeof(struct ip6_hdr); 16328 else 16329 ulen = hdrlen + len - sizeof(struct ip); 16330 udp->uh_ulen = htons(ulen); 16331 } 16332 m->m_pkthdr.rcvif = (struct ifnet *)0; 16333 if (TCPS_HAVERCVDSYN(tp->t_state) && 16334 (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT))) { 16335 int ect = tcp_ecn_output_established(tp, &flags, len, false); 16336 if ((tp->t_state == TCPS_SYN_RECEIVED) && 16337 (tp->t_flags2 & TF2_ECN_SND_ECE)) 16338 tp->t_flags2 &= ~TF2_ECN_SND_ECE; 16339 #ifdef INET6 16340 if (rack->r_is_v6) { 16341 ip6->ip6_flow &= ~htonl(IPTOS_ECN_MASK << 20); 16342 ip6->ip6_flow |= htonl(ect << 20); 16343 } 16344 else 16345 #endif 16346 { 16347 ip->ip_tos &= ~IPTOS_ECN_MASK; 16348 ip->ip_tos |= ect; 16349 } 16350 } 16351 tcp_set_flags(th, flags); 16352 m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */ 16353 #ifdef INET6 16354 if (rack->r_is_v6) { 16355 if (tp->t_port) { 16356 m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; 16357 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 16358 udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0); 16359 th->th_sum = htons(0); 16360 UDPSTAT_INC(udps_opackets); 16361 } else { 16362 m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; 16363 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 16364 th->th_sum = in6_cksum_pseudo(ip6, 16365 sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP, 16366 0); 16367 } 16368 } 16369 #endif 16370 #if defined(INET6) && defined(INET) 16371 else 16372 #endif 16373 #ifdef INET 16374 { 16375 if (tp->t_port) { 16376 m->m_pkthdr.csum_flags = CSUM_UDP; 16377 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 16378 udp->uh_sum = in_pseudo(ip->ip_src.s_addr, 16379 ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP)); 16380 th->th_sum = htons(0); 16381 UDPSTAT_INC(udps_opackets); 16382 } else { 16383 m->m_pkthdr.csum_flags = CSUM_TCP; 16384 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 16385 th->th_sum = in_pseudo(ip->ip_src.s_addr, 16386 ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + 16387 IPPROTO_TCP + len + optlen)); 16388 } 16389 /* IP version must be set here for ipv4/ipv6 checking later */ 16390 KASSERT(ip->ip_v == IPVERSION, 16391 ("%s: IP version incorrect: %d", __func__, ip->ip_v)); 16392 } 16393 #endif 16394 if (tso) { 16395 KASSERT(len > tp->t_maxseg - optlen, 16396 ("%s: len <= tso_segsz tp:%p", __func__, tp)); 16397 m->m_pkthdr.csum_flags |= CSUM_TSO; 16398 m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen; 16399 } 16400 #ifdef INET6 16401 if (rack->r_is_v6) { 16402 ip6->ip6_hlim = rack->r_ctl.fsb.hoplimit; 16403 ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6)); 16404 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) 16405 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 16406 else 16407 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 16408 } 16409 #endif 16410 #if defined(INET) && defined(INET6) 16411 else 16412 #endif 16413 #ifdef INET 16414 { 16415 ip->ip_len = htons(m->m_pkthdr.len); 16416 ip->ip_ttl = rack->r_ctl.fsb.hoplimit; 16417 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) { 16418 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 16419 if (tp->t_port == 0 || len < V_tcp_minmss) { 16420 ip->ip_off |= htons(IP_DF); 16421 } 16422 } else { 16423 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 16424 } 16425 } 16426 #endif 16427 /* Time to copy in our header */ 16428 cpto = mtod(m, uint8_t *); 16429 memcpy(cpto, rack->r_ctl.fsb.tcp_ip_hdr, rack->r_ctl.fsb.tcp_ip_hdr_len); 16430 th = (struct tcphdr *)(cpto + ((uint8_t *)rack->r_ctl.fsb.th - rack->r_ctl.fsb.tcp_ip_hdr)); 16431 if (optlen) { 16432 bcopy(opt, th + 1, optlen); 16433 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 16434 } else { 16435 th->th_off = sizeof(struct tcphdr) >> 2; 16436 } 16437 if (tp->t_logstate != TCP_LOG_STATE_OFF) { 16438 union tcp_log_stackspecific log; 16439 16440 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 16441 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); 16442 if (rack->rack_no_prr) 16443 log.u_bbr.flex1 = 0; 16444 else 16445 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; 16446 log.u_bbr.flex2 = rack->r_ctl.rc_pace_min_segs; 16447 log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs; 16448 log.u_bbr.flex4 = max_val; 16449 log.u_bbr.flex5 = 0; 16450 /* Save off the early/late values */ 16451 log.u_bbr.flex6 = rack->r_ctl.rc_agg_early; 16452 log.u_bbr.applimited = rack->r_ctl.rc_agg_delayed; 16453 log.u_bbr.bw_inuse = rack_get_bw(rack); 16454 log.u_bbr.flex8 = 0; 16455 log.u_bbr.pacing_gain = rack_get_output_gain(rack, NULL); 16456 log.u_bbr.flex7 = 44; 16457 log.u_bbr.pkts_out = tp->t_maxseg; 16458 log.u_bbr.timeStamp = cts; 16459 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 16460 log.u_bbr.lt_epoch = rack->r_ctl.cwnd_to_use; 16461 log.u_bbr.delivered = 0; 16462 lgb = tcp_log_event_(tp, th, NULL, NULL, TCP_LOG_OUT, ERRNO_UNK, 16463 len, &log, false, NULL, NULL, 0, tv); 16464 } else 16465 lgb = NULL; 16466 #ifdef INET6 16467 if (rack->r_is_v6) { 16468 error = ip6_output(m, NULL, 16469 &inp->inp_route6, 16470 0, NULL, NULL, inp); 16471 } 16472 #endif 16473 #if defined(INET) && defined(INET6) 16474 else 16475 #endif 16476 #ifdef INET 16477 { 16478 error = ip_output(m, NULL, 16479 &inp->inp_route, 16480 0, 0, inp); 16481 } 16482 #endif 16483 if (lgb) { 16484 lgb->tlb_errno = error; 16485 lgb = NULL; 16486 } 16487 if (error) { 16488 *send_err = error; 16489 m = NULL; 16490 goto failed; 16491 } 16492 rack_log_output(tp, &to, len, tp->snd_max, flags, error, rack_to_usec_ts(tv), 16493 NULL, add_flag, s_mb, s_soff, rack->r_ctl.fsb.hw_tls); 16494 m = NULL; 16495 if (tp->snd_una == tp->snd_max) { 16496 rack->r_ctl.rc_tlp_rxt_last_time = cts; 16497 rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__); 16498 tp->t_acktime = ticks; 16499 } 16500 if (error == 0) 16501 tcp_account_for_send(tp, len, 0, 0, rack->r_ctl.fsb.hw_tls); 16502 16503 rack->forced_ack = 0; /* If we send something zap the FA flag */ 16504 tot_len += len; 16505 if ((tp->t_flags & TF_GPUTINPROG) == 0) 16506 rack_start_gp_measurement(tp, rack, tp->snd_max, sb_offset); 16507 tp->snd_max += len; 16508 tp->snd_nxt = tp->snd_max; 16509 { 16510 int idx; 16511 16512 idx = (len / segsiz) + 3; 16513 if (idx >= TCP_MSS_ACCT_ATIMER) 16514 counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1); 16515 else 16516 counter_u64_add(rack_out_size[idx], 1); 16517 } 16518 if (len <= rack->r_ctl.fsb.left_to_send) 16519 rack->r_ctl.fsb.left_to_send -= len; 16520 else 16521 rack->r_ctl.fsb.left_to_send = 0; 16522 if (rack->r_ctl.fsb.left_to_send < segsiz) { 16523 rack->r_fast_output = 0; 16524 rack->r_ctl.fsb.left_to_send = 0; 16525 /* At the end of fast_output scale up the sb */ 16526 SOCKBUF_LOCK(&rack->rc_inp->inp_socket->so_snd); 16527 rack_sndbuf_autoscale(rack); 16528 SOCKBUF_UNLOCK(&rack->rc_inp->inp_socket->so_snd); 16529 } 16530 if (tp->t_rtttime == 0) { 16531 tp->t_rtttime = ticks; 16532 tp->t_rtseq = startseq; 16533 KMOD_TCPSTAT_INC(tcps_segstimed); 16534 } 16535 if ((rack->r_ctl.fsb.left_to_send >= segsiz) && 16536 (max_val > len) && 16537 (tso == 0)) { 16538 max_val -= len; 16539 len = segsiz; 16540 th = rack->r_ctl.fsb.th; 16541 #ifdef TCP_ACCOUNTING 16542 cnt_thru++; 16543 #endif 16544 goto again; 16545 } 16546 tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); 16547 counter_u64_add(rack_fto_send, 1); 16548 slot = rack_get_pacing_delay(rack, tp, tot_len, NULL, segsiz); 16549 rack_start_hpts_timer(rack, tp, cts, slot, tot_len, 0); 16550 #ifdef TCP_ACCOUNTING 16551 crtsc = get_cyclecount(); 16552 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 16553 tp->tcp_cnt_counters[SND_OUT_DATA] += cnt_thru; 16554 } 16555 counter_u64_add(tcp_cnt_counters[SND_OUT_DATA], cnt_thru); 16556 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 16557 tp->tcp_proc_time[SND_OUT_DATA] += (crtsc - ts_val); 16558 } 16559 counter_u64_add(tcp_proc_time[SND_OUT_DATA], (crtsc - ts_val)); 16560 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 16561 tp->tcp_cnt_counters[CNT_OF_MSS_OUT] += ((tot_len + segsiz - 1) / segsiz); 16562 } 16563 counter_u64_add(tcp_cnt_counters[CNT_OF_MSS_OUT], ((tot_len + segsiz - 1) / segsiz)); 16564 sched_unpin(); 16565 #endif 16566 return (0); 16567 failed: 16568 if (m) 16569 m_free(m); 16570 rack->r_fast_output = 0; 16571 return (-1); 16572 } 16573 16574 static struct rack_sendmap * 16575 rack_check_collapsed(struct tcp_rack *rack, uint32_t cts) 16576 { 16577 struct rack_sendmap *rsm = NULL; 16578 struct rack_sendmap fe; 16579 int thresh; 16580 16581 restart: 16582 fe.r_start = rack->r_ctl.last_collapse_point; 16583 rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); 16584 if ((rsm == NULL) || ((rsm->r_flags & RACK_RWND_COLLAPSED) == 0)) { 16585 /* Nothing, strange turn off validity */ 16586 rack->r_collapse_point_valid = 0; 16587 return (NULL); 16588 } 16589 /* Can we send it yet? */ 16590 if (rsm->r_end > (rack->rc_tp->snd_una + rack->rc_tp->snd_wnd)) { 16591 /* 16592 * Receiver window has not grown enough for 16593 * the segment to be put on the wire. 16594 */ 16595 return (NULL); 16596 } 16597 if (rsm->r_flags & RACK_ACKED) { 16598 /* 16599 * It has been sacked, lets move to the 16600 * next one if possible. 16601 */ 16602 rack->r_ctl.last_collapse_point = rsm->r_end; 16603 /* Are we done? */ 16604 if (SEQ_GEQ(rack->r_ctl.last_collapse_point, 16605 rack->r_ctl.high_collapse_point)) { 16606 rack->r_collapse_point_valid = 0; 16607 return (NULL); 16608 } 16609 goto restart; 16610 } 16611 /* Now has it been long enough ? */ 16612 thresh = rack_calc_thresh_rack(rack, rack_grab_rtt(rack->rc_tp, rack), cts); 16613 if ((cts - ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)])) > thresh) { 16614 rack_log_collapse(rack, rsm->r_start, 16615 (cts - ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)])), 16616 thresh, __LINE__, 6, rsm->r_flags, rsm); 16617 return (rsm); 16618 } 16619 /* Not enough time */ 16620 rack_log_collapse(rack, rsm->r_start, 16621 (cts - ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)])), 16622 thresh, __LINE__, 7, rsm->r_flags, rsm); 16623 return (NULL); 16624 } 16625 16626 static int 16627 rack_output(struct tcpcb *tp) 16628 { 16629 struct socket *so; 16630 uint32_t recwin; 16631 uint32_t sb_offset, s_moff = 0; 16632 int32_t len, error = 0; 16633 uint16_t flags; 16634 struct mbuf *m, *s_mb = NULL; 16635 struct mbuf *mb; 16636 uint32_t if_hw_tsomaxsegcount = 0; 16637 uint32_t if_hw_tsomaxsegsize; 16638 int32_t segsiz, minseg; 16639 long tot_len_this_send = 0; 16640 #ifdef INET 16641 struct ip *ip = NULL; 16642 #endif 16643 struct udphdr *udp = NULL; 16644 struct tcp_rack *rack; 16645 struct tcphdr *th; 16646 uint8_t pass = 0; 16647 uint8_t mark = 0; 16648 uint8_t wanted_cookie = 0; 16649 u_char opt[TCP_MAXOLEN]; 16650 unsigned ipoptlen, optlen, hdrlen, ulen=0; 16651 uint32_t rack_seq; 16652 16653 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 16654 unsigned ipsec_optlen = 0; 16655 16656 #endif 16657 int32_t idle, sendalot; 16658 int32_t sub_from_prr = 0; 16659 volatile int32_t sack_rxmit; 16660 struct rack_sendmap *rsm = NULL; 16661 int32_t tso, mtu; 16662 struct tcpopt to; 16663 int32_t slot = 0; 16664 int32_t sup_rack = 0; 16665 uint32_t cts, ms_cts, delayed, early; 16666 uint16_t add_flag = RACK_SENT_SP; 16667 /* The doing_tlp flag will be set by the actual rack_timeout_tlp() */ 16668 uint8_t hpts_calling, doing_tlp = 0; 16669 uint32_t cwnd_to_use, pace_max_seg; 16670 int32_t do_a_prefetch = 0; 16671 int32_t prefetch_rsm = 0; 16672 int32_t orig_len = 0; 16673 struct timeval tv; 16674 int32_t prefetch_so_done = 0; 16675 struct tcp_log_buffer *lgb; 16676 struct inpcb *inp = tptoinpcb(tp); 16677 struct sockbuf *sb; 16678 uint64_t ts_val = 0; 16679 #ifdef TCP_ACCOUNTING 16680 uint64_t crtsc; 16681 #endif 16682 #ifdef INET6 16683 struct ip6_hdr *ip6 = NULL; 16684 int32_t isipv6; 16685 #endif 16686 bool hw_tls = false; 16687 16688 NET_EPOCH_ASSERT(); 16689 INP_WLOCK_ASSERT(inp); 16690 16691 /* setup and take the cache hits here */ 16692 rack = (struct tcp_rack *)tp->t_fb_ptr; 16693 #ifdef TCP_ACCOUNTING 16694 sched_pin(); 16695 ts_val = get_cyclecount(); 16696 #endif 16697 hpts_calling = inp->inp_hpts_calls; 16698 #ifdef TCP_OFFLOAD 16699 if (tp->t_flags & TF_TOE) { 16700 #ifdef TCP_ACCOUNTING 16701 sched_unpin(); 16702 #endif 16703 return (tcp_offload_output(tp)); 16704 } 16705 #endif 16706 /* 16707 * For TFO connections in SYN_RECEIVED, only allow the initial 16708 * SYN|ACK and those sent by the retransmit timer. 16709 */ 16710 if (IS_FASTOPEN(tp->t_flags) && 16711 (tp->t_state == TCPS_SYN_RECEIVED) && 16712 SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN|ACK sent */ 16713 (rack->r_ctl.rc_resend == NULL)) { /* not a retransmit */ 16714 #ifdef TCP_ACCOUNTING 16715 sched_unpin(); 16716 #endif 16717 return (0); 16718 } 16719 #ifdef INET6 16720 if (rack->r_state) { 16721 /* Use the cache line loaded if possible */ 16722 isipv6 = rack->r_is_v6; 16723 } else { 16724 isipv6 = (rack->rc_inp->inp_vflag & INP_IPV6) != 0; 16725 } 16726 #endif 16727 early = 0; 16728 cts = tcp_get_usecs(&tv); 16729 ms_cts = tcp_tv_to_mssectick(&tv); 16730 if (((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) && 16731 tcp_in_hpts(rack->rc_inp)) { 16732 /* 16733 * We are on the hpts for some timer but not hptsi output. 16734 * Remove from the hpts unconditionally. 16735 */ 16736 rack_timer_cancel(tp, rack, cts, __LINE__); 16737 } 16738 /* Are we pacing and late? */ 16739 if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) && 16740 TSTMP_GEQ(cts, rack->r_ctl.rc_last_output_to)) { 16741 /* We are delayed */ 16742 delayed = cts - rack->r_ctl.rc_last_output_to; 16743 } else { 16744 delayed = 0; 16745 } 16746 /* Do the timers, which may override the pacer */ 16747 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { 16748 int retval; 16749 16750 retval = rack_process_timers(tp, rack, cts, hpts_calling, 16751 &doing_tlp); 16752 if (retval != 0) { 16753 counter_u64_add(rack_out_size[TCP_MSS_ACCT_ATIMER], 1); 16754 #ifdef TCP_ACCOUNTING 16755 sched_unpin(); 16756 #endif 16757 /* 16758 * If timers want tcp_drop(), then pass error out, 16759 * otherwise suppress it. 16760 */ 16761 return (retval < 0 ? retval : 0); 16762 } 16763 } 16764 if (rack->rc_in_persist) { 16765 if (tcp_in_hpts(rack->rc_inp) == 0) { 16766 /* Timer is not running */ 16767 rack_start_hpts_timer(rack, tp, cts, 0, 0, 0); 16768 } 16769 #ifdef TCP_ACCOUNTING 16770 sched_unpin(); 16771 #endif 16772 return (0); 16773 } 16774 if ((rack->rc_ack_required == 1) && 16775 (rack->r_timer_override == 0)){ 16776 /* A timeout occurred and no ack has arrived */ 16777 if (tcp_in_hpts(rack->rc_inp) == 0) { 16778 /* Timer is not running */ 16779 rack_start_hpts_timer(rack, tp, cts, 0, 0, 0); 16780 } 16781 #ifdef TCP_ACCOUNTING 16782 sched_unpin(); 16783 #endif 16784 return (0); 16785 } 16786 if ((rack->r_timer_override) || 16787 (rack->rc_ack_can_sendout_data) || 16788 (delayed) || 16789 (tp->t_state < TCPS_ESTABLISHED)) { 16790 rack->rc_ack_can_sendout_data = 0; 16791 if (tcp_in_hpts(rack->rc_inp)) 16792 tcp_hpts_remove(rack->rc_inp); 16793 } else if (tcp_in_hpts(rack->rc_inp)) { 16794 /* 16795 * On the hpts you can't pass even if ACKNOW is on, we will 16796 * when the hpts fires. 16797 */ 16798 #ifdef TCP_ACCOUNTING 16799 crtsc = get_cyclecount(); 16800 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 16801 tp->tcp_proc_time[SND_BLOCKED] += (crtsc - ts_val); 16802 } 16803 counter_u64_add(tcp_proc_time[SND_BLOCKED], (crtsc - ts_val)); 16804 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 16805 tp->tcp_cnt_counters[SND_BLOCKED]++; 16806 } 16807 counter_u64_add(tcp_cnt_counters[SND_BLOCKED], 1); 16808 sched_unpin(); 16809 #endif 16810 counter_u64_add(rack_out_size[TCP_MSS_ACCT_INPACE], 1); 16811 return (0); 16812 } 16813 rack->rc_inp->inp_hpts_calls = 0; 16814 /* Finish out both pacing early and late accounting */ 16815 if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) && 16816 TSTMP_GT(rack->r_ctl.rc_last_output_to, cts)) { 16817 early = rack->r_ctl.rc_last_output_to - cts; 16818 } else 16819 early = 0; 16820 if (delayed) { 16821 rack->r_ctl.rc_agg_delayed += delayed; 16822 rack->r_late = 1; 16823 } else if (early) { 16824 rack->r_ctl.rc_agg_early += early; 16825 rack->r_early = 1; 16826 } 16827 /* Now that early/late accounting is done turn off the flag */ 16828 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 16829 rack->r_wanted_output = 0; 16830 rack->r_timer_override = 0; 16831 if ((tp->t_state != rack->r_state) && 16832 TCPS_HAVEESTABLISHED(tp->t_state)) { 16833 rack_set_state(tp, rack); 16834 } 16835 if ((rack->r_fast_output) && 16836 (doing_tlp == 0) && 16837 (tp->rcv_numsacks == 0)) { 16838 int ret; 16839 16840 error = 0; 16841 ret = rack_fast_output(tp, rack, ts_val, cts, ms_cts, &tv, tot_len_this_send, &error); 16842 if (ret >= 0) 16843 return(ret); 16844 else if (error) { 16845 inp = rack->rc_inp; 16846 so = inp->inp_socket; 16847 sb = &so->so_snd; 16848 goto nomore; 16849 } 16850 } 16851 inp = rack->rc_inp; 16852 /* 16853 * For TFO connections in SYN_SENT or SYN_RECEIVED, 16854 * only allow the initial SYN or SYN|ACK and those sent 16855 * by the retransmit timer. 16856 */ 16857 if (IS_FASTOPEN(tp->t_flags) && 16858 ((tp->t_state == TCPS_SYN_RECEIVED) || 16859 (tp->t_state == TCPS_SYN_SENT)) && 16860 SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN or SYN|ACK sent */ 16861 (tp->t_rxtshift == 0)) { /* not a retransmit */ 16862 cwnd_to_use = rack->r_ctl.cwnd_to_use = tp->snd_cwnd; 16863 so = inp->inp_socket; 16864 sb = &so->so_snd; 16865 goto just_return_nolock; 16866 } 16867 /* 16868 * Determine length of data that should be transmitted, and flags 16869 * that will be used. If there is some data or critical controls 16870 * (SYN, RST) to send, then transmit; otherwise, investigate 16871 * further. 16872 */ 16873 idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una); 16874 if (tp->t_idle_reduce) { 16875 if (idle && (TICKS_2_USEC(ticks - tp->t_rcvtime) >= tp->t_rxtcur)) 16876 rack_cc_after_idle(rack, tp); 16877 } 16878 tp->t_flags &= ~TF_LASTIDLE; 16879 if (idle) { 16880 if (tp->t_flags & TF_MORETOCOME) { 16881 tp->t_flags |= TF_LASTIDLE; 16882 idle = 0; 16883 } 16884 } 16885 if ((tp->snd_una == tp->snd_max) && 16886 rack->r_ctl.rc_went_idle_time && 16887 TSTMP_GT(cts, rack->r_ctl.rc_went_idle_time)) { 16888 idle = cts - rack->r_ctl.rc_went_idle_time; 16889 if (idle > rack_min_probertt_hold) { 16890 /* Count as a probe rtt */ 16891 if (rack->in_probe_rtt == 0) { 16892 rack->r_ctl.rc_lower_rtt_us_cts = cts; 16893 rack->r_ctl.rc_time_probertt_entered = rack->r_ctl.rc_lower_rtt_us_cts; 16894 rack->r_ctl.rc_time_probertt_starts = rack->r_ctl.rc_lower_rtt_us_cts; 16895 rack->r_ctl.rc_time_of_last_probertt = rack->r_ctl.rc_lower_rtt_us_cts; 16896 } else { 16897 rack_exit_probertt(rack, cts); 16898 } 16899 } 16900 idle = 0; 16901 } 16902 if (rack_use_fsb && (rack->r_fsb_inited == 0) && (rack->r_state != TCPS_CLOSED)) 16903 rack_init_fsb_block(tp, rack); 16904 again: 16905 /* 16906 * If we've recently taken a timeout, snd_max will be greater than 16907 * snd_nxt. There may be SACK information that allows us to avoid 16908 * resending already delivered data. Adjust snd_nxt accordingly. 16909 */ 16910 sendalot = 0; 16911 cts = tcp_get_usecs(&tv); 16912 ms_cts = tcp_tv_to_mssectick(&tv); 16913 tso = 0; 16914 mtu = 0; 16915 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 16916 minseg = segsiz; 16917 if (rack->r_ctl.rc_pace_max_segs == 0) 16918 pace_max_seg = rack->rc_user_set_max_segs * segsiz; 16919 else 16920 pace_max_seg = rack->r_ctl.rc_pace_max_segs; 16921 sb_offset = tp->snd_max - tp->snd_una; 16922 cwnd_to_use = rack->r_ctl.cwnd_to_use = tp->snd_cwnd; 16923 flags = tcp_outflags[tp->t_state]; 16924 while (rack->rc_free_cnt < rack_free_cache) { 16925 rsm = rack_alloc(rack); 16926 if (rsm == NULL) { 16927 if (inp->inp_hpts_calls) 16928 /* Retry in a ms */ 16929 slot = (1 * HPTS_USEC_IN_MSEC); 16930 so = inp->inp_socket; 16931 sb = &so->so_snd; 16932 goto just_return_nolock; 16933 } 16934 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_tnext); 16935 rack->rc_free_cnt++; 16936 rsm = NULL; 16937 } 16938 if (inp->inp_hpts_calls) 16939 inp->inp_hpts_calls = 0; 16940 sack_rxmit = 0; 16941 len = 0; 16942 rsm = NULL; 16943 if (flags & TH_RST) { 16944 SOCKBUF_LOCK(&inp->inp_socket->so_snd); 16945 so = inp->inp_socket; 16946 sb = &so->so_snd; 16947 goto send; 16948 } 16949 if (rack->r_ctl.rc_resend) { 16950 /* Retransmit timer */ 16951 rsm = rack->r_ctl.rc_resend; 16952 rack->r_ctl.rc_resend = NULL; 16953 len = rsm->r_end - rsm->r_start; 16954 sack_rxmit = 1; 16955 sendalot = 0; 16956 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start), 16957 ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p", 16958 __func__, __LINE__, 16959 rsm->r_start, tp->snd_una, tp, rack, rsm)); 16960 sb_offset = rsm->r_start - tp->snd_una; 16961 if (len >= segsiz) 16962 len = segsiz; 16963 } else if (rack->r_collapse_point_valid && 16964 ((rsm = rack_check_collapsed(rack, cts)) != NULL)) { 16965 /* 16966 * If an RSM is returned then enough time has passed 16967 * for us to retransmit it. Move up the collapse point, 16968 * since this rsm has its chance to retransmit now. 16969 */ 16970 rack_trace_point(rack, RACK_TP_COLLAPSED_RXT); 16971 rack->r_ctl.last_collapse_point = rsm->r_end; 16972 /* Are we done? */ 16973 if (SEQ_GEQ(rack->r_ctl.last_collapse_point, 16974 rack->r_ctl.high_collapse_point)) 16975 rack->r_collapse_point_valid = 0; 16976 sack_rxmit = 1; 16977 /* We are not doing a TLP */ 16978 doing_tlp = 0; 16979 len = rsm->r_end - rsm->r_start; 16980 sb_offset = rsm->r_start - tp->snd_una; 16981 sendalot = 0; 16982 if ((rack->full_size_rxt == 0) && 16983 (rack->shape_rxt_to_pacing_min == 0) && 16984 (len >= segsiz)) 16985 len = segsiz; 16986 } else if ((rsm = tcp_rack_output(tp, rack, cts)) != NULL) { 16987 /* We have a retransmit that takes precedence */ 16988 if ((!IN_FASTRECOVERY(tp->t_flags)) && 16989 ((rsm->r_flags & RACK_MUST_RXT) == 0) && 16990 ((tp->t_flags & TF_WASFRECOVERY) == 0)) { 16991 /* Enter recovery if not induced by a time-out */ 16992 rack_cong_signal(tp, CC_NDUPACK, tp->snd_una, __LINE__); 16993 } 16994 #ifdef INVARIANTS 16995 if (SEQ_LT(rsm->r_start, tp->snd_una)) { 16996 panic("Huh, tp:%p rack:%p rsm:%p start:%u < snd_una:%u\n", 16997 tp, rack, rsm, rsm->r_start, tp->snd_una); 16998 } 16999 #endif 17000 len = rsm->r_end - rsm->r_start; 17001 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start), 17002 ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p", 17003 __func__, __LINE__, 17004 rsm->r_start, tp->snd_una, tp, rack, rsm)); 17005 sb_offset = rsm->r_start - tp->snd_una; 17006 sendalot = 0; 17007 if (len >= segsiz) 17008 len = segsiz; 17009 if (len > 0) { 17010 sack_rxmit = 1; 17011 KMOD_TCPSTAT_INC(tcps_sack_rexmits); 17012 KMOD_TCPSTAT_ADD(tcps_sack_rexmit_bytes, 17013 min(len, segsiz)); 17014 } 17015 } else if (rack->r_ctl.rc_tlpsend) { 17016 /* Tail loss probe */ 17017 long cwin; 17018 long tlen; 17019 17020 /* 17021 * Check if we can do a TLP with a RACK'd packet 17022 * this can happen if we are not doing the rack 17023 * cheat and we skipped to a TLP and it 17024 * went off. 17025 */ 17026 rsm = rack->r_ctl.rc_tlpsend; 17027 /* We are doing a TLP make sure the flag is preent */ 17028 rsm->r_flags |= RACK_TLP; 17029 rack->r_ctl.rc_tlpsend = NULL; 17030 sack_rxmit = 1; 17031 tlen = rsm->r_end - rsm->r_start; 17032 if (tlen > segsiz) 17033 tlen = segsiz; 17034 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start), 17035 ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p", 17036 __func__, __LINE__, 17037 rsm->r_start, tp->snd_una, tp, rack, rsm)); 17038 sb_offset = rsm->r_start - tp->snd_una; 17039 cwin = min(tp->snd_wnd, tlen); 17040 len = cwin; 17041 } 17042 if (rack->r_must_retran && 17043 (doing_tlp == 0) && 17044 (SEQ_GT(tp->snd_max, tp->snd_una)) && 17045 (rsm == NULL)) { 17046 /* 17047 * There are two different ways that we 17048 * can get into this block: 17049 * a) This is a non-sack connection, we had a time-out 17050 * and thus r_must_retran was set and everything 17051 * left outstanding as been marked for retransmit. 17052 * b) The MTU of the path shrank, so that everything 17053 * was marked to be retransmitted with the smaller 17054 * mtu and r_must_retran was set. 17055 * 17056 * This means that we expect the sendmap (outstanding) 17057 * to all be marked must. We can use the tmap to 17058 * look at them. 17059 * 17060 */ 17061 int sendwin, flight; 17062 17063 sendwin = min(tp->snd_wnd, tp->snd_cwnd); 17064 flight = ctf_flight_size(tp, rack->r_ctl.rc_out_at_rto); 17065 if (flight >= sendwin) { 17066 /* 17067 * We can't send yet. 17068 */ 17069 so = inp->inp_socket; 17070 sb = &so->so_snd; 17071 goto just_return_nolock; 17072 } 17073 /* 17074 * This is the case a/b mentioned above. All 17075 * outstanding/not-acked should be marked. 17076 * We can use the tmap to find them. 17077 */ 17078 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 17079 if (rsm == NULL) { 17080 /* TSNH */ 17081 rack->r_must_retran = 0; 17082 rack->r_ctl.rc_out_at_rto = 0; 17083 so = inp->inp_socket; 17084 sb = &so->so_snd; 17085 goto just_return_nolock; 17086 } 17087 if ((rsm->r_flags & RACK_MUST_RXT) == 0) { 17088 /* 17089 * The first one does not have the flag, did we collapse 17090 * further up in our list? 17091 */ 17092 rack->r_must_retran = 0; 17093 rack->r_ctl.rc_out_at_rto = 0; 17094 rsm = NULL; 17095 sack_rxmit = 0; 17096 } else { 17097 sack_rxmit = 1; 17098 len = rsm->r_end - rsm->r_start; 17099 sb_offset = rsm->r_start - tp->snd_una; 17100 sendalot = 0; 17101 if ((rack->full_size_rxt == 0) && 17102 (rack->shape_rxt_to_pacing_min == 0) && 17103 (len >= segsiz)) 17104 len = segsiz; 17105 /* 17106 * Delay removing the flag RACK_MUST_RXT so 17107 * that the fastpath for retransmit will 17108 * work with this rsm. 17109 */ 17110 } 17111 } 17112 /* 17113 * Enforce a connection sendmap count limit if set 17114 * as long as we are not retransmiting. 17115 */ 17116 if ((rsm == NULL) && 17117 (rack->do_detection == 0) && 17118 (V_tcp_map_entries_limit > 0) && 17119 (rack->r_ctl.rc_num_maps_alloced >= V_tcp_map_entries_limit)) { 17120 counter_u64_add(rack_to_alloc_limited, 1); 17121 if (!rack->alloc_limit_reported) { 17122 rack->alloc_limit_reported = 1; 17123 counter_u64_add(rack_alloc_limited_conns, 1); 17124 } 17125 so = inp->inp_socket; 17126 sb = &so->so_snd; 17127 goto just_return_nolock; 17128 } 17129 if (rsm && (rsm->r_flags & RACK_HAS_FIN)) { 17130 /* we are retransmitting the fin */ 17131 len--; 17132 if (len) { 17133 /* 17134 * When retransmitting data do *not* include the 17135 * FIN. This could happen from a TLP probe. 17136 */ 17137 flags &= ~TH_FIN; 17138 } 17139 } 17140 if (rsm && rack->r_fsb_inited && rack_use_rsm_rfo && 17141 ((rsm->r_flags & RACK_HAS_FIN) == 0)) { 17142 int ret; 17143 17144 ret = rack_fast_rsm_output(tp, rack, rsm, ts_val, cts, ms_cts, &tv, len, doing_tlp); 17145 if (ret == 0) 17146 return (0); 17147 } 17148 so = inp->inp_socket; 17149 sb = &so->so_snd; 17150 if (do_a_prefetch == 0) { 17151 kern_prefetch(sb, &do_a_prefetch); 17152 do_a_prefetch = 1; 17153 } 17154 #ifdef NETFLIX_SHARED_CWND 17155 if ((tp->t_flags2 & TF2_TCP_SCWND_ALLOWED) && 17156 rack->rack_enable_scwnd) { 17157 /* We are doing cwnd sharing */ 17158 if (rack->gp_ready && 17159 (rack->rack_attempted_scwnd == 0) && 17160 (rack->r_ctl.rc_scw == NULL) && 17161 tp->t_lib) { 17162 /* The pcbid is in, lets make an attempt */ 17163 counter_u64_add(rack_try_scwnd, 1); 17164 rack->rack_attempted_scwnd = 1; 17165 rack->r_ctl.rc_scw = tcp_shared_cwnd_alloc(tp, 17166 &rack->r_ctl.rc_scw_index, 17167 segsiz); 17168 } 17169 if (rack->r_ctl.rc_scw && 17170 (rack->rack_scwnd_is_idle == 1) && 17171 sbavail(&so->so_snd)) { 17172 /* we are no longer out of data */ 17173 tcp_shared_cwnd_active(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index); 17174 rack->rack_scwnd_is_idle = 0; 17175 } 17176 if (rack->r_ctl.rc_scw) { 17177 /* First lets update and get the cwnd */ 17178 rack->r_ctl.cwnd_to_use = cwnd_to_use = tcp_shared_cwnd_update(rack->r_ctl.rc_scw, 17179 rack->r_ctl.rc_scw_index, 17180 tp->snd_cwnd, tp->snd_wnd, segsiz); 17181 } 17182 } 17183 #endif 17184 /* 17185 * Get standard flags, and add SYN or FIN if requested by 'hidden' 17186 * state flags. 17187 */ 17188 if (tp->t_flags & TF_NEEDFIN) 17189 flags |= TH_FIN; 17190 if (tp->t_flags & TF_NEEDSYN) 17191 flags |= TH_SYN; 17192 if ((sack_rxmit == 0) && (prefetch_rsm == 0)) { 17193 void *end_rsm; 17194 end_rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext); 17195 if (end_rsm) 17196 kern_prefetch(end_rsm, &prefetch_rsm); 17197 prefetch_rsm = 1; 17198 } 17199 SOCKBUF_LOCK(sb); 17200 /* 17201 * If snd_nxt == snd_max and we have transmitted a FIN, the 17202 * sb_offset will be > 0 even if so_snd.sb_cc is 0, resulting in a 17203 * negative length. This can also occur when TCP opens up its 17204 * congestion window while receiving additional duplicate acks after 17205 * fast-retransmit because TCP will reset snd_nxt to snd_max after 17206 * the fast-retransmit. 17207 * 17208 * In the normal retransmit-FIN-only case, however, snd_nxt will be 17209 * set to snd_una, the sb_offset will be 0, and the length may wind 17210 * up 0. 17211 * 17212 * If sack_rxmit is true we are retransmitting from the scoreboard 17213 * in which case len is already set. 17214 */ 17215 if ((sack_rxmit == 0) && 17216 (TCPS_HAVEESTABLISHED(tp->t_state) || IS_FASTOPEN(tp->t_flags))) { 17217 uint32_t avail; 17218 17219 avail = sbavail(sb); 17220 if (SEQ_GT(tp->snd_nxt, tp->snd_una) && avail) 17221 sb_offset = tp->snd_nxt - tp->snd_una; 17222 else 17223 sb_offset = 0; 17224 if ((IN_FASTRECOVERY(tp->t_flags) == 0) || rack->rack_no_prr) { 17225 if (rack->r_ctl.rc_tlp_new_data) { 17226 /* TLP is forcing out new data */ 17227 if (rack->r_ctl.rc_tlp_new_data > (uint32_t) (avail - sb_offset)) { 17228 rack->r_ctl.rc_tlp_new_data = (uint32_t) (avail - sb_offset); 17229 } 17230 if ((rack->r_ctl.rc_tlp_new_data + sb_offset) > tp->snd_wnd) { 17231 if (tp->snd_wnd > sb_offset) 17232 len = tp->snd_wnd - sb_offset; 17233 else 17234 len = 0; 17235 } else { 17236 len = rack->r_ctl.rc_tlp_new_data; 17237 } 17238 rack->r_ctl.rc_tlp_new_data = 0; 17239 } else { 17240 len = rack_what_can_we_send(tp, rack, cwnd_to_use, avail, sb_offset); 17241 } 17242 if ((rack->r_ctl.crte == NULL) && IN_FASTRECOVERY(tp->t_flags) && (len > segsiz)) { 17243 /* 17244 * For prr=off, we need to send only 1 MSS 17245 * at a time. We do this because another sack could 17246 * be arriving that causes us to send retransmits and 17247 * we don't want to be on a long pace due to a larger send 17248 * that keeps us from sending out the retransmit. 17249 */ 17250 len = segsiz; 17251 } 17252 } else { 17253 uint32_t outstanding; 17254 /* 17255 * We are inside of a Fast recovery episode, this 17256 * is caused by a SACK or 3 dup acks. At this point 17257 * we have sent all the retransmissions and we rely 17258 * on PRR to dictate what we will send in the form of 17259 * new data. 17260 */ 17261 17262 outstanding = tp->snd_max - tp->snd_una; 17263 if ((rack->r_ctl.rc_prr_sndcnt + outstanding) > tp->snd_wnd) { 17264 if (tp->snd_wnd > outstanding) { 17265 len = tp->snd_wnd - outstanding; 17266 /* Check to see if we have the data */ 17267 if ((sb_offset + len) > avail) { 17268 /* It does not all fit */ 17269 if (avail > sb_offset) 17270 len = avail - sb_offset; 17271 else 17272 len = 0; 17273 } 17274 } else { 17275 len = 0; 17276 } 17277 } else if (avail > sb_offset) { 17278 len = avail - sb_offset; 17279 } else { 17280 len = 0; 17281 } 17282 if (len > 0) { 17283 if (len > rack->r_ctl.rc_prr_sndcnt) { 17284 len = rack->r_ctl.rc_prr_sndcnt; 17285 } 17286 if (len > 0) { 17287 sub_from_prr = 1; 17288 } 17289 } 17290 if (len > segsiz) { 17291 /* 17292 * We should never send more than a MSS when 17293 * retransmitting or sending new data in prr 17294 * mode unless the override flag is on. Most 17295 * likely the PRR algorithm is not going to 17296 * let us send a lot as well :-) 17297 */ 17298 if (rack->r_ctl.rc_prr_sendalot == 0) { 17299 len = segsiz; 17300 } 17301 } else if (len < segsiz) { 17302 /* 17303 * Do we send any? The idea here is if the 17304 * send empty's the socket buffer we want to 17305 * do it. However if not then lets just wait 17306 * for our prr_sndcnt to get bigger. 17307 */ 17308 long leftinsb; 17309 17310 leftinsb = sbavail(sb) - sb_offset; 17311 if (leftinsb > len) { 17312 /* This send does not empty the sb */ 17313 len = 0; 17314 } 17315 } 17316 } 17317 } else if (!TCPS_HAVEESTABLISHED(tp->t_state)) { 17318 /* 17319 * If you have not established 17320 * and are not doing FAST OPEN 17321 * no data please. 17322 */ 17323 if ((sack_rxmit == 0) && 17324 (!IS_FASTOPEN(tp->t_flags))){ 17325 len = 0; 17326 sb_offset = 0; 17327 } 17328 } 17329 if (prefetch_so_done == 0) { 17330 kern_prefetch(so, &prefetch_so_done); 17331 prefetch_so_done = 1; 17332 } 17333 /* 17334 * Lop off SYN bit if it has already been sent. However, if this is 17335 * SYN-SENT state and if segment contains data and if we don't know 17336 * that foreign host supports TAO, suppress sending segment. 17337 */ 17338 if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una) && 17339 ((sack_rxmit == 0) && (tp->t_rxtshift == 0))) { 17340 /* 17341 * When sending additional segments following a TFO SYN|ACK, 17342 * do not include the SYN bit. 17343 */ 17344 if (IS_FASTOPEN(tp->t_flags) && 17345 (tp->t_state == TCPS_SYN_RECEIVED)) 17346 flags &= ~TH_SYN; 17347 } 17348 /* 17349 * Be careful not to send data and/or FIN on SYN segments. This 17350 * measure is needed to prevent interoperability problems with not 17351 * fully conformant TCP implementations. 17352 */ 17353 if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) { 17354 len = 0; 17355 flags &= ~TH_FIN; 17356 } 17357 /* 17358 * On TFO sockets, ensure no data is sent in the following cases: 17359 * 17360 * - When retransmitting SYN|ACK on a passively-created socket 17361 * 17362 * - When retransmitting SYN on an actively created socket 17363 * 17364 * - When sending a zero-length cookie (cookie request) on an 17365 * actively created socket 17366 * 17367 * - When the socket is in the CLOSED state (RST is being sent) 17368 */ 17369 if (IS_FASTOPEN(tp->t_flags) && 17370 (((flags & TH_SYN) && (tp->t_rxtshift > 0)) || 17371 ((tp->t_state == TCPS_SYN_SENT) && 17372 (tp->t_tfo_client_cookie_len == 0)) || 17373 (flags & TH_RST))) { 17374 sack_rxmit = 0; 17375 len = 0; 17376 } 17377 /* Without fast-open there should never be data sent on a SYN */ 17378 if ((flags & TH_SYN) && (!IS_FASTOPEN(tp->t_flags))) { 17379 tp->snd_nxt = tp->iss; 17380 len = 0; 17381 } 17382 if ((len > segsiz) && (tcp_dsack_block_exists(tp))) { 17383 /* We only send 1 MSS if we have a DSACK block */ 17384 add_flag |= RACK_SENT_W_DSACK; 17385 len = segsiz; 17386 } 17387 orig_len = len; 17388 if (len <= 0) { 17389 /* 17390 * If FIN has been sent but not acked, but we haven't been 17391 * called to retransmit, len will be < 0. Otherwise, window 17392 * shrank after we sent into it. If window shrank to 0, 17393 * cancel pending retransmit, pull snd_nxt back to (closed) 17394 * window, and set the persist timer if it isn't already 17395 * going. If the window didn't close completely, just wait 17396 * for an ACK. 17397 * 17398 * We also do a general check here to ensure that we will 17399 * set the persist timer when we have data to send, but a 17400 * 0-byte window. This makes sure the persist timer is set 17401 * even if the packet hits one of the "goto send" lines 17402 * below. 17403 */ 17404 len = 0; 17405 if ((tp->snd_wnd == 0) && 17406 (TCPS_HAVEESTABLISHED(tp->t_state)) && 17407 (tp->snd_una == tp->snd_max) && 17408 (sb_offset < (int)sbavail(sb))) { 17409 rack_enter_persist(tp, rack, cts); 17410 } 17411 } else if ((rsm == NULL) && 17412 (doing_tlp == 0) && 17413 (len < pace_max_seg)) { 17414 /* 17415 * We are not sending a maximum sized segment for 17416 * some reason. Should we not send anything (think 17417 * sws or persists)? 17418 */ 17419 if ((tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), minseg)) && 17420 (TCPS_HAVEESTABLISHED(tp->t_state)) && 17421 (len < minseg) && 17422 (len < (int)(sbavail(sb) - sb_offset))) { 17423 /* 17424 * Here the rwnd is less than 17425 * the minimum pacing size, this is not a retransmit, 17426 * we are established and 17427 * the send is not the last in the socket buffer 17428 * we send nothing, and we may enter persists 17429 * if nothing is outstanding. 17430 */ 17431 len = 0; 17432 if (tp->snd_max == tp->snd_una) { 17433 /* 17434 * Nothing out we can 17435 * go into persists. 17436 */ 17437 rack_enter_persist(tp, rack, cts); 17438 } 17439 } else if ((cwnd_to_use >= max(minseg, (segsiz * 4))) && 17440 (ctf_flight_size(tp, rack->r_ctl.rc_sacked) > (2 * segsiz)) && 17441 (len < (int)(sbavail(sb) - sb_offset)) && 17442 (len < minseg)) { 17443 /* 17444 * Here we are not retransmitting, and 17445 * the cwnd is not so small that we could 17446 * not send at least a min size (rxt timer 17447 * not having gone off), We have 2 segments or 17448 * more already in flight, its not the tail end 17449 * of the socket buffer and the cwnd is blocking 17450 * us from sending out a minimum pacing segment size. 17451 * Lets not send anything. 17452 */ 17453 len = 0; 17454 } else if (((tp->snd_wnd - ctf_outstanding(tp)) < 17455 min((rack->r_ctl.rc_high_rwnd/2), minseg)) && 17456 (ctf_flight_size(tp, rack->r_ctl.rc_sacked) > (2 * segsiz)) && 17457 (len < (int)(sbavail(sb) - sb_offset)) && 17458 (TCPS_HAVEESTABLISHED(tp->t_state))) { 17459 /* 17460 * Here we have a send window but we have 17461 * filled it up and we can't send another pacing segment. 17462 * We also have in flight more than 2 segments 17463 * and we are not completing the sb i.e. we allow 17464 * the last bytes of the sb to go out even if 17465 * its not a full pacing segment. 17466 */ 17467 len = 0; 17468 } else if ((rack->r_ctl.crte != NULL) && 17469 (tp->snd_wnd >= (pace_max_seg * max(1, rack_hw_rwnd_factor))) && 17470 (cwnd_to_use >= (pace_max_seg + (4 * segsiz))) && 17471 (ctf_flight_size(tp, rack->r_ctl.rc_sacked) >= (2 * segsiz)) && 17472 (len < (int)(sbavail(sb) - sb_offset))) { 17473 /* 17474 * Here we are doing hardware pacing, this is not a TLP, 17475 * we are not sending a pace max segment size, there is rwnd 17476 * room to send at least N pace_max_seg, the cwnd is greater 17477 * than or equal to a full pacing segments plus 4 mss and we have 2 or 17478 * more segments in flight and its not the tail of the socket buffer. 17479 * 17480 * We don't want to send instead we need to get more ack's in to 17481 * allow us to send a full pacing segment. Normally, if we are pacing 17482 * about the right speed, we should have finished our pacing 17483 * send as most of the acks have come back if we are at the 17484 * right rate. This is a bit fuzzy since return path delay 17485 * can delay the acks, which is why we want to make sure we 17486 * have cwnd space to have a bit more than a max pace segments in flight. 17487 * 17488 * If we have not gotten our acks back we are pacing at too high a 17489 * rate delaying will not hurt and will bring our GP estimate down by 17490 * injecting the delay. If we don't do this we will send 17491 * 2 MSS out in response to the acks being clocked in which 17492 * defeats the point of hw-pacing (i.e. to help us get 17493 * larger TSO's out). 17494 */ 17495 len = 0; 17496 17497 } 17498 17499 } 17500 /* len will be >= 0 after this point. */ 17501 KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); 17502 rack_sndbuf_autoscale(rack); 17503 /* 17504 * Decide if we can use TCP Segmentation Offloading (if supported by 17505 * hardware). 17506 * 17507 * TSO may only be used if we are in a pure bulk sending state. The 17508 * presence of TCP-MD5, SACK retransmits, SACK advertizements and IP 17509 * options prevent using TSO. With TSO the TCP header is the same 17510 * (except for the sequence number) for all generated packets. This 17511 * makes it impossible to transmit any options which vary per 17512 * generated segment or packet. 17513 * 17514 * IPv4 handling has a clear separation of ip options and ip header 17515 * flags while IPv6 combines both in in6p_outputopts. ip6_optlen() does 17516 * the right thing below to provide length of just ip options and thus 17517 * checking for ipoptlen is enough to decide if ip options are present. 17518 */ 17519 ipoptlen = 0; 17520 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 17521 /* 17522 * Pre-calculate here as we save another lookup into the darknesses 17523 * of IPsec that way and can actually decide if TSO is ok. 17524 */ 17525 #ifdef INET6 17526 if (isipv6 && IPSEC_ENABLED(ipv6)) 17527 ipsec_optlen = IPSEC_HDRSIZE(ipv6, inp); 17528 #ifdef INET 17529 else 17530 #endif 17531 #endif /* INET6 */ 17532 #ifdef INET 17533 if (IPSEC_ENABLED(ipv4)) 17534 ipsec_optlen = IPSEC_HDRSIZE(ipv4, inp); 17535 #endif /* INET */ 17536 #endif 17537 17538 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 17539 ipoptlen += ipsec_optlen; 17540 #endif 17541 if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > segsiz && 17542 (tp->t_port == 0) && 17543 ((tp->t_flags & TF_SIGNATURE) == 0) && 17544 tp->rcv_numsacks == 0 && sack_rxmit == 0 && 17545 ipoptlen == 0) 17546 tso = 1; 17547 { 17548 uint32_t outstanding __unused; 17549 17550 outstanding = tp->snd_max - tp->snd_una; 17551 if (tp->t_flags & TF_SENTFIN) { 17552 /* 17553 * If we sent a fin, snd_max is 1 higher than 17554 * snd_una 17555 */ 17556 outstanding--; 17557 } 17558 if (sack_rxmit) { 17559 if ((rsm->r_flags & RACK_HAS_FIN) == 0) 17560 flags &= ~TH_FIN; 17561 } else { 17562 if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + 17563 sbused(sb))) 17564 flags &= ~TH_FIN; 17565 } 17566 } 17567 recwin = lmin(lmax(sbspace(&so->so_rcv), 0), 17568 (long)TCP_MAXWIN << tp->rcv_scale); 17569 17570 /* 17571 * Sender silly window avoidance. We transmit under the following 17572 * conditions when len is non-zero: 17573 * 17574 * - We have a full segment (or more with TSO) - This is the last 17575 * buffer in a write()/send() and we are either idle or running 17576 * NODELAY - we've timed out (e.g. persist timer) - we have more 17577 * then 1/2 the maximum send window's worth of data (receiver may be 17578 * limited the window size) - we need to retransmit 17579 */ 17580 if (len) { 17581 if (len >= segsiz) { 17582 goto send; 17583 } 17584 /* 17585 * NOTE! on localhost connections an 'ack' from the remote 17586 * end may occur synchronously with the output and cause us 17587 * to flush a buffer queued with moretocome. XXX 17588 * 17589 */ 17590 if (!(tp->t_flags & TF_MORETOCOME) && /* normal case */ 17591 (idle || (tp->t_flags & TF_NODELAY)) && 17592 ((uint32_t)len + (uint32_t)sb_offset >= sbavail(sb)) && 17593 (tp->t_flags & TF_NOPUSH) == 0) { 17594 pass = 2; 17595 goto send; 17596 } 17597 if ((tp->snd_una == tp->snd_max) && len) { /* Nothing outstanding */ 17598 pass = 22; 17599 goto send; 17600 } 17601 if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) { 17602 pass = 4; 17603 goto send; 17604 } 17605 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { /* retransmit case */ 17606 pass = 5; 17607 goto send; 17608 } 17609 if (sack_rxmit) { 17610 pass = 6; 17611 goto send; 17612 } 17613 if (((tp->snd_wnd - ctf_outstanding(tp)) < segsiz) && 17614 (ctf_outstanding(tp) < (segsiz * 2))) { 17615 /* 17616 * We have less than two MSS outstanding (delayed ack) 17617 * and our rwnd will not let us send a full sized 17618 * MSS. Lets go ahead and let this small segment 17619 * out because we want to try to have at least two 17620 * packets inflight to not be caught by delayed ack. 17621 */ 17622 pass = 12; 17623 goto send; 17624 } 17625 } 17626 /* 17627 * Sending of standalone window updates. 17628 * 17629 * Window updates are important when we close our window due to a 17630 * full socket buffer and are opening it again after the application 17631 * reads data from it. Once the window has opened again and the 17632 * remote end starts to send again the ACK clock takes over and 17633 * provides the most current window information. 17634 * 17635 * We must avoid the silly window syndrome whereas every read from 17636 * the receive buffer, no matter how small, causes a window update 17637 * to be sent. We also should avoid sending a flurry of window 17638 * updates when the socket buffer had queued a lot of data and the 17639 * application is doing small reads. 17640 * 17641 * Prevent a flurry of pointless window updates by only sending an 17642 * update when we can increase the advertized window by more than 17643 * 1/4th of the socket buffer capacity. When the buffer is getting 17644 * full or is very small be more aggressive and send an update 17645 * whenever we can increase by two mss sized segments. In all other 17646 * situations the ACK's to new incoming data will carry further 17647 * window increases. 17648 * 17649 * Don't send an independent window update if a delayed ACK is 17650 * pending (it will get piggy-backed on it) or the remote side 17651 * already has done a half-close and won't send more data. Skip 17652 * this if the connection is in T/TCP half-open state. 17653 */ 17654 if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN) && 17655 !(tp->t_flags & TF_DELACK) && 17656 !TCPS_HAVERCVDFIN(tp->t_state)) { 17657 /* 17658 * "adv" is the amount we could increase the window, taking 17659 * into account that we are limited by TCP_MAXWIN << 17660 * tp->rcv_scale. 17661 */ 17662 int32_t adv; 17663 int oldwin; 17664 17665 adv = recwin; 17666 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) { 17667 oldwin = (tp->rcv_adv - tp->rcv_nxt); 17668 if (adv > oldwin) 17669 adv -= oldwin; 17670 else { 17671 /* We can't increase the window */ 17672 adv = 0; 17673 } 17674 } else 17675 oldwin = 0; 17676 17677 /* 17678 * If the new window size ends up being the same as or less 17679 * than the old size when it is scaled, then don't force 17680 * a window update. 17681 */ 17682 if (oldwin >> tp->rcv_scale >= (adv + oldwin) >> tp->rcv_scale) 17683 goto dontupdate; 17684 17685 if (adv >= (int32_t)(2 * segsiz) && 17686 (adv >= (int32_t)(so->so_rcv.sb_hiwat / 4) || 17687 recwin <= (int32_t)(so->so_rcv.sb_hiwat / 8) || 17688 so->so_rcv.sb_hiwat <= 8 * segsiz)) { 17689 pass = 7; 17690 goto send; 17691 } 17692 if (2 * adv >= (int32_t) so->so_rcv.sb_hiwat) { 17693 pass = 23; 17694 goto send; 17695 } 17696 } 17697 dontupdate: 17698 17699 /* 17700 * Send if we owe the peer an ACK, RST, SYN, or urgent data. ACKNOW 17701 * is also a catch-all for the retransmit timer timeout case. 17702 */ 17703 if (tp->t_flags & TF_ACKNOW) { 17704 pass = 8; 17705 goto send; 17706 } 17707 if (((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0)) { 17708 pass = 9; 17709 goto send; 17710 } 17711 /* 17712 * If our state indicates that FIN should be sent and we have not 17713 * yet done so, then we need to send. 17714 */ 17715 if ((flags & TH_FIN) && 17716 (tp->snd_nxt == tp->snd_una)) { 17717 pass = 11; 17718 goto send; 17719 } 17720 /* 17721 * No reason to send a segment, just return. 17722 */ 17723 just_return: 17724 SOCKBUF_UNLOCK(sb); 17725 just_return_nolock: 17726 { 17727 int app_limited = CTF_JR_SENT_DATA; 17728 17729 if (tot_len_this_send > 0) { 17730 /* Make sure snd_nxt is up to max */ 17731 rack->r_ctl.fsb.recwin = recwin; 17732 slot = rack_get_pacing_delay(rack, tp, tot_len_this_send, NULL, segsiz); 17733 if ((error == 0) && 17734 rack_use_rfo && 17735 ((flags & (TH_SYN|TH_FIN)) == 0) && 17736 (ipoptlen == 0) && 17737 (tp->snd_nxt == tp->snd_max) && 17738 (tp->rcv_numsacks == 0) && 17739 rack->r_fsb_inited && 17740 TCPS_HAVEESTABLISHED(tp->t_state) && 17741 (rack->r_must_retran == 0) && 17742 ((tp->t_flags & TF_NEEDFIN) == 0) && 17743 (len > 0) && (orig_len > 0) && 17744 (orig_len > len) && 17745 ((orig_len - len) >= segsiz) && 17746 ((optlen == 0) || 17747 ((optlen == TCPOLEN_TSTAMP_APPA) && (to.to_flags & TOF_TS)))) { 17748 /* We can send at least one more MSS using our fsb */ 17749 17750 rack->r_fast_output = 1; 17751 rack->r_ctl.fsb.m = sbsndmbuf(sb, (tp->snd_max - tp->snd_una), &rack->r_ctl.fsb.off); 17752 rack->r_ctl.fsb.o_m_len = rack->r_ctl.fsb.m->m_len; 17753 rack->r_ctl.fsb.tcp_flags = flags; 17754 rack->r_ctl.fsb.left_to_send = orig_len - len; 17755 if (hw_tls) 17756 rack->r_ctl.fsb.hw_tls = 1; 17757 else 17758 rack->r_ctl.fsb.hw_tls = 0; 17759 KASSERT((rack->r_ctl.fsb.left_to_send <= (sbavail(sb) - (tp->snd_max - tp->snd_una))), 17760 ("rack:%p left_to_send:%u sbavail:%u out:%u", 17761 rack, rack->r_ctl.fsb.left_to_send, sbavail(sb), 17762 (tp->snd_max - tp->snd_una))); 17763 if (rack->r_ctl.fsb.left_to_send < segsiz) 17764 rack->r_fast_output = 0; 17765 else { 17766 if (rack->r_ctl.fsb.left_to_send == (sbavail(sb) - (tp->snd_max - tp->snd_una))) 17767 rack->r_ctl.fsb.rfo_apply_push = 1; 17768 else 17769 rack->r_ctl.fsb.rfo_apply_push = 0; 17770 } 17771 } else 17772 rack->r_fast_output = 0; 17773 17774 17775 rack_log_fsb(rack, tp, so, flags, 17776 ipoptlen, orig_len, len, 0, 17777 1, optlen, __LINE__, 1); 17778 if (SEQ_GT(tp->snd_max, tp->snd_nxt)) 17779 tp->snd_nxt = tp->snd_max; 17780 } else { 17781 int end_window = 0; 17782 uint32_t seq = tp->gput_ack; 17783 17784 rsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 17785 if (rsm) { 17786 /* 17787 * Mark the last sent that we just-returned (hinting 17788 * that delayed ack may play a role in any rtt measurement). 17789 */ 17790 rsm->r_just_ret = 1; 17791 } 17792 counter_u64_add(rack_out_size[TCP_MSS_ACCT_JUSTRET], 1); 17793 rack->r_ctl.rc_agg_delayed = 0; 17794 rack->r_early = 0; 17795 rack->r_late = 0; 17796 rack->r_ctl.rc_agg_early = 0; 17797 if ((ctf_outstanding(tp) + 17798 min(max(segsiz, (rack->r_ctl.rc_high_rwnd/2)), 17799 minseg)) >= tp->snd_wnd) { 17800 /* We are limited by the rwnd */ 17801 app_limited = CTF_JR_RWND_LIMITED; 17802 if (IN_FASTRECOVERY(tp->t_flags)) 17803 rack->r_ctl.rc_prr_sndcnt = 0; 17804 } else if (ctf_outstanding(tp) >= sbavail(sb)) { 17805 /* We are limited by whats available -- app limited */ 17806 app_limited = CTF_JR_APP_LIMITED; 17807 if (IN_FASTRECOVERY(tp->t_flags)) 17808 rack->r_ctl.rc_prr_sndcnt = 0; 17809 } else if ((idle == 0) && 17810 ((tp->t_flags & TF_NODELAY) == 0) && 17811 ((uint32_t)len + (uint32_t)sb_offset >= sbavail(sb)) && 17812 (len < segsiz)) { 17813 /* 17814 * No delay is not on and the 17815 * user is sending less than 1MSS. This 17816 * brings out SWS avoidance so we 17817 * don't send. Another app-limited case. 17818 */ 17819 app_limited = CTF_JR_APP_LIMITED; 17820 } else if (tp->t_flags & TF_NOPUSH) { 17821 /* 17822 * The user has requested no push of 17823 * the last segment and we are 17824 * at the last segment. Another app 17825 * limited case. 17826 */ 17827 app_limited = CTF_JR_APP_LIMITED; 17828 } else if ((ctf_outstanding(tp) + minseg) > cwnd_to_use) { 17829 /* Its the cwnd */ 17830 app_limited = CTF_JR_CWND_LIMITED; 17831 } else if (IN_FASTRECOVERY(tp->t_flags) && 17832 (rack->rack_no_prr == 0) && 17833 (rack->r_ctl.rc_prr_sndcnt < segsiz)) { 17834 app_limited = CTF_JR_PRR; 17835 } else { 17836 /* Now why here are we not sending? */ 17837 #ifdef NOW 17838 #ifdef INVARIANTS 17839 panic("rack:%p hit JR_ASSESSING case cwnd_to_use:%u?", rack, cwnd_to_use); 17840 #endif 17841 #endif 17842 app_limited = CTF_JR_ASSESSING; 17843 } 17844 /* 17845 * App limited in some fashion, for our pacing GP 17846 * measurements we don't want any gap (even cwnd). 17847 * Close down the measurement window. 17848 */ 17849 if (rack_cwnd_block_ends_measure && 17850 ((app_limited == CTF_JR_CWND_LIMITED) || 17851 (app_limited == CTF_JR_PRR))) { 17852 /* 17853 * The reason we are not sending is 17854 * the cwnd (or prr). We have been configured 17855 * to end the measurement window in 17856 * this case. 17857 */ 17858 end_window = 1; 17859 } else if (rack_rwnd_block_ends_measure && 17860 (app_limited == CTF_JR_RWND_LIMITED)) { 17861 /* 17862 * We are rwnd limited and have been 17863 * configured to end the measurement 17864 * window in this case. 17865 */ 17866 end_window = 1; 17867 } else if (app_limited == CTF_JR_APP_LIMITED) { 17868 /* 17869 * A true application limited period, we have 17870 * ran out of data. 17871 */ 17872 end_window = 1; 17873 } else if (app_limited == CTF_JR_ASSESSING) { 17874 /* 17875 * In the assessing case we hit the end of 17876 * the if/else and had no known reason 17877 * This will panic us under invariants.. 17878 * 17879 * If we get this out in logs we need to 17880 * investagate which reason we missed. 17881 */ 17882 end_window = 1; 17883 } 17884 if (end_window) { 17885 uint8_t log = 0; 17886 17887 /* Adjust the Gput measurement */ 17888 if ((tp->t_flags & TF_GPUTINPROG) && 17889 SEQ_GT(tp->gput_ack, tp->snd_max)) { 17890 tp->gput_ack = tp->snd_max; 17891 if ((tp->gput_ack - tp->gput_seq) < (MIN_GP_WIN * segsiz)) { 17892 /* 17893 * There is not enough to measure. 17894 */ 17895 tp->t_flags &= ~TF_GPUTINPROG; 17896 rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/, 17897 rack->r_ctl.rc_gp_srtt /*flex1*/, 17898 tp->gput_seq, 17899 0, 0, 18, __LINE__, NULL, 0); 17900 } else 17901 log = 1; 17902 } 17903 /* Mark the last packet has app limited */ 17904 rsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 17905 if (rsm && ((rsm->r_flags & RACK_APP_LIMITED) == 0)) { 17906 if (rack->r_ctl.rc_app_limited_cnt == 0) 17907 rack->r_ctl.rc_end_appl = rack->r_ctl.rc_first_appl = rsm; 17908 else { 17909 /* 17910 * Go out to the end app limited and mark 17911 * this new one as next and move the end_appl up 17912 * to this guy. 17913 */ 17914 if (rack->r_ctl.rc_end_appl) 17915 rack->r_ctl.rc_end_appl->r_nseq_appl = rsm->r_start; 17916 rack->r_ctl.rc_end_appl = rsm; 17917 } 17918 rsm->r_flags |= RACK_APP_LIMITED; 17919 rack->r_ctl.rc_app_limited_cnt++; 17920 } 17921 if (log) 17922 rack_log_pacing_delay_calc(rack, 17923 rack->r_ctl.rc_app_limited_cnt, seq, 17924 tp->gput_ack, 0, 0, 4, __LINE__, NULL, 0); 17925 } 17926 } 17927 /* Check if we need to go into persists or not */ 17928 if ((tp->snd_max == tp->snd_una) && 17929 TCPS_HAVEESTABLISHED(tp->t_state) && 17930 sbavail(sb) && 17931 (sbavail(sb) > tp->snd_wnd) && 17932 (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), minseg))) { 17933 /* Yes lets make sure to move to persist before timer-start */ 17934 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime); 17935 } 17936 rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, sup_rack); 17937 rack_log_type_just_return(rack, cts, tot_len_this_send, slot, hpts_calling, app_limited, cwnd_to_use); 17938 } 17939 #ifdef NETFLIX_SHARED_CWND 17940 if ((sbavail(sb) == 0) && 17941 rack->r_ctl.rc_scw) { 17942 tcp_shared_cwnd_idle(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index); 17943 rack->rack_scwnd_is_idle = 1; 17944 } 17945 #endif 17946 #ifdef TCP_ACCOUNTING 17947 if (tot_len_this_send > 0) { 17948 crtsc = get_cyclecount(); 17949 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 17950 tp->tcp_cnt_counters[SND_OUT_DATA]++; 17951 } 17952 counter_u64_add(tcp_cnt_counters[SND_OUT_DATA], 1); 17953 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 17954 tp->tcp_proc_time[SND_OUT_DATA] += (crtsc - ts_val); 17955 } 17956 counter_u64_add(tcp_proc_time[SND_OUT_DATA], (crtsc - ts_val)); 17957 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 17958 tp->tcp_cnt_counters[CNT_OF_MSS_OUT] += ((tot_len_this_send + segsiz - 1) / segsiz); 17959 } 17960 counter_u64_add(tcp_cnt_counters[CNT_OF_MSS_OUT], ((tot_len_this_send + segsiz - 1) / segsiz)); 17961 } else { 17962 crtsc = get_cyclecount(); 17963 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 17964 tp->tcp_cnt_counters[SND_LIMITED]++; 17965 } 17966 counter_u64_add(tcp_cnt_counters[SND_LIMITED], 1); 17967 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 17968 tp->tcp_proc_time[SND_LIMITED] += (crtsc - ts_val); 17969 } 17970 counter_u64_add(tcp_proc_time[SND_LIMITED], (crtsc - ts_val)); 17971 } 17972 sched_unpin(); 17973 #endif 17974 return (0); 17975 17976 send: 17977 if (rsm || sack_rxmit) 17978 counter_u64_add(rack_nfto_resend, 1); 17979 else 17980 counter_u64_add(rack_non_fto_send, 1); 17981 if ((flags & TH_FIN) && 17982 sbavail(sb)) { 17983 /* 17984 * We do not transmit a FIN 17985 * with data outstanding. We 17986 * need to make it so all data 17987 * is acked first. 17988 */ 17989 flags &= ~TH_FIN; 17990 } 17991 /* Enforce stack imposed max seg size if we have one */ 17992 if (rack->r_ctl.rc_pace_max_segs && 17993 (len > rack->r_ctl.rc_pace_max_segs)) { 17994 mark = 1; 17995 len = rack->r_ctl.rc_pace_max_segs; 17996 } 17997 SOCKBUF_LOCK_ASSERT(sb); 17998 if (len > 0) { 17999 if (len >= segsiz) 18000 tp->t_flags2 |= TF2_PLPMTU_MAXSEGSNT; 18001 else 18002 tp->t_flags2 &= ~TF2_PLPMTU_MAXSEGSNT; 18003 } 18004 /* 18005 * Before ESTABLISHED, force sending of initial options unless TCP 18006 * set not to do any options. NOTE: we assume that the IP/TCP header 18007 * plus TCP options always fit in a single mbuf, leaving room for a 18008 * maximum link header, i.e. max_linkhdr + sizeof (struct tcpiphdr) 18009 * + optlen <= MCLBYTES 18010 */ 18011 optlen = 0; 18012 #ifdef INET6 18013 if (isipv6) 18014 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 18015 else 18016 #endif 18017 hdrlen = sizeof(struct tcpiphdr); 18018 18019 /* 18020 * Compute options for segment. We only have to care about SYN and 18021 * established connection segments. Options for SYN-ACK segments 18022 * are handled in TCP syncache. 18023 */ 18024 to.to_flags = 0; 18025 if ((tp->t_flags & TF_NOOPT) == 0) { 18026 /* Maximum segment size. */ 18027 if (flags & TH_SYN) { 18028 tp->snd_nxt = tp->iss; 18029 to.to_mss = tcp_mssopt(&inp->inp_inc); 18030 if (tp->t_port) 18031 to.to_mss -= V_tcp_udp_tunneling_overhead; 18032 to.to_flags |= TOF_MSS; 18033 18034 /* 18035 * On SYN or SYN|ACK transmits on TFO connections, 18036 * only include the TFO option if it is not a 18037 * retransmit, as the presence of the TFO option may 18038 * have caused the original SYN or SYN|ACK to have 18039 * been dropped by a middlebox. 18040 */ 18041 if (IS_FASTOPEN(tp->t_flags) && 18042 (tp->t_rxtshift == 0)) { 18043 if (tp->t_state == TCPS_SYN_RECEIVED) { 18044 to.to_tfo_len = TCP_FASTOPEN_COOKIE_LEN; 18045 to.to_tfo_cookie = 18046 (u_int8_t *)&tp->t_tfo_cookie.server; 18047 to.to_flags |= TOF_FASTOPEN; 18048 wanted_cookie = 1; 18049 } else if (tp->t_state == TCPS_SYN_SENT) { 18050 to.to_tfo_len = 18051 tp->t_tfo_client_cookie_len; 18052 to.to_tfo_cookie = 18053 tp->t_tfo_cookie.client; 18054 to.to_flags |= TOF_FASTOPEN; 18055 wanted_cookie = 1; 18056 /* 18057 * If we wind up having more data to 18058 * send with the SYN than can fit in 18059 * one segment, don't send any more 18060 * until the SYN|ACK comes back from 18061 * the other end. 18062 */ 18063 sendalot = 0; 18064 } 18065 } 18066 } 18067 /* Window scaling. */ 18068 if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) { 18069 to.to_wscale = tp->request_r_scale; 18070 to.to_flags |= TOF_SCALE; 18071 } 18072 /* Timestamps. */ 18073 if ((tp->t_flags & TF_RCVD_TSTMP) || 18074 ((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) { 18075 to.to_tsval = ms_cts + tp->ts_offset; 18076 to.to_tsecr = tp->ts_recent; 18077 to.to_flags |= TOF_TS; 18078 } 18079 /* Set receive buffer autosizing timestamp. */ 18080 if (tp->rfbuf_ts == 0 && 18081 (so->so_rcv.sb_flags & SB_AUTOSIZE)) 18082 tp->rfbuf_ts = tcp_ts_getticks(); 18083 /* Selective ACK's. */ 18084 if (tp->t_flags & TF_SACK_PERMIT) { 18085 if (flags & TH_SYN) 18086 to.to_flags |= TOF_SACKPERM; 18087 else if (TCPS_HAVEESTABLISHED(tp->t_state) && 18088 tp->rcv_numsacks > 0) { 18089 to.to_flags |= TOF_SACK; 18090 to.to_nsacks = tp->rcv_numsacks; 18091 to.to_sacks = (u_char *)tp->sackblks; 18092 } 18093 } 18094 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) 18095 /* TCP-MD5 (RFC2385). */ 18096 if (tp->t_flags & TF_SIGNATURE) 18097 to.to_flags |= TOF_SIGNATURE; 18098 #endif /* TCP_SIGNATURE */ 18099 18100 /* Processing the options. */ 18101 hdrlen += optlen = tcp_addoptions(&to, opt); 18102 /* 18103 * If we wanted a TFO option to be added, but it was unable 18104 * to fit, ensure no data is sent. 18105 */ 18106 if (IS_FASTOPEN(tp->t_flags) && wanted_cookie && 18107 !(to.to_flags & TOF_FASTOPEN)) 18108 len = 0; 18109 } 18110 if (tp->t_port) { 18111 if (V_tcp_udp_tunneling_port == 0) { 18112 /* The port was removed?? */ 18113 SOCKBUF_UNLOCK(&so->so_snd); 18114 #ifdef TCP_ACCOUNTING 18115 crtsc = get_cyclecount(); 18116 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 18117 tp->tcp_cnt_counters[SND_OUT_FAIL]++; 18118 } 18119 counter_u64_add(tcp_cnt_counters[SND_OUT_FAIL], 1); 18120 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 18121 tp->tcp_proc_time[SND_OUT_FAIL] += (crtsc - ts_val); 18122 } 18123 counter_u64_add(tcp_proc_time[SND_OUT_FAIL], (crtsc - ts_val)); 18124 sched_unpin(); 18125 #endif 18126 return (EHOSTUNREACH); 18127 } 18128 hdrlen += sizeof(struct udphdr); 18129 } 18130 #ifdef INET6 18131 if (isipv6) 18132 ipoptlen = ip6_optlen(inp); 18133 else 18134 #endif 18135 if (inp->inp_options) 18136 ipoptlen = inp->inp_options->m_len - 18137 offsetof(struct ipoption, ipopt_list); 18138 else 18139 ipoptlen = 0; 18140 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 18141 ipoptlen += ipsec_optlen; 18142 #endif 18143 18144 /* 18145 * Adjust data length if insertion of options will bump the packet 18146 * length beyond the t_maxseg length. Clear the FIN bit because we 18147 * cut off the tail of the segment. 18148 */ 18149 if (len + optlen + ipoptlen > tp->t_maxseg) { 18150 if (tso) { 18151 uint32_t if_hw_tsomax; 18152 uint32_t moff; 18153 int32_t max_len; 18154 18155 /* extract TSO information */ 18156 if_hw_tsomax = tp->t_tsomax; 18157 if_hw_tsomaxsegcount = tp->t_tsomaxsegcount; 18158 if_hw_tsomaxsegsize = tp->t_tsomaxsegsize; 18159 KASSERT(ipoptlen == 0, 18160 ("%s: TSO can't do IP options", __func__)); 18161 18162 /* 18163 * Check if we should limit by maximum payload 18164 * length: 18165 */ 18166 if (if_hw_tsomax != 0) { 18167 /* compute maximum TSO length */ 18168 max_len = (if_hw_tsomax - hdrlen - 18169 max_linkhdr); 18170 if (max_len <= 0) { 18171 len = 0; 18172 } else if (len > max_len) { 18173 sendalot = 1; 18174 len = max_len; 18175 mark = 2; 18176 } 18177 } 18178 /* 18179 * Prevent the last segment from being fractional 18180 * unless the send sockbuf can be emptied: 18181 */ 18182 max_len = (tp->t_maxseg - optlen); 18183 if ((sb_offset + len) < sbavail(sb)) { 18184 moff = len % (u_int)max_len; 18185 if (moff != 0) { 18186 mark = 3; 18187 len -= moff; 18188 } 18189 } 18190 /* 18191 * In case there are too many small fragments don't 18192 * use TSO: 18193 */ 18194 if (len <= segsiz) { 18195 mark = 4; 18196 tso = 0; 18197 } 18198 /* 18199 * Send the FIN in a separate segment after the bulk 18200 * sending is done. We don't trust the TSO 18201 * implementations to clear the FIN flag on all but 18202 * the last segment. 18203 */ 18204 if (tp->t_flags & TF_NEEDFIN) { 18205 sendalot = 4; 18206 } 18207 } else { 18208 mark = 5; 18209 if (optlen + ipoptlen >= tp->t_maxseg) { 18210 /* 18211 * Since we don't have enough space to put 18212 * the IP header chain and the TCP header in 18213 * one packet as required by RFC 7112, don't 18214 * send it. Also ensure that at least one 18215 * byte of the payload can be put into the 18216 * TCP segment. 18217 */ 18218 SOCKBUF_UNLOCK(&so->so_snd); 18219 error = EMSGSIZE; 18220 sack_rxmit = 0; 18221 goto out; 18222 } 18223 len = tp->t_maxseg - optlen - ipoptlen; 18224 sendalot = 5; 18225 } 18226 } else { 18227 tso = 0; 18228 mark = 6; 18229 } 18230 KASSERT(len + hdrlen + ipoptlen <= IP_MAXPACKET, 18231 ("%s: len > IP_MAXPACKET", __func__)); 18232 #ifdef DIAGNOSTIC 18233 #ifdef INET6 18234 if (max_linkhdr + hdrlen > MCLBYTES) 18235 #else 18236 if (max_linkhdr + hdrlen > MHLEN) 18237 #endif 18238 panic("tcphdr too big"); 18239 #endif 18240 18241 /* 18242 * This KASSERT is here to catch edge cases at a well defined place. 18243 * Before, those had triggered (random) panic conditions further 18244 * down. 18245 */ 18246 KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); 18247 if ((len == 0) && 18248 (flags & TH_FIN) && 18249 (sbused(sb))) { 18250 /* 18251 * We have outstanding data, don't send a fin by itself!. 18252 */ 18253 goto just_return; 18254 } 18255 /* 18256 * Grab a header mbuf, attaching a copy of data to be transmitted, 18257 * and initialize the header from the template for sends on this 18258 * connection. 18259 */ 18260 hw_tls = (sb->sb_flags & SB_TLS_IFNET) != 0; 18261 if (len) { 18262 uint32_t max_val; 18263 uint32_t moff; 18264 18265 if (rack->r_ctl.rc_pace_max_segs) 18266 max_val = rack->r_ctl.rc_pace_max_segs; 18267 else if (rack->rc_user_set_max_segs) 18268 max_val = rack->rc_user_set_max_segs * segsiz; 18269 else 18270 max_val = len; 18271 /* 18272 * We allow a limit on sending with hptsi. 18273 */ 18274 if (len > max_val) { 18275 mark = 7; 18276 len = max_val; 18277 } 18278 #ifdef INET6 18279 if (MHLEN < hdrlen + max_linkhdr) 18280 m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); 18281 else 18282 #endif 18283 m = m_gethdr(M_NOWAIT, MT_DATA); 18284 18285 if (m == NULL) { 18286 SOCKBUF_UNLOCK(sb); 18287 error = ENOBUFS; 18288 sack_rxmit = 0; 18289 goto out; 18290 } 18291 m->m_data += max_linkhdr; 18292 m->m_len = hdrlen; 18293 18294 /* 18295 * Start the m_copy functions from the closest mbuf to the 18296 * sb_offset in the socket buffer chain. 18297 */ 18298 mb = sbsndptr_noadv(sb, sb_offset, &moff); 18299 s_mb = mb; 18300 s_moff = moff; 18301 if (len <= MHLEN - hdrlen - max_linkhdr && !hw_tls) { 18302 m_copydata(mb, moff, (int)len, 18303 mtod(m, caddr_t)+hdrlen); 18304 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) 18305 sbsndptr_adv(sb, mb, len); 18306 m->m_len += len; 18307 } else { 18308 struct sockbuf *msb; 18309 18310 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) 18311 msb = NULL; 18312 else 18313 msb = sb; 18314 m->m_next = tcp_m_copym( 18315 mb, moff, &len, 18316 if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb, 18317 ((rsm == NULL) ? hw_tls : 0) 18318 #ifdef NETFLIX_COPY_ARGS 18319 , &s_mb, &s_moff 18320 #endif 18321 ); 18322 if (len <= (tp->t_maxseg - optlen)) { 18323 /* 18324 * Must have ran out of mbufs for the copy 18325 * shorten it to no longer need tso. Lets 18326 * not put on sendalot since we are low on 18327 * mbufs. 18328 */ 18329 tso = 0; 18330 } 18331 if (m->m_next == NULL) { 18332 SOCKBUF_UNLOCK(sb); 18333 (void)m_free(m); 18334 error = ENOBUFS; 18335 sack_rxmit = 0; 18336 goto out; 18337 } 18338 } 18339 if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) { 18340 if (rsm && (rsm->r_flags & RACK_TLP)) { 18341 /* 18342 * TLP should not count in retran count, but 18343 * in its own bin 18344 */ 18345 counter_u64_add(rack_tlp_retran, 1); 18346 counter_u64_add(rack_tlp_retran_bytes, len); 18347 } else { 18348 tp->t_sndrexmitpack++; 18349 KMOD_TCPSTAT_INC(tcps_sndrexmitpack); 18350 KMOD_TCPSTAT_ADD(tcps_sndrexmitbyte, len); 18351 } 18352 #ifdef STATS 18353 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB, 18354 len); 18355 #endif 18356 } else { 18357 KMOD_TCPSTAT_INC(tcps_sndpack); 18358 KMOD_TCPSTAT_ADD(tcps_sndbyte, len); 18359 #ifdef STATS 18360 stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB, 18361 len); 18362 #endif 18363 } 18364 /* 18365 * If we're sending everything we've got, set PUSH. (This 18366 * will keep happy those implementations which only give 18367 * data to the user when a buffer fills or a PUSH comes in.) 18368 */ 18369 if (sb_offset + len == sbused(sb) && 18370 sbused(sb) && 18371 !(flags & TH_SYN)) { 18372 flags |= TH_PUSH; 18373 add_flag |= RACK_HAD_PUSH; 18374 } 18375 18376 SOCKBUF_UNLOCK(sb); 18377 } else { 18378 SOCKBUF_UNLOCK(sb); 18379 if (tp->t_flags & TF_ACKNOW) 18380 KMOD_TCPSTAT_INC(tcps_sndacks); 18381 else if (flags & (TH_SYN | TH_FIN | TH_RST)) 18382 KMOD_TCPSTAT_INC(tcps_sndctrl); 18383 else 18384 KMOD_TCPSTAT_INC(tcps_sndwinup); 18385 18386 m = m_gethdr(M_NOWAIT, MT_DATA); 18387 if (m == NULL) { 18388 error = ENOBUFS; 18389 sack_rxmit = 0; 18390 goto out; 18391 } 18392 #ifdef INET6 18393 if (isipv6 && (MHLEN < hdrlen + max_linkhdr) && 18394 MHLEN >= hdrlen) { 18395 M_ALIGN(m, hdrlen); 18396 } else 18397 #endif 18398 m->m_data += max_linkhdr; 18399 m->m_len = hdrlen; 18400 } 18401 SOCKBUF_UNLOCK_ASSERT(sb); 18402 m->m_pkthdr.rcvif = (struct ifnet *)0; 18403 #ifdef MAC 18404 mac_inpcb_create_mbuf(inp, m); 18405 #endif 18406 if ((ipoptlen == 0) && (rack->r_ctl.fsb.tcp_ip_hdr) && rack->r_fsb_inited) { 18407 #ifdef INET6 18408 if (isipv6) 18409 ip6 = (struct ip6_hdr *)rack->r_ctl.fsb.tcp_ip_hdr; 18410 else 18411 #endif /* INET6 */ 18412 ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr; 18413 th = rack->r_ctl.fsb.th; 18414 udp = rack->r_ctl.fsb.udp; 18415 if (udp) { 18416 #ifdef INET6 18417 if (isipv6) 18418 ulen = hdrlen + len - sizeof(struct ip6_hdr); 18419 else 18420 #endif /* INET6 */ 18421 ulen = hdrlen + len - sizeof(struct ip); 18422 udp->uh_ulen = htons(ulen); 18423 } 18424 } else { 18425 #ifdef INET6 18426 if (isipv6) { 18427 ip6 = mtod(m, struct ip6_hdr *); 18428 if (tp->t_port) { 18429 udp = (struct udphdr *)((caddr_t)ip6 + sizeof(struct ip6_hdr)); 18430 udp->uh_sport = htons(V_tcp_udp_tunneling_port); 18431 udp->uh_dport = tp->t_port; 18432 ulen = hdrlen + len - sizeof(struct ip6_hdr); 18433 udp->uh_ulen = htons(ulen); 18434 th = (struct tcphdr *)(udp + 1); 18435 } else 18436 th = (struct tcphdr *)(ip6 + 1); 18437 tcpip_fillheaders(inp, tp->t_port, ip6, th); 18438 } else 18439 #endif /* INET6 */ 18440 { 18441 ip = mtod(m, struct ip *); 18442 if (tp->t_port) { 18443 udp = (struct udphdr *)((caddr_t)ip + sizeof(struct ip)); 18444 udp->uh_sport = htons(V_tcp_udp_tunneling_port); 18445 udp->uh_dport = tp->t_port; 18446 ulen = hdrlen + len - sizeof(struct ip); 18447 udp->uh_ulen = htons(ulen); 18448 th = (struct tcphdr *)(udp + 1); 18449 } else 18450 th = (struct tcphdr *)(ip + 1); 18451 tcpip_fillheaders(inp, tp->t_port, ip, th); 18452 } 18453 } 18454 /* 18455 * Fill in fields, remembering maximum advertised window for use in 18456 * delaying messages about window sizes. If resending a FIN, be sure 18457 * not to use a new sequence number. 18458 */ 18459 if (flags & TH_FIN && tp->t_flags & TF_SENTFIN && 18460 tp->snd_nxt == tp->snd_max) 18461 tp->snd_nxt--; 18462 /* 18463 * If we are starting a connection, send ECN setup SYN packet. If we 18464 * are on a retransmit, we may resend those bits a number of times 18465 * as per RFC 3168. 18466 */ 18467 if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn) { 18468 flags |= tcp_ecn_output_syn_sent(tp); 18469 } 18470 /* Also handle parallel SYN for ECN */ 18471 if (TCPS_HAVERCVDSYN(tp->t_state) && 18472 (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT))) { 18473 int ect = tcp_ecn_output_established(tp, &flags, len, sack_rxmit); 18474 if ((tp->t_state == TCPS_SYN_RECEIVED) && 18475 (tp->t_flags2 & TF2_ECN_SND_ECE)) 18476 tp->t_flags2 &= ~TF2_ECN_SND_ECE; 18477 #ifdef INET6 18478 if (isipv6) { 18479 ip6->ip6_flow &= ~htonl(IPTOS_ECN_MASK << 20); 18480 ip6->ip6_flow |= htonl(ect << 20); 18481 } 18482 else 18483 #endif 18484 { 18485 ip->ip_tos &= ~IPTOS_ECN_MASK; 18486 ip->ip_tos |= ect; 18487 } 18488 } 18489 /* 18490 * If we are doing retransmissions, then snd_nxt will not reflect 18491 * the first unsent octet. For ACK only packets, we do not want the 18492 * sequence number of the retransmitted packet, we want the sequence 18493 * number of the next unsent octet. So, if there is no data (and no 18494 * SYN or FIN), use snd_max instead of snd_nxt when filling in 18495 * ti_seq. But if we are in persist state, snd_max might reflect 18496 * one byte beyond the right edge of the window, so use snd_nxt in 18497 * that case, since we know we aren't doing a retransmission. 18498 * (retransmit and persist are mutually exclusive...) 18499 */ 18500 if (sack_rxmit == 0) { 18501 if (len || (flags & (TH_SYN | TH_FIN))) { 18502 th->th_seq = htonl(tp->snd_nxt); 18503 rack_seq = tp->snd_nxt; 18504 } else { 18505 th->th_seq = htonl(tp->snd_max); 18506 rack_seq = tp->snd_max; 18507 } 18508 } else { 18509 th->th_seq = htonl(rsm->r_start); 18510 rack_seq = rsm->r_start; 18511 } 18512 th->th_ack = htonl(tp->rcv_nxt); 18513 tcp_set_flags(th, flags); 18514 /* 18515 * Calculate receive window. Don't shrink window, but avoid silly 18516 * window syndrome. 18517 * If a RST segment is sent, advertise a window of zero. 18518 */ 18519 if (flags & TH_RST) { 18520 recwin = 0; 18521 } else { 18522 if (recwin < (long)(so->so_rcv.sb_hiwat / 4) && 18523 recwin < (long)segsiz) { 18524 recwin = 0; 18525 } 18526 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) && 18527 recwin < (long)(tp->rcv_adv - tp->rcv_nxt)) 18528 recwin = (long)(tp->rcv_adv - tp->rcv_nxt); 18529 } 18530 18531 /* 18532 * According to RFC1323 the window field in a SYN (i.e., a <SYN> or 18533 * <SYN,ACK>) segment itself is never scaled. The <SYN,ACK> case is 18534 * handled in syncache. 18535 */ 18536 if (flags & TH_SYN) 18537 th->th_win = htons((u_short) 18538 (min(sbspace(&so->so_rcv), TCP_MAXWIN))); 18539 else { 18540 /* Avoid shrinking window with window scaling. */ 18541 recwin = roundup2(recwin, 1 << tp->rcv_scale); 18542 th->th_win = htons((u_short)(recwin >> tp->rcv_scale)); 18543 } 18544 /* 18545 * Adjust the RXWIN0SENT flag - indicate that we have advertised a 0 18546 * window. This may cause the remote transmitter to stall. This 18547 * flag tells soreceive() to disable delayed acknowledgements when 18548 * draining the buffer. This can occur if the receiver is 18549 * attempting to read more data than can be buffered prior to 18550 * transmitting on the connection. 18551 */ 18552 if (th->th_win == 0) { 18553 tp->t_sndzerowin++; 18554 tp->t_flags |= TF_RXWIN0SENT; 18555 } else 18556 tp->t_flags &= ~TF_RXWIN0SENT; 18557 tp->snd_up = tp->snd_una; /* drag it along, its deprecated */ 18558 /* Now are we using fsb?, if so copy the template data to the mbuf */ 18559 if ((ipoptlen == 0) && (rack->r_ctl.fsb.tcp_ip_hdr) && rack->r_fsb_inited) { 18560 uint8_t *cpto; 18561 18562 cpto = mtod(m, uint8_t *); 18563 memcpy(cpto, rack->r_ctl.fsb.tcp_ip_hdr, rack->r_ctl.fsb.tcp_ip_hdr_len); 18564 /* 18565 * We have just copied in: 18566 * IP/IP6 18567 * <optional udphdr> 18568 * tcphdr (no options) 18569 * 18570 * We need to grab the correct pointers into the mbuf 18571 * for both the tcp header, and possibly the udp header (if tunneling). 18572 * We do this by using the offset in the copy buffer and adding it 18573 * to the mbuf base pointer (cpto). 18574 */ 18575 #ifdef INET6 18576 if (isipv6) 18577 ip6 = mtod(m, struct ip6_hdr *); 18578 else 18579 #endif /* INET6 */ 18580 ip = mtod(m, struct ip *); 18581 th = (struct tcphdr *)(cpto + ((uint8_t *)rack->r_ctl.fsb.th - rack->r_ctl.fsb.tcp_ip_hdr)); 18582 /* If we have a udp header lets set it into the mbuf as well */ 18583 if (udp) 18584 udp = (struct udphdr *)(cpto + ((uint8_t *)rack->r_ctl.fsb.udp - rack->r_ctl.fsb.tcp_ip_hdr)); 18585 } 18586 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) 18587 if (to.to_flags & TOF_SIGNATURE) { 18588 /* 18589 * Calculate MD5 signature and put it into the place 18590 * determined before. 18591 * NOTE: since TCP options buffer doesn't point into 18592 * mbuf's data, calculate offset and use it. 18593 */ 18594 if (!TCPMD5_ENABLED() || TCPMD5_OUTPUT(m, th, 18595 (u_char *)(th + 1) + (to.to_signature - opt)) != 0) { 18596 /* 18597 * Do not send segment if the calculation of MD5 18598 * digest has failed. 18599 */ 18600 goto out; 18601 } 18602 } 18603 #endif 18604 if (optlen) { 18605 bcopy(opt, th + 1, optlen); 18606 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 18607 } 18608 /* 18609 * Put TCP length in extended header, and then checksum extended 18610 * header and data. 18611 */ 18612 m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */ 18613 #ifdef INET6 18614 if (isipv6) { 18615 /* 18616 * ip6_plen is not need to be filled now, and will be filled 18617 * in ip6_output. 18618 */ 18619 if (tp->t_port) { 18620 m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; 18621 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 18622 udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0); 18623 th->th_sum = htons(0); 18624 UDPSTAT_INC(udps_opackets); 18625 } else { 18626 m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; 18627 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 18628 th->th_sum = in6_cksum_pseudo(ip6, 18629 sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP, 18630 0); 18631 } 18632 } 18633 #endif 18634 #if defined(INET6) && defined(INET) 18635 else 18636 #endif 18637 #ifdef INET 18638 { 18639 if (tp->t_port) { 18640 m->m_pkthdr.csum_flags = CSUM_UDP; 18641 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 18642 udp->uh_sum = in_pseudo(ip->ip_src.s_addr, 18643 ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP)); 18644 th->th_sum = htons(0); 18645 UDPSTAT_INC(udps_opackets); 18646 } else { 18647 m->m_pkthdr.csum_flags = CSUM_TCP; 18648 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 18649 th->th_sum = in_pseudo(ip->ip_src.s_addr, 18650 ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + 18651 IPPROTO_TCP + len + optlen)); 18652 } 18653 /* IP version must be set here for ipv4/ipv6 checking later */ 18654 KASSERT(ip->ip_v == IPVERSION, 18655 ("%s: IP version incorrect: %d", __func__, ip->ip_v)); 18656 } 18657 #endif 18658 /* 18659 * Enable TSO and specify the size of the segments. The TCP pseudo 18660 * header checksum is always provided. XXX: Fixme: This is currently 18661 * not the case for IPv6. 18662 */ 18663 if (tso) { 18664 KASSERT(len > tp->t_maxseg - optlen, 18665 ("%s: len <= tso_segsz", __func__)); 18666 m->m_pkthdr.csum_flags |= CSUM_TSO; 18667 m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen; 18668 } 18669 KASSERT(len + hdrlen == m_length(m, NULL), 18670 ("%s: mbuf chain different than expected: %d + %u != %u", 18671 __func__, len, hdrlen, m_length(m, NULL))); 18672 18673 #ifdef TCP_HHOOK 18674 /* Run HHOOK_TCP_ESTABLISHED_OUT helper hooks. */ 18675 hhook_run_tcp_est_out(tp, th, &to, len, tso); 18676 #endif 18677 /* We're getting ready to send; log now. */ 18678 if (tp->t_logstate != TCP_LOG_STATE_OFF) { 18679 union tcp_log_stackspecific log; 18680 18681 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 18682 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); 18683 if (rack->rack_no_prr) 18684 log.u_bbr.flex1 = 0; 18685 else 18686 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; 18687 log.u_bbr.flex2 = rack->r_ctl.rc_pace_min_segs; 18688 log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs; 18689 log.u_bbr.flex4 = orig_len; 18690 /* Save off the early/late values */ 18691 log.u_bbr.flex6 = rack->r_ctl.rc_agg_early; 18692 log.u_bbr.applimited = rack->r_ctl.rc_agg_delayed; 18693 log.u_bbr.bw_inuse = rack_get_bw(rack); 18694 log.u_bbr.flex8 = 0; 18695 if (rsm) { 18696 if (rsm->r_flags & RACK_RWND_COLLAPSED) { 18697 rack_log_collapse(rack, rsm->r_start, rsm->r_end, 0, __LINE__, 5, rsm->r_flags, rsm); 18698 counter_u64_add(rack_collapsed_win_rxt, 1); 18699 counter_u64_add(rack_collapsed_win_rxt_bytes, (rsm->r_end - rsm->r_start)); 18700 } 18701 if (doing_tlp) 18702 log.u_bbr.flex8 = 2; 18703 else 18704 log.u_bbr.flex8 = 1; 18705 } else { 18706 if (doing_tlp) 18707 log.u_bbr.flex8 = 3; 18708 else 18709 log.u_bbr.flex8 = 0; 18710 } 18711 log.u_bbr.pacing_gain = rack_get_output_gain(rack, rsm); 18712 log.u_bbr.flex7 = mark; 18713 log.u_bbr.flex7 <<= 8; 18714 log.u_bbr.flex7 |= pass; 18715 log.u_bbr.pkts_out = tp->t_maxseg; 18716 log.u_bbr.timeStamp = cts; 18717 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 18718 log.u_bbr.lt_epoch = cwnd_to_use; 18719 log.u_bbr.delivered = sendalot; 18720 lgb = tcp_log_event_(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_OUT, ERRNO_UNK, 18721 len, &log, false, NULL, NULL, 0, &tv); 18722 } else 18723 lgb = NULL; 18724 18725 /* 18726 * Fill in IP length and desired time to live and send to IP level. 18727 * There should be a better way to handle ttl and tos; we could keep 18728 * them in the template, but need a way to checksum without them. 18729 */ 18730 /* 18731 * m->m_pkthdr.len should have been set before cksum calcuration, 18732 * because in6_cksum() need it. 18733 */ 18734 #ifdef INET6 18735 if (isipv6) { 18736 /* 18737 * we separately set hoplimit for every segment, since the 18738 * user might want to change the value via setsockopt. Also, 18739 * desired default hop limit might be changed via Neighbor 18740 * Discovery. 18741 */ 18742 rack->r_ctl.fsb.hoplimit = ip6->ip6_hlim = in6_selecthlim(inp, NULL); 18743 18744 /* 18745 * Set the packet size here for the benefit of DTrace 18746 * probes. ip6_output() will set it properly; it's supposed 18747 * to include the option header lengths as well. 18748 */ 18749 ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6)); 18750 18751 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) 18752 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 18753 else 18754 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 18755 18756 if (tp->t_state == TCPS_SYN_SENT) 18757 TCP_PROBE5(connect__request, NULL, tp, ip6, tp, th); 18758 18759 TCP_PROBE5(send, NULL, tp, ip6, tp, th); 18760 /* TODO: IPv6 IP6TOS_ECT bit on */ 18761 error = ip6_output(m, 18762 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 18763 inp->in6p_outputopts, 18764 #else 18765 NULL, 18766 #endif 18767 &inp->inp_route6, 18768 ((rsm || sack_rxmit) ? IP_NO_SND_TAG_RL : 0), 18769 NULL, NULL, inp); 18770 18771 if (error == EMSGSIZE && inp->inp_route6.ro_nh != NULL) 18772 mtu = inp->inp_route6.ro_nh->nh_mtu; 18773 } 18774 #endif /* INET6 */ 18775 #if defined(INET) && defined(INET6) 18776 else 18777 #endif 18778 #ifdef INET 18779 { 18780 ip->ip_len = htons(m->m_pkthdr.len); 18781 #ifdef INET6 18782 if (inp->inp_vflag & INP_IPV6PROTO) 18783 ip->ip_ttl = in6_selecthlim(inp, NULL); 18784 #endif /* INET6 */ 18785 rack->r_ctl.fsb.hoplimit = ip->ip_ttl; 18786 /* 18787 * If we do path MTU discovery, then we set DF on every 18788 * packet. This might not be the best thing to do according 18789 * to RFC3390 Section 2. However the tcp hostcache migitates 18790 * the problem so it affects only the first tcp connection 18791 * with a host. 18792 * 18793 * NB: Don't set DF on small MTU/MSS to have a safe 18794 * fallback. 18795 */ 18796 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) { 18797 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 18798 if (tp->t_port == 0 || len < V_tcp_minmss) { 18799 ip->ip_off |= htons(IP_DF); 18800 } 18801 } else { 18802 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 18803 } 18804 18805 if (tp->t_state == TCPS_SYN_SENT) 18806 TCP_PROBE5(connect__request, NULL, tp, ip, tp, th); 18807 18808 TCP_PROBE5(send, NULL, tp, ip, tp, th); 18809 18810 error = ip_output(m, 18811 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 18812 inp->inp_options, 18813 #else 18814 NULL, 18815 #endif 18816 &inp->inp_route, 18817 ((rsm || sack_rxmit) ? IP_NO_SND_TAG_RL : 0), 0, 18818 inp); 18819 if (error == EMSGSIZE && inp->inp_route.ro_nh != NULL) 18820 mtu = inp->inp_route.ro_nh->nh_mtu; 18821 } 18822 #endif /* INET */ 18823 18824 out: 18825 if (lgb) { 18826 lgb->tlb_errno = error; 18827 lgb = NULL; 18828 } 18829 /* 18830 * In transmit state, time the transmission and arrange for the 18831 * retransmit. In persist state, just set snd_max. 18832 */ 18833 if (error == 0) { 18834 tcp_account_for_send(tp, len, (rsm != NULL), doing_tlp, hw_tls); 18835 if (rsm && doing_tlp) { 18836 rack->rc_last_sent_tlp_past_cumack = 0; 18837 rack->rc_last_sent_tlp_seq_valid = 1; 18838 rack->r_ctl.last_sent_tlp_seq = rsm->r_start; 18839 rack->r_ctl.last_sent_tlp_len = rsm->r_end - rsm->r_start; 18840 } 18841 rack->forced_ack = 0; /* If we send something zap the FA flag */ 18842 if (rsm && (doing_tlp == 0)) { 18843 /* Set we retransmitted */ 18844 rack->rc_gp_saw_rec = 1; 18845 } else { 18846 if (cwnd_to_use > tp->snd_ssthresh) { 18847 /* Set we sent in CA */ 18848 rack->rc_gp_saw_ca = 1; 18849 } else { 18850 /* Set we sent in SS */ 18851 rack->rc_gp_saw_ss = 1; 18852 } 18853 } 18854 if (TCPS_HAVEESTABLISHED(tp->t_state) && 18855 (tp->t_flags & TF_SACK_PERMIT) && 18856 tp->rcv_numsacks > 0) 18857 tcp_clean_dsack_blocks(tp); 18858 tot_len_this_send += len; 18859 if (len == 0) 18860 counter_u64_add(rack_out_size[TCP_MSS_ACCT_SNDACK], 1); 18861 else if (len == 1) { 18862 counter_u64_add(rack_out_size[TCP_MSS_ACCT_PERSIST], 1); 18863 } else if (len > 1) { 18864 int idx; 18865 18866 idx = (len / segsiz) + 3; 18867 if (idx >= TCP_MSS_ACCT_ATIMER) 18868 counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1); 18869 else 18870 counter_u64_add(rack_out_size[idx], 1); 18871 } 18872 } 18873 if ((rack->rack_no_prr == 0) && 18874 sub_from_prr && 18875 (error == 0)) { 18876 if (rack->r_ctl.rc_prr_sndcnt >= len) 18877 rack->r_ctl.rc_prr_sndcnt -= len; 18878 else 18879 rack->r_ctl.rc_prr_sndcnt = 0; 18880 } 18881 sub_from_prr = 0; 18882 if (doing_tlp) { 18883 /* Make sure the TLP is added */ 18884 add_flag |= RACK_TLP; 18885 } else if (rsm) { 18886 /* If its a resend without TLP then it must not have the flag */ 18887 rsm->r_flags &= ~RACK_TLP; 18888 } 18889 rack_log_output(tp, &to, len, rack_seq, (uint8_t) flags, error, 18890 rack_to_usec_ts(&tv), 18891 rsm, add_flag, s_mb, s_moff, hw_tls); 18892 18893 18894 if ((error == 0) && 18895 (len > 0) && 18896 (tp->snd_una == tp->snd_max)) 18897 rack->r_ctl.rc_tlp_rxt_last_time = cts; 18898 { 18899 tcp_seq startseq = tp->snd_nxt; 18900 18901 /* Track our lost count */ 18902 if (rsm && (doing_tlp == 0)) 18903 rack->r_ctl.rc_loss_count += rsm->r_end - rsm->r_start; 18904 /* 18905 * Advance snd_nxt over sequence space of this segment. 18906 */ 18907 if (error) 18908 /* We don't log or do anything with errors */ 18909 goto nomore; 18910 if (doing_tlp == 0) { 18911 if (rsm == NULL) { 18912 /* 18913 * Not a retransmission of some 18914 * sort, new data is going out so 18915 * clear our TLP count and flag. 18916 */ 18917 rack->rc_tlp_in_progress = 0; 18918 rack->r_ctl.rc_tlp_cnt_out = 0; 18919 } 18920 } else { 18921 /* 18922 * We have just sent a TLP, mark that it is true 18923 * and make sure our in progress is set so we 18924 * continue to check the count. 18925 */ 18926 rack->rc_tlp_in_progress = 1; 18927 rack->r_ctl.rc_tlp_cnt_out++; 18928 } 18929 if (flags & (TH_SYN | TH_FIN)) { 18930 if (flags & TH_SYN) 18931 tp->snd_nxt++; 18932 if (flags & TH_FIN) { 18933 tp->snd_nxt++; 18934 tp->t_flags |= TF_SENTFIN; 18935 } 18936 } 18937 /* In the ENOBUFS case we do *not* update snd_max */ 18938 if (sack_rxmit) 18939 goto nomore; 18940 18941 tp->snd_nxt += len; 18942 if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { 18943 if (tp->snd_una == tp->snd_max) { 18944 /* 18945 * Update the time we just added data since 18946 * none was outstanding. 18947 */ 18948 rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__); 18949 tp->t_acktime = ticks; 18950 } 18951 tp->snd_max = tp->snd_nxt; 18952 /* 18953 * Time this transmission if not a retransmission and 18954 * not currently timing anything. 18955 * This is only relevant in case of switching back to 18956 * the base stack. 18957 */ 18958 if (tp->t_rtttime == 0) { 18959 tp->t_rtttime = ticks; 18960 tp->t_rtseq = startseq; 18961 KMOD_TCPSTAT_INC(tcps_segstimed); 18962 } 18963 if (len && 18964 ((tp->t_flags & TF_GPUTINPROG) == 0)) 18965 rack_start_gp_measurement(tp, rack, startseq, sb_offset); 18966 } 18967 /* 18968 * If we are doing FO we need to update the mbuf position and subtract 18969 * this happens when the peer sends us duplicate information and 18970 * we thus want to send a DSACK. 18971 * 18972 * XXXRRS: This brings to mind a ?, when we send a DSACK block is TSO 18973 * turned off? If not then we are going to echo multiple DSACK blocks 18974 * out (with the TSO), which we should not be doing. 18975 */ 18976 if (rack->r_fast_output && len) { 18977 if (rack->r_ctl.fsb.left_to_send > len) 18978 rack->r_ctl.fsb.left_to_send -= len; 18979 else 18980 rack->r_ctl.fsb.left_to_send = 0; 18981 if (rack->r_ctl.fsb.left_to_send < segsiz) 18982 rack->r_fast_output = 0; 18983 if (rack->r_fast_output) { 18984 rack->r_ctl.fsb.m = sbsndmbuf(sb, (tp->snd_max - tp->snd_una), &rack->r_ctl.fsb.off); 18985 rack->r_ctl.fsb.o_m_len = rack->r_ctl.fsb.m->m_len; 18986 } 18987 } 18988 } 18989 nomore: 18990 if (error) { 18991 rack->r_ctl.rc_agg_delayed = 0; 18992 rack->r_early = 0; 18993 rack->r_late = 0; 18994 rack->r_ctl.rc_agg_early = 0; 18995 SOCKBUF_UNLOCK_ASSERT(sb); /* Check gotos. */ 18996 /* 18997 * Failures do not advance the seq counter above. For the 18998 * case of ENOBUFS we will fall out and retry in 1ms with 18999 * the hpts. Everything else will just have to retransmit 19000 * with the timer. 19001 * 19002 * In any case, we do not want to loop around for another 19003 * send without a good reason. 19004 */ 19005 sendalot = 0; 19006 switch (error) { 19007 case EPERM: 19008 tp->t_softerror = error; 19009 #ifdef TCP_ACCOUNTING 19010 crtsc = get_cyclecount(); 19011 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 19012 tp->tcp_cnt_counters[SND_OUT_FAIL]++; 19013 } 19014 counter_u64_add(tcp_cnt_counters[SND_OUT_FAIL], 1); 19015 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 19016 tp->tcp_proc_time[SND_OUT_FAIL] += (crtsc - ts_val); 19017 } 19018 counter_u64_add(tcp_proc_time[SND_OUT_FAIL], (crtsc - ts_val)); 19019 sched_unpin(); 19020 #endif 19021 return (error); 19022 case ENOBUFS: 19023 /* 19024 * Pace us right away to retry in a some 19025 * time 19026 */ 19027 if (rack->r_ctl.crte != NULL) { 19028 rack_trace_point(rack, RACK_TP_HWENOBUF); 19029 } else 19030 rack_trace_point(rack, RACK_TP_ENOBUF); 19031 slot = ((1 + rack->rc_enobuf) * HPTS_USEC_IN_MSEC); 19032 if (rack->rc_enobuf < 0x7f) 19033 rack->rc_enobuf++; 19034 if (slot < (10 * HPTS_USEC_IN_MSEC)) 19035 slot = 10 * HPTS_USEC_IN_MSEC; 19036 if (rack->r_ctl.crte != NULL) { 19037 counter_u64_add(rack_saw_enobuf_hw, 1); 19038 tcp_rl_log_enobuf(rack->r_ctl.crte); 19039 } 19040 counter_u64_add(rack_saw_enobuf, 1); 19041 goto enobufs; 19042 case EMSGSIZE: 19043 /* 19044 * For some reason the interface we used initially 19045 * to send segments changed to another or lowered 19046 * its MTU. If TSO was active we either got an 19047 * interface without TSO capabilits or TSO was 19048 * turned off. If we obtained mtu from ip_output() 19049 * then update it and try again. 19050 */ 19051 if (tso) 19052 tp->t_flags &= ~TF_TSO; 19053 if (mtu != 0) { 19054 tcp_mss_update(tp, -1, mtu, NULL, NULL); 19055 goto again; 19056 } 19057 slot = 10 * HPTS_USEC_IN_MSEC; 19058 rack_start_hpts_timer(rack, tp, cts, slot, 0, 0); 19059 #ifdef TCP_ACCOUNTING 19060 crtsc = get_cyclecount(); 19061 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 19062 tp->tcp_cnt_counters[SND_OUT_FAIL]++; 19063 } 19064 counter_u64_add(tcp_cnt_counters[SND_OUT_FAIL], 1); 19065 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 19066 tp->tcp_proc_time[SND_OUT_FAIL] += (crtsc - ts_val); 19067 } 19068 counter_u64_add(tcp_proc_time[SND_OUT_FAIL], (crtsc - ts_val)); 19069 sched_unpin(); 19070 #endif 19071 return (error); 19072 case ENETUNREACH: 19073 counter_u64_add(rack_saw_enetunreach, 1); 19074 case EHOSTDOWN: 19075 case EHOSTUNREACH: 19076 case ENETDOWN: 19077 if (TCPS_HAVERCVDSYN(tp->t_state)) { 19078 tp->t_softerror = error; 19079 } 19080 /* FALLTHROUGH */ 19081 default: 19082 slot = 10 * HPTS_USEC_IN_MSEC; 19083 rack_start_hpts_timer(rack, tp, cts, slot, 0, 0); 19084 #ifdef TCP_ACCOUNTING 19085 crtsc = get_cyclecount(); 19086 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 19087 tp->tcp_cnt_counters[SND_OUT_FAIL]++; 19088 } 19089 counter_u64_add(tcp_cnt_counters[SND_OUT_FAIL], 1); 19090 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 19091 tp->tcp_proc_time[SND_OUT_FAIL] += (crtsc - ts_val); 19092 } 19093 counter_u64_add(tcp_proc_time[SND_OUT_FAIL], (crtsc - ts_val)); 19094 sched_unpin(); 19095 #endif 19096 return (error); 19097 } 19098 } else { 19099 rack->rc_enobuf = 0; 19100 if (IN_FASTRECOVERY(tp->t_flags) && rsm) 19101 rack->r_ctl.retran_during_recovery += len; 19102 } 19103 KMOD_TCPSTAT_INC(tcps_sndtotal); 19104 19105 /* 19106 * Data sent (as far as we can tell). If this advertises a larger 19107 * window than any other segment, then remember the size of the 19108 * advertised window. Any pending ACK has now been sent. 19109 */ 19110 if (recwin > 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv)) 19111 tp->rcv_adv = tp->rcv_nxt + recwin; 19112 19113 tp->last_ack_sent = tp->rcv_nxt; 19114 tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); 19115 enobufs: 19116 if (sendalot) { 19117 /* Do we need to turn off sendalot? */ 19118 if (rack->r_ctl.rc_pace_max_segs && 19119 (tot_len_this_send >= rack->r_ctl.rc_pace_max_segs)) { 19120 /* We hit our max. */ 19121 sendalot = 0; 19122 } else if ((rack->rc_user_set_max_segs) && 19123 (tot_len_this_send >= (rack->rc_user_set_max_segs * segsiz))) { 19124 /* We hit the user defined max */ 19125 sendalot = 0; 19126 } 19127 } 19128 if ((error == 0) && (flags & TH_FIN)) 19129 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_FIN); 19130 if (flags & TH_RST) { 19131 /* 19132 * We don't send again after sending a RST. 19133 */ 19134 slot = 0; 19135 sendalot = 0; 19136 if (error == 0) 19137 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); 19138 } else if ((slot == 0) && (sendalot == 0) && tot_len_this_send) { 19139 /* 19140 * Get our pacing rate, if an error 19141 * occurred in sending (ENOBUF) we would 19142 * hit the else if with slot preset. Other 19143 * errors return. 19144 */ 19145 slot = rack_get_pacing_delay(rack, tp, tot_len_this_send, rsm, segsiz); 19146 } 19147 if (rsm && 19148 (rsm->r_flags & RACK_HAS_SYN) == 0 && 19149 rack->use_rack_rr) { 19150 /* Its a retransmit and we use the rack cheat? */ 19151 if ((slot == 0) || 19152 (rack->rc_always_pace == 0) || 19153 (rack->r_rr_config == 1)) { 19154 /* 19155 * We have no pacing set or we 19156 * are using old-style rack or 19157 * we are overridden to use the old 1ms pacing. 19158 */ 19159 slot = rack->r_ctl.rc_min_to; 19160 } 19161 } 19162 /* We have sent clear the flag */ 19163 rack->r_ent_rec_ns = 0; 19164 if (rack->r_must_retran) { 19165 if (rsm) { 19166 rack->r_ctl.rc_out_at_rto -= (rsm->r_end - rsm->r_start); 19167 if (SEQ_GEQ(rsm->r_end, rack->r_ctl.rc_snd_max_at_rto)) { 19168 /* 19169 * We have retransmitted all. 19170 */ 19171 rack->r_must_retran = 0; 19172 rack->r_ctl.rc_out_at_rto = 0; 19173 } 19174 } else if (SEQ_GEQ(tp->snd_max, rack->r_ctl.rc_snd_max_at_rto)) { 19175 /* 19176 * Sending new data will also kill 19177 * the loop. 19178 */ 19179 rack->r_must_retran = 0; 19180 rack->r_ctl.rc_out_at_rto = 0; 19181 } 19182 } 19183 rack->r_ctl.fsb.recwin = recwin; 19184 if ((tp->t_flags & (TF_WASCRECOVERY|TF_WASFRECOVERY)) && 19185 SEQ_GT(tp->snd_max, rack->r_ctl.rc_snd_max_at_rto)) { 19186 /* 19187 * We hit an RTO and now have past snd_max at the RTO 19188 * clear all the WAS flags. 19189 */ 19190 tp->t_flags &= ~(TF_WASCRECOVERY|TF_WASFRECOVERY); 19191 } 19192 if (slot) { 19193 /* set the rack tcb into the slot N */ 19194 if ((error == 0) && 19195 rack_use_rfo && 19196 ((flags & (TH_SYN|TH_FIN)) == 0) && 19197 (rsm == NULL) && 19198 (tp->snd_nxt == tp->snd_max) && 19199 (ipoptlen == 0) && 19200 (tp->rcv_numsacks == 0) && 19201 rack->r_fsb_inited && 19202 TCPS_HAVEESTABLISHED(tp->t_state) && 19203 (rack->r_must_retran == 0) && 19204 ((tp->t_flags & TF_NEEDFIN) == 0) && 19205 (len > 0) && (orig_len > 0) && 19206 (orig_len > len) && 19207 ((orig_len - len) >= segsiz) && 19208 ((optlen == 0) || 19209 ((optlen == TCPOLEN_TSTAMP_APPA) && (to.to_flags & TOF_TS)))) { 19210 /* We can send at least one more MSS using our fsb */ 19211 19212 rack->r_fast_output = 1; 19213 rack->r_ctl.fsb.m = sbsndmbuf(sb, (tp->snd_max - tp->snd_una), &rack->r_ctl.fsb.off); 19214 rack->r_ctl.fsb.o_m_len = rack->r_ctl.fsb.m->m_len; 19215 rack->r_ctl.fsb.tcp_flags = flags; 19216 rack->r_ctl.fsb.left_to_send = orig_len - len; 19217 if (hw_tls) 19218 rack->r_ctl.fsb.hw_tls = 1; 19219 else 19220 rack->r_ctl.fsb.hw_tls = 0; 19221 KASSERT((rack->r_ctl.fsb.left_to_send <= (sbavail(sb) - (tp->snd_max - tp->snd_una))), 19222 ("rack:%p left_to_send:%u sbavail:%u out:%u", 19223 rack, rack->r_ctl.fsb.left_to_send, sbavail(sb), 19224 (tp->snd_max - tp->snd_una))); 19225 if (rack->r_ctl.fsb.left_to_send < segsiz) 19226 rack->r_fast_output = 0; 19227 else { 19228 if (rack->r_ctl.fsb.left_to_send == (sbavail(sb) - (tp->snd_max - tp->snd_una))) 19229 rack->r_ctl.fsb.rfo_apply_push = 1; 19230 else 19231 rack->r_ctl.fsb.rfo_apply_push = 0; 19232 } 19233 } else 19234 rack->r_fast_output = 0; 19235 rack_log_fsb(rack, tp, so, flags, 19236 ipoptlen, orig_len, len, error, 19237 (rsm == NULL), optlen, __LINE__, 2); 19238 } else if (sendalot) { 19239 int ret; 19240 19241 sack_rxmit = 0; 19242 if ((error == 0) && 19243 rack_use_rfo && 19244 ((flags & (TH_SYN|TH_FIN)) == 0) && 19245 (rsm == NULL) && 19246 (ipoptlen == 0) && 19247 (tp->rcv_numsacks == 0) && 19248 (tp->snd_nxt == tp->snd_max) && 19249 (rack->r_must_retran == 0) && 19250 rack->r_fsb_inited && 19251 TCPS_HAVEESTABLISHED(tp->t_state) && 19252 ((tp->t_flags & TF_NEEDFIN) == 0) && 19253 (len > 0) && (orig_len > 0) && 19254 (orig_len > len) && 19255 ((orig_len - len) >= segsiz) && 19256 ((optlen == 0) || 19257 ((optlen == TCPOLEN_TSTAMP_APPA) && (to.to_flags & TOF_TS)))) { 19258 /* we can use fast_output for more */ 19259 19260 rack->r_fast_output = 1; 19261 rack->r_ctl.fsb.m = sbsndmbuf(sb, (tp->snd_max - tp->snd_una), &rack->r_ctl.fsb.off); 19262 rack->r_ctl.fsb.o_m_len = rack->r_ctl.fsb.m->m_len; 19263 rack->r_ctl.fsb.tcp_flags = flags; 19264 rack->r_ctl.fsb.left_to_send = orig_len - len; 19265 if (hw_tls) 19266 rack->r_ctl.fsb.hw_tls = 1; 19267 else 19268 rack->r_ctl.fsb.hw_tls = 0; 19269 KASSERT((rack->r_ctl.fsb.left_to_send <= (sbavail(sb) - (tp->snd_max - tp->snd_una))), 19270 ("rack:%p left_to_send:%u sbavail:%u out:%u", 19271 rack, rack->r_ctl.fsb.left_to_send, sbavail(sb), 19272 (tp->snd_max - tp->snd_una))); 19273 if (rack->r_ctl.fsb.left_to_send < segsiz) { 19274 rack->r_fast_output = 0; 19275 } 19276 if (rack->r_fast_output) { 19277 if (rack->r_ctl.fsb.left_to_send == (sbavail(sb) - (tp->snd_max - tp->snd_una))) 19278 rack->r_ctl.fsb.rfo_apply_push = 1; 19279 else 19280 rack->r_ctl.fsb.rfo_apply_push = 0; 19281 rack_log_fsb(rack, tp, so, flags, 19282 ipoptlen, orig_len, len, error, 19283 (rsm == NULL), optlen, __LINE__, 3); 19284 error = 0; 19285 ret = rack_fast_output(tp, rack, ts_val, cts, ms_cts, &tv, tot_len_this_send, &error); 19286 if (ret >= 0) 19287 return (ret); 19288 else if (error) 19289 goto nomore; 19290 19291 } 19292 } 19293 goto again; 19294 } 19295 /* Assure when we leave that snd_nxt will point to top */ 19296 if (SEQ_GT(tp->snd_max, tp->snd_nxt)) 19297 tp->snd_nxt = tp->snd_max; 19298 rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, 0); 19299 #ifdef TCP_ACCOUNTING 19300 crtsc = get_cyclecount() - ts_val; 19301 if (tot_len_this_send) { 19302 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 19303 tp->tcp_cnt_counters[SND_OUT_DATA]++; 19304 } 19305 counter_u64_add(tcp_cnt_counters[SND_OUT_DATA], 1); 19306 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 19307 tp->tcp_proc_time[SND_OUT_DATA] += crtsc; 19308 } 19309 counter_u64_add(tcp_proc_time[SND_OUT_DATA], crtsc); 19310 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 19311 tp->tcp_cnt_counters[CNT_OF_MSS_OUT] += ((tot_len_this_send + segsiz - 1) /segsiz); 19312 } 19313 counter_u64_add(tcp_cnt_counters[CNT_OF_MSS_OUT], ((tot_len_this_send + segsiz - 1) /segsiz)); 19314 } else { 19315 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 19316 tp->tcp_cnt_counters[SND_OUT_ACK]++; 19317 } 19318 counter_u64_add(tcp_cnt_counters[SND_OUT_ACK], 1); 19319 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 19320 tp->tcp_proc_time[SND_OUT_ACK] += crtsc; 19321 } 19322 counter_u64_add(tcp_proc_time[SND_OUT_ACK], crtsc); 19323 } 19324 sched_unpin(); 19325 #endif 19326 if (error == ENOBUFS) 19327 error = 0; 19328 return (error); 19329 } 19330 19331 static void 19332 rack_update_seg(struct tcp_rack *rack) 19333 { 19334 uint32_t orig_val; 19335 19336 orig_val = rack->r_ctl.rc_pace_max_segs; 19337 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL); 19338 if (orig_val != rack->r_ctl.rc_pace_max_segs) 19339 rack_log_pacing_delay_calc(rack, 0, 0, orig_val, 0, 0, 15, __LINE__, NULL, 0); 19340 } 19341 19342 static void 19343 rack_mtu_change(struct tcpcb *tp) 19344 { 19345 /* 19346 * The MSS may have changed 19347 */ 19348 struct tcp_rack *rack; 19349 struct rack_sendmap *rsm; 19350 19351 rack = (struct tcp_rack *)tp->t_fb_ptr; 19352 if (rack->r_ctl.rc_pace_min_segs != ctf_fixed_maxseg(tp)) { 19353 /* 19354 * The MTU has changed we need to resend everything 19355 * since all we have sent is lost. We first fix 19356 * up the mtu though. 19357 */ 19358 rack_set_pace_segments(tp, rack, __LINE__, NULL); 19359 /* We treat this like a full retransmit timeout without the cwnd adjustment */ 19360 rack_remxt_tmr(tp); 19361 rack->r_fast_output = 0; 19362 rack->r_ctl.rc_out_at_rto = ctf_flight_size(tp, 19363 rack->r_ctl.rc_sacked); 19364 rack->r_ctl.rc_snd_max_at_rto = tp->snd_max; 19365 rack->r_must_retran = 1; 19366 /* Mark all inflight to needing to be rxt'd */ 19367 TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) { 19368 rsm->r_flags |= RACK_MUST_RXT; 19369 } 19370 } 19371 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 19372 /* We don't use snd_nxt to retransmit */ 19373 tp->snd_nxt = tp->snd_max; 19374 } 19375 19376 static int 19377 rack_set_profile(struct tcp_rack *rack, int prof) 19378 { 19379 int err = EINVAL; 19380 if (prof == 1) { 19381 /* pace_always=1 */ 19382 if (rack->rc_always_pace == 0) { 19383 if (tcp_can_enable_pacing() == 0) 19384 return (EBUSY); 19385 } 19386 rack->rc_always_pace = 1; 19387 if (rack->use_fixed_rate || rack->gp_ready) 19388 rack_set_cc_pacing(rack); 19389 rack->rc_inp->inp_flags2 |= INP_SUPPORTS_MBUFQ; 19390 rack->rack_attempt_hdwr_pace = 0; 19391 /* cmpack=1 */ 19392 if (rack_use_cmp_acks) 19393 rack->r_use_cmp_ack = 1; 19394 if (TCPS_HAVEESTABLISHED(rack->rc_tp->t_state) && 19395 rack->r_use_cmp_ack) 19396 rack->rc_inp->inp_flags2 |= INP_MBUF_ACKCMP; 19397 /* scwnd=1 */ 19398 rack->rack_enable_scwnd = 1; 19399 /* dynamic=100 */ 19400 rack->rc_gp_dyn_mul = 1; 19401 /* gp_inc_ca */ 19402 rack->r_ctl.rack_per_of_gp_ca = 100; 19403 /* rrr_conf=3 */ 19404 rack->r_rr_config = 3; 19405 /* npush=2 */ 19406 rack->r_ctl.rc_no_push_at_mrtt = 2; 19407 /* fillcw=1 */ 19408 rack->rc_pace_to_cwnd = 1; 19409 rack->rc_pace_fill_if_rttin_range = 0; 19410 rack->rtt_limit_mul = 0; 19411 /* noprr=1 */ 19412 rack->rack_no_prr = 1; 19413 /* lscwnd=1 */ 19414 rack->r_limit_scw = 1; 19415 /* gp_inc_rec */ 19416 rack->r_ctl.rack_per_of_gp_rec = 90; 19417 err = 0; 19418 19419 } else if (prof == 3) { 19420 /* Same as profile one execept fill_cw becomes 2 (less aggressive set) */ 19421 /* pace_always=1 */ 19422 if (rack->rc_always_pace == 0) { 19423 if (tcp_can_enable_pacing() == 0) 19424 return (EBUSY); 19425 } 19426 rack->rc_always_pace = 1; 19427 if (rack->use_fixed_rate || rack->gp_ready) 19428 rack_set_cc_pacing(rack); 19429 rack->rc_inp->inp_flags2 |= INP_SUPPORTS_MBUFQ; 19430 rack->rack_attempt_hdwr_pace = 0; 19431 /* cmpack=1 */ 19432 if (rack_use_cmp_acks) 19433 rack->r_use_cmp_ack = 1; 19434 if (TCPS_HAVEESTABLISHED(rack->rc_tp->t_state) && 19435 rack->r_use_cmp_ack) 19436 rack->rc_inp->inp_flags2 |= INP_MBUF_ACKCMP; 19437 /* scwnd=1 */ 19438 rack->rack_enable_scwnd = 1; 19439 /* dynamic=100 */ 19440 rack->rc_gp_dyn_mul = 1; 19441 /* gp_inc_ca */ 19442 rack->r_ctl.rack_per_of_gp_ca = 100; 19443 /* rrr_conf=3 */ 19444 rack->r_rr_config = 3; 19445 /* npush=2 */ 19446 rack->r_ctl.rc_no_push_at_mrtt = 2; 19447 /* fillcw=2 */ 19448 rack->rc_pace_to_cwnd = 1; 19449 rack->r_fill_less_agg = 1; 19450 rack->rc_pace_fill_if_rttin_range = 0; 19451 rack->rtt_limit_mul = 0; 19452 /* noprr=1 */ 19453 rack->rack_no_prr = 1; 19454 /* lscwnd=1 */ 19455 rack->r_limit_scw = 1; 19456 /* gp_inc_rec */ 19457 rack->r_ctl.rack_per_of_gp_rec = 90; 19458 err = 0; 19459 19460 19461 } else if (prof == 2) { 19462 /* cmpack=1 */ 19463 if (rack->rc_always_pace == 0) { 19464 if (tcp_can_enable_pacing() == 0) 19465 return (EBUSY); 19466 } 19467 rack->rc_always_pace = 1; 19468 if (rack->use_fixed_rate || rack->gp_ready) 19469 rack_set_cc_pacing(rack); 19470 rack->r_use_cmp_ack = 1; 19471 if (TCPS_HAVEESTABLISHED(rack->rc_tp->t_state)) 19472 rack->rc_inp->inp_flags2 |= INP_MBUF_ACKCMP; 19473 /* pace_always=1 */ 19474 rack->rc_inp->inp_flags2 |= INP_SUPPORTS_MBUFQ; 19475 /* scwnd=1 */ 19476 rack->rack_enable_scwnd = 1; 19477 /* dynamic=100 */ 19478 rack->rc_gp_dyn_mul = 1; 19479 rack->r_ctl.rack_per_of_gp_ca = 100; 19480 /* rrr_conf=3 */ 19481 rack->r_rr_config = 3; 19482 /* npush=2 */ 19483 rack->r_ctl.rc_no_push_at_mrtt = 2; 19484 /* fillcw=1 */ 19485 rack->rc_pace_to_cwnd = 1; 19486 rack->rc_pace_fill_if_rttin_range = 0; 19487 rack->rtt_limit_mul = 0; 19488 /* noprr=1 */ 19489 rack->rack_no_prr = 1; 19490 /* lscwnd=0 */ 19491 rack->r_limit_scw = 0; 19492 err = 0; 19493 } else if (prof == 0) { 19494 /* This changes things back to the default settings */ 19495 err = 0; 19496 if (rack->rc_always_pace) { 19497 tcp_decrement_paced_conn(); 19498 rack_undo_cc_pacing(rack); 19499 rack->rc_always_pace = 0; 19500 } 19501 if (rack_pace_every_seg && tcp_can_enable_pacing()) { 19502 rack->rc_always_pace = 1; 19503 if (rack->use_fixed_rate || rack->gp_ready) 19504 rack_set_cc_pacing(rack); 19505 } else 19506 rack->rc_always_pace = 0; 19507 if (rack_dsack_std_based & 0x1) { 19508 /* Basically this means all rack timers are at least (srtt + 1/4 srtt) */ 19509 rack->rc_rack_tmr_std_based = 1; 19510 } 19511 if (rack_dsack_std_based & 0x2) { 19512 /* Basically this means rack timers are extended based on dsack by up to (2 * srtt) */ 19513 rack->rc_rack_use_dsack = 1; 19514 } 19515 if (rack_use_cmp_acks) 19516 rack->r_use_cmp_ack = 1; 19517 else 19518 rack->r_use_cmp_ack = 0; 19519 if (rack_disable_prr) 19520 rack->rack_no_prr = 1; 19521 else 19522 rack->rack_no_prr = 0; 19523 if (rack_gp_no_rec_chg) 19524 rack->rc_gp_no_rec_chg = 1; 19525 else 19526 rack->rc_gp_no_rec_chg = 0; 19527 if (rack_enable_mqueue_for_nonpaced || rack->r_use_cmp_ack) { 19528 rack->r_mbuf_queue = 1; 19529 if (TCPS_HAVEESTABLISHED(rack->rc_tp->t_state)) 19530 rack->rc_inp->inp_flags2 |= INP_MBUF_ACKCMP; 19531 rack->rc_inp->inp_flags2 |= INP_SUPPORTS_MBUFQ; 19532 } else { 19533 rack->r_mbuf_queue = 0; 19534 rack->rc_inp->inp_flags2 &= ~INP_SUPPORTS_MBUFQ; 19535 } 19536 if (rack_enable_shared_cwnd) 19537 rack->rack_enable_scwnd = 1; 19538 else 19539 rack->rack_enable_scwnd = 0; 19540 if (rack_do_dyn_mul) { 19541 /* When dynamic adjustment is on CA needs to start at 100% */ 19542 rack->rc_gp_dyn_mul = 1; 19543 if (rack_do_dyn_mul >= 100) 19544 rack->r_ctl.rack_per_of_gp_ca = rack_do_dyn_mul; 19545 } else { 19546 rack->r_ctl.rack_per_of_gp_ca = rack_per_of_gp_ca; 19547 rack->rc_gp_dyn_mul = 0; 19548 } 19549 rack->r_rr_config = 0; 19550 rack->r_ctl.rc_no_push_at_mrtt = 0; 19551 rack->rc_pace_to_cwnd = 0; 19552 rack->rc_pace_fill_if_rttin_range = 0; 19553 rack->rtt_limit_mul = 0; 19554 19555 if (rack_enable_hw_pacing) 19556 rack->rack_hdw_pace_ena = 1; 19557 else 19558 rack->rack_hdw_pace_ena = 0; 19559 if (rack_disable_prr) 19560 rack->rack_no_prr = 1; 19561 else 19562 rack->rack_no_prr = 0; 19563 if (rack_limits_scwnd) 19564 rack->r_limit_scw = 1; 19565 else 19566 rack->r_limit_scw = 0; 19567 err = 0; 19568 } 19569 return (err); 19570 } 19571 19572 static int 19573 rack_add_deferred_option(struct tcp_rack *rack, int sopt_name, uint64_t loptval) 19574 { 19575 struct deferred_opt_list *dol; 19576 19577 dol = malloc(sizeof(struct deferred_opt_list), 19578 M_TCPFSB, M_NOWAIT|M_ZERO); 19579 if (dol == NULL) { 19580 /* 19581 * No space yikes -- fail out.. 19582 */ 19583 return (0); 19584 } 19585 dol->optname = sopt_name; 19586 dol->optval = loptval; 19587 TAILQ_INSERT_TAIL(&rack->r_ctl.opt_list, dol, next); 19588 return (1); 19589 } 19590 19591 static int 19592 rack_process_option(struct tcpcb *tp, struct tcp_rack *rack, int sopt_name, 19593 uint32_t optval, uint64_t loptval) 19594 { 19595 struct epoch_tracker et; 19596 struct sockopt sopt; 19597 struct cc_newreno_opts opt; 19598 struct inpcb *inp = tptoinpcb(tp); 19599 uint64_t val; 19600 int error = 0; 19601 uint16_t ca, ss; 19602 19603 switch (sopt_name) { 19604 19605 case TCP_RACK_DSACK_OPT: 19606 RACK_OPTS_INC(tcp_rack_dsack_opt); 19607 if (optval & 0x1) { 19608 rack->rc_rack_tmr_std_based = 1; 19609 } else { 19610 rack->rc_rack_tmr_std_based = 0; 19611 } 19612 if (optval & 0x2) { 19613 rack->rc_rack_use_dsack = 1; 19614 } else { 19615 rack->rc_rack_use_dsack = 0; 19616 } 19617 rack_log_dsack_event(rack, 5, __LINE__, 0, 0); 19618 break; 19619 case TCP_RACK_PACING_BETA: 19620 RACK_OPTS_INC(tcp_rack_beta); 19621 if (strcmp(tp->t_cc->name, CCALGONAME_NEWRENO) != 0) { 19622 /* This only works for newreno. */ 19623 error = EINVAL; 19624 break; 19625 } 19626 if (rack->rc_pacing_cc_set) { 19627 /* 19628 * Set them into the real CC module 19629 * whats in the rack pcb is the old values 19630 * to be used on restoral/ 19631 */ 19632 sopt.sopt_dir = SOPT_SET; 19633 opt.name = CC_NEWRENO_BETA; 19634 opt.val = optval; 19635 if (CC_ALGO(tp)->ctl_output != NULL) 19636 error = CC_ALGO(tp)->ctl_output(&tp->t_ccv, &sopt, &opt); 19637 else { 19638 error = ENOENT; 19639 break; 19640 } 19641 } else { 19642 /* 19643 * Not pacing yet so set it into our local 19644 * rack pcb storage. 19645 */ 19646 rack->r_ctl.rc_saved_beta.beta = optval; 19647 } 19648 break; 19649 case TCP_RACK_TIMER_SLOP: 19650 RACK_OPTS_INC(tcp_rack_timer_slop); 19651 rack->r_ctl.timer_slop = optval; 19652 if (rack->rc_tp->t_srtt) { 19653 /* 19654 * If we have an SRTT lets update t_rxtcur 19655 * to have the new slop. 19656 */ 19657 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 19658 rack_rto_min, rack_rto_max, 19659 rack->r_ctl.timer_slop); 19660 } 19661 break; 19662 case TCP_RACK_PACING_BETA_ECN: 19663 RACK_OPTS_INC(tcp_rack_beta_ecn); 19664 if (strcmp(tp->t_cc->name, CCALGONAME_NEWRENO) != 0) { 19665 /* This only works for newreno. */ 19666 error = EINVAL; 19667 break; 19668 } 19669 if (rack->rc_pacing_cc_set) { 19670 /* 19671 * Set them into the real CC module 19672 * whats in the rack pcb is the old values 19673 * to be used on restoral/ 19674 */ 19675 sopt.sopt_dir = SOPT_SET; 19676 opt.name = CC_NEWRENO_BETA_ECN; 19677 opt.val = optval; 19678 if (CC_ALGO(tp)->ctl_output != NULL) 19679 error = CC_ALGO(tp)->ctl_output(&tp->t_ccv, &sopt, &opt); 19680 else 19681 error = ENOENT; 19682 } else { 19683 /* 19684 * Not pacing yet so set it into our local 19685 * rack pcb storage. 19686 */ 19687 rack->r_ctl.rc_saved_beta.beta_ecn = optval; 19688 rack->r_ctl.rc_saved_beta.newreno_flags = CC_NEWRENO_BETA_ECN_ENABLED; 19689 } 19690 break; 19691 case TCP_DEFER_OPTIONS: 19692 RACK_OPTS_INC(tcp_defer_opt); 19693 if (optval) { 19694 if (rack->gp_ready) { 19695 /* Too late */ 19696 error = EINVAL; 19697 break; 19698 } 19699 rack->defer_options = 1; 19700 } else 19701 rack->defer_options = 0; 19702 break; 19703 case TCP_RACK_MEASURE_CNT: 19704 RACK_OPTS_INC(tcp_rack_measure_cnt); 19705 if (optval && (optval <= 0xff)) { 19706 rack->r_ctl.req_measurements = optval; 19707 } else 19708 error = EINVAL; 19709 break; 19710 case TCP_REC_ABC_VAL: 19711 RACK_OPTS_INC(tcp_rec_abc_val); 19712 if (optval > 0) 19713 rack->r_use_labc_for_rec = 1; 19714 else 19715 rack->r_use_labc_for_rec = 0; 19716 break; 19717 case TCP_RACK_ABC_VAL: 19718 RACK_OPTS_INC(tcp_rack_abc_val); 19719 if ((optval > 0) && (optval < 255)) 19720 rack->rc_labc = optval; 19721 else 19722 error = EINVAL; 19723 break; 19724 case TCP_HDWR_UP_ONLY: 19725 RACK_OPTS_INC(tcp_pacing_up_only); 19726 if (optval) 19727 rack->r_up_only = 1; 19728 else 19729 rack->r_up_only = 0; 19730 break; 19731 case TCP_PACING_RATE_CAP: 19732 RACK_OPTS_INC(tcp_pacing_rate_cap); 19733 rack->r_ctl.bw_rate_cap = loptval; 19734 break; 19735 case TCP_RACK_PROFILE: 19736 RACK_OPTS_INC(tcp_profile); 19737 error = rack_set_profile(rack, optval); 19738 break; 19739 case TCP_USE_CMP_ACKS: 19740 RACK_OPTS_INC(tcp_use_cmp_acks); 19741 if ((optval == 0) && (rack->rc_inp->inp_flags2 & INP_MBUF_ACKCMP)) { 19742 /* You can't turn it off once its on! */ 19743 error = EINVAL; 19744 } else if ((optval == 1) && (rack->r_use_cmp_ack == 0)) { 19745 rack->r_use_cmp_ack = 1; 19746 rack->r_mbuf_queue = 1; 19747 inp->inp_flags2 |= INP_SUPPORTS_MBUFQ; 19748 } 19749 if (rack->r_use_cmp_ack && TCPS_HAVEESTABLISHED(tp->t_state)) 19750 inp->inp_flags2 |= INP_MBUF_ACKCMP; 19751 break; 19752 case TCP_SHARED_CWND_TIME_LIMIT: 19753 RACK_OPTS_INC(tcp_lscwnd); 19754 if (optval) 19755 rack->r_limit_scw = 1; 19756 else 19757 rack->r_limit_scw = 0; 19758 break; 19759 case TCP_RACK_PACE_TO_FILL: 19760 RACK_OPTS_INC(tcp_fillcw); 19761 if (optval == 0) 19762 rack->rc_pace_to_cwnd = 0; 19763 else { 19764 rack->rc_pace_to_cwnd = 1; 19765 if (optval > 1) 19766 rack->r_fill_less_agg = 1; 19767 } 19768 if ((optval >= rack_gp_rtt_maxmul) && 19769 rack_gp_rtt_maxmul && 19770 (optval < 0xf)) { 19771 rack->rc_pace_fill_if_rttin_range = 1; 19772 rack->rtt_limit_mul = optval; 19773 } else { 19774 rack->rc_pace_fill_if_rttin_range = 0; 19775 rack->rtt_limit_mul = 0; 19776 } 19777 break; 19778 case TCP_RACK_NO_PUSH_AT_MAX: 19779 RACK_OPTS_INC(tcp_npush); 19780 if (optval == 0) 19781 rack->r_ctl.rc_no_push_at_mrtt = 0; 19782 else if (optval < 0xff) 19783 rack->r_ctl.rc_no_push_at_mrtt = optval; 19784 else 19785 error = EINVAL; 19786 break; 19787 case TCP_SHARED_CWND_ENABLE: 19788 RACK_OPTS_INC(tcp_rack_scwnd); 19789 if (optval == 0) 19790 rack->rack_enable_scwnd = 0; 19791 else 19792 rack->rack_enable_scwnd = 1; 19793 break; 19794 case TCP_RACK_MBUF_QUEUE: 19795 /* Now do we use the LRO mbuf-queue feature */ 19796 RACK_OPTS_INC(tcp_rack_mbufq); 19797 if (optval || rack->r_use_cmp_ack) 19798 rack->r_mbuf_queue = 1; 19799 else 19800 rack->r_mbuf_queue = 0; 19801 if (rack->r_mbuf_queue || rack->rc_always_pace || rack->r_use_cmp_ack) 19802 inp->inp_flags2 |= INP_SUPPORTS_MBUFQ; 19803 else 19804 inp->inp_flags2 &= ~INP_SUPPORTS_MBUFQ; 19805 break; 19806 case TCP_RACK_NONRXT_CFG_RATE: 19807 RACK_OPTS_INC(tcp_rack_cfg_rate); 19808 if (optval == 0) 19809 rack->rack_rec_nonrxt_use_cr = 0; 19810 else 19811 rack->rack_rec_nonrxt_use_cr = 1; 19812 break; 19813 case TCP_NO_PRR: 19814 RACK_OPTS_INC(tcp_rack_noprr); 19815 if (optval == 0) 19816 rack->rack_no_prr = 0; 19817 else if (optval == 1) 19818 rack->rack_no_prr = 1; 19819 else if (optval == 2) 19820 rack->no_prr_addback = 1; 19821 else 19822 error = EINVAL; 19823 break; 19824 case TCP_TIMELY_DYN_ADJ: 19825 RACK_OPTS_INC(tcp_timely_dyn); 19826 if (optval == 0) 19827 rack->rc_gp_dyn_mul = 0; 19828 else { 19829 rack->rc_gp_dyn_mul = 1; 19830 if (optval >= 100) { 19831 /* 19832 * If the user sets something 100 or more 19833 * its the gp_ca value. 19834 */ 19835 rack->r_ctl.rack_per_of_gp_ca = optval; 19836 } 19837 } 19838 break; 19839 case TCP_RACK_DO_DETECTION: 19840 RACK_OPTS_INC(tcp_rack_do_detection); 19841 if (optval == 0) 19842 rack->do_detection = 0; 19843 else 19844 rack->do_detection = 1; 19845 break; 19846 case TCP_RACK_TLP_USE: 19847 if ((optval < TLP_USE_ID) || (optval > TLP_USE_TWO_TWO)) { 19848 error = EINVAL; 19849 break; 19850 } 19851 RACK_OPTS_INC(tcp_tlp_use); 19852 rack->rack_tlp_threshold_use = optval; 19853 break; 19854 case TCP_RACK_TLP_REDUCE: 19855 /* RACK TLP cwnd reduction (bool) */ 19856 RACK_OPTS_INC(tcp_rack_tlp_reduce); 19857 rack->r_ctl.rc_tlp_cwnd_reduce = optval; 19858 break; 19859 /* Pacing related ones */ 19860 case TCP_RACK_PACE_ALWAYS: 19861 /* 19862 * zero is old rack method, 1 is new 19863 * method using a pacing rate. 19864 */ 19865 RACK_OPTS_INC(tcp_rack_pace_always); 19866 if (optval > 0) { 19867 if (rack->rc_always_pace) { 19868 error = EALREADY; 19869 break; 19870 } else if (tcp_can_enable_pacing()) { 19871 rack->rc_always_pace = 1; 19872 if (rack->use_fixed_rate || rack->gp_ready) 19873 rack_set_cc_pacing(rack); 19874 } 19875 else { 19876 error = ENOSPC; 19877 break; 19878 } 19879 } else { 19880 if (rack->rc_always_pace) { 19881 tcp_decrement_paced_conn(); 19882 rack->rc_always_pace = 0; 19883 rack_undo_cc_pacing(rack); 19884 } 19885 } 19886 if (rack->r_mbuf_queue || rack->rc_always_pace || rack->r_use_cmp_ack) 19887 inp->inp_flags2 |= INP_SUPPORTS_MBUFQ; 19888 else 19889 inp->inp_flags2 &= ~INP_SUPPORTS_MBUFQ; 19890 /* A rate may be set irate or other, if so set seg size */ 19891 rack_update_seg(rack); 19892 break; 19893 case TCP_BBR_RACK_INIT_RATE: 19894 RACK_OPTS_INC(tcp_initial_rate); 19895 val = optval; 19896 /* Change from kbits per second to bytes per second */ 19897 val *= 1000; 19898 val /= 8; 19899 rack->r_ctl.init_rate = val; 19900 if (rack->rc_init_win != rack_default_init_window) { 19901 uint32_t win, snt; 19902 19903 /* 19904 * Options don't always get applied 19905 * in the order you think. So in order 19906 * to assure we update a cwnd we need 19907 * to check and see if we are still 19908 * where we should raise the cwnd. 19909 */ 19910 win = rc_init_window(rack); 19911 if (SEQ_GT(tp->snd_max, tp->iss)) 19912 snt = tp->snd_max - tp->iss; 19913 else 19914 snt = 0; 19915 if ((snt < win) && 19916 (tp->snd_cwnd < win)) 19917 tp->snd_cwnd = win; 19918 } 19919 if (rack->rc_always_pace) 19920 rack_update_seg(rack); 19921 break; 19922 case TCP_BBR_IWINTSO: 19923 RACK_OPTS_INC(tcp_initial_win); 19924 if (optval && (optval <= 0xff)) { 19925 uint32_t win, snt; 19926 19927 rack->rc_init_win = optval; 19928 win = rc_init_window(rack); 19929 if (SEQ_GT(tp->snd_max, tp->iss)) 19930 snt = tp->snd_max - tp->iss; 19931 else 19932 snt = 0; 19933 if ((snt < win) && 19934 (tp->t_srtt | 19935 #ifdef NETFLIX_PEAKRATE 19936 tp->t_maxpeakrate | 19937 #endif 19938 rack->r_ctl.init_rate)) { 19939 /* 19940 * We are not past the initial window 19941 * and we have some bases for pacing, 19942 * so we need to possibly adjust up 19943 * the cwnd. Note even if we don't set 19944 * the cwnd, its still ok to raise the rc_init_win 19945 * which can be used coming out of idle when we 19946 * would have a rate. 19947 */ 19948 if (tp->snd_cwnd < win) 19949 tp->snd_cwnd = win; 19950 } 19951 if (rack->rc_always_pace) 19952 rack_update_seg(rack); 19953 } else 19954 error = EINVAL; 19955 break; 19956 case TCP_RACK_FORCE_MSEG: 19957 RACK_OPTS_INC(tcp_rack_force_max_seg); 19958 if (optval) 19959 rack->rc_force_max_seg = 1; 19960 else 19961 rack->rc_force_max_seg = 0; 19962 break; 19963 case TCP_RACK_PACE_MAX_SEG: 19964 /* Max segments size in a pace in bytes */ 19965 RACK_OPTS_INC(tcp_rack_max_seg); 19966 rack->rc_user_set_max_segs = optval; 19967 rack_set_pace_segments(tp, rack, __LINE__, NULL); 19968 break; 19969 case TCP_RACK_PACE_RATE_REC: 19970 /* Set the fixed pacing rate in Bytes per second ca */ 19971 RACK_OPTS_INC(tcp_rack_pace_rate_rec); 19972 rack->r_ctl.rc_fixed_pacing_rate_rec = optval; 19973 if (rack->r_ctl.rc_fixed_pacing_rate_ca == 0) 19974 rack->r_ctl.rc_fixed_pacing_rate_ca = optval; 19975 if (rack->r_ctl.rc_fixed_pacing_rate_ss == 0) 19976 rack->r_ctl.rc_fixed_pacing_rate_ss = optval; 19977 rack->use_fixed_rate = 1; 19978 if (rack->rc_always_pace) 19979 rack_set_cc_pacing(rack); 19980 rack_log_pacing_delay_calc(rack, 19981 rack->r_ctl.rc_fixed_pacing_rate_ss, 19982 rack->r_ctl.rc_fixed_pacing_rate_ca, 19983 rack->r_ctl.rc_fixed_pacing_rate_rec, 0, 0, 8, 19984 __LINE__, NULL,0); 19985 break; 19986 19987 case TCP_RACK_PACE_RATE_SS: 19988 /* Set the fixed pacing rate in Bytes per second ca */ 19989 RACK_OPTS_INC(tcp_rack_pace_rate_ss); 19990 rack->r_ctl.rc_fixed_pacing_rate_ss = optval; 19991 if (rack->r_ctl.rc_fixed_pacing_rate_ca == 0) 19992 rack->r_ctl.rc_fixed_pacing_rate_ca = optval; 19993 if (rack->r_ctl.rc_fixed_pacing_rate_rec == 0) 19994 rack->r_ctl.rc_fixed_pacing_rate_rec = optval; 19995 rack->use_fixed_rate = 1; 19996 if (rack->rc_always_pace) 19997 rack_set_cc_pacing(rack); 19998 rack_log_pacing_delay_calc(rack, 19999 rack->r_ctl.rc_fixed_pacing_rate_ss, 20000 rack->r_ctl.rc_fixed_pacing_rate_ca, 20001 rack->r_ctl.rc_fixed_pacing_rate_rec, 0, 0, 8, 20002 __LINE__, NULL, 0); 20003 break; 20004 20005 case TCP_RACK_PACE_RATE_CA: 20006 /* Set the fixed pacing rate in Bytes per second ca */ 20007 RACK_OPTS_INC(tcp_rack_pace_rate_ca); 20008 rack->r_ctl.rc_fixed_pacing_rate_ca = optval; 20009 if (rack->r_ctl.rc_fixed_pacing_rate_ss == 0) 20010 rack->r_ctl.rc_fixed_pacing_rate_ss = optval; 20011 if (rack->r_ctl.rc_fixed_pacing_rate_rec == 0) 20012 rack->r_ctl.rc_fixed_pacing_rate_rec = optval; 20013 rack->use_fixed_rate = 1; 20014 if (rack->rc_always_pace) 20015 rack_set_cc_pacing(rack); 20016 rack_log_pacing_delay_calc(rack, 20017 rack->r_ctl.rc_fixed_pacing_rate_ss, 20018 rack->r_ctl.rc_fixed_pacing_rate_ca, 20019 rack->r_ctl.rc_fixed_pacing_rate_rec, 0, 0, 8, 20020 __LINE__, NULL, 0); 20021 break; 20022 case TCP_RACK_GP_INCREASE_REC: 20023 RACK_OPTS_INC(tcp_gp_inc_rec); 20024 rack->r_ctl.rack_per_of_gp_rec = optval; 20025 rack_log_pacing_delay_calc(rack, 20026 rack->r_ctl.rack_per_of_gp_ss, 20027 rack->r_ctl.rack_per_of_gp_ca, 20028 rack->r_ctl.rack_per_of_gp_rec, 0, 0, 1, 20029 __LINE__, NULL, 0); 20030 break; 20031 case TCP_RACK_GP_INCREASE_CA: 20032 RACK_OPTS_INC(tcp_gp_inc_ca); 20033 ca = optval; 20034 if (ca < 100) { 20035 /* 20036 * We don't allow any reduction 20037 * over the GP b/w. 20038 */ 20039 error = EINVAL; 20040 break; 20041 } 20042 rack->r_ctl.rack_per_of_gp_ca = ca; 20043 rack_log_pacing_delay_calc(rack, 20044 rack->r_ctl.rack_per_of_gp_ss, 20045 rack->r_ctl.rack_per_of_gp_ca, 20046 rack->r_ctl.rack_per_of_gp_rec, 0, 0, 1, 20047 __LINE__, NULL, 0); 20048 break; 20049 case TCP_RACK_GP_INCREASE_SS: 20050 RACK_OPTS_INC(tcp_gp_inc_ss); 20051 ss = optval; 20052 if (ss < 100) { 20053 /* 20054 * We don't allow any reduction 20055 * over the GP b/w. 20056 */ 20057 error = EINVAL; 20058 break; 20059 } 20060 rack->r_ctl.rack_per_of_gp_ss = ss; 20061 rack_log_pacing_delay_calc(rack, 20062 rack->r_ctl.rack_per_of_gp_ss, 20063 rack->r_ctl.rack_per_of_gp_ca, 20064 rack->r_ctl.rack_per_of_gp_rec, 0, 0, 1, 20065 __LINE__, NULL, 0); 20066 break; 20067 case TCP_RACK_RR_CONF: 20068 RACK_OPTS_INC(tcp_rack_rrr_no_conf_rate); 20069 if (optval && optval <= 3) 20070 rack->r_rr_config = optval; 20071 else 20072 rack->r_rr_config = 0; 20073 break; 20074 case TCP_HDWR_RATE_CAP: 20075 RACK_OPTS_INC(tcp_hdwr_rate_cap); 20076 if (optval) { 20077 if (rack->r_rack_hw_rate_caps == 0) 20078 rack->r_rack_hw_rate_caps = 1; 20079 else 20080 error = EALREADY; 20081 } else { 20082 rack->r_rack_hw_rate_caps = 0; 20083 } 20084 break; 20085 case TCP_BBR_HDWR_PACE: 20086 RACK_OPTS_INC(tcp_hdwr_pacing); 20087 if (optval){ 20088 if (rack->rack_hdrw_pacing == 0) { 20089 rack->rack_hdw_pace_ena = 1; 20090 rack->rack_attempt_hdwr_pace = 0; 20091 } else 20092 error = EALREADY; 20093 } else { 20094 rack->rack_hdw_pace_ena = 0; 20095 #ifdef RATELIMIT 20096 if (rack->r_ctl.crte != NULL) { 20097 rack->rack_hdrw_pacing = 0; 20098 rack->rack_attempt_hdwr_pace = 0; 20099 tcp_rel_pacing_rate(rack->r_ctl.crte, tp); 20100 rack->r_ctl.crte = NULL; 20101 } 20102 #endif 20103 } 20104 break; 20105 /* End Pacing related ones */ 20106 case TCP_RACK_PRR_SENDALOT: 20107 /* Allow PRR to send more than one seg */ 20108 RACK_OPTS_INC(tcp_rack_prr_sendalot); 20109 rack->r_ctl.rc_prr_sendalot = optval; 20110 break; 20111 case TCP_RACK_MIN_TO: 20112 /* Minimum time between rack t-o's in ms */ 20113 RACK_OPTS_INC(tcp_rack_min_to); 20114 rack->r_ctl.rc_min_to = optval; 20115 break; 20116 case TCP_RACK_EARLY_SEG: 20117 /* If early recovery max segments */ 20118 RACK_OPTS_INC(tcp_rack_early_seg); 20119 rack->r_ctl.rc_early_recovery_segs = optval; 20120 break; 20121 case TCP_RACK_ENABLE_HYSTART: 20122 { 20123 if (optval) { 20124 tp->t_ccv.flags |= CCF_HYSTART_ALLOWED; 20125 if (rack_do_hystart > RACK_HYSTART_ON) 20126 tp->t_ccv.flags |= CCF_HYSTART_CAN_SH_CWND; 20127 if (rack_do_hystart > RACK_HYSTART_ON_W_SC) 20128 tp->t_ccv.flags |= CCF_HYSTART_CONS_SSTH; 20129 } else { 20130 tp->t_ccv.flags &= ~(CCF_HYSTART_ALLOWED|CCF_HYSTART_CAN_SH_CWND|CCF_HYSTART_CONS_SSTH); 20131 } 20132 } 20133 break; 20134 case TCP_RACK_REORD_THRESH: 20135 /* RACK reorder threshold (shift amount) */ 20136 RACK_OPTS_INC(tcp_rack_reord_thresh); 20137 if ((optval > 0) && (optval < 31)) 20138 rack->r_ctl.rc_reorder_shift = optval; 20139 else 20140 error = EINVAL; 20141 break; 20142 case TCP_RACK_REORD_FADE: 20143 /* Does reordering fade after ms time */ 20144 RACK_OPTS_INC(tcp_rack_reord_fade); 20145 rack->r_ctl.rc_reorder_fade = optval; 20146 break; 20147 case TCP_RACK_TLP_THRESH: 20148 /* RACK TLP theshold i.e. srtt+(srtt/N) */ 20149 RACK_OPTS_INC(tcp_rack_tlp_thresh); 20150 if (optval) 20151 rack->r_ctl.rc_tlp_threshold = optval; 20152 else 20153 error = EINVAL; 20154 break; 20155 case TCP_BBR_USE_RACK_RR: 20156 RACK_OPTS_INC(tcp_rack_rr); 20157 if (optval) 20158 rack->use_rack_rr = 1; 20159 else 20160 rack->use_rack_rr = 0; 20161 break; 20162 case TCP_FAST_RSM_HACK: 20163 RACK_OPTS_INC(tcp_rack_fastrsm_hack); 20164 if (optval) 20165 rack->fast_rsm_hack = 1; 20166 else 20167 rack->fast_rsm_hack = 0; 20168 break; 20169 case TCP_RACK_PKT_DELAY: 20170 /* RACK added ms i.e. rack-rtt + reord + N */ 20171 RACK_OPTS_INC(tcp_rack_pkt_delay); 20172 rack->r_ctl.rc_pkt_delay = optval; 20173 break; 20174 case TCP_DELACK: 20175 RACK_OPTS_INC(tcp_rack_delayed_ack); 20176 if (optval == 0) 20177 tp->t_delayed_ack = 0; 20178 else 20179 tp->t_delayed_ack = 1; 20180 if (tp->t_flags & TF_DELACK) { 20181 tp->t_flags &= ~TF_DELACK; 20182 tp->t_flags |= TF_ACKNOW; 20183 NET_EPOCH_ENTER(et); 20184 rack_output(tp); 20185 NET_EPOCH_EXIT(et); 20186 } 20187 break; 20188 20189 case TCP_BBR_RACK_RTT_USE: 20190 RACK_OPTS_INC(tcp_rack_rtt_use); 20191 if ((optval != USE_RTT_HIGH) && 20192 (optval != USE_RTT_LOW) && 20193 (optval != USE_RTT_AVG)) 20194 error = EINVAL; 20195 else 20196 rack->r_ctl.rc_rate_sample_method = optval; 20197 break; 20198 case TCP_DATA_AFTER_CLOSE: 20199 RACK_OPTS_INC(tcp_data_after_close); 20200 if (optval) 20201 rack->rc_allow_data_af_clo = 1; 20202 else 20203 rack->rc_allow_data_af_clo = 0; 20204 break; 20205 default: 20206 break; 20207 } 20208 #ifdef NETFLIX_STATS 20209 tcp_log_socket_option(tp, sopt_name, optval, error); 20210 #endif 20211 return (error); 20212 } 20213 20214 20215 static void 20216 rack_apply_deferred_options(struct tcp_rack *rack) 20217 { 20218 struct deferred_opt_list *dol, *sdol; 20219 uint32_t s_optval; 20220 20221 TAILQ_FOREACH_SAFE(dol, &rack->r_ctl.opt_list, next, sdol) { 20222 TAILQ_REMOVE(&rack->r_ctl.opt_list, dol, next); 20223 /* Disadvantage of deferal is you loose the error return */ 20224 s_optval = (uint32_t)dol->optval; 20225 (void)rack_process_option(rack->rc_tp, rack, dol->optname, s_optval, dol->optval); 20226 free(dol, M_TCPDO); 20227 } 20228 } 20229 20230 static void 20231 rack_hw_tls_change(struct tcpcb *tp, int chg) 20232 { 20233 /* 20234 * HW tls state has changed.. fix all 20235 * rsm's in flight. 20236 */ 20237 struct tcp_rack *rack; 20238 struct rack_sendmap *rsm; 20239 20240 rack = (struct tcp_rack *)tp->t_fb_ptr; 20241 RB_FOREACH(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) { 20242 if (chg) 20243 rsm->r_hw_tls = 1; 20244 else 20245 rsm->r_hw_tls = 0; 20246 } 20247 if (chg) 20248 rack->r_ctl.fsb.hw_tls = 1; 20249 else 20250 rack->r_ctl.fsb.hw_tls = 0; 20251 } 20252 20253 static int 20254 rack_pru_options(struct tcpcb *tp, int flags) 20255 { 20256 if (flags & PRUS_OOB) 20257 return (EOPNOTSUPP); 20258 return (0); 20259 } 20260 20261 static struct tcp_function_block __tcp_rack = { 20262 .tfb_tcp_block_name = __XSTRING(STACKNAME), 20263 .tfb_tcp_output = rack_output, 20264 .tfb_do_queued_segments = ctf_do_queued_segments, 20265 .tfb_do_segment_nounlock = rack_do_segment_nounlock, 20266 .tfb_tcp_do_segment = rack_do_segment, 20267 .tfb_tcp_ctloutput = rack_ctloutput, 20268 .tfb_tcp_fb_init = rack_init, 20269 .tfb_tcp_fb_fini = rack_fini, 20270 .tfb_tcp_timer_stop_all = rack_stopall, 20271 .tfb_tcp_rexmit_tmr = rack_remxt_tmr, 20272 .tfb_tcp_handoff_ok = rack_handoff_ok, 20273 .tfb_tcp_mtu_chg = rack_mtu_change, 20274 .tfb_pru_options = rack_pru_options, 20275 .tfb_hwtls_change = rack_hw_tls_change, 20276 .tfb_compute_pipe = rack_compute_pipe, 20277 .tfb_flags = TCP_FUNC_OUTPUT_CANDROP, 20278 }; 20279 20280 /* 20281 * rack_ctloutput() must drop the inpcb lock before performing copyin on 20282 * socket option arguments. When it re-acquires the lock after the copy, it 20283 * has to revalidate that the connection is still valid for the socket 20284 * option. 20285 */ 20286 static int 20287 rack_set_sockopt(struct inpcb *inp, struct sockopt *sopt) 20288 { 20289 #ifdef INET6 20290 struct ip6_hdr *ip6; 20291 #endif 20292 #ifdef INET 20293 struct ip *ip; 20294 #endif 20295 struct tcpcb *tp; 20296 struct tcp_rack *rack; 20297 uint64_t loptval; 20298 int32_t error = 0, optval; 20299 20300 tp = intotcpcb(inp); 20301 rack = (struct tcp_rack *)tp->t_fb_ptr; 20302 if (rack == NULL) { 20303 INP_WUNLOCK(inp); 20304 return (EINVAL); 20305 } 20306 #ifdef INET6 20307 ip6 = (struct ip6_hdr *)rack->r_ctl.fsb.tcp_ip_hdr; 20308 #endif 20309 #ifdef INET 20310 ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr; 20311 #endif 20312 20313 switch (sopt->sopt_level) { 20314 #ifdef INET6 20315 case IPPROTO_IPV6: 20316 MPASS(inp->inp_vflag & INP_IPV6PROTO); 20317 switch (sopt->sopt_name) { 20318 case IPV6_USE_MIN_MTU: 20319 tcp6_use_min_mtu(tp); 20320 break; 20321 case IPV6_TCLASS: 20322 /* 20323 * The DSCP codepoint has changed, update the fsb. 20324 */ 20325 ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) | 20326 (rack->rc_inp->inp_flow & IPV6_FLOWINFO_MASK); 20327 break; 20328 } 20329 INP_WUNLOCK(inp); 20330 return (0); 20331 #endif 20332 #ifdef INET 20333 case IPPROTO_IP: 20334 switch (sopt->sopt_name) { 20335 case IP_TOS: 20336 /* 20337 * The DSCP codepoint has changed, update the fsb. 20338 */ 20339 ip->ip_tos = rack->rc_inp->inp_ip_tos; 20340 break; 20341 case IP_TTL: 20342 /* 20343 * The TTL has changed, update the fsb. 20344 */ 20345 ip->ip_ttl = rack->rc_inp->inp_ip_ttl; 20346 break; 20347 } 20348 INP_WUNLOCK(inp); 20349 return (0); 20350 #endif 20351 } 20352 20353 switch (sopt->sopt_name) { 20354 case TCP_RACK_TLP_REDUCE: /* URL:tlp_reduce */ 20355 /* Pacing related ones */ 20356 case TCP_RACK_PACE_ALWAYS: /* URL:pace_always */ 20357 case TCP_BBR_RACK_INIT_RATE: /* URL:irate */ 20358 case TCP_BBR_IWINTSO: /* URL:tso_iwin */ 20359 case TCP_RACK_PACE_MAX_SEG: /* URL:pace_max_seg */ 20360 case TCP_RACK_FORCE_MSEG: /* URL:force_max_seg */ 20361 case TCP_RACK_PACE_RATE_CA: /* URL:pr_ca */ 20362 case TCP_RACK_PACE_RATE_SS: /* URL:pr_ss*/ 20363 case TCP_RACK_PACE_RATE_REC: /* URL:pr_rec */ 20364 case TCP_RACK_GP_INCREASE_CA: /* URL:gp_inc_ca */ 20365 case TCP_RACK_GP_INCREASE_SS: /* URL:gp_inc_ss */ 20366 case TCP_RACK_GP_INCREASE_REC: /* URL:gp_inc_rec */ 20367 case TCP_RACK_RR_CONF: /* URL:rrr_conf */ 20368 case TCP_BBR_HDWR_PACE: /* URL:hdwrpace */ 20369 case TCP_HDWR_RATE_CAP: /* URL:hdwrcap boolean */ 20370 case TCP_PACING_RATE_CAP: /* URL:cap -- used by side-channel */ 20371 case TCP_HDWR_UP_ONLY: /* URL:uponly -- hardware pacing boolean */ 20372 /* End pacing related */ 20373 case TCP_FAST_RSM_HACK: /* URL:frsm_hack */ 20374 case TCP_DELACK: /* URL:delack (in base TCP i.e. tcp_hints along with cc etc ) */ 20375 case TCP_RACK_PRR_SENDALOT: /* URL:prr_sendalot */ 20376 case TCP_RACK_MIN_TO: /* URL:min_to */ 20377 case TCP_RACK_EARLY_SEG: /* URL:early_seg */ 20378 case TCP_RACK_REORD_THRESH: /* URL:reord_thresh */ 20379 case TCP_RACK_REORD_FADE: /* URL:reord_fade */ 20380 case TCP_RACK_TLP_THRESH: /* URL:tlp_thresh */ 20381 case TCP_RACK_PKT_DELAY: /* URL:pkt_delay */ 20382 case TCP_RACK_TLP_USE: /* URL:tlp_use */ 20383 case TCP_BBR_RACK_RTT_USE: /* URL:rttuse */ 20384 case TCP_BBR_USE_RACK_RR: /* URL:rackrr */ 20385 case TCP_RACK_DO_DETECTION: /* URL:detect */ 20386 case TCP_NO_PRR: /* URL:noprr */ 20387 case TCP_TIMELY_DYN_ADJ: /* URL:dynamic */ 20388 case TCP_DATA_AFTER_CLOSE: /* no URL */ 20389 case TCP_RACK_NONRXT_CFG_RATE: /* URL:nonrxtcr */ 20390 case TCP_SHARED_CWND_ENABLE: /* URL:scwnd */ 20391 case TCP_RACK_MBUF_QUEUE: /* URL:mqueue */ 20392 case TCP_RACK_NO_PUSH_AT_MAX: /* URL:npush */ 20393 case TCP_RACK_PACE_TO_FILL: /* URL:fillcw */ 20394 case TCP_SHARED_CWND_TIME_LIMIT: /* URL:lscwnd */ 20395 case TCP_RACK_PROFILE: /* URL:profile */ 20396 case TCP_USE_CMP_ACKS: /* URL:cmpack */ 20397 case TCP_RACK_ABC_VAL: /* URL:labc */ 20398 case TCP_REC_ABC_VAL: /* URL:reclabc */ 20399 case TCP_RACK_MEASURE_CNT: /* URL:measurecnt */ 20400 case TCP_DEFER_OPTIONS: /* URL:defer */ 20401 case TCP_RACK_DSACK_OPT: /* URL:dsack */ 20402 case TCP_RACK_PACING_BETA: /* URL:pacing_beta */ 20403 case TCP_RACK_PACING_BETA_ECN: /* URL:pacing_beta_ecn */ 20404 case TCP_RACK_TIMER_SLOP: /* URL:timer_slop */ 20405 case TCP_RACK_ENABLE_HYSTART: /* URL:hystart */ 20406 break; 20407 default: 20408 /* Filter off all unknown options to the base stack */ 20409 return (tcp_default_ctloutput(inp, sopt)); 20410 break; 20411 } 20412 INP_WUNLOCK(inp); 20413 if (sopt->sopt_name == TCP_PACING_RATE_CAP) { 20414 error = sooptcopyin(sopt, &loptval, sizeof(loptval), sizeof(loptval)); 20415 /* 20416 * We truncate it down to 32 bits for the socket-option trace this 20417 * means rates > 34Gbps won't show right, but thats probably ok. 20418 */ 20419 optval = (uint32_t)loptval; 20420 } else { 20421 error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); 20422 /* Save it in 64 bit form too */ 20423 loptval = optval; 20424 } 20425 if (error) 20426 return (error); 20427 INP_WLOCK(inp); 20428 if (inp->inp_flags & INP_DROPPED) { 20429 INP_WUNLOCK(inp); 20430 return (ECONNRESET); 20431 } 20432 if (tp->t_fb != &__tcp_rack) { 20433 INP_WUNLOCK(inp); 20434 return (ENOPROTOOPT); 20435 } 20436 if (rack->defer_options && (rack->gp_ready == 0) && 20437 (sopt->sopt_name != TCP_DEFER_OPTIONS) && 20438 (sopt->sopt_name != TCP_RACK_PACING_BETA) && 20439 (sopt->sopt_name != TCP_RACK_PACING_BETA_ECN) && 20440 (sopt->sopt_name != TCP_RACK_MEASURE_CNT)) { 20441 /* Options are beind deferred */ 20442 if (rack_add_deferred_option(rack, sopt->sopt_name, loptval)) { 20443 INP_WUNLOCK(inp); 20444 return (0); 20445 } else { 20446 /* No memory to defer, fail */ 20447 INP_WUNLOCK(inp); 20448 return (ENOMEM); 20449 } 20450 } 20451 error = rack_process_option(tp, rack, sopt->sopt_name, optval, loptval); 20452 INP_WUNLOCK(inp); 20453 return (error); 20454 } 20455 20456 static void 20457 rack_fill_info(struct tcpcb *tp, struct tcp_info *ti) 20458 { 20459 20460 INP_WLOCK_ASSERT(tptoinpcb(tp)); 20461 bzero(ti, sizeof(*ti)); 20462 20463 ti->tcpi_state = tp->t_state; 20464 if ((tp->t_flags & TF_REQ_TSTMP) && (tp->t_flags & TF_RCVD_TSTMP)) 20465 ti->tcpi_options |= TCPI_OPT_TIMESTAMPS; 20466 if (tp->t_flags & TF_SACK_PERMIT) 20467 ti->tcpi_options |= TCPI_OPT_SACK; 20468 if ((tp->t_flags & TF_REQ_SCALE) && (tp->t_flags & TF_RCVD_SCALE)) { 20469 ti->tcpi_options |= TCPI_OPT_WSCALE; 20470 ti->tcpi_snd_wscale = tp->snd_scale; 20471 ti->tcpi_rcv_wscale = tp->rcv_scale; 20472 } 20473 if (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT)) 20474 ti->tcpi_options |= TCPI_OPT_ECN; 20475 if (tp->t_flags & TF_FASTOPEN) 20476 ti->tcpi_options |= TCPI_OPT_TFO; 20477 /* still kept in ticks is t_rcvtime */ 20478 ti->tcpi_last_data_recv = ((uint32_t)ticks - tp->t_rcvtime) * tick; 20479 /* Since we hold everything in precise useconds this is easy */ 20480 ti->tcpi_rtt = tp->t_srtt; 20481 ti->tcpi_rttvar = tp->t_rttvar; 20482 ti->tcpi_rto = tp->t_rxtcur; 20483 ti->tcpi_snd_ssthresh = tp->snd_ssthresh; 20484 ti->tcpi_snd_cwnd = tp->snd_cwnd; 20485 /* 20486 * FreeBSD-specific extension fields for tcp_info. 20487 */ 20488 ti->tcpi_rcv_space = tp->rcv_wnd; 20489 ti->tcpi_rcv_nxt = tp->rcv_nxt; 20490 ti->tcpi_snd_wnd = tp->snd_wnd; 20491 ti->tcpi_snd_bwnd = 0; /* Unused, kept for compat. */ 20492 ti->tcpi_snd_nxt = tp->snd_nxt; 20493 ti->tcpi_snd_mss = tp->t_maxseg; 20494 ti->tcpi_rcv_mss = tp->t_maxseg; 20495 ti->tcpi_snd_rexmitpack = tp->t_sndrexmitpack; 20496 ti->tcpi_rcv_ooopack = tp->t_rcvoopack; 20497 ti->tcpi_snd_zerowin = tp->t_sndzerowin; 20498 #ifdef NETFLIX_STATS 20499 ti->tcpi_total_tlp = tp->t_sndtlppack; 20500 ti->tcpi_total_tlp_bytes = tp->t_sndtlpbyte; 20501 memcpy(&ti->tcpi_rxsyninfo, &tp->t_rxsyninfo, sizeof(struct tcpsyninfo)); 20502 #endif 20503 #ifdef TCP_OFFLOAD 20504 if (tp->t_flags & TF_TOE) { 20505 ti->tcpi_options |= TCPI_OPT_TOE; 20506 tcp_offload_tcp_info(tp, ti); 20507 } 20508 #endif 20509 } 20510 20511 static int 20512 rack_get_sockopt(struct inpcb *inp, struct sockopt *sopt) 20513 { 20514 struct tcpcb *tp; 20515 struct tcp_rack *rack; 20516 int32_t error, optval; 20517 uint64_t val, loptval; 20518 struct tcp_info ti; 20519 /* 20520 * Because all our options are either boolean or an int, we can just 20521 * pull everything into optval and then unlock and copy. If we ever 20522 * add a option that is not a int, then this will have quite an 20523 * impact to this routine. 20524 */ 20525 error = 0; 20526 tp = intotcpcb(inp); 20527 rack = (struct tcp_rack *)tp->t_fb_ptr; 20528 if (rack == NULL) { 20529 INP_WUNLOCK(inp); 20530 return (EINVAL); 20531 } 20532 switch (sopt->sopt_name) { 20533 case TCP_INFO: 20534 /* First get the info filled */ 20535 rack_fill_info(tp, &ti); 20536 /* Fix up the rtt related fields if needed */ 20537 INP_WUNLOCK(inp); 20538 error = sooptcopyout(sopt, &ti, sizeof ti); 20539 return (error); 20540 /* 20541 * Beta is the congestion control value for NewReno that influences how 20542 * much of a backoff happens when loss is detected. It is normally set 20543 * to 50 for 50% i.e. the cwnd is reduced to 50% of its previous value 20544 * when you exit recovery. 20545 */ 20546 case TCP_RACK_PACING_BETA: 20547 if (strcmp(tp->t_cc->name, CCALGONAME_NEWRENO) != 0) 20548 error = EINVAL; 20549 else if (rack->rc_pacing_cc_set == 0) 20550 optval = rack->r_ctl.rc_saved_beta.beta; 20551 else { 20552 /* 20553 * Reach out into the CC data and report back what 20554 * I have previously set. Yeah it looks hackish but 20555 * we don't want to report the saved values. 20556 */ 20557 if (tp->t_ccv.cc_data) 20558 optval = ((struct newreno *)tp->t_ccv.cc_data)->beta; 20559 else 20560 error = EINVAL; 20561 } 20562 break; 20563 /* 20564 * Beta_ecn is the congestion control value for NewReno that influences how 20565 * much of a backoff happens when a ECN mark is detected. It is normally set 20566 * to 80 for 80% i.e. the cwnd is reduced by 20% of its previous value when 20567 * you exit recovery. Note that classic ECN has a beta of 50, it is only 20568 * ABE Ecn that uses this "less" value, but we do too with pacing :) 20569 */ 20570 20571 case TCP_RACK_PACING_BETA_ECN: 20572 if (strcmp(tp->t_cc->name, CCALGONAME_NEWRENO) != 0) 20573 error = EINVAL; 20574 else if (rack->rc_pacing_cc_set == 0) 20575 optval = rack->r_ctl.rc_saved_beta.beta_ecn; 20576 else { 20577 /* 20578 * Reach out into the CC data and report back what 20579 * I have previously set. Yeah it looks hackish but 20580 * we don't want to report the saved values. 20581 */ 20582 if (tp->t_ccv.cc_data) 20583 optval = ((struct newreno *)tp->t_ccv.cc_data)->beta_ecn; 20584 else 20585 error = EINVAL; 20586 } 20587 break; 20588 case TCP_RACK_DSACK_OPT: 20589 optval = 0; 20590 if (rack->rc_rack_tmr_std_based) { 20591 optval |= 1; 20592 } 20593 if (rack->rc_rack_use_dsack) { 20594 optval |= 2; 20595 } 20596 break; 20597 case TCP_RACK_ENABLE_HYSTART: 20598 { 20599 if (tp->t_ccv.flags & CCF_HYSTART_ALLOWED) { 20600 optval = RACK_HYSTART_ON; 20601 if (tp->t_ccv.flags & CCF_HYSTART_CAN_SH_CWND) 20602 optval = RACK_HYSTART_ON_W_SC; 20603 if (tp->t_ccv.flags & CCF_HYSTART_CONS_SSTH) 20604 optval = RACK_HYSTART_ON_W_SC_C; 20605 } else { 20606 optval = RACK_HYSTART_OFF; 20607 } 20608 } 20609 break; 20610 case TCP_FAST_RSM_HACK: 20611 optval = rack->fast_rsm_hack; 20612 break; 20613 case TCP_DEFER_OPTIONS: 20614 optval = rack->defer_options; 20615 break; 20616 case TCP_RACK_MEASURE_CNT: 20617 optval = rack->r_ctl.req_measurements; 20618 break; 20619 case TCP_REC_ABC_VAL: 20620 optval = rack->r_use_labc_for_rec; 20621 break; 20622 case TCP_RACK_ABC_VAL: 20623 optval = rack->rc_labc; 20624 break; 20625 case TCP_HDWR_UP_ONLY: 20626 optval= rack->r_up_only; 20627 break; 20628 case TCP_PACING_RATE_CAP: 20629 loptval = rack->r_ctl.bw_rate_cap; 20630 break; 20631 case TCP_RACK_PROFILE: 20632 /* You cannot retrieve a profile, its write only */ 20633 error = EINVAL; 20634 break; 20635 case TCP_USE_CMP_ACKS: 20636 optval = rack->r_use_cmp_ack; 20637 break; 20638 case TCP_RACK_PACE_TO_FILL: 20639 optval = rack->rc_pace_to_cwnd; 20640 if (optval && rack->r_fill_less_agg) 20641 optval++; 20642 break; 20643 case TCP_RACK_NO_PUSH_AT_MAX: 20644 optval = rack->r_ctl.rc_no_push_at_mrtt; 20645 break; 20646 case TCP_SHARED_CWND_ENABLE: 20647 optval = rack->rack_enable_scwnd; 20648 break; 20649 case TCP_RACK_NONRXT_CFG_RATE: 20650 optval = rack->rack_rec_nonrxt_use_cr; 20651 break; 20652 case TCP_NO_PRR: 20653 if (rack->rack_no_prr == 1) 20654 optval = 1; 20655 else if (rack->no_prr_addback == 1) 20656 optval = 2; 20657 else 20658 optval = 0; 20659 break; 20660 case TCP_RACK_DO_DETECTION: 20661 optval = rack->do_detection; 20662 break; 20663 case TCP_RACK_MBUF_QUEUE: 20664 /* Now do we use the LRO mbuf-queue feature */ 20665 optval = rack->r_mbuf_queue; 20666 break; 20667 case TCP_TIMELY_DYN_ADJ: 20668 optval = rack->rc_gp_dyn_mul; 20669 break; 20670 case TCP_BBR_IWINTSO: 20671 optval = rack->rc_init_win; 20672 break; 20673 case TCP_RACK_TLP_REDUCE: 20674 /* RACK TLP cwnd reduction (bool) */ 20675 optval = rack->r_ctl.rc_tlp_cwnd_reduce; 20676 break; 20677 case TCP_BBR_RACK_INIT_RATE: 20678 val = rack->r_ctl.init_rate; 20679 /* convert to kbits per sec */ 20680 val *= 8; 20681 val /= 1000; 20682 optval = (uint32_t)val; 20683 break; 20684 case TCP_RACK_FORCE_MSEG: 20685 optval = rack->rc_force_max_seg; 20686 break; 20687 case TCP_RACK_PACE_MAX_SEG: 20688 /* Max segments in a pace */ 20689 optval = rack->rc_user_set_max_segs; 20690 break; 20691 case TCP_RACK_PACE_ALWAYS: 20692 /* Use the always pace method */ 20693 optval = rack->rc_always_pace; 20694 break; 20695 case TCP_RACK_PRR_SENDALOT: 20696 /* Allow PRR to send more than one seg */ 20697 optval = rack->r_ctl.rc_prr_sendalot; 20698 break; 20699 case TCP_RACK_MIN_TO: 20700 /* Minimum time between rack t-o's in ms */ 20701 optval = rack->r_ctl.rc_min_to; 20702 break; 20703 case TCP_RACK_EARLY_SEG: 20704 /* If early recovery max segments */ 20705 optval = rack->r_ctl.rc_early_recovery_segs; 20706 break; 20707 case TCP_RACK_REORD_THRESH: 20708 /* RACK reorder threshold (shift amount) */ 20709 optval = rack->r_ctl.rc_reorder_shift; 20710 break; 20711 case TCP_RACK_REORD_FADE: 20712 /* Does reordering fade after ms time */ 20713 optval = rack->r_ctl.rc_reorder_fade; 20714 break; 20715 case TCP_BBR_USE_RACK_RR: 20716 /* Do we use the rack cheat for rxt */ 20717 optval = rack->use_rack_rr; 20718 break; 20719 case TCP_RACK_RR_CONF: 20720 optval = rack->r_rr_config; 20721 break; 20722 case TCP_HDWR_RATE_CAP: 20723 optval = rack->r_rack_hw_rate_caps; 20724 break; 20725 case TCP_BBR_HDWR_PACE: 20726 optval = rack->rack_hdw_pace_ena; 20727 break; 20728 case TCP_RACK_TLP_THRESH: 20729 /* RACK TLP theshold i.e. srtt+(srtt/N) */ 20730 optval = rack->r_ctl.rc_tlp_threshold; 20731 break; 20732 case TCP_RACK_PKT_DELAY: 20733 /* RACK added ms i.e. rack-rtt + reord + N */ 20734 optval = rack->r_ctl.rc_pkt_delay; 20735 break; 20736 case TCP_RACK_TLP_USE: 20737 optval = rack->rack_tlp_threshold_use; 20738 break; 20739 case TCP_RACK_PACE_RATE_CA: 20740 optval = rack->r_ctl.rc_fixed_pacing_rate_ca; 20741 break; 20742 case TCP_RACK_PACE_RATE_SS: 20743 optval = rack->r_ctl.rc_fixed_pacing_rate_ss; 20744 break; 20745 case TCP_RACK_PACE_RATE_REC: 20746 optval = rack->r_ctl.rc_fixed_pacing_rate_rec; 20747 break; 20748 case TCP_RACK_GP_INCREASE_SS: 20749 optval = rack->r_ctl.rack_per_of_gp_ca; 20750 break; 20751 case TCP_RACK_GP_INCREASE_CA: 20752 optval = rack->r_ctl.rack_per_of_gp_ss; 20753 break; 20754 case TCP_BBR_RACK_RTT_USE: 20755 optval = rack->r_ctl.rc_rate_sample_method; 20756 break; 20757 case TCP_DELACK: 20758 optval = tp->t_delayed_ack; 20759 break; 20760 case TCP_DATA_AFTER_CLOSE: 20761 optval = rack->rc_allow_data_af_clo; 20762 break; 20763 case TCP_SHARED_CWND_TIME_LIMIT: 20764 optval = rack->r_limit_scw; 20765 break; 20766 case TCP_RACK_TIMER_SLOP: 20767 optval = rack->r_ctl.timer_slop; 20768 break; 20769 default: 20770 return (tcp_default_ctloutput(inp, sopt)); 20771 break; 20772 } 20773 INP_WUNLOCK(inp); 20774 if (error == 0) { 20775 if (TCP_PACING_RATE_CAP) 20776 error = sooptcopyout(sopt, &loptval, sizeof loptval); 20777 else 20778 error = sooptcopyout(sopt, &optval, sizeof optval); 20779 } 20780 return (error); 20781 } 20782 20783 static int 20784 rack_ctloutput(struct inpcb *inp, struct sockopt *sopt) 20785 { 20786 if (sopt->sopt_dir == SOPT_SET) { 20787 return (rack_set_sockopt(inp, sopt)); 20788 } else if (sopt->sopt_dir == SOPT_GET) { 20789 return (rack_get_sockopt(inp, sopt)); 20790 } else { 20791 panic("%s: sopt_dir $%d", __func__, sopt->sopt_dir); 20792 } 20793 } 20794 20795 static const char *rack_stack_names[] = { 20796 __XSTRING(STACKNAME), 20797 #ifdef STACKALIAS 20798 __XSTRING(STACKALIAS), 20799 #endif 20800 }; 20801 20802 static int 20803 rack_ctor(void *mem, int32_t size, void *arg, int32_t how) 20804 { 20805 memset(mem, 0, size); 20806 return (0); 20807 } 20808 20809 static void 20810 rack_dtor(void *mem, int32_t size, void *arg) 20811 { 20812 20813 } 20814 20815 static bool rack_mod_inited = false; 20816 20817 static int 20818 tcp_addrack(module_t mod, int32_t type, void *data) 20819 { 20820 int32_t err = 0; 20821 int num_stacks; 20822 20823 switch (type) { 20824 case MOD_LOAD: 20825 rack_zone = uma_zcreate(__XSTRING(MODNAME) "_map", 20826 sizeof(struct rack_sendmap), 20827 rack_ctor, rack_dtor, NULL, NULL, UMA_ALIGN_PTR, 0); 20828 20829 rack_pcb_zone = uma_zcreate(__XSTRING(MODNAME) "_pcb", 20830 sizeof(struct tcp_rack), 20831 rack_ctor, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0); 20832 20833 sysctl_ctx_init(&rack_sysctl_ctx); 20834 rack_sysctl_root = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 20835 SYSCTL_STATIC_CHILDREN(_net_inet_tcp), 20836 OID_AUTO, 20837 #ifdef STACKALIAS 20838 __XSTRING(STACKALIAS), 20839 #else 20840 __XSTRING(STACKNAME), 20841 #endif 20842 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 20843 ""); 20844 if (rack_sysctl_root == NULL) { 20845 printf("Failed to add sysctl node\n"); 20846 err = EFAULT; 20847 goto free_uma; 20848 } 20849 rack_init_sysctls(); 20850 num_stacks = nitems(rack_stack_names); 20851 err = register_tcp_functions_as_names(&__tcp_rack, M_WAITOK, 20852 rack_stack_names, &num_stacks); 20853 if (err) { 20854 printf("Failed to register %s stack name for " 20855 "%s module\n", rack_stack_names[num_stacks], 20856 __XSTRING(MODNAME)); 20857 sysctl_ctx_free(&rack_sysctl_ctx); 20858 free_uma: 20859 uma_zdestroy(rack_zone); 20860 uma_zdestroy(rack_pcb_zone); 20861 rack_counter_destroy(); 20862 printf("Failed to register rack module -- err:%d\n", err); 20863 return (err); 20864 } 20865 tcp_lro_reg_mbufq(); 20866 rack_mod_inited = true; 20867 break; 20868 case MOD_QUIESCE: 20869 err = deregister_tcp_functions(&__tcp_rack, true, false); 20870 break; 20871 case MOD_UNLOAD: 20872 err = deregister_tcp_functions(&__tcp_rack, false, true); 20873 if (err == EBUSY) 20874 break; 20875 if (rack_mod_inited) { 20876 uma_zdestroy(rack_zone); 20877 uma_zdestroy(rack_pcb_zone); 20878 sysctl_ctx_free(&rack_sysctl_ctx); 20879 rack_counter_destroy(); 20880 rack_mod_inited = false; 20881 } 20882 tcp_lro_dereg_mbufq(); 20883 err = 0; 20884 break; 20885 default: 20886 return (EOPNOTSUPP); 20887 } 20888 return (err); 20889 } 20890 20891 static moduledata_t tcp_rack = { 20892 .name = __XSTRING(MODNAME), 20893 .evhand = tcp_addrack, 20894 .priv = 0 20895 }; 20896 20897 MODULE_VERSION(MODNAME, 1); 20898 DECLARE_MODULE(MODNAME, tcp_rack, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY); 20899 MODULE_DEPEND(MODNAME, tcphpts, 1, 1, 1); 20900