1 /*- 2 * Copyright (c) 2016-2020 Netflix, Inc. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 16 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 23 * SUCH DAMAGE. 24 * 25 */ 26 /** 27 * Author: Randall Stewart <rrs@netflix.com> 28 * This work is based on the ACM Queue paper 29 * BBR - Congestion Based Congestion Control 30 * and also numerous discussions with Neal, Yuchung and Van. 31 */ 32 33 #include <sys/cdefs.h> 34 __FBSDID("$FreeBSD$"); 35 36 #include "opt_inet.h" 37 #include "opt_inet6.h" 38 #include "opt_ipsec.h" 39 #include "opt_tcpdebug.h" 40 #include "opt_ratelimit.h" 41 #include <sys/param.h> 42 #include <sys/arb.h> 43 #include <sys/module.h> 44 #include <sys/kernel.h> 45 #include <sys/libkern.h> 46 #ifdef TCP_HHOOK 47 #include <sys/hhook.h> 48 #endif 49 #include <sys/malloc.h> 50 #include <sys/mbuf.h> 51 #include <sys/proc.h> 52 #include <sys/socket.h> 53 #include <sys/socketvar.h> 54 #include <sys/sysctl.h> 55 #include <sys/systm.h> 56 #ifdef STATS 57 #include <sys/qmath.h> 58 #include <sys/tree.h> 59 #include <sys/stats.h> /* Must come after qmath.h and tree.h */ 60 #endif 61 #include <sys/refcount.h> 62 #include <sys/queue.h> 63 #include <sys/eventhandler.h> 64 #include <sys/smp.h> 65 #include <sys/kthread.h> 66 #include <sys/lock.h> 67 #include <sys/mutex.h> 68 #include <sys/tim_filter.h> 69 #include <sys/time.h> 70 #include <sys/protosw.h> 71 #include <vm/uma.h> 72 #include <sys/kern_prefetch.h> 73 74 #include <net/route.h> 75 #include <net/route/nhop.h> 76 #include <net/vnet.h> 77 78 #define TCPSTATES /* for logging */ 79 80 #include <netinet/in.h> 81 #include <netinet/in_kdtrace.h> 82 #include <netinet/in_pcb.h> 83 #include <netinet/ip.h> 84 #include <netinet/ip_icmp.h> /* required for icmp_var.h */ 85 #include <netinet/icmp_var.h> /* for ICMP_BANDLIM */ 86 #include <netinet/ip_var.h> 87 #include <netinet/ip6.h> 88 #include <netinet6/in6_pcb.h> 89 #include <netinet6/ip6_var.h> 90 #define TCPOUTFLAGS 91 #include <netinet/tcp.h> 92 #include <netinet/tcp_fsm.h> 93 #include <netinet/tcp_seq.h> 94 #include <netinet/tcp_timer.h> 95 #include <netinet/tcp_var.h> 96 #include <netinet/tcpip.h> 97 #include <netinet/tcp_hpts.h> 98 #include <netinet/cc/cc.h> 99 #include <netinet/tcp_log_buf.h> 100 #include <netinet/tcp_ratelimit.h> 101 #include <netinet/tcp_lro.h> 102 #ifdef TCPDEBUG 103 #include <netinet/tcp_debug.h> 104 #endif /* TCPDEBUG */ 105 #ifdef TCP_OFFLOAD 106 #include <netinet/tcp_offload.h> 107 #endif 108 #ifdef INET6 109 #include <netinet6/tcp6_var.h> 110 #endif 111 #include <netinet/tcp_fastopen.h> 112 113 #include <netipsec/ipsec_support.h> 114 #include <net/if.h> 115 #include <net/if_var.h> 116 #include <net/ethernet.h> 117 118 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 119 #include <netipsec/ipsec.h> 120 #include <netipsec/ipsec6.h> 121 #endif /* IPSEC */ 122 123 #include <netinet/udp.h> 124 #include <netinet/udp_var.h> 125 #include <machine/in_cksum.h> 126 127 #ifdef MAC 128 #include <security/mac/mac_framework.h> 129 #endif 130 131 #include "sack_filter.h" 132 #include "tcp_bbr.h" 133 #include "rack_bbr_common.h" 134 uma_zone_t bbr_zone; 135 uma_zone_t bbr_pcb_zone; 136 137 struct sysctl_ctx_list bbr_sysctl_ctx; 138 struct sysctl_oid *bbr_sysctl_root; 139 140 #define TCPT_RANGESET_NOSLOP(tv, value, tvmin, tvmax) do { \ 141 (tv) = (value); \ 142 if ((u_long)(tv) < (u_long)(tvmin)) \ 143 (tv) = (tvmin); \ 144 if ((u_long)(tv) > (u_long)(tvmax)) \ 145 (tv) = (tvmax); \ 146 } while(0) 147 148 /*#define BBR_INVARIANT 1*/ 149 150 /* 151 * initial window 152 */ 153 static uint32_t bbr_def_init_win = 10; 154 static int32_t bbr_persist_min = 250000; /* 250ms */ 155 static int32_t bbr_persist_max = 1000000; /* 1 Second */ 156 static int32_t bbr_cwnd_may_shrink = 0; 157 static int32_t bbr_cwndtarget_rtt_touse = BBR_RTT_PROP; 158 static int32_t bbr_num_pktepo_for_del_limit = BBR_NUM_RTTS_FOR_DEL_LIMIT; 159 static int32_t bbr_hardware_pacing_limit = 8000; 160 static int32_t bbr_quanta = 3; /* How much extra quanta do we get? */ 161 static int32_t bbr_no_retran = 0; 162 163 static int32_t bbr_error_base_paceout = 10000; /* usec to pace */ 164 static int32_t bbr_max_net_error_cnt = 10; 165 /* Should the following be dynamic too -- loss wise */ 166 static int32_t bbr_rtt_gain_thresh = 0; 167 /* Measurement controls */ 168 static int32_t bbr_use_google_algo = 1; 169 static int32_t bbr_ts_limiting = 1; 170 static int32_t bbr_ts_can_raise = 0; 171 static int32_t bbr_do_red = 600; 172 static int32_t bbr_red_scale = 20000; 173 static int32_t bbr_red_mul = 1; 174 static int32_t bbr_red_div = 2; 175 static int32_t bbr_red_growth_restrict = 1; 176 static int32_t bbr_target_is_bbunit = 0; 177 static int32_t bbr_drop_limit = 0; 178 /* 179 * How much gain do we need to see to 180 * stay in startup? 181 */ 182 static int32_t bbr_marks_rxt_sack_passed = 0; 183 static int32_t bbr_start_exit = 25; 184 static int32_t bbr_low_start_exit = 25; /* When we are in reduced gain */ 185 static int32_t bbr_startup_loss_thresh = 2000; /* 20.00% loss */ 186 static int32_t bbr_hptsi_max_mul = 1; /* These two mul/div assure a min pacing */ 187 static int32_t bbr_hptsi_max_div = 2; /* time, 0 means turned off. We need this 188 * if we go back ever to where the pacer 189 * has priority over timers. 190 */ 191 static int32_t bbr_policer_call_from_rack_to = 0; 192 static int32_t bbr_policer_detection_enabled = 1; 193 static int32_t bbr_min_measurements_req = 1; /* We need at least 2 194 * measurements before we are 195 * "good" note that 2 == 1. 196 * This is because we use a > 197 * comparison. This means if 198 * min_measure was 0, it takes 199 * num-measures > min(0) and 200 * you get 1 measurement and 201 * you are good. Set to 1, you 202 * have to have two 203 * measurements (this is done 204 * to prevent it from being ok 205 * to have no measurements). */ 206 static int32_t bbr_no_pacing_until = 4; 207 208 static int32_t bbr_min_usec_delta = 20000; /* 20,000 usecs */ 209 static int32_t bbr_min_peer_delta = 20; /* 20 units */ 210 static int32_t bbr_delta_percent = 150; /* 15.0 % */ 211 212 static int32_t bbr_target_cwnd_mult_limit = 8; 213 /* 214 * bbr_cwnd_min_val is the number of 215 * segments we hold to in the RTT probe 216 * state typically 4. 217 */ 218 static int32_t bbr_cwnd_min_val = BBR_PROBERTT_NUM_MSS; 219 220 static int32_t bbr_cwnd_min_val_hs = BBR_HIGHSPEED_NUM_MSS; 221 222 static int32_t bbr_gain_to_target = 1; 223 static int32_t bbr_gain_gets_extra_too = 1; 224 /* 225 * bbr_high_gain is the 2/ln(2) value we need 226 * to double the sending rate in startup. This 227 * is used for both cwnd and hptsi gain's. 228 */ 229 static int32_t bbr_high_gain = BBR_UNIT * 2885 / 1000 + 1; 230 static int32_t bbr_startup_lower = BBR_UNIT * 1500 / 1000 + 1; 231 static int32_t bbr_use_lower_gain_in_startup = 1; 232 233 /* thresholds for reduction on drain in sub-states/drain */ 234 static int32_t bbr_drain_rtt = BBR_SRTT; 235 static int32_t bbr_drain_floor = 88; 236 static int32_t google_allow_early_out = 1; 237 static int32_t google_consider_lost = 1; 238 static int32_t bbr_drain_drop_mul = 4; 239 static int32_t bbr_drain_drop_div = 5; 240 static int32_t bbr_rand_ot = 50; 241 static int32_t bbr_can_force_probertt = 0; 242 static int32_t bbr_can_adjust_probertt = 1; 243 static int32_t bbr_probertt_sets_rtt = 0; 244 static int32_t bbr_can_use_ts_for_rtt = 1; 245 static int32_t bbr_is_ratio = 0; 246 static int32_t bbr_sub_drain_app_limit = 1; 247 static int32_t bbr_prtt_slam_cwnd = 1; 248 static int32_t bbr_sub_drain_slam_cwnd = 1; 249 static int32_t bbr_slam_cwnd_in_main_drain = 1; 250 static int32_t bbr_filter_len_sec = 6; /* How long does the rttProp filter 251 * hold */ 252 static uint32_t bbr_rtt_probe_limit = (USECS_IN_SECOND * 4); 253 /* 254 * bbr_drain_gain is the reverse of the high_gain 255 * designed to drain back out the standing queue 256 * that is formed in startup by causing a larger 257 * hptsi gain and thus drainging the packets 258 * in flight. 259 */ 260 static int32_t bbr_drain_gain = BBR_UNIT * 1000 / 2885; 261 static int32_t bbr_rttprobe_gain = 192; 262 263 /* 264 * The cwnd_gain is the default cwnd gain applied when 265 * calculating a target cwnd. Note that the cwnd is 266 * a secondary factor in the way BBR works (see the 267 * paper and think about it, it will take some time). 268 * Basically the hptsi_gain spreads the packets out 269 * so you never get more than BDP to the peer even 270 * if the cwnd is high. In our implemenation that 271 * means in non-recovery/retransmission scenarios 272 * cwnd will never be reached by the flight-size. 273 */ 274 static int32_t bbr_cwnd_gain = BBR_UNIT * 2; 275 static int32_t bbr_tlp_type_to_use = BBR_SRTT; 276 static int32_t bbr_delack_time = 100000; /* 100ms in useconds */ 277 static int32_t bbr_sack_not_required = 0; /* set to one to allow non-sack to use bbr */ 278 static int32_t bbr_initial_bw_bps = 62500; /* 500kbps in bytes ps */ 279 static int32_t bbr_ignore_data_after_close = 1; 280 static int16_t bbr_hptsi_gain[] = { 281 (BBR_UNIT *5 / 4), 282 (BBR_UNIT * 3 / 4), 283 BBR_UNIT, 284 BBR_UNIT, 285 BBR_UNIT, 286 BBR_UNIT, 287 BBR_UNIT, 288 BBR_UNIT 289 }; 290 int32_t bbr_use_rack_resend_cheat = 1; 291 int32_t bbr_sends_full_iwnd = 1; 292 293 #define BBR_HPTSI_GAIN_MAX 8 294 /* 295 * The BBR module incorporates a number of 296 * TCP ideas that have been put out into the IETF 297 * over the last few years: 298 * - Yuchung Cheng's RACK TCP (for which its named) that 299 * will stop us using the number of dup acks and instead 300 * use time as the gage of when we retransmit. 301 * - Reorder Detection of RFC4737 and the Tail-Loss probe draft 302 * of Dukkipati et.al. 303 * - Van Jacobson's et.al BBR. 304 * 305 * RACK depends on SACK, so if an endpoint arrives that 306 * cannot do SACK the state machine below will shuttle the 307 * connection back to using the "default" TCP stack that is 308 * in FreeBSD. 309 * 310 * To implement BBR and RACK the original TCP stack was first decomposed 311 * into a functional state machine with individual states 312 * for each of the possible TCP connection states. The do_segment 313 * functions role in life is to mandate the connection supports SACK 314 * initially and then assure that the RACK state matches the conenction 315 * state before calling the states do_segment function. Data processing 316 * of inbound segments also now happens in the hpts_do_segment in general 317 * with only one exception. This is so we can keep the connection on 318 * a single CPU. 319 * 320 * Each state is simplified due to the fact that the original do_segment 321 * has been decomposed and we *know* what state we are in (no 322 * switches on the state) and all tests for SACK are gone. This 323 * greatly simplifies what each state does. 324 * 325 * TCP output is also over-written with a new version since it 326 * must maintain the new rack scoreboard and has had hptsi 327 * integrated as a requirment. Still todo is to eliminate the 328 * use of the callout_() system and use the hpts for all 329 * timers as well. 330 */ 331 static uint32_t bbr_rtt_probe_time = 200000; /* 200ms in micro seconds */ 332 static uint32_t bbr_rtt_probe_cwndtarg = 4; /* How many mss's outstanding */ 333 static const int32_t bbr_min_req_free = 2; /* The min we must have on the 334 * free list */ 335 static int32_t bbr_tlp_thresh = 1; 336 static int32_t bbr_reorder_thresh = 2; 337 static int32_t bbr_reorder_fade = 60000000; /* 0 - never fade, def 338 * 60,000,000 - 60 seconds */ 339 static int32_t bbr_pkt_delay = 1000; 340 static int32_t bbr_min_to = 1000; /* Number of usec's minimum timeout */ 341 static int32_t bbr_incr_timers = 1; 342 343 static int32_t bbr_tlp_min = 10000; /* 10ms in usecs */ 344 static int32_t bbr_delayed_ack_time = 200000; /* 200ms in usecs */ 345 static int32_t bbr_exit_startup_at_loss = 1; 346 347 /* 348 * bbr_lt_bw_ratio is 1/8th 349 * bbr_lt_bw_diff is < 4 Kbit/sec 350 */ 351 static uint64_t bbr_lt_bw_diff = 4000 / 8; /* In bytes per second */ 352 static uint64_t bbr_lt_bw_ratio = 8; /* For 1/8th */ 353 static uint32_t bbr_lt_bw_max_rtts = 48; /* How many rtt's do we use 354 * the lt_bw for */ 355 static uint32_t bbr_lt_intvl_min_rtts = 4; /* Min num of RTT's to measure 356 * lt_bw */ 357 static int32_t bbr_lt_intvl_fp = 0; /* False positive epoch diff */ 358 static int32_t bbr_lt_loss_thresh = 196; /* Lost vs delivered % */ 359 static int32_t bbr_lt_fd_thresh = 100; /* false detection % */ 360 361 static int32_t bbr_verbose_logging = 0; 362 /* 363 * Currently regular tcp has a rto_min of 30ms 364 * the backoff goes 12 times so that ends up 365 * being a total of 122.850 seconds before a 366 * connection is killed. 367 */ 368 static int32_t bbr_rto_min_ms = 30; /* 30ms same as main freebsd */ 369 static int32_t bbr_rto_max_sec = 4; /* 4 seconds */ 370 371 /****************************************************/ 372 /* DEFAULT TSO SIZING (cpu performance impacting) */ 373 /****************************************************/ 374 /* What amount is our formula using to get TSO size */ 375 static int32_t bbr_hptsi_per_second = 1000; 376 377 /* 378 * For hptsi under bbr_cross_over connections what is delay 379 * target 7ms (in usec) combined with a seg_max of 2 380 * gets us close to identical google behavior in 381 * TSO size selection (possibly more 1MSS sends). 382 */ 383 static int32_t bbr_hptsi_segments_delay_tar = 7000; 384 385 /* Does pacing delay include overhead's in its time calculations? */ 386 static int32_t bbr_include_enet_oh = 0; 387 static int32_t bbr_include_ip_oh = 1; 388 static int32_t bbr_include_tcp_oh = 1; 389 static int32_t bbr_google_discount = 10; 390 391 /* Do we use (nf mode) pkt-epoch to drive us or rttProp? */ 392 static int32_t bbr_state_is_pkt_epoch = 0; 393 static int32_t bbr_state_drain_2_tar = 1; 394 /* What is the max the 0 - bbr_cross_over MBPS TSO target 395 * can reach using our delay target. Note that this 396 * value becomes the floor for the cross over 397 * algorithm. 398 */ 399 static int32_t bbr_hptsi_segments_max = 2; 400 static int32_t bbr_hptsi_segments_floor = 1; 401 static int32_t bbr_hptsi_utter_max = 0; 402 403 /* What is the min the 0 - bbr_cross-over MBPS TSO target can be */ 404 static int32_t bbr_hptsi_bytes_min = 1460; 405 static int32_t bbr_all_get_min = 0; 406 407 /* Cross over point from algo-a to algo-b */ 408 static uint32_t bbr_cross_over = TWENTY_THREE_MBPS; 409 410 /* Do we deal with our restart state? */ 411 static int32_t bbr_uses_idle_restart = 0; 412 static int32_t bbr_idle_restart_threshold = 100000; /* 100ms in useconds */ 413 414 /* Do we allow hardware pacing? */ 415 static int32_t bbr_allow_hdwr_pacing = 0; 416 static int32_t bbr_hdwr_pace_adjust = 2; /* multipler when we calc the tso size */ 417 static int32_t bbr_hdwr_pace_floor = 1; 418 static int32_t bbr_hdwr_pacing_delay_cnt = 10; 419 420 /****************************************************/ 421 static int32_t bbr_resends_use_tso = 0; 422 static int32_t bbr_tlp_max_resend = 2; 423 static int32_t bbr_sack_block_limit = 128; 424 425 #define BBR_MAX_STAT 19 426 counter_u64_t bbr_state_time[BBR_MAX_STAT]; 427 counter_u64_t bbr_state_lost[BBR_MAX_STAT]; 428 counter_u64_t bbr_state_resend[BBR_MAX_STAT]; 429 counter_u64_t bbr_stat_arry[BBR_STAT_SIZE]; 430 counter_u64_t bbr_opts_arry[BBR_OPTS_SIZE]; 431 counter_u64_t bbr_out_size[TCP_MSS_ACCT_SIZE]; 432 counter_u64_t bbr_flows_whdwr_pacing; 433 counter_u64_t bbr_flows_nohdwr_pacing; 434 435 counter_u64_t bbr_nohdwr_pacing_enobuf; 436 counter_u64_t bbr_hdwr_pacing_enobuf; 437 438 static inline uint64_t bbr_get_bw(struct tcp_bbr *bbr); 439 440 /* 441 * Static defintions we need for forward declarations. 442 */ 443 static uint32_t 444 bbr_get_pacing_length(struct tcp_bbr *bbr, uint16_t gain, 445 uint32_t useconds_time, uint64_t bw); 446 static uint32_t 447 bbr_get_a_state_target(struct tcp_bbr *bbr, uint32_t gain); 448 static void 449 bbr_set_state(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t win); 450 static void 451 bbr_set_probebw_gains(struct tcp_bbr *bbr, uint32_t cts, uint32_t losses); 452 static void 453 bbr_substate_change(struct tcp_bbr *bbr, uint32_t cts, int line, 454 int dolog); 455 static uint32_t 456 bbr_get_target_cwnd(struct tcp_bbr *bbr, uint64_t bw, uint32_t gain); 457 static void 458 bbr_state_change(struct tcp_bbr *bbr, uint32_t cts, int32_t epoch, 459 int32_t pkt_epoch, uint32_t losses); 460 static uint32_t 461 bbr_calc_thresh_rack(struct tcp_bbr *bbr, uint32_t srtt, uint32_t cts, 462 struct bbr_sendmap *rsm); 463 static uint32_t 464 bbr_initial_cwnd(struct tcp_bbr *bbr, struct tcpcb *tp); 465 static uint32_t 466 bbr_calc_thresh_tlp(struct tcpcb *tp, struct tcp_bbr *bbr, 467 struct bbr_sendmap *rsm, uint32_t srtt, uint32_t cts); 468 static void 469 bbr_exit_persist(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts, 470 int32_t line); 471 static void 472 bbr_set_state_target(struct tcp_bbr *bbr, int line); 473 static void 474 bbr_enter_probe_rtt(struct tcp_bbr *bbr, uint32_t cts, int32_t line); 475 static void 476 bbr_log_progress_event(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t tick, 477 int event, int line); 478 static void 479 tcp_bbr_tso_size_check(struct tcp_bbr *bbr, uint32_t cts); 480 static void 481 bbr_setup_red_bw(struct tcp_bbr *bbr, uint32_t cts); 482 static void 483 bbr_log_rtt_shrinks(struct tcp_bbr *bbr, uint32_t cts, uint32_t applied, 484 uint32_t rtt, uint32_t line, uint8_t is_start, 485 uint16_t set); 486 static struct bbr_sendmap * 487 bbr_find_lowest_rsm(struct tcp_bbr *bbr); 488 static __inline uint32_t 489 bbr_get_rtt(struct tcp_bbr *bbr, int32_t rtt_type); 490 static void 491 bbr_log_to_start(struct tcp_bbr *bbr, uint32_t cts, uint32_t to, int32_t slot, 492 uint8_t which); 493 static void 494 bbr_log_timer_var(struct tcp_bbr *bbr, int mode, uint32_t cts, 495 uint32_t time_since_sent, uint32_t srtt, 496 uint32_t thresh, uint32_t to); 497 static void 498 bbr_log_hpts_diag(struct tcp_bbr *bbr, uint32_t cts, struct hpts_diag *diag); 499 static void 500 bbr_log_type_bbrsnd(struct tcp_bbr *bbr, uint32_t len, uint32_t slot, 501 uint32_t del_by, uint32_t cts, uint32_t sloton, 502 uint32_t prev_delay); 503 static void 504 bbr_enter_persist(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts, 505 int32_t line); 506 static void 507 bbr_stop_all_timers(struct tcpcb *tp); 508 static void 509 bbr_exit_probe_rtt(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts); 510 static void 511 bbr_check_probe_rtt_limits(struct tcp_bbr *bbr, uint32_t cts); 512 static void 513 bbr_timer_cancel(struct tcp_bbr *bbr, int32_t line, uint32_t cts); 514 static void 515 bbr_log_pacing_delay_calc(struct tcp_bbr *bbr, uint16_t gain, uint32_t len, 516 uint32_t cts, uint32_t usecs, uint64_t bw, 517 uint32_t override, int mod); 518 static int 519 bbr_ctloutput(struct inpcb *inp, struct sockopt *sopt); 520 521 static inline uint8_t 522 bbr_state_val(struct tcp_bbr *bbr) 523 { 524 return(bbr->rc_bbr_substate); 525 } 526 527 static inline uint32_t 528 get_min_cwnd(struct tcp_bbr *bbr) 529 { 530 int mss; 531 532 mss = min((bbr->rc_tp->t_maxseg - bbr->rc_last_options), 533 bbr->r_ctl.rc_pace_max_segs); 534 if (bbr_get_rtt(bbr, BBR_RTT_PROP) < BBR_HIGH_SPEED) 535 return (bbr_cwnd_min_val_hs * mss); 536 else 537 return (bbr_cwnd_min_val * mss); 538 } 539 540 static uint32_t 541 bbr_get_persists_timer_val(struct tcpcb *tp, struct tcp_bbr *bbr) 542 { 543 uint64_t srtt, var; 544 uint64_t ret_val; 545 546 bbr->r_ctl.rc_hpts_flags |= PACE_TMR_PERSIT; 547 if (tp->t_srtt == 0) { 548 srtt = (uint64_t)BBR_INITIAL_RTO; 549 var = 0; 550 } else { 551 srtt = ((uint64_t)TICKS_2_USEC(tp->t_srtt) >> TCP_RTT_SHIFT); 552 var = ((uint64_t)TICKS_2_USEC(tp->t_rttvar) >> TCP_RTT_SHIFT); 553 } 554 TCPT_RANGESET_NOSLOP(ret_val, ((srtt + var) * tcp_backoff[tp->t_rxtshift]), 555 bbr_persist_min, bbr_persist_max); 556 return ((uint32_t)ret_val); 557 } 558 559 static uint32_t 560 bbr_timer_start(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts) 561 { 562 /* 563 * Start the FR timer, we do this based on getting the first one in 564 * the rc_tmap. Note that if its NULL we must stop the timer. in all 565 * events we need to stop the running timer (if its running) before 566 * starting the new one. 567 */ 568 uint32_t thresh, exp, to, srtt, time_since_sent, tstmp_touse; 569 int32_t idx; 570 int32_t is_tlp_timer = 0; 571 struct bbr_sendmap *rsm; 572 573 if (bbr->rc_all_timers_stopped) { 574 /* All timers have been stopped none are to run */ 575 return (0); 576 } 577 if (bbr->rc_in_persist) { 578 /* We can't start any timer in persists */ 579 return (bbr_get_persists_timer_val(tp, bbr)); 580 } 581 rsm = TAILQ_FIRST(&bbr->r_ctl.rc_tmap); 582 if ((rsm == NULL) || 583 ((tp->t_flags & TF_SACK_PERMIT) == 0) || 584 (tp->t_state < TCPS_ESTABLISHED)) { 585 /* Nothing on the send map */ 586 activate_rxt: 587 if (SEQ_LT(tp->snd_una, tp->snd_max) || sbavail(&(tp->t_inpcb->inp_socket->so_snd))) { 588 uint64_t tov; 589 590 time_since_sent = 0; 591 rsm = TAILQ_FIRST(&bbr->r_ctl.rc_tmap); 592 if (rsm) { 593 idx = rsm->r_rtr_cnt - 1; 594 if (TSTMP_GEQ(rsm->r_tim_lastsent[idx], bbr->r_ctl.rc_tlp_rxt_last_time)) 595 tstmp_touse = rsm->r_tim_lastsent[idx]; 596 else 597 tstmp_touse = bbr->r_ctl.rc_tlp_rxt_last_time; 598 if (TSTMP_GT(tstmp_touse, cts)) 599 time_since_sent = cts - tstmp_touse; 600 } 601 bbr->r_ctl.rc_hpts_flags |= PACE_TMR_RXT; 602 if (tp->t_srtt == 0) 603 tov = BBR_INITIAL_RTO; 604 else 605 tov = ((uint64_t)(TICKS_2_USEC(tp->t_srtt) + 606 ((uint64_t)TICKS_2_USEC(tp->t_rttvar) * (uint64_t)4)) >> TCP_RTT_SHIFT); 607 if (tp->t_rxtshift) 608 tov *= tcp_backoff[tp->t_rxtshift]; 609 if (tov > time_since_sent) 610 tov -= time_since_sent; 611 else 612 tov = bbr->r_ctl.rc_min_to; 613 TCPT_RANGESET_NOSLOP(to, tov, 614 (bbr->r_ctl.rc_min_rto_ms * MS_IN_USEC), 615 (bbr->rc_max_rto_sec * USECS_IN_SECOND)); 616 bbr_log_timer_var(bbr, 2, cts, 0, srtt, 0, to); 617 return (to); 618 } 619 return (0); 620 } 621 if (rsm->r_flags & BBR_ACKED) { 622 rsm = bbr_find_lowest_rsm(bbr); 623 if (rsm == NULL) { 624 /* No lowest? */ 625 goto activate_rxt; 626 } 627 } 628 /* Convert from ms to usecs */ 629 if (rsm->r_flags & BBR_SACK_PASSED) { 630 if ((tp->t_flags & TF_SENTFIN) && 631 ((tp->snd_max - tp->snd_una) == 1) && 632 (rsm->r_flags & BBR_HAS_FIN)) { 633 /* 634 * We don't start a bbr rack timer if all we have is 635 * a FIN outstanding. 636 */ 637 goto activate_rxt; 638 } 639 srtt = bbr_get_rtt(bbr, BBR_RTT_RACK); 640 thresh = bbr_calc_thresh_rack(bbr, srtt, cts, rsm); 641 idx = rsm->r_rtr_cnt - 1; 642 exp = rsm->r_tim_lastsent[idx] + thresh; 643 if (SEQ_GEQ(exp, cts)) { 644 to = exp - cts; 645 if (to < bbr->r_ctl.rc_min_to) { 646 to = bbr->r_ctl.rc_min_to; 647 } 648 } else { 649 to = bbr->r_ctl.rc_min_to; 650 } 651 } else { 652 /* Ok we need to do a TLP not RACK */ 653 if (bbr->rc_tlp_in_progress != 0) { 654 /* 655 * The previous send was a TLP. 656 */ 657 goto activate_rxt; 658 } 659 rsm = TAILQ_LAST_FAST(&bbr->r_ctl.rc_tmap, bbr_sendmap, r_tnext); 660 if (rsm == NULL) { 661 /* We found no rsm to TLP with. */ 662 goto activate_rxt; 663 } 664 if (rsm->r_flags & BBR_HAS_FIN) { 665 /* If its a FIN we don't do TLP */ 666 rsm = NULL; 667 goto activate_rxt; 668 } 669 time_since_sent = 0; 670 idx = rsm->r_rtr_cnt - 1; 671 if (TSTMP_GEQ(rsm->r_tim_lastsent[idx], bbr->r_ctl.rc_tlp_rxt_last_time)) 672 tstmp_touse = rsm->r_tim_lastsent[idx]; 673 else 674 tstmp_touse = bbr->r_ctl.rc_tlp_rxt_last_time; 675 if (TSTMP_GT(tstmp_touse, cts)) 676 time_since_sent = cts - tstmp_touse; 677 is_tlp_timer = 1; 678 srtt = bbr_get_rtt(bbr, bbr_tlp_type_to_use); 679 thresh = bbr_calc_thresh_tlp(tp, bbr, rsm, srtt, cts); 680 if (thresh > time_since_sent) 681 to = thresh - time_since_sent; 682 else 683 to = bbr->r_ctl.rc_min_to; 684 if (to > (((uint32_t)bbr->rc_max_rto_sec) * USECS_IN_SECOND)) { 685 /* 686 * If the TLP time works out to larger than the max 687 * RTO lets not do TLP.. just RTO. 688 */ 689 goto activate_rxt; 690 } 691 if ((bbr->rc_tlp_rtx_out == 1) && 692 (rsm->r_start == bbr->r_ctl.rc_last_tlp_seq)) { 693 /* 694 * Second retransmit of the same TLP 695 * lets not. 696 */ 697 bbr->rc_tlp_rtx_out = 0; 698 goto activate_rxt; 699 } 700 if (rsm->r_start != bbr->r_ctl.rc_last_tlp_seq) { 701 /* 702 * The tail is no longer the last one I did a probe 703 * on 704 */ 705 bbr->r_ctl.rc_tlp_seg_send_cnt = 0; 706 bbr->r_ctl.rc_last_tlp_seq = rsm->r_start; 707 } 708 } 709 if (is_tlp_timer == 0) { 710 BBR_STAT_INC(bbr_to_arm_rack); 711 bbr->r_ctl.rc_hpts_flags |= PACE_TMR_RACK; 712 } else { 713 bbr_log_timer_var(bbr, 1, cts, time_since_sent, srtt, thresh, to); 714 if (bbr->r_ctl.rc_tlp_seg_send_cnt > bbr_tlp_max_resend) { 715 /* 716 * We have exceeded how many times we can retran the 717 * current TLP timer, switch to the RTO timer. 718 */ 719 goto activate_rxt; 720 } else { 721 BBR_STAT_INC(bbr_to_arm_tlp); 722 bbr->r_ctl.rc_hpts_flags |= PACE_TMR_TLP; 723 } 724 } 725 return (to); 726 } 727 728 static inline int32_t 729 bbr_minseg(struct tcp_bbr *bbr) 730 { 731 return (bbr->r_ctl.rc_pace_min_segs - bbr->rc_last_options); 732 } 733 734 static void 735 bbr_start_hpts_timer(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts, int32_t frm, int32_t slot, uint32_t tot_len) 736 { 737 struct inpcb *inp; 738 struct hpts_diag diag; 739 uint32_t delayed_ack = 0; 740 uint32_t left = 0; 741 uint32_t hpts_timeout; 742 uint8_t stopped; 743 int32_t delay_calc = 0; 744 uint32_t prev_delay = 0; 745 746 inp = tp->t_inpcb; 747 if (tcp_in_hpts(inp)) { 748 /* A previous call is already set up */ 749 return; 750 } 751 if ((tp->t_state == TCPS_CLOSED) || 752 (tp->t_state == TCPS_LISTEN)) { 753 return; 754 } 755 stopped = bbr->rc_tmr_stopped; 756 if (stopped && TSTMP_GT(bbr->r_ctl.rc_timer_exp, cts)) { 757 left = bbr->r_ctl.rc_timer_exp - cts; 758 } 759 bbr->r_ctl.rc_hpts_flags = 0; 760 bbr->r_ctl.rc_timer_exp = 0; 761 prev_delay = bbr->r_ctl.rc_last_delay_val; 762 if (bbr->r_ctl.rc_last_delay_val && 763 (slot == 0)) { 764 /* 765 * If a previous pacer delay was in place we 766 * are not coming from the output side (where 767 * we calculate a delay, more likely a timer). 768 */ 769 slot = bbr->r_ctl.rc_last_delay_val; 770 if (TSTMP_GT(cts, bbr->rc_pacer_started)) { 771 /* Compensate for time passed */ 772 delay_calc = cts - bbr->rc_pacer_started; 773 if (delay_calc <= slot) 774 slot -= delay_calc; 775 } 776 } 777 /* Do we have early to make up for by pushing out the pacing time? */ 778 if (bbr->r_agg_early_set) { 779 bbr_log_pacing_delay_calc(bbr, 0, bbr->r_ctl.rc_agg_early, cts, slot, 0, bbr->r_agg_early_set, 2); 780 slot += bbr->r_ctl.rc_agg_early; 781 bbr->r_ctl.rc_agg_early = 0; 782 bbr->r_agg_early_set = 0; 783 } 784 /* Are we running a total debt that needs to be compensated for? */ 785 if (bbr->r_ctl.rc_hptsi_agg_delay) { 786 if (slot > bbr->r_ctl.rc_hptsi_agg_delay) { 787 /* We nuke the delay */ 788 slot -= bbr->r_ctl.rc_hptsi_agg_delay; 789 bbr->r_ctl.rc_hptsi_agg_delay = 0; 790 } else { 791 /* We nuke some of the delay, put in a minimal 100usecs */ 792 bbr->r_ctl.rc_hptsi_agg_delay -= slot; 793 bbr->r_ctl.rc_last_delay_val = slot = 100; 794 } 795 } 796 bbr->r_ctl.rc_last_delay_val = slot; 797 hpts_timeout = bbr_timer_start(tp, bbr, cts); 798 if (tp->t_flags & TF_DELACK) { 799 if (bbr->rc_in_persist == 0) { 800 delayed_ack = bbr_delack_time; 801 } else { 802 /* 803 * We are in persists and have 804 * gotten a new data element. 805 */ 806 if (hpts_timeout > bbr_delack_time) { 807 /* 808 * Lets make the persists timer (which acks) 809 * be the smaller of hpts_timeout and bbr_delack_time. 810 */ 811 hpts_timeout = bbr_delack_time; 812 } 813 } 814 } 815 if (delayed_ack && 816 ((hpts_timeout == 0) || 817 (delayed_ack < hpts_timeout))) { 818 /* We need a Delayed ack timer */ 819 bbr->r_ctl.rc_hpts_flags = PACE_TMR_DELACK; 820 hpts_timeout = delayed_ack; 821 } 822 if (slot) { 823 /* Mark that we have a pacing timer up */ 824 BBR_STAT_INC(bbr_paced_segments); 825 bbr->r_ctl.rc_hpts_flags |= PACE_PKT_OUTPUT; 826 } 827 /* 828 * If no timers are going to run and we will fall off thfe hptsi 829 * wheel, we resort to a keep-alive timer if its configured. 830 */ 831 if ((hpts_timeout == 0) && 832 (slot == 0)) { 833 if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && 834 (tp->t_state <= TCPS_CLOSING)) { 835 /* 836 * Ok we have no timer (persists, rack, tlp, rxt or 837 * del-ack), we don't have segments being paced. So 838 * all that is left is the keepalive timer. 839 */ 840 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 841 hpts_timeout = TICKS_2_USEC(TP_KEEPIDLE(tp)); 842 } else { 843 hpts_timeout = TICKS_2_USEC(TP_KEEPINIT(tp)); 844 } 845 bbr->r_ctl.rc_hpts_flags |= PACE_TMR_KEEP; 846 } 847 } 848 if (left && (stopped & (PACE_TMR_KEEP | PACE_TMR_DELACK)) == 849 (bbr->r_ctl.rc_hpts_flags & PACE_TMR_MASK)) { 850 /* 851 * RACK, TLP, persists and RXT timers all are restartable 852 * based on actions input .. i.e we received a packet (ack 853 * or sack) and that changes things (rw, or snd_una etc). 854 * Thus we can restart them with a new value. For 855 * keep-alive, delayed_ack we keep track of what was left 856 * and restart the timer with a smaller value. 857 */ 858 if (left < hpts_timeout) 859 hpts_timeout = left; 860 } 861 if (bbr->r_ctl.rc_incr_tmrs && slot && 862 (bbr->r_ctl.rc_hpts_flags & (PACE_TMR_TLP|PACE_TMR_RXT))) { 863 /* 864 * If configured to do so, and the timer is either 865 * the TLP or RXT timer, we need to increase the timeout 866 * by the pacing time. Consider the bottleneck at my 867 * machine as an example, we are sending something 868 * to start a TLP on. The last packet won't be emitted 869 * fully until the pacing time (the bottleneck will hold 870 * the data in place). Once the packet is emitted that 871 * is when we want to start waiting for the TLP. This 872 * is most evident with hardware pacing (where the nic 873 * is holding the packet(s) before emitting). But it 874 * can also show up in the network so we do it for all 875 * cases. Technically we would take off one packet from 876 * this extra delay but this is easier and being more 877 * conservative is probably better. 878 */ 879 hpts_timeout += slot; 880 } 881 if (hpts_timeout) { 882 /* 883 * Hack alert for now we can't time-out over 2147 seconds (a 884 * bit more than 35min) 885 */ 886 if (hpts_timeout > 0x7ffffffe) 887 hpts_timeout = 0x7ffffffe; 888 bbr->r_ctl.rc_timer_exp = cts + hpts_timeout; 889 } else 890 bbr->r_ctl.rc_timer_exp = 0; 891 if ((slot) && 892 (bbr->rc_use_google || 893 bbr->output_error_seen || 894 (slot <= hpts_timeout)) ) { 895 /* 896 * Tell LRO that it can queue packets while 897 * we pace. 898 */ 899 bbr->rc_inp->inp_flags2 |= INP_MBUF_QUEUE_READY; 900 if ((bbr->r_ctl.rc_hpts_flags & PACE_TMR_RACK) && 901 (bbr->rc_cwnd_limited == 0)) { 902 /* 903 * If we are not cwnd limited and we 904 * are running a rack timer we put on 905 * the do not disturbe even for sack. 906 */ 907 inp->inp_flags2 |= INP_DONT_SACK_QUEUE; 908 } else 909 inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE; 910 bbr->rc_pacer_started = cts; 911 912 (void)tcp_hpts_insert_diag(tp->t_inpcb, HPTS_USEC_TO_SLOTS(slot), 913 __LINE__, &diag); 914 bbr->rc_timer_first = 0; 915 bbr->bbr_timer_src = frm; 916 bbr_log_to_start(bbr, cts, hpts_timeout, slot, 1); 917 bbr_log_hpts_diag(bbr, cts, &diag); 918 } else if (hpts_timeout) { 919 (void)tcp_hpts_insert_diag(tp->t_inpcb, HPTS_USEC_TO_SLOTS(hpts_timeout), 920 __LINE__, &diag); 921 /* 922 * We add the flag here as well if the slot is set, 923 * since hpts will call in to clear the queue first before 924 * calling the output routine (which does our timers). 925 * We don't want to set the flag if its just a timer 926 * else the arrival of data might (that causes us 927 * to send more) might get delayed. Imagine being 928 * on a keep-alive timer and a request comes in for 929 * more data. 930 */ 931 if (slot) 932 bbr->rc_pacer_started = cts; 933 if ((bbr->r_ctl.rc_hpts_flags & PACE_TMR_RACK) && 934 (bbr->rc_cwnd_limited == 0)) { 935 /* 936 * For a rack timer, don't wake us even 937 * if a sack arrives as long as we are 938 * not cwnd limited. 939 */ 940 bbr->rc_inp->inp_flags2 |= INP_MBUF_QUEUE_READY; 941 inp->inp_flags2 |= INP_DONT_SACK_QUEUE; 942 } else { 943 /* All other timers wake us up */ 944 bbr->rc_inp->inp_flags2 &= ~INP_MBUF_QUEUE_READY; 945 inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE; 946 } 947 bbr->bbr_timer_src = frm; 948 bbr_log_to_start(bbr, cts, hpts_timeout, slot, 0); 949 bbr_log_hpts_diag(bbr, cts, &diag); 950 bbr->rc_timer_first = 1; 951 } 952 bbr->rc_tmr_stopped = 0; 953 bbr_log_type_bbrsnd(bbr, tot_len, slot, delay_calc, cts, frm, prev_delay); 954 } 955 956 static void 957 bbr_timer_audit(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts, struct sockbuf *sb) 958 { 959 /* 960 * We received an ack, and then did not call send or were bounced 961 * out due to the hpts was running. Now a timer is up as well, is it 962 * the right timer? 963 */ 964 struct inpcb *inp; 965 struct bbr_sendmap *rsm; 966 uint32_t hpts_timeout; 967 int tmr_up; 968 969 tmr_up = bbr->r_ctl.rc_hpts_flags & PACE_TMR_MASK; 970 if (bbr->rc_in_persist && (tmr_up == PACE_TMR_PERSIT)) 971 return; 972 rsm = TAILQ_FIRST(&bbr->r_ctl.rc_tmap); 973 if (((rsm == NULL) || (tp->t_state < TCPS_ESTABLISHED)) && 974 (tmr_up == PACE_TMR_RXT)) { 975 /* Should be an RXT */ 976 return; 977 } 978 inp = bbr->rc_inp; 979 if (rsm == NULL) { 980 /* Nothing outstanding? */ 981 if (tp->t_flags & TF_DELACK) { 982 if (tmr_up == PACE_TMR_DELACK) 983 /* 984 * We are supposed to have delayed ack up 985 * and we do 986 */ 987 return; 988 } else if (sbavail(&inp->inp_socket->so_snd) && 989 (tmr_up == PACE_TMR_RXT)) { 990 /* 991 * if we hit enobufs then we would expect the 992 * possibility of nothing outstanding and the RXT up 993 * (and the hptsi timer). 994 */ 995 return; 996 } else if (((V_tcp_always_keepalive || 997 inp->inp_socket->so_options & SO_KEEPALIVE) && 998 (tp->t_state <= TCPS_CLOSING)) && 999 (tmr_up == PACE_TMR_KEEP) && 1000 (tp->snd_max == tp->snd_una)) { 1001 /* We should have keep alive up and we do */ 1002 return; 1003 } 1004 } 1005 if (rsm && (rsm->r_flags & BBR_SACK_PASSED)) { 1006 if ((tp->t_flags & TF_SENTFIN) && 1007 ((tp->snd_max - tp->snd_una) == 1) && 1008 (rsm->r_flags & BBR_HAS_FIN)) { 1009 /* needs to be a RXT */ 1010 if (tmr_up == PACE_TMR_RXT) 1011 return; 1012 else 1013 goto wrong_timer; 1014 } else if (tmr_up == PACE_TMR_RACK) 1015 return; 1016 else 1017 goto wrong_timer; 1018 } else if (rsm && (tmr_up == PACE_TMR_RACK)) { 1019 /* Rack timer has priority if we have data out */ 1020 return; 1021 } else if (SEQ_GT(tp->snd_max, tp->snd_una) && 1022 ((tmr_up == PACE_TMR_TLP) || 1023 (tmr_up == PACE_TMR_RXT))) { 1024 /* 1025 * Either a TLP or RXT is fine if no sack-passed is in place 1026 * and data is outstanding. 1027 */ 1028 return; 1029 } else if (tmr_up == PACE_TMR_DELACK) { 1030 /* 1031 * If the delayed ack was going to go off before the 1032 * rtx/tlp/rack timer were going to expire, then that would 1033 * be the timer in control. Note we don't check the time 1034 * here trusting the code is correct. 1035 */ 1036 return; 1037 } 1038 if (SEQ_GT(tp->snd_max, tp->snd_una) && 1039 ((tmr_up == PACE_TMR_RXT) || 1040 (tmr_up == PACE_TMR_TLP) || 1041 (tmr_up == PACE_TMR_RACK))) { 1042 /* 1043 * We have outstanding data and 1044 * we *do* have a RACK, TLP or RXT 1045 * timer running. We won't restart 1046 * anything here since thats probably ok we 1047 * will get called with some timer here shortly. 1048 */ 1049 return; 1050 } 1051 /* 1052 * Ok the timer originally started is not what we want now. We will 1053 * force the hpts to be stopped if any, and restart with the slot 1054 * set to what was in the saved slot. 1055 */ 1056 wrong_timer: 1057 if ((bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) { 1058 if (tcp_in_hpts(inp)) 1059 tcp_hpts_remove(inp); 1060 bbr_timer_cancel(bbr, __LINE__, cts); 1061 bbr_start_hpts_timer(bbr, tp, cts, 1, bbr->r_ctl.rc_last_delay_val, 1062 0); 1063 } else { 1064 /* 1065 * Output is hptsi so we just need to switch the type of 1066 * timer. We don't bother with keep-alive, since when we 1067 * jump through the output, it will start the keep-alive if 1068 * nothing is sent. 1069 * 1070 * We only need a delayed-ack added and or the hpts_timeout. 1071 */ 1072 hpts_timeout = bbr_timer_start(tp, bbr, cts); 1073 if (tp->t_flags & TF_DELACK) { 1074 if (hpts_timeout == 0) { 1075 hpts_timeout = bbr_delack_time; 1076 bbr->r_ctl.rc_hpts_flags = PACE_TMR_DELACK; 1077 } 1078 else if (hpts_timeout > bbr_delack_time) { 1079 hpts_timeout = bbr_delack_time; 1080 bbr->r_ctl.rc_hpts_flags = PACE_TMR_DELACK; 1081 } 1082 } 1083 if (hpts_timeout) { 1084 if (hpts_timeout > 0x7ffffffe) 1085 hpts_timeout = 0x7ffffffe; 1086 bbr->r_ctl.rc_timer_exp = cts + hpts_timeout; 1087 } 1088 } 1089 } 1090 1091 int32_t bbr_clear_lost = 0; 1092 1093 /* 1094 * Considers the two time values now (cts) and earlier. 1095 * If cts is smaller than earlier, we could have 1096 * had a sequence wrap (our counter wraps every 1097 * 70 min or so) or it could be just clock skew 1098 * getting us two different time values. Clock skew 1099 * will show up within 10ms or so. So in such 1100 * a case (where cts is behind earlier time by 1101 * less than 10ms) we return 0. Otherwise we 1102 * return the true difference between them. 1103 */ 1104 static inline uint32_t 1105 bbr_calc_time(uint32_t cts, uint32_t earlier_time) { 1106 /* 1107 * Given two timestamps, the current time stamp cts, and some other 1108 * time-stamp taken in theory earlier return the difference. The 1109 * trick is here sometimes locking will get the other timestamp 1110 * after the cts. If this occurs we need to return 0. 1111 */ 1112 if (TSTMP_GEQ(cts, earlier_time)) 1113 return (cts - earlier_time); 1114 /* 1115 * cts is behind earlier_time if its less than 10ms consider it 0. 1116 * If its more than 10ms difference then we had a time wrap. Else 1117 * its just the normal locking foo. I wonder if we should not go to 1118 * 64bit TS and get rid of this issue. 1119 */ 1120 if (TSTMP_GEQ((cts + 10000), earlier_time)) 1121 return (0); 1122 /* 1123 * Ok the time must have wrapped. So we need to answer a large 1124 * amount of time, which the normal subtraction should do. 1125 */ 1126 return (cts - earlier_time); 1127 } 1128 1129 static int 1130 sysctl_bbr_clear_lost(SYSCTL_HANDLER_ARGS) 1131 { 1132 uint32_t stat; 1133 int32_t error; 1134 1135 error = SYSCTL_OUT(req, &bbr_clear_lost, sizeof(uint32_t)); 1136 if (error || req->newptr == NULL) 1137 return error; 1138 1139 error = SYSCTL_IN(req, &stat, sizeof(uint32_t)); 1140 if (error) 1141 return (error); 1142 if (stat == 1) { 1143 #ifdef BBR_INVARIANTS 1144 printf("Clearing BBR lost counters\n"); 1145 #endif 1146 COUNTER_ARRAY_ZERO(bbr_state_lost, BBR_MAX_STAT); 1147 COUNTER_ARRAY_ZERO(bbr_state_time, BBR_MAX_STAT); 1148 COUNTER_ARRAY_ZERO(bbr_state_resend, BBR_MAX_STAT); 1149 } else if (stat == 2) { 1150 #ifdef BBR_INVARIANTS 1151 printf("Clearing BBR option counters\n"); 1152 #endif 1153 COUNTER_ARRAY_ZERO(bbr_opts_arry, BBR_OPTS_SIZE); 1154 } else if (stat == 3) { 1155 #ifdef BBR_INVARIANTS 1156 printf("Clearing BBR stats counters\n"); 1157 #endif 1158 COUNTER_ARRAY_ZERO(bbr_stat_arry, BBR_STAT_SIZE); 1159 } else if (stat == 4) { 1160 #ifdef BBR_INVARIANTS 1161 printf("Clearing BBR out-size counters\n"); 1162 #endif 1163 COUNTER_ARRAY_ZERO(bbr_out_size, TCP_MSS_ACCT_SIZE); 1164 } 1165 bbr_clear_lost = 0; 1166 return (0); 1167 } 1168 1169 static void 1170 bbr_init_sysctls(void) 1171 { 1172 struct sysctl_oid *bbr_probertt; 1173 struct sysctl_oid *bbr_hptsi; 1174 struct sysctl_oid *bbr_measure; 1175 struct sysctl_oid *bbr_cwnd; 1176 struct sysctl_oid *bbr_timeout; 1177 struct sysctl_oid *bbr_states; 1178 struct sysctl_oid *bbr_startup; 1179 struct sysctl_oid *bbr_policer; 1180 1181 /* Probe rtt controls */ 1182 bbr_probertt = SYSCTL_ADD_NODE(&bbr_sysctl_ctx, 1183 SYSCTL_CHILDREN(bbr_sysctl_root), 1184 OID_AUTO, 1185 "probertt", 1186 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1187 ""); 1188 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1189 SYSCTL_CHILDREN(bbr_probertt), 1190 OID_AUTO, "gain", CTLFLAG_RW, 1191 &bbr_rttprobe_gain, 192, 1192 "What is the filter gain drop in probe_rtt (0=disable)?"); 1193 SYSCTL_ADD_U32(&bbr_sysctl_ctx, 1194 SYSCTL_CHILDREN(bbr_probertt), 1195 OID_AUTO, "cwnd", CTLFLAG_RW, 1196 &bbr_rtt_probe_cwndtarg, 4, 1197 "How many mss's are outstanding during probe-rtt"); 1198 SYSCTL_ADD_U32(&bbr_sysctl_ctx, 1199 SYSCTL_CHILDREN(bbr_probertt), 1200 OID_AUTO, "int", CTLFLAG_RW, 1201 &bbr_rtt_probe_limit, 4000000, 1202 "If RTT has not shrank in this many micro-seconds enter probe-rtt"); 1203 SYSCTL_ADD_U32(&bbr_sysctl_ctx, 1204 SYSCTL_CHILDREN(bbr_probertt), 1205 OID_AUTO, "mintime", CTLFLAG_RW, 1206 &bbr_rtt_probe_time, 200000, 1207 "How many microseconds in probe-rtt"); 1208 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1209 SYSCTL_CHILDREN(bbr_probertt), 1210 OID_AUTO, "filter_len_sec", CTLFLAG_RW, 1211 &bbr_filter_len_sec, 6, 1212 "How long in seconds does the rttProp filter run?"); 1213 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1214 SYSCTL_CHILDREN(bbr_probertt), 1215 OID_AUTO, "drain_rtt", CTLFLAG_RW, 1216 &bbr_drain_rtt, BBR_SRTT, 1217 "What is the drain rtt to use in probeRTT (rtt_prop=0, rtt_rack=1, rtt_pkt=2, rtt_srtt=3?"); 1218 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1219 SYSCTL_CHILDREN(bbr_probertt), 1220 OID_AUTO, "can_force", CTLFLAG_RW, 1221 &bbr_can_force_probertt, 0, 1222 "If we keep setting new low rtt's but delay going in probe-rtt can we force in??"); 1223 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1224 SYSCTL_CHILDREN(bbr_probertt), 1225 OID_AUTO, "enter_sets_force", CTLFLAG_RW, 1226 &bbr_probertt_sets_rtt, 0, 1227 "In NF mode, do we imitate google_mode and set the rttProp on entry to probe-rtt?"); 1228 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1229 SYSCTL_CHILDREN(bbr_probertt), 1230 OID_AUTO, "can_adjust", CTLFLAG_RW, 1231 &bbr_can_adjust_probertt, 1, 1232 "Can we dynamically adjust the probe-rtt limits and times?"); 1233 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1234 SYSCTL_CHILDREN(bbr_probertt), 1235 OID_AUTO, "is_ratio", CTLFLAG_RW, 1236 &bbr_is_ratio, 0, 1237 "is the limit to filter a ratio?"); 1238 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1239 SYSCTL_CHILDREN(bbr_probertt), 1240 OID_AUTO, "use_cwnd", CTLFLAG_RW, 1241 &bbr_prtt_slam_cwnd, 0, 1242 "Should we set/recover cwnd?"); 1243 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1244 SYSCTL_CHILDREN(bbr_probertt), 1245 OID_AUTO, "can_use_ts", CTLFLAG_RW, 1246 &bbr_can_use_ts_for_rtt, 1, 1247 "Can we use the ms timestamp if available for retransmistted rtt calculations?"); 1248 1249 /* Pacing controls */ 1250 bbr_hptsi = SYSCTL_ADD_NODE(&bbr_sysctl_ctx, 1251 SYSCTL_CHILDREN(bbr_sysctl_root), 1252 OID_AUTO, 1253 "pacing", 1254 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1255 ""); 1256 SYSCTL_ADD_U32(&bbr_sysctl_ctx, 1257 SYSCTL_CHILDREN(bbr_hptsi), 1258 OID_AUTO, "hw_pacing", CTLFLAG_RW, 1259 &bbr_allow_hdwr_pacing, 1, 1260 "Do we allow hardware pacing?"); 1261 SYSCTL_ADD_U32(&bbr_sysctl_ctx, 1262 SYSCTL_CHILDREN(bbr_hptsi), 1263 OID_AUTO, "hw_pacing_limit", CTLFLAG_RW, 1264 &bbr_hardware_pacing_limit, 4000, 1265 "Do we have a limited number of connections for pacing chelsio (0=no limit)?"); 1266 SYSCTL_ADD_U32(&bbr_sysctl_ctx, 1267 SYSCTL_CHILDREN(bbr_hptsi), 1268 OID_AUTO, "hw_pacing_adj", CTLFLAG_RW, 1269 &bbr_hdwr_pace_adjust, 2, 1270 "Multiplier to calculated tso size?"); 1271 SYSCTL_ADD_U32(&bbr_sysctl_ctx, 1272 SYSCTL_CHILDREN(bbr_hptsi), 1273 OID_AUTO, "hw_pacing_floor", CTLFLAG_RW, 1274 &bbr_hdwr_pace_floor, 1, 1275 "Do we invoke the hardware pacing floor?"); 1276 SYSCTL_ADD_U32(&bbr_sysctl_ctx, 1277 SYSCTL_CHILDREN(bbr_hptsi), 1278 OID_AUTO, "hw_pacing_delay_cnt", CTLFLAG_RW, 1279 &bbr_hdwr_pacing_delay_cnt, 10, 1280 "How many packets must be sent after hdwr pacing is enabled"); 1281 SYSCTL_ADD_U32(&bbr_sysctl_ctx, 1282 SYSCTL_CHILDREN(bbr_hptsi), 1283 OID_AUTO, "bw_cross", CTLFLAG_RW, 1284 &bbr_cross_over, 3000000, 1285 "What is the point where we cross over to linux like TSO size set"); 1286 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1287 SYSCTL_CHILDREN(bbr_hptsi), 1288 OID_AUTO, "seg_deltarg", CTLFLAG_RW, 1289 &bbr_hptsi_segments_delay_tar, 7000, 1290 "What is the worse case delay target for hptsi < 48Mbp connections"); 1291 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1292 SYSCTL_CHILDREN(bbr_hptsi), 1293 OID_AUTO, "enet_oh", CTLFLAG_RW, 1294 &bbr_include_enet_oh, 0, 1295 "Do we include the ethernet overhead in calculating pacing delay?"); 1296 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1297 SYSCTL_CHILDREN(bbr_hptsi), 1298 OID_AUTO, "ip_oh", CTLFLAG_RW, 1299 &bbr_include_ip_oh, 1, 1300 "Do we include the IP overhead in calculating pacing delay?"); 1301 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1302 SYSCTL_CHILDREN(bbr_hptsi), 1303 OID_AUTO, "tcp_oh", CTLFLAG_RW, 1304 &bbr_include_tcp_oh, 0, 1305 "Do we include the TCP overhead in calculating pacing delay?"); 1306 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1307 SYSCTL_CHILDREN(bbr_hptsi), 1308 OID_AUTO, "google_discount", CTLFLAG_RW, 1309 &bbr_google_discount, 10, 1310 "What is the default google discount percentage wise for pacing (11 = 1.1%%)?"); 1311 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1312 SYSCTL_CHILDREN(bbr_hptsi), 1313 OID_AUTO, "all_get_min", CTLFLAG_RW, 1314 &bbr_all_get_min, 0, 1315 "If you are less than a MSS do you just get the min?"); 1316 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1317 SYSCTL_CHILDREN(bbr_hptsi), 1318 OID_AUTO, "tso_min", CTLFLAG_RW, 1319 &bbr_hptsi_bytes_min, 1460, 1320 "For 0 -> 24Mbps what is floor number of segments for TSO"); 1321 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1322 SYSCTL_CHILDREN(bbr_hptsi), 1323 OID_AUTO, "seg_tso_max", CTLFLAG_RW, 1324 &bbr_hptsi_segments_max, 6, 1325 "For 0 -> 24Mbps what is top number of segments for TSO"); 1326 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1327 SYSCTL_CHILDREN(bbr_hptsi), 1328 OID_AUTO, "seg_floor", CTLFLAG_RW, 1329 &bbr_hptsi_segments_floor, 1, 1330 "Minimum TSO size we will fall too in segments"); 1331 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1332 SYSCTL_CHILDREN(bbr_hptsi), 1333 OID_AUTO, "utter_max", CTLFLAG_RW, 1334 &bbr_hptsi_utter_max, 0, 1335 "The absolute maximum that any pacing (outside of hardware) can be"); 1336 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1337 SYSCTL_CHILDREN(bbr_hptsi), 1338 OID_AUTO, "seg_divisor", CTLFLAG_RW, 1339 &bbr_hptsi_per_second, 100, 1340 "What is the divisor in our hptsi TSO calculation 512Mbps < X > 24Mbps "); 1341 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1342 SYSCTL_CHILDREN(bbr_hptsi), 1343 OID_AUTO, "srtt_mul", CTLFLAG_RW, 1344 &bbr_hptsi_max_mul, 1, 1345 "The multiplier for pace len max"); 1346 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1347 SYSCTL_CHILDREN(bbr_hptsi), 1348 OID_AUTO, "srtt_div", CTLFLAG_RW, 1349 &bbr_hptsi_max_div, 2, 1350 "The divisor for pace len max"); 1351 /* Measurement controls */ 1352 bbr_measure = SYSCTL_ADD_NODE(&bbr_sysctl_ctx, 1353 SYSCTL_CHILDREN(bbr_sysctl_root), 1354 OID_AUTO, 1355 "measure", 1356 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1357 "Measurement controls"); 1358 SYSCTL_ADD_U32(&bbr_sysctl_ctx, 1359 SYSCTL_CHILDREN(bbr_measure), 1360 OID_AUTO, "min_i_bw", CTLFLAG_RW, 1361 &bbr_initial_bw_bps, 62500, 1362 "Minimum initial b/w in bytes per second"); 1363 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1364 SYSCTL_CHILDREN(bbr_measure), 1365 OID_AUTO, "no_sack_needed", CTLFLAG_RW, 1366 &bbr_sack_not_required, 0, 1367 "Do we allow bbr to run on connections not supporting SACK?"); 1368 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1369 SYSCTL_CHILDREN(bbr_measure), 1370 OID_AUTO, "use_google", CTLFLAG_RW, 1371 &bbr_use_google_algo, 0, 1372 "Use has close to google V1.0 has possible?"); 1373 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1374 SYSCTL_CHILDREN(bbr_measure), 1375 OID_AUTO, "ts_limiting", CTLFLAG_RW, 1376 &bbr_ts_limiting, 1, 1377 "Do we attempt to use the peers timestamp to limit b/w caculations?"); 1378 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1379 SYSCTL_CHILDREN(bbr_measure), 1380 OID_AUTO, "ts_can_raise", CTLFLAG_RW, 1381 &bbr_ts_can_raise, 0, 1382 "Can we raise the b/w via timestamp b/w calculation?"); 1383 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1384 SYSCTL_CHILDREN(bbr_measure), 1385 OID_AUTO, "ts_delta", CTLFLAG_RW, 1386 &bbr_min_usec_delta, 20000, 1387 "How long in usec between ts of our sends in ts validation code?"); 1388 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1389 SYSCTL_CHILDREN(bbr_measure), 1390 OID_AUTO, "ts_peer_delta", CTLFLAG_RW, 1391 &bbr_min_peer_delta, 20, 1392 "What min numerical value should be between the peer deltas?"); 1393 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1394 SYSCTL_CHILDREN(bbr_measure), 1395 OID_AUTO, "ts_delta_percent", CTLFLAG_RW, 1396 &bbr_delta_percent, 150, 1397 "What percentage (150 = 15.0) do we allow variance for?"); 1398 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1399 SYSCTL_CHILDREN(bbr_measure), 1400 OID_AUTO, "min_measure_good_bw", CTLFLAG_RW, 1401 &bbr_min_measurements_req, 1, 1402 "What is the minimum measurement count we need before we switch to our b/w estimate"); 1403 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1404 SYSCTL_CHILDREN(bbr_measure), 1405 OID_AUTO, "min_measure_before_pace", CTLFLAG_RW, 1406 &bbr_no_pacing_until, 4, 1407 "How many pkt-epoch's (0 is off) do we need before pacing is on?"); 1408 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1409 SYSCTL_CHILDREN(bbr_measure), 1410 OID_AUTO, "quanta", CTLFLAG_RW, 1411 &bbr_quanta, 2, 1412 "Extra quanta to add when calculating the target (ID section 4.2.3.2)."); 1413 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1414 SYSCTL_CHILDREN(bbr_measure), 1415 OID_AUTO, "noretran", CTLFLAG_RW, 1416 &bbr_no_retran, 0, 1417 "Should google mode not use retransmission measurements for the b/w estimation?"); 1418 /* State controls */ 1419 bbr_states = SYSCTL_ADD_NODE(&bbr_sysctl_ctx, 1420 SYSCTL_CHILDREN(bbr_sysctl_root), 1421 OID_AUTO, 1422 "states", 1423 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1424 "State controls"); 1425 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1426 SYSCTL_CHILDREN(bbr_states), 1427 OID_AUTO, "idle_restart", CTLFLAG_RW, 1428 &bbr_uses_idle_restart, 0, 1429 "Do we use a new special idle_restart state to ramp back up quickly?"); 1430 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1431 SYSCTL_CHILDREN(bbr_states), 1432 OID_AUTO, "idle_restart_threshold", CTLFLAG_RW, 1433 &bbr_idle_restart_threshold, 100000, 1434 "How long must we be idle before we restart??"); 1435 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1436 SYSCTL_CHILDREN(bbr_states), 1437 OID_AUTO, "use_pkt_epoch", CTLFLAG_RW, 1438 &bbr_state_is_pkt_epoch, 0, 1439 "Do we use a pkt-epoch for substate if 0 rttProp?"); 1440 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1441 SYSCTL_CHILDREN(bbr_states), 1442 OID_AUTO, "startup_rtt_gain", CTLFLAG_RW, 1443 &bbr_rtt_gain_thresh, 0, 1444 "What increase in RTT triggers us to stop ignoring no-loss and possibly exit startup?"); 1445 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1446 SYSCTL_CHILDREN(bbr_states), 1447 OID_AUTO, "drain_floor", CTLFLAG_RW, 1448 &bbr_drain_floor, 88, 1449 "What is the lowest we can drain (pg) too?"); 1450 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1451 SYSCTL_CHILDREN(bbr_states), 1452 OID_AUTO, "drain_2_target", CTLFLAG_RW, 1453 &bbr_state_drain_2_tar, 1, 1454 "Do we drain to target in drain substate?"); 1455 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1456 SYSCTL_CHILDREN(bbr_states), 1457 OID_AUTO, "gain_2_target", CTLFLAG_RW, 1458 &bbr_gain_to_target, 1, 1459 "Does probe bw gain to target??"); 1460 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1461 SYSCTL_CHILDREN(bbr_states), 1462 OID_AUTO, "gain_extra_time", CTLFLAG_RW, 1463 &bbr_gain_gets_extra_too, 1, 1464 "Does probe bw gain get the extra time too?"); 1465 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1466 SYSCTL_CHILDREN(bbr_states), 1467 OID_AUTO, "ld_div", CTLFLAG_RW, 1468 &bbr_drain_drop_div, 5, 1469 "Long drain drop divider?"); 1470 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1471 SYSCTL_CHILDREN(bbr_states), 1472 OID_AUTO, "ld_mul", CTLFLAG_RW, 1473 &bbr_drain_drop_mul, 4, 1474 "Long drain drop multiplier?"); 1475 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1476 SYSCTL_CHILDREN(bbr_states), 1477 OID_AUTO, "rand_ot_disc", CTLFLAG_RW, 1478 &bbr_rand_ot, 50, 1479 "Random discount of the ot?"); 1480 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1481 SYSCTL_CHILDREN(bbr_states), 1482 OID_AUTO, "dr_filter_life", CTLFLAG_RW, 1483 &bbr_num_pktepo_for_del_limit, BBR_NUM_RTTS_FOR_DEL_LIMIT, 1484 "How many packet-epochs does the b/w delivery rate last?"); 1485 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1486 SYSCTL_CHILDREN(bbr_states), 1487 OID_AUTO, "subdrain_applimited", CTLFLAG_RW, 1488 &bbr_sub_drain_app_limit, 0, 1489 "Does our sub-state drain invoke app limited if its long?"); 1490 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1491 SYSCTL_CHILDREN(bbr_states), 1492 OID_AUTO, "use_cwnd_subdrain", CTLFLAG_RW, 1493 &bbr_sub_drain_slam_cwnd, 0, 1494 "Should we set/recover cwnd for sub-state drain?"); 1495 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1496 SYSCTL_CHILDREN(bbr_states), 1497 OID_AUTO, "use_cwnd_maindrain", CTLFLAG_RW, 1498 &bbr_slam_cwnd_in_main_drain, 0, 1499 "Should we set/recover cwnd for main-state drain?"); 1500 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1501 SYSCTL_CHILDREN(bbr_states), 1502 OID_AUTO, "google_gets_earlyout", CTLFLAG_RW, 1503 &google_allow_early_out, 1, 1504 "Should we allow google probe-bw/drain to exit early at flight target?"); 1505 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1506 SYSCTL_CHILDREN(bbr_states), 1507 OID_AUTO, "google_exit_loss", CTLFLAG_RW, 1508 &google_consider_lost, 1, 1509 "Should we have losses exit gain of probebw in google mode??"); 1510 /* Startup controls */ 1511 bbr_startup = SYSCTL_ADD_NODE(&bbr_sysctl_ctx, 1512 SYSCTL_CHILDREN(bbr_sysctl_root), 1513 OID_AUTO, 1514 "startup", 1515 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1516 "Startup controls"); 1517 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1518 SYSCTL_CHILDREN(bbr_startup), 1519 OID_AUTO, "cheat_iwnd", CTLFLAG_RW, 1520 &bbr_sends_full_iwnd, 1, 1521 "Do we not pace but burst out initial windows has our TSO size?"); 1522 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1523 SYSCTL_CHILDREN(bbr_startup), 1524 OID_AUTO, "loss_threshold", CTLFLAG_RW, 1525 &bbr_startup_loss_thresh, 2000, 1526 "In startup what is the loss threshold in a pe that will exit us from startup?"); 1527 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1528 SYSCTL_CHILDREN(bbr_startup), 1529 OID_AUTO, "use_lowerpg", CTLFLAG_RW, 1530 &bbr_use_lower_gain_in_startup, 1, 1531 "Should we use a lower hptsi gain if we see loss in startup?"); 1532 SYSCTL_ADD_U32(&bbr_sysctl_ctx, 1533 SYSCTL_CHILDREN(bbr_startup), 1534 OID_AUTO, "gain", CTLFLAG_RW, 1535 &bbr_start_exit, 25, 1536 "What gain percent do we need to see to stay in startup??"); 1537 SYSCTL_ADD_U32(&bbr_sysctl_ctx, 1538 SYSCTL_CHILDREN(bbr_startup), 1539 OID_AUTO, "low_gain", CTLFLAG_RW, 1540 &bbr_low_start_exit, 15, 1541 "What gain percent do we need to see to stay in the lower gain startup??"); 1542 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1543 SYSCTL_CHILDREN(bbr_startup), 1544 OID_AUTO, "loss_exit", CTLFLAG_RW, 1545 &bbr_exit_startup_at_loss, 1, 1546 "Should we exit startup at loss in an epoch if we are not gaining?"); 1547 /* CWND controls */ 1548 bbr_cwnd = SYSCTL_ADD_NODE(&bbr_sysctl_ctx, 1549 SYSCTL_CHILDREN(bbr_sysctl_root), 1550 OID_AUTO, 1551 "cwnd", 1552 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1553 "Cwnd controls"); 1554 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1555 SYSCTL_CHILDREN(bbr_cwnd), 1556 OID_AUTO, "tar_rtt", CTLFLAG_RW, 1557 &bbr_cwndtarget_rtt_touse, 0, 1558 "Target cwnd rtt measurement to use (0=rtt_prop, 1=rtt_rack, 2=pkt_rtt, 3=srtt)?"); 1559 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1560 SYSCTL_CHILDREN(bbr_cwnd), 1561 OID_AUTO, "may_shrink", CTLFLAG_RW, 1562 &bbr_cwnd_may_shrink, 0, 1563 "Can the cwnd shrink if it would grow to more than the target?"); 1564 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1565 SYSCTL_CHILDREN(bbr_cwnd), 1566 OID_AUTO, "max_target_limit", CTLFLAG_RW, 1567 &bbr_target_cwnd_mult_limit, 8, 1568 "Do we limit the cwnd to some multiple of the cwnd target if cwnd can't shrink 0=no?"); 1569 SYSCTL_ADD_U32(&bbr_sysctl_ctx, 1570 SYSCTL_CHILDREN(bbr_cwnd), 1571 OID_AUTO, "highspeed_min", CTLFLAG_RW, 1572 &bbr_cwnd_min_val_hs, BBR_HIGHSPEED_NUM_MSS, 1573 "What is the high-speed min cwnd (rttProp under 1ms)"); 1574 SYSCTL_ADD_U32(&bbr_sysctl_ctx, 1575 SYSCTL_CHILDREN(bbr_cwnd), 1576 OID_AUTO, "lowspeed_min", CTLFLAG_RW, 1577 &bbr_cwnd_min_val, BBR_PROBERTT_NUM_MSS, 1578 "What is the min cwnd (rttProp > 1ms)"); 1579 SYSCTL_ADD_U32(&bbr_sysctl_ctx, 1580 SYSCTL_CHILDREN(bbr_cwnd), 1581 OID_AUTO, "initwin", CTLFLAG_RW, 1582 &bbr_def_init_win, 10, 1583 "What is the BBR initial window, if 0 use tcp version"); 1584 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1585 SYSCTL_CHILDREN(bbr_cwnd), 1586 OID_AUTO, "do_loss_red", CTLFLAG_RW, 1587 &bbr_do_red, 600, 1588 "Do we reduce the b/w at exit from recovery based on ratio of prop/srtt (800=80.0, 0=off)?"); 1589 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1590 SYSCTL_CHILDREN(bbr_cwnd), 1591 OID_AUTO, "red_scale", CTLFLAG_RW, 1592 &bbr_red_scale, 20000, 1593 "What RTT do we scale with?"); 1594 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1595 SYSCTL_CHILDREN(bbr_cwnd), 1596 OID_AUTO, "red_growslow", CTLFLAG_RW, 1597 &bbr_red_growth_restrict, 1, 1598 "Do we restrict cwnd growth for whats in flight?"); 1599 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1600 SYSCTL_CHILDREN(bbr_cwnd), 1601 OID_AUTO, "red_div", CTLFLAG_RW, 1602 &bbr_red_div, 2, 1603 "If we reduce whats the divisor?"); 1604 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1605 SYSCTL_CHILDREN(bbr_cwnd), 1606 OID_AUTO, "red_mul", CTLFLAG_RW, 1607 &bbr_red_mul, 1, 1608 "If we reduce whats the mulitiplier?"); 1609 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1610 SYSCTL_CHILDREN(bbr_cwnd), 1611 OID_AUTO, "target_is_unit", CTLFLAG_RW, 1612 &bbr_target_is_bbunit, 0, 1613 "Is the state target the pacing_gain or BBR_UNIT?"); 1614 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1615 SYSCTL_CHILDREN(bbr_cwnd), 1616 OID_AUTO, "drop_limit", CTLFLAG_RW, 1617 &bbr_drop_limit, 0, 1618 "Number of segments limit for drop (0=use min_cwnd w/flight)?"); 1619 1620 /* Timeout controls */ 1621 bbr_timeout = SYSCTL_ADD_NODE(&bbr_sysctl_ctx, 1622 SYSCTL_CHILDREN(bbr_sysctl_root), 1623 OID_AUTO, 1624 "timeout", 1625 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1626 "Time out controls"); 1627 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1628 SYSCTL_CHILDREN(bbr_timeout), 1629 OID_AUTO, "delack", CTLFLAG_RW, 1630 &bbr_delack_time, 100000, 1631 "BBR's delayed ack time"); 1632 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1633 SYSCTL_CHILDREN(bbr_timeout), 1634 OID_AUTO, "tlp_uses", CTLFLAG_RW, 1635 &bbr_tlp_type_to_use, 3, 1636 "RTT that TLP uses in its calculations, 0=rttProp, 1=Rack_rtt, 2=pkt_rtt and 3=srtt"); 1637 SYSCTL_ADD_U32(&bbr_sysctl_ctx, 1638 SYSCTL_CHILDREN(bbr_timeout), 1639 OID_AUTO, "persmin", CTLFLAG_RW, 1640 &bbr_persist_min, 250000, 1641 "What is the minimum time in microseconds between persists"); 1642 SYSCTL_ADD_U32(&bbr_sysctl_ctx, 1643 SYSCTL_CHILDREN(bbr_timeout), 1644 OID_AUTO, "persmax", CTLFLAG_RW, 1645 &bbr_persist_max, 1000000, 1646 "What is the largest delay in microseconds between persists"); 1647 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1648 SYSCTL_CHILDREN(bbr_timeout), 1649 OID_AUTO, "tlp_minto", CTLFLAG_RW, 1650 &bbr_tlp_min, 10000, 1651 "TLP Min timeout in usecs"); 1652 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1653 SYSCTL_CHILDREN(bbr_timeout), 1654 OID_AUTO, "tlp_dack_time", CTLFLAG_RW, 1655 &bbr_delayed_ack_time, 200000, 1656 "TLP delayed ack compensation value"); 1657 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1658 SYSCTL_CHILDREN(bbr_sysctl_root), 1659 OID_AUTO, "minrto", CTLFLAG_RW, 1660 &bbr_rto_min_ms, 30, 1661 "Minimum RTO in ms"); 1662 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1663 SYSCTL_CHILDREN(bbr_timeout), 1664 OID_AUTO, "maxrto", CTLFLAG_RW, 1665 &bbr_rto_max_sec, 4, 1666 "Maximum RTO in seconds -- should be at least as large as min_rto"); 1667 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1668 SYSCTL_CHILDREN(bbr_timeout), 1669 OID_AUTO, "tlp_retry", CTLFLAG_RW, 1670 &bbr_tlp_max_resend, 2, 1671 "How many times does TLP retry a single segment or multiple with no ACK"); 1672 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1673 SYSCTL_CHILDREN(bbr_timeout), 1674 OID_AUTO, "minto", CTLFLAG_RW, 1675 &bbr_min_to, 1000, 1676 "Minimum rack timeout in useconds"); 1677 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1678 SYSCTL_CHILDREN(bbr_timeout), 1679 OID_AUTO, "pktdelay", CTLFLAG_RW, 1680 &bbr_pkt_delay, 1000, 1681 "Extra RACK time (in useconds) besides reordering thresh"); 1682 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1683 SYSCTL_CHILDREN(bbr_timeout), 1684 OID_AUTO, "incr_tmrs", CTLFLAG_RW, 1685 &bbr_incr_timers, 1, 1686 "Increase the RXT/TLP timer by the pacing time used?"); 1687 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1688 SYSCTL_CHILDREN(bbr_timeout), 1689 OID_AUTO, "rxtmark_sackpassed", CTLFLAG_RW, 1690 &bbr_marks_rxt_sack_passed, 0, 1691 "Mark sack passed on all those not ack'd when a RXT hits?"); 1692 /* Policer controls */ 1693 bbr_policer = SYSCTL_ADD_NODE(&bbr_sysctl_ctx, 1694 SYSCTL_CHILDREN(bbr_sysctl_root), 1695 OID_AUTO, 1696 "policer", 1697 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1698 "Policer controls"); 1699 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1700 SYSCTL_CHILDREN(bbr_policer), 1701 OID_AUTO, "detect_enable", CTLFLAG_RW, 1702 &bbr_policer_detection_enabled, 1, 1703 "Is policer detection enabled??"); 1704 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1705 SYSCTL_CHILDREN(bbr_policer), 1706 OID_AUTO, "min_pes", CTLFLAG_RW, 1707 &bbr_lt_intvl_min_rtts, 4, 1708 "Minimum number of PE's?"); 1709 SYSCTL_ADD_U64(&bbr_sysctl_ctx, 1710 SYSCTL_CHILDREN(bbr_policer), 1711 OID_AUTO, "bwdiff", CTLFLAG_RW, 1712 &bbr_lt_bw_diff, (4000/8), 1713 "Minimal bw diff?"); 1714 SYSCTL_ADD_U64(&bbr_sysctl_ctx, 1715 SYSCTL_CHILDREN(bbr_policer), 1716 OID_AUTO, "bwratio", CTLFLAG_RW, 1717 &bbr_lt_bw_ratio, 8, 1718 "Minimal bw diff?"); 1719 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1720 SYSCTL_CHILDREN(bbr_policer), 1721 OID_AUTO, "from_rack_rxt", CTLFLAG_RW, 1722 &bbr_policer_call_from_rack_to, 0, 1723 "Do we call the policer detection code from a rack-timeout?"); 1724 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1725 SYSCTL_CHILDREN(bbr_policer), 1726 OID_AUTO, "false_postive", CTLFLAG_RW, 1727 &bbr_lt_intvl_fp, 0, 1728 "What packet epoch do we do false-positive detection at (0=no)?"); 1729 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1730 SYSCTL_CHILDREN(bbr_policer), 1731 OID_AUTO, "loss_thresh", CTLFLAG_RW, 1732 &bbr_lt_loss_thresh, 196, 1733 "Loss threshold 196 = 19.6%?"); 1734 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1735 SYSCTL_CHILDREN(bbr_policer), 1736 OID_AUTO, "false_postive_thresh", CTLFLAG_RW, 1737 &bbr_lt_fd_thresh, 100, 1738 "What percentage is the false detection threshold (150=15.0)?"); 1739 /* All the rest */ 1740 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1741 SYSCTL_CHILDREN(bbr_sysctl_root), 1742 OID_AUTO, "cheat_rxt", CTLFLAG_RW, 1743 &bbr_use_rack_resend_cheat, 0, 1744 "Do we burst 1ms between sends on retransmissions (like rack)?"); 1745 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1746 SYSCTL_CHILDREN(bbr_sysctl_root), 1747 OID_AUTO, "error_paceout", CTLFLAG_RW, 1748 &bbr_error_base_paceout, 10000, 1749 "When we hit an error what is the min to pace out in usec's?"); 1750 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1751 SYSCTL_CHILDREN(bbr_sysctl_root), 1752 OID_AUTO, "kill_paceout", CTLFLAG_RW, 1753 &bbr_max_net_error_cnt, 10, 1754 "When we hit this many errors in a row, kill the session?"); 1755 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1756 SYSCTL_CHILDREN(bbr_sysctl_root), 1757 OID_AUTO, "data_after_close", CTLFLAG_RW, 1758 &bbr_ignore_data_after_close, 1, 1759 "Do we hold off sending a RST until all pending data is ack'd"); 1760 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1761 SYSCTL_CHILDREN(bbr_sysctl_root), 1762 OID_AUTO, "resend_use_tso", CTLFLAG_RW, 1763 &bbr_resends_use_tso, 0, 1764 "Can resends use TSO?"); 1765 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1766 SYSCTL_CHILDREN(bbr_sysctl_root), 1767 OID_AUTO, "sblklimit", CTLFLAG_RW, 1768 &bbr_sack_block_limit, 128, 1769 "When do we start ignoring small sack blocks"); 1770 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1771 SYSCTL_CHILDREN(bbr_sysctl_root), 1772 OID_AUTO, "bb_verbose", CTLFLAG_RW, 1773 &bbr_verbose_logging, 0, 1774 "Should BBR black box logging be verbose"); 1775 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1776 SYSCTL_CHILDREN(bbr_sysctl_root), 1777 OID_AUTO, "reorder_thresh", CTLFLAG_RW, 1778 &bbr_reorder_thresh, 2, 1779 "What factor for rack will be added when seeing reordering (shift right)"); 1780 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1781 SYSCTL_CHILDREN(bbr_sysctl_root), 1782 OID_AUTO, "reorder_fade", CTLFLAG_RW, 1783 &bbr_reorder_fade, 0, 1784 "Does reorder detection fade, if so how many ms (0 means never)"); 1785 SYSCTL_ADD_S32(&bbr_sysctl_ctx, 1786 SYSCTL_CHILDREN(bbr_sysctl_root), 1787 OID_AUTO, "rtt_tlp_thresh", CTLFLAG_RW, 1788 &bbr_tlp_thresh, 1, 1789 "what divisor for TLP rtt/retran will be added (1=rtt, 2=1/2 rtt etc)"); 1790 /* Stats and counters */ 1791 /* The pacing counters for hdwr/software can't be in the array */ 1792 bbr_nohdwr_pacing_enobuf = counter_u64_alloc(M_WAITOK); 1793 bbr_hdwr_pacing_enobuf = counter_u64_alloc(M_WAITOK); 1794 SYSCTL_ADD_COUNTER_U64(&bbr_sysctl_ctx, 1795 SYSCTL_CHILDREN(bbr_sysctl_root), 1796 OID_AUTO, "enob_hdwr_pacing", CTLFLAG_RD, 1797 &bbr_hdwr_pacing_enobuf, 1798 "Total number of enobufs for hardware paced flows"); 1799 SYSCTL_ADD_COUNTER_U64(&bbr_sysctl_ctx, 1800 SYSCTL_CHILDREN(bbr_sysctl_root), 1801 OID_AUTO, "enob_no_hdwr_pacing", CTLFLAG_RD, 1802 &bbr_nohdwr_pacing_enobuf, 1803 "Total number of enobufs for non-hardware paced flows"); 1804 1805 bbr_flows_whdwr_pacing = counter_u64_alloc(M_WAITOK); 1806 SYSCTL_ADD_COUNTER_U64(&bbr_sysctl_ctx, 1807 SYSCTL_CHILDREN(bbr_sysctl_root), 1808 OID_AUTO, "hdwr_pacing", CTLFLAG_RD, 1809 &bbr_flows_whdwr_pacing, 1810 "Total number of hardware paced flows"); 1811 bbr_flows_nohdwr_pacing = counter_u64_alloc(M_WAITOK); 1812 SYSCTL_ADD_COUNTER_U64(&bbr_sysctl_ctx, 1813 SYSCTL_CHILDREN(bbr_sysctl_root), 1814 OID_AUTO, "software_pacing", CTLFLAG_RD, 1815 &bbr_flows_nohdwr_pacing, 1816 "Total number of software paced flows"); 1817 COUNTER_ARRAY_ALLOC(bbr_stat_arry, BBR_STAT_SIZE, M_WAITOK); 1818 SYSCTL_ADD_COUNTER_U64_ARRAY(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root), 1819 OID_AUTO, "stats", CTLFLAG_RD, 1820 bbr_stat_arry, BBR_STAT_SIZE, "BBR Stats"); 1821 COUNTER_ARRAY_ALLOC(bbr_opts_arry, BBR_OPTS_SIZE, M_WAITOK); 1822 SYSCTL_ADD_COUNTER_U64_ARRAY(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root), 1823 OID_AUTO, "opts", CTLFLAG_RD, 1824 bbr_opts_arry, BBR_OPTS_SIZE, "BBR Option Stats"); 1825 COUNTER_ARRAY_ALLOC(bbr_state_lost, BBR_MAX_STAT, M_WAITOK); 1826 SYSCTL_ADD_COUNTER_U64_ARRAY(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root), 1827 OID_AUTO, "lost", CTLFLAG_RD, 1828 bbr_state_lost, BBR_MAX_STAT, "Stats of when losses occur"); 1829 COUNTER_ARRAY_ALLOC(bbr_state_resend, BBR_MAX_STAT, M_WAITOK); 1830 SYSCTL_ADD_COUNTER_U64_ARRAY(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root), 1831 OID_AUTO, "stateresend", CTLFLAG_RD, 1832 bbr_state_resend, BBR_MAX_STAT, "Stats of what states resend"); 1833 COUNTER_ARRAY_ALLOC(bbr_state_time, BBR_MAX_STAT, M_WAITOK); 1834 SYSCTL_ADD_COUNTER_U64_ARRAY(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root), 1835 OID_AUTO, "statetime", CTLFLAG_RD, 1836 bbr_state_time, BBR_MAX_STAT, "Stats of time spent in the states"); 1837 COUNTER_ARRAY_ALLOC(bbr_out_size, TCP_MSS_ACCT_SIZE, M_WAITOK); 1838 SYSCTL_ADD_COUNTER_U64_ARRAY(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root), 1839 OID_AUTO, "outsize", CTLFLAG_RD, 1840 bbr_out_size, TCP_MSS_ACCT_SIZE, "Size of output calls"); 1841 SYSCTL_ADD_PROC(&bbr_sysctl_ctx, 1842 SYSCTL_CHILDREN(bbr_sysctl_root), 1843 OID_AUTO, "clrlost", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, 1844 &bbr_clear_lost, 0, sysctl_bbr_clear_lost, "IU", "Clear lost counters"); 1845 } 1846 1847 static void 1848 bbr_counter_destroy(void) 1849 { 1850 COUNTER_ARRAY_FREE(bbr_stat_arry, BBR_STAT_SIZE); 1851 COUNTER_ARRAY_FREE(bbr_opts_arry, BBR_OPTS_SIZE); 1852 COUNTER_ARRAY_FREE(bbr_out_size, TCP_MSS_ACCT_SIZE); 1853 COUNTER_ARRAY_FREE(bbr_state_lost, BBR_MAX_STAT); 1854 COUNTER_ARRAY_FREE(bbr_state_time, BBR_MAX_STAT); 1855 COUNTER_ARRAY_FREE(bbr_state_resend, BBR_MAX_STAT); 1856 counter_u64_free(bbr_nohdwr_pacing_enobuf); 1857 counter_u64_free(bbr_hdwr_pacing_enobuf); 1858 counter_u64_free(bbr_flows_whdwr_pacing); 1859 counter_u64_free(bbr_flows_nohdwr_pacing); 1860 1861 } 1862 1863 static __inline void 1864 bbr_fill_in_logging_data(struct tcp_bbr *bbr, struct tcp_log_bbr *l, uint32_t cts) 1865 { 1866 memset(l, 0, sizeof(union tcp_log_stackspecific)); 1867 l->cur_del_rate = bbr->r_ctl.rc_bbr_cur_del_rate; 1868 l->delRate = get_filter_value(&bbr->r_ctl.rc_delrate); 1869 l->rttProp = get_filter_value_small(&bbr->r_ctl.rc_rttprop); 1870 l->bw_inuse = bbr_get_bw(bbr); 1871 l->inflight = ctf_flight_size(bbr->rc_tp, 1872 (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)); 1873 l->applimited = bbr->r_ctl.r_app_limited_until; 1874 l->delivered = bbr->r_ctl.rc_delivered; 1875 l->timeStamp = cts; 1876 l->lost = bbr->r_ctl.rc_lost; 1877 l->bbr_state = bbr->rc_bbr_state; 1878 l->bbr_substate = bbr_state_val(bbr); 1879 l->epoch = bbr->r_ctl.rc_rtt_epoch; 1880 l->lt_epoch = bbr->r_ctl.rc_lt_epoch; 1881 l->pacing_gain = bbr->r_ctl.rc_bbr_hptsi_gain; 1882 l->cwnd_gain = bbr->r_ctl.rc_bbr_cwnd_gain; 1883 l->inhpts = tcp_in_hpts(bbr->rc_inp); 1884 l->use_lt_bw = bbr->rc_lt_use_bw; 1885 l->pkts_out = bbr->r_ctl.rc_flight_at_input; 1886 l->pkt_epoch = bbr->r_ctl.rc_pkt_epoch; 1887 } 1888 1889 static void 1890 bbr_log_type_bw_reduce(struct tcp_bbr *bbr, int reason) 1891 { 1892 if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 1893 union tcp_log_stackspecific log; 1894 1895 bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime); 1896 log.u_bbr.flex1 = 0; 1897 log.u_bbr.flex2 = 0; 1898 log.u_bbr.flex5 = 0; 1899 log.u_bbr.flex3 = 0; 1900 log.u_bbr.flex4 = bbr->r_ctl.rc_pkt_epoch_loss_rate; 1901 log.u_bbr.flex7 = reason; 1902 log.u_bbr.flex6 = bbr->r_ctl.rc_bbr_enters_probertt; 1903 log.u_bbr.flex8 = 0; 1904 TCP_LOG_EVENTP(bbr->rc_tp, NULL, 1905 &bbr->rc_inp->inp_socket->so_rcv, 1906 &bbr->rc_inp->inp_socket->so_snd, 1907 BBR_LOG_BW_RED_EV, 0, 1908 0, &log, false, &bbr->rc_tv); 1909 } 1910 } 1911 1912 static void 1913 bbr_log_type_rwnd_collapse(struct tcp_bbr *bbr, int seq, int mode, uint32_t count) 1914 { 1915 if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 1916 union tcp_log_stackspecific log; 1917 1918 bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime); 1919 log.u_bbr.flex1 = seq; 1920 log.u_bbr.flex2 = count; 1921 log.u_bbr.flex8 = mode; 1922 TCP_LOG_EVENTP(bbr->rc_tp, NULL, 1923 &bbr->rc_inp->inp_socket->so_rcv, 1924 &bbr->rc_inp->inp_socket->so_snd, 1925 BBR_LOG_LOWGAIN, 0, 1926 0, &log, false, &bbr->rc_tv); 1927 } 1928 } 1929 1930 static void 1931 bbr_log_type_just_return(struct tcp_bbr *bbr, uint32_t cts, uint32_t tlen, uint8_t hpts_calling, 1932 uint8_t reason, uint32_t p_maxseg, int len) 1933 { 1934 if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 1935 union tcp_log_stackspecific log; 1936 1937 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); 1938 log.u_bbr.flex1 = p_maxseg; 1939 log.u_bbr.flex2 = bbr->r_ctl.rc_hpts_flags; 1940 log.u_bbr.flex3 = bbr->r_ctl.rc_timer_exp; 1941 log.u_bbr.flex4 = reason; 1942 log.u_bbr.flex5 = bbr->rc_in_persist; 1943 log.u_bbr.flex6 = bbr->r_ctl.rc_last_delay_val; 1944 log.u_bbr.flex7 = p_maxseg; 1945 log.u_bbr.flex8 = bbr->rc_in_persist; 1946 log.u_bbr.pkts_out = 0; 1947 log.u_bbr.applimited = len; 1948 TCP_LOG_EVENTP(bbr->rc_tp, NULL, 1949 &bbr->rc_inp->inp_socket->so_rcv, 1950 &bbr->rc_inp->inp_socket->so_snd, 1951 BBR_LOG_JUSTRET, 0, 1952 tlen, &log, false, &bbr->rc_tv); 1953 } 1954 } 1955 1956 static void 1957 bbr_log_type_enter_rec(struct tcp_bbr *bbr, uint32_t seq) 1958 { 1959 if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 1960 union tcp_log_stackspecific log; 1961 1962 bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime); 1963 log.u_bbr.flex1 = seq; 1964 log.u_bbr.flex2 = bbr->r_ctl.rc_cwnd_on_ent; 1965 log.u_bbr.flex3 = bbr->r_ctl.rc_recovery_start; 1966 TCP_LOG_EVENTP(bbr->rc_tp, NULL, 1967 &bbr->rc_inp->inp_socket->so_rcv, 1968 &bbr->rc_inp->inp_socket->so_snd, 1969 BBR_LOG_ENTREC, 0, 1970 0, &log, false, &bbr->rc_tv); 1971 } 1972 } 1973 1974 static void 1975 bbr_log_msgsize_fail(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t len, uint32_t maxseg, uint32_t mtu, int32_t csum_flags, int32_t tso, uint32_t cts) 1976 { 1977 if (tp->t_logstate != TCP_LOG_STATE_OFF) { 1978 union tcp_log_stackspecific log; 1979 1980 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); 1981 log.u_bbr.flex1 = tso; 1982 log.u_bbr.flex2 = maxseg; 1983 log.u_bbr.flex3 = mtu; 1984 log.u_bbr.flex4 = csum_flags; 1985 TCP_LOG_EVENTP(tp, NULL, 1986 &bbr->rc_inp->inp_socket->so_rcv, 1987 &bbr->rc_inp->inp_socket->so_snd, 1988 BBR_LOG_MSGSIZE, 0, 1989 0, &log, false, &bbr->rc_tv); 1990 } 1991 } 1992 1993 static void 1994 bbr_log_flowend(struct tcp_bbr *bbr) 1995 { 1996 if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 1997 union tcp_log_stackspecific log; 1998 struct sockbuf *r, *s; 1999 struct timeval tv; 2000 2001 if (bbr->rc_inp->inp_socket) { 2002 r = &bbr->rc_inp->inp_socket->so_rcv; 2003 s = &bbr->rc_inp->inp_socket->so_snd; 2004 } else { 2005 r = s = NULL; 2006 } 2007 bbr_fill_in_logging_data(bbr, &log.u_bbr, tcp_get_usecs(&tv)); 2008 TCP_LOG_EVENTP(bbr->rc_tp, NULL, 2009 r, s, 2010 TCP_LOG_FLOWEND, 0, 2011 0, &log, false, &tv); 2012 } 2013 } 2014 2015 static void 2016 bbr_log_pkt_epoch(struct tcp_bbr *bbr, uint32_t cts, uint32_t line, 2017 uint32_t lost, uint32_t del) 2018 { 2019 if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2020 union tcp_log_stackspecific log; 2021 2022 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); 2023 log.u_bbr.flex1 = lost; 2024 log.u_bbr.flex2 = del; 2025 log.u_bbr.flex3 = bbr->r_ctl.rc_bbr_lastbtlbw; 2026 log.u_bbr.flex4 = bbr->r_ctl.rc_pkt_epoch_rtt; 2027 log.u_bbr.flex5 = bbr->r_ctl.rc_bbr_last_startup_epoch; 2028 log.u_bbr.flex6 = bbr->r_ctl.rc_lost_at_startup; 2029 log.u_bbr.flex7 = line; 2030 log.u_bbr.flex8 = 0; 2031 log.u_bbr.inflight = bbr->r_ctl.r_measurement_count; 2032 TCP_LOG_EVENTP(bbr->rc_tp, NULL, 2033 &bbr->rc_inp->inp_socket->so_rcv, 2034 &bbr->rc_inp->inp_socket->so_snd, 2035 BBR_LOG_PKT_EPOCH, 0, 2036 0, &log, false, &bbr->rc_tv); 2037 } 2038 } 2039 2040 static void 2041 bbr_log_time_epoch(struct tcp_bbr *bbr, uint32_t cts, uint32_t line, uint32_t epoch_time) 2042 { 2043 if (bbr_verbose_logging && (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) { 2044 union tcp_log_stackspecific log; 2045 2046 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); 2047 log.u_bbr.flex1 = bbr->r_ctl.rc_lost; 2048 log.u_bbr.flex2 = bbr->rc_inp->inp_socket->so_snd.sb_lowat; 2049 log.u_bbr.flex3 = bbr->rc_inp->inp_socket->so_snd.sb_hiwat; 2050 log.u_bbr.flex7 = line; 2051 TCP_LOG_EVENTP(bbr->rc_tp, NULL, 2052 &bbr->rc_inp->inp_socket->so_rcv, 2053 &bbr->rc_inp->inp_socket->so_snd, 2054 BBR_LOG_TIME_EPOCH, 0, 2055 0, &log, false, &bbr->rc_tv); 2056 } 2057 } 2058 2059 static void 2060 bbr_log_set_of_state_target(struct tcp_bbr *bbr, uint32_t new_tar, int line, int meth) 2061 { 2062 if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2063 union tcp_log_stackspecific log; 2064 2065 bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime); 2066 log.u_bbr.flex1 = bbr->r_ctl.rc_target_at_state; 2067 log.u_bbr.flex2 = new_tar; 2068 log.u_bbr.flex3 = line; 2069 log.u_bbr.flex4 = bbr->r_ctl.rc_pace_max_segs; 2070 log.u_bbr.flex5 = bbr_quanta; 2071 log.u_bbr.flex6 = bbr->r_ctl.rc_pace_min_segs; 2072 log.u_bbr.flex7 = bbr->rc_last_options; 2073 log.u_bbr.flex8 = meth; 2074 TCP_LOG_EVENTP(bbr->rc_tp, NULL, 2075 &bbr->rc_inp->inp_socket->so_rcv, 2076 &bbr->rc_inp->inp_socket->so_snd, 2077 BBR_LOG_STATE_TARGET, 0, 2078 0, &log, false, &bbr->rc_tv); 2079 } 2080 2081 } 2082 2083 static void 2084 bbr_log_type_statechange(struct tcp_bbr *bbr, uint32_t cts, int32_t line) 2085 { 2086 if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2087 union tcp_log_stackspecific log; 2088 2089 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); 2090 log.u_bbr.flex1 = line; 2091 log.u_bbr.flex2 = bbr->r_ctl.rc_rtt_shrinks; 2092 log.u_bbr.flex3 = bbr->r_ctl.rc_probertt_int; 2093 if (bbr_state_is_pkt_epoch) 2094 log.u_bbr.flex4 = bbr_get_rtt(bbr, BBR_RTT_PKTRTT); 2095 else 2096 log.u_bbr.flex4 = bbr_get_rtt(bbr, BBR_RTT_PROP); 2097 log.u_bbr.flex5 = bbr->r_ctl.rc_bbr_last_startup_epoch; 2098 log.u_bbr.flex6 = bbr->r_ctl.rc_lost_at_startup; 2099 log.u_bbr.flex7 = (bbr->r_ctl.rc_target_at_state/1000); 2100 log.u_bbr.lt_epoch = bbr->r_ctl.rc_level_state_extra; 2101 log.u_bbr.pkts_out = bbr->r_ctl.rc_target_at_state; 2102 TCP_LOG_EVENTP(bbr->rc_tp, NULL, 2103 &bbr->rc_inp->inp_socket->so_rcv, 2104 &bbr->rc_inp->inp_socket->so_snd, 2105 BBR_LOG_STATE, 0, 2106 0, &log, false, &bbr->rc_tv); 2107 } 2108 } 2109 2110 static void 2111 bbr_log_rtt_shrinks(struct tcp_bbr *bbr, uint32_t cts, uint32_t applied, 2112 uint32_t rtt, uint32_t line, uint8_t reas, uint16_t cond) 2113 { 2114 if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2115 union tcp_log_stackspecific log; 2116 2117 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); 2118 log.u_bbr.flex1 = line; 2119 log.u_bbr.flex2 = bbr->r_ctl.rc_rtt_shrinks; 2120 log.u_bbr.flex3 = bbr->r_ctl.last_in_probertt; 2121 log.u_bbr.flex4 = applied; 2122 log.u_bbr.flex5 = rtt; 2123 log.u_bbr.flex6 = bbr->r_ctl.rc_target_at_state; 2124 log.u_bbr.flex7 = cond; 2125 log.u_bbr.flex8 = reas; 2126 TCP_LOG_EVENTP(bbr->rc_tp, NULL, 2127 &bbr->rc_inp->inp_socket->so_rcv, 2128 &bbr->rc_inp->inp_socket->so_snd, 2129 BBR_LOG_RTT_SHRINKS, 0, 2130 0, &log, false, &bbr->rc_tv); 2131 } 2132 } 2133 2134 static void 2135 bbr_log_type_exit_rec(struct tcp_bbr *bbr) 2136 { 2137 if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2138 union tcp_log_stackspecific log; 2139 2140 bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime); 2141 log.u_bbr.flex1 = bbr->r_ctl.rc_recovery_start; 2142 log.u_bbr.flex2 = bbr->r_ctl.rc_cwnd_on_ent; 2143 log.u_bbr.flex5 = bbr->r_ctl.rc_target_at_state; 2144 TCP_LOG_EVENTP(bbr->rc_tp, NULL, 2145 &bbr->rc_inp->inp_socket->so_rcv, 2146 &bbr->rc_inp->inp_socket->so_snd, 2147 BBR_LOG_EXITREC, 0, 2148 0, &log, false, &bbr->rc_tv); 2149 } 2150 } 2151 2152 static void 2153 bbr_log_type_cwndupd(struct tcp_bbr *bbr, uint32_t bytes_this_ack, uint32_t chg, 2154 uint32_t prev_acked, int32_t meth, uint32_t target, uint32_t th_ack, int32_t line) 2155 { 2156 if (bbr_verbose_logging && (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) { 2157 union tcp_log_stackspecific log; 2158 2159 bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime); 2160 log.u_bbr.flex1 = line; 2161 log.u_bbr.flex2 = prev_acked; 2162 log.u_bbr.flex3 = bytes_this_ack; 2163 log.u_bbr.flex4 = chg; 2164 log.u_bbr.flex5 = th_ack; 2165 log.u_bbr.flex6 = target; 2166 log.u_bbr.flex8 = meth; 2167 TCP_LOG_EVENTP(bbr->rc_tp, NULL, 2168 &bbr->rc_inp->inp_socket->so_rcv, 2169 &bbr->rc_inp->inp_socket->so_snd, 2170 BBR_LOG_CWND, 0, 2171 0, &log, false, &bbr->rc_tv); 2172 } 2173 } 2174 2175 static void 2176 bbr_log_rtt_sample(struct tcp_bbr *bbr, uint32_t rtt, uint32_t tsin) 2177 { 2178 /* 2179 * Log the rtt sample we are applying to the srtt algorithm in 2180 * useconds. 2181 */ 2182 if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2183 union tcp_log_stackspecific log; 2184 2185 bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime); 2186 log.u_bbr.flex1 = rtt; 2187 log.u_bbr.flex2 = bbr->r_ctl.rc_bbr_state_time; 2188 log.u_bbr.flex3 = bbr->r_ctl.rc_ack_hdwr_delay; 2189 log.u_bbr.flex4 = bbr->rc_tp->ts_offset; 2190 log.u_bbr.flex5 = bbr->r_ctl.rc_target_at_state; 2191 log.u_bbr.pkts_out = tcp_tv_to_mssectick(&bbr->rc_tv); 2192 log.u_bbr.flex6 = tsin; 2193 log.u_bbr.flex7 = 0; 2194 log.u_bbr.flex8 = bbr->rc_ack_was_delayed; 2195 TCP_LOG_EVENTP(bbr->rc_tp, NULL, 2196 &bbr->rc_inp->inp_socket->so_rcv, 2197 &bbr->rc_inp->inp_socket->so_snd, 2198 TCP_LOG_RTT, 0, 2199 0, &log, false, &bbr->rc_tv); 2200 } 2201 } 2202 2203 static void 2204 bbr_log_type_pesist(struct tcp_bbr *bbr, uint32_t cts, uint32_t time_in, int32_t line, uint8_t enter_exit) 2205 { 2206 if (bbr_verbose_logging && (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) { 2207 union tcp_log_stackspecific log; 2208 2209 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); 2210 log.u_bbr.flex1 = time_in; 2211 log.u_bbr.flex2 = line; 2212 log.u_bbr.flex8 = enter_exit; 2213 TCP_LOG_EVENTP(bbr->rc_tp, NULL, 2214 &bbr->rc_inp->inp_socket->so_rcv, 2215 &bbr->rc_inp->inp_socket->so_snd, 2216 BBR_LOG_PERSIST, 0, 2217 0, &log, false, &bbr->rc_tv); 2218 } 2219 } 2220 static void 2221 bbr_log_ack_clear(struct tcp_bbr *bbr, uint32_t cts) 2222 { 2223 if (bbr_verbose_logging && (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) { 2224 union tcp_log_stackspecific log; 2225 2226 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); 2227 log.u_bbr.flex1 = bbr->rc_tp->ts_recent_age; 2228 log.u_bbr.flex2 = bbr->r_ctl.rc_rtt_shrinks; 2229 log.u_bbr.flex3 = bbr->r_ctl.rc_probertt_int; 2230 log.u_bbr.flex4 = bbr->r_ctl.rc_went_idle_time; 2231 log.u_bbr.flex5 = bbr->r_ctl.rc_target_at_state; 2232 TCP_LOG_EVENTP(bbr->rc_tp, NULL, 2233 &bbr->rc_inp->inp_socket->so_rcv, 2234 &bbr->rc_inp->inp_socket->so_snd, 2235 BBR_LOG_ACKCLEAR, 0, 2236 0, &log, false, &bbr->rc_tv); 2237 } 2238 } 2239 2240 static void 2241 bbr_log_ack_event(struct tcp_bbr *bbr, struct tcphdr *th, struct tcpopt *to, uint32_t tlen, 2242 uint16_t nsegs, uint32_t cts, int32_t nxt_pkt, struct mbuf *m) 2243 { 2244 if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2245 union tcp_log_stackspecific log; 2246 struct timeval tv; 2247 2248 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); 2249 log.u_bbr.flex1 = nsegs; 2250 log.u_bbr.flex2 = bbr->r_ctl.rc_lost_bytes; 2251 if (m) { 2252 struct timespec ts; 2253 2254 log.u_bbr.flex3 = m->m_flags; 2255 if (m->m_flags & M_TSTMP) { 2256 mbuf_tstmp2timespec(m, &ts); 2257 tv.tv_sec = ts.tv_sec; 2258 tv.tv_usec = ts.tv_nsec / 1000; 2259 log.u_bbr.lt_epoch = tcp_tv_to_usectick(&tv); 2260 } else { 2261 log.u_bbr.lt_epoch = 0; 2262 } 2263 if (m->m_flags & M_TSTMP_LRO) { 2264 mbuf_tstmp2timeval(m, &tv); 2265 log.u_bbr.flex5 = tcp_tv_to_usectick(&tv); 2266 } else { 2267 /* No arrival timestamp */ 2268 log.u_bbr.flex5 = 0; 2269 } 2270 2271 log.u_bbr.pkts_out = tcp_get_usecs(&tv); 2272 } else { 2273 log.u_bbr.flex3 = 0; 2274 log.u_bbr.flex5 = 0; 2275 log.u_bbr.flex6 = 0; 2276 log.u_bbr.pkts_out = 0; 2277 } 2278 log.u_bbr.flex4 = bbr->r_ctl.rc_target_at_state; 2279 log.u_bbr.flex7 = bbr->r_wanted_output; 2280 log.u_bbr.flex8 = bbr->rc_in_persist; 2281 TCP_LOG_EVENTP(bbr->rc_tp, th, 2282 &bbr->rc_inp->inp_socket->so_rcv, 2283 &bbr->rc_inp->inp_socket->so_snd, 2284 TCP_LOG_IN, 0, 2285 tlen, &log, true, &bbr->rc_tv); 2286 } 2287 } 2288 2289 static void 2290 bbr_log_doseg_done(struct tcp_bbr *bbr, uint32_t cts, int32_t nxt_pkt, int32_t did_out) 2291 { 2292 if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2293 union tcp_log_stackspecific log; 2294 2295 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); 2296 log.u_bbr.flex1 = did_out; 2297 log.u_bbr.flex2 = nxt_pkt; 2298 log.u_bbr.flex3 = bbr->r_ctl.rc_last_delay_val; 2299 log.u_bbr.flex4 = bbr->r_ctl.rc_hpts_flags; 2300 log.u_bbr.flex5 = bbr->r_ctl.rc_timer_exp; 2301 log.u_bbr.flex6 = bbr->r_ctl.rc_lost_bytes; 2302 log.u_bbr.flex7 = bbr->r_wanted_output; 2303 log.u_bbr.flex8 = bbr->rc_in_persist; 2304 log.u_bbr.pkts_out = bbr->r_ctl.highest_hdwr_delay; 2305 TCP_LOG_EVENTP(bbr->rc_tp, NULL, 2306 &bbr->rc_inp->inp_socket->so_rcv, 2307 &bbr->rc_inp->inp_socket->so_snd, 2308 BBR_LOG_DOSEG_DONE, 0, 2309 0, &log, true, &bbr->rc_tv); 2310 } 2311 } 2312 2313 static void 2314 bbr_log_enobuf_jmp(struct tcp_bbr *bbr, uint32_t len, uint32_t cts, 2315 int32_t line, uint32_t o_len, uint32_t segcnt, uint32_t segsiz) 2316 { 2317 if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2318 union tcp_log_stackspecific log; 2319 2320 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); 2321 log.u_bbr.flex1 = line; 2322 log.u_bbr.flex2 = o_len; 2323 log.u_bbr.flex3 = segcnt; 2324 log.u_bbr.flex4 = segsiz; 2325 TCP_LOG_EVENTP(bbr->rc_tp, NULL, 2326 &bbr->rc_inp->inp_socket->so_rcv, 2327 &bbr->rc_inp->inp_socket->so_snd, 2328 BBR_LOG_ENOBUF_JMP, ENOBUFS, 2329 len, &log, true, &bbr->rc_tv); 2330 } 2331 } 2332 2333 static void 2334 bbr_log_to_processing(struct tcp_bbr *bbr, uint32_t cts, int32_t ret, int32_t timers, uint8_t hpts_calling) 2335 { 2336 if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2337 union tcp_log_stackspecific log; 2338 2339 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); 2340 log.u_bbr.flex1 = timers; 2341 log.u_bbr.flex2 = ret; 2342 log.u_bbr.flex3 = bbr->r_ctl.rc_timer_exp; 2343 log.u_bbr.flex4 = bbr->r_ctl.rc_hpts_flags; 2344 log.u_bbr.flex5 = cts; 2345 log.u_bbr.flex6 = bbr->r_ctl.rc_target_at_state; 2346 log.u_bbr.flex8 = hpts_calling; 2347 TCP_LOG_EVENTP(bbr->rc_tp, NULL, 2348 &bbr->rc_inp->inp_socket->so_rcv, 2349 &bbr->rc_inp->inp_socket->so_snd, 2350 BBR_LOG_TO_PROCESS, 0, 2351 0, &log, false, &bbr->rc_tv); 2352 } 2353 } 2354 2355 static void 2356 bbr_log_to_event(struct tcp_bbr *bbr, uint32_t cts, int32_t to_num) 2357 { 2358 if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2359 union tcp_log_stackspecific log; 2360 uint64_t ar; 2361 2362 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); 2363 log.u_bbr.flex1 = bbr->bbr_timer_src; 2364 log.u_bbr.flex2 = 0; 2365 log.u_bbr.flex3 = bbr->r_ctl.rc_hpts_flags; 2366 ar = (uint64_t)(bbr->r_ctl.rc_resend); 2367 ar >>= 32; 2368 ar &= 0x00000000ffffffff; 2369 log.u_bbr.flex4 = (uint32_t)ar; 2370 ar = (uint64_t)bbr->r_ctl.rc_resend; 2371 ar &= 0x00000000ffffffff; 2372 log.u_bbr.flex5 = (uint32_t)ar; 2373 log.u_bbr.flex6 = TICKS_2_USEC(bbr->rc_tp->t_rxtcur); 2374 log.u_bbr.flex8 = to_num; 2375 TCP_LOG_EVENTP(bbr->rc_tp, NULL, 2376 &bbr->rc_inp->inp_socket->so_rcv, 2377 &bbr->rc_inp->inp_socket->so_snd, 2378 BBR_LOG_RTO, 0, 2379 0, &log, false, &bbr->rc_tv); 2380 } 2381 } 2382 2383 static void 2384 bbr_log_startup_event(struct tcp_bbr *bbr, uint32_t cts, uint32_t flex1, uint32_t flex2, uint32_t flex3, uint8_t reason) 2385 { 2386 if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2387 union tcp_log_stackspecific log; 2388 2389 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); 2390 log.u_bbr.flex1 = flex1; 2391 log.u_bbr.flex2 = flex2; 2392 log.u_bbr.flex3 = flex3; 2393 log.u_bbr.flex4 = 0; 2394 log.u_bbr.flex5 = bbr->r_ctl.rc_target_at_state; 2395 log.u_bbr.flex6 = bbr->r_ctl.rc_lost_at_startup; 2396 log.u_bbr.flex8 = reason; 2397 log.u_bbr.cur_del_rate = bbr->r_ctl.rc_bbr_lastbtlbw; 2398 TCP_LOG_EVENTP(bbr->rc_tp, NULL, 2399 &bbr->rc_inp->inp_socket->so_rcv, 2400 &bbr->rc_inp->inp_socket->so_snd, 2401 BBR_LOG_REDUCE, 0, 2402 0, &log, false, &bbr->rc_tv); 2403 } 2404 } 2405 2406 static void 2407 bbr_log_hpts_diag(struct tcp_bbr *bbr, uint32_t cts, struct hpts_diag *diag) 2408 { 2409 if (bbr_verbose_logging && (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) { 2410 union tcp_log_stackspecific log; 2411 2412 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); 2413 log.u_bbr.flex1 = diag->p_nxt_slot; 2414 log.u_bbr.flex2 = diag->p_cur_slot; 2415 log.u_bbr.flex3 = diag->slot_req; 2416 log.u_bbr.flex4 = diag->inp_hptsslot; 2417 log.u_bbr.flex5 = diag->slot_remaining; 2418 log.u_bbr.flex6 = diag->need_new_to; 2419 log.u_bbr.flex7 = diag->p_hpts_active; 2420 log.u_bbr.flex8 = diag->p_on_min_sleep; 2421 /* Hijack other fields as needed */ 2422 log.u_bbr.epoch = diag->have_slept; 2423 log.u_bbr.lt_epoch = diag->yet_to_sleep; 2424 log.u_bbr.pkts_out = diag->co_ret; 2425 log.u_bbr.applimited = diag->hpts_sleep_time; 2426 log.u_bbr.delivered = diag->p_prev_slot; 2427 log.u_bbr.inflight = diag->p_runningslot; 2428 log.u_bbr.bw_inuse = diag->wheel_slot; 2429 log.u_bbr.rttProp = diag->wheel_cts; 2430 log.u_bbr.delRate = diag->maxslots; 2431 log.u_bbr.cur_del_rate = diag->p_curtick; 2432 log.u_bbr.cur_del_rate <<= 32; 2433 log.u_bbr.cur_del_rate |= diag->p_lasttick; 2434 TCP_LOG_EVENTP(bbr->rc_tp, NULL, 2435 &bbr->rc_inp->inp_socket->so_rcv, 2436 &bbr->rc_inp->inp_socket->so_snd, 2437 BBR_LOG_HPTSDIAG, 0, 2438 0, &log, false, &bbr->rc_tv); 2439 } 2440 } 2441 2442 static void 2443 bbr_log_timer_var(struct tcp_bbr *bbr, int mode, uint32_t cts, uint32_t time_since_sent, uint32_t srtt, 2444 uint32_t thresh, uint32_t to) 2445 { 2446 if (bbr_verbose_logging && (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) { 2447 union tcp_log_stackspecific log; 2448 2449 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); 2450 log.u_bbr.flex1 = bbr->rc_tp->t_rttvar; 2451 log.u_bbr.flex2 = time_since_sent; 2452 log.u_bbr.flex3 = srtt; 2453 log.u_bbr.flex4 = thresh; 2454 log.u_bbr.flex5 = to; 2455 log.u_bbr.flex6 = bbr->rc_tp->t_srtt; 2456 log.u_bbr.flex8 = mode; 2457 TCP_LOG_EVENTP(bbr->rc_tp, NULL, 2458 &bbr->rc_inp->inp_socket->so_rcv, 2459 &bbr->rc_inp->inp_socket->so_snd, 2460 BBR_LOG_TIMERPREP, 0, 2461 0, &log, false, &bbr->rc_tv); 2462 } 2463 } 2464 2465 static void 2466 bbr_log_pacing_delay_calc(struct tcp_bbr *bbr, uint16_t gain, uint32_t len, 2467 uint32_t cts, uint32_t usecs, uint64_t bw, uint32_t override, int mod) 2468 { 2469 if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2470 union tcp_log_stackspecific log; 2471 2472 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); 2473 log.u_bbr.flex1 = usecs; 2474 log.u_bbr.flex2 = len; 2475 log.u_bbr.flex3 = (uint32_t)((bw >> 32) & 0x00000000ffffffff); 2476 log.u_bbr.flex4 = (uint32_t)(bw & 0x00000000ffffffff); 2477 if (override) 2478 log.u_bbr.flex5 = (1 << 2); 2479 else 2480 log.u_bbr.flex5 = 0; 2481 log.u_bbr.flex6 = override; 2482 log.u_bbr.flex7 = gain; 2483 log.u_bbr.flex8 = mod; 2484 TCP_LOG_EVENTP(bbr->rc_tp, NULL, 2485 &bbr->rc_inp->inp_socket->so_rcv, 2486 &bbr->rc_inp->inp_socket->so_snd, 2487 BBR_LOG_HPTSI_CALC, 0, 2488 len, &log, false, &bbr->rc_tv); 2489 } 2490 } 2491 2492 static void 2493 bbr_log_to_start(struct tcp_bbr *bbr, uint32_t cts, uint32_t to, int32_t slot, uint8_t which) 2494 { 2495 if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2496 union tcp_log_stackspecific log; 2497 2498 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); 2499 2500 log.u_bbr.flex1 = bbr->bbr_timer_src; 2501 log.u_bbr.flex2 = to; 2502 log.u_bbr.flex3 = bbr->r_ctl.rc_hpts_flags; 2503 log.u_bbr.flex4 = slot; 2504 log.u_bbr.flex5 = bbr->rc_inp->inp_hptsslot; 2505 log.u_bbr.flex6 = TICKS_2_USEC(bbr->rc_tp->t_rxtcur); 2506 log.u_bbr.pkts_out = bbr->rc_inp->inp_flags2; 2507 log.u_bbr.flex8 = which; 2508 TCP_LOG_EVENTP(bbr->rc_tp, NULL, 2509 &bbr->rc_inp->inp_socket->so_rcv, 2510 &bbr->rc_inp->inp_socket->so_snd, 2511 BBR_LOG_TIMERSTAR, 0, 2512 0, &log, false, &bbr->rc_tv); 2513 } 2514 } 2515 2516 static void 2517 bbr_log_thresh_choice(struct tcp_bbr *bbr, uint32_t cts, uint32_t thresh, uint32_t lro, uint32_t srtt, struct bbr_sendmap *rsm, uint8_t frm) 2518 { 2519 if (bbr_verbose_logging && (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) { 2520 union tcp_log_stackspecific log; 2521 2522 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); 2523 log.u_bbr.flex1 = thresh; 2524 log.u_bbr.flex2 = lro; 2525 log.u_bbr.flex3 = bbr->r_ctl.rc_reorder_ts; 2526 log.u_bbr.flex4 = rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; 2527 log.u_bbr.flex5 = TICKS_2_USEC(bbr->rc_tp->t_rxtcur); 2528 log.u_bbr.flex6 = srtt; 2529 log.u_bbr.flex7 = bbr->r_ctl.rc_reorder_shift; 2530 log.u_bbr.flex8 = frm; 2531 TCP_LOG_EVENTP(bbr->rc_tp, NULL, 2532 &bbr->rc_inp->inp_socket->so_rcv, 2533 &bbr->rc_inp->inp_socket->so_snd, 2534 BBR_LOG_THRESH_CALC, 0, 2535 0, &log, false, &bbr->rc_tv); 2536 } 2537 } 2538 2539 static void 2540 bbr_log_to_cancel(struct tcp_bbr *bbr, int32_t line, uint32_t cts, uint8_t hpts_removed) 2541 { 2542 if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2543 union tcp_log_stackspecific log; 2544 2545 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); 2546 log.u_bbr.flex1 = line; 2547 log.u_bbr.flex2 = bbr->bbr_timer_src; 2548 log.u_bbr.flex3 = bbr->r_ctl.rc_hpts_flags; 2549 log.u_bbr.flex4 = bbr->rc_in_persist; 2550 log.u_bbr.flex5 = bbr->r_ctl.rc_target_at_state; 2551 log.u_bbr.flex6 = TICKS_2_USEC(bbr->rc_tp->t_rxtcur); 2552 log.u_bbr.flex8 = hpts_removed; 2553 log.u_bbr.pkts_out = bbr->rc_pacer_started; 2554 TCP_LOG_EVENTP(bbr->rc_tp, NULL, 2555 &bbr->rc_inp->inp_socket->so_rcv, 2556 &bbr->rc_inp->inp_socket->so_snd, 2557 BBR_LOG_TIMERCANC, 0, 2558 0, &log, false, &bbr->rc_tv); 2559 } 2560 } 2561 2562 static void 2563 bbr_log_tstmp_validation(struct tcp_bbr *bbr, uint64_t peer_delta, uint64_t delta) 2564 { 2565 if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2566 union tcp_log_stackspecific log; 2567 2568 bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime); 2569 log.u_bbr.flex1 = bbr->r_ctl.bbr_peer_tsratio; 2570 log.u_bbr.flex2 = (peer_delta >> 32); 2571 log.u_bbr.flex3 = (peer_delta & 0x00000000ffffffff); 2572 log.u_bbr.flex4 = (delta >> 32); 2573 log.u_bbr.flex5 = (delta & 0x00000000ffffffff); 2574 log.u_bbr.flex7 = bbr->rc_ts_clock_set; 2575 log.u_bbr.flex8 = bbr->rc_ts_cant_be_used; 2576 TCP_LOG_EVENTP(bbr->rc_tp, NULL, 2577 &bbr->rc_inp->inp_socket->so_rcv, 2578 &bbr->rc_inp->inp_socket->so_snd, 2579 BBR_LOG_TSTMP_VAL, 0, 2580 0, &log, false, &bbr->rc_tv); 2581 } 2582 } 2583 2584 static void 2585 bbr_log_type_tsosize(struct tcp_bbr *bbr, uint32_t cts, uint32_t tsosz, uint32_t tls, uint32_t old_val, uint32_t maxseg, int hdwr) 2586 { 2587 if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2588 union tcp_log_stackspecific log; 2589 2590 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); 2591 log.u_bbr.flex1 = tsosz; 2592 log.u_bbr.flex2 = tls; 2593 log.u_bbr.flex3 = tcp_min_hptsi_time; 2594 log.u_bbr.flex4 = bbr->r_ctl.bbr_hptsi_bytes_min; 2595 log.u_bbr.flex5 = old_val; 2596 log.u_bbr.flex6 = maxseg; 2597 log.u_bbr.flex7 = bbr->rc_no_pacing; 2598 log.u_bbr.flex7 <<= 1; 2599 log.u_bbr.flex7 |= bbr->rc_past_init_win; 2600 if (hdwr) 2601 log.u_bbr.flex8 = 0x80 | bbr->rc_use_google; 2602 else 2603 log.u_bbr.flex8 = bbr->rc_use_google; 2604 TCP_LOG_EVENTP(bbr->rc_tp, NULL, 2605 &bbr->rc_inp->inp_socket->so_rcv, 2606 &bbr->rc_inp->inp_socket->so_snd, 2607 BBR_LOG_BBRTSO, 0, 2608 0, &log, false, &bbr->rc_tv); 2609 } 2610 } 2611 2612 static void 2613 bbr_log_type_rsmclear(struct tcp_bbr *bbr, uint32_t cts, struct bbr_sendmap *rsm, 2614 uint32_t flags, uint32_t line) 2615 { 2616 if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2617 union tcp_log_stackspecific log; 2618 2619 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); 2620 log.u_bbr.flex1 = line; 2621 log.u_bbr.flex2 = rsm->r_start; 2622 log.u_bbr.flex3 = rsm->r_end; 2623 log.u_bbr.flex4 = rsm->r_delivered; 2624 log.u_bbr.flex5 = rsm->r_rtr_cnt; 2625 log.u_bbr.flex6 = rsm->r_dupack; 2626 log.u_bbr.flex7 = rsm->r_tim_lastsent[0]; 2627 log.u_bbr.flex8 = rsm->r_flags; 2628 /* Hijack the pkts_out fids */ 2629 log.u_bbr.applimited = flags; 2630 TCP_LOG_EVENTP(bbr->rc_tp, NULL, 2631 &bbr->rc_inp->inp_socket->so_rcv, 2632 &bbr->rc_inp->inp_socket->so_snd, 2633 BBR_RSM_CLEARED, 0, 2634 0, &log, false, &bbr->rc_tv); 2635 } 2636 } 2637 2638 static void 2639 bbr_log_type_bbrupd(struct tcp_bbr *bbr, uint8_t flex8, uint32_t cts, 2640 uint32_t flex3, uint32_t flex2, uint32_t flex5, 2641 uint32_t flex6, uint32_t pkts_out, int flex7, 2642 uint32_t flex4, uint32_t flex1) 2643 { 2644 2645 if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2646 union tcp_log_stackspecific log; 2647 2648 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); 2649 log.u_bbr.flex1 = flex1; 2650 log.u_bbr.flex2 = flex2; 2651 log.u_bbr.flex3 = flex3; 2652 log.u_bbr.flex4 = flex4; 2653 log.u_bbr.flex5 = flex5; 2654 log.u_bbr.flex6 = flex6; 2655 log.u_bbr.flex7 = flex7; 2656 /* Hijack the pkts_out fids */ 2657 log.u_bbr.pkts_out = pkts_out; 2658 log.u_bbr.flex8 = flex8; 2659 if (bbr->rc_ack_was_delayed) 2660 log.u_bbr.epoch = bbr->r_ctl.rc_ack_hdwr_delay; 2661 else 2662 log.u_bbr.epoch = 0; 2663 TCP_LOG_EVENTP(bbr->rc_tp, NULL, 2664 &bbr->rc_inp->inp_socket->so_rcv, 2665 &bbr->rc_inp->inp_socket->so_snd, 2666 BBR_LOG_BBRUPD, 0, 2667 flex2, &log, false, &bbr->rc_tv); 2668 } 2669 } 2670 2671 static void 2672 bbr_log_type_ltbw(struct tcp_bbr *bbr, uint32_t cts, int32_t reason, 2673 uint32_t newbw, uint32_t obw, uint32_t diff, 2674 uint32_t tim) 2675 { 2676 if (/*bbr_verbose_logging && */(bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) { 2677 union tcp_log_stackspecific log; 2678 2679 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); 2680 log.u_bbr.flex1 = reason; 2681 log.u_bbr.flex2 = newbw; 2682 log.u_bbr.flex3 = obw; 2683 log.u_bbr.flex4 = diff; 2684 log.u_bbr.flex5 = bbr->r_ctl.rc_lt_lost; 2685 log.u_bbr.flex6 = bbr->r_ctl.rc_lt_del; 2686 log.u_bbr.flex7 = bbr->rc_lt_is_sampling; 2687 log.u_bbr.pkts_out = tim; 2688 log.u_bbr.bw_inuse = bbr->r_ctl.rc_lt_bw; 2689 if (bbr->rc_lt_use_bw == 0) 2690 log.u_bbr.epoch = bbr->r_ctl.rc_pkt_epoch - bbr->r_ctl.rc_lt_epoch; 2691 else 2692 log.u_bbr.epoch = bbr->r_ctl.rc_pkt_epoch - bbr->r_ctl.rc_lt_epoch_use; 2693 TCP_LOG_EVENTP(bbr->rc_tp, NULL, 2694 &bbr->rc_inp->inp_socket->so_rcv, 2695 &bbr->rc_inp->inp_socket->so_snd, 2696 BBR_LOG_BWSAMP, 0, 2697 0, &log, false, &bbr->rc_tv); 2698 } 2699 } 2700 2701 static inline void 2702 bbr_log_progress_event(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t tick, int event, int line) 2703 { 2704 if (bbr_verbose_logging && (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) { 2705 union tcp_log_stackspecific log; 2706 2707 bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime); 2708 log.u_bbr.flex1 = line; 2709 log.u_bbr.flex2 = tick; 2710 log.u_bbr.flex3 = tp->t_maxunacktime; 2711 log.u_bbr.flex4 = tp->t_acktime; 2712 log.u_bbr.flex8 = event; 2713 TCP_LOG_EVENTP(bbr->rc_tp, NULL, 2714 &bbr->rc_inp->inp_socket->so_rcv, 2715 &bbr->rc_inp->inp_socket->so_snd, 2716 BBR_LOG_PROGRESS, 0, 2717 0, &log, false, &bbr->rc_tv); 2718 } 2719 } 2720 2721 static void 2722 bbr_type_log_hdwr_pacing(struct tcp_bbr *bbr, const struct ifnet *ifp, 2723 uint64_t rate, uint64_t hw_rate, int line, uint32_t cts, 2724 int error) 2725 { 2726 if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2727 union tcp_log_stackspecific log; 2728 2729 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); 2730 log.u_bbr.flex1 = ((hw_rate >> 32) & 0x00000000ffffffff); 2731 log.u_bbr.flex2 = (hw_rate & 0x00000000ffffffff); 2732 log.u_bbr.flex3 = (((uint64_t)ifp >> 32) & 0x00000000ffffffff); 2733 log.u_bbr.flex4 = ((uint64_t)ifp & 0x00000000ffffffff); 2734 log.u_bbr.bw_inuse = rate; 2735 log.u_bbr.flex5 = line; 2736 log.u_bbr.flex6 = error; 2737 log.u_bbr.flex8 = bbr->skip_gain; 2738 log.u_bbr.flex8 <<= 1; 2739 log.u_bbr.flex8 |= bbr->gain_is_limited; 2740 log.u_bbr.flex8 <<= 1; 2741 log.u_bbr.flex8 |= bbr->bbr_hdrw_pacing; 2742 log.u_bbr.pkts_out = bbr->rc_tp->t_maxseg; 2743 TCP_LOG_EVENTP(bbr->rc_tp, NULL, 2744 &bbr->rc_inp->inp_socket->so_rcv, 2745 &bbr->rc_inp->inp_socket->so_snd, 2746 BBR_LOG_HDWR_PACE, 0, 2747 0, &log, false, &bbr->rc_tv); 2748 } 2749 } 2750 2751 static void 2752 bbr_log_type_bbrsnd(struct tcp_bbr *bbr, uint32_t len, uint32_t slot, uint32_t del_by, uint32_t cts, uint32_t line, uint32_t prev_delay) 2753 { 2754 if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2755 union tcp_log_stackspecific log; 2756 2757 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); 2758 log.u_bbr.flex1 = slot; 2759 log.u_bbr.flex2 = del_by; 2760 log.u_bbr.flex3 = prev_delay; 2761 log.u_bbr.flex4 = line; 2762 log.u_bbr.flex5 = bbr->r_ctl.rc_last_delay_val; 2763 log.u_bbr.flex6 = bbr->r_ctl.rc_hptsi_agg_delay; 2764 log.u_bbr.flex7 = (0x0000ffff & bbr->r_ctl.rc_hpts_flags); 2765 log.u_bbr.flex8 = bbr->rc_in_persist; 2766 TCP_LOG_EVENTP(bbr->rc_tp, NULL, 2767 &bbr->rc_inp->inp_socket->so_rcv, 2768 &bbr->rc_inp->inp_socket->so_snd, 2769 BBR_LOG_BBRSND, 0, 2770 len, &log, false, &bbr->rc_tv); 2771 } 2772 } 2773 2774 static void 2775 bbr_log_type_bbrrttprop(struct tcp_bbr *bbr, uint32_t t, uint32_t end, uint32_t tsconv, uint32_t cts, int32_t match, uint32_t seq, uint8_t flags) 2776 { 2777 if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2778 union tcp_log_stackspecific log; 2779 2780 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); 2781 log.u_bbr.flex1 = bbr->r_ctl.rc_delivered; 2782 log.u_bbr.flex2 = 0; 2783 log.u_bbr.flex3 = bbr->r_ctl.rc_lowest_rtt; 2784 log.u_bbr.flex4 = end; 2785 log.u_bbr.flex5 = seq; 2786 log.u_bbr.flex6 = t; 2787 log.u_bbr.flex7 = match; 2788 log.u_bbr.flex8 = flags; 2789 TCP_LOG_EVENTP(bbr->rc_tp, NULL, 2790 &bbr->rc_inp->inp_socket->so_rcv, 2791 &bbr->rc_inp->inp_socket->so_snd, 2792 BBR_LOG_BBRRTT, 0, 2793 0, &log, false, &bbr->rc_tv); 2794 } 2795 } 2796 2797 static void 2798 bbr_log_exit_gain(struct tcp_bbr *bbr, uint32_t cts, int32_t entry_method) 2799 { 2800 if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2801 union tcp_log_stackspecific log; 2802 2803 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); 2804 log.u_bbr.flex1 = bbr->r_ctl.rc_target_at_state; 2805 log.u_bbr.flex2 = (bbr->rc_tp->t_maxseg - bbr->rc_last_options); 2806 log.u_bbr.flex3 = bbr->r_ctl.gain_epoch; 2807 log.u_bbr.flex4 = bbr->r_ctl.rc_pace_max_segs; 2808 log.u_bbr.flex5 = bbr->r_ctl.rc_pace_min_segs; 2809 log.u_bbr.flex6 = bbr->r_ctl.rc_bbr_state_atflight; 2810 log.u_bbr.flex7 = 0; 2811 log.u_bbr.flex8 = entry_method; 2812 TCP_LOG_EVENTP(bbr->rc_tp, NULL, 2813 &bbr->rc_inp->inp_socket->so_rcv, 2814 &bbr->rc_inp->inp_socket->so_snd, 2815 BBR_LOG_EXIT_GAIN, 0, 2816 0, &log, false, &bbr->rc_tv); 2817 } 2818 } 2819 2820 static void 2821 bbr_log_settings_change(struct tcp_bbr *bbr, int settings_desired) 2822 { 2823 if (bbr_verbose_logging && (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) { 2824 union tcp_log_stackspecific log; 2825 2826 bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime); 2827 /* R-HU */ 2828 log.u_bbr.flex1 = 0; 2829 log.u_bbr.flex2 = 0; 2830 log.u_bbr.flex3 = 0; 2831 log.u_bbr.flex4 = 0; 2832 log.u_bbr.flex7 = 0; 2833 log.u_bbr.flex8 = settings_desired; 2834 2835 TCP_LOG_EVENTP(bbr->rc_tp, NULL, 2836 &bbr->rc_inp->inp_socket->so_rcv, 2837 &bbr->rc_inp->inp_socket->so_snd, 2838 BBR_LOG_SETTINGS_CHG, 0, 2839 0, &log, false, &bbr->rc_tv); 2840 } 2841 } 2842 2843 /* 2844 * Returns the bw from the our filter. 2845 */ 2846 static inline uint64_t 2847 bbr_get_full_bw(struct tcp_bbr *bbr) 2848 { 2849 uint64_t bw; 2850 2851 bw = get_filter_value(&bbr->r_ctl.rc_delrate); 2852 2853 return (bw); 2854 } 2855 2856 static inline void 2857 bbr_set_pktepoch(struct tcp_bbr *bbr, uint32_t cts, int32_t line) 2858 { 2859 uint64_t calclr; 2860 uint32_t lost, del; 2861 2862 if (bbr->r_ctl.rc_lost > bbr->r_ctl.rc_lost_at_pktepoch) 2863 lost = bbr->r_ctl.rc_lost - bbr->r_ctl.rc_lost_at_pktepoch; 2864 else 2865 lost = 0; 2866 del = bbr->r_ctl.rc_delivered - bbr->r_ctl.rc_pkt_epoch_del; 2867 if (lost == 0) { 2868 calclr = 0; 2869 } else if (del) { 2870 calclr = lost; 2871 calclr *= (uint64_t)1000; 2872 calclr /= (uint64_t)del; 2873 } else { 2874 /* Nothing delivered? 100.0% loss */ 2875 calclr = 1000; 2876 } 2877 bbr->r_ctl.rc_pkt_epoch_loss_rate = (uint32_t)calclr; 2878 if (IN_RECOVERY(bbr->rc_tp->t_flags)) 2879 bbr->r_ctl.recovery_lr += (uint32_t)calclr; 2880 bbr->r_ctl.rc_pkt_epoch++; 2881 if (bbr->rc_no_pacing && 2882 (bbr->r_ctl.rc_pkt_epoch >= bbr->no_pacing_until)) { 2883 bbr->rc_no_pacing = 0; 2884 tcp_bbr_tso_size_check(bbr, cts); 2885 } 2886 bbr->r_ctl.rc_pkt_epoch_rtt = bbr_calc_time(cts, bbr->r_ctl.rc_pkt_epoch_time); 2887 bbr->r_ctl.rc_pkt_epoch_time = cts; 2888 /* What was our loss rate */ 2889 bbr_log_pkt_epoch(bbr, cts, line, lost, del); 2890 bbr->r_ctl.rc_pkt_epoch_del = bbr->r_ctl.rc_delivered; 2891 bbr->r_ctl.rc_lost_at_pktepoch = bbr->r_ctl.rc_lost; 2892 } 2893 2894 static inline void 2895 bbr_set_epoch(struct tcp_bbr *bbr, uint32_t cts, int32_t line) 2896 { 2897 uint32_t epoch_time; 2898 2899 /* Tick the RTT clock */ 2900 bbr->r_ctl.rc_rtt_epoch++; 2901 epoch_time = cts - bbr->r_ctl.rc_rcv_epoch_start; 2902 bbr_log_time_epoch(bbr, cts, line, epoch_time); 2903 bbr->r_ctl.rc_rcv_epoch_start = cts; 2904 } 2905 2906 static inline void 2907 bbr_isit_a_pkt_epoch(struct tcp_bbr *bbr, uint32_t cts, struct bbr_sendmap *rsm, int32_t line, int32_t cum_acked) 2908 { 2909 if (SEQ_GEQ(rsm->r_delivered, bbr->r_ctl.rc_pkt_epoch_del)) { 2910 bbr->rc_is_pkt_epoch_now = 1; 2911 } 2912 } 2913 2914 /* 2915 * Returns the bw from either the b/w filter 2916 * or from the lt_bw (if the connection is being 2917 * policed). 2918 */ 2919 static inline uint64_t 2920 __bbr_get_bw(struct tcp_bbr *bbr) 2921 { 2922 uint64_t bw, min_bw; 2923 uint64_t rtt; 2924 int gm_measure_cnt = 1; 2925 2926 /* 2927 * For startup we make, like google, a 2928 * minimum b/w. This is generated from the 2929 * IW and the rttProp. We do fall back to srtt 2930 * if for some reason (initial handshake) we don't 2931 * have a rttProp. We, in the worst case, fall back 2932 * to the configured min_bw (rc_initial_hptsi_bw). 2933 */ 2934 if (bbr->rc_bbr_state == BBR_STATE_STARTUP) { 2935 /* Attempt first to use rttProp */ 2936 rtt = (uint64_t)get_filter_value_small(&bbr->r_ctl.rc_rttprop); 2937 if (rtt && (rtt < 0xffffffff)) { 2938 measure: 2939 min_bw = (uint64_t)(bbr_initial_cwnd(bbr, bbr->rc_tp)) * 2940 ((uint64_t)1000000); 2941 min_bw /= rtt; 2942 if (min_bw < bbr->r_ctl.rc_initial_hptsi_bw) { 2943 min_bw = bbr->r_ctl.rc_initial_hptsi_bw; 2944 } 2945 2946 } else if (bbr->rc_tp->t_srtt != 0) { 2947 /* No rttProp, use srtt? */ 2948 rtt = bbr_get_rtt(bbr, BBR_SRTT); 2949 goto measure; 2950 } else { 2951 min_bw = bbr->r_ctl.rc_initial_hptsi_bw; 2952 } 2953 } else 2954 min_bw = 0; 2955 2956 if ((bbr->rc_past_init_win == 0) && 2957 (bbr->r_ctl.rc_delivered > bbr_initial_cwnd(bbr, bbr->rc_tp))) 2958 bbr->rc_past_init_win = 1; 2959 if ((bbr->rc_use_google) && (bbr->r_ctl.r_measurement_count >= 1)) 2960 gm_measure_cnt = 0; 2961 if (gm_measure_cnt && 2962 ((bbr->r_ctl.r_measurement_count < bbr_min_measurements_req) || 2963 (bbr->rc_past_init_win == 0))) { 2964 /* For google we use our guess rate until we get 1 measurement */ 2965 2966 use_initial_window: 2967 rtt = (uint64_t)get_filter_value_small(&bbr->r_ctl.rc_rttprop); 2968 if (rtt && (rtt < 0xffffffff)) { 2969 /* 2970 * We have an RTT measurement. Use that in 2971 * combination with our initial window to calculate 2972 * a b/w. 2973 */ 2974 bw = (uint64_t)(bbr_initial_cwnd(bbr, bbr->rc_tp)) * 2975 ((uint64_t)1000000); 2976 bw /= rtt; 2977 if (bw < bbr->r_ctl.rc_initial_hptsi_bw) { 2978 bw = bbr->r_ctl.rc_initial_hptsi_bw; 2979 } 2980 } else { 2981 /* Drop back to the 40 and punt to a default */ 2982 bw = bbr->r_ctl.rc_initial_hptsi_bw; 2983 } 2984 if (bw < 1) 2985 /* Probably should panic */ 2986 bw = 1; 2987 if (bw > min_bw) 2988 return (bw); 2989 else 2990 return (min_bw); 2991 } 2992 if (bbr->rc_lt_use_bw) 2993 bw = bbr->r_ctl.rc_lt_bw; 2994 else if (bbr->r_recovery_bw && (bbr->rc_use_google == 0)) 2995 bw = bbr->r_ctl.red_bw; 2996 else 2997 bw = get_filter_value(&bbr->r_ctl.rc_delrate); 2998 if (bbr->rc_tp->t_peakrate_thr && (bbr->rc_use_google == 0)) { 2999 /* 3000 * Enforce user set rate limit, keep in mind that 3001 * t_peakrate_thr is in B/s already 3002 */ 3003 bw = uqmin((uint64_t)bbr->rc_tp->t_peakrate_thr, bw); 3004 } 3005 if (bw == 0) { 3006 /* We should not be at 0, go to the initial window then */ 3007 goto use_initial_window; 3008 } 3009 if (bw < 1) 3010 /* Probably should panic */ 3011 bw = 1; 3012 if (bw < min_bw) 3013 bw = min_bw; 3014 return (bw); 3015 } 3016 3017 static inline uint64_t 3018 bbr_get_bw(struct tcp_bbr *bbr) 3019 { 3020 uint64_t bw; 3021 3022 bw = __bbr_get_bw(bbr); 3023 return (bw); 3024 } 3025 3026 static inline void 3027 bbr_reset_lt_bw_interval(struct tcp_bbr *bbr, uint32_t cts) 3028 { 3029 bbr->r_ctl.rc_lt_epoch = bbr->r_ctl.rc_pkt_epoch; 3030 bbr->r_ctl.rc_lt_time = bbr->r_ctl.rc_del_time; 3031 bbr->r_ctl.rc_lt_del = bbr->r_ctl.rc_delivered; 3032 bbr->r_ctl.rc_lt_lost = bbr->r_ctl.rc_lost; 3033 } 3034 3035 static inline void 3036 bbr_reset_lt_bw_sampling(struct tcp_bbr *bbr, uint32_t cts) 3037 { 3038 bbr->rc_lt_is_sampling = 0; 3039 bbr->rc_lt_use_bw = 0; 3040 bbr->r_ctl.rc_lt_bw = 0; 3041 bbr_reset_lt_bw_interval(bbr, cts); 3042 } 3043 3044 static inline void 3045 bbr_lt_bw_samp_done(struct tcp_bbr *bbr, uint64_t bw, uint32_t cts, uint32_t timin) 3046 { 3047 uint64_t diff; 3048 3049 /* Do we have a previous sample? */ 3050 if (bbr->r_ctl.rc_lt_bw) { 3051 /* Get the diff in bytes per second */ 3052 if (bbr->r_ctl.rc_lt_bw > bw) 3053 diff = bbr->r_ctl.rc_lt_bw - bw; 3054 else 3055 diff = bw - bbr->r_ctl.rc_lt_bw; 3056 if ((diff <= bbr_lt_bw_diff) || 3057 (diff <= (bbr->r_ctl.rc_lt_bw / bbr_lt_bw_ratio))) { 3058 /* Consider us policed */ 3059 uint32_t saved_bw; 3060 3061 saved_bw = (uint32_t)bbr->r_ctl.rc_lt_bw; 3062 bbr->r_ctl.rc_lt_bw = (bw + bbr->r_ctl.rc_lt_bw) / 2; /* average of two */ 3063 bbr->rc_lt_use_bw = 1; 3064 bbr->r_ctl.rc_bbr_hptsi_gain = BBR_UNIT; 3065 /* 3066 * Use pkt based epoch for measuring length of 3067 * policer up 3068 */ 3069 bbr->r_ctl.rc_lt_epoch_use = bbr->r_ctl.rc_pkt_epoch; 3070 /* 3071 * reason 4 is we need to start consider being 3072 * policed 3073 */ 3074 bbr_log_type_ltbw(bbr, cts, 4, (uint32_t)bw, saved_bw, (uint32_t)diff, timin); 3075 return; 3076 } 3077 } 3078 bbr->r_ctl.rc_lt_bw = bw; 3079 bbr_reset_lt_bw_interval(bbr, cts); 3080 bbr_log_type_ltbw(bbr, cts, 5, 0, (uint32_t)bw, 0, timin); 3081 } 3082 3083 static void 3084 bbr_randomize_extra_state_time(struct tcp_bbr *bbr) 3085 { 3086 uint32_t ran, deduct; 3087 3088 ran = arc4random_uniform(bbr_rand_ot); 3089 if (ran) { 3090 deduct = bbr->r_ctl.rc_level_state_extra / ran; 3091 bbr->r_ctl.rc_level_state_extra -= deduct; 3092 } 3093 } 3094 /* 3095 * Return randomly the starting state 3096 * to use in probebw. 3097 */ 3098 static uint8_t 3099 bbr_pick_probebw_substate(struct tcp_bbr *bbr, uint32_t cts) 3100 { 3101 uint32_t ran; 3102 uint8_t ret_val; 3103 3104 /* Initialize the offset to 0 */ 3105 bbr->r_ctl.rc_exta_time_gd = 0; 3106 bbr->rc_hit_state_1 = 0; 3107 bbr->r_ctl.rc_level_state_extra = 0; 3108 ran = arc4random_uniform((BBR_SUBSTATE_COUNT-1)); 3109 /* 3110 * The math works funny here :) the return value is used to set the 3111 * substate and then the state change is called which increments by 3112 * one. So if we return 1 (DRAIN) we will increment to 2 (LEVEL1) when 3113 * we fully enter the state. Note that the (8 - 1 - ran) assures that 3114 * we return 1 - 7, so we dont return 0 and end up starting in 3115 * state 1 (DRAIN). 3116 */ 3117 ret_val = BBR_SUBSTATE_COUNT - 1 - ran; 3118 /* Set an epoch */ 3119 if ((cts - bbr->r_ctl.rc_rcv_epoch_start) >= bbr_get_rtt(bbr, BBR_RTT_PROP)) 3120 bbr_set_epoch(bbr, cts, __LINE__); 3121 3122 bbr->r_ctl.bbr_lost_at_state = bbr->r_ctl.rc_lost; 3123 return (ret_val); 3124 } 3125 3126 static void 3127 bbr_lt_bw_sampling(struct tcp_bbr *bbr, uint32_t cts, int32_t loss_detected) 3128 { 3129 uint32_t diff, d_time; 3130 uint64_t del_time, bw, lost, delivered; 3131 3132 if (bbr->r_use_policer == 0) 3133 return; 3134 if (bbr->rc_lt_use_bw) { 3135 /* We are using lt bw do we stop yet? */ 3136 diff = bbr->r_ctl.rc_pkt_epoch - bbr->r_ctl.rc_lt_epoch_use; 3137 if (diff > bbr_lt_bw_max_rtts) { 3138 /* Reset it all */ 3139 reset_all: 3140 bbr_reset_lt_bw_sampling(bbr, cts); 3141 if (bbr->rc_filled_pipe) { 3142 bbr_set_epoch(bbr, cts, __LINE__); 3143 bbr->rc_bbr_substate = bbr_pick_probebw_substate(bbr, cts); 3144 bbr_substate_change(bbr, cts, __LINE__, 0); 3145 bbr->rc_bbr_state = BBR_STATE_PROBE_BW; 3146 bbr_log_type_statechange(bbr, cts, __LINE__); 3147 } else { 3148 /* 3149 * This should not happen really 3150 * unless we remove the startup/drain 3151 * restrictions above. 3152 */ 3153 bbr->rc_bbr_state = BBR_STATE_STARTUP; 3154 bbr_set_epoch(bbr, cts, __LINE__); 3155 bbr->r_ctl.rc_bbr_state_time = cts; 3156 bbr->r_ctl.rc_lost_at_startup = bbr->r_ctl.rc_lost; 3157 bbr->r_ctl.rc_bbr_hptsi_gain = bbr->r_ctl.rc_startup_pg; 3158 bbr->r_ctl.rc_bbr_cwnd_gain = bbr->r_ctl.rc_startup_pg; 3159 bbr_set_state_target(bbr, __LINE__); 3160 bbr_log_type_statechange(bbr, cts, __LINE__); 3161 } 3162 /* reason 0 is to stop using lt-bw */ 3163 bbr_log_type_ltbw(bbr, cts, 0, 0, 0, 0, 0); 3164 return; 3165 } 3166 if (bbr_lt_intvl_fp == 0) { 3167 /* Not doing false-positive detection */ 3168 return; 3169 } 3170 /* False positive detection */ 3171 if (diff == bbr_lt_intvl_fp) { 3172 /* At bbr_lt_intvl_fp we record the lost */ 3173 bbr->r_ctl.rc_lt_del = bbr->r_ctl.rc_delivered; 3174 bbr->r_ctl.rc_lt_lost = bbr->r_ctl.rc_lost; 3175 } else if (diff > (bbr_lt_intvl_min_rtts + bbr_lt_intvl_fp)) { 3176 /* Now is our loss rate still high? */ 3177 lost = bbr->r_ctl.rc_lost - bbr->r_ctl.rc_lt_lost; 3178 delivered = bbr->r_ctl.rc_delivered - bbr->r_ctl.rc_lt_del; 3179 if ((delivered == 0) || 3180 (((lost * 1000)/delivered) < bbr_lt_fd_thresh)) { 3181 /* No still below our threshold */ 3182 bbr_log_type_ltbw(bbr, cts, 7, lost, delivered, 0, 0); 3183 } else { 3184 /* Yikes its still high, it must be a false positive */ 3185 bbr_log_type_ltbw(bbr, cts, 8, lost, delivered, 0, 0); 3186 goto reset_all; 3187 } 3188 } 3189 return; 3190 } 3191 /* 3192 * Wait for the first loss before sampling, to let the policer 3193 * exhaust its tokens and estimate the steady-state rate allowed by 3194 * the policer. Starting samples earlier includes bursts that 3195 * over-estimate the bw. 3196 */ 3197 if (bbr->rc_lt_is_sampling == 0) { 3198 /* reason 1 is to begin doing the sampling */ 3199 if (loss_detected == 0) 3200 return; 3201 bbr_reset_lt_bw_interval(bbr, cts); 3202 bbr->rc_lt_is_sampling = 1; 3203 bbr_log_type_ltbw(bbr, cts, 1, 0, 0, 0, 0); 3204 return; 3205 } 3206 /* Now how long were we delivering long term last> */ 3207 if (TSTMP_GEQ(bbr->r_ctl.rc_del_time, bbr->r_ctl.rc_lt_time)) 3208 d_time = bbr->r_ctl.rc_del_time - bbr->r_ctl.rc_lt_time; 3209 else 3210 d_time = 0; 3211 3212 /* To avoid underestimates, reset sampling if we run out of data. */ 3213 if (bbr->r_ctl.r_app_limited_until) { 3214 /* Can not measure in app-limited state */ 3215 bbr_reset_lt_bw_sampling(bbr, cts); 3216 /* reason 2 is to reset sampling due to app limits */ 3217 bbr_log_type_ltbw(bbr, cts, 2, 0, 0, 0, d_time); 3218 return; 3219 } 3220 diff = bbr->r_ctl.rc_pkt_epoch - bbr->r_ctl.rc_lt_epoch; 3221 if (diff < bbr_lt_intvl_min_rtts) { 3222 /* 3223 * need more samples (we don't 3224 * start on a round like linux so 3225 * we need 1 more). 3226 */ 3227 /* 6 is not_enough time or no-loss */ 3228 bbr_log_type_ltbw(bbr, cts, 6, 0, 0, 0, d_time); 3229 return; 3230 } 3231 if (diff > (4 * bbr_lt_intvl_min_rtts)) { 3232 /* 3233 * For now if we wait too long, reset all sampling. We need 3234 * to do some research here, its possible that we should 3235 * base this on how much loss as occurred.. something like 3236 * if its under 10% (or some thresh) reset all otherwise 3237 * don't. Thats for phase II I guess. 3238 */ 3239 bbr_reset_lt_bw_sampling(bbr, cts); 3240 /* reason 3 is to reset sampling due too long of sampling */ 3241 bbr_log_type_ltbw(bbr, cts, 3, 0, 0, 0, d_time); 3242 return; 3243 } 3244 /* 3245 * End sampling interval when a packet is lost, so we estimate the 3246 * policer tokens were exhausted. Stopping the sampling before the 3247 * tokens are exhausted under-estimates the policed rate. 3248 */ 3249 if (loss_detected == 0) { 3250 /* 6 is not_enough time or no-loss */ 3251 bbr_log_type_ltbw(bbr, cts, 6, 0, 0, 0, d_time); 3252 return; 3253 } 3254 /* Calculate packets lost and delivered in sampling interval. */ 3255 lost = bbr->r_ctl.rc_lost - bbr->r_ctl.rc_lt_lost; 3256 delivered = bbr->r_ctl.rc_delivered - bbr->r_ctl.rc_lt_del; 3257 if ((delivered == 0) || 3258 (((lost * 1000)/delivered) < bbr_lt_loss_thresh)) { 3259 bbr_log_type_ltbw(bbr, cts, 6, lost, delivered, 0, d_time); 3260 return; 3261 } 3262 if (d_time < 1000) { 3263 /* Not enough time. wait */ 3264 /* 6 is not_enough time or no-loss */ 3265 bbr_log_type_ltbw(bbr, cts, 6, 0, 0, 0, d_time); 3266 return; 3267 } 3268 if (d_time >= (0xffffffff / USECS_IN_MSEC)) { 3269 /* Too long */ 3270 bbr_reset_lt_bw_sampling(bbr, cts); 3271 /* reason 3 is to reset sampling due too long of sampling */ 3272 bbr_log_type_ltbw(bbr, cts, 3, 0, 0, 0, d_time); 3273 return; 3274 } 3275 del_time = d_time; 3276 bw = delivered; 3277 bw *= (uint64_t)USECS_IN_SECOND; 3278 bw /= del_time; 3279 bbr_lt_bw_samp_done(bbr, bw, cts, d_time); 3280 } 3281 3282 /* 3283 * Allocate a sendmap from our zone. 3284 */ 3285 static struct bbr_sendmap * 3286 bbr_alloc(struct tcp_bbr *bbr) 3287 { 3288 struct bbr_sendmap *rsm; 3289 3290 BBR_STAT_INC(bbr_to_alloc); 3291 rsm = uma_zalloc(bbr_zone, (M_NOWAIT | M_ZERO)); 3292 if (rsm) { 3293 bbr->r_ctl.rc_num_maps_alloced++; 3294 return (rsm); 3295 } 3296 if (bbr->r_ctl.rc_free_cnt) { 3297 BBR_STAT_INC(bbr_to_alloc_emerg); 3298 rsm = TAILQ_FIRST(&bbr->r_ctl.rc_free); 3299 TAILQ_REMOVE(&bbr->r_ctl.rc_free, rsm, r_next); 3300 bbr->r_ctl.rc_free_cnt--; 3301 return (rsm); 3302 } 3303 BBR_STAT_INC(bbr_to_alloc_failed); 3304 return (NULL); 3305 } 3306 3307 static struct bbr_sendmap * 3308 bbr_alloc_full_limit(struct tcp_bbr *bbr) 3309 { 3310 if ((V_tcp_map_entries_limit > 0) && 3311 (bbr->r_ctl.rc_num_maps_alloced >= V_tcp_map_entries_limit)) { 3312 BBR_STAT_INC(bbr_alloc_limited); 3313 if (!bbr->alloc_limit_reported) { 3314 bbr->alloc_limit_reported = 1; 3315 BBR_STAT_INC(bbr_alloc_limited_conns); 3316 } 3317 return (NULL); 3318 } 3319 return (bbr_alloc(bbr)); 3320 } 3321 3322 /* wrapper to allocate a sendmap entry, subject to a specific limit */ 3323 static struct bbr_sendmap * 3324 bbr_alloc_limit(struct tcp_bbr *bbr, uint8_t limit_type) 3325 { 3326 struct bbr_sendmap *rsm; 3327 3328 if (limit_type) { 3329 /* currently there is only one limit type */ 3330 if (V_tcp_map_split_limit > 0 && 3331 bbr->r_ctl.rc_num_split_allocs >= V_tcp_map_split_limit) { 3332 BBR_STAT_INC(bbr_split_limited); 3333 if (!bbr->alloc_limit_reported) { 3334 bbr->alloc_limit_reported = 1; 3335 BBR_STAT_INC(bbr_alloc_limited_conns); 3336 } 3337 return (NULL); 3338 } 3339 } 3340 3341 /* allocate and mark in the limit type, if set */ 3342 rsm = bbr_alloc(bbr); 3343 if (rsm != NULL && limit_type) { 3344 rsm->r_limit_type = limit_type; 3345 bbr->r_ctl.rc_num_split_allocs++; 3346 } 3347 return (rsm); 3348 } 3349 3350 static void 3351 bbr_free(struct tcp_bbr *bbr, struct bbr_sendmap *rsm) 3352 { 3353 if (rsm->r_limit_type) { 3354 /* currently there is only one limit type */ 3355 bbr->r_ctl.rc_num_split_allocs--; 3356 } 3357 if (rsm->r_is_smallmap) 3358 bbr->r_ctl.rc_num_small_maps_alloced--; 3359 if (bbr->r_ctl.rc_tlp_send == rsm) 3360 bbr->r_ctl.rc_tlp_send = NULL; 3361 if (bbr->r_ctl.rc_resend == rsm) { 3362 bbr->r_ctl.rc_resend = NULL; 3363 } 3364 if (bbr->r_ctl.rc_next == rsm) 3365 bbr->r_ctl.rc_next = NULL; 3366 if (bbr->r_ctl.rc_sacklast == rsm) 3367 bbr->r_ctl.rc_sacklast = NULL; 3368 if (bbr->r_ctl.rc_free_cnt < bbr_min_req_free) { 3369 memset(rsm, 0, sizeof(struct bbr_sendmap)); 3370 TAILQ_INSERT_TAIL(&bbr->r_ctl.rc_free, rsm, r_next); 3371 rsm->r_limit_type = 0; 3372 bbr->r_ctl.rc_free_cnt++; 3373 return; 3374 } 3375 bbr->r_ctl.rc_num_maps_alloced--; 3376 uma_zfree(bbr_zone, rsm); 3377 } 3378 3379 /* 3380 * Returns the BDP. 3381 */ 3382 static uint64_t 3383 bbr_get_bw_delay_prod(uint64_t rtt, uint64_t bw) { 3384 /* 3385 * Calculate the bytes in flight needed given the bw (in bytes per 3386 * second) and the specifyed rtt in useconds. We need to put out the 3387 * returned value per RTT to match that rate. Gain will normally 3388 * raise it up from there. 3389 * 3390 * This should not overflow as long as the bandwidth is below 1 3391 * TByte per second (bw < 10**12 = 2**40) and the rtt is smaller 3392 * than 1000 seconds (rtt < 10**3 * 10**6 = 10**9 = 2**30). 3393 */ 3394 uint64_t usec_per_sec; 3395 3396 usec_per_sec = USECS_IN_SECOND; 3397 return ((rtt * bw) / usec_per_sec); 3398 } 3399 3400 /* 3401 * Return the initial cwnd. 3402 */ 3403 static uint32_t 3404 bbr_initial_cwnd(struct tcp_bbr *bbr, struct tcpcb *tp) 3405 { 3406 uint32_t i_cwnd; 3407 3408 if (bbr->rc_init_win) { 3409 i_cwnd = bbr->rc_init_win * tp->t_maxseg; 3410 } else if (V_tcp_initcwnd_segments) 3411 i_cwnd = min((V_tcp_initcwnd_segments * tp->t_maxseg), 3412 max(2 * tp->t_maxseg, 14600)); 3413 else if (V_tcp_do_rfc3390) 3414 i_cwnd = min(4 * tp->t_maxseg, 3415 max(2 * tp->t_maxseg, 4380)); 3416 else { 3417 /* Per RFC5681 Section 3.1 */ 3418 if (tp->t_maxseg > 2190) 3419 i_cwnd = 2 * tp->t_maxseg; 3420 else if (tp->t_maxseg > 1095) 3421 i_cwnd = 3 * tp->t_maxseg; 3422 else 3423 i_cwnd = 4 * tp->t_maxseg; 3424 } 3425 return (i_cwnd); 3426 } 3427 3428 /* 3429 * Given a specified gain, return the target 3430 * cwnd based on that gain. 3431 */ 3432 static uint32_t 3433 bbr_get_raw_target_cwnd(struct tcp_bbr *bbr, uint32_t gain, uint64_t bw) 3434 { 3435 uint64_t bdp, rtt; 3436 uint32_t cwnd; 3437 3438 if ((get_filter_value_small(&bbr->r_ctl.rc_rttprop) == 0xffffffff) || 3439 (bbr_get_full_bw(bbr) == 0)) { 3440 /* No measurements yet */ 3441 return (bbr_initial_cwnd(bbr, bbr->rc_tp)); 3442 } 3443 /* 3444 * Get bytes per RTT needed (rttProp is normally in 3445 * bbr_cwndtarget_rtt_touse) 3446 */ 3447 rtt = bbr_get_rtt(bbr, bbr_cwndtarget_rtt_touse); 3448 /* Get the bdp from the two values */ 3449 bdp = bbr_get_bw_delay_prod(rtt, bw); 3450 /* Now apply the gain */ 3451 cwnd = (uint32_t)(((bdp * ((uint64_t)gain)) + (uint64_t)(BBR_UNIT - 1)) / ((uint64_t)BBR_UNIT)); 3452 3453 return (cwnd); 3454 } 3455 3456 static uint32_t 3457 bbr_get_target_cwnd(struct tcp_bbr *bbr, uint64_t bw, uint32_t gain) 3458 { 3459 uint32_t cwnd, mss; 3460 3461 mss = min((bbr->rc_tp->t_maxseg - bbr->rc_last_options), bbr->r_ctl.rc_pace_max_segs); 3462 /* Get the base cwnd with gain rounded to a mss */ 3463 cwnd = roundup(bbr_get_raw_target_cwnd(bbr, bw, gain), mss); 3464 /* 3465 * Add in N (2 default since we do not have a 3466 * fq layer to trap packets in) quanta's per the I-D 3467 * section 4.2.3.2 quanta adjust. 3468 */ 3469 cwnd += (bbr_quanta * bbr->r_ctl.rc_pace_max_segs); 3470 if (bbr->rc_use_google) { 3471 if((bbr->rc_bbr_state == BBR_STATE_PROBE_BW) && 3472 (bbr_state_val(bbr) == BBR_SUB_GAIN)) { 3473 /* 3474 * The linux implementation adds 3475 * an extra 2 x mss in gain cycle which 3476 * is documented no-where except in the code. 3477 * so we add more for Neal undocumented feature 3478 */ 3479 cwnd += 2 * mss; 3480 } 3481 if ((cwnd / mss) & 0x1) { 3482 /* Round up for odd num mss */ 3483 cwnd += mss; 3484 } 3485 } 3486 /* Are we below the min cwnd? */ 3487 if (cwnd < get_min_cwnd(bbr)) 3488 return (get_min_cwnd(bbr)); 3489 return (cwnd); 3490 } 3491 3492 static uint16_t 3493 bbr_gain_adjust(struct tcp_bbr *bbr, uint16_t gain) 3494 { 3495 if (gain < 1) 3496 gain = 1; 3497 return (gain); 3498 } 3499 3500 static uint32_t 3501 bbr_get_header_oh(struct tcp_bbr *bbr) 3502 { 3503 int seg_oh; 3504 3505 seg_oh = 0; 3506 if (bbr->r_ctl.rc_inc_tcp_oh) { 3507 /* Do we include TCP overhead? */ 3508 seg_oh = (bbr->rc_last_options + sizeof(struct tcphdr)); 3509 } 3510 if (bbr->r_ctl.rc_inc_ip_oh) { 3511 /* Do we include IP overhead? */ 3512 #ifdef INET6 3513 if (bbr->r_is_v6) { 3514 seg_oh += sizeof(struct ip6_hdr); 3515 } else 3516 #endif 3517 { 3518 3519 #ifdef INET 3520 seg_oh += sizeof(struct ip); 3521 #endif 3522 } 3523 } 3524 if (bbr->r_ctl.rc_inc_enet_oh) { 3525 /* Do we include the ethernet overhead? */ 3526 seg_oh += sizeof(struct ether_header); 3527 } 3528 return(seg_oh); 3529 } 3530 3531 static uint32_t 3532 bbr_get_pacing_length(struct tcp_bbr *bbr, uint16_t gain, uint32_t useconds_time, uint64_t bw) 3533 { 3534 uint64_t divor, res, tim; 3535 3536 if (useconds_time == 0) 3537 return (0); 3538 gain = bbr_gain_adjust(bbr, gain); 3539 divor = (uint64_t)USECS_IN_SECOND * (uint64_t)BBR_UNIT; 3540 tim = useconds_time; 3541 res = (tim * bw * gain) / divor; 3542 if (res == 0) 3543 res = 1; 3544 return ((uint32_t)res); 3545 } 3546 3547 /* 3548 * Given a gain and a length return the delay in useconds that 3549 * should be used to evenly space out packets 3550 * on the connection (based on the gain factor). 3551 */ 3552 static uint32_t 3553 bbr_get_pacing_delay(struct tcp_bbr *bbr, uint16_t gain, int32_t len, uint32_t cts, int nolog) 3554 { 3555 uint64_t bw, lentim, res; 3556 uint32_t usecs, srtt, over = 0; 3557 uint32_t seg_oh, num_segs, maxseg; 3558 3559 if (len == 0) 3560 return (0); 3561 3562 maxseg = bbr->rc_tp->t_maxseg - bbr->rc_last_options; 3563 num_segs = (len + maxseg - 1) / maxseg; 3564 if (bbr->rc_use_google == 0) { 3565 seg_oh = bbr_get_header_oh(bbr); 3566 len += (num_segs * seg_oh); 3567 } 3568 gain = bbr_gain_adjust(bbr, gain); 3569 bw = bbr_get_bw(bbr); 3570 if (bbr->rc_use_google) { 3571 uint64_t cbw; 3572 3573 /* 3574 * Reduce the b/w by the google discount 3575 * factor 10 = 1%. 3576 */ 3577 cbw = bw * (uint64_t)(1000 - bbr->r_ctl.bbr_google_discount); 3578 cbw /= (uint64_t)1000; 3579 /* We don't apply a discount if it results in 0 */ 3580 if (cbw > 0) 3581 bw = cbw; 3582 } 3583 lentim = ((uint64_t)len * 3584 (uint64_t)USECS_IN_SECOND * 3585 (uint64_t)BBR_UNIT); 3586 res = lentim / ((uint64_t)gain * bw); 3587 if (res == 0) 3588 res = 1; 3589 usecs = (uint32_t)res; 3590 srtt = bbr_get_rtt(bbr, BBR_SRTT); 3591 if (bbr_hptsi_max_mul && bbr_hptsi_max_div && 3592 (bbr->rc_use_google == 0) && 3593 (usecs > ((srtt * bbr_hptsi_max_mul) / bbr_hptsi_max_div))) { 3594 /* 3595 * We cannot let the delay be more than 1/2 the srtt time. 3596 * Otherwise we cannot pace out or send properly. 3597 */ 3598 over = usecs = (srtt * bbr_hptsi_max_mul) / bbr_hptsi_max_div; 3599 BBR_STAT_INC(bbr_hpts_min_time); 3600 } 3601 if (!nolog) 3602 bbr_log_pacing_delay_calc(bbr, gain, len, cts, usecs, bw, over, 1); 3603 return (usecs); 3604 } 3605 3606 static void 3607 bbr_ack_received(struct tcpcb *tp, struct tcp_bbr *bbr, struct tcphdr *th, uint32_t bytes_this_ack, 3608 uint32_t sack_changed, uint32_t prev_acked, int32_t line, uint32_t losses) 3609 { 3610 INP_WLOCK_ASSERT(tp->t_inpcb); 3611 uint64_t bw; 3612 uint32_t cwnd, target_cwnd, saved_bytes, maxseg; 3613 int32_t meth; 3614 3615 #ifdef STATS 3616 if ((tp->t_flags & TF_GPUTINPROG) && 3617 SEQ_GEQ(th->th_ack, tp->gput_ack)) { 3618 /* 3619 * Strech acks and compressed acks will cause this to 3620 * oscillate but we are doing it the same way as the main 3621 * stack so it will be compariable (though possibly not 3622 * ideal). 3623 */ 3624 int32_t cgput; 3625 int64_t gput, time_stamp; 3626 3627 gput = (int64_t) (th->th_ack - tp->gput_seq) * 8; 3628 time_stamp = max(1, ((bbr->r_ctl.rc_rcvtime - tp->gput_ts) / 1000)); 3629 cgput = gput / time_stamp; 3630 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_GPUT, 3631 cgput); 3632 if (tp->t_stats_gput_prev > 0) 3633 stats_voi_update_abs_s32(tp->t_stats, 3634 VOI_TCP_GPUT_ND, 3635 ((gput - tp->t_stats_gput_prev) * 100) / 3636 tp->t_stats_gput_prev); 3637 tp->t_flags &= ~TF_GPUTINPROG; 3638 tp->t_stats_gput_prev = cgput; 3639 } 3640 #endif 3641 if ((bbr->rc_bbr_state == BBR_STATE_PROBE_RTT) && 3642 ((bbr->r_ctl.bbr_rttprobe_gain_val == 0) || bbr->rc_use_google)) { 3643 /* We don't change anything in probe-rtt */ 3644 return; 3645 } 3646 maxseg = tp->t_maxseg - bbr->rc_last_options; 3647 saved_bytes = bytes_this_ack; 3648 bytes_this_ack += sack_changed; 3649 if (bytes_this_ack > prev_acked) { 3650 bytes_this_ack -= prev_acked; 3651 /* 3652 * A byte ack'd gives us a full mss 3653 * to be like linux i.e. they count packets. 3654 */ 3655 if ((bytes_this_ack < maxseg) && bbr->rc_use_google) 3656 bytes_this_ack = maxseg; 3657 } else { 3658 /* Unlikely */ 3659 bytes_this_ack = 0; 3660 } 3661 cwnd = tp->snd_cwnd; 3662 bw = get_filter_value(&bbr->r_ctl.rc_delrate); 3663 if (bw) 3664 target_cwnd = bbr_get_target_cwnd(bbr, 3665 bw, 3666 (uint32_t)bbr->r_ctl.rc_bbr_cwnd_gain); 3667 else 3668 target_cwnd = bbr_initial_cwnd(bbr, bbr->rc_tp); 3669 if (IN_RECOVERY(tp->t_flags) && 3670 (bbr->bbr_prev_in_rec == 0)) { 3671 /* 3672 * We are entering recovery and 3673 * thus packet conservation. 3674 */ 3675 bbr->pkt_conservation = 1; 3676 bbr->r_ctl.rc_recovery_start = bbr->r_ctl.rc_rcvtime; 3677 cwnd = ctf_flight_size(tp, 3678 (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)) + 3679 bytes_this_ack; 3680 } 3681 if (IN_RECOVERY(tp->t_flags)) { 3682 uint32_t flight; 3683 3684 bbr->bbr_prev_in_rec = 1; 3685 if (cwnd > losses) { 3686 cwnd -= losses; 3687 if (cwnd < maxseg) 3688 cwnd = maxseg; 3689 } else 3690 cwnd = maxseg; 3691 flight = ctf_flight_size(tp, 3692 (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)); 3693 bbr_log_type_cwndupd(bbr, flight, 0, 3694 losses, 10, 0, 0, line); 3695 if (bbr->pkt_conservation) { 3696 uint32_t time_in; 3697 3698 if (TSTMP_GEQ(bbr->r_ctl.rc_rcvtime, bbr->r_ctl.rc_recovery_start)) 3699 time_in = bbr->r_ctl.rc_rcvtime - bbr->r_ctl.rc_recovery_start; 3700 else 3701 time_in = 0; 3702 3703 if (time_in >= bbr_get_rtt(bbr, BBR_RTT_PROP)) { 3704 /* Clear packet conservation after an rttProp */ 3705 bbr->pkt_conservation = 0; 3706 } else { 3707 if ((flight + bytes_this_ack) > cwnd) 3708 cwnd = flight + bytes_this_ack; 3709 if (cwnd < get_min_cwnd(bbr)) 3710 cwnd = get_min_cwnd(bbr); 3711 tp->snd_cwnd = cwnd; 3712 bbr_log_type_cwndupd(bbr, saved_bytes, sack_changed, 3713 prev_acked, 1, target_cwnd, th->th_ack, line); 3714 return; 3715 } 3716 } 3717 } else 3718 bbr->bbr_prev_in_rec = 0; 3719 if ((bbr->rc_use_google == 0) && bbr->r_ctl.restrict_growth) { 3720 bbr->r_ctl.restrict_growth--; 3721 if (bytes_this_ack > maxseg) 3722 bytes_this_ack = maxseg; 3723 } 3724 if (bbr->rc_filled_pipe) { 3725 /* 3726 * Here we have exited startup and filled the pipe. We will 3727 * thus allow the cwnd to shrink to the target. We hit here 3728 * mostly. 3729 */ 3730 uint32_t s_cwnd; 3731 3732 meth = 2; 3733 s_cwnd = min((cwnd + bytes_this_ack), target_cwnd); 3734 if (s_cwnd > cwnd) 3735 cwnd = s_cwnd; 3736 else if (bbr_cwnd_may_shrink || bbr->rc_use_google || bbr->rc_no_pacing) 3737 cwnd = s_cwnd; 3738 } else { 3739 /* 3740 * Here we are still in startup, we increase cwnd by what 3741 * has been acked. 3742 */ 3743 if ((cwnd < target_cwnd) || 3744 (bbr->rc_past_init_win == 0)) { 3745 meth = 3; 3746 cwnd += bytes_this_ack; 3747 } else { 3748 /* 3749 * Method 4 means we are at target so no gain in 3750 * startup and past the initial window. 3751 */ 3752 meth = 4; 3753 } 3754 } 3755 tp->snd_cwnd = max(cwnd, get_min_cwnd(bbr)); 3756 bbr_log_type_cwndupd(bbr, saved_bytes, sack_changed, prev_acked, meth, target_cwnd, th->th_ack, line); 3757 } 3758 3759 static void 3760 tcp_bbr_partialack(struct tcpcb *tp) 3761 { 3762 struct tcp_bbr *bbr; 3763 3764 bbr = (struct tcp_bbr *)tp->t_fb_ptr; 3765 INP_WLOCK_ASSERT(tp->t_inpcb); 3766 if (ctf_flight_size(tp, 3767 (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)) <= 3768 tp->snd_cwnd) { 3769 bbr->r_wanted_output = 1; 3770 } 3771 } 3772 3773 static void 3774 bbr_post_recovery(struct tcpcb *tp) 3775 { 3776 struct tcp_bbr *bbr; 3777 uint32_t flight; 3778 3779 INP_WLOCK_ASSERT(tp->t_inpcb); 3780 bbr = (struct tcp_bbr *)tp->t_fb_ptr; 3781 /* 3782 * Here we just exit recovery. 3783 */ 3784 EXIT_RECOVERY(tp->t_flags); 3785 /* Lock in our b/w reduction for the specified number of pkt-epochs */ 3786 bbr->r_recovery_bw = 0; 3787 tp->snd_recover = tp->snd_una; 3788 tcp_bbr_tso_size_check(bbr, bbr->r_ctl.rc_rcvtime); 3789 bbr->pkt_conservation = 0; 3790 if (bbr->rc_use_google == 0) { 3791 /* 3792 * For non-google mode lets 3793 * go ahead and make sure we clear 3794 * the recovery state so if we 3795 * bounce back in to recovery we 3796 * will do PC. 3797 */ 3798 bbr->bbr_prev_in_rec = 0; 3799 } 3800 bbr_log_type_exit_rec(bbr); 3801 if (bbr->rc_bbr_state != BBR_STATE_PROBE_RTT) { 3802 tp->snd_cwnd = max(tp->snd_cwnd, bbr->r_ctl.rc_cwnd_on_ent); 3803 bbr_log_type_cwndupd(bbr, 0, 0, 0, 15, 0, 0, __LINE__); 3804 } else { 3805 /* For probe-rtt case lets fix up its saved_cwnd */ 3806 if (bbr->r_ctl.rc_saved_cwnd < bbr->r_ctl.rc_cwnd_on_ent) { 3807 bbr->r_ctl.rc_saved_cwnd = bbr->r_ctl.rc_cwnd_on_ent; 3808 bbr_log_type_cwndupd(bbr, 0, 0, 0, 16, 0, 0, __LINE__); 3809 } 3810 } 3811 flight = ctf_flight_size(tp, 3812 (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)); 3813 if ((bbr->rc_use_google == 0) && 3814 bbr_do_red) { 3815 uint64_t val, lr2use; 3816 uint32_t maxseg, newcwnd, acks_inflight, ratio, cwnd; 3817 uint32_t *cwnd_p; 3818 3819 if (bbr_get_rtt(bbr, BBR_SRTT)) { 3820 val = ((uint64_t)bbr_get_rtt(bbr, BBR_RTT_PROP) * (uint64_t)1000); 3821 val /= bbr_get_rtt(bbr, BBR_SRTT); 3822 ratio = (uint32_t)val; 3823 } else 3824 ratio = 1000; 3825 3826 bbr_log_type_cwndupd(bbr, bbr_red_mul, bbr_red_div, 3827 bbr->r_ctl.recovery_lr, 21, 3828 ratio, 3829 bbr->r_ctl.rc_red_cwnd_pe, 3830 __LINE__); 3831 if ((ratio < bbr_do_red) || (bbr_do_red == 0)) 3832 goto done; 3833 if (((bbr->rc_bbr_state == BBR_STATE_PROBE_RTT) && 3834 bbr_prtt_slam_cwnd) || 3835 (bbr_sub_drain_slam_cwnd && 3836 (bbr->rc_bbr_state == BBR_STATE_PROBE_BW) && 3837 bbr->rc_hit_state_1 && 3838 (bbr_state_val(bbr) == BBR_SUB_DRAIN)) || 3839 ((bbr->rc_bbr_state == BBR_STATE_DRAIN) && 3840 bbr_slam_cwnd_in_main_drain)) { 3841 /* 3842 * Here we must poke at the saved cwnd 3843 * as well as the cwnd. 3844 */ 3845 cwnd = bbr->r_ctl.rc_saved_cwnd; 3846 cwnd_p = &bbr->r_ctl.rc_saved_cwnd; 3847 } else { 3848 cwnd = tp->snd_cwnd; 3849 cwnd_p = &tp->snd_cwnd; 3850 } 3851 maxseg = tp->t_maxseg - bbr->rc_last_options; 3852 /* Add the overall lr with the recovery lr */ 3853 if (bbr->r_ctl.rc_lost == 0) 3854 lr2use = 0; 3855 else if (bbr->r_ctl.rc_delivered == 0) 3856 lr2use = 1000; 3857 else { 3858 lr2use = bbr->r_ctl.rc_lost * 1000; 3859 lr2use /= bbr->r_ctl.rc_delivered; 3860 } 3861 lr2use += bbr->r_ctl.recovery_lr; 3862 acks_inflight = (flight / (maxseg * 2)); 3863 if (bbr_red_scale) { 3864 lr2use *= bbr_get_rtt(bbr, BBR_SRTT); 3865 lr2use /= bbr_red_scale; 3866 if ((bbr_red_growth_restrict) && 3867 ((bbr_get_rtt(bbr, BBR_SRTT)/bbr_red_scale) > 1)) 3868 bbr->r_ctl.restrict_growth += acks_inflight; 3869 } 3870 if (lr2use) { 3871 val = (uint64_t)cwnd * lr2use; 3872 val /= 1000; 3873 if (cwnd > val) 3874 newcwnd = roundup((cwnd - val), maxseg); 3875 else 3876 newcwnd = maxseg; 3877 } else { 3878 val = (uint64_t)cwnd * (uint64_t)bbr_red_mul; 3879 val /= (uint64_t)bbr_red_div; 3880 newcwnd = roundup((uint32_t)val, maxseg); 3881 } 3882 /* with standard delayed acks how many acks can I expect? */ 3883 if (bbr_drop_limit == 0) { 3884 /* 3885 * Anticpate how much we will 3886 * raise the cwnd based on the acks. 3887 */ 3888 if ((newcwnd + (acks_inflight * maxseg)) < get_min_cwnd(bbr)) { 3889 /* We do enforce the min (with the acks) */ 3890 newcwnd = (get_min_cwnd(bbr) - acks_inflight); 3891 } 3892 } else { 3893 /* 3894 * A strict drop limit of N is inplace 3895 */ 3896 if (newcwnd < (bbr_drop_limit * maxseg)) { 3897 newcwnd = bbr_drop_limit * maxseg; 3898 } 3899 } 3900 /* For the next N acks do we restrict the growth */ 3901 *cwnd_p = newcwnd; 3902 if (tp->snd_cwnd > newcwnd) 3903 tp->snd_cwnd = newcwnd; 3904 bbr_log_type_cwndupd(bbr, bbr_red_mul, bbr_red_div, val, 22, 3905 (uint32_t)lr2use, 3906 bbr_get_rtt(bbr, BBR_SRTT), __LINE__); 3907 bbr->r_ctl.rc_red_cwnd_pe = bbr->r_ctl.rc_pkt_epoch; 3908 } 3909 done: 3910 bbr->r_ctl.recovery_lr = 0; 3911 if (flight <= tp->snd_cwnd) { 3912 bbr->r_wanted_output = 1; 3913 } 3914 tcp_bbr_tso_size_check(bbr, bbr->r_ctl.rc_rcvtime); 3915 } 3916 3917 static void 3918 bbr_setup_red_bw(struct tcp_bbr *bbr, uint32_t cts) 3919 { 3920 bbr->r_ctl.red_bw = get_filter_value(&bbr->r_ctl.rc_delrate); 3921 /* Limit the drop in b/w to 1/2 our current filter. */ 3922 if (bbr->r_ctl.red_bw > bbr->r_ctl.rc_bbr_cur_del_rate) 3923 bbr->r_ctl.red_bw = bbr->r_ctl.rc_bbr_cur_del_rate; 3924 if (bbr->r_ctl.red_bw < (get_filter_value(&bbr->r_ctl.rc_delrate) / 2)) 3925 bbr->r_ctl.red_bw = get_filter_value(&bbr->r_ctl.rc_delrate) / 2; 3926 tcp_bbr_tso_size_check(bbr, cts); 3927 } 3928 3929 static void 3930 bbr_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type, struct bbr_sendmap *rsm) 3931 { 3932 struct tcp_bbr *bbr; 3933 3934 INP_WLOCK_ASSERT(tp->t_inpcb); 3935 #ifdef STATS 3936 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_CSIG, type); 3937 #endif 3938 bbr = (struct tcp_bbr *)tp->t_fb_ptr; 3939 switch (type) { 3940 case CC_NDUPACK: 3941 if (!IN_RECOVERY(tp->t_flags)) { 3942 tp->snd_recover = tp->snd_max; 3943 /* Start a new epoch */ 3944 bbr_set_pktepoch(bbr, bbr->r_ctl.rc_rcvtime, __LINE__); 3945 if (bbr->rc_lt_is_sampling || bbr->rc_lt_use_bw) { 3946 /* 3947 * Move forward the lt epoch 3948 * so it won't count the truncated 3949 * epoch. 3950 */ 3951 bbr->r_ctl.rc_lt_epoch++; 3952 } 3953 if (bbr->rc_bbr_state == BBR_STATE_STARTUP) { 3954 /* 3955 * Just like the policer detection code 3956 * if we are in startup we must push 3957 * forward the last startup epoch 3958 * to hide the truncated PE. 3959 */ 3960 bbr->r_ctl.rc_bbr_last_startup_epoch++; 3961 } 3962 bbr->r_ctl.rc_cwnd_on_ent = tp->snd_cwnd; 3963 ENTER_RECOVERY(tp->t_flags); 3964 bbr->rc_tlp_rtx_out = 0; 3965 bbr->r_ctl.recovery_lr = bbr->r_ctl.rc_pkt_epoch_loss_rate; 3966 tcp_bbr_tso_size_check(bbr, bbr->r_ctl.rc_rcvtime); 3967 if (tcp_in_hpts(bbr->rc_inp) && 3968 ((bbr->r_ctl.rc_hpts_flags & PACE_TMR_RACK) == 0)) { 3969 /* 3970 * When we enter recovery, we need to restart 3971 * any timers. This may mean we gain an agg 3972 * early, which will be made up for at the last 3973 * rxt out. 3974 */ 3975 bbr->rc_timer_first = 1; 3976 bbr_timer_cancel(bbr, __LINE__, bbr->r_ctl.rc_rcvtime); 3977 } 3978 /* 3979 * Calculate a new cwnd based on to the current 3980 * delivery rate with no gain. We get the bdp 3981 * without gaining it up like we normally would and 3982 * we use the last cur_del_rate. 3983 */ 3984 if ((bbr->rc_use_google == 0) && 3985 (bbr->r_ctl.bbr_rttprobe_gain_val || 3986 (bbr->rc_bbr_state != BBR_STATE_PROBE_RTT))) { 3987 tp->snd_cwnd = ctf_flight_size(tp, 3988 (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)) + 3989 (tp->t_maxseg - bbr->rc_last_options); 3990 if (tp->snd_cwnd < get_min_cwnd(bbr)) { 3991 /* We always gate to min cwnd */ 3992 tp->snd_cwnd = get_min_cwnd(bbr); 3993 } 3994 bbr_log_type_cwndupd(bbr, 0, 0, 0, 14, 0, 0, __LINE__); 3995 } 3996 bbr_log_type_enter_rec(bbr, rsm->r_start); 3997 } 3998 break; 3999 case CC_RTO_ERR: 4000 KMOD_TCPSTAT_INC(tcps_sndrexmitbad); 4001 /* RTO was unnecessary, so reset everything. */ 4002 bbr_reset_lt_bw_sampling(bbr, bbr->r_ctl.rc_rcvtime); 4003 if (bbr->rc_bbr_state != BBR_STATE_PROBE_RTT) { 4004 tp->snd_cwnd = tp->snd_cwnd_prev; 4005 tp->snd_ssthresh = tp->snd_ssthresh_prev; 4006 tp->snd_recover = tp->snd_recover_prev; 4007 tp->snd_cwnd = max(tp->snd_cwnd, bbr->r_ctl.rc_cwnd_on_ent); 4008 bbr_log_type_cwndupd(bbr, 0, 0, 0, 13, 0, 0, __LINE__); 4009 } 4010 tp->t_badrxtwin = 0; 4011 break; 4012 } 4013 } 4014 4015 /* 4016 * Indicate whether this ack should be delayed. We can delay the ack if 4017 * following conditions are met: 4018 * - There is no delayed ack timer in progress. 4019 * - Our last ack wasn't a 0-sized window. We never want to delay 4020 * the ack that opens up a 0-sized window. 4021 * - LRO wasn't used for this segment. We make sure by checking that the 4022 * segment size is not larger than the MSS. 4023 * - Delayed acks are enabled or this is a half-synchronized T/TCP 4024 * connection. 4025 * - The data being acked is less than a full segment (a stretch ack 4026 * of more than a segment we should ack. 4027 * - nsegs is 1 (if its more than that we received more than 1 ack). 4028 */ 4029 #define DELAY_ACK(tp, bbr, nsegs) \ 4030 (((tp->t_flags & TF_RXWIN0SENT) == 0) && \ 4031 ((tp->t_flags & TF_DELACK) == 0) && \ 4032 ((bbr->bbr_segs_rcvd + nsegs) < tp->t_delayed_ack) && \ 4033 (tp->t_delayed_ack || (tp->t_flags & TF_NEEDSYN))) 4034 4035 /* 4036 * Return the lowest RSM in the map of 4037 * packets still in flight that is not acked. 4038 * This should normally find on the first one 4039 * since we remove packets from the send 4040 * map after they are marked ACKED. 4041 */ 4042 static struct bbr_sendmap * 4043 bbr_find_lowest_rsm(struct tcp_bbr *bbr) 4044 { 4045 struct bbr_sendmap *rsm; 4046 4047 /* 4048 * Walk the time-order transmitted list looking for an rsm that is 4049 * not acked. This will be the one that was sent the longest time 4050 * ago that is still outstanding. 4051 */ 4052 TAILQ_FOREACH(rsm, &bbr->r_ctl.rc_tmap, r_tnext) { 4053 if (rsm->r_flags & BBR_ACKED) { 4054 continue; 4055 } 4056 goto finish; 4057 } 4058 finish: 4059 return (rsm); 4060 } 4061 4062 static struct bbr_sendmap * 4063 bbr_find_high_nonack(struct tcp_bbr *bbr, struct bbr_sendmap *rsm) 4064 { 4065 struct bbr_sendmap *prsm; 4066 4067 /* 4068 * Walk the sequence order list backward until we hit and arrive at 4069 * the highest seq not acked. In theory when this is called it 4070 * should be the last segment (which it was not). 4071 */ 4072 prsm = rsm; 4073 TAILQ_FOREACH_REVERSE_FROM(prsm, &bbr->r_ctl.rc_map, bbr_head, r_next) { 4074 if (prsm->r_flags & (BBR_ACKED | BBR_HAS_FIN)) { 4075 continue; 4076 } 4077 return (prsm); 4078 } 4079 return (NULL); 4080 } 4081 4082 /* 4083 * Returns to the caller the number of microseconds that 4084 * the packet can be outstanding before we think we 4085 * should have had an ack returned. 4086 */ 4087 static uint32_t 4088 bbr_calc_thresh_rack(struct tcp_bbr *bbr, uint32_t srtt, uint32_t cts, struct bbr_sendmap *rsm) 4089 { 4090 /* 4091 * lro is the flag we use to determine if we have seen reordering. 4092 * If it gets set we have seen reordering. The reorder logic either 4093 * works in one of two ways: 4094 * 4095 * If reorder-fade is configured, then we track the last time we saw 4096 * re-ordering occur. If we reach the point where enough time as 4097 * passed we no longer consider reordering has occuring. 4098 * 4099 * Or if reorder-face is 0, then once we see reordering we consider 4100 * the connection to alway be subject to reordering and just set lro 4101 * to 1. 4102 * 4103 * In the end if lro is non-zero we add the extra time for 4104 * reordering in. 4105 */ 4106 int32_t lro; 4107 uint32_t thresh, t_rxtcur; 4108 4109 if (srtt == 0) 4110 srtt = 1; 4111 if (bbr->r_ctl.rc_reorder_ts) { 4112 if (bbr->r_ctl.rc_reorder_fade) { 4113 if (SEQ_GEQ(cts, bbr->r_ctl.rc_reorder_ts)) { 4114 lro = cts - bbr->r_ctl.rc_reorder_ts; 4115 if (lro == 0) { 4116 /* 4117 * No time as passed since the last 4118 * reorder, mark it as reordering. 4119 */ 4120 lro = 1; 4121 } 4122 } else { 4123 /* Negative time? */ 4124 lro = 0; 4125 } 4126 if (lro > bbr->r_ctl.rc_reorder_fade) { 4127 /* Turn off reordering seen too */ 4128 bbr->r_ctl.rc_reorder_ts = 0; 4129 lro = 0; 4130 } 4131 } else { 4132 /* Reodering does not fade */ 4133 lro = 1; 4134 } 4135 } else { 4136 lro = 0; 4137 } 4138 thresh = srtt + bbr->r_ctl.rc_pkt_delay; 4139 if (lro) { 4140 /* It must be set, if not you get 1/4 rtt */ 4141 if (bbr->r_ctl.rc_reorder_shift) 4142 thresh += (srtt >> bbr->r_ctl.rc_reorder_shift); 4143 else 4144 thresh += (srtt >> 2); 4145 } else { 4146 thresh += 1000; 4147 } 4148 /* We don't let the rack timeout be above a RTO */ 4149 if ((bbr->rc_tp)->t_srtt == 0) 4150 t_rxtcur = BBR_INITIAL_RTO; 4151 else 4152 t_rxtcur = TICKS_2_USEC(bbr->rc_tp->t_rxtcur); 4153 if (thresh > t_rxtcur) { 4154 thresh = t_rxtcur; 4155 } 4156 /* And we don't want it above the RTO max either */ 4157 if (thresh > (((uint32_t)bbr->rc_max_rto_sec) * USECS_IN_SECOND)) { 4158 thresh = (((uint32_t)bbr->rc_max_rto_sec) * USECS_IN_SECOND); 4159 } 4160 bbr_log_thresh_choice(bbr, cts, thresh, lro, srtt, rsm, BBR_TO_FRM_RACK); 4161 return (thresh); 4162 } 4163 4164 /* 4165 * Return to the caller the amount of time in mico-seconds 4166 * that should be used for the TLP timer from the last 4167 * send time of this packet. 4168 */ 4169 static uint32_t 4170 bbr_calc_thresh_tlp(struct tcpcb *tp, struct tcp_bbr *bbr, 4171 struct bbr_sendmap *rsm, uint32_t srtt, 4172 uint32_t cts) 4173 { 4174 uint32_t thresh, len, maxseg, t_rxtcur; 4175 struct bbr_sendmap *prsm; 4176 4177 if (srtt == 0) 4178 srtt = 1; 4179 if (bbr->rc_tlp_threshold) 4180 thresh = srtt + (srtt / bbr->rc_tlp_threshold); 4181 else 4182 thresh = (srtt * 2); 4183 maxseg = tp->t_maxseg - bbr->rc_last_options; 4184 /* Get the previous sent packet, if any */ 4185 len = rsm->r_end - rsm->r_start; 4186 4187 /* 2.1 behavior */ 4188 prsm = TAILQ_PREV(rsm, bbr_head, r_tnext); 4189 if (prsm && (len <= maxseg)) { 4190 /* 4191 * Two packets outstanding, thresh should be (2*srtt) + 4192 * possible inter-packet delay (if any). 4193 */ 4194 uint32_t inter_gap = 0; 4195 int idx, nidx; 4196 4197 idx = rsm->r_rtr_cnt - 1; 4198 nidx = prsm->r_rtr_cnt - 1; 4199 if (TSTMP_GEQ(rsm->r_tim_lastsent[nidx], prsm->r_tim_lastsent[idx])) { 4200 /* Yes it was sent later (or at the same time) */ 4201 inter_gap = rsm->r_tim_lastsent[idx] - prsm->r_tim_lastsent[nidx]; 4202 } 4203 thresh += inter_gap; 4204 } else if (len <= maxseg) { 4205 /* 4206 * Possibly compensate for delayed-ack. 4207 */ 4208 uint32_t alt_thresh; 4209 4210 alt_thresh = srtt + (srtt / 2) + bbr_delayed_ack_time; 4211 if (alt_thresh > thresh) 4212 thresh = alt_thresh; 4213 } 4214 /* Not above the current RTO */ 4215 if (tp->t_srtt == 0) 4216 t_rxtcur = BBR_INITIAL_RTO; 4217 else 4218 t_rxtcur = TICKS_2_USEC(tp->t_rxtcur); 4219 4220 bbr_log_thresh_choice(bbr, cts, thresh, t_rxtcur, srtt, rsm, BBR_TO_FRM_TLP); 4221 /* Not above an RTO */ 4222 if (thresh > t_rxtcur) { 4223 thresh = t_rxtcur; 4224 } 4225 /* Not above a RTO max */ 4226 if (thresh > (((uint32_t)bbr->rc_max_rto_sec) * USECS_IN_SECOND)) { 4227 thresh = (((uint32_t)bbr->rc_max_rto_sec) * USECS_IN_SECOND); 4228 } 4229 /* And now apply the user TLP min */ 4230 if (thresh < bbr_tlp_min) { 4231 thresh = bbr_tlp_min; 4232 } 4233 return (thresh); 4234 } 4235 4236 /* 4237 * Return one of three RTTs to use (in microseconds). 4238 */ 4239 static __inline uint32_t 4240 bbr_get_rtt(struct tcp_bbr *bbr, int32_t rtt_type) 4241 { 4242 uint32_t f_rtt; 4243 uint32_t srtt; 4244 4245 f_rtt = get_filter_value_small(&bbr->r_ctl.rc_rttprop); 4246 if (get_filter_value_small(&bbr->r_ctl.rc_rttprop) == 0xffffffff) { 4247 /* We have no rtt at all */ 4248 if (bbr->rc_tp->t_srtt == 0) 4249 f_rtt = BBR_INITIAL_RTO; 4250 else 4251 f_rtt = (TICKS_2_USEC(bbr->rc_tp->t_srtt) >> TCP_RTT_SHIFT); 4252 /* 4253 * Since we don't know how good the rtt is apply a 4254 * delayed-ack min 4255 */ 4256 if (f_rtt < bbr_delayed_ack_time) { 4257 f_rtt = bbr_delayed_ack_time; 4258 } 4259 } 4260 /* Take the filter version or last measured pkt-rtt */ 4261 if (rtt_type == BBR_RTT_PROP) { 4262 srtt = f_rtt; 4263 } else if (rtt_type == BBR_RTT_PKTRTT) { 4264 if (bbr->r_ctl.rc_pkt_epoch_rtt) { 4265 srtt = bbr->r_ctl.rc_pkt_epoch_rtt; 4266 } else { 4267 /* No pkt rtt yet */ 4268 srtt = f_rtt; 4269 } 4270 } else if (rtt_type == BBR_RTT_RACK) { 4271 srtt = bbr->r_ctl.rc_last_rtt; 4272 /* We need to add in any internal delay for our timer */ 4273 if (bbr->rc_ack_was_delayed) 4274 srtt += bbr->r_ctl.rc_ack_hdwr_delay; 4275 } else if (rtt_type == BBR_SRTT) { 4276 srtt = (TICKS_2_USEC(bbr->rc_tp->t_srtt) >> TCP_RTT_SHIFT); 4277 } else { 4278 /* TSNH */ 4279 srtt = f_rtt; 4280 #ifdef BBR_INVARIANTS 4281 panic("Unknown rtt request type %d", rtt_type); 4282 #endif 4283 } 4284 return (srtt); 4285 } 4286 4287 static int 4288 bbr_is_lost(struct tcp_bbr *bbr, struct bbr_sendmap *rsm, uint32_t cts) 4289 { 4290 uint32_t thresh; 4291 4292 thresh = bbr_calc_thresh_rack(bbr, bbr_get_rtt(bbr, BBR_RTT_RACK), 4293 cts, rsm); 4294 if ((cts - rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]) >= thresh) { 4295 /* It is lost (past time) */ 4296 return (1); 4297 } 4298 return (0); 4299 } 4300 4301 /* 4302 * Return a sendmap if we need to retransmit something. 4303 */ 4304 static struct bbr_sendmap * 4305 bbr_check_recovery_mode(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts) 4306 { 4307 /* 4308 * Check to see that we don't need to fall into recovery. We will 4309 * need to do so if our oldest transmit is past the time we should 4310 * have had an ack. 4311 */ 4312 4313 struct bbr_sendmap *rsm; 4314 int32_t idx; 4315 4316 if (TAILQ_EMPTY(&bbr->r_ctl.rc_map)) { 4317 /* Nothing outstanding that we know of */ 4318 return (NULL); 4319 } 4320 rsm = TAILQ_FIRST(&bbr->r_ctl.rc_tmap); 4321 if (rsm == NULL) { 4322 /* Nothing in the transmit map */ 4323 return (NULL); 4324 } 4325 if (tp->t_flags & TF_SENTFIN) { 4326 /* Fin restricted, don't find anything once a fin is sent */ 4327 return (NULL); 4328 } 4329 if (rsm->r_flags & BBR_ACKED) { 4330 /* 4331 * Ok the first one is acked (this really should not happen 4332 * since we remove the from the tmap once they are acked) 4333 */ 4334 rsm = bbr_find_lowest_rsm(bbr); 4335 if (rsm == NULL) 4336 return (NULL); 4337 } 4338 idx = rsm->r_rtr_cnt - 1; 4339 if (SEQ_LEQ(cts, rsm->r_tim_lastsent[idx])) { 4340 /* Send timestamp is the same or less? can't be ready */ 4341 return (NULL); 4342 } 4343 /* Get our RTT time */ 4344 if (bbr_is_lost(bbr, rsm, cts) && 4345 ((rsm->r_dupack >= DUP_ACK_THRESHOLD) || 4346 (rsm->r_flags & BBR_SACK_PASSED))) { 4347 if ((rsm->r_flags & BBR_MARKED_LOST) == 0) { 4348 rsm->r_flags |= BBR_MARKED_LOST; 4349 bbr->r_ctl.rc_lost += rsm->r_end - rsm->r_start; 4350 bbr->r_ctl.rc_lost_bytes += rsm->r_end - rsm->r_start; 4351 } 4352 bbr_cong_signal(tp, NULL, CC_NDUPACK, rsm); 4353 #ifdef BBR_INVARIANTS 4354 if ((rsm->r_end - rsm->r_start) == 0) 4355 panic("tp:%p bbr:%p rsm:%p length is 0?", tp, bbr, rsm); 4356 #endif 4357 return (rsm); 4358 } 4359 return (NULL); 4360 } 4361 4362 /* 4363 * RACK Timer, here we simply do logging and house keeping. 4364 * the normal bbr_output_wtime() function will call the 4365 * appropriate thing to check if we need to do a RACK retransmit. 4366 * We return 1, saying don't proceed with bbr_output_wtime only 4367 * when all timers have been stopped (destroyed PCB?). 4368 */ 4369 static int 4370 bbr_timeout_rack(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts) 4371 { 4372 /* 4373 * This timer simply provides an internal trigger to send out data. 4374 * The check_recovery_mode call will see if there are needed 4375 * retransmissions, if so we will enter fast-recovery. The output 4376 * call may or may not do the same thing depending on sysctl 4377 * settings. 4378 */ 4379 uint32_t lost; 4380 4381 if (bbr->rc_all_timers_stopped) { 4382 return (1); 4383 } 4384 if (TSTMP_LT(cts, bbr->r_ctl.rc_timer_exp)) { 4385 /* Its not time yet */ 4386 return (0); 4387 } 4388 BBR_STAT_INC(bbr_to_tot); 4389 lost = bbr->r_ctl.rc_lost; 4390 if (bbr->r_state && (bbr->r_state != tp->t_state)) 4391 bbr_set_state(tp, bbr, 0); 4392 bbr_log_to_event(bbr, cts, BBR_TO_FRM_RACK); 4393 if (bbr->r_ctl.rc_resend == NULL) { 4394 /* Lets do the check here */ 4395 bbr->r_ctl.rc_resend = bbr_check_recovery_mode(tp, bbr, cts); 4396 } 4397 if (bbr_policer_call_from_rack_to) 4398 bbr_lt_bw_sampling(bbr, cts, (bbr->r_ctl.rc_lost > lost)); 4399 bbr->r_ctl.rc_hpts_flags &= ~PACE_TMR_RACK; 4400 return (0); 4401 } 4402 4403 static __inline void 4404 bbr_clone_rsm(struct tcp_bbr *bbr, struct bbr_sendmap *nrsm, struct bbr_sendmap *rsm, uint32_t start) 4405 { 4406 int idx; 4407 4408 nrsm->r_start = start; 4409 nrsm->r_end = rsm->r_end; 4410 nrsm->r_rtr_cnt = rsm->r_rtr_cnt; 4411 nrsm-> r_rtt_not_allowed = rsm->r_rtt_not_allowed; 4412 nrsm->r_flags = rsm->r_flags; 4413 /* We don't transfer forward the SYN flag */ 4414 nrsm->r_flags &= ~BBR_HAS_SYN; 4415 /* We move forward the FIN flag, not that this should happen */ 4416 rsm->r_flags &= ~BBR_HAS_FIN; 4417 nrsm->r_dupack = rsm->r_dupack; 4418 nrsm->r_rtr_bytes = 0; 4419 nrsm->r_is_gain = rsm->r_is_gain; 4420 nrsm->r_is_drain = rsm->r_is_drain; 4421 nrsm->r_delivered = rsm->r_delivered; 4422 nrsm->r_ts_valid = rsm->r_ts_valid; 4423 nrsm->r_del_ack_ts = rsm->r_del_ack_ts; 4424 nrsm->r_del_time = rsm->r_del_time; 4425 nrsm->r_app_limited = rsm->r_app_limited; 4426 nrsm->r_first_sent_time = rsm->r_first_sent_time; 4427 nrsm->r_flight_at_send = rsm->r_flight_at_send; 4428 /* We split a piece the lower section looses any just_ret flag. */ 4429 nrsm->r_bbr_state = rsm->r_bbr_state; 4430 for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { 4431 nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; 4432 } 4433 rsm->r_end = nrsm->r_start; 4434 idx = min((bbr->rc_tp->t_maxseg - bbr->rc_last_options), bbr->r_ctl.rc_pace_max_segs); 4435 idx /= 8; 4436 /* Check if we got too small */ 4437 if ((rsm->r_is_smallmap == 0) && 4438 ((rsm->r_end - rsm->r_start) <= idx)) { 4439 bbr->r_ctl.rc_num_small_maps_alloced++; 4440 rsm->r_is_smallmap = 1; 4441 } 4442 /* Check the new one as well */ 4443 if ((nrsm->r_end - nrsm->r_start) <= idx) { 4444 bbr->r_ctl.rc_num_small_maps_alloced++; 4445 nrsm->r_is_smallmap = 1; 4446 } 4447 } 4448 4449 static int 4450 bbr_sack_mergable(struct bbr_sendmap *at, 4451 uint32_t start, uint32_t end) 4452 { 4453 /* 4454 * Given a sack block defined by 4455 * start and end, and a current position 4456 * at. Return 1 if either side of at 4457 * would show that the block is mergable 4458 * to that side. A block to be mergable 4459 * must have overlap with the start/end 4460 * and be in the SACK'd state. 4461 */ 4462 struct bbr_sendmap *l_rsm; 4463 struct bbr_sendmap *r_rsm; 4464 4465 /* first get the either side blocks */ 4466 l_rsm = TAILQ_PREV(at, bbr_head, r_next); 4467 r_rsm = TAILQ_NEXT(at, r_next); 4468 if (l_rsm && (l_rsm->r_flags & BBR_ACKED)) { 4469 /* Potentially mergeable */ 4470 if ((l_rsm->r_end == start) || 4471 (SEQ_LT(start, l_rsm->r_end) && 4472 SEQ_GT(end, l_rsm->r_end))) { 4473 /* 4474 * map blk |------| 4475 * sack blk |------| 4476 * <or> 4477 * map blk |------| 4478 * sack blk |------| 4479 */ 4480 return (1); 4481 } 4482 } 4483 if (r_rsm && (r_rsm->r_flags & BBR_ACKED)) { 4484 /* Potentially mergeable */ 4485 if ((r_rsm->r_start == end) || 4486 (SEQ_LT(start, r_rsm->r_start) && 4487 SEQ_GT(end, r_rsm->r_start))) { 4488 /* 4489 * map blk |---------| 4490 * sack blk |----| 4491 * <or> 4492 * map blk |---------| 4493 * sack blk |-------| 4494 */ 4495 return (1); 4496 } 4497 } 4498 return (0); 4499 } 4500 4501 static struct bbr_sendmap * 4502 bbr_merge_rsm(struct tcp_bbr *bbr, 4503 struct bbr_sendmap *l_rsm, 4504 struct bbr_sendmap *r_rsm) 4505 { 4506 /* 4507 * We are merging two ack'd RSM's, 4508 * the l_rsm is on the left (lower seq 4509 * values) and the r_rsm is on the right 4510 * (higher seq value). The simplest way 4511 * to merge these is to move the right 4512 * one into the left. I don't think there 4513 * is any reason we need to try to find 4514 * the oldest (or last oldest retransmitted). 4515 */ 4516 l_rsm->r_end = r_rsm->r_end; 4517 if (l_rsm->r_dupack < r_rsm->r_dupack) 4518 l_rsm->r_dupack = r_rsm->r_dupack; 4519 if (r_rsm->r_rtr_bytes) 4520 l_rsm->r_rtr_bytes += r_rsm->r_rtr_bytes; 4521 if (r_rsm->r_in_tmap) { 4522 /* This really should not happen */ 4523 TAILQ_REMOVE(&bbr->r_ctl.rc_tmap, r_rsm, r_tnext); 4524 } 4525 if (r_rsm->r_app_limited) 4526 l_rsm->r_app_limited = r_rsm->r_app_limited; 4527 /* Now the flags */ 4528 if (r_rsm->r_flags & BBR_HAS_FIN) 4529 l_rsm->r_flags |= BBR_HAS_FIN; 4530 if (r_rsm->r_flags & BBR_TLP) 4531 l_rsm->r_flags |= BBR_TLP; 4532 if (r_rsm->r_flags & BBR_RWND_COLLAPSED) 4533 l_rsm->r_flags |= BBR_RWND_COLLAPSED; 4534 if (r_rsm->r_flags & BBR_MARKED_LOST) { 4535 /* This really should not happen */ 4536 bbr->r_ctl.rc_lost_bytes -= r_rsm->r_end - r_rsm->r_start; 4537 } 4538 TAILQ_REMOVE(&bbr->r_ctl.rc_map, r_rsm, r_next); 4539 if ((r_rsm->r_limit_type == 0) && (l_rsm->r_limit_type != 0)) { 4540 /* Transfer the split limit to the map we free */ 4541 r_rsm->r_limit_type = l_rsm->r_limit_type; 4542 l_rsm->r_limit_type = 0; 4543 } 4544 bbr_free(bbr, r_rsm); 4545 return(l_rsm); 4546 } 4547 4548 /* 4549 * TLP Timer, here we simply setup what segment we want to 4550 * have the TLP expire on, the normal bbr_output_wtime() will then 4551 * send it out. 4552 * 4553 * We return 1, saying don't proceed with bbr_output_wtime only 4554 * when all timers have been stopped (destroyed PCB?). 4555 */ 4556 static int 4557 bbr_timeout_tlp(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts) 4558 { 4559 /* 4560 * Tail Loss Probe. 4561 */ 4562 struct bbr_sendmap *rsm = NULL; 4563 struct socket *so; 4564 uint32_t amm; 4565 uint32_t out, avail; 4566 uint32_t maxseg; 4567 int collapsed_win = 0; 4568 4569 if (bbr->rc_all_timers_stopped) { 4570 return (1); 4571 } 4572 if (TSTMP_LT(cts, bbr->r_ctl.rc_timer_exp)) { 4573 /* Its not time yet */ 4574 return (0); 4575 } 4576 if (ctf_progress_timeout_check(tp, true)) { 4577 bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__); 4578 return (-ETIMEDOUT); /* tcp_drop() */ 4579 } 4580 /* Did we somehow get into persists? */ 4581 if (bbr->rc_in_persist) { 4582 return (0); 4583 } 4584 if (bbr->r_state && (bbr->r_state != tp->t_state)) 4585 bbr_set_state(tp, bbr, 0); 4586 BBR_STAT_INC(bbr_tlp_tot); 4587 maxseg = tp->t_maxseg - bbr->rc_last_options; 4588 /* 4589 * A TLP timer has expired. We have been idle for 2 rtts. So we now 4590 * need to figure out how to force a full MSS segment out. 4591 */ 4592 so = tp->t_inpcb->inp_socket; 4593 avail = sbavail(&so->so_snd); 4594 out = ctf_outstanding(tp); 4595 if (out > tp->snd_wnd) { 4596 /* special case, we need a retransmission */ 4597 collapsed_win = 1; 4598 goto need_retran; 4599 } 4600 if (avail > out) { 4601 /* New data is available */ 4602 amm = avail - out; 4603 if (amm > maxseg) { 4604 amm = maxseg; 4605 } else if ((amm < maxseg) && ((tp->t_flags & TF_NODELAY) == 0)) { 4606 /* not enough to fill a MTU and no-delay is off */ 4607 goto need_retran; 4608 } 4609 /* Set the send-new override */ 4610 if ((out + amm) <= tp->snd_wnd) { 4611 bbr->rc_tlp_new_data = 1; 4612 } else { 4613 goto need_retran; 4614 } 4615 bbr->r_ctl.rc_tlp_seg_send_cnt = 0; 4616 bbr->r_ctl.rc_last_tlp_seq = tp->snd_max; 4617 bbr->r_ctl.rc_tlp_send = NULL; 4618 /* cap any slots */ 4619 BBR_STAT_INC(bbr_tlp_newdata); 4620 goto send; 4621 } 4622 need_retran: 4623 /* 4624 * Ok we need to arrange the last un-acked segment to be re-sent, or 4625 * optionally the first un-acked segment. 4626 */ 4627 if (collapsed_win == 0) { 4628 rsm = TAILQ_LAST_FAST(&bbr->r_ctl.rc_map, bbr_sendmap, r_next); 4629 if (rsm && (BBR_ACKED | BBR_HAS_FIN)) { 4630 rsm = bbr_find_high_nonack(bbr, rsm); 4631 } 4632 if (rsm == NULL) { 4633 goto restore; 4634 } 4635 } else { 4636 /* 4637 * We must find the last segment 4638 * that was acceptable by the client. 4639 */ 4640 TAILQ_FOREACH_REVERSE(rsm, &bbr->r_ctl.rc_map, bbr_head, r_next) { 4641 if ((rsm->r_flags & BBR_RWND_COLLAPSED) == 0) { 4642 /* Found one */ 4643 break; 4644 } 4645 } 4646 if (rsm == NULL) { 4647 /* None? if so send the first */ 4648 rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map); 4649 if (rsm == NULL) 4650 goto restore; 4651 } 4652 } 4653 if ((rsm->r_end - rsm->r_start) > maxseg) { 4654 /* 4655 * We need to split this the last segment in two. 4656 */ 4657 struct bbr_sendmap *nrsm; 4658 4659 nrsm = bbr_alloc_full_limit(bbr); 4660 if (nrsm == NULL) { 4661 /* 4662 * We can't get memory to split, we can either just 4663 * not split it. Or retransmit the whole piece, lets 4664 * do the large send (BTLP :-) ). 4665 */ 4666 goto go_for_it; 4667 } 4668 bbr_clone_rsm(bbr, nrsm, rsm, (rsm->r_end - maxseg)); 4669 TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_map, rsm, nrsm, r_next); 4670 if (rsm->r_in_tmap) { 4671 TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 4672 nrsm->r_in_tmap = 1; 4673 } 4674 rsm->r_flags &= (~BBR_HAS_FIN); 4675 rsm = nrsm; 4676 } 4677 go_for_it: 4678 bbr->r_ctl.rc_tlp_send = rsm; 4679 bbr->rc_tlp_rtx_out = 1; 4680 if (rsm->r_start == bbr->r_ctl.rc_last_tlp_seq) { 4681 bbr->r_ctl.rc_tlp_seg_send_cnt++; 4682 tp->t_rxtshift++; 4683 } else { 4684 bbr->r_ctl.rc_last_tlp_seq = rsm->r_start; 4685 bbr->r_ctl.rc_tlp_seg_send_cnt = 1; 4686 } 4687 send: 4688 if (bbr->r_ctl.rc_tlp_seg_send_cnt > bbr_tlp_max_resend) { 4689 /* 4690 * Can't [re]/transmit a segment we have retranmitted the 4691 * max times. We need the retransmit timer to take over. 4692 */ 4693 restore: 4694 bbr->rc_tlp_new_data = 0; 4695 bbr->r_ctl.rc_tlp_send = NULL; 4696 if (rsm) 4697 rsm->r_flags &= ~BBR_TLP; 4698 BBR_STAT_INC(bbr_tlp_retran_fail); 4699 return (0); 4700 } else if (rsm) { 4701 rsm->r_flags |= BBR_TLP; 4702 } 4703 if (rsm && (rsm->r_start == bbr->r_ctl.rc_last_tlp_seq) && 4704 (bbr->r_ctl.rc_tlp_seg_send_cnt > bbr_tlp_max_resend)) { 4705 /* 4706 * We have retransmitted to many times for TLP. Switch to 4707 * the regular RTO timer 4708 */ 4709 goto restore; 4710 } 4711 bbr_log_to_event(bbr, cts, BBR_TO_FRM_TLP); 4712 bbr->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP; 4713 return (0); 4714 } 4715 4716 /* 4717 * Delayed ack Timer, here we simply need to setup the 4718 * ACK_NOW flag and remove the DELACK flag. From there 4719 * the output routine will send the ack out. 4720 * 4721 * We only return 1, saying don't proceed, if all timers 4722 * are stopped (destroyed PCB?). 4723 */ 4724 static int 4725 bbr_timeout_delack(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts) 4726 { 4727 if (bbr->rc_all_timers_stopped) { 4728 return (1); 4729 } 4730 bbr_log_to_event(bbr, cts, BBR_TO_FRM_DELACK); 4731 tp->t_flags &= ~TF_DELACK; 4732 tp->t_flags |= TF_ACKNOW; 4733 KMOD_TCPSTAT_INC(tcps_delack); 4734 bbr->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK; 4735 return (0); 4736 } 4737 4738 /* 4739 * Here we send a KEEP-ALIVE like probe to the 4740 * peer, we do not send data. 4741 * 4742 * We only return 1, saying don't proceed, if all timers 4743 * are stopped (destroyed PCB?). 4744 */ 4745 static int 4746 bbr_timeout_persist(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts) 4747 { 4748 struct tcptemp *t_template; 4749 int32_t retval = 1; 4750 4751 if (bbr->rc_all_timers_stopped) { 4752 return (1); 4753 } 4754 if (bbr->rc_in_persist == 0) 4755 return (0); 4756 KASSERT(tp->t_inpcb != NULL, 4757 ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 4758 /* 4759 * Persistence timer into zero window. Force a byte to be output, if 4760 * possible. 4761 */ 4762 bbr_log_to_event(bbr, cts, BBR_TO_FRM_PERSIST); 4763 bbr->r_ctl.rc_hpts_flags &= ~PACE_TMR_PERSIT; 4764 KMOD_TCPSTAT_INC(tcps_persisttimeo); 4765 /* 4766 * Have we exceeded the user specified progress time? 4767 */ 4768 if (ctf_progress_timeout_check(tp, true)) { 4769 bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__); 4770 return (-ETIMEDOUT); /* tcp_drop() */ 4771 } 4772 /* 4773 * Hack: if the peer is dead/unreachable, we do not time out if the 4774 * window is closed. After a full backoff, drop the connection if 4775 * the idle time (no responses to probes) reaches the maximum 4776 * backoff that we would use if retransmitting. 4777 */ 4778 if (tp->t_rxtshift == TCP_MAXRXTSHIFT && 4779 (ticks - tp->t_rcvtime >= tcp_maxpersistidle || 4780 ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) { 4781 KMOD_TCPSTAT_INC(tcps_persistdrop); 4782 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); 4783 return (-ETIMEDOUT); /* tcp_drop() */ 4784 } 4785 if ((sbavail(&bbr->rc_inp->inp_socket->so_snd) == 0) && 4786 tp->snd_una == tp->snd_max) { 4787 bbr_exit_persist(tp, bbr, cts, __LINE__); 4788 retval = 0; 4789 goto out; 4790 } 4791 /* 4792 * If the user has closed the socket then drop a persisting 4793 * connection after a much reduced timeout. 4794 */ 4795 if (tp->t_state > TCPS_CLOSE_WAIT && 4796 (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) { 4797 KMOD_TCPSTAT_INC(tcps_persistdrop); 4798 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); 4799 return (-ETIMEDOUT); /* tcp_drop() */ 4800 } 4801 t_template = tcpip_maketemplate(bbr->rc_inp); 4802 if (t_template) { 4803 tcp_respond(tp, t_template->tt_ipgen, 4804 &t_template->tt_t, (struct mbuf *)NULL, 4805 tp->rcv_nxt, tp->snd_una - 1, 0); 4806 /* This sends an ack */ 4807 if (tp->t_flags & TF_DELACK) 4808 tp->t_flags &= ~TF_DELACK; 4809 free(t_template, M_TEMP); 4810 } 4811 if (tp->t_rxtshift < TCP_MAXRXTSHIFT) 4812 tp->t_rxtshift++; 4813 bbr_start_hpts_timer(bbr, tp, cts, 3, 0, 0); 4814 out: 4815 return (retval); 4816 } 4817 4818 /* 4819 * If a keepalive goes off, we had no other timers 4820 * happening. We always return 1 here since this 4821 * routine either drops the connection or sends 4822 * out a segment with respond. 4823 */ 4824 static int 4825 bbr_timeout_keepalive(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts) 4826 { 4827 struct tcptemp *t_template; 4828 struct inpcb *inp; 4829 4830 if (bbr->rc_all_timers_stopped) { 4831 return (1); 4832 } 4833 bbr->r_ctl.rc_hpts_flags &= ~PACE_TMR_KEEP; 4834 inp = tp->t_inpcb; 4835 bbr_log_to_event(bbr, cts, BBR_TO_FRM_KEEP); 4836 /* 4837 * Keep-alive timer went off; send something or drop connection if 4838 * idle for too long. 4839 */ 4840 KMOD_TCPSTAT_INC(tcps_keeptimeo); 4841 if (tp->t_state < TCPS_ESTABLISHED) 4842 goto dropit; 4843 if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && 4844 tp->t_state <= TCPS_CLOSING) { 4845 if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp)) 4846 goto dropit; 4847 /* 4848 * Send a packet designed to force a response if the peer is 4849 * up and reachable: either an ACK if the connection is 4850 * still alive, or an RST if the peer has closed the 4851 * connection due to timeout or reboot. Using sequence 4852 * number tp->snd_una-1 causes the transmitted zero-length 4853 * segment to lie outside the receive window; by the 4854 * protocol spec, this requires the correspondent TCP to 4855 * respond. 4856 */ 4857 KMOD_TCPSTAT_INC(tcps_keepprobe); 4858 t_template = tcpip_maketemplate(inp); 4859 if (t_template) { 4860 tcp_respond(tp, t_template->tt_ipgen, 4861 &t_template->tt_t, (struct mbuf *)NULL, 4862 tp->rcv_nxt, tp->snd_una - 1, 0); 4863 free(t_template, M_TEMP); 4864 } 4865 } 4866 bbr_start_hpts_timer(bbr, tp, cts, 4, 0, 0); 4867 return (1); 4868 dropit: 4869 KMOD_TCPSTAT_INC(tcps_keepdrops); 4870 tcp_log_end_status(tp, TCP_EI_STATUS_KEEP_MAX); 4871 return (-ETIMEDOUT); /* tcp_drop() */ 4872 } 4873 4874 /* 4875 * Retransmit helper function, clear up all the ack 4876 * flags and take care of important book keeping. 4877 */ 4878 static void 4879 bbr_remxt_tmr(struct tcpcb *tp) 4880 { 4881 /* 4882 * The retransmit timer went off, all sack'd blocks must be 4883 * un-acked. 4884 */ 4885 struct bbr_sendmap *rsm, *trsm = NULL; 4886 struct tcp_bbr *bbr; 4887 uint32_t cts, lost; 4888 4889 bbr = (struct tcp_bbr *)tp->t_fb_ptr; 4890 cts = tcp_get_usecs(&bbr->rc_tv); 4891 lost = bbr->r_ctl.rc_lost; 4892 if (bbr->r_state && (bbr->r_state != tp->t_state)) 4893 bbr_set_state(tp, bbr, 0); 4894 4895 TAILQ_FOREACH(rsm, &bbr->r_ctl.rc_map, r_next) { 4896 if (rsm->r_flags & BBR_ACKED) { 4897 uint32_t old_flags; 4898 4899 rsm->r_dupack = 0; 4900 if (rsm->r_in_tmap == 0) { 4901 /* We must re-add it back to the tlist */ 4902 if (trsm == NULL) { 4903 TAILQ_INSERT_HEAD(&bbr->r_ctl.rc_tmap, rsm, r_tnext); 4904 } else { 4905 TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_tmap, trsm, rsm, r_tnext); 4906 } 4907 rsm->r_in_tmap = 1; 4908 } 4909 old_flags = rsm->r_flags; 4910 rsm->r_flags |= BBR_RXT_CLEARED; 4911 rsm->r_flags &= ~(BBR_ACKED | BBR_SACK_PASSED | BBR_WAS_SACKPASS); 4912 bbr_log_type_rsmclear(bbr, cts, rsm, old_flags, __LINE__); 4913 } else { 4914 if ((tp->t_state < TCPS_ESTABLISHED) && 4915 (rsm->r_start == tp->snd_una)) { 4916 /* 4917 * Special case for TCP FO. Where 4918 * we sent more data beyond the snd_max. 4919 * We don't mark that as lost and stop here. 4920 */ 4921 break; 4922 } 4923 if ((rsm->r_flags & BBR_MARKED_LOST) == 0) { 4924 bbr->r_ctl.rc_lost += rsm->r_end - rsm->r_start; 4925 bbr->r_ctl.rc_lost_bytes += rsm->r_end - rsm->r_start; 4926 } 4927 if (bbr_marks_rxt_sack_passed) { 4928 /* 4929 * With this option, we will rack out 4930 * in 1ms increments the rest of the packets. 4931 */ 4932 rsm->r_flags |= BBR_SACK_PASSED | BBR_MARKED_LOST; 4933 rsm->r_flags &= ~BBR_WAS_SACKPASS; 4934 } else { 4935 /* 4936 * With this option we only mark them lost 4937 * and remove all sack'd markings. We will run 4938 * another RXT or a TLP. This will cause 4939 * us to eventually send more based on what 4940 * ack's come in. 4941 */ 4942 rsm->r_flags |= BBR_MARKED_LOST; 4943 rsm->r_flags &= ~BBR_WAS_SACKPASS; 4944 rsm->r_flags &= ~BBR_SACK_PASSED; 4945 } 4946 } 4947 trsm = rsm; 4948 } 4949 bbr->r_ctl.rc_resend = TAILQ_FIRST(&bbr->r_ctl.rc_map); 4950 /* Clear the count (we just un-acked them) */ 4951 bbr_log_to_event(bbr, cts, BBR_TO_FRM_TMR); 4952 bbr->rc_tlp_new_data = 0; 4953 bbr->r_ctl.rc_tlp_seg_send_cnt = 0; 4954 /* zap the behindness on a rxt */ 4955 bbr->r_ctl.rc_hptsi_agg_delay = 0; 4956 bbr->r_agg_early_set = 0; 4957 bbr->r_ctl.rc_agg_early = 0; 4958 bbr->rc_tlp_rtx_out = 0; 4959 bbr->r_ctl.rc_sacked = 0; 4960 bbr->r_ctl.rc_sacklast = NULL; 4961 bbr->r_timer_override = 1; 4962 bbr_lt_bw_sampling(bbr, cts, (bbr->r_ctl.rc_lost > lost)); 4963 } 4964 4965 /* 4966 * Re-transmit timeout! If we drop the PCB we will return 1, otherwise 4967 * we will setup to retransmit the lowest seq number outstanding. 4968 */ 4969 static int 4970 bbr_timeout_rxt(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts) 4971 { 4972 int32_t rexmt; 4973 int32_t retval = 0; 4974 bool isipv6; 4975 4976 bbr->r_ctl.rc_hpts_flags &= ~PACE_TMR_RXT; 4977 if (bbr->rc_all_timers_stopped) { 4978 return (1); 4979 } 4980 if (TCPS_HAVEESTABLISHED(tp->t_state) && 4981 (tp->snd_una == tp->snd_max)) { 4982 /* Nothing outstanding .. nothing to do */ 4983 return (0); 4984 } 4985 /* 4986 * Retransmission timer went off. Message has not been acked within 4987 * retransmit interval. Back off to a longer retransmit interval 4988 * and retransmit one segment. 4989 */ 4990 if (ctf_progress_timeout_check(tp, true)) { 4991 bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__); 4992 return (-ETIMEDOUT); /* tcp_drop() */ 4993 } 4994 bbr_remxt_tmr(tp); 4995 if ((bbr->r_ctl.rc_resend == NULL) || 4996 ((bbr->r_ctl.rc_resend->r_flags & BBR_RWND_COLLAPSED) == 0)) { 4997 /* 4998 * If the rwnd collapsed on 4999 * the one we are retransmitting 5000 * it does not count against the 5001 * rxt count. 5002 */ 5003 tp->t_rxtshift++; 5004 } 5005 if (tp->t_rxtshift > TCP_MAXRXTSHIFT) { 5006 tp->t_rxtshift = TCP_MAXRXTSHIFT; 5007 KMOD_TCPSTAT_INC(tcps_timeoutdrop); 5008 tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN); 5009 /* XXXGL: previously t_softerror was casted to uint16_t */ 5010 MPASS(tp->t_softerror >= 0); 5011 retval = tp->t_softerror ? -tp->t_softerror : -ETIMEDOUT; 5012 return (retval); /* tcp_drop() */ 5013 } 5014 if (tp->t_state == TCPS_SYN_SENT) { 5015 /* 5016 * If the SYN was retransmitted, indicate CWND to be limited 5017 * to 1 segment in cc_conn_init(). 5018 */ 5019 tp->snd_cwnd = 1; 5020 } else if (tp->t_rxtshift == 1) { 5021 /* 5022 * first retransmit; record ssthresh and cwnd so they can be 5023 * recovered if this turns out to be a "bad" retransmit. A 5024 * retransmit is considered "bad" if an ACK for this segment 5025 * is received within RTT/2 interval; the assumption here is 5026 * that the ACK was already in flight. See "On Estimating 5027 * End-to-End Network Path Properties" by Allman and Paxson 5028 * for more details. 5029 */ 5030 tp->snd_cwnd = tp->t_maxseg - bbr->rc_last_options; 5031 if (!IN_RECOVERY(tp->t_flags)) { 5032 tp->snd_cwnd_prev = tp->snd_cwnd; 5033 tp->snd_ssthresh_prev = tp->snd_ssthresh; 5034 tp->snd_recover_prev = tp->snd_recover; 5035 tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1)); 5036 tp->t_flags |= TF_PREVVALID; 5037 } else { 5038 tp->t_flags &= ~TF_PREVVALID; 5039 } 5040 tp->snd_cwnd = tp->t_maxseg - bbr->rc_last_options; 5041 } else { 5042 tp->snd_cwnd = tp->t_maxseg - bbr->rc_last_options; 5043 tp->t_flags &= ~TF_PREVVALID; 5044 } 5045 KMOD_TCPSTAT_INC(tcps_rexmttimeo); 5046 if ((tp->t_state == TCPS_SYN_SENT) || 5047 (tp->t_state == TCPS_SYN_RECEIVED)) 5048 rexmt = USEC_2_TICKS(BBR_INITIAL_RTO) * tcp_backoff[tp->t_rxtshift]; 5049 else 5050 rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; 5051 TCPT_RANGESET(tp->t_rxtcur, rexmt, 5052 MSEC_2_TICKS(bbr->r_ctl.rc_min_rto_ms), 5053 MSEC_2_TICKS(((uint32_t)bbr->rc_max_rto_sec) * 1000)); 5054 /* 5055 * We enter the path for PLMTUD if connection is established or, if 5056 * connection is FIN_WAIT_1 status, reason for the last is that if 5057 * amount of data we send is very small, we could send it in couple 5058 * of packets and process straight to FIN. In that case we won't 5059 * catch ESTABLISHED state. 5060 */ 5061 #ifdef INET6 5062 isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? true : false; 5063 #else 5064 isipv6 = false; 5065 #endif 5066 if (((V_tcp_pmtud_blackhole_detect == 1) || 5067 (V_tcp_pmtud_blackhole_detect == 2 && !isipv6) || 5068 (V_tcp_pmtud_blackhole_detect == 3 && isipv6)) && 5069 ((tp->t_state == TCPS_ESTABLISHED) || 5070 (tp->t_state == TCPS_FIN_WAIT_1))) { 5071 /* 5072 * Idea here is that at each stage of mtu probe (usually, 5073 * 1448 -> 1188 -> 524) should be given 2 chances to recover 5074 * before further clamping down. 'tp->t_rxtshift % 2 == 0' 5075 * should take care of that. 5076 */ 5077 if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) == 5078 (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) && 5079 (tp->t_rxtshift >= 2 && tp->t_rxtshift < 6 && 5080 tp->t_rxtshift % 2 == 0)) { 5081 /* 5082 * Enter Path MTU Black-hole Detection mechanism: - 5083 * Disable Path MTU Discovery (IP "DF" bit). - 5084 * Reduce MTU to lower value than what we negotiated 5085 * with peer. 5086 */ 5087 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) == 0) { 5088 /* 5089 * Record that we may have found a black 5090 * hole. 5091 */ 5092 tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE; 5093 /* Keep track of previous MSS. */ 5094 tp->t_pmtud_saved_maxseg = tp->t_maxseg; 5095 } 5096 /* 5097 * Reduce the MSS to blackhole value or to the 5098 * default in an attempt to retransmit. 5099 */ 5100 #ifdef INET6 5101 isipv6 = bbr->r_is_v6; 5102 if (isipv6 && 5103 tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) { 5104 /* Use the sysctl tuneable blackhole MSS. */ 5105 tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss; 5106 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated); 5107 } else if (isipv6) { 5108 /* Use the default MSS. */ 5109 tp->t_maxseg = V_tcp_v6mssdflt; 5110 /* 5111 * Disable Path MTU Discovery when we switch 5112 * to minmss. 5113 */ 5114 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 5115 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); 5116 } 5117 #endif 5118 #if defined(INET6) && defined(INET) 5119 else 5120 #endif 5121 #ifdef INET 5122 if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) { 5123 /* Use the sysctl tuneable blackhole MSS. */ 5124 tp->t_maxseg = V_tcp_pmtud_blackhole_mss; 5125 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated); 5126 } else { 5127 /* Use the default MSS. */ 5128 tp->t_maxseg = V_tcp_mssdflt; 5129 /* 5130 * Disable Path MTU Discovery when we switch 5131 * to minmss. 5132 */ 5133 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 5134 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); 5135 } 5136 #endif 5137 } else { 5138 /* 5139 * If further retransmissions are still unsuccessful 5140 * with a lowered MTU, maybe this isn't a blackhole 5141 * and we restore the previous MSS and blackhole 5142 * detection flags. The limit '6' is determined by 5143 * giving each probe stage (1448, 1188, 524) 2 5144 * chances to recover. 5145 */ 5146 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) && 5147 (tp->t_rxtshift >= 6)) { 5148 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 5149 tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE; 5150 tp->t_maxseg = tp->t_pmtud_saved_maxseg; 5151 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_failed); 5152 } 5153 } 5154 } 5155 /* 5156 * Disable RFC1323 and SACK if we haven't got any response to our 5157 * third SYN to work-around some broken terminal servers (most of 5158 * which have hopefully been retired) that have bad VJ header 5159 * compression code which trashes TCP segments containing 5160 * unknown-to-them TCP options. 5161 */ 5162 if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) && 5163 (tp->t_rxtshift == 3)) 5164 tp->t_flags &= ~(TF_REQ_SCALE | TF_REQ_TSTMP | TF_SACK_PERMIT); 5165 /* 5166 * If we backed off this far, our srtt estimate is probably bogus. 5167 * Clobber it so we'll take the next rtt measurement as our srtt; 5168 * move the current srtt into rttvar to keep the current retransmit 5169 * times until then. 5170 */ 5171 if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) { 5172 #ifdef INET6 5173 if (bbr->r_is_v6) 5174 in6_losing(tp->t_inpcb); 5175 else 5176 #endif 5177 in_losing(tp->t_inpcb); 5178 tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT); 5179 tp->t_srtt = 0; 5180 } 5181 sack_filter_clear(&bbr->r_ctl.bbr_sf, tp->snd_una); 5182 tp->snd_recover = tp->snd_max; 5183 tp->t_flags |= TF_ACKNOW; 5184 tp->t_rtttime = 0; 5185 5186 return (retval); 5187 } 5188 5189 static int 5190 bbr_process_timers(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts, uint8_t hpts_calling) 5191 { 5192 int32_t ret = 0; 5193 int32_t timers = (bbr->r_ctl.rc_hpts_flags & PACE_TMR_MASK); 5194 5195 if (timers == 0) { 5196 return (0); 5197 } 5198 if (tp->t_state == TCPS_LISTEN) { 5199 /* no timers on listen sockets */ 5200 if (bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) 5201 return (0); 5202 return (1); 5203 } 5204 if (TSTMP_LT(cts, bbr->r_ctl.rc_timer_exp)) { 5205 uint32_t left; 5206 5207 if (bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { 5208 ret = -1; 5209 bbr_log_to_processing(bbr, cts, ret, 0, hpts_calling); 5210 return (0); 5211 } 5212 if (hpts_calling == 0) { 5213 ret = -2; 5214 bbr_log_to_processing(bbr, cts, ret, 0, hpts_calling); 5215 return (0); 5216 } 5217 /* 5218 * Ok our timer went off early and we are not paced false 5219 * alarm, go back to sleep. 5220 */ 5221 left = bbr->r_ctl.rc_timer_exp - cts; 5222 ret = -3; 5223 bbr_log_to_processing(bbr, cts, ret, left, hpts_calling); 5224 tcp_hpts_insert(tp->t_inpcb, HPTS_USEC_TO_SLOTS(left)); 5225 return (1); 5226 } 5227 bbr->rc_tmr_stopped = 0; 5228 bbr->r_ctl.rc_hpts_flags &= ~PACE_TMR_MASK; 5229 if (timers & PACE_TMR_DELACK) { 5230 ret = bbr_timeout_delack(tp, bbr, cts); 5231 } else if (timers & PACE_TMR_PERSIT) { 5232 ret = bbr_timeout_persist(tp, bbr, cts); 5233 } else if (timers & PACE_TMR_RACK) { 5234 bbr->r_ctl.rc_tlp_rxt_last_time = cts; 5235 ret = bbr_timeout_rack(tp, bbr, cts); 5236 } else if (timers & PACE_TMR_TLP) { 5237 bbr->r_ctl.rc_tlp_rxt_last_time = cts; 5238 ret = bbr_timeout_tlp(tp, bbr, cts); 5239 } else if (timers & PACE_TMR_RXT) { 5240 bbr->r_ctl.rc_tlp_rxt_last_time = cts; 5241 ret = bbr_timeout_rxt(tp, bbr, cts); 5242 } else if (timers & PACE_TMR_KEEP) { 5243 ret = bbr_timeout_keepalive(tp, bbr, cts); 5244 } 5245 bbr_log_to_processing(bbr, cts, ret, timers, hpts_calling); 5246 return (ret); 5247 } 5248 5249 static void 5250 bbr_timer_cancel(struct tcp_bbr *bbr, int32_t line, uint32_t cts) 5251 { 5252 if (bbr->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { 5253 uint8_t hpts_removed = 0; 5254 5255 if (tcp_in_hpts(bbr->rc_inp) && 5256 (bbr->rc_timer_first == 1)) { 5257 /* 5258 * If we are canceling timer's when we have the 5259 * timer ahead of the output being paced. We also 5260 * must remove ourselves from the hpts. 5261 */ 5262 hpts_removed = 1; 5263 tcp_hpts_remove(bbr->rc_inp); 5264 if (bbr->r_ctl.rc_last_delay_val) { 5265 /* Update the last hptsi delay too */ 5266 uint32_t time_since_send; 5267 5268 if (TSTMP_GT(cts, bbr->rc_pacer_started)) 5269 time_since_send = cts - bbr->rc_pacer_started; 5270 else 5271 time_since_send = 0; 5272 if (bbr->r_ctl.rc_last_delay_val > time_since_send) { 5273 /* Cut down our slot time */ 5274 bbr->r_ctl.rc_last_delay_val -= time_since_send; 5275 } else { 5276 bbr->r_ctl.rc_last_delay_val = 0; 5277 } 5278 bbr->rc_pacer_started = cts; 5279 } 5280 } 5281 bbr->rc_timer_first = 0; 5282 bbr_log_to_cancel(bbr, line, cts, hpts_removed); 5283 bbr->rc_tmr_stopped = bbr->r_ctl.rc_hpts_flags & PACE_TMR_MASK; 5284 bbr->r_ctl.rc_hpts_flags &= ~(PACE_TMR_MASK); 5285 } 5286 } 5287 5288 static void 5289 bbr_timer_stop(struct tcpcb *tp, uint32_t timer_type) 5290 { 5291 struct tcp_bbr *bbr; 5292 5293 bbr = (struct tcp_bbr *)tp->t_fb_ptr; 5294 bbr->rc_all_timers_stopped = 1; 5295 return; 5296 } 5297 5298 /* 5299 * stop all timers always returning 0. 5300 */ 5301 static int 5302 bbr_stopall(struct tcpcb *tp) 5303 { 5304 return (0); 5305 } 5306 5307 static void 5308 bbr_timer_activate(struct tcpcb *tp, uint32_t timer_type, uint32_t delta) 5309 { 5310 return; 5311 } 5312 5313 /* 5314 * return true if a bbr timer (rack or tlp) is active. 5315 */ 5316 static int 5317 bbr_timer_active(struct tcpcb *tp, uint32_t timer_type) 5318 { 5319 return (0); 5320 } 5321 5322 static uint32_t 5323 bbr_get_earliest_send_outstanding(struct tcp_bbr *bbr, struct bbr_sendmap *u_rsm, uint32_t cts) 5324 { 5325 struct bbr_sendmap *rsm; 5326 5327 rsm = TAILQ_FIRST(&bbr->r_ctl.rc_tmap); 5328 if ((rsm == NULL) || (u_rsm == rsm)) 5329 return (cts); 5330 return(rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]); 5331 } 5332 5333 static void 5334 bbr_update_rsm(struct tcpcb *tp, struct tcp_bbr *bbr, 5335 struct bbr_sendmap *rsm, uint32_t cts, uint32_t pacing_time) 5336 { 5337 int32_t idx; 5338 5339 rsm->r_rtr_cnt++; 5340 rsm->r_dupack = 0; 5341 if (rsm->r_rtr_cnt > BBR_NUM_OF_RETRANS) { 5342 rsm->r_rtr_cnt = BBR_NUM_OF_RETRANS; 5343 rsm->r_flags |= BBR_OVERMAX; 5344 } 5345 if (rsm->r_flags & BBR_RWND_COLLAPSED) { 5346 /* Take off the collapsed flag at rxt */ 5347 rsm->r_flags &= ~BBR_RWND_COLLAPSED; 5348 } 5349 if (rsm->r_flags & BBR_MARKED_LOST) { 5350 /* We have retransmitted, its no longer lost */ 5351 rsm->r_flags &= ~BBR_MARKED_LOST; 5352 bbr->r_ctl.rc_lost_bytes -= rsm->r_end - rsm->r_start; 5353 } 5354 if (rsm->r_flags & BBR_RXT_CLEARED) { 5355 /* 5356 * We hit a RXT timer on it and 5357 * we cleared the "acked" flag. 5358 * We now have it going back into 5359 * flight, we can remove the cleared 5360 * flag and possibly do accounting on 5361 * this piece. 5362 */ 5363 rsm->r_flags &= ~BBR_RXT_CLEARED; 5364 } 5365 if ((rsm->r_rtr_cnt > 1) && ((rsm->r_flags & BBR_TLP) == 0)) { 5366 bbr->r_ctl.rc_holes_rxt += (rsm->r_end - rsm->r_start); 5367 rsm->r_rtr_bytes += (rsm->r_end - rsm->r_start); 5368 } 5369 idx = rsm->r_rtr_cnt - 1; 5370 rsm->r_tim_lastsent[idx] = cts; 5371 rsm->r_pacing_delay = pacing_time; 5372 rsm->r_delivered = bbr->r_ctl.rc_delivered; 5373 rsm->r_ts_valid = bbr->rc_ts_valid; 5374 if (bbr->rc_ts_valid) 5375 rsm->r_del_ack_ts = bbr->r_ctl.last_inbound_ts; 5376 if (bbr->r_ctl.r_app_limited_until) 5377 rsm->r_app_limited = 1; 5378 else 5379 rsm->r_app_limited = 0; 5380 if (bbr->rc_bbr_state == BBR_STATE_PROBE_BW) 5381 rsm->r_bbr_state = bbr_state_val(bbr); 5382 else 5383 rsm->r_bbr_state = 8; 5384 if (rsm->r_flags & BBR_ACKED) { 5385 /* Problably MTU discovery messing with us */ 5386 uint32_t old_flags; 5387 5388 old_flags = rsm->r_flags; 5389 rsm->r_flags &= ~BBR_ACKED; 5390 bbr_log_type_rsmclear(bbr, cts, rsm, old_flags, __LINE__); 5391 bbr->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); 5392 if (bbr->r_ctl.rc_sacked == 0) 5393 bbr->r_ctl.rc_sacklast = NULL; 5394 } 5395 if (rsm->r_in_tmap) { 5396 TAILQ_REMOVE(&bbr->r_ctl.rc_tmap, rsm, r_tnext); 5397 } 5398 TAILQ_INSERT_TAIL(&bbr->r_ctl.rc_tmap, rsm, r_tnext); 5399 rsm->r_in_tmap = 1; 5400 if (rsm->r_flags & BBR_SACK_PASSED) { 5401 /* We have retransmitted due to the SACK pass */ 5402 rsm->r_flags &= ~BBR_SACK_PASSED; 5403 rsm->r_flags |= BBR_WAS_SACKPASS; 5404 } 5405 rsm->r_first_sent_time = bbr_get_earliest_send_outstanding(bbr, rsm, cts); 5406 rsm->r_flight_at_send = ctf_flight_size(bbr->rc_tp, 5407 (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)); 5408 bbr->r_ctl.rc_next = TAILQ_NEXT(rsm, r_next); 5409 if (bbr->r_ctl.rc_bbr_hptsi_gain > BBR_UNIT) { 5410 rsm->r_is_gain = 1; 5411 rsm->r_is_drain = 0; 5412 } else if (bbr->r_ctl.rc_bbr_hptsi_gain < BBR_UNIT) { 5413 rsm->r_is_drain = 1; 5414 rsm->r_is_gain = 0; 5415 } else { 5416 rsm->r_is_drain = 0; 5417 rsm->r_is_gain = 0; 5418 } 5419 rsm->r_del_time = bbr->r_ctl.rc_del_time; /* TEMP GOOGLE CODE */ 5420 } 5421 5422 /* 5423 * Returns 0, or the sequence where we stopped 5424 * updating. We also update the lenp to be the amount 5425 * of data left. 5426 */ 5427 5428 static uint32_t 5429 bbr_update_entry(struct tcpcb *tp, struct tcp_bbr *bbr, 5430 struct bbr_sendmap *rsm, uint32_t cts, int32_t *lenp, uint32_t pacing_time) 5431 { 5432 /* 5433 * We (re-)transmitted starting at rsm->r_start for some length 5434 * (possibly less than r_end. 5435 */ 5436 struct bbr_sendmap *nrsm; 5437 uint32_t c_end; 5438 int32_t len; 5439 5440 len = *lenp; 5441 c_end = rsm->r_start + len; 5442 if (SEQ_GEQ(c_end, rsm->r_end)) { 5443 /* 5444 * We retransmitted the whole piece or more than the whole 5445 * slopping into the next rsm. 5446 */ 5447 bbr_update_rsm(tp, bbr, rsm, cts, pacing_time); 5448 if (c_end == rsm->r_end) { 5449 *lenp = 0; 5450 return (0); 5451 } else { 5452 int32_t act_len; 5453 5454 /* Hangs over the end return whats left */ 5455 act_len = rsm->r_end - rsm->r_start; 5456 *lenp = (len - act_len); 5457 return (rsm->r_end); 5458 } 5459 /* We don't get out of this block. */ 5460 } 5461 /* 5462 * Here we retransmitted less than the whole thing which means we 5463 * have to split this into what was transmitted and what was not. 5464 */ 5465 nrsm = bbr_alloc_full_limit(bbr); 5466 if (nrsm == NULL) { 5467 *lenp = 0; 5468 return (0); 5469 } 5470 /* 5471 * So here we are going to take the original rsm and make it what we 5472 * retransmitted. nrsm will be the tail portion we did not 5473 * retransmit. For example say the chunk was 1, 11 (10 bytes). And 5474 * we retransmitted 5 bytes i.e. 1, 5. The original piece shrinks to 5475 * 1, 6 and the new piece will be 6, 11. 5476 */ 5477 bbr_clone_rsm(bbr, nrsm, rsm, c_end); 5478 TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_map, rsm, nrsm, r_next); 5479 nrsm->r_dupack = 0; 5480 if (rsm->r_in_tmap) { 5481 TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 5482 nrsm->r_in_tmap = 1; 5483 } 5484 rsm->r_flags &= (~BBR_HAS_FIN); 5485 bbr_update_rsm(tp, bbr, rsm, cts, pacing_time); 5486 *lenp = 0; 5487 return (0); 5488 } 5489 5490 static uint64_t 5491 bbr_get_hardware_rate(struct tcp_bbr *bbr) 5492 { 5493 uint64_t bw; 5494 5495 bw = bbr_get_bw(bbr); 5496 bw *= (uint64_t)bbr_hptsi_gain[BBR_SUB_GAIN]; 5497 bw /= (uint64_t)BBR_UNIT; 5498 return(bw); 5499 } 5500 5501 static void 5502 bbr_setup_less_of_rate(struct tcp_bbr *bbr, uint32_t cts, 5503 uint64_t act_rate, uint64_t rate_wanted) 5504 { 5505 /* 5506 * We could not get a full gains worth 5507 * of rate. 5508 */ 5509 if (get_filter_value(&bbr->r_ctl.rc_delrate) >= act_rate) { 5510 /* we can't even get the real rate */ 5511 uint64_t red; 5512 5513 bbr->skip_gain = 1; 5514 bbr->gain_is_limited = 0; 5515 red = get_filter_value(&bbr->r_ctl.rc_delrate) - act_rate; 5516 if (red) 5517 filter_reduce_by(&bbr->r_ctl.rc_delrate, red, cts); 5518 } else { 5519 /* We can use a lower gain */ 5520 bbr->skip_gain = 0; 5521 bbr->gain_is_limited = 1; 5522 } 5523 } 5524 5525 static void 5526 bbr_update_hardware_pacing_rate(struct tcp_bbr *bbr, uint32_t cts) 5527 { 5528 const struct tcp_hwrate_limit_table *nrte; 5529 int error, rate = -1; 5530 5531 if (bbr->r_ctl.crte == NULL) 5532 return; 5533 if ((bbr->rc_inp->inp_route.ro_nh == NULL) || 5534 (bbr->rc_inp->inp_route.ro_nh->nh_ifp == NULL)) { 5535 /* Lost our routes? */ 5536 /* Clear the way for a re-attempt */ 5537 bbr->bbr_attempt_hdwr_pace = 0; 5538 lost_rate: 5539 bbr->gain_is_limited = 0; 5540 bbr->skip_gain = 0; 5541 bbr->bbr_hdrw_pacing = 0; 5542 counter_u64_add(bbr_flows_whdwr_pacing, -1); 5543 counter_u64_add(bbr_flows_nohdwr_pacing, 1); 5544 tcp_bbr_tso_size_check(bbr, cts); 5545 return; 5546 } 5547 rate = bbr_get_hardware_rate(bbr); 5548 nrte = tcp_chg_pacing_rate(bbr->r_ctl.crte, 5549 bbr->rc_tp, 5550 bbr->rc_inp->inp_route.ro_nh->nh_ifp, 5551 rate, 5552 (RS_PACING_GEQ|RS_PACING_SUB_OK), 5553 &error, NULL); 5554 if (nrte == NULL) { 5555 goto lost_rate; 5556 } 5557 if (nrte != bbr->r_ctl.crte) { 5558 bbr->r_ctl.crte = nrte; 5559 if (error == 0) { 5560 BBR_STAT_INC(bbr_hdwr_rl_mod_ok); 5561 if (bbr->r_ctl.crte->rate < rate) { 5562 /* We have a problem */ 5563 bbr_setup_less_of_rate(bbr, cts, 5564 bbr->r_ctl.crte->rate, rate); 5565 } else { 5566 /* We are good */ 5567 bbr->gain_is_limited = 0; 5568 bbr->skip_gain = 0; 5569 } 5570 } else { 5571 /* A failure should release the tag */ 5572 BBR_STAT_INC(bbr_hdwr_rl_mod_fail); 5573 bbr->gain_is_limited = 0; 5574 bbr->skip_gain = 0; 5575 bbr->bbr_hdrw_pacing = 0; 5576 } 5577 bbr_type_log_hdwr_pacing(bbr, 5578 bbr->r_ctl.crte->ptbl->rs_ifp, 5579 rate, 5580 ((bbr->r_ctl.crte == NULL) ? 0 : bbr->r_ctl.crte->rate), 5581 __LINE__, 5582 cts, 5583 error); 5584 } 5585 } 5586 5587 static void 5588 bbr_adjust_for_hw_pacing(struct tcp_bbr *bbr, uint32_t cts) 5589 { 5590 /* 5591 * If we have hardware pacing support 5592 * we need to factor that in for our 5593 * TSO size. 5594 */ 5595 const struct tcp_hwrate_limit_table *rlp; 5596 uint32_t cur_delay, seg_sz, maxseg, new_tso, delta, hdwr_delay; 5597 5598 if ((bbr->bbr_hdrw_pacing == 0) || 5599 (IN_RECOVERY(bbr->rc_tp->t_flags)) || 5600 (bbr->r_ctl.crte == NULL)) 5601 return; 5602 if (bbr->hw_pacing_set == 0) { 5603 /* Not yet by the hdwr pacing count delay */ 5604 return; 5605 } 5606 if (bbr_hdwr_pace_adjust == 0) { 5607 /* No adjustment */ 5608 return; 5609 } 5610 rlp = bbr->r_ctl.crte; 5611 if (bbr->rc_tp->t_maxseg > bbr->rc_last_options) 5612 maxseg = bbr->rc_tp->t_maxseg - bbr->rc_last_options; 5613 else 5614 maxseg = BBR_MIN_SEG - bbr->rc_last_options; 5615 /* 5616 * So lets first get the 5617 * time we will take between 5618 * TSO sized sends currently without 5619 * hardware help. 5620 */ 5621 cur_delay = bbr_get_pacing_delay(bbr, BBR_UNIT, 5622 bbr->r_ctl.rc_pace_max_segs, cts, 1); 5623 hdwr_delay = bbr->r_ctl.rc_pace_max_segs / maxseg; 5624 hdwr_delay *= rlp->time_between; 5625 if (cur_delay > hdwr_delay) 5626 delta = cur_delay - hdwr_delay; 5627 else 5628 delta = 0; 5629 bbr_log_type_tsosize(bbr, cts, delta, cur_delay, hdwr_delay, 5630 (bbr->r_ctl.rc_pace_max_segs / maxseg), 5631 1); 5632 if (delta && 5633 (delta < (max(rlp->time_between, 5634 bbr->r_ctl.bbr_hptsi_segments_delay_tar)))) { 5635 /* 5636 * Now lets divide by the pacing 5637 * time between each segment the 5638 * hardware sends rounding up and 5639 * derive a bytes from that. We multiply 5640 * that by bbr_hdwr_pace_adjust to get 5641 * more bang for our buck. 5642 * 5643 * The goal is to have the software pacer 5644 * waiting no more than an additional 5645 * pacing delay if we can (without the 5646 * compensation i.e. x bbr_hdwr_pace_adjust). 5647 */ 5648 seg_sz = max(((cur_delay + rlp->time_between)/rlp->time_between), 5649 (bbr->r_ctl.rc_pace_max_segs/maxseg)); 5650 seg_sz *= bbr_hdwr_pace_adjust; 5651 if (bbr_hdwr_pace_floor && 5652 (seg_sz < bbr->r_ctl.crte->ptbl->rs_min_seg)) { 5653 /* Currently hardware paces 5654 * out rs_min_seg segments at a time. 5655 * We need to make sure we always send at least 5656 * a full burst of bbr_hdwr_pace_floor down. 5657 */ 5658 seg_sz = bbr->r_ctl.crte->ptbl->rs_min_seg; 5659 } 5660 seg_sz *= maxseg; 5661 } else if (delta == 0) { 5662 /* 5663 * The highest pacing rate is 5664 * above our b/w gained. This means 5665 * we probably are going quite fast at 5666 * the hardware highest rate. Lets just multiply 5667 * the calculated TSO size by the 5668 * multiplier factor (its probably 5669 * 4 segments in the default config for 5670 * mlx). 5671 */ 5672 seg_sz = bbr->r_ctl.rc_pace_max_segs * bbr_hdwr_pace_adjust; 5673 if (bbr_hdwr_pace_floor && 5674 (seg_sz < bbr->r_ctl.crte->ptbl->rs_min_seg)) { 5675 /* Currently hardware paces 5676 * out rs_min_seg segments at a time. 5677 * We need to make sure we always send at least 5678 * a full burst of bbr_hdwr_pace_floor down. 5679 */ 5680 seg_sz = bbr->r_ctl.crte->ptbl->rs_min_seg; 5681 } 5682 } else { 5683 /* 5684 * The pacing time difference is so 5685 * big that the hardware will 5686 * pace out more rapidly then we 5687 * really want and then we 5688 * will have a long delay. Lets just keep 5689 * the same TSO size so its as if 5690 * we were not using hdwr pacing (we 5691 * just gain a bit of spacing from the 5692 * hardware if seg_sz > 1). 5693 */ 5694 seg_sz = bbr->r_ctl.rc_pace_max_segs; 5695 } 5696 if (seg_sz > bbr->r_ctl.rc_pace_max_segs) 5697 new_tso = seg_sz; 5698 else 5699 new_tso = bbr->r_ctl.rc_pace_max_segs; 5700 if (new_tso >= (PACE_MAX_IP_BYTES-maxseg)) 5701 new_tso = PACE_MAX_IP_BYTES - maxseg; 5702 5703 if (new_tso != bbr->r_ctl.rc_pace_max_segs) { 5704 bbr_log_type_tsosize(bbr, cts, new_tso, 0, bbr->r_ctl.rc_pace_max_segs, maxseg, 0); 5705 bbr->r_ctl.rc_pace_max_segs = new_tso; 5706 } 5707 } 5708 5709 static void 5710 tcp_bbr_tso_size_check(struct tcp_bbr *bbr, uint32_t cts) 5711 { 5712 uint64_t bw; 5713 uint32_t old_tso = 0, new_tso; 5714 uint32_t maxseg, bytes; 5715 uint32_t tls_seg=0; 5716 /* 5717 * Google/linux uses the following algorithm to determine 5718 * the TSO size based on the b/w of the link (from Neal Cardwell email 9/27/18): 5719 * 5720 * bytes = bw_in_bytes_per_second / 1000 5721 * bytes = min(bytes, 64k) 5722 * tso_segs = bytes / MSS 5723 * if (bw < 1.2Mbs) 5724 * min_tso_segs = 1 5725 * else 5726 * min_tso_segs = 2 5727 * tso_segs = max(tso_segs, min_tso_segs) 5728 * 5729 * * Note apply a device specific limit (we apply this in the 5730 * tcp_m_copym). 5731 * Note that before the initial measurement is made google bursts out 5732 * a full iwnd just like new-reno/cubic. 5733 * 5734 * We do not use this algorithm. Instead we 5735 * use a two phased approach: 5736 * 5737 * if ( bw <= per-tcb-cross-over) 5738 * goal_tso = calculate how much with this bw we 5739 * can send in goal-time seconds. 5740 * if (goal_tso > mss) 5741 * seg = goal_tso / mss 5742 * tso = seg * mss 5743 * else 5744 * tso = mss 5745 * if (tso > per-tcb-max) 5746 * tso = per-tcb-max 5747 * else if ( bw > 512Mbps) 5748 * tso = max-tso (64k/mss) 5749 * else 5750 * goal_tso = bw / per-tcb-divsor 5751 * seg = (goal_tso + mss-1)/mss 5752 * tso = seg * mss 5753 * 5754 * if (tso < per-tcb-floor) 5755 * tso = per-tcb-floor 5756 * if (tso > per-tcb-utter_max) 5757 * tso = per-tcb-utter_max 5758 * 5759 * Note the default per-tcb-divisor is 1000 (same as google). 5760 * the goal cross over is 30Mbps however. To recreate googles 5761 * algorithm you need to set: 5762 * 5763 * cross-over = 23,168,000 bps 5764 * goal-time = 18000 5765 * per-tcb-max = 2 5766 * per-tcb-divisor = 1000 5767 * per-tcb-floor = 1 5768 * 5769 * This will get you "google bbr" behavior with respect to tso size. 5770 * 5771 * Note we do set anything TSO size until we are past the initial 5772 * window. Before that we gnerally use either a single MSS 5773 * or we use the full IW size (so we burst a IW at a time) 5774 */ 5775 5776 if (bbr->rc_tp->t_maxseg > bbr->rc_last_options) { 5777 maxseg = bbr->rc_tp->t_maxseg - bbr->rc_last_options; 5778 } else { 5779 maxseg = BBR_MIN_SEG - bbr->rc_last_options; 5780 } 5781 old_tso = bbr->r_ctl.rc_pace_max_segs; 5782 if (bbr->rc_past_init_win == 0) { 5783 /* 5784 * Not enough data has been acknowledged to make a 5785 * judgement. Set up the initial TSO based on if we 5786 * are sending a full IW at once or not. 5787 */ 5788 if (bbr->rc_use_google) 5789 bbr->r_ctl.rc_pace_max_segs = ((bbr->rc_tp->t_maxseg - bbr->rc_last_options) * 2); 5790 else if (bbr->bbr_init_win_cheat) 5791 bbr->r_ctl.rc_pace_max_segs = bbr_initial_cwnd(bbr, bbr->rc_tp); 5792 else 5793 bbr->r_ctl.rc_pace_max_segs = bbr->rc_tp->t_maxseg - bbr->rc_last_options; 5794 if (bbr->r_ctl.rc_pace_min_segs != bbr->rc_tp->t_maxseg) 5795 bbr->r_ctl.rc_pace_min_segs = bbr->rc_tp->t_maxseg; 5796 if (bbr->r_ctl.rc_pace_max_segs == 0) { 5797 bbr->r_ctl.rc_pace_max_segs = maxseg; 5798 } 5799 bbr_log_type_tsosize(bbr, cts, bbr->r_ctl.rc_pace_max_segs, tls_seg, old_tso, maxseg, 0); 5800 bbr_adjust_for_hw_pacing(bbr, cts); 5801 return; 5802 } 5803 /** 5804 * Now lets set the TSO goal based on our delivery rate in 5805 * bytes per second. Note we only do this if 5806 * we have acked at least the initial cwnd worth of data. 5807 */ 5808 bw = bbr_get_bw(bbr); 5809 if (IN_RECOVERY(bbr->rc_tp->t_flags) && 5810 (bbr->rc_use_google == 0)) { 5811 /* We clamp to one MSS in recovery */ 5812 new_tso = maxseg; 5813 } else if (bbr->rc_use_google) { 5814 int min_tso_segs; 5815 5816 /* Google considers the gain too */ 5817 if (bbr->r_ctl.rc_bbr_hptsi_gain != BBR_UNIT) { 5818 bw *= bbr->r_ctl.rc_bbr_hptsi_gain; 5819 bw /= BBR_UNIT; 5820 } 5821 bytes = bw / 1024; 5822 if (bytes > (64 * 1024)) 5823 bytes = 64 * 1024; 5824 new_tso = bytes / maxseg; 5825 if (bw < ONE_POINT_TWO_MEG) 5826 min_tso_segs = 1; 5827 else 5828 min_tso_segs = 2; 5829 if (new_tso < min_tso_segs) 5830 new_tso = min_tso_segs; 5831 new_tso *= maxseg; 5832 } else if (bbr->rc_no_pacing) { 5833 new_tso = (PACE_MAX_IP_BYTES / maxseg) * maxseg; 5834 } else if (bw <= bbr->r_ctl.bbr_cross_over) { 5835 /* 5836 * Calculate the worse case b/w TSO if we are inserting no 5837 * more than a delay_target number of TSO's. 5838 */ 5839 uint32_t tso_len, min_tso; 5840 5841 tso_len = bbr_get_pacing_length(bbr, BBR_UNIT, bbr->r_ctl.bbr_hptsi_segments_delay_tar, bw); 5842 if (tso_len > maxseg) { 5843 new_tso = tso_len / maxseg; 5844 if (new_tso > bbr->r_ctl.bbr_hptsi_segments_max) 5845 new_tso = bbr->r_ctl.bbr_hptsi_segments_max; 5846 new_tso *= maxseg; 5847 } else { 5848 /* 5849 * less than a full sized frame yikes.. long rtt or 5850 * low bw? 5851 */ 5852 min_tso = bbr_minseg(bbr); 5853 if ((tso_len > min_tso) && (bbr_all_get_min == 0)) 5854 new_tso = rounddown(tso_len, min_tso); 5855 else 5856 new_tso = min_tso; 5857 } 5858 } else if (bw > FIVETWELVE_MBPS) { 5859 /* 5860 * This guy is so fast b/w wise that we can TSO as large as 5861 * possible of segments that the NIC will allow. 5862 */ 5863 new_tso = rounddown(PACE_MAX_IP_BYTES, maxseg); 5864 } else { 5865 /* 5866 * This formula is based on attempting to send a segment or 5867 * more every bbr_hptsi_per_second. The default is 1000 5868 * which means you are targeting what you can send every 1ms 5869 * based on the peers bw. 5870 * 5871 * If the number drops to say 500, then you are looking more 5872 * at 2ms and you will raise how much we send in a single 5873 * TSO thus saving CPU (less bbr_output_wtime() calls). The 5874 * trade off of course is you will send more at once and 5875 * thus tend to clump up the sends into larger "bursts" 5876 * building a queue. 5877 */ 5878 bw /= bbr->r_ctl.bbr_hptsi_per_second; 5879 new_tso = roundup(bw, (uint64_t)maxseg); 5880 /* 5881 * Gate the floor to match what our lower than 48Mbps 5882 * algorithm does. The ceiling (bbr_hptsi_segments_max) thus 5883 * becomes the floor for this calculation. 5884 */ 5885 if (new_tso < (bbr->r_ctl.bbr_hptsi_segments_max * maxseg)) 5886 new_tso = (bbr->r_ctl.bbr_hptsi_segments_max * maxseg); 5887 } 5888 if (bbr->r_ctl.bbr_hptsi_segments_floor && (new_tso < (maxseg * bbr->r_ctl.bbr_hptsi_segments_floor))) 5889 new_tso = maxseg * bbr->r_ctl.bbr_hptsi_segments_floor; 5890 if (new_tso > PACE_MAX_IP_BYTES) 5891 new_tso = rounddown(PACE_MAX_IP_BYTES, maxseg); 5892 /* Enforce an utter maximum. */ 5893 if (bbr->r_ctl.bbr_utter_max && (new_tso > (bbr->r_ctl.bbr_utter_max * maxseg))) { 5894 new_tso = bbr->r_ctl.bbr_utter_max * maxseg; 5895 } 5896 if (old_tso != new_tso) { 5897 /* Only log changes */ 5898 bbr_log_type_tsosize(bbr, cts, new_tso, tls_seg, old_tso, maxseg, 0); 5899 bbr->r_ctl.rc_pace_max_segs = new_tso; 5900 } 5901 /* We have hardware pacing! */ 5902 bbr_adjust_for_hw_pacing(bbr, cts); 5903 } 5904 5905 static void 5906 bbr_log_output(struct tcp_bbr *bbr, struct tcpcb *tp, struct tcpopt *to, int32_t len, 5907 uint32_t seq_out, uint16_t th_flags, int32_t err, uint32_t cts, 5908 struct mbuf *mb, int32_t * abandon, struct bbr_sendmap *hintrsm, uint32_t delay_calc, 5909 struct sockbuf *sb) 5910 { 5911 5912 struct bbr_sendmap *rsm, *nrsm; 5913 register uint32_t snd_max, snd_una; 5914 uint32_t pacing_time; 5915 /* 5916 * Add to the RACK log of packets in flight or retransmitted. If 5917 * there is a TS option we will use the TS echoed, if not we will 5918 * grab a TS. 5919 * 5920 * Retransmissions will increment the count and move the ts to its 5921 * proper place. Note that if options do not include TS's then we 5922 * won't be able to effectively use the ACK for an RTT on a retran. 5923 * 5924 * Notes about r_start and r_end. Lets consider a send starting at 5925 * sequence 1 for 10 bytes. In such an example the r_start would be 5926 * 1 (starting sequence) but the r_end would be r_start+len i.e. 11. 5927 * This means that r_end is actually the first sequence for the next 5928 * slot (11). 5929 * 5930 */ 5931 INP_WLOCK_ASSERT(tp->t_inpcb); 5932 if (err) { 5933 /* 5934 * We don't log errors -- we could but snd_max does not 5935 * advance in this case either. 5936 */ 5937 return; 5938 } 5939 if (th_flags & TH_RST) { 5940 /* 5941 * We don't log resets and we return immediately from 5942 * sending 5943 */ 5944 *abandon = 1; 5945 return; 5946 } 5947 snd_una = tp->snd_una; 5948 if (th_flags & (TH_SYN | TH_FIN) && (hintrsm == NULL)) { 5949 /* 5950 * The call to bbr_log_output is made before bumping 5951 * snd_max. This means we can record one extra byte on a SYN 5952 * or FIN if seq_out is adding more on and a FIN is present 5953 * (and we are not resending). 5954 */ 5955 if ((th_flags & TH_SYN) && (tp->iss == seq_out)) 5956 len++; 5957 if (th_flags & TH_FIN) 5958 len++; 5959 } 5960 if (SEQ_LEQ((seq_out + len), snd_una)) { 5961 /* Are sending an old segment to induce an ack (keep-alive)? */ 5962 return; 5963 } 5964 if (SEQ_LT(seq_out, snd_una)) { 5965 /* huh? should we panic? */ 5966 uint32_t end; 5967 5968 end = seq_out + len; 5969 seq_out = snd_una; 5970 len = end - seq_out; 5971 } 5972 snd_max = tp->snd_max; 5973 if (len == 0) { 5974 /* We don't log zero window probes */ 5975 return; 5976 } 5977 pacing_time = bbr_get_pacing_delay(bbr, bbr->r_ctl.rc_bbr_hptsi_gain, len, cts, 1); 5978 /* First question is it a retransmission? */ 5979 if (seq_out == snd_max) { 5980 again: 5981 rsm = bbr_alloc(bbr); 5982 if (rsm == NULL) { 5983 return; 5984 } 5985 rsm->r_flags = 0; 5986 if (th_flags & TH_SYN) 5987 rsm->r_flags |= BBR_HAS_SYN; 5988 if (th_flags & TH_FIN) 5989 rsm->r_flags |= BBR_HAS_FIN; 5990 rsm->r_tim_lastsent[0] = cts; 5991 rsm->r_rtr_cnt = 1; 5992 rsm->r_rtr_bytes = 0; 5993 rsm->r_start = seq_out; 5994 rsm->r_end = rsm->r_start + len; 5995 rsm->r_dupack = 0; 5996 rsm->r_delivered = bbr->r_ctl.rc_delivered; 5997 rsm->r_pacing_delay = pacing_time; 5998 rsm->r_ts_valid = bbr->rc_ts_valid; 5999 if (bbr->rc_ts_valid) 6000 rsm->r_del_ack_ts = bbr->r_ctl.last_inbound_ts; 6001 rsm->r_del_time = bbr->r_ctl.rc_del_time; 6002 if (bbr->r_ctl.r_app_limited_until) 6003 rsm->r_app_limited = 1; 6004 else 6005 rsm->r_app_limited = 0; 6006 rsm->r_first_sent_time = bbr_get_earliest_send_outstanding(bbr, rsm, cts); 6007 rsm->r_flight_at_send = ctf_flight_size(bbr->rc_tp, 6008 (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)); 6009 /* 6010 * Here we must also add in this rsm since snd_max 6011 * is updated after we return from a new send. 6012 */ 6013 rsm->r_flight_at_send += len; 6014 TAILQ_INSERT_TAIL(&bbr->r_ctl.rc_map, rsm, r_next); 6015 TAILQ_INSERT_TAIL(&bbr->r_ctl.rc_tmap, rsm, r_tnext); 6016 rsm->r_in_tmap = 1; 6017 if (bbr->rc_bbr_state == BBR_STATE_PROBE_BW) 6018 rsm->r_bbr_state = bbr_state_val(bbr); 6019 else 6020 rsm->r_bbr_state = 8; 6021 if (bbr->r_ctl.rc_bbr_hptsi_gain > BBR_UNIT) { 6022 rsm->r_is_gain = 1; 6023 rsm->r_is_drain = 0; 6024 } else if (bbr->r_ctl.rc_bbr_hptsi_gain < BBR_UNIT) { 6025 rsm->r_is_drain = 1; 6026 rsm->r_is_gain = 0; 6027 } else { 6028 rsm->r_is_drain = 0; 6029 rsm->r_is_gain = 0; 6030 } 6031 return; 6032 } 6033 /* 6034 * If we reach here its a retransmission and we need to find it. 6035 */ 6036 more: 6037 if (hintrsm && (hintrsm->r_start == seq_out)) { 6038 rsm = hintrsm; 6039 hintrsm = NULL; 6040 } else if (bbr->r_ctl.rc_next) { 6041 /* We have a hint from a previous run */ 6042 rsm = bbr->r_ctl.rc_next; 6043 } else { 6044 /* No hints sorry */ 6045 rsm = NULL; 6046 } 6047 if ((rsm) && (rsm->r_start == seq_out)) { 6048 /* 6049 * We used rc_next or hintrsm to retransmit, hopefully the 6050 * likely case. 6051 */ 6052 seq_out = bbr_update_entry(tp, bbr, rsm, cts, &len, pacing_time); 6053 if (len == 0) { 6054 return; 6055 } else { 6056 goto more; 6057 } 6058 } 6059 /* Ok it was not the last pointer go through it the hard way. */ 6060 TAILQ_FOREACH(rsm, &bbr->r_ctl.rc_map, r_next) { 6061 if (rsm->r_start == seq_out) { 6062 seq_out = bbr_update_entry(tp, bbr, rsm, cts, &len, pacing_time); 6063 bbr->r_ctl.rc_next = TAILQ_NEXT(rsm, r_next); 6064 if (len == 0) { 6065 return; 6066 } else { 6067 continue; 6068 } 6069 } 6070 if (SEQ_GEQ(seq_out, rsm->r_start) && SEQ_LT(seq_out, rsm->r_end)) { 6071 /* Transmitted within this piece */ 6072 /* 6073 * Ok we must split off the front and then let the 6074 * update do the rest 6075 */ 6076 nrsm = bbr_alloc_full_limit(bbr); 6077 if (nrsm == NULL) { 6078 bbr_update_rsm(tp, bbr, rsm, cts, pacing_time); 6079 return; 6080 } 6081 /* 6082 * copy rsm to nrsm and then trim the front of rsm 6083 * to not include this part. 6084 */ 6085 bbr_clone_rsm(bbr, nrsm, rsm, seq_out); 6086 TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_map, rsm, nrsm, r_next); 6087 if (rsm->r_in_tmap) { 6088 TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 6089 nrsm->r_in_tmap = 1; 6090 } 6091 rsm->r_flags &= (~BBR_HAS_FIN); 6092 seq_out = bbr_update_entry(tp, bbr, nrsm, cts, &len, pacing_time); 6093 if (len == 0) { 6094 return; 6095 } 6096 } 6097 } 6098 /* 6099 * Hmm not found in map did they retransmit both old and on into the 6100 * new? 6101 */ 6102 if (seq_out == tp->snd_max) { 6103 goto again; 6104 } else if (SEQ_LT(seq_out, tp->snd_max)) { 6105 #ifdef BBR_INVARIANTS 6106 printf("seq_out:%u len:%d snd_una:%u snd_max:%u -- but rsm not found?\n", 6107 seq_out, len, tp->snd_una, tp->snd_max); 6108 printf("Starting Dump of all rack entries\n"); 6109 TAILQ_FOREACH(rsm, &bbr->r_ctl.rc_map, r_next) { 6110 printf("rsm:%p start:%u end:%u\n", 6111 rsm, rsm->r_start, rsm->r_end); 6112 } 6113 printf("Dump complete\n"); 6114 panic("seq_out not found rack:%p tp:%p", 6115 bbr, tp); 6116 #endif 6117 } else { 6118 #ifdef BBR_INVARIANTS 6119 /* 6120 * Hmm beyond sndmax? (only if we are using the new rtt-pack 6121 * flag) 6122 */ 6123 panic("seq_out:%u(%d) is beyond snd_max:%u tp:%p", 6124 seq_out, len, tp->snd_max, tp); 6125 #endif 6126 } 6127 } 6128 6129 static void 6130 bbr_collapse_rtt(struct tcpcb *tp, struct tcp_bbr *bbr, int32_t rtt) 6131 { 6132 /* 6133 * Collapse timeout back the cum-ack moved. 6134 */ 6135 tp->t_rxtshift = 0; 6136 tp->t_softerror = 0; 6137 } 6138 6139 static void 6140 tcp_bbr_xmit_timer(struct tcp_bbr *bbr, uint32_t rtt_usecs, uint32_t rsm_send_time, uint32_t r_start, uint32_t tsin) 6141 { 6142 bbr->rtt_valid = 1; 6143 bbr->r_ctl.cur_rtt = rtt_usecs; 6144 bbr->r_ctl.ts_in = tsin; 6145 if (rsm_send_time) 6146 bbr->r_ctl.cur_rtt_send_time = rsm_send_time; 6147 } 6148 6149 static void 6150 bbr_make_timestamp_determination(struct tcp_bbr *bbr) 6151 { 6152 /** 6153 * We have in our bbr control: 6154 * 1) The timestamp we started observing cum-acks (bbr->r_ctl.bbr_ts_check_tstmp). 6155 * 2) Our timestamp indicating when we sent that packet (bbr->r_ctl.rsm->bbr_ts_check_our_cts). 6156 * 3) The current timestamp that just came in (bbr->r_ctl.last_inbound_ts) 6157 * 4) The time that the packet that generated that ack was sent (bbr->r_ctl.cur_rtt_send_time) 6158 * 6159 * Now we can calculate the time between the sends by doing: 6160 * 6161 * delta = bbr->r_ctl.cur_rtt_send_time - bbr->r_ctl.bbr_ts_check_our_cts 6162 * 6163 * And the peer's time between receiving them by doing: 6164 * 6165 * peer_delta = bbr->r_ctl.last_inbound_ts - bbr->r_ctl.bbr_ts_check_tstmp 6166 * 6167 * We want to figure out if the timestamp values are in msec, 10msec or usec. 6168 * We also may find that we can't use the timestamps if say we see 6169 * that the peer_delta indicates that though we may have taken 10ms to 6170 * pace out the data, it only saw 1ms between the two packets. This would 6171 * indicate that somewhere on the path is a batching entity that is giving 6172 * out time-slices of the actual b/w. This would mean we could not use 6173 * reliably the peers timestamps. 6174 * 6175 * We expect delta > peer_delta initially. Until we figure out the 6176 * timestamp difference which we will store in bbr->r_ctl.bbr_peer_tsratio. 6177 * If we place 1000 there then its a ms vs our usec. If we place 10000 there 6178 * then its 10ms vs our usec. If the peer is running a usec clock we would 6179 * put a 1 there. If the value is faster then ours, we will disable the 6180 * use of timestamps (though we could revist this later if we find it to be not 6181 * just an isolated one or two flows)). 6182 * 6183 * To detect the batching middle boxes we will come up with our compensation and 6184 * if with it in place, we find the peer is drastically off (by some margin) in 6185 * the smaller direction, then we will assume the worst case and disable use of timestamps. 6186 * 6187 */ 6188 uint64_t delta, peer_delta, delta_up; 6189 6190 delta = bbr->r_ctl.cur_rtt_send_time - bbr->r_ctl.bbr_ts_check_our_cts; 6191 if (delta < bbr_min_usec_delta) { 6192 /* 6193 * Have not seen a min amount of time 6194 * between our send times so we can 6195 * make a determination of the timestamp 6196 * yet. 6197 */ 6198 return; 6199 } 6200 peer_delta = bbr->r_ctl.last_inbound_ts - bbr->r_ctl.bbr_ts_check_tstmp; 6201 if (peer_delta < bbr_min_peer_delta) { 6202 /* 6203 * We may have enough in the form of 6204 * our delta but the peers number 6205 * has not changed that much. It could 6206 * be its clock ratio is such that 6207 * we need more data (10ms tick) or 6208 * there may be other compression scenarios 6209 * going on. In any event we need the 6210 * spread to be larger. 6211 */ 6212 return; 6213 } 6214 /* Ok lets first see which way our delta is going */ 6215 if (peer_delta > delta) { 6216 /* Very unlikely, the peer without 6217 * compensation shows that it saw 6218 * the two sends arrive further apart 6219 * then we saw then in micro-seconds. 6220 */ 6221 if (peer_delta < (delta + ((delta * (uint64_t)1000)/ (uint64_t)bbr_delta_percent))) { 6222 /* well it looks like the peer is a micro-second clock. */ 6223 bbr->rc_ts_clock_set = 1; 6224 bbr->r_ctl.bbr_peer_tsratio = 1; 6225 } else { 6226 bbr->rc_ts_cant_be_used = 1; 6227 bbr->rc_ts_clock_set = 1; 6228 } 6229 return; 6230 } 6231 /* Ok we know that the peer_delta is smaller than our send distance */ 6232 bbr->rc_ts_clock_set = 1; 6233 /* First question is it within the percentage that they are using usec time? */ 6234 delta_up = (peer_delta * 1000) / (uint64_t)bbr_delta_percent; 6235 if ((peer_delta + delta_up) >= delta) { 6236 /* Its a usec clock */ 6237 bbr->r_ctl.bbr_peer_tsratio = 1; 6238 bbr_log_tstmp_validation(bbr, peer_delta, delta); 6239 return; 6240 } 6241 /* Ok if not usec, what about 10usec (though unlikely)? */ 6242 delta_up = (peer_delta * 1000 * 10) / (uint64_t)bbr_delta_percent; 6243 if (((peer_delta * 10) + delta_up) >= delta) { 6244 bbr->r_ctl.bbr_peer_tsratio = 10; 6245 bbr_log_tstmp_validation(bbr, peer_delta, delta); 6246 return; 6247 } 6248 /* And what about 100usec (though again unlikely)? */ 6249 delta_up = (peer_delta * 1000 * 100) / (uint64_t)bbr_delta_percent; 6250 if (((peer_delta * 100) + delta_up) >= delta) { 6251 bbr->r_ctl.bbr_peer_tsratio = 100; 6252 bbr_log_tstmp_validation(bbr, peer_delta, delta); 6253 return; 6254 } 6255 /* And how about 1 msec (the most likely one)? */ 6256 delta_up = (peer_delta * 1000 * 1000) / (uint64_t)bbr_delta_percent; 6257 if (((peer_delta * 1000) + delta_up) >= delta) { 6258 bbr->r_ctl.bbr_peer_tsratio = 1000; 6259 bbr_log_tstmp_validation(bbr, peer_delta, delta); 6260 return; 6261 } 6262 /* Ok if not msec could it be 10 msec? */ 6263 delta_up = (peer_delta * 1000 * 10000) / (uint64_t)bbr_delta_percent; 6264 if (((peer_delta * 10000) + delta_up) >= delta) { 6265 bbr->r_ctl.bbr_peer_tsratio = 10000; 6266 return; 6267 } 6268 /* If we fall down here the clock tick so slowly we can't use it */ 6269 bbr->rc_ts_cant_be_used = 1; 6270 bbr->r_ctl.bbr_peer_tsratio = 0; 6271 bbr_log_tstmp_validation(bbr, peer_delta, delta); 6272 } 6273 6274 /* 6275 * Collect new round-trip time estimate 6276 * and update averages and current timeout. 6277 */ 6278 static void 6279 tcp_bbr_xmit_timer_commit(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts) 6280 { 6281 int32_t delta; 6282 uint32_t rtt, tsin; 6283 int32_t rtt_ticks; 6284 6285 if (bbr->rtt_valid == 0) 6286 /* No valid sample */ 6287 return; 6288 6289 rtt = bbr->r_ctl.cur_rtt; 6290 tsin = bbr->r_ctl.ts_in; 6291 if (bbr->rc_prtt_set_ts) { 6292 /* 6293 * We are to force feed the rttProp filter due 6294 * to an entry into PROBE_RTT. This assures 6295 * that the times are sync'd between when we 6296 * go into PROBE_RTT and the filter expiration. 6297 * 6298 * Google does not use a true filter, so they do 6299 * this implicitly since they only keep one value 6300 * and when they enter probe-rtt they update the 6301 * value to the newest rtt. 6302 */ 6303 uint32_t rtt_prop; 6304 6305 bbr->rc_prtt_set_ts = 0; 6306 rtt_prop = get_filter_value_small(&bbr->r_ctl.rc_rttprop); 6307 if (rtt > rtt_prop) 6308 filter_increase_by_small(&bbr->r_ctl.rc_rttprop, (rtt - rtt_prop), cts); 6309 else 6310 apply_filter_min_small(&bbr->r_ctl.rc_rttprop, rtt, cts); 6311 } 6312 if (bbr->rc_ack_was_delayed) 6313 rtt += bbr->r_ctl.rc_ack_hdwr_delay; 6314 6315 if (rtt < bbr->r_ctl.rc_lowest_rtt) 6316 bbr->r_ctl.rc_lowest_rtt = rtt; 6317 bbr_log_rtt_sample(bbr, rtt, tsin); 6318 if (bbr->r_init_rtt) { 6319 /* 6320 * The initial rtt is not-trusted, nuke it and lets get 6321 * our first valid measurement in. 6322 */ 6323 bbr->r_init_rtt = 0; 6324 tp->t_srtt = 0; 6325 } 6326 if ((bbr->rc_ts_clock_set == 0) && bbr->rc_ts_valid) { 6327 /* 6328 * So we have not yet figured out 6329 * what the peers TSTMP value is 6330 * in (most likely ms). We need a 6331 * series of cum-ack's to determine 6332 * this reliably. 6333 */ 6334 if (bbr->rc_ack_is_cumack) { 6335 if (bbr->rc_ts_data_set) { 6336 /* Lets attempt to determine the timestamp granularity. */ 6337 bbr_make_timestamp_determination(bbr); 6338 } else { 6339 bbr->rc_ts_data_set = 1; 6340 bbr->r_ctl.bbr_ts_check_tstmp = bbr->r_ctl.last_inbound_ts; 6341 bbr->r_ctl.bbr_ts_check_our_cts = bbr->r_ctl.cur_rtt_send_time; 6342 } 6343 } else { 6344 /* 6345 * We have to have consecutive acks 6346 * reset any "filled" state to none. 6347 */ 6348 bbr->rc_ts_data_set = 0; 6349 } 6350 } 6351 /* Round it up */ 6352 rtt_ticks = USEC_2_TICKS((rtt + (USECS_IN_MSEC - 1))); 6353 if (rtt_ticks == 0) 6354 rtt_ticks = 1; 6355 if (tp->t_srtt != 0) { 6356 /* 6357 * srtt is stored as fixed point with 5 bits after the 6358 * binary point (i.e., scaled by 8). The following magic is 6359 * equivalent to the smoothing algorithm in rfc793 with an 6360 * alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed point). 6361 * Adjust rtt to origin 0. 6362 */ 6363 6364 delta = ((rtt_ticks - 1) << TCP_DELTA_SHIFT) 6365 - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)); 6366 6367 tp->t_srtt += delta; 6368 if (tp->t_srtt <= 0) 6369 tp->t_srtt = 1; 6370 6371 /* 6372 * We accumulate a smoothed rtt variance (actually, a 6373 * smoothed mean difference), then set the retransmit timer 6374 * to smoothed rtt + 4 times the smoothed variance. rttvar 6375 * is stored as fixed point with 4 bits after the binary 6376 * point (scaled by 16). The following is equivalent to 6377 * rfc793 smoothing with an alpha of .75 (rttvar = 6378 * rttvar*3/4 + |delta| / 4). This replaces rfc793's 6379 * wired-in beta. 6380 */ 6381 if (delta < 0) 6382 delta = -delta; 6383 delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT); 6384 tp->t_rttvar += delta; 6385 if (tp->t_rttvar <= 0) 6386 tp->t_rttvar = 1; 6387 if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar) 6388 tp->t_rttbest = tp->t_srtt + tp->t_rttvar; 6389 } else { 6390 /* 6391 * No rtt measurement yet - use the unsmoothed rtt. Set the 6392 * variance to half the rtt (so our first retransmit happens 6393 * at 3*rtt). 6394 */ 6395 tp->t_srtt = rtt_ticks << TCP_RTT_SHIFT; 6396 tp->t_rttvar = rtt_ticks << (TCP_RTTVAR_SHIFT - 1); 6397 tp->t_rttbest = tp->t_srtt + tp->t_rttvar; 6398 } 6399 KMOD_TCPSTAT_INC(tcps_rttupdated); 6400 tp->t_rttupdated++; 6401 #ifdef STATS 6402 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rtt_ticks)); 6403 #endif 6404 /* 6405 * the retransmit should happen at rtt + 4 * rttvar. Because of the 6406 * way we do the smoothing, srtt and rttvar will each average +1/2 6407 * tick of bias. When we compute the retransmit timer, we want 1/2 6408 * tick of rounding and 1 extra tick because of +-1/2 tick 6409 * uncertainty in the firing of the timer. The bias will give us 6410 * exactly the 1.5 tick we need. But, because the bias is 6411 * statistical, we have to test that we don't drop below the minimum 6412 * feasible timer (which is 2 ticks). 6413 */ 6414 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), 6415 max(MSEC_2_TICKS(bbr->r_ctl.rc_min_rto_ms), rtt_ticks + 2), 6416 MSEC_2_TICKS(((uint32_t)bbr->rc_max_rto_sec) * 1000)); 6417 6418 /* 6419 * We received an ack for a packet that wasn't retransmitted; it is 6420 * probably safe to discard any error indications we've received 6421 * recently. This isn't quite right, but close enough for now (a 6422 * route might have failed after we sent a segment, and the return 6423 * path might not be symmetrical). 6424 */ 6425 tp->t_softerror = 0; 6426 rtt = (TICKS_2_USEC(bbr->rc_tp->t_srtt) >> TCP_RTT_SHIFT); 6427 if (bbr->r_ctl.bbr_smallest_srtt_this_state > rtt) 6428 bbr->r_ctl.bbr_smallest_srtt_this_state = rtt; 6429 } 6430 6431 static void 6432 bbr_set_reduced_rtt(struct tcp_bbr *bbr, uint32_t cts, uint32_t line) 6433 { 6434 bbr->r_ctl.rc_rtt_shrinks = cts; 6435 if (bbr_can_force_probertt && 6436 (TSTMP_GT(cts, bbr->r_ctl.last_in_probertt)) && 6437 ((cts - bbr->r_ctl.last_in_probertt) > bbr->r_ctl.rc_probertt_int)) { 6438 /* 6439 * We should enter probe-rtt its been too long 6440 * since we have been there. 6441 */ 6442 bbr_enter_probe_rtt(bbr, cts, __LINE__); 6443 } else 6444 bbr_check_probe_rtt_limits(bbr, cts); 6445 } 6446 6447 static void 6448 tcp_bbr_commit_bw(struct tcp_bbr *bbr, uint32_t cts) 6449 { 6450 uint64_t orig_bw; 6451 6452 if (bbr->r_ctl.rc_bbr_cur_del_rate == 0) { 6453 /* We never apply a zero measurement */ 6454 bbr_log_type_bbrupd(bbr, 20, cts, 0, 0, 6455 0, 0, 0, 0, 0, 0); 6456 return; 6457 } 6458 if (bbr->r_ctl.r_measurement_count < 0xffffffff) 6459 bbr->r_ctl.r_measurement_count++; 6460 orig_bw = get_filter_value(&bbr->r_ctl.rc_delrate); 6461 apply_filter_max(&bbr->r_ctl.rc_delrate, bbr->r_ctl.rc_bbr_cur_del_rate, bbr->r_ctl.rc_pkt_epoch); 6462 bbr_log_type_bbrupd(bbr, 21, cts, (uint32_t)orig_bw, 6463 (uint32_t)get_filter_value(&bbr->r_ctl.rc_delrate), 6464 0, 0, 0, 0, 0, 0); 6465 if (orig_bw && 6466 (orig_bw != get_filter_value(&bbr->r_ctl.rc_delrate))) { 6467 if (bbr->bbr_hdrw_pacing) { 6468 /* 6469 * Apply a new rate to the hardware 6470 * possibly. 6471 */ 6472 bbr_update_hardware_pacing_rate(bbr, cts); 6473 } 6474 bbr_set_state_target(bbr, __LINE__); 6475 tcp_bbr_tso_size_check(bbr, cts); 6476 if (bbr->r_recovery_bw) { 6477 bbr_setup_red_bw(bbr, cts); 6478 bbr_log_type_bw_reduce(bbr, BBR_RED_BW_USELRBW); 6479 } 6480 } else if ((orig_bw == 0) && get_filter_value(&bbr->r_ctl.rc_delrate)) 6481 tcp_bbr_tso_size_check(bbr, cts); 6482 } 6483 6484 static void 6485 bbr_nf_measurement(struct tcp_bbr *bbr, struct bbr_sendmap *rsm, uint32_t rtt, uint32_t cts) 6486 { 6487 if (bbr->rc_in_persist == 0) { 6488 /* We log only when not in persist */ 6489 /* Translate to a Bytes Per Second */ 6490 uint64_t tim, bw, ts_diff, ts_bw; 6491 uint32_t delivered; 6492 6493 if (TSTMP_GT(bbr->r_ctl.rc_del_time, rsm->r_del_time)) 6494 tim = (uint64_t)(bbr->r_ctl.rc_del_time - rsm->r_del_time); 6495 else 6496 tim = 1; 6497 /* 6498 * Now that we have processed the tim (skipping the sample 6499 * or possibly updating the time, go ahead and 6500 * calculate the cdr. 6501 */ 6502 delivered = (bbr->r_ctl.rc_delivered - rsm->r_delivered); 6503 bw = (uint64_t)delivered; 6504 bw *= (uint64_t)USECS_IN_SECOND; 6505 bw /= tim; 6506 if (bw == 0) { 6507 /* We must have a calculatable amount */ 6508 return; 6509 } 6510 /* 6511 * If we are using this b/w shove it in now so we 6512 * can see in the trace viewer if it gets over-ridden. 6513 */ 6514 if (rsm->r_ts_valid && 6515 bbr->rc_ts_valid && 6516 bbr->rc_ts_clock_set && 6517 (bbr->rc_ts_cant_be_used == 0) && 6518 bbr->rc_use_ts_limit) { 6519 ts_diff = max((bbr->r_ctl.last_inbound_ts - rsm->r_del_ack_ts), 1); 6520 ts_diff *= bbr->r_ctl.bbr_peer_tsratio; 6521 if ((delivered == 0) || 6522 (rtt < 1000)) { 6523 /* Can't use the ts */ 6524 bbr_log_type_bbrupd(bbr, 61, cts, 6525 ts_diff, 6526 bbr->r_ctl.last_inbound_ts, 6527 rsm->r_del_ack_ts, 0, 6528 0, 0, 0, delivered); 6529 } else { 6530 ts_bw = (uint64_t)delivered; 6531 ts_bw *= (uint64_t)USECS_IN_SECOND; 6532 ts_bw /= ts_diff; 6533 bbr_log_type_bbrupd(bbr, 62, cts, 6534 (ts_bw >> 32), 6535 (ts_bw & 0xffffffff), 0, 0, 6536 0, 0, ts_diff, delivered); 6537 if ((bbr->ts_can_raise) && 6538 (ts_bw > bw)) { 6539 bbr_log_type_bbrupd(bbr, 8, cts, 6540 delivered, 6541 ts_diff, 6542 (bw >> 32), 6543 (bw & 0x00000000ffffffff), 6544 0, 0, 0, 0); 6545 bw = ts_bw; 6546 } else if (ts_bw && (ts_bw < bw)) { 6547 bbr_log_type_bbrupd(bbr, 7, cts, 6548 delivered, 6549 ts_diff, 6550 (bw >> 32), 6551 (bw & 0x00000000ffffffff), 6552 0, 0, 0, 0); 6553 bw = ts_bw; 6554 } 6555 } 6556 } 6557 if (rsm->r_first_sent_time && 6558 TSTMP_GT(rsm->r_tim_lastsent[(rsm->r_rtr_cnt -1)],rsm->r_first_sent_time)) { 6559 uint64_t sbw, sti; 6560 /* 6561 * We use what was in flight at the time of our 6562 * send and the size of this send to figure 6563 * out what we have been sending at (amount). 6564 * For the time we take from the time of 6565 * the send of the first send outstanding 6566 * until this send plus this sends pacing 6567 * time. This gives us a good calculation 6568 * as to the rate we have been sending at. 6569 */ 6570 6571 sbw = (uint64_t)(rsm->r_flight_at_send); 6572 sbw *= (uint64_t)USECS_IN_SECOND; 6573 sti = rsm->r_tim_lastsent[(rsm->r_rtr_cnt -1)] - rsm->r_first_sent_time; 6574 sti += rsm->r_pacing_delay; 6575 sbw /= sti; 6576 if (sbw < bw) { 6577 bbr_log_type_bbrupd(bbr, 6, cts, 6578 delivered, 6579 (uint32_t)sti, 6580 (bw >> 32), 6581 (uint32_t)bw, 6582 rsm->r_first_sent_time, 0, (sbw >> 32), 6583 (uint32_t)sbw); 6584 bw = sbw; 6585 } 6586 } 6587 /* Use the google algorithm for b/w measurements */ 6588 bbr->r_ctl.rc_bbr_cur_del_rate = bw; 6589 if ((rsm->r_app_limited == 0) || 6590 (bw > get_filter_value(&bbr->r_ctl.rc_delrate))) { 6591 tcp_bbr_commit_bw(bbr, cts); 6592 bbr_log_type_bbrupd(bbr, 10, cts, (uint32_t)tim, delivered, 6593 0, 0, 0, 0, bbr->r_ctl.rc_del_time, rsm->r_del_time); 6594 } 6595 } 6596 } 6597 6598 static void 6599 bbr_google_measurement(struct tcp_bbr *bbr, struct bbr_sendmap *rsm, uint32_t rtt, uint32_t cts) 6600 { 6601 if (bbr->rc_in_persist == 0) { 6602 /* We log only when not in persist */ 6603 /* Translate to a Bytes Per Second */ 6604 uint64_t tim, bw; 6605 uint32_t delivered; 6606 int no_apply = 0; 6607 6608 if (TSTMP_GT(bbr->r_ctl.rc_del_time, rsm->r_del_time)) 6609 tim = (uint64_t)(bbr->r_ctl.rc_del_time - rsm->r_del_time); 6610 else 6611 tim = 1; 6612 /* 6613 * Now that we have processed the tim (skipping the sample 6614 * or possibly updating the time, go ahead and 6615 * calculate the cdr. 6616 */ 6617 delivered = (bbr->r_ctl.rc_delivered - rsm->r_delivered); 6618 bw = (uint64_t)delivered; 6619 bw *= (uint64_t)USECS_IN_SECOND; 6620 bw /= tim; 6621 if (tim < bbr->r_ctl.rc_lowest_rtt) { 6622 bbr_log_type_bbrupd(bbr, 99, cts, (uint32_t)tim, delivered, 6623 tim, bbr->r_ctl.rc_lowest_rtt, 0, 0, 0, 0); 6624 6625 no_apply = 1; 6626 } 6627 /* 6628 * If we are using this b/w shove it in now so we 6629 * can see in the trace viewer if it gets over-ridden. 6630 */ 6631 bbr->r_ctl.rc_bbr_cur_del_rate = bw; 6632 /* Gate by the sending rate */ 6633 if (rsm->r_first_sent_time && 6634 TSTMP_GT(rsm->r_tim_lastsent[(rsm->r_rtr_cnt -1)],rsm->r_first_sent_time)) { 6635 uint64_t sbw, sti; 6636 /* 6637 * We use what was in flight at the time of our 6638 * send and the size of this send to figure 6639 * out what we have been sending at (amount). 6640 * For the time we take from the time of 6641 * the send of the first send outstanding 6642 * until this send plus this sends pacing 6643 * time. This gives us a good calculation 6644 * as to the rate we have been sending at. 6645 */ 6646 6647 sbw = (uint64_t)(rsm->r_flight_at_send); 6648 sbw *= (uint64_t)USECS_IN_SECOND; 6649 sti = rsm->r_tim_lastsent[(rsm->r_rtr_cnt -1)] - rsm->r_first_sent_time; 6650 sti += rsm->r_pacing_delay; 6651 sbw /= sti; 6652 if (sbw < bw) { 6653 bbr_log_type_bbrupd(bbr, 6, cts, 6654 delivered, 6655 (uint32_t)sti, 6656 (bw >> 32), 6657 (uint32_t)bw, 6658 rsm->r_first_sent_time, 0, (sbw >> 32), 6659 (uint32_t)sbw); 6660 bw = sbw; 6661 } 6662 if ((sti > tim) && 6663 (sti < bbr->r_ctl.rc_lowest_rtt)) { 6664 bbr_log_type_bbrupd(bbr, 99, cts, (uint32_t)tim, delivered, 6665 (uint32_t)sti, bbr->r_ctl.rc_lowest_rtt, 0, 0, 0, 0); 6666 no_apply = 1; 6667 } else 6668 no_apply = 0; 6669 } 6670 bbr->r_ctl.rc_bbr_cur_del_rate = bw; 6671 if ((no_apply == 0) && 6672 ((rsm->r_app_limited == 0) || 6673 (bw > get_filter_value(&bbr->r_ctl.rc_delrate)))) { 6674 tcp_bbr_commit_bw(bbr, cts); 6675 bbr_log_type_bbrupd(bbr, 10, cts, (uint32_t)tim, delivered, 6676 0, 0, 0, 0, bbr->r_ctl.rc_del_time, rsm->r_del_time); 6677 } 6678 } 6679 } 6680 6681 static void 6682 bbr_update_bbr_info(struct tcp_bbr *bbr, struct bbr_sendmap *rsm, uint32_t rtt, uint32_t cts, uint32_t tsin, 6683 uint32_t uts, int32_t match, uint32_t rsm_send_time, int32_t ack_type, struct tcpopt *to) 6684 { 6685 uint64_t old_rttprop; 6686 6687 /* Update our delivery time and amount */ 6688 bbr->r_ctl.rc_delivered += (rsm->r_end - rsm->r_start); 6689 bbr->r_ctl.rc_del_time = cts; 6690 if (rtt == 0) { 6691 /* 6692 * 0 means its a retransmit, for now we don't use these for 6693 * the rest of BBR. 6694 */ 6695 return; 6696 } 6697 if ((bbr->rc_use_google == 0) && 6698 (match != BBR_RTT_BY_EXACTMATCH) && 6699 (match != BBR_RTT_BY_TIMESTAMP)){ 6700 /* 6701 * We get a lot of rtt updates, lets not pay attention to 6702 * any that are not an exact match. That way we don't have 6703 * to worry about timestamps and the whole nonsense of 6704 * unsure if its a retransmission etc (if we ever had the 6705 * timestamp fixed to always have the last thing sent this 6706 * would not be a issue). 6707 */ 6708 return; 6709 } 6710 if ((bbr_no_retran && bbr->rc_use_google) && 6711 (match != BBR_RTT_BY_EXACTMATCH) && 6712 (match != BBR_RTT_BY_TIMESTAMP)){ 6713 /* 6714 * We only do measurements in google mode 6715 * with bbr_no_retran on for sure things. 6716 */ 6717 return; 6718 } 6719 /* Only update srtt if we know by exact match */ 6720 tcp_bbr_xmit_timer(bbr, rtt, rsm_send_time, rsm->r_start, tsin); 6721 if (ack_type == BBR_CUM_ACKED) 6722 bbr->rc_ack_is_cumack = 1; 6723 else 6724 bbr->rc_ack_is_cumack = 0; 6725 old_rttprop = bbr_get_rtt(bbr, BBR_RTT_PROP); 6726 /* 6727 * Note the following code differs to the original 6728 * BBR spec. It calls for <= not <. However after a 6729 * long discussion in email with Neal, he acknowledged 6730 * that it should be < than so that we will have flows 6731 * going into probe-rtt (we were seeing cases where that 6732 * did not happen and caused ugly things to occur). We 6733 * have added this agreed upon fix to our code base. 6734 */ 6735 if (rtt < old_rttprop) { 6736 /* Update when we last saw a rtt drop */ 6737 bbr_log_rtt_shrinks(bbr, cts, 0, rtt, __LINE__, BBR_RTTS_NEWRTT, 0); 6738 bbr_set_reduced_rtt(bbr, cts, __LINE__); 6739 } 6740 bbr_log_type_bbrrttprop(bbr, rtt, (rsm ? rsm->r_end : 0), uts, cts, 6741 match, rsm->r_start, rsm->r_flags); 6742 apply_filter_min_small(&bbr->r_ctl.rc_rttprop, rtt, cts); 6743 if (old_rttprop != bbr_get_rtt(bbr, BBR_RTT_PROP)) { 6744 /* 6745 * The RTT-prop moved, reset the target (may be a 6746 * nop for some states). 6747 */ 6748 bbr_set_state_target(bbr, __LINE__); 6749 if (bbr->rc_bbr_state == BBR_STATE_PROBE_RTT) 6750 bbr_log_rtt_shrinks(bbr, cts, 0, 0, 6751 __LINE__, BBR_RTTS_NEW_TARGET, 0); 6752 else if (old_rttprop < bbr_get_rtt(bbr, BBR_RTT_PROP)) 6753 /* It went up */ 6754 bbr_check_probe_rtt_limits(bbr, cts); 6755 } 6756 if ((bbr->rc_use_google == 0) && 6757 (match == BBR_RTT_BY_TIMESTAMP)) { 6758 /* 6759 * We don't do b/w update with 6760 * these since they are not really 6761 * reliable. 6762 */ 6763 return; 6764 } 6765 if (bbr->r_ctl.r_app_limited_until && 6766 (bbr->r_ctl.rc_delivered >= bbr->r_ctl.r_app_limited_until)) { 6767 /* We are no longer app-limited */ 6768 bbr->r_ctl.r_app_limited_until = 0; 6769 } 6770 if (bbr->rc_use_google) { 6771 bbr_google_measurement(bbr, rsm, rtt, cts); 6772 } else { 6773 bbr_nf_measurement(bbr, rsm, rtt, cts); 6774 } 6775 } 6776 6777 /* 6778 * Convert a timestamp that the main stack 6779 * uses (milliseconds) into one that bbr uses 6780 * (microseconds). Return that converted timestamp. 6781 */ 6782 static uint32_t 6783 bbr_ts_convert(uint32_t cts) { 6784 uint32_t sec, msec; 6785 6786 sec = cts / MS_IN_USEC; 6787 msec = cts - (MS_IN_USEC * sec); 6788 return ((sec * USECS_IN_SECOND) + (msec * MS_IN_USEC)); 6789 } 6790 6791 /* 6792 * Return 0 if we did not update the RTT time, return 6793 * 1 if we did. 6794 */ 6795 static int 6796 bbr_update_rtt(struct tcpcb *tp, struct tcp_bbr *bbr, 6797 struct bbr_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type, uint32_t th_ack) 6798 { 6799 int32_t i; 6800 uint32_t t, uts = 0; 6801 6802 if ((rsm->r_flags & BBR_ACKED) || 6803 (rsm->r_flags & BBR_WAS_RENEGED) || 6804 (rsm->r_flags & BBR_RXT_CLEARED)) { 6805 /* Already done */ 6806 return (0); 6807 } 6808 if (rsm->r_rtt_not_allowed) { 6809 /* Not allowed */ 6810 return (0); 6811 } 6812 if (rsm->r_rtr_cnt == 1) { 6813 /* 6814 * Only one transmit. Hopefully the normal case. 6815 */ 6816 if (TSTMP_GT(cts, rsm->r_tim_lastsent[0])) 6817 t = cts - rsm->r_tim_lastsent[0]; 6818 else 6819 t = 1; 6820 if ((int)t <= 0) 6821 t = 1; 6822 bbr->r_ctl.rc_last_rtt = t; 6823 bbr_update_bbr_info(bbr, rsm, t, cts, to->to_tsecr, 0, 6824 BBR_RTT_BY_EXACTMATCH, rsm->r_tim_lastsent[0], ack_type, to); 6825 return (1); 6826 } 6827 /* Convert to usecs */ 6828 if ((bbr_can_use_ts_for_rtt == 1) && 6829 (bbr->rc_use_google == 1) && 6830 (ack_type == BBR_CUM_ACKED) && 6831 (to->to_flags & TOF_TS) && 6832 (to->to_tsecr != 0)) { 6833 t = tcp_tv_to_mssectick(&bbr->rc_tv) - to->to_tsecr; 6834 if (t < 1) 6835 t = 1; 6836 t *= MS_IN_USEC; 6837 bbr_update_bbr_info(bbr, rsm, t, cts, to->to_tsecr, 0, 6838 BBR_RTT_BY_TIMESTAMP, 6839 rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)], 6840 ack_type, to); 6841 return (1); 6842 } 6843 uts = bbr_ts_convert(to->to_tsecr); 6844 if ((to->to_flags & TOF_TS) && 6845 (to->to_tsecr != 0) && 6846 (ack_type == BBR_CUM_ACKED) && 6847 ((rsm->r_flags & BBR_OVERMAX) == 0)) { 6848 /* 6849 * Now which timestamp does it match? In this block the ACK 6850 * may be coming from a previous transmission. 6851 */ 6852 uint32_t fudge; 6853 6854 fudge = BBR_TIMER_FUDGE; 6855 for (i = 0; i < rsm->r_rtr_cnt; i++) { 6856 if ((SEQ_GEQ(uts, (rsm->r_tim_lastsent[i] - fudge))) && 6857 (SEQ_LEQ(uts, (rsm->r_tim_lastsent[i] + fudge)))) { 6858 if (TSTMP_GT(cts, rsm->r_tim_lastsent[i])) 6859 t = cts - rsm->r_tim_lastsent[i]; 6860 else 6861 t = 1; 6862 if ((int)t <= 0) 6863 t = 1; 6864 bbr->r_ctl.rc_last_rtt = t; 6865 bbr_update_bbr_info(bbr, rsm, t, cts, to->to_tsecr, uts, BBR_RTT_BY_TSMATCHING, 6866 rsm->r_tim_lastsent[i], ack_type, to); 6867 if ((i + 1) < rsm->r_rtr_cnt) { 6868 /* Likely */ 6869 return (0); 6870 } else if (rsm->r_flags & BBR_TLP) { 6871 bbr->rc_tlp_rtx_out = 0; 6872 } 6873 return (1); 6874 } 6875 } 6876 /* Fall through if we can't find a matching timestamp */ 6877 } 6878 /* 6879 * Ok its a SACK block that we retransmitted. or a windows 6880 * machine without timestamps. We can tell nothing from the 6881 * time-stamp since its not there or the time the peer last 6882 * recieved a segment that moved forward its cum-ack point. 6883 * 6884 * Lets look at the last retransmit and see what we can tell 6885 * (with BBR for space we only keep 2 note we have to keep 6886 * at least 2 so the map can not be condensed more). 6887 */ 6888 i = rsm->r_rtr_cnt - 1; 6889 if (TSTMP_GT(cts, rsm->r_tim_lastsent[i])) 6890 t = cts - rsm->r_tim_lastsent[i]; 6891 else 6892 goto not_sure; 6893 if (t < bbr->r_ctl.rc_lowest_rtt) { 6894 /* 6895 * We retransmitted and the ack came back in less 6896 * than the smallest rtt we have observed in the 6897 * windowed rtt. We most likey did an improper 6898 * retransmit as outlined in 4.2 Step 3 point 2 in 6899 * the rack-draft. 6900 * 6901 * Use the prior transmission to update all the 6902 * information as long as there is only one prior 6903 * transmission. 6904 */ 6905 if ((rsm->r_flags & BBR_OVERMAX) == 0) { 6906 #ifdef BBR_INVARIANTS 6907 if (rsm->r_rtr_cnt == 1) 6908 panic("rsm:%p bbr:%p rsm has overmax and only 1 retranmit flags:%x?", rsm, bbr, rsm->r_flags); 6909 #endif 6910 i = rsm->r_rtr_cnt - 2; 6911 if (TSTMP_GT(cts, rsm->r_tim_lastsent[i])) 6912 t = cts - rsm->r_tim_lastsent[i]; 6913 else 6914 t = 1; 6915 bbr_update_bbr_info(bbr, rsm, t, cts, to->to_tsecr, uts, BBR_RTT_BY_EARLIER_RET, 6916 rsm->r_tim_lastsent[i], ack_type, to); 6917 return (0); 6918 } else { 6919 /* 6920 * Too many prior transmissions, just 6921 * updated BBR delivered 6922 */ 6923 not_sure: 6924 bbr_update_bbr_info(bbr, rsm, 0, cts, to->to_tsecr, uts, 6925 BBR_RTT_BY_SOME_RETRAN, 0, ack_type, to); 6926 } 6927 } else { 6928 /* 6929 * We retransmitted it and the retransmit did the 6930 * job. 6931 */ 6932 if (rsm->r_flags & BBR_TLP) 6933 bbr->rc_tlp_rtx_out = 0; 6934 if ((rsm->r_flags & BBR_OVERMAX) == 0) 6935 bbr_update_bbr_info(bbr, rsm, t, cts, to->to_tsecr, uts, 6936 BBR_RTT_BY_THIS_RETRAN, 0, ack_type, to); 6937 else 6938 bbr_update_bbr_info(bbr, rsm, 0, cts, to->to_tsecr, uts, 6939 BBR_RTT_BY_SOME_RETRAN, 0, ack_type, to); 6940 return (1); 6941 } 6942 return (0); 6943 } 6944 6945 /* 6946 * Mark the SACK_PASSED flag on all entries prior to rsm send wise. 6947 */ 6948 static void 6949 bbr_log_sack_passed(struct tcpcb *tp, 6950 struct tcp_bbr *bbr, struct bbr_sendmap *rsm) 6951 { 6952 struct bbr_sendmap *nrsm; 6953 6954 nrsm = rsm; 6955 TAILQ_FOREACH_REVERSE_FROM(nrsm, &bbr->r_ctl.rc_tmap, 6956 bbr_head, r_tnext) { 6957 if (nrsm == rsm) { 6958 /* Skip orginal segment he is acked */ 6959 continue; 6960 } 6961 if (nrsm->r_flags & BBR_ACKED) { 6962 /* Skip ack'd segments */ 6963 continue; 6964 } 6965 if (nrsm->r_flags & BBR_SACK_PASSED) { 6966 /* 6967 * We found one that is already marked 6968 * passed, we have been here before and 6969 * so all others below this are marked. 6970 */ 6971 break; 6972 } 6973 BBR_STAT_INC(bbr_sack_passed); 6974 nrsm->r_flags |= BBR_SACK_PASSED; 6975 if (((nrsm->r_flags & BBR_MARKED_LOST) == 0) && 6976 bbr_is_lost(bbr, nrsm, bbr->r_ctl.rc_rcvtime)) { 6977 bbr->r_ctl.rc_lost += nrsm->r_end - nrsm->r_start; 6978 bbr->r_ctl.rc_lost_bytes += nrsm->r_end - nrsm->r_start; 6979 nrsm->r_flags |= BBR_MARKED_LOST; 6980 } 6981 nrsm->r_flags &= ~BBR_WAS_SACKPASS; 6982 } 6983 } 6984 6985 /* 6986 * Returns the number of bytes that were 6987 * newly ack'd by sack blocks. 6988 */ 6989 static uint32_t 6990 bbr_proc_sack_blk(struct tcpcb *tp, struct tcp_bbr *bbr, struct sackblk *sack, 6991 struct tcpopt *to, struct bbr_sendmap **prsm, uint32_t cts) 6992 { 6993 int32_t times = 0; 6994 uint32_t start, end, changed = 0; 6995 struct bbr_sendmap *rsm, *nrsm; 6996 int32_t used_ref = 1; 6997 uint8_t went_back = 0, went_fwd = 0; 6998 6999 start = sack->start; 7000 end = sack->end; 7001 rsm = *prsm; 7002 if (rsm == NULL) 7003 used_ref = 0; 7004 7005 /* Do we locate the block behind where we last were? */ 7006 if (rsm && SEQ_LT(start, rsm->r_start)) { 7007 went_back = 1; 7008 TAILQ_FOREACH_REVERSE_FROM(rsm, &bbr->r_ctl.rc_map, bbr_head, r_next) { 7009 if (SEQ_GEQ(start, rsm->r_start) && 7010 SEQ_LT(start, rsm->r_end)) { 7011 goto do_rest_ofb; 7012 } 7013 } 7014 } 7015 start_at_beginning: 7016 went_fwd = 1; 7017 /* 7018 * Ok lets locate the block where this guy is fwd from rsm (if its 7019 * set) 7020 */ 7021 TAILQ_FOREACH_FROM(rsm, &bbr->r_ctl.rc_map, r_next) { 7022 if (SEQ_GEQ(start, rsm->r_start) && 7023 SEQ_LT(start, rsm->r_end)) { 7024 break; 7025 } 7026 } 7027 do_rest_ofb: 7028 if (rsm == NULL) { 7029 /* 7030 * This happens when we get duplicate sack blocks with the 7031 * same end. For example SACK 4: 100 SACK 3: 100 The sort 7032 * will not change there location so we would just start at 7033 * the end of the first one and get lost. 7034 */ 7035 if (tp->t_flags & TF_SENTFIN) { 7036 /* 7037 * Check to see if we have not logged the FIN that 7038 * went out. 7039 */ 7040 nrsm = TAILQ_LAST_FAST(&bbr->r_ctl.rc_map, bbr_sendmap, r_next); 7041 if (nrsm && (nrsm->r_end + 1) == tp->snd_max) { 7042 /* 7043 * Ok we did not get the FIN logged. 7044 */ 7045 nrsm->r_end++; 7046 rsm = nrsm; 7047 goto do_rest_ofb; 7048 } 7049 } 7050 if (times == 1) { 7051 #ifdef BBR_INVARIANTS 7052 panic("tp:%p bbr:%p sack:%p to:%p prsm:%p", 7053 tp, bbr, sack, to, prsm); 7054 #else 7055 goto out; 7056 #endif 7057 } 7058 times++; 7059 BBR_STAT_INC(bbr_sack_proc_restart); 7060 rsm = NULL; 7061 goto start_at_beginning; 7062 } 7063 /* Ok we have an ACK for some piece of rsm */ 7064 if (rsm->r_start != start) { 7065 /* 7066 * Need to split this in two pieces the before and after. 7067 */ 7068 if (bbr_sack_mergable(rsm, start, end)) 7069 nrsm = bbr_alloc_full_limit(bbr); 7070 else 7071 nrsm = bbr_alloc_limit(bbr, BBR_LIMIT_TYPE_SPLIT); 7072 if (nrsm == NULL) { 7073 /* We could not allocate ignore the sack */ 7074 struct sackblk blk; 7075 7076 blk.start = start; 7077 blk.end = end; 7078 sack_filter_reject(&bbr->r_ctl.bbr_sf, &blk); 7079 goto out; 7080 } 7081 bbr_clone_rsm(bbr, nrsm, rsm, start); 7082 TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_map, rsm, nrsm, r_next); 7083 if (rsm->r_in_tmap) { 7084 TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 7085 nrsm->r_in_tmap = 1; 7086 } 7087 rsm->r_flags &= (~BBR_HAS_FIN); 7088 rsm = nrsm; 7089 } 7090 if (SEQ_GEQ(end, rsm->r_end)) { 7091 /* 7092 * The end of this block is either beyond this guy or right 7093 * at this guy. 7094 */ 7095 if ((rsm->r_flags & BBR_ACKED) == 0) { 7096 bbr_update_rtt(tp, bbr, rsm, to, cts, BBR_SACKED, 0); 7097 changed += (rsm->r_end - rsm->r_start); 7098 bbr->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start); 7099 bbr_log_sack_passed(tp, bbr, rsm); 7100 if (rsm->r_flags & BBR_MARKED_LOST) { 7101 bbr->r_ctl.rc_lost_bytes -= rsm->r_end - rsm->r_start; 7102 } 7103 /* Is Reordering occuring? */ 7104 if (rsm->r_flags & BBR_SACK_PASSED) { 7105 BBR_STAT_INC(bbr_reorder_seen); 7106 bbr->r_ctl.rc_reorder_ts = cts; 7107 if (rsm->r_flags & BBR_MARKED_LOST) { 7108 bbr->r_ctl.rc_lost -= rsm->r_end - rsm->r_start; 7109 if (SEQ_GT(bbr->r_ctl.rc_lt_lost, bbr->r_ctl.rc_lost)) 7110 /* LT sampling also needs adjustment */ 7111 bbr->r_ctl.rc_lt_lost = bbr->r_ctl.rc_lost; 7112 } 7113 } 7114 rsm->r_flags |= BBR_ACKED; 7115 rsm->r_flags &= ~(BBR_TLP|BBR_WAS_RENEGED|BBR_RXT_CLEARED|BBR_MARKED_LOST); 7116 if (rsm->r_in_tmap) { 7117 TAILQ_REMOVE(&bbr->r_ctl.rc_tmap, rsm, r_tnext); 7118 rsm->r_in_tmap = 0; 7119 } 7120 } 7121 bbr_isit_a_pkt_epoch(bbr, cts, rsm, __LINE__, BBR_SACKED); 7122 if (end == rsm->r_end) { 7123 /* This block only - done */ 7124 goto out; 7125 } 7126 /* There is more not coverend by this rsm move on */ 7127 start = rsm->r_end; 7128 nrsm = TAILQ_NEXT(rsm, r_next); 7129 rsm = nrsm; 7130 times = 0; 7131 goto do_rest_ofb; 7132 } 7133 if (rsm->r_flags & BBR_ACKED) { 7134 /* Been here done that */ 7135 goto out; 7136 } 7137 /* Ok we need to split off this one at the tail */ 7138 if (bbr_sack_mergable(rsm, start, end)) 7139 nrsm = bbr_alloc_full_limit(bbr); 7140 else 7141 nrsm = bbr_alloc_limit(bbr, BBR_LIMIT_TYPE_SPLIT); 7142 if (nrsm == NULL) { 7143 /* failed XXXrrs what can we do but loose the sack info? */ 7144 struct sackblk blk; 7145 7146 blk.start = start; 7147 blk.end = end; 7148 sack_filter_reject(&bbr->r_ctl.bbr_sf, &blk); 7149 goto out; 7150 } 7151 /* Clone it */ 7152 bbr_clone_rsm(bbr, nrsm, rsm, end); 7153 /* The sack block does not cover this guy fully */ 7154 rsm->r_flags &= (~BBR_HAS_FIN); 7155 TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_map, rsm, nrsm, r_next); 7156 if (rsm->r_in_tmap) { 7157 TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 7158 nrsm->r_in_tmap = 1; 7159 } 7160 nrsm->r_dupack = 0; 7161 bbr_update_rtt(tp, bbr, rsm, to, cts, BBR_SACKED, 0); 7162 bbr_isit_a_pkt_epoch(bbr, cts, rsm, __LINE__, BBR_SACKED); 7163 changed += (rsm->r_end - rsm->r_start); 7164 bbr->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start); 7165 bbr_log_sack_passed(tp, bbr, rsm); 7166 /* Is Reordering occuring? */ 7167 if (rsm->r_flags & BBR_MARKED_LOST) { 7168 bbr->r_ctl.rc_lost_bytes -= rsm->r_end - rsm->r_start; 7169 } 7170 if (rsm->r_flags & BBR_SACK_PASSED) { 7171 BBR_STAT_INC(bbr_reorder_seen); 7172 bbr->r_ctl.rc_reorder_ts = cts; 7173 if (rsm->r_flags & BBR_MARKED_LOST) { 7174 bbr->r_ctl.rc_lost -= rsm->r_end - rsm->r_start; 7175 if (SEQ_GT(bbr->r_ctl.rc_lt_lost, bbr->r_ctl.rc_lost)) 7176 /* LT sampling also needs adjustment */ 7177 bbr->r_ctl.rc_lt_lost = bbr->r_ctl.rc_lost; 7178 } 7179 } 7180 rsm->r_flags &= ~(BBR_TLP|BBR_WAS_RENEGED|BBR_RXT_CLEARED|BBR_MARKED_LOST); 7181 rsm->r_flags |= BBR_ACKED; 7182 if (rsm->r_in_tmap) { 7183 TAILQ_REMOVE(&bbr->r_ctl.rc_tmap, rsm, r_tnext); 7184 rsm->r_in_tmap = 0; 7185 } 7186 out: 7187 if (rsm && (rsm->r_flags & BBR_ACKED)) { 7188 /* 7189 * Now can we merge this newly acked 7190 * block with either the previous or 7191 * next block? 7192 */ 7193 nrsm = TAILQ_NEXT(rsm, r_next); 7194 if (nrsm && 7195 (nrsm->r_flags & BBR_ACKED)) { 7196 /* yep this and next can be merged */ 7197 rsm = bbr_merge_rsm(bbr, rsm, nrsm); 7198 } 7199 /* Now what about the previous? */ 7200 nrsm = TAILQ_PREV(rsm, bbr_head, r_next); 7201 if (nrsm && 7202 (nrsm->r_flags & BBR_ACKED)) { 7203 /* yep the previous and this can be merged */ 7204 rsm = bbr_merge_rsm(bbr, nrsm, rsm); 7205 } 7206 } 7207 if (used_ref == 0) { 7208 BBR_STAT_INC(bbr_sack_proc_all); 7209 } else { 7210 BBR_STAT_INC(bbr_sack_proc_short); 7211 } 7212 if (went_fwd && went_back) { 7213 BBR_STAT_INC(bbr_sack_search_both); 7214 } else if (went_fwd) { 7215 BBR_STAT_INC(bbr_sack_search_fwd); 7216 } else if (went_back) { 7217 BBR_STAT_INC(bbr_sack_search_back); 7218 } 7219 /* Save off where the next seq is */ 7220 if (rsm) 7221 bbr->r_ctl.rc_sacklast = TAILQ_NEXT(rsm, r_next); 7222 else 7223 bbr->r_ctl.rc_sacklast = NULL; 7224 *prsm = rsm; 7225 return (changed); 7226 } 7227 7228 static void inline 7229 bbr_peer_reneges(struct tcp_bbr *bbr, struct bbr_sendmap *rsm, tcp_seq th_ack) 7230 { 7231 struct bbr_sendmap *tmap; 7232 7233 BBR_STAT_INC(bbr_reneges_seen); 7234 tmap = NULL; 7235 while (rsm && (rsm->r_flags & BBR_ACKED)) { 7236 /* Its no longer sacked, mark it so */ 7237 uint32_t oflags; 7238 bbr->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); 7239 #ifdef BBR_INVARIANTS 7240 if (rsm->r_in_tmap) { 7241 panic("bbr:%p rsm:%p flags:0x%x in tmap?", 7242 bbr, rsm, rsm->r_flags); 7243 } 7244 #endif 7245 oflags = rsm->r_flags; 7246 if (rsm->r_flags & BBR_MARKED_LOST) { 7247 bbr->r_ctl.rc_lost -= rsm->r_end - rsm->r_start; 7248 bbr->r_ctl.rc_lost_bytes -= rsm->r_end - rsm->r_start; 7249 if (SEQ_GT(bbr->r_ctl.rc_lt_lost, bbr->r_ctl.rc_lost)) 7250 /* LT sampling also needs adjustment */ 7251 bbr->r_ctl.rc_lt_lost = bbr->r_ctl.rc_lost; 7252 } 7253 rsm->r_flags &= ~(BBR_ACKED | BBR_SACK_PASSED | BBR_WAS_SACKPASS | BBR_MARKED_LOST); 7254 rsm->r_flags |= BBR_WAS_RENEGED; 7255 rsm->r_flags |= BBR_RXT_CLEARED; 7256 bbr_log_type_rsmclear(bbr, bbr->r_ctl.rc_rcvtime, rsm, oflags, __LINE__); 7257 /* Rebuild it into our tmap */ 7258 if (tmap == NULL) { 7259 TAILQ_INSERT_HEAD(&bbr->r_ctl.rc_tmap, rsm, r_tnext); 7260 tmap = rsm; 7261 } else { 7262 TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_tmap, tmap, rsm, r_tnext); 7263 tmap = rsm; 7264 } 7265 tmap->r_in_tmap = 1; 7266 /* 7267 * XXXrrs Delivered? Should we do anything here? 7268 * 7269 * Of course we don't on a rxt timeout so maybe its ok that 7270 * we don't? 7271 * 7272 * For now lets not. 7273 */ 7274 rsm = TAILQ_NEXT(rsm, r_next); 7275 } 7276 /* 7277 * Now lets possibly clear the sack filter so we start recognizing 7278 * sacks that cover this area. 7279 */ 7280 sack_filter_clear(&bbr->r_ctl.bbr_sf, th_ack); 7281 } 7282 7283 static void 7284 bbr_log_syn(struct tcpcb *tp, struct tcpopt *to) 7285 { 7286 struct tcp_bbr *bbr; 7287 struct bbr_sendmap *rsm; 7288 uint32_t cts; 7289 7290 bbr = (struct tcp_bbr *)tp->t_fb_ptr; 7291 cts = bbr->r_ctl.rc_rcvtime; 7292 rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map); 7293 if (rsm && (rsm->r_flags & BBR_HAS_SYN)) { 7294 if ((rsm->r_end - rsm->r_start) <= 1) { 7295 /* Log out the SYN completely */ 7296 bbr->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes; 7297 rsm->r_rtr_bytes = 0; 7298 TAILQ_REMOVE(&bbr->r_ctl.rc_map, rsm, r_next); 7299 if (rsm->r_in_tmap) { 7300 TAILQ_REMOVE(&bbr->r_ctl.rc_tmap, rsm, r_tnext); 7301 rsm->r_in_tmap = 0; 7302 } 7303 if (bbr->r_ctl.rc_next == rsm) { 7304 /* scoot along the marker */ 7305 bbr->r_ctl.rc_next = TAILQ_FIRST(&bbr->r_ctl.rc_map); 7306 } 7307 if (to != NULL) 7308 bbr_update_rtt(tp, bbr, rsm, to, cts, BBR_CUM_ACKED, 0); 7309 bbr_free(bbr, rsm); 7310 } else { 7311 /* There is more (Fast open)? strip out SYN. */ 7312 rsm->r_flags &= ~BBR_HAS_SYN; 7313 rsm->r_start++; 7314 } 7315 } 7316 } 7317 7318 /* 7319 * Returns the number of bytes that were 7320 * acknowledged by SACK blocks. 7321 */ 7322 7323 static uint32_t 7324 bbr_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th, 7325 uint32_t *prev_acked) 7326 { 7327 uint32_t changed, last_seq, entered_recovery = 0; 7328 struct tcp_bbr *bbr; 7329 struct bbr_sendmap *rsm; 7330 struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1]; 7331 register uint32_t th_ack; 7332 int32_t i, j, k, new_sb, num_sack_blks = 0; 7333 uint32_t cts, acked, ack_point, sack_changed = 0; 7334 uint32_t p_maxseg, maxseg, p_acked = 0; 7335 7336 INP_WLOCK_ASSERT(tp->t_inpcb); 7337 if (tcp_get_flags(th) & TH_RST) { 7338 /* We don't log resets */ 7339 return (0); 7340 } 7341 bbr = (struct tcp_bbr *)tp->t_fb_ptr; 7342 cts = bbr->r_ctl.rc_rcvtime; 7343 7344 rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map); 7345 changed = 0; 7346 maxseg = tp->t_maxseg - bbr->rc_last_options; 7347 p_maxseg = min(bbr->r_ctl.rc_pace_max_segs, maxseg); 7348 th_ack = th->th_ack; 7349 if (SEQ_GT(th_ack, tp->snd_una)) { 7350 acked = th_ack - tp->snd_una; 7351 bbr_log_progress_event(bbr, tp, ticks, PROGRESS_UPDATE, __LINE__); 7352 bbr->rc_tp->t_acktime = ticks; 7353 } else 7354 acked = 0; 7355 if (SEQ_LEQ(th_ack, tp->snd_una)) { 7356 /* Only sent here for sack processing */ 7357 goto proc_sack; 7358 } 7359 if (rsm && SEQ_GT(th_ack, rsm->r_start)) { 7360 changed = th_ack - rsm->r_start; 7361 } else if ((rsm == NULL) && ((th_ack - 1) == tp->iss)) { 7362 /* 7363 * For the SYN incoming case we will not have called 7364 * tcp_output for the sending of the SYN, so there will be 7365 * no map. All other cases should probably be a panic. 7366 */ 7367 if ((to->to_flags & TOF_TS) && (to->to_tsecr != 0)) { 7368 /* 7369 * We have a timestamp that can be used to generate 7370 * an initial RTT. 7371 */ 7372 uint32_t ts, now, rtt; 7373 7374 ts = bbr_ts_convert(to->to_tsecr); 7375 now = bbr_ts_convert(tcp_tv_to_mssectick(&bbr->rc_tv)); 7376 rtt = now - ts; 7377 if (rtt < 1) 7378 rtt = 1; 7379 bbr_log_type_bbrrttprop(bbr, rtt, 7380 tp->iss, 0, cts, 7381 BBR_RTT_BY_TIMESTAMP, tp->iss, 0); 7382 apply_filter_min_small(&bbr->r_ctl.rc_rttprop, rtt, cts); 7383 changed = 1; 7384 bbr->r_wanted_output = 1; 7385 goto out; 7386 } 7387 goto proc_sack; 7388 } else if (rsm == NULL) { 7389 goto out; 7390 } 7391 if (changed) { 7392 /* 7393 * The ACK point is advancing to th_ack, we must drop off 7394 * the packets in the rack log and calculate any eligble 7395 * RTT's. 7396 */ 7397 bbr->r_wanted_output = 1; 7398 more: 7399 if (rsm == NULL) { 7400 if (tp->t_flags & TF_SENTFIN) { 7401 /* if we send a FIN we will not hav a map */ 7402 goto proc_sack; 7403 } 7404 #ifdef BBR_INVARIANTS 7405 panic("No rack map tp:%p for th:%p state:%d bbr:%p snd_una:%u snd_max:%u chg:%d\n", 7406 tp, 7407 th, tp->t_state, bbr, 7408 tp->snd_una, tp->snd_max, changed); 7409 #endif 7410 goto proc_sack; 7411 } 7412 } 7413 if (SEQ_LT(th_ack, rsm->r_start)) { 7414 /* Huh map is missing this */ 7415 #ifdef BBR_INVARIANTS 7416 printf("Rack map starts at r_start:%u for th_ack:%u huh? ts:%d rs:%d bbr:%p\n", 7417 rsm->r_start, 7418 th_ack, tp->t_state, 7419 bbr->r_state, bbr); 7420 panic("th-ack is bad bbr:%p tp:%p", bbr, tp); 7421 #endif 7422 goto proc_sack; 7423 } else if (th_ack == rsm->r_start) { 7424 /* None here to ack */ 7425 goto proc_sack; 7426 } 7427 /* 7428 * Clear the dup ack counter, it will 7429 * either be freed or if there is some 7430 * remaining we need to start it at zero. 7431 */ 7432 rsm->r_dupack = 0; 7433 /* Now do we consume the whole thing? */ 7434 if (SEQ_GEQ(th_ack, rsm->r_end)) { 7435 /* Its all consumed. */ 7436 uint32_t left; 7437 7438 if (rsm->r_flags & BBR_ACKED) { 7439 /* 7440 * It was acked on the scoreboard -- remove it from 7441 * total 7442 */ 7443 p_acked += (rsm->r_end - rsm->r_start); 7444 bbr->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); 7445 if (bbr->r_ctl.rc_sacked == 0) 7446 bbr->r_ctl.rc_sacklast = NULL; 7447 } else { 7448 bbr_update_rtt(tp, bbr, rsm, to, cts, BBR_CUM_ACKED, th_ack); 7449 if (rsm->r_flags & BBR_MARKED_LOST) { 7450 bbr->r_ctl.rc_lost_bytes -= rsm->r_end - rsm->r_start; 7451 } 7452 if (rsm->r_flags & BBR_SACK_PASSED) { 7453 /* 7454 * There are acked segments ACKED on the 7455 * scoreboard further up. We are seeing 7456 * reordering. 7457 */ 7458 BBR_STAT_INC(bbr_reorder_seen); 7459 bbr->r_ctl.rc_reorder_ts = cts; 7460 if (rsm->r_flags & BBR_MARKED_LOST) { 7461 bbr->r_ctl.rc_lost -= rsm->r_end - rsm->r_start; 7462 if (SEQ_GT(bbr->r_ctl.rc_lt_lost, bbr->r_ctl.rc_lost)) 7463 /* LT sampling also needs adjustment */ 7464 bbr->r_ctl.rc_lt_lost = bbr->r_ctl.rc_lost; 7465 } 7466 } 7467 rsm->r_flags &= ~BBR_MARKED_LOST; 7468 } 7469 bbr->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes; 7470 rsm->r_rtr_bytes = 0; 7471 TAILQ_REMOVE(&bbr->r_ctl.rc_map, rsm, r_next); 7472 if (rsm->r_in_tmap) { 7473 TAILQ_REMOVE(&bbr->r_ctl.rc_tmap, rsm, r_tnext); 7474 rsm->r_in_tmap = 0; 7475 } 7476 if (bbr->r_ctl.rc_next == rsm) { 7477 /* scoot along the marker */ 7478 bbr->r_ctl.rc_next = TAILQ_FIRST(&bbr->r_ctl.rc_map); 7479 } 7480 bbr_isit_a_pkt_epoch(bbr, cts, rsm, __LINE__, BBR_CUM_ACKED); 7481 /* Adjust the packet counts */ 7482 left = th_ack - rsm->r_end; 7483 /* Free back to zone */ 7484 bbr_free(bbr, rsm); 7485 if (left) { 7486 rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map); 7487 goto more; 7488 } 7489 goto proc_sack; 7490 } 7491 if (rsm->r_flags & BBR_ACKED) { 7492 /* 7493 * It was acked on the scoreboard -- remove it from total 7494 * for the part being cum-acked. 7495 */ 7496 p_acked += (rsm->r_end - rsm->r_start); 7497 bbr->r_ctl.rc_sacked -= (th_ack - rsm->r_start); 7498 if (bbr->r_ctl.rc_sacked == 0) 7499 bbr->r_ctl.rc_sacklast = NULL; 7500 } else { 7501 /* 7502 * It was acked up to th_ack point for the first time 7503 */ 7504 struct bbr_sendmap lrsm; 7505 7506 memcpy(&lrsm, rsm, sizeof(struct bbr_sendmap)); 7507 lrsm.r_end = th_ack; 7508 bbr_update_rtt(tp, bbr, &lrsm, to, cts, BBR_CUM_ACKED, th_ack); 7509 } 7510 if ((rsm->r_flags & BBR_MARKED_LOST) && 7511 ((rsm->r_flags & BBR_ACKED) == 0)) { 7512 /* 7513 * It was marked lost and partly ack'd now 7514 * for the first time. We lower the rc_lost_bytes 7515 * and still leave it MARKED. 7516 */ 7517 bbr->r_ctl.rc_lost_bytes -= th_ack - rsm->r_start; 7518 } 7519 bbr_isit_a_pkt_epoch(bbr, cts, rsm, __LINE__, BBR_CUM_ACKED); 7520 bbr->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes; 7521 rsm->r_rtr_bytes = 0; 7522 /* adjust packet count */ 7523 rsm->r_start = th_ack; 7524 proc_sack: 7525 /* Check for reneging */ 7526 rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map); 7527 if (rsm && (rsm->r_flags & BBR_ACKED) && (th_ack == rsm->r_start)) { 7528 /* 7529 * The peer has moved snd_una up to the edge of this send, 7530 * i.e. one that it had previously acked. The only way that 7531 * can be true if the peer threw away data (space issues) 7532 * that it had previously sacked (else it would have given 7533 * us snd_una up to (rsm->r_end). We need to undo the acked 7534 * markings here. 7535 * 7536 * Note we have to look to make sure th_ack is our 7537 * rsm->r_start in case we get an old ack where th_ack is 7538 * behind snd_una. 7539 */ 7540 bbr_peer_reneges(bbr, rsm, th->th_ack); 7541 } 7542 if ((to->to_flags & TOF_SACK) == 0) { 7543 /* We are done nothing left to log */ 7544 goto out; 7545 } 7546 rsm = TAILQ_LAST_FAST(&bbr->r_ctl.rc_map, bbr_sendmap, r_next); 7547 if (rsm) { 7548 last_seq = rsm->r_end; 7549 } else { 7550 last_seq = tp->snd_max; 7551 } 7552 /* Sack block processing */ 7553 if (SEQ_GT(th_ack, tp->snd_una)) 7554 ack_point = th_ack; 7555 else 7556 ack_point = tp->snd_una; 7557 for (i = 0; i < to->to_nsacks; i++) { 7558 bcopy((to->to_sacks + i * TCPOLEN_SACK), 7559 &sack, sizeof(sack)); 7560 sack.start = ntohl(sack.start); 7561 sack.end = ntohl(sack.end); 7562 if (SEQ_GT(sack.end, sack.start) && 7563 SEQ_GT(sack.start, ack_point) && 7564 SEQ_LT(sack.start, tp->snd_max) && 7565 SEQ_GT(sack.end, ack_point) && 7566 SEQ_LEQ(sack.end, tp->snd_max)) { 7567 if ((bbr->r_ctl.rc_num_small_maps_alloced > bbr_sack_block_limit) && 7568 (SEQ_LT(sack.end, last_seq)) && 7569 ((sack.end - sack.start) < (p_maxseg / 8))) { 7570 /* 7571 * Not the last piece and its smaller than 7572 * 1/8th of a p_maxseg. We ignore this. 7573 */ 7574 BBR_STAT_INC(bbr_runt_sacks); 7575 continue; 7576 } 7577 sack_blocks[num_sack_blks] = sack; 7578 num_sack_blks++; 7579 } else if (SEQ_LEQ(sack.start, th_ack) && 7580 SEQ_LEQ(sack.end, th_ack)) { 7581 /* 7582 * Its a D-SACK block. 7583 */ 7584 tcp_record_dsack(tp, sack.start, sack.end, 0); 7585 } 7586 } 7587 if (num_sack_blks == 0) 7588 goto out; 7589 /* 7590 * Sort the SACK blocks so we can update the rack scoreboard with 7591 * just one pass. 7592 */ 7593 new_sb = sack_filter_blks(&bbr->r_ctl.bbr_sf, sack_blocks, 7594 num_sack_blks, th->th_ack); 7595 ctf_log_sack_filter(bbr->rc_tp, new_sb, sack_blocks); 7596 BBR_STAT_ADD(bbr_sack_blocks, num_sack_blks); 7597 BBR_STAT_ADD(bbr_sack_blocks_skip, (num_sack_blks - new_sb)); 7598 num_sack_blks = new_sb; 7599 if (num_sack_blks < 2) { 7600 goto do_sack_work; 7601 } 7602 /* Sort the sacks */ 7603 for (i = 0; i < num_sack_blks; i++) { 7604 for (j = i + 1; j < num_sack_blks; j++) { 7605 if (SEQ_GT(sack_blocks[i].end, sack_blocks[j].end)) { 7606 sack = sack_blocks[i]; 7607 sack_blocks[i] = sack_blocks[j]; 7608 sack_blocks[j] = sack; 7609 } 7610 } 7611 } 7612 /* 7613 * Now are any of the sack block ends the same (yes some 7614 * implememtations send these)? 7615 */ 7616 again: 7617 if (num_sack_blks > 1) { 7618 for (i = 0; i < num_sack_blks; i++) { 7619 for (j = i + 1; j < num_sack_blks; j++) { 7620 if (sack_blocks[i].end == sack_blocks[j].end) { 7621 /* 7622 * Ok these two have the same end we 7623 * want the smallest end and then 7624 * throw away the larger and start 7625 * again. 7626 */ 7627 if (SEQ_LT(sack_blocks[j].start, sack_blocks[i].start)) { 7628 /* 7629 * The second block covers 7630 * more area use that 7631 */ 7632 sack_blocks[i].start = sack_blocks[j].start; 7633 } 7634 /* 7635 * Now collapse out the dup-sack and 7636 * lower the count 7637 */ 7638 for (k = (j + 1); k < num_sack_blks; k++) { 7639 sack_blocks[j].start = sack_blocks[k].start; 7640 sack_blocks[j].end = sack_blocks[k].end; 7641 j++; 7642 } 7643 num_sack_blks--; 7644 goto again; 7645 } 7646 } 7647 } 7648 } 7649 do_sack_work: 7650 rsm = bbr->r_ctl.rc_sacklast; 7651 for (i = 0; i < num_sack_blks; i++) { 7652 acked = bbr_proc_sack_blk(tp, bbr, &sack_blocks[i], to, &rsm, cts); 7653 if (acked) { 7654 bbr->r_wanted_output = 1; 7655 changed += acked; 7656 sack_changed += acked; 7657 } 7658 } 7659 out: 7660 *prev_acked = p_acked; 7661 if ((sack_changed) && (!IN_RECOVERY(tp->t_flags))) { 7662 /* 7663 * Ok we have a high probability that we need to go in to 7664 * recovery since we have data sack'd 7665 */ 7666 struct bbr_sendmap *rsm; 7667 7668 rsm = bbr_check_recovery_mode(tp, bbr, cts); 7669 if (rsm) { 7670 /* Enter recovery */ 7671 entered_recovery = 1; 7672 bbr->r_wanted_output = 1; 7673 /* 7674 * When we enter recovery we need to assure we send 7675 * one packet. 7676 */ 7677 if (bbr->r_ctl.rc_resend == NULL) { 7678 bbr->r_ctl.rc_resend = rsm; 7679 } 7680 } 7681 } 7682 if (IN_RECOVERY(tp->t_flags) && (entered_recovery == 0)) { 7683 /* 7684 * See if we need to rack-retransmit anything if so set it 7685 * up as the thing to resend assuming something else is not 7686 * already in that position. 7687 */ 7688 if (bbr->r_ctl.rc_resend == NULL) { 7689 bbr->r_ctl.rc_resend = bbr_check_recovery_mode(tp, bbr, cts); 7690 } 7691 } 7692 /* 7693 * We return the amount that changed via sack, this is used by the 7694 * ack-received code to augment what was changed between th_ack <-> 7695 * snd_una. 7696 */ 7697 return (sack_changed); 7698 } 7699 7700 static void 7701 bbr_strike_dupack(struct tcp_bbr *bbr) 7702 { 7703 struct bbr_sendmap *rsm; 7704 7705 rsm = TAILQ_FIRST(&bbr->r_ctl.rc_tmap); 7706 if (rsm && (rsm->r_dupack < 0xff)) { 7707 rsm->r_dupack++; 7708 if (rsm->r_dupack >= DUP_ACK_THRESHOLD) 7709 bbr->r_wanted_output = 1; 7710 } 7711 } 7712 7713 /* 7714 * Return value of 1, we do not need to call bbr_process_data(). 7715 * return value of 0, bbr_process_data can be called. 7716 * For ret_val if its 0 the TCB is locked and valid, if its non-zero 7717 * its unlocked and probably unsafe to touch the TCB. 7718 */ 7719 static int 7720 bbr_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so, 7721 struct tcpcb *tp, struct tcpopt *to, 7722 uint32_t tiwin, int32_t tlen, 7723 int32_t * ofia, int32_t thflags, int32_t * ret_val) 7724 { 7725 int32_t ourfinisacked = 0; 7726 int32_t acked_amount; 7727 uint16_t nsegs; 7728 int32_t acked; 7729 uint32_t lost, sack_changed = 0; 7730 struct mbuf *mfree; 7731 struct tcp_bbr *bbr; 7732 uint32_t prev_acked = 0; 7733 7734 bbr = (struct tcp_bbr *)tp->t_fb_ptr; 7735 lost = bbr->r_ctl.rc_lost; 7736 nsegs = max(1, m->m_pkthdr.lro_nsegs); 7737 if (SEQ_GT(th->th_ack, tp->snd_max)) { 7738 ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val); 7739 bbr->r_wanted_output = 1; 7740 return (1); 7741 } 7742 if (SEQ_GEQ(th->th_ack, tp->snd_una) || to->to_nsacks) { 7743 /* Process the ack */ 7744 if (bbr->rc_in_persist) 7745 tp->t_rxtshift = 0; 7746 if ((th->th_ack == tp->snd_una) && (tiwin == tp->snd_wnd)) 7747 bbr_strike_dupack(bbr); 7748 sack_changed = bbr_log_ack(tp, to, th, &prev_acked); 7749 } 7750 bbr_lt_bw_sampling(bbr, bbr->r_ctl.rc_rcvtime, (bbr->r_ctl.rc_lost > lost)); 7751 if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) { 7752 /* 7753 * Old ack, behind the last one rcv'd or a duplicate ack 7754 * with SACK info. 7755 */ 7756 if (th->th_ack == tp->snd_una) { 7757 bbr_ack_received(tp, bbr, th, 0, sack_changed, prev_acked, __LINE__, 0); 7758 if (bbr->r_state == TCPS_SYN_SENT) { 7759 /* 7760 * Special case on where we sent SYN. When 7761 * the SYN-ACK is processed in syn_sent 7762 * state it bumps the snd_una. This causes 7763 * us to hit here even though we did ack 1 7764 * byte. 7765 * 7766 * Go through the nothing left case so we 7767 * send data. 7768 */ 7769 goto nothing_left; 7770 } 7771 } 7772 return (0); 7773 } 7774 /* 7775 * If we reach this point, ACK is not a duplicate, i.e., it ACKs 7776 * something we sent. 7777 */ 7778 if (tp->t_flags & TF_NEEDSYN) { 7779 /* 7780 * T/TCP: Connection was half-synchronized, and our SYN has 7781 * been ACK'd (so connection is now fully synchronized). Go 7782 * to non-starred state, increment snd_una for ACK of SYN, 7783 * and check if we can do window scaling. 7784 */ 7785 tp->t_flags &= ~TF_NEEDSYN; 7786 tp->snd_una++; 7787 /* Do window scaling? */ 7788 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 7789 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 7790 tp->rcv_scale = tp->request_r_scale; 7791 /* Send window already scaled. */ 7792 } 7793 } 7794 INP_WLOCK_ASSERT(tp->t_inpcb); 7795 7796 acked = BYTES_THIS_ACK(tp, th); 7797 KMOD_TCPSTAT_ADD(tcps_rcvackpack, (int)nsegs); 7798 KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked); 7799 7800 /* 7801 * If we just performed our first retransmit, and the ACK arrives 7802 * within our recovery window, then it was a mistake to do the 7803 * retransmit in the first place. Recover our original cwnd and 7804 * ssthresh, and proceed to transmit where we left off. 7805 */ 7806 if (tp->t_flags & TF_PREVVALID) { 7807 tp->t_flags &= ~TF_PREVVALID; 7808 if (tp->t_rxtshift == 1 && 7809 (int)(ticks - tp->t_badrxtwin) < 0) 7810 bbr_cong_signal(tp, th, CC_RTO_ERR, NULL); 7811 } 7812 SOCKBUF_LOCK(&so->so_snd); 7813 acked_amount = min(acked, (int)sbavail(&so->so_snd)); 7814 tp->snd_wnd -= acked_amount; 7815 mfree = sbcut_locked(&so->so_snd, acked_amount); 7816 /* NB: sowwakeup_locked() does an implicit unlock. */ 7817 sowwakeup_locked(so); 7818 m_freem(mfree); 7819 if (SEQ_GT(th->th_ack, tp->snd_una)) { 7820 bbr_collapse_rtt(tp, bbr, TCP_REXMTVAL(tp)); 7821 } 7822 tp->snd_una = th->th_ack; 7823 bbr_ack_received(tp, bbr, th, acked, sack_changed, prev_acked, __LINE__, (bbr->r_ctl.rc_lost - lost)); 7824 if (IN_RECOVERY(tp->t_flags)) { 7825 if (SEQ_LT(th->th_ack, tp->snd_recover) && 7826 (SEQ_LT(th->th_ack, tp->snd_max))) { 7827 tcp_bbr_partialack(tp); 7828 } else { 7829 bbr_post_recovery(tp); 7830 } 7831 } 7832 if (SEQ_GT(tp->snd_una, tp->snd_recover)) { 7833 tp->snd_recover = tp->snd_una; 7834 } 7835 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { 7836 tp->snd_nxt = tp->snd_max; 7837 } 7838 if (tp->snd_una == tp->snd_max) { 7839 /* Nothing left outstanding */ 7840 nothing_left: 7841 bbr_log_progress_event(bbr, tp, ticks, PROGRESS_CLEAR, __LINE__); 7842 if (sbavail(&tp->t_inpcb->inp_socket->so_snd) == 0) 7843 bbr->rc_tp->t_acktime = 0; 7844 if ((sbused(&so->so_snd) == 0) && 7845 (tp->t_flags & TF_SENTFIN)) { 7846 ourfinisacked = 1; 7847 } 7848 bbr_timer_cancel(bbr, __LINE__, bbr->r_ctl.rc_rcvtime); 7849 if (bbr->rc_in_persist == 0) { 7850 bbr->r_ctl.rc_went_idle_time = bbr->r_ctl.rc_rcvtime; 7851 } 7852 sack_filter_clear(&bbr->r_ctl.bbr_sf, tp->snd_una); 7853 bbr_log_ack_clear(bbr, bbr->r_ctl.rc_rcvtime); 7854 /* 7855 * We invalidate the last ack here since we 7856 * don't want to transfer forward the time 7857 * for our sum's calculations. 7858 */ 7859 if ((tp->t_state >= TCPS_FIN_WAIT_1) && 7860 (sbavail(&so->so_snd) == 0) && 7861 (tp->t_flags2 & TF2_DROP_AF_DATA)) { 7862 /* 7863 * The socket was gone and the peer sent data, time 7864 * to reset him. 7865 */ 7866 *ret_val = 1; 7867 tcp_log_end_status(tp, TCP_EI_STATUS_DATA_A_CLOSE); 7868 /* tcp_close will kill the inp pre-log the Reset */ 7869 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); 7870 tp = tcp_close(tp); 7871 ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, tlen); 7872 BBR_STAT_INC(bbr_dropped_af_data); 7873 return (1); 7874 } 7875 /* Set need output so persist might get set */ 7876 bbr->r_wanted_output = 1; 7877 } 7878 if (ofia) 7879 *ofia = ourfinisacked; 7880 return (0); 7881 } 7882 7883 static void 7884 bbr_enter_persist(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts, int32_t line) 7885 { 7886 if (bbr->rc_in_persist == 0) { 7887 bbr_timer_cancel(bbr, __LINE__, cts); 7888 bbr->r_ctl.rc_last_delay_val = 0; 7889 tp->t_rxtshift = 0; 7890 bbr->rc_in_persist = 1; 7891 bbr->r_ctl.rc_went_idle_time = cts; 7892 /* We should be capped when rw went to 0 but just in case */ 7893 bbr_log_type_pesist(bbr, cts, 0, line, 1); 7894 /* Time freezes for the state, so do the accounting now */ 7895 if (SEQ_GT(cts, bbr->r_ctl.rc_bbr_state_time)) { 7896 uint32_t time_in; 7897 7898 time_in = cts - bbr->r_ctl.rc_bbr_state_time; 7899 if (bbr->rc_bbr_state == BBR_STATE_PROBE_BW) { 7900 int32_t idx; 7901 7902 idx = bbr_state_val(bbr); 7903 counter_u64_add(bbr_state_time[(idx + 5)], time_in); 7904 } else { 7905 counter_u64_add(bbr_state_time[bbr->rc_bbr_state], time_in); 7906 } 7907 } 7908 bbr->r_ctl.rc_bbr_state_time = cts; 7909 } 7910 } 7911 7912 static void 7913 bbr_restart_after_idle(struct tcp_bbr *bbr, uint32_t cts, uint32_t idle_time) 7914 { 7915 /* 7916 * Note that if idle time does not exceed our 7917 * threshold, we do nothing continuing the state 7918 * transitions we were last walking through. 7919 */ 7920 if (idle_time >= bbr_idle_restart_threshold) { 7921 if (bbr->rc_use_idle_restart) { 7922 bbr->rc_bbr_state = BBR_STATE_IDLE_EXIT; 7923 /* 7924 * Set our target using BBR_UNIT, so 7925 * we increase at a dramatic rate but 7926 * we stop when we get the pipe 7927 * full again for our current b/w estimate. 7928 */ 7929 bbr->r_ctl.rc_bbr_hptsi_gain = BBR_UNIT; 7930 bbr->r_ctl.rc_bbr_cwnd_gain = BBR_UNIT; 7931 bbr_set_state_target(bbr, __LINE__); 7932 /* Now setup our gains to ramp up */ 7933 bbr->r_ctl.rc_bbr_hptsi_gain = bbr->r_ctl.rc_startup_pg; 7934 bbr->r_ctl.rc_bbr_cwnd_gain = bbr->r_ctl.rc_startup_pg; 7935 bbr_log_type_statechange(bbr, cts, __LINE__); 7936 } else if (bbr->rc_bbr_state == BBR_STATE_PROBE_BW) { 7937 bbr_substate_change(bbr, cts, __LINE__, 1); 7938 } 7939 } 7940 } 7941 7942 static void 7943 bbr_exit_persist(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts, int32_t line) 7944 { 7945 uint32_t idle_time; 7946 7947 if (bbr->rc_in_persist == 0) 7948 return; 7949 idle_time = bbr_calc_time(cts, bbr->r_ctl.rc_went_idle_time); 7950 bbr->rc_in_persist = 0; 7951 bbr->rc_hit_state_1 = 0; 7952 bbr->r_ctl.rc_del_time = cts; 7953 /* 7954 * We invalidate the last ack here since we 7955 * don't want to transfer forward the time 7956 * for our sum's calculations. 7957 */ 7958 if (tcp_in_hpts(bbr->rc_inp)) { 7959 tcp_hpts_remove(bbr->rc_inp); 7960 bbr->rc_timer_first = 0; 7961 bbr->r_ctl.rc_hpts_flags = 0; 7962 bbr->r_ctl.rc_last_delay_val = 0; 7963 bbr->r_ctl.rc_hptsi_agg_delay = 0; 7964 bbr->r_agg_early_set = 0; 7965 bbr->r_ctl.rc_agg_early = 0; 7966 } 7967 bbr_log_type_pesist(bbr, cts, idle_time, line, 0); 7968 if (idle_time >= bbr_rtt_probe_time) { 7969 /* 7970 * This qualifies as a RTT_PROBE session since we drop the 7971 * data outstanding to nothing and waited more than 7972 * bbr_rtt_probe_time. 7973 */ 7974 bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_PERSIST, 0); 7975 bbr->r_ctl.last_in_probertt = bbr->r_ctl.rc_rtt_shrinks = cts; 7976 } 7977 tp->t_rxtshift = 0; 7978 /* 7979 * If in probeBW and we have persisted more than an RTT lets do 7980 * special handling. 7981 */ 7982 /* Force a time based epoch */ 7983 bbr_set_epoch(bbr, cts, __LINE__); 7984 /* 7985 * Setup the lost so we don't count anything against the guy 7986 * we have been stuck with during persists. 7987 */ 7988 bbr->r_ctl.bbr_lost_at_state = bbr->r_ctl.rc_lost; 7989 /* Time un-freezes for the state */ 7990 bbr->r_ctl.rc_bbr_state_time = cts; 7991 if ((bbr->rc_bbr_state == BBR_STATE_PROBE_BW) || 7992 (bbr->rc_bbr_state == BBR_STATE_PROBE_RTT)) { 7993 /* 7994 * If we are going back to probe-bw 7995 * or probe_rtt, we may need to possibly 7996 * do a fast restart. 7997 */ 7998 bbr_restart_after_idle(bbr, cts, idle_time); 7999 } 8000 } 8001 8002 static void 8003 bbr_collapsed_window(struct tcp_bbr *bbr) 8004 { 8005 /* 8006 * Now we must walk the 8007 * send map and divide the 8008 * ones left stranded. These 8009 * guys can't cause us to abort 8010 * the connection and are really 8011 * "unsent". However if a buggy 8012 * client actually did keep some 8013 * of the data i.e. collapsed the win 8014 * and refused to ack and then opened 8015 * the win and acked that data. We would 8016 * get into an ack war, the simplier 8017 * method then of just pretending we 8018 * did not send those segments something 8019 * won't work. 8020 */ 8021 struct bbr_sendmap *rsm, *nrsm; 8022 tcp_seq max_seq; 8023 uint32_t maxseg; 8024 int can_split = 0; 8025 int fnd = 0; 8026 8027 maxseg = bbr->rc_tp->t_maxseg - bbr->rc_last_options; 8028 max_seq = bbr->rc_tp->snd_una + bbr->rc_tp->snd_wnd; 8029 bbr_log_type_rwnd_collapse(bbr, max_seq, 1, 0); 8030 TAILQ_FOREACH(rsm, &bbr->r_ctl.rc_map, r_next) { 8031 /* Find the first seq past or at maxseq */ 8032 if (rsm->r_flags & BBR_RWND_COLLAPSED) 8033 rsm->r_flags &= ~BBR_RWND_COLLAPSED; 8034 if (SEQ_GEQ(max_seq, rsm->r_start) && 8035 SEQ_GEQ(rsm->r_end, max_seq)) { 8036 fnd = 1; 8037 break; 8038 } 8039 } 8040 bbr->rc_has_collapsed = 0; 8041 if (!fnd) { 8042 /* Nothing to do strange */ 8043 return; 8044 } 8045 /* 8046 * Now can we split? 8047 * 8048 * We don't want to split if splitting 8049 * would generate too many small segments 8050 * less we let an attacker fragment our 8051 * send_map and leave us out of memory. 8052 */ 8053 if ((max_seq != rsm->r_start) && 8054 (max_seq != rsm->r_end)){ 8055 /* can we split? */ 8056 int res1, res2; 8057 8058 res1 = max_seq - rsm->r_start; 8059 res2 = rsm->r_end - max_seq; 8060 if ((res1 >= (maxseg/8)) && 8061 (res2 >= (maxseg/8))) { 8062 /* No small pieces here */ 8063 can_split = 1; 8064 } else if (bbr->r_ctl.rc_num_small_maps_alloced < bbr_sack_block_limit) { 8065 /* We are under the limit */ 8066 can_split = 1; 8067 } 8068 } 8069 /* Ok do we need to split this rsm? */ 8070 if (max_seq == rsm->r_start) { 8071 /* It's this guy no split required */ 8072 nrsm = rsm; 8073 } else if (max_seq == rsm->r_end) { 8074 /* It's the next one no split required. */ 8075 nrsm = TAILQ_NEXT(rsm, r_next); 8076 if (nrsm == NULL) { 8077 /* Huh? */ 8078 return; 8079 } 8080 } else if (can_split && SEQ_LT(max_seq, rsm->r_end)) { 8081 /* yep we need to split it */ 8082 nrsm = bbr_alloc_limit(bbr, BBR_LIMIT_TYPE_SPLIT); 8083 if (nrsm == NULL) { 8084 /* failed XXXrrs what can we do mark the whole? */ 8085 nrsm = rsm; 8086 goto no_split; 8087 } 8088 /* Clone it */ 8089 bbr_log_type_rwnd_collapse(bbr, max_seq, 3, 0); 8090 bbr_clone_rsm(bbr, nrsm, rsm, max_seq); 8091 TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_map, rsm, nrsm, r_next); 8092 if (rsm->r_in_tmap) { 8093 TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 8094 nrsm->r_in_tmap = 1; 8095 } 8096 } else { 8097 /* 8098 * Split not allowed just start here just 8099 * use this guy. 8100 */ 8101 nrsm = rsm; 8102 } 8103 no_split: 8104 BBR_STAT_INC(bbr_collapsed_win); 8105 /* reuse fnd as a count */ 8106 fnd = 0; 8107 TAILQ_FOREACH_FROM(nrsm, &bbr->r_ctl.rc_map, r_next) { 8108 nrsm->r_flags |= BBR_RWND_COLLAPSED; 8109 fnd++; 8110 bbr->rc_has_collapsed = 1; 8111 } 8112 bbr_log_type_rwnd_collapse(bbr, max_seq, 4, fnd); 8113 } 8114 8115 static void 8116 bbr_un_collapse_window(struct tcp_bbr *bbr) 8117 { 8118 struct bbr_sendmap *rsm; 8119 int cleared = 0; 8120 8121 TAILQ_FOREACH_REVERSE(rsm, &bbr->r_ctl.rc_map, bbr_head, r_next) { 8122 if (rsm->r_flags & BBR_RWND_COLLAPSED) { 8123 /* Clear the flag */ 8124 rsm->r_flags &= ~BBR_RWND_COLLAPSED; 8125 cleared++; 8126 } else 8127 break; 8128 } 8129 bbr_log_type_rwnd_collapse(bbr, 8130 (bbr->rc_tp->snd_una + bbr->rc_tp->snd_wnd), 0, cleared); 8131 bbr->rc_has_collapsed = 0; 8132 } 8133 8134 /* 8135 * Return value of 1, the TCB is unlocked and most 8136 * likely gone, return value of 0, the TCB is still 8137 * locked. 8138 */ 8139 static int 8140 bbr_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so, 8141 struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, 8142 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) 8143 { 8144 /* 8145 * Update window information. Don't look at window if no ACK: TAC's 8146 * send garbage on first SYN. 8147 */ 8148 uint16_t nsegs; 8149 int32_t tfo_syn; 8150 struct tcp_bbr *bbr; 8151 8152 bbr = (struct tcp_bbr *)tp->t_fb_ptr; 8153 INP_WLOCK_ASSERT(tp->t_inpcb); 8154 nsegs = max(1, m->m_pkthdr.lro_nsegs); 8155 if ((thflags & TH_ACK) && 8156 (SEQ_LT(tp->snd_wl1, th->th_seq) || 8157 (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || 8158 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { 8159 /* keep track of pure window updates */ 8160 if (tlen == 0 && 8161 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) 8162 KMOD_TCPSTAT_INC(tcps_rcvwinupd); 8163 tp->snd_wnd = tiwin; 8164 tp->snd_wl1 = th->th_seq; 8165 tp->snd_wl2 = th->th_ack; 8166 if (tp->snd_wnd > tp->max_sndwnd) 8167 tp->max_sndwnd = tp->snd_wnd; 8168 bbr->r_wanted_output = 1; 8169 } else if (thflags & TH_ACK) { 8170 if ((tp->snd_wl2 == th->th_ack) && (tiwin < tp->snd_wnd)) { 8171 tp->snd_wnd = tiwin; 8172 tp->snd_wl1 = th->th_seq; 8173 tp->snd_wl2 = th->th_ack; 8174 } 8175 } 8176 if (tp->snd_wnd < ctf_outstanding(tp)) 8177 /* The peer collapsed its window on us */ 8178 bbr_collapsed_window(bbr); 8179 else if (bbr->rc_has_collapsed) 8180 bbr_un_collapse_window(bbr); 8181 /* Was persist timer active and now we have window space? */ 8182 if ((bbr->rc_in_persist != 0) && 8183 (tp->snd_wnd >= min((bbr->r_ctl.rc_high_rwnd/2), 8184 bbr_minseg(bbr)))) { 8185 /* 8186 * Make the rate persist at end of persist mode if idle long 8187 * enough 8188 */ 8189 bbr_exit_persist(tp, bbr, bbr->r_ctl.rc_rcvtime, __LINE__); 8190 8191 /* Make sure we output to start the timer */ 8192 bbr->r_wanted_output = 1; 8193 } 8194 /* Do we need to enter persist? */ 8195 if ((bbr->rc_in_persist == 0) && 8196 (tp->snd_wnd < min((bbr->r_ctl.rc_high_rwnd/2), bbr_minseg(bbr))) && 8197 TCPS_HAVEESTABLISHED(tp->t_state) && 8198 (tp->snd_max == tp->snd_una) && 8199 sbavail(&tp->t_inpcb->inp_socket->so_snd) && 8200 (sbavail(&tp->t_inpcb->inp_socket->so_snd) > tp->snd_wnd)) { 8201 /* No send window.. we must enter persist */ 8202 bbr_enter_persist(tp, bbr, bbr->r_ctl.rc_rcvtime, __LINE__); 8203 } 8204 if (tp->t_flags2 & TF2_DROP_AF_DATA) { 8205 m_freem(m); 8206 return (0); 8207 } 8208 /* 8209 * We don't support urgent data but 8210 * drag along the up just to make sure 8211 * if there is a stack switch no one 8212 * is surprised. 8213 */ 8214 tp->rcv_up = tp->rcv_nxt; 8215 INP_WLOCK_ASSERT(tp->t_inpcb); 8216 8217 /* 8218 * Process the segment text, merging it into the TCP sequencing 8219 * queue, and arranging for acknowledgment of receipt if necessary. 8220 * This process logically involves adjusting tp->rcv_wnd as data is 8221 * presented to the user (this happens in tcp_usrreq.c, case 8222 * PRU_RCVD). If a FIN has already been received on this connection 8223 * then we just ignore the text. 8224 */ 8225 tfo_syn = ((tp->t_state == TCPS_SYN_RECEIVED) && 8226 IS_FASTOPEN(tp->t_flags)); 8227 if ((tlen || (thflags & TH_FIN) || (tfo_syn && tlen > 0)) && 8228 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 8229 tcp_seq save_start = th->th_seq; 8230 tcp_seq save_rnxt = tp->rcv_nxt; 8231 int save_tlen = tlen; 8232 8233 m_adj(m, drop_hdrlen); /* delayed header drop */ 8234 /* 8235 * Insert segment which includes th into TCP reassembly 8236 * queue with control block tp. Set thflags to whether 8237 * reassembly now includes a segment with FIN. This handles 8238 * the common case inline (segment is the next to be 8239 * received on an established connection, and the queue is 8240 * empty), avoiding linkage into and removal from the queue 8241 * and repetition of various conversions. Set DELACK for 8242 * segments received in order, but ack immediately when 8243 * segments are out of order (so fast retransmit can work). 8244 */ 8245 if (th->th_seq == tp->rcv_nxt && 8246 SEGQ_EMPTY(tp) && 8247 (TCPS_HAVEESTABLISHED(tp->t_state) || 8248 tfo_syn)) { 8249 #ifdef NETFLIX_SB_LIMITS 8250 u_int mcnt, appended; 8251 8252 if (so->so_rcv.sb_shlim) { 8253 mcnt = m_memcnt(m); 8254 appended = 0; 8255 if (counter_fo_get(so->so_rcv.sb_shlim, mcnt, 8256 CFO_NOSLEEP, NULL) == false) { 8257 counter_u64_add(tcp_sb_shlim_fails, 1); 8258 m_freem(m); 8259 return (0); 8260 } 8261 } 8262 8263 #endif 8264 if (DELAY_ACK(tp, bbr, nsegs) || tfo_syn) { 8265 bbr->bbr_segs_rcvd += max(1, nsegs); 8266 tp->t_flags |= TF_DELACK; 8267 bbr_timer_cancel(bbr, __LINE__, bbr->r_ctl.rc_rcvtime); 8268 } else { 8269 bbr->r_wanted_output = 1; 8270 tp->t_flags |= TF_ACKNOW; 8271 } 8272 tp->rcv_nxt += tlen; 8273 if (tlen && 8274 ((tp->t_flags2 & TF2_FBYTES_COMPLETE) == 0) && 8275 (tp->t_fbyte_in == 0)) { 8276 tp->t_fbyte_in = ticks; 8277 if (tp->t_fbyte_in == 0) 8278 tp->t_fbyte_in = 1; 8279 if (tp->t_fbyte_out && tp->t_fbyte_in) 8280 tp->t_flags2 |= TF2_FBYTES_COMPLETE; 8281 } 8282 thflags = tcp_get_flags(th) & TH_FIN; 8283 KMOD_TCPSTAT_ADD(tcps_rcvpack, (int)nsegs); 8284 KMOD_TCPSTAT_ADD(tcps_rcvbyte, tlen); 8285 SOCKBUF_LOCK(&so->so_rcv); 8286 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) 8287 m_freem(m); 8288 else 8289 #ifdef NETFLIX_SB_LIMITS 8290 appended = 8291 #endif 8292 sbappendstream_locked(&so->so_rcv, m, 0); 8293 /* NB: sorwakeup_locked() does an implicit unlock. */ 8294 sorwakeup_locked(so); 8295 #ifdef NETFLIX_SB_LIMITS 8296 if (so->so_rcv.sb_shlim && appended != mcnt) 8297 counter_fo_release(so->so_rcv.sb_shlim, 8298 mcnt - appended); 8299 #endif 8300 8301 } else { 8302 /* 8303 * XXX: Due to the header drop above "th" is 8304 * theoretically invalid by now. Fortunately 8305 * m_adj() doesn't actually frees any mbufs when 8306 * trimming from the head. 8307 */ 8308 tcp_seq temp = save_start; 8309 8310 thflags = tcp_reass(tp, th, &temp, &tlen, m); 8311 tp->t_flags |= TF_ACKNOW; 8312 if (tp->t_flags & TF_WAKESOR) { 8313 tp->t_flags &= ~TF_WAKESOR; 8314 /* NB: sorwakeup_locked() does an implicit unlock. */ 8315 sorwakeup_locked(so); 8316 } 8317 } 8318 if ((tp->t_flags & TF_SACK_PERMIT) && 8319 (save_tlen > 0) && 8320 TCPS_HAVEESTABLISHED(tp->t_state)) { 8321 if ((tlen == 0) && (SEQ_LT(save_start, save_rnxt))) { 8322 /* 8323 * DSACK actually handled in the fastpath 8324 * above. 8325 */ 8326 tcp_update_sack_list(tp, save_start, 8327 save_start + save_tlen); 8328 } else if ((tlen > 0) && SEQ_GT(tp->rcv_nxt, save_rnxt)) { 8329 if ((tp->rcv_numsacks >= 1) && 8330 (tp->sackblks[0].end == save_start)) { 8331 /* 8332 * Partial overlap, recorded at todrop 8333 * above. 8334 */ 8335 tcp_update_sack_list(tp, 8336 tp->sackblks[0].start, 8337 tp->sackblks[0].end); 8338 } else { 8339 tcp_update_dsack_list(tp, save_start, 8340 save_start + save_tlen); 8341 } 8342 } else if (tlen >= save_tlen) { 8343 /* Update of sackblks. */ 8344 tcp_update_dsack_list(tp, save_start, 8345 save_start + save_tlen); 8346 } else if (tlen > 0) { 8347 tcp_update_dsack_list(tp, save_start, 8348 save_start + tlen); 8349 } 8350 } 8351 } else { 8352 m_freem(m); 8353 thflags &= ~TH_FIN; 8354 } 8355 8356 /* 8357 * If FIN is received ACK the FIN and let the user know that the 8358 * connection is closing. 8359 */ 8360 if (thflags & TH_FIN) { 8361 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 8362 /* The socket upcall is handled by socantrcvmore. */ 8363 socantrcvmore(so); 8364 /* 8365 * If connection is half-synchronized (ie NEEDSYN 8366 * flag on) then delay ACK, so it may be piggybacked 8367 * when SYN is sent. Otherwise, since we received a 8368 * FIN then no more input can be expected, send ACK 8369 * now. 8370 */ 8371 if (tp->t_flags & TF_NEEDSYN) { 8372 tp->t_flags |= TF_DELACK; 8373 bbr_timer_cancel(bbr, 8374 __LINE__, bbr->r_ctl.rc_rcvtime); 8375 } else { 8376 tp->t_flags |= TF_ACKNOW; 8377 } 8378 tp->rcv_nxt++; 8379 } 8380 switch (tp->t_state) { 8381 /* 8382 * In SYN_RECEIVED and ESTABLISHED STATES enter the 8383 * CLOSE_WAIT state. 8384 */ 8385 case TCPS_SYN_RECEIVED: 8386 tp->t_starttime = ticks; 8387 /* FALLTHROUGH */ 8388 case TCPS_ESTABLISHED: 8389 tcp_state_change(tp, TCPS_CLOSE_WAIT); 8390 break; 8391 8392 /* 8393 * If still in FIN_WAIT_1 STATE FIN has not been 8394 * acked so enter the CLOSING state. 8395 */ 8396 case TCPS_FIN_WAIT_1: 8397 tcp_state_change(tp, TCPS_CLOSING); 8398 break; 8399 8400 /* 8401 * In FIN_WAIT_2 state enter the TIME_WAIT state, 8402 * starting the time-wait timer, turning off the 8403 * other standard timers. 8404 */ 8405 case TCPS_FIN_WAIT_2: 8406 bbr->rc_timer_first = 1; 8407 bbr_timer_cancel(bbr, 8408 __LINE__, bbr->r_ctl.rc_rcvtime); 8409 INP_WLOCK_ASSERT(tp->t_inpcb); 8410 tcp_twstart(tp); 8411 return (1); 8412 } 8413 } 8414 /* 8415 * Return any desired output. 8416 */ 8417 if ((tp->t_flags & TF_ACKNOW) || 8418 (sbavail(&so->so_snd) > ctf_outstanding(tp))) { 8419 bbr->r_wanted_output = 1; 8420 } 8421 INP_WLOCK_ASSERT(tp->t_inpcb); 8422 return (0); 8423 } 8424 8425 /* 8426 * Here nothing is really faster, its just that we 8427 * have broken out the fast-data path also just like 8428 * the fast-ack. Return 1 if we processed the packet 8429 * return 0 if you need to take the "slow-path". 8430 */ 8431 static int 8432 bbr_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so, 8433 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 8434 uint32_t tiwin, int32_t nxt_pkt) 8435 { 8436 uint16_t nsegs; 8437 int32_t newsize = 0; /* automatic sockbuf scaling */ 8438 struct tcp_bbr *bbr; 8439 #ifdef NETFLIX_SB_LIMITS 8440 u_int mcnt, appended; 8441 #endif 8442 #ifdef TCPDEBUG 8443 /* 8444 * The size of tcp_saveipgen must be the size of the max ip header, 8445 * now IPv6. 8446 */ 8447 u_char tcp_saveipgen[IP6_HDR_LEN]; 8448 struct tcphdr tcp_savetcp; 8449 short ostate = 0; 8450 8451 #endif 8452 /* On the hpts and we would have called output */ 8453 bbr = (struct tcp_bbr *)tp->t_fb_ptr; 8454 8455 /* 8456 * If last ACK falls within this segment's sequence numbers, record 8457 * the timestamp. NOTE that the test is modified according to the 8458 * latest proposal of the tcplw@cray.com list (Braden 1993/04/26). 8459 */ 8460 if (bbr->r_ctl.rc_resend != NULL) { 8461 return (0); 8462 } 8463 if (tiwin && tiwin != tp->snd_wnd) { 8464 return (0); 8465 } 8466 if (__predict_false((tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN)))) { 8467 return (0); 8468 } 8469 if (__predict_false((to->to_flags & TOF_TS) && 8470 (TSTMP_LT(to->to_tsval, tp->ts_recent)))) { 8471 return (0); 8472 } 8473 if (__predict_false((th->th_ack != tp->snd_una))) { 8474 return (0); 8475 } 8476 if (__predict_false(tlen > sbspace(&so->so_rcv))) { 8477 return (0); 8478 } 8479 if ((to->to_flags & TOF_TS) != 0 && 8480 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 8481 tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv); 8482 tp->ts_recent = to->to_tsval; 8483 } 8484 /* 8485 * This is a pure, in-sequence data packet with nothing on the 8486 * reassembly queue and we have enough buffer space to take it. 8487 */ 8488 nsegs = max(1, m->m_pkthdr.lro_nsegs); 8489 8490 #ifdef NETFLIX_SB_LIMITS 8491 if (so->so_rcv.sb_shlim) { 8492 mcnt = m_memcnt(m); 8493 appended = 0; 8494 if (counter_fo_get(so->so_rcv.sb_shlim, mcnt, 8495 CFO_NOSLEEP, NULL) == false) { 8496 counter_u64_add(tcp_sb_shlim_fails, 1); 8497 m_freem(m); 8498 return (1); 8499 } 8500 } 8501 #endif 8502 /* Clean receiver SACK report if present */ 8503 if (tp->rcv_numsacks) 8504 tcp_clean_sackreport(tp); 8505 KMOD_TCPSTAT_INC(tcps_preddat); 8506 tp->rcv_nxt += tlen; 8507 if (tlen && 8508 ((tp->t_flags2 & TF2_FBYTES_COMPLETE) == 0) && 8509 (tp->t_fbyte_in == 0)) { 8510 tp->t_fbyte_in = ticks; 8511 if (tp->t_fbyte_in == 0) 8512 tp->t_fbyte_in = 1; 8513 if (tp->t_fbyte_out && tp->t_fbyte_in) 8514 tp->t_flags2 |= TF2_FBYTES_COMPLETE; 8515 } 8516 /* 8517 * Pull snd_wl1 up to prevent seq wrap relative to th_seq. 8518 */ 8519 tp->snd_wl1 = th->th_seq; 8520 /* 8521 * Pull rcv_up up to prevent seq wrap relative to rcv_nxt. 8522 */ 8523 tp->rcv_up = tp->rcv_nxt; 8524 KMOD_TCPSTAT_ADD(tcps_rcvpack, (int)nsegs); 8525 KMOD_TCPSTAT_ADD(tcps_rcvbyte, tlen); 8526 #ifdef TCPDEBUG 8527 if (so->so_options & SO_DEBUG) 8528 tcp_trace(TA_INPUT, ostate, tp, 8529 (void *)tcp_saveipgen, &tcp_savetcp, 0); 8530 #endif 8531 newsize = tcp_autorcvbuf(m, th, so, tp, tlen); 8532 8533 /* Add data to socket buffer. */ 8534 SOCKBUF_LOCK(&so->so_rcv); 8535 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 8536 m_freem(m); 8537 } else { 8538 /* 8539 * Set new socket buffer size. Give up when limit is 8540 * reached. 8541 */ 8542 if (newsize) 8543 if (!sbreserve_locked(so, SO_RCV, newsize, NULL)) 8544 so->so_rcv.sb_flags &= ~SB_AUTOSIZE; 8545 m_adj(m, drop_hdrlen); /* delayed header drop */ 8546 8547 #ifdef NETFLIX_SB_LIMITS 8548 appended = 8549 #endif 8550 sbappendstream_locked(&so->so_rcv, m, 0); 8551 ctf_calc_rwin(so, tp); 8552 } 8553 /* NB: sorwakeup_locked() does an implicit unlock. */ 8554 sorwakeup_locked(so); 8555 #ifdef NETFLIX_SB_LIMITS 8556 if (so->so_rcv.sb_shlim && mcnt != appended) 8557 counter_fo_release(so->so_rcv.sb_shlim, mcnt - appended); 8558 #endif 8559 if (DELAY_ACK(tp, bbr, nsegs)) { 8560 bbr->bbr_segs_rcvd += max(1, nsegs); 8561 tp->t_flags |= TF_DELACK; 8562 bbr_timer_cancel(bbr, __LINE__, bbr->r_ctl.rc_rcvtime); 8563 } else { 8564 bbr->r_wanted_output = 1; 8565 tp->t_flags |= TF_ACKNOW; 8566 } 8567 return (1); 8568 } 8569 8570 /* 8571 * This subfunction is used to try to highly optimize the 8572 * fast path. We again allow window updates that are 8573 * in sequence to remain in the fast-path. We also add 8574 * in the __predict's to attempt to help the compiler. 8575 * Note that if we return a 0, then we can *not* process 8576 * it and the caller should push the packet into the 8577 * slow-path. If we return 1, then all is well and 8578 * the packet is fully processed. 8579 */ 8580 static int 8581 bbr_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so, 8582 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 8583 uint32_t tiwin, int32_t nxt_pkt, uint8_t iptos) 8584 { 8585 int32_t acked; 8586 uint16_t nsegs; 8587 uint32_t sack_changed; 8588 #ifdef TCPDEBUG 8589 /* 8590 * The size of tcp_saveipgen must be the size of the max ip header, 8591 * now IPv6. 8592 */ 8593 u_char tcp_saveipgen[IP6_HDR_LEN]; 8594 struct tcphdr tcp_savetcp; 8595 short ostate = 0; 8596 8597 #endif 8598 uint32_t prev_acked = 0; 8599 struct tcp_bbr *bbr; 8600 8601 if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) { 8602 /* Old ack, behind (or duplicate to) the last one rcv'd */ 8603 return (0); 8604 } 8605 if (__predict_false(SEQ_GT(th->th_ack, tp->snd_max))) { 8606 /* Above what we have sent? */ 8607 return (0); 8608 } 8609 if (__predict_false(tiwin == 0)) { 8610 /* zero window */ 8611 return (0); 8612 } 8613 if (__predict_false(tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN))) { 8614 /* We need a SYN or a FIN, unlikely.. */ 8615 return (0); 8616 } 8617 if ((to->to_flags & TOF_TS) && __predict_false(TSTMP_LT(to->to_tsval, tp->ts_recent))) { 8618 /* Timestamp is behind .. old ack with seq wrap? */ 8619 return (0); 8620 } 8621 if (__predict_false(IN_RECOVERY(tp->t_flags))) { 8622 /* Still recovering */ 8623 return (0); 8624 } 8625 bbr = (struct tcp_bbr *)tp->t_fb_ptr; 8626 if (__predict_false(bbr->r_ctl.rc_resend != NULL)) { 8627 /* We are retransmitting */ 8628 return (0); 8629 } 8630 if (__predict_false(bbr->rc_in_persist != 0)) { 8631 /* In persist mode */ 8632 return (0); 8633 } 8634 if (bbr->r_ctl.rc_sacked) { 8635 /* We have sack holes on our scoreboard */ 8636 return (0); 8637 } 8638 /* Ok if we reach here, we can process a fast-ack */ 8639 nsegs = max(1, m->m_pkthdr.lro_nsegs); 8640 sack_changed = bbr_log_ack(tp, to, th, &prev_acked); 8641 /* 8642 * We never detect loss in fast ack [we can't 8643 * have a sack and can't be in recovery so 8644 * we always pass 0 (nothing detected)]. 8645 */ 8646 bbr_lt_bw_sampling(bbr, bbr->r_ctl.rc_rcvtime, 0); 8647 /* Did the window get updated? */ 8648 if (tiwin != tp->snd_wnd) { 8649 tp->snd_wnd = tiwin; 8650 tp->snd_wl1 = th->th_seq; 8651 if (tp->snd_wnd > tp->max_sndwnd) 8652 tp->max_sndwnd = tp->snd_wnd; 8653 } 8654 /* Do we need to exit persists? */ 8655 if ((bbr->rc_in_persist != 0) && 8656 (tp->snd_wnd >= min((bbr->r_ctl.rc_high_rwnd/2), 8657 bbr_minseg(bbr)))) { 8658 bbr_exit_persist(tp, bbr, bbr->r_ctl.rc_rcvtime, __LINE__); 8659 bbr->r_wanted_output = 1; 8660 } 8661 /* Do we need to enter persists? */ 8662 if ((bbr->rc_in_persist == 0) && 8663 (tp->snd_wnd < min((bbr->r_ctl.rc_high_rwnd/2), bbr_minseg(bbr))) && 8664 TCPS_HAVEESTABLISHED(tp->t_state) && 8665 (tp->snd_max == tp->snd_una) && 8666 sbavail(&tp->t_inpcb->inp_socket->so_snd) && 8667 (sbavail(&tp->t_inpcb->inp_socket->so_snd) > tp->snd_wnd)) { 8668 /* No send window.. we must enter persist */ 8669 bbr_enter_persist(tp, bbr, bbr->r_ctl.rc_rcvtime, __LINE__); 8670 } 8671 /* 8672 * If last ACK falls within this segment's sequence numbers, record 8673 * the timestamp. NOTE that the test is modified according to the 8674 * latest proposal of the tcplw@cray.com list (Braden 1993/04/26). 8675 */ 8676 if ((to->to_flags & TOF_TS) != 0 && 8677 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 8678 tp->ts_recent_age = bbr->r_ctl.rc_rcvtime; 8679 tp->ts_recent = to->to_tsval; 8680 } 8681 /* 8682 * This is a pure ack for outstanding data. 8683 */ 8684 KMOD_TCPSTAT_INC(tcps_predack); 8685 8686 /* 8687 * "bad retransmit" recovery. 8688 */ 8689 if (tp->t_flags & TF_PREVVALID) { 8690 tp->t_flags &= ~TF_PREVVALID; 8691 if (tp->t_rxtshift == 1 && 8692 (int)(ticks - tp->t_badrxtwin) < 0) 8693 bbr_cong_signal(tp, th, CC_RTO_ERR, NULL); 8694 } 8695 /* 8696 * Recalculate the transmit timer / rtt. 8697 * 8698 * Some boxes send broken timestamp replies during the SYN+ACK 8699 * phase, ignore timestamps of 0 or we could calculate a huge RTT 8700 * and blow up the retransmit timer. 8701 */ 8702 acked = BYTES_THIS_ACK(tp, th); 8703 8704 #ifdef TCP_HHOOK 8705 /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ 8706 hhook_run_tcp_est_in(tp, th, to); 8707 #endif 8708 8709 KMOD_TCPSTAT_ADD(tcps_rcvackpack, (int)nsegs); 8710 KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked); 8711 sbdrop(&so->so_snd, acked); 8712 8713 if (SEQ_GT(th->th_ack, tp->snd_una)) 8714 bbr_collapse_rtt(tp, bbr, TCP_REXMTVAL(tp)); 8715 tp->snd_una = th->th_ack; 8716 if (tp->snd_wnd < ctf_outstanding(tp)) 8717 /* The peer collapsed its window on us */ 8718 bbr_collapsed_window(bbr); 8719 else if (bbr->rc_has_collapsed) 8720 bbr_un_collapse_window(bbr); 8721 8722 if (SEQ_GT(tp->snd_una, tp->snd_recover)) { 8723 tp->snd_recover = tp->snd_una; 8724 } 8725 bbr_ack_received(tp, bbr, th, acked, sack_changed, prev_acked, __LINE__, 0); 8726 /* 8727 * Pull snd_wl2 up to prevent seq wrap relative to th_ack. 8728 */ 8729 tp->snd_wl2 = th->th_ack; 8730 m_freem(m); 8731 /* 8732 * If all outstanding data are acked, stop retransmit timer, 8733 * otherwise restart timer using current (possibly backed-off) 8734 * value. If process is waiting for space, wakeup/selwakeup/signal. 8735 * If data are ready to send, let tcp_output decide between more 8736 * output or persist. 8737 */ 8738 #ifdef TCPDEBUG 8739 if (so->so_options & SO_DEBUG) 8740 tcp_trace(TA_INPUT, ostate, tp, 8741 (void *)tcp_saveipgen, 8742 &tcp_savetcp, 0); 8743 #endif 8744 /* Wake up the socket if we have room to write more */ 8745 sowwakeup(so); 8746 if (tp->snd_una == tp->snd_max) { 8747 /* Nothing left outstanding */ 8748 bbr_log_progress_event(bbr, tp, ticks, PROGRESS_CLEAR, __LINE__); 8749 if (sbavail(&tp->t_inpcb->inp_socket->so_snd) == 0) 8750 bbr->rc_tp->t_acktime = 0; 8751 bbr_timer_cancel(bbr, __LINE__, bbr->r_ctl.rc_rcvtime); 8752 if (bbr->rc_in_persist == 0) { 8753 bbr->r_ctl.rc_went_idle_time = bbr->r_ctl.rc_rcvtime; 8754 } 8755 sack_filter_clear(&bbr->r_ctl.bbr_sf, tp->snd_una); 8756 bbr_log_ack_clear(bbr, bbr->r_ctl.rc_rcvtime); 8757 /* 8758 * We invalidate the last ack here since we 8759 * don't want to transfer forward the time 8760 * for our sum's calculations. 8761 */ 8762 bbr->r_wanted_output = 1; 8763 } 8764 if (sbavail(&so->so_snd)) { 8765 bbr->r_wanted_output = 1; 8766 } 8767 return (1); 8768 } 8769 8770 /* 8771 * Return value of 1, the TCB is unlocked and most 8772 * likely gone, return value of 0, the TCB is still 8773 * locked. 8774 */ 8775 static int 8776 bbr_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so, 8777 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 8778 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 8779 { 8780 int32_t todrop; 8781 int32_t ourfinisacked = 0; 8782 struct tcp_bbr *bbr; 8783 int32_t ret_val = 0; 8784 8785 bbr = (struct tcp_bbr *)tp->t_fb_ptr; 8786 ctf_calc_rwin(so, tp); 8787 /* 8788 * If the state is SYN_SENT: if seg contains an ACK, but not for our 8789 * SYN, drop the input. if seg contains a RST, then drop the 8790 * connection. if seg does not contain SYN, then drop it. Otherwise 8791 * this is an acceptable SYN segment initialize tp->rcv_nxt and 8792 * tp->irs if seg contains ack then advance tp->snd_una. BRR does 8793 * not support ECN so we will not say we are capable. if SYN has 8794 * been acked change to ESTABLISHED else SYN_RCVD state arrange for 8795 * segment to be acked (eventually) continue processing rest of 8796 * data/controls, beginning with URG 8797 */ 8798 if ((thflags & TH_ACK) && 8799 (SEQ_LEQ(th->th_ack, tp->iss) || 8800 SEQ_GT(th->th_ack, tp->snd_max))) { 8801 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 8802 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 8803 return (1); 8804 } 8805 if ((thflags & (TH_ACK | TH_RST)) == (TH_ACK | TH_RST)) { 8806 TCP_PROBE5(connect__refused, NULL, tp, 8807 mtod(m, const char *), tp, th); 8808 tp = tcp_drop(tp, ECONNREFUSED); 8809 ctf_do_drop(m, tp); 8810 return (1); 8811 } 8812 if (thflags & TH_RST) { 8813 ctf_do_drop(m, tp); 8814 return (1); 8815 } 8816 if (!(thflags & TH_SYN)) { 8817 ctf_do_drop(m, tp); 8818 return (1); 8819 } 8820 tp->irs = th->th_seq; 8821 tcp_rcvseqinit(tp); 8822 if (thflags & TH_ACK) { 8823 int tfo_partial = 0; 8824 8825 KMOD_TCPSTAT_INC(tcps_connects); 8826 soisconnected(so); 8827 #ifdef MAC 8828 mac_socketpeer_set_from_mbuf(m, so); 8829 #endif 8830 /* Do window scaling on this connection? */ 8831 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 8832 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 8833 tp->rcv_scale = tp->request_r_scale; 8834 } 8835 tp->rcv_adv += min(tp->rcv_wnd, 8836 TCP_MAXWIN << tp->rcv_scale); 8837 /* 8838 * If not all the data that was sent in the TFO SYN 8839 * has been acked, resend the remainder right away. 8840 */ 8841 if (IS_FASTOPEN(tp->t_flags) && 8842 (tp->snd_una != tp->snd_max)) { 8843 tp->snd_nxt = th->th_ack; 8844 tfo_partial = 1; 8845 } 8846 /* 8847 * If there's data, delay ACK; if there's also a FIN ACKNOW 8848 * will be turned on later. 8849 */ 8850 if (DELAY_ACK(tp, bbr, 1) && tlen != 0 && !tfo_partial) { 8851 bbr->bbr_segs_rcvd += 1; 8852 tp->t_flags |= TF_DELACK; 8853 bbr_timer_cancel(bbr, __LINE__, bbr->r_ctl.rc_rcvtime); 8854 } else { 8855 bbr->r_wanted_output = 1; 8856 tp->t_flags |= TF_ACKNOW; 8857 } 8858 if (SEQ_GT(th->th_ack, tp->iss)) { 8859 /* 8860 * The SYN is acked 8861 * handle it specially. 8862 */ 8863 bbr_log_syn(tp, to); 8864 } 8865 if (SEQ_GT(th->th_ack, tp->snd_una)) { 8866 /* 8867 * We advance snd_una for the 8868 * fast open case. If th_ack is 8869 * acknowledging data beyond 8870 * snd_una we can't just call 8871 * ack-processing since the 8872 * data stream in our send-map 8873 * will start at snd_una + 1 (one 8874 * beyond the SYN). If its just 8875 * equal we don't need to do that 8876 * and there is no send_map. 8877 */ 8878 tp->snd_una++; 8879 } 8880 /* 8881 * Received <SYN,ACK> in SYN_SENT[*] state. Transitions: 8882 * SYN_SENT --> ESTABLISHED SYN_SENT* --> FIN_WAIT_1 8883 */ 8884 tp->t_starttime = ticks; 8885 if (tp->t_flags & TF_NEEDFIN) { 8886 tcp_state_change(tp, TCPS_FIN_WAIT_1); 8887 tp->t_flags &= ~TF_NEEDFIN; 8888 thflags &= ~TH_SYN; 8889 } else { 8890 tcp_state_change(tp, TCPS_ESTABLISHED); 8891 TCP_PROBE5(connect__established, NULL, tp, 8892 mtod(m, const char *), tp, th); 8893 cc_conn_init(tp); 8894 } 8895 } else { 8896 /* 8897 * Received initial SYN in SYN-SENT[*] state => simultaneous 8898 * open. If segment contains CC option and there is a 8899 * cached CC, apply TAO test. If it succeeds, connection is * 8900 * half-synchronized. Otherwise, do 3-way handshake: 8901 * SYN-SENT -> SYN-RECEIVED SYN-SENT* -> SYN-RECEIVED* If 8902 * there was no CC option, clear cached CC value. 8903 */ 8904 tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN | TF_SONOTCONN); 8905 tcp_state_change(tp, TCPS_SYN_RECEIVED); 8906 } 8907 INP_WLOCK_ASSERT(tp->t_inpcb); 8908 /* 8909 * Advance th->th_seq to correspond to first data byte. If data, 8910 * trim to stay within window, dropping FIN if necessary. 8911 */ 8912 th->th_seq++; 8913 if (tlen > tp->rcv_wnd) { 8914 todrop = tlen - tp->rcv_wnd; 8915 m_adj(m, -todrop); 8916 tlen = tp->rcv_wnd; 8917 thflags &= ~TH_FIN; 8918 KMOD_TCPSTAT_INC(tcps_rcvpackafterwin); 8919 KMOD_TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); 8920 } 8921 tp->snd_wl1 = th->th_seq - 1; 8922 tp->rcv_up = th->th_seq; 8923 /* 8924 * Client side of transaction: already sent SYN and data. If the 8925 * remote host used T/TCP to validate the SYN, our data will be 8926 * ACK'd; if so, enter normal data segment processing in the middle 8927 * of step 5, ack processing. Otherwise, goto step 6. 8928 */ 8929 if (thflags & TH_ACK) { 8930 if ((to->to_flags & TOF_TS) != 0) { 8931 uint32_t t, rtt; 8932 8933 t = tcp_tv_to_mssectick(&bbr->rc_tv); 8934 if (TSTMP_GEQ(t, to->to_tsecr)) { 8935 rtt = t - to->to_tsecr; 8936 if (rtt == 0) { 8937 rtt = 1; 8938 } 8939 rtt *= MS_IN_USEC; 8940 tcp_bbr_xmit_timer(bbr, rtt, 0, 0, 0); 8941 apply_filter_min_small(&bbr->r_ctl.rc_rttprop, 8942 rtt, bbr->r_ctl.rc_rcvtime); 8943 } 8944 } 8945 if (bbr_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) 8946 return (ret_val); 8947 /* We may have changed to FIN_WAIT_1 above */ 8948 if (tp->t_state == TCPS_FIN_WAIT_1) { 8949 /* 8950 * In FIN_WAIT_1 STATE in addition to the processing 8951 * for the ESTABLISHED state if our FIN is now 8952 * acknowledged then enter FIN_WAIT_2. 8953 */ 8954 if (ourfinisacked) { 8955 /* 8956 * If we can't receive any more data, then 8957 * closing user can proceed. Starting the 8958 * timer is contrary to the specification, 8959 * but if we don't get a FIN we'll hang 8960 * forever. 8961 * 8962 * XXXjl: we should release the tp also, and 8963 * use a compressed state. 8964 */ 8965 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 8966 soisdisconnected(so); 8967 tcp_timer_activate(tp, TT_2MSL, 8968 (tcp_fast_finwait2_recycle ? 8969 tcp_finwait2_timeout : 8970 TP_MAXIDLE(tp))); 8971 } 8972 tcp_state_change(tp, TCPS_FIN_WAIT_2); 8973 } 8974 } 8975 } 8976 return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen, 8977 tiwin, thflags, nxt_pkt)); 8978 } 8979 8980 /* 8981 * Return value of 1, the TCB is unlocked and most 8982 * likely gone, return value of 0, the TCB is still 8983 * locked. 8984 */ 8985 static int 8986 bbr_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, 8987 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 8988 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 8989 { 8990 int32_t ourfinisacked = 0; 8991 int32_t ret_val; 8992 struct tcp_bbr *bbr; 8993 8994 bbr = (struct tcp_bbr *)tp->t_fb_ptr; 8995 ctf_calc_rwin(so, tp); 8996 if ((thflags & TH_ACK) && 8997 (SEQ_LEQ(th->th_ack, tp->snd_una) || 8998 SEQ_GT(th->th_ack, tp->snd_max))) { 8999 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 9000 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 9001 return (1); 9002 } 9003 if (IS_FASTOPEN(tp->t_flags)) { 9004 /* 9005 * When a TFO connection is in SYN_RECEIVED, the only valid 9006 * packets are the initial SYN, a retransmit/copy of the 9007 * initial SYN (possibly with a subset of the original 9008 * data), a valid ACK, a FIN, or a RST. 9009 */ 9010 if ((thflags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) { 9011 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 9012 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 9013 return (1); 9014 } else if (thflags & TH_SYN) { 9015 /* non-initial SYN is ignored */ 9016 if ((bbr->r_ctl.rc_hpts_flags & PACE_TMR_RXT) || 9017 (bbr->r_ctl.rc_hpts_flags & PACE_TMR_TLP) || 9018 (bbr->r_ctl.rc_hpts_flags & PACE_TMR_RACK)) { 9019 ctf_do_drop(m, NULL); 9020 return (0); 9021 } 9022 } else if (!(thflags & (TH_ACK | TH_FIN | TH_RST))) { 9023 ctf_do_drop(m, NULL); 9024 return (0); 9025 } 9026 } 9027 if ((thflags & TH_RST) || 9028 (tp->t_fin_is_rst && (thflags & TH_FIN))) 9029 return (ctf_process_rst(m, th, so, tp)); 9030 /* 9031 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 9032 * it's less than ts_recent, drop it. 9033 */ 9034 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 9035 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 9036 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 9037 return (ret_val); 9038 } 9039 /* 9040 * In the SYN-RECEIVED state, validate that the packet belongs to 9041 * this connection before trimming the data to fit the receive 9042 * window. Check the sequence number versus IRS since we know the 9043 * sequence numbers haven't wrapped. This is a partial fix for the 9044 * "LAND" DoS attack. 9045 */ 9046 if (SEQ_LT(th->th_seq, tp->irs)) { 9047 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 9048 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 9049 return (1); 9050 } 9051 INP_WLOCK_ASSERT(tp->t_inpcb); 9052 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 9053 return (ret_val); 9054 } 9055 /* 9056 * If last ACK falls within this segment's sequence numbers, record 9057 * its timestamp. NOTE: 1) That the test incorporates suggestions 9058 * from the latest proposal of the tcplw@cray.com list (Braden 9059 * 1993/04/26). 2) That updating only on newer timestamps interferes 9060 * with our earlier PAWS tests, so this check should be solely 9061 * predicated on the sequence space of this segment. 3) That we 9062 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 9063 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 9064 * SEG.Len, This modified check allows us to overcome RFC1323's 9065 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 9066 * p.869. In such cases, we can still calculate the RTT correctly 9067 * when RCV.NXT == Last.ACK.Sent. 9068 */ 9069 if ((to->to_flags & TOF_TS) != 0 && 9070 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 9071 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 9072 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 9073 tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv); 9074 tp->ts_recent = to->to_tsval; 9075 } 9076 tp->snd_wnd = tiwin; 9077 /* 9078 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 9079 * is on (half-synchronized state), then queue data for later 9080 * processing; else drop segment and return. 9081 */ 9082 if ((thflags & TH_ACK) == 0) { 9083 if (IS_FASTOPEN(tp->t_flags)) { 9084 cc_conn_init(tp); 9085 } 9086 return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen, 9087 tiwin, thflags, nxt_pkt)); 9088 } 9089 KMOD_TCPSTAT_INC(tcps_connects); 9090 if (tp->t_flags & TF_SONOTCONN) { 9091 tp->t_flags &= ~TF_SONOTCONN; 9092 soisconnected(so); 9093 } 9094 /* Do window scaling? */ 9095 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 9096 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 9097 tp->rcv_scale = tp->request_r_scale; 9098 } 9099 /* 9100 * ok for the first time in lets see if we can use the ts to figure 9101 * out what the initial RTT was. 9102 */ 9103 if ((to->to_flags & TOF_TS) != 0) { 9104 uint32_t t, rtt; 9105 9106 t = tcp_tv_to_mssectick(&bbr->rc_tv); 9107 if (TSTMP_GEQ(t, to->to_tsecr)) { 9108 rtt = t - to->to_tsecr; 9109 if (rtt == 0) { 9110 rtt = 1; 9111 } 9112 rtt *= MS_IN_USEC; 9113 tcp_bbr_xmit_timer(bbr, rtt, 0, 0, 0); 9114 apply_filter_min_small(&bbr->r_ctl.rc_rttprop, rtt, bbr->r_ctl.rc_rcvtime); 9115 } 9116 } 9117 /* Drop off any SYN in the send map (probably not there) */ 9118 if (thflags & TH_ACK) 9119 bbr_log_syn(tp, to); 9120 if (IS_FASTOPEN(tp->t_flags) && tp->t_tfo_pending) { 9121 tcp_fastopen_decrement_counter(tp->t_tfo_pending); 9122 tp->t_tfo_pending = NULL; 9123 } 9124 /* 9125 * Make transitions: SYN-RECEIVED -> ESTABLISHED SYN-RECEIVED* -> 9126 * FIN-WAIT-1 9127 */ 9128 tp->t_starttime = ticks; 9129 if (tp->t_flags & TF_NEEDFIN) { 9130 tcp_state_change(tp, TCPS_FIN_WAIT_1); 9131 tp->t_flags &= ~TF_NEEDFIN; 9132 } else { 9133 tcp_state_change(tp, TCPS_ESTABLISHED); 9134 TCP_PROBE5(accept__established, NULL, tp, 9135 mtod(m, const char *), tp, th); 9136 /* 9137 * TFO connections call cc_conn_init() during SYN 9138 * processing. Calling it again here for such connections 9139 * is not harmless as it would undo the snd_cwnd reduction 9140 * that occurs when a TFO SYN|ACK is retransmitted. 9141 */ 9142 if (!IS_FASTOPEN(tp->t_flags)) 9143 cc_conn_init(tp); 9144 } 9145 /* 9146 * Account for the ACK of our SYN prior to 9147 * regular ACK processing below, except for 9148 * simultaneous SYN, which is handled later. 9149 */ 9150 if (SEQ_GT(th->th_ack, tp->snd_una) && !(tp->t_flags & TF_NEEDSYN)) 9151 tp->snd_una++; 9152 /* 9153 * If segment contains data or ACK, will call tcp_reass() later; if 9154 * not, do so now to pass queued data to user. 9155 */ 9156 if (tlen == 0 && (thflags & TH_FIN) == 0) { 9157 (void)tcp_reass(tp, (struct tcphdr *)0, NULL, 0, 9158 (struct mbuf *)0); 9159 if (tp->t_flags & TF_WAKESOR) { 9160 tp->t_flags &= ~TF_WAKESOR; 9161 /* NB: sorwakeup_locked() does an implicit unlock. */ 9162 sorwakeup_locked(so); 9163 } 9164 } 9165 tp->snd_wl1 = th->th_seq - 1; 9166 if (bbr_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 9167 return (ret_val); 9168 } 9169 if (tp->t_state == TCPS_FIN_WAIT_1) { 9170 /* We could have went to FIN_WAIT_1 (or EST) above */ 9171 /* 9172 * In FIN_WAIT_1 STATE in addition to the processing for the 9173 * ESTABLISHED state if our FIN is now acknowledged then 9174 * enter FIN_WAIT_2. 9175 */ 9176 if (ourfinisacked) { 9177 /* 9178 * If we can't receive any more data, then closing 9179 * user can proceed. Starting the timer is contrary 9180 * to the specification, but if we don't get a FIN 9181 * we'll hang forever. 9182 * 9183 * XXXjl: we should release the tp also, and use a 9184 * compressed state. 9185 */ 9186 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 9187 soisdisconnected(so); 9188 tcp_timer_activate(tp, TT_2MSL, 9189 (tcp_fast_finwait2_recycle ? 9190 tcp_finwait2_timeout : 9191 TP_MAXIDLE(tp))); 9192 } 9193 tcp_state_change(tp, TCPS_FIN_WAIT_2); 9194 } 9195 } 9196 return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen, 9197 tiwin, thflags, nxt_pkt)); 9198 } 9199 9200 /* 9201 * Return value of 1, the TCB is unlocked and most 9202 * likely gone, return value of 0, the TCB is still 9203 * locked. 9204 */ 9205 static int 9206 bbr_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so, 9207 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 9208 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 9209 { 9210 struct tcp_bbr *bbr; 9211 int32_t ret_val; 9212 9213 /* 9214 * Header prediction: check for the two common cases of a 9215 * uni-directional data xfer. If the packet has no control flags, 9216 * is in-sequence, the window didn't change and we're not 9217 * retransmitting, it's a candidate. If the length is zero and the 9218 * ack moved forward, we're the sender side of the xfer. Just free 9219 * the data acked & wake any higher level process that was blocked 9220 * waiting for space. If the length is non-zero and the ack didn't 9221 * move, we're the receiver side. If we're getting packets in-order 9222 * (the reassembly queue is empty), add the data toc The socket 9223 * buffer and note that we need a delayed ack. Make sure that the 9224 * hidden state-flags are also off. Since we check for 9225 * TCPS_ESTABLISHED first, it can only be TH_NEEDSYN. 9226 */ 9227 bbr = (struct tcp_bbr *)tp->t_fb_ptr; 9228 if (bbr->r_ctl.rc_delivered < (4 * tp->t_maxseg)) { 9229 /* 9230 * If we have delived under 4 segments increase the initial 9231 * window if raised by the peer. We use this to determine 9232 * dynamic and static rwnd's at the end of a connection. 9233 */ 9234 bbr->r_ctl.rc_init_rwnd = max(tiwin, tp->snd_wnd); 9235 } 9236 if (__predict_true(((to->to_flags & TOF_SACK) == 0)) && 9237 __predict_true((thflags & (TH_SYN | TH_FIN | TH_RST | TH_URG | TH_ACK)) == TH_ACK) && 9238 __predict_true(SEGQ_EMPTY(tp)) && 9239 __predict_true(th->th_seq == tp->rcv_nxt)) { 9240 if (tlen == 0) { 9241 if (bbr_fastack(m, th, so, tp, to, drop_hdrlen, tlen, 9242 tiwin, nxt_pkt, iptos)) { 9243 return (0); 9244 } 9245 } else { 9246 if (bbr_do_fastnewdata(m, th, so, tp, to, drop_hdrlen, tlen, 9247 tiwin, nxt_pkt)) { 9248 return (0); 9249 } 9250 } 9251 } 9252 ctf_calc_rwin(so, tp); 9253 9254 if ((thflags & TH_RST) || 9255 (tp->t_fin_is_rst && (thflags & TH_FIN))) 9256 return (ctf_process_rst(m, th, so, tp)); 9257 /* 9258 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 9259 * synchronized state. 9260 */ 9261 if (thflags & TH_SYN) { 9262 ctf_challenge_ack(m, th, tp, iptos, &ret_val); 9263 return (ret_val); 9264 } 9265 /* 9266 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 9267 * it's less than ts_recent, drop it. 9268 */ 9269 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 9270 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 9271 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 9272 return (ret_val); 9273 } 9274 INP_WLOCK_ASSERT(tp->t_inpcb); 9275 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 9276 return (ret_val); 9277 } 9278 /* 9279 * If last ACK falls within this segment's sequence numbers, record 9280 * its timestamp. NOTE: 1) That the test incorporates suggestions 9281 * from the latest proposal of the tcplw@cray.com list (Braden 9282 * 1993/04/26). 2) That updating only on newer timestamps interferes 9283 * with our earlier PAWS tests, so this check should be solely 9284 * predicated on the sequence space of this segment. 3) That we 9285 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 9286 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 9287 * SEG.Len, This modified check allows us to overcome RFC1323's 9288 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 9289 * p.869. In such cases, we can still calculate the RTT correctly 9290 * when RCV.NXT == Last.ACK.Sent. 9291 */ 9292 if ((to->to_flags & TOF_TS) != 0 && 9293 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 9294 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 9295 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 9296 tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv); 9297 tp->ts_recent = to->to_tsval; 9298 } 9299 /* 9300 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 9301 * is on (half-synchronized state), then queue data for later 9302 * processing; else drop segment and return. 9303 */ 9304 if ((thflags & TH_ACK) == 0) { 9305 if (tp->t_flags & TF_NEEDSYN) { 9306 return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen, 9307 tiwin, thflags, nxt_pkt)); 9308 } else if (tp->t_flags & TF_ACKNOW) { 9309 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 9310 bbr->r_wanted_output = 1; 9311 return (ret_val); 9312 } else { 9313 ctf_do_drop(m, NULL); 9314 return (0); 9315 } 9316 } 9317 /* 9318 * Ack processing. 9319 */ 9320 if (bbr_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) { 9321 return (ret_val); 9322 } 9323 if (sbavail(&so->so_snd)) { 9324 if (ctf_progress_timeout_check(tp, true)) { 9325 bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__); 9326 ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 9327 return (1); 9328 } 9329 } 9330 /* State changes only happen in bbr_process_data() */ 9331 return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen, 9332 tiwin, thflags, nxt_pkt)); 9333 } 9334 9335 /* 9336 * Return value of 1, the TCB is unlocked and most 9337 * likely gone, return value of 0, the TCB is still 9338 * locked. 9339 */ 9340 static int 9341 bbr_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so, 9342 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 9343 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 9344 { 9345 struct tcp_bbr *bbr; 9346 int32_t ret_val; 9347 9348 bbr = (struct tcp_bbr *)tp->t_fb_ptr; 9349 ctf_calc_rwin(so, tp); 9350 if ((thflags & TH_RST) || 9351 (tp->t_fin_is_rst && (thflags & TH_FIN))) 9352 return (ctf_process_rst(m, th, so, tp)); 9353 /* 9354 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 9355 * synchronized state. 9356 */ 9357 if (thflags & TH_SYN) { 9358 ctf_challenge_ack(m, th, tp, iptos, &ret_val); 9359 return (ret_val); 9360 } 9361 /* 9362 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 9363 * it's less than ts_recent, drop it. 9364 */ 9365 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 9366 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 9367 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 9368 return (ret_val); 9369 } 9370 INP_WLOCK_ASSERT(tp->t_inpcb); 9371 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 9372 return (ret_val); 9373 } 9374 /* 9375 * If last ACK falls within this segment's sequence numbers, record 9376 * its timestamp. NOTE: 1) That the test incorporates suggestions 9377 * from the latest proposal of the tcplw@cray.com list (Braden 9378 * 1993/04/26). 2) That updating only on newer timestamps interferes 9379 * with our earlier PAWS tests, so this check should be solely 9380 * predicated on the sequence space of this segment. 3) That we 9381 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 9382 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 9383 * SEG.Len, This modified check allows us to overcome RFC1323's 9384 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 9385 * p.869. In such cases, we can still calculate the RTT correctly 9386 * when RCV.NXT == Last.ACK.Sent. 9387 */ 9388 if ((to->to_flags & TOF_TS) != 0 && 9389 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 9390 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 9391 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 9392 tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv); 9393 tp->ts_recent = to->to_tsval; 9394 } 9395 /* 9396 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 9397 * is on (half-synchronized state), then queue data for later 9398 * processing; else drop segment and return. 9399 */ 9400 if ((thflags & TH_ACK) == 0) { 9401 if (tp->t_flags & TF_NEEDSYN) { 9402 return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen, 9403 tiwin, thflags, nxt_pkt)); 9404 } else if (tp->t_flags & TF_ACKNOW) { 9405 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 9406 bbr->r_wanted_output = 1; 9407 return (ret_val); 9408 } else { 9409 ctf_do_drop(m, NULL); 9410 return (0); 9411 } 9412 } 9413 /* 9414 * Ack processing. 9415 */ 9416 if (bbr_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) { 9417 return (ret_val); 9418 } 9419 if (sbavail(&so->so_snd)) { 9420 if (ctf_progress_timeout_check(tp, true)) { 9421 bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__); 9422 ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 9423 return (1); 9424 } 9425 } 9426 return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen, 9427 tiwin, thflags, nxt_pkt)); 9428 } 9429 9430 static int 9431 bbr_check_data_after_close(struct mbuf *m, struct tcp_bbr *bbr, 9432 struct tcpcb *tp, int32_t * tlen, struct tcphdr *th, struct socket *so) 9433 { 9434 9435 if (bbr->rc_allow_data_af_clo == 0) { 9436 close_now: 9437 tcp_log_end_status(tp, TCP_EI_STATUS_DATA_A_CLOSE); 9438 /* tcp_close will kill the inp pre-log the Reset */ 9439 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); 9440 tp = tcp_close(tp); 9441 KMOD_TCPSTAT_INC(tcps_rcvafterclose); 9442 ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, (*tlen)); 9443 return (1); 9444 } 9445 if (sbavail(&so->so_snd) == 0) 9446 goto close_now; 9447 /* Ok we allow data that is ignored and a followup reset */ 9448 tp->rcv_nxt = th->th_seq + *tlen; 9449 tp->t_flags2 |= TF2_DROP_AF_DATA; 9450 bbr->r_wanted_output = 1; 9451 *tlen = 0; 9452 return (0); 9453 } 9454 9455 /* 9456 * Return value of 1, the TCB is unlocked and most 9457 * likely gone, return value of 0, the TCB is still 9458 * locked. 9459 */ 9460 static int 9461 bbr_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so, 9462 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 9463 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 9464 { 9465 int32_t ourfinisacked = 0; 9466 int32_t ret_val; 9467 struct tcp_bbr *bbr; 9468 9469 bbr = (struct tcp_bbr *)tp->t_fb_ptr; 9470 ctf_calc_rwin(so, tp); 9471 if ((thflags & TH_RST) || 9472 (tp->t_fin_is_rst && (thflags & TH_FIN))) 9473 return (ctf_process_rst(m, th, so, tp)); 9474 /* 9475 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 9476 * synchronized state. 9477 */ 9478 if (thflags & TH_SYN) { 9479 ctf_challenge_ack(m, th, tp, iptos, &ret_val); 9480 return (ret_val); 9481 } 9482 /* 9483 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 9484 * it's less than ts_recent, drop it. 9485 */ 9486 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 9487 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 9488 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 9489 return (ret_val); 9490 } 9491 INP_WLOCK_ASSERT(tp->t_inpcb); 9492 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 9493 return (ret_val); 9494 } 9495 /* 9496 * If new data are received on a connection after the user processes 9497 * are gone, then RST the other end. 9498 * We call a new function now so we might continue and setup 9499 * to reset at all data being ack'd. 9500 */ 9501 if ((tp->t_flags & TF_CLOSED) && tlen && 9502 bbr_check_data_after_close(m, bbr, tp, &tlen, th, so)) 9503 return (1); 9504 /* 9505 * If last ACK falls within this segment's sequence numbers, record 9506 * its timestamp. NOTE: 1) That the test incorporates suggestions 9507 * from the latest proposal of the tcplw@cray.com list (Braden 9508 * 1993/04/26). 2) That updating only on newer timestamps interferes 9509 * with our earlier PAWS tests, so this check should be solely 9510 * predicated on the sequence space of this segment. 3) That we 9511 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 9512 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 9513 * SEG.Len, This modified check allows us to overcome RFC1323's 9514 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 9515 * p.869. In such cases, we can still calculate the RTT correctly 9516 * when RCV.NXT == Last.ACK.Sent. 9517 */ 9518 if ((to->to_flags & TOF_TS) != 0 && 9519 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 9520 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 9521 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 9522 tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv); 9523 tp->ts_recent = to->to_tsval; 9524 } 9525 /* 9526 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 9527 * is on (half-synchronized state), then queue data for later 9528 * processing; else drop segment and return. 9529 */ 9530 if ((thflags & TH_ACK) == 0) { 9531 if (tp->t_flags & TF_NEEDSYN) { 9532 return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen, 9533 tiwin, thflags, nxt_pkt)); 9534 } else if (tp->t_flags & TF_ACKNOW) { 9535 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 9536 bbr->r_wanted_output = 1; 9537 return (ret_val); 9538 } else { 9539 ctf_do_drop(m, NULL); 9540 return (0); 9541 } 9542 } 9543 /* 9544 * Ack processing. 9545 */ 9546 if (bbr_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 9547 return (ret_val); 9548 } 9549 if (ourfinisacked) { 9550 /* 9551 * If we can't receive any more data, then closing user can 9552 * proceed. Starting the timer is contrary to the 9553 * specification, but if we don't get a FIN we'll hang 9554 * forever. 9555 * 9556 * XXXjl: we should release the tp also, and use a 9557 * compressed state. 9558 */ 9559 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 9560 soisdisconnected(so); 9561 tcp_timer_activate(tp, TT_2MSL, 9562 (tcp_fast_finwait2_recycle ? 9563 tcp_finwait2_timeout : 9564 TP_MAXIDLE(tp))); 9565 } 9566 tcp_state_change(tp, TCPS_FIN_WAIT_2); 9567 } 9568 if (sbavail(&so->so_snd)) { 9569 if (ctf_progress_timeout_check(tp, true)) { 9570 bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__); 9571 ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 9572 return (1); 9573 } 9574 } 9575 return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen, 9576 tiwin, thflags, nxt_pkt)); 9577 } 9578 9579 /* 9580 * Return value of 1, the TCB is unlocked and most 9581 * likely gone, return value of 0, the TCB is still 9582 * locked. 9583 */ 9584 static int 9585 bbr_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so, 9586 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 9587 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 9588 { 9589 int32_t ourfinisacked = 0; 9590 int32_t ret_val; 9591 struct tcp_bbr *bbr; 9592 9593 bbr = (struct tcp_bbr *)tp->t_fb_ptr; 9594 ctf_calc_rwin(so, tp); 9595 if ((thflags & TH_RST) || 9596 (tp->t_fin_is_rst && (thflags & TH_FIN))) 9597 return (ctf_process_rst(m, th, so, tp)); 9598 /* 9599 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 9600 * synchronized state. 9601 */ 9602 if (thflags & TH_SYN) { 9603 ctf_challenge_ack(m, th, tp, iptos, &ret_val); 9604 return (ret_val); 9605 } 9606 /* 9607 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 9608 * it's less than ts_recent, drop it. 9609 */ 9610 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 9611 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 9612 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 9613 return (ret_val); 9614 } 9615 INP_WLOCK_ASSERT(tp->t_inpcb); 9616 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 9617 return (ret_val); 9618 } 9619 /* 9620 * If new data are received on a connection after the user processes 9621 * are gone, then RST the other end. 9622 * We call a new function now so we might continue and setup 9623 * to reset at all data being ack'd. 9624 */ 9625 if ((tp->t_flags & TF_CLOSED) && tlen && 9626 bbr_check_data_after_close(m, bbr, tp, &tlen, th, so)) 9627 return (1); 9628 /* 9629 * If last ACK falls within this segment's sequence numbers, record 9630 * its timestamp. NOTE: 1) That the test incorporates suggestions 9631 * from the latest proposal of the tcplw@cray.com list (Braden 9632 * 1993/04/26). 2) That updating only on newer timestamps interferes 9633 * with our earlier PAWS tests, so this check should be solely 9634 * predicated on the sequence space of this segment. 3) That we 9635 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 9636 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 9637 * SEG.Len, This modified check allows us to overcome RFC1323's 9638 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 9639 * p.869. In such cases, we can still calculate the RTT correctly 9640 * when RCV.NXT == Last.ACK.Sent. 9641 */ 9642 if ((to->to_flags & TOF_TS) != 0 && 9643 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 9644 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 9645 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 9646 tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv); 9647 tp->ts_recent = to->to_tsval; 9648 } 9649 /* 9650 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 9651 * is on (half-synchronized state), then queue data for later 9652 * processing; else drop segment and return. 9653 */ 9654 if ((thflags & TH_ACK) == 0) { 9655 if (tp->t_flags & TF_NEEDSYN) { 9656 return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen, 9657 tiwin, thflags, nxt_pkt)); 9658 } else if (tp->t_flags & TF_ACKNOW) { 9659 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 9660 bbr->r_wanted_output = 1; 9661 return (ret_val); 9662 } else { 9663 ctf_do_drop(m, NULL); 9664 return (0); 9665 } 9666 } 9667 /* 9668 * Ack processing. 9669 */ 9670 if (bbr_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 9671 return (ret_val); 9672 } 9673 if (ourfinisacked) { 9674 tcp_twstart(tp); 9675 m_freem(m); 9676 return (1); 9677 } 9678 if (sbavail(&so->so_snd)) { 9679 if (ctf_progress_timeout_check(tp, true)) { 9680 bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__); 9681 ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 9682 return (1); 9683 } 9684 } 9685 return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen, 9686 tiwin, thflags, nxt_pkt)); 9687 } 9688 9689 /* 9690 * Return value of 1, the TCB is unlocked and most 9691 * likely gone, return value of 0, the TCB is still 9692 * locked. 9693 */ 9694 static int 9695 bbr_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so, 9696 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 9697 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 9698 { 9699 int32_t ourfinisacked = 0; 9700 int32_t ret_val; 9701 struct tcp_bbr *bbr; 9702 9703 bbr = (struct tcp_bbr *)tp->t_fb_ptr; 9704 ctf_calc_rwin(so, tp); 9705 if ((thflags & TH_RST) || 9706 (tp->t_fin_is_rst && (thflags & TH_FIN))) 9707 return (ctf_process_rst(m, th, so, tp)); 9708 /* 9709 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 9710 * synchronized state. 9711 */ 9712 if (thflags & TH_SYN) { 9713 ctf_challenge_ack(m, th, tp, iptos, &ret_val); 9714 return (ret_val); 9715 } 9716 /* 9717 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 9718 * it's less than ts_recent, drop it. 9719 */ 9720 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 9721 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 9722 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 9723 return (ret_val); 9724 } 9725 INP_WLOCK_ASSERT(tp->t_inpcb); 9726 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 9727 return (ret_val); 9728 } 9729 /* 9730 * If new data are received on a connection after the user processes 9731 * are gone, then RST the other end. 9732 * We call a new function now so we might continue and setup 9733 * to reset at all data being ack'd. 9734 */ 9735 if ((tp->t_flags & TF_CLOSED) && tlen && 9736 bbr_check_data_after_close(m, bbr, tp, &tlen, th, so)) 9737 return (1); 9738 /* 9739 * If last ACK falls within this segment's sequence numbers, record 9740 * its timestamp. NOTE: 1) That the test incorporates suggestions 9741 * from the latest proposal of the tcplw@cray.com list (Braden 9742 * 1993/04/26). 2) That updating only on newer timestamps interferes 9743 * with our earlier PAWS tests, so this check should be solely 9744 * predicated on the sequence space of this segment. 3) That we 9745 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 9746 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 9747 * SEG.Len, This modified check allows us to overcome RFC1323's 9748 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 9749 * p.869. In such cases, we can still calculate the RTT correctly 9750 * when RCV.NXT == Last.ACK.Sent. 9751 */ 9752 if ((to->to_flags & TOF_TS) != 0 && 9753 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 9754 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 9755 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 9756 tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv); 9757 tp->ts_recent = to->to_tsval; 9758 } 9759 /* 9760 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 9761 * is on (half-synchronized state), then queue data for later 9762 * processing; else drop segment and return. 9763 */ 9764 if ((thflags & TH_ACK) == 0) { 9765 if (tp->t_flags & TF_NEEDSYN) { 9766 return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen, 9767 tiwin, thflags, nxt_pkt)); 9768 } else if (tp->t_flags & TF_ACKNOW) { 9769 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 9770 bbr->r_wanted_output = 1; 9771 return (ret_val); 9772 } else { 9773 ctf_do_drop(m, NULL); 9774 return (0); 9775 } 9776 } 9777 /* 9778 * case TCPS_LAST_ACK: Ack processing. 9779 */ 9780 if (bbr_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 9781 return (ret_val); 9782 } 9783 if (ourfinisacked) { 9784 tp = tcp_close(tp); 9785 ctf_do_drop(m, tp); 9786 return (1); 9787 } 9788 if (sbavail(&so->so_snd)) { 9789 if (ctf_progress_timeout_check(tp, true)) { 9790 bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__); 9791 ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 9792 return (1); 9793 } 9794 } 9795 return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen, 9796 tiwin, thflags, nxt_pkt)); 9797 } 9798 9799 /* 9800 * Return value of 1, the TCB is unlocked and most 9801 * likely gone, return value of 0, the TCB is still 9802 * locked. 9803 */ 9804 static int 9805 bbr_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so, 9806 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 9807 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 9808 { 9809 int32_t ourfinisacked = 0; 9810 int32_t ret_val; 9811 struct tcp_bbr *bbr; 9812 9813 bbr = (struct tcp_bbr *)tp->t_fb_ptr; 9814 ctf_calc_rwin(so, tp); 9815 /* Reset receive buffer auto scaling when not in bulk receive mode. */ 9816 if ((thflags & TH_RST) || 9817 (tp->t_fin_is_rst && (thflags & TH_FIN))) 9818 return (ctf_process_rst(m, th, so, tp)); 9819 9820 /* 9821 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 9822 * synchronized state. 9823 */ 9824 if (thflags & TH_SYN) { 9825 ctf_challenge_ack(m, th, tp, iptos, &ret_val); 9826 return (ret_val); 9827 } 9828 INP_WLOCK_ASSERT(tp->t_inpcb); 9829 /* 9830 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 9831 * it's less than ts_recent, drop it. 9832 */ 9833 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 9834 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 9835 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 9836 return (ret_val); 9837 } 9838 INP_WLOCK_ASSERT(tp->t_inpcb); 9839 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 9840 return (ret_val); 9841 } 9842 /* 9843 * If new data are received on a connection after the user processes 9844 * are gone, then we may RST the other end depending on the outcome 9845 * of bbr_check_data_after_close. 9846 * We call a new function now so we might continue and setup 9847 * to reset at all data being ack'd. 9848 */ 9849 if ((tp->t_flags & TF_CLOSED) && tlen && 9850 bbr_check_data_after_close(m, bbr, tp, &tlen, th, so)) 9851 return (1); 9852 /* 9853 * If last ACK falls within this segment's sequence numbers, record 9854 * its timestamp. NOTE: 1) That the test incorporates suggestions 9855 * from the latest proposal of the tcplw@cray.com list (Braden 9856 * 1993/04/26). 2) That updating only on newer timestamps interferes 9857 * with our earlier PAWS tests, so this check should be solely 9858 * predicated on the sequence space of this segment. 3) That we 9859 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 9860 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 9861 * SEG.Len, This modified check allows us to overcome RFC1323's 9862 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 9863 * p.869. In such cases, we can still calculate the RTT correctly 9864 * when RCV.NXT == Last.ACK.Sent. 9865 */ 9866 INP_WLOCK_ASSERT(tp->t_inpcb); 9867 if ((to->to_flags & TOF_TS) != 0 && 9868 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 9869 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 9870 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 9871 tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv); 9872 tp->ts_recent = to->to_tsval; 9873 } 9874 /* 9875 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 9876 * is on (half-synchronized state), then queue data for later 9877 * processing; else drop segment and return. 9878 */ 9879 if ((thflags & TH_ACK) == 0) { 9880 if (tp->t_flags & TF_NEEDSYN) { 9881 return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen, 9882 tiwin, thflags, nxt_pkt)); 9883 } else if (tp->t_flags & TF_ACKNOW) { 9884 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 9885 bbr->r_wanted_output = 1; 9886 return (ret_val); 9887 } else { 9888 ctf_do_drop(m, NULL); 9889 return (0); 9890 } 9891 } 9892 /* 9893 * Ack processing. 9894 */ 9895 INP_WLOCK_ASSERT(tp->t_inpcb); 9896 if (bbr_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 9897 return (ret_val); 9898 } 9899 if (sbavail(&so->so_snd)) { 9900 if (ctf_progress_timeout_check(tp, true)) { 9901 bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__); 9902 ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 9903 return (1); 9904 } 9905 } 9906 INP_WLOCK_ASSERT(tp->t_inpcb); 9907 return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen, 9908 tiwin, thflags, nxt_pkt)); 9909 } 9910 9911 static void 9912 bbr_stop_all_timers(struct tcpcb *tp) 9913 { 9914 struct tcp_bbr *bbr; 9915 9916 /* 9917 * Assure no timers are running. 9918 */ 9919 if (tcp_timer_active(tp, TT_PERSIST)) { 9920 /* We enter in persists, set the flag appropriately */ 9921 bbr = (struct tcp_bbr *)tp->t_fb_ptr; 9922 bbr->rc_in_persist = 1; 9923 } 9924 tcp_timer_suspend(tp, TT_PERSIST); 9925 tcp_timer_suspend(tp, TT_REXMT); 9926 tcp_timer_suspend(tp, TT_KEEP); 9927 tcp_timer_suspend(tp, TT_DELACK); 9928 } 9929 9930 static void 9931 bbr_google_mode_on(struct tcp_bbr *bbr) 9932 { 9933 bbr->rc_use_google = 1; 9934 bbr->rc_no_pacing = 0; 9935 bbr->r_ctl.bbr_google_discount = bbr_google_discount; 9936 bbr->r_use_policer = bbr_policer_detection_enabled; 9937 bbr->r_ctl.rc_probertt_int = (USECS_IN_SECOND * 10); 9938 bbr->bbr_use_rack_cheat = 0; 9939 bbr->r_ctl.rc_incr_tmrs = 0; 9940 bbr->r_ctl.rc_inc_tcp_oh = 0; 9941 bbr->r_ctl.rc_inc_ip_oh = 0; 9942 bbr->r_ctl.rc_inc_enet_oh = 0; 9943 reset_time(&bbr->r_ctl.rc_delrate, 9944 BBR_NUM_RTTS_FOR_GOOG_DEL_LIMIT); 9945 reset_time_small(&bbr->r_ctl.rc_rttprop, 9946 (11 * USECS_IN_SECOND)); 9947 tcp_bbr_tso_size_check(bbr, tcp_get_usecs(&bbr->rc_tv)); 9948 } 9949 9950 static void 9951 bbr_google_mode_off(struct tcp_bbr *bbr) 9952 { 9953 bbr->rc_use_google = 0; 9954 bbr->r_ctl.bbr_google_discount = 0; 9955 bbr->no_pacing_until = bbr_no_pacing_until; 9956 bbr->r_use_policer = 0; 9957 if (bbr->no_pacing_until) 9958 bbr->rc_no_pacing = 1; 9959 else 9960 bbr->rc_no_pacing = 0; 9961 if (bbr_use_rack_resend_cheat) 9962 bbr->bbr_use_rack_cheat = 1; 9963 else 9964 bbr->bbr_use_rack_cheat = 0; 9965 if (bbr_incr_timers) 9966 bbr->r_ctl.rc_incr_tmrs = 1; 9967 else 9968 bbr->r_ctl.rc_incr_tmrs = 0; 9969 if (bbr_include_tcp_oh) 9970 bbr->r_ctl.rc_inc_tcp_oh = 1; 9971 else 9972 bbr->r_ctl.rc_inc_tcp_oh = 0; 9973 if (bbr_include_ip_oh) 9974 bbr->r_ctl.rc_inc_ip_oh = 1; 9975 else 9976 bbr->r_ctl.rc_inc_ip_oh = 0; 9977 if (bbr_include_enet_oh) 9978 bbr->r_ctl.rc_inc_enet_oh = 1; 9979 else 9980 bbr->r_ctl.rc_inc_enet_oh = 0; 9981 bbr->r_ctl.rc_probertt_int = bbr_rtt_probe_limit; 9982 reset_time(&bbr->r_ctl.rc_delrate, 9983 bbr_num_pktepo_for_del_limit); 9984 reset_time_small(&bbr->r_ctl.rc_rttprop, 9985 (bbr_filter_len_sec * USECS_IN_SECOND)); 9986 tcp_bbr_tso_size_check(bbr, tcp_get_usecs(&bbr->rc_tv)); 9987 } 9988 /* 9989 * Return 0 on success, non-zero on failure 9990 * which indicates the error (usually no memory). 9991 */ 9992 static int 9993 bbr_init(struct tcpcb *tp) 9994 { 9995 struct tcp_bbr *bbr = NULL; 9996 struct inpcb *inp; 9997 uint32_t cts; 9998 9999 tp->t_fb_ptr = uma_zalloc(bbr_pcb_zone, (M_NOWAIT | M_ZERO)); 10000 if (tp->t_fb_ptr == NULL) { 10001 /* 10002 * We need to allocate memory but cant. The INP and INP_INFO 10003 * locks and they are recursive (happens during setup. So a 10004 * scheme to drop the locks fails :( 10005 * 10006 */ 10007 return (ENOMEM); 10008 } 10009 bbr = (struct tcp_bbr *)tp->t_fb_ptr; 10010 bbr->rtt_valid = 0; 10011 inp = tp->t_inpcb; 10012 inp->inp_flags2 |= INP_CANNOT_DO_ECN; 10013 inp->inp_flags2 |= INP_SUPPORTS_MBUFQ; 10014 TAILQ_INIT(&bbr->r_ctl.rc_map); 10015 TAILQ_INIT(&bbr->r_ctl.rc_free); 10016 TAILQ_INIT(&bbr->r_ctl.rc_tmap); 10017 bbr->rc_tp = tp; 10018 if (tp->t_inpcb) { 10019 bbr->rc_inp = tp->t_inpcb; 10020 } 10021 cts = tcp_get_usecs(&bbr->rc_tv); 10022 tp->t_acktime = 0; 10023 bbr->rc_allow_data_af_clo = bbr_ignore_data_after_close; 10024 bbr->r_ctl.rc_reorder_fade = bbr_reorder_fade; 10025 bbr->rc_tlp_threshold = bbr_tlp_thresh; 10026 bbr->r_ctl.rc_reorder_shift = bbr_reorder_thresh; 10027 bbr->r_ctl.rc_pkt_delay = bbr_pkt_delay; 10028 bbr->r_ctl.rc_min_to = bbr_min_to; 10029 bbr->rc_bbr_state = BBR_STATE_STARTUP; 10030 bbr->r_ctl.bbr_lost_at_state = 0; 10031 bbr->r_ctl.rc_lost_at_startup = 0; 10032 bbr->rc_all_timers_stopped = 0; 10033 bbr->r_ctl.rc_bbr_lastbtlbw = 0; 10034 bbr->r_ctl.rc_pkt_epoch_del = 0; 10035 bbr->r_ctl.rc_pkt_epoch = 0; 10036 bbr->r_ctl.rc_lowest_rtt = 0xffffffff; 10037 bbr->r_ctl.rc_bbr_hptsi_gain = bbr_high_gain; 10038 bbr->r_ctl.rc_bbr_cwnd_gain = bbr_high_gain; 10039 bbr->r_ctl.rc_went_idle_time = cts; 10040 bbr->rc_pacer_started = cts; 10041 bbr->r_ctl.rc_pkt_epoch_time = cts; 10042 bbr->r_ctl.rc_rcvtime = cts; 10043 bbr->r_ctl.rc_bbr_state_time = cts; 10044 bbr->r_ctl.rc_del_time = cts; 10045 bbr->r_ctl.rc_tlp_rxt_last_time = cts; 10046 bbr->r_ctl.last_in_probertt = cts; 10047 bbr->skip_gain = 0; 10048 bbr->gain_is_limited = 0; 10049 bbr->no_pacing_until = bbr_no_pacing_until; 10050 if (bbr->no_pacing_until) 10051 bbr->rc_no_pacing = 1; 10052 if (bbr_use_google_algo) { 10053 bbr->rc_no_pacing = 0; 10054 bbr->rc_use_google = 1; 10055 bbr->r_ctl.bbr_google_discount = bbr_google_discount; 10056 bbr->r_use_policer = bbr_policer_detection_enabled; 10057 } else { 10058 bbr->rc_use_google = 0; 10059 bbr->r_ctl.bbr_google_discount = 0; 10060 bbr->r_use_policer = 0; 10061 } 10062 if (bbr_ts_limiting) 10063 bbr->rc_use_ts_limit = 1; 10064 else 10065 bbr->rc_use_ts_limit = 0; 10066 if (bbr_ts_can_raise) 10067 bbr->ts_can_raise = 1; 10068 else 10069 bbr->ts_can_raise = 0; 10070 if (V_tcp_delack_enabled == 1) 10071 tp->t_delayed_ack = 2; 10072 else if (V_tcp_delack_enabled == 0) 10073 tp->t_delayed_ack = 0; 10074 else if (V_tcp_delack_enabled < 100) 10075 tp->t_delayed_ack = V_tcp_delack_enabled; 10076 else 10077 tp->t_delayed_ack = 2; 10078 if (bbr->rc_use_google == 0) 10079 bbr->r_ctl.rc_probertt_int = bbr_rtt_probe_limit; 10080 else 10081 bbr->r_ctl.rc_probertt_int = (USECS_IN_SECOND * 10); 10082 bbr->r_ctl.rc_min_rto_ms = bbr_rto_min_ms; 10083 bbr->rc_max_rto_sec = bbr_rto_max_sec; 10084 bbr->rc_init_win = bbr_def_init_win; 10085 if (tp->t_flags & TF_REQ_TSTMP) 10086 bbr->rc_last_options = TCP_TS_OVERHEAD; 10087 bbr->r_ctl.rc_pace_max_segs = tp->t_maxseg - bbr->rc_last_options; 10088 bbr->r_ctl.rc_high_rwnd = tp->snd_wnd; 10089 bbr->r_init_rtt = 1; 10090 10091 counter_u64_add(bbr_flows_nohdwr_pacing, 1); 10092 if (bbr_allow_hdwr_pacing) 10093 bbr->bbr_hdw_pace_ena = 1; 10094 else 10095 bbr->bbr_hdw_pace_ena = 0; 10096 if (bbr_sends_full_iwnd) 10097 bbr->bbr_init_win_cheat = 1; 10098 else 10099 bbr->bbr_init_win_cheat = 0; 10100 bbr->r_ctl.bbr_utter_max = bbr_hptsi_utter_max; 10101 bbr->r_ctl.rc_drain_pg = bbr_drain_gain; 10102 bbr->r_ctl.rc_startup_pg = bbr_high_gain; 10103 bbr->rc_loss_exit = bbr_exit_startup_at_loss; 10104 bbr->r_ctl.bbr_rttprobe_gain_val = bbr_rttprobe_gain; 10105 bbr->r_ctl.bbr_hptsi_per_second = bbr_hptsi_per_second; 10106 bbr->r_ctl.bbr_hptsi_segments_delay_tar = bbr_hptsi_segments_delay_tar; 10107 bbr->r_ctl.bbr_hptsi_segments_max = bbr_hptsi_segments_max; 10108 bbr->r_ctl.bbr_hptsi_segments_floor = bbr_hptsi_segments_floor; 10109 bbr->r_ctl.bbr_hptsi_bytes_min = bbr_hptsi_bytes_min; 10110 bbr->r_ctl.bbr_cross_over = bbr_cross_over; 10111 bbr->r_ctl.rc_rtt_shrinks = cts; 10112 if (bbr->rc_use_google) { 10113 setup_time_filter(&bbr->r_ctl.rc_delrate, 10114 FILTER_TYPE_MAX, 10115 BBR_NUM_RTTS_FOR_GOOG_DEL_LIMIT); 10116 setup_time_filter_small(&bbr->r_ctl.rc_rttprop, 10117 FILTER_TYPE_MIN, (11 * USECS_IN_SECOND)); 10118 } else { 10119 setup_time_filter(&bbr->r_ctl.rc_delrate, 10120 FILTER_TYPE_MAX, 10121 bbr_num_pktepo_for_del_limit); 10122 setup_time_filter_small(&bbr->r_ctl.rc_rttprop, 10123 FILTER_TYPE_MIN, (bbr_filter_len_sec * USECS_IN_SECOND)); 10124 } 10125 bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_INIT, 0); 10126 if (bbr_uses_idle_restart) 10127 bbr->rc_use_idle_restart = 1; 10128 else 10129 bbr->rc_use_idle_restart = 0; 10130 bbr->r_ctl.rc_bbr_cur_del_rate = 0; 10131 bbr->r_ctl.rc_initial_hptsi_bw = bbr_initial_bw_bps; 10132 if (bbr_resends_use_tso) 10133 bbr->rc_resends_use_tso = 1; 10134 #ifdef NETFLIX_PEAKRATE 10135 tp->t_peakrate_thr = tp->t_maxpeakrate; 10136 #endif 10137 if (tp->snd_una != tp->snd_max) { 10138 /* Create a send map for the current outstanding data */ 10139 struct bbr_sendmap *rsm; 10140 10141 rsm = bbr_alloc(bbr); 10142 if (rsm == NULL) { 10143 uma_zfree(bbr_pcb_zone, tp->t_fb_ptr); 10144 tp->t_fb_ptr = NULL; 10145 return (ENOMEM); 10146 } 10147 rsm->r_rtt_not_allowed = 1; 10148 rsm->r_tim_lastsent[0] = cts; 10149 rsm->r_rtr_cnt = 1; 10150 rsm->r_rtr_bytes = 0; 10151 rsm->r_start = tp->snd_una; 10152 rsm->r_end = tp->snd_max; 10153 rsm->r_dupack = 0; 10154 rsm->r_delivered = bbr->r_ctl.rc_delivered; 10155 rsm->r_ts_valid = 0; 10156 rsm->r_del_ack_ts = tp->ts_recent; 10157 rsm->r_del_time = cts; 10158 if (bbr->r_ctl.r_app_limited_until) 10159 rsm->r_app_limited = 1; 10160 else 10161 rsm->r_app_limited = 0; 10162 TAILQ_INSERT_TAIL(&bbr->r_ctl.rc_map, rsm, r_next); 10163 TAILQ_INSERT_TAIL(&bbr->r_ctl.rc_tmap, rsm, r_tnext); 10164 rsm->r_in_tmap = 1; 10165 if (bbr->rc_bbr_state == BBR_STATE_PROBE_BW) 10166 rsm->r_bbr_state = bbr_state_val(bbr); 10167 else 10168 rsm->r_bbr_state = 8; 10169 } 10170 if (bbr_use_rack_resend_cheat && (bbr->rc_use_google == 0)) 10171 bbr->bbr_use_rack_cheat = 1; 10172 if (bbr_incr_timers && (bbr->rc_use_google == 0)) 10173 bbr->r_ctl.rc_incr_tmrs = 1; 10174 if (bbr_include_tcp_oh && (bbr->rc_use_google == 0)) 10175 bbr->r_ctl.rc_inc_tcp_oh = 1; 10176 if (bbr_include_ip_oh && (bbr->rc_use_google == 0)) 10177 bbr->r_ctl.rc_inc_ip_oh = 1; 10178 if (bbr_include_enet_oh && (bbr->rc_use_google == 0)) 10179 bbr->r_ctl.rc_inc_enet_oh = 1; 10180 10181 bbr_log_type_statechange(bbr, cts, __LINE__); 10182 if (TCPS_HAVEESTABLISHED(tp->t_state) && 10183 (tp->t_srtt)) { 10184 uint32_t rtt; 10185 10186 rtt = (TICKS_2_USEC(tp->t_srtt) >> TCP_RTT_SHIFT); 10187 apply_filter_min_small(&bbr->r_ctl.rc_rttprop, rtt, cts); 10188 } 10189 /* announce the settings and state */ 10190 bbr_log_settings_change(bbr, BBR_RECOVERY_LOWRTT); 10191 tcp_bbr_tso_size_check(bbr, cts); 10192 /* 10193 * Now call the generic function to start a timer. This will place 10194 * the TCB on the hptsi wheel if a timer is needed with appropriate 10195 * flags. 10196 */ 10197 bbr_stop_all_timers(tp); 10198 bbr_start_hpts_timer(bbr, tp, cts, 5, 0, 0); 10199 return (0); 10200 } 10201 10202 /* 10203 * Return 0 if we can accept the connection. Return 10204 * non-zero if we can't handle the connection. A EAGAIN 10205 * means you need to wait until the connection is up. 10206 * a EADDRNOTAVAIL means we can never handle the connection 10207 * (no SACK). 10208 */ 10209 static int 10210 bbr_handoff_ok(struct tcpcb *tp) 10211 { 10212 if ((tp->t_state == TCPS_CLOSED) || 10213 (tp->t_state == TCPS_LISTEN)) { 10214 /* Sure no problem though it may not stick */ 10215 return (0); 10216 } 10217 if ((tp->t_state == TCPS_SYN_SENT) || 10218 (tp->t_state == TCPS_SYN_RECEIVED)) { 10219 /* 10220 * We really don't know you have to get to ESTAB or beyond 10221 * to tell. 10222 */ 10223 return (EAGAIN); 10224 } 10225 if (tp->t_flags & TF_SENTFIN) 10226 return (EINVAL); 10227 if ((tp->t_flags & TF_SACK_PERMIT) || bbr_sack_not_required) { 10228 return (0); 10229 } 10230 /* 10231 * If we reach here we don't do SACK on this connection so we can 10232 * never do rack. 10233 */ 10234 return (EINVAL); 10235 } 10236 10237 static void 10238 bbr_fini(struct tcpcb *tp, int32_t tcb_is_purged) 10239 { 10240 if (tp->t_fb_ptr) { 10241 uint32_t calc; 10242 struct tcp_bbr *bbr; 10243 struct bbr_sendmap *rsm; 10244 10245 bbr = (struct tcp_bbr *)tp->t_fb_ptr; 10246 if (bbr->r_ctl.crte) 10247 tcp_rel_pacing_rate(bbr->r_ctl.crte, bbr->rc_tp); 10248 bbr_log_flowend(bbr); 10249 bbr->rc_tp = NULL; 10250 if (tp->t_inpcb) { 10251 /* Backout any flags2 we applied */ 10252 tp->t_inpcb->inp_flags2 &= ~INP_CANNOT_DO_ECN; 10253 tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ; 10254 tp->t_inpcb->inp_flags2 &= ~INP_MBUF_QUEUE_READY; 10255 } 10256 if (bbr->bbr_hdrw_pacing) 10257 counter_u64_add(bbr_flows_whdwr_pacing, -1); 10258 else 10259 counter_u64_add(bbr_flows_nohdwr_pacing, -1); 10260 if (bbr->r_ctl.crte != NULL) { 10261 tcp_rel_pacing_rate(bbr->r_ctl.crte, tp); 10262 bbr->r_ctl.crte = NULL; 10263 } 10264 rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map); 10265 while (rsm) { 10266 TAILQ_REMOVE(&bbr->r_ctl.rc_map, rsm, r_next); 10267 uma_zfree(bbr_zone, rsm); 10268 rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map); 10269 } 10270 rsm = TAILQ_FIRST(&bbr->r_ctl.rc_free); 10271 while (rsm) { 10272 TAILQ_REMOVE(&bbr->r_ctl.rc_free, rsm, r_next); 10273 uma_zfree(bbr_zone, rsm); 10274 rsm = TAILQ_FIRST(&bbr->r_ctl.rc_free); 10275 } 10276 calc = bbr->r_ctl.rc_high_rwnd - bbr->r_ctl.rc_init_rwnd; 10277 if (calc > (bbr->r_ctl.rc_init_rwnd / 10)) 10278 BBR_STAT_INC(bbr_dynamic_rwnd); 10279 else 10280 BBR_STAT_INC(bbr_static_rwnd); 10281 bbr->r_ctl.rc_free_cnt = 0; 10282 uma_zfree(bbr_pcb_zone, tp->t_fb_ptr); 10283 tp->t_fb_ptr = NULL; 10284 } 10285 /* Make sure snd_nxt is correctly set */ 10286 tp->snd_nxt = tp->snd_max; 10287 } 10288 10289 static void 10290 bbr_set_state(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t win) 10291 { 10292 switch (tp->t_state) { 10293 case TCPS_SYN_SENT: 10294 bbr->r_state = TCPS_SYN_SENT; 10295 bbr->r_substate = bbr_do_syn_sent; 10296 break; 10297 case TCPS_SYN_RECEIVED: 10298 bbr->r_state = TCPS_SYN_RECEIVED; 10299 bbr->r_substate = bbr_do_syn_recv; 10300 break; 10301 case TCPS_ESTABLISHED: 10302 bbr->r_ctl.rc_init_rwnd = max(win, bbr->rc_tp->snd_wnd); 10303 bbr->r_state = TCPS_ESTABLISHED; 10304 bbr->r_substate = bbr_do_established; 10305 break; 10306 case TCPS_CLOSE_WAIT: 10307 bbr->r_state = TCPS_CLOSE_WAIT; 10308 bbr->r_substate = bbr_do_close_wait; 10309 break; 10310 case TCPS_FIN_WAIT_1: 10311 bbr->r_state = TCPS_FIN_WAIT_1; 10312 bbr->r_substate = bbr_do_fin_wait_1; 10313 break; 10314 case TCPS_CLOSING: 10315 bbr->r_state = TCPS_CLOSING; 10316 bbr->r_substate = bbr_do_closing; 10317 break; 10318 case TCPS_LAST_ACK: 10319 bbr->r_state = TCPS_LAST_ACK; 10320 bbr->r_substate = bbr_do_lastack; 10321 break; 10322 case TCPS_FIN_WAIT_2: 10323 bbr->r_state = TCPS_FIN_WAIT_2; 10324 bbr->r_substate = bbr_do_fin_wait_2; 10325 break; 10326 case TCPS_LISTEN: 10327 case TCPS_CLOSED: 10328 case TCPS_TIME_WAIT: 10329 default: 10330 break; 10331 }; 10332 } 10333 10334 static void 10335 bbr_substate_change(struct tcp_bbr *bbr, uint32_t cts, int32_t line, int dolog) 10336 { 10337 /* 10338 * Now what state are we going into now? Is there adjustments 10339 * needed? 10340 */ 10341 int32_t old_state; 10342 10343 old_state = bbr_state_val(bbr); 10344 if (bbr_state_val(bbr) == BBR_SUB_LEVEL1) { 10345 /* Save the lowest srtt we saw in our end of the sub-state */ 10346 bbr->rc_hit_state_1 = 0; 10347 if (bbr->r_ctl.bbr_smallest_srtt_this_state != 0xffffffff) 10348 bbr->r_ctl.bbr_smallest_srtt_state2 = bbr->r_ctl.bbr_smallest_srtt_this_state; 10349 } 10350 bbr->rc_bbr_substate++; 10351 if (bbr->rc_bbr_substate >= BBR_SUBSTATE_COUNT) { 10352 /* Cycle back to first state-> gain */ 10353 bbr->rc_bbr_substate = 0; 10354 } 10355 if (bbr_state_val(bbr) == BBR_SUB_GAIN) { 10356 /* 10357 * We enter the gain(5/4) cycle (possibly less if 10358 * shallow buffer detection is enabled) 10359 */ 10360 if (bbr->skip_gain) { 10361 /* 10362 * Hardware pacing has set our rate to 10363 * the max and limited our b/w just 10364 * do level i.e. no gain. 10365 */ 10366 bbr->r_ctl.rc_bbr_hptsi_gain = bbr_hptsi_gain[BBR_SUB_LEVEL1]; 10367 } else if (bbr->gain_is_limited && 10368 bbr->bbr_hdrw_pacing && 10369 bbr->r_ctl.crte) { 10370 /* 10371 * We can't gain above the hardware pacing 10372 * rate which is less than our rate + the gain 10373 * calculate the gain needed to reach the hardware 10374 * pacing rate.. 10375 */ 10376 uint64_t bw, rate, gain_calc; 10377 10378 bw = bbr_get_bw(bbr); 10379 rate = bbr->r_ctl.crte->rate; 10380 if ((rate > bw) && 10381 (((bw * (uint64_t)bbr_hptsi_gain[BBR_SUB_GAIN]) / (uint64_t)BBR_UNIT) > rate)) { 10382 gain_calc = (rate * BBR_UNIT) / bw; 10383 if (gain_calc < BBR_UNIT) 10384 gain_calc = BBR_UNIT; 10385 bbr->r_ctl.rc_bbr_hptsi_gain = (uint16_t)gain_calc; 10386 } else { 10387 bbr->r_ctl.rc_bbr_hptsi_gain = bbr_hptsi_gain[BBR_SUB_GAIN]; 10388 } 10389 } else 10390 bbr->r_ctl.rc_bbr_hptsi_gain = bbr_hptsi_gain[BBR_SUB_GAIN]; 10391 if ((bbr->rc_use_google == 0) && (bbr_gain_to_target == 0)) { 10392 bbr->r_ctl.rc_bbr_state_atflight = cts; 10393 } else 10394 bbr->r_ctl.rc_bbr_state_atflight = 0; 10395 } else if (bbr_state_val(bbr) == BBR_SUB_DRAIN) { 10396 bbr->rc_hit_state_1 = 1; 10397 bbr->r_ctl.rc_exta_time_gd = 0; 10398 bbr->r_ctl.flightsize_at_drain = ctf_flight_size(bbr->rc_tp, 10399 (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)); 10400 if (bbr_state_drain_2_tar) { 10401 bbr->r_ctl.rc_bbr_state_atflight = 0; 10402 } else 10403 bbr->r_ctl.rc_bbr_state_atflight = cts; 10404 bbr->r_ctl.rc_bbr_hptsi_gain = bbr_hptsi_gain[BBR_SUB_DRAIN]; 10405 } else { 10406 /* All other cycles hit here 2-7 */ 10407 if ((old_state == BBR_SUB_DRAIN) && bbr->rc_hit_state_1) { 10408 if (bbr_sub_drain_slam_cwnd && 10409 (bbr->rc_use_google == 0) && 10410 (bbr->rc_tp->snd_cwnd < bbr->r_ctl.rc_saved_cwnd)) { 10411 bbr->rc_tp->snd_cwnd = bbr->r_ctl.rc_saved_cwnd; 10412 bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__); 10413 } 10414 if ((cts - bbr->r_ctl.rc_bbr_state_time) > bbr_get_rtt(bbr, BBR_RTT_PROP)) 10415 bbr->r_ctl.rc_exta_time_gd += ((cts - bbr->r_ctl.rc_bbr_state_time) - 10416 bbr_get_rtt(bbr, BBR_RTT_PROP)); 10417 else 10418 bbr->r_ctl.rc_exta_time_gd = 0; 10419 if (bbr->r_ctl.rc_exta_time_gd) { 10420 bbr->r_ctl.rc_level_state_extra = bbr->r_ctl.rc_exta_time_gd; 10421 /* Now chop up the time for each state (div by 7) */ 10422 bbr->r_ctl.rc_level_state_extra /= 7; 10423 if (bbr_rand_ot && bbr->r_ctl.rc_level_state_extra) { 10424 /* Add a randomization */ 10425 bbr_randomize_extra_state_time(bbr); 10426 } 10427 } 10428 } 10429 bbr->r_ctl.rc_bbr_state_atflight = max(1, cts); 10430 bbr->r_ctl.rc_bbr_hptsi_gain = bbr_hptsi_gain[bbr_state_val(bbr)]; 10431 } 10432 if (bbr->rc_use_google) { 10433 bbr->r_ctl.rc_bbr_state_atflight = max(1, cts); 10434 } 10435 bbr->r_ctl.bbr_lost_at_state = bbr->r_ctl.rc_lost; 10436 bbr->r_ctl.rc_bbr_cwnd_gain = bbr_cwnd_gain; 10437 if (dolog) 10438 bbr_log_type_statechange(bbr, cts, line); 10439 10440 if (SEQ_GT(cts, bbr->r_ctl.rc_bbr_state_time)) { 10441 uint32_t time_in; 10442 10443 time_in = cts - bbr->r_ctl.rc_bbr_state_time; 10444 if (bbr->rc_bbr_state == BBR_STATE_PROBE_BW) { 10445 counter_u64_add(bbr_state_time[(old_state + 5)], time_in); 10446 } else { 10447 counter_u64_add(bbr_state_time[bbr->rc_bbr_state], time_in); 10448 } 10449 } 10450 bbr->r_ctl.bbr_smallest_srtt_this_state = 0xffffffff; 10451 bbr_set_state_target(bbr, __LINE__); 10452 if (bbr_sub_drain_slam_cwnd && 10453 (bbr->rc_use_google == 0) && 10454 (bbr_state_val(bbr) == BBR_SUB_DRAIN)) { 10455 /* Slam down the cwnd */ 10456 bbr->r_ctl.rc_saved_cwnd = bbr->rc_tp->snd_cwnd; 10457 bbr->rc_tp->snd_cwnd = bbr->r_ctl.rc_target_at_state; 10458 if (bbr_sub_drain_app_limit) { 10459 /* Go app limited if we are on a long drain */ 10460 bbr->r_ctl.r_app_limited_until = (bbr->r_ctl.rc_delivered + 10461 ctf_flight_size(bbr->rc_tp, 10462 (bbr->r_ctl.rc_sacked + 10463 bbr->r_ctl.rc_lost_bytes))); 10464 } 10465 bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__); 10466 } 10467 if (bbr->rc_lt_use_bw) { 10468 /* In policed mode we clamp pacing_gain to BBR_UNIT */ 10469 bbr->r_ctl.rc_bbr_hptsi_gain = BBR_UNIT; 10470 } 10471 /* Google changes TSO size every cycle */ 10472 if (bbr->rc_use_google) 10473 tcp_bbr_tso_size_check(bbr, cts); 10474 bbr->r_ctl.gain_epoch = cts; 10475 bbr->r_ctl.rc_bbr_state_time = cts; 10476 bbr->r_ctl.substate_pe = bbr->r_ctl.rc_pkt_epoch; 10477 } 10478 10479 static void 10480 bbr_set_probebw_google_gains(struct tcp_bbr *bbr, uint32_t cts, uint32_t losses) 10481 { 10482 if ((bbr_state_val(bbr) == BBR_SUB_DRAIN) && 10483 (google_allow_early_out == 1) && 10484 (bbr->r_ctl.rc_flight_at_input <= bbr->r_ctl.rc_target_at_state)) { 10485 /* We have reached out target flight size possibly early */ 10486 goto change_state; 10487 } 10488 if (TSTMP_LT(cts, bbr->r_ctl.rc_bbr_state_time)) { 10489 return; 10490 } 10491 if ((cts - bbr->r_ctl.rc_bbr_state_time) < bbr_get_rtt(bbr, BBR_RTT_PROP)) { 10492 /* 10493 * Must be a rttProp movement forward before 10494 * we can change states. 10495 */ 10496 return; 10497 } 10498 if (bbr_state_val(bbr) == BBR_SUB_GAIN) { 10499 /* 10500 * The needed time has passed but for 10501 * the gain cycle extra rules apply: 10502 * 1) If we have seen loss, we exit 10503 * 2) If we have not reached the target 10504 * we stay in GAIN (gain-to-target). 10505 */ 10506 if (google_consider_lost && losses) 10507 goto change_state; 10508 if (bbr->r_ctl.rc_target_at_state > bbr->r_ctl.rc_flight_at_input) { 10509 return; 10510 } 10511 } 10512 change_state: 10513 /* For gain we must reach our target, all others last 1 rttProp */ 10514 bbr_substate_change(bbr, cts, __LINE__, 1); 10515 } 10516 10517 static void 10518 bbr_set_probebw_gains(struct tcp_bbr *bbr, uint32_t cts, uint32_t losses) 10519 { 10520 uint32_t flight, bbr_cur_cycle_time; 10521 10522 if (bbr->rc_use_google) { 10523 bbr_set_probebw_google_gains(bbr, cts, losses); 10524 return; 10525 } 10526 if (cts == 0) { 10527 /* 10528 * Never alow cts to be 0 we 10529 * do this so we can judge if 10530 * we have set a timestamp. 10531 */ 10532 cts = 1; 10533 } 10534 if (bbr_state_is_pkt_epoch) 10535 bbr_cur_cycle_time = bbr_get_rtt(bbr, BBR_RTT_PKTRTT); 10536 else 10537 bbr_cur_cycle_time = bbr_get_rtt(bbr, BBR_RTT_PROP); 10538 10539 if (bbr->r_ctl.rc_bbr_state_atflight == 0) { 10540 if (bbr_state_val(bbr) == BBR_SUB_DRAIN) { 10541 flight = ctf_flight_size(bbr->rc_tp, 10542 (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)); 10543 if (bbr_sub_drain_slam_cwnd && bbr->rc_hit_state_1) { 10544 /* Keep it slam down */ 10545 if (bbr->rc_tp->snd_cwnd > bbr->r_ctl.rc_target_at_state) { 10546 bbr->rc_tp->snd_cwnd = bbr->r_ctl.rc_target_at_state; 10547 bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__); 10548 } 10549 if (bbr_sub_drain_app_limit) { 10550 /* Go app limited if we are on a long drain */ 10551 bbr->r_ctl.r_app_limited_until = (bbr->r_ctl.rc_delivered + flight); 10552 } 10553 } 10554 if (TSTMP_GT(cts, bbr->r_ctl.gain_epoch) && 10555 (((cts - bbr->r_ctl.gain_epoch) > bbr_get_rtt(bbr, BBR_RTT_PROP)) || 10556 (flight >= bbr->r_ctl.flightsize_at_drain))) { 10557 /* 10558 * Still here after the same time as 10559 * the gain. We need to drain harder 10560 * for the next srtt. Reduce by a set amount 10561 * the gain drop is capped at DRAIN states 10562 * value (88). 10563 */ 10564 bbr->r_ctl.flightsize_at_drain = flight; 10565 if (bbr_drain_drop_mul && 10566 bbr_drain_drop_div && 10567 (bbr_drain_drop_mul < bbr_drain_drop_div)) { 10568 /* Use your specific drop value (def 4/5 = 20%) */ 10569 bbr->r_ctl.rc_bbr_hptsi_gain *= bbr_drain_drop_mul; 10570 bbr->r_ctl.rc_bbr_hptsi_gain /= bbr_drain_drop_div; 10571 } else { 10572 /* You get drop of 20% */ 10573 bbr->r_ctl.rc_bbr_hptsi_gain *= 4; 10574 bbr->r_ctl.rc_bbr_hptsi_gain /= 5; 10575 } 10576 if (bbr->r_ctl.rc_bbr_hptsi_gain <= bbr_drain_floor) { 10577 /* Reduce our gain again to the bottom */ 10578 bbr->r_ctl.rc_bbr_hptsi_gain = max(bbr_drain_floor, 1); 10579 } 10580 bbr_log_exit_gain(bbr, cts, 4); 10581 /* 10582 * Extend out so we wait another 10583 * epoch before dropping again. 10584 */ 10585 bbr->r_ctl.gain_epoch = cts; 10586 } 10587 if (flight <= bbr->r_ctl.rc_target_at_state) { 10588 if (bbr_sub_drain_slam_cwnd && 10589 (bbr->rc_use_google == 0) && 10590 (bbr->rc_tp->snd_cwnd < bbr->r_ctl.rc_saved_cwnd)) { 10591 bbr->rc_tp->snd_cwnd = bbr->r_ctl.rc_saved_cwnd; 10592 bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__); 10593 } 10594 bbr->r_ctl.rc_bbr_state_atflight = max(cts, 1); 10595 bbr_log_exit_gain(bbr, cts, 3); 10596 } 10597 } else { 10598 /* Its a gain */ 10599 if (bbr->r_ctl.rc_lost > bbr->r_ctl.bbr_lost_at_state) { 10600 bbr->r_ctl.rc_bbr_state_atflight = max(cts, 1); 10601 goto change_state; 10602 } 10603 if ((ctf_outstanding(bbr->rc_tp) >= bbr->r_ctl.rc_target_at_state) || 10604 ((ctf_outstanding(bbr->rc_tp) + bbr->rc_tp->t_maxseg - 1) >= 10605 bbr->rc_tp->snd_wnd)) { 10606 bbr->r_ctl.rc_bbr_state_atflight = max(cts, 1); 10607 bbr_log_exit_gain(bbr, cts, 2); 10608 } 10609 } 10610 /** 10611 * We fall through and return always one of two things has 10612 * occurred. 10613 * 1) We are still not at target 10614 * <or> 10615 * 2) We reached the target and set rc_bbr_state_atflight 10616 * which means we no longer hit this block 10617 * next time we are called. 10618 */ 10619 return; 10620 } 10621 change_state: 10622 if (TSTMP_LT(cts, bbr->r_ctl.rc_bbr_state_time)) 10623 return; 10624 if ((cts - bbr->r_ctl.rc_bbr_state_time) < bbr_cur_cycle_time) { 10625 /* Less than a full time-period has passed */ 10626 return; 10627 } 10628 if (bbr->r_ctl.rc_level_state_extra && 10629 (bbr_state_val(bbr) > BBR_SUB_DRAIN) && 10630 ((cts - bbr->r_ctl.rc_bbr_state_time) < 10631 (bbr_cur_cycle_time + bbr->r_ctl.rc_level_state_extra))) { 10632 /* Less than a full time-period + extra has passed */ 10633 return; 10634 } 10635 if (bbr_gain_gets_extra_too && 10636 bbr->r_ctl.rc_level_state_extra && 10637 (bbr_state_val(bbr) == BBR_SUB_GAIN) && 10638 ((cts - bbr->r_ctl.rc_bbr_state_time) < 10639 (bbr_cur_cycle_time + bbr->r_ctl.rc_level_state_extra))) { 10640 /* Less than a full time-period + extra has passed */ 10641 return; 10642 } 10643 bbr_substate_change(bbr, cts, __LINE__, 1); 10644 } 10645 10646 static uint32_t 10647 bbr_get_a_state_target(struct tcp_bbr *bbr, uint32_t gain) 10648 { 10649 uint32_t mss, tar; 10650 10651 if (bbr->rc_use_google) { 10652 /* Google just uses the cwnd target */ 10653 tar = bbr_get_target_cwnd(bbr, bbr_get_bw(bbr), gain); 10654 } else { 10655 mss = min((bbr->rc_tp->t_maxseg - bbr->rc_last_options), 10656 bbr->r_ctl.rc_pace_max_segs); 10657 /* Get the base cwnd with gain rounded to a mss */ 10658 tar = roundup(bbr_get_raw_target_cwnd(bbr, bbr_get_bw(bbr), 10659 gain), mss); 10660 /* Make sure it is within our min */ 10661 if (tar < get_min_cwnd(bbr)) 10662 return (get_min_cwnd(bbr)); 10663 } 10664 return (tar); 10665 } 10666 10667 static void 10668 bbr_set_state_target(struct tcp_bbr *bbr, int line) 10669 { 10670 uint32_t tar, meth; 10671 10672 if ((bbr->rc_bbr_state == BBR_STATE_PROBE_RTT) && 10673 ((bbr->r_ctl.bbr_rttprobe_gain_val == 0) || bbr->rc_use_google)) { 10674 /* Special case using old probe-rtt method */ 10675 tar = bbr_rtt_probe_cwndtarg * (bbr->rc_tp->t_maxseg - bbr->rc_last_options); 10676 meth = 1; 10677 } else { 10678 /* Non-probe-rtt case and reduced probe-rtt */ 10679 if ((bbr->rc_bbr_state == BBR_STATE_PROBE_BW) && 10680 (bbr->r_ctl.rc_bbr_hptsi_gain > BBR_UNIT)) { 10681 /* For gain cycle we use the hptsi gain */ 10682 tar = bbr_get_a_state_target(bbr, bbr->r_ctl.rc_bbr_hptsi_gain); 10683 meth = 2; 10684 } else if ((bbr_target_is_bbunit) || bbr->rc_use_google) { 10685 /* 10686 * If configured, or for google all other states 10687 * get BBR_UNIT. 10688 */ 10689 tar = bbr_get_a_state_target(bbr, BBR_UNIT); 10690 meth = 3; 10691 } else { 10692 /* 10693 * Or we set a target based on the pacing gain 10694 * for non-google mode and default (non-configured). 10695 * Note we don't set a target goal below drain (192). 10696 */ 10697 if (bbr->r_ctl.rc_bbr_hptsi_gain < bbr_hptsi_gain[BBR_SUB_DRAIN]) { 10698 tar = bbr_get_a_state_target(bbr, bbr_hptsi_gain[BBR_SUB_DRAIN]); 10699 meth = 4; 10700 } else { 10701 tar = bbr_get_a_state_target(bbr, bbr->r_ctl.rc_bbr_hptsi_gain); 10702 meth = 5; 10703 } 10704 } 10705 } 10706 bbr_log_set_of_state_target(bbr, tar, line, meth); 10707 bbr->r_ctl.rc_target_at_state = tar; 10708 } 10709 10710 static void 10711 bbr_enter_probe_rtt(struct tcp_bbr *bbr, uint32_t cts, int32_t line) 10712 { 10713 /* Change to probe_rtt */ 10714 uint32_t time_in; 10715 10716 bbr->r_ctl.bbr_lost_at_state = bbr->r_ctl.rc_lost; 10717 bbr->r_ctl.flightsize_at_drain = ctf_flight_size(bbr->rc_tp, 10718 (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)); 10719 bbr->r_ctl.r_app_limited_until = (bbr->r_ctl.flightsize_at_drain 10720 + bbr->r_ctl.rc_delivered); 10721 /* Setup so we force feed the filter */ 10722 if (bbr->rc_use_google || bbr_probertt_sets_rtt) 10723 bbr->rc_prtt_set_ts = 1; 10724 if (SEQ_GT(cts, bbr->r_ctl.rc_bbr_state_time)) { 10725 time_in = cts - bbr->r_ctl.rc_bbr_state_time; 10726 counter_u64_add(bbr_state_time[bbr->rc_bbr_state], time_in); 10727 } 10728 bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_ENTERPROBE, 0); 10729 bbr->r_ctl.rc_rtt_shrinks = cts; 10730 bbr->r_ctl.last_in_probertt = cts; 10731 bbr->r_ctl.rc_probertt_srttchktim = cts; 10732 bbr->r_ctl.rc_bbr_state_time = cts; 10733 bbr->rc_bbr_state = BBR_STATE_PROBE_RTT; 10734 /* We need to force the filter to update */ 10735 10736 if ((bbr_sub_drain_slam_cwnd) && 10737 bbr->rc_hit_state_1 && 10738 (bbr->rc_use_google == 0) && 10739 (bbr_state_val(bbr) == BBR_SUB_DRAIN)) { 10740 if (bbr->rc_tp->snd_cwnd > bbr->r_ctl.rc_saved_cwnd) 10741 bbr->r_ctl.rc_saved_cwnd = bbr->rc_tp->snd_cwnd; 10742 } else 10743 bbr->r_ctl.rc_saved_cwnd = bbr->rc_tp->snd_cwnd; 10744 /* Update the lost */ 10745 bbr->r_ctl.rc_lost_at_startup = bbr->r_ctl.rc_lost; 10746 if ((bbr->r_ctl.bbr_rttprobe_gain_val == 0) || bbr->rc_use_google){ 10747 /* Set to the non-configurable default of 4 (PROBE_RTT_MIN) */ 10748 bbr->rc_tp->snd_cwnd = bbr_rtt_probe_cwndtarg * (bbr->rc_tp->t_maxseg - bbr->rc_last_options); 10749 bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__); 10750 bbr->r_ctl.rc_bbr_hptsi_gain = BBR_UNIT; 10751 bbr->r_ctl.rc_bbr_cwnd_gain = BBR_UNIT; 10752 bbr_log_set_of_state_target(bbr, bbr->rc_tp->snd_cwnd, __LINE__, 6); 10753 bbr->r_ctl.rc_target_at_state = bbr->rc_tp->snd_cwnd; 10754 } else { 10755 /* 10756 * We bring it down slowly by using a hptsi gain that is 10757 * probably 75%. This will slowly float down our outstanding 10758 * without tampering with the cwnd. 10759 */ 10760 bbr->r_ctl.rc_bbr_hptsi_gain = bbr->r_ctl.bbr_rttprobe_gain_val; 10761 bbr->r_ctl.rc_bbr_cwnd_gain = BBR_UNIT; 10762 bbr_set_state_target(bbr, __LINE__); 10763 if (bbr_prtt_slam_cwnd && 10764 (bbr->rc_tp->snd_cwnd > bbr->r_ctl.rc_target_at_state)) { 10765 bbr->rc_tp->snd_cwnd = bbr->r_ctl.rc_target_at_state; 10766 bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__); 10767 } 10768 } 10769 if (ctf_flight_size(bbr->rc_tp, 10770 (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)) <= 10771 bbr->r_ctl.rc_target_at_state) { 10772 /* We are at target */ 10773 bbr->r_ctl.rc_bbr_enters_probertt = cts; 10774 } else { 10775 /* We need to come down to reach target before our time begins */ 10776 bbr->r_ctl.rc_bbr_enters_probertt = 0; 10777 } 10778 bbr->r_ctl.rc_pe_of_prtt = bbr->r_ctl.rc_pkt_epoch; 10779 BBR_STAT_INC(bbr_enter_probertt); 10780 bbr_log_exit_gain(bbr, cts, 0); 10781 bbr_log_type_statechange(bbr, cts, line); 10782 } 10783 10784 static void 10785 bbr_check_probe_rtt_limits(struct tcp_bbr *bbr, uint32_t cts) 10786 { 10787 /* 10788 * Sanity check on probe-rtt intervals. 10789 * In crazy situations where we are competing 10790 * against new-reno flows with huge buffers 10791 * our rtt-prop interval could come to dominate 10792 * things if we can't get through a full set 10793 * of cycles, we need to adjust it. 10794 */ 10795 if (bbr_can_adjust_probertt && 10796 (bbr->rc_use_google == 0)) { 10797 uint16_t val = 0; 10798 uint32_t cur_rttp, fval, newval, baseval; 10799 10800 /* Are we to small and go into probe-rtt to often? */ 10801 baseval = (bbr_get_rtt(bbr, BBR_RTT_PROP) * (BBR_SUBSTATE_COUNT + 1)); 10802 cur_rttp = roundup(baseval, USECS_IN_SECOND); 10803 fval = bbr_filter_len_sec * USECS_IN_SECOND; 10804 if (bbr_is_ratio == 0) { 10805 if (fval > bbr_rtt_probe_limit) 10806 newval = cur_rttp + (fval - bbr_rtt_probe_limit); 10807 else 10808 newval = cur_rttp; 10809 } else { 10810 int mul; 10811 10812 mul = fval / bbr_rtt_probe_limit; 10813 newval = cur_rttp * mul; 10814 } 10815 if (cur_rttp > bbr->r_ctl.rc_probertt_int) { 10816 bbr->r_ctl.rc_probertt_int = cur_rttp; 10817 reset_time_small(&bbr->r_ctl.rc_rttprop, newval); 10818 val = 1; 10819 } else { 10820 /* 10821 * No adjustments were made 10822 * do we need to shrink it? 10823 */ 10824 if (bbr->r_ctl.rc_probertt_int > bbr_rtt_probe_limit) { 10825 if (cur_rttp <= bbr_rtt_probe_limit) { 10826 /* 10827 * Things have calmed down lets 10828 * shrink all the way to default 10829 */ 10830 bbr->r_ctl.rc_probertt_int = bbr_rtt_probe_limit; 10831 reset_time_small(&bbr->r_ctl.rc_rttprop, 10832 (bbr_filter_len_sec * USECS_IN_SECOND)); 10833 cur_rttp = bbr_rtt_probe_limit; 10834 newval = (bbr_filter_len_sec * USECS_IN_SECOND); 10835 val = 2; 10836 } else { 10837 /* 10838 * Well does some adjustment make sense? 10839 */ 10840 if (cur_rttp < bbr->r_ctl.rc_probertt_int) { 10841 /* We can reduce interval time some */ 10842 bbr->r_ctl.rc_probertt_int = cur_rttp; 10843 reset_time_small(&bbr->r_ctl.rc_rttprop, newval); 10844 val = 3; 10845 } 10846 } 10847 } 10848 } 10849 if (val) 10850 bbr_log_rtt_shrinks(bbr, cts, cur_rttp, newval, __LINE__, BBR_RTTS_RESETS_VALUES, val); 10851 } 10852 } 10853 10854 static void 10855 bbr_exit_probe_rtt(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts) 10856 { 10857 /* Exit probe-rtt */ 10858 10859 if (tp->snd_cwnd < bbr->r_ctl.rc_saved_cwnd) { 10860 tp->snd_cwnd = bbr->r_ctl.rc_saved_cwnd; 10861 bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__); 10862 } 10863 bbr_log_exit_gain(bbr, cts, 1); 10864 bbr->rc_hit_state_1 = 0; 10865 bbr->r_ctl.rc_rtt_shrinks = cts; 10866 bbr->r_ctl.last_in_probertt = cts; 10867 bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_RTTPROBE, 0); 10868 bbr->r_ctl.bbr_lost_at_state = bbr->r_ctl.rc_lost; 10869 bbr->r_ctl.r_app_limited_until = (ctf_flight_size(tp, 10870 (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)) + 10871 bbr->r_ctl.rc_delivered); 10872 if (SEQ_GT(cts, bbr->r_ctl.rc_bbr_state_time)) { 10873 uint32_t time_in; 10874 10875 time_in = cts - bbr->r_ctl.rc_bbr_state_time; 10876 counter_u64_add(bbr_state_time[bbr->rc_bbr_state], time_in); 10877 } 10878 if (bbr->rc_filled_pipe) { 10879 /* Switch to probe_bw */ 10880 bbr->rc_bbr_state = BBR_STATE_PROBE_BW; 10881 bbr->rc_bbr_substate = bbr_pick_probebw_substate(bbr, cts); 10882 bbr->r_ctl.rc_bbr_cwnd_gain = bbr_cwnd_gain; 10883 bbr_substate_change(bbr, cts, __LINE__, 0); 10884 bbr_log_type_statechange(bbr, cts, __LINE__); 10885 } else { 10886 /* Back to startup */ 10887 bbr->rc_bbr_state = BBR_STATE_STARTUP; 10888 bbr->r_ctl.rc_bbr_state_time = cts; 10889 /* 10890 * We don't want to give a complete free 3 10891 * measurements until we exit, so we use 10892 * the number of pe's we were in probe-rtt 10893 * to add to the startup_epoch. That way 10894 * we will still retain the old state. 10895 */ 10896 bbr->r_ctl.rc_bbr_last_startup_epoch += (bbr->r_ctl.rc_pkt_epoch - bbr->r_ctl.rc_pe_of_prtt); 10897 bbr->r_ctl.rc_lost_at_startup = bbr->r_ctl.rc_lost; 10898 /* Make sure to use the lower pg when shifting back in */ 10899 if (bbr->r_ctl.rc_lost && 10900 bbr_use_lower_gain_in_startup && 10901 (bbr->rc_use_google == 0)) 10902 bbr->r_ctl.rc_bbr_hptsi_gain = bbr_startup_lower; 10903 else 10904 bbr->r_ctl.rc_bbr_hptsi_gain = bbr->r_ctl.rc_startup_pg; 10905 bbr->r_ctl.rc_bbr_cwnd_gain = bbr->r_ctl.rc_startup_pg; 10906 /* Probably not needed but set it anyway */ 10907 bbr_set_state_target(bbr, __LINE__); 10908 bbr_log_type_statechange(bbr, cts, __LINE__); 10909 bbr_log_startup_event(bbr, cts, bbr->r_ctl.rc_bbr_last_startup_epoch, 10910 bbr->r_ctl.rc_lost_at_startup, bbr_start_exit, 0); 10911 } 10912 bbr_check_probe_rtt_limits(bbr, cts); 10913 } 10914 10915 static int32_t inline 10916 bbr_should_enter_probe_rtt(struct tcp_bbr *bbr, uint32_t cts) 10917 { 10918 if ((bbr->rc_past_init_win == 1) && 10919 (bbr->rc_in_persist == 0) && 10920 (bbr_calc_time(cts, bbr->r_ctl.rc_rtt_shrinks) >= bbr->r_ctl.rc_probertt_int)) { 10921 return (1); 10922 } 10923 if (bbr_can_force_probertt && 10924 (bbr->rc_in_persist == 0) && 10925 (TSTMP_GT(cts, bbr->r_ctl.last_in_probertt)) && 10926 ((cts - bbr->r_ctl.last_in_probertt) > bbr->r_ctl.rc_probertt_int)) { 10927 return (1); 10928 } 10929 return (0); 10930 } 10931 10932 static int32_t 10933 bbr_google_startup(struct tcp_bbr *bbr, uint32_t cts, int32_t pkt_epoch) 10934 { 10935 uint64_t btlbw, gain; 10936 if (pkt_epoch == 0) { 10937 /* 10938 * Need to be on a pkt-epoch to continue. 10939 */ 10940 return (0); 10941 } 10942 btlbw = bbr_get_full_bw(bbr); 10943 gain = ((bbr->r_ctl.rc_bbr_lastbtlbw * 10944 (uint64_t)bbr_start_exit) / (uint64_t)100) + bbr->r_ctl.rc_bbr_lastbtlbw; 10945 if (btlbw >= gain) { 10946 bbr->r_ctl.rc_bbr_last_startup_epoch = bbr->r_ctl.rc_pkt_epoch; 10947 bbr_log_startup_event(bbr, cts, bbr->r_ctl.rc_bbr_last_startup_epoch, 10948 bbr->r_ctl.rc_lost_at_startup, bbr_start_exit, 3); 10949 bbr->r_ctl.rc_bbr_lastbtlbw = btlbw; 10950 } 10951 if ((bbr->r_ctl.rc_pkt_epoch - bbr->r_ctl.rc_bbr_last_startup_epoch) >= BBR_STARTUP_EPOCHS) 10952 return (1); 10953 bbr_log_startup_event(bbr, cts, bbr->r_ctl.rc_bbr_last_startup_epoch, 10954 bbr->r_ctl.rc_lost_at_startup, bbr_start_exit, 8); 10955 return(0); 10956 } 10957 10958 static int32_t inline 10959 bbr_state_startup(struct tcp_bbr *bbr, uint32_t cts, int32_t epoch, int32_t pkt_epoch) 10960 { 10961 /* Have we gained 25% in the last 3 packet based epoch's? */ 10962 uint64_t btlbw, gain; 10963 int do_exit; 10964 int delta, rtt_gain; 10965 10966 if ((bbr->rc_tp->snd_una == bbr->rc_tp->snd_max) && 10967 (bbr_calc_time(cts, bbr->r_ctl.rc_went_idle_time) >= bbr_rtt_probe_time)) { 10968 /* 10969 * This qualifies as a RTT_PROBE session since we drop the 10970 * data outstanding to nothing and waited more than 10971 * bbr_rtt_probe_time. 10972 */ 10973 bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_WASIDLE, 0); 10974 bbr_set_reduced_rtt(bbr, cts, __LINE__); 10975 } 10976 if (bbr_should_enter_probe_rtt(bbr, cts)) { 10977 bbr_enter_probe_rtt(bbr, cts, __LINE__); 10978 return (0); 10979 } 10980 if (bbr->rc_use_google) 10981 return (bbr_google_startup(bbr, cts, pkt_epoch)); 10982 10983 if ((bbr->r_ctl.rc_lost > bbr->r_ctl.rc_lost_at_startup) && 10984 (bbr_use_lower_gain_in_startup)) { 10985 /* Drop to a lower gain 1.5 x since we saw loss */ 10986 bbr->r_ctl.rc_bbr_hptsi_gain = bbr_startup_lower; 10987 } 10988 if (pkt_epoch == 0) { 10989 /* 10990 * Need to be on a pkt-epoch to continue. 10991 */ 10992 return (0); 10993 } 10994 if (bbr_rtt_gain_thresh) { 10995 /* 10996 * Do we allow a flow to stay 10997 * in startup with no loss and no 10998 * gain in rtt over a set threshold? 10999 */ 11000 if (bbr->r_ctl.rc_pkt_epoch_rtt && 11001 bbr->r_ctl.startup_last_srtt && 11002 (bbr->r_ctl.rc_pkt_epoch_rtt > bbr->r_ctl.startup_last_srtt)) { 11003 delta = bbr->r_ctl.rc_pkt_epoch_rtt - bbr->r_ctl.startup_last_srtt; 11004 rtt_gain = (delta * 100) / bbr->r_ctl.startup_last_srtt; 11005 } else 11006 rtt_gain = 0; 11007 if ((bbr->r_ctl.startup_last_srtt == 0) || 11008 (bbr->r_ctl.rc_pkt_epoch_rtt < bbr->r_ctl.startup_last_srtt)) 11009 /* First time or new lower value */ 11010 bbr->r_ctl.startup_last_srtt = bbr->r_ctl.rc_pkt_epoch_rtt; 11011 11012 if ((bbr->r_ctl.rc_lost == 0) && 11013 (rtt_gain < bbr_rtt_gain_thresh)) { 11014 /* 11015 * No loss, and we are under 11016 * our gain threhold for 11017 * increasing RTT. 11018 */ 11019 if (bbr->r_ctl.rc_bbr_last_startup_epoch < bbr->r_ctl.rc_pkt_epoch) 11020 bbr->r_ctl.rc_bbr_last_startup_epoch++; 11021 bbr_log_startup_event(bbr, cts, rtt_gain, 11022 delta, bbr->r_ctl.startup_last_srtt, 10); 11023 return (0); 11024 } 11025 } 11026 if ((bbr->r_ctl.r_measurement_count == bbr->r_ctl.last_startup_measure) && 11027 (bbr->r_ctl.rc_lost_at_startup == bbr->r_ctl.rc_lost) && 11028 (!IN_RECOVERY(bbr->rc_tp->t_flags))) { 11029 /* 11030 * We only assess if we have a new measurement when 11031 * we have no loss and are not in recovery. 11032 * Drag up by one our last_startup epoch so we will hold 11033 * the number of non-gain we have already accumulated. 11034 */ 11035 if (bbr->r_ctl.rc_bbr_last_startup_epoch < bbr->r_ctl.rc_pkt_epoch) 11036 bbr->r_ctl.rc_bbr_last_startup_epoch++; 11037 bbr_log_startup_event(bbr, cts, bbr->r_ctl.rc_bbr_last_startup_epoch, 11038 bbr->r_ctl.rc_lost_at_startup, bbr_start_exit, 9); 11039 return (0); 11040 } 11041 /* Case where we reduced the lost (bad retransmit) */ 11042 if (bbr->r_ctl.rc_lost_at_startup > bbr->r_ctl.rc_lost) 11043 bbr->r_ctl.rc_lost_at_startup = bbr->r_ctl.rc_lost; 11044 bbr->r_ctl.last_startup_measure = bbr->r_ctl.r_measurement_count; 11045 btlbw = bbr_get_full_bw(bbr); 11046 if (bbr->r_ctl.rc_bbr_hptsi_gain == bbr_startup_lower) 11047 gain = ((bbr->r_ctl.rc_bbr_lastbtlbw * 11048 (uint64_t)bbr_low_start_exit) / (uint64_t)100) + bbr->r_ctl.rc_bbr_lastbtlbw; 11049 else 11050 gain = ((bbr->r_ctl.rc_bbr_lastbtlbw * 11051 (uint64_t)bbr_start_exit) / (uint64_t)100) + bbr->r_ctl.rc_bbr_lastbtlbw; 11052 do_exit = 0; 11053 if (btlbw > bbr->r_ctl.rc_bbr_lastbtlbw) 11054 bbr->r_ctl.rc_bbr_lastbtlbw = btlbw; 11055 if (btlbw >= gain) { 11056 bbr->r_ctl.rc_bbr_last_startup_epoch = bbr->r_ctl.rc_pkt_epoch; 11057 /* Update the lost so we won't exit in next set of tests */ 11058 bbr->r_ctl.rc_lost_at_startup = bbr->r_ctl.rc_lost; 11059 bbr_log_startup_event(bbr, cts, bbr->r_ctl.rc_bbr_last_startup_epoch, 11060 bbr->r_ctl.rc_lost_at_startup, bbr_start_exit, 3); 11061 } 11062 if ((bbr->rc_loss_exit && 11063 (bbr->r_ctl.rc_lost > bbr->r_ctl.rc_lost_at_startup) && 11064 (bbr->r_ctl.rc_pkt_epoch_loss_rate > bbr_startup_loss_thresh)) && 11065 ((bbr->r_ctl.rc_pkt_epoch - bbr->r_ctl.rc_bbr_last_startup_epoch) >= BBR_STARTUP_EPOCHS)) { 11066 /* 11067 * If we had no gain, we had loss and that loss was above 11068 * our threshould, the rwnd is not constrained, and we have 11069 * had at least 3 packet epochs exit. Note that this is 11070 * switched off by sysctl. Google does not do this by the 11071 * way. 11072 */ 11073 if ((ctf_flight_size(bbr->rc_tp, 11074 (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)) + 11075 (2 * max(bbr->r_ctl.rc_pace_max_segs, bbr->rc_tp->t_maxseg))) <= bbr->rc_tp->snd_wnd) { 11076 do_exit = 1; 11077 bbr_log_startup_event(bbr, cts, bbr->r_ctl.rc_bbr_last_startup_epoch, 11078 bbr->r_ctl.rc_lost_at_startup, bbr_start_exit, 4); 11079 } else { 11080 /* Just record an updated loss value */ 11081 bbr->r_ctl.rc_lost_at_startup = bbr->r_ctl.rc_lost; 11082 bbr_log_startup_event(bbr, cts, bbr->r_ctl.rc_bbr_last_startup_epoch, 11083 bbr->r_ctl.rc_lost_at_startup, bbr_start_exit, 5); 11084 } 11085 } else 11086 bbr->r_ctl.rc_lost_at_startup = bbr->r_ctl.rc_lost; 11087 if (((bbr->r_ctl.rc_pkt_epoch - bbr->r_ctl.rc_bbr_last_startup_epoch) >= BBR_STARTUP_EPOCHS) || 11088 do_exit) { 11089 /* Return 1 to exit the startup state. */ 11090 return (1); 11091 } 11092 /* Stay in startup */ 11093 bbr_log_startup_event(bbr, cts, bbr->r_ctl.rc_bbr_last_startup_epoch, 11094 bbr->r_ctl.rc_lost_at_startup, bbr_start_exit, 8); 11095 return (0); 11096 } 11097 11098 static void 11099 bbr_state_change(struct tcp_bbr *bbr, uint32_t cts, int32_t epoch, int32_t pkt_epoch, uint32_t losses) 11100 { 11101 /* 11102 * A tick occurred in the rtt epoch do we need to do anything? 11103 */ 11104 #ifdef BBR_INVARIANTS 11105 if ((bbr->rc_bbr_state != BBR_STATE_STARTUP) && 11106 (bbr->rc_bbr_state != BBR_STATE_DRAIN) && 11107 (bbr->rc_bbr_state != BBR_STATE_PROBE_RTT) && 11108 (bbr->rc_bbr_state != BBR_STATE_IDLE_EXIT) && 11109 (bbr->rc_bbr_state != BBR_STATE_PROBE_BW)) { 11110 /* Debug code? */ 11111 panic("Unknown BBR state %d?\n", bbr->rc_bbr_state); 11112 } 11113 #endif 11114 if (bbr->rc_bbr_state == BBR_STATE_STARTUP) { 11115 /* Do we exit the startup state? */ 11116 if (bbr_state_startup(bbr, cts, epoch, pkt_epoch)) { 11117 uint32_t time_in; 11118 11119 bbr_log_startup_event(bbr, cts, bbr->r_ctl.rc_bbr_last_startup_epoch, 11120 bbr->r_ctl.rc_lost_at_startup, bbr_start_exit, 6); 11121 bbr->rc_filled_pipe = 1; 11122 bbr->r_ctl.bbr_lost_at_state = bbr->r_ctl.rc_lost; 11123 if (SEQ_GT(cts, bbr->r_ctl.rc_bbr_state_time)) { 11124 time_in = cts - bbr->r_ctl.rc_bbr_state_time; 11125 counter_u64_add(bbr_state_time[bbr->rc_bbr_state], time_in); 11126 } else 11127 time_in = 0; 11128 if (bbr->rc_no_pacing) 11129 bbr->rc_no_pacing = 0; 11130 bbr->r_ctl.rc_bbr_state_time = cts; 11131 bbr->r_ctl.rc_bbr_hptsi_gain = bbr->r_ctl.rc_drain_pg; 11132 bbr->rc_bbr_state = BBR_STATE_DRAIN; 11133 bbr_set_state_target(bbr, __LINE__); 11134 if ((bbr->rc_use_google == 0) && 11135 bbr_slam_cwnd_in_main_drain) { 11136 /* Here we don't have to worry about probe-rtt */ 11137 bbr->r_ctl.rc_saved_cwnd = bbr->rc_tp->snd_cwnd; 11138 bbr->rc_tp->snd_cwnd = bbr->r_ctl.rc_target_at_state; 11139 bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__); 11140 } 11141 bbr->r_ctl.rc_bbr_cwnd_gain = bbr_high_gain; 11142 bbr_log_type_statechange(bbr, cts, __LINE__); 11143 if (ctf_flight_size(bbr->rc_tp, 11144 (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)) <= 11145 bbr->r_ctl.rc_target_at_state) { 11146 /* 11147 * Switch to probe_bw if we are already 11148 * there 11149 */ 11150 bbr->rc_bbr_substate = bbr_pick_probebw_substate(bbr, cts); 11151 bbr_substate_change(bbr, cts, __LINE__, 0); 11152 bbr->rc_bbr_state = BBR_STATE_PROBE_BW; 11153 bbr_log_type_statechange(bbr, cts, __LINE__); 11154 } 11155 } 11156 } else if (bbr->rc_bbr_state == BBR_STATE_IDLE_EXIT) { 11157 uint32_t inflight; 11158 struct tcpcb *tp; 11159 11160 tp = bbr->rc_tp; 11161 inflight = ctf_flight_size(tp, 11162 (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)); 11163 if (inflight >= bbr->r_ctl.rc_target_at_state) { 11164 /* We have reached a flight of the cwnd target */ 11165 bbr->rc_bbr_state = BBR_STATE_PROBE_BW; 11166 bbr->r_ctl.rc_bbr_hptsi_gain = BBR_UNIT; 11167 bbr->r_ctl.rc_bbr_cwnd_gain = BBR_UNIT; 11168 bbr_set_state_target(bbr, __LINE__); 11169 /* 11170 * Rig it so we don't do anything crazy and 11171 * start fresh with a new randomization. 11172 */ 11173 bbr->r_ctl.bbr_smallest_srtt_this_state = 0xffffffff; 11174 bbr->rc_bbr_substate = BBR_SUB_LEVEL6; 11175 bbr_substate_change(bbr, cts, __LINE__, 1); 11176 } 11177 } else if (bbr->rc_bbr_state == BBR_STATE_DRAIN) { 11178 /* Has in-flight reached the bdp (or less)? */ 11179 uint32_t inflight; 11180 struct tcpcb *tp; 11181 11182 tp = bbr->rc_tp; 11183 inflight = ctf_flight_size(tp, 11184 (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)); 11185 if ((bbr->rc_use_google == 0) && 11186 bbr_slam_cwnd_in_main_drain && 11187 (bbr->rc_tp->snd_cwnd > bbr->r_ctl.rc_target_at_state)) { 11188 /* 11189 * Here we don't have to worry about probe-rtt 11190 * re-slam it, but keep it slammed down. 11191 */ 11192 bbr->rc_tp->snd_cwnd = bbr->r_ctl.rc_target_at_state; 11193 bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__); 11194 } 11195 if (inflight <= bbr->r_ctl.rc_target_at_state) { 11196 /* We have drained */ 11197 bbr->rc_bbr_state = BBR_STATE_PROBE_BW; 11198 bbr->r_ctl.bbr_lost_at_state = bbr->r_ctl.rc_lost; 11199 if (SEQ_GT(cts, bbr->r_ctl.rc_bbr_state_time)) { 11200 uint32_t time_in; 11201 11202 time_in = cts - bbr->r_ctl.rc_bbr_state_time; 11203 counter_u64_add(bbr_state_time[bbr->rc_bbr_state], time_in); 11204 } 11205 if ((bbr->rc_use_google == 0) && 11206 bbr_slam_cwnd_in_main_drain && 11207 (tp->snd_cwnd < bbr->r_ctl.rc_saved_cwnd)) { 11208 /* Restore the cwnd */ 11209 tp->snd_cwnd = bbr->r_ctl.rc_saved_cwnd; 11210 bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__); 11211 } 11212 /* Setup probe-rtt has being done now RRS-HERE */ 11213 bbr->r_ctl.rc_rtt_shrinks = cts; 11214 bbr->r_ctl.last_in_probertt = cts; 11215 bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_LEAVE_DRAIN, 0); 11216 /* Randomly pick a sub-state */ 11217 bbr->rc_bbr_substate = bbr_pick_probebw_substate(bbr, cts); 11218 bbr_substate_change(bbr, cts, __LINE__, 0); 11219 bbr_log_type_statechange(bbr, cts, __LINE__); 11220 } 11221 } else if (bbr->rc_bbr_state == BBR_STATE_PROBE_RTT) { 11222 uint32_t flight; 11223 11224 flight = ctf_flight_size(bbr->rc_tp, 11225 (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)); 11226 bbr->r_ctl.r_app_limited_until = (flight + bbr->r_ctl.rc_delivered); 11227 if (((bbr->r_ctl.bbr_rttprobe_gain_val == 0) || bbr->rc_use_google) && 11228 (bbr->rc_tp->snd_cwnd > bbr->r_ctl.rc_target_at_state)) { 11229 /* 11230 * We must keep cwnd at the desired MSS. 11231 */ 11232 bbr->rc_tp->snd_cwnd = bbr_rtt_probe_cwndtarg * (bbr->rc_tp->t_maxseg - bbr->rc_last_options); 11233 bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__); 11234 } else if ((bbr_prtt_slam_cwnd) && 11235 (bbr->rc_tp->snd_cwnd > bbr->r_ctl.rc_target_at_state)) { 11236 /* Re-slam it */ 11237 bbr->rc_tp->snd_cwnd = bbr->r_ctl.rc_target_at_state; 11238 bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__); 11239 } 11240 if (bbr->r_ctl.rc_bbr_enters_probertt == 0) { 11241 /* Has outstanding reached our target? */ 11242 if (flight <= bbr->r_ctl.rc_target_at_state) { 11243 bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_REACHTAR, 0); 11244 bbr->r_ctl.rc_bbr_enters_probertt = cts; 11245 /* If time is exactly 0, be 1usec off */ 11246 if (bbr->r_ctl.rc_bbr_enters_probertt == 0) 11247 bbr->r_ctl.rc_bbr_enters_probertt = 1; 11248 if (bbr->rc_use_google == 0) { 11249 /* 11250 * Restore any lowering that as occurred to 11251 * reach here 11252 */ 11253 if (bbr->r_ctl.bbr_rttprobe_gain_val) 11254 bbr->r_ctl.rc_bbr_hptsi_gain = bbr->r_ctl.bbr_rttprobe_gain_val; 11255 else 11256 bbr->r_ctl.rc_bbr_hptsi_gain = BBR_UNIT; 11257 } 11258 } 11259 if ((bbr->r_ctl.rc_bbr_enters_probertt == 0) && 11260 (bbr->rc_use_google == 0) && 11261 bbr->r_ctl.bbr_rttprobe_gain_val && 11262 (((cts - bbr->r_ctl.rc_probertt_srttchktim) > bbr_get_rtt(bbr, bbr_drain_rtt)) || 11263 (flight >= bbr->r_ctl.flightsize_at_drain))) { 11264 /* 11265 * We have doddled with our current hptsi 11266 * gain an srtt and have still not made it 11267 * to target, or we have increased our flight. 11268 * Lets reduce the gain by xx% 11269 * flooring the reduce at DRAIN (based on 11270 * mul/div) 11271 */ 11272 int red; 11273 11274 bbr->r_ctl.flightsize_at_drain = flight; 11275 bbr->r_ctl.rc_probertt_srttchktim = cts; 11276 red = max((bbr->r_ctl.bbr_rttprobe_gain_val / 10), 1); 11277 if ((bbr->r_ctl.rc_bbr_hptsi_gain - red) > max(bbr_drain_floor, 1)) { 11278 /* Reduce our gain again */ 11279 bbr->r_ctl.rc_bbr_hptsi_gain -= red; 11280 bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_SHRINK_PG, 0); 11281 } else if (bbr->r_ctl.rc_bbr_hptsi_gain > max(bbr_drain_floor, 1)) { 11282 /* one more chance before we give up */ 11283 bbr->r_ctl.rc_bbr_hptsi_gain = max(bbr_drain_floor, 1); 11284 bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_SHRINK_PG_FINAL, 0); 11285 } else { 11286 /* At the very bottom */ 11287 bbr->r_ctl.rc_bbr_hptsi_gain = max((bbr_drain_floor-1), 1); 11288 } 11289 } 11290 } 11291 if (bbr->r_ctl.rc_bbr_enters_probertt && 11292 (TSTMP_GT(cts, bbr->r_ctl.rc_bbr_enters_probertt)) && 11293 ((cts - bbr->r_ctl.rc_bbr_enters_probertt) >= bbr_rtt_probe_time)) { 11294 /* Time to exit probe RTT normally */ 11295 bbr_exit_probe_rtt(bbr->rc_tp, bbr, cts); 11296 } 11297 } else if (bbr->rc_bbr_state == BBR_STATE_PROBE_BW) { 11298 if ((bbr->rc_tp->snd_una == bbr->rc_tp->snd_max) && 11299 (bbr_calc_time(cts, bbr->r_ctl.rc_went_idle_time) >= bbr_rtt_probe_time)) { 11300 /* 11301 * This qualifies as a RTT_PROBE session since we 11302 * drop the data outstanding to nothing and waited 11303 * more than bbr_rtt_probe_time. 11304 */ 11305 bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_WASIDLE, 0); 11306 bbr_set_reduced_rtt(bbr, cts, __LINE__); 11307 } 11308 if (bbr_should_enter_probe_rtt(bbr, cts)) { 11309 bbr_enter_probe_rtt(bbr, cts, __LINE__); 11310 } else { 11311 bbr_set_probebw_gains(bbr, cts, losses); 11312 } 11313 } 11314 } 11315 11316 static void 11317 bbr_check_bbr_for_state(struct tcp_bbr *bbr, uint32_t cts, int32_t line, uint32_t losses) 11318 { 11319 int32_t epoch = 0; 11320 11321 if ((cts - bbr->r_ctl.rc_rcv_epoch_start) >= bbr_get_rtt(bbr, BBR_RTT_PROP)) { 11322 bbr_set_epoch(bbr, cts, line); 11323 /* At each epoch doe lt bw sampling */ 11324 epoch = 1; 11325 } 11326 bbr_state_change(bbr, cts, epoch, bbr->rc_is_pkt_epoch_now, losses); 11327 } 11328 11329 static int 11330 bbr_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so, 11331 struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos, 11332 int32_t nxt_pkt, struct timeval *tv) 11333 { 11334 int32_t thflags, retval; 11335 uint32_t cts, lcts; 11336 uint32_t tiwin; 11337 struct tcpopt to; 11338 struct tcp_bbr *bbr; 11339 struct bbr_sendmap *rsm; 11340 struct timeval ltv; 11341 int32_t did_out = 0; 11342 uint16_t nsegs; 11343 int32_t prev_state; 11344 uint32_t lost; 11345 11346 nsegs = max(1, m->m_pkthdr.lro_nsegs); 11347 bbr = (struct tcp_bbr *)tp->t_fb_ptr; 11348 /* add in our stats */ 11349 kern_prefetch(bbr, &prev_state); 11350 prev_state = 0; 11351 thflags = tcp_get_flags(th); 11352 /* 11353 * If this is either a state-changing packet or current state isn't 11354 * established, we require a write lock on tcbinfo. Otherwise, we 11355 * allow the tcbinfo to be in either alocked or unlocked, as the 11356 * caller may have unnecessarily acquired a write lock due to a 11357 * race. 11358 */ 11359 INP_WLOCK_ASSERT(tp->t_inpcb); 11360 KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", 11361 __func__)); 11362 11363 tp->t_rcvtime = ticks; 11364 /* 11365 * Unscale the window into a 32-bit value. For the SYN_SENT state 11366 * the scale is zero. 11367 */ 11368 tiwin = th->th_win << tp->snd_scale; 11369 #ifdef STATS 11370 stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_FRWIN, tiwin); 11371 #endif 11372 11373 if (m->m_flags & M_TSTMP) { 11374 /* Prefer the hardware timestamp if present */ 11375 struct timespec ts; 11376 11377 mbuf_tstmp2timespec(m, &ts); 11378 bbr->rc_tv.tv_sec = ts.tv_sec; 11379 bbr->rc_tv.tv_usec = ts.tv_nsec / 1000; 11380 bbr->r_ctl.rc_rcvtime = cts = tcp_tv_to_usectick(&bbr->rc_tv); 11381 } else if (m->m_flags & M_TSTMP_LRO) { 11382 /* Next the arrival timestamp */ 11383 struct timespec ts; 11384 11385 mbuf_tstmp2timespec(m, &ts); 11386 bbr->rc_tv.tv_sec = ts.tv_sec; 11387 bbr->rc_tv.tv_usec = ts.tv_nsec / 1000; 11388 bbr->r_ctl.rc_rcvtime = cts = tcp_tv_to_usectick(&bbr->rc_tv); 11389 } else { 11390 /* 11391 * Ok just get the current time. 11392 */ 11393 bbr->r_ctl.rc_rcvtime = lcts = cts = tcp_get_usecs(&bbr->rc_tv); 11394 } 11395 /* 11396 * Parse options on any incoming segment. 11397 */ 11398 tcp_dooptions(&to, (u_char *)(th + 1), 11399 (th->th_off << 2) - sizeof(struct tcphdr), 11400 (thflags & TH_SYN) ? TO_SYN : 0); 11401 11402 /* 11403 * If timestamps were negotiated during SYN/ACK and a 11404 * segment without a timestamp is received, silently drop 11405 * the segment, unless it is a RST segment or missing timestamps are 11406 * tolerated. 11407 * See section 3.2 of RFC 7323. 11408 */ 11409 if ((tp->t_flags & TF_RCVD_TSTMP) && !(to.to_flags & TOF_TS) && 11410 ((thflags & TH_RST) == 0) && (V_tcp_tolerate_missing_ts == 0)) { 11411 retval = 0; 11412 m_freem(m); 11413 goto done_with_input; 11414 } 11415 /* 11416 * If echoed timestamp is later than the current time, fall back to 11417 * non RFC1323 RTT calculation. Normalize timestamp if syncookies 11418 * were used when this connection was established. 11419 */ 11420 if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) { 11421 to.to_tsecr -= tp->ts_offset; 11422 if (TSTMP_GT(to.to_tsecr, tcp_tv_to_mssectick(&bbr->rc_tv))) 11423 to.to_tsecr = 0; 11424 } 11425 /* 11426 * If its the first time in we need to take care of options and 11427 * verify we can do SACK for rack! 11428 */ 11429 if (bbr->r_state == 0) { 11430 /* 11431 * Process options only when we get SYN/ACK back. The SYN 11432 * case for incoming connections is handled in tcp_syncache. 11433 * According to RFC1323 the window field in a SYN (i.e., a 11434 * <SYN> or <SYN,ACK>) segment itself is never scaled. XXX 11435 * this is traditional behavior, may need to be cleaned up. 11436 */ 11437 if (bbr->rc_inp == NULL) { 11438 bbr->rc_inp = tp->t_inpcb; 11439 } 11440 /* 11441 * We need to init rc_inp here since its not init'd when 11442 * bbr_init is called 11443 */ 11444 if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) { 11445 if ((to.to_flags & TOF_SCALE) && 11446 (tp->t_flags & TF_REQ_SCALE)) { 11447 tp->t_flags |= TF_RCVD_SCALE; 11448 tp->snd_scale = to.to_wscale; 11449 } else 11450 tp->t_flags &= ~TF_REQ_SCALE; 11451 /* 11452 * Initial send window. It will be updated with the 11453 * next incoming segment to the scaled value. 11454 */ 11455 tp->snd_wnd = th->th_win; 11456 if ((to.to_flags & TOF_TS) && 11457 (tp->t_flags & TF_REQ_TSTMP)) { 11458 tp->t_flags |= TF_RCVD_TSTMP; 11459 tp->ts_recent = to.to_tsval; 11460 tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv); 11461 } else 11462 tp->t_flags &= ~TF_REQ_TSTMP; 11463 if (to.to_flags & TOF_MSS) 11464 tcp_mss(tp, to.to_mss); 11465 if ((tp->t_flags & TF_SACK_PERMIT) && 11466 (to.to_flags & TOF_SACKPERM) == 0) 11467 tp->t_flags &= ~TF_SACK_PERMIT; 11468 if (IS_FASTOPEN(tp->t_flags)) { 11469 if (to.to_flags & TOF_FASTOPEN) { 11470 uint16_t mss; 11471 11472 if (to.to_flags & TOF_MSS) 11473 mss = to.to_mss; 11474 else 11475 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) 11476 mss = TCP6_MSS; 11477 else 11478 mss = TCP_MSS; 11479 tcp_fastopen_update_cache(tp, mss, 11480 to.to_tfo_len, to.to_tfo_cookie); 11481 } else 11482 tcp_fastopen_disable_path(tp); 11483 } 11484 } 11485 /* 11486 * At this point we are at the initial call. Here we decide 11487 * if we are doing RACK or not. We do this by seeing if 11488 * TF_SACK_PERMIT is set, if not rack is *not* possible and 11489 * we switch to the default code. 11490 */ 11491 if ((tp->t_flags & TF_SACK_PERMIT) == 0) { 11492 /* Bail */ 11493 tcp_switch_back_to_default(tp); 11494 (*tp->t_fb->tfb_tcp_do_segment) (m, th, so, tp, drop_hdrlen, 11495 tlen, iptos); 11496 return (1); 11497 } 11498 /* Set the flag */ 11499 bbr->r_is_v6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; 11500 tcp_set_hpts(tp->t_inpcb); 11501 sack_filter_clear(&bbr->r_ctl.bbr_sf, th->th_ack); 11502 } 11503 if (thflags & TH_ACK) { 11504 /* Track ack types */ 11505 if (to.to_flags & TOF_SACK) 11506 BBR_STAT_INC(bbr_acks_with_sacks); 11507 else 11508 BBR_STAT_INC(bbr_plain_acks); 11509 } 11510 /* 11511 * This is the one exception case where we set the rack state 11512 * always. All other times (timers etc) we must have a rack-state 11513 * set (so we assure we have done the checks above for SACK). 11514 */ 11515 if (thflags & TH_FIN) 11516 tcp_log_end_status(tp, TCP_EI_STATUS_CLIENT_FIN); 11517 if (bbr->r_state != tp->t_state) 11518 bbr_set_state(tp, bbr, tiwin); 11519 11520 if (SEQ_GT(th->th_ack, tp->snd_una) && (rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map)) != NULL) 11521 kern_prefetch(rsm, &prev_state); 11522 prev_state = bbr->r_state; 11523 bbr->rc_ack_was_delayed = 0; 11524 lost = bbr->r_ctl.rc_lost; 11525 bbr->rc_is_pkt_epoch_now = 0; 11526 if (m->m_flags & (M_TSTMP|M_TSTMP_LRO)) { 11527 /* Get the real time into lcts and figure the real delay */ 11528 lcts = tcp_get_usecs(<v); 11529 if (TSTMP_GT(lcts, cts)) { 11530 bbr->r_ctl.rc_ack_hdwr_delay = lcts - cts; 11531 bbr->rc_ack_was_delayed = 1; 11532 if (TSTMP_GT(bbr->r_ctl.rc_ack_hdwr_delay, 11533 bbr->r_ctl.highest_hdwr_delay)) 11534 bbr->r_ctl.highest_hdwr_delay = bbr->r_ctl.rc_ack_hdwr_delay; 11535 } else { 11536 bbr->r_ctl.rc_ack_hdwr_delay = 0; 11537 bbr->rc_ack_was_delayed = 0; 11538 } 11539 } else { 11540 bbr->r_ctl.rc_ack_hdwr_delay = 0; 11541 bbr->rc_ack_was_delayed = 0; 11542 } 11543 bbr_log_ack_event(bbr, th, &to, tlen, nsegs, cts, nxt_pkt, m); 11544 if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) { 11545 retval = 0; 11546 m_freem(m); 11547 goto done_with_input; 11548 } 11549 /* 11550 * If a segment with the ACK-bit set arrives in the SYN-SENT state 11551 * check SEQ.ACK first as described on page 66 of RFC 793, section 3.9. 11552 */ 11553 if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) && 11554 (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { 11555 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 11556 ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 11557 return (1); 11558 } 11559 if (tiwin > bbr->r_ctl.rc_high_rwnd) 11560 bbr->r_ctl.rc_high_rwnd = tiwin; 11561 #ifdef BBR_INVARIANTS 11562 if ((tp->t_inpcb->inp_flags & INP_DROPPED) || 11563 (tp->t_inpcb->inp_flags2 & INP_FREED)) { 11564 panic("tp:%p bbr:%p given a dropped inp:%p", 11565 tp, bbr, tp->t_inpcb); 11566 } 11567 #endif 11568 bbr->r_ctl.rc_flight_at_input = ctf_flight_size(tp, 11569 (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)); 11570 bbr->rtt_valid = 0; 11571 if (to.to_flags & TOF_TS) { 11572 bbr->rc_ts_valid = 1; 11573 bbr->r_ctl.last_inbound_ts = to.to_tsval; 11574 } else { 11575 bbr->rc_ts_valid = 0; 11576 bbr->r_ctl.last_inbound_ts = 0; 11577 } 11578 retval = (*bbr->r_substate) (m, th, so, 11579 tp, &to, drop_hdrlen, 11580 tlen, tiwin, thflags, nxt_pkt, iptos); 11581 #ifdef BBR_INVARIANTS 11582 if ((retval == 0) && 11583 (tp->t_inpcb == NULL)) { 11584 panic("retval:%d tp:%p t_inpcb:NULL state:%d", 11585 retval, tp, prev_state); 11586 } 11587 #endif 11588 if (nxt_pkt == 0) 11589 BBR_STAT_INC(bbr_rlock_left_ret0); 11590 else 11591 BBR_STAT_INC(bbr_rlock_left_ret1); 11592 if (retval == 0) { 11593 /* 11594 * If retval is 1 the tcb is unlocked and most likely the tp 11595 * is gone. 11596 */ 11597 INP_WLOCK_ASSERT(tp->t_inpcb); 11598 tcp_bbr_xmit_timer_commit(bbr, tp, cts); 11599 if (bbr->rc_is_pkt_epoch_now) 11600 bbr_set_pktepoch(bbr, cts, __LINE__); 11601 bbr_check_bbr_for_state(bbr, cts, __LINE__, (bbr->r_ctl.rc_lost - lost)); 11602 if (nxt_pkt == 0) { 11603 if (bbr->r_wanted_output != 0) { 11604 bbr->rc_output_starts_timer = 0; 11605 did_out = 1; 11606 if (tcp_output(tp) < 0) 11607 return (1); 11608 } else 11609 bbr_start_hpts_timer(bbr, tp, cts, 6, 0, 0); 11610 } 11611 if ((nxt_pkt == 0) && 11612 ((bbr->r_ctl.rc_hpts_flags & PACE_TMR_MASK) == 0) && 11613 (SEQ_GT(tp->snd_max, tp->snd_una) || 11614 (tp->t_flags & TF_DELACK) || 11615 ((V_tcp_always_keepalive || bbr->rc_inp->inp_socket->so_options & SO_KEEPALIVE) && 11616 (tp->t_state <= TCPS_CLOSING)))) { 11617 /* 11618 * We could not send (probably in the hpts but 11619 * stopped the timer)? 11620 */ 11621 if ((tp->snd_max == tp->snd_una) && 11622 ((tp->t_flags & TF_DELACK) == 0) && 11623 (tcp_in_hpts(bbr->rc_inp)) && 11624 (bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { 11625 /* 11626 * keep alive not needed if we are hptsi 11627 * output yet 11628 */ 11629 ; 11630 } else { 11631 if (tcp_in_hpts(bbr->rc_inp)) { 11632 tcp_hpts_remove(bbr->rc_inp); 11633 if ((bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) && 11634 (TSTMP_GT(lcts, bbr->rc_pacer_started))) { 11635 uint32_t del; 11636 11637 del = lcts - bbr->rc_pacer_started; 11638 if (bbr->r_ctl.rc_last_delay_val > del) { 11639 BBR_STAT_INC(bbr_force_timer_start); 11640 bbr->r_ctl.rc_last_delay_val -= del; 11641 bbr->rc_pacer_started = lcts; 11642 } else { 11643 /* We are late */ 11644 bbr->r_ctl.rc_last_delay_val = 0; 11645 BBR_STAT_INC(bbr_force_output); 11646 if (tcp_output(tp) < 0) 11647 return (1); 11648 } 11649 } 11650 } 11651 bbr_start_hpts_timer(bbr, tp, cts, 8, bbr->r_ctl.rc_last_delay_val, 11652 0); 11653 } 11654 } else if ((bbr->rc_output_starts_timer == 0) && (nxt_pkt == 0)) { 11655 /* Do we have the correct timer running? */ 11656 bbr_timer_audit(tp, bbr, lcts, &so->so_snd); 11657 } 11658 /* Do we have a new state */ 11659 if (bbr->r_state != tp->t_state) 11660 bbr_set_state(tp, bbr, tiwin); 11661 done_with_input: 11662 bbr_log_doseg_done(bbr, cts, nxt_pkt, did_out); 11663 if (did_out) 11664 bbr->r_wanted_output = 0; 11665 #ifdef BBR_INVARIANTS 11666 if (tp->t_inpcb == NULL) { 11667 panic("OP:%d retval:%d tp:%p t_inpcb:NULL state:%d", 11668 did_out, 11669 retval, tp, prev_state); 11670 } 11671 #endif 11672 } 11673 return (retval); 11674 } 11675 11676 static void 11677 bbr_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, 11678 struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos) 11679 { 11680 struct timeval tv; 11681 int retval; 11682 11683 /* First lets see if we have old packets */ 11684 if (tp->t_in_pkt) { 11685 if (ctf_do_queued_segments(so, tp, 1)) { 11686 m_freem(m); 11687 return; 11688 } 11689 } 11690 if (m->m_flags & M_TSTMP_LRO) { 11691 mbuf_tstmp2timeval(m, &tv); 11692 } else { 11693 /* Should not be should we kassert instead? */ 11694 tcp_get_usecs(&tv); 11695 } 11696 retval = bbr_do_segment_nounlock(m, th, so, tp, 11697 drop_hdrlen, tlen, iptos, 0, &tv); 11698 if (retval == 0) { 11699 INP_WUNLOCK(tp->t_inpcb); 11700 } 11701 } 11702 11703 /* 11704 * Return how much data can be sent without violating the 11705 * cwnd or rwnd. 11706 */ 11707 11708 static inline uint32_t 11709 bbr_what_can_we_send(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t sendwin, 11710 uint32_t avail, int32_t sb_offset, uint32_t cts) 11711 { 11712 uint32_t len; 11713 11714 if (ctf_outstanding(tp) >= tp->snd_wnd) { 11715 /* We never want to go over our peers rcv-window */ 11716 len = 0; 11717 } else { 11718 uint32_t flight; 11719 11720 flight = ctf_flight_size(tp, (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)); 11721 if (flight >= sendwin) { 11722 /* 11723 * We have in flight what we are allowed by cwnd (if 11724 * it was rwnd blocking it would have hit above out 11725 * >= tp->snd_wnd). 11726 */ 11727 return (0); 11728 } 11729 len = sendwin - flight; 11730 if ((len + ctf_outstanding(tp)) > tp->snd_wnd) { 11731 /* We would send too much (beyond the rwnd) */ 11732 len = tp->snd_wnd - ctf_outstanding(tp); 11733 } 11734 if ((len + sb_offset) > avail) { 11735 /* 11736 * We don't have that much in the SB, how much is 11737 * there? 11738 */ 11739 len = avail - sb_offset; 11740 } 11741 } 11742 return (len); 11743 } 11744 11745 static inline void 11746 bbr_do_error_accounting(struct tcpcb *tp, struct tcp_bbr *bbr, struct bbr_sendmap *rsm, int32_t len, int32_t error) 11747 { 11748 #ifdef NETFLIX_STATS 11749 KMOD_TCPSTAT_INC(tcps_sndpack_error); 11750 KMOD_TCPSTAT_ADD(tcps_sndbyte_error, len); 11751 #endif 11752 } 11753 11754 static inline void 11755 bbr_do_send_accounting(struct tcpcb *tp, struct tcp_bbr *bbr, struct bbr_sendmap *rsm, int32_t len, int32_t error) 11756 { 11757 if (error) { 11758 bbr_do_error_accounting(tp, bbr, rsm, len, error); 11759 return; 11760 } 11761 if (rsm) { 11762 if (rsm->r_flags & BBR_TLP) { 11763 /* 11764 * TLP should not count in retran count, but in its 11765 * own bin 11766 */ 11767 #ifdef NETFLIX_STATS 11768 KMOD_TCPSTAT_INC(tcps_tlpresends); 11769 KMOD_TCPSTAT_ADD(tcps_tlpresend_bytes, len); 11770 #endif 11771 } else { 11772 /* Retransmit */ 11773 tp->t_sndrexmitpack++; 11774 KMOD_TCPSTAT_INC(tcps_sndrexmitpack); 11775 KMOD_TCPSTAT_ADD(tcps_sndrexmitbyte, len); 11776 #ifdef STATS 11777 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB, 11778 len); 11779 #endif 11780 } 11781 /* 11782 * Logs in 0 - 8, 8 is all non probe_bw states 0-7 is 11783 * sub-state 11784 */ 11785 counter_u64_add(bbr_state_lost[rsm->r_bbr_state], len); 11786 if (bbr->rc_bbr_state != BBR_STATE_PROBE_BW) { 11787 /* Non probe_bw log in 1, 2, or 4. */ 11788 counter_u64_add(bbr_state_resend[bbr->rc_bbr_state], len); 11789 } else { 11790 /* 11791 * Log our probe state 3, and log also 5-13 to show 11792 * us the recovery sub-state for the send. This 11793 * means that 3 == (5+6+7+8+9+10+11+12+13) 11794 */ 11795 counter_u64_add(bbr_state_resend[BBR_STATE_PROBE_BW], len); 11796 counter_u64_add(bbr_state_resend[(bbr_state_val(bbr) + 5)], len); 11797 } 11798 /* Place in both 16's the totals of retransmitted */ 11799 counter_u64_add(bbr_state_lost[16], len); 11800 counter_u64_add(bbr_state_resend[16], len); 11801 /* Place in 17's the total sent */ 11802 counter_u64_add(bbr_state_resend[17], len); 11803 counter_u64_add(bbr_state_lost[17], len); 11804 11805 } else { 11806 /* New sends */ 11807 KMOD_TCPSTAT_INC(tcps_sndpack); 11808 KMOD_TCPSTAT_ADD(tcps_sndbyte, len); 11809 /* Place in 17's the total sent */ 11810 counter_u64_add(bbr_state_resend[17], len); 11811 counter_u64_add(bbr_state_lost[17], len); 11812 #ifdef STATS 11813 stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB, 11814 len); 11815 #endif 11816 } 11817 } 11818 11819 static void 11820 bbr_cwnd_limiting(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t in_level) 11821 { 11822 if (bbr->rc_filled_pipe && bbr_target_cwnd_mult_limit && (bbr->rc_use_google == 0)) { 11823 /* 11824 * Limit the cwnd to not be above N x the target plus whats 11825 * is outstanding. The target is based on the current b/w 11826 * estimate. 11827 */ 11828 uint32_t target; 11829 11830 target = bbr_get_target_cwnd(bbr, bbr_get_bw(bbr), BBR_UNIT); 11831 target += ctf_outstanding(tp); 11832 target *= bbr_target_cwnd_mult_limit; 11833 if (tp->snd_cwnd > target) 11834 tp->snd_cwnd = target; 11835 bbr_log_type_cwndupd(bbr, 0, 0, 0, 10, 0, 0, __LINE__); 11836 } 11837 } 11838 11839 static int 11840 bbr_window_update_needed(struct tcpcb *tp, struct socket *so, uint32_t recwin, int32_t maxseg) 11841 { 11842 /* 11843 * "adv" is the amount we could increase the window, taking into 11844 * account that we are limited by TCP_MAXWIN << tp->rcv_scale. 11845 */ 11846 int32_t adv; 11847 int32_t oldwin; 11848 11849 adv = recwin; 11850 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) { 11851 oldwin = (tp->rcv_adv - tp->rcv_nxt); 11852 if (adv > oldwin) 11853 adv -= oldwin; 11854 else { 11855 /* We can't increase the window */ 11856 adv = 0; 11857 } 11858 } else 11859 oldwin = 0; 11860 11861 /* 11862 * If the new window size ends up being the same as or less 11863 * than the old size when it is scaled, then don't force 11864 * a window update. 11865 */ 11866 if (oldwin >> tp->rcv_scale >= (adv + oldwin) >> tp->rcv_scale) 11867 return (0); 11868 11869 if (adv >= (2 * maxseg) && 11870 (adv >= (so->so_rcv.sb_hiwat / 4) || 11871 recwin <= (so->so_rcv.sb_hiwat / 8) || 11872 so->so_rcv.sb_hiwat <= 8 * maxseg)) { 11873 return (1); 11874 } 11875 if (2 * adv >= (int32_t) so->so_rcv.sb_hiwat) 11876 return (1); 11877 return (0); 11878 } 11879 11880 /* 11881 * Return 0 on success and a errno on failure to send. 11882 * Note that a 0 return may not mean we sent anything 11883 * if the TCB was on the hpts. A non-zero return 11884 * does indicate the error we got from ip[6]_output. 11885 */ 11886 static int 11887 bbr_output_wtime(struct tcpcb *tp, const struct timeval *tv) 11888 { 11889 struct socket *so; 11890 int32_t len; 11891 uint32_t cts; 11892 uint32_t recwin, sendwin; 11893 int32_t sb_offset; 11894 int32_t flags, abandon, error = 0; 11895 struct tcp_log_buffer *lgb = NULL; 11896 struct mbuf *m; 11897 struct mbuf *mb; 11898 uint32_t if_hw_tsomaxsegcount = 0; 11899 uint32_t if_hw_tsomaxsegsize = 0; 11900 uint32_t if_hw_tsomax = 0; 11901 struct ip *ip = NULL; 11902 #ifdef TCPDEBUG 11903 struct ipovly *ipov = NULL; 11904 #endif 11905 struct tcp_bbr *bbr; 11906 struct tcphdr *th; 11907 struct udphdr *udp = NULL; 11908 u_char opt[TCP_MAXOLEN]; 11909 unsigned ipoptlen, optlen, hdrlen; 11910 unsigned ulen; 11911 uint32_t bbr_seq; 11912 uint32_t delay_calc=0; 11913 uint8_t doing_tlp = 0; 11914 uint8_t local_options; 11915 #ifdef BBR_INVARIANTS 11916 uint8_t doing_retran_from = 0; 11917 uint8_t picked_up_retran = 0; 11918 #endif 11919 uint8_t wanted_cookie = 0; 11920 uint8_t more_to_rxt=0; 11921 int32_t prefetch_so_done = 0; 11922 int32_t prefetch_rsm = 0; 11923 uint32_t tot_len = 0; 11924 uint32_t maxseg, pace_max_segs, p_maxseg; 11925 int32_t csum_flags = 0; 11926 int32_t hw_tls; 11927 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 11928 unsigned ipsec_optlen = 0; 11929 11930 #endif 11931 volatile int32_t sack_rxmit; 11932 struct bbr_sendmap *rsm = NULL; 11933 int32_t tso, mtu; 11934 struct tcpopt to; 11935 int32_t slot = 0; 11936 struct inpcb *inp; 11937 struct sockbuf *sb; 11938 uint32_t hpts_calling; 11939 #ifdef INET6 11940 struct ip6_hdr *ip6 = NULL; 11941 int32_t isipv6; 11942 #endif 11943 uint8_t app_limited = BBR_JR_SENT_DATA; 11944 uint8_t filled_all = 0; 11945 bbr = (struct tcp_bbr *)tp->t_fb_ptr; 11946 /* We take a cache hit here */ 11947 memcpy(&bbr->rc_tv, tv, sizeof(struct timeval)); 11948 cts = tcp_tv_to_usectick(&bbr->rc_tv); 11949 inp = bbr->rc_inp; 11950 so = inp->inp_socket; 11951 sb = &so->so_snd; 11952 if (sb->sb_flags & SB_TLS_IFNET) 11953 hw_tls = 1; 11954 else 11955 hw_tls = 0; 11956 kern_prefetch(sb, &maxseg); 11957 maxseg = tp->t_maxseg - bbr->rc_last_options; 11958 if (bbr_minseg(bbr) < maxseg) { 11959 tcp_bbr_tso_size_check(bbr, cts); 11960 } 11961 /* Remove any flags that indicate we are pacing on the inp */ 11962 pace_max_segs = bbr->r_ctl.rc_pace_max_segs; 11963 p_maxseg = min(maxseg, pace_max_segs); 11964 INP_WLOCK_ASSERT(inp); 11965 #ifdef TCP_OFFLOAD 11966 if (tp->t_flags & TF_TOE) 11967 return (tcp_offload_output(tp)); 11968 #endif 11969 11970 #ifdef INET6 11971 if (bbr->r_state) { 11972 /* Use the cache line loaded if possible */ 11973 isipv6 = bbr->r_is_v6; 11974 } else { 11975 isipv6 = (inp->inp_vflag & INP_IPV6) != 0; 11976 } 11977 #endif 11978 if (((bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) && 11979 tcp_in_hpts(inp)) { 11980 /* 11981 * We are on the hpts for some timer but not hptsi output. 11982 * Possibly remove from the hpts so we can send/recv etc. 11983 */ 11984 if ((tp->t_flags & TF_ACKNOW) == 0) { 11985 /* 11986 * No immediate demand right now to send an ack, but 11987 * the user may have read, making room for new data 11988 * (a window update). If so we may want to cancel 11989 * whatever timer is running (KEEP/DEL-ACK?) and 11990 * continue to send out a window update. Or we may 11991 * have gotten more data into the socket buffer to 11992 * send. 11993 */ 11994 recwin = lmin(lmax(sbspace(&so->so_rcv), 0), 11995 (long)TCP_MAXWIN << tp->rcv_scale); 11996 if ((bbr_window_update_needed(tp, so, recwin, maxseg) == 0) && 11997 ((tcp_outflags[tp->t_state] & TH_RST) == 0) && 11998 ((sbavail(sb) + ((tcp_outflags[tp->t_state] & TH_FIN) ? 1 : 0)) <= 11999 (tp->snd_max - tp->snd_una))) { 12000 /* 12001 * Nothing new to send and no window update 12002 * is needed to send. Lets just return and 12003 * let the timer-run off. 12004 */ 12005 return (0); 12006 } 12007 } 12008 tcp_hpts_remove(inp); 12009 bbr_timer_cancel(bbr, __LINE__, cts); 12010 } 12011 if (bbr->r_ctl.rc_last_delay_val) { 12012 /* Calculate a rough delay for early escape to sending */ 12013 if (SEQ_GT(cts, bbr->rc_pacer_started)) 12014 delay_calc = cts - bbr->rc_pacer_started; 12015 if (delay_calc >= bbr->r_ctl.rc_last_delay_val) 12016 delay_calc -= bbr->r_ctl.rc_last_delay_val; 12017 else 12018 delay_calc = 0; 12019 } 12020 /* Mark that we have called bbr_output(). */ 12021 if ((bbr->r_timer_override) || 12022 (tp->t_state < TCPS_ESTABLISHED)) { 12023 /* Timeouts or early states are exempt */ 12024 if (tcp_in_hpts(inp)) 12025 tcp_hpts_remove(inp); 12026 } else if (tcp_in_hpts(inp)) { 12027 if ((bbr->r_ctl.rc_last_delay_val) && 12028 (bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) && 12029 delay_calc) { 12030 /* 12031 * We were being paced for output and the delay has 12032 * already exceeded when we were supposed to be 12033 * called, lets go ahead and pull out of the hpts 12034 * and call output. 12035 */ 12036 counter_u64_add(bbr_out_size[TCP_MSS_ACCT_LATE], 1); 12037 bbr->r_ctl.rc_last_delay_val = 0; 12038 tcp_hpts_remove(inp); 12039 } else if (tp->t_state == TCPS_CLOSED) { 12040 bbr->r_ctl.rc_last_delay_val = 0; 12041 tcp_hpts_remove(inp); 12042 } else { 12043 /* 12044 * On the hpts, you shall not pass! even if ACKNOW 12045 * is on, we will when the hpts fires, unless of 12046 * course we are overdue. 12047 */ 12048 counter_u64_add(bbr_out_size[TCP_MSS_ACCT_INPACE], 1); 12049 return (0); 12050 } 12051 } 12052 bbr->rc_cwnd_limited = 0; 12053 if (bbr->r_ctl.rc_last_delay_val) { 12054 /* recalculate the real delay and deal with over/under */ 12055 if (SEQ_GT(cts, bbr->rc_pacer_started)) 12056 delay_calc = cts - bbr->rc_pacer_started; 12057 else 12058 delay_calc = 0; 12059 if (delay_calc >= bbr->r_ctl.rc_last_delay_val) 12060 /* Setup the delay which will be added in */ 12061 delay_calc -= bbr->r_ctl.rc_last_delay_val; 12062 else { 12063 /* 12064 * We are early setup to adjust 12065 * our slot time. 12066 */ 12067 uint64_t merged_val; 12068 12069 bbr->r_ctl.rc_agg_early += (bbr->r_ctl.rc_last_delay_val - delay_calc); 12070 bbr->r_agg_early_set = 1; 12071 if (bbr->r_ctl.rc_hptsi_agg_delay) { 12072 if (bbr->r_ctl.rc_hptsi_agg_delay >= bbr->r_ctl.rc_agg_early) { 12073 /* Nope our previous late cancels out the early */ 12074 bbr->r_ctl.rc_hptsi_agg_delay -= bbr->r_ctl.rc_agg_early; 12075 bbr->r_agg_early_set = 0; 12076 bbr->r_ctl.rc_agg_early = 0; 12077 } else { 12078 bbr->r_ctl.rc_agg_early -= bbr->r_ctl.rc_hptsi_agg_delay; 12079 bbr->r_ctl.rc_hptsi_agg_delay = 0; 12080 } 12081 } 12082 merged_val = bbr->rc_pacer_started; 12083 merged_val <<= 32; 12084 merged_val |= bbr->r_ctl.rc_last_delay_val; 12085 bbr_log_pacing_delay_calc(bbr, inp->inp_hpts_calls, 12086 bbr->r_ctl.rc_agg_early, cts, delay_calc, merged_val, 12087 bbr->r_agg_early_set, 3); 12088 bbr->r_ctl.rc_last_delay_val = 0; 12089 BBR_STAT_INC(bbr_early); 12090 delay_calc = 0; 12091 } 12092 } else { 12093 /* We were not delayed due to hptsi */ 12094 if (bbr->r_agg_early_set) 12095 bbr->r_ctl.rc_agg_early = 0; 12096 bbr->r_agg_early_set = 0; 12097 delay_calc = 0; 12098 } 12099 if (delay_calc) { 12100 /* 12101 * We had a hptsi delay which means we are falling behind on 12102 * sending at the expected rate. Calculate an extra amount 12103 * of data we can send, if any, to put us back on track. 12104 */ 12105 if ((bbr->r_ctl.rc_hptsi_agg_delay + delay_calc) < bbr->r_ctl.rc_hptsi_agg_delay) 12106 bbr->r_ctl.rc_hptsi_agg_delay = 0xffffffff; 12107 else 12108 bbr->r_ctl.rc_hptsi_agg_delay += delay_calc; 12109 } 12110 sendwin = min(tp->snd_wnd, tp->snd_cwnd); 12111 if ((tp->snd_una == tp->snd_max) && 12112 (bbr->rc_bbr_state != BBR_STATE_IDLE_EXIT) && 12113 (sbavail(sb))) { 12114 /* 12115 * Ok we have been idle with nothing outstanding 12116 * we possibly need to start fresh with either a new 12117 * suite of states or a fast-ramp up. 12118 */ 12119 bbr_restart_after_idle(bbr, 12120 cts, bbr_calc_time(cts, bbr->r_ctl.rc_went_idle_time)); 12121 } 12122 /* 12123 * Now was there a hptsi delay where we are behind? We only count 12124 * being behind if: a) We are not in recovery. b) There was a delay. 12125 * <and> c) We had room to send something. 12126 * 12127 */ 12128 hpts_calling = inp->inp_hpts_calls; 12129 inp->inp_hpts_calls = 0; 12130 if (bbr->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { 12131 int retval; 12132 12133 retval = bbr_process_timers(tp, bbr, cts, hpts_calling); 12134 if (retval != 0) { 12135 counter_u64_add(bbr_out_size[TCP_MSS_ACCT_ATIMER], 1); 12136 /* 12137 * If timers want tcp_drop(), then pass error out, 12138 * otherwise suppress it. 12139 */ 12140 return (retval < 0 ? retval : 0); 12141 } 12142 } 12143 bbr->rc_inp->inp_flags2 &= ~INP_MBUF_QUEUE_READY; 12144 if (hpts_calling && 12145 (bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { 12146 bbr->r_ctl.rc_last_delay_val = 0; 12147 } 12148 bbr->r_timer_override = 0; 12149 bbr->r_wanted_output = 0; 12150 /* 12151 * For TFO connections in SYN_RECEIVED, only allow the initial 12152 * SYN|ACK and those sent by the retransmit timer. 12153 */ 12154 if (IS_FASTOPEN(tp->t_flags) && 12155 ((tp->t_state == TCPS_SYN_RECEIVED) || 12156 (tp->t_state == TCPS_SYN_SENT)) && 12157 SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN or SYN|ACK sent */ 12158 (tp->t_rxtshift == 0)) { /* not a retransmit */ 12159 len = 0; 12160 goto just_return_nolock; 12161 } 12162 /* 12163 * Before sending anything check for a state update. For hpts 12164 * calling without input this is important. If its input calling 12165 * then this was already done. 12166 */ 12167 if (bbr->rc_use_google == 0) 12168 bbr_check_bbr_for_state(bbr, cts, __LINE__, 0); 12169 again: 12170 /* 12171 * If we've recently taken a timeout, snd_max will be greater than 12172 * snd_max. BBR in general does not pay much attention to snd_nxt 12173 * for historic reasons the persist timer still uses it. This means 12174 * we have to look at it. All retransmissions that are not persits 12175 * use the rsm that needs to be sent so snd_nxt is ignored. At the 12176 * end of this routine we pull snd_nxt always up to snd_max. 12177 */ 12178 doing_tlp = 0; 12179 #ifdef BBR_INVARIANTS 12180 doing_retran_from = picked_up_retran = 0; 12181 #endif 12182 error = 0; 12183 tso = 0; 12184 slot = 0; 12185 mtu = 0; 12186 sendwin = min(tp->snd_wnd, tp->snd_cwnd); 12187 sb_offset = tp->snd_max - tp->snd_una; 12188 flags = tcp_outflags[tp->t_state]; 12189 sack_rxmit = 0; 12190 len = 0; 12191 rsm = NULL; 12192 if (flags & TH_RST) { 12193 SOCKBUF_LOCK(sb); 12194 goto send; 12195 } 12196 recheck_resend: 12197 while (bbr->r_ctl.rc_free_cnt < bbr_min_req_free) { 12198 /* We need to always have one in reserve */ 12199 rsm = bbr_alloc(bbr); 12200 if (rsm == NULL) { 12201 error = ENOMEM; 12202 /* Lie to get on the hpts */ 12203 tot_len = tp->t_maxseg; 12204 if (hpts_calling) 12205 /* Retry in a ms */ 12206 slot = 1001; 12207 goto just_return_nolock; 12208 } 12209 TAILQ_INSERT_TAIL(&bbr->r_ctl.rc_free, rsm, r_next); 12210 bbr->r_ctl.rc_free_cnt++; 12211 rsm = NULL; 12212 } 12213 /* What do we send, a resend? */ 12214 if (bbr->r_ctl.rc_resend == NULL) { 12215 /* Check for rack timeout */ 12216 bbr->r_ctl.rc_resend = bbr_check_recovery_mode(tp, bbr, cts); 12217 if (bbr->r_ctl.rc_resend) { 12218 #ifdef BBR_INVARIANTS 12219 picked_up_retran = 1; 12220 #endif 12221 bbr_cong_signal(tp, NULL, CC_NDUPACK, bbr->r_ctl.rc_resend); 12222 } 12223 } 12224 if (bbr->r_ctl.rc_resend) { 12225 rsm = bbr->r_ctl.rc_resend; 12226 #ifdef BBR_INVARIANTS 12227 doing_retran_from = 1; 12228 #endif 12229 /* Remove any TLP flags its a RACK or T-O */ 12230 rsm->r_flags &= ~BBR_TLP; 12231 bbr->r_ctl.rc_resend = NULL; 12232 if (SEQ_LT(rsm->r_start, tp->snd_una)) { 12233 #ifdef BBR_INVARIANTS 12234 panic("Huh, tp:%p bbr:%p rsm:%p start:%u < snd_una:%u\n", 12235 tp, bbr, rsm, rsm->r_start, tp->snd_una); 12236 goto recheck_resend; 12237 #else 12238 /* TSNH */ 12239 rsm = NULL; 12240 goto recheck_resend; 12241 #endif 12242 } 12243 if (rsm->r_flags & BBR_HAS_SYN) { 12244 /* Only retransmit a SYN by itself */ 12245 len = 0; 12246 if ((flags & TH_SYN) == 0) { 12247 /* Huh something is wrong */ 12248 rsm->r_start++; 12249 if (rsm->r_start == rsm->r_end) { 12250 /* Clean it up, somehow we missed the ack? */ 12251 bbr_log_syn(tp, NULL); 12252 } else { 12253 /* TFO with data? */ 12254 rsm->r_flags &= ~BBR_HAS_SYN; 12255 len = rsm->r_end - rsm->r_start; 12256 } 12257 } else { 12258 /* Retransmitting SYN */ 12259 rsm = NULL; 12260 SOCKBUF_LOCK(sb); 12261 goto send; 12262 } 12263 } else 12264 len = rsm->r_end - rsm->r_start; 12265 if ((bbr->rc_resends_use_tso == 0) && 12266 (len > maxseg)) { 12267 len = maxseg; 12268 more_to_rxt = 1; 12269 } 12270 sb_offset = rsm->r_start - tp->snd_una; 12271 if (len > 0) { 12272 sack_rxmit = 1; 12273 KMOD_TCPSTAT_INC(tcps_sack_rexmits); 12274 KMOD_TCPSTAT_ADD(tcps_sack_rexmit_bytes, 12275 min(len, maxseg)); 12276 } else { 12277 /* I dont think this can happen */ 12278 rsm = NULL; 12279 goto recheck_resend; 12280 } 12281 BBR_STAT_INC(bbr_resends_set); 12282 } else if (bbr->r_ctl.rc_tlp_send) { 12283 /* 12284 * Tail loss probe 12285 */ 12286 doing_tlp = 1; 12287 rsm = bbr->r_ctl.rc_tlp_send; 12288 bbr->r_ctl.rc_tlp_send = NULL; 12289 sack_rxmit = 1; 12290 len = rsm->r_end - rsm->r_start; 12291 if ((bbr->rc_resends_use_tso == 0) && (len > maxseg)) 12292 len = maxseg; 12293 12294 if (SEQ_GT(tp->snd_una, rsm->r_start)) { 12295 #ifdef BBR_INVARIANTS 12296 panic("tp:%p bbc:%p snd_una:%u rsm:%p r_start:%u", 12297 tp, bbr, tp->snd_una, rsm, rsm->r_start); 12298 #else 12299 /* TSNH */ 12300 rsm = NULL; 12301 goto recheck_resend; 12302 #endif 12303 } 12304 sb_offset = rsm->r_start - tp->snd_una; 12305 BBR_STAT_INC(bbr_tlp_set); 12306 } 12307 /* 12308 * Enforce a connection sendmap count limit if set 12309 * as long as we are not retransmiting. 12310 */ 12311 if ((rsm == NULL) && 12312 (V_tcp_map_entries_limit > 0) && 12313 (bbr->r_ctl.rc_num_maps_alloced >= V_tcp_map_entries_limit)) { 12314 BBR_STAT_INC(bbr_alloc_limited); 12315 if (!bbr->alloc_limit_reported) { 12316 bbr->alloc_limit_reported = 1; 12317 BBR_STAT_INC(bbr_alloc_limited_conns); 12318 } 12319 goto just_return_nolock; 12320 } 12321 #ifdef BBR_INVARIANTS 12322 if (rsm && SEQ_LT(rsm->r_start, tp->snd_una)) { 12323 panic("tp:%p bbr:%p rsm:%p sb_offset:%u len:%u", 12324 tp, bbr, rsm, sb_offset, len); 12325 } 12326 #endif 12327 /* 12328 * Get standard flags, and add SYN or FIN if requested by 'hidden' 12329 * state flags. 12330 */ 12331 if (tp->t_flags & TF_NEEDFIN && (rsm == NULL)) 12332 flags |= TH_FIN; 12333 if (tp->t_flags & TF_NEEDSYN) 12334 flags |= TH_SYN; 12335 12336 if (rsm && (rsm->r_flags & BBR_HAS_FIN)) { 12337 /* we are retransmitting the fin */ 12338 len--; 12339 if (len) { 12340 /* 12341 * When retransmitting data do *not* include the 12342 * FIN. This could happen from a TLP probe if we 12343 * allowed data with a FIN. 12344 */ 12345 flags &= ~TH_FIN; 12346 } 12347 } else if (rsm) { 12348 if (flags & TH_FIN) 12349 flags &= ~TH_FIN; 12350 } 12351 if ((sack_rxmit == 0) && (prefetch_rsm == 0)) { 12352 void *end_rsm; 12353 12354 end_rsm = TAILQ_LAST_FAST(&bbr->r_ctl.rc_tmap, bbr_sendmap, r_tnext); 12355 if (end_rsm) 12356 kern_prefetch(end_rsm, &prefetch_rsm); 12357 prefetch_rsm = 1; 12358 } 12359 SOCKBUF_LOCK(sb); 12360 /* 12361 * If snd_nxt == snd_max and we have transmitted a FIN, the 12362 * sb_offset will be > 0 even if so_snd.sb_cc is 0, resulting in a 12363 * negative length. This can also occur when TCP opens up its 12364 * congestion window while receiving additional duplicate acks after 12365 * fast-retransmit because TCP will reset snd_nxt to snd_max after 12366 * the fast-retransmit. 12367 * 12368 * In the normal retransmit-FIN-only case, however, snd_nxt will be 12369 * set to snd_una, the sb_offset will be 0, and the length may wind 12370 * up 0. 12371 * 12372 * If sack_rxmit is true we are retransmitting from the scoreboard 12373 * in which case len is already set. 12374 */ 12375 if (sack_rxmit == 0) { 12376 uint32_t avail; 12377 12378 avail = sbavail(sb); 12379 if (SEQ_GT(tp->snd_max, tp->snd_una)) 12380 sb_offset = tp->snd_max - tp->snd_una; 12381 else 12382 sb_offset = 0; 12383 if (bbr->rc_tlp_new_data) { 12384 /* TLP is forcing out new data */ 12385 uint32_t tlplen; 12386 12387 doing_tlp = 1; 12388 tlplen = maxseg; 12389 12390 if (tlplen > (uint32_t)(avail - sb_offset)) { 12391 tlplen = (uint32_t)(avail - sb_offset); 12392 } 12393 if (tlplen > tp->snd_wnd) { 12394 len = tp->snd_wnd; 12395 } else { 12396 len = tlplen; 12397 } 12398 bbr->rc_tlp_new_data = 0; 12399 } else { 12400 len = bbr_what_can_we_send(tp, bbr, sendwin, avail, sb_offset, cts); 12401 if ((len < p_maxseg) && 12402 (bbr->rc_in_persist == 0) && 12403 (ctf_outstanding(tp) >= (2 * p_maxseg)) && 12404 ((avail - sb_offset) >= p_maxseg)) { 12405 /* 12406 * We are not completing whats in the socket 12407 * buffer (i.e. there is at least a segment 12408 * waiting to send) and we have 2 or more 12409 * segments outstanding. There is no sense 12410 * of sending a little piece. Lets defer and 12411 * and wait until we can send a whole 12412 * segment. 12413 */ 12414 len = 0; 12415 } 12416 if (bbr->rc_in_persist) { 12417 /* 12418 * We are in persists, figure out if 12419 * a retransmit is available (maybe the previous 12420 * persists we sent) or if we have to send new 12421 * data. 12422 */ 12423 rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map); 12424 if (rsm) { 12425 len = rsm->r_end - rsm->r_start; 12426 if (rsm->r_flags & BBR_HAS_FIN) 12427 len--; 12428 if ((bbr->rc_resends_use_tso == 0) && (len > maxseg)) 12429 len = maxseg; 12430 if (len > 1) 12431 BBR_STAT_INC(bbr_persist_reneg); 12432 /* 12433 * XXXrrs we could force the len to 12434 * 1 byte here to cause the chunk to 12435 * split apart.. but that would then 12436 * mean we always retransmit it as 12437 * one byte even after the window 12438 * opens. 12439 */ 12440 sack_rxmit = 1; 12441 sb_offset = rsm->r_start - tp->snd_una; 12442 } else { 12443 /* 12444 * First time through in persists or peer 12445 * acked our one byte. Though we do have 12446 * to have something in the sb. 12447 */ 12448 len = 1; 12449 sb_offset = 0; 12450 if (avail == 0) 12451 len = 0; 12452 } 12453 } 12454 } 12455 } 12456 if (prefetch_so_done == 0) { 12457 kern_prefetch(so, &prefetch_so_done); 12458 prefetch_so_done = 1; 12459 } 12460 /* 12461 * Lop off SYN bit if it has already been sent. However, if this is 12462 * SYN-SENT state and if segment contains data and if we don't know 12463 * that foreign host supports TAO, suppress sending segment. 12464 */ 12465 if ((flags & TH_SYN) && (rsm == NULL) && 12466 SEQ_GT(tp->snd_max, tp->snd_una)) { 12467 if (tp->t_state != TCPS_SYN_RECEIVED) 12468 flags &= ~TH_SYN; 12469 /* 12470 * When sending additional segments following a TFO SYN|ACK, 12471 * do not include the SYN bit. 12472 */ 12473 if (IS_FASTOPEN(tp->t_flags) && 12474 (tp->t_state == TCPS_SYN_RECEIVED)) 12475 flags &= ~TH_SYN; 12476 sb_offset--, len++; 12477 if (sbavail(sb) == 0) 12478 len = 0; 12479 } else if ((flags & TH_SYN) && rsm) { 12480 /* 12481 * Subtract one from the len for the SYN being 12482 * retransmitted. 12483 */ 12484 len--; 12485 } 12486 /* 12487 * Be careful not to send data and/or FIN on SYN segments. This 12488 * measure is needed to prevent interoperability problems with not 12489 * fully conformant TCP implementations. 12490 */ 12491 if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) { 12492 len = 0; 12493 flags &= ~TH_FIN; 12494 } 12495 /* 12496 * On TFO sockets, ensure no data is sent in the following cases: 12497 * 12498 * - When retransmitting SYN|ACK on a passively-created socket 12499 * - When retransmitting SYN on an actively created socket 12500 * - When sending a zero-length cookie (cookie request) on an 12501 * actively created socket 12502 * - When the socket is in the CLOSED state (RST is being sent) 12503 */ 12504 if (IS_FASTOPEN(tp->t_flags) && 12505 (((flags & TH_SYN) && (tp->t_rxtshift > 0)) || 12506 ((tp->t_state == TCPS_SYN_SENT) && 12507 (tp->t_tfo_client_cookie_len == 0)) || 12508 (flags & TH_RST))) { 12509 len = 0; 12510 sack_rxmit = 0; 12511 rsm = NULL; 12512 } 12513 /* Without fast-open there should never be data sent on a SYN */ 12514 if ((flags & TH_SYN) && (!IS_FASTOPEN(tp->t_flags))) 12515 len = 0; 12516 if (len <= 0) { 12517 /* 12518 * If FIN has been sent but not acked, but we haven't been 12519 * called to retransmit, len will be < 0. Otherwise, window 12520 * shrank after we sent into it. If window shrank to 0, 12521 * cancel pending retransmit, pull snd_nxt back to (closed) 12522 * window, and set the persist timer if it isn't already 12523 * going. If the window didn't close completely, just wait 12524 * for an ACK. 12525 * 12526 * We also do a general check here to ensure that we will 12527 * set the persist timer when we have data to send, but a 12528 * 0-byte window. This makes sure the persist timer is set 12529 * even if the packet hits one of the "goto send" lines 12530 * below. 12531 */ 12532 len = 0; 12533 if ((tp->snd_wnd == 0) && 12534 (TCPS_HAVEESTABLISHED(tp->t_state)) && 12535 (tp->snd_una == tp->snd_max) && 12536 (sb_offset < (int)sbavail(sb))) { 12537 /* 12538 * Not enough room in the rwnd to send 12539 * a paced segment out. 12540 */ 12541 bbr_enter_persist(tp, bbr, cts, __LINE__); 12542 } 12543 } else if ((rsm == NULL) && 12544 (doing_tlp == 0) && 12545 (len < bbr->r_ctl.rc_pace_max_segs)) { 12546 /* 12547 * We are not sending a full segment for 12548 * some reason. Should we not send anything (think 12549 * sws or persists)? 12550 */ 12551 if ((tp->snd_wnd < min((bbr->r_ctl.rc_high_rwnd/2), bbr_minseg(bbr))) && 12552 (TCPS_HAVEESTABLISHED(tp->t_state)) && 12553 (len < (int)(sbavail(sb) - sb_offset))) { 12554 /* 12555 * Here the rwnd is less than 12556 * the pacing size, this is not a retransmit, 12557 * we are established and 12558 * the send is not the last in the socket buffer 12559 * lets not send, and possibly enter persists. 12560 */ 12561 len = 0; 12562 if (tp->snd_max == tp->snd_una) 12563 bbr_enter_persist(tp, bbr, cts, __LINE__); 12564 } else if ((tp->snd_cwnd >= bbr->r_ctl.rc_pace_max_segs) && 12565 (ctf_flight_size(tp, (bbr->r_ctl.rc_sacked + 12566 bbr->r_ctl.rc_lost_bytes)) > (2 * maxseg)) && 12567 (len < (int)(sbavail(sb) - sb_offset)) && 12568 (len < bbr_minseg(bbr))) { 12569 /* 12570 * Here we are not retransmitting, and 12571 * the cwnd is not so small that we could 12572 * not send at least a min size (rxt timer 12573 * not having gone off), We have 2 segments or 12574 * more already in flight, its not the tail end 12575 * of the socket buffer and the cwnd is blocking 12576 * us from sending out minimum pacing segment size. 12577 * Lets not send anything. 12578 */ 12579 bbr->rc_cwnd_limited = 1; 12580 len = 0; 12581 } else if (((tp->snd_wnd - ctf_outstanding(tp)) < 12582 min((bbr->r_ctl.rc_high_rwnd/2), bbr_minseg(bbr))) && 12583 (ctf_flight_size(tp, (bbr->r_ctl.rc_sacked + 12584 bbr->r_ctl.rc_lost_bytes)) > (2 * maxseg)) && 12585 (len < (int)(sbavail(sb) - sb_offset)) && 12586 (TCPS_HAVEESTABLISHED(tp->t_state))) { 12587 /* 12588 * Here we have a send window but we have 12589 * filled it up and we can't send another pacing segment. 12590 * We also have in flight more than 2 segments 12591 * and we are not completing the sb i.e. we allow 12592 * the last bytes of the sb to go out even if 12593 * its not a full pacing segment. 12594 */ 12595 len = 0; 12596 } 12597 } 12598 /* len will be >= 0 after this point. */ 12599 KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); 12600 tcp_sndbuf_autoscale(tp, so, sendwin); 12601 /* 12602 * 12603 */ 12604 if (bbr->rc_in_persist && 12605 len && 12606 (rsm == NULL) && 12607 (len < min((bbr->r_ctl.rc_high_rwnd/2), bbr->r_ctl.rc_pace_max_segs))) { 12608 /* 12609 * We are in persist, not doing a retransmit and don't have enough space 12610 * yet to send a full TSO. So is it at the end of the sb 12611 * if so we need to send else nuke to 0 and don't send. 12612 */ 12613 int sbleft; 12614 if (sbavail(sb) > sb_offset) 12615 sbleft = sbavail(sb) - sb_offset; 12616 else 12617 sbleft = 0; 12618 if (sbleft >= min((bbr->r_ctl.rc_high_rwnd/2), bbr->r_ctl.rc_pace_max_segs)) { 12619 /* not at end of sb lets not send */ 12620 len = 0; 12621 } 12622 } 12623 /* 12624 * Decide if we can use TCP Segmentation Offloading (if supported by 12625 * hardware). 12626 * 12627 * TSO may only be used if we are in a pure bulk sending state. The 12628 * presence of TCP-MD5, SACK retransmits, SACK advertizements and IP 12629 * options prevent using TSO. With TSO the TCP header is the same 12630 * (except for the sequence number) for all generated packets. This 12631 * makes it impossible to transmit any options which vary per 12632 * generated segment or packet. 12633 * 12634 * IPv4 handling has a clear separation of ip options and ip header 12635 * flags while IPv6 combines both in in6p_outputopts. ip6_optlen() 12636 * does the right thing below to provide length of just ip options 12637 * and thus checking for ipoptlen is enough to decide if ip options 12638 * are present. 12639 */ 12640 #ifdef INET6 12641 if (isipv6) 12642 ipoptlen = ip6_optlen(inp); 12643 else 12644 #endif 12645 if (inp->inp_options) 12646 ipoptlen = inp->inp_options->m_len - 12647 offsetof(struct ipoption, ipopt_list); 12648 else 12649 ipoptlen = 0; 12650 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 12651 /* 12652 * Pre-calculate here as we save another lookup into the darknesses 12653 * of IPsec that way and can actually decide if TSO is ok. 12654 */ 12655 #ifdef INET6 12656 if (isipv6 && IPSEC_ENABLED(ipv6)) 12657 ipsec_optlen = IPSEC_HDRSIZE(ipv6, inp); 12658 #ifdef INET 12659 else 12660 #endif 12661 #endif /* INET6 */ 12662 #ifdef INET 12663 if (IPSEC_ENABLED(ipv4)) 12664 ipsec_optlen = IPSEC_HDRSIZE(ipv4, inp); 12665 #endif /* INET */ 12666 #endif /* IPSEC */ 12667 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 12668 ipoptlen += ipsec_optlen; 12669 #endif 12670 if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && 12671 (len > maxseg) && 12672 (tp->t_port == 0) && 12673 ((tp->t_flags & TF_SIGNATURE) == 0) && 12674 tp->rcv_numsacks == 0 && 12675 ipoptlen == 0) 12676 tso = 1; 12677 12678 recwin = lmin(lmax(sbspace(&so->so_rcv), 0), 12679 (long)TCP_MAXWIN << tp->rcv_scale); 12680 /* 12681 * Sender silly window avoidance. We transmit under the following 12682 * conditions when len is non-zero: 12683 * 12684 * - We have a full segment (or more with TSO) - This is the last 12685 * buffer in a write()/send() and we are either idle or running 12686 * NODELAY - we've timed out (e.g. persist timer) - we have more 12687 * then 1/2 the maximum send window's worth of data (receiver may be 12688 * limited the window size) - we need to retransmit 12689 */ 12690 if (rsm) 12691 goto send; 12692 if (len) { 12693 if (sack_rxmit) 12694 goto send; 12695 if (len >= p_maxseg) 12696 goto send; 12697 /* 12698 * NOTE! on localhost connections an 'ack' from the remote 12699 * end may occur synchronously with the output and cause us 12700 * to flush a buffer queued with moretocome. XXX 12701 * 12702 */ 12703 if (((tp->t_flags & TF_MORETOCOME) == 0) && /* normal case */ 12704 ((tp->t_flags & TF_NODELAY) || 12705 ((uint32_t)len + (uint32_t)sb_offset) >= sbavail(&so->so_snd)) && 12706 (tp->t_flags & TF_NOPUSH) == 0) { 12707 goto send; 12708 } 12709 if ((tp->snd_una == tp->snd_max) && len) { /* Nothing outstanding */ 12710 goto send; 12711 } 12712 if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) { 12713 goto send; 12714 } 12715 } 12716 /* 12717 * Sending of standalone window updates. 12718 * 12719 * Window updates are important when we close our window due to a 12720 * full socket buffer and are opening it again after the application 12721 * reads data from it. Once the window has opened again and the 12722 * remote end starts to send again the ACK clock takes over and 12723 * provides the most current window information. 12724 * 12725 * We must avoid the silly window syndrome whereas every read from 12726 * the receive buffer, no matter how small, causes a window update 12727 * to be sent. We also should avoid sending a flurry of window 12728 * updates when the socket buffer had queued a lot of data and the 12729 * application is doing small reads. 12730 * 12731 * Prevent a flurry of pointless window updates by only sending an 12732 * update when we can increase the advertized window by more than 12733 * 1/4th of the socket buffer capacity. When the buffer is getting 12734 * full or is very small be more aggressive and send an update 12735 * whenever we can increase by two mss sized segments. In all other 12736 * situations the ACK's to new incoming data will carry further 12737 * window increases. 12738 * 12739 * Don't send an independent window update if a delayed ACK is 12740 * pending (it will get piggy-backed on it) or the remote side 12741 * already has done a half-close and won't send more data. Skip 12742 * this if the connection is in T/TCP half-open state. 12743 */ 12744 if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN) && 12745 !(tp->t_flags & TF_DELACK) && 12746 !TCPS_HAVERCVDFIN(tp->t_state)) { 12747 /* Check to see if we should do a window update */ 12748 if (bbr_window_update_needed(tp, so, recwin, maxseg)) 12749 goto send; 12750 } 12751 /* 12752 * Send if we owe the peer an ACK, RST, SYN. ACKNOW 12753 * is also a catch-all for the retransmit timer timeout case. 12754 */ 12755 if (tp->t_flags & TF_ACKNOW) { 12756 goto send; 12757 } 12758 if (flags & TH_RST) { 12759 /* Always send a RST if one is due */ 12760 goto send; 12761 } 12762 if ((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0) { 12763 goto send; 12764 } 12765 /* 12766 * If our state indicates that FIN should be sent and we have not 12767 * yet done so, then we need to send. 12768 */ 12769 if (flags & TH_FIN && 12770 ((tp->t_flags & TF_SENTFIN) == 0)) { 12771 goto send; 12772 } 12773 /* 12774 * No reason to send a segment, just return. 12775 */ 12776 just_return: 12777 SOCKBUF_UNLOCK(sb); 12778 just_return_nolock: 12779 if (tot_len) 12780 slot = bbr_get_pacing_delay(bbr, bbr->r_ctl.rc_bbr_hptsi_gain, tot_len, cts, 0); 12781 if (bbr->rc_no_pacing) 12782 slot = 0; 12783 if (tot_len == 0) { 12784 if ((ctf_outstanding(tp) + min((bbr->r_ctl.rc_high_rwnd/2), bbr_minseg(bbr))) >= 12785 tp->snd_wnd) { 12786 BBR_STAT_INC(bbr_rwnd_limited); 12787 app_limited = BBR_JR_RWND_LIMITED; 12788 bbr_cwnd_limiting(tp, bbr, ctf_outstanding(tp)); 12789 if ((bbr->rc_in_persist == 0) && 12790 TCPS_HAVEESTABLISHED(tp->t_state) && 12791 (tp->snd_max == tp->snd_una) && 12792 sbavail(&tp->t_inpcb->inp_socket->so_snd)) { 12793 /* No send window.. we must enter persist */ 12794 bbr_enter_persist(tp, bbr, bbr->r_ctl.rc_rcvtime, __LINE__); 12795 } 12796 } else if (ctf_outstanding(tp) >= sbavail(sb)) { 12797 BBR_STAT_INC(bbr_app_limited); 12798 app_limited = BBR_JR_APP_LIMITED; 12799 bbr_cwnd_limiting(tp, bbr, ctf_outstanding(tp)); 12800 } else if ((ctf_flight_size(tp, (bbr->r_ctl.rc_sacked + 12801 bbr->r_ctl.rc_lost_bytes)) + p_maxseg) >= tp->snd_cwnd) { 12802 BBR_STAT_INC(bbr_cwnd_limited); 12803 app_limited = BBR_JR_CWND_LIMITED; 12804 bbr_cwnd_limiting(tp, bbr, ctf_flight_size(tp, (bbr->r_ctl.rc_sacked + 12805 bbr->r_ctl.rc_lost_bytes))); 12806 bbr->rc_cwnd_limited = 1; 12807 } else { 12808 BBR_STAT_INC(bbr_app_limited); 12809 app_limited = BBR_JR_APP_LIMITED; 12810 bbr_cwnd_limiting(tp, bbr, ctf_outstanding(tp)); 12811 } 12812 bbr->r_ctl.rc_hptsi_agg_delay = 0; 12813 bbr->r_agg_early_set = 0; 12814 bbr->r_ctl.rc_agg_early = 0; 12815 bbr->r_ctl.rc_last_delay_val = 0; 12816 } else if (bbr->rc_use_google == 0) 12817 bbr_check_bbr_for_state(bbr, cts, __LINE__, 0); 12818 /* Are we app limited? */ 12819 if ((app_limited == BBR_JR_APP_LIMITED) || 12820 (app_limited == BBR_JR_RWND_LIMITED)) { 12821 /** 12822 * We are application limited. 12823 */ 12824 bbr->r_ctl.r_app_limited_until = (ctf_flight_size(tp, (bbr->r_ctl.rc_sacked + 12825 bbr->r_ctl.rc_lost_bytes)) + bbr->r_ctl.rc_delivered); 12826 } 12827 if (tot_len == 0) 12828 counter_u64_add(bbr_out_size[TCP_MSS_ACCT_JUSTRET], 1); 12829 /* Dont update the time if we did not send */ 12830 bbr->r_ctl.rc_last_delay_val = 0; 12831 bbr->rc_output_starts_timer = 1; 12832 bbr_start_hpts_timer(bbr, tp, cts, 9, slot, tot_len); 12833 bbr_log_type_just_return(bbr, cts, tot_len, hpts_calling, app_limited, p_maxseg, len); 12834 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { 12835 /* Make sure snd_nxt is drug up */ 12836 tp->snd_nxt = tp->snd_max; 12837 } 12838 return (error); 12839 12840 send: 12841 if (doing_tlp == 0) { 12842 /* 12843 * Data not a TLP, and its not the rxt firing. If it is the 12844 * rxt firing, we want to leave the tlp_in_progress flag on 12845 * so we don't send another TLP. It has to be a rack timer 12846 * or normal send (response to acked data) to clear the tlp 12847 * in progress flag. 12848 */ 12849 bbr->rc_tlp_in_progress = 0; 12850 bbr->rc_tlp_rtx_out = 0; 12851 } else { 12852 /* 12853 * Its a TLP. 12854 */ 12855 bbr->rc_tlp_in_progress = 1; 12856 } 12857 bbr_timer_cancel(bbr, __LINE__, cts); 12858 if (rsm == NULL) { 12859 if (sbused(sb) > 0) { 12860 /* 12861 * This is sub-optimal. We only send a stand alone 12862 * FIN on its own segment. 12863 */ 12864 if (flags & TH_FIN) { 12865 flags &= ~TH_FIN; 12866 if ((len == 0) && ((tp->t_flags & TF_ACKNOW) == 0)) { 12867 /* Lets not send this */ 12868 slot = 0; 12869 goto just_return; 12870 } 12871 } 12872 } 12873 } else { 12874 /* 12875 * We do *not* send a FIN on a retransmit if it has data. 12876 * The if clause here where len > 1 should never come true. 12877 */ 12878 if ((len > 0) && 12879 (((rsm->r_flags & BBR_HAS_FIN) == 0) && 12880 (flags & TH_FIN))) { 12881 flags &= ~TH_FIN; 12882 len--; 12883 } 12884 } 12885 SOCKBUF_LOCK_ASSERT(sb); 12886 if (len > 0) { 12887 if ((tp->snd_una == tp->snd_max) && 12888 (bbr_calc_time(cts, bbr->r_ctl.rc_went_idle_time) >= bbr_rtt_probe_time)) { 12889 /* 12890 * This qualifies as a RTT_PROBE session since we 12891 * drop the data outstanding to nothing and waited 12892 * more than bbr_rtt_probe_time. 12893 */ 12894 bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_WASIDLE, 0); 12895 bbr_set_reduced_rtt(bbr, cts, __LINE__); 12896 } 12897 if (len >= maxseg) 12898 tp->t_flags2 |= TF2_PLPMTU_MAXSEGSNT; 12899 else 12900 tp->t_flags2 &= ~TF2_PLPMTU_MAXSEGSNT; 12901 } 12902 /* 12903 * Before ESTABLISHED, force sending of initial options unless TCP 12904 * set not to do any options. NOTE: we assume that the IP/TCP header 12905 * plus TCP options always fit in a single mbuf, leaving room for a 12906 * maximum link header, i.e. max_linkhdr + sizeof (struct tcpiphdr) 12907 * + optlen <= MCLBYTES 12908 */ 12909 optlen = 0; 12910 #ifdef INET6 12911 if (isipv6) 12912 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 12913 else 12914 #endif 12915 hdrlen = sizeof(struct tcpiphdr); 12916 12917 /* 12918 * Compute options for segment. We only have to care about SYN and 12919 * established connection segments. Options for SYN-ACK segments 12920 * are handled in TCP syncache. 12921 */ 12922 to.to_flags = 0; 12923 local_options = 0; 12924 if ((tp->t_flags & TF_NOOPT) == 0) { 12925 /* Maximum segment size. */ 12926 if (flags & TH_SYN) { 12927 to.to_mss = tcp_mssopt(&inp->inp_inc); 12928 if (tp->t_port) 12929 to.to_mss -= V_tcp_udp_tunneling_overhead; 12930 to.to_flags |= TOF_MSS; 12931 /* 12932 * On SYN or SYN|ACK transmits on TFO connections, 12933 * only include the TFO option if it is not a 12934 * retransmit, as the presence of the TFO option may 12935 * have caused the original SYN or SYN|ACK to have 12936 * been dropped by a middlebox. 12937 */ 12938 if (IS_FASTOPEN(tp->t_flags) && 12939 (tp->t_rxtshift == 0)) { 12940 if (tp->t_state == TCPS_SYN_RECEIVED) { 12941 to.to_tfo_len = TCP_FASTOPEN_COOKIE_LEN; 12942 to.to_tfo_cookie = 12943 (u_int8_t *)&tp->t_tfo_cookie.server; 12944 to.to_flags |= TOF_FASTOPEN; 12945 wanted_cookie = 1; 12946 } else if (tp->t_state == TCPS_SYN_SENT) { 12947 to.to_tfo_len = 12948 tp->t_tfo_client_cookie_len; 12949 to.to_tfo_cookie = 12950 tp->t_tfo_cookie.client; 12951 to.to_flags |= TOF_FASTOPEN; 12952 wanted_cookie = 1; 12953 } 12954 } 12955 } 12956 /* Window scaling. */ 12957 if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) { 12958 to.to_wscale = tp->request_r_scale; 12959 to.to_flags |= TOF_SCALE; 12960 } 12961 /* Timestamps. */ 12962 if ((tp->t_flags & TF_RCVD_TSTMP) || 12963 ((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) { 12964 to.to_tsval = tcp_tv_to_mssectick(&bbr->rc_tv) + tp->ts_offset; 12965 to.to_tsecr = tp->ts_recent; 12966 to.to_flags |= TOF_TS; 12967 local_options += TCPOLEN_TIMESTAMP + 2; 12968 } 12969 /* Set receive buffer autosizing timestamp. */ 12970 if (tp->rfbuf_ts == 0 && 12971 (so->so_rcv.sb_flags & SB_AUTOSIZE)) 12972 tp->rfbuf_ts = tcp_tv_to_mssectick(&bbr->rc_tv); 12973 /* Selective ACK's. */ 12974 if (flags & TH_SYN) 12975 to.to_flags |= TOF_SACKPERM; 12976 else if (TCPS_HAVEESTABLISHED(tp->t_state) && 12977 tp->rcv_numsacks > 0) { 12978 to.to_flags |= TOF_SACK; 12979 to.to_nsacks = tp->rcv_numsacks; 12980 to.to_sacks = (u_char *)tp->sackblks; 12981 } 12982 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) 12983 /* TCP-MD5 (RFC2385). */ 12984 if (tp->t_flags & TF_SIGNATURE) 12985 to.to_flags |= TOF_SIGNATURE; 12986 #endif /* TCP_SIGNATURE */ 12987 12988 /* Processing the options. */ 12989 hdrlen += (optlen = tcp_addoptions(&to, opt)); 12990 /* 12991 * If we wanted a TFO option to be added, but it was unable 12992 * to fit, ensure no data is sent. 12993 */ 12994 if (IS_FASTOPEN(tp->t_flags) && wanted_cookie && 12995 !(to.to_flags & TOF_FASTOPEN)) 12996 len = 0; 12997 } 12998 if (tp->t_port) { 12999 if (V_tcp_udp_tunneling_port == 0) { 13000 /* The port was removed?? */ 13001 SOCKBUF_UNLOCK(&so->so_snd); 13002 return (EHOSTUNREACH); 13003 } 13004 hdrlen += sizeof(struct udphdr); 13005 } 13006 #ifdef INET6 13007 if (isipv6) 13008 ipoptlen = ip6_optlen(tp->t_inpcb); 13009 else 13010 #endif 13011 if (tp->t_inpcb->inp_options) 13012 ipoptlen = tp->t_inpcb->inp_options->m_len - 13013 offsetof(struct ipoption, ipopt_list); 13014 else 13015 ipoptlen = 0; 13016 ipoptlen = 0; 13017 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 13018 ipoptlen += ipsec_optlen; 13019 #endif 13020 if (bbr->rc_last_options != local_options) { 13021 /* 13022 * Cache the options length this generally does not change 13023 * on a connection. We use this to calculate TSO. 13024 */ 13025 bbr->rc_last_options = local_options; 13026 } 13027 maxseg = tp->t_maxseg - (ipoptlen + optlen); 13028 p_maxseg = min(maxseg, pace_max_segs); 13029 /* 13030 * Adjust data length if insertion of options will bump the packet 13031 * length beyond the t_maxseg length. Clear the FIN bit because we 13032 * cut off the tail of the segment. 13033 */ 13034 if (len > maxseg) { 13035 if (len != 0 && (flags & TH_FIN)) { 13036 flags &= ~TH_FIN; 13037 } 13038 if (tso) { 13039 uint32_t moff; 13040 int32_t max_len; 13041 13042 /* extract TSO information */ 13043 if_hw_tsomax = tp->t_tsomax; 13044 if_hw_tsomaxsegcount = tp->t_tsomaxsegcount; 13045 if_hw_tsomaxsegsize = tp->t_tsomaxsegsize; 13046 KASSERT(ipoptlen == 0, 13047 ("%s: TSO can't do IP options", __func__)); 13048 13049 /* 13050 * Check if we should limit by maximum payload 13051 * length: 13052 */ 13053 if (if_hw_tsomax != 0) { 13054 /* compute maximum TSO length */ 13055 max_len = (if_hw_tsomax - hdrlen - 13056 max_linkhdr); 13057 if (max_len <= 0) { 13058 len = 0; 13059 } else if (len > max_len) { 13060 len = max_len; 13061 } 13062 } 13063 /* 13064 * Prevent the last segment from being fractional 13065 * unless the send sockbuf can be emptied: 13066 */ 13067 if ((sb_offset + len) < sbavail(sb)) { 13068 moff = len % (uint32_t)maxseg; 13069 if (moff != 0) { 13070 len -= moff; 13071 } 13072 } 13073 /* 13074 * In case there are too many small fragments don't 13075 * use TSO: 13076 */ 13077 if (len <= maxseg) { 13078 len = maxseg; 13079 tso = 0; 13080 } 13081 } else { 13082 /* Not doing TSO */ 13083 if (optlen + ipoptlen >= tp->t_maxseg) { 13084 /* 13085 * Since we don't have enough space to put 13086 * the IP header chain and the TCP header in 13087 * one packet as required by RFC 7112, don't 13088 * send it. Also ensure that at least one 13089 * byte of the payload can be put into the 13090 * TCP segment. 13091 */ 13092 SOCKBUF_UNLOCK(&so->so_snd); 13093 error = EMSGSIZE; 13094 sack_rxmit = 0; 13095 goto out; 13096 } 13097 len = maxseg; 13098 } 13099 } else { 13100 /* Not doing TSO */ 13101 if_hw_tsomaxsegcount = 0; 13102 tso = 0; 13103 } 13104 KASSERT(len + hdrlen + ipoptlen <= IP_MAXPACKET, 13105 ("%s: len > IP_MAXPACKET", __func__)); 13106 #ifdef DIAGNOSTIC 13107 #ifdef INET6 13108 if (max_linkhdr + hdrlen > MCLBYTES) 13109 #else 13110 if (max_linkhdr + hdrlen > MHLEN) 13111 #endif 13112 panic("tcphdr too big"); 13113 #endif 13114 /* 13115 * This KASSERT is here to catch edge cases at a well defined place. 13116 * Before, those had triggered (random) panic conditions further 13117 * down. 13118 */ 13119 #ifdef BBR_INVARIANTS 13120 if (sack_rxmit) { 13121 if (SEQ_LT(rsm->r_start, tp->snd_una)) { 13122 panic("RSM:%p TP:%p bbr:%p start:%u is < snd_una:%u", 13123 rsm, tp, bbr, rsm->r_start, tp->snd_una); 13124 } 13125 } 13126 #endif 13127 KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); 13128 if ((len == 0) && 13129 (flags & TH_FIN) && 13130 (sbused(sb))) { 13131 /* 13132 * We have outstanding data, don't send a fin by itself!. 13133 */ 13134 slot = 0; 13135 goto just_return; 13136 } 13137 /* 13138 * Grab a header mbuf, attaching a copy of data to be transmitted, 13139 * and initialize the header from the template for sends on this 13140 * connection. 13141 */ 13142 if (len) { 13143 uint32_t moff; 13144 13145 /* 13146 * We place a limit on sending with hptsi. 13147 */ 13148 if ((rsm == NULL) && len > pace_max_segs) 13149 len = pace_max_segs; 13150 if (len <= maxseg) 13151 tso = 0; 13152 #ifdef INET6 13153 if (MHLEN < hdrlen + max_linkhdr) 13154 m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); 13155 else 13156 #endif 13157 m = m_gethdr(M_NOWAIT, MT_DATA); 13158 13159 if (m == NULL) { 13160 BBR_STAT_INC(bbr_failed_mbuf_aloc); 13161 bbr_log_enobuf_jmp(bbr, len, cts, __LINE__, len, 0, 0); 13162 SOCKBUF_UNLOCK(sb); 13163 error = ENOBUFS; 13164 sack_rxmit = 0; 13165 goto out; 13166 } 13167 m->m_data += max_linkhdr; 13168 m->m_len = hdrlen; 13169 /* 13170 * Start the m_copy functions from the closest mbuf to the 13171 * sb_offset in the socket buffer chain. 13172 */ 13173 if ((sb_offset > sbavail(sb)) || ((len + sb_offset) > sbavail(sb))) { 13174 #ifdef BBR_INVARIANTS 13175 if ((len + sb_offset) > (sbavail(sb) + ((flags & (TH_FIN | TH_SYN)) ? 1 : 0))) 13176 panic("tp:%p bbr:%p len:%u sb_offset:%u sbavail:%u rsm:%p %u:%u:%u", 13177 tp, bbr, len, sb_offset, sbavail(sb), rsm, 13178 doing_retran_from, 13179 picked_up_retran, 13180 doing_tlp); 13181 13182 #endif 13183 /* 13184 * In this messed up situation we have two choices, 13185 * a) pretend the send worked, and just start timers 13186 * and what not (not good since that may lead us 13187 * back here a lot). <or> b) Send the lowest segment 13188 * in the map. <or> c) Drop the connection. Lets do 13189 * <b> which if it continues to happen will lead to 13190 * <c> via timeouts. 13191 */ 13192 BBR_STAT_INC(bbr_offset_recovery); 13193 rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map); 13194 sb_offset = 0; 13195 if (rsm == NULL) { 13196 sack_rxmit = 0; 13197 len = sbavail(sb); 13198 } else { 13199 sack_rxmit = 1; 13200 if (rsm->r_start != tp->snd_una) { 13201 /* 13202 * Things are really messed up, <c> 13203 * is the only thing to do. 13204 */ 13205 BBR_STAT_INC(bbr_offset_drop); 13206 SOCKBUF_UNLOCK(sb); 13207 (void)m_free(m); 13208 return (-EFAULT); /* tcp_drop() */ 13209 } 13210 len = rsm->r_end - rsm->r_start; 13211 } 13212 if (len > sbavail(sb)) 13213 len = sbavail(sb); 13214 if (len > maxseg) 13215 len = maxseg; 13216 } 13217 mb = sbsndptr_noadv(sb, sb_offset, &moff); 13218 if (len <= MHLEN - hdrlen - max_linkhdr && !hw_tls) { 13219 m_copydata(mb, moff, (int)len, 13220 mtod(m, caddr_t)+hdrlen); 13221 if (rsm == NULL) 13222 sbsndptr_adv(sb, mb, len); 13223 m->m_len += len; 13224 } else { 13225 struct sockbuf *msb; 13226 13227 if (rsm) 13228 msb = NULL; 13229 else 13230 msb = sb; 13231 #ifdef BBR_INVARIANTS 13232 if ((len + moff) > (sbavail(sb) + ((flags & (TH_FIN | TH_SYN)) ? 1 : 0))) { 13233 if (rsm) { 13234 panic("tp:%p bbr:%p len:%u moff:%u sbavail:%u rsm:%p snd_una:%u rsm_start:%u flg:%x %u:%u:%u sr:%d ", 13235 tp, bbr, len, moff, 13236 sbavail(sb), rsm, 13237 tp->snd_una, rsm->r_flags, rsm->r_start, 13238 doing_retran_from, 13239 picked_up_retran, 13240 doing_tlp, sack_rxmit); 13241 } else { 13242 panic("tp:%p bbr:%p len:%u moff:%u sbavail:%u sb_offset:%u snd_una:%u", 13243 tp, bbr, len, moff, sbavail(sb), sb_offset, tp->snd_una); 13244 } 13245 } 13246 #endif 13247 m->m_next = tcp_m_copym( 13248 mb, moff, &len, 13249 if_hw_tsomaxsegcount, 13250 if_hw_tsomaxsegsize, msb, 13251 ((rsm == NULL) ? hw_tls : 0) 13252 #ifdef NETFLIX_COPY_ARGS 13253 , &filled_all 13254 #endif 13255 ); 13256 if (len <= maxseg) { 13257 /* 13258 * Must have ran out of mbufs for the copy 13259 * shorten it to no longer need tso. Lets 13260 * not put on sendalot since we are low on 13261 * mbufs. 13262 */ 13263 tso = 0; 13264 } 13265 if (m->m_next == NULL) { 13266 SOCKBUF_UNLOCK(sb); 13267 (void)m_free(m); 13268 error = ENOBUFS; 13269 sack_rxmit = 0; 13270 goto out; 13271 } 13272 } 13273 #ifdef BBR_INVARIANTS 13274 if (tso && len < maxseg) { 13275 panic("tp:%p tso on, but len:%d < maxseg:%d", 13276 tp, len, maxseg); 13277 } 13278 if (tso && if_hw_tsomaxsegcount) { 13279 int32_t seg_cnt = 0; 13280 struct mbuf *foo; 13281 13282 foo = m; 13283 while (foo) { 13284 seg_cnt++; 13285 foo = foo->m_next; 13286 } 13287 if (seg_cnt > if_hw_tsomaxsegcount) { 13288 panic("seg_cnt:%d > max:%d", seg_cnt, if_hw_tsomaxsegcount); 13289 } 13290 } 13291 #endif 13292 /* 13293 * If we're sending everything we've got, set PUSH. (This 13294 * will keep happy those implementations which only give 13295 * data to the user when a buffer fills or a PUSH comes in.) 13296 */ 13297 if (sb_offset + len == sbused(sb) && 13298 sbused(sb) && 13299 !(flags & TH_SYN)) { 13300 flags |= TH_PUSH; 13301 } 13302 SOCKBUF_UNLOCK(sb); 13303 } else { 13304 SOCKBUF_UNLOCK(sb); 13305 if (tp->t_flags & TF_ACKNOW) 13306 KMOD_TCPSTAT_INC(tcps_sndacks); 13307 else if (flags & (TH_SYN | TH_FIN | TH_RST)) 13308 KMOD_TCPSTAT_INC(tcps_sndctrl); 13309 else 13310 KMOD_TCPSTAT_INC(tcps_sndwinup); 13311 13312 m = m_gethdr(M_NOWAIT, MT_DATA); 13313 if (m == NULL) { 13314 BBR_STAT_INC(bbr_failed_mbuf_aloc); 13315 bbr_log_enobuf_jmp(bbr, len, cts, __LINE__, len, 0, 0); 13316 error = ENOBUFS; 13317 /* Fudge the send time since we could not send */ 13318 sack_rxmit = 0; 13319 goto out; 13320 } 13321 #ifdef INET6 13322 if (isipv6 && (MHLEN < hdrlen + max_linkhdr) && 13323 MHLEN >= hdrlen) { 13324 M_ALIGN(m, hdrlen); 13325 } else 13326 #endif 13327 m->m_data += max_linkhdr; 13328 m->m_len = hdrlen; 13329 } 13330 SOCKBUF_UNLOCK_ASSERT(sb); 13331 m->m_pkthdr.rcvif = (struct ifnet *)0; 13332 #ifdef MAC 13333 mac_inpcb_create_mbuf(inp, m); 13334 #endif 13335 #ifdef INET6 13336 if (isipv6) { 13337 ip6 = mtod(m, struct ip6_hdr *); 13338 if (tp->t_port) { 13339 udp = (struct udphdr *)((caddr_t)ip6 + sizeof(struct ip6_hdr)); 13340 udp->uh_sport = htons(V_tcp_udp_tunneling_port); 13341 udp->uh_dport = tp->t_port; 13342 ulen = hdrlen + len - sizeof(struct ip6_hdr); 13343 udp->uh_ulen = htons(ulen); 13344 th = (struct tcphdr *)(udp + 1); 13345 } else { 13346 th = (struct tcphdr *)(ip6 + 1); 13347 } 13348 tcpip_fillheaders(inp, tp->t_port, ip6, th); 13349 } else 13350 #endif /* INET6 */ 13351 { 13352 ip = mtod(m, struct ip *); 13353 #ifdef TCPDEBUG 13354 ipov = (struct ipovly *)ip; 13355 #endif 13356 if (tp->t_port) { 13357 udp = (struct udphdr *)((caddr_t)ip + sizeof(struct ip)); 13358 udp->uh_sport = htons(V_tcp_udp_tunneling_port); 13359 udp->uh_dport = tp->t_port; 13360 ulen = hdrlen + len - sizeof(struct ip); 13361 udp->uh_ulen = htons(ulen); 13362 th = (struct tcphdr *)(udp + 1); 13363 } else { 13364 th = (struct tcphdr *)(ip + 1); 13365 } 13366 tcpip_fillheaders(inp, tp->t_port, ip, th); 13367 } 13368 /* 13369 * If we are doing retransmissions, then snd_nxt will not reflect 13370 * the first unsent octet. For ACK only packets, we do not want the 13371 * sequence number of the retransmitted packet, we want the sequence 13372 * number of the next unsent octet. So, if there is no data (and no 13373 * SYN or FIN), use snd_max instead of snd_nxt when filling in 13374 * ti_seq. But if we are in persist state, snd_max might reflect 13375 * one byte beyond the right edge of the window, so use snd_nxt in 13376 * that case, since we know we aren't doing a retransmission. 13377 * (retransmit and persist are mutually exclusive...) 13378 */ 13379 if (sack_rxmit == 0) { 13380 if (len && ((flags & (TH_FIN | TH_SYN | TH_RST)) == 0)) { 13381 /* New data (including new persists) */ 13382 th->th_seq = htonl(tp->snd_max); 13383 bbr_seq = tp->snd_max; 13384 } else if (flags & TH_SYN) { 13385 /* Syn's always send from iss */ 13386 th->th_seq = htonl(tp->iss); 13387 bbr_seq = tp->iss; 13388 } else if (flags & TH_FIN) { 13389 if (flags & TH_FIN && tp->t_flags & TF_SENTFIN) { 13390 /* 13391 * If we sent the fin already its 1 minus 13392 * snd_max 13393 */ 13394 th->th_seq = (htonl(tp->snd_max - 1)); 13395 bbr_seq = (tp->snd_max - 1); 13396 } else { 13397 /* First time FIN use snd_max */ 13398 th->th_seq = htonl(tp->snd_max); 13399 bbr_seq = tp->snd_max; 13400 } 13401 } else { 13402 /* 13403 * len == 0 and not persist we use snd_max, sending 13404 * an ack unless we have sent the fin then its 1 13405 * minus. 13406 */ 13407 /* 13408 * XXXRRS Question if we are in persists and we have 13409 * nothing outstanding to send and we have not sent 13410 * a FIN, we will send an ACK. In such a case it 13411 * might be better to send (tp->snd_una - 1) which 13412 * would force the peer to ack. 13413 */ 13414 if (tp->t_flags & TF_SENTFIN) { 13415 th->th_seq = htonl(tp->snd_max - 1); 13416 bbr_seq = (tp->snd_max - 1); 13417 } else { 13418 th->th_seq = htonl(tp->snd_max); 13419 bbr_seq = tp->snd_max; 13420 } 13421 } 13422 } else { 13423 /* All retransmits use the rsm to guide the send */ 13424 th->th_seq = htonl(rsm->r_start); 13425 bbr_seq = rsm->r_start; 13426 } 13427 th->th_ack = htonl(tp->rcv_nxt); 13428 if (optlen) { 13429 bcopy(opt, th + 1, optlen); 13430 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 13431 } 13432 tcp_set_flags(th, flags); 13433 /* 13434 * Calculate receive window. Don't shrink window, but avoid silly 13435 * window syndrome. 13436 */ 13437 if ((flags & TH_RST) || ((recwin < (so->so_rcv.sb_hiwat / 4) && 13438 recwin < maxseg))) 13439 recwin = 0; 13440 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) && 13441 recwin < (tp->rcv_adv - tp->rcv_nxt)) 13442 recwin = (tp->rcv_adv - tp->rcv_nxt); 13443 if (recwin > TCP_MAXWIN << tp->rcv_scale) 13444 recwin = TCP_MAXWIN << tp->rcv_scale; 13445 13446 /* 13447 * According to RFC1323 the window field in a SYN (i.e., a <SYN> or 13448 * <SYN,ACK>) segment itself is never scaled. The <SYN,ACK> case is 13449 * handled in syncache. 13450 */ 13451 if (flags & TH_SYN) 13452 th->th_win = htons((u_short) 13453 (min(sbspace(&so->so_rcv), TCP_MAXWIN))); 13454 else { 13455 /* Avoid shrinking window with window scaling. */ 13456 recwin = roundup2(recwin, 1 << tp->rcv_scale); 13457 th->th_win = htons((u_short)(recwin >> tp->rcv_scale)); 13458 } 13459 /* 13460 * Adjust the RXWIN0SENT flag - indicate that we have advertised a 0 13461 * window. This may cause the remote transmitter to stall. This 13462 * flag tells soreceive() to disable delayed acknowledgements when 13463 * draining the buffer. This can occur if the receiver is 13464 * attempting to read more data than can be buffered prior to 13465 * transmitting on the connection. 13466 */ 13467 if (th->th_win == 0) { 13468 tp->t_sndzerowin++; 13469 tp->t_flags |= TF_RXWIN0SENT; 13470 } else 13471 tp->t_flags &= ~TF_RXWIN0SENT; 13472 /* 13473 * We don't support urgent data, but drag along 13474 * the pointer in case of a stack switch. 13475 */ 13476 tp->snd_up = tp->snd_una; 13477 13478 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) 13479 if (to.to_flags & TOF_SIGNATURE) { 13480 /* 13481 * Calculate MD5 signature and put it into the place 13482 * determined before. NOTE: since TCP options buffer doesn't 13483 * point into mbuf's data, calculate offset and use it. 13484 */ 13485 if (!TCPMD5_ENABLED() || TCPMD5_OUTPUT(m, th, 13486 (u_char *)(th + 1) + (to.to_signature - opt)) != 0) { 13487 /* 13488 * Do not send segment if the calculation of MD5 13489 * digest has failed. 13490 */ 13491 goto out; 13492 } 13493 } 13494 #endif 13495 13496 /* 13497 * Put TCP length in extended header, and then checksum extended 13498 * header and data. 13499 */ 13500 m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */ 13501 #ifdef INET6 13502 if (isipv6) { 13503 /* 13504 * ip6_plen is not need to be filled now, and will be filled 13505 * in ip6_output. 13506 */ 13507 if (tp->t_port) { 13508 m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; 13509 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 13510 udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0); 13511 th->th_sum = htons(0); 13512 UDPSTAT_INC(udps_opackets); 13513 } else { 13514 csum_flags = m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; 13515 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 13516 th->th_sum = in6_cksum_pseudo(ip6, sizeof(struct tcphdr) + 13517 optlen + len, IPPROTO_TCP, 0); 13518 } 13519 } 13520 #endif 13521 #if defined(INET6) && defined(INET) 13522 else 13523 #endif 13524 #ifdef INET 13525 { 13526 if (tp->t_port) { 13527 m->m_pkthdr.csum_flags = CSUM_UDP; 13528 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 13529 udp->uh_sum = in_pseudo(ip->ip_src.s_addr, 13530 ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP)); 13531 th->th_sum = htons(0); 13532 UDPSTAT_INC(udps_opackets); 13533 } else { 13534 csum_flags = m->m_pkthdr.csum_flags = CSUM_TCP; 13535 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 13536 th->th_sum = in_pseudo(ip->ip_src.s_addr, 13537 ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + 13538 IPPROTO_TCP + len + optlen)); 13539 } 13540 /* IP version must be set here for ipv4/ipv6 checking later */ 13541 KASSERT(ip->ip_v == IPVERSION, 13542 ("%s: IP version incorrect: %d", __func__, ip->ip_v)); 13543 } 13544 #endif 13545 13546 /* 13547 * Enable TSO and specify the size of the segments. The TCP pseudo 13548 * header checksum is always provided. XXX: Fixme: This is currently 13549 * not the case for IPv6. 13550 */ 13551 if (tso) { 13552 KASSERT(len > maxseg, 13553 ("%s: len:%d <= tso_segsz:%d", __func__, len, maxseg)); 13554 m->m_pkthdr.csum_flags |= CSUM_TSO; 13555 csum_flags |= CSUM_TSO; 13556 m->m_pkthdr.tso_segsz = maxseg; 13557 } 13558 KASSERT(len + hdrlen == m_length(m, NULL), 13559 ("%s: mbuf chain different than expected: %d + %u != %u", 13560 __func__, len, hdrlen, m_length(m, NULL))); 13561 13562 #ifdef TCP_HHOOK 13563 /* Run HHOOK_TC_ESTABLISHED_OUT helper hooks. */ 13564 hhook_run_tcp_est_out(tp, th, &to, len, tso); 13565 #endif 13566 #ifdef TCPDEBUG 13567 /* 13568 * Trace. 13569 */ 13570 if (so->so_options & SO_DEBUG) { 13571 u_short save = 0; 13572 13573 #ifdef INET6 13574 if (!isipv6) 13575 #endif 13576 { 13577 save = ipov->ih_len; 13578 ipov->ih_len = htons(m->m_pkthdr.len /* - hdrlen + 13579 * (th->th_off << 2) */ ); 13580 } 13581 tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0); 13582 #ifdef INET6 13583 if (!isipv6) 13584 #endif 13585 ipov->ih_len = save; 13586 } 13587 #endif /* TCPDEBUG */ 13588 13589 /* Log to the black box */ 13590 if (tp->t_logstate != TCP_LOG_STATE_OFF) { 13591 union tcp_log_stackspecific log; 13592 13593 bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); 13594 /* Record info on type of transmission */ 13595 log.u_bbr.flex1 = bbr->r_ctl.rc_hptsi_agg_delay; 13596 log.u_bbr.flex2 = (bbr->r_recovery_bw << 3); 13597 log.u_bbr.flex3 = maxseg; 13598 log.u_bbr.flex4 = delay_calc; 13599 /* Encode filled_all into the upper flex5 bit */ 13600 log.u_bbr.flex5 = bbr->rc_past_init_win; 13601 log.u_bbr.flex5 <<= 1; 13602 log.u_bbr.flex5 |= bbr->rc_no_pacing; 13603 log.u_bbr.flex5 <<= 29; 13604 if (filled_all) 13605 log.u_bbr.flex5 |= 0x80000000; 13606 log.u_bbr.flex5 |= tp->t_maxseg; 13607 log.u_bbr.flex6 = bbr->r_ctl.rc_pace_max_segs; 13608 log.u_bbr.flex7 = (bbr->rc_bbr_state << 8) | bbr_state_val(bbr); 13609 /* lets poke in the low and the high here for debugging */ 13610 log.u_bbr.pkts_out = bbr->rc_tp->t_maxseg; 13611 if (rsm || sack_rxmit) { 13612 if (doing_tlp) 13613 log.u_bbr.flex8 = 2; 13614 else 13615 log.u_bbr.flex8 = 1; 13616 } else { 13617 log.u_bbr.flex8 = 0; 13618 } 13619 lgb = tcp_log_event_(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_OUT, ERRNO_UNK, 13620 len, &log, false, NULL, NULL, 0, tv); 13621 } else { 13622 lgb = NULL; 13623 } 13624 /* 13625 * Fill in IP length and desired time to live and send to IP level. 13626 * There should be a better way to handle ttl and tos; we could keep 13627 * them in the template, but need a way to checksum without them. 13628 */ 13629 /* 13630 * m->m_pkthdr.len should have been set before cksum calcuration, 13631 * because in6_cksum() need it. 13632 */ 13633 #ifdef INET6 13634 if (isipv6) { 13635 /* 13636 * we separately set hoplimit for every segment, since the 13637 * user might want to change the value via setsockopt. Also, 13638 * desired default hop limit might be changed via Neighbor 13639 * Discovery. 13640 */ 13641 ip6->ip6_hlim = in6_selecthlim(inp, NULL); 13642 13643 /* 13644 * Set the packet size here for the benefit of DTrace 13645 * probes. ip6_output() will set it properly; it's supposed 13646 * to include the option header lengths as well. 13647 */ 13648 ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6)); 13649 13650 if (V_path_mtu_discovery && maxseg > V_tcp_minmss) 13651 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 13652 else 13653 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 13654 13655 if (tp->t_state == TCPS_SYN_SENT) 13656 TCP_PROBE5(connect__request, NULL, tp, ip6, tp, th); 13657 13658 TCP_PROBE5(send, NULL, tp, ip6, tp, th); 13659 /* TODO: IPv6 IP6TOS_ECT bit on */ 13660 error = ip6_output(m, inp->in6p_outputopts, 13661 &inp->inp_route6, 13662 ((rsm || sack_rxmit) ? IP_NO_SND_TAG_RL : 0), 13663 NULL, NULL, inp); 13664 13665 if (error == EMSGSIZE && inp->inp_route6.ro_nh != NULL) 13666 mtu = inp->inp_route6.ro_nh->nh_mtu; 13667 } 13668 #endif /* INET6 */ 13669 #if defined(INET) && defined(INET6) 13670 else 13671 #endif 13672 #ifdef INET 13673 { 13674 ip->ip_len = htons(m->m_pkthdr.len); 13675 #ifdef INET6 13676 if (isipv6) 13677 ip->ip_ttl = in6_selecthlim(inp, NULL); 13678 #endif /* INET6 */ 13679 /* 13680 * If we do path MTU discovery, then we set DF on every 13681 * packet. This might not be the best thing to do according 13682 * to RFC3390 Section 2. However the tcp hostcache migitates 13683 * the problem so it affects only the first tcp connection 13684 * with a host. 13685 * 13686 * NB: Don't set DF on small MTU/MSS to have a safe 13687 * fallback. 13688 */ 13689 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) { 13690 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 13691 if (tp->t_port == 0 || len < V_tcp_minmss) { 13692 ip->ip_off |= htons(IP_DF); 13693 } 13694 } else { 13695 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 13696 } 13697 13698 if (tp->t_state == TCPS_SYN_SENT) 13699 TCP_PROBE5(connect__request, NULL, tp, ip, tp, th); 13700 13701 TCP_PROBE5(send, NULL, tp, ip, tp, th); 13702 13703 error = ip_output(m, inp->inp_options, &inp->inp_route, 13704 ((rsm || sack_rxmit) ? IP_NO_SND_TAG_RL : 0), 0, 13705 inp); 13706 if (error == EMSGSIZE && inp->inp_route.ro_nh != NULL) 13707 mtu = inp->inp_route.ro_nh->nh_mtu; 13708 } 13709 #endif /* INET */ 13710 out: 13711 13712 if (lgb) { 13713 lgb->tlb_errno = error; 13714 lgb = NULL; 13715 } 13716 /* 13717 * In transmit state, time the transmission and arrange for the 13718 * retransmit. In persist state, just set snd_max. 13719 */ 13720 if (error == 0) { 13721 tcp_account_for_send(tp, len, (rsm != NULL), doing_tlp, hw_tls); 13722 if (TCPS_HAVEESTABLISHED(tp->t_state) && 13723 (tp->t_flags & TF_SACK_PERMIT) && 13724 tp->rcv_numsacks > 0) 13725 tcp_clean_dsack_blocks(tp); 13726 /* We sent an ack clear the bbr_segs_rcvd count */ 13727 bbr->output_error_seen = 0; 13728 bbr->oerror_cnt = 0; 13729 bbr->bbr_segs_rcvd = 0; 13730 if (len == 0) 13731 counter_u64_add(bbr_out_size[TCP_MSS_ACCT_SNDACK], 1); 13732 /* Do accounting for new sends */ 13733 if ((len > 0) && (rsm == NULL)) { 13734 int idx; 13735 if (tp->snd_una == tp->snd_max) { 13736 /* 13737 * Special case to match google, when 13738 * nothing is in flight the delivered 13739 * time does get updated to the current 13740 * time (see tcp_rate_bsd.c). 13741 */ 13742 bbr->r_ctl.rc_del_time = cts; 13743 } 13744 if (len >= maxseg) { 13745 idx = (len / maxseg) + 3; 13746 if (idx >= TCP_MSS_ACCT_ATIMER) 13747 counter_u64_add(bbr_out_size[(TCP_MSS_ACCT_ATIMER - 1)], 1); 13748 else 13749 counter_u64_add(bbr_out_size[idx], 1); 13750 } else { 13751 /* smaller than a MSS */ 13752 idx = len / (bbr_hptsi_bytes_min - bbr->rc_last_options); 13753 if (idx >= TCP_MSS_SMALL_MAX_SIZE_DIV) 13754 idx = (TCP_MSS_SMALL_MAX_SIZE_DIV - 1); 13755 counter_u64_add(bbr_out_size[(idx + TCP_MSS_SMALL_SIZE_OFF)], 1); 13756 } 13757 } 13758 } 13759 abandon = 0; 13760 /* 13761 * We must do the send accounting before we log the output, 13762 * otherwise the state of the rsm could change and we account to the 13763 * wrong bucket. 13764 */ 13765 if (len > 0) { 13766 bbr_do_send_accounting(tp, bbr, rsm, len, error); 13767 if (error == 0) { 13768 if (tp->snd_una == tp->snd_max) 13769 bbr->r_ctl.rc_tlp_rxt_last_time = cts; 13770 } 13771 } 13772 bbr_log_output(bbr, tp, &to, len, bbr_seq, (uint8_t) flags, error, 13773 cts, mb, &abandon, rsm, 0, sb); 13774 if (abandon) { 13775 /* 13776 * If bbr_log_output destroys the TCB or sees a TH_RST being 13777 * sent we should hit this condition. 13778 */ 13779 return (0); 13780 } 13781 if (bbr->rc_in_persist == 0) { 13782 /* 13783 * Advance snd_nxt over sequence space of this segment. 13784 */ 13785 if (error) 13786 /* We don't log or do anything with errors */ 13787 goto skip_upd; 13788 13789 if (tp->snd_una == tp->snd_max && 13790 (len || (flags & (TH_SYN | TH_FIN)))) { 13791 /* 13792 * Update the time we just added data since none was 13793 * outstanding. 13794 */ 13795 bbr_log_progress_event(bbr, tp, ticks, PROGRESS_START, __LINE__); 13796 bbr->rc_tp->t_acktime = ticks; 13797 } 13798 if (flags & (TH_SYN | TH_FIN) && (rsm == NULL)) { 13799 if (flags & TH_SYN) { 13800 /* 13801 * Smack the snd_max to iss + 1 13802 * if its a FO we will add len below. 13803 */ 13804 tp->snd_max = tp->iss + 1; 13805 } 13806 if ((flags & TH_FIN) && ((tp->t_flags & TF_SENTFIN) == 0)) { 13807 tp->snd_max++; 13808 tp->t_flags |= TF_SENTFIN; 13809 } 13810 } 13811 if (sack_rxmit == 0) 13812 tp->snd_max += len; 13813 skip_upd: 13814 if ((error == 0) && len) 13815 tot_len += len; 13816 } else { 13817 /* Persists case */ 13818 int32_t xlen = len; 13819 13820 if (error) 13821 goto nomore; 13822 13823 if (flags & TH_SYN) 13824 ++xlen; 13825 if ((flags & TH_FIN) && ((tp->t_flags & TF_SENTFIN) == 0)) { 13826 ++xlen; 13827 tp->t_flags |= TF_SENTFIN; 13828 } 13829 if (xlen && (tp->snd_una == tp->snd_max)) { 13830 /* 13831 * Update the time we just added data since none was 13832 * outstanding. 13833 */ 13834 bbr_log_progress_event(bbr, tp, ticks, PROGRESS_START, __LINE__); 13835 bbr->rc_tp->t_acktime = ticks; 13836 } 13837 if (sack_rxmit == 0) 13838 tp->snd_max += xlen; 13839 tot_len += (len + optlen + ipoptlen); 13840 } 13841 nomore: 13842 if (error) { 13843 /* 13844 * Failures do not advance the seq counter above. For the 13845 * case of ENOBUFS we will fall out and become ack-clocked. 13846 * capping the cwnd at the current flight. 13847 * Everything else will just have to retransmit with the timer 13848 * (no pacer). 13849 */ 13850 SOCKBUF_UNLOCK_ASSERT(sb); 13851 BBR_STAT_INC(bbr_saw_oerr); 13852 /* Clear all delay/early tracks */ 13853 bbr->r_ctl.rc_hptsi_agg_delay = 0; 13854 bbr->r_ctl.rc_agg_early = 0; 13855 bbr->r_agg_early_set = 0; 13856 bbr->output_error_seen = 1; 13857 if (bbr->oerror_cnt < 0xf) 13858 bbr->oerror_cnt++; 13859 if (bbr_max_net_error_cnt && (bbr->oerror_cnt >= bbr_max_net_error_cnt)) { 13860 /* drop the session */ 13861 return (-ENETDOWN); 13862 } 13863 switch (error) { 13864 case ENOBUFS: 13865 /* 13866 * Make this guy have to get ack's to send 13867 * more but lets make sure we don't 13868 * slam him below a T-O (1MSS). 13869 */ 13870 if (bbr->rc_bbr_state != BBR_STATE_PROBE_RTT) { 13871 tp->snd_cwnd = ctf_flight_size(tp, (bbr->r_ctl.rc_sacked + 13872 bbr->r_ctl.rc_lost_bytes)) - maxseg; 13873 if (tp->snd_cwnd < maxseg) 13874 tp->snd_cwnd = maxseg; 13875 } 13876 slot = (bbr_error_base_paceout + 1) << bbr->oerror_cnt; 13877 BBR_STAT_INC(bbr_saw_enobuf); 13878 if (bbr->bbr_hdrw_pacing) 13879 counter_u64_add(bbr_hdwr_pacing_enobuf, 1); 13880 else 13881 counter_u64_add(bbr_nohdwr_pacing_enobuf, 1); 13882 /* 13883 * Here even in the enobuf's case we want to do our 13884 * state update. The reason being we may have been 13885 * called by the input function. If so we have had 13886 * things change. 13887 */ 13888 error = 0; 13889 goto enobufs; 13890 case EMSGSIZE: 13891 /* 13892 * For some reason the interface we used initially 13893 * to send segments changed to another or lowered 13894 * its MTU. If TSO was active we either got an 13895 * interface without TSO capabilits or TSO was 13896 * turned off. If we obtained mtu from ip_output() 13897 * then update it and try again. 13898 */ 13899 /* Turn on tracing (or try to) */ 13900 { 13901 int old_maxseg; 13902 13903 old_maxseg = tp->t_maxseg; 13904 BBR_STAT_INC(bbr_saw_emsgsiz); 13905 bbr_log_msgsize_fail(bbr, tp, len, maxseg, mtu, csum_flags, tso, cts); 13906 if (mtu != 0) 13907 tcp_mss_update(tp, -1, mtu, NULL, NULL); 13908 if (old_maxseg <= tp->t_maxseg) { 13909 /* Huh it did not shrink? */ 13910 tp->t_maxseg = old_maxseg - 40; 13911 bbr_log_msgsize_fail(bbr, tp, len, maxseg, mtu, 0, tso, cts); 13912 } 13913 /* 13914 * Nuke all other things that can interfere 13915 * with slot 13916 */ 13917 if ((tot_len + len) && (len >= tp->t_maxseg)) { 13918 slot = bbr_get_pacing_delay(bbr, 13919 bbr->r_ctl.rc_bbr_hptsi_gain, 13920 (tot_len + len), cts, 0); 13921 if (slot < bbr_error_base_paceout) 13922 slot = (bbr_error_base_paceout + 2) << bbr->oerror_cnt; 13923 } else 13924 slot = (bbr_error_base_paceout + 2) << bbr->oerror_cnt; 13925 bbr->rc_output_starts_timer = 1; 13926 bbr_start_hpts_timer(bbr, tp, cts, 10, slot, 13927 tot_len); 13928 return (error); 13929 } 13930 case EPERM: 13931 tp->t_softerror = error; 13932 /* Fall through */ 13933 case EHOSTDOWN: 13934 case EHOSTUNREACH: 13935 case ENETDOWN: 13936 case ENETUNREACH: 13937 if (TCPS_HAVERCVDSYN(tp->t_state)) { 13938 tp->t_softerror = error; 13939 } 13940 /* FALLTHROUGH */ 13941 default: 13942 slot = (bbr_error_base_paceout + 3) << bbr->oerror_cnt; 13943 bbr->rc_output_starts_timer = 1; 13944 bbr_start_hpts_timer(bbr, tp, cts, 11, slot, 0); 13945 return (error); 13946 } 13947 #ifdef STATS 13948 } else if (((tp->t_flags & TF_GPUTINPROG) == 0) && 13949 len && 13950 (rsm == NULL) && 13951 (bbr->rc_in_persist == 0)) { 13952 tp->gput_seq = bbr_seq; 13953 tp->gput_ack = bbr_seq + 13954 min(sbavail(&so->so_snd) - sb_offset, sendwin); 13955 tp->gput_ts = cts; 13956 tp->t_flags |= TF_GPUTINPROG; 13957 #endif 13958 } 13959 KMOD_TCPSTAT_INC(tcps_sndtotal); 13960 if ((bbr->bbr_hdw_pace_ena) && 13961 (bbr->bbr_attempt_hdwr_pace == 0) && 13962 (bbr->rc_past_init_win) && 13963 (bbr->rc_bbr_state != BBR_STATE_STARTUP) && 13964 (get_filter_value(&bbr->r_ctl.rc_delrate)) && 13965 (inp->inp_route.ro_nh && 13966 inp->inp_route.ro_nh->nh_ifp)) { 13967 /* 13968 * We are past the initial window and 13969 * have at least one measurement so we 13970 * could use hardware pacing if its available. 13971 * We have an interface and we have not attempted 13972 * to setup hardware pacing, lets try to now. 13973 */ 13974 uint64_t rate_wanted; 13975 int err = 0; 13976 13977 rate_wanted = bbr_get_hardware_rate(bbr); 13978 bbr->bbr_attempt_hdwr_pace = 1; 13979 bbr->r_ctl.crte = tcp_set_pacing_rate(bbr->rc_tp, 13980 inp->inp_route.ro_nh->nh_ifp, 13981 rate_wanted, 13982 (RS_PACING_GEQ|RS_PACING_SUB_OK), 13983 &err, NULL); 13984 if (bbr->r_ctl.crte) { 13985 bbr_type_log_hdwr_pacing(bbr, 13986 bbr->r_ctl.crte->ptbl->rs_ifp, 13987 rate_wanted, 13988 bbr->r_ctl.crte->rate, 13989 __LINE__, cts, err); 13990 BBR_STAT_INC(bbr_hdwr_rl_add_ok); 13991 counter_u64_add(bbr_flows_nohdwr_pacing, -1); 13992 counter_u64_add(bbr_flows_whdwr_pacing, 1); 13993 bbr->bbr_hdrw_pacing = 1; 13994 /* Now what is our gain status? */ 13995 if (bbr->r_ctl.crte->rate < rate_wanted) { 13996 /* We have a problem */ 13997 bbr_setup_less_of_rate(bbr, cts, 13998 bbr->r_ctl.crte->rate, rate_wanted); 13999 } else { 14000 /* We are good */ 14001 bbr->gain_is_limited = 0; 14002 bbr->skip_gain = 0; 14003 } 14004 tcp_bbr_tso_size_check(bbr, cts); 14005 } else { 14006 bbr_type_log_hdwr_pacing(bbr, 14007 inp->inp_route.ro_nh->nh_ifp, 14008 rate_wanted, 14009 0, 14010 __LINE__, cts, err); 14011 BBR_STAT_INC(bbr_hdwr_rl_add_fail); 14012 } 14013 } 14014 if (bbr->bbr_hdrw_pacing) { 14015 /* 14016 * Worry about cases where the route 14017 * changes or something happened that we 14018 * lost our hardware pacing possibly during 14019 * the last ip_output call. 14020 */ 14021 if (inp->inp_snd_tag == NULL) { 14022 /* A change during ip output disabled hw pacing? */ 14023 bbr->bbr_hdrw_pacing = 0; 14024 } else if ((inp->inp_route.ro_nh == NULL) || 14025 (inp->inp_route.ro_nh->nh_ifp != inp->inp_snd_tag->ifp)) { 14026 /* 14027 * We had an interface or route change, 14028 * detach from the current hdwr pacing 14029 * and setup to re-attempt next go 14030 * round. 14031 */ 14032 bbr->bbr_hdrw_pacing = 0; 14033 bbr->bbr_attempt_hdwr_pace = 0; 14034 tcp_rel_pacing_rate(bbr->r_ctl.crte, bbr->rc_tp); 14035 tcp_bbr_tso_size_check(bbr, cts); 14036 } 14037 } 14038 /* 14039 * Data sent (as far as we can tell). If this advertises a larger 14040 * window than any other segment, then remember the size of the 14041 * advertised window. Any pending ACK has now been sent. 14042 */ 14043 if (SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv)) 14044 tp->rcv_adv = tp->rcv_nxt + recwin; 14045 14046 tp->last_ack_sent = tp->rcv_nxt; 14047 if ((error == 0) && 14048 (bbr->r_ctl.rc_pace_max_segs > tp->t_maxseg) && 14049 (doing_tlp == 0) && 14050 (tso == 0) && 14051 (len > 0) && 14052 ((flags & TH_RST) == 0) && 14053 ((flags & TH_SYN) == 0) && 14054 (IN_RECOVERY(tp->t_flags) == 0) && 14055 (bbr->rc_in_persist == 0) && 14056 (tot_len < bbr->r_ctl.rc_pace_max_segs)) { 14057 /* 14058 * For non-tso we need to goto again until we have sent out 14059 * enough data to match what we are hptsi out every hptsi 14060 * interval. 14061 */ 14062 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { 14063 /* Make sure snd_nxt is drug up */ 14064 tp->snd_nxt = tp->snd_max; 14065 } 14066 if (rsm != NULL) { 14067 rsm = NULL; 14068 goto skip_again; 14069 } 14070 rsm = NULL; 14071 sack_rxmit = 0; 14072 tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); 14073 goto again; 14074 } 14075 skip_again: 14076 if ((error == 0) && (flags & TH_FIN)) 14077 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_FIN); 14078 if ((error == 0) && (flags & TH_RST)) 14079 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); 14080 if (((flags & (TH_RST | TH_SYN | TH_FIN)) == 0) && tot_len) { 14081 /* 14082 * Calculate/Re-Calculate the hptsi slot in usecs based on 14083 * what we have sent so far 14084 */ 14085 slot = bbr_get_pacing_delay(bbr, bbr->r_ctl.rc_bbr_hptsi_gain, tot_len, cts, 0); 14086 if (bbr->rc_no_pacing) 14087 slot = 0; 14088 } 14089 tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); 14090 enobufs: 14091 if (bbr->rc_use_google == 0) 14092 bbr_check_bbr_for_state(bbr, cts, __LINE__, 0); 14093 bbr_cwnd_limiting(tp, bbr, ctf_flight_size(tp, (bbr->r_ctl.rc_sacked + 14094 bbr->r_ctl.rc_lost_bytes))); 14095 bbr->rc_output_starts_timer = 1; 14096 if (bbr->bbr_use_rack_cheat && 14097 (more_to_rxt || 14098 ((bbr->r_ctl.rc_resend = bbr_check_recovery_mode(tp, bbr, cts)) != NULL))) { 14099 /* Rack cheats and shotguns out all rxt's 1ms apart */ 14100 if (slot > 1000) 14101 slot = 1000; 14102 } 14103 if (bbr->bbr_hdrw_pacing && (bbr->hw_pacing_set == 0)) { 14104 /* 14105 * We don't change the tso size until some number of sends 14106 * to give the hardware commands time to get down 14107 * to the interface. 14108 */ 14109 bbr->r_ctl.bbr_hdwr_cnt_noset_snt++; 14110 if (bbr->r_ctl.bbr_hdwr_cnt_noset_snt >= bbr_hdwr_pacing_delay_cnt) { 14111 bbr->hw_pacing_set = 1; 14112 tcp_bbr_tso_size_check(bbr, cts); 14113 } 14114 } 14115 bbr_start_hpts_timer(bbr, tp, cts, 12, slot, tot_len); 14116 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { 14117 /* Make sure snd_nxt is drug up */ 14118 tp->snd_nxt = tp->snd_max; 14119 } 14120 return (error); 14121 14122 } 14123 14124 /* 14125 * See bbr_output_wtime() for return values. 14126 */ 14127 static int 14128 bbr_output(struct tcpcb *tp) 14129 { 14130 int32_t ret; 14131 struct timeval tv; 14132 14133 NET_EPOCH_ASSERT(); 14134 14135 INP_WLOCK_ASSERT(tp->t_inpcb); 14136 (void)tcp_get_usecs(&tv); 14137 ret = bbr_output_wtime(tp, &tv); 14138 return (ret); 14139 } 14140 14141 static void 14142 bbr_mtu_chg(struct tcpcb *tp) 14143 { 14144 struct tcp_bbr *bbr; 14145 struct bbr_sendmap *rsm, *frsm = NULL; 14146 uint32_t maxseg; 14147 14148 /* 14149 * The MTU has changed. a) Clear the sack filter. b) Mark everything 14150 * over the current size as SACK_PASS so a retransmit will occur. 14151 */ 14152 14153 bbr = (struct tcp_bbr *)tp->t_fb_ptr; 14154 maxseg = tp->t_maxseg - bbr->rc_last_options; 14155 sack_filter_clear(&bbr->r_ctl.bbr_sf, tp->snd_una); 14156 TAILQ_FOREACH(rsm, &bbr->r_ctl.rc_map, r_next) { 14157 /* Don't mess with ones acked (by sack?) */ 14158 if (rsm->r_flags & BBR_ACKED) 14159 continue; 14160 if ((rsm->r_end - rsm->r_start) > maxseg) { 14161 /* 14162 * We mark sack-passed on all the previous large 14163 * sends we did. This will force them to retransmit. 14164 */ 14165 rsm->r_flags |= BBR_SACK_PASSED; 14166 if (((rsm->r_flags & BBR_MARKED_LOST) == 0) && 14167 bbr_is_lost(bbr, rsm, bbr->r_ctl.rc_rcvtime)) { 14168 bbr->r_ctl.rc_lost_bytes += rsm->r_end - rsm->r_start; 14169 bbr->r_ctl.rc_lost += rsm->r_end - rsm->r_start; 14170 rsm->r_flags |= BBR_MARKED_LOST; 14171 } 14172 if (frsm == NULL) 14173 frsm = rsm; 14174 } 14175 } 14176 if (frsm) { 14177 bbr->r_ctl.rc_resend = frsm; 14178 } 14179 } 14180 14181 static int 14182 bbr_pru_options(struct tcpcb *tp, int flags) 14183 { 14184 if (flags & PRUS_OOB) 14185 return (EOPNOTSUPP); 14186 return (0); 14187 } 14188 14189 struct tcp_function_block __tcp_bbr = { 14190 .tfb_tcp_block_name = __XSTRING(STACKNAME), 14191 .tfb_tcp_output = bbr_output, 14192 .tfb_do_queued_segments = ctf_do_queued_segments, 14193 .tfb_do_segment_nounlock = bbr_do_segment_nounlock, 14194 .tfb_tcp_do_segment = bbr_do_segment, 14195 .tfb_tcp_ctloutput = bbr_ctloutput, 14196 .tfb_tcp_fb_init = bbr_init, 14197 .tfb_tcp_fb_fini = bbr_fini, 14198 .tfb_tcp_timer_stop_all = bbr_stopall, 14199 .tfb_tcp_timer_activate = bbr_timer_activate, 14200 .tfb_tcp_timer_active = bbr_timer_active, 14201 .tfb_tcp_timer_stop = bbr_timer_stop, 14202 .tfb_tcp_rexmit_tmr = bbr_remxt_tmr, 14203 .tfb_tcp_handoff_ok = bbr_handoff_ok, 14204 .tfb_tcp_mtu_chg = bbr_mtu_chg, 14205 .tfb_pru_options = bbr_pru_options, 14206 .tfb_flags = TCP_FUNC_OUTPUT_CANDROP, 14207 }; 14208 14209 /* 14210 * bbr_ctloutput() must drop the inpcb lock before performing copyin on 14211 * socket option arguments. When it re-acquires the lock after the copy, it 14212 * has to revalidate that the connection is still valid for the socket 14213 * option. 14214 */ 14215 static int 14216 bbr_set_sockopt(struct inpcb *inp, struct sockopt *sopt) 14217 { 14218 struct epoch_tracker et; 14219 struct tcpcb *tp; 14220 struct tcp_bbr *bbr; 14221 int32_t error = 0, optval; 14222 14223 switch (sopt->sopt_level) { 14224 case IPPROTO_IPV6: 14225 case IPPROTO_IP: 14226 return (tcp_default_ctloutput(inp, sopt)); 14227 } 14228 14229 switch (sopt->sopt_name) { 14230 case TCP_RACK_PACE_MAX_SEG: 14231 case TCP_RACK_MIN_TO: 14232 case TCP_RACK_REORD_THRESH: 14233 case TCP_RACK_REORD_FADE: 14234 case TCP_RACK_TLP_THRESH: 14235 case TCP_RACK_PKT_DELAY: 14236 case TCP_BBR_ALGORITHM: 14237 case TCP_BBR_TSLIMITS: 14238 case TCP_BBR_IWINTSO: 14239 case TCP_BBR_RECFORCE: 14240 case TCP_BBR_STARTUP_PG: 14241 case TCP_BBR_DRAIN_PG: 14242 case TCP_BBR_RWND_IS_APP: 14243 case TCP_BBR_PROBE_RTT_INT: 14244 case TCP_BBR_PROBE_RTT_GAIN: 14245 case TCP_BBR_PROBE_RTT_LEN: 14246 case TCP_BBR_STARTUP_LOSS_EXIT: 14247 case TCP_BBR_USEDEL_RATE: 14248 case TCP_BBR_MIN_RTO: 14249 case TCP_BBR_MAX_RTO: 14250 case TCP_BBR_PACE_PER_SEC: 14251 case TCP_DELACK: 14252 case TCP_BBR_PACE_DEL_TAR: 14253 case TCP_BBR_SEND_IWND_IN_TSO: 14254 case TCP_BBR_EXTRA_STATE: 14255 case TCP_BBR_UTTER_MAX_TSO: 14256 case TCP_BBR_MIN_TOPACEOUT: 14257 case TCP_BBR_FLOOR_MIN_TSO: 14258 case TCP_BBR_TSTMP_RAISES: 14259 case TCP_BBR_POLICER_DETECT: 14260 case TCP_BBR_USE_RACK_CHEAT: 14261 case TCP_DATA_AFTER_CLOSE: 14262 case TCP_BBR_HDWR_PACE: 14263 case TCP_BBR_PACE_SEG_MAX: 14264 case TCP_BBR_PACE_SEG_MIN: 14265 case TCP_BBR_PACE_CROSS: 14266 case TCP_BBR_PACE_OH: 14267 #ifdef NETFLIX_PEAKRATE 14268 case TCP_MAXPEAKRATE: 14269 #endif 14270 case TCP_BBR_TMR_PACE_OH: 14271 case TCP_BBR_RACK_RTT_USE: 14272 case TCP_BBR_RETRAN_WTSO: 14273 break; 14274 default: 14275 return (tcp_default_ctloutput(inp, sopt)); 14276 break; 14277 } 14278 INP_WUNLOCK(inp); 14279 error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); 14280 if (error) 14281 return (error); 14282 INP_WLOCK(inp); 14283 if (inp->inp_flags & INP_DROPPED) { 14284 INP_WUNLOCK(inp); 14285 return (ECONNRESET); 14286 } 14287 tp = intotcpcb(inp); 14288 if (tp->t_fb != &__tcp_bbr) { 14289 INP_WUNLOCK(inp); 14290 return (ENOPROTOOPT); 14291 } 14292 bbr = (struct tcp_bbr *)tp->t_fb_ptr; 14293 switch (sopt->sopt_name) { 14294 case TCP_BBR_PACE_PER_SEC: 14295 BBR_OPTS_INC(tcp_bbr_pace_per_sec); 14296 bbr->r_ctl.bbr_hptsi_per_second = optval; 14297 break; 14298 case TCP_BBR_PACE_DEL_TAR: 14299 BBR_OPTS_INC(tcp_bbr_pace_del_tar); 14300 bbr->r_ctl.bbr_hptsi_segments_delay_tar = optval; 14301 break; 14302 case TCP_BBR_PACE_SEG_MAX: 14303 BBR_OPTS_INC(tcp_bbr_pace_seg_max); 14304 bbr->r_ctl.bbr_hptsi_segments_max = optval; 14305 break; 14306 case TCP_BBR_PACE_SEG_MIN: 14307 BBR_OPTS_INC(tcp_bbr_pace_seg_min); 14308 bbr->r_ctl.bbr_hptsi_bytes_min = optval; 14309 break; 14310 case TCP_BBR_PACE_CROSS: 14311 BBR_OPTS_INC(tcp_bbr_pace_cross); 14312 bbr->r_ctl.bbr_cross_over = optval; 14313 break; 14314 case TCP_BBR_ALGORITHM: 14315 BBR_OPTS_INC(tcp_bbr_algorithm); 14316 if (optval && (bbr->rc_use_google == 0)) { 14317 /* Turn on the google mode */ 14318 bbr_google_mode_on(bbr); 14319 if ((optval > 3) && (optval < 500)) { 14320 /* 14321 * Must be at least greater than .3% 14322 * and must be less than 50.0%. 14323 */ 14324 bbr->r_ctl.bbr_google_discount = optval; 14325 } 14326 } else if ((optval == 0) && (bbr->rc_use_google == 1)) { 14327 /* Turn off the google mode */ 14328 bbr_google_mode_off(bbr); 14329 } 14330 break; 14331 case TCP_BBR_TSLIMITS: 14332 BBR_OPTS_INC(tcp_bbr_tslimits); 14333 if (optval == 1) 14334 bbr->rc_use_ts_limit = 1; 14335 else if (optval == 0) 14336 bbr->rc_use_ts_limit = 0; 14337 else 14338 error = EINVAL; 14339 break; 14340 14341 case TCP_BBR_IWINTSO: 14342 BBR_OPTS_INC(tcp_bbr_iwintso); 14343 if ((optval >= 0) && (optval < 128)) { 14344 uint32_t twin; 14345 14346 bbr->rc_init_win = optval; 14347 twin = bbr_initial_cwnd(bbr, tp); 14348 if ((bbr->rc_past_init_win == 0) && (twin > tp->snd_cwnd)) 14349 tp->snd_cwnd = twin; 14350 else 14351 error = EBUSY; 14352 } else 14353 error = EINVAL; 14354 break; 14355 case TCP_BBR_STARTUP_PG: 14356 BBR_OPTS_INC(tcp_bbr_startup_pg); 14357 if ((optval > 0) && (optval < BBR_MAX_GAIN_VALUE)) { 14358 bbr->r_ctl.rc_startup_pg = optval; 14359 if (bbr->rc_bbr_state == BBR_STATE_STARTUP) { 14360 bbr->r_ctl.rc_bbr_hptsi_gain = optval; 14361 } 14362 } else 14363 error = EINVAL; 14364 break; 14365 case TCP_BBR_DRAIN_PG: 14366 BBR_OPTS_INC(tcp_bbr_drain_pg); 14367 if ((optval > 0) && (optval < BBR_MAX_GAIN_VALUE)) 14368 bbr->r_ctl.rc_drain_pg = optval; 14369 else 14370 error = EINVAL; 14371 break; 14372 case TCP_BBR_PROBE_RTT_LEN: 14373 BBR_OPTS_INC(tcp_bbr_probertt_len); 14374 if (optval <= 1) 14375 reset_time_small(&bbr->r_ctl.rc_rttprop, (optval * USECS_IN_SECOND)); 14376 else 14377 error = EINVAL; 14378 break; 14379 case TCP_BBR_PROBE_RTT_GAIN: 14380 BBR_OPTS_INC(tcp_bbr_probertt_gain); 14381 if (optval <= BBR_UNIT) 14382 bbr->r_ctl.bbr_rttprobe_gain_val = optval; 14383 else 14384 error = EINVAL; 14385 break; 14386 case TCP_BBR_PROBE_RTT_INT: 14387 BBR_OPTS_INC(tcp_bbr_probe_rtt_int); 14388 if (optval > 1000) 14389 bbr->r_ctl.rc_probertt_int = optval; 14390 else 14391 error = EINVAL; 14392 break; 14393 case TCP_BBR_MIN_TOPACEOUT: 14394 BBR_OPTS_INC(tcp_bbr_topaceout); 14395 if (optval == 0) { 14396 bbr->no_pacing_until = 0; 14397 bbr->rc_no_pacing = 0; 14398 } else if (optval <= 0x00ff) { 14399 bbr->no_pacing_until = optval; 14400 if ((bbr->r_ctl.rc_pkt_epoch < bbr->no_pacing_until) && 14401 (bbr->rc_bbr_state == BBR_STATE_STARTUP)){ 14402 /* Turn on no pacing */ 14403 bbr->rc_no_pacing = 1; 14404 } 14405 } else 14406 error = EINVAL; 14407 break; 14408 case TCP_BBR_STARTUP_LOSS_EXIT: 14409 BBR_OPTS_INC(tcp_bbr_startup_loss_exit); 14410 bbr->rc_loss_exit = optval; 14411 break; 14412 case TCP_BBR_USEDEL_RATE: 14413 error = EINVAL; 14414 break; 14415 case TCP_BBR_MIN_RTO: 14416 BBR_OPTS_INC(tcp_bbr_min_rto); 14417 bbr->r_ctl.rc_min_rto_ms = optval; 14418 break; 14419 case TCP_BBR_MAX_RTO: 14420 BBR_OPTS_INC(tcp_bbr_max_rto); 14421 bbr->rc_max_rto_sec = optval; 14422 break; 14423 case TCP_RACK_MIN_TO: 14424 /* Minimum time between rack t-o's in ms */ 14425 BBR_OPTS_INC(tcp_rack_min_to); 14426 bbr->r_ctl.rc_min_to = optval; 14427 break; 14428 case TCP_RACK_REORD_THRESH: 14429 /* RACK reorder threshold (shift amount) */ 14430 BBR_OPTS_INC(tcp_rack_reord_thresh); 14431 if ((optval > 0) && (optval < 31)) 14432 bbr->r_ctl.rc_reorder_shift = optval; 14433 else 14434 error = EINVAL; 14435 break; 14436 case TCP_RACK_REORD_FADE: 14437 /* Does reordering fade after ms time */ 14438 BBR_OPTS_INC(tcp_rack_reord_fade); 14439 bbr->r_ctl.rc_reorder_fade = optval; 14440 break; 14441 case TCP_RACK_TLP_THRESH: 14442 /* RACK TLP theshold i.e. srtt+(srtt/N) */ 14443 BBR_OPTS_INC(tcp_rack_tlp_thresh); 14444 if (optval) 14445 bbr->rc_tlp_threshold = optval; 14446 else 14447 error = EINVAL; 14448 break; 14449 case TCP_BBR_USE_RACK_CHEAT: 14450 BBR_OPTS_INC(tcp_use_rackcheat); 14451 if (bbr->rc_use_google) { 14452 error = EINVAL; 14453 break; 14454 } 14455 BBR_OPTS_INC(tcp_rack_cheat); 14456 if (optval) 14457 bbr->bbr_use_rack_cheat = 1; 14458 else 14459 bbr->bbr_use_rack_cheat = 0; 14460 break; 14461 case TCP_BBR_FLOOR_MIN_TSO: 14462 BBR_OPTS_INC(tcp_utter_max_tso); 14463 if ((optval >= 0) && (optval < 40)) 14464 bbr->r_ctl.bbr_hptsi_segments_floor = optval; 14465 else 14466 error = EINVAL; 14467 break; 14468 case TCP_BBR_UTTER_MAX_TSO: 14469 BBR_OPTS_INC(tcp_utter_max_tso); 14470 if ((optval >= 0) && (optval < 0xffff)) 14471 bbr->r_ctl.bbr_utter_max = optval; 14472 else 14473 error = EINVAL; 14474 break; 14475 14476 case TCP_BBR_EXTRA_STATE: 14477 BBR_OPTS_INC(tcp_extra_state); 14478 if (optval) 14479 bbr->rc_use_idle_restart = 1; 14480 else 14481 bbr->rc_use_idle_restart = 0; 14482 break; 14483 case TCP_BBR_SEND_IWND_IN_TSO: 14484 BBR_OPTS_INC(tcp_iwnd_tso); 14485 if (optval) { 14486 bbr->bbr_init_win_cheat = 1; 14487 if (bbr->rc_past_init_win == 0) { 14488 uint32_t cts; 14489 cts = tcp_get_usecs(&bbr->rc_tv); 14490 tcp_bbr_tso_size_check(bbr, cts); 14491 } 14492 } else 14493 bbr->bbr_init_win_cheat = 0; 14494 break; 14495 case TCP_BBR_HDWR_PACE: 14496 BBR_OPTS_INC(tcp_hdwr_pacing); 14497 if (optval){ 14498 bbr->bbr_hdw_pace_ena = 1; 14499 bbr->bbr_attempt_hdwr_pace = 0; 14500 } else { 14501 bbr->bbr_hdw_pace_ena = 0; 14502 #ifdef RATELIMIT 14503 if (bbr->r_ctl.crte != NULL) { 14504 tcp_rel_pacing_rate(bbr->r_ctl.crte, tp); 14505 bbr->r_ctl.crte = NULL; 14506 } 14507 #endif 14508 } 14509 break; 14510 14511 case TCP_DELACK: 14512 BBR_OPTS_INC(tcp_delack); 14513 if (optval < 100) { 14514 if (optval == 0) /* off */ 14515 tp->t_delayed_ack = 0; 14516 else if (optval == 1) /* on which is 2 */ 14517 tp->t_delayed_ack = 2; 14518 else /* higher than 2 and less than 100 */ 14519 tp->t_delayed_ack = optval; 14520 if (tp->t_flags & TF_DELACK) { 14521 tp->t_flags &= ~TF_DELACK; 14522 tp->t_flags |= TF_ACKNOW; 14523 NET_EPOCH_ENTER(et); 14524 bbr_output(tp); 14525 NET_EPOCH_EXIT(et); 14526 } 14527 } else 14528 error = EINVAL; 14529 break; 14530 case TCP_RACK_PKT_DELAY: 14531 /* RACK added ms i.e. rack-rtt + reord + N */ 14532 BBR_OPTS_INC(tcp_rack_pkt_delay); 14533 bbr->r_ctl.rc_pkt_delay = optval; 14534 break; 14535 #ifdef NETFLIX_PEAKRATE 14536 case TCP_MAXPEAKRATE: 14537 BBR_OPTS_INC(tcp_maxpeak); 14538 error = tcp_set_maxpeakrate(tp, optval); 14539 if (!error) 14540 tp->t_peakrate_thr = tp->t_maxpeakrate; 14541 break; 14542 #endif 14543 case TCP_BBR_RETRAN_WTSO: 14544 BBR_OPTS_INC(tcp_retran_wtso); 14545 if (optval) 14546 bbr->rc_resends_use_tso = 1; 14547 else 14548 bbr->rc_resends_use_tso = 0; 14549 break; 14550 case TCP_DATA_AFTER_CLOSE: 14551 BBR_OPTS_INC(tcp_data_ac); 14552 if (optval) 14553 bbr->rc_allow_data_af_clo = 1; 14554 else 14555 bbr->rc_allow_data_af_clo = 0; 14556 break; 14557 case TCP_BBR_POLICER_DETECT: 14558 BBR_OPTS_INC(tcp_policer_det); 14559 if (bbr->rc_use_google == 0) 14560 error = EINVAL; 14561 else if (optval) 14562 bbr->r_use_policer = 1; 14563 else 14564 bbr->r_use_policer = 0; 14565 break; 14566 14567 case TCP_BBR_TSTMP_RAISES: 14568 BBR_OPTS_INC(tcp_ts_raises); 14569 if (optval) 14570 bbr->ts_can_raise = 1; 14571 else 14572 bbr->ts_can_raise = 0; 14573 break; 14574 case TCP_BBR_TMR_PACE_OH: 14575 BBR_OPTS_INC(tcp_pacing_oh_tmr); 14576 if (bbr->rc_use_google) { 14577 error = EINVAL; 14578 } else { 14579 if (optval) 14580 bbr->r_ctl.rc_incr_tmrs = 1; 14581 else 14582 bbr->r_ctl.rc_incr_tmrs = 0; 14583 } 14584 break; 14585 case TCP_BBR_PACE_OH: 14586 BBR_OPTS_INC(tcp_pacing_oh); 14587 if (bbr->rc_use_google) { 14588 error = EINVAL; 14589 } else { 14590 if (optval > (BBR_INCL_TCP_OH| 14591 BBR_INCL_IP_OH| 14592 BBR_INCL_ENET_OH)) { 14593 error = EINVAL; 14594 break; 14595 } 14596 if (optval & BBR_INCL_TCP_OH) 14597 bbr->r_ctl.rc_inc_tcp_oh = 1; 14598 else 14599 bbr->r_ctl.rc_inc_tcp_oh = 0; 14600 if (optval & BBR_INCL_IP_OH) 14601 bbr->r_ctl.rc_inc_ip_oh = 1; 14602 else 14603 bbr->r_ctl.rc_inc_ip_oh = 0; 14604 if (optval & BBR_INCL_ENET_OH) 14605 bbr->r_ctl.rc_inc_enet_oh = 1; 14606 else 14607 bbr->r_ctl.rc_inc_enet_oh = 0; 14608 } 14609 break; 14610 default: 14611 return (tcp_default_ctloutput(inp, sopt)); 14612 break; 14613 } 14614 #ifdef NETFLIX_STATS 14615 tcp_log_socket_option(tp, sopt->sopt_name, optval, error); 14616 #endif 14617 INP_WUNLOCK(inp); 14618 return (error); 14619 } 14620 14621 /* 14622 * return 0 on success, error-num on failure 14623 */ 14624 static int 14625 bbr_get_sockopt(struct inpcb *inp, struct sockopt *sopt) 14626 { 14627 struct tcpcb *tp; 14628 struct tcp_bbr *bbr; 14629 int32_t error, optval; 14630 14631 tp = intotcpcb(inp); 14632 bbr = (struct tcp_bbr *)tp->t_fb_ptr; 14633 if (bbr == NULL) { 14634 INP_WUNLOCK(inp); 14635 return (EINVAL); 14636 } 14637 /* 14638 * Because all our options are either boolean or an int, we can just 14639 * pull everything into optval and then unlock and copy. If we ever 14640 * add a option that is not a int, then this will have quite an 14641 * impact to this routine. 14642 */ 14643 switch (sopt->sopt_name) { 14644 case TCP_BBR_PACE_PER_SEC: 14645 optval = bbr->r_ctl.bbr_hptsi_per_second; 14646 break; 14647 case TCP_BBR_PACE_DEL_TAR: 14648 optval = bbr->r_ctl.bbr_hptsi_segments_delay_tar; 14649 break; 14650 case TCP_BBR_PACE_SEG_MAX: 14651 optval = bbr->r_ctl.bbr_hptsi_segments_max; 14652 break; 14653 case TCP_BBR_MIN_TOPACEOUT: 14654 optval = bbr->no_pacing_until; 14655 break; 14656 case TCP_BBR_PACE_SEG_MIN: 14657 optval = bbr->r_ctl.bbr_hptsi_bytes_min; 14658 break; 14659 case TCP_BBR_PACE_CROSS: 14660 optval = bbr->r_ctl.bbr_cross_over; 14661 break; 14662 case TCP_BBR_ALGORITHM: 14663 optval = bbr->rc_use_google; 14664 break; 14665 case TCP_BBR_TSLIMITS: 14666 optval = bbr->rc_use_ts_limit; 14667 break; 14668 case TCP_BBR_IWINTSO: 14669 optval = bbr->rc_init_win; 14670 break; 14671 case TCP_BBR_STARTUP_PG: 14672 optval = bbr->r_ctl.rc_startup_pg; 14673 break; 14674 case TCP_BBR_DRAIN_PG: 14675 optval = bbr->r_ctl.rc_drain_pg; 14676 break; 14677 case TCP_BBR_PROBE_RTT_INT: 14678 optval = bbr->r_ctl.rc_probertt_int; 14679 break; 14680 case TCP_BBR_PROBE_RTT_LEN: 14681 optval = (bbr->r_ctl.rc_rttprop.cur_time_limit / USECS_IN_SECOND); 14682 break; 14683 case TCP_BBR_PROBE_RTT_GAIN: 14684 optval = bbr->r_ctl.bbr_rttprobe_gain_val; 14685 break; 14686 case TCP_BBR_STARTUP_LOSS_EXIT: 14687 optval = bbr->rc_loss_exit; 14688 break; 14689 case TCP_BBR_USEDEL_RATE: 14690 error = EINVAL; 14691 break; 14692 case TCP_BBR_MIN_RTO: 14693 optval = bbr->r_ctl.rc_min_rto_ms; 14694 break; 14695 case TCP_BBR_MAX_RTO: 14696 optval = bbr->rc_max_rto_sec; 14697 break; 14698 case TCP_RACK_PACE_MAX_SEG: 14699 /* Max segments in a pace */ 14700 optval = bbr->r_ctl.rc_pace_max_segs; 14701 break; 14702 case TCP_RACK_MIN_TO: 14703 /* Minimum time between rack t-o's in ms */ 14704 optval = bbr->r_ctl.rc_min_to; 14705 break; 14706 case TCP_RACK_REORD_THRESH: 14707 /* RACK reorder threshold (shift amount) */ 14708 optval = bbr->r_ctl.rc_reorder_shift; 14709 break; 14710 case TCP_RACK_REORD_FADE: 14711 /* Does reordering fade after ms time */ 14712 optval = bbr->r_ctl.rc_reorder_fade; 14713 break; 14714 case TCP_BBR_USE_RACK_CHEAT: 14715 /* Do we use the rack cheat for rxt */ 14716 optval = bbr->bbr_use_rack_cheat; 14717 break; 14718 case TCP_BBR_FLOOR_MIN_TSO: 14719 optval = bbr->r_ctl.bbr_hptsi_segments_floor; 14720 break; 14721 case TCP_BBR_UTTER_MAX_TSO: 14722 optval = bbr->r_ctl.bbr_utter_max; 14723 break; 14724 case TCP_BBR_SEND_IWND_IN_TSO: 14725 /* Do we send TSO size segments initially */ 14726 optval = bbr->bbr_init_win_cheat; 14727 break; 14728 case TCP_BBR_EXTRA_STATE: 14729 optval = bbr->rc_use_idle_restart; 14730 break; 14731 case TCP_RACK_TLP_THRESH: 14732 /* RACK TLP theshold i.e. srtt+(srtt/N) */ 14733 optval = bbr->rc_tlp_threshold; 14734 break; 14735 case TCP_RACK_PKT_DELAY: 14736 /* RACK added ms i.e. rack-rtt + reord + N */ 14737 optval = bbr->r_ctl.rc_pkt_delay; 14738 break; 14739 case TCP_BBR_RETRAN_WTSO: 14740 optval = bbr->rc_resends_use_tso; 14741 break; 14742 case TCP_DATA_AFTER_CLOSE: 14743 optval = bbr->rc_allow_data_af_clo; 14744 break; 14745 case TCP_DELACK: 14746 optval = tp->t_delayed_ack; 14747 break; 14748 case TCP_BBR_HDWR_PACE: 14749 optval = bbr->bbr_hdw_pace_ena; 14750 break; 14751 case TCP_BBR_POLICER_DETECT: 14752 optval = bbr->r_use_policer; 14753 break; 14754 case TCP_BBR_TSTMP_RAISES: 14755 optval = bbr->ts_can_raise; 14756 break; 14757 case TCP_BBR_TMR_PACE_OH: 14758 optval = bbr->r_ctl.rc_incr_tmrs; 14759 break; 14760 case TCP_BBR_PACE_OH: 14761 optval = 0; 14762 if (bbr->r_ctl.rc_inc_tcp_oh) 14763 optval |= BBR_INCL_TCP_OH; 14764 if (bbr->r_ctl.rc_inc_ip_oh) 14765 optval |= BBR_INCL_IP_OH; 14766 if (bbr->r_ctl.rc_inc_enet_oh) 14767 optval |= BBR_INCL_ENET_OH; 14768 break; 14769 default: 14770 return (tcp_default_ctloutput(inp, sopt)); 14771 break; 14772 } 14773 INP_WUNLOCK(inp); 14774 error = sooptcopyout(sopt, &optval, sizeof optval); 14775 return (error); 14776 } 14777 14778 /* 14779 * return 0 on success, error-num on failure 14780 */ 14781 static int 14782 bbr_ctloutput(struct inpcb *inp, struct sockopt *sopt) 14783 { 14784 if (sopt->sopt_dir == SOPT_SET) { 14785 return (bbr_set_sockopt(inp, sopt)); 14786 } else if (sopt->sopt_dir == SOPT_GET) { 14787 return (bbr_get_sockopt(inp, sopt)); 14788 } else { 14789 panic("%s: sopt_dir $%d", __func__, sopt->sopt_dir); 14790 } 14791 } 14792 14793 static const char *bbr_stack_names[] = { 14794 __XSTRING(STACKNAME), 14795 #ifdef STACKALIAS 14796 __XSTRING(STACKALIAS), 14797 #endif 14798 }; 14799 14800 static bool bbr_mod_inited = false; 14801 14802 static int 14803 tcp_addbbr(module_t mod, int32_t type, void *data) 14804 { 14805 int32_t err = 0; 14806 int num_stacks; 14807 14808 switch (type) { 14809 case MOD_LOAD: 14810 printf("Attempting to load " __XSTRING(MODNAME) "\n"); 14811 bbr_zone = uma_zcreate(__XSTRING(MODNAME) "_map", 14812 sizeof(struct bbr_sendmap), 14813 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); 14814 bbr_pcb_zone = uma_zcreate(__XSTRING(MODNAME) "_pcb", 14815 sizeof(struct tcp_bbr), 14816 NULL, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0); 14817 sysctl_ctx_init(&bbr_sysctl_ctx); 14818 bbr_sysctl_root = SYSCTL_ADD_NODE(&bbr_sysctl_ctx, 14819 SYSCTL_STATIC_CHILDREN(_net_inet_tcp), 14820 OID_AUTO, 14821 #ifdef STACKALIAS 14822 __XSTRING(STACKALIAS), 14823 #else 14824 __XSTRING(STACKNAME), 14825 #endif 14826 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 14827 ""); 14828 if (bbr_sysctl_root == NULL) { 14829 printf("Failed to add sysctl node\n"); 14830 err = EFAULT; 14831 goto free_uma; 14832 } 14833 bbr_init_sysctls(); 14834 num_stacks = nitems(bbr_stack_names); 14835 err = register_tcp_functions_as_names(&__tcp_bbr, M_WAITOK, 14836 bbr_stack_names, &num_stacks); 14837 if (err) { 14838 printf("Failed to register %s stack name for " 14839 "%s module\n", bbr_stack_names[num_stacks], 14840 __XSTRING(MODNAME)); 14841 sysctl_ctx_free(&bbr_sysctl_ctx); 14842 free_uma: 14843 uma_zdestroy(bbr_zone); 14844 uma_zdestroy(bbr_pcb_zone); 14845 bbr_counter_destroy(); 14846 printf("Failed to register " __XSTRING(MODNAME) 14847 " module err:%d\n", err); 14848 return (err); 14849 } 14850 tcp_lro_reg_mbufq(); 14851 bbr_mod_inited = true; 14852 printf(__XSTRING(MODNAME) " is now available\n"); 14853 break; 14854 case MOD_QUIESCE: 14855 err = deregister_tcp_functions(&__tcp_bbr, true, false); 14856 break; 14857 case MOD_UNLOAD: 14858 err = deregister_tcp_functions(&__tcp_bbr, false, true); 14859 if (err == EBUSY) 14860 break; 14861 if (bbr_mod_inited) { 14862 uma_zdestroy(bbr_zone); 14863 uma_zdestroy(bbr_pcb_zone); 14864 sysctl_ctx_free(&bbr_sysctl_ctx); 14865 bbr_counter_destroy(); 14866 printf(__XSTRING(MODNAME) 14867 " is now no longer available\n"); 14868 bbr_mod_inited = false; 14869 } 14870 tcp_lro_dereg_mbufq(); 14871 err = 0; 14872 break; 14873 default: 14874 return (EOPNOTSUPP); 14875 } 14876 return (err); 14877 } 14878 14879 static moduledata_t tcp_bbr = { 14880 .name = __XSTRING(MODNAME), 14881 .evhand = tcp_addbbr, 14882 .priv = 0 14883 }; 14884 14885 MODULE_VERSION(MODNAME, 1); 14886 DECLARE_MODULE(MODNAME, tcp_bbr, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY); 14887 MODULE_DEPEND(MODNAME, tcphpts, 1, 1, 1); 14888