1 /*- 2 * Copyright (c) 2016-9 3 * Netflix Inc. All rights reserved. 4 * Author Randall R. Stewart 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * SUCH DAMAGE. 26 * 27 * $FreeBSD$ 28 */ 29 30 #ifndef _NETINET_TCP_BBR_H_ 31 #define _NETINET_TCP_BBR_H_ 32 33 #define BBR_INITIAL_RTO 1000000 /* 1 second in micro-seconds */ 34 /* Send map flags */ 35 #define BBR_ACKED 0x0001 /* The remote endpoint acked this */ 36 #define BBR_WAS_RENEGED 0x0002 /* The peer reneged the ack */ 37 #define BBR_RXT_CLEARED 0x0004 /* ACK Cleared by the RXT timer */ 38 #define BBR_OVERMAX 0x0008 /* We have more retran's then we can 39 * fit */ 40 #define BBR_SACK_PASSED 0x0010 /* A sack was done above this block */ 41 #define BBR_WAS_SACKPASS 0x0020 /* We retransmitted due to SACK pass */ 42 #define BBR_HAS_FIN 0x0040 /* segment is sent with fin */ 43 #define BBR_TLP 0x0080 /* segment sent as tail-loss-probe */ 44 #define BBR_HAS_SYN 0x0100 /* segment has the syn */ 45 #define BBR_MARKED_LOST 0x0200 /* 46 * This segments is lost and 47 * totaled into bbr->rc_ctl.rc_lost 48 */ 49 #define BBR_RWND_COLLAPSED 0x0400 /* The peer collapsed the rwnd on the segment */ 50 #define BBR_NUM_OF_RETRANS 7 51 52 /* Defines for socket options to set pacing overheads */ 53 #define BBR_INCL_ENET_OH 0x01 54 #define BBR_INCL_IP_OH 0x02 55 #define BBR_INCL_TCP_OH 0x03 56 57 /* 58 * With the addition of both measurement algorithms 59 * I had to move over the size of a 60 * cache line (unfortunately). For now there is 61 * no way around this. We may be able to cut back 62 * at some point I hope. 63 */ 64 struct bbr_sendmap { 65 TAILQ_ENTRY(bbr_sendmap) r_next; /* seq number arrayed next */ 66 TAILQ_ENTRY(bbr_sendmap) r_tnext; /* Time of tmit based next */ 67 uint32_t r_start; /* Sequence number of the segment */ 68 uint32_t r_end; /* End seq, this is 1 beyond actually */ 69 70 uint32_t r_rtr_bytes; /* How many bytes have been retransmitted */ 71 uint32_t r_delivered; /* Delivered amount at send */ 72 73 uint32_t r_del_time; /* The time of the last delivery update */ 74 uint8_t r_rtr_cnt:4, /* Retran count, index this -1 to get time 75 * sent */ 76 unused_bit:1, 77 r_is_drain:1, /* In a draining cycle */ 78 r_app_limited:1,/* We went app limited */ 79 r_ts_valid:1; /* Timestamp field is valid (r_del_ack_ts) */ 80 uint8_t r_dupack; /* Dup ack count */ 81 uint8_t r_in_tmap:1, /* Flag to see if its in the r_tnext array */ 82 r_is_smallmap:1,/* Was logged as a small-map send-map item */ 83 r_is_gain:1, /* Was in gain cycle */ 84 r_bbr_state:5; /* The BBR state at send */ 85 uint8_t r_limit_type; /* is this entry counted against a limit? */ 86 87 uint16_t r_flags; /* Flags as defined above */ 88 uint16_t r_spare16; 89 uint32_t r_del_ack_ts; /* At send what timestamp of peer was (if r_ts_valid set) */ 90 /****************Cache line*****************/ 91 uint32_t r_tim_lastsent[BBR_NUM_OF_RETRANS]; 92 /* 93 * Question, should we instead just grab the sending b/w 94 * from the filter with the gain and store it in a 95 * uint64_t instead? 96 */ 97 uint32_t r_first_sent_time; /* Time of first pkt in flight sent */ 98 uint32_t r_pacing_delay; /* pacing delay of this send */ 99 uint32_t r_flight_at_send; /* flight at the time of the send */ 100 #ifdef _KERNEL 101 } __aligned(CACHE_LINE_SIZE); 102 #else 103 }; 104 #endif 105 #define BBR_LIMIT_TYPE_SPLIT 1 106 107 TAILQ_HEAD(bbr_head, bbr_sendmap); 108 109 #define BBR_SEGMENT_TIME_SIZE 1500 /* How many bytes in time_between */ 110 111 #define BBR_MIN_SEG 1460 /* MSS size */ 112 #define BBR_MAX_GAIN_VALUE 0xffff 113 114 #define BBR_TIMER_FUDGE 1500 /* 1.5ms in micro seconds */ 115 116 /* BW twiddle secret codes */ 117 #define BBR_RED_BW_CONGSIG 0 /* We enter recovery and set using b/w */ 118 #define BBR_RED_BW_RATECAL 1 /* We are calculating the loss rate */ 119 #define BBR_RED_BW_USELRBW 2 /* We are dropping the lower b/w with 120 * cDR */ 121 #define BBR_RED_BW_SETHIGHLOSS 3 /* We have set our highloss value at 122 * exit from probe-rtt */ 123 #define BBR_RED_BW_PE_CLREARLY 4 /* We have decided to clear the 124 * reduction early */ 125 #define BBR_RED_BW_PE_CLAFDEL 5 /* We are clearing it on schedule 126 * delayed */ 127 #define BBR_RED_BW_REC_ENDCLL 6 /* Recover exits save high if needed 128 * an clear to start measuring */ 129 #define BBR_RED_BW_PE_NOEARLY_OUT 7 /* Set pkt epoch judged that we do not 130 * get out of jail early */ 131 /* codes for just-return */ 132 #define BBR_JR_SENT_DATA 0 133 #define BBR_JR_CWND_LIMITED 1 134 #define BBR_JR_RWND_LIMITED 2 135 #define BBR_JR_APP_LIMITED 3 136 #define BBR_JR_ASSESSING 4 137 /* For calculating a rate */ 138 #define BBR_CALC_BW 1 139 #define BBR_CALC_LOSS 2 140 141 #define BBR_RTT_BY_TIMESTAMP 0 142 #define BBR_RTT_BY_EXACTMATCH 1 143 #define BBR_RTT_BY_EARLIER_RET 2 144 #define BBR_RTT_BY_THIS_RETRAN 3 145 #define BBR_RTT_BY_SOME_RETRAN 4 146 #define BBR_RTT_BY_TSMATCHING 5 147 148 /* Markers to track where we enter persists from */ 149 #define BBR_PERSISTS_FROM_1 1 150 #define BBR_PERSISTS_FROM_2 2 151 #define BBR_PERSISTS_FROM_3 3 152 #define BBR_PERSISTS_FROM_4 4 153 #define BBR_PERSISTS_FROM_5 5 154 155 /* magic cookies to ask for the RTT */ 156 #define BBR_RTT_PROP 0 157 #define BBR_RTT_RACK 1 158 #define BBR_RTT_PKTRTT 2 159 #define BBR_SRTT 3 160 161 #define BBR_SACKED 0 162 #define BBR_CUM_ACKED 1 163 164 /* threshold in useconds where we consider we need a higher min cwnd */ 165 #define BBR_HIGH_SPEED 1000 166 #define BBR_HIGHSPEED_NUM_MSS 12 167 168 #define MAX_REDUCE_RXT 3 /* What is the maximum times we are willing to 169 * reduce b/w in RTX's. Setting this has a 170 * multiplicative effect e.g. if we are 171 * reducing by 20% then setting it to 3 means 172 * you will have reduced the b/w estimate by > 173 * 60% before you stop. */ 174 /* 175 * We use the rate sample structure to 176 * assist in single sack/ack rate and rtt 177 * calculation. In the future we will expand 178 * this in BBR to do forward rate sample 179 * b/w estimation. 180 */ 181 #define BBR_RS_RTT_EMPTY 0x00000001 /* Nothing yet stored in RTT's */ 182 #define BBR_RS_BW_EMPTY 0x00000002 /* Nothing yet stored in cDR */ 183 #define BBR_RS_RTT_VALID 0x00000004 /* We have at least one valid RTT */ 184 #define BBR_RS_BW_VAILD 0x00000008 /* We have a valid cDR */ 185 #define BBR_RS_EMPTY (BBR_RS_RTT_EMPTY|BBR_RS_BW_EMPTY) 186 struct bbr_rtt_sample { 187 uint32_t rs_flags; 188 uint32_t rs_rtt_lowest; 189 uint32_t rs_rtt_lowest_sendtime; 190 uint32_t rs_rtt_low_seq_start; 191 192 uint32_t rs_rtt_highest; 193 uint32_t rs_rtt_cnt; 194 195 uint64_t rs_rtt_tot; 196 uint32_t cur_rtt; 197 uint32_t cur_rtt_bytecnt; 198 199 uint32_t cur_rtt_rsmcnt; 200 uint32_t rc_crtt_set:1, 201 avail_bits:31; 202 uint64_t rs_cDR; 203 }; 204 205 /* RTT shrink reasons */ 206 #define BBR_RTTS_INIT 0 207 #define BBR_RTTS_NEWRTT 1 208 #define BBR_RTTS_RTTPROBE 2 209 #define BBR_RTTS_WASIDLE 3 210 #define BBR_RTTS_PERSIST 4 211 #define BBR_RTTS_REACHTAR 5 212 #define BBR_RTTS_ENTERPROBE 6 213 #define BBR_RTTS_SHRINK_PG 7 214 #define BBR_RTTS_SHRINK_PG_FINAL 8 215 #define BBR_RTTS_NEW_TARGET 9 216 #define BBR_RTTS_LEAVE_DRAIN 10 217 #define BBR_RTTS_RESETS_VALUES 11 218 219 #define BBR_NUM_RATES 5 220 /* Rate flags */ 221 #define BBR_RT_FLAG_FREE 0x00 /* Is on the free list */ 222 #define BBR_RT_FLAG_INUSE 0x01 /* Has been allocated */ 223 #define BBR_RT_FLAG_READY 0x02 /* Ready to initiate a measurement. */ 224 #define BBR_RT_FLAG_CAPPED_PRE 0x04 /* Ready to cap if we send the next segment */ 225 #define BBR_RT_FLAG_CAPPED 0x08 /* Measurement is capped */ 226 #define BBR_RT_FLAG_PASTFA 0x10 /* Past the first ack. */ 227 #define BBR_RT_FLAG_LIMITED 0x20 /* Saw application/cwnd or rwnd limited period */ 228 #define BBR_RT_SEEN_A_ACK 0x40 /* A ack has been saved */ 229 #define BBR_RT_PREV_RTT_SET 0x80 /* There was a RTT set in */ 230 #define BBR_RT_PREV_SEND_TIME 0x100 /* 231 *There was a RTT send time set that can be used 232 * no snd_limits 233 */ 234 #define BBR_RT_SET_GRADIENT 0x200 235 #define BBR_RT_TS_VALID 0x400 236 237 238 struct bbr_log { 239 union { 240 struct bbr_sendmap *rsm; /* For alloc/free */ 241 uint64_t sb_acc; /* For out/ack or t-o */ 242 }; 243 struct tcpcb *tp; 244 uint32_t t_flags; 245 uint32_t th_seq; 246 uint32_t th_ack; 247 uint32_t snd_una; 248 uint32_t snd_nxt; 249 uint32_t snd_max; 250 uint32_t snd_cwnd; 251 uint32_t snd_wnd; 252 uint32_t rc_lost; 253 uint32_t target_cwnd; /* UU */ 254 uint32_t inflight; /* UU */ 255 uint32_t applimited; /* UU */ 256 /* Things for BBR */ 257 uint32_t delivered; /* UU */ 258 uint64_t cur_del_rate; /* UU */ 259 uint64_t delRate; /* UU */ 260 uint64_t rttProp; /* UU */ 261 uint64_t lt_bw; /* UU */ 262 uint32_t timeStamp; 263 uint32_t time; 264 uint32_t slot; /* UU */ 265 uint32_t delayed_by; 266 uint32_t exp_del; 267 uint32_t pkts_out; 268 uint32_t new_win; 269 uint32_t hptsi_gain; /* UU */ 270 uint32_t cwnd_gain; /* UU */ 271 uint32_t epoch; /* UU */ 272 uint32_t lt_epoch; /* UU */ 273 /* Sack fun */ 274 uint32_t blk_start[4]; /* xx */ 275 uint32_t blk_end[4]; 276 uint32_t len; /* Timeout T3=1, TLP=2, RACK=3 */ 277 uint8_t type; 278 uint8_t n_sackblks; 279 uint8_t applied; /* UU */ 280 uint8_t inhpts; /* UU */ 281 uint8_t ininput; /* UU */ 282 uint8_t use_lt_bw; /* UU */ 283 }; 284 285 struct bbr_log_sysctl_out { 286 uint32_t bbr_log_at; 287 uint32_t bbr_log_max; 288 struct bbr_log entries[0]; 289 }; 290 291 /* 292 * Magic numbers for logging timeout events if the 293 * logging is enabled. 294 */ 295 #define BBR_TO_FRM_TMR 1 296 #define BBR_TO_FRM_TLP 2 297 #define BBR_TO_FRM_RACK 3 298 #define BBR_TO_FRM_KEEP 4 299 #define BBR_TO_FRM_PERSIST 5 300 #define BBR_TO_FRM_DELACK 6 301 302 #define BBR_SEES_STRETCH_ACK 1 303 #define BBR_SEES_COMPRESSED_ACKS 2 304 305 306 /* 307 * As we get each SACK we wade through the 308 * rc_map and mark off what is acked. 309 * We also increment rc_sacked as well. 310 * 311 * We also pay attention to missing entries 312 * based on the time and possibly mark them 313 * for retransmit. If we do and we are not already 314 * in recovery we enter recovery. In doing 315 * so we claer prr_delivered/holes_rxt and prr_sent_dur_rec. 316 * We also setup rc_next/rc_snd_nxt/rc_send_end so 317 * we will know where to send from. When not in 318 * recovery rc_next will be NULL and rc_snd_nxt should 319 * equal snd_max. 320 * 321 * Whenever we retransmit from recovery we increment 322 * rc_holes_rxt as we retran a block and mark it as retransmitted 323 * with the time it was sent. During non-recovery sending we 324 * add to our map and note the time down of any send expanding 325 * the rc_map at the tail and moving rc_snd_nxt up with snd_max. 326 * 327 * In recovery during SACK/ACK processing if a chunk has 328 * been retransmitted and it is now acked, we decrement rc_holes_rxt. 329 * When we retransmit from the scoreboard we use 330 * rc_next and rc_snd_nxt/rc_send_end to help us 331 * find what needs to be retran. 332 * 333 * To calculate pipe we simply take (snd_max - snd_una) + rc_holes_rxt 334 * This gets us the effect of RFC6675 pipe, counting twice for 335 * bytes retransmitted. 336 */ 337 338 #define TT_BBR_FR_TMR 0x2001 339 340 #define BBR_SCALE 8 341 #define BBR_UNIT (1 << BBR_SCALE) 342 343 #define BBR_NUM_RTTS_FOR_DEL_LIMIT 8 /* How many pkt-rtts do we keep 344 * Delivery rate for */ 345 #define BBR_NUM_RTTS_FOR_GOOG_DEL_LIMIT 10 /* How many pkt-rtts do we keep 346 * Delivery rate for google */ 347 348 #define BBR_SECONDS_NO_RTT 10 /* 10 seconds with no RTT shrinkage */ 349 #define BBR_PROBERTT_MAX 200 /* 200ms */ 350 #define BBR_PROBERTT_NUM_MSS 4 351 #define BBR_STARTUP_EPOCHS 3 352 #define USECS_IN_MSEC 1000 353 #define BBR_TIME_TO_SECONDS(a) (a / USECS_IN_SECOND) 354 #define BBR_TIME_TO_MILLI(a) (a / MS_IN_USEC) 355 356 357 /* BBR keeps time in usec's so we divide by 1000 and round up */ 358 #define BBR_TS_TO_MS(t) ((t+999)/MS_IN_USEC) 359 360 /* 361 * Locking for the rack control block. 362 * a) Locked by INP_WLOCK 363 * b) Locked by the hpts-mutex 364 * 365 */ 366 #define BBR_STATE_STARTUP 0x01 367 #define BBR_STATE_DRAIN 0x02 368 #define BBR_STATE_PROBE_BW 0x03 369 #define BBR_STATE_PROBE_RTT 0x04 370 #define BBR_STATE_IDLE_EXIT 0x05 371 372 /* Substate defines for STATE == PROBE_BW */ 373 #define BBR_SUB_GAIN 0 /* State 0 where we are 5/4 BBR_UNIT */ 374 #define BBR_SUB_DRAIN 1 /* State 1 where we are at 3/4 BBR_UNIT */ 375 #define BBR_SUB_LEVEL1 2 /* State 1 first BBR_UNIT */ 376 #define BBR_SUB_LEVEL2 3 /* State 2nd BBR_UNIT */ 377 #define BBR_SUB_LEVEL3 4 /* State 3rd BBR_UNIT */ 378 #define BBR_SUB_LEVEL4 5 /* State 4th BBR_UNIT */ 379 #define BBR_SUB_LEVEL5 6 /* State 5th BBR_UNIT */ 380 #define BBR_SUB_LEVEL6 7 /* State last BBR_UNIT */ 381 #define BBR_SUBSTATE_COUNT 8 382 383 /* Single remaining reduce log */ 384 #define BBR_REDUCE_AT_FR 5 385 386 #define BBR_BIG_LOG_SIZE 300000 387 388 /* Bits per second in bytes per second */ 389 #define FORTY_EIGHT_MBPS 6000000 /* 48 megabits in bytes */ 390 #define THIRTY_MBPS 3750000 /* 30 megabits in bytes */ 391 #define TWENTY_THREE_MBPS 2896000 392 #define FIVETWELVE_MBPS 64000000 /* 512 megabits in bytes */ 393 #define ONE_POINT_TWO_MEG 150000 /* 1.2 megabits in bytes */ 394 395 struct bbr_stats { 396 uint64_t bbr_badfr; /* 0 */ 397 uint64_t bbr_badfr_bytes; /* 1 */ 398 uint64_t bbr_saw_oerr; /* 2 */ 399 uint64_t bbr_saw_emsgsiz; /* 3 */ 400 uint64_t bbr_reorder_seen; /* 4 */ 401 uint64_t bbr_tlp_tot; /* 5 */ 402 uint64_t bbr_tlp_newdata; /* 6 */ 403 uint64_t bbr_offset_recovery; /* 7 */ 404 uint64_t bbr_tlp_retran_fail; /* 8 */ 405 uint64_t bbr_to_tot; /* 9 */ 406 uint64_t bbr_to_arm_rack; /* 10 */ 407 uint64_t bbr_enter_probertt; /* 11 */ 408 uint64_t bbr_tlp_set; /* 12 */ 409 uint64_t bbr_resends_set; /* 13 */ 410 uint64_t bbr_force_output; /* 14 */ 411 uint64_t bbr_to_arm_tlp; /* 15 */ 412 uint64_t bbr_paced_segments; /* 16 */ 413 uint64_t bbr_saw_enobuf; /* 17 */ 414 uint64_t bbr_to_alloc_failed; /* 18 */ 415 uint64_t bbr_to_alloc_emerg; /* 19 */ 416 uint64_t bbr_sack_proc_all; /* 20 */ 417 uint64_t bbr_sack_proc_short; /* 21 */ 418 uint64_t bbr_sack_proc_restart; /* 22 */ 419 uint64_t bbr_to_alloc; /* 23 */ 420 uint64_t bbr_offset_drop; /* 24 */ 421 uint64_t bbr_runt_sacks; /* 25 */ 422 uint64_t bbr_sack_passed; /* 26 */ 423 uint64_t bbr_rlock_left_ret0; /* 27 */ 424 uint64_t bbr_rlock_left_ret1; /* 28 */ 425 uint64_t bbr_dynamic_rwnd; /* 29 */ 426 uint64_t bbr_static_rwnd; /* 30 */ 427 uint64_t bbr_sack_blocks; /* 31 */ 428 uint64_t bbr_sack_blocks_skip; /* 32 */ 429 uint64_t bbr_sack_search_both; /* 33 */ 430 uint64_t bbr_sack_search_fwd; /* 34 */ 431 uint64_t bbr_sack_search_back; /* 35 */ 432 uint64_t bbr_plain_acks; /* 36 */ 433 uint64_t bbr_acks_with_sacks; /* 37 */ 434 uint64_t bbr_progress_drops; /* 38 */ 435 uint64_t bbr_early; /* 39 */ 436 uint64_t bbr_reneges_seen; /* 40 */ 437 uint64_t bbr_persist_reneg; /* 41 */ 438 uint64_t bbr_dropped_af_data; /* 42 */ 439 uint64_t bbr_failed_mbuf_aloc; /* 43 */ 440 uint64_t bbr_cwnd_limited; /* 44 */ 441 uint64_t bbr_rwnd_limited; /* 45 */ 442 uint64_t bbr_app_limited; /* 46 */ 443 uint64_t bbr_force_timer_start; /* 47 */ 444 uint64_t bbr_hpts_min_time; /* 48 */ 445 uint64_t bbr_meets_tso_thresh; /* 49 */ 446 uint64_t bbr_miss_tso_rwnd; /* 50 */ 447 uint64_t bbr_miss_tso_cwnd; /* 51 */ 448 uint64_t bbr_miss_tso_app; /* 52 */ 449 uint64_t bbr_miss_retran; /* 53 */ 450 uint64_t bbr_miss_tlp; /* 54 */ 451 uint64_t bbr_miss_unknown; /* 55 */ 452 uint64_t bbr_hdwr_rl_add_ok; /* 56 */ 453 uint64_t bbr_hdwr_rl_add_fail; /* 57 */ 454 uint64_t bbr_hdwr_rl_mod_ok; /* 58 */ 455 uint64_t bbr_hdwr_rl_mod_fail; /* 59 */ 456 uint64_t bbr_collapsed_win; /* 60 */ 457 uint64_t bbr_alloc_limited; /* 61 */ 458 uint64_t bbr_alloc_limited_conns; /* 62 */ 459 uint64_t bbr_split_limited; /* 63 */ 460 }; 461 462 /* 463 * The structure bbr_opt_stats is a simple 464 * way to see how many options are being 465 * changed in the stack. 466 */ 467 struct bbr_opts_stats { 468 uint64_t tcp_bbr_pace_per_sec; 469 uint64_t tcp_bbr_pace_del_tar; 470 uint64_t tcp_bbr_pace_seg_max; 471 uint64_t tcp_bbr_pace_seg_min; 472 uint64_t tcp_bbr_pace_cross; 473 uint64_t tcp_bbr_drain_inc_extra; 474 uint64_t tcp_bbr_unlimited; 475 uint64_t tcp_bbr_iwintso; 476 uint64_t tcp_bbr_rec_over_hpts; 477 uint64_t tcp_bbr_recforce; 478 uint64_t tcp_bbr_startup_pg; 479 uint64_t tcp_bbr_drain_pg; 480 uint64_t tcp_bbr_rwnd_is_app; 481 uint64_t tcp_bbr_probe_rtt_int; 482 uint64_t tcp_bbr_one_retran; 483 uint64_t tcp_bbr_startup_loss_exit; 484 uint64_t tcp_bbr_use_lowgain; 485 uint64_t tcp_bbr_lowgain_thresh; 486 uint64_t tcp_bbr_lowgain_half; 487 uint64_t tcp_bbr_lowgain_fd; 488 uint64_t tcp_bbr_usedel_rate; 489 uint64_t tcp_bbr_min_rto; 490 uint64_t tcp_bbr_max_rto; 491 uint64_t tcp_rack_pace_max_seg; 492 uint64_t tcp_rack_min_to; 493 uint64_t tcp_rack_reord_thresh; 494 uint64_t tcp_rack_reord_fade; 495 uint64_t tcp_rack_tlp_thresh; 496 uint64_t tcp_rack_pkt_delay; 497 uint64_t tcp_bbr_startup_exit_epoch; 498 uint64_t tcp_bbr_ack_comp_alg; 499 uint64_t tcp_rack_cheat; 500 uint64_t tcp_iwnd_tso; 501 uint64_t tcp_utter_max_tso; 502 uint64_t tcp_hdwr_pacing; 503 uint64_t tcp_extra_state; 504 uint64_t tcp_floor_min_tso; 505 /* New */ 506 uint64_t tcp_bbr_algorithm; 507 uint64_t tcp_bbr_tslimits; 508 uint64_t tcp_bbr_probertt_len; 509 uint64_t tcp_bbr_probertt_gain; 510 uint64_t tcp_bbr_topaceout; 511 uint64_t tcp_use_rackcheat; 512 uint64_t tcp_delack; 513 uint64_t tcp_maxpeak; 514 uint64_t tcp_retran_wtso; 515 uint64_t tcp_data_ac; 516 uint64_t tcp_ts_raises; 517 uint64_t tcp_pacing_oh_tmr; 518 uint64_t tcp_pacing_oh; 519 uint64_t tcp_policer_det; 520 }; 521 522 523 #ifdef _KERNEL 524 #define BBR_STAT_SIZE (sizeof(struct bbr_stats)/sizeof(uint64_t)) 525 extern counter_u64_t bbr_stat_arry[BBR_STAT_SIZE]; 526 #define BBR_STAT_ADD(name, amm) counter_u64_add(bbr_stat_arry[(offsetof(struct bbr_stats, name)/sizeof(uint64_t))], (amm)) 527 #define BBR_STAT_INC(name) BBR_STAT_ADD(name, 1) 528 #define BBR_OPTS_SIZE (sizeof(struct bbr_stats)/sizeof(uint64_t)) 529 extern counter_u64_t bbr_opts_arry[BBR_OPTS_SIZE]; 530 #define BBR_OPTS_ADD(name, amm) counter_u64_add(bbr_opts_arry[(offsetof(struct bbr_opts_stats, name)/sizeof(uint64_t))], (amm)) 531 #define BBR_OPTS_INC(name) BBR_OPTS_ADD(name, 1) 532 #endif 533 534 #define BBR_NUM_LOSS_RATES 3 535 #define BBR_NUM_BW_RATES 3 536 537 #define BBR_RECOVERY_LOWRTT 1 538 #define BBR_RECOVERY_MEDRTT 2 539 #define BBR_RECOVERY_HIGHRTT 3 540 #define BBR_RECOVERY_EXTREMERTT 4 541 542 543 struct bbr_control { 544 /*******************************/ 545 /* Cache line 2 from bbr start */ 546 /*******************************/ 547 struct bbr_head rc_map; /* List of all segments Lock(a) */ 548 struct bbr_head rc_tmap; /* List in transmit order Lock(a) */ 549 struct bbr_sendmap *rc_resend; /* something we have been asked to 550 * resend */ 551 uint32_t rc_last_delay_val; /* How much we expect to delay Lock(a) */ 552 uint32_t rc_bbr_hptsi_gain:16, /* Current hptsi gain Lock(a) */ 553 rc_hpts_flags:16; /* flags on whats on the pacer wheel */ 554 555 uint32_t rc_delivered; /* BRR delivered amount Lock(a) */ 556 uint32_t rc_hptsi_agg_delay; /* How much time are we behind */ 557 558 uint32_t rc_flight_at_input; 559 uint32_t rc_lost_bytes; /* Total bytes currently marked lost */ 560 /*******************************/ 561 /* Cache line 3 from bbr start */ 562 /*******************************/ 563 struct time_filter rc_delrate; 564 /*******************************/ 565 /* Cache line 4 from bbr start */ 566 /*******************************/ 567 struct bbr_head rc_free; /* List of Free map entries Lock(a) */ 568 struct bbr_sendmap *rc_tlp_send; /* something we have been 569 * asked to resend */ 570 uint32_t rc_del_time; 571 uint32_t rc_target_at_state; /* Target for a state */ 572 573 uint16_t rc_free_cnt; /* Number of free entries on the rc_free list 574 * Lock(a) */ 575 uint16_t rc_startup_pg; 576 577 uint32_t cur_rtt; /* Last RTT from ack */ 578 579 580 uint32_t rc_went_idle_time; /* Used for persits to see if its 581 * probe-rtt qualified */ 582 uint32_t rc_pace_max_segs:17, /* How much in any single TSO we send Lock(a) */ 583 rc_pace_min_segs:15; /* The minimum single segment size before we enter persists */ 584 585 uint32_t rc_rtt_shrinks; /* Time of last rtt shrinkage Lock(a) */ 586 uint32_t r_app_limited_until; 587 uint32_t rc_timer_exp; /* If a timer ticks of expiry */ 588 uint32_t rc_rcv_epoch_start; /* Start time of the Epoch Lock(a) */ 589 590 /*******************************/ 591 /* Cache line 5 from bbr start */ 592 /*******************************/ 593 594 uint32_t rc_lost_at_pktepoch; /* what the lost value was at the last 595 * pkt-epoch */ 596 uint32_t r_measurement_count; /* count of measurement applied lock(a) */ 597 598 599 uint32_t rc_last_tlp_seq; /* Last tlp sequence Lock(a) */ 600 uint16_t rc_reorder_shift; /* Socket option value Lock(a) */ 601 uint16_t rc_pkt_delay; /* Socket option value Lock(a) */ 602 603 struct bbr_sendmap *rc_sacklast; /* sack remembered place 604 * Lock(a) */ 605 struct bbr_sendmap *rc_next; /* remembered place where we next 606 * retransmit at Lock(a) */ 607 608 uint32_t rc_sacked; /* Tot sacked on scoreboard Lock(a) */ 609 uint32_t rc_holes_rxt; /* Tot retraned from scoreboard Lock(a) */ 610 611 uint32_t rc_reorder_ts; /* Last time we saw reordering Lock(a) */ 612 uint32_t rc_init_rwnd; /* Initial rwnd when we transitioned */ 613 /*- --- 614 * used only inital and close 615 */ 616 uint32_t rc_high_rwnd; /* Highest rwnd seen */ 617 uint32_t rc_lowest_rtt; /* Smallest RTT we have seen */ 618 619 uint32_t rc_last_rtt; /* Last valid measured RTT that ack'd data */ 620 uint32_t bbr_cross_over; 621 622 /*******************************/ 623 /* Cache line 6 from bbr start */ 624 /*******************************/ 625 struct sack_filter bbr_sf; 626 627 /*******************************/ 628 /* Cache line 7 from bbr start */ 629 /*******************************/ 630 struct time_filter_small rc_rttprop; 631 uint32_t last_inbound_ts; /* Peers last timestamp */ 632 633 uint32_t rc_inc_tcp_oh: 1, 634 rc_inc_ip_oh: 1, 635 rc_inc_enet_oh:1, 636 rc_incr_tmrs:1, 637 restrict_growth:28; 638 uint32_t rc_lt_epoch_use; /* When we started lt-bw use Lock(a) */ 639 640 uint32_t rc_recovery_start; /* Time we start recovery Lock(a) */ 641 uint32_t rc_lt_del; /* Delivered at lt bw sampling start Lock(a) */ 642 643 uint64_t rc_bbr_cur_del_rate; /* Current measured delivery rate 644 * Lock(a) */ 645 646 /*******************************/ 647 /* Cache line 8 from bbr start */ 648 /*******************************/ 649 uint32_t rc_cwnd_on_ent; /* On entry to recovery the cwnd 650 * Lock(a) */ 651 uint32_t rc_agg_early; /* aggregate amount early */ 652 653 uint32_t rc_rcvtime; /* When we last received data Lock(a) */ 654 uint32_t rc_pkt_epoch_del; /* seq num that we need for RTT epoch */ 655 656 uint32_t rc_pkt_epoch; /* Epoch based on packet RTTs */ 657 uint32_t rc_pkt_epoch_time; /* Time we started the pkt epoch */ 658 659 uint32_t rc_pkt_epoch_rtt; /* RTT using the packet epoch */ 660 uint32_t rc_rtt_epoch; /* Current RTT epoch, it ticks every rttProp 661 * Lock(a) */ 662 uint32_t lowest_rtt; 663 uint32_t bbr_smallest_srtt_this_state; 664 665 uint32_t rc_lt_epoch; /* LT epoch start of bw_sampling */ 666 uint32_t rc_lost_at_startup; 667 668 uint32_t rc_bbr_state_atflight; 669 uint32_t rc_bbr_last_startup_epoch; /* Last startup epoch where we 670 * increased 20% */ 671 uint32_t rc_bbr_enters_probertt; /* Timestamp we entered 672 * probertt Lock(a) */ 673 uint32_t rc_lt_time; /* Time of lt sampling start Lock(a) */ 674 675 /*******************************/ 676 /* Cache line 9 from bbr start */ 677 /*******************************/ 678 uint64_t rc_lt_bw; /* LT bw calculated Lock(a) */ 679 uint64_t rc_bbr_lastbtlbw; /* For startup, what was last btlbw I 680 * saw to check the 20% gain Lock(a) */ 681 682 683 uint32_t rc_bbr_cwnd_gain; /* Current cwnd gain Lock(a) */ 684 uint32_t rc_pkt_epoch_loss_rate; /* pkt-epoch loss rate */ 685 686 uint32_t rc_saved_cwnd; /* Saved cwnd during Probe-rtt drain Lock(a) */ 687 uint32_t substate_pe; 688 689 uint32_t rc_lost; /* Number of bytes lost Lock(a) */ 690 uint32_t rc_exta_time_gd; /* How much extra time we got in d/g */ 691 692 uint32_t rc_lt_lost; /* Number of lt bytes lost at sampling start 693 * Lock(a) */ 694 uint32_t rc_bbr_state_time; 695 696 uint32_t rc_min_to; /* Socket option value Lock(a) */ 697 uint32_t rc_initial_hptsi_bw; /* Our initial startup bw Lock(a) */ 698 699 uint32_t bbr_lost_at_state; /* Temp counter debug lost value as we 700 * enter a state */ 701 /*******************************/ 702 /* Cache line 10 from bbr start */ 703 /*******************************/ 704 uint32_t rc_level_state_extra; 705 uint32_t rc_red_cwnd_pe; 706 const struct tcp_hwrate_limit_table *crte; 707 uint64_t red_bw; 708 709 uint32_t rc_probertt_int; 710 uint32_t rc_probertt_srttchktim; /* Time we last did a srtt 711 * check */ 712 uint32_t gain_epoch; /* Epoch we should be out of gain */ 713 uint32_t rc_min_rto_ms; 714 715 uint32_t rc_reorder_fade; /* Socket option value Lock(a) */ 716 uint32_t last_startup_measure; 717 718 int32_t bbr_hptsi_per_second; 719 int32_t bbr_hptsi_segments_delay_tar; 720 721 int32_t bbr_hptsi_segments_max; 722 uint32_t bbr_rttprobe_gain_val; 723 /*******************************/ 724 /* Cache line 11 from bbr start */ 725 /*******************************/ 726 uint32_t cur_rtt_send_time; /* Time we sent our rtt measured packet */ 727 uint32_t bbr_peer_tsratio; /* Our calculated ts ratio to multply */ 728 uint32_t bbr_ts_check_tstmp; /* When we filled it the TS that came on the ack */ 729 uint32_t bbr_ts_check_our_cts; /* When we filled it the cts of the send */ 730 uint32_t rc_tlp_rxt_last_time; 731 uint32_t bbr_smallest_srtt_state2; 732 uint32_t bbr_hdwr_cnt_noset_snt; /* count of hw pacing sends during delay */ 733 uint32_t startup_last_srtt; 734 uint32_t rc_ack_hdwr_delay; 735 uint32_t highest_hdwr_delay; /* Largest delay we have seen from hardware */ 736 uint32_t non_gain_extra; 737 uint32_t recovery_lr; /* The sum of the loss rate from the pe's during recovery */ 738 uint32_t last_in_probertt; 739 uint32_t flightsize_at_drain; /* In draining what was the last marked flight size */ 740 uint32_t rc_pe_of_prtt; /* PE we went into probe-rtt */ 741 uint32_t ts_in; /* ts that went with the last rtt */ 742 743 uint16_t rc_tlp_seg_send_cnt; /* Number of times we have TLP sent 744 * rc_last_tlp_seq Lock(a) */ 745 uint16_t rc_drain_pg; 746 uint32_t rc_num_maps_alloced; /* num send map entries allocated */ 747 uint32_t rc_num_split_allocs; /* num split map entries allocated */ 748 uint16_t rc_num_small_maps_alloced; /* Number of sack blocks 749 * allocated */ 750 uint16_t bbr_hptsi_bytes_min; 751 752 uint16_t bbr_hptsi_segments_floor; 753 uint16_t bbr_utter_max; 754 uint16_t bbr_google_discount; 755 756 }; 757 758 759 struct socket; 760 struct tcp_bbr { 761 /* First cache line 0x00 */ 762 int32_t(*r_substate) (struct mbuf *, struct tcphdr *, 763 struct socket *, struct tcpcb *, struct tcpopt *, 764 int32_t, int32_t, uint32_t, int32_t, int32_t); /* Lock(a) */ 765 struct tcpcb *rc_tp; /* The tcpcb Lock(a) */ 766 struct inpcb *rc_inp; /* The inpcb Lock(a) */ 767 struct timeval rc_tv; 768 uint32_t rc_pacer_started; /* Time we started the pacer */ 769 uint16_t no_pacing_until:8, /* No pacing until N packet epochs */ 770 ts_can_raise:1,/* TS b/w calculations can raise the bw higher */ 771 skip_gain:1, /* Skip the gain cycle (hardware pacing) */ 772 gain_is_limited:1, /* With hardware pacing we are limiting gain */ 773 output_error_seen:1, 774 oerror_cnt:4, 775 hw_pacing_set:1; /* long enough has passed for us to start pacing */ 776 uint16_t xxx_r_ack_count; /* During recovery count of ack's received 777 * that added data since output */ 778 uint16_t bbr_segs_rcvd; /* In Segment count since we sent a ack */ 779 780 uint8_t bbr_timer_src:4, /* Used for debugging Lock(a) */ 781 bbr_use_rack_cheat:1, /* Use the rack cheat */ 782 bbr_init_win_cheat:1, /* Send full IW for TSO */ 783 bbr_attempt_hdwr_pace:1,/* Try to do hardware pacing */ 784 bbr_hdrw_pacing:1; /* Hardware pacing is available */ 785 uint8_t bbr_hdw_pace_ena:1, /* Does the connection allow hardware pacing to be attempted */ 786 bbr_prev_in_rec:1, /* We were previously in recovery */ 787 pkt_conservation:1, 788 use_policer_detection:1, 789 xxx_bbr_hdw_pace_idx:4; /* If hardware pacing is on, index to slot in pace tbl */ 790 uint16_t r_wanted_output:1, 791 rtt_valid:1, 792 rc_timer_first:1, 793 rc_output_starts_timer:1, 794 rc_resends_use_tso:1, 795 rc_all_timers_stopped:1, 796 rc_loss_exit:1, 797 rc_ack_was_delayed:1, 798 rc_lt_is_sampling:1, 799 rc_filled_pipe:1, 800 rc_tlp_new_data:1, 801 rc_hit_state_1:1, 802 rc_ts_valid:1, 803 rc_prtt_set_ts:1, 804 rc_is_pkt_epoch_now:1, 805 rc_has_collapsed:1; 806 807 uint8_t r_state:4, /* Current bbr state Lock(a) */ 808 r_agg_early_set:1, /* Did we get called early */ 809 r_init_rtt:1, 810 r_use_policer:1, /* For google mode only */ 811 r_recovery_bw:1; 812 uint8_t r_timer_override:1, /* pacer override Lock(a) 0/1 */ 813 rc_in_persist:1, 814 rc_lt_use_bw:1, 815 rc_allow_data_af_clo:1, 816 rc_tlp_rtx_out:1, /* A TLP is in flight */ 817 rc_tlp_in_progress:1, /* a TLP timer is running needed? */ 818 rc_use_idle_restart:1; /* Do we restart fast after idle (persist or applim) */ 819 uint8_t rc_bbr_state:3, /* What is the major BBR state */ 820 rc_bbr_substate:3, /* For probeBW state */ 821 r_is_v6:1, 822 rc_past_init_win:1; 823 uint8_t rc_last_options; 824 uint8_t rc_tlp_threshold; /* Socket option value Lock(a) */ 825 uint8_t rc_max_rto_sec; 826 uint8_t rc_cwnd_limited:1, /* We are cwnd limited */ 827 rc_tmr_stopped:7; /* What timers have been stopped */ 828 uint8_t rc_use_google:1, 829 rc_use_ts_limit:1, 830 rc_ts_data_set:1, /* We have filled a set point to determine */ 831 rc_ts_clock_set:1, /* We have determined the ts type */ 832 rc_ts_cant_be_used:1, /* We determined we can't use ts values */ 833 rc_ack_is_cumack:1, 834 rc_no_pacing:1, 835 alloc_limit_reported:1; 836 uint8_t rc_init_win; 837 /* Cache line 2 0x40 */ 838 struct bbr_control r_ctl; 839 #ifdef _KERNEL 840 } __aligned(CACHE_LINE_SIZE); 841 #else 842 }; 843 #endif 844 845 #endif 846