1 /*- 2 * Copyright (c) 2016-9 Netflix, Inc. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 16 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 23 * SUCH DAMAGE. 24 * 25 */ 26 27 #include <sys/cdefs.h> 28 __FBSDID("$FreeBSD$"); 29 30 #include "opt_inet.h" 31 #include "opt_inet6.h" 32 #include "opt_ipsec.h" 33 #include "opt_tcpdebug.h" 34 #include "opt_ratelimit.h" 35 #include "opt_kern_tls.h" 36 #include <sys/param.h> 37 #include <sys/arb.h> 38 #include <sys/module.h> 39 #include <sys/kernel.h> 40 #ifdef TCP_HHOOK 41 #include <sys/hhook.h> 42 #endif 43 #include <sys/lock.h> 44 #include <sys/malloc.h> 45 #include <sys/lock.h> 46 #include <sys/mutex.h> 47 #include <sys/mbuf.h> 48 #include <sys/proc.h> /* for proc0 declaration */ 49 #include <sys/socket.h> 50 #include <sys/socketvar.h> 51 #ifdef KERN_TLS 52 #include <sys/ktls.h> 53 #endif 54 #include <sys/sysctl.h> 55 #include <sys/systm.h> 56 #ifdef STATS 57 #include <sys/qmath.h> 58 #include <sys/tree.h> 59 #include <sys/stats.h> /* Must come after qmath.h and tree.h */ 60 #endif 61 #include <sys/refcount.h> 62 #include <sys/tree.h> 63 #include <sys/queue.h> 64 #include <sys/smp.h> 65 #include <sys/kthread.h> 66 #include <sys/kern_prefetch.h> 67 68 #include <vm/uma.h> 69 70 #include <net/route.h> 71 #include <net/route/nhop.h> 72 #include <net/vnet.h> 73 74 #define TCPSTATES /* for logging */ 75 76 #include <netinet/in.h> 77 #include <netinet/in_kdtrace.h> 78 #include <netinet/in_pcb.h> 79 #include <netinet/ip.h> 80 #include <netinet/ip_icmp.h> /* required for icmp_var.h */ 81 #include <netinet/icmp_var.h> /* for ICMP_BANDLIM */ 82 #include <netinet/ip_var.h> 83 #include <netinet/ip6.h> 84 #include <netinet6/in6_pcb.h> 85 #include <netinet6/ip6_var.h> 86 #include <netinet/tcp.h> 87 #define TCPOUTFLAGS 88 #include <netinet/tcp_fsm.h> 89 #include <netinet/tcp_log_buf.h> 90 #include <netinet/tcp_seq.h> 91 #include <netinet/tcp_timer.h> 92 #include <netinet/tcp_var.h> 93 #include <netinet/tcp_hpts.h> 94 #include <netinet/tcpip.h> 95 #include <netinet/cc/cc.h> 96 #include <netinet/tcp_fastopen.h> 97 #include <netinet/tcp_lro.h> 98 #ifdef TCPDEBUG 99 #include <netinet/tcp_debug.h> 100 #endif /* TCPDEBUG */ 101 #ifdef TCP_OFFLOAD 102 #include <netinet/tcp_offload.h> 103 #endif 104 #ifdef INET6 105 #include <netinet6/tcp6_var.h> 106 #endif 107 108 #include <netipsec/ipsec_support.h> 109 110 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 111 #include <netipsec/ipsec.h> 112 #include <netipsec/ipsec6.h> 113 #endif /* IPSEC */ 114 115 #include <netinet/udp.h> 116 #include <netinet/udp_var.h> 117 #include <machine/in_cksum.h> 118 119 #ifdef MAC 120 #include <security/mac/mac_framework.h> 121 #endif 122 #include "sack_filter.h" 123 #include "tcp_rack.h" 124 #include "rack_bbr_common.h" 125 126 uma_zone_t rack_zone; 127 uma_zone_t rack_pcb_zone; 128 129 #ifndef TICKS2SBT 130 #define TICKS2SBT(__t) (tick_sbt * ((sbintime_t)(__t))) 131 #endif 132 133 struct sysctl_ctx_list rack_sysctl_ctx; 134 struct sysctl_oid *rack_sysctl_root; 135 136 #define CUM_ACKED 1 137 #define SACKED 2 138 139 /* 140 * The RACK module incorporates a number of 141 * TCP ideas that have been put out into the IETF 142 * over the last few years: 143 * - Matt Mathis's Rate Halving which slowly drops 144 * the congestion window so that the ack clock can 145 * be maintained during a recovery. 146 * - Yuchung Cheng's RACK TCP (for which its named) that 147 * will stop us using the number of dup acks and instead 148 * use time as the gage of when we retransmit. 149 * - Reorder Detection of RFC4737 and the Tail-Loss probe draft 150 * of Dukkipati et.al. 151 * RACK depends on SACK, so if an endpoint arrives that 152 * cannot do SACK the state machine below will shuttle the 153 * connection back to using the "default" TCP stack that is 154 * in FreeBSD. 155 * 156 * To implement RACK the original TCP stack was first decomposed 157 * into a functional state machine with individual states 158 * for each of the possible TCP connection states. The do_segement 159 * functions role in life is to mandate the connection supports SACK 160 * initially and then assure that the RACK state matches the conenction 161 * state before calling the states do_segment function. Each 162 * state is simplified due to the fact that the original do_segment 163 * has been decomposed and we *know* what state we are in (no 164 * switches on the state) and all tests for SACK are gone. This 165 * greatly simplifies what each state does. 166 * 167 * TCP output is also over-written with a new version since it 168 * must maintain the new rack scoreboard. 169 * 170 */ 171 static int32_t rack_tlp_thresh = 1; 172 static int32_t rack_reorder_thresh = 2; 173 static int32_t rack_reorder_fade = 60000; /* 0 - never fade, def 60,000 174 * - 60 seconds */ 175 /* Attack threshold detections */ 176 static uint32_t rack_highest_sack_thresh_seen = 0; 177 static uint32_t rack_highest_move_thresh_seen = 0; 178 179 static int32_t rack_pkt_delay = 1; 180 static int32_t rack_min_pace_time = 0; 181 static int32_t rack_early_recovery = 1; 182 static int32_t rack_send_a_lot_in_prr = 1; 183 static int32_t rack_min_to = 1; /* Number of ms minimum timeout */ 184 static int32_t rack_verbose_logging = 0; 185 static int32_t rack_ignore_data_after_close = 1; 186 static int32_t use_rack_cheat = 1; 187 static int32_t rack_persist_min = 250; /* 250ms */ 188 static int32_t rack_persist_max = 1000; /* 1 Second */ 189 static int32_t rack_sack_not_required = 0; /* set to one to allow non-sack to use rack */ 190 static int32_t rack_hw_tls_max_seg = 0; /* 0 means use hw-tls single segment */ 191 192 /* 193 * Currently regular tcp has a rto_min of 30ms 194 * the backoff goes 12 times so that ends up 195 * being a total of 122.850 seconds before a 196 * connection is killed. 197 */ 198 static int32_t rack_tlp_min = 10; 199 static int32_t rack_rto_min = 30; /* 30ms same as main freebsd */ 200 static int32_t rack_rto_max = 4000; /* 4 seconds */ 201 static const int32_t rack_free_cache = 2; 202 static int32_t rack_hptsi_segments = 40; 203 static int32_t rack_rate_sample_method = USE_RTT_LOW; 204 static int32_t rack_pace_every_seg = 0; 205 static int32_t rack_delayed_ack_time = 200; /* 200ms */ 206 static int32_t rack_slot_reduction = 4; 207 static int32_t rack_lower_cwnd_at_tlp = 0; 208 static int32_t rack_use_proportional_reduce = 0; 209 static int32_t rack_proportional_rate = 10; 210 static int32_t rack_tlp_max_resend = 2; 211 static int32_t rack_limited_retran = 0; 212 static int32_t rack_always_send_oldest = 0; 213 static int32_t rack_use_sack_filter = 1; 214 static int32_t rack_tlp_threshold_use = TLP_USE_TWO_ONE; 215 static int32_t rack_per_of_gp = 50; 216 217 /* Rack specific counters */ 218 counter_u64_t rack_badfr; 219 counter_u64_t rack_badfr_bytes; 220 counter_u64_t rack_rtm_prr_retran; 221 counter_u64_t rack_rtm_prr_newdata; 222 counter_u64_t rack_timestamp_mismatch; 223 counter_u64_t rack_reorder_seen; 224 counter_u64_t rack_paced_segments; 225 counter_u64_t rack_unpaced_segments; 226 counter_u64_t rack_calc_zero; 227 counter_u64_t rack_calc_nonzero; 228 counter_u64_t rack_saw_enobuf; 229 counter_u64_t rack_saw_enetunreach; 230 counter_u64_t rack_per_timer_hole; 231 232 /* Tail loss probe counters */ 233 counter_u64_t rack_tlp_tot; 234 counter_u64_t rack_tlp_newdata; 235 counter_u64_t rack_tlp_retran; 236 counter_u64_t rack_tlp_retran_bytes; 237 counter_u64_t rack_tlp_retran_fail; 238 counter_u64_t rack_to_tot; 239 counter_u64_t rack_to_arm_rack; 240 counter_u64_t rack_to_arm_tlp; 241 counter_u64_t rack_to_alloc; 242 counter_u64_t rack_to_alloc_hard; 243 counter_u64_t rack_to_alloc_emerg; 244 counter_u64_t rack_to_alloc_limited; 245 counter_u64_t rack_alloc_limited_conns; 246 counter_u64_t rack_split_limited; 247 248 counter_u64_t rack_sack_proc_all; 249 counter_u64_t rack_sack_proc_short; 250 counter_u64_t rack_sack_proc_restart; 251 counter_u64_t rack_sack_attacks_detected; 252 counter_u64_t rack_sack_attacks_reversed; 253 counter_u64_t rack_sack_used_next_merge; 254 counter_u64_t rack_sack_splits; 255 counter_u64_t rack_sack_used_prev_merge; 256 counter_u64_t rack_sack_skipped_acked; 257 counter_u64_t rack_ack_total; 258 counter_u64_t rack_express_sack; 259 counter_u64_t rack_sack_total; 260 counter_u64_t rack_move_none; 261 counter_u64_t rack_move_some; 262 263 counter_u64_t rack_used_tlpmethod; 264 counter_u64_t rack_used_tlpmethod2; 265 counter_u64_t rack_enter_tlp_calc; 266 counter_u64_t rack_input_idle_reduces; 267 counter_u64_t rack_collapsed_win; 268 counter_u64_t rack_tlp_does_nada; 269 270 /* Counters for HW TLS */ 271 counter_u64_t rack_tls_rwnd; 272 counter_u64_t rack_tls_cwnd; 273 counter_u64_t rack_tls_app; 274 counter_u64_t rack_tls_other; 275 counter_u64_t rack_tls_filled; 276 counter_u64_t rack_tls_rxt; 277 counter_u64_t rack_tls_tlp; 278 279 /* Temp CPU counters */ 280 counter_u64_t rack_find_high; 281 282 counter_u64_t rack_progress_drops; 283 counter_u64_t rack_out_size[TCP_MSS_ACCT_SIZE]; 284 counter_u64_t rack_opts_arry[RACK_OPTS_SIZE]; 285 286 static void 287 rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, int event, int line); 288 289 static int 290 rack_process_ack(struct mbuf *m, struct tcphdr *th, 291 struct socket *so, struct tcpcb *tp, struct tcpopt *to, 292 uint32_t tiwin, int32_t tlen, int32_t * ofia, int32_t thflags, int32_t * ret_val); 293 static int 294 rack_process_data(struct mbuf *m, struct tcphdr *th, 295 struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, 296 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); 297 static void 298 rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, 299 struct tcphdr *th, uint16_t nsegs, uint16_t type, int32_t recovery); 300 static struct rack_sendmap *rack_alloc(struct tcp_rack *rack); 301 static struct rack_sendmap *rack_alloc_limit(struct tcp_rack *rack, 302 uint8_t limit_type); 303 static struct rack_sendmap * 304 rack_check_recovery_mode(struct tcpcb *tp, 305 uint32_t tsused); 306 static void 307 rack_cong_signal(struct tcpcb *tp, struct tcphdr *th, 308 uint32_t type); 309 static void rack_counter_destroy(void); 310 static int 311 rack_ctloutput(struct socket *so, struct sockopt *sopt, 312 struct inpcb *inp, struct tcpcb *tp); 313 static int32_t rack_ctor(void *mem, int32_t size, void *arg, int32_t how); 314 static void 315 rack_do_segment(struct mbuf *m, struct tcphdr *th, 316 struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, 317 uint8_t iptos); 318 static void rack_dtor(void *mem, int32_t size, void *arg); 319 static void 320 rack_earlier_retran(struct tcpcb *tp, struct rack_sendmap *rsm, 321 uint32_t t, uint32_t cts); 322 static struct rack_sendmap * 323 rack_find_high_nonack(struct tcp_rack *rack, 324 struct rack_sendmap *rsm); 325 static struct rack_sendmap *rack_find_lowest_rsm(struct tcp_rack *rack); 326 static void rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm); 327 static void rack_fini(struct tcpcb *tp, int32_t tcb_is_purged); 328 static int 329 rack_get_sockopt(struct socket *so, struct sockopt *sopt, 330 struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack); 331 static int32_t rack_handoff_ok(struct tcpcb *tp); 332 static int32_t rack_init(struct tcpcb *tp); 333 static void rack_init_sysctls(void); 334 static void 335 rack_log_ack(struct tcpcb *tp, struct tcpopt *to, 336 struct tcphdr *th); 337 static void 338 rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len, 339 uint32_t seq_out, uint8_t th_flags, int32_t err, uint32_t ts, 340 uint8_t pass, struct rack_sendmap *hintrsm); 341 static void 342 rack_log_sack_passed(struct tcpcb *tp, struct tcp_rack *rack, 343 struct rack_sendmap *rsm); 344 static void rack_log_to_event(struct tcp_rack *rack, int32_t to_num, int num); 345 static int32_t rack_output(struct tcpcb *tp); 346 347 static uint32_t 348 rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, 349 struct sackblk *sack, struct tcpopt *to, struct rack_sendmap **prsm, 350 uint32_t cts, int *moved_two); 351 static void rack_post_recovery(struct tcpcb *tp, struct tcphdr *th); 352 static void rack_remxt_tmr(struct tcpcb *tp); 353 static int 354 rack_set_sockopt(struct socket *so, struct sockopt *sopt, 355 struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack); 356 static void rack_set_state(struct tcpcb *tp, struct tcp_rack *rack); 357 static int32_t rack_stopall(struct tcpcb *tp); 358 static void 359 rack_timer_activate(struct tcpcb *tp, uint32_t timer_type, 360 uint32_t delta); 361 static int32_t rack_timer_active(struct tcpcb *tp, uint32_t timer_type); 362 static void rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line); 363 static void rack_timer_stop(struct tcpcb *tp, uint32_t timer_type); 364 static uint32_t 365 rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack, 366 struct rack_sendmap *rsm, uint32_t ts, int32_t * lenp); 367 static void 368 rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack, 369 struct rack_sendmap *rsm, uint32_t ts); 370 static int 371 rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack, 372 struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type); 373 static int32_t tcp_addrack(module_t mod, int32_t type, void *data); 374 static int 375 rack_do_close_wait(struct mbuf *m, struct tcphdr *th, 376 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 377 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 378 static int 379 rack_do_closing(struct mbuf *m, struct tcphdr *th, 380 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 381 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 382 static int 383 rack_do_established(struct mbuf *m, struct tcphdr *th, 384 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 385 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 386 static int 387 rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, 388 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 389 int32_t tlen, uint32_t tiwin, int32_t nxt_pkt, uint8_t iptos); 390 static int 391 rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, 392 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 393 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 394 static int 395 rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, 396 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 397 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 398 static int 399 rack_do_lastack(struct mbuf *m, struct tcphdr *th, 400 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 401 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 402 static int 403 rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, 404 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 405 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 406 static int 407 rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, 408 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 409 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 410 struct rack_sendmap * 411 tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, 412 uint32_t tsused); 413 static void tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt); 414 static void 415 tcp_rack_partialack(struct tcpcb *tp, struct tcphdr *th); 416 417 int32_t rack_clear_counter=0; 418 419 420 static int 421 sysctl_rack_clear(SYSCTL_HANDLER_ARGS) 422 { 423 uint32_t stat; 424 int32_t error; 425 426 error = SYSCTL_OUT(req, &rack_clear_counter, sizeof(uint32_t)); 427 if (error || req->newptr == NULL) 428 return error; 429 430 error = SYSCTL_IN(req, &stat, sizeof(uint32_t)); 431 if (error) 432 return (error); 433 if (stat == 1) { 434 #ifdef INVARIANTS 435 printf("Clearing RACK counters\n"); 436 #endif 437 counter_u64_zero(rack_badfr); 438 counter_u64_zero(rack_badfr_bytes); 439 counter_u64_zero(rack_rtm_prr_retran); 440 counter_u64_zero(rack_rtm_prr_newdata); 441 counter_u64_zero(rack_timestamp_mismatch); 442 counter_u64_zero(rack_reorder_seen); 443 counter_u64_zero(rack_tlp_tot); 444 counter_u64_zero(rack_tlp_newdata); 445 counter_u64_zero(rack_tlp_retran); 446 counter_u64_zero(rack_tlp_retran_bytes); 447 counter_u64_zero(rack_tlp_retran_fail); 448 counter_u64_zero(rack_to_tot); 449 counter_u64_zero(rack_to_arm_rack); 450 counter_u64_zero(rack_to_arm_tlp); 451 counter_u64_zero(rack_paced_segments); 452 counter_u64_zero(rack_calc_zero); 453 counter_u64_zero(rack_calc_nonzero); 454 counter_u64_zero(rack_unpaced_segments); 455 counter_u64_zero(rack_saw_enobuf); 456 counter_u64_zero(rack_saw_enetunreach); 457 counter_u64_zero(rack_per_timer_hole); 458 counter_u64_zero(rack_to_alloc_hard); 459 counter_u64_zero(rack_to_alloc_emerg); 460 counter_u64_zero(rack_sack_proc_all); 461 counter_u64_zero(rack_sack_proc_short); 462 counter_u64_zero(rack_sack_proc_restart); 463 counter_u64_zero(rack_to_alloc); 464 counter_u64_zero(rack_to_alloc_limited); 465 counter_u64_zero(rack_alloc_limited_conns); 466 counter_u64_zero(rack_split_limited); 467 counter_u64_zero(rack_find_high); 468 counter_u64_zero(rack_tls_rwnd); 469 counter_u64_zero(rack_tls_cwnd); 470 counter_u64_zero(rack_tls_app); 471 counter_u64_zero(rack_tls_other); 472 counter_u64_zero(rack_tls_filled); 473 counter_u64_zero(rack_tls_rxt); 474 counter_u64_zero(rack_tls_tlp); 475 counter_u64_zero(rack_sack_attacks_detected); 476 counter_u64_zero(rack_sack_attacks_reversed); 477 counter_u64_zero(rack_sack_used_next_merge); 478 counter_u64_zero(rack_sack_used_prev_merge); 479 counter_u64_zero(rack_sack_splits); 480 counter_u64_zero(rack_sack_skipped_acked); 481 counter_u64_zero(rack_ack_total); 482 counter_u64_zero(rack_express_sack); 483 counter_u64_zero(rack_sack_total); 484 counter_u64_zero(rack_move_none); 485 counter_u64_zero(rack_move_some); 486 counter_u64_zero(rack_used_tlpmethod); 487 counter_u64_zero(rack_used_tlpmethod2); 488 counter_u64_zero(rack_enter_tlp_calc); 489 counter_u64_zero(rack_progress_drops); 490 counter_u64_zero(rack_tlp_does_nada); 491 counter_u64_zero(rack_collapsed_win); 492 493 } 494 rack_clear_counter = 0; 495 return (0); 496 } 497 498 499 500 static void 501 rack_init_sysctls(void) 502 { 503 struct sysctl_oid *rack_counters; 504 struct sysctl_oid *rack_attack; 505 506 SYSCTL_ADD_S32(&rack_sysctl_ctx, 507 SYSCTL_CHILDREN(rack_sysctl_root), 508 OID_AUTO, "rate_sample_method", CTLFLAG_RW, 509 &rack_rate_sample_method , USE_RTT_LOW, 510 "What method should we use for rate sampling 0=high, 1=low "); 511 SYSCTL_ADD_S32(&rack_sysctl_ctx, 512 SYSCTL_CHILDREN(rack_sysctl_root), 513 OID_AUTO, "hw_tlsmax", CTLFLAG_RW, 514 &rack_hw_tls_max_seg , 0, 515 "Do we have a multplier of TLS records we can send as a max (0=1 TLS record)? "); 516 SYSCTL_ADD_S32(&rack_sysctl_ctx, 517 SYSCTL_CHILDREN(rack_sysctl_root), 518 OID_AUTO, "data_after_close", CTLFLAG_RW, 519 &rack_ignore_data_after_close, 0, 520 "Do we hold off sending a RST until all pending data is ack'd"); 521 SYSCTL_ADD_S32(&rack_sysctl_ctx, 522 SYSCTL_CHILDREN(rack_sysctl_root), 523 OID_AUTO, "cheat_rxt", CTLFLAG_RW, 524 &use_rack_cheat, 1, 525 "Do we use the rxt cheat for rack?"); 526 527 SYSCTL_ADD_U32(&rack_sysctl_ctx, 528 SYSCTL_CHILDREN(rack_sysctl_root), 529 OID_AUTO, "persmin", CTLFLAG_RW, 530 &rack_persist_min, 250, 531 "What is the minimum time in milliseconds between persists"); 532 SYSCTL_ADD_U32(&rack_sysctl_ctx, 533 SYSCTL_CHILDREN(rack_sysctl_root), 534 OID_AUTO, "persmax", CTLFLAG_RW, 535 &rack_persist_max, 1000, 536 "What is the largest delay in milliseconds between persists"); 537 SYSCTL_ADD_S32(&rack_sysctl_ctx, 538 SYSCTL_CHILDREN(rack_sysctl_root), 539 OID_AUTO, "no_sack_needed", CTLFLAG_RW, 540 &rack_sack_not_required, 0, 541 "Do we allow rack to run on connections not supporting SACK?"); 542 SYSCTL_ADD_S32(&rack_sysctl_ctx, 543 SYSCTL_CHILDREN(rack_sysctl_root), 544 OID_AUTO, "tlpmethod", CTLFLAG_RW, 545 &rack_tlp_threshold_use, TLP_USE_TWO_ONE, 546 "What method do we do for TLP time calc 0=no-de-ack-comp, 1=ID, 2=2.1, 3=2.2"); 547 SYSCTL_ADD_S32(&rack_sysctl_ctx, 548 SYSCTL_CHILDREN(rack_sysctl_root), 549 OID_AUTO, "gp_percentage", CTLFLAG_RW, 550 &rack_per_of_gp, 50, 551 "Do we pace to percentage of goodput (0=old method)?"); 552 SYSCTL_ADD_S32(&rack_sysctl_ctx, 553 SYSCTL_CHILDREN(rack_sysctl_root), 554 OID_AUTO, "min_pace_time", CTLFLAG_RW, 555 &rack_min_pace_time, 0, 556 "Should we enforce a minimum pace time of 1ms"); 557 SYSCTL_ADD_S32(&rack_sysctl_ctx, 558 SYSCTL_CHILDREN(rack_sysctl_root), 559 OID_AUTO, "bb_verbose", CTLFLAG_RW, 560 &rack_verbose_logging, 0, 561 "Should RACK black box logging be verbose"); 562 SYSCTL_ADD_S32(&rack_sysctl_ctx, 563 SYSCTL_CHILDREN(rack_sysctl_root), 564 OID_AUTO, "sackfiltering", CTLFLAG_RW, 565 &rack_use_sack_filter, 1, 566 "Do we use sack filtering?"); 567 SYSCTL_ADD_S32(&rack_sysctl_ctx, 568 SYSCTL_CHILDREN(rack_sysctl_root), 569 OID_AUTO, "delayed_ack", CTLFLAG_RW, 570 &rack_delayed_ack_time, 200, 571 "Delayed ack time (200ms)"); 572 SYSCTL_ADD_S32(&rack_sysctl_ctx, 573 SYSCTL_CHILDREN(rack_sysctl_root), 574 OID_AUTO, "tlpminto", CTLFLAG_RW, 575 &rack_tlp_min, 10, 576 "TLP minimum timeout per the specification (10ms)"); 577 SYSCTL_ADD_S32(&rack_sysctl_ctx, 578 SYSCTL_CHILDREN(rack_sysctl_root), 579 OID_AUTO, "send_oldest", CTLFLAG_RW, 580 &rack_always_send_oldest, 1, 581 "Should we always send the oldest TLP and RACK-TLP"); 582 SYSCTL_ADD_S32(&rack_sysctl_ctx, 583 SYSCTL_CHILDREN(rack_sysctl_root), 584 OID_AUTO, "rack_tlimit", CTLFLAG_RW, 585 &rack_limited_retran, 0, 586 "How many times can a rack timeout drive out sends"); 587 SYSCTL_ADD_S32(&rack_sysctl_ctx, 588 SYSCTL_CHILDREN(rack_sysctl_root), 589 OID_AUTO, "minrto", CTLFLAG_RW, 590 &rack_rto_min, 0, 591 "Minimum RTO in ms -- set with caution below 1000 due to TLP"); 592 SYSCTL_ADD_S32(&rack_sysctl_ctx, 593 SYSCTL_CHILDREN(rack_sysctl_root), 594 OID_AUTO, "maxrto", CTLFLAG_RW, 595 &rack_rto_max, 0, 596 "Maxiumum RTO in ms -- should be at least as large as min_rto"); 597 SYSCTL_ADD_S32(&rack_sysctl_ctx, 598 SYSCTL_CHILDREN(rack_sysctl_root), 599 OID_AUTO, "tlp_retry", CTLFLAG_RW, 600 &rack_tlp_max_resend, 2, 601 "How many times does TLP retry a single segment or multiple with no ACK"); 602 SYSCTL_ADD_S32(&rack_sysctl_ctx, 603 SYSCTL_CHILDREN(rack_sysctl_root), 604 OID_AUTO, "recovery_loss_prop", CTLFLAG_RW, 605 &rack_use_proportional_reduce, 0, 606 "Should we proportionaly reduce cwnd based on the number of losses "); 607 SYSCTL_ADD_S32(&rack_sysctl_ctx, 608 SYSCTL_CHILDREN(rack_sysctl_root), 609 OID_AUTO, "recovery_prop", CTLFLAG_RW, 610 &rack_proportional_rate, 10, 611 "What percent reduction per loss"); 612 SYSCTL_ADD_S32(&rack_sysctl_ctx, 613 SYSCTL_CHILDREN(rack_sysctl_root), 614 OID_AUTO, "tlp_cwnd_flag", CTLFLAG_RW, 615 &rack_lower_cwnd_at_tlp, 0, 616 "When a TLP completes a retran should we enter recovery?"); 617 SYSCTL_ADD_S32(&rack_sysctl_ctx, 618 SYSCTL_CHILDREN(rack_sysctl_root), 619 OID_AUTO, "hptsi_reduces", CTLFLAG_RW, 620 &rack_slot_reduction, 4, 621 "When setting a slot should we reduce by divisor"); 622 SYSCTL_ADD_S32(&rack_sysctl_ctx, 623 SYSCTL_CHILDREN(rack_sysctl_root), 624 OID_AUTO, "hptsi_every_seg", CTLFLAG_RW, 625 &rack_pace_every_seg, 0, 626 "Should we use the original pacing mechanism that did not pace much?"); 627 SYSCTL_ADD_S32(&rack_sysctl_ctx, 628 SYSCTL_CHILDREN(rack_sysctl_root), 629 OID_AUTO, "hptsi_seg_max", CTLFLAG_RW, 630 &rack_hptsi_segments, 40, 631 "Should we pace out only a limited size of segments"); 632 SYSCTL_ADD_S32(&rack_sysctl_ctx, 633 SYSCTL_CHILDREN(rack_sysctl_root), 634 OID_AUTO, "prr_sendalot", CTLFLAG_RW, 635 &rack_send_a_lot_in_prr, 1, 636 "Send a lot in prr"); 637 SYSCTL_ADD_S32(&rack_sysctl_ctx, 638 SYSCTL_CHILDREN(rack_sysctl_root), 639 OID_AUTO, "minto", CTLFLAG_RW, 640 &rack_min_to, 1, 641 "Minimum rack timeout in milliseconds"); 642 SYSCTL_ADD_S32(&rack_sysctl_ctx, 643 SYSCTL_CHILDREN(rack_sysctl_root), 644 OID_AUTO, "earlyrecovery", CTLFLAG_RW, 645 &rack_early_recovery, 1, 646 "Do we do early recovery with rack"); 647 SYSCTL_ADD_S32(&rack_sysctl_ctx, 648 SYSCTL_CHILDREN(rack_sysctl_root), 649 OID_AUTO, "reorder_thresh", CTLFLAG_RW, 650 &rack_reorder_thresh, 2, 651 "What factor for rack will be added when seeing reordering (shift right)"); 652 SYSCTL_ADD_S32(&rack_sysctl_ctx, 653 SYSCTL_CHILDREN(rack_sysctl_root), 654 OID_AUTO, "rtt_tlp_thresh", CTLFLAG_RW, 655 &rack_tlp_thresh, 1, 656 "what divisor for TLP rtt/retran will be added (1=rtt, 2=1/2 rtt etc)"); 657 SYSCTL_ADD_S32(&rack_sysctl_ctx, 658 SYSCTL_CHILDREN(rack_sysctl_root), 659 OID_AUTO, "reorder_fade", CTLFLAG_RW, 660 &rack_reorder_fade, 0, 661 "Does reorder detection fade, if so how many ms (0 means never)"); 662 SYSCTL_ADD_S32(&rack_sysctl_ctx, 663 SYSCTL_CHILDREN(rack_sysctl_root), 664 OID_AUTO, "pktdelay", CTLFLAG_RW, 665 &rack_pkt_delay, 1, 666 "Extra RACK time (in ms) besides reordering thresh"); 667 668 rack_counters = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 669 SYSCTL_CHILDREN(rack_sysctl_root), 670 OID_AUTO, 671 "stats", 672 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 673 "Rack Counters"); 674 rack_badfr = counter_u64_alloc(M_WAITOK); 675 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 676 SYSCTL_CHILDREN(rack_counters), 677 OID_AUTO, "badfr", CTLFLAG_RD, 678 &rack_badfr, "Total number of bad FRs"); 679 rack_badfr_bytes = counter_u64_alloc(M_WAITOK); 680 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 681 SYSCTL_CHILDREN(rack_counters), 682 OID_AUTO, "badfr_bytes", CTLFLAG_RD, 683 &rack_badfr_bytes, "Total number of bad FRs"); 684 rack_rtm_prr_retran = counter_u64_alloc(M_WAITOK); 685 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 686 SYSCTL_CHILDREN(rack_counters), 687 OID_AUTO, "prrsndret", CTLFLAG_RD, 688 &rack_rtm_prr_retran, 689 "Total number of prr based retransmits"); 690 rack_rtm_prr_newdata = counter_u64_alloc(M_WAITOK); 691 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 692 SYSCTL_CHILDREN(rack_counters), 693 OID_AUTO, "prrsndnew", CTLFLAG_RD, 694 &rack_rtm_prr_newdata, 695 "Total number of prr based new transmits"); 696 rack_timestamp_mismatch = counter_u64_alloc(M_WAITOK); 697 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 698 SYSCTL_CHILDREN(rack_counters), 699 OID_AUTO, "tsnf", CTLFLAG_RD, 700 &rack_timestamp_mismatch, 701 "Total number of timestamps that we could not find the reported ts"); 702 rack_find_high = counter_u64_alloc(M_WAITOK); 703 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 704 SYSCTL_CHILDREN(rack_counters), 705 OID_AUTO, "findhigh", CTLFLAG_RD, 706 &rack_find_high, 707 "Total number of FIN causing find-high"); 708 rack_reorder_seen = counter_u64_alloc(M_WAITOK); 709 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 710 SYSCTL_CHILDREN(rack_counters), 711 OID_AUTO, "reordering", CTLFLAG_RD, 712 &rack_reorder_seen, 713 "Total number of times we added delay due to reordering"); 714 rack_tlp_tot = counter_u64_alloc(M_WAITOK); 715 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 716 SYSCTL_CHILDREN(rack_counters), 717 OID_AUTO, "tlp_to_total", CTLFLAG_RD, 718 &rack_tlp_tot, 719 "Total number of tail loss probe expirations"); 720 rack_tlp_newdata = counter_u64_alloc(M_WAITOK); 721 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 722 SYSCTL_CHILDREN(rack_counters), 723 OID_AUTO, "tlp_new", CTLFLAG_RD, 724 &rack_tlp_newdata, 725 "Total number of tail loss probe sending new data"); 726 727 rack_tlp_retran = counter_u64_alloc(M_WAITOK); 728 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 729 SYSCTL_CHILDREN(rack_counters), 730 OID_AUTO, "tlp_retran", CTLFLAG_RD, 731 &rack_tlp_retran, 732 "Total number of tail loss probe sending retransmitted data"); 733 rack_tlp_retran_bytes = counter_u64_alloc(M_WAITOK); 734 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 735 SYSCTL_CHILDREN(rack_counters), 736 OID_AUTO, "tlp_retran_bytes", CTLFLAG_RD, 737 &rack_tlp_retran_bytes, 738 "Total bytes of tail loss probe sending retransmitted data"); 739 rack_tlp_retran_fail = counter_u64_alloc(M_WAITOK); 740 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 741 SYSCTL_CHILDREN(rack_counters), 742 OID_AUTO, "tlp_retran_fail", CTLFLAG_RD, 743 &rack_tlp_retran_fail, 744 "Total number of tail loss probe sending retransmitted data that failed (wait for t3)"); 745 rack_to_tot = counter_u64_alloc(M_WAITOK); 746 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 747 SYSCTL_CHILDREN(rack_counters), 748 OID_AUTO, "rack_to_tot", CTLFLAG_RD, 749 &rack_to_tot, 750 "Total number of times the rack to expired?"); 751 rack_to_arm_rack = counter_u64_alloc(M_WAITOK); 752 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 753 SYSCTL_CHILDREN(rack_counters), 754 OID_AUTO, "arm_rack", CTLFLAG_RD, 755 &rack_to_arm_rack, 756 "Total number of times the rack timer armed?"); 757 rack_to_arm_tlp = counter_u64_alloc(M_WAITOK); 758 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 759 SYSCTL_CHILDREN(rack_counters), 760 OID_AUTO, "arm_tlp", CTLFLAG_RD, 761 &rack_to_arm_tlp, 762 "Total number of times the tlp timer armed?"); 763 764 rack_calc_zero = counter_u64_alloc(M_WAITOK); 765 rack_calc_nonzero = counter_u64_alloc(M_WAITOK); 766 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 767 SYSCTL_CHILDREN(rack_counters), 768 OID_AUTO, "calc_zero", CTLFLAG_RD, 769 &rack_calc_zero, 770 "Total number of times pacing time worked out to zero?"); 771 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 772 SYSCTL_CHILDREN(rack_counters), 773 OID_AUTO, "calc_nonzero", CTLFLAG_RD, 774 &rack_calc_nonzero, 775 "Total number of times pacing time worked out to non-zero?"); 776 rack_paced_segments = counter_u64_alloc(M_WAITOK); 777 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 778 SYSCTL_CHILDREN(rack_counters), 779 OID_AUTO, "paced", CTLFLAG_RD, 780 &rack_paced_segments, 781 "Total number of times a segment send caused hptsi"); 782 rack_unpaced_segments = counter_u64_alloc(M_WAITOK); 783 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 784 SYSCTL_CHILDREN(rack_counters), 785 OID_AUTO, "unpaced", CTLFLAG_RD, 786 &rack_unpaced_segments, 787 "Total number of times a segment did not cause hptsi"); 788 rack_saw_enobuf = counter_u64_alloc(M_WAITOK); 789 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 790 SYSCTL_CHILDREN(rack_counters), 791 OID_AUTO, "saw_enobufs", CTLFLAG_RD, 792 &rack_saw_enobuf, 793 "Total number of times a segment did not cause hptsi"); 794 rack_saw_enetunreach = counter_u64_alloc(M_WAITOK); 795 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 796 SYSCTL_CHILDREN(rack_counters), 797 OID_AUTO, "saw_enetunreach", CTLFLAG_RD, 798 &rack_saw_enetunreach, 799 "Total number of times a segment did not cause hptsi"); 800 rack_to_alloc = counter_u64_alloc(M_WAITOK); 801 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 802 SYSCTL_CHILDREN(rack_counters), 803 OID_AUTO, "allocs", CTLFLAG_RD, 804 &rack_to_alloc, 805 "Total allocations of tracking structures"); 806 rack_to_alloc_hard = counter_u64_alloc(M_WAITOK); 807 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 808 SYSCTL_CHILDREN(rack_counters), 809 OID_AUTO, "allochard", CTLFLAG_RD, 810 &rack_to_alloc_hard, 811 "Total allocations done with sleeping the hard way"); 812 rack_to_alloc_emerg = counter_u64_alloc(M_WAITOK); 813 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 814 SYSCTL_CHILDREN(rack_counters), 815 OID_AUTO, "allocemerg", CTLFLAG_RD, 816 &rack_to_alloc_emerg, 817 "Total allocations done from emergency cache"); 818 rack_to_alloc_limited = counter_u64_alloc(M_WAITOK); 819 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 820 SYSCTL_CHILDREN(rack_counters), 821 OID_AUTO, "alloc_limited", CTLFLAG_RD, 822 &rack_to_alloc_limited, 823 "Total allocations dropped due to limit"); 824 rack_alloc_limited_conns = counter_u64_alloc(M_WAITOK); 825 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 826 SYSCTL_CHILDREN(rack_counters), 827 OID_AUTO, "alloc_limited_conns", CTLFLAG_RD, 828 &rack_alloc_limited_conns, 829 "Connections with allocations dropped due to limit"); 830 rack_split_limited = counter_u64_alloc(M_WAITOK); 831 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 832 SYSCTL_CHILDREN(rack_counters), 833 OID_AUTO, "split_limited", CTLFLAG_RD, 834 &rack_split_limited, 835 "Split allocations dropped due to limit"); 836 rack_sack_proc_all = counter_u64_alloc(M_WAITOK); 837 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 838 SYSCTL_CHILDREN(rack_counters), 839 OID_AUTO, "sack_long", CTLFLAG_RD, 840 &rack_sack_proc_all, 841 "Total times we had to walk whole list for sack processing"); 842 843 rack_sack_proc_restart = counter_u64_alloc(M_WAITOK); 844 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 845 SYSCTL_CHILDREN(rack_counters), 846 OID_AUTO, "sack_restart", CTLFLAG_RD, 847 &rack_sack_proc_restart, 848 "Total times we had to walk whole list due to a restart"); 849 rack_sack_proc_short = counter_u64_alloc(M_WAITOK); 850 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 851 SYSCTL_CHILDREN(rack_counters), 852 OID_AUTO, "sack_short", CTLFLAG_RD, 853 &rack_sack_proc_short, 854 "Total times we took shortcut for sack processing"); 855 rack_enter_tlp_calc = counter_u64_alloc(M_WAITOK); 856 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 857 SYSCTL_CHILDREN(rack_counters), 858 OID_AUTO, "tlp_calc_entered", CTLFLAG_RD, 859 &rack_enter_tlp_calc, 860 "Total times we called calc-tlp"); 861 rack_used_tlpmethod = counter_u64_alloc(M_WAITOK); 862 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 863 SYSCTL_CHILDREN(rack_counters), 864 OID_AUTO, "hit_tlp_method", CTLFLAG_RD, 865 &rack_used_tlpmethod, 866 "Total number of runt sacks"); 867 rack_used_tlpmethod2 = counter_u64_alloc(M_WAITOK); 868 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 869 SYSCTL_CHILDREN(rack_counters), 870 OID_AUTO, "hit_tlp_method2", CTLFLAG_RD, 871 &rack_used_tlpmethod2, 872 "Total number of times we hit TLP method 2"); 873 /* Sack Attacker detection stuff */ 874 rack_attack = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 875 SYSCTL_CHILDREN(rack_sysctl_root), 876 OID_AUTO, 877 "sack_attack", 878 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 879 "Rack Sack Attack Counters and Controls"); 880 SYSCTL_ADD_U32(&rack_sysctl_ctx, 881 SYSCTL_CHILDREN(rack_attack), 882 OID_AUTO, "detect_highsackratio", CTLFLAG_RW, 883 &rack_highest_sack_thresh_seen, 0, 884 "Highest sack to ack ratio seen"); 885 SYSCTL_ADD_U32(&rack_sysctl_ctx, 886 SYSCTL_CHILDREN(rack_attack), 887 OID_AUTO, "detect_highmoveratio", CTLFLAG_RW, 888 &rack_highest_move_thresh_seen, 0, 889 "Highest move to non-move ratio seen"); 890 rack_ack_total = counter_u64_alloc(M_WAITOK); 891 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 892 SYSCTL_CHILDREN(rack_attack), 893 OID_AUTO, "acktotal", CTLFLAG_RD, 894 &rack_ack_total, 895 "Total number of Ack's"); 896 897 rack_express_sack = counter_u64_alloc(M_WAITOK); 898 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 899 SYSCTL_CHILDREN(rack_attack), 900 OID_AUTO, "exp_sacktotal", CTLFLAG_RD, 901 &rack_express_sack, 902 "Total expresss number of Sack's"); 903 rack_sack_total = counter_u64_alloc(M_WAITOK); 904 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 905 SYSCTL_CHILDREN(rack_attack), 906 OID_AUTO, "sacktotal", CTLFLAG_RD, 907 &rack_sack_total, 908 "Total number of SACK's"); 909 rack_move_none = counter_u64_alloc(M_WAITOK); 910 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 911 SYSCTL_CHILDREN(rack_attack), 912 OID_AUTO, "move_none", CTLFLAG_RD, 913 &rack_move_none, 914 "Total number of SACK index reuse of postions under threshold"); 915 rack_move_some = counter_u64_alloc(M_WAITOK); 916 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 917 SYSCTL_CHILDREN(rack_attack), 918 OID_AUTO, "move_some", CTLFLAG_RD, 919 &rack_move_some, 920 "Total number of SACK index reuse of postions over threshold"); 921 rack_sack_attacks_detected = counter_u64_alloc(M_WAITOK); 922 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 923 SYSCTL_CHILDREN(rack_attack), 924 OID_AUTO, "attacks", CTLFLAG_RD, 925 &rack_sack_attacks_detected, 926 "Total number of SACK attackers that had sack disabled"); 927 rack_sack_attacks_reversed = counter_u64_alloc(M_WAITOK); 928 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 929 SYSCTL_CHILDREN(rack_attack), 930 OID_AUTO, "reversed", CTLFLAG_RD, 931 &rack_sack_attacks_reversed, 932 "Total number of SACK attackers that were later determined false positive"); 933 rack_sack_used_next_merge = counter_u64_alloc(M_WAITOK); 934 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 935 SYSCTL_CHILDREN(rack_attack), 936 OID_AUTO, "nextmerge", CTLFLAG_RD, 937 &rack_sack_used_next_merge, 938 "Total number of times we used the next merge"); 939 rack_sack_used_prev_merge = counter_u64_alloc(M_WAITOK); 940 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 941 SYSCTL_CHILDREN(rack_attack), 942 OID_AUTO, "prevmerge", CTLFLAG_RD, 943 &rack_sack_used_prev_merge, 944 "Total number of times we used the prev merge"); 945 rack_sack_skipped_acked = counter_u64_alloc(M_WAITOK); 946 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 947 SYSCTL_CHILDREN(rack_attack), 948 OID_AUTO, "skipacked", CTLFLAG_RD, 949 &rack_sack_skipped_acked, 950 "Total number of times we skipped previously sacked"); 951 rack_sack_splits = counter_u64_alloc(M_WAITOK); 952 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 953 SYSCTL_CHILDREN(rack_attack), 954 OID_AUTO, "ofsplit", CTLFLAG_RD, 955 &rack_sack_splits, 956 "Total number of times we did the old fashion tree split"); 957 rack_progress_drops = counter_u64_alloc(M_WAITOK); 958 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 959 SYSCTL_CHILDREN(rack_counters), 960 OID_AUTO, "prog_drops", CTLFLAG_RD, 961 &rack_progress_drops, 962 "Total number of progress drops"); 963 rack_input_idle_reduces = counter_u64_alloc(M_WAITOK); 964 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 965 SYSCTL_CHILDREN(rack_counters), 966 OID_AUTO, "idle_reduce_oninput", CTLFLAG_RD, 967 &rack_input_idle_reduces, 968 "Total number of idle reductions on input"); 969 rack_collapsed_win = counter_u64_alloc(M_WAITOK); 970 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 971 SYSCTL_CHILDREN(rack_counters), 972 OID_AUTO, "collapsed_win", CTLFLAG_RD, 973 &rack_collapsed_win, 974 "Total number of collapsed windows"); 975 rack_tlp_does_nada = counter_u64_alloc(M_WAITOK); 976 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 977 SYSCTL_CHILDREN(rack_counters), 978 OID_AUTO, "tlp_nada", CTLFLAG_RD, 979 &rack_tlp_does_nada, 980 "Total number of nada tlp calls"); 981 982 rack_tls_rwnd = counter_u64_alloc(M_WAITOK); 983 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 984 SYSCTL_CHILDREN(rack_counters), 985 OID_AUTO, "tls_rwnd", CTLFLAG_RD, 986 &rack_tls_rwnd, 987 "Total hdwr tls rwnd limited"); 988 989 rack_tls_cwnd = counter_u64_alloc(M_WAITOK); 990 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 991 SYSCTL_CHILDREN(rack_counters), 992 OID_AUTO, "tls_cwnd", CTLFLAG_RD, 993 &rack_tls_cwnd, 994 "Total hdwr tls cwnd limited"); 995 996 rack_tls_app = counter_u64_alloc(M_WAITOK); 997 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 998 SYSCTL_CHILDREN(rack_counters), 999 OID_AUTO, "tls_app", CTLFLAG_RD, 1000 &rack_tls_app, 1001 "Total hdwr tls app limited"); 1002 1003 rack_tls_other = counter_u64_alloc(M_WAITOK); 1004 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1005 SYSCTL_CHILDREN(rack_counters), 1006 OID_AUTO, "tls_other", CTLFLAG_RD, 1007 &rack_tls_other, 1008 "Total hdwr tls other limited"); 1009 1010 rack_tls_filled = counter_u64_alloc(M_WAITOK); 1011 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1012 SYSCTL_CHILDREN(rack_counters), 1013 OID_AUTO, "tls_filled", CTLFLAG_RD, 1014 &rack_tls_filled, 1015 "Total hdwr tls filled"); 1016 1017 rack_tls_rxt = counter_u64_alloc(M_WAITOK); 1018 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1019 SYSCTL_CHILDREN(rack_counters), 1020 OID_AUTO, "tls_rxt", CTLFLAG_RD, 1021 &rack_tls_rxt, 1022 "Total hdwr rxt"); 1023 1024 rack_tls_tlp = counter_u64_alloc(M_WAITOK); 1025 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1026 SYSCTL_CHILDREN(rack_counters), 1027 OID_AUTO, "tls_tlp", CTLFLAG_RD, 1028 &rack_tls_tlp, 1029 "Total hdwr tls tlp"); 1030 rack_per_timer_hole = counter_u64_alloc(M_WAITOK); 1031 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1032 SYSCTL_CHILDREN(rack_counters), 1033 OID_AUTO, "timer_hole", CTLFLAG_RD, 1034 &rack_per_timer_hole, 1035 "Total persists start in timer hole"); 1036 1037 COUNTER_ARRAY_ALLOC(rack_out_size, TCP_MSS_ACCT_SIZE, M_WAITOK); 1038 SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), 1039 OID_AUTO, "outsize", CTLFLAG_RD, 1040 rack_out_size, TCP_MSS_ACCT_SIZE, "MSS send sizes"); 1041 COUNTER_ARRAY_ALLOC(rack_opts_arry, RACK_OPTS_SIZE, M_WAITOK); 1042 SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), 1043 OID_AUTO, "opts", CTLFLAG_RD, 1044 rack_opts_arry, RACK_OPTS_SIZE, "RACK Option Stats"); 1045 SYSCTL_ADD_PROC(&rack_sysctl_ctx, 1046 SYSCTL_CHILDREN(rack_sysctl_root), 1047 OID_AUTO, "clear", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, 1048 &rack_clear_counter, 0, sysctl_rack_clear, "IU", "Clear counters"); 1049 } 1050 1051 static __inline int 1052 rb_map_cmp(struct rack_sendmap *b, struct rack_sendmap *a) 1053 { 1054 if (SEQ_GEQ(b->r_start, a->r_start) && 1055 SEQ_LT(b->r_start, a->r_end)) { 1056 /* 1057 * The entry b is within the 1058 * block a. i.e.: 1059 * a -- |-------------| 1060 * b -- |----| 1061 * <or> 1062 * b -- |------| 1063 * <or> 1064 * b -- |-----------| 1065 */ 1066 return (0); 1067 } else if (SEQ_GEQ(b->r_start, a->r_end)) { 1068 /* 1069 * b falls as either the next 1070 * sequence block after a so a 1071 * is said to be smaller than b. 1072 * i.e: 1073 * a -- |------| 1074 * b -- |--------| 1075 * or 1076 * b -- |-----| 1077 */ 1078 return (1); 1079 } 1080 /* 1081 * Whats left is where a is 1082 * larger than b. i.e: 1083 * a -- |-------| 1084 * b -- |---| 1085 * or even possibly 1086 * b -- |--------------| 1087 */ 1088 return (-1); 1089 } 1090 1091 RB_PROTOTYPE(rack_rb_tree_head, rack_sendmap, r_next, rb_map_cmp); 1092 RB_GENERATE(rack_rb_tree_head, rack_sendmap, r_next, rb_map_cmp); 1093 1094 static inline int32_t 1095 rack_progress_timeout_check(struct tcpcb *tp) 1096 { 1097 if (tp->t_maxunacktime && tp->t_acktime && TSTMP_GT(ticks, tp->t_acktime)) { 1098 if ((ticks - tp->t_acktime) >= tp->t_maxunacktime) { 1099 /* 1100 * There is an assumption that the caller 1101 * will drop the connection so we will 1102 * increment the counters here. 1103 */ 1104 struct tcp_rack *rack; 1105 rack = (struct tcp_rack *)tp->t_fb_ptr; 1106 counter_u64_add(rack_progress_drops, 1); 1107 #ifdef NETFLIX_STATS 1108 KMOD_TCPSTAT_INC(tcps_progdrops); 1109 #endif 1110 rack_log_progress_event(rack, tp, ticks, PROGRESS_DROP, __LINE__); 1111 return (1); 1112 } 1113 } 1114 return (0); 1115 } 1116 1117 1118 1119 static void 1120 rack_log_retran_reason(struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t tsused, uint32_t thresh, int mod) 1121 { 1122 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 1123 union tcp_log_stackspecific log; 1124 struct timeval tv; 1125 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 1126 log.u_bbr.flex1 = tsused; 1127 log.u_bbr.flex2 = thresh; 1128 log.u_bbr.flex3 = rsm->r_flags; 1129 log.u_bbr.flex4 = rsm->r_dupack; 1130 log.u_bbr.flex5 = rsm->r_start; 1131 log.u_bbr.flex6 = rsm->r_end; 1132 log.u_bbr.flex8 = mod; 1133 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 1134 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 1135 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 1136 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 1137 TCP_LOG_EVENTP(rack->rc_tp, NULL, 1138 &rack->rc_inp->inp_socket->so_rcv, 1139 &rack->rc_inp->inp_socket->so_snd, 1140 BBR_LOG_SETTINGS_CHG, 0, 1141 0, &log, false, &tv); 1142 } 1143 } 1144 1145 1146 1147 static void 1148 rack_log_to_start(struct tcp_rack *rack, uint32_t cts, uint32_t to, int32_t slot, uint8_t which) 1149 { 1150 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 1151 union tcp_log_stackspecific log; 1152 struct timeval tv; 1153 1154 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 1155 log.u_bbr.flex1 = TICKS_2_MSEC(rack->rc_tp->t_srtt >> TCP_RTT_SHIFT); 1156 log.u_bbr.flex2 = to; 1157 log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags; 1158 log.u_bbr.flex4 = slot; 1159 log.u_bbr.flex5 = rack->rc_inp->inp_hptsslot; 1160 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur; 1161 log.u_bbr.flex7 = rack->rc_in_persist; 1162 log.u_bbr.flex8 = which; 1163 log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt; 1164 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 1165 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 1166 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 1167 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 1168 TCP_LOG_EVENTP(rack->rc_tp, NULL, 1169 &rack->rc_inp->inp_socket->so_rcv, 1170 &rack->rc_inp->inp_socket->so_snd, 1171 BBR_LOG_TIMERSTAR, 0, 1172 0, &log, false, &tv); 1173 } 1174 } 1175 1176 static void 1177 rack_log_to_event(struct tcp_rack *rack, int32_t to_num, int no) 1178 { 1179 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 1180 union tcp_log_stackspecific log; 1181 struct timeval tv; 1182 1183 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 1184 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 1185 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 1186 log.u_bbr.flex8 = to_num; 1187 log.u_bbr.flex1 = rack->r_ctl.rc_rack_min_rtt; 1188 log.u_bbr.flex2 = rack->rc_rack_rtt; 1189 log.u_bbr.flex3 = no; 1190 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; 1191 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 1192 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 1193 TCP_LOG_EVENTP(rack->rc_tp, NULL, 1194 &rack->rc_inp->inp_socket->so_rcv, 1195 &rack->rc_inp->inp_socket->so_snd, 1196 BBR_LOG_RTO, 0, 1197 0, &log, false, &tv); 1198 } 1199 } 1200 1201 static void 1202 rack_log_rtt_upd(struct tcpcb *tp, struct tcp_rack *rack, int32_t t, 1203 uint32_t o_srtt, uint32_t o_var) 1204 { 1205 if (tp->t_logstate != TCP_LOG_STATE_OFF) { 1206 union tcp_log_stackspecific log; 1207 struct timeval tv; 1208 1209 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 1210 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 1211 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 1212 log.u_bbr.flex1 = t; 1213 log.u_bbr.flex2 = o_srtt; 1214 log.u_bbr.flex3 = o_var; 1215 log.u_bbr.flex4 = rack->r_ctl.rack_rs.rs_rtt_lowest; 1216 log.u_bbr.flex5 = rack->r_ctl.rack_rs.rs_rtt_highest; 1217 log.u_bbr.flex6 = rack->r_ctl.rack_rs.rs_rtt_cnt; 1218 log.u_bbr.rttProp = rack->r_ctl.rack_rs.rs_rtt_tot; 1219 log.u_bbr.flex8 = rack->r_ctl.rc_rate_sample_method; 1220 log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt; 1221 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 1222 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 1223 TCP_LOG_EVENTP(tp, NULL, 1224 &rack->rc_inp->inp_socket->so_rcv, 1225 &rack->rc_inp->inp_socket->so_snd, 1226 BBR_LOG_BBRRTT, 0, 1227 0, &log, false, &tv); 1228 } 1229 } 1230 1231 static void 1232 rack_log_rtt_sample(struct tcp_rack *rack, uint32_t rtt) 1233 { 1234 /* 1235 * Log the rtt sample we are 1236 * applying to the srtt algorithm in 1237 * useconds. 1238 */ 1239 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 1240 union tcp_log_stackspecific log; 1241 struct timeval tv; 1242 1243 /* Convert our ms to a microsecond */ 1244 memset(&log, 0, sizeof(log)); 1245 log.u_bbr.flex1 = rtt * 1000; 1246 log.u_bbr.flex2 = rack->r_ctl.ack_count; 1247 log.u_bbr.flex3 = rack->r_ctl.sack_count; 1248 log.u_bbr.flex4 = rack->r_ctl.sack_noextra_move; 1249 log.u_bbr.flex5 = rack->r_ctl.sack_moved_extra; 1250 log.u_bbr.flex8 = rack->sack_attack_disable; 1251 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 1252 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 1253 TCP_LOG_EVENTP(rack->rc_tp, NULL, 1254 &rack->rc_inp->inp_socket->so_rcv, 1255 &rack->rc_inp->inp_socket->so_snd, 1256 TCP_LOG_RTT, 0, 1257 0, &log, false, &tv); 1258 } 1259 } 1260 1261 1262 static inline void 1263 rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, int event, int line) 1264 { 1265 if (rack_verbose_logging && (tp->t_logstate != TCP_LOG_STATE_OFF)) { 1266 union tcp_log_stackspecific log; 1267 struct timeval tv; 1268 1269 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 1270 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 1271 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 1272 log.u_bbr.flex1 = line; 1273 log.u_bbr.flex2 = tick; 1274 log.u_bbr.flex3 = tp->t_maxunacktime; 1275 log.u_bbr.flex4 = tp->t_acktime; 1276 log.u_bbr.flex8 = event; 1277 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 1278 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 1279 TCP_LOG_EVENTP(tp, NULL, 1280 &rack->rc_inp->inp_socket->so_rcv, 1281 &rack->rc_inp->inp_socket->so_snd, 1282 BBR_LOG_PROGRESS, 0, 1283 0, &log, false, &tv); 1284 } 1285 } 1286 1287 static void 1288 rack_log_type_bbrsnd(struct tcp_rack *rack, uint32_t len, uint32_t slot, uint32_t cts) 1289 { 1290 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 1291 union tcp_log_stackspecific log; 1292 struct timeval tv; 1293 1294 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 1295 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 1296 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 1297 log.u_bbr.flex1 = slot; 1298 log.u_bbr.flex2 = rack->r_ctl.rc_prr_sndcnt; 1299 log.u_bbr.flex7 = (0x0000ffff & rack->r_ctl.rc_hpts_flags); 1300 log.u_bbr.flex8 = rack->rc_in_persist; 1301 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 1302 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 1303 TCP_LOG_EVENTP(rack->rc_tp, NULL, 1304 &rack->rc_inp->inp_socket->so_rcv, 1305 &rack->rc_inp->inp_socket->so_snd, 1306 BBR_LOG_BBRSND, 0, 1307 0, &log, false, &tv); 1308 } 1309 } 1310 1311 static void 1312 rack_log_doseg_done(struct tcp_rack *rack, uint32_t cts, int32_t nxt_pkt, int32_t did_out, int way_out) 1313 { 1314 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 1315 union tcp_log_stackspecific log; 1316 struct timeval tv; 1317 1318 memset(&log, 0, sizeof(log)); 1319 log.u_bbr.flex1 = did_out; 1320 log.u_bbr.flex2 = nxt_pkt; 1321 log.u_bbr.flex3 = way_out; 1322 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; 1323 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; 1324 log.u_bbr.applimited = rack->r_ctl.rc_pace_min_segs; 1325 log.u_bbr.flex7 = rack->r_wanted_output; 1326 log.u_bbr.flex8 = rack->rc_in_persist; 1327 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 1328 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 1329 TCP_LOG_EVENTP(rack->rc_tp, NULL, 1330 &rack->rc_inp->inp_socket->so_rcv, 1331 &rack->rc_inp->inp_socket->so_snd, 1332 BBR_LOG_DOSEG_DONE, 0, 1333 0, &log, false, &tv); 1334 } 1335 } 1336 1337 static void 1338 rack_log_type_hrdwtso(struct tcpcb *tp, struct tcp_rack *rack, int len, int mod, int32_t orig_len, int frm) 1339 { 1340 if (tp->t_logstate != TCP_LOG_STATE_OFF) { 1341 union tcp_log_stackspecific log; 1342 struct timeval tv; 1343 uint32_t cts; 1344 1345 memset(&log, 0, sizeof(log)); 1346 cts = tcp_get_usecs(&tv); 1347 log.u_bbr.flex1 = rack->r_ctl.rc_pace_min_segs; 1348 log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs; 1349 log.u_bbr.flex4 = len; 1350 log.u_bbr.flex5 = orig_len; 1351 log.u_bbr.flex6 = rack->r_ctl.rc_sacked; 1352 log.u_bbr.flex7 = mod; 1353 log.u_bbr.flex8 = frm; 1354 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 1355 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 1356 TCP_LOG_EVENTP(tp, NULL, 1357 &tp->t_inpcb->inp_socket->so_rcv, 1358 &tp->t_inpcb->inp_socket->so_snd, 1359 TCP_HDWR_TLS, 0, 1360 0, &log, false, &tv); 1361 } 1362 } 1363 1364 static void 1365 rack_log_type_just_return(struct tcp_rack *rack, uint32_t cts, uint32_t tlen, uint32_t slot, uint8_t hpts_calling) 1366 { 1367 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 1368 union tcp_log_stackspecific log; 1369 struct timeval tv; 1370 1371 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 1372 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 1373 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 1374 log.u_bbr.flex1 = slot; 1375 log.u_bbr.flex2 = rack->r_ctl.rc_hpts_flags; 1376 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; 1377 log.u_bbr.flex7 = hpts_calling; 1378 log.u_bbr.flex8 = rack->rc_in_persist; 1379 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 1380 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 1381 TCP_LOG_EVENTP(rack->rc_tp, NULL, 1382 &rack->rc_inp->inp_socket->so_rcv, 1383 &rack->rc_inp->inp_socket->so_snd, 1384 BBR_LOG_JUSTRET, 0, 1385 tlen, &log, false, &tv); 1386 } 1387 } 1388 1389 static void 1390 rack_log_to_cancel(struct tcp_rack *rack, int32_t hpts_removed, int line) 1391 { 1392 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 1393 union tcp_log_stackspecific log; 1394 struct timeval tv; 1395 1396 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 1397 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 1398 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 1399 log.u_bbr.flex1 = line; 1400 log.u_bbr.flex2 = 0; 1401 log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags; 1402 log.u_bbr.flex4 = 0; 1403 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; 1404 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur; 1405 log.u_bbr.flex8 = hpts_removed; 1406 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 1407 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 1408 TCP_LOG_EVENTP(rack->rc_tp, NULL, 1409 &rack->rc_inp->inp_socket->so_rcv, 1410 &rack->rc_inp->inp_socket->so_snd, 1411 BBR_LOG_TIMERCANC, 0, 1412 0, &log, false, &tv); 1413 } 1414 } 1415 1416 static void 1417 rack_log_to_processing(struct tcp_rack *rack, uint32_t cts, int32_t ret, int32_t timers) 1418 { 1419 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 1420 union tcp_log_stackspecific log; 1421 struct timeval tv; 1422 1423 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 1424 log.u_bbr.flex1 = timers; 1425 log.u_bbr.flex2 = ret; 1426 log.u_bbr.flex3 = rack->r_ctl.rc_timer_exp; 1427 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; 1428 log.u_bbr.flex5 = cts; 1429 log.u_bbr.flex6 = rack->r_ctl.rc_prr_sndcnt; 1430 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 1431 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 1432 TCP_LOG_EVENTP(rack->rc_tp, NULL, 1433 &rack->rc_inp->inp_socket->so_rcv, 1434 &rack->rc_inp->inp_socket->so_snd, 1435 BBR_LOG_TO_PROCESS, 0, 1436 0, &log, false, &tv); 1437 } 1438 } 1439 1440 static void 1441 rack_log_to_prr(struct tcp_rack *rack, int frm) 1442 { 1443 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 1444 union tcp_log_stackspecific log; 1445 struct timeval tv; 1446 1447 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 1448 log.u_bbr.flex1 = rack->r_ctl.rc_prr_out; 1449 log.u_bbr.flex2 = rack->r_ctl.rc_prr_recovery_fs; 1450 log.u_bbr.flex3 = rack->r_ctl.rc_prr_sndcnt; 1451 log.u_bbr.flex4 = rack->r_ctl.rc_prr_delivered; 1452 log.u_bbr.flex5 = rack->r_ctl.rc_sacked; 1453 log.u_bbr.flex6 = rack->r_ctl.rc_holes_rxt; 1454 log.u_bbr.flex8 = frm; 1455 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 1456 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 1457 TCP_LOG_EVENTP(rack->rc_tp, NULL, 1458 &rack->rc_inp->inp_socket->so_rcv, 1459 &rack->rc_inp->inp_socket->so_snd, 1460 BBR_LOG_BBRUPD, 0, 1461 0, &log, false, &tv); 1462 } 1463 } 1464 1465 #ifdef NETFLIX_EXP_DETECTION 1466 static void 1467 rack_log_sad(struct tcp_rack *rack, int event) 1468 { 1469 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 1470 union tcp_log_stackspecific log; 1471 struct timeval tv; 1472 1473 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 1474 log.u_bbr.flex1 = rack->r_ctl.sack_count; 1475 log.u_bbr.flex2 = rack->r_ctl.ack_count; 1476 log.u_bbr.flex3 = rack->r_ctl.sack_moved_extra; 1477 log.u_bbr.flex4 = rack->r_ctl.sack_noextra_move; 1478 log.u_bbr.flex5 = rack->r_ctl.rc_num_maps_alloced; 1479 log.u_bbr.flex6 = tcp_sack_to_ack_thresh; 1480 log.u_bbr.pkts_out = tcp_sack_to_move_thresh; 1481 log.u_bbr.lt_epoch = (tcp_force_detection << 8); 1482 log.u_bbr.lt_epoch |= rack->do_detection; 1483 log.u_bbr.applimited = tcp_map_minimum; 1484 log.u_bbr.flex7 = rack->sack_attack_disable; 1485 log.u_bbr.flex8 = event; 1486 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 1487 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 1488 log.u_bbr.delivered = tcp_sad_decay_val; 1489 TCP_LOG_EVENTP(rack->rc_tp, NULL, 1490 &rack->rc_inp->inp_socket->so_rcv, 1491 &rack->rc_inp->inp_socket->so_snd, 1492 TCP_SAD_DETECTION, 0, 1493 0, &log, false, &tv); 1494 } 1495 } 1496 #endif 1497 1498 static void 1499 rack_counter_destroy(void) 1500 { 1501 counter_u64_free(rack_badfr); 1502 counter_u64_free(rack_badfr_bytes); 1503 counter_u64_free(rack_rtm_prr_retran); 1504 counter_u64_free(rack_rtm_prr_newdata); 1505 counter_u64_free(rack_timestamp_mismatch); 1506 counter_u64_free(rack_reorder_seen); 1507 counter_u64_free(rack_tlp_tot); 1508 counter_u64_free(rack_tlp_newdata); 1509 counter_u64_free(rack_tlp_retran); 1510 counter_u64_free(rack_tlp_retran_bytes); 1511 counter_u64_free(rack_tlp_retran_fail); 1512 counter_u64_free(rack_to_tot); 1513 counter_u64_free(rack_to_arm_rack); 1514 counter_u64_free(rack_to_arm_tlp); 1515 counter_u64_free(rack_paced_segments); 1516 counter_u64_free(rack_unpaced_segments); 1517 counter_u64_free(rack_saw_enobuf); 1518 counter_u64_free(rack_saw_enetunreach); 1519 counter_u64_free(rack_to_alloc_hard); 1520 counter_u64_free(rack_to_alloc_emerg); 1521 counter_u64_free(rack_sack_proc_all); 1522 counter_u64_free(rack_sack_proc_short); 1523 counter_u64_free(rack_sack_proc_restart); 1524 counter_u64_free(rack_to_alloc); 1525 counter_u64_free(rack_to_alloc_limited); 1526 counter_u64_free(rack_alloc_limited_conns); 1527 counter_u64_free(rack_split_limited); 1528 counter_u64_free(rack_find_high); 1529 counter_u64_free(rack_enter_tlp_calc); 1530 counter_u64_free(rack_used_tlpmethod); 1531 counter_u64_free(rack_used_tlpmethod2); 1532 counter_u64_free(rack_progress_drops); 1533 counter_u64_free(rack_input_idle_reduces); 1534 counter_u64_free(rack_collapsed_win); 1535 counter_u64_free(rack_tlp_does_nada); 1536 COUNTER_ARRAY_FREE(rack_out_size, TCP_MSS_ACCT_SIZE); 1537 COUNTER_ARRAY_FREE(rack_opts_arry, RACK_OPTS_SIZE); 1538 } 1539 1540 static struct rack_sendmap * 1541 rack_alloc(struct tcp_rack *rack) 1542 { 1543 struct rack_sendmap *rsm; 1544 1545 rsm = uma_zalloc(rack_zone, M_NOWAIT); 1546 if (rsm) { 1547 rack->r_ctl.rc_num_maps_alloced++; 1548 counter_u64_add(rack_to_alloc, 1); 1549 return (rsm); 1550 } 1551 if (rack->rc_free_cnt) { 1552 counter_u64_add(rack_to_alloc_emerg, 1); 1553 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); 1554 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext); 1555 rack->rc_free_cnt--; 1556 return (rsm); 1557 } 1558 return (NULL); 1559 } 1560 1561 static struct rack_sendmap * 1562 rack_alloc_full_limit(struct tcp_rack *rack) 1563 { 1564 if ((V_tcp_map_entries_limit > 0) && 1565 (rack->do_detection == 0) && 1566 (rack->r_ctl.rc_num_maps_alloced >= V_tcp_map_entries_limit)) { 1567 counter_u64_add(rack_to_alloc_limited, 1); 1568 if (!rack->alloc_limit_reported) { 1569 rack->alloc_limit_reported = 1; 1570 counter_u64_add(rack_alloc_limited_conns, 1); 1571 } 1572 return (NULL); 1573 } 1574 return (rack_alloc(rack)); 1575 } 1576 1577 /* wrapper to allocate a sendmap entry, subject to a specific limit */ 1578 static struct rack_sendmap * 1579 rack_alloc_limit(struct tcp_rack *rack, uint8_t limit_type) 1580 { 1581 struct rack_sendmap *rsm; 1582 1583 if (limit_type) { 1584 /* currently there is only one limit type */ 1585 if (V_tcp_map_split_limit > 0 && 1586 (rack->do_detection == 0) && 1587 rack->r_ctl.rc_num_split_allocs >= V_tcp_map_split_limit) { 1588 counter_u64_add(rack_split_limited, 1); 1589 if (!rack->alloc_limit_reported) { 1590 rack->alloc_limit_reported = 1; 1591 counter_u64_add(rack_alloc_limited_conns, 1); 1592 } 1593 return (NULL); 1594 } 1595 } 1596 1597 /* allocate and mark in the limit type, if set */ 1598 rsm = rack_alloc(rack); 1599 if (rsm != NULL && limit_type) { 1600 rsm->r_limit_type = limit_type; 1601 rack->r_ctl.rc_num_split_allocs++; 1602 } 1603 return (rsm); 1604 } 1605 1606 static void 1607 rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm) 1608 { 1609 if (rsm->r_limit_type) { 1610 /* currently there is only one limit type */ 1611 rack->r_ctl.rc_num_split_allocs--; 1612 } 1613 if (rack->r_ctl.rc_tlpsend == rsm) 1614 rack->r_ctl.rc_tlpsend = NULL; 1615 if (rack->r_ctl.rc_sacklast == rsm) 1616 rack->r_ctl.rc_sacklast = NULL; 1617 if (rack->rc_free_cnt < rack_free_cache) { 1618 memset(rsm, 0, sizeof(struct rack_sendmap)); 1619 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_tnext); 1620 rsm->r_limit_type = 0; 1621 rack->rc_free_cnt++; 1622 return; 1623 } 1624 rack->r_ctl.rc_num_maps_alloced--; 1625 uma_zfree(rack_zone, rsm); 1626 } 1627 1628 /* 1629 * CC wrapper hook functions 1630 */ 1631 static void 1632 rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, struct tcphdr *th, uint16_t nsegs, 1633 uint16_t type, int32_t recovery) 1634 { 1635 #ifdef STATS 1636 int32_t gput; 1637 #endif 1638 1639 INP_WLOCK_ASSERT(tp->t_inpcb); 1640 tp->ccv->nsegs = nsegs; 1641 tp->ccv->bytes_this_ack = BYTES_THIS_ACK(tp, th); 1642 if ((recovery) && (rack->r_ctl.rc_early_recovery_segs)) { 1643 uint32_t max; 1644 1645 max = rack->r_ctl.rc_early_recovery_segs * ctf_fixed_maxseg(tp); 1646 if (tp->ccv->bytes_this_ack > max) { 1647 tp->ccv->bytes_this_ack = max; 1648 } 1649 } 1650 if ((!V_tcp_do_newcwv && (tp->snd_cwnd <= tp->snd_wnd)) || 1651 (V_tcp_do_newcwv && (tp->snd_cwnd <= tp->snd_wnd) && 1652 (tp->snd_cwnd < (ctf_flight_size(tp, rack->r_ctl.rc_sacked) * 2)))) 1653 tp->ccv->flags |= CCF_CWND_LIMITED; 1654 else 1655 tp->ccv->flags &= ~CCF_CWND_LIMITED; 1656 1657 if (type == CC_ACK) { 1658 #ifdef STATS 1659 stats_voi_update_abs_s32(tp->t_stats, VOI_TCP_CALCFRWINDIFF, 1660 ((int32_t) tp->snd_cwnd) - tp->snd_wnd); 1661 if ((tp->t_flags & TF_GPUTINPROG) && 1662 SEQ_GEQ(th->th_ack, tp->gput_ack)) { 1663 gput = (((int64_t) (th->th_ack - tp->gput_seq)) << 3) / 1664 max(1, tcp_ts_getticks() - tp->gput_ts); 1665 /* We store it in bytes per ms (or kbytes per sec) */ 1666 rack->r_ctl.rc_gp_history[rack->r_ctl.rc_gp_hist_idx] = gput / 8; 1667 rack->r_ctl.rc_gp_hist_idx++; 1668 if (rack->r_ctl.rc_gp_hist_idx >= RACK_GP_HIST) 1669 rack->r_ctl.rc_gp_hist_filled = 1; 1670 rack->r_ctl.rc_gp_hist_idx %= RACK_GP_HIST; 1671 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_GPUT, 1672 gput); 1673 /* 1674 * XXXLAS: This is a temporary hack, and should be 1675 * chained off VOI_TCP_GPUT when stats(9) grows an 1676 * API to deal with chained VOIs. 1677 */ 1678 if (tp->t_stats_gput_prev > 0) 1679 stats_voi_update_abs_s32(tp->t_stats, 1680 VOI_TCP_GPUT_ND, 1681 ((gput - tp->t_stats_gput_prev) * 100) / 1682 tp->t_stats_gput_prev); 1683 tp->t_flags &= ~TF_GPUTINPROG; 1684 tp->t_stats_gput_prev = gput; 1685 #ifdef NETFLIX_PEAKRATE 1686 if (tp->t_maxpeakrate) { 1687 /* 1688 * We update t_peakrate_thr. This gives us roughly 1689 * one update per round trip time. 1690 */ 1691 tcp_update_peakrate_thr(tp); 1692 } 1693 #endif 1694 } 1695 #endif 1696 if (tp->snd_cwnd > tp->snd_ssthresh) { 1697 tp->t_bytes_acked += min(tp->ccv->bytes_this_ack, 1698 nsegs * V_tcp_abc_l_var * ctf_fixed_maxseg(tp)); 1699 if (tp->t_bytes_acked >= tp->snd_cwnd) { 1700 tp->t_bytes_acked -= tp->snd_cwnd; 1701 tp->ccv->flags |= CCF_ABC_SENTAWND; 1702 } 1703 } else { 1704 tp->ccv->flags &= ~CCF_ABC_SENTAWND; 1705 tp->t_bytes_acked = 0; 1706 } 1707 } 1708 if (CC_ALGO(tp)->ack_received != NULL) { 1709 /* XXXLAS: Find a way to live without this */ 1710 tp->ccv->curack = th->th_ack; 1711 CC_ALGO(tp)->ack_received(tp->ccv, type); 1712 } 1713 #ifdef STATS 1714 stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_LCWIN, tp->snd_cwnd); 1715 #endif 1716 if (rack->r_ctl.rc_rack_largest_cwnd < tp->snd_cwnd) { 1717 rack->r_ctl.rc_rack_largest_cwnd = tp->snd_cwnd; 1718 } 1719 /* we enforce max peak rate if it is set. */ 1720 if (tp->t_peakrate_thr && tp->snd_cwnd > tp->t_peakrate_thr) { 1721 tp->snd_cwnd = tp->t_peakrate_thr; 1722 } 1723 } 1724 1725 static void 1726 tcp_rack_partialack(struct tcpcb *tp, struct tcphdr *th) 1727 { 1728 struct tcp_rack *rack; 1729 1730 rack = (struct tcp_rack *)tp->t_fb_ptr; 1731 INP_WLOCK_ASSERT(tp->t_inpcb); 1732 if (rack->r_ctl.rc_prr_sndcnt > 0) 1733 rack->r_wanted_output++; 1734 } 1735 1736 static void 1737 rack_post_recovery(struct tcpcb *tp, struct tcphdr *th) 1738 { 1739 struct tcp_rack *rack; 1740 1741 INP_WLOCK_ASSERT(tp->t_inpcb); 1742 rack = (struct tcp_rack *)tp->t_fb_ptr; 1743 if (CC_ALGO(tp)->post_recovery != NULL) { 1744 tp->ccv->curack = th->th_ack; 1745 CC_ALGO(tp)->post_recovery(tp->ccv); 1746 } 1747 /* 1748 * Here we can in theory adjust cwnd to be based on the number of 1749 * losses in the window (rack->r_ctl.rc_loss_count). This is done 1750 * based on the rack_use_proportional flag. 1751 */ 1752 if (rack->r_ctl.rc_prop_reduce && rack->r_ctl.rc_prop_rate) { 1753 int32_t reduce; 1754 1755 reduce = (rack->r_ctl.rc_loss_count * rack->r_ctl.rc_prop_rate); 1756 if (reduce > 50) { 1757 reduce = 50; 1758 } 1759 tp->snd_cwnd -= ((reduce * tp->snd_cwnd) / 100); 1760 } else { 1761 if (tp->snd_cwnd > tp->snd_ssthresh) { 1762 /* Drop us down to the ssthresh (1/2 cwnd at loss) */ 1763 tp->snd_cwnd = tp->snd_ssthresh; 1764 } 1765 } 1766 if (rack->r_ctl.rc_prr_sndcnt > 0) { 1767 /* Suck the next prr cnt back into cwnd */ 1768 tp->snd_cwnd += rack->r_ctl.rc_prr_sndcnt; 1769 rack->r_ctl.rc_prr_sndcnt = 0; 1770 rack_log_to_prr(rack, 1); 1771 } 1772 tp->snd_recover = tp->snd_una; 1773 EXIT_RECOVERY(tp->t_flags); 1774 1775 1776 } 1777 1778 static void 1779 rack_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type) 1780 { 1781 struct tcp_rack *rack; 1782 1783 INP_WLOCK_ASSERT(tp->t_inpcb); 1784 1785 rack = (struct tcp_rack *)tp->t_fb_ptr; 1786 switch (type) { 1787 case CC_NDUPACK: 1788 tp->t_flags &= ~TF_WASFRECOVERY; 1789 tp->t_flags &= ~TF_WASCRECOVERY; 1790 if (!IN_FASTRECOVERY(tp->t_flags)) { 1791 rack->r_ctl.rc_tlp_rtx_out = 0; 1792 rack->r_ctl.rc_prr_delivered = 0; 1793 rack->r_ctl.rc_prr_out = 0; 1794 rack->r_ctl.rc_loss_count = 0; 1795 rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); 1796 rack_log_to_prr(rack, 2); 1797 rack->r_ctl.rc_prr_recovery_fs = tp->snd_max - tp->snd_una; 1798 tp->snd_recover = tp->snd_max; 1799 if (tp->t_flags2 & TF2_ECN_PERMIT) 1800 tp->t_flags2 |= TF2_ECN_SND_CWR; 1801 } 1802 break; 1803 case CC_ECN: 1804 if (!IN_CONGRECOVERY(tp->t_flags)) { 1805 KMOD_TCPSTAT_INC(tcps_ecn_rcwnd); 1806 tp->snd_recover = tp->snd_max; 1807 if (tp->t_flags2 & TF2_ECN_PERMIT) 1808 tp->t_flags2 |= TF2_ECN_SND_CWR; 1809 } 1810 break; 1811 case CC_RTO: 1812 tp->t_dupacks = 0; 1813 tp->t_bytes_acked = 0; 1814 EXIT_RECOVERY(tp->t_flags); 1815 tp->snd_ssthresh = max(2, min(tp->snd_wnd, tp->snd_cwnd) / 2 / 1816 ctf_fixed_maxseg(tp)) * ctf_fixed_maxseg(tp); 1817 tp->snd_cwnd = ctf_fixed_maxseg(tp); 1818 if (tp->t_flags2 & TF2_ECN_PERMIT) 1819 tp->t_flags2 |= TF2_ECN_SND_CWR; 1820 break; 1821 case CC_RTO_ERR: 1822 KMOD_TCPSTAT_INC(tcps_sndrexmitbad); 1823 /* RTO was unnecessary, so reset everything. */ 1824 tp->snd_cwnd = tp->snd_cwnd_prev; 1825 tp->snd_ssthresh = tp->snd_ssthresh_prev; 1826 tp->snd_recover = tp->snd_recover_prev; 1827 if (tp->t_flags & TF_WASFRECOVERY) { 1828 ENTER_FASTRECOVERY(tp->t_flags); 1829 tp->t_flags &= ~TF_WASFRECOVERY; 1830 } 1831 if (tp->t_flags & TF_WASCRECOVERY) { 1832 ENTER_CONGRECOVERY(tp->t_flags); 1833 tp->t_flags &= ~TF_WASCRECOVERY; 1834 } 1835 tp->snd_nxt = tp->snd_max; 1836 tp->t_badrxtwin = 0; 1837 break; 1838 } 1839 1840 if (CC_ALGO(tp)->cong_signal != NULL) { 1841 if (th != NULL) 1842 tp->ccv->curack = th->th_ack; 1843 CC_ALGO(tp)->cong_signal(tp->ccv, type); 1844 } 1845 } 1846 1847 1848 1849 static inline void 1850 rack_cc_after_idle(struct tcpcb *tp) 1851 { 1852 uint32_t i_cwnd; 1853 1854 INP_WLOCK_ASSERT(tp->t_inpcb); 1855 1856 #ifdef NETFLIX_STATS 1857 KMOD_TCPSTAT_INC(tcps_idle_restarts); 1858 if (tp->t_state == TCPS_ESTABLISHED) 1859 KMOD_TCPSTAT_INC(tcps_idle_estrestarts); 1860 #endif 1861 if (CC_ALGO(tp)->after_idle != NULL) 1862 CC_ALGO(tp)->after_idle(tp->ccv); 1863 1864 if (tp->snd_cwnd == 1) 1865 i_cwnd = tp->t_maxseg; /* SYN(-ACK) lost */ 1866 else 1867 i_cwnd = tcp_compute_initwnd(tcp_maxseg(tp)); 1868 1869 /* 1870 * Being idle is no differnt than the initial window. If the cc 1871 * clamps it down below the initial window raise it to the initial 1872 * window. 1873 */ 1874 if (tp->snd_cwnd < i_cwnd) { 1875 tp->snd_cwnd = i_cwnd; 1876 } 1877 } 1878 1879 1880 /* 1881 * Indicate whether this ack should be delayed. We can delay the ack if 1882 * following conditions are met: 1883 * - There is no delayed ack timer in progress. 1884 * - Our last ack wasn't a 0-sized window. We never want to delay 1885 * the ack that opens up a 0-sized window. 1886 * - LRO wasn't used for this segment. We make sure by checking that the 1887 * segment size is not larger than the MSS. 1888 * - Delayed acks are enabled or this is a half-synchronized T/TCP 1889 * connection. 1890 */ 1891 #define DELAY_ACK(tp, tlen) \ 1892 (((tp->t_flags & TF_RXWIN0SENT) == 0) && \ 1893 ((tp->t_flags & TF_DELACK) == 0) && \ 1894 (tlen <= tp->t_maxseg) && \ 1895 (tp->t_delayed_ack || (tp->t_flags & TF_NEEDSYN))) 1896 1897 static struct rack_sendmap * 1898 rack_find_lowest_rsm(struct tcp_rack *rack) 1899 { 1900 struct rack_sendmap *rsm; 1901 1902 /* 1903 * Walk the time-order transmitted list looking for an rsm that is 1904 * not acked. This will be the one that was sent the longest time 1905 * ago that is still outstanding. 1906 */ 1907 TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) { 1908 if (rsm->r_flags & RACK_ACKED) { 1909 continue; 1910 } 1911 goto finish; 1912 } 1913 finish: 1914 return (rsm); 1915 } 1916 1917 static struct rack_sendmap * 1918 rack_find_high_nonack(struct tcp_rack *rack, struct rack_sendmap *rsm) 1919 { 1920 struct rack_sendmap *prsm; 1921 1922 /* 1923 * Walk the sequence order list backward until we hit and arrive at 1924 * the highest seq not acked. In theory when this is called it 1925 * should be the last segment (which it was not). 1926 */ 1927 counter_u64_add(rack_find_high, 1); 1928 prsm = rsm; 1929 RB_FOREACH_REVERSE_FROM(prsm, rack_rb_tree_head, rsm) { 1930 if (prsm->r_flags & (RACK_ACKED | RACK_HAS_FIN)) { 1931 continue; 1932 } 1933 return (prsm); 1934 } 1935 return (NULL); 1936 } 1937 1938 1939 static uint32_t 1940 rack_calc_thresh_rack(struct tcp_rack *rack, uint32_t srtt, uint32_t cts) 1941 { 1942 int32_t lro; 1943 uint32_t thresh; 1944 1945 /* 1946 * lro is the flag we use to determine if we have seen reordering. 1947 * If it gets set we have seen reordering. The reorder logic either 1948 * works in one of two ways: 1949 * 1950 * If reorder-fade is configured, then we track the last time we saw 1951 * re-ordering occur. If we reach the point where enough time as 1952 * passed we no longer consider reordering has occuring. 1953 * 1954 * Or if reorder-face is 0, then once we see reordering we consider 1955 * the connection to alway be subject to reordering and just set lro 1956 * to 1. 1957 * 1958 * In the end if lro is non-zero we add the extra time for 1959 * reordering in. 1960 */ 1961 if (srtt == 0) 1962 srtt = 1; 1963 if (rack->r_ctl.rc_reorder_ts) { 1964 if (rack->r_ctl.rc_reorder_fade) { 1965 if (SEQ_GEQ(cts, rack->r_ctl.rc_reorder_ts)) { 1966 lro = cts - rack->r_ctl.rc_reorder_ts; 1967 if (lro == 0) { 1968 /* 1969 * No time as passed since the last 1970 * reorder, mark it as reordering. 1971 */ 1972 lro = 1; 1973 } 1974 } else { 1975 /* Negative time? */ 1976 lro = 0; 1977 } 1978 if (lro > rack->r_ctl.rc_reorder_fade) { 1979 /* Turn off reordering seen too */ 1980 rack->r_ctl.rc_reorder_ts = 0; 1981 lro = 0; 1982 } 1983 } else { 1984 /* Reodering does not fade */ 1985 lro = 1; 1986 } 1987 } else { 1988 lro = 0; 1989 } 1990 thresh = srtt + rack->r_ctl.rc_pkt_delay; 1991 if (lro) { 1992 /* It must be set, if not you get 1/4 rtt */ 1993 if (rack->r_ctl.rc_reorder_shift) 1994 thresh += (srtt >> rack->r_ctl.rc_reorder_shift); 1995 else 1996 thresh += (srtt >> 2); 1997 } else { 1998 thresh += 1; 1999 } 2000 /* We don't let the rack timeout be above a RTO */ 2001 if (thresh > TICKS_2_MSEC(rack->rc_tp->t_rxtcur)) { 2002 thresh = TICKS_2_MSEC(rack->rc_tp->t_rxtcur); 2003 } 2004 /* And we don't want it above the RTO max either */ 2005 if (thresh > rack_rto_max) { 2006 thresh = rack_rto_max; 2007 } 2008 return (thresh); 2009 } 2010 2011 static uint32_t 2012 rack_calc_thresh_tlp(struct tcpcb *tp, struct tcp_rack *rack, 2013 struct rack_sendmap *rsm, uint32_t srtt) 2014 { 2015 struct rack_sendmap *prsm; 2016 uint32_t thresh, len; 2017 int maxseg; 2018 2019 if (srtt == 0) 2020 srtt = 1; 2021 if (rack->r_ctl.rc_tlp_threshold) 2022 thresh = srtt + (srtt / rack->r_ctl.rc_tlp_threshold); 2023 else 2024 thresh = (srtt * 2); 2025 2026 /* Get the previous sent packet, if any */ 2027 maxseg = ctf_fixed_maxseg(tp); 2028 counter_u64_add(rack_enter_tlp_calc, 1); 2029 len = rsm->r_end - rsm->r_start; 2030 if (rack->rack_tlp_threshold_use == TLP_USE_ID) { 2031 /* Exactly like the ID */ 2032 if (((tp->snd_max - tp->snd_una) - rack->r_ctl.rc_sacked + rack->r_ctl.rc_holes_rxt) <= maxseg) { 2033 uint32_t alt_thresh; 2034 /* 2035 * Compensate for delayed-ack with the d-ack time. 2036 */ 2037 counter_u64_add(rack_used_tlpmethod, 1); 2038 alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time; 2039 if (alt_thresh > thresh) 2040 thresh = alt_thresh; 2041 } 2042 } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_ONE) { 2043 /* 2.1 behavior */ 2044 prsm = TAILQ_PREV(rsm, rack_head, r_tnext); 2045 if (prsm && (len <= maxseg)) { 2046 /* 2047 * Two packets outstanding, thresh should be (2*srtt) + 2048 * possible inter-packet delay (if any). 2049 */ 2050 uint32_t inter_gap = 0; 2051 int idx, nidx; 2052 2053 counter_u64_add(rack_used_tlpmethod, 1); 2054 idx = rsm->r_rtr_cnt - 1; 2055 nidx = prsm->r_rtr_cnt - 1; 2056 if (TSTMP_GEQ(rsm->r_tim_lastsent[nidx], prsm->r_tim_lastsent[idx])) { 2057 /* Yes it was sent later (or at the same time) */ 2058 inter_gap = rsm->r_tim_lastsent[idx] - prsm->r_tim_lastsent[nidx]; 2059 } 2060 thresh += inter_gap; 2061 } else if (len <= maxseg) { 2062 /* 2063 * Possibly compensate for delayed-ack. 2064 */ 2065 uint32_t alt_thresh; 2066 2067 counter_u64_add(rack_used_tlpmethod2, 1); 2068 alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time; 2069 if (alt_thresh > thresh) 2070 thresh = alt_thresh; 2071 } 2072 } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_TWO) { 2073 /* 2.2 behavior */ 2074 if (len <= maxseg) { 2075 uint32_t alt_thresh; 2076 /* 2077 * Compensate for delayed-ack with the d-ack time. 2078 */ 2079 counter_u64_add(rack_used_tlpmethod, 1); 2080 alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time; 2081 if (alt_thresh > thresh) 2082 thresh = alt_thresh; 2083 } 2084 } 2085 /* Not above an RTO */ 2086 if (thresh > TICKS_2_MSEC(tp->t_rxtcur)) { 2087 thresh = TICKS_2_MSEC(tp->t_rxtcur); 2088 } 2089 /* Not above a RTO max */ 2090 if (thresh > rack_rto_max) { 2091 thresh = rack_rto_max; 2092 } 2093 /* Apply user supplied min TLP */ 2094 if (thresh < rack_tlp_min) { 2095 thresh = rack_tlp_min; 2096 } 2097 return (thresh); 2098 } 2099 2100 static uint32_t 2101 rack_grab_rtt(struct tcpcb *tp, struct tcp_rack *rack) 2102 { 2103 /* 2104 * We want the rack_rtt which is the 2105 * last rtt we measured. However if that 2106 * does not exist we fallback to the srtt (which 2107 * we probably will never do) and then as a last 2108 * resort we use RACK_INITIAL_RTO if no srtt is 2109 * yet set. 2110 */ 2111 if (rack->rc_rack_rtt) 2112 return(rack->rc_rack_rtt); 2113 else if (tp->t_srtt == 0) 2114 return(RACK_INITIAL_RTO); 2115 return (TICKS_2_MSEC(tp->t_srtt >> TCP_RTT_SHIFT)); 2116 } 2117 2118 static struct rack_sendmap * 2119 rack_check_recovery_mode(struct tcpcb *tp, uint32_t tsused) 2120 { 2121 /* 2122 * Check to see that we don't need to fall into recovery. We will 2123 * need to do so if our oldest transmit is past the time we should 2124 * have had an ack. 2125 */ 2126 struct tcp_rack *rack; 2127 struct rack_sendmap *rsm; 2128 int32_t idx; 2129 uint32_t srtt, thresh; 2130 2131 rack = (struct tcp_rack *)tp->t_fb_ptr; 2132 if (RB_EMPTY(&rack->r_ctl.rc_mtree)) { 2133 return (NULL); 2134 } 2135 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 2136 if (rsm == NULL) 2137 return (NULL); 2138 2139 if (rsm->r_flags & RACK_ACKED) { 2140 rsm = rack_find_lowest_rsm(rack); 2141 if (rsm == NULL) 2142 return (NULL); 2143 } 2144 idx = rsm->r_rtr_cnt - 1; 2145 srtt = rack_grab_rtt(tp, rack); 2146 thresh = rack_calc_thresh_rack(rack, srtt, tsused); 2147 if (tsused < rsm->r_tim_lastsent[idx]) { 2148 return (NULL); 2149 } 2150 if ((tsused - rsm->r_tim_lastsent[idx]) < thresh) { 2151 return (NULL); 2152 } 2153 /* Ok if we reach here we are over-due */ 2154 rack->r_ctl.rc_rsm_start = rsm->r_start; 2155 rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; 2156 rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; 2157 rack_cong_signal(tp, NULL, CC_NDUPACK); 2158 return (rsm); 2159 } 2160 2161 static uint32_t 2162 rack_get_persists_timer_val(struct tcpcb *tp, struct tcp_rack *rack) 2163 { 2164 int32_t t; 2165 int32_t tt; 2166 uint32_t ret_val; 2167 2168 t = TICKS_2_MSEC((tp->t_srtt >> TCP_RTT_SHIFT) + ((tp->t_rttvar * 4) >> TCP_RTT_SHIFT)); 2169 TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift], 2170 rack_persist_min, rack_persist_max); 2171 if (tp->t_rxtshift < TCP_MAXRXTSHIFT) 2172 tp->t_rxtshift++; 2173 rack->r_ctl.rc_hpts_flags |= PACE_TMR_PERSIT; 2174 ret_val = (uint32_t)tt; 2175 return (ret_val); 2176 } 2177 2178 static uint32_t 2179 rack_timer_start(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int sup_rack) 2180 { 2181 /* 2182 * Start the FR timer, we do this based on getting the first one in 2183 * the rc_tmap. Note that if its NULL we must stop the timer. in all 2184 * events we need to stop the running timer (if its running) before 2185 * starting the new one. 2186 */ 2187 uint32_t thresh, exp, to, srtt, time_since_sent, tstmp_touse; 2188 uint32_t srtt_cur; 2189 int32_t idx; 2190 int32_t is_tlp_timer = 0; 2191 struct rack_sendmap *rsm; 2192 2193 if (rack->t_timers_stopped) { 2194 /* All timers have been stopped none are to run */ 2195 return (0); 2196 } 2197 if (rack->rc_in_persist) { 2198 /* We can't start any timer in persists */ 2199 return (rack_get_persists_timer_val(tp, rack)); 2200 } 2201 if ((tp->t_state < TCPS_ESTABLISHED) || 2202 ((tp->t_flags & TF_SACK_PERMIT) == 0)) 2203 goto activate_rxt; 2204 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 2205 if ((rsm == NULL) || sup_rack) { 2206 /* Nothing on the send map */ 2207 activate_rxt: 2208 time_since_sent = 0; 2209 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 2210 if (rsm) { 2211 idx = rsm->r_rtr_cnt - 1; 2212 if (TSTMP_GEQ(rsm->r_tim_lastsent[idx], rack->r_ctl.rc_tlp_rxt_last_time)) 2213 tstmp_touse = rsm->r_tim_lastsent[idx]; 2214 else 2215 tstmp_touse = rack->r_ctl.rc_tlp_rxt_last_time; 2216 if (TSTMP_GT(tstmp_touse, cts)) 2217 time_since_sent = cts - tstmp_touse; 2218 } 2219 if (SEQ_LT(tp->snd_una, tp->snd_max) || sbavail(&(tp->t_inpcb->inp_socket->so_snd))) { 2220 rack->r_ctl.rc_hpts_flags |= PACE_TMR_RXT; 2221 to = TICKS_2_MSEC(tp->t_rxtcur); 2222 if (to > time_since_sent) 2223 to -= time_since_sent; 2224 else 2225 to = rack->r_ctl.rc_min_to; 2226 if (to == 0) 2227 to = 1; 2228 return (to); 2229 } 2230 return (0); 2231 } 2232 if (rsm->r_flags & RACK_ACKED) { 2233 rsm = rack_find_lowest_rsm(rack); 2234 if (rsm == NULL) { 2235 /* No lowest? */ 2236 goto activate_rxt; 2237 } 2238 } 2239 if (rack->sack_attack_disable) { 2240 /* 2241 * We don't want to do 2242 * any TLP's if you are an attacker. 2243 * Though if you are doing what 2244 * is expected you may still have 2245 * SACK-PASSED marks. 2246 */ 2247 goto activate_rxt; 2248 } 2249 /* Convert from ms to usecs */ 2250 if (rsm->r_flags & RACK_SACK_PASSED) { 2251 if ((tp->t_flags & TF_SENTFIN) && 2252 ((tp->snd_max - tp->snd_una) == 1) && 2253 (rsm->r_flags & RACK_HAS_FIN)) { 2254 /* 2255 * We don't start a rack timer if all we have is a 2256 * FIN outstanding. 2257 */ 2258 goto activate_rxt; 2259 } 2260 if ((rack->use_rack_cheat == 0) && 2261 (IN_RECOVERY(tp->t_flags)) && 2262 (rack->r_ctl.rc_prr_sndcnt < ctf_fixed_maxseg(tp))) { 2263 /* 2264 * We are not cheating, in recovery and 2265 * not enough ack's to yet get our next 2266 * retransmission out. 2267 * 2268 * Note that classified attackers do not 2269 * get to use the rack-cheat. 2270 */ 2271 goto activate_tlp; 2272 } 2273 srtt = rack_grab_rtt(tp, rack); 2274 thresh = rack_calc_thresh_rack(rack, srtt, cts); 2275 idx = rsm->r_rtr_cnt - 1; 2276 exp = rsm->r_tim_lastsent[idx] + thresh; 2277 if (SEQ_GEQ(exp, cts)) { 2278 to = exp - cts; 2279 if (to < rack->r_ctl.rc_min_to) { 2280 to = rack->r_ctl.rc_min_to; 2281 } 2282 } else { 2283 to = rack->r_ctl.rc_min_to; 2284 } 2285 } else { 2286 /* Ok we need to do a TLP not RACK */ 2287 activate_tlp: 2288 if ((rack->rc_tlp_in_progress != 0) || 2289 (rack->r_ctl.rc_tlp_rtx_out != 0)) { 2290 /* 2291 * The previous send was a TLP or a tlp_rtx is in 2292 * process. 2293 */ 2294 goto activate_rxt; 2295 } 2296 rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext); 2297 if (rsm == NULL) { 2298 /* We found no rsm to TLP with. */ 2299 goto activate_rxt; 2300 } 2301 if (rsm->r_flags & RACK_HAS_FIN) { 2302 /* If its a FIN we dont do TLP */ 2303 rsm = NULL; 2304 goto activate_rxt; 2305 } 2306 idx = rsm->r_rtr_cnt - 1; 2307 time_since_sent = 0; 2308 if (TSTMP_GEQ(rsm->r_tim_lastsent[idx], rack->r_ctl.rc_tlp_rxt_last_time)) 2309 tstmp_touse = rsm->r_tim_lastsent[idx]; 2310 else 2311 tstmp_touse = rack->r_ctl.rc_tlp_rxt_last_time; 2312 if (TSTMP_GT(tstmp_touse, cts)) 2313 time_since_sent = cts - tstmp_touse; 2314 is_tlp_timer = 1; 2315 if (tp->t_srtt) { 2316 srtt_cur = (tp->t_srtt >> TCP_RTT_SHIFT); 2317 srtt = TICKS_2_MSEC(srtt_cur); 2318 } else 2319 srtt = RACK_INITIAL_RTO; 2320 thresh = rack_calc_thresh_tlp(tp, rack, rsm, srtt); 2321 if (thresh > time_since_sent) 2322 to = thresh - time_since_sent; 2323 else 2324 to = rack->r_ctl.rc_min_to; 2325 if (to > TCPTV_REXMTMAX) { 2326 /* 2327 * If the TLP time works out to larger than the max 2328 * RTO lets not do TLP.. just RTO. 2329 */ 2330 goto activate_rxt; 2331 } 2332 if (rsm->r_start != rack->r_ctl.rc_last_tlp_seq) { 2333 /* 2334 * The tail is no longer the last one I did a probe 2335 * on 2336 */ 2337 rack->r_ctl.rc_tlp_seg_send_cnt = 0; 2338 rack->r_ctl.rc_last_tlp_seq = rsm->r_start; 2339 } 2340 } 2341 if (is_tlp_timer == 0) { 2342 rack->r_ctl.rc_hpts_flags |= PACE_TMR_RACK; 2343 } else { 2344 if ((rack->r_ctl.rc_tlp_send_cnt > rack_tlp_max_resend) || 2345 (rack->r_ctl.rc_tlp_seg_send_cnt > rack_tlp_max_resend)) { 2346 /* 2347 * We have exceeded how many times we can retran the 2348 * current TLP timer, switch to the RTO timer. 2349 */ 2350 goto activate_rxt; 2351 } else { 2352 rack->r_ctl.rc_hpts_flags |= PACE_TMR_TLP; 2353 } 2354 } 2355 if (to == 0) 2356 to = 1; 2357 return (to); 2358 } 2359 2360 static void 2361 rack_enter_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 2362 { 2363 if (rack->rc_in_persist == 0) { 2364 rack->r_ctl.rc_went_idle_time = cts; 2365 rack_timer_cancel(tp, rack, cts, __LINE__); 2366 tp->t_rxtshift = 0; 2367 rack->rc_in_persist = 1; 2368 } 2369 } 2370 2371 static void 2372 rack_exit_persist(struct tcpcb *tp, struct tcp_rack *rack) 2373 { 2374 if (rack->rc_inp->inp_in_hpts) { 2375 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); 2376 rack->r_ctl.rc_hpts_flags = 0; 2377 } 2378 rack->rc_in_persist = 0; 2379 rack->r_ctl.rc_went_idle_time = 0; 2380 tp->t_flags &= ~TF_FORCEDATA; 2381 tp->t_rxtshift = 0; 2382 } 2383 2384 static void 2385 rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, 2386 int32_t slot, uint32_t tot_len_this_send, int sup_rack) 2387 { 2388 struct inpcb *inp; 2389 uint32_t delayed_ack = 0; 2390 uint32_t hpts_timeout; 2391 uint8_t stopped; 2392 uint32_t left = 0; 2393 2394 inp = tp->t_inpcb; 2395 if (inp->inp_in_hpts) { 2396 /* A previous call is already set up */ 2397 return; 2398 } 2399 if ((tp->t_state == TCPS_CLOSED) || 2400 (tp->t_state == TCPS_LISTEN)) { 2401 return; 2402 } 2403 stopped = rack->rc_tmr_stopped; 2404 if (stopped && TSTMP_GT(rack->r_ctl.rc_timer_exp, cts)) { 2405 left = rack->r_ctl.rc_timer_exp - cts; 2406 } 2407 rack->tlp_timer_up = 0; 2408 rack->r_ctl.rc_timer_exp = 0; 2409 if (rack->rc_inp->inp_in_hpts == 0) { 2410 rack->r_ctl.rc_hpts_flags = 0; 2411 } 2412 if (slot) { 2413 /* We are hptsi too */ 2414 rack->r_ctl.rc_hpts_flags |= PACE_PKT_OUTPUT; 2415 } else if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { 2416 /* 2417 * We are still left on the hpts when the to goes 2418 * it will be for output. 2419 */ 2420 if (TSTMP_GT(rack->r_ctl.rc_last_output_to, cts)) 2421 slot = rack->r_ctl.rc_last_output_to - cts; 2422 else 2423 slot = 1; 2424 } 2425 hpts_timeout = rack_timer_start(tp, rack, cts, sup_rack); 2426 #ifdef NETFLIX_EXP_DETECTION 2427 if (rack->sack_attack_disable && 2428 (slot < USEC_TO_MSEC(tcp_sad_pacing_interval))) { 2429 /* 2430 * We have a potential attacker on 2431 * the line. We have possibly some 2432 * (or now) pacing time set. We want to 2433 * slow down the processing of sacks by some 2434 * amount (if it is an attacker). Set the default 2435 * slot for attackers in place (unless the orginal 2436 * interval is longer). Its stored in 2437 * micro-seconds, so lets convert to msecs. 2438 */ 2439 slot = USEC_TO_MSEC(tcp_sad_pacing_interval); 2440 } 2441 #endif 2442 if (tp->t_flags & TF_DELACK) { 2443 delayed_ack = TICKS_2_MSEC(tcp_delacktime); 2444 rack->r_ctl.rc_hpts_flags |= PACE_TMR_DELACK; 2445 } 2446 if (delayed_ack && ((hpts_timeout == 0) || 2447 (delayed_ack < hpts_timeout))) 2448 hpts_timeout = delayed_ack; 2449 else 2450 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK; 2451 /* 2452 * If no timers are going to run and we will fall off the hptsi 2453 * wheel, we resort to a keep-alive timer if its configured. 2454 */ 2455 if ((hpts_timeout == 0) && 2456 (slot == 0)) { 2457 if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && 2458 (tp->t_state <= TCPS_CLOSING)) { 2459 /* 2460 * Ok we have no timer (persists, rack, tlp, rxt or 2461 * del-ack), we don't have segments being paced. So 2462 * all that is left is the keepalive timer. 2463 */ 2464 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 2465 /* Get the established keep-alive time */ 2466 hpts_timeout = TP_KEEPIDLE(tp); 2467 } else { 2468 /* Get the initial setup keep-alive time */ 2469 hpts_timeout = TP_KEEPINIT(tp); 2470 } 2471 rack->r_ctl.rc_hpts_flags |= PACE_TMR_KEEP; 2472 } 2473 } 2474 if (left && (stopped & (PACE_TMR_KEEP | PACE_TMR_DELACK)) == 2475 (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK)) { 2476 /* 2477 * RACK, TLP, persists and RXT timers all are restartable 2478 * based on actions input .. i.e we received a packet (ack 2479 * or sack) and that changes things (rw, or snd_una etc). 2480 * Thus we can restart them with a new value. For 2481 * keep-alive, delayed_ack we keep track of what was left 2482 * and restart the timer with a smaller value. 2483 */ 2484 if (left < hpts_timeout) 2485 hpts_timeout = left; 2486 } 2487 if (hpts_timeout) { 2488 /* 2489 * Hack alert for now we can't time-out over 2,147,483 2490 * seconds (a bit more than 596 hours), which is probably ok 2491 * :). 2492 */ 2493 if (hpts_timeout > 0x7ffffffe) 2494 hpts_timeout = 0x7ffffffe; 2495 rack->r_ctl.rc_timer_exp = cts + hpts_timeout; 2496 } 2497 if (slot) { 2498 rack->rc_inp->inp_flags2 |= INP_MBUF_QUEUE_READY; 2499 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK) 2500 inp->inp_flags2 |= INP_DONT_SACK_QUEUE; 2501 else 2502 inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE; 2503 rack->r_ctl.rc_last_output_to = cts + slot; 2504 if ((hpts_timeout == 0) || (hpts_timeout > slot)) { 2505 if (rack->rc_inp->inp_in_hpts == 0) 2506 tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(slot)); 2507 rack_log_to_start(rack, cts, hpts_timeout, slot, 1); 2508 } else { 2509 /* 2510 * Arrange for the hpts to kick back in after the 2511 * t-o if the t-o does not cause a send. 2512 */ 2513 if (rack->rc_inp->inp_in_hpts == 0) 2514 tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(hpts_timeout)); 2515 rack_log_to_start(rack, cts, hpts_timeout, slot, 0); 2516 } 2517 } else if (hpts_timeout) { 2518 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK) { 2519 /* For a rack timer, don't wake us */ 2520 rack->rc_inp->inp_flags2 |= INP_MBUF_QUEUE_READY; 2521 inp->inp_flags2 |= INP_DONT_SACK_QUEUE; 2522 } else { 2523 /* All other timers wake us up */ 2524 rack->rc_inp->inp_flags2 &= ~INP_MBUF_QUEUE_READY; 2525 inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE; 2526 } 2527 if (rack->rc_inp->inp_in_hpts == 0) 2528 tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(hpts_timeout)); 2529 rack_log_to_start(rack, cts, hpts_timeout, slot, 0); 2530 } else { 2531 /* No timer starting */ 2532 #ifdef INVARIANTS 2533 if (SEQ_GT(tp->snd_max, tp->snd_una)) { 2534 panic("tp:%p rack:%p tlts:%d cts:%u slot:%u pto:%u -- no timer started?", 2535 tp, rack, tot_len_this_send, cts, slot, hpts_timeout); 2536 } 2537 #endif 2538 } 2539 rack->rc_tmr_stopped = 0; 2540 if (slot) 2541 rack_log_type_bbrsnd(rack, tot_len_this_send, slot, cts); 2542 } 2543 2544 /* 2545 * RACK Timer, here we simply do logging and house keeping. 2546 * the normal rack_output() function will call the 2547 * appropriate thing to check if we need to do a RACK retransmit. 2548 * We return 1, saying don't proceed with rack_output only 2549 * when all timers have been stopped (destroyed PCB?). 2550 */ 2551 static int 2552 rack_timeout_rack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 2553 { 2554 /* 2555 * This timer simply provides an internal trigger to send out data. 2556 * The check_recovery_mode call will see if there are needed 2557 * retransmissions, if so we will enter fast-recovery. The output 2558 * call may or may not do the same thing depending on sysctl 2559 * settings. 2560 */ 2561 struct rack_sendmap *rsm; 2562 int32_t recovery, ll; 2563 2564 if (tp->t_timers->tt_flags & TT_STOPPED) { 2565 return (1); 2566 } 2567 if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) { 2568 /* Its not time yet */ 2569 return (0); 2570 } 2571 recovery = IN_RECOVERY(tp->t_flags); 2572 counter_u64_add(rack_to_tot, 1); 2573 if (rack->r_state && (rack->r_state != tp->t_state)) 2574 rack_set_state(tp, rack); 2575 rsm = rack_check_recovery_mode(tp, cts); 2576 if (rsm) 2577 ll = rsm->r_end - rsm->r_start; 2578 else 2579 ll = 0; 2580 rack_log_to_event(rack, RACK_TO_FRM_RACK, ll); 2581 if (rsm) { 2582 uint32_t rtt; 2583 2584 rtt = rack->rc_rack_rtt; 2585 if (rtt == 0) 2586 rtt = 1; 2587 if ((recovery == 0) && 2588 (rack->r_ctl.rc_prr_sndcnt < ctf_fixed_maxseg(tp))) { 2589 /* 2590 * The rack-timeout that enter's us into recovery 2591 * will force out one MSS and set us up so that we 2592 * can do one more send in 2*rtt (transitioning the 2593 * rack timeout into a rack-tlp). 2594 */ 2595 rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); 2596 rack_log_to_prr(rack, 3); 2597 } else if ((rack->r_ctl.rc_prr_sndcnt < (rsm->r_end - rsm->r_start)) && 2598 rack->use_rack_cheat) { 2599 /* 2600 * When a rack timer goes, if the rack cheat is 2601 * on, arrange it so we can send a full segment. 2602 */ 2603 rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); 2604 rack_log_to_prr(rack, 4); 2605 } 2606 } else { 2607 /* This is a case that should happen rarely if ever */ 2608 counter_u64_add(rack_tlp_does_nada, 1); 2609 #ifdef TCP_BLACKBOX 2610 tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true); 2611 #endif 2612 rack->r_ctl.rc_resend = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 2613 } 2614 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RACK; 2615 return (0); 2616 } 2617 2618 static __inline void 2619 rack_clone_rsm(struct tcp_rack *rack, struct rack_sendmap *nrsm, 2620 struct rack_sendmap *rsm, uint32_t start) 2621 { 2622 int idx; 2623 2624 nrsm->r_start = start; 2625 nrsm->r_end = rsm->r_end; 2626 nrsm->r_rtr_cnt = rsm->r_rtr_cnt; 2627 nrsm->r_flags = rsm->r_flags; 2628 nrsm->r_dupack = rsm->r_dupack; 2629 nrsm->r_rtr_bytes = 0; 2630 rsm->r_end = nrsm->r_start; 2631 for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { 2632 nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; 2633 } 2634 } 2635 2636 static struct rack_sendmap * 2637 rack_merge_rsm(struct tcp_rack *rack, 2638 struct rack_sendmap *l_rsm, 2639 struct rack_sendmap *r_rsm) 2640 { 2641 /* 2642 * We are merging two ack'd RSM's, 2643 * the l_rsm is on the left (lower seq 2644 * values) and the r_rsm is on the right 2645 * (higher seq value). The simplest way 2646 * to merge these is to move the right 2647 * one into the left. I don't think there 2648 * is any reason we need to try to find 2649 * the oldest (or last oldest retransmitted). 2650 */ 2651 struct rack_sendmap *rm; 2652 2653 l_rsm->r_end = r_rsm->r_end; 2654 if (l_rsm->r_dupack < r_rsm->r_dupack) 2655 l_rsm->r_dupack = r_rsm->r_dupack; 2656 if (r_rsm->r_rtr_bytes) 2657 l_rsm->r_rtr_bytes += r_rsm->r_rtr_bytes; 2658 if (r_rsm->r_in_tmap) { 2659 /* This really should not happen */ 2660 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, r_rsm, r_tnext); 2661 r_rsm->r_in_tmap = 0; 2662 } 2663 /* Now the flags */ 2664 if (r_rsm->r_flags & RACK_HAS_FIN) 2665 l_rsm->r_flags |= RACK_HAS_FIN; 2666 if (r_rsm->r_flags & RACK_TLP) 2667 l_rsm->r_flags |= RACK_TLP; 2668 if (r_rsm->r_flags & RACK_RWND_COLLAPSED) 2669 l_rsm->r_flags |= RACK_RWND_COLLAPSED; 2670 rm = RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, r_rsm); 2671 #ifdef INVARIANTS 2672 if (rm != r_rsm) { 2673 panic("removing head in rack:%p rsm:%p rm:%p", 2674 rack, r_rsm, rm); 2675 } 2676 #endif 2677 if ((r_rsm->r_limit_type == 0) && (l_rsm->r_limit_type != 0)) { 2678 /* Transfer the split limit to the map we free */ 2679 r_rsm->r_limit_type = l_rsm->r_limit_type; 2680 l_rsm->r_limit_type = 0; 2681 } 2682 rack_free(rack, r_rsm); 2683 return(l_rsm); 2684 } 2685 2686 /* 2687 * TLP Timer, here we simply setup what segment we want to 2688 * have the TLP expire on, the normal rack_output() will then 2689 * send it out. 2690 * 2691 * We return 1, saying don't proceed with rack_output only 2692 * when all timers have been stopped (destroyed PCB?). 2693 */ 2694 static int 2695 rack_timeout_tlp(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 2696 { 2697 /* 2698 * Tail Loss Probe. 2699 */ 2700 struct rack_sendmap *rsm = NULL; 2701 struct rack_sendmap *insret; 2702 struct socket *so; 2703 uint32_t amm, old_prr_snd = 0; 2704 uint32_t out, avail; 2705 int collapsed_win = 0; 2706 2707 if (tp->t_timers->tt_flags & TT_STOPPED) { 2708 return (1); 2709 } 2710 if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) { 2711 /* Its not time yet */ 2712 return (0); 2713 } 2714 if (rack_progress_timeout_check(tp)) { 2715 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 2716 return (1); 2717 } 2718 /* 2719 * A TLP timer has expired. We have been idle for 2 rtts. So we now 2720 * need to figure out how to force a full MSS segment out. 2721 */ 2722 rack_log_to_event(rack, RACK_TO_FRM_TLP, 0); 2723 counter_u64_add(rack_tlp_tot, 1); 2724 if (rack->r_state && (rack->r_state != tp->t_state)) 2725 rack_set_state(tp, rack); 2726 so = tp->t_inpcb->inp_socket; 2727 #ifdef KERN_TLS 2728 if (rack->rc_inp->inp_socket->so_snd.sb_flags & SB_TLS_IFNET) { 2729 /* 2730 * For hardware TLS we do *not* want to send 2731 * new data, lets instead just do a retransmission. 2732 */ 2733 goto need_retran; 2734 } 2735 #endif 2736 avail = sbavail(&so->so_snd); 2737 out = tp->snd_max - tp->snd_una; 2738 rack->tlp_timer_up = 1; 2739 if (out > tp->snd_wnd) { 2740 /* special case, we need a retransmission */ 2741 collapsed_win = 1; 2742 goto need_retran; 2743 } 2744 /* 2745 * If we are in recovery we can jazz out a segment if new data is 2746 * present simply by setting rc_prr_sndcnt to a segment. 2747 */ 2748 if ((avail > out) && 2749 ((rack_always_send_oldest == 0) || (TAILQ_EMPTY(&rack->r_ctl.rc_tmap)))) { 2750 /* New data is available */ 2751 amm = avail - out; 2752 if (amm > ctf_fixed_maxseg(tp)) { 2753 amm = ctf_fixed_maxseg(tp); 2754 } else if ((amm < ctf_fixed_maxseg(tp)) && ((tp->t_flags & TF_NODELAY) == 0)) { 2755 /* not enough to fill a MTU and no-delay is off */ 2756 goto need_retran; 2757 } 2758 if (IN_RECOVERY(tp->t_flags)) { 2759 /* Unlikely */ 2760 old_prr_snd = rack->r_ctl.rc_prr_sndcnt; 2761 if (out + amm <= tp->snd_wnd) { 2762 rack->r_ctl.rc_prr_sndcnt = amm; 2763 rack_log_to_prr(rack, 4); 2764 } else 2765 goto need_retran; 2766 } else { 2767 /* Set the send-new override */ 2768 if (out + amm <= tp->snd_wnd) 2769 rack->r_ctl.rc_tlp_new_data = amm; 2770 else 2771 goto need_retran; 2772 } 2773 rack->r_ctl.rc_tlp_seg_send_cnt = 0; 2774 rack->r_ctl.rc_last_tlp_seq = tp->snd_max; 2775 rack->r_ctl.rc_tlpsend = NULL; 2776 counter_u64_add(rack_tlp_newdata, 1); 2777 goto send; 2778 } 2779 need_retran: 2780 /* 2781 * Ok we need to arrange the last un-acked segment to be re-sent, or 2782 * optionally the first un-acked segment. 2783 */ 2784 if (collapsed_win == 0) { 2785 if (rack_always_send_oldest) 2786 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 2787 else { 2788 rsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 2789 if (rsm && (rsm->r_flags & (RACK_ACKED | RACK_HAS_FIN))) { 2790 rsm = rack_find_high_nonack(rack, rsm); 2791 } 2792 } 2793 if (rsm == NULL) { 2794 counter_u64_add(rack_tlp_does_nada, 1); 2795 #ifdef TCP_BLACKBOX 2796 tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true); 2797 #endif 2798 goto out; 2799 } 2800 } else { 2801 /* 2802 * We must find the last segment 2803 * that was acceptable by the client. 2804 */ 2805 RB_FOREACH_REVERSE(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) { 2806 if ((rsm->r_flags & RACK_RWND_COLLAPSED) == 0) { 2807 /* Found one */ 2808 break; 2809 } 2810 } 2811 if (rsm == NULL) { 2812 /* None? if so send the first */ 2813 rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 2814 if (rsm == NULL) { 2815 counter_u64_add(rack_tlp_does_nada, 1); 2816 #ifdef TCP_BLACKBOX 2817 tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true); 2818 #endif 2819 goto out; 2820 } 2821 } 2822 } 2823 if ((rsm->r_end - rsm->r_start) > ctf_fixed_maxseg(tp)) { 2824 /* 2825 * We need to split this the last segment in two. 2826 */ 2827 struct rack_sendmap *nrsm; 2828 2829 2830 nrsm = rack_alloc_full_limit(rack); 2831 if (nrsm == NULL) { 2832 /* 2833 * No memory to split, we will just exit and punt 2834 * off to the RXT timer. 2835 */ 2836 counter_u64_add(rack_tlp_does_nada, 1); 2837 goto out; 2838 } 2839 rack_clone_rsm(rack, nrsm, rsm, 2840 (rsm->r_end - ctf_fixed_maxseg(tp))); 2841 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); 2842 #ifdef INVARIANTS 2843 if (insret != NULL) { 2844 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", 2845 nrsm, insret, rack, rsm); 2846 } 2847 #endif 2848 if (rsm->r_in_tmap) { 2849 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 2850 nrsm->r_in_tmap = 1; 2851 } 2852 rsm->r_flags &= (~RACK_HAS_FIN); 2853 rsm = nrsm; 2854 } 2855 rack->r_ctl.rc_tlpsend = rsm; 2856 rack->r_ctl.rc_tlp_rtx_out = 1; 2857 if (rsm->r_start == rack->r_ctl.rc_last_tlp_seq) { 2858 rack->r_ctl.rc_tlp_seg_send_cnt++; 2859 tp->t_rxtshift++; 2860 } else { 2861 rack->r_ctl.rc_last_tlp_seq = rsm->r_start; 2862 rack->r_ctl.rc_tlp_seg_send_cnt = 1; 2863 } 2864 send: 2865 rack->r_ctl.rc_tlp_send_cnt++; 2866 if (rack->r_ctl.rc_tlp_send_cnt > rack_tlp_max_resend) { 2867 /* 2868 * Can't [re]/transmit a segment we have not heard from the 2869 * peer in max times. We need the retransmit timer to take 2870 * over. 2871 */ 2872 restore: 2873 rack->r_ctl.rc_tlpsend = NULL; 2874 if (rsm) 2875 rsm->r_flags &= ~RACK_TLP; 2876 rack->r_ctl.rc_prr_sndcnt = old_prr_snd; 2877 rack_log_to_prr(rack, 5); 2878 counter_u64_add(rack_tlp_retran_fail, 1); 2879 goto out; 2880 } else if (rsm) { 2881 rsm->r_flags |= RACK_TLP; 2882 } 2883 if (rsm && (rsm->r_start == rack->r_ctl.rc_last_tlp_seq) && 2884 (rack->r_ctl.rc_tlp_seg_send_cnt > rack_tlp_max_resend)) { 2885 /* 2886 * We don't want to send a single segment more than the max 2887 * either. 2888 */ 2889 goto restore; 2890 } 2891 rack->r_timer_override = 1; 2892 rack->r_tlp_running = 1; 2893 rack->rc_tlp_in_progress = 1; 2894 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP; 2895 return (0); 2896 out: 2897 rack->tlp_timer_up = 0; 2898 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP; 2899 return (0); 2900 } 2901 2902 /* 2903 * Delayed ack Timer, here we simply need to setup the 2904 * ACK_NOW flag and remove the DELACK flag. From there 2905 * the output routine will send the ack out. 2906 * 2907 * We only return 1, saying don't proceed, if all timers 2908 * are stopped (destroyed PCB?). 2909 */ 2910 static int 2911 rack_timeout_delack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 2912 { 2913 if (tp->t_timers->tt_flags & TT_STOPPED) { 2914 return (1); 2915 } 2916 rack_log_to_event(rack, RACK_TO_FRM_DELACK, 0); 2917 tp->t_flags &= ~TF_DELACK; 2918 tp->t_flags |= TF_ACKNOW; 2919 KMOD_TCPSTAT_INC(tcps_delack); 2920 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK; 2921 return (0); 2922 } 2923 2924 /* 2925 * Persists timer, here we simply need to setup the 2926 * FORCE-DATA flag the output routine will send 2927 * the one byte send. 2928 * 2929 * We only return 1, saying don't proceed, if all timers 2930 * are stopped (destroyed PCB?). 2931 */ 2932 static int 2933 rack_timeout_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 2934 { 2935 struct tcptemp *t_template; 2936 struct inpcb *inp; 2937 int32_t retval = 1; 2938 2939 inp = tp->t_inpcb; 2940 2941 if (tp->t_timers->tt_flags & TT_STOPPED) { 2942 return (1); 2943 } 2944 if (rack->rc_in_persist == 0) 2945 return (0); 2946 if (rack_progress_timeout_check(tp)) { 2947 tcp_set_inp_to_drop(inp, ETIMEDOUT); 2948 return (1); 2949 } 2950 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 2951 /* 2952 * Persistence timer into zero window. Force a byte to be output, if 2953 * possible. 2954 */ 2955 KMOD_TCPSTAT_INC(tcps_persisttimeo); 2956 /* 2957 * Hack: if the peer is dead/unreachable, we do not time out if the 2958 * window is closed. After a full backoff, drop the connection if 2959 * the idle time (no responses to probes) reaches the maximum 2960 * backoff that we would use if retransmitting. 2961 */ 2962 if (tp->t_rxtshift == TCP_MAXRXTSHIFT && 2963 (ticks - tp->t_rcvtime >= tcp_maxpersistidle || 2964 ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) { 2965 KMOD_TCPSTAT_INC(tcps_persistdrop); 2966 retval = 1; 2967 tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT); 2968 goto out; 2969 } 2970 if ((sbavail(&rack->rc_inp->inp_socket->so_snd) == 0) && 2971 tp->snd_una == tp->snd_max) 2972 rack_exit_persist(tp, rack); 2973 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_PERSIT; 2974 /* 2975 * If the user has closed the socket then drop a persisting 2976 * connection after a much reduced timeout. 2977 */ 2978 if (tp->t_state > TCPS_CLOSE_WAIT && 2979 (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) { 2980 retval = 1; 2981 KMOD_TCPSTAT_INC(tcps_persistdrop); 2982 tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT); 2983 goto out; 2984 } 2985 t_template = tcpip_maketemplate(rack->rc_inp); 2986 if (t_template) { 2987 tcp_respond(tp, t_template->tt_ipgen, 2988 &t_template->tt_t, (struct mbuf *)NULL, 2989 tp->rcv_nxt, tp->snd_una - 1, 0); 2990 /* This sends an ack */ 2991 if (tp->t_flags & TF_DELACK) 2992 tp->t_flags &= ~TF_DELACK; 2993 free(t_template, M_TEMP); 2994 } 2995 if (tp->t_rxtshift < TCP_MAXRXTSHIFT) 2996 tp->t_rxtshift++; 2997 out: 2998 rack_log_to_event(rack, RACK_TO_FRM_PERSIST, 0); 2999 rack_start_hpts_timer(rack, tp, cts, 3000 0, 0, 0); 3001 return (retval); 3002 } 3003 3004 /* 3005 * If a keepalive goes off, we had no other timers 3006 * happening. We always return 1 here since this 3007 * routine either drops the connection or sends 3008 * out a segment with respond. 3009 */ 3010 static int 3011 rack_timeout_keepalive(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 3012 { 3013 struct tcptemp *t_template; 3014 struct inpcb *inp; 3015 3016 if (tp->t_timers->tt_flags & TT_STOPPED) { 3017 return (1); 3018 } 3019 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_KEEP; 3020 inp = tp->t_inpcb; 3021 rack_log_to_event(rack, RACK_TO_FRM_KEEP, 0); 3022 /* 3023 * Keep-alive timer went off; send something or drop connection if 3024 * idle for too long. 3025 */ 3026 KMOD_TCPSTAT_INC(tcps_keeptimeo); 3027 if (tp->t_state < TCPS_ESTABLISHED) 3028 goto dropit; 3029 if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && 3030 tp->t_state <= TCPS_CLOSING) { 3031 if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp)) 3032 goto dropit; 3033 /* 3034 * Send a packet designed to force a response if the peer is 3035 * up and reachable: either an ACK if the connection is 3036 * still alive, or an RST if the peer has closed the 3037 * connection due to timeout or reboot. Using sequence 3038 * number tp->snd_una-1 causes the transmitted zero-length 3039 * segment to lie outside the receive window; by the 3040 * protocol spec, this requires the correspondent TCP to 3041 * respond. 3042 */ 3043 KMOD_TCPSTAT_INC(tcps_keepprobe); 3044 t_template = tcpip_maketemplate(inp); 3045 if (t_template) { 3046 tcp_respond(tp, t_template->tt_ipgen, 3047 &t_template->tt_t, (struct mbuf *)NULL, 3048 tp->rcv_nxt, tp->snd_una - 1, 0); 3049 free(t_template, M_TEMP); 3050 } 3051 } 3052 rack_start_hpts_timer(rack, tp, cts, 0, 0, 0); 3053 return (1); 3054 dropit: 3055 KMOD_TCPSTAT_INC(tcps_keepdrops); 3056 tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT); 3057 return (1); 3058 } 3059 3060 /* 3061 * Retransmit helper function, clear up all the ack 3062 * flags and take care of important book keeping. 3063 */ 3064 static void 3065 rack_remxt_tmr(struct tcpcb *tp) 3066 { 3067 /* 3068 * The retransmit timer went off, all sack'd blocks must be 3069 * un-acked. 3070 */ 3071 struct rack_sendmap *rsm, *trsm = NULL; 3072 struct tcp_rack *rack; 3073 int32_t cnt = 0; 3074 3075 rack = (struct tcp_rack *)tp->t_fb_ptr; 3076 rack_timer_cancel(tp, rack, tcp_ts_getticks(), __LINE__); 3077 rack_log_to_event(rack, RACK_TO_FRM_TMR, 0); 3078 if (rack->r_state && (rack->r_state != tp->t_state)) 3079 rack_set_state(tp, rack); 3080 /* 3081 * Ideally we would like to be able to 3082 * mark SACK-PASS on anything not acked here. 3083 * However, if we do that we would burst out 3084 * all that data 1ms apart. This would be unwise, 3085 * so for now we will just let the normal rxt timer 3086 * and tlp timer take care of it. 3087 */ 3088 RB_FOREACH(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) { 3089 if (rsm->r_flags & RACK_ACKED) { 3090 cnt++; 3091 rsm->r_dupack = 0; 3092 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 3093 if (rsm->r_in_tmap == 0) { 3094 /* We must re-add it back to the tlist */ 3095 if (trsm == NULL) { 3096 TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext); 3097 } else { 3098 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, trsm, rsm, r_tnext); 3099 } 3100 rsm->r_in_tmap = 1; 3101 } 3102 } 3103 trsm = rsm; 3104 rsm->r_flags &= ~(RACK_ACKED | RACK_SACK_PASSED | RACK_WAS_SACKPASS); 3105 } 3106 /* Clear the count (we just un-acked them) */ 3107 rack->r_ctl.rc_sacked = 0; 3108 /* Clear the tlp rtx mark */ 3109 rack->r_ctl.rc_tlp_rtx_out = 0; 3110 rack->r_ctl.rc_tlp_seg_send_cnt = 0; 3111 rack->r_ctl.rc_resend = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 3112 rack->r_ctl.rc_prr_sndcnt = 0; 3113 rack_log_to_prr(rack, 6); 3114 rack->r_timer_override = 1; 3115 } 3116 3117 /* 3118 * Re-transmit timeout! If we drop the PCB we will return 1, otherwise 3119 * we will setup to retransmit the lowest seq number outstanding. 3120 */ 3121 static int 3122 rack_timeout_rxt(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 3123 { 3124 int32_t rexmt; 3125 struct inpcb *inp; 3126 int32_t retval = 0; 3127 bool isipv6; 3128 3129 inp = tp->t_inpcb; 3130 if (tp->t_timers->tt_flags & TT_STOPPED) { 3131 return (1); 3132 } 3133 if (rack_progress_timeout_check(tp)) { 3134 tcp_set_inp_to_drop(inp, ETIMEDOUT); 3135 return (1); 3136 } 3137 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RXT; 3138 if (TCPS_HAVEESTABLISHED(tp->t_state) && 3139 (tp->snd_una == tp->snd_max)) { 3140 /* Nothing outstanding .. nothing to do */ 3141 return (0); 3142 } 3143 /* 3144 * Retransmission timer went off. Message has not been acked within 3145 * retransmit interval. Back off to a longer retransmit interval 3146 * and retransmit one segment. 3147 */ 3148 rack_remxt_tmr(tp); 3149 if ((rack->r_ctl.rc_resend == NULL) || 3150 ((rack->r_ctl.rc_resend->r_flags & RACK_RWND_COLLAPSED) == 0)) { 3151 /* 3152 * If the rwnd collapsed on 3153 * the one we are retransmitting 3154 * it does not count against the 3155 * rxt count. 3156 */ 3157 tp->t_rxtshift++; 3158 } 3159 if (tp->t_rxtshift > TCP_MAXRXTSHIFT) { 3160 tp->t_rxtshift = TCP_MAXRXTSHIFT; 3161 KMOD_TCPSTAT_INC(tcps_timeoutdrop); 3162 retval = 1; 3163 tcp_set_inp_to_drop(rack->rc_inp, 3164 (tp->t_softerror ? (uint16_t) tp->t_softerror : ETIMEDOUT)); 3165 goto out; 3166 } 3167 if (tp->t_state == TCPS_SYN_SENT) { 3168 /* 3169 * If the SYN was retransmitted, indicate CWND to be limited 3170 * to 1 segment in cc_conn_init(). 3171 */ 3172 tp->snd_cwnd = 1; 3173 } else if (tp->t_rxtshift == 1) { 3174 /* 3175 * first retransmit; record ssthresh and cwnd so they can be 3176 * recovered if this turns out to be a "bad" retransmit. A 3177 * retransmit is considered "bad" if an ACK for this segment 3178 * is received within RTT/2 interval; the assumption here is 3179 * that the ACK was already in flight. See "On Estimating 3180 * End-to-End Network Path Properties" by Allman and Paxson 3181 * for more details. 3182 */ 3183 tp->snd_cwnd_prev = tp->snd_cwnd; 3184 tp->snd_ssthresh_prev = tp->snd_ssthresh; 3185 tp->snd_recover_prev = tp->snd_recover; 3186 if (IN_FASTRECOVERY(tp->t_flags)) 3187 tp->t_flags |= TF_WASFRECOVERY; 3188 else 3189 tp->t_flags &= ~TF_WASFRECOVERY; 3190 if (IN_CONGRECOVERY(tp->t_flags)) 3191 tp->t_flags |= TF_WASCRECOVERY; 3192 else 3193 tp->t_flags &= ~TF_WASCRECOVERY; 3194 tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1)); 3195 tp->t_flags |= TF_PREVVALID; 3196 } else 3197 tp->t_flags &= ~TF_PREVVALID; 3198 KMOD_TCPSTAT_INC(tcps_rexmttimeo); 3199 if ((tp->t_state == TCPS_SYN_SENT) || 3200 (tp->t_state == TCPS_SYN_RECEIVED)) 3201 rexmt = MSEC_2_TICKS(RACK_INITIAL_RTO * tcp_backoff[tp->t_rxtshift]); 3202 else 3203 rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; 3204 TCPT_RANGESET(tp->t_rxtcur, rexmt, 3205 max(MSEC_2_TICKS(rack_rto_min), rexmt), 3206 MSEC_2_TICKS(rack_rto_max)); 3207 /* 3208 * We enter the path for PLMTUD if connection is established or, if 3209 * connection is FIN_WAIT_1 status, reason for the last is that if 3210 * amount of data we send is very small, we could send it in couple 3211 * of packets and process straight to FIN. In that case we won't 3212 * catch ESTABLISHED state. 3213 */ 3214 #ifdef INET6 3215 isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? true : false; 3216 #else 3217 isipv6 = false; 3218 #endif 3219 if (((V_tcp_pmtud_blackhole_detect == 1) || 3220 (V_tcp_pmtud_blackhole_detect == 2 && !isipv6) || 3221 (V_tcp_pmtud_blackhole_detect == 3 && isipv6)) && 3222 ((tp->t_state == TCPS_ESTABLISHED) || 3223 (tp->t_state == TCPS_FIN_WAIT_1))) { 3224 3225 /* 3226 * Idea here is that at each stage of mtu probe (usually, 3227 * 1448 -> 1188 -> 524) should be given 2 chances to recover 3228 * before further clamping down. 'tp->t_rxtshift % 2 == 0' 3229 * should take care of that. 3230 */ 3231 if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) == 3232 (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) && 3233 (tp->t_rxtshift >= 2 && tp->t_rxtshift < 6 && 3234 tp->t_rxtshift % 2 == 0)) { 3235 /* 3236 * Enter Path MTU Black-hole Detection mechanism: - 3237 * Disable Path MTU Discovery (IP "DF" bit). - 3238 * Reduce MTU to lower value than what we negotiated 3239 * with peer. 3240 */ 3241 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) == 0) { 3242 /* Record that we may have found a black hole. */ 3243 tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE; 3244 /* Keep track of previous MSS. */ 3245 tp->t_pmtud_saved_maxseg = tp->t_maxseg; 3246 } 3247 3248 /* 3249 * Reduce the MSS to blackhole value or to the 3250 * default in an attempt to retransmit. 3251 */ 3252 #ifdef INET6 3253 if (isipv6 && 3254 tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) { 3255 /* Use the sysctl tuneable blackhole MSS. */ 3256 tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss; 3257 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated); 3258 } else if (isipv6) { 3259 /* Use the default MSS. */ 3260 tp->t_maxseg = V_tcp_v6mssdflt; 3261 /* 3262 * Disable Path MTU Discovery when we switch 3263 * to minmss. 3264 */ 3265 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 3266 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); 3267 } 3268 #endif 3269 #if defined(INET6) && defined(INET) 3270 else 3271 #endif 3272 #ifdef INET 3273 if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) { 3274 /* Use the sysctl tuneable blackhole MSS. */ 3275 tp->t_maxseg = V_tcp_pmtud_blackhole_mss; 3276 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated); 3277 } else { 3278 /* Use the default MSS. */ 3279 tp->t_maxseg = V_tcp_mssdflt; 3280 /* 3281 * Disable Path MTU Discovery when we switch 3282 * to minmss. 3283 */ 3284 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 3285 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); 3286 } 3287 #endif 3288 } else { 3289 /* 3290 * If further retransmissions are still unsuccessful 3291 * with a lowered MTU, maybe this isn't a blackhole 3292 * and we restore the previous MSS and blackhole 3293 * detection flags. The limit '6' is determined by 3294 * giving each probe stage (1448, 1188, 524) 2 3295 * chances to recover. 3296 */ 3297 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) && 3298 (tp->t_rxtshift >= 6)) { 3299 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 3300 tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE; 3301 tp->t_maxseg = tp->t_pmtud_saved_maxseg; 3302 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_failed); 3303 } 3304 } 3305 } 3306 /* 3307 * If we backed off this far, our srtt estimate is probably bogus. 3308 * Clobber it so we'll take the next rtt measurement as our srtt; 3309 * move the current srtt into rttvar to keep the current retransmit 3310 * times until then. 3311 */ 3312 if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) { 3313 #ifdef INET6 3314 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) 3315 in6_losing(tp->t_inpcb); 3316 else 3317 #endif 3318 in_losing(tp->t_inpcb); 3319 tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT); 3320 tp->t_srtt = 0; 3321 } 3322 if (rack_use_sack_filter) 3323 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 3324 tp->snd_recover = tp->snd_max; 3325 tp->t_flags |= TF_ACKNOW; 3326 tp->t_rtttime = 0; 3327 rack_cong_signal(tp, NULL, CC_RTO); 3328 out: 3329 return (retval); 3330 } 3331 3332 static int 3333 rack_process_timers(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8_t hpts_calling) 3334 { 3335 int32_t ret = 0; 3336 int32_t timers = (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK); 3337 3338 if (timers == 0) { 3339 return (0); 3340 } 3341 if (tp->t_state == TCPS_LISTEN) { 3342 /* no timers on listen sockets */ 3343 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) 3344 return (0); 3345 return (1); 3346 } 3347 if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) { 3348 uint32_t left; 3349 3350 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { 3351 ret = -1; 3352 rack_log_to_processing(rack, cts, ret, 0); 3353 return (0); 3354 } 3355 if (hpts_calling == 0) { 3356 ret = -2; 3357 rack_log_to_processing(rack, cts, ret, 0); 3358 return (0); 3359 } 3360 /* 3361 * Ok our timer went off early and we are not paced false 3362 * alarm, go back to sleep. 3363 */ 3364 ret = -3; 3365 left = rack->r_ctl.rc_timer_exp - cts; 3366 tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(left)); 3367 rack_log_to_processing(rack, cts, ret, left); 3368 rack->rc_last_pto_set = 0; 3369 return (1); 3370 } 3371 rack->rc_tmr_stopped = 0; 3372 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_MASK; 3373 if (timers & PACE_TMR_DELACK) { 3374 ret = rack_timeout_delack(tp, rack, cts); 3375 } else if (timers & PACE_TMR_RACK) { 3376 rack->r_ctl.rc_tlp_rxt_last_time = cts; 3377 ret = rack_timeout_rack(tp, rack, cts); 3378 } else if (timers & PACE_TMR_TLP) { 3379 rack->r_ctl.rc_tlp_rxt_last_time = cts; 3380 ret = rack_timeout_tlp(tp, rack, cts); 3381 } else if (timers & PACE_TMR_RXT) { 3382 rack->r_ctl.rc_tlp_rxt_last_time = cts; 3383 ret = rack_timeout_rxt(tp, rack, cts); 3384 } else if (timers & PACE_TMR_PERSIT) { 3385 ret = rack_timeout_persist(tp, rack, cts); 3386 } else if (timers & PACE_TMR_KEEP) { 3387 ret = rack_timeout_keepalive(tp, rack, cts); 3388 } 3389 rack_log_to_processing(rack, cts, ret, timers); 3390 return (ret); 3391 } 3392 3393 static void 3394 rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line) 3395 { 3396 uint8_t hpts_removed = 0; 3397 3398 if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) && 3399 TSTMP_GEQ(cts, rack->r_ctl.rc_last_output_to)) { 3400 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); 3401 hpts_removed = 1; 3402 } 3403 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { 3404 rack->rc_tmr_stopped = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK; 3405 if (rack->rc_inp->inp_in_hpts && 3406 ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0)) { 3407 /* 3408 * Canceling timer's when we have no output being 3409 * paced. We also must remove ourselves from the 3410 * hpts. 3411 */ 3412 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); 3413 hpts_removed = 1; 3414 } 3415 rack_log_to_cancel(rack, hpts_removed, line); 3416 rack->r_ctl.rc_hpts_flags &= ~(PACE_TMR_MASK); 3417 } 3418 } 3419 3420 static void 3421 rack_timer_stop(struct tcpcb *tp, uint32_t timer_type) 3422 { 3423 return; 3424 } 3425 3426 static int 3427 rack_stopall(struct tcpcb *tp) 3428 { 3429 struct tcp_rack *rack; 3430 rack = (struct tcp_rack *)tp->t_fb_ptr; 3431 rack->t_timers_stopped = 1; 3432 return (0); 3433 } 3434 3435 static void 3436 rack_timer_activate(struct tcpcb *tp, uint32_t timer_type, uint32_t delta) 3437 { 3438 return; 3439 } 3440 3441 static int 3442 rack_timer_active(struct tcpcb *tp, uint32_t timer_type) 3443 { 3444 return (0); 3445 } 3446 3447 static void 3448 rack_stop_all_timers(struct tcpcb *tp) 3449 { 3450 struct tcp_rack *rack; 3451 3452 /* 3453 * Assure no timers are running. 3454 */ 3455 if (tcp_timer_active(tp, TT_PERSIST)) { 3456 /* We enter in persists, set the flag appropriately */ 3457 rack = (struct tcp_rack *)tp->t_fb_ptr; 3458 rack->rc_in_persist = 1; 3459 } 3460 tcp_timer_suspend(tp, TT_PERSIST); 3461 tcp_timer_suspend(tp, TT_REXMT); 3462 tcp_timer_suspend(tp, TT_KEEP); 3463 tcp_timer_suspend(tp, TT_DELACK); 3464 } 3465 3466 static void 3467 rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack, 3468 struct rack_sendmap *rsm, uint32_t ts) 3469 { 3470 int32_t idx; 3471 3472 rsm->r_rtr_cnt++; 3473 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 3474 rsm->r_dupack = 0; 3475 if (rsm->r_rtr_cnt > RACK_NUM_OF_RETRANS) { 3476 rsm->r_rtr_cnt = RACK_NUM_OF_RETRANS; 3477 rsm->r_flags |= RACK_OVERMAX; 3478 } 3479 if ((rsm->r_rtr_cnt > 1) && (rack->r_tlp_running == 0)) { 3480 rack->r_ctl.rc_holes_rxt += (rsm->r_end - rsm->r_start); 3481 rsm->r_rtr_bytes += (rsm->r_end - rsm->r_start); 3482 } 3483 idx = rsm->r_rtr_cnt - 1; 3484 rsm->r_tim_lastsent[idx] = ts; 3485 if (rsm->r_flags & RACK_ACKED) { 3486 /* Problably MTU discovery messing with us */ 3487 rsm->r_flags &= ~RACK_ACKED; 3488 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); 3489 } 3490 if (rsm->r_in_tmap) { 3491 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 3492 rsm->r_in_tmap = 0; 3493 } 3494 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); 3495 rsm->r_in_tmap = 1; 3496 if (rsm->r_flags & RACK_SACK_PASSED) { 3497 /* We have retransmitted due to the SACK pass */ 3498 rsm->r_flags &= ~RACK_SACK_PASSED; 3499 rsm->r_flags |= RACK_WAS_SACKPASS; 3500 } 3501 } 3502 3503 3504 static uint32_t 3505 rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack, 3506 struct rack_sendmap *rsm, uint32_t ts, int32_t *lenp) 3507 { 3508 /* 3509 * We (re-)transmitted starting at rsm->r_start for some length 3510 * (possibly less than r_end. 3511 */ 3512 struct rack_sendmap *nrsm, *insret; 3513 uint32_t c_end; 3514 int32_t len; 3515 3516 len = *lenp; 3517 c_end = rsm->r_start + len; 3518 if (SEQ_GEQ(c_end, rsm->r_end)) { 3519 /* 3520 * We retransmitted the whole piece or more than the whole 3521 * slopping into the next rsm. 3522 */ 3523 rack_update_rsm(tp, rack, rsm, ts); 3524 if (c_end == rsm->r_end) { 3525 *lenp = 0; 3526 return (0); 3527 } else { 3528 int32_t act_len; 3529 3530 /* Hangs over the end return whats left */ 3531 act_len = rsm->r_end - rsm->r_start; 3532 *lenp = (len - act_len); 3533 return (rsm->r_end); 3534 } 3535 /* We don't get out of this block. */ 3536 } 3537 /* 3538 * Here we retransmitted less than the whole thing which means we 3539 * have to split this into what was transmitted and what was not. 3540 */ 3541 nrsm = rack_alloc_full_limit(rack); 3542 if (nrsm == NULL) { 3543 /* 3544 * We can't get memory, so lets not proceed. 3545 */ 3546 *lenp = 0; 3547 return (0); 3548 } 3549 /* 3550 * So here we are going to take the original rsm and make it what we 3551 * retransmitted. nrsm will be the tail portion we did not 3552 * retransmit. For example say the chunk was 1, 11 (10 bytes). And 3553 * we retransmitted 5 bytes i.e. 1, 5. The original piece shrinks to 3554 * 1, 6 and the new piece will be 6, 11. 3555 */ 3556 rack_clone_rsm(rack, nrsm, rsm, c_end); 3557 nrsm->r_dupack = 0; 3558 rack_log_retran_reason(rack, nrsm, __LINE__, 0, 2); 3559 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); 3560 #ifdef INVARIANTS 3561 if (insret != NULL) { 3562 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", 3563 nrsm, insret, rack, rsm); 3564 } 3565 #endif 3566 if (rsm->r_in_tmap) { 3567 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 3568 nrsm->r_in_tmap = 1; 3569 } 3570 rsm->r_flags &= (~RACK_HAS_FIN); 3571 rack_update_rsm(tp, rack, rsm, ts); 3572 *lenp = 0; 3573 return (0); 3574 } 3575 3576 3577 static void 3578 rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len, 3579 uint32_t seq_out, uint8_t th_flags, int32_t err, uint32_t ts, 3580 uint8_t pass, struct rack_sendmap *hintrsm) 3581 { 3582 struct tcp_rack *rack; 3583 struct rack_sendmap *rsm, *nrsm, *insret, fe; 3584 register uint32_t snd_max, snd_una; 3585 3586 /* 3587 * Add to the RACK log of packets in flight or retransmitted. If 3588 * there is a TS option we will use the TS echoed, if not we will 3589 * grab a TS. 3590 * 3591 * Retransmissions will increment the count and move the ts to its 3592 * proper place. Note that if options do not include TS's then we 3593 * won't be able to effectively use the ACK for an RTT on a retran. 3594 * 3595 * Notes about r_start and r_end. Lets consider a send starting at 3596 * sequence 1 for 10 bytes. In such an example the r_start would be 3597 * 1 (starting sequence) but the r_end would be r_start+len i.e. 11. 3598 * This means that r_end is actually the first sequence for the next 3599 * slot (11). 3600 * 3601 */ 3602 /* 3603 * If err is set what do we do XXXrrs? should we not add the thing? 3604 * -- i.e. return if err != 0 or should we pretend we sent it? -- 3605 * i.e. proceed with add ** do this for now. 3606 */ 3607 INP_WLOCK_ASSERT(tp->t_inpcb); 3608 if (err) 3609 /* 3610 * We don't log errors -- we could but snd_max does not 3611 * advance in this case either. 3612 */ 3613 return; 3614 3615 if (th_flags & TH_RST) { 3616 /* 3617 * We don't log resets and we return immediately from 3618 * sending 3619 */ 3620 return; 3621 } 3622 rack = (struct tcp_rack *)tp->t_fb_ptr; 3623 snd_una = tp->snd_una; 3624 if (SEQ_LEQ((seq_out + len), snd_una)) { 3625 /* Are sending an old segment to induce an ack (keep-alive)? */ 3626 return; 3627 } 3628 if (SEQ_LT(seq_out, snd_una)) { 3629 /* huh? should we panic? */ 3630 uint32_t end; 3631 3632 end = seq_out + len; 3633 seq_out = snd_una; 3634 if (SEQ_GEQ(end, seq_out)) 3635 len = end - seq_out; 3636 else 3637 len = 0; 3638 } 3639 snd_max = tp->snd_max; 3640 if (th_flags & (TH_SYN | TH_FIN)) { 3641 /* 3642 * The call to rack_log_output is made before bumping 3643 * snd_max. This means we can record one extra byte on a SYN 3644 * or FIN if seq_out is adding more on and a FIN is present 3645 * (and we are not resending). 3646 */ 3647 if (th_flags & TH_SYN) 3648 len++; 3649 if (th_flags & TH_FIN) 3650 len++; 3651 if (SEQ_LT(snd_max, tp->snd_nxt)) { 3652 /* 3653 * The add/update as not been done for the FIN/SYN 3654 * yet. 3655 */ 3656 snd_max = tp->snd_nxt; 3657 } 3658 } 3659 if (len == 0) { 3660 /* We don't log zero window probes */ 3661 return; 3662 } 3663 rack->r_ctl.rc_time_last_sent = ts; 3664 if (IN_RECOVERY(tp->t_flags)) { 3665 rack->r_ctl.rc_prr_out += len; 3666 } 3667 /* First question is it a retransmission or new? */ 3668 if (seq_out == snd_max) { 3669 /* Its new */ 3670 again: 3671 rsm = rack_alloc(rack); 3672 if (rsm == NULL) { 3673 /* 3674 * Hmm out of memory and the tcb got destroyed while 3675 * we tried to wait. 3676 */ 3677 return; 3678 } 3679 if (th_flags & TH_FIN) { 3680 rsm->r_flags = RACK_HAS_FIN; 3681 } else { 3682 rsm->r_flags = 0; 3683 } 3684 rsm->r_tim_lastsent[0] = ts; 3685 rsm->r_rtr_cnt = 1; 3686 rsm->r_rtr_bytes = 0; 3687 if (th_flags & TH_SYN) { 3688 /* The data space is one beyond snd_una */ 3689 rsm->r_start = seq_out + 1; 3690 rsm->r_end = rsm->r_start + (len - 1); 3691 } else { 3692 /* Normal case */ 3693 rsm->r_start = seq_out; 3694 rsm->r_end = rsm->r_start + len; 3695 } 3696 rsm->r_dupack = 0; 3697 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 3698 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 3699 #ifdef INVARIANTS 3700 if (insret != NULL) { 3701 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", 3702 nrsm, insret, rack, rsm); 3703 } 3704 #endif 3705 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); 3706 rsm->r_in_tmap = 1; 3707 return; 3708 } 3709 /* 3710 * If we reach here its a retransmission and we need to find it. 3711 */ 3712 memset(&fe, 0, sizeof(fe)); 3713 more: 3714 if (hintrsm && (hintrsm->r_start == seq_out)) { 3715 rsm = hintrsm; 3716 hintrsm = NULL; 3717 } else { 3718 /* No hints sorry */ 3719 rsm = NULL; 3720 } 3721 if ((rsm) && (rsm->r_start == seq_out)) { 3722 seq_out = rack_update_entry(tp, rack, rsm, ts, &len); 3723 if (len == 0) { 3724 return; 3725 } else { 3726 goto more; 3727 } 3728 } 3729 /* Ok it was not the last pointer go through it the hard way. */ 3730 refind: 3731 fe.r_start = seq_out; 3732 rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); 3733 if (rsm) { 3734 if (rsm->r_start == seq_out) { 3735 seq_out = rack_update_entry(tp, rack, rsm, ts, &len); 3736 if (len == 0) { 3737 return; 3738 } else { 3739 goto refind; 3740 } 3741 } 3742 if (SEQ_GEQ(seq_out, rsm->r_start) && SEQ_LT(seq_out, rsm->r_end)) { 3743 /* Transmitted within this piece */ 3744 /* 3745 * Ok we must split off the front and then let the 3746 * update do the rest 3747 */ 3748 nrsm = rack_alloc_full_limit(rack); 3749 if (nrsm == NULL) { 3750 rack_update_rsm(tp, rack, rsm, ts); 3751 return; 3752 } 3753 /* 3754 * copy rsm to nrsm and then trim the front of rsm 3755 * to not include this part. 3756 */ 3757 rack_clone_rsm(rack, nrsm, rsm, seq_out); 3758 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); 3759 #ifdef INVARIANTS 3760 if (insret != NULL) { 3761 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", 3762 nrsm, insret, rack, rsm); 3763 } 3764 #endif 3765 if (rsm->r_in_tmap) { 3766 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 3767 nrsm->r_in_tmap = 1; 3768 } 3769 rsm->r_flags &= (~RACK_HAS_FIN); 3770 seq_out = rack_update_entry(tp, rack, nrsm, ts, &len); 3771 if (len == 0) { 3772 return; 3773 } else if (len > 0) 3774 goto refind; 3775 } 3776 } 3777 /* 3778 * Hmm not found in map did they retransmit both old and on into the 3779 * new? 3780 */ 3781 if (seq_out == tp->snd_max) { 3782 goto again; 3783 } else if (SEQ_LT(seq_out, tp->snd_max)) { 3784 #ifdef INVARIANTS 3785 printf("seq_out:%u len:%d snd_una:%u snd_max:%u -- but rsm not found?\n", 3786 seq_out, len, tp->snd_una, tp->snd_max); 3787 printf("Starting Dump of all rack entries\n"); 3788 RB_FOREACH(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) { 3789 printf("rsm:%p start:%u end:%u\n", 3790 rsm, rsm->r_start, rsm->r_end); 3791 } 3792 printf("Dump complete\n"); 3793 panic("seq_out not found rack:%p tp:%p", 3794 rack, tp); 3795 #endif 3796 } else { 3797 #ifdef INVARIANTS 3798 /* 3799 * Hmm beyond sndmax? (only if we are using the new rtt-pack 3800 * flag) 3801 */ 3802 panic("seq_out:%u(%d) is beyond snd_max:%u tp:%p", 3803 seq_out, len, tp->snd_max, tp); 3804 #endif 3805 } 3806 } 3807 3808 /* 3809 * Record one of the RTT updates from an ack into 3810 * our sample structure. 3811 */ 3812 static void 3813 tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt) 3814 { 3815 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) || 3816 (rack->r_ctl.rack_rs.rs_rtt_lowest > rtt)) { 3817 rack->r_ctl.rack_rs.rs_rtt_lowest = rtt; 3818 } 3819 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) || 3820 (rack->r_ctl.rack_rs.rs_rtt_highest < rtt)) { 3821 rack->r_ctl.rack_rs.rs_rtt_highest = rtt; 3822 } 3823 rack->r_ctl.rack_rs.rs_flags = RACK_RTT_VALID; 3824 rack->r_ctl.rack_rs.rs_rtt_tot += rtt; 3825 rack->r_ctl.rack_rs.rs_rtt_cnt++; 3826 } 3827 3828 /* 3829 * Collect new round-trip time estimate 3830 * and update averages and current timeout. 3831 */ 3832 static void 3833 tcp_rack_xmit_timer_commit(struct tcp_rack *rack, struct tcpcb *tp) 3834 { 3835 int32_t delta; 3836 uint32_t o_srtt, o_var; 3837 int32_t rtt; 3838 3839 if (rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) 3840 /* No valid sample */ 3841 return; 3842 if (rack->r_ctl.rc_rate_sample_method == USE_RTT_LOW) { 3843 /* We are to use the lowest RTT seen in a single ack */ 3844 rtt = rack->r_ctl.rack_rs.rs_rtt_lowest; 3845 } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_HIGH) { 3846 /* We are to use the highest RTT seen in a single ack */ 3847 rtt = rack->r_ctl.rack_rs.rs_rtt_highest; 3848 } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_AVG) { 3849 /* We are to use the average RTT seen in a single ack */ 3850 rtt = (int32_t)(rack->r_ctl.rack_rs.rs_rtt_tot / 3851 (uint64_t)rack->r_ctl.rack_rs.rs_rtt_cnt); 3852 } else { 3853 #ifdef INVARIANTS 3854 panic("Unknown rtt variant %d", rack->r_ctl.rc_rate_sample_method); 3855 #endif 3856 return; 3857 } 3858 if (rtt == 0) 3859 rtt = 1; 3860 rack_log_rtt_sample(rack, rtt); 3861 o_srtt = tp->t_srtt; 3862 o_var = tp->t_rttvar; 3863 rack = (struct tcp_rack *)tp->t_fb_ptr; 3864 if (tp->t_srtt != 0) { 3865 /* 3866 * srtt is stored as fixed point with 5 bits after the 3867 * binary point (i.e., scaled by 8). The following magic is 3868 * equivalent to the smoothing algorithm in rfc793 with an 3869 * alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed point). 3870 * Adjust rtt to origin 0. 3871 */ 3872 delta = ((rtt - 1) << TCP_DELTA_SHIFT) 3873 - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)); 3874 3875 tp->t_srtt += delta; 3876 if (tp->t_srtt <= 0) 3877 tp->t_srtt = 1; 3878 3879 /* 3880 * We accumulate a smoothed rtt variance (actually, a 3881 * smoothed mean difference), then set the retransmit timer 3882 * to smoothed rtt + 4 times the smoothed variance. rttvar 3883 * is stored as fixed point with 4 bits after the binary 3884 * point (scaled by 16). The following is equivalent to 3885 * rfc793 smoothing with an alpha of .75 (rttvar = 3886 * rttvar*3/4 + |delta| / 4). This replaces rfc793's 3887 * wired-in beta. 3888 */ 3889 if (delta < 0) 3890 delta = -delta; 3891 delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT); 3892 tp->t_rttvar += delta; 3893 if (tp->t_rttvar <= 0) 3894 tp->t_rttvar = 1; 3895 if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar) 3896 tp->t_rttbest = tp->t_srtt + tp->t_rttvar; 3897 } else { 3898 /* 3899 * No rtt measurement yet - use the unsmoothed rtt. Set the 3900 * variance to half the rtt (so our first retransmit happens 3901 * at 3*rtt). 3902 */ 3903 tp->t_srtt = rtt << TCP_RTT_SHIFT; 3904 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1); 3905 tp->t_rttbest = tp->t_srtt + tp->t_rttvar; 3906 } 3907 KMOD_TCPSTAT_INC(tcps_rttupdated); 3908 rack_log_rtt_upd(tp, rack, rtt, o_srtt, o_var); 3909 tp->t_rttupdated++; 3910 #ifdef STATS 3911 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rtt)); 3912 #endif 3913 tp->t_rxtshift = 0; 3914 3915 /* 3916 * the retransmit should happen at rtt + 4 * rttvar. Because of the 3917 * way we do the smoothing, srtt and rttvar will each average +1/2 3918 * tick of bias. When we compute the retransmit timer, we want 1/2 3919 * tick of rounding and 1 extra tick because of +-1/2 tick 3920 * uncertainty in the firing of the timer. The bias will give us 3921 * exactly the 1.5 tick we need. But, because the bias is 3922 * statistical, we have to test that we don't drop below the minimum 3923 * feasible timer (which is 2 ticks). 3924 */ 3925 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), 3926 max(MSEC_2_TICKS(rack_rto_min), rtt + 2), MSEC_2_TICKS(rack_rto_max)); 3927 tp->t_softerror = 0; 3928 } 3929 3930 static void 3931 rack_earlier_retran(struct tcpcb *tp, struct rack_sendmap *rsm, 3932 uint32_t t, uint32_t cts) 3933 { 3934 /* 3935 * For this RSM, we acknowledged the data from a previous 3936 * transmission, not the last one we made. This means we did a false 3937 * retransmit. 3938 */ 3939 struct tcp_rack *rack; 3940 3941 if (rsm->r_flags & RACK_HAS_FIN) { 3942 /* 3943 * The sending of the FIN often is multiple sent when we 3944 * have everything outstanding ack'd. We ignore this case 3945 * since its over now. 3946 */ 3947 return; 3948 } 3949 if (rsm->r_flags & RACK_TLP) { 3950 /* 3951 * We expect TLP's to have this occur. 3952 */ 3953 return; 3954 } 3955 rack = (struct tcp_rack *)tp->t_fb_ptr; 3956 /* should we undo cc changes and exit recovery? */ 3957 if (IN_RECOVERY(tp->t_flags)) { 3958 if (rack->r_ctl.rc_rsm_start == rsm->r_start) { 3959 /* 3960 * Undo what we ratched down and exit recovery if 3961 * possible 3962 */ 3963 EXIT_RECOVERY(tp->t_flags); 3964 tp->snd_recover = tp->snd_una; 3965 if (rack->r_ctl.rc_cwnd_at > tp->snd_cwnd) 3966 tp->snd_cwnd = rack->r_ctl.rc_cwnd_at; 3967 if (rack->r_ctl.rc_ssthresh_at > tp->snd_ssthresh) 3968 tp->snd_ssthresh = rack->r_ctl.rc_ssthresh_at; 3969 } 3970 } 3971 if (rsm->r_flags & RACK_WAS_SACKPASS) { 3972 /* 3973 * We retransmitted based on a sack and the earlier 3974 * retransmission ack'd it - re-ordering is occuring. 3975 */ 3976 counter_u64_add(rack_reorder_seen, 1); 3977 rack->r_ctl.rc_reorder_ts = cts; 3978 } 3979 counter_u64_add(rack_badfr, 1); 3980 counter_u64_add(rack_badfr_bytes, (rsm->r_end - rsm->r_start)); 3981 } 3982 3983 3984 static int 3985 rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack, 3986 struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type) 3987 { 3988 int32_t i; 3989 uint32_t t; 3990 3991 if (rsm->r_flags & RACK_ACKED) 3992 /* Already done */ 3993 return (0); 3994 3995 3996 if ((rsm->r_rtr_cnt == 1) || 3997 ((ack_type == CUM_ACKED) && 3998 (to->to_flags & TOF_TS) && 3999 (to->to_tsecr) && 4000 (rsm->r_tim_lastsent[rsm->r_rtr_cnt - 1] == to->to_tsecr)) 4001 ) { 4002 /* 4003 * We will only find a matching timestamp if its cum-acked. 4004 * But if its only one retransmission its for-sure matching 4005 * :-) 4006 */ 4007 t = cts - rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; 4008 if ((int)t <= 0) 4009 t = 1; 4010 if (!tp->t_rttlow || tp->t_rttlow > t) 4011 tp->t_rttlow = t; 4012 if (!rack->r_ctl.rc_rack_min_rtt || 4013 SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 4014 rack->r_ctl.rc_rack_min_rtt = t; 4015 if (rack->r_ctl.rc_rack_min_rtt == 0) { 4016 rack->r_ctl.rc_rack_min_rtt = 1; 4017 } 4018 } 4019 tcp_rack_xmit_timer(rack, t + 1); 4020 if ((rsm->r_flags & RACK_TLP) && 4021 (!IN_RECOVERY(tp->t_flags))) { 4022 /* Segment was a TLP and our retrans matched */ 4023 if (rack->r_ctl.rc_tlp_cwnd_reduce) { 4024 rack->r_ctl.rc_rsm_start = tp->snd_max; 4025 rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; 4026 rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; 4027 rack_cong_signal(tp, NULL, CC_NDUPACK); 4028 /* 4029 * When we enter recovery we need to assure 4030 * we send one packet. 4031 */ 4032 rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); 4033 rack_log_to_prr(rack, 7); 4034 } 4035 } 4036 if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) { 4037 /* New more recent rack_tmit_time */ 4038 rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; 4039 rack->rc_rack_rtt = t; 4040 } 4041 return (1); 4042 } 4043 /* 4044 * We clear the soft/rxtshift since we got an ack. 4045 * There is no assurance we will call the commit() function 4046 * so we need to clear these to avoid incorrect handling. 4047 */ 4048 tp->t_rxtshift = 0; 4049 tp->t_softerror = 0; 4050 if ((to->to_flags & TOF_TS) && 4051 (ack_type == CUM_ACKED) && 4052 (to->to_tsecr) && 4053 ((rsm->r_flags & RACK_OVERMAX) == 0)) { 4054 /* 4055 * Now which timestamp does it match? In this block the ACK 4056 * must be coming from a previous transmission. 4057 */ 4058 for (i = 0; i < rsm->r_rtr_cnt; i++) { 4059 if (rsm->r_tim_lastsent[i] == to->to_tsecr) { 4060 t = cts - rsm->r_tim_lastsent[i]; 4061 if ((int)t <= 0) 4062 t = 1; 4063 if ((i + 1) < rsm->r_rtr_cnt) { 4064 /* Likely */ 4065 rack_earlier_retran(tp, rsm, t, cts); 4066 } 4067 if (!tp->t_rttlow || tp->t_rttlow > t) 4068 tp->t_rttlow = t; 4069 if (!rack->r_ctl.rc_rack_min_rtt || SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 4070 rack->r_ctl.rc_rack_min_rtt = t; 4071 if (rack->r_ctl.rc_rack_min_rtt == 0) { 4072 rack->r_ctl.rc_rack_min_rtt = 1; 4073 } 4074 } 4075 /* 4076 * Note the following calls to 4077 * tcp_rack_xmit_timer() are being commented 4078 * out for now. They give us no more accuracy 4079 * and often lead to a wrong choice. We have 4080 * enough samples that have not been 4081 * retransmitted. I leave the commented out 4082 * code in here in case in the future we 4083 * decide to add it back (though I can't forsee 4084 * doing that). That way we will easily see 4085 * where they need to be placed. 4086 */ 4087 if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, 4088 rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) { 4089 /* New more recent rack_tmit_time */ 4090 rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; 4091 rack->rc_rack_rtt = t; 4092 } 4093 return (1); 4094 } 4095 } 4096 goto ts_not_found; 4097 } else { 4098 /* 4099 * Ok its a SACK block that we retransmitted. or a windows 4100 * machine without timestamps. We can tell nothing from the 4101 * time-stamp since its not there or the time the peer last 4102 * recieved a segment that moved forward its cum-ack point. 4103 */ 4104 ts_not_found: 4105 i = rsm->r_rtr_cnt - 1; 4106 t = cts - rsm->r_tim_lastsent[i]; 4107 if ((int)t <= 0) 4108 t = 1; 4109 if (rack->r_ctl.rc_rack_min_rtt && SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 4110 /* 4111 * We retransmitted and the ack came back in less 4112 * than the smallest rtt we have observed. We most 4113 * likey did an improper retransmit as outlined in 4114 * 4.2 Step 3 point 2 in the rack-draft. 4115 */ 4116 i = rsm->r_rtr_cnt - 2; 4117 t = cts - rsm->r_tim_lastsent[i]; 4118 rack_earlier_retran(tp, rsm, t, cts); 4119 } else if (rack->r_ctl.rc_rack_min_rtt) { 4120 /* 4121 * We retransmitted it and the retransmit did the 4122 * job. 4123 */ 4124 if (!rack->r_ctl.rc_rack_min_rtt || 4125 SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 4126 rack->r_ctl.rc_rack_min_rtt = t; 4127 if (rack->r_ctl.rc_rack_min_rtt == 0) { 4128 rack->r_ctl.rc_rack_min_rtt = 1; 4129 } 4130 } 4131 if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, rsm->r_tim_lastsent[i])) { 4132 /* New more recent rack_tmit_time */ 4133 rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[i]; 4134 rack->rc_rack_rtt = t; 4135 } 4136 return (1); 4137 } 4138 } 4139 return (0); 4140 } 4141 4142 /* 4143 * Mark the SACK_PASSED flag on all entries prior to rsm send wise. 4144 */ 4145 static void 4146 rack_log_sack_passed(struct tcpcb *tp, 4147 struct tcp_rack *rack, struct rack_sendmap *rsm) 4148 { 4149 struct rack_sendmap *nrsm; 4150 4151 nrsm = rsm; 4152 TAILQ_FOREACH_REVERSE_FROM(nrsm, &rack->r_ctl.rc_tmap, 4153 rack_head, r_tnext) { 4154 if (nrsm == rsm) { 4155 /* Skip orginal segment he is acked */ 4156 continue; 4157 } 4158 if (nrsm->r_flags & RACK_ACKED) { 4159 /* 4160 * Skip ack'd segments, though we 4161 * should not see these, since tmap 4162 * should not have ack'd segments. 4163 */ 4164 continue; 4165 } 4166 if (nrsm->r_flags & RACK_SACK_PASSED) { 4167 /* 4168 * We found one that is already marked 4169 * passed, we have been here before and 4170 * so all others below this are marked. 4171 */ 4172 break; 4173 } 4174 nrsm->r_flags |= RACK_SACK_PASSED; 4175 nrsm->r_flags &= ~RACK_WAS_SACKPASS; 4176 } 4177 } 4178 4179 static uint32_t 4180 rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, struct sackblk *sack, 4181 struct tcpopt *to, struct rack_sendmap **prsm, uint32_t cts, int *moved_two) 4182 { 4183 uint32_t start, end, changed = 0; 4184 struct rack_sendmap stack_map; 4185 struct rack_sendmap *rsm, *nrsm, fe, *insret, *prev, *next; 4186 int32_t used_ref = 1; 4187 int moved = 0; 4188 4189 start = sack->start; 4190 end = sack->end; 4191 rsm = *prsm; 4192 memset(&fe, 0, sizeof(fe)); 4193 do_rest_ofb: 4194 if ((rsm == NULL) || 4195 (SEQ_LT(end, rsm->r_start)) || 4196 (SEQ_GEQ(start, rsm->r_end)) || 4197 (SEQ_LT(start, rsm->r_start))) { 4198 /* 4199 * We are not in the right spot, 4200 * find the correct spot in the tree. 4201 */ 4202 used_ref = 0; 4203 fe.r_start = start; 4204 rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); 4205 moved++; 4206 } 4207 if (rsm == NULL) { 4208 /* TSNH */ 4209 goto out; 4210 } 4211 /* Ok we have an ACK for some piece of this rsm */ 4212 if (rsm->r_start != start) { 4213 if ((rsm->r_flags & RACK_ACKED) == 0) { 4214 /** 4215 * Need to split this in two pieces the before and after, 4216 * the before remains in the map, the after must be 4217 * added. In other words we have: 4218 * rsm |--------------| 4219 * sackblk |-------> 4220 * rsm will become 4221 * rsm |---| 4222 * and nrsm will be the sacked piece 4223 * nrsm |----------| 4224 * 4225 * But before we start down that path lets 4226 * see if the sack spans over on top of 4227 * the next guy and it is already sacked. 4228 */ 4229 next = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 4230 if (next && (next->r_flags & RACK_ACKED) && 4231 SEQ_GEQ(end, next->r_start)) { 4232 /** 4233 * So the next one is already acked, and 4234 * we can thus by hookery use our stack_map 4235 * to reflect the piece being sacked and 4236 * then adjust the two tree entries moving 4237 * the start and ends around. So we start like: 4238 * rsm |------------| (not-acked) 4239 * next |-----------| (acked) 4240 * sackblk |--------> 4241 * We want to end like so: 4242 * rsm |------| (not-acked) 4243 * next |-----------------| (acked) 4244 * nrsm |-----| 4245 * Where nrsm is a temporary stack piece we 4246 * use to update all the gizmos. 4247 */ 4248 /* Copy up our fudge block */ 4249 nrsm = &stack_map; 4250 memcpy(nrsm, rsm, sizeof(struct rack_sendmap)); 4251 /* Now adjust our tree blocks */ 4252 rsm->r_end = start; 4253 next->r_start = start; 4254 /* Clear out the dup ack count of the remainder */ 4255 rsm->r_dupack = 0; 4256 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 4257 /* Now lets make sure our fudge block is right */ 4258 nrsm->r_start = start; 4259 /* Now lets update all the stats and such */ 4260 rack_update_rtt(tp, rack, nrsm, to, cts, SACKED); 4261 changed += (nrsm->r_end - nrsm->r_start); 4262 rack->r_ctl.rc_sacked += (nrsm->r_end - nrsm->r_start); 4263 if (nrsm->r_flags & RACK_SACK_PASSED) { 4264 counter_u64_add(rack_reorder_seen, 1); 4265 rack->r_ctl.rc_reorder_ts = cts; 4266 } 4267 /* 4268 * Now we want to go up from rsm (the 4269 * one left un-acked) to the next one 4270 * in the tmap. We do this so when 4271 * we walk backwards we include marking 4272 * sack-passed on rsm (The one passed in 4273 * is skipped since it is generally called 4274 * on something sacked before removing it 4275 * from the tmap). 4276 */ 4277 if (rsm->r_in_tmap) { 4278 nrsm = TAILQ_NEXT(rsm, r_tnext); 4279 /* 4280 * Now that we have the next 4281 * one walk backwards from there. 4282 */ 4283 if (nrsm && nrsm->r_in_tmap) 4284 rack_log_sack_passed(tp, rack, nrsm); 4285 } 4286 /* Now are we done? */ 4287 if (SEQ_LT(end, next->r_end) || 4288 (end == next->r_end)) { 4289 /* Done with block */ 4290 goto out; 4291 } 4292 counter_u64_add(rack_sack_used_next_merge, 1); 4293 /* Postion for the next block */ 4294 start = next->r_end; 4295 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, next); 4296 if (rsm == NULL) 4297 goto out; 4298 } else { 4299 /** 4300 * We can't use any hookery here, so we 4301 * need to split the map. We enter like 4302 * so: 4303 * rsm |--------| 4304 * sackblk |-----> 4305 * We will add the new block nrsm and 4306 * that will be the new portion, and then 4307 * fall through after reseting rsm. So we 4308 * split and look like this: 4309 * rsm |----| 4310 * sackblk |-----> 4311 * nrsm |---| 4312 * We then fall through reseting 4313 * rsm to nrsm, so the next block 4314 * picks it up. 4315 */ 4316 nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT); 4317 if (nrsm == NULL) { 4318 /* 4319 * failed XXXrrs what can we do but loose the sack 4320 * info? 4321 */ 4322 goto out; 4323 } 4324 counter_u64_add(rack_sack_splits, 1); 4325 rack_clone_rsm(rack, nrsm, rsm, start); 4326 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); 4327 #ifdef INVARIANTS 4328 if (insret != NULL) { 4329 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", 4330 nrsm, insret, rack, rsm); 4331 } 4332 #endif 4333 if (rsm->r_in_tmap) { 4334 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 4335 nrsm->r_in_tmap = 1; 4336 } 4337 rsm->r_flags &= (~RACK_HAS_FIN); 4338 /* Position us to point to the new nrsm that starts the sack blk */ 4339 rsm = nrsm; 4340 } 4341 } else { 4342 /* Already sacked this piece */ 4343 counter_u64_add(rack_sack_skipped_acked, 1); 4344 moved++; 4345 if (end == rsm->r_end) { 4346 /* Done with block */ 4347 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 4348 goto out; 4349 } else if (SEQ_LT(end, rsm->r_end)) { 4350 /* A partial sack to a already sacked block */ 4351 moved++; 4352 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 4353 goto out; 4354 } else { 4355 /* 4356 * The end goes beyond this guy 4357 * repostion the start to the 4358 * next block. 4359 */ 4360 start = rsm->r_end; 4361 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 4362 if (rsm == NULL) 4363 goto out; 4364 } 4365 } 4366 } 4367 if (SEQ_GEQ(end, rsm->r_end)) { 4368 /** 4369 * The end of this block is either beyond this guy or right 4370 * at this guy. I.e.: 4371 * rsm --- |-----| 4372 * end |-----| 4373 * <or> 4374 * end |---------| 4375 */ 4376 if (rsm->r_flags & RACK_TLP) 4377 rack->r_ctl.rc_tlp_rtx_out = 0; 4378 if ((rsm->r_flags & RACK_ACKED) == 0) { 4379 rack_update_rtt(tp, rack, rsm, to, cts, SACKED); 4380 changed += (rsm->r_end - rsm->r_start); 4381 rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start); 4382 if (rsm->r_in_tmap) /* should be true */ 4383 rack_log_sack_passed(tp, rack, rsm); 4384 /* Is Reordering occuring? */ 4385 if (rsm->r_flags & RACK_SACK_PASSED) { 4386 rsm->r_flags &= ~RACK_SACK_PASSED; 4387 counter_u64_add(rack_reorder_seen, 1); 4388 rack->r_ctl.rc_reorder_ts = cts; 4389 } 4390 rsm->r_flags |= RACK_ACKED; 4391 rsm->r_flags &= ~RACK_TLP; 4392 if (rsm->r_in_tmap) { 4393 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 4394 rsm->r_in_tmap = 0; 4395 } 4396 } else { 4397 counter_u64_add(rack_sack_skipped_acked, 1); 4398 moved++; 4399 } 4400 if (end == rsm->r_end) { 4401 /* This block only - done, setup for next */ 4402 goto out; 4403 } 4404 /* 4405 * There is more not coverend by this rsm move on 4406 * to the next block in the RB tree. 4407 */ 4408 nrsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 4409 start = rsm->r_end; 4410 rsm = nrsm; 4411 if (rsm == NULL) 4412 goto out; 4413 goto do_rest_ofb; 4414 } 4415 /** 4416 * The end of this sack block is smaller than 4417 * our rsm i.e.: 4418 * rsm --- |-----| 4419 * end |--| 4420 */ 4421 if ((rsm->r_flags & RACK_ACKED) == 0) { 4422 prev = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 4423 if (prev && (prev->r_flags & RACK_ACKED)) { 4424 /** 4425 * Goal, we want the right remainder of rsm to shrink 4426 * in place and span from (rsm->r_start = end) to rsm->r_end. 4427 * We want to expand prev to go all the way 4428 * to prev->r_end <- end. 4429 * so in the tree we have before: 4430 * prev |--------| (acked) 4431 * rsm |-------| (non-acked) 4432 * sackblk |-| 4433 * We churn it so we end up with 4434 * prev |----------| (acked) 4435 * rsm |-----| (non-acked) 4436 * nrsm |-| (temporary) 4437 */ 4438 nrsm = &stack_map; 4439 memcpy(nrsm, rsm, sizeof(struct rack_sendmap)); 4440 prev->r_end = end; 4441 rsm->r_start = end; 4442 /* Now adjust nrsm (stack copy) to be 4443 * the one that is the small 4444 * piece that was "sacked". 4445 */ 4446 nrsm->r_end = end; 4447 rsm->r_dupack = 0; 4448 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 4449 /* 4450 * Now nrsm is our new little piece 4451 * that is acked (which was merged 4452 * to prev). Update the rtt and changed 4453 * based on that. Also check for reordering. 4454 */ 4455 rack_update_rtt(tp, rack, nrsm, to, cts, SACKED); 4456 changed += (nrsm->r_end - nrsm->r_start); 4457 rack->r_ctl.rc_sacked += (nrsm->r_end - nrsm->r_start); 4458 if (nrsm->r_flags & RACK_SACK_PASSED) { 4459 counter_u64_add(rack_reorder_seen, 1); 4460 rack->r_ctl.rc_reorder_ts = cts; 4461 } 4462 rsm = prev; 4463 counter_u64_add(rack_sack_used_prev_merge, 1); 4464 } else { 4465 /** 4466 * This is the case where our previous 4467 * block is not acked either, so we must 4468 * split the block in two. 4469 */ 4470 nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT); 4471 if (nrsm == NULL) { 4472 /* failed rrs what can we do but loose the sack info? */ 4473 goto out; 4474 } 4475 /** 4476 * In this case nrsm becomes 4477 * nrsm->r_start = end; 4478 * nrsm->r_end = rsm->r_end; 4479 * which is un-acked. 4480 * <and> 4481 * rsm->r_end = nrsm->r_start; 4482 * i.e. the remaining un-acked 4483 * piece is left on the left 4484 * hand side. 4485 * 4486 * So we start like this 4487 * rsm |----------| (not acked) 4488 * sackblk |---| 4489 * build it so we have 4490 * rsm |---| (acked) 4491 * nrsm |------| (not acked) 4492 */ 4493 counter_u64_add(rack_sack_splits, 1); 4494 rack_clone_rsm(rack, nrsm, rsm, end); 4495 rsm->r_flags &= (~RACK_HAS_FIN); 4496 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); 4497 #ifdef INVARIANTS 4498 if (insret != NULL) { 4499 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", 4500 nrsm, insret, rack, rsm); 4501 } 4502 #endif 4503 if (rsm->r_in_tmap) { 4504 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 4505 nrsm->r_in_tmap = 1; 4506 } 4507 nrsm->r_dupack = 0; 4508 rack_log_retran_reason(rack, nrsm, __LINE__, 0, 2); 4509 if (rsm->r_flags & RACK_TLP) 4510 rack->r_ctl.rc_tlp_rtx_out = 0; 4511 rack_update_rtt(tp, rack, rsm, to, cts, SACKED); 4512 changed += (rsm->r_end - rsm->r_start); 4513 rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start); 4514 if (rsm->r_in_tmap) /* should be true */ 4515 rack_log_sack_passed(tp, rack, rsm); 4516 /* Is Reordering occuring? */ 4517 if (rsm->r_flags & RACK_SACK_PASSED) { 4518 rsm->r_flags &= ~RACK_SACK_PASSED; 4519 counter_u64_add(rack_reorder_seen, 1); 4520 rack->r_ctl.rc_reorder_ts = cts; 4521 } 4522 rsm->r_flags |= RACK_ACKED; 4523 rsm->r_flags &= ~RACK_TLP; 4524 if (rsm->r_in_tmap) { 4525 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 4526 rsm->r_in_tmap = 0; 4527 } 4528 } 4529 } else if (start != end){ 4530 /* 4531 * The block was already acked. 4532 */ 4533 counter_u64_add(rack_sack_skipped_acked, 1); 4534 moved++; 4535 } 4536 out: 4537 if (rsm && (rsm->r_flags & RACK_ACKED)) { 4538 /* 4539 * Now can we merge where we worked 4540 * with either the previous or 4541 * next block? 4542 */ 4543 next = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 4544 while (next) { 4545 if (next->r_flags & RACK_ACKED) { 4546 /* yep this and next can be merged */ 4547 rsm = rack_merge_rsm(rack, rsm, next); 4548 next = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 4549 } else 4550 break; 4551 } 4552 /* Now what about the previous? */ 4553 prev = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 4554 while (prev) { 4555 if (prev->r_flags & RACK_ACKED) { 4556 /* yep the previous and this can be merged */ 4557 rsm = rack_merge_rsm(rack, prev, rsm); 4558 prev = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 4559 } else 4560 break; 4561 } 4562 } 4563 if (used_ref == 0) { 4564 counter_u64_add(rack_sack_proc_all, 1); 4565 } else { 4566 counter_u64_add(rack_sack_proc_short, 1); 4567 } 4568 /* Save off the next one for quick reference. */ 4569 if (rsm) 4570 nrsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 4571 else 4572 nrsm = NULL; 4573 *prsm = rack->r_ctl.rc_sacklast = nrsm; 4574 /* Pass back the moved. */ 4575 *moved_two = moved; 4576 return (changed); 4577 } 4578 4579 static void inline 4580 rack_peer_reneges(struct tcp_rack *rack, struct rack_sendmap *rsm, tcp_seq th_ack) 4581 { 4582 struct rack_sendmap *tmap; 4583 4584 tmap = NULL; 4585 while (rsm && (rsm->r_flags & RACK_ACKED)) { 4586 /* Its no longer sacked, mark it so */ 4587 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); 4588 #ifdef INVARIANTS 4589 if (rsm->r_in_tmap) { 4590 panic("rack:%p rsm:%p flags:0x%x in tmap?", 4591 rack, rsm, rsm->r_flags); 4592 } 4593 #endif 4594 rsm->r_flags &= ~(RACK_ACKED|RACK_SACK_PASSED|RACK_WAS_SACKPASS); 4595 /* Rebuild it into our tmap */ 4596 if (tmap == NULL) { 4597 TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext); 4598 tmap = rsm; 4599 } else { 4600 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, tmap, rsm, r_tnext); 4601 tmap = rsm; 4602 } 4603 tmap->r_in_tmap = 1; 4604 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 4605 } 4606 /* 4607 * Now lets possibly clear the sack filter so we start 4608 * recognizing sacks that cover this area. 4609 */ 4610 if (rack_use_sack_filter) 4611 sack_filter_clear(&rack->r_ctl.rack_sf, th_ack); 4612 4613 } 4614 4615 static void 4616 rack_do_decay(struct tcp_rack *rack) 4617 { 4618 #ifdef NETFLIX_EXP_DETECTION 4619 struct timeval res; 4620 4621 #define timersub(tvp, uvp, vvp) \ 4622 do { \ 4623 (vvp)->tv_sec = (tvp)->tv_sec - (uvp)->tv_sec; \ 4624 (vvp)->tv_usec = (tvp)->tv_usec - (uvp)->tv_usec; \ 4625 if ((vvp)->tv_usec < 0) { \ 4626 (vvp)->tv_sec--; \ 4627 (vvp)->tv_usec += 1000000; \ 4628 } \ 4629 } while (0) 4630 4631 timersub(&rack->r_ctl.rc_last_ack, &rack->r_ctl.rc_last_time_decay, &res); 4632 #undef timersub 4633 4634 rack->r_ctl.input_pkt++; 4635 if ((rack->rc_in_persist) || 4636 (res.tv_sec >= 1) || 4637 (rack->rc_tp->snd_max == rack->rc_tp->snd_una)) { 4638 /* 4639 * Check for decay of non-SAD, 4640 * we want all SAD detection metrics to 4641 * decay 1/4 per second (or more) passed. 4642 */ 4643 uint32_t pkt_delta; 4644 4645 pkt_delta = rack->r_ctl.input_pkt - rack->r_ctl.saved_input_pkt; 4646 /* Update our saved tracking values */ 4647 rack->r_ctl.saved_input_pkt = rack->r_ctl.input_pkt; 4648 rack->r_ctl.rc_last_time_decay = rack->r_ctl.rc_last_ack; 4649 /* Now do we escape without decay? */ 4650 if (rack->rc_in_persist || 4651 (rack->rc_tp->snd_max == rack->rc_tp->snd_una) || 4652 (pkt_delta < tcp_sad_low_pps)){ 4653 /* 4654 * We don't decay idle connections 4655 * or ones that have a low input pps. 4656 */ 4657 return; 4658 } 4659 /* Decay the counters */ 4660 rack->r_ctl.ack_count = ctf_decay_count(rack->r_ctl.ack_count, 4661 tcp_sad_decay_val); 4662 rack->r_ctl.sack_count = ctf_decay_count(rack->r_ctl.sack_count, 4663 tcp_sad_decay_val); 4664 rack->r_ctl.sack_moved_extra = ctf_decay_count(rack->r_ctl.sack_moved_extra, 4665 tcp_sad_decay_val); 4666 rack->r_ctl.sack_noextra_move = ctf_decay_count(rack->r_ctl.sack_noextra_move, 4667 tcp_sad_decay_val); 4668 } 4669 #endif 4670 } 4671 4672 static void 4673 rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th) 4674 { 4675 uint32_t changed, entered_recovery = 0; 4676 struct tcp_rack *rack; 4677 struct rack_sendmap *rsm, *rm; 4678 struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1]; 4679 register uint32_t th_ack; 4680 int32_t i, j, k, num_sack_blks = 0; 4681 uint32_t cts, acked, ack_point, sack_changed = 0; 4682 int loop_start = 0, moved_two = 0; 4683 4684 INP_WLOCK_ASSERT(tp->t_inpcb); 4685 if (th->th_flags & TH_RST) { 4686 /* We don't log resets */ 4687 return; 4688 } 4689 rack = (struct tcp_rack *)tp->t_fb_ptr; 4690 cts = tcp_ts_getticks(); 4691 rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 4692 changed = 0; 4693 th_ack = th->th_ack; 4694 if (rack->sack_attack_disable == 0) 4695 rack_do_decay(rack); 4696 if (BYTES_THIS_ACK(tp, th) >= ctf_fixed_maxseg(rack->rc_tp)) { 4697 /* 4698 * You only get credit for 4699 * MSS and greater (and you get extra 4700 * credit for larger cum-ack moves). 4701 */ 4702 int ac; 4703 4704 ac = BYTES_THIS_ACK(tp, th) / ctf_fixed_maxseg(rack->rc_tp); 4705 rack->r_ctl.ack_count += ac; 4706 counter_u64_add(rack_ack_total, ac); 4707 } 4708 if (rack->r_ctl.ack_count > 0xfff00000) { 4709 /* 4710 * reduce the number to keep us under 4711 * a uint32_t. 4712 */ 4713 rack->r_ctl.ack_count /= 2; 4714 rack->r_ctl.sack_count /= 2; 4715 } 4716 if (SEQ_GT(th_ack, tp->snd_una)) { 4717 rack_log_progress_event(rack, tp, ticks, PROGRESS_UPDATE, __LINE__); 4718 tp->t_acktime = ticks; 4719 } 4720 if (rsm && SEQ_GT(th_ack, rsm->r_start)) 4721 changed = th_ack - rsm->r_start; 4722 if (changed) { 4723 /* 4724 * The ACK point is advancing to th_ack, we must drop off 4725 * the packets in the rack log and calculate any eligble 4726 * RTT's. 4727 */ 4728 rack->r_wanted_output++; 4729 more: 4730 rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 4731 if (rsm == NULL) { 4732 if ((th_ack - 1) == tp->iss) { 4733 /* 4734 * For the SYN incoming case we will not 4735 * have called tcp_output for the sending of 4736 * the SYN, so there will be no map. All 4737 * other cases should probably be a panic. 4738 */ 4739 goto proc_sack; 4740 } 4741 if (tp->t_flags & TF_SENTFIN) { 4742 /* if we send a FIN we will not hav a map */ 4743 goto proc_sack; 4744 } 4745 #ifdef INVARIANTS 4746 panic("No rack map tp:%p for th:%p state:%d rack:%p snd_una:%u snd_max:%u snd_nxt:%u chg:%d\n", 4747 tp, 4748 th, tp->t_state, rack, 4749 tp->snd_una, tp->snd_max, tp->snd_nxt, changed); 4750 #endif 4751 goto proc_sack; 4752 } 4753 if (SEQ_LT(th_ack, rsm->r_start)) { 4754 /* Huh map is missing this */ 4755 #ifdef INVARIANTS 4756 printf("Rack map starts at r_start:%u for th_ack:%u huh? ts:%d rs:%d\n", 4757 rsm->r_start, 4758 th_ack, tp->t_state, rack->r_state); 4759 #endif 4760 goto proc_sack; 4761 } 4762 rack_update_rtt(tp, rack, rsm, to, cts, CUM_ACKED); 4763 /* Now do we consume the whole thing? */ 4764 if (SEQ_GEQ(th_ack, rsm->r_end)) { 4765 /* Its all consumed. */ 4766 uint32_t left; 4767 4768 rack->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes; 4769 rsm->r_rtr_bytes = 0; 4770 if (rsm->r_flags & RACK_TLP) 4771 rack->r_ctl.rc_tlp_rtx_out = 0; 4772 rm = RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 4773 #ifdef INVARIANTS 4774 if (rm != rsm) { 4775 panic("removing head in rack:%p rsm:%p rm:%p", 4776 rack, rsm, rm); 4777 } 4778 #endif 4779 if (rsm->r_in_tmap) { 4780 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 4781 rsm->r_in_tmap = 0; 4782 } 4783 if (rsm->r_flags & RACK_ACKED) { 4784 /* 4785 * It was acked on the scoreboard -- remove 4786 * it from total 4787 */ 4788 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); 4789 } else if (rsm->r_flags & RACK_SACK_PASSED) { 4790 /* 4791 * There are segments ACKED on the 4792 * scoreboard further up. We are seeing 4793 * reordering. 4794 */ 4795 rsm->r_flags &= ~RACK_SACK_PASSED; 4796 counter_u64_add(rack_reorder_seen, 1); 4797 rsm->r_flags |= RACK_ACKED; 4798 rack->r_ctl.rc_reorder_ts = cts; 4799 } 4800 left = th_ack - rsm->r_end; 4801 if (rsm->r_rtr_cnt > 1) { 4802 /* 4803 * Technically we should make r_rtr_cnt be 4804 * monotonicly increasing and just mod it to 4805 * the timestamp it is replacing.. that way 4806 * we would have the last 3 retransmits. Now 4807 * rc_loss_count will be wrong if we 4808 * retransmit something more than 2 times in 4809 * recovery :( 4810 */ 4811 rack->r_ctl.rc_loss_count += (rsm->r_rtr_cnt - 1); 4812 } 4813 /* Free back to zone */ 4814 rack_free(rack, rsm); 4815 if (left) { 4816 goto more; 4817 } 4818 goto proc_sack; 4819 } 4820 if (rsm->r_flags & RACK_ACKED) { 4821 /* 4822 * It was acked on the scoreboard -- remove it from 4823 * total for the part being cum-acked. 4824 */ 4825 rack->r_ctl.rc_sacked -= (th_ack - rsm->r_start); 4826 } 4827 /* 4828 * Clear the dup ack count for 4829 * the piece that remains. 4830 */ 4831 rsm->r_dupack = 0; 4832 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 4833 if (rsm->r_rtr_bytes) { 4834 /* 4835 * It was retransmitted adjust the 4836 * sack holes for what was acked. 4837 */ 4838 int ack_am; 4839 4840 ack_am = (th_ack - rsm->r_start); 4841 if (ack_am >= rsm->r_rtr_bytes) { 4842 rack->r_ctl.rc_holes_rxt -= ack_am; 4843 rsm->r_rtr_bytes -= ack_am; 4844 } 4845 } 4846 /* Update where the piece starts */ 4847 rsm->r_start = th_ack; 4848 } 4849 proc_sack: 4850 /* Check for reneging */ 4851 rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 4852 if (rsm && (rsm->r_flags & RACK_ACKED) && (th_ack == rsm->r_start)) { 4853 /* 4854 * The peer has moved snd_una up to 4855 * the edge of this send, i.e. one 4856 * that it had previously acked. The only 4857 * way that can be true if the peer threw 4858 * away data (space issues) that it had 4859 * previously sacked (else it would have 4860 * given us snd_una up to (rsm->r_end). 4861 * We need to undo the acked markings here. 4862 * 4863 * Note we have to look to make sure th_ack is 4864 * our rsm->r_start in case we get an old ack 4865 * where th_ack is behind snd_una. 4866 */ 4867 rack_peer_reneges(rack, rsm, th->th_ack); 4868 } 4869 if ((to->to_flags & TOF_SACK) == 0) { 4870 /* We are done nothing left */ 4871 goto out; 4872 } 4873 /* Sack block processing */ 4874 if (SEQ_GT(th_ack, tp->snd_una)) 4875 ack_point = th_ack; 4876 else 4877 ack_point = tp->snd_una; 4878 for (i = 0; i < to->to_nsacks; i++) { 4879 bcopy((to->to_sacks + i * TCPOLEN_SACK), 4880 &sack, sizeof(sack)); 4881 sack.start = ntohl(sack.start); 4882 sack.end = ntohl(sack.end); 4883 if (SEQ_GT(sack.end, sack.start) && 4884 SEQ_GT(sack.start, ack_point) && 4885 SEQ_LT(sack.start, tp->snd_max) && 4886 SEQ_GT(sack.end, ack_point) && 4887 SEQ_LEQ(sack.end, tp->snd_max)) { 4888 sack_blocks[num_sack_blks] = sack; 4889 num_sack_blks++; 4890 #ifdef NETFLIX_STATS 4891 } else if (SEQ_LEQ(sack.start, th_ack) && 4892 SEQ_LEQ(sack.end, th_ack)) { 4893 /* 4894 * Its a D-SACK block. 4895 */ 4896 tcp_record_dsack(sack.start, sack.end); 4897 #endif 4898 } 4899 4900 } 4901 /* 4902 * Sort the SACK blocks so we can update the rack scoreboard with 4903 * just one pass. 4904 */ 4905 if (rack_use_sack_filter) { 4906 num_sack_blks = sack_filter_blks(&rack->r_ctl.rack_sf, sack_blocks, 4907 num_sack_blks, th->th_ack); 4908 ctf_log_sack_filter(rack->rc_tp, num_sack_blks, sack_blocks); 4909 } 4910 if (num_sack_blks == 0) { 4911 /* Nothing to sack (DSACKs?) */ 4912 goto out_with_totals; 4913 } 4914 if (num_sack_blks < 2) { 4915 /* Only one, we don't need to sort */ 4916 goto do_sack_work; 4917 } 4918 /* Sort the sacks */ 4919 for (i = 0; i < num_sack_blks; i++) { 4920 for (j = i + 1; j < num_sack_blks; j++) { 4921 if (SEQ_GT(sack_blocks[i].end, sack_blocks[j].end)) { 4922 sack = sack_blocks[i]; 4923 sack_blocks[i] = sack_blocks[j]; 4924 sack_blocks[j] = sack; 4925 } 4926 } 4927 } 4928 /* 4929 * Now are any of the sack block ends the same (yes some 4930 * implementations send these)? 4931 */ 4932 again: 4933 if (num_sack_blks == 0) 4934 goto out_with_totals; 4935 if (num_sack_blks > 1) { 4936 for (i = 0; i < num_sack_blks; i++) { 4937 for (j = i + 1; j < num_sack_blks; j++) { 4938 if (sack_blocks[i].end == sack_blocks[j].end) { 4939 /* 4940 * Ok these two have the same end we 4941 * want the smallest end and then 4942 * throw away the larger and start 4943 * again. 4944 */ 4945 if (SEQ_LT(sack_blocks[j].start, sack_blocks[i].start)) { 4946 /* 4947 * The second block covers 4948 * more area use that 4949 */ 4950 sack_blocks[i].start = sack_blocks[j].start; 4951 } 4952 /* 4953 * Now collapse out the dup-sack and 4954 * lower the count 4955 */ 4956 for (k = (j + 1); k < num_sack_blks; k++) { 4957 sack_blocks[j].start = sack_blocks[k].start; 4958 sack_blocks[j].end = sack_blocks[k].end; 4959 j++; 4960 } 4961 num_sack_blks--; 4962 goto again; 4963 } 4964 } 4965 } 4966 } 4967 do_sack_work: 4968 /* 4969 * First lets look to see if 4970 * we have retransmitted and 4971 * can use the transmit next? 4972 */ 4973 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 4974 if (rsm && 4975 SEQ_GT(sack_blocks[0].end, rsm->r_start) && 4976 SEQ_LT(sack_blocks[0].start, rsm->r_end)) { 4977 /* 4978 * We probably did the FR and the next 4979 * SACK in continues as we would expect. 4980 */ 4981 acked = rack_proc_sack_blk(tp, rack, &sack_blocks[0], to, &rsm, cts, &moved_two); 4982 if (acked) { 4983 rack->r_wanted_output++; 4984 changed += acked; 4985 sack_changed += acked; 4986 } 4987 if (num_sack_blks == 1) { 4988 /* 4989 * This is what we would expect from 4990 * a normal implementation to happen 4991 * after we have retransmitted the FR, 4992 * i.e the sack-filter pushes down 4993 * to 1 block and the next to be retransmitted 4994 * is the sequence in the sack block (has more 4995 * are acked). Count this as ACK'd data to boost 4996 * up the chances of recovering any false positives. 4997 */ 4998 rack->r_ctl.ack_count += (acked / ctf_fixed_maxseg(rack->rc_tp)); 4999 counter_u64_add(rack_ack_total, (acked / ctf_fixed_maxseg(rack->rc_tp))); 5000 counter_u64_add(rack_express_sack, 1); 5001 if (rack->r_ctl.ack_count > 0xfff00000) { 5002 /* 5003 * reduce the number to keep us under 5004 * a uint32_t. 5005 */ 5006 rack->r_ctl.ack_count /= 2; 5007 rack->r_ctl.sack_count /= 2; 5008 } 5009 goto out_with_totals; 5010 } else { 5011 /* 5012 * Start the loop through the 5013 * rest of blocks, past the first block. 5014 */ 5015 moved_two = 0; 5016 loop_start = 1; 5017 } 5018 } 5019 /* Its a sack of some sort */ 5020 rack->r_ctl.sack_count++; 5021 if (rack->r_ctl.sack_count > 0xfff00000) { 5022 /* 5023 * reduce the number to keep us under 5024 * a uint32_t. 5025 */ 5026 rack->r_ctl.ack_count /= 2; 5027 rack->r_ctl.sack_count /= 2; 5028 } 5029 counter_u64_add(rack_sack_total, 1); 5030 if (rack->sack_attack_disable) { 5031 /* An attacker disablement is in place */ 5032 if (num_sack_blks > 1) { 5033 rack->r_ctl.sack_count += (num_sack_blks - 1); 5034 rack->r_ctl.sack_moved_extra++; 5035 counter_u64_add(rack_move_some, 1); 5036 if (rack->r_ctl.sack_moved_extra > 0xfff00000) { 5037 rack->r_ctl.sack_moved_extra /= 2; 5038 rack->r_ctl.sack_noextra_move /= 2; 5039 } 5040 } 5041 goto out; 5042 } 5043 rsm = rack->r_ctl.rc_sacklast; 5044 for (i = loop_start; i < num_sack_blks; i++) { 5045 acked = rack_proc_sack_blk(tp, rack, &sack_blocks[i], to, &rsm, cts, &moved_two); 5046 if (acked) { 5047 rack->r_wanted_output++; 5048 changed += acked; 5049 sack_changed += acked; 5050 } 5051 if (moved_two) { 5052 /* 5053 * If we did not get a SACK for at least a MSS and 5054 * had to move at all, or if we moved more than our 5055 * threshold, it counts against the "extra" move. 5056 */ 5057 rack->r_ctl.sack_moved_extra += moved_two; 5058 counter_u64_add(rack_move_some, 1); 5059 } else { 5060 /* 5061 * else we did not have to move 5062 * any more than we would expect. 5063 */ 5064 rack->r_ctl.sack_noextra_move++; 5065 counter_u64_add(rack_move_none, 1); 5066 } 5067 if (moved_two && (acked < ctf_fixed_maxseg(rack->rc_tp))) { 5068 /* 5069 * If the SACK was not a full MSS then 5070 * we add to sack_count the number of 5071 * MSS's (or possibly more than 5072 * a MSS if its a TSO send) we had to skip by. 5073 */ 5074 rack->r_ctl.sack_count += moved_two; 5075 counter_u64_add(rack_sack_total, moved_two); 5076 } 5077 /* 5078 * Now we need to setup for the next 5079 * round. First we make sure we won't 5080 * exceed the size of our uint32_t on 5081 * the various counts, and then clear out 5082 * moved_two. 5083 */ 5084 if ((rack->r_ctl.sack_moved_extra > 0xfff00000) || 5085 (rack->r_ctl.sack_noextra_move > 0xfff00000)) { 5086 rack->r_ctl.sack_moved_extra /= 2; 5087 rack->r_ctl.sack_noextra_move /= 2; 5088 } 5089 if (rack->r_ctl.sack_count > 0xfff00000) { 5090 rack->r_ctl.ack_count /= 2; 5091 rack->r_ctl.sack_count /= 2; 5092 } 5093 moved_two = 0; 5094 } 5095 out_with_totals: 5096 if (num_sack_blks > 1) { 5097 /* 5098 * You get an extra stroke if 5099 * you have more than one sack-blk, this 5100 * could be where we are skipping forward 5101 * and the sack-filter is still working, or 5102 * it could be an attacker constantly 5103 * moving us. 5104 */ 5105 rack->r_ctl.sack_moved_extra++; 5106 counter_u64_add(rack_move_some, 1); 5107 } 5108 out: 5109 #ifdef NETFLIX_EXP_DETECTION 5110 if ((rack->do_detection || tcp_force_detection) && 5111 tcp_sack_to_ack_thresh && 5112 tcp_sack_to_move_thresh && 5113 ((rack->r_ctl.rc_num_maps_alloced > tcp_map_minimum) || rack->sack_attack_disable)) { 5114 /* 5115 * We have thresholds set to find 5116 * possible attackers and disable sack. 5117 * Check them. 5118 */ 5119 uint64_t ackratio, moveratio, movetotal; 5120 5121 /* Log detecting */ 5122 rack_log_sad(rack, 1); 5123 ackratio = (uint64_t)(rack->r_ctl.sack_count); 5124 ackratio *= (uint64_t)(1000); 5125 if (rack->r_ctl.ack_count) 5126 ackratio /= (uint64_t)(rack->r_ctl.ack_count); 5127 else { 5128 /* We really should not hit here */ 5129 ackratio = 1000; 5130 } 5131 if ((rack->sack_attack_disable == 0) && 5132 (ackratio > rack_highest_sack_thresh_seen)) 5133 rack_highest_sack_thresh_seen = (uint32_t)ackratio; 5134 movetotal = rack->r_ctl.sack_moved_extra; 5135 movetotal += rack->r_ctl.sack_noextra_move; 5136 moveratio = rack->r_ctl.sack_moved_extra; 5137 moveratio *= (uint64_t)1000; 5138 if (movetotal) 5139 moveratio /= movetotal; 5140 else { 5141 /* No moves, thats pretty good */ 5142 moveratio = 0; 5143 } 5144 if ((rack->sack_attack_disable == 0) && 5145 (moveratio > rack_highest_move_thresh_seen)) 5146 rack_highest_move_thresh_seen = (uint32_t)moveratio; 5147 if (rack->sack_attack_disable == 0) { 5148 if ((ackratio > tcp_sack_to_ack_thresh) && 5149 (moveratio > tcp_sack_to_move_thresh)) { 5150 /* Disable sack processing */ 5151 rack->sack_attack_disable = 1; 5152 if (rack->r_rep_attack == 0) { 5153 rack->r_rep_attack = 1; 5154 counter_u64_add(rack_sack_attacks_detected, 1); 5155 } 5156 if (tcp_attack_on_turns_on_logging) { 5157 /* 5158 * Turn on logging, used for debugging 5159 * false positives. 5160 */ 5161 rack->rc_tp->t_logstate = tcp_attack_on_turns_on_logging; 5162 } 5163 /* Clamp the cwnd at flight size */ 5164 rack->r_ctl.rc_saved_cwnd = rack->rc_tp->snd_cwnd; 5165 rack->rc_tp->snd_cwnd = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 5166 rack_log_sad(rack, 2); 5167 } 5168 } else { 5169 /* We are sack-disabled check for false positives */ 5170 if ((ackratio <= tcp_restoral_thresh) || 5171 (rack->r_ctl.rc_num_maps_alloced < tcp_map_minimum)) { 5172 rack->sack_attack_disable = 0; 5173 rack_log_sad(rack, 3); 5174 /* Restart counting */ 5175 rack->r_ctl.sack_count = 0; 5176 rack->r_ctl.sack_moved_extra = 0; 5177 rack->r_ctl.sack_noextra_move = 1; 5178 rack->r_ctl.ack_count = max(1, 5179 (BYTES_THIS_ACK(tp, th)/ctf_fixed_maxseg(rack->rc_tp))); 5180 5181 if (rack->r_rep_reverse == 0) { 5182 rack->r_rep_reverse = 1; 5183 counter_u64_add(rack_sack_attacks_reversed, 1); 5184 } 5185 /* Restore the cwnd */ 5186 if (rack->r_ctl.rc_saved_cwnd > rack->rc_tp->snd_cwnd) 5187 rack->rc_tp->snd_cwnd = rack->r_ctl.rc_saved_cwnd; 5188 } 5189 } 5190 } 5191 #endif 5192 if (changed) { 5193 /* Something changed cancel the rack timer */ 5194 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 5195 } 5196 if ((sack_changed) && (!IN_RECOVERY(tp->t_flags))) { 5197 /* 5198 * Ok we have a high probability that we need to go in to 5199 * recovery since we have data sack'd 5200 */ 5201 struct rack_sendmap *rsm; 5202 uint32_t tsused; 5203 5204 tsused = tcp_ts_getticks(); 5205 rsm = tcp_rack_output(tp, rack, tsused); 5206 if (rsm) { 5207 /* Enter recovery */ 5208 rack->r_ctl.rc_rsm_start = rsm->r_start; 5209 rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; 5210 rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; 5211 entered_recovery = 1; 5212 rack_cong_signal(tp, NULL, CC_NDUPACK); 5213 /* 5214 * When we enter recovery we need to assure we send 5215 * one packet. 5216 */ 5217 rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); 5218 rack_log_to_prr(rack, 8); 5219 rack->r_timer_override = 1; 5220 } 5221 } 5222 if (IN_RECOVERY(tp->t_flags) && (entered_recovery == 0)) { 5223 /* Deal with changed and PRR here (in recovery only) */ 5224 uint32_t pipe, snd_una; 5225 5226 rack->r_ctl.rc_prr_delivered += changed; 5227 /* Compute prr_sndcnt */ 5228 if (SEQ_GT(tp->snd_una, th_ack)) { 5229 snd_una = tp->snd_una; 5230 } else { 5231 snd_una = th_ack; 5232 } 5233 pipe = ((tp->snd_max - snd_una) - rack->r_ctl.rc_sacked) + rack->r_ctl.rc_holes_rxt; 5234 if (pipe > tp->snd_ssthresh) { 5235 long sndcnt; 5236 5237 sndcnt = rack->r_ctl.rc_prr_delivered * tp->snd_ssthresh; 5238 if (rack->r_ctl.rc_prr_recovery_fs > 0) 5239 sndcnt /= (long)rack->r_ctl.rc_prr_recovery_fs; 5240 else { 5241 rack->r_ctl.rc_prr_sndcnt = 0; 5242 rack_log_to_prr(rack, 9); 5243 sndcnt = 0; 5244 } 5245 sndcnt++; 5246 if (sndcnt > (long)rack->r_ctl.rc_prr_out) 5247 sndcnt -= rack->r_ctl.rc_prr_out; 5248 else 5249 sndcnt = 0; 5250 rack->r_ctl.rc_prr_sndcnt = sndcnt; 5251 rack_log_to_prr(rack, 10); 5252 } else { 5253 uint32_t limit; 5254 5255 if (rack->r_ctl.rc_prr_delivered > rack->r_ctl.rc_prr_out) 5256 limit = (rack->r_ctl.rc_prr_delivered - rack->r_ctl.rc_prr_out); 5257 else 5258 limit = 0; 5259 if (changed > limit) 5260 limit = changed; 5261 limit += ctf_fixed_maxseg(tp); 5262 if (tp->snd_ssthresh > pipe) { 5263 rack->r_ctl.rc_prr_sndcnt = min((tp->snd_ssthresh - pipe), limit); 5264 rack_log_to_prr(rack, 11); 5265 } else { 5266 rack->r_ctl.rc_prr_sndcnt = min(0, limit); 5267 rack_log_to_prr(rack, 12); 5268 } 5269 } 5270 if (rack->r_ctl.rc_prr_sndcnt >= ctf_fixed_maxseg(tp)) { 5271 rack->r_timer_override = 1; 5272 } 5273 } 5274 } 5275 5276 static void 5277 rack_strike_dupack(struct tcp_rack *rack) 5278 { 5279 struct rack_sendmap *rsm; 5280 5281 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 5282 if (rsm && (rsm->r_dupack < 0xff)) { 5283 rsm->r_dupack++; 5284 if (rsm->r_dupack >= DUP_ACK_THRESHOLD) { 5285 rack->r_wanted_output = 1; 5286 rack_log_retran_reason(rack, rsm, __LINE__, 1, 3); 5287 } else { 5288 rack_log_retran_reason(rack, rsm, __LINE__, 0, 3); 5289 } 5290 } 5291 } 5292 5293 /* 5294 * Return value of 1, we do not need to call rack_process_data(). 5295 * return value of 0, rack_process_data can be called. 5296 * For ret_val if its 0 the TCP is locked, if its non-zero 5297 * its unlocked and probably unsafe to touch the TCB. 5298 */ 5299 static int 5300 rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so, 5301 struct tcpcb *tp, struct tcpopt *to, 5302 uint32_t tiwin, int32_t tlen, 5303 int32_t * ofia, int32_t thflags, int32_t * ret_val) 5304 { 5305 int32_t ourfinisacked = 0; 5306 int32_t nsegs, acked_amount; 5307 int32_t acked; 5308 struct mbuf *mfree; 5309 struct tcp_rack *rack; 5310 int32_t recovery = 0; 5311 5312 rack = (struct tcp_rack *)tp->t_fb_ptr; 5313 if (SEQ_GT(th->th_ack, tp->snd_max)) { 5314 ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val); 5315 rack->r_wanted_output++; 5316 return (1); 5317 } 5318 if (SEQ_GEQ(th->th_ack, tp->snd_una) || to->to_nsacks) { 5319 if (rack->rc_in_persist) 5320 tp->t_rxtshift = 0; 5321 if ((th->th_ack == tp->snd_una) && (tiwin == tp->snd_wnd)) 5322 rack_strike_dupack(rack); 5323 rack_log_ack(tp, to, th); 5324 } 5325 if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) { 5326 /* 5327 * Old ack, behind (or duplicate to) the last one rcv'd 5328 * Note: Should mark reordering is occuring! We should also 5329 * look for sack blocks arriving e.g. ack 1, 4-4 then ack 1, 5330 * 3-3, 4-4 would be reording. As well as ack 1, 3-3 <no 5331 * retran and> ack 3 5332 */ 5333 return (0); 5334 } 5335 /* 5336 * If we reach this point, ACK is not a duplicate, i.e., it ACKs 5337 * something we sent. 5338 */ 5339 if (tp->t_flags & TF_NEEDSYN) { 5340 /* 5341 * T/TCP: Connection was half-synchronized, and our SYN has 5342 * been ACK'd (so connection is now fully synchronized). Go 5343 * to non-starred state, increment snd_una for ACK of SYN, 5344 * and check if we can do window scaling. 5345 */ 5346 tp->t_flags &= ~TF_NEEDSYN; 5347 tp->snd_una++; 5348 /* Do window scaling? */ 5349 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 5350 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 5351 tp->rcv_scale = tp->request_r_scale; 5352 /* Send window already scaled. */ 5353 } 5354 } 5355 nsegs = max(1, m->m_pkthdr.lro_nsegs); 5356 INP_WLOCK_ASSERT(tp->t_inpcb); 5357 5358 acked = BYTES_THIS_ACK(tp, th); 5359 KMOD_TCPSTAT_ADD(tcps_rcvackpack, nsegs); 5360 KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked); 5361 5362 /* 5363 * If we just performed our first retransmit, and the ACK arrives 5364 * within our recovery window, then it was a mistake to do the 5365 * retransmit in the first place. Recover our original cwnd and 5366 * ssthresh, and proceed to transmit where we left off. 5367 */ 5368 if (tp->t_flags & TF_PREVVALID) { 5369 tp->t_flags &= ~TF_PREVVALID; 5370 if (tp->t_rxtshift == 1 && 5371 (int)(ticks - tp->t_badrxtwin) < 0) 5372 rack_cong_signal(tp, th, CC_RTO_ERR); 5373 } 5374 /* 5375 * If we have a timestamp reply, update smoothed round trip time. If 5376 * no timestamp is present but transmit timer is running and timed 5377 * sequence number was acked, update smoothed round trip time. Since 5378 * we now have an rtt measurement, cancel the timer backoff (cf., 5379 * Phil Karn's retransmit alg.). Recompute the initial retransmit 5380 * timer. 5381 * 5382 * Some boxes send broken timestamp replies during the SYN+ACK 5383 * phase, ignore timestamps of 0 or we could calculate a huge RTT 5384 * and blow up the retransmit timer. 5385 */ 5386 /* 5387 * If all outstanding data is acked, stop retransmit timer and 5388 * remember to restart (more output or persist). If there is more 5389 * data to be acked, restart retransmit timer, using current 5390 * (possibly backed-off) value. 5391 */ 5392 if (th->th_ack == tp->snd_max) { 5393 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 5394 rack->r_wanted_output++; 5395 } 5396 if (acked == 0) { 5397 if (ofia) 5398 *ofia = ourfinisacked; 5399 return (0); 5400 } 5401 if (rack->r_ctl.rc_early_recovery) { 5402 if (IN_RECOVERY(tp->t_flags)) { 5403 if (SEQ_LT(th->th_ack, tp->snd_recover) && 5404 (SEQ_LT(th->th_ack, tp->snd_max))) { 5405 tcp_rack_partialack(tp, th); 5406 } else { 5407 rack_post_recovery(tp, th); 5408 recovery = 1; 5409 } 5410 } 5411 } 5412 /* 5413 * Let the congestion control algorithm update congestion control 5414 * related information. This typically means increasing the 5415 * congestion window. 5416 */ 5417 rack_ack_received(tp, rack, th, nsegs, CC_ACK, recovery); 5418 SOCKBUF_LOCK(&so->so_snd); 5419 acked_amount = min(acked, (int)sbavail(&so->so_snd)); 5420 tp->snd_wnd -= acked_amount; 5421 mfree = sbcut_locked(&so->so_snd, acked_amount); 5422 if ((sbused(&so->so_snd) == 0) && 5423 (acked > acked_amount) && 5424 (tp->t_state >= TCPS_FIN_WAIT_1)) { 5425 ourfinisacked = 1; 5426 } 5427 /* NB: sowwakeup_locked() does an implicit unlock. */ 5428 sowwakeup_locked(so); 5429 m_freem(mfree); 5430 if (rack->r_ctl.rc_early_recovery == 0) { 5431 if (IN_RECOVERY(tp->t_flags)) { 5432 if (SEQ_LT(th->th_ack, tp->snd_recover) && 5433 (SEQ_LT(th->th_ack, tp->snd_max))) { 5434 tcp_rack_partialack(tp, th); 5435 } else { 5436 rack_post_recovery(tp, th); 5437 } 5438 } 5439 } 5440 tp->snd_una = th->th_ack; 5441 if (SEQ_GT(tp->snd_una, tp->snd_recover)) 5442 tp->snd_recover = tp->snd_una; 5443 5444 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) { 5445 tp->snd_nxt = tp->snd_una; 5446 } 5447 if (tp->snd_una == tp->snd_max) { 5448 /* Nothing left outstanding */ 5449 rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__); 5450 if (sbavail(&tp->t_inpcb->inp_socket->so_snd) == 0) 5451 tp->t_acktime = 0; 5452 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 5453 /* Set need output so persist might get set */ 5454 rack->r_wanted_output++; 5455 if (rack_use_sack_filter) 5456 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 5457 if ((tp->t_state >= TCPS_FIN_WAIT_1) && 5458 (sbavail(&so->so_snd) == 0) && 5459 (tp->t_flags2 & TF2_DROP_AF_DATA)) { 5460 /* 5461 * The socket was gone and the 5462 * peer sent data, time to 5463 * reset him. 5464 */ 5465 *ret_val = 1; 5466 tp = tcp_close(tp); 5467 ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, tlen); 5468 return (1); 5469 } 5470 } 5471 if (ofia) 5472 *ofia = ourfinisacked; 5473 return (0); 5474 } 5475 5476 static void 5477 rack_collapsed_window(struct tcp_rack *rack) 5478 { 5479 /* 5480 * Now we must walk the 5481 * send map and divide the 5482 * ones left stranded. These 5483 * guys can't cause us to abort 5484 * the connection and are really 5485 * "unsent". However if a buggy 5486 * client actually did keep some 5487 * of the data i.e. collapsed the win 5488 * and refused to ack and then opened 5489 * the win and acked that data. We would 5490 * get into an ack war, the simplier 5491 * method then of just pretending we 5492 * did not send those segments something 5493 * won't work. 5494 */ 5495 struct rack_sendmap *rsm, *nrsm, fe, *insret; 5496 tcp_seq max_seq; 5497 uint32_t maxseg; 5498 5499 max_seq = rack->rc_tp->snd_una + rack->rc_tp->snd_wnd; 5500 maxseg = ctf_fixed_maxseg(rack->rc_tp); 5501 memset(&fe, 0, sizeof(fe)); 5502 fe.r_start = max_seq; 5503 /* Find the first seq past or at maxseq */ 5504 rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); 5505 if (rsm == NULL) { 5506 /* Nothing to do strange */ 5507 rack->rc_has_collapsed = 0; 5508 return; 5509 } 5510 /* 5511 * Now do we need to split at 5512 * the collapse point? 5513 */ 5514 if (SEQ_GT(max_seq, rsm->r_start)) { 5515 nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT); 5516 if (nrsm == NULL) { 5517 /* We can't get a rsm, mark all? */ 5518 nrsm = rsm; 5519 goto no_split; 5520 } 5521 /* Clone it */ 5522 rack_clone_rsm(rack, nrsm, rsm, max_seq); 5523 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); 5524 #ifdef INVARIANTS 5525 if (insret != NULL) { 5526 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", 5527 nrsm, insret, rack, rsm); 5528 } 5529 #endif 5530 if (rsm->r_in_tmap) { 5531 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 5532 nrsm->r_in_tmap = 1; 5533 } 5534 /* 5535 * Set in the new RSM as the 5536 * collapsed starting point 5537 */ 5538 rsm = nrsm; 5539 } 5540 no_split: 5541 counter_u64_add(rack_collapsed_win, 1); 5542 RB_FOREACH_FROM(nrsm, rack_rb_tree_head, rsm) { 5543 nrsm->r_flags |= RACK_RWND_COLLAPSED; 5544 rack->rc_has_collapsed = 1; 5545 } 5546 } 5547 5548 static void 5549 rack_un_collapse_window(struct tcp_rack *rack) 5550 { 5551 struct rack_sendmap *rsm; 5552 5553 RB_FOREACH_REVERSE(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) { 5554 if (rsm->r_flags & RACK_RWND_COLLAPSED) 5555 rsm->r_flags &= ~RACK_RWND_COLLAPSED; 5556 else 5557 break; 5558 } 5559 rack->rc_has_collapsed = 0; 5560 } 5561 5562 /* 5563 * Return value of 1, the TCB is unlocked and most 5564 * likely gone, return value of 0, the TCP is still 5565 * locked. 5566 */ 5567 static int 5568 rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so, 5569 struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, 5570 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) 5571 { 5572 /* 5573 * Update window information. Don't look at window if no ACK: TAC's 5574 * send garbage on first SYN. 5575 */ 5576 int32_t nsegs; 5577 int32_t tfo_syn; 5578 struct tcp_rack *rack; 5579 5580 rack = (struct tcp_rack *)tp->t_fb_ptr; 5581 INP_WLOCK_ASSERT(tp->t_inpcb); 5582 nsegs = max(1, m->m_pkthdr.lro_nsegs); 5583 if ((thflags & TH_ACK) && 5584 (SEQ_LT(tp->snd_wl1, th->th_seq) || 5585 (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || 5586 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { 5587 /* keep track of pure window updates */ 5588 if (tlen == 0 && 5589 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) 5590 KMOD_TCPSTAT_INC(tcps_rcvwinupd); 5591 tp->snd_wnd = tiwin; 5592 tp->snd_wl1 = th->th_seq; 5593 tp->snd_wl2 = th->th_ack; 5594 if (tp->snd_wnd > tp->max_sndwnd) 5595 tp->max_sndwnd = tp->snd_wnd; 5596 rack->r_wanted_output++; 5597 } else if (thflags & TH_ACK) { 5598 if ((tp->snd_wl2 == th->th_ack) && (tiwin < tp->snd_wnd)) { 5599 tp->snd_wnd = tiwin; 5600 tp->snd_wl1 = th->th_seq; 5601 tp->snd_wl2 = th->th_ack; 5602 } 5603 } 5604 if (tp->snd_wnd < ctf_outstanding(tp)) 5605 /* The peer collapsed the window */ 5606 rack_collapsed_window(rack); 5607 else if (rack->rc_has_collapsed) 5608 rack_un_collapse_window(rack); 5609 /* Was persist timer active and now we have window space? */ 5610 if ((rack->rc_in_persist != 0) && 5611 (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2), 5612 rack->r_ctl.rc_pace_min_segs))) { 5613 rack_exit_persist(tp, rack); 5614 tp->snd_nxt = tp->snd_max; 5615 /* Make sure we output to start the timer */ 5616 rack->r_wanted_output++; 5617 } 5618 /* Do we enter persists? */ 5619 if ((rack->rc_in_persist == 0) && 5620 (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) && 5621 TCPS_HAVEESTABLISHED(tp->t_state) && 5622 (tp->snd_max == tp->snd_una) && 5623 sbavail(&tp->t_inpcb->inp_socket->so_snd) && 5624 (sbavail(&tp->t_inpcb->inp_socket->so_snd) > tp->snd_wnd)) { 5625 /* 5626 * Here the rwnd is less than 5627 * the pacing size, we are established, 5628 * nothing is outstanding, and there is 5629 * data to send. Enter persists. 5630 */ 5631 tp->snd_nxt = tp->snd_una; 5632 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime); 5633 } 5634 if (tp->t_flags2 & TF2_DROP_AF_DATA) { 5635 m_freem(m); 5636 return (0); 5637 } 5638 /* 5639 * Process segments with URG. 5640 */ 5641 if ((thflags & TH_URG) && th->th_urp && 5642 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 5643 /* 5644 * This is a kludge, but if we receive and accept random 5645 * urgent pointers, we'll crash in soreceive. It's hard to 5646 * imagine someone actually wanting to send this much urgent 5647 * data. 5648 */ 5649 SOCKBUF_LOCK(&so->so_rcv); 5650 if (th->th_urp + sbavail(&so->so_rcv) > sb_max) { 5651 th->th_urp = 0; /* XXX */ 5652 thflags &= ~TH_URG; /* XXX */ 5653 SOCKBUF_UNLOCK(&so->so_rcv); /* XXX */ 5654 goto dodata; /* XXX */ 5655 } 5656 /* 5657 * If this segment advances the known urgent pointer, then 5658 * mark the data stream. This should not happen in 5659 * CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since a 5660 * FIN has been received from the remote side. In these 5661 * states we ignore the URG. 5662 * 5663 * According to RFC961 (Assigned Protocols), the urgent 5664 * pointer points to the last octet of urgent data. We 5665 * continue, however, to consider it to indicate the first 5666 * octet of data past the urgent section as the original 5667 * spec states (in one of two places). 5668 */ 5669 if (SEQ_GT(th->th_seq + th->th_urp, tp->rcv_up)) { 5670 tp->rcv_up = th->th_seq + th->th_urp; 5671 so->so_oobmark = sbavail(&so->so_rcv) + 5672 (tp->rcv_up - tp->rcv_nxt) - 1; 5673 if (so->so_oobmark == 0) 5674 so->so_rcv.sb_state |= SBS_RCVATMARK; 5675 sohasoutofband(so); 5676 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); 5677 } 5678 SOCKBUF_UNLOCK(&so->so_rcv); 5679 /* 5680 * Remove out of band data so doesn't get presented to user. 5681 * This can happen independent of advancing the URG pointer, 5682 * but if two URG's are pending at once, some out-of-band 5683 * data may creep in... ick. 5684 */ 5685 if (th->th_urp <= (uint32_t) tlen && 5686 !(so->so_options & SO_OOBINLINE)) { 5687 /* hdr drop is delayed */ 5688 tcp_pulloutofband(so, th, m, drop_hdrlen); 5689 } 5690 } else { 5691 /* 5692 * If no out of band data is expected, pull receive urgent 5693 * pointer along with the receive window. 5694 */ 5695 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) 5696 tp->rcv_up = tp->rcv_nxt; 5697 } 5698 dodata: /* XXX */ 5699 INP_WLOCK_ASSERT(tp->t_inpcb); 5700 5701 /* 5702 * Process the segment text, merging it into the TCP sequencing 5703 * queue, and arranging for acknowledgment of receipt if necessary. 5704 * This process logically involves adjusting tp->rcv_wnd as data is 5705 * presented to the user (this happens in tcp_usrreq.c, case 5706 * PRU_RCVD). If a FIN has already been received on this connection 5707 * then we just ignore the text. 5708 */ 5709 tfo_syn = ((tp->t_state == TCPS_SYN_RECEIVED) && 5710 IS_FASTOPEN(tp->t_flags)); 5711 if ((tlen || (thflags & TH_FIN) || tfo_syn) && 5712 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 5713 tcp_seq save_start = th->th_seq; 5714 tcp_seq save_rnxt = tp->rcv_nxt; 5715 int save_tlen = tlen; 5716 5717 m_adj(m, drop_hdrlen); /* delayed header drop */ 5718 /* 5719 * Insert segment which includes th into TCP reassembly 5720 * queue with control block tp. Set thflags to whether 5721 * reassembly now includes a segment with FIN. This handles 5722 * the common case inline (segment is the next to be 5723 * received on an established connection, and the queue is 5724 * empty), avoiding linkage into and removal from the queue 5725 * and repetition of various conversions. Set DELACK for 5726 * segments received in order, but ack immediately when 5727 * segments are out of order (so fast retransmit can work). 5728 */ 5729 if (th->th_seq == tp->rcv_nxt && 5730 SEGQ_EMPTY(tp) && 5731 (TCPS_HAVEESTABLISHED(tp->t_state) || 5732 tfo_syn)) { 5733 #ifdef NETFLIX_SB_LIMITS 5734 u_int mcnt, appended; 5735 5736 if (so->so_rcv.sb_shlim) { 5737 mcnt = m_memcnt(m); 5738 appended = 0; 5739 if (counter_fo_get(so->so_rcv.sb_shlim, mcnt, 5740 CFO_NOSLEEP, NULL) == false) { 5741 counter_u64_add(tcp_sb_shlim_fails, 1); 5742 m_freem(m); 5743 return (0); 5744 } 5745 } 5746 #endif 5747 if (DELAY_ACK(tp, tlen) || tfo_syn) { 5748 rack_timer_cancel(tp, rack, 5749 rack->r_ctl.rc_rcvtime, __LINE__); 5750 tp->t_flags |= TF_DELACK; 5751 } else { 5752 rack->r_wanted_output++; 5753 tp->t_flags |= TF_ACKNOW; 5754 } 5755 tp->rcv_nxt += tlen; 5756 thflags = th->th_flags & TH_FIN; 5757 KMOD_TCPSTAT_ADD(tcps_rcvpack, nsegs); 5758 KMOD_TCPSTAT_ADD(tcps_rcvbyte, tlen); 5759 SOCKBUF_LOCK(&so->so_rcv); 5760 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 5761 m_freem(m); 5762 } else 5763 #ifdef NETFLIX_SB_LIMITS 5764 appended = 5765 #endif 5766 sbappendstream_locked(&so->so_rcv, m, 0); 5767 /* NB: sorwakeup_locked() does an implicit unlock. */ 5768 sorwakeup_locked(so); 5769 #ifdef NETFLIX_SB_LIMITS 5770 if (so->so_rcv.sb_shlim && appended != mcnt) 5771 counter_fo_release(so->so_rcv.sb_shlim, 5772 mcnt - appended); 5773 #endif 5774 } else { 5775 /* 5776 * XXX: Due to the header drop above "th" is 5777 * theoretically invalid by now. Fortunately 5778 * m_adj() doesn't actually frees any mbufs when 5779 * trimming from the head. 5780 */ 5781 tcp_seq temp = save_start; 5782 thflags = tcp_reass(tp, th, &temp, &tlen, m); 5783 tp->t_flags |= TF_ACKNOW; 5784 } 5785 if ((tp->t_flags & TF_SACK_PERMIT) && (save_tlen > 0)) { 5786 if ((tlen == 0) && (SEQ_LT(save_start, save_rnxt))) { 5787 /* 5788 * DSACK actually handled in the fastpath 5789 * above. 5790 */ 5791 tcp_update_sack_list(tp, save_start, 5792 save_start + save_tlen); 5793 } else if ((tlen > 0) && SEQ_GT(tp->rcv_nxt, save_rnxt)) { 5794 if ((tp->rcv_numsacks >= 1) && 5795 (tp->sackblks[0].end == save_start)) { 5796 /* 5797 * Partial overlap, recorded at todrop 5798 * above. 5799 */ 5800 tcp_update_sack_list(tp, 5801 tp->sackblks[0].start, 5802 tp->sackblks[0].end); 5803 } else { 5804 tcp_update_dsack_list(tp, save_start, 5805 save_start + save_tlen); 5806 } 5807 } else if (tlen >= save_tlen) { 5808 /* Update of sackblks. */ 5809 tcp_update_dsack_list(tp, save_start, 5810 save_start + save_tlen); 5811 } else if (tlen > 0) { 5812 tcp_update_dsack_list(tp, save_start, 5813 save_start + tlen); 5814 } 5815 } 5816 } else { 5817 m_freem(m); 5818 thflags &= ~TH_FIN; 5819 } 5820 5821 /* 5822 * If FIN is received ACK the FIN and let the user know that the 5823 * connection is closing. 5824 */ 5825 if (thflags & TH_FIN) { 5826 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 5827 socantrcvmore(so); 5828 /* 5829 * If connection is half-synchronized (ie NEEDSYN 5830 * flag on) then delay ACK, so it may be piggybacked 5831 * when SYN is sent. Otherwise, since we received a 5832 * FIN then no more input can be expected, send ACK 5833 * now. 5834 */ 5835 if (tp->t_flags & TF_NEEDSYN) { 5836 rack_timer_cancel(tp, rack, 5837 rack->r_ctl.rc_rcvtime, __LINE__); 5838 tp->t_flags |= TF_DELACK; 5839 } else { 5840 tp->t_flags |= TF_ACKNOW; 5841 } 5842 tp->rcv_nxt++; 5843 } 5844 switch (tp->t_state) { 5845 5846 /* 5847 * In SYN_RECEIVED and ESTABLISHED STATES enter the 5848 * CLOSE_WAIT state. 5849 */ 5850 case TCPS_SYN_RECEIVED: 5851 tp->t_starttime = ticks; 5852 /* FALLTHROUGH */ 5853 case TCPS_ESTABLISHED: 5854 rack_timer_cancel(tp, rack, 5855 rack->r_ctl.rc_rcvtime, __LINE__); 5856 tcp_state_change(tp, TCPS_CLOSE_WAIT); 5857 break; 5858 5859 /* 5860 * If still in FIN_WAIT_1 STATE FIN has not been 5861 * acked so enter the CLOSING state. 5862 */ 5863 case TCPS_FIN_WAIT_1: 5864 rack_timer_cancel(tp, rack, 5865 rack->r_ctl.rc_rcvtime, __LINE__); 5866 tcp_state_change(tp, TCPS_CLOSING); 5867 break; 5868 5869 /* 5870 * In FIN_WAIT_2 state enter the TIME_WAIT state, 5871 * starting the time-wait timer, turning off the 5872 * other standard timers. 5873 */ 5874 case TCPS_FIN_WAIT_2: 5875 rack_timer_cancel(tp, rack, 5876 rack->r_ctl.rc_rcvtime, __LINE__); 5877 tcp_twstart(tp); 5878 return (1); 5879 } 5880 } 5881 /* 5882 * Return any desired output. 5883 */ 5884 if ((tp->t_flags & TF_ACKNOW) || 5885 (sbavail(&so->so_snd) > (tp->snd_max - tp->snd_una))) { 5886 rack->r_wanted_output++; 5887 } 5888 INP_WLOCK_ASSERT(tp->t_inpcb); 5889 return (0); 5890 } 5891 5892 /* 5893 * Here nothing is really faster, its just that we 5894 * have broken out the fast-data path also just like 5895 * the fast-ack. 5896 */ 5897 static int 5898 rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so, 5899 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 5900 uint32_t tiwin, int32_t nxt_pkt, uint8_t iptos) 5901 { 5902 int32_t nsegs; 5903 int32_t newsize = 0; /* automatic sockbuf scaling */ 5904 struct tcp_rack *rack; 5905 #ifdef NETFLIX_SB_LIMITS 5906 u_int mcnt, appended; 5907 #endif 5908 #ifdef TCPDEBUG 5909 /* 5910 * The size of tcp_saveipgen must be the size of the max ip header, 5911 * now IPv6. 5912 */ 5913 u_char tcp_saveipgen[IP6_HDR_LEN]; 5914 struct tcphdr tcp_savetcp; 5915 short ostate = 0; 5916 5917 #endif 5918 /* 5919 * If last ACK falls within this segment's sequence numbers, record 5920 * the timestamp. NOTE that the test is modified according to the 5921 * latest proposal of the tcplw@cray.com list (Braden 1993/04/26). 5922 */ 5923 if (__predict_false(th->th_seq != tp->rcv_nxt)) { 5924 return (0); 5925 } 5926 if (__predict_false(tp->snd_nxt != tp->snd_max)) { 5927 return (0); 5928 } 5929 if (tiwin && tiwin != tp->snd_wnd) { 5930 return (0); 5931 } 5932 if (__predict_false((tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN)))) { 5933 return (0); 5934 } 5935 if (__predict_false((to->to_flags & TOF_TS) && 5936 (TSTMP_LT(to->to_tsval, tp->ts_recent)))) { 5937 return (0); 5938 } 5939 if (__predict_false((th->th_ack != tp->snd_una))) { 5940 return (0); 5941 } 5942 if (__predict_false(tlen > sbspace(&so->so_rcv))) { 5943 return (0); 5944 } 5945 if ((to->to_flags & TOF_TS) != 0 && 5946 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 5947 tp->ts_recent_age = tcp_ts_getticks(); 5948 tp->ts_recent = to->to_tsval; 5949 } 5950 rack = (struct tcp_rack *)tp->t_fb_ptr; 5951 /* 5952 * This is a pure, in-sequence data packet with nothing on the 5953 * reassembly queue and we have enough buffer space to take it. 5954 */ 5955 nsegs = max(1, m->m_pkthdr.lro_nsegs); 5956 5957 #ifdef NETFLIX_SB_LIMITS 5958 if (so->so_rcv.sb_shlim) { 5959 mcnt = m_memcnt(m); 5960 appended = 0; 5961 if (counter_fo_get(so->so_rcv.sb_shlim, mcnt, 5962 CFO_NOSLEEP, NULL) == false) { 5963 counter_u64_add(tcp_sb_shlim_fails, 1); 5964 m_freem(m); 5965 return (1); 5966 } 5967 } 5968 #endif 5969 /* Clean receiver SACK report if present */ 5970 if (tp->rcv_numsacks) 5971 tcp_clean_sackreport(tp); 5972 KMOD_TCPSTAT_INC(tcps_preddat); 5973 tp->rcv_nxt += tlen; 5974 /* 5975 * Pull snd_wl1 up to prevent seq wrap relative to th_seq. 5976 */ 5977 tp->snd_wl1 = th->th_seq; 5978 /* 5979 * Pull rcv_up up to prevent seq wrap relative to rcv_nxt. 5980 */ 5981 tp->rcv_up = tp->rcv_nxt; 5982 KMOD_TCPSTAT_ADD(tcps_rcvpack, nsegs); 5983 KMOD_TCPSTAT_ADD(tcps_rcvbyte, tlen); 5984 #ifdef TCPDEBUG 5985 if (so->so_options & SO_DEBUG) 5986 tcp_trace(TA_INPUT, ostate, tp, 5987 (void *)tcp_saveipgen, &tcp_savetcp, 0); 5988 #endif 5989 newsize = tcp_autorcvbuf(m, th, so, tp, tlen); 5990 5991 /* Add data to socket buffer. */ 5992 SOCKBUF_LOCK(&so->so_rcv); 5993 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 5994 m_freem(m); 5995 } else { 5996 /* 5997 * Set new socket buffer size. Give up when limit is 5998 * reached. 5999 */ 6000 if (newsize) 6001 if (!sbreserve_locked(&so->so_rcv, 6002 newsize, so, NULL)) 6003 so->so_rcv.sb_flags &= ~SB_AUTOSIZE; 6004 m_adj(m, drop_hdrlen); /* delayed header drop */ 6005 #ifdef NETFLIX_SB_LIMITS 6006 appended = 6007 #endif 6008 sbappendstream_locked(&so->so_rcv, m, 0); 6009 ctf_calc_rwin(so, tp); 6010 } 6011 /* NB: sorwakeup_locked() does an implicit unlock. */ 6012 sorwakeup_locked(so); 6013 #ifdef NETFLIX_SB_LIMITS 6014 if (so->so_rcv.sb_shlim && mcnt != appended) 6015 counter_fo_release(so->so_rcv.sb_shlim, mcnt - appended); 6016 #endif 6017 if (DELAY_ACK(tp, tlen)) { 6018 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 6019 tp->t_flags |= TF_DELACK; 6020 } else { 6021 tp->t_flags |= TF_ACKNOW; 6022 rack->r_wanted_output++; 6023 } 6024 if ((tp->snd_una == tp->snd_max) && rack_use_sack_filter) 6025 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 6026 return (1); 6027 } 6028 6029 /* 6030 * This subfunction is used to try to highly optimize the 6031 * fast path. We again allow window updates that are 6032 * in sequence to remain in the fast-path. We also add 6033 * in the __predict's to attempt to help the compiler. 6034 * Note that if we return a 0, then we can *not* process 6035 * it and the caller should push the packet into the 6036 * slow-path. 6037 */ 6038 static int 6039 rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so, 6040 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 6041 uint32_t tiwin, int32_t nxt_pkt, uint32_t cts, uint8_t iptos) 6042 { 6043 int32_t acked; 6044 int32_t nsegs; 6045 6046 #ifdef TCPDEBUG 6047 /* 6048 * The size of tcp_saveipgen must be the size of the max ip header, 6049 * now IPv6. 6050 */ 6051 u_char tcp_saveipgen[IP6_HDR_LEN]; 6052 struct tcphdr tcp_savetcp; 6053 short ostate = 0; 6054 6055 #endif 6056 struct tcp_rack *rack; 6057 6058 if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) { 6059 /* Old ack, behind (or duplicate to) the last one rcv'd */ 6060 return (0); 6061 } 6062 if (__predict_false(SEQ_GT(th->th_ack, tp->snd_max))) { 6063 /* Above what we have sent? */ 6064 return (0); 6065 } 6066 if (__predict_false(tp->snd_nxt != tp->snd_max)) { 6067 /* We are retransmitting */ 6068 return (0); 6069 } 6070 if (__predict_false(tiwin == 0)) { 6071 /* zero window */ 6072 return (0); 6073 } 6074 if (__predict_false(tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN))) { 6075 /* We need a SYN or a FIN, unlikely.. */ 6076 return (0); 6077 } 6078 if ((to->to_flags & TOF_TS) && __predict_false(TSTMP_LT(to->to_tsval, tp->ts_recent))) { 6079 /* Timestamp is behind .. old ack with seq wrap? */ 6080 return (0); 6081 } 6082 if (__predict_false(IN_RECOVERY(tp->t_flags))) { 6083 /* Still recovering */ 6084 return (0); 6085 } 6086 rack = (struct tcp_rack *)tp->t_fb_ptr; 6087 if (rack->r_ctl.rc_sacked) { 6088 /* We have sack holes on our scoreboard */ 6089 return (0); 6090 } 6091 /* Ok if we reach here, we can process a fast-ack */ 6092 nsegs = max(1, m->m_pkthdr.lro_nsegs); 6093 rack_log_ack(tp, to, th); 6094 /* 6095 * We made progress, clear the tlp 6096 * out flag so we could start a TLP 6097 * again. 6098 */ 6099 rack->r_ctl.rc_tlp_rtx_out = 0; 6100 /* Did the window get updated? */ 6101 if (tiwin != tp->snd_wnd) { 6102 tp->snd_wnd = tiwin; 6103 tp->snd_wl1 = th->th_seq; 6104 if (tp->snd_wnd > tp->max_sndwnd) 6105 tp->max_sndwnd = tp->snd_wnd; 6106 } 6107 /* Do we exit persists? */ 6108 if ((rack->rc_in_persist != 0) && 6109 (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2), 6110 rack->r_ctl.rc_pace_min_segs))) { 6111 rack_exit_persist(tp, rack); 6112 } 6113 /* Do we enter persists? */ 6114 if ((rack->rc_in_persist == 0) && 6115 (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) && 6116 TCPS_HAVEESTABLISHED(tp->t_state) && 6117 (tp->snd_max == tp->snd_una) && 6118 sbavail(&tp->t_inpcb->inp_socket->so_snd) && 6119 (sbavail(&tp->t_inpcb->inp_socket->so_snd) > tp->snd_wnd)) { 6120 /* 6121 * Here the rwnd is less than 6122 * the pacing size, we are established, 6123 * nothing is outstanding, and there is 6124 * data to send. Enter persists. 6125 */ 6126 tp->snd_nxt = tp->snd_una; 6127 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime); 6128 } 6129 /* 6130 * If last ACK falls within this segment's sequence numbers, record 6131 * the timestamp. NOTE that the test is modified according to the 6132 * latest proposal of the tcplw@cray.com list (Braden 1993/04/26). 6133 */ 6134 if ((to->to_flags & TOF_TS) != 0 && 6135 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 6136 tp->ts_recent_age = tcp_ts_getticks(); 6137 tp->ts_recent = to->to_tsval; 6138 } 6139 /* 6140 * This is a pure ack for outstanding data. 6141 */ 6142 KMOD_TCPSTAT_INC(tcps_predack); 6143 6144 /* 6145 * "bad retransmit" recovery. 6146 */ 6147 if (tp->t_flags & TF_PREVVALID) { 6148 tp->t_flags &= ~TF_PREVVALID; 6149 if (tp->t_rxtshift == 1 && 6150 (int)(ticks - tp->t_badrxtwin) < 0) 6151 rack_cong_signal(tp, th, CC_RTO_ERR); 6152 } 6153 /* 6154 * Recalculate the transmit timer / rtt. 6155 * 6156 * Some boxes send broken timestamp replies during the SYN+ACK 6157 * phase, ignore timestamps of 0 or we could calculate a huge RTT 6158 * and blow up the retransmit timer. 6159 */ 6160 acked = BYTES_THIS_ACK(tp, th); 6161 6162 #ifdef TCP_HHOOK 6163 /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ 6164 hhook_run_tcp_est_in(tp, th, to); 6165 #endif 6166 6167 KMOD_TCPSTAT_ADD(tcps_rcvackpack, nsegs); 6168 KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked); 6169 sbdrop(&so->so_snd, acked); 6170 /* 6171 * Let the congestion control algorithm update congestion control 6172 * related information. This typically means increasing the 6173 * congestion window. 6174 */ 6175 rack_ack_received(tp, rack, th, nsegs, CC_ACK, 0); 6176 6177 tp->snd_una = th->th_ack; 6178 if (tp->snd_wnd < ctf_outstanding(tp)) { 6179 /* The peer collapsed the window */ 6180 rack_collapsed_window(rack); 6181 } else if (rack->rc_has_collapsed) 6182 rack_un_collapse_window(rack); 6183 6184 /* 6185 * Pull snd_wl2 up to prevent seq wrap relative to th_ack. 6186 */ 6187 tp->snd_wl2 = th->th_ack; 6188 tp->t_dupacks = 0; 6189 m_freem(m); 6190 /* ND6_HINT(tp); *//* Some progress has been made. */ 6191 6192 /* 6193 * If all outstanding data are acked, stop retransmit timer, 6194 * otherwise restart timer using current (possibly backed-off) 6195 * value. If process is waiting for space, wakeup/selwakeup/signal. 6196 * If data are ready to send, let tcp_output decide between more 6197 * output or persist. 6198 */ 6199 #ifdef TCPDEBUG 6200 if (so->so_options & SO_DEBUG) 6201 tcp_trace(TA_INPUT, ostate, tp, 6202 (void *)tcp_saveipgen, 6203 &tcp_savetcp, 0); 6204 #endif 6205 if (tp->snd_una == tp->snd_max) { 6206 rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__); 6207 if (sbavail(&tp->t_inpcb->inp_socket->so_snd) == 0) 6208 tp->t_acktime = 0; 6209 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 6210 } 6211 /* Wake up the socket if we have room to write more */ 6212 sowwakeup(so); 6213 if (sbavail(&so->so_snd)) { 6214 rack->r_wanted_output++; 6215 } 6216 return (1); 6217 } 6218 6219 /* 6220 * Return value of 1, the TCB is unlocked and most 6221 * likely gone, return value of 0, the TCP is still 6222 * locked. 6223 */ 6224 static int 6225 rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so, 6226 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 6227 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t tos) 6228 { 6229 int32_t ret_val = 0; 6230 int32_t todrop; 6231 int32_t ourfinisacked = 0; 6232 struct tcp_rack *rack; 6233 6234 ctf_calc_rwin(so, tp); 6235 /* 6236 * If the state is SYN_SENT: if seg contains an ACK, but not for our 6237 * SYN, drop the input. if seg contains a RST, then drop the 6238 * connection. if seg does not contain SYN, then drop it. Otherwise 6239 * this is an acceptable SYN segment initialize tp->rcv_nxt and 6240 * tp->irs if seg contains ack then advance tp->snd_una if seg 6241 * contains an ECE and ECN support is enabled, the stream is ECN 6242 * capable. if SYN has been acked change to ESTABLISHED else 6243 * SYN_RCVD state arrange for segment to be acked (eventually) 6244 * continue processing rest of data/controls, beginning with URG 6245 */ 6246 if ((thflags & TH_ACK) && 6247 (SEQ_LEQ(th->th_ack, tp->iss) || 6248 SEQ_GT(th->th_ack, tp->snd_max))) { 6249 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 6250 return (1); 6251 } 6252 if ((thflags & (TH_ACK | TH_RST)) == (TH_ACK | TH_RST)) { 6253 TCP_PROBE5(connect__refused, NULL, tp, 6254 mtod(m, const char *), tp, th); 6255 tp = tcp_drop(tp, ECONNREFUSED); 6256 ctf_do_drop(m, tp); 6257 return (1); 6258 } 6259 if (thflags & TH_RST) { 6260 ctf_do_drop(m, tp); 6261 return (1); 6262 } 6263 if (!(thflags & TH_SYN)) { 6264 ctf_do_drop(m, tp); 6265 return (1); 6266 } 6267 tp->irs = th->th_seq; 6268 tcp_rcvseqinit(tp); 6269 rack = (struct tcp_rack *)tp->t_fb_ptr; 6270 if (thflags & TH_ACK) { 6271 int tfo_partial = 0; 6272 6273 KMOD_TCPSTAT_INC(tcps_connects); 6274 soisconnected(so); 6275 #ifdef MAC 6276 mac_socketpeer_set_from_mbuf(m, so); 6277 #endif 6278 /* Do window scaling on this connection? */ 6279 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 6280 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 6281 tp->rcv_scale = tp->request_r_scale; 6282 } 6283 tp->rcv_adv += min(tp->rcv_wnd, 6284 TCP_MAXWIN << tp->rcv_scale); 6285 /* 6286 * If not all the data that was sent in the TFO SYN 6287 * has been acked, resend the remainder right away. 6288 */ 6289 if (IS_FASTOPEN(tp->t_flags) && 6290 (tp->snd_una != tp->snd_max)) { 6291 tp->snd_nxt = th->th_ack; 6292 tfo_partial = 1; 6293 } 6294 /* 6295 * If there's data, delay ACK; if there's also a FIN ACKNOW 6296 * will be turned on later. 6297 */ 6298 if (DELAY_ACK(tp, tlen) && tlen != 0 && (tfo_partial == 0)) { 6299 rack_timer_cancel(tp, rack, 6300 rack->r_ctl.rc_rcvtime, __LINE__); 6301 tp->t_flags |= TF_DELACK; 6302 } else { 6303 rack->r_wanted_output++; 6304 tp->t_flags |= TF_ACKNOW; 6305 } 6306 6307 if (((thflags & (TH_CWR | TH_ECE)) == TH_ECE) && 6308 (V_tcp_do_ecn == 1)) { 6309 tp->t_flags2 |= TF2_ECN_PERMIT; 6310 KMOD_TCPSTAT_INC(tcps_ecn_shs); 6311 } 6312 if (SEQ_GT(th->th_ack, tp->snd_una)) { 6313 /* 6314 * We advance snd_una for the 6315 * fast open case. If th_ack is 6316 * acknowledging data beyond 6317 * snd_una we can't just call 6318 * ack-processing since the 6319 * data stream in our send-map 6320 * will start at snd_una + 1 (one 6321 * beyond the SYN). If its just 6322 * equal we don't need to do that 6323 * and there is no send_map. 6324 */ 6325 tp->snd_una++; 6326 } 6327 /* 6328 * Received <SYN,ACK> in SYN_SENT[*] state. Transitions: 6329 * SYN_SENT --> ESTABLISHED SYN_SENT* --> FIN_WAIT_1 6330 */ 6331 tp->t_starttime = ticks; 6332 if (tp->t_flags & TF_NEEDFIN) { 6333 tcp_state_change(tp, TCPS_FIN_WAIT_1); 6334 tp->t_flags &= ~TF_NEEDFIN; 6335 thflags &= ~TH_SYN; 6336 } else { 6337 tcp_state_change(tp, TCPS_ESTABLISHED); 6338 TCP_PROBE5(connect__established, NULL, tp, 6339 mtod(m, const char *), tp, th); 6340 cc_conn_init(tp); 6341 } 6342 } else { 6343 /* 6344 * Received initial SYN in SYN-SENT[*] state => simultaneous 6345 * open. If segment contains CC option and there is a 6346 * cached CC, apply TAO test. If it succeeds, connection is * 6347 * half-synchronized. Otherwise, do 3-way handshake: 6348 * SYN-SENT -> SYN-RECEIVED SYN-SENT* -> SYN-RECEIVED* If 6349 * there was no CC option, clear cached CC value. 6350 */ 6351 tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN); 6352 tcp_state_change(tp, TCPS_SYN_RECEIVED); 6353 } 6354 INP_WLOCK_ASSERT(tp->t_inpcb); 6355 /* 6356 * Advance th->th_seq to correspond to first data byte. If data, 6357 * trim to stay within window, dropping FIN if necessary. 6358 */ 6359 th->th_seq++; 6360 if (tlen > tp->rcv_wnd) { 6361 todrop = tlen - tp->rcv_wnd; 6362 m_adj(m, -todrop); 6363 tlen = tp->rcv_wnd; 6364 thflags &= ~TH_FIN; 6365 KMOD_TCPSTAT_INC(tcps_rcvpackafterwin); 6366 KMOD_TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); 6367 } 6368 tp->snd_wl1 = th->th_seq - 1; 6369 tp->rcv_up = th->th_seq; 6370 /* 6371 * Client side of transaction: already sent SYN and data. If the 6372 * remote host used T/TCP to validate the SYN, our data will be 6373 * ACK'd; if so, enter normal data segment processing in the middle 6374 * of step 5, ack processing. Otherwise, goto step 6. 6375 */ 6376 if (thflags & TH_ACK) { 6377 /* For syn-sent we need to possibly update the rtt */ 6378 if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) { 6379 uint32_t t; 6380 6381 t = tcp_ts_getticks() - to->to_tsecr; 6382 if (!tp->t_rttlow || tp->t_rttlow > t) 6383 tp->t_rttlow = t; 6384 tcp_rack_xmit_timer(rack, t + 1); 6385 tcp_rack_xmit_timer_commit(rack, tp); 6386 } 6387 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) 6388 return (ret_val); 6389 /* We may have changed to FIN_WAIT_1 above */ 6390 if (tp->t_state == TCPS_FIN_WAIT_1) { 6391 /* 6392 * In FIN_WAIT_1 STATE in addition to the processing 6393 * for the ESTABLISHED state if our FIN is now 6394 * acknowledged then enter FIN_WAIT_2. 6395 */ 6396 if (ourfinisacked) { 6397 /* 6398 * If we can't receive any more data, then 6399 * closing user can proceed. Starting the 6400 * timer is contrary to the specification, 6401 * but if we don't get a FIN we'll hang 6402 * forever. 6403 * 6404 * XXXjl: we should release the tp also, and 6405 * use a compressed state. 6406 */ 6407 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 6408 soisdisconnected(so); 6409 tcp_timer_activate(tp, TT_2MSL, 6410 (tcp_fast_finwait2_recycle ? 6411 tcp_finwait2_timeout : 6412 TP_MAXIDLE(tp))); 6413 } 6414 tcp_state_change(tp, TCPS_FIN_WAIT_2); 6415 } 6416 } 6417 } 6418 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 6419 tiwin, thflags, nxt_pkt)); 6420 } 6421 6422 /* 6423 * Return value of 1, the TCB is unlocked and most 6424 * likely gone, return value of 0, the TCP is still 6425 * locked. 6426 */ 6427 static int 6428 rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, 6429 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 6430 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 6431 { 6432 struct tcp_rack *rack; 6433 int32_t ret_val = 0; 6434 int32_t ourfinisacked = 0; 6435 6436 ctf_calc_rwin(so, tp); 6437 if ((thflags & TH_ACK) && 6438 (SEQ_LEQ(th->th_ack, tp->snd_una) || 6439 SEQ_GT(th->th_ack, tp->snd_max))) { 6440 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 6441 return (1); 6442 } 6443 rack = (struct tcp_rack *)tp->t_fb_ptr; 6444 if (IS_FASTOPEN(tp->t_flags)) { 6445 /* 6446 * When a TFO connection is in SYN_RECEIVED, the 6447 * only valid packets are the initial SYN, a 6448 * retransmit/copy of the initial SYN (possibly with 6449 * a subset of the original data), a valid ACK, a 6450 * FIN, or a RST. 6451 */ 6452 if ((thflags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) { 6453 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 6454 return (1); 6455 } else if (thflags & TH_SYN) { 6456 /* non-initial SYN is ignored */ 6457 if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) || 6458 (rack->r_ctl.rc_hpts_flags & PACE_TMR_TLP) || 6459 (rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK)) { 6460 ctf_do_drop(m, NULL); 6461 return (0); 6462 } 6463 } else if (!(thflags & (TH_ACK | TH_FIN | TH_RST))) { 6464 ctf_do_drop(m, NULL); 6465 return (0); 6466 } 6467 } 6468 if ((thflags & TH_RST) || 6469 (tp->t_fin_is_rst && (thflags & TH_FIN))) 6470 return (ctf_process_rst(m, th, so, tp)); 6471 /* 6472 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 6473 * it's less than ts_recent, drop it. 6474 */ 6475 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 6476 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 6477 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 6478 return (ret_val); 6479 } 6480 /* 6481 * In the SYN-RECEIVED state, validate that the packet belongs to 6482 * this connection before trimming the data to fit the receive 6483 * window. Check the sequence number versus IRS since we know the 6484 * sequence numbers haven't wrapped. This is a partial fix for the 6485 * "LAND" DoS attack. 6486 */ 6487 if (SEQ_LT(th->th_seq, tp->irs)) { 6488 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 6489 return (1); 6490 } 6491 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 6492 return (ret_val); 6493 } 6494 /* 6495 * If last ACK falls within this segment's sequence numbers, record 6496 * its timestamp. NOTE: 1) That the test incorporates suggestions 6497 * from the latest proposal of the tcplw@cray.com list (Braden 6498 * 1993/04/26). 2) That updating only on newer timestamps interferes 6499 * with our earlier PAWS tests, so this check should be solely 6500 * predicated on the sequence space of this segment. 3) That we 6501 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 6502 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 6503 * SEG.Len, This modified check allows us to overcome RFC1323's 6504 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 6505 * p.869. In such cases, we can still calculate the RTT correctly 6506 * when RCV.NXT == Last.ACK.Sent. 6507 */ 6508 if ((to->to_flags & TOF_TS) != 0 && 6509 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 6510 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 6511 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 6512 tp->ts_recent_age = tcp_ts_getticks(); 6513 tp->ts_recent = to->to_tsval; 6514 } 6515 tp->snd_wnd = tiwin; 6516 /* 6517 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 6518 * is on (half-synchronized state), then queue data for later 6519 * processing; else drop segment and return. 6520 */ 6521 if ((thflags & TH_ACK) == 0) { 6522 if (IS_FASTOPEN(tp->t_flags)) { 6523 cc_conn_init(tp); 6524 } 6525 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 6526 tiwin, thflags, nxt_pkt)); 6527 } 6528 KMOD_TCPSTAT_INC(tcps_connects); 6529 soisconnected(so); 6530 /* Do window scaling? */ 6531 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 6532 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 6533 tp->rcv_scale = tp->request_r_scale; 6534 } 6535 /* 6536 * Make transitions: SYN-RECEIVED -> ESTABLISHED SYN-RECEIVED* -> 6537 * FIN-WAIT-1 6538 */ 6539 tp->t_starttime = ticks; 6540 if (IS_FASTOPEN(tp->t_flags) && tp->t_tfo_pending) { 6541 tcp_fastopen_decrement_counter(tp->t_tfo_pending); 6542 tp->t_tfo_pending = NULL; 6543 6544 /* 6545 * Account for the ACK of our SYN prior to 6546 * regular ACK processing below. 6547 */ 6548 tp->snd_una++; 6549 } 6550 if (tp->t_flags & TF_NEEDFIN) { 6551 tcp_state_change(tp, TCPS_FIN_WAIT_1); 6552 tp->t_flags &= ~TF_NEEDFIN; 6553 } else { 6554 tcp_state_change(tp, TCPS_ESTABLISHED); 6555 TCP_PROBE5(accept__established, NULL, tp, 6556 mtod(m, const char *), tp, th); 6557 /* 6558 * TFO connections call cc_conn_init() during SYN 6559 * processing. Calling it again here for such connections 6560 * is not harmless as it would undo the snd_cwnd reduction 6561 * that occurs when a TFO SYN|ACK is retransmitted. 6562 */ 6563 if (!IS_FASTOPEN(tp->t_flags)) 6564 cc_conn_init(tp); 6565 } 6566 /* 6567 * If segment contains data or ACK, will call tcp_reass() later; if 6568 * not, do so now to pass queued data to user. 6569 */ 6570 if (tlen == 0 && (thflags & TH_FIN) == 0) 6571 (void) tcp_reass(tp, (struct tcphdr *)0, NULL, 0, 6572 (struct mbuf *)0); 6573 tp->snd_wl1 = th->th_seq - 1; 6574 /* For syn-recv we need to possibly update the rtt */ 6575 if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) { 6576 uint32_t t; 6577 6578 t = tcp_ts_getticks() - to->to_tsecr; 6579 if (!tp->t_rttlow || tp->t_rttlow > t) 6580 tp->t_rttlow = t; 6581 tcp_rack_xmit_timer(rack, t + 1); 6582 tcp_rack_xmit_timer_commit(rack, tp); 6583 } 6584 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 6585 return (ret_val); 6586 } 6587 if (tp->t_state == TCPS_FIN_WAIT_1) { 6588 /* We could have went to FIN_WAIT_1 (or EST) above */ 6589 /* 6590 * In FIN_WAIT_1 STATE in addition to the processing for the 6591 * ESTABLISHED state if our FIN is now acknowledged then 6592 * enter FIN_WAIT_2. 6593 */ 6594 if (ourfinisacked) { 6595 /* 6596 * If we can't receive any more data, then closing 6597 * user can proceed. Starting the timer is contrary 6598 * to the specification, but if we don't get a FIN 6599 * we'll hang forever. 6600 * 6601 * XXXjl: we should release the tp also, and use a 6602 * compressed state. 6603 */ 6604 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 6605 soisdisconnected(so); 6606 tcp_timer_activate(tp, TT_2MSL, 6607 (tcp_fast_finwait2_recycle ? 6608 tcp_finwait2_timeout : 6609 TP_MAXIDLE(tp))); 6610 } 6611 tcp_state_change(tp, TCPS_FIN_WAIT_2); 6612 } 6613 } 6614 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 6615 tiwin, thflags, nxt_pkt)); 6616 } 6617 6618 /* 6619 * Return value of 1, the TCB is unlocked and most 6620 * likely gone, return value of 0, the TCP is still 6621 * locked. 6622 */ 6623 static int 6624 rack_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so, 6625 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 6626 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 6627 { 6628 int32_t ret_val = 0; 6629 6630 /* 6631 * Header prediction: check for the two common cases of a 6632 * uni-directional data xfer. If the packet has no control flags, 6633 * is in-sequence, the window didn't change and we're not 6634 * retransmitting, it's a candidate. If the length is zero and the 6635 * ack moved forward, we're the sender side of the xfer. Just free 6636 * the data acked & wake any higher level process that was blocked 6637 * waiting for space. If the length is non-zero and the ack didn't 6638 * move, we're the receiver side. If we're getting packets in-order 6639 * (the reassembly queue is empty), add the data toc The socket 6640 * buffer and note that we need a delayed ack. Make sure that the 6641 * hidden state-flags are also off. Since we check for 6642 * TCPS_ESTABLISHED first, it can only be TH_NEEDSYN. 6643 */ 6644 if (__predict_true(((to->to_flags & TOF_SACK) == 0)) && 6645 __predict_true((thflags & (TH_SYN | TH_FIN | TH_RST | TH_URG | TH_ACK)) == TH_ACK) && 6646 __predict_true(SEGQ_EMPTY(tp)) && 6647 __predict_true(th->th_seq == tp->rcv_nxt)) { 6648 struct tcp_rack *rack; 6649 6650 rack = (struct tcp_rack *)tp->t_fb_ptr; 6651 if (tlen == 0) { 6652 if (rack_fastack(m, th, so, tp, to, drop_hdrlen, tlen, 6653 tiwin, nxt_pkt, rack->r_ctl.rc_rcvtime, iptos)) { 6654 return (0); 6655 } 6656 } else { 6657 if (rack_do_fastnewdata(m, th, so, tp, to, drop_hdrlen, tlen, 6658 tiwin, nxt_pkt, iptos)) { 6659 return (0); 6660 } 6661 } 6662 } 6663 ctf_calc_rwin(so, tp); 6664 6665 if ((thflags & TH_RST) || 6666 (tp->t_fin_is_rst && (thflags & TH_FIN))) 6667 return (ctf_process_rst(m, th, so, tp)); 6668 6669 /* 6670 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 6671 * synchronized state. 6672 */ 6673 if (thflags & TH_SYN) { 6674 ctf_challenge_ack(m, th, tp, &ret_val); 6675 return (ret_val); 6676 } 6677 /* 6678 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 6679 * it's less than ts_recent, drop it. 6680 */ 6681 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 6682 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 6683 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 6684 return (ret_val); 6685 } 6686 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 6687 return (ret_val); 6688 } 6689 /* 6690 * If last ACK falls within this segment's sequence numbers, record 6691 * its timestamp. NOTE: 1) That the test incorporates suggestions 6692 * from the latest proposal of the tcplw@cray.com list (Braden 6693 * 1993/04/26). 2) That updating only on newer timestamps interferes 6694 * with our earlier PAWS tests, so this check should be solely 6695 * predicated on the sequence space of this segment. 3) That we 6696 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 6697 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 6698 * SEG.Len, This modified check allows us to overcome RFC1323's 6699 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 6700 * p.869. In such cases, we can still calculate the RTT correctly 6701 * when RCV.NXT == Last.ACK.Sent. 6702 */ 6703 if ((to->to_flags & TOF_TS) != 0 && 6704 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 6705 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 6706 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 6707 tp->ts_recent_age = tcp_ts_getticks(); 6708 tp->ts_recent = to->to_tsval; 6709 } 6710 /* 6711 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 6712 * is on (half-synchronized state), then queue data for later 6713 * processing; else drop segment and return. 6714 */ 6715 if ((thflags & TH_ACK) == 0) { 6716 if (tp->t_flags & TF_NEEDSYN) { 6717 6718 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 6719 tiwin, thflags, nxt_pkt)); 6720 6721 } else if (tp->t_flags & TF_ACKNOW) { 6722 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 6723 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output++; 6724 return (ret_val); 6725 } else { 6726 ctf_do_drop(m, NULL); 6727 return (0); 6728 } 6729 } 6730 /* 6731 * Ack processing. 6732 */ 6733 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) { 6734 return (ret_val); 6735 } 6736 if (sbavail(&so->so_snd)) { 6737 if (rack_progress_timeout_check(tp)) { 6738 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 6739 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 6740 return (1); 6741 } 6742 } 6743 /* State changes only happen in rack_process_data() */ 6744 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 6745 tiwin, thflags, nxt_pkt)); 6746 } 6747 6748 /* 6749 * Return value of 1, the TCB is unlocked and most 6750 * likely gone, return value of 0, the TCP is still 6751 * locked. 6752 */ 6753 static int 6754 rack_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so, 6755 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 6756 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 6757 { 6758 int32_t ret_val = 0; 6759 6760 ctf_calc_rwin(so, tp); 6761 if ((thflags & TH_RST) || 6762 (tp->t_fin_is_rst && (thflags & TH_FIN))) 6763 return (ctf_process_rst(m, th, so, tp)); 6764 /* 6765 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 6766 * synchronized state. 6767 */ 6768 if (thflags & TH_SYN) { 6769 ctf_challenge_ack(m, th, tp, &ret_val); 6770 return (ret_val); 6771 } 6772 /* 6773 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 6774 * it's less than ts_recent, drop it. 6775 */ 6776 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 6777 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 6778 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 6779 return (ret_val); 6780 } 6781 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 6782 return (ret_val); 6783 } 6784 /* 6785 * If last ACK falls within this segment's sequence numbers, record 6786 * its timestamp. NOTE: 1) That the test incorporates suggestions 6787 * from the latest proposal of the tcplw@cray.com list (Braden 6788 * 1993/04/26). 2) That updating only on newer timestamps interferes 6789 * with our earlier PAWS tests, so this check should be solely 6790 * predicated on the sequence space of this segment. 3) That we 6791 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 6792 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 6793 * SEG.Len, This modified check allows us to overcome RFC1323's 6794 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 6795 * p.869. In such cases, we can still calculate the RTT correctly 6796 * when RCV.NXT == Last.ACK.Sent. 6797 */ 6798 if ((to->to_flags & TOF_TS) != 0 && 6799 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 6800 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 6801 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 6802 tp->ts_recent_age = tcp_ts_getticks(); 6803 tp->ts_recent = to->to_tsval; 6804 } 6805 /* 6806 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 6807 * is on (half-synchronized state), then queue data for later 6808 * processing; else drop segment and return. 6809 */ 6810 if ((thflags & TH_ACK) == 0) { 6811 if (tp->t_flags & TF_NEEDSYN) { 6812 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 6813 tiwin, thflags, nxt_pkt)); 6814 6815 } else if (tp->t_flags & TF_ACKNOW) { 6816 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 6817 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output++; 6818 return (ret_val); 6819 } else { 6820 ctf_do_drop(m, NULL); 6821 return (0); 6822 } 6823 } 6824 /* 6825 * Ack processing. 6826 */ 6827 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) { 6828 return (ret_val); 6829 } 6830 if (sbavail(&so->so_snd)) { 6831 if (rack_progress_timeout_check(tp)) { 6832 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 6833 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 6834 return (1); 6835 } 6836 } 6837 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 6838 tiwin, thflags, nxt_pkt)); 6839 } 6840 6841 static int 6842 rack_check_data_after_close(struct mbuf *m, 6843 struct tcpcb *tp, int32_t *tlen, struct tcphdr *th, struct socket *so) 6844 { 6845 struct tcp_rack *rack; 6846 6847 rack = (struct tcp_rack *)tp->t_fb_ptr; 6848 if (rack->rc_allow_data_af_clo == 0) { 6849 close_now: 6850 tp = tcp_close(tp); 6851 KMOD_TCPSTAT_INC(tcps_rcvafterclose); 6852 ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, (*tlen)); 6853 return (1); 6854 } 6855 if (sbavail(&so->so_snd) == 0) 6856 goto close_now; 6857 /* Ok we allow data that is ignored and a followup reset */ 6858 tp->rcv_nxt = th->th_seq + *tlen; 6859 tp->t_flags2 |= TF2_DROP_AF_DATA; 6860 rack->r_wanted_output = 1; 6861 *tlen = 0; 6862 return (0); 6863 } 6864 6865 /* 6866 * Return value of 1, the TCB is unlocked and most 6867 * likely gone, return value of 0, the TCP is still 6868 * locked. 6869 */ 6870 static int 6871 rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so, 6872 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 6873 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 6874 { 6875 int32_t ret_val = 0; 6876 int32_t ourfinisacked = 0; 6877 6878 ctf_calc_rwin(so, tp); 6879 6880 if ((thflags & TH_RST) || 6881 (tp->t_fin_is_rst && (thflags & TH_FIN))) 6882 return (ctf_process_rst(m, th, so, tp)); 6883 /* 6884 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 6885 * synchronized state. 6886 */ 6887 if (thflags & TH_SYN) { 6888 ctf_challenge_ack(m, th, tp, &ret_val); 6889 return (ret_val); 6890 } 6891 /* 6892 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 6893 * it's less than ts_recent, drop it. 6894 */ 6895 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 6896 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 6897 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 6898 return (ret_val); 6899 } 6900 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 6901 return (ret_val); 6902 } 6903 /* 6904 * If new data are received on a connection after the user processes 6905 * are gone, then RST the other end. 6906 */ 6907 if ((so->so_state & SS_NOFDREF) && tlen) { 6908 if (rack_check_data_after_close(m, tp, &tlen, th, so)) 6909 return (1); 6910 } 6911 /* 6912 * If last ACK falls within this segment's sequence numbers, record 6913 * its timestamp. NOTE: 1) That the test incorporates suggestions 6914 * from the latest proposal of the tcplw@cray.com list (Braden 6915 * 1993/04/26). 2) That updating only on newer timestamps interferes 6916 * with our earlier PAWS tests, so this check should be solely 6917 * predicated on the sequence space of this segment. 3) That we 6918 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 6919 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 6920 * SEG.Len, This modified check allows us to overcome RFC1323's 6921 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 6922 * p.869. In such cases, we can still calculate the RTT correctly 6923 * when RCV.NXT == Last.ACK.Sent. 6924 */ 6925 if ((to->to_flags & TOF_TS) != 0 && 6926 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 6927 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 6928 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 6929 tp->ts_recent_age = tcp_ts_getticks(); 6930 tp->ts_recent = to->to_tsval; 6931 } 6932 /* 6933 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 6934 * is on (half-synchronized state), then queue data for later 6935 * processing; else drop segment and return. 6936 */ 6937 if ((thflags & TH_ACK) == 0) { 6938 if (tp->t_flags & TF_NEEDSYN) { 6939 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 6940 tiwin, thflags, nxt_pkt)); 6941 } else if (tp->t_flags & TF_ACKNOW) { 6942 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 6943 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output++; 6944 return (ret_val); 6945 } else { 6946 ctf_do_drop(m, NULL); 6947 return (0); 6948 } 6949 } 6950 /* 6951 * Ack processing. 6952 */ 6953 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 6954 return (ret_val); 6955 } 6956 if (ourfinisacked) { 6957 /* 6958 * If we can't receive any more data, then closing user can 6959 * proceed. Starting the timer is contrary to the 6960 * specification, but if we don't get a FIN we'll hang 6961 * forever. 6962 * 6963 * XXXjl: we should release the tp also, and use a 6964 * compressed state. 6965 */ 6966 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 6967 soisdisconnected(so); 6968 tcp_timer_activate(tp, TT_2MSL, 6969 (tcp_fast_finwait2_recycle ? 6970 tcp_finwait2_timeout : 6971 TP_MAXIDLE(tp))); 6972 } 6973 tcp_state_change(tp, TCPS_FIN_WAIT_2); 6974 } 6975 if (sbavail(&so->so_snd)) { 6976 if (rack_progress_timeout_check(tp)) { 6977 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 6978 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 6979 return (1); 6980 } 6981 } 6982 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 6983 tiwin, thflags, nxt_pkt)); 6984 } 6985 6986 /* 6987 * Return value of 1, the TCB is unlocked and most 6988 * likely gone, return value of 0, the TCP is still 6989 * locked. 6990 */ 6991 static int 6992 rack_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so, 6993 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 6994 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 6995 { 6996 int32_t ret_val = 0; 6997 int32_t ourfinisacked = 0; 6998 6999 ctf_calc_rwin(so, tp); 7000 7001 if ((thflags & TH_RST) || 7002 (tp->t_fin_is_rst && (thflags & TH_FIN))) 7003 return (ctf_process_rst(m, th, so, tp)); 7004 /* 7005 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 7006 * synchronized state. 7007 */ 7008 if (thflags & TH_SYN) { 7009 ctf_challenge_ack(m, th, tp, &ret_val); 7010 return (ret_val); 7011 } 7012 /* 7013 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 7014 * it's less than ts_recent, drop it. 7015 */ 7016 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 7017 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 7018 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 7019 return (ret_val); 7020 } 7021 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 7022 return (ret_val); 7023 } 7024 /* 7025 * If new data are received on a connection after the user processes 7026 * are gone, then RST the other end. 7027 */ 7028 if ((so->so_state & SS_NOFDREF) && tlen) { 7029 if (rack_check_data_after_close(m, tp, &tlen, th, so)) 7030 return (1); 7031 } 7032 /* 7033 * If last ACK falls within this segment's sequence numbers, record 7034 * its timestamp. NOTE: 1) That the test incorporates suggestions 7035 * from the latest proposal of the tcplw@cray.com list (Braden 7036 * 1993/04/26). 2) That updating only on newer timestamps interferes 7037 * with our earlier PAWS tests, so this check should be solely 7038 * predicated on the sequence space of this segment. 3) That we 7039 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 7040 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 7041 * SEG.Len, This modified check allows us to overcome RFC1323's 7042 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 7043 * p.869. In such cases, we can still calculate the RTT correctly 7044 * when RCV.NXT == Last.ACK.Sent. 7045 */ 7046 if ((to->to_flags & TOF_TS) != 0 && 7047 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 7048 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 7049 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 7050 tp->ts_recent_age = tcp_ts_getticks(); 7051 tp->ts_recent = to->to_tsval; 7052 } 7053 /* 7054 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 7055 * is on (half-synchronized state), then queue data for later 7056 * processing; else drop segment and return. 7057 */ 7058 if ((thflags & TH_ACK) == 0) { 7059 if (tp->t_flags & TF_NEEDSYN) { 7060 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 7061 tiwin, thflags, nxt_pkt)); 7062 } else if (tp->t_flags & TF_ACKNOW) { 7063 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 7064 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output++; 7065 return (ret_val); 7066 } else { 7067 ctf_do_drop(m, NULL); 7068 return (0); 7069 } 7070 } 7071 /* 7072 * Ack processing. 7073 */ 7074 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 7075 return (ret_val); 7076 } 7077 if (ourfinisacked) { 7078 tcp_twstart(tp); 7079 m_freem(m); 7080 return (1); 7081 } 7082 if (sbavail(&so->so_snd)) { 7083 if (rack_progress_timeout_check(tp)) { 7084 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 7085 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 7086 return (1); 7087 } 7088 } 7089 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 7090 tiwin, thflags, nxt_pkt)); 7091 } 7092 7093 /* 7094 * Return value of 1, the TCB is unlocked and most 7095 * likely gone, return value of 0, the TCP is still 7096 * locked. 7097 */ 7098 static int 7099 rack_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so, 7100 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 7101 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 7102 { 7103 int32_t ret_val = 0; 7104 int32_t ourfinisacked = 0; 7105 7106 ctf_calc_rwin(so, tp); 7107 7108 if ((thflags & TH_RST) || 7109 (tp->t_fin_is_rst && (thflags & TH_FIN))) 7110 return (ctf_process_rst(m, th, so, tp)); 7111 /* 7112 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 7113 * synchronized state. 7114 */ 7115 if (thflags & TH_SYN) { 7116 ctf_challenge_ack(m, th, tp, &ret_val); 7117 return (ret_val); 7118 } 7119 /* 7120 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 7121 * it's less than ts_recent, drop it. 7122 */ 7123 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 7124 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 7125 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 7126 return (ret_val); 7127 } 7128 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 7129 return (ret_val); 7130 } 7131 /* 7132 * If new data are received on a connection after the user processes 7133 * are gone, then RST the other end. 7134 */ 7135 if ((so->so_state & SS_NOFDREF) && tlen) { 7136 if (rack_check_data_after_close(m, tp, &tlen, th, so)) 7137 return (1); 7138 } 7139 /* 7140 * If last ACK falls within this segment's sequence numbers, record 7141 * its timestamp. NOTE: 1) That the test incorporates suggestions 7142 * from the latest proposal of the tcplw@cray.com list (Braden 7143 * 1993/04/26). 2) That updating only on newer timestamps interferes 7144 * with our earlier PAWS tests, so this check should be solely 7145 * predicated on the sequence space of this segment. 3) That we 7146 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 7147 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 7148 * SEG.Len, This modified check allows us to overcome RFC1323's 7149 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 7150 * p.869. In such cases, we can still calculate the RTT correctly 7151 * when RCV.NXT == Last.ACK.Sent. 7152 */ 7153 if ((to->to_flags & TOF_TS) != 0 && 7154 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 7155 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 7156 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 7157 tp->ts_recent_age = tcp_ts_getticks(); 7158 tp->ts_recent = to->to_tsval; 7159 } 7160 /* 7161 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 7162 * is on (half-synchronized state), then queue data for later 7163 * processing; else drop segment and return. 7164 */ 7165 if ((thflags & TH_ACK) == 0) { 7166 if (tp->t_flags & TF_NEEDSYN) { 7167 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 7168 tiwin, thflags, nxt_pkt)); 7169 } else if (tp->t_flags & TF_ACKNOW) { 7170 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 7171 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output++; 7172 return (ret_val); 7173 } else { 7174 ctf_do_drop(m, NULL); 7175 return (0); 7176 } 7177 } 7178 /* 7179 * case TCPS_LAST_ACK: Ack processing. 7180 */ 7181 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 7182 return (ret_val); 7183 } 7184 if (ourfinisacked) { 7185 tp = tcp_close(tp); 7186 ctf_do_drop(m, tp); 7187 return (1); 7188 } 7189 if (sbavail(&so->so_snd)) { 7190 if (rack_progress_timeout_check(tp)) { 7191 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 7192 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 7193 return (1); 7194 } 7195 } 7196 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 7197 tiwin, thflags, nxt_pkt)); 7198 } 7199 7200 7201 /* 7202 * Return value of 1, the TCB is unlocked and most 7203 * likely gone, return value of 0, the TCP is still 7204 * locked. 7205 */ 7206 static int 7207 rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so, 7208 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 7209 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 7210 { 7211 int32_t ret_val = 0; 7212 int32_t ourfinisacked = 0; 7213 7214 ctf_calc_rwin(so, tp); 7215 7216 /* Reset receive buffer auto scaling when not in bulk receive mode. */ 7217 if ((thflags & TH_RST) || 7218 (tp->t_fin_is_rst && (thflags & TH_FIN))) 7219 return (ctf_process_rst(m, th, so, tp)); 7220 /* 7221 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 7222 * synchronized state. 7223 */ 7224 if (thflags & TH_SYN) { 7225 ctf_challenge_ack(m, th, tp, &ret_val); 7226 return (ret_val); 7227 } 7228 /* 7229 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 7230 * it's less than ts_recent, drop it. 7231 */ 7232 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 7233 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 7234 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 7235 return (ret_val); 7236 } 7237 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 7238 return (ret_val); 7239 } 7240 /* 7241 * If new data are received on a connection after the user processes 7242 * are gone, then RST the other end. 7243 */ 7244 if ((so->so_state & SS_NOFDREF) && 7245 tlen) { 7246 if (rack_check_data_after_close(m, tp, &tlen, th, so)) 7247 return (1); 7248 } 7249 /* 7250 * If last ACK falls within this segment's sequence numbers, record 7251 * its timestamp. NOTE: 1) That the test incorporates suggestions 7252 * from the latest proposal of the tcplw@cray.com list (Braden 7253 * 1993/04/26). 2) That updating only on newer timestamps interferes 7254 * with our earlier PAWS tests, so this check should be solely 7255 * predicated on the sequence space of this segment. 3) That we 7256 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 7257 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 7258 * SEG.Len, This modified check allows us to overcome RFC1323's 7259 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 7260 * p.869. In such cases, we can still calculate the RTT correctly 7261 * when RCV.NXT == Last.ACK.Sent. 7262 */ 7263 if ((to->to_flags & TOF_TS) != 0 && 7264 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 7265 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 7266 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 7267 tp->ts_recent_age = tcp_ts_getticks(); 7268 tp->ts_recent = to->to_tsval; 7269 } 7270 /* 7271 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 7272 * is on (half-synchronized state), then queue data for later 7273 * processing; else drop segment and return. 7274 */ 7275 if ((thflags & TH_ACK) == 0) { 7276 if (tp->t_flags & TF_NEEDSYN) { 7277 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 7278 tiwin, thflags, nxt_pkt)); 7279 } else if (tp->t_flags & TF_ACKNOW) { 7280 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 7281 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output++; 7282 return (ret_val); 7283 } else { 7284 ctf_do_drop(m, NULL); 7285 return (0); 7286 } 7287 } 7288 /* 7289 * Ack processing. 7290 */ 7291 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 7292 return (ret_val); 7293 } 7294 if (sbavail(&so->so_snd)) { 7295 if (rack_progress_timeout_check(tp)) { 7296 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 7297 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 7298 return (1); 7299 } 7300 } 7301 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 7302 tiwin, thflags, nxt_pkt)); 7303 } 7304 7305 7306 static void inline 7307 rack_clear_rate_sample(struct tcp_rack *rack) 7308 { 7309 rack->r_ctl.rack_rs.rs_flags = RACK_RTT_EMPTY; 7310 rack->r_ctl.rack_rs.rs_rtt_cnt = 0; 7311 rack->r_ctl.rack_rs.rs_rtt_tot = 0; 7312 } 7313 7314 static void 7315 rack_set_pace_segments(struct tcpcb *tp, struct tcp_rack *rack) 7316 { 7317 uint32_t tls_seg = 0; 7318 7319 #ifdef KERN_TLS 7320 if (rack->rc_inp->inp_socket->so_snd.sb_flags & SB_TLS_IFNET) { 7321 tls_seg = ctf_get_opt_tls_size(rack->rc_inp->inp_socket, rack->rc_tp->snd_wnd); 7322 rack->r_ctl.rc_pace_min_segs = tls_seg; 7323 } else 7324 #endif 7325 rack->r_ctl.rc_pace_min_segs = ctf_fixed_maxseg(tp); 7326 rack->r_ctl.rc_pace_max_segs = ctf_fixed_maxseg(tp) * rack->rc_pace_max_segs; 7327 if (rack->r_ctl.rc_pace_max_segs > PACE_MAX_IP_BYTES) 7328 rack->r_ctl.rc_pace_max_segs = PACE_MAX_IP_BYTES; 7329 #ifdef KERN_TLS 7330 if (tls_seg != 0) { 7331 if (rack_hw_tls_max_seg > 1) { 7332 rack->r_ctl.rc_pace_max_segs /= tls_seg; 7333 if (rack_hw_tls_max_seg < rack->r_ctl.rc_pace_max_segs) 7334 rack->r_ctl.rc_pace_max_segs = rack_hw_tls_max_seg; 7335 } else { 7336 rack->r_ctl.rc_pace_max_segs = 1; 7337 } 7338 if (rack->r_ctl.rc_pace_max_segs == 0) 7339 rack->r_ctl.rc_pace_max_segs = 1; 7340 rack->r_ctl.rc_pace_max_segs *= tls_seg; 7341 } 7342 #endif 7343 rack_log_type_hrdwtso(tp, rack, tls_seg, rack->rc_inp->inp_socket->so_snd.sb_flags, 0, 2); 7344 } 7345 7346 static int 7347 rack_init(struct tcpcb *tp) 7348 { 7349 struct tcp_rack *rack = NULL; 7350 struct rack_sendmap *insret; 7351 7352 tp->t_fb_ptr = uma_zalloc(rack_pcb_zone, M_NOWAIT); 7353 if (tp->t_fb_ptr == NULL) { 7354 /* 7355 * We need to allocate memory but cant. The INP and INP_INFO 7356 * locks and they are recusive (happens during setup. So a 7357 * scheme to drop the locks fails :( 7358 * 7359 */ 7360 return (ENOMEM); 7361 } 7362 memset(tp->t_fb_ptr, 0, sizeof(struct tcp_rack)); 7363 7364 rack = (struct tcp_rack *)tp->t_fb_ptr; 7365 RB_INIT(&rack->r_ctl.rc_mtree); 7366 TAILQ_INIT(&rack->r_ctl.rc_free); 7367 TAILQ_INIT(&rack->r_ctl.rc_tmap); 7368 rack->rc_tp = tp; 7369 if (tp->t_inpcb) { 7370 rack->rc_inp = tp->t_inpcb; 7371 } 7372 tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ; 7373 /* Probably not needed but lets be sure */ 7374 rack_clear_rate_sample(rack); 7375 rack->r_cpu = 0; 7376 rack->r_ctl.rc_reorder_fade = rack_reorder_fade; 7377 rack->rc_allow_data_af_clo = rack_ignore_data_after_close; 7378 rack->r_ctl.rc_tlp_threshold = rack_tlp_thresh; 7379 rack->rc_pace_reduce = rack_slot_reduction; 7380 if (use_rack_cheat) 7381 rack->use_rack_cheat = 1; 7382 if (V_tcp_delack_enabled) 7383 tp->t_delayed_ack = 1; 7384 else 7385 tp->t_delayed_ack = 0; 7386 rack->rc_pace_max_segs = rack_hptsi_segments; 7387 rack->r_ctl.rc_reorder_shift = rack_reorder_thresh; 7388 rack->r_ctl.rc_pkt_delay = rack_pkt_delay; 7389 rack->r_ctl.rc_prop_reduce = rack_use_proportional_reduce; 7390 rack->r_enforce_min_pace = rack_min_pace_time; 7391 rack->r_ctl.rc_prop_rate = rack_proportional_rate; 7392 rack->r_ctl.rc_tlp_cwnd_reduce = rack_lower_cwnd_at_tlp; 7393 rack->r_ctl.rc_early_recovery = rack_early_recovery; 7394 rack->rc_always_pace = rack_pace_every_seg; 7395 rack_set_pace_segments(tp, rack); 7396 rack->r_ctl.rc_high_rwnd = tp->snd_wnd; 7397 rack->r_ctl.rc_rate_sample_method = rack_rate_sample_method; 7398 rack->rack_tlp_threshold_use = rack_tlp_threshold_use; 7399 rack->r_ctl.rc_prr_sendalot = rack_send_a_lot_in_prr; 7400 rack->r_ctl.rc_min_to = rack_min_to; 7401 rack->rack_per_of_gp = rack_per_of_gp; 7402 microuptime(&rack->r_ctl.rc_last_ack); 7403 rack->r_ctl.rc_last_time_decay = rack->r_ctl.rc_last_ack; 7404 rack->r_ctl.rc_tlp_rxt_last_time = tcp_ts_getticks(); 7405 /* Do we force on detection? */ 7406 #ifdef NETFLIX_EXP_DETECTION 7407 if (tcp_force_detection) 7408 rack->do_detection = 1; 7409 else 7410 #endif 7411 rack->do_detection = 0; 7412 if (tp->snd_una != tp->snd_max) { 7413 /* Create a send map for the current outstanding data */ 7414 struct rack_sendmap *rsm; 7415 7416 rsm = rack_alloc(rack); 7417 if (rsm == NULL) { 7418 uma_zfree(rack_pcb_zone, tp->t_fb_ptr); 7419 tp->t_fb_ptr = NULL; 7420 return (ENOMEM); 7421 } 7422 rsm->r_flags = RACK_OVERMAX; 7423 rsm->r_tim_lastsent[0] = rack->r_ctl.rc_tlp_rxt_last_time; 7424 rsm->r_rtr_cnt = 1; 7425 rsm->r_rtr_bytes = 0; 7426 rsm->r_start = tp->snd_una; 7427 rsm->r_end = tp->snd_max; 7428 rsm->r_dupack = 0; 7429 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7430 #ifdef INVARIANTS 7431 if (insret != NULL) { 7432 panic("Insert in rb tree fails ret:%p rack:%p rsm:%p", 7433 insret, rack, rsm); 7434 } 7435 #endif 7436 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); 7437 rsm->r_in_tmap = 1; 7438 } 7439 rack_stop_all_timers(tp); 7440 rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), 0, 0, 0); 7441 return (0); 7442 } 7443 7444 static int 7445 rack_handoff_ok(struct tcpcb *tp) 7446 { 7447 if ((tp->t_state == TCPS_CLOSED) || 7448 (tp->t_state == TCPS_LISTEN)) { 7449 /* Sure no problem though it may not stick */ 7450 return (0); 7451 } 7452 if ((tp->t_state == TCPS_SYN_SENT) || 7453 (tp->t_state == TCPS_SYN_RECEIVED)) { 7454 /* 7455 * We really don't know you have to get to ESTAB or beyond 7456 * to tell. 7457 */ 7458 return (EAGAIN); 7459 } 7460 if ((tp->t_flags & TF_SACK_PERMIT) || rack_sack_not_required){ 7461 return (0); 7462 } 7463 /* 7464 * If we reach here we don't do SACK on this connection so we can 7465 * never do rack. 7466 */ 7467 return (EINVAL); 7468 } 7469 7470 static void 7471 rack_fini(struct tcpcb *tp, int32_t tcb_is_purged) 7472 { 7473 if (tp->t_fb_ptr) { 7474 struct tcp_rack *rack; 7475 struct rack_sendmap *rsm, *nrsm, *rm; 7476 if (tp->t_inpcb) { 7477 tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ; 7478 tp->t_inpcb->inp_flags2 &= ~INP_MBUF_QUEUE_READY; 7479 } 7480 rack = (struct tcp_rack *)tp->t_fb_ptr; 7481 #ifdef TCP_BLACKBOX 7482 tcp_log_flowend(tp); 7483 #endif 7484 RB_FOREACH_SAFE(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm) { 7485 rm = RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7486 #ifdef INVARIANTS 7487 if (rm != rsm) { 7488 panic("At fini, rack:%p rsm:%p rm:%p", 7489 rack, rsm, rm); 7490 } 7491 #endif 7492 uma_zfree(rack_zone, rsm); 7493 } 7494 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); 7495 while (rsm) { 7496 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext); 7497 uma_zfree(rack_zone, rsm); 7498 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); 7499 } 7500 rack->rc_free_cnt = 0; 7501 uma_zfree(rack_pcb_zone, tp->t_fb_ptr); 7502 tp->t_fb_ptr = NULL; 7503 } 7504 /* Make sure snd_nxt is correctly set */ 7505 tp->snd_nxt = tp->snd_max; 7506 } 7507 7508 7509 static void 7510 rack_set_state(struct tcpcb *tp, struct tcp_rack *rack) 7511 { 7512 switch (tp->t_state) { 7513 case TCPS_SYN_SENT: 7514 rack->r_state = TCPS_SYN_SENT; 7515 rack->r_substate = rack_do_syn_sent; 7516 break; 7517 case TCPS_SYN_RECEIVED: 7518 rack->r_state = TCPS_SYN_RECEIVED; 7519 rack->r_substate = rack_do_syn_recv; 7520 break; 7521 case TCPS_ESTABLISHED: 7522 rack_set_pace_segments(tp, rack); 7523 rack->r_state = TCPS_ESTABLISHED; 7524 rack->r_substate = rack_do_established; 7525 break; 7526 case TCPS_CLOSE_WAIT: 7527 rack->r_state = TCPS_CLOSE_WAIT; 7528 rack->r_substate = rack_do_close_wait; 7529 break; 7530 case TCPS_FIN_WAIT_1: 7531 rack->r_state = TCPS_FIN_WAIT_1; 7532 rack->r_substate = rack_do_fin_wait_1; 7533 break; 7534 case TCPS_CLOSING: 7535 rack->r_state = TCPS_CLOSING; 7536 rack->r_substate = rack_do_closing; 7537 break; 7538 case TCPS_LAST_ACK: 7539 rack->r_state = TCPS_LAST_ACK; 7540 rack->r_substate = rack_do_lastack; 7541 break; 7542 case TCPS_FIN_WAIT_2: 7543 rack->r_state = TCPS_FIN_WAIT_2; 7544 rack->r_substate = rack_do_fin_wait_2; 7545 break; 7546 case TCPS_LISTEN: 7547 case TCPS_CLOSED: 7548 case TCPS_TIME_WAIT: 7549 default: 7550 break; 7551 }; 7552 } 7553 7554 7555 static void 7556 rack_timer_audit(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb) 7557 { 7558 /* 7559 * We received an ack, and then did not 7560 * call send or were bounced out due to the 7561 * hpts was running. Now a timer is up as well, is 7562 * it the right timer? 7563 */ 7564 struct rack_sendmap *rsm; 7565 int tmr_up; 7566 7567 tmr_up = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK; 7568 if (rack->rc_in_persist && (tmr_up == PACE_TMR_PERSIT)) 7569 return; 7570 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 7571 if (((rsm == NULL) || (tp->t_state < TCPS_ESTABLISHED)) && 7572 (tmr_up == PACE_TMR_RXT)) { 7573 /* Should be an RXT */ 7574 return; 7575 } 7576 if (rsm == NULL) { 7577 /* Nothing outstanding? */ 7578 if (tp->t_flags & TF_DELACK) { 7579 if (tmr_up == PACE_TMR_DELACK) 7580 /* We are supposed to have delayed ack up and we do */ 7581 return; 7582 } else if (sbavail(&tp->t_inpcb->inp_socket->so_snd) && (tmr_up == PACE_TMR_RXT)) { 7583 /* 7584 * if we hit enobufs then we would expect the possiblity 7585 * of nothing outstanding and the RXT up (and the hptsi timer). 7586 */ 7587 return; 7588 } else if (((V_tcp_always_keepalive || 7589 rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) && 7590 (tp->t_state <= TCPS_CLOSING)) && 7591 (tmr_up == PACE_TMR_KEEP) && 7592 (tp->snd_max == tp->snd_una)) { 7593 /* We should have keep alive up and we do */ 7594 return; 7595 } 7596 } 7597 if (SEQ_GT(tp->snd_max, tp->snd_una) && 7598 ((tmr_up == PACE_TMR_TLP) || 7599 (tmr_up == PACE_TMR_RACK) || 7600 (tmr_up == PACE_TMR_RXT))) { 7601 /* 7602 * Either a Rack, TLP or RXT is fine if we 7603 * have outstanding data. 7604 */ 7605 return; 7606 } else if (tmr_up == PACE_TMR_DELACK) { 7607 /* 7608 * If the delayed ack was going to go off 7609 * before the rtx/tlp/rack timer were going to 7610 * expire, then that would be the timer in control. 7611 * Note we don't check the time here trusting the 7612 * code is correct. 7613 */ 7614 return; 7615 } 7616 /* 7617 * Ok the timer originally started is not what we want now. 7618 * We will force the hpts to be stopped if any, and restart 7619 * with the slot set to what was in the saved slot. 7620 */ 7621 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 7622 rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), 0, 0, 0); 7623 } 7624 7625 static int 7626 rack_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so, 7627 struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos, 7628 int32_t nxt_pkt, struct timeval *tv) 7629 { 7630 int32_t thflags, retval, did_out = 0; 7631 int32_t way_out = 0; 7632 uint32_t cts; 7633 uint32_t tiwin; 7634 struct tcpopt to; 7635 struct tcp_rack *rack; 7636 struct rack_sendmap *rsm; 7637 int32_t prev_state = 0; 7638 7639 if (m->m_flags & M_TSTMP_LRO) { 7640 tv->tv_sec = m->m_pkthdr.rcv_tstmp /1000000000; 7641 tv->tv_usec = (m->m_pkthdr.rcv_tstmp % 1000000000)/1000; 7642 } 7643 cts = tcp_tv_to_mssectick(tv); 7644 rack = (struct tcp_rack *)tp->t_fb_ptr; 7645 7646 kern_prefetch(rack, &prev_state); 7647 prev_state = 0; 7648 thflags = th->th_flags; 7649 7650 NET_EPOCH_ASSERT(); 7651 INP_WLOCK_ASSERT(tp->t_inpcb); 7652 KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", 7653 __func__)); 7654 KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT", 7655 __func__)); 7656 if (tp->t_logstate != TCP_LOG_STATE_OFF) { 7657 union tcp_log_stackspecific log; 7658 struct timeval tv; 7659 7660 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 7661 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 7662 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 7663 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; 7664 log.u_bbr.flex2 = rack->r_ctl.rc_num_maps_alloced; 7665 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 7666 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 7667 log.u_bbr.pkts_out = rack->rc_tp->t_maxseg; 7668 TCP_LOG_EVENTP(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_IN, 0, 7669 tlen, &log, true, &tv); 7670 } 7671 if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) { 7672 way_out = 4; 7673 retval = 0; 7674 goto done_with_input; 7675 } 7676 /* 7677 * If a segment with the ACK-bit set arrives in the SYN-SENT state 7678 * check SEQ.ACK first as described on page 66 of RFC 793, section 3.9. 7679 */ 7680 if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) && 7681 (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { 7682 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 7683 return(1); 7684 } 7685 /* 7686 * Segment received on connection. Reset idle time and keep-alive 7687 * timer. XXX: This should be done after segment validation to 7688 * ignore broken/spoofed segs. 7689 */ 7690 if (tp->t_idle_reduce && 7691 (tp->snd_max == tp->snd_una) && 7692 ((ticks - tp->t_rcvtime) >= tp->t_rxtcur)) { 7693 counter_u64_add(rack_input_idle_reduces, 1); 7694 rack_cc_after_idle(tp); 7695 } 7696 tp->t_rcvtime = ticks; 7697 7698 /* 7699 * Unscale the window into a 32-bit value. For the SYN_SENT state 7700 * the scale is zero. 7701 */ 7702 tiwin = th->th_win << tp->snd_scale; 7703 #ifdef STATS 7704 stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_FRWIN, tiwin); 7705 #endif 7706 if (tiwin > rack->r_ctl.rc_high_rwnd) 7707 rack->r_ctl.rc_high_rwnd = tiwin; 7708 /* 7709 * TCP ECN processing. XXXJTL: If we ever use ECN, we need to move 7710 * this to occur after we've validated the segment. 7711 */ 7712 if (tp->t_flags2 & TF2_ECN_PERMIT) { 7713 if (thflags & TH_CWR) { 7714 tp->t_flags2 &= ~TF2_ECN_SND_ECE; 7715 tp->t_flags |= TF_ACKNOW; 7716 } 7717 switch (iptos & IPTOS_ECN_MASK) { 7718 case IPTOS_ECN_CE: 7719 tp->t_flags2 |= TF2_ECN_SND_ECE; 7720 KMOD_TCPSTAT_INC(tcps_ecn_ce); 7721 break; 7722 case IPTOS_ECN_ECT0: 7723 KMOD_TCPSTAT_INC(tcps_ecn_ect0); 7724 break; 7725 case IPTOS_ECN_ECT1: 7726 KMOD_TCPSTAT_INC(tcps_ecn_ect1); 7727 break; 7728 } 7729 7730 /* Process a packet differently from RFC3168. */ 7731 cc_ecnpkt_handler(tp, th, iptos); 7732 7733 /* Congestion experienced. */ 7734 if (thflags & TH_ECE) { 7735 rack_cong_signal(tp, th, CC_ECN); 7736 } 7737 } 7738 /* 7739 * Parse options on any incoming segment. 7740 */ 7741 tcp_dooptions(&to, (u_char *)(th + 1), 7742 (th->th_off << 2) - sizeof(struct tcphdr), 7743 (thflags & TH_SYN) ? TO_SYN : 0); 7744 7745 /* 7746 * If echoed timestamp is later than the current time, fall back to 7747 * non RFC1323 RTT calculation. Normalize timestamp if syncookies 7748 * were used when this connection was established. 7749 */ 7750 if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) { 7751 to.to_tsecr -= tp->ts_offset; 7752 if (TSTMP_GT(to.to_tsecr, cts)) 7753 to.to_tsecr = 0; 7754 } 7755 /* 7756 * If its the first time in we need to take care of options and 7757 * verify we can do SACK for rack! 7758 */ 7759 if (rack->r_state == 0) { 7760 /* Should be init'd by rack_init() */ 7761 KASSERT(rack->rc_inp != NULL, 7762 ("%s: rack->rc_inp unexpectedly NULL", __func__)); 7763 if (rack->rc_inp == NULL) { 7764 rack->rc_inp = tp->t_inpcb; 7765 } 7766 7767 /* 7768 * Process options only when we get SYN/ACK back. The SYN 7769 * case for incoming connections is handled in tcp_syncache. 7770 * According to RFC1323 the window field in a SYN (i.e., a 7771 * <SYN> or <SYN,ACK>) segment itself is never scaled. XXX 7772 * this is traditional behavior, may need to be cleaned up. 7773 */ 7774 rack->r_cpu = inp_to_cpuid(tp->t_inpcb); 7775 if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) { 7776 if ((to.to_flags & TOF_SCALE) && 7777 (tp->t_flags & TF_REQ_SCALE)) { 7778 tp->t_flags |= TF_RCVD_SCALE; 7779 tp->snd_scale = to.to_wscale; 7780 } 7781 /* 7782 * Initial send window. It will be updated with the 7783 * next incoming segment to the scaled value. 7784 */ 7785 tp->snd_wnd = th->th_win; 7786 if (to.to_flags & TOF_TS) { 7787 tp->t_flags |= TF_RCVD_TSTMP; 7788 tp->ts_recent = to.to_tsval; 7789 tp->ts_recent_age = cts; 7790 } 7791 if (to.to_flags & TOF_MSS) 7792 tcp_mss(tp, to.to_mss); 7793 if ((tp->t_flags & TF_SACK_PERMIT) && 7794 (to.to_flags & TOF_SACKPERM) == 0) 7795 tp->t_flags &= ~TF_SACK_PERMIT; 7796 if (IS_FASTOPEN(tp->t_flags)) { 7797 if (to.to_flags & TOF_FASTOPEN) { 7798 uint16_t mss; 7799 7800 if (to.to_flags & TOF_MSS) 7801 mss = to.to_mss; 7802 else 7803 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) 7804 mss = TCP6_MSS; 7805 else 7806 mss = TCP_MSS; 7807 tcp_fastopen_update_cache(tp, mss, 7808 to.to_tfo_len, to.to_tfo_cookie); 7809 } else 7810 tcp_fastopen_disable_path(tp); 7811 } 7812 } 7813 /* 7814 * At this point we are at the initial call. Here we decide 7815 * if we are doing RACK or not. We do this by seeing if 7816 * TF_SACK_PERMIT is set, if not rack is *not* possible and 7817 * we switch to the default code. 7818 */ 7819 if ((tp->t_flags & TF_SACK_PERMIT) == 0) { 7820 tcp_switch_back_to_default(tp); 7821 (*tp->t_fb->tfb_tcp_do_segment) (m, th, so, tp, drop_hdrlen, 7822 tlen, iptos); 7823 return (1); 7824 } 7825 /* Set the flag */ 7826 rack->r_is_v6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; 7827 tcp_set_hpts(tp->t_inpcb); 7828 sack_filter_clear(&rack->r_ctl.rack_sf, th->th_ack); 7829 } 7830 /* 7831 * This is the one exception case where we set the rack state 7832 * always. All other times (timers etc) we must have a rack-state 7833 * set (so we assure we have done the checks above for SACK). 7834 */ 7835 memcpy(&rack->r_ctl.rc_last_ack, tv, sizeof(struct timeval)); 7836 rack->r_ctl.rc_rcvtime = cts; 7837 if (rack->r_state != tp->t_state) 7838 rack_set_state(tp, rack); 7839 if (SEQ_GT(th->th_ack, tp->snd_una) && 7840 (rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree)) != NULL) 7841 kern_prefetch(rsm, &prev_state); 7842 prev_state = rack->r_state; 7843 rack->r_ctl.rc_tlp_send_cnt = 0; 7844 rack_clear_rate_sample(rack); 7845 retval = (*rack->r_substate) (m, th, so, 7846 tp, &to, drop_hdrlen, 7847 tlen, tiwin, thflags, nxt_pkt, iptos); 7848 #ifdef INVARIANTS 7849 if ((retval == 0) && 7850 (tp->t_inpcb == NULL)) { 7851 panic("retval:%d tp:%p t_inpcb:NULL state:%d", 7852 retval, tp, prev_state); 7853 } 7854 #endif 7855 if (retval == 0) { 7856 /* 7857 * If retval is 1 the tcb is unlocked and most likely the tp 7858 * is gone. 7859 */ 7860 INP_WLOCK_ASSERT(tp->t_inpcb); 7861 if (rack->set_pacing_done_a_iw == 0) { 7862 /* How much has been acked? */ 7863 if ((tp->snd_una - tp->iss) > (ctf_fixed_maxseg(tp) * 10)) { 7864 /* We have enough to set in the pacing segment size */ 7865 rack->set_pacing_done_a_iw = 1; 7866 rack_set_pace_segments(tp, rack); 7867 } 7868 } 7869 tcp_rack_xmit_timer_commit(rack, tp); 7870 if ((nxt_pkt == 0) || (IN_RECOVERY(tp->t_flags))) { 7871 if (rack->r_wanted_output != 0) { 7872 did_out = 1; 7873 (void)tp->t_fb->tfb_tcp_output(tp); 7874 } 7875 rack_start_hpts_timer(rack, tp, cts, 0, 0, 0); 7876 } 7877 if ((nxt_pkt == 0) && 7878 ((rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) == 0) && 7879 (SEQ_GT(tp->snd_max, tp->snd_una) || 7880 (tp->t_flags & TF_DELACK) || 7881 ((V_tcp_always_keepalive || rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) && 7882 (tp->t_state <= TCPS_CLOSING)))) { 7883 /* We could not send (probably in the hpts but stopped the timer earlier)? */ 7884 if ((tp->snd_max == tp->snd_una) && 7885 ((tp->t_flags & TF_DELACK) == 0) && 7886 (rack->rc_inp->inp_in_hpts) && 7887 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { 7888 /* keep alive not needed if we are hptsi output yet */ 7889 ; 7890 } else { 7891 if (rack->rc_inp->inp_in_hpts) { 7892 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); 7893 counter_u64_add(rack_per_timer_hole, 1); 7894 } 7895 rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), 0, 0, 0); 7896 } 7897 way_out = 1; 7898 } else if (nxt_pkt == 0) { 7899 /* Do we have the correct timer running? */ 7900 rack_timer_audit(tp, rack, &so->so_snd); 7901 way_out = 2; 7902 } 7903 done_with_input: 7904 rack_log_doseg_done(rack, cts, nxt_pkt, did_out, way_out); 7905 if (did_out) 7906 rack->r_wanted_output = 0; 7907 #ifdef INVARIANTS 7908 if (tp->t_inpcb == NULL) { 7909 panic("OP:%d retval:%d tp:%p t_inpcb:NULL state:%d", 7910 did_out, 7911 retval, tp, prev_state); 7912 } 7913 #endif 7914 } 7915 return (retval); 7916 } 7917 7918 void 7919 rack_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, 7920 struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos) 7921 { 7922 struct timeval tv; 7923 7924 /* First lets see if we have old packets */ 7925 if (tp->t_in_pkt) { 7926 if (ctf_do_queued_segments(so, tp, 1)) { 7927 m_freem(m); 7928 return; 7929 } 7930 } 7931 if (m->m_flags & M_TSTMP_LRO) { 7932 tv.tv_sec = m->m_pkthdr.rcv_tstmp /1000000000; 7933 tv.tv_usec = (m->m_pkthdr.rcv_tstmp % 1000000000)/1000; 7934 } else { 7935 /* Should not be should we kassert instead? */ 7936 tcp_get_usecs(&tv); 7937 } 7938 if(rack_do_segment_nounlock(m, th, so, tp, 7939 drop_hdrlen, tlen, iptos, 0, &tv) == 0) 7940 INP_WUNLOCK(tp->t_inpcb); 7941 } 7942 7943 struct rack_sendmap * 7944 tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tsused) 7945 { 7946 struct rack_sendmap *rsm = NULL; 7947 int32_t idx; 7948 uint32_t srtt = 0, thresh = 0, ts_low = 0; 7949 7950 /* Return the next guy to be re-transmitted */ 7951 if (RB_EMPTY(&rack->r_ctl.rc_mtree)) { 7952 return (NULL); 7953 } 7954 if (tp->t_flags & TF_SENTFIN) { 7955 /* retran the end FIN? */ 7956 return (NULL); 7957 } 7958 /* ok lets look at this one */ 7959 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 7960 if (rsm && ((rsm->r_flags & RACK_ACKED) == 0)) { 7961 goto check_it; 7962 } 7963 rsm = rack_find_lowest_rsm(rack); 7964 if (rsm == NULL) { 7965 return (NULL); 7966 } 7967 check_it: 7968 if (rsm->r_flags & RACK_ACKED) { 7969 return (NULL); 7970 } 7971 if ((rsm->r_flags & RACK_SACK_PASSED) == 0) { 7972 /* Its not yet ready */ 7973 return (NULL); 7974 } 7975 srtt = rack_grab_rtt(tp, rack); 7976 idx = rsm->r_rtr_cnt - 1; 7977 ts_low = rsm->r_tim_lastsent[idx]; 7978 thresh = rack_calc_thresh_rack(rack, srtt, tsused); 7979 if ((tsused == ts_low) || 7980 (TSTMP_LT(tsused, ts_low))) { 7981 /* No time since sending */ 7982 return (NULL); 7983 } 7984 if ((tsused - ts_low) < thresh) { 7985 /* It has not been long enough yet */ 7986 return (NULL); 7987 } 7988 if ((rsm->r_dupack >= DUP_ACK_THRESHOLD) || 7989 ((rsm->r_flags & RACK_SACK_PASSED) && 7990 (rack->sack_attack_disable == 0))) { 7991 /* 7992 * We have passed the dup-ack threshold <or> 7993 * a SACK has indicated this is missing. 7994 * Note that if you are a declared attacker 7995 * it is only the dup-ack threshold that 7996 * will cause retransmits. 7997 */ 7998 /* log retransmit reason */ 7999 rack_log_retran_reason(rack, rsm, (tsused - ts_low), thresh, 1); 8000 return (rsm); 8001 } 8002 return (NULL); 8003 } 8004 8005 static int32_t 8006 rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len) 8007 { 8008 int32_t slot = 0; 8009 8010 if ((rack->rack_per_of_gp == 0) || 8011 (rack->rc_always_pace == 0)) { 8012 /* 8013 * We use the most optimistic possible cwnd/srtt for 8014 * sending calculations. This will make our 8015 * calculation anticipate getting more through 8016 * quicker then possible. But thats ok we don't want 8017 * the peer to have a gap in data sending. 8018 */ 8019 uint32_t srtt, cwnd, tr_perms = 0; 8020 8021 old_method: 8022 if (rack->r_ctl.rc_rack_min_rtt) 8023 srtt = rack->r_ctl.rc_rack_min_rtt; 8024 else 8025 srtt = TICKS_2_MSEC((tp->t_srtt >> TCP_RTT_SHIFT)); 8026 if (rack->r_ctl.rc_rack_largest_cwnd) 8027 cwnd = rack->r_ctl.rc_rack_largest_cwnd; 8028 else 8029 cwnd = tp->snd_cwnd; 8030 tr_perms = cwnd / srtt; 8031 if (tr_perms == 0) { 8032 tr_perms = ctf_fixed_maxseg(tp); 8033 } 8034 /* 8035 * Calculate how long this will take to drain, if 8036 * the calculation comes out to zero, thats ok we 8037 * will use send_a_lot to possibly spin around for 8038 * more increasing tot_len_this_send to the point 8039 * that its going to require a pace, or we hit the 8040 * cwnd. Which in that case we are just waiting for 8041 * a ACK. 8042 */ 8043 slot = len / tr_perms; 8044 /* Now do we reduce the time so we don't run dry? */ 8045 if (slot && rack->rc_pace_reduce) { 8046 int32_t reduce; 8047 8048 reduce = (slot / rack->rc_pace_reduce); 8049 if (reduce < slot) { 8050 slot -= reduce; 8051 } else 8052 slot = 0; 8053 } 8054 } else { 8055 int cnt; 8056 uint64_t bw_est, bw_raise, res, lentim; 8057 8058 bw_est = 0; 8059 for (cnt=0; cnt<RACK_GP_HIST; cnt++) { 8060 if ((rack->r_ctl.rc_gp_hist_filled == 0) && 8061 (rack->r_ctl.rc_gp_history[cnt] == 0)) 8062 break; 8063 bw_est += rack->r_ctl.rc_gp_history[cnt]; 8064 } 8065 if (bw_est == 0) { 8066 /* 8067 * No way yet to make a b/w estimate 8068 * (no goodput est yet). 8069 */ 8070 goto old_method; 8071 } 8072 /* Covert to bytes per second */ 8073 bw_est *= MSEC_IN_SECOND; 8074 /* 8075 * Now ratchet it up by our percentage. Note 8076 * that the minimum you can do is 1 which would 8077 * get you 101% of the average last N goodput estimates. 8078 * The max you can do is 256 which would yeild you 8079 * 356% of the last N goodput estimates. 8080 */ 8081 bw_raise = bw_est * (uint64_t)rack->rack_per_of_gp; 8082 bw_est += bw_raise; 8083 /* average by the number we added */ 8084 bw_est /= cnt; 8085 /* Now calculate a rate based on this b/w */ 8086 lentim = (uint64_t) len * (uint64_t)MSEC_IN_SECOND; 8087 res = lentim / bw_est; 8088 slot = (uint32_t)res; 8089 } 8090 if (rack->r_enforce_min_pace && 8091 (slot == 0)) { 8092 /* We are enforcing a minimum pace time of 1ms */ 8093 slot = rack->r_enforce_min_pace; 8094 } 8095 if (slot) 8096 counter_u64_add(rack_calc_nonzero, 1); 8097 else 8098 counter_u64_add(rack_calc_zero, 1); 8099 return (slot); 8100 } 8101 8102 static int 8103 rack_output(struct tcpcb *tp) 8104 { 8105 struct socket *so; 8106 uint32_t recwin, sendwin; 8107 uint32_t sb_offset; 8108 int32_t len, flags, error = 0; 8109 struct mbuf *m; 8110 struct mbuf *mb; 8111 uint32_t if_hw_tsomaxsegcount = 0; 8112 uint32_t if_hw_tsomaxsegsize = 0; 8113 int32_t maxseg; 8114 long tot_len_this_send = 0; 8115 struct ip *ip = NULL; 8116 #ifdef TCPDEBUG 8117 struct ipovly *ipov = NULL; 8118 #endif 8119 struct udphdr *udp = NULL; 8120 struct tcp_rack *rack; 8121 struct tcphdr *th; 8122 uint8_t pass = 0; 8123 uint8_t wanted_cookie = 0; 8124 u_char opt[TCP_MAXOLEN]; 8125 unsigned ipoptlen, optlen, hdrlen, ulen=0; 8126 uint32_t rack_seq; 8127 8128 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 8129 unsigned ipsec_optlen = 0; 8130 8131 #endif 8132 int32_t idle, sendalot; 8133 int32_t sub_from_prr = 0; 8134 volatile int32_t sack_rxmit; 8135 struct rack_sendmap *rsm = NULL; 8136 int32_t tso, mtu; 8137 struct tcpopt to; 8138 int32_t slot = 0; 8139 int32_t sup_rack = 0; 8140 uint32_t cts; 8141 uint8_t hpts_calling, new_data_tlp = 0, doing_tlp = 0; 8142 int32_t do_a_prefetch; 8143 int32_t prefetch_rsm = 0; 8144 int force_tso = 0; 8145 int32_t orig_len; 8146 int32_t prefetch_so_done = 0; 8147 struct tcp_log_buffer *lgb = NULL; 8148 struct inpcb *inp; 8149 struct sockbuf *sb; 8150 #ifdef INET6 8151 struct ip6_hdr *ip6 = NULL; 8152 int32_t isipv6; 8153 #endif 8154 uint8_t filled_all = 0; 8155 bool hw_tls = false; 8156 8157 /* setup and take the cache hits here */ 8158 rack = (struct tcp_rack *)tp->t_fb_ptr; 8159 inp = rack->rc_inp; 8160 so = inp->inp_socket; 8161 sb = &so->so_snd; 8162 kern_prefetch(sb, &do_a_prefetch); 8163 do_a_prefetch = 1; 8164 8165 #ifdef KERN_TLS 8166 hw_tls = (so->so_snd.sb_flags & SB_TLS_IFNET) != 0; 8167 #endif 8168 8169 NET_EPOCH_ASSERT(); 8170 INP_WLOCK_ASSERT(inp); 8171 8172 #ifdef TCP_OFFLOAD 8173 if (tp->t_flags & TF_TOE) 8174 return (tcp_offload_output(tp)); 8175 #endif 8176 maxseg = ctf_fixed_maxseg(tp); 8177 /* 8178 * For TFO connections in SYN_RECEIVED, only allow the initial 8179 * SYN|ACK and those sent by the retransmit timer. 8180 */ 8181 if (IS_FASTOPEN(tp->t_flags) && 8182 (tp->t_state == TCPS_SYN_RECEIVED) && 8183 SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN|ACK sent */ 8184 (rack->r_ctl.rc_resend == NULL)) /* not a retransmit */ 8185 return (0); 8186 #ifdef INET6 8187 if (rack->r_state) { 8188 /* Use the cache line loaded if possible */ 8189 isipv6 = rack->r_is_v6; 8190 } else { 8191 isipv6 = (inp->inp_vflag & INP_IPV6) != 0; 8192 } 8193 #endif 8194 cts = tcp_ts_getticks(); 8195 if (((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) && 8196 inp->inp_in_hpts) { 8197 /* 8198 * We are on the hpts for some timer but not hptsi output. 8199 * Remove from the hpts unconditionally. 8200 */ 8201 rack_timer_cancel(tp, rack, cts, __LINE__); 8202 } 8203 /* Mark that we have called rack_output(). */ 8204 if ((rack->r_timer_override) || 8205 (tp->t_flags & TF_FORCEDATA) || 8206 (tp->t_state < TCPS_ESTABLISHED)) { 8207 if (tp->t_inpcb->inp_in_hpts) 8208 tcp_hpts_remove(tp->t_inpcb, HPTS_REMOVE_OUTPUT); 8209 } else if (tp->t_inpcb->inp_in_hpts) { 8210 /* 8211 * On the hpts you can't pass even if ACKNOW is on, we will 8212 * when the hpts fires. 8213 */ 8214 counter_u64_add(rack_out_size[TCP_MSS_ACCT_INPACE], 1); 8215 return (0); 8216 } 8217 hpts_calling = inp->inp_hpts_calls; 8218 inp->inp_hpts_calls = 0; 8219 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { 8220 if (rack_process_timers(tp, rack, cts, hpts_calling)) { 8221 counter_u64_add(rack_out_size[TCP_MSS_ACCT_ATIMER], 1); 8222 return (0); 8223 } 8224 } 8225 rack->r_wanted_output = 0; 8226 rack->r_timer_override = 0; 8227 /* 8228 * For TFO connections in SYN_SENT or SYN_RECEIVED, 8229 * only allow the initial SYN or SYN|ACK and those sent 8230 * by the retransmit timer. 8231 */ 8232 if (IS_FASTOPEN(tp->t_flags) && 8233 ((tp->t_state == TCPS_SYN_RECEIVED) || 8234 (tp->t_state == TCPS_SYN_SENT)) && 8235 SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN or SYN|ACK sent */ 8236 (tp->t_rxtshift == 0)) /* not a retransmit */ 8237 return (0); 8238 /* 8239 * Determine length of data that should be transmitted, and flags 8240 * that will be used. If there is some data or critical controls 8241 * (SYN, RST) to send, then transmit; otherwise, investigate 8242 * further. 8243 */ 8244 idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una); 8245 if (tp->t_idle_reduce) { 8246 if (idle && ((ticks - tp->t_rcvtime) >= tp->t_rxtcur)) 8247 rack_cc_after_idle(tp); 8248 } 8249 tp->t_flags &= ~TF_LASTIDLE; 8250 if (idle) { 8251 if (tp->t_flags & TF_MORETOCOME) { 8252 tp->t_flags |= TF_LASTIDLE; 8253 idle = 0; 8254 } 8255 } 8256 again: 8257 /* 8258 * If we've recently taken a timeout, snd_max will be greater than 8259 * snd_nxt. There may be SACK information that allows us to avoid 8260 * resending already delivered data. Adjust snd_nxt accordingly. 8261 */ 8262 sendalot = 0; 8263 cts = tcp_ts_getticks(); 8264 tso = 0; 8265 mtu = 0; 8266 sb_offset = tp->snd_max - tp->snd_una; 8267 sendwin = min(tp->snd_wnd, tp->snd_cwnd); 8268 8269 flags = tcp_outflags[tp->t_state]; 8270 while (rack->rc_free_cnt < rack_free_cache) { 8271 rsm = rack_alloc(rack); 8272 if (rsm == NULL) { 8273 if (inp->inp_hpts_calls) 8274 /* Retry in a ms */ 8275 slot = 1; 8276 goto just_return_nolock; 8277 } 8278 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_tnext); 8279 rack->rc_free_cnt++; 8280 rsm = NULL; 8281 } 8282 if (inp->inp_hpts_calls) 8283 inp->inp_hpts_calls = 0; 8284 sack_rxmit = 0; 8285 len = 0; 8286 rsm = NULL; 8287 if (flags & TH_RST) { 8288 SOCKBUF_LOCK(sb); 8289 goto send; 8290 } 8291 if (rack->r_ctl.rc_tlpsend) { 8292 /* Tail loss probe */ 8293 long cwin; 8294 long tlen; 8295 8296 doing_tlp = 1; 8297 /* 8298 * Check if we can do a TLP with a RACK'd packet 8299 * this can happen if we are not doing the rack 8300 * cheat and we skipped to a TLP and it 8301 * went off. 8302 */ 8303 rsm = tcp_rack_output(tp, rack, cts); 8304 if (rsm == NULL) 8305 rsm = rack->r_ctl.rc_tlpsend; 8306 rack->r_ctl.rc_tlpsend = NULL; 8307 sack_rxmit = 1; 8308 tlen = rsm->r_end - rsm->r_start; 8309 if (tlen > ctf_fixed_maxseg(tp)) 8310 tlen = ctf_fixed_maxseg(tp); 8311 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start), 8312 ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p", 8313 __func__, __LINE__, 8314 rsm->r_start, tp->snd_una, tp, rack, rsm)); 8315 sb_offset = rsm->r_start - tp->snd_una; 8316 cwin = min(tp->snd_wnd, tlen); 8317 len = cwin; 8318 } else if (rack->r_ctl.rc_resend) { 8319 /* Retransmit timer */ 8320 rsm = rack->r_ctl.rc_resend; 8321 rack->r_ctl.rc_resend = NULL; 8322 len = rsm->r_end - rsm->r_start; 8323 sack_rxmit = 1; 8324 sendalot = 0; 8325 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start), 8326 ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p", 8327 __func__, __LINE__, 8328 rsm->r_start, tp->snd_una, tp, rack, rsm)); 8329 sb_offset = rsm->r_start - tp->snd_una; 8330 if (len >= ctf_fixed_maxseg(tp)) { 8331 len = ctf_fixed_maxseg(tp); 8332 } 8333 } else if ((rack->rc_in_persist == 0) && 8334 ((rsm = tcp_rack_output(tp, rack, cts)) != NULL)) { 8335 int maxseg; 8336 8337 maxseg = ctf_fixed_maxseg(tp); 8338 if ((!IN_RECOVERY(tp->t_flags)) && 8339 ((tp->t_flags & (TF_WASFRECOVERY | TF_WASCRECOVERY)) == 0)) { 8340 /* Enter recovery if not induced by a time-out */ 8341 rack->r_ctl.rc_rsm_start = rsm->r_start; 8342 rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; 8343 rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; 8344 rack_cong_signal(tp, NULL, CC_NDUPACK); 8345 /* 8346 * When we enter recovery we need to assure we send 8347 * one packet. 8348 */ 8349 rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); 8350 rack_log_to_prr(rack, 13); 8351 } 8352 #ifdef INVARIANTS 8353 if (SEQ_LT(rsm->r_start, tp->snd_una)) { 8354 panic("Huh, tp:%p rack:%p rsm:%p start:%u < snd_una:%u\n", 8355 tp, rack, rsm, rsm->r_start, tp->snd_una); 8356 } 8357 #endif 8358 len = rsm->r_end - rsm->r_start; 8359 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start), 8360 ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p", 8361 __func__, __LINE__, 8362 rsm->r_start, tp->snd_una, tp, rack, rsm)); 8363 sb_offset = rsm->r_start - tp->snd_una; 8364 /* Can we send it within the PRR boundary? */ 8365 if ((rack->use_rack_cheat == 0) && (len > rack->r_ctl.rc_prr_sndcnt)) { 8366 /* It does not fit */ 8367 if ((ctf_flight_size(tp, rack->r_ctl.rc_sacked) > len) && 8368 (rack->r_ctl.rc_prr_sndcnt < maxseg)) { 8369 /* 8370 * prr is less than a segment, we 8371 * have more acks due in besides 8372 * what we need to resend. Lets not send 8373 * to avoid sending small pieces of 8374 * what we need to retransmit. 8375 */ 8376 len = 0; 8377 goto just_return_nolock; 8378 } 8379 len = rack->r_ctl.rc_prr_sndcnt; 8380 } 8381 sendalot = 0; 8382 if (len >= maxseg) { 8383 len = maxseg; 8384 } 8385 if (len > 0) { 8386 sub_from_prr = 1; 8387 sack_rxmit = 1; 8388 KMOD_TCPSTAT_INC(tcps_sack_rexmits); 8389 KMOD_TCPSTAT_ADD(tcps_sack_rexmit_bytes, 8390 min(len, ctf_fixed_maxseg(tp))); 8391 counter_u64_add(rack_rtm_prr_retran, 1); 8392 } 8393 } 8394 /* 8395 * Enforce a connection sendmap count limit if set 8396 * as long as we are not retransmiting. 8397 */ 8398 if ((rsm == NULL) && 8399 (rack->do_detection == 0) && 8400 (V_tcp_map_entries_limit > 0) && 8401 (rack->r_ctl.rc_num_maps_alloced >= V_tcp_map_entries_limit)) { 8402 counter_u64_add(rack_to_alloc_limited, 1); 8403 if (!rack->alloc_limit_reported) { 8404 rack->alloc_limit_reported = 1; 8405 counter_u64_add(rack_alloc_limited_conns, 1); 8406 } 8407 goto just_return_nolock; 8408 } 8409 if (rsm && (rsm->r_flags & RACK_HAS_FIN)) { 8410 /* we are retransmitting the fin */ 8411 len--; 8412 if (len) { 8413 /* 8414 * When retransmitting data do *not* include the 8415 * FIN. This could happen from a TLP probe. 8416 */ 8417 flags &= ~TH_FIN; 8418 } 8419 } 8420 #ifdef INVARIANTS 8421 /* For debugging */ 8422 rack->r_ctl.rc_rsm_at_retran = rsm; 8423 #endif 8424 /* 8425 * Get standard flags, and add SYN or FIN if requested by 'hidden' 8426 * state flags. 8427 */ 8428 if (tp->t_flags & TF_NEEDFIN) 8429 flags |= TH_FIN; 8430 if (tp->t_flags & TF_NEEDSYN) 8431 flags |= TH_SYN; 8432 if ((sack_rxmit == 0) && (prefetch_rsm == 0)) { 8433 void *end_rsm; 8434 end_rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext); 8435 if (end_rsm) 8436 kern_prefetch(end_rsm, &prefetch_rsm); 8437 prefetch_rsm = 1; 8438 } 8439 SOCKBUF_LOCK(sb); 8440 /* 8441 * If in persist timeout with window of 0, send 1 byte. Otherwise, 8442 * if window is small but nonzero and time TF_SENTFIN expired, we 8443 * will send what we can and go to transmit state. 8444 */ 8445 if (tp->t_flags & TF_FORCEDATA) { 8446 if (sendwin == 0) { 8447 /* 8448 * If we still have some data to send, then clear 8449 * the FIN bit. Usually this would happen below 8450 * when it realizes that we aren't sending all the 8451 * data. However, if we have exactly 1 byte of 8452 * unsent data, then it won't clear the FIN bit 8453 * below, and if we are in persist state, we wind up 8454 * sending the packet without recording that we sent 8455 * the FIN bit. 8456 * 8457 * We can't just blindly clear the FIN bit, because 8458 * if we don't have any more data to send then the 8459 * probe will be the FIN itself. 8460 */ 8461 if (sb_offset < sbused(sb)) 8462 flags &= ~TH_FIN; 8463 sendwin = 1; 8464 } else { 8465 if ((rack->rc_in_persist != 0) && 8466 (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2), 8467 rack->r_ctl.rc_pace_min_segs))) 8468 rack_exit_persist(tp, rack); 8469 /* 8470 * If we are dropping persist mode then we need to 8471 * correct snd_nxt/snd_max and off. 8472 */ 8473 tp->snd_nxt = tp->snd_max; 8474 sb_offset = tp->snd_nxt - tp->snd_una; 8475 } 8476 } 8477 /* 8478 * If snd_nxt == snd_max and we have transmitted a FIN, the 8479 * sb_offset will be > 0 even if so_snd.sb_cc is 0, resulting in a 8480 * negative length. This can also occur when TCP opens up its 8481 * congestion window while receiving additional duplicate acks after 8482 * fast-retransmit because TCP will reset snd_nxt to snd_max after 8483 * the fast-retransmit. 8484 * 8485 * In the normal retransmit-FIN-only case, however, snd_nxt will be 8486 * set to snd_una, the sb_offset will be 0, and the length may wind 8487 * up 0. 8488 * 8489 * If sack_rxmit is true we are retransmitting from the scoreboard 8490 * in which case len is already set. 8491 */ 8492 if (sack_rxmit == 0) { 8493 uint32_t avail; 8494 8495 avail = sbavail(sb); 8496 if (SEQ_GT(tp->snd_nxt, tp->snd_una) && avail) 8497 sb_offset = tp->snd_nxt - tp->snd_una; 8498 else 8499 sb_offset = 0; 8500 if (IN_RECOVERY(tp->t_flags) == 0) { 8501 if (rack->r_ctl.rc_tlp_new_data) { 8502 /* TLP is forcing out new data */ 8503 if (rack->r_ctl.rc_tlp_new_data > (uint32_t) (avail - sb_offset)) { 8504 rack->r_ctl.rc_tlp_new_data = (uint32_t) (avail - sb_offset); 8505 } 8506 if (rack->r_ctl.rc_tlp_new_data > tp->snd_wnd) 8507 len = tp->snd_wnd; 8508 else 8509 len = rack->r_ctl.rc_tlp_new_data; 8510 rack->r_ctl.rc_tlp_new_data = 0; 8511 new_data_tlp = doing_tlp = 1; 8512 } else { 8513 if (sendwin > avail) { 8514 /* use the available */ 8515 if (avail > sb_offset) { 8516 len = (int32_t)(avail - sb_offset); 8517 } else { 8518 len = 0; 8519 } 8520 } else { 8521 if (sendwin > sb_offset) { 8522 len = (int32_t)(sendwin - sb_offset); 8523 } else { 8524 len = 0; 8525 } 8526 } 8527 } 8528 } else { 8529 uint32_t outstanding; 8530 8531 /* 8532 * We are inside of a SACK recovery episode and are 8533 * sending new data, having retransmitted all the 8534 * data possible so far in the scoreboard. 8535 */ 8536 outstanding = tp->snd_max - tp->snd_una; 8537 if ((rack->r_ctl.rc_prr_sndcnt + outstanding) > tp->snd_wnd) { 8538 if (tp->snd_wnd > outstanding) { 8539 len = tp->snd_wnd - outstanding; 8540 /* Check to see if we have the data */ 8541 if (((sb_offset + len) > avail) && 8542 (avail > sb_offset)) 8543 len = avail - sb_offset; 8544 else 8545 len = 0; 8546 } else 8547 len = 0; 8548 } else if (avail > sb_offset) 8549 len = avail - sb_offset; 8550 else 8551 len = 0; 8552 if (len > 0) { 8553 if (len > rack->r_ctl.rc_prr_sndcnt) 8554 len = rack->r_ctl.rc_prr_sndcnt; 8555 if (len > 0) { 8556 sub_from_prr = 1; 8557 counter_u64_add(rack_rtm_prr_newdata, 1); 8558 } 8559 } 8560 if (len > ctf_fixed_maxseg(tp)) { 8561 /* 8562 * We should never send more than a MSS when 8563 * retransmitting or sending new data in prr 8564 * mode unless the override flag is on. Most 8565 * likely the PRR algorithm is not going to 8566 * let us send a lot as well :-) 8567 */ 8568 if (rack->r_ctl.rc_prr_sendalot == 0) 8569 len = ctf_fixed_maxseg(tp); 8570 } else if (len < ctf_fixed_maxseg(tp)) { 8571 /* 8572 * Do we send any? The idea here is if the 8573 * send empty's the socket buffer we want to 8574 * do it. However if not then lets just wait 8575 * for our prr_sndcnt to get bigger. 8576 */ 8577 long leftinsb; 8578 8579 leftinsb = sbavail(sb) - sb_offset; 8580 if (leftinsb > len) { 8581 /* This send does not empty the sb */ 8582 len = 0; 8583 } 8584 } 8585 } 8586 } 8587 if (prefetch_so_done == 0) { 8588 kern_prefetch(so, &prefetch_so_done); 8589 prefetch_so_done = 1; 8590 } 8591 /* 8592 * Lop off SYN bit if it has already been sent. However, if this is 8593 * SYN-SENT state and if segment contains data and if we don't know 8594 * that foreign host supports TAO, suppress sending segment. 8595 */ 8596 if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una) && 8597 ((sack_rxmit == 0) && (tp->t_rxtshift == 0))) { 8598 if (tp->t_state != TCPS_SYN_RECEIVED) 8599 flags &= ~TH_SYN; 8600 /* 8601 * When sending additional segments following a TFO SYN|ACK, 8602 * do not include the SYN bit. 8603 */ 8604 if (IS_FASTOPEN(tp->t_flags) && 8605 (tp->t_state == TCPS_SYN_RECEIVED)) 8606 flags &= ~TH_SYN; 8607 sb_offset--, len++; 8608 } 8609 /* 8610 * Be careful not to send data and/or FIN on SYN segments. This 8611 * measure is needed to prevent interoperability problems with not 8612 * fully conformant TCP implementations. 8613 */ 8614 if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) { 8615 len = 0; 8616 flags &= ~TH_FIN; 8617 } 8618 /* 8619 * On TFO sockets, ensure no data is sent in the following cases: 8620 * 8621 * - When retransmitting SYN|ACK on a passively-created socket 8622 * 8623 * - When retransmitting SYN on an actively created socket 8624 * 8625 * - When sending a zero-length cookie (cookie request) on an 8626 * actively created socket 8627 * 8628 * - When the socket is in the CLOSED state (RST is being sent) 8629 */ 8630 if (IS_FASTOPEN(tp->t_flags) && 8631 (((flags & TH_SYN) && (tp->t_rxtshift > 0)) || 8632 ((tp->t_state == TCPS_SYN_SENT) && 8633 (tp->t_tfo_client_cookie_len == 0)) || 8634 (flags & TH_RST))) { 8635 sack_rxmit = 0; 8636 len = 0; 8637 } 8638 /* Without fast-open there should never be data sent on a SYN */ 8639 if ((flags & TH_SYN) && (!IS_FASTOPEN(tp->t_flags))) 8640 len = 0; 8641 orig_len = len; 8642 if (len <= 0) { 8643 /* 8644 * If FIN has been sent but not acked, but we haven't been 8645 * called to retransmit, len will be < 0. Otherwise, window 8646 * shrank after we sent into it. If window shrank to 0, 8647 * cancel pending retransmit, pull snd_nxt back to (closed) 8648 * window, and set the persist timer if it isn't already 8649 * going. If the window didn't close completely, just wait 8650 * for an ACK. 8651 * 8652 * We also do a general check here to ensure that we will 8653 * set the persist timer when we have data to send, but a 8654 * 0-byte window. This makes sure the persist timer is set 8655 * even if the packet hits one of the "goto send" lines 8656 * below. 8657 */ 8658 len = 0; 8659 if ((tp->snd_wnd == 0) && 8660 (TCPS_HAVEESTABLISHED(tp->t_state)) && 8661 (tp->snd_una == tp->snd_max) && 8662 (sb_offset < (int)sbavail(sb))) { 8663 tp->snd_nxt = tp->snd_una; 8664 rack_enter_persist(tp, rack, cts); 8665 } 8666 } else if ((rsm == NULL) && 8667 ((doing_tlp == 0) || (new_data_tlp == 1)) && 8668 (len < rack->r_ctl.rc_pace_max_segs)) { 8669 /* 8670 * We are not sending a full segment for 8671 * some reason. Should we not send anything (think 8672 * sws or persists)? 8673 */ 8674 if ((tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) && 8675 (TCPS_HAVEESTABLISHED(tp->t_state)) && 8676 (len < (int)(sbavail(sb) - sb_offset))) { 8677 /* 8678 * Here the rwnd is less than 8679 * the pacing size, this is not a retransmit, 8680 * we are established and 8681 * the send is not the last in the socket buffer 8682 * we send nothing, and may enter persists. 8683 */ 8684 len = 0; 8685 if (tp->snd_max == tp->snd_una) { 8686 /* 8687 * Nothing out we can 8688 * go into persists. 8689 */ 8690 rack_enter_persist(tp, rack, cts); 8691 tp->snd_nxt = tp->snd_una; 8692 } 8693 } else if ((tp->snd_cwnd >= max(rack->r_ctl.rc_pace_min_segs, (maxseg * 4))) && 8694 (ctf_flight_size(tp, rack->r_ctl.rc_sacked) > (2 * maxseg)) && 8695 (len < (int)(sbavail(sb) - sb_offset)) && 8696 (len < rack->r_ctl.rc_pace_min_segs)) { 8697 /* 8698 * Here we are not retransmitting, and 8699 * the cwnd is not so small that we could 8700 * not send at least a min size (rxt timer 8701 * not having gone off), We have 2 segments or 8702 * more already in flight, its not the tail end 8703 * of the socket buffer and the cwnd is blocking 8704 * us from sending out a minimum pacing segment size. 8705 * Lets not send anything. 8706 */ 8707 len = 0; 8708 } else if (((tp->snd_wnd - ctf_outstanding(tp)) < 8709 min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) && 8710 (ctf_flight_size(tp, rack->r_ctl.rc_sacked) > (2 * maxseg)) && 8711 (len < (int)(sbavail(sb) - sb_offset)) && 8712 (TCPS_HAVEESTABLISHED(tp->t_state))) { 8713 /* 8714 * Here we have a send window but we have 8715 * filled it up and we can't send another pacing segment. 8716 * We also have in flight more than 2 segments 8717 * and we are not completing the sb i.e. we allow 8718 * the last bytes of the sb to go out even if 8719 * its not a full pacing segment. 8720 */ 8721 len = 0; 8722 } 8723 } 8724 /* len will be >= 0 after this point. */ 8725 KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); 8726 tcp_sndbuf_autoscale(tp, so, sendwin); 8727 /* 8728 * Decide if we can use TCP Segmentation Offloading (if supported by 8729 * hardware). 8730 * 8731 * TSO may only be used if we are in a pure bulk sending state. The 8732 * presence of TCP-MD5, SACK retransmits, SACK advertizements and IP 8733 * options prevent using TSO. With TSO the TCP header is the same 8734 * (except for the sequence number) for all generated packets. This 8735 * makes it impossible to transmit any options which vary per 8736 * generated segment or packet. 8737 * 8738 * IPv4 handling has a clear separation of ip options and ip header 8739 * flags while IPv6 combines both in in6p_outputopts. ip6_optlen() does 8740 * the right thing below to provide length of just ip options and thus 8741 * checking for ipoptlen is enough to decide if ip options are present. 8742 */ 8743 8744 #ifdef INET6 8745 if (isipv6) 8746 ipoptlen = ip6_optlen(tp->t_inpcb); 8747 else 8748 #endif 8749 if (tp->t_inpcb->inp_options) 8750 ipoptlen = tp->t_inpcb->inp_options->m_len - 8751 offsetof(struct ipoption, ipopt_list); 8752 else 8753 ipoptlen = 0; 8754 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 8755 /* 8756 * Pre-calculate here as we save another lookup into the darknesses 8757 * of IPsec that way and can actually decide if TSO is ok. 8758 */ 8759 #ifdef INET6 8760 if (isipv6 && IPSEC_ENABLED(ipv6)) 8761 ipsec_optlen = IPSEC_HDRSIZE(ipv6, tp->t_inpcb); 8762 #ifdef INET 8763 else 8764 #endif 8765 #endif /* INET6 */ 8766 #ifdef INET 8767 if (IPSEC_ENABLED(ipv4)) 8768 ipsec_optlen = IPSEC_HDRSIZE(ipv4, tp->t_inpcb); 8769 #endif /* INET */ 8770 #endif 8771 8772 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 8773 ipoptlen += ipsec_optlen; 8774 #endif 8775 if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > ctf_fixed_maxseg(tp) && 8776 (tp->t_port == 0) && 8777 ((tp->t_flags & TF_SIGNATURE) == 0) && 8778 tp->rcv_numsacks == 0 && sack_rxmit == 0 && 8779 ipoptlen == 0) 8780 tso = 1; 8781 { 8782 uint32_t outstanding; 8783 8784 outstanding = tp->snd_max - tp->snd_una; 8785 if (tp->t_flags & TF_SENTFIN) { 8786 /* 8787 * If we sent a fin, snd_max is 1 higher than 8788 * snd_una 8789 */ 8790 outstanding--; 8791 } 8792 if (sack_rxmit) { 8793 if ((rsm->r_flags & RACK_HAS_FIN) == 0) 8794 flags &= ~TH_FIN; 8795 } else { 8796 if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + 8797 sbused(sb))) 8798 flags &= ~TH_FIN; 8799 } 8800 } 8801 recwin = sbspace(&so->so_rcv); 8802 8803 /* 8804 * Sender silly window avoidance. We transmit under the following 8805 * conditions when len is non-zero: 8806 * 8807 * - We have a full segment (or more with TSO) - This is the last 8808 * buffer in a write()/send() and we are either idle or running 8809 * NODELAY - we've timed out (e.g. persist timer) - we have more 8810 * then 1/2 the maximum send window's worth of data (receiver may be 8811 * limited the window size) - we need to retransmit 8812 */ 8813 if (len) { 8814 if (len >= ctf_fixed_maxseg(tp)) { 8815 pass = 1; 8816 goto send; 8817 } 8818 /* 8819 * NOTE! on localhost connections an 'ack' from the remote 8820 * end may occur synchronously with the output and cause us 8821 * to flush a buffer queued with moretocome. XXX 8822 * 8823 */ 8824 if (!(tp->t_flags & TF_MORETOCOME) && /* normal case */ 8825 (idle || (tp->t_flags & TF_NODELAY)) && 8826 ((uint32_t)len + (uint32_t)sb_offset >= sbavail(&so->so_snd)) && 8827 (tp->t_flags & TF_NOPUSH) == 0) { 8828 pass = 2; 8829 goto send; 8830 } 8831 if (tp->t_flags & TF_FORCEDATA) { /* typ. timeout case */ 8832 pass = 3; 8833 goto send; 8834 } 8835 if ((tp->snd_una == tp->snd_max) && len) { /* Nothing outstanding */ 8836 goto send; 8837 } 8838 if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) { 8839 pass = 4; 8840 goto send; 8841 } 8842 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { /* retransmit case */ 8843 pass = 5; 8844 goto send; 8845 } 8846 if (sack_rxmit) { 8847 pass = 6; 8848 goto send; 8849 } 8850 } 8851 /* 8852 * Sending of standalone window updates. 8853 * 8854 * Window updates are important when we close our window due to a 8855 * full socket buffer and are opening it again after the application 8856 * reads data from it. Once the window has opened again and the 8857 * remote end starts to send again the ACK clock takes over and 8858 * provides the most current window information. 8859 * 8860 * We must avoid the silly window syndrome whereas every read from 8861 * the receive buffer, no matter how small, causes a window update 8862 * to be sent. We also should avoid sending a flurry of window 8863 * updates when the socket buffer had queued a lot of data and the 8864 * application is doing small reads. 8865 * 8866 * Prevent a flurry of pointless window updates by only sending an 8867 * update when we can increase the advertized window by more than 8868 * 1/4th of the socket buffer capacity. When the buffer is getting 8869 * full or is very small be more aggressive and send an update 8870 * whenever we can increase by two mss sized segments. In all other 8871 * situations the ACK's to new incoming data will carry further 8872 * window increases. 8873 * 8874 * Don't send an independent window update if a delayed ACK is 8875 * pending (it will get piggy-backed on it) or the remote side 8876 * already has done a half-close and won't send more data. Skip 8877 * this if the connection is in T/TCP half-open state. 8878 */ 8879 if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN) && 8880 !(tp->t_flags & TF_DELACK) && 8881 !TCPS_HAVERCVDFIN(tp->t_state)) { 8882 /* 8883 * "adv" is the amount we could increase the window, taking 8884 * into account that we are limited by TCP_MAXWIN << 8885 * tp->rcv_scale. 8886 */ 8887 int32_t adv; 8888 int oldwin; 8889 8890 adv = min(recwin, (long)TCP_MAXWIN << tp->rcv_scale); 8891 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) { 8892 oldwin = (tp->rcv_adv - tp->rcv_nxt); 8893 adv -= oldwin; 8894 } else 8895 oldwin = 0; 8896 8897 /* 8898 * If the new window size ends up being the same as the old 8899 * size when it is scaled, then don't force a window update. 8900 */ 8901 if (oldwin >> tp->rcv_scale == (adv + oldwin) >> tp->rcv_scale) 8902 goto dontupdate; 8903 8904 if (adv >= (int32_t)(2 * ctf_fixed_maxseg(tp)) && 8905 (adv >= (int32_t)(so->so_rcv.sb_hiwat / 4) || 8906 recwin <= (int32_t)(so->so_rcv.sb_hiwat / 8) || 8907 so->so_rcv.sb_hiwat <= 8 * ctf_fixed_maxseg(tp))) { 8908 pass = 7; 8909 goto send; 8910 } 8911 if (2 * adv >= (int32_t) so->so_rcv.sb_hiwat) 8912 goto send; 8913 } 8914 dontupdate: 8915 8916 /* 8917 * Send if we owe the peer an ACK, RST, SYN, or urgent data. ACKNOW 8918 * is also a catch-all for the retransmit timer timeout case. 8919 */ 8920 if (tp->t_flags & TF_ACKNOW) { 8921 pass = 8; 8922 goto send; 8923 } 8924 if (((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0)) { 8925 pass = 9; 8926 goto send; 8927 } 8928 if (SEQ_GT(tp->snd_up, tp->snd_una)) { 8929 pass = 10; 8930 goto send; 8931 } 8932 /* 8933 * If our state indicates that FIN should be sent and we have not 8934 * yet done so, then we need to send. 8935 */ 8936 if ((flags & TH_FIN) && 8937 (tp->snd_nxt == tp->snd_una)) { 8938 pass = 11; 8939 goto send; 8940 } 8941 /* 8942 * No reason to send a segment, just return. 8943 */ 8944 just_return: 8945 SOCKBUF_UNLOCK(sb); 8946 just_return_nolock: 8947 if (tot_len_this_send == 0) 8948 counter_u64_add(rack_out_size[TCP_MSS_ACCT_JUSTRET], 1); 8949 if (slot) { 8950 /* set the rack tcb into the slot N */ 8951 counter_u64_add(rack_paced_segments, 1); 8952 } else if (tot_len_this_send) { 8953 counter_u64_add(rack_unpaced_segments, 1); 8954 } 8955 /* Check if we need to go into persists or not */ 8956 if ((rack->rc_in_persist == 0) && 8957 (tp->snd_max == tp->snd_una) && 8958 TCPS_HAVEESTABLISHED(tp->t_state) && 8959 sbavail(&tp->t_inpcb->inp_socket->so_snd) && 8960 (sbavail(&tp->t_inpcb->inp_socket->so_snd) > tp->snd_wnd) && 8961 (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs))) { 8962 /* Yes lets make sure to move to persist before timer-start */ 8963 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime); 8964 } 8965 rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, sup_rack); 8966 rack_log_type_just_return(rack, cts, tot_len_this_send, slot, hpts_calling); 8967 tp->t_flags &= ~TF_FORCEDATA; 8968 return (0); 8969 8970 send: 8971 if ((flags & TH_FIN) && 8972 sbavail(&tp->t_inpcb->inp_socket->so_snd)) { 8973 /* 8974 * We do not transmit a FIN 8975 * with data outstanding. We 8976 * need to make it so all data 8977 * is acked first. 8978 */ 8979 flags &= ~TH_FIN; 8980 } 8981 if (doing_tlp == 0) { 8982 /* 8983 * Data not a TLP, and its not the rxt firing. If it is the 8984 * rxt firing, we want to leave the tlp_in_progress flag on 8985 * so we don't send another TLP. It has to be a rack timer 8986 * or normal send (response to acked data) to clear the tlp 8987 * in progress flag. 8988 */ 8989 rack->rc_tlp_in_progress = 0; 8990 } 8991 SOCKBUF_LOCK_ASSERT(sb); 8992 if (len > 0) { 8993 if (len >= ctf_fixed_maxseg(tp)) 8994 tp->t_flags2 |= TF2_PLPMTU_MAXSEGSNT; 8995 else 8996 tp->t_flags2 &= ~TF2_PLPMTU_MAXSEGSNT; 8997 } 8998 /* 8999 * Before ESTABLISHED, force sending of initial options unless TCP 9000 * set not to do any options. NOTE: we assume that the IP/TCP header 9001 * plus TCP options always fit in a single mbuf, leaving room for a 9002 * maximum link header, i.e. max_linkhdr + sizeof (struct tcpiphdr) 9003 * + optlen <= MCLBYTES 9004 */ 9005 optlen = 0; 9006 #ifdef INET6 9007 if (isipv6) 9008 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 9009 else 9010 #endif 9011 hdrlen = sizeof(struct tcpiphdr); 9012 9013 /* 9014 * Compute options for segment. We only have to care about SYN and 9015 * established connection segments. Options for SYN-ACK segments 9016 * are handled in TCP syncache. 9017 */ 9018 to.to_flags = 0; 9019 if ((tp->t_flags & TF_NOOPT) == 0) { 9020 /* Maximum segment size. */ 9021 if (flags & TH_SYN) { 9022 tp->snd_nxt = tp->iss; 9023 to.to_mss = tcp_mssopt(&inp->inp_inc); 9024 #ifdef NETFLIX_TCPOUDP 9025 if (tp->t_port) 9026 to.to_mss -= V_tcp_udp_tunneling_overhead; 9027 #endif 9028 to.to_flags |= TOF_MSS; 9029 9030 /* 9031 * On SYN or SYN|ACK transmits on TFO connections, 9032 * only include the TFO option if it is not a 9033 * retransmit, as the presence of the TFO option may 9034 * have caused the original SYN or SYN|ACK to have 9035 * been dropped by a middlebox. 9036 */ 9037 if (IS_FASTOPEN(tp->t_flags) && 9038 (tp->t_rxtshift == 0)) { 9039 if (tp->t_state == TCPS_SYN_RECEIVED) { 9040 to.to_tfo_len = TCP_FASTOPEN_COOKIE_LEN; 9041 to.to_tfo_cookie = 9042 (u_int8_t *)&tp->t_tfo_cookie.server; 9043 to.to_flags |= TOF_FASTOPEN; 9044 wanted_cookie = 1; 9045 } else if (tp->t_state == TCPS_SYN_SENT) { 9046 to.to_tfo_len = 9047 tp->t_tfo_client_cookie_len; 9048 to.to_tfo_cookie = 9049 tp->t_tfo_cookie.client; 9050 to.to_flags |= TOF_FASTOPEN; 9051 wanted_cookie = 1; 9052 /* 9053 * If we wind up having more data to 9054 * send with the SYN than can fit in 9055 * one segment, don't send any more 9056 * until the SYN|ACK comes back from 9057 * the other end. 9058 */ 9059 sendalot = 0; 9060 } 9061 } 9062 } 9063 /* Window scaling. */ 9064 if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) { 9065 to.to_wscale = tp->request_r_scale; 9066 to.to_flags |= TOF_SCALE; 9067 } 9068 /* Timestamps. */ 9069 if ((tp->t_flags & TF_RCVD_TSTMP) || 9070 ((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) { 9071 to.to_tsval = cts + tp->ts_offset; 9072 to.to_tsecr = tp->ts_recent; 9073 to.to_flags |= TOF_TS; 9074 } 9075 /* Set receive buffer autosizing timestamp. */ 9076 if (tp->rfbuf_ts == 0 && 9077 (so->so_rcv.sb_flags & SB_AUTOSIZE)) 9078 tp->rfbuf_ts = tcp_ts_getticks(); 9079 /* Selective ACK's. */ 9080 if (flags & TH_SYN) 9081 to.to_flags |= TOF_SACKPERM; 9082 else if (TCPS_HAVEESTABLISHED(tp->t_state) && 9083 tp->rcv_numsacks > 0) { 9084 to.to_flags |= TOF_SACK; 9085 to.to_nsacks = tp->rcv_numsacks; 9086 to.to_sacks = (u_char *)tp->sackblks; 9087 } 9088 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) 9089 /* TCP-MD5 (RFC2385). */ 9090 if (tp->t_flags & TF_SIGNATURE) 9091 to.to_flags |= TOF_SIGNATURE; 9092 #endif /* TCP_SIGNATURE */ 9093 9094 /* Processing the options. */ 9095 hdrlen += optlen = tcp_addoptions(&to, opt); 9096 /* 9097 * If we wanted a TFO option to be added, but it was unable 9098 * to fit, ensure no data is sent. 9099 */ 9100 if (IS_FASTOPEN(tp->t_flags) && wanted_cookie && 9101 !(to.to_flags & TOF_FASTOPEN)) 9102 len = 0; 9103 } 9104 #ifdef NETFLIX_TCPOUDP 9105 if (tp->t_port) { 9106 if (V_tcp_udp_tunneling_port == 0) { 9107 /* The port was removed?? */ 9108 SOCKBUF_UNLOCK(&so->so_snd); 9109 return (EHOSTUNREACH); 9110 } 9111 hdrlen += sizeof(struct udphdr); 9112 } 9113 #endif 9114 #ifdef INET6 9115 if (isipv6) 9116 ipoptlen = ip6_optlen(tp->t_inpcb); 9117 else 9118 #endif 9119 if (tp->t_inpcb->inp_options) 9120 ipoptlen = tp->t_inpcb->inp_options->m_len - 9121 offsetof(struct ipoption, ipopt_list); 9122 else 9123 ipoptlen = 0; 9124 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 9125 ipoptlen += ipsec_optlen; 9126 #endif 9127 9128 #ifdef KERN_TLS 9129 /* force TSO for so TLS offload can get mss */ 9130 if (sb->sb_flags & SB_TLS_IFNET) { 9131 force_tso = 1; 9132 } 9133 #endif 9134 /* 9135 * Adjust data length if insertion of options will bump the packet 9136 * length beyond the t_maxseg length. Clear the FIN bit because we 9137 * cut off the tail of the segment. 9138 */ 9139 if (len + optlen + ipoptlen > tp->t_maxseg) { 9140 if (tso) { 9141 uint32_t if_hw_tsomax; 9142 uint32_t moff; 9143 int32_t max_len; 9144 9145 /* extract TSO information */ 9146 if_hw_tsomax = tp->t_tsomax; 9147 if_hw_tsomaxsegcount = tp->t_tsomaxsegcount; 9148 if_hw_tsomaxsegsize = tp->t_tsomaxsegsize; 9149 KASSERT(ipoptlen == 0, 9150 ("%s: TSO can't do IP options", __func__)); 9151 9152 /* 9153 * Check if we should limit by maximum payload 9154 * length: 9155 */ 9156 if (if_hw_tsomax != 0) { 9157 /* compute maximum TSO length */ 9158 max_len = (if_hw_tsomax - hdrlen - 9159 max_linkhdr); 9160 if (max_len <= 0) { 9161 len = 0; 9162 } else if (len > max_len) { 9163 sendalot = 1; 9164 len = max_len; 9165 } 9166 } 9167 /* 9168 * Prevent the last segment from being fractional 9169 * unless the send sockbuf can be emptied: 9170 */ 9171 max_len = (tp->t_maxseg - optlen); 9172 if (((sb_offset + len) < sbavail(sb)) && 9173 (hw_tls == 0)) { 9174 moff = len % (u_int)max_len; 9175 if (moff != 0) { 9176 len -= moff; 9177 sendalot = 1; 9178 } 9179 } 9180 /* 9181 * In case there are too many small fragments don't 9182 * use TSO: 9183 */ 9184 if (len <= maxseg) { 9185 len = max_len; 9186 sendalot = 1; 9187 tso = 0; 9188 } 9189 /* 9190 * Send the FIN in a separate segment after the bulk 9191 * sending is done. We don't trust the TSO 9192 * implementations to clear the FIN flag on all but 9193 * the last segment. 9194 */ 9195 if (tp->t_flags & TF_NEEDFIN) 9196 sendalot = 1; 9197 9198 } else { 9199 if (optlen + ipoptlen >= tp->t_maxseg) { 9200 /* 9201 * Since we don't have enough space to put 9202 * the IP header chain and the TCP header in 9203 * one packet as required by RFC 7112, don't 9204 * send it. Also ensure that at least one 9205 * byte of the payload can be put into the 9206 * TCP segment. 9207 */ 9208 SOCKBUF_UNLOCK(&so->so_snd); 9209 error = EMSGSIZE; 9210 sack_rxmit = 0; 9211 goto out; 9212 } 9213 len = tp->t_maxseg - optlen - ipoptlen; 9214 sendalot = 1; 9215 } 9216 } else 9217 tso = 0; 9218 KASSERT(len + hdrlen + ipoptlen <= IP_MAXPACKET, 9219 ("%s: len > IP_MAXPACKET", __func__)); 9220 #ifdef DIAGNOSTIC 9221 #ifdef INET6 9222 if (max_linkhdr + hdrlen > MCLBYTES) 9223 #else 9224 if (max_linkhdr + hdrlen > MHLEN) 9225 #endif 9226 panic("tcphdr too big"); 9227 #endif 9228 9229 /* 9230 * This KASSERT is here to catch edge cases at a well defined place. 9231 * Before, those had triggered (random) panic conditions further 9232 * down. 9233 */ 9234 KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); 9235 if ((len == 0) && 9236 (flags & TH_FIN) && 9237 (sbused(sb))) { 9238 /* 9239 * We have outstanding data, don't send a fin by itself!. 9240 */ 9241 goto just_return; 9242 } 9243 /* 9244 * Grab a header mbuf, attaching a copy of data to be transmitted, 9245 * and initialize the header from the template for sends on this 9246 * connection. 9247 */ 9248 if (len) { 9249 uint32_t max_val; 9250 uint32_t moff; 9251 9252 if (rack->rc_pace_max_segs) 9253 max_val = rack->rc_pace_max_segs * ctf_fixed_maxseg(tp); 9254 else 9255 max_val = len; 9256 if (rack->r_ctl.rc_pace_max_segs < max_val) 9257 max_val = rack->r_ctl.rc_pace_max_segs; 9258 /* 9259 * We allow a limit on sending with hptsi. 9260 */ 9261 if (len > max_val) { 9262 len = max_val; 9263 } 9264 #ifdef INET6 9265 if (MHLEN < hdrlen + max_linkhdr) 9266 m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); 9267 else 9268 #endif 9269 m = m_gethdr(M_NOWAIT, MT_DATA); 9270 9271 if (m == NULL) { 9272 SOCKBUF_UNLOCK(sb); 9273 error = ENOBUFS; 9274 sack_rxmit = 0; 9275 goto out; 9276 } 9277 m->m_data += max_linkhdr; 9278 m->m_len = hdrlen; 9279 9280 /* 9281 * Start the m_copy functions from the closest mbuf to the 9282 * sb_offset in the socket buffer chain. 9283 */ 9284 mb = sbsndptr_noadv(sb, sb_offset, &moff); 9285 if (len <= MHLEN - hdrlen - max_linkhdr && !hw_tls) { 9286 m_copydata(mb, moff, (int)len, 9287 mtod(m, caddr_t)+hdrlen); 9288 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) 9289 sbsndptr_adv(sb, mb, len); 9290 m->m_len += len; 9291 } else { 9292 struct sockbuf *msb; 9293 9294 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) 9295 msb = NULL; 9296 else 9297 msb = sb; 9298 m->m_next = tcp_m_copym( 9299 #ifdef NETFLIX_COPY_ARGS 9300 tp, 9301 #endif 9302 mb, moff, &len, 9303 if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb, 9304 ((rsm == NULL) ? hw_tls : 0) 9305 #ifdef NETFLIX_COPY_ARGS 9306 , &filled_all 9307 #endif 9308 ); 9309 if (len <= (tp->t_maxseg - optlen)) { 9310 /* 9311 * Must have ran out of mbufs for the copy 9312 * shorten it to no longer need tso. Lets 9313 * not put on sendalot since we are low on 9314 * mbufs. 9315 */ 9316 tso = 0; 9317 } 9318 if (m->m_next == NULL) { 9319 SOCKBUF_UNLOCK(sb); 9320 (void)m_free(m); 9321 error = ENOBUFS; 9322 sack_rxmit = 0; 9323 goto out; 9324 } 9325 } 9326 if ((tp->t_flags & TF_FORCEDATA) && len == 1) { 9327 KMOD_TCPSTAT_INC(tcps_sndprobe); 9328 #ifdef STATS 9329 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) 9330 stats_voi_update_abs_u32(tp->t_stats, 9331 VOI_TCP_RETXPB, len); 9332 else 9333 stats_voi_update_abs_u64(tp->t_stats, 9334 VOI_TCP_TXPB, len); 9335 #endif 9336 } else if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) { 9337 if (rsm && (rsm->r_flags & RACK_TLP)) { 9338 /* 9339 * TLP should not count in retran count, but 9340 * in its own bin 9341 */ 9342 counter_u64_add(rack_tlp_retran, 1); 9343 counter_u64_add(rack_tlp_retran_bytes, len); 9344 } else { 9345 tp->t_sndrexmitpack++; 9346 KMOD_TCPSTAT_INC(tcps_sndrexmitpack); 9347 KMOD_TCPSTAT_ADD(tcps_sndrexmitbyte, len); 9348 } 9349 #ifdef STATS 9350 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB, 9351 len); 9352 #endif 9353 } else { 9354 KMOD_TCPSTAT_INC(tcps_sndpack); 9355 KMOD_TCPSTAT_ADD(tcps_sndbyte, len); 9356 #ifdef STATS 9357 stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB, 9358 len); 9359 #endif 9360 } 9361 /* 9362 * If we're sending everything we've got, set PUSH. (This 9363 * will keep happy those implementations which only give 9364 * data to the user when a buffer fills or a PUSH comes in.) 9365 */ 9366 if (sb_offset + len == sbused(sb) && 9367 sbused(sb) && 9368 !(flags & TH_SYN)) 9369 flags |= TH_PUSH; 9370 9371 /* 9372 * Are we doing pacing, if so we must calculate the slot. We 9373 * only do hptsi in ESTABLISHED and with no RESET being 9374 * sent where we have data to send. 9375 */ 9376 if (((tp->t_state == TCPS_ESTABLISHED) || 9377 (tp->t_state == TCPS_CLOSE_WAIT) || 9378 ((tp->t_state == TCPS_FIN_WAIT_1) && 9379 ((tp->t_flags & TF_SENTFIN) == 0) && 9380 ((flags & TH_FIN) == 0))) && 9381 ((flags & TH_RST) == 0)) { 9382 /* Get our pacing rate */ 9383 tot_len_this_send += len; 9384 slot = rack_get_pacing_delay(rack, tp, tot_len_this_send); 9385 } 9386 SOCKBUF_UNLOCK(sb); 9387 } else { 9388 SOCKBUF_UNLOCK(sb); 9389 if (tp->t_flags & TF_ACKNOW) 9390 KMOD_TCPSTAT_INC(tcps_sndacks); 9391 else if (flags & (TH_SYN | TH_FIN | TH_RST)) 9392 KMOD_TCPSTAT_INC(tcps_sndctrl); 9393 else if (SEQ_GT(tp->snd_up, tp->snd_una)) 9394 KMOD_TCPSTAT_INC(tcps_sndurg); 9395 else 9396 KMOD_TCPSTAT_INC(tcps_sndwinup); 9397 9398 m = m_gethdr(M_NOWAIT, MT_DATA); 9399 if (m == NULL) { 9400 error = ENOBUFS; 9401 sack_rxmit = 0; 9402 goto out; 9403 } 9404 #ifdef INET6 9405 if (isipv6 && (MHLEN < hdrlen + max_linkhdr) && 9406 MHLEN >= hdrlen) { 9407 M_ALIGN(m, hdrlen); 9408 } else 9409 #endif 9410 m->m_data += max_linkhdr; 9411 m->m_len = hdrlen; 9412 } 9413 SOCKBUF_UNLOCK_ASSERT(sb); 9414 m->m_pkthdr.rcvif = (struct ifnet *)0; 9415 #ifdef MAC 9416 mac_inpcb_create_mbuf(inp, m); 9417 #endif 9418 #ifdef INET6 9419 if (isipv6) { 9420 ip6 = mtod(m, struct ip6_hdr *); 9421 #ifdef NETFLIX_TCPOUDP 9422 if (tp->t_port) { 9423 udp = (struct udphdr *)((caddr_t)ip6 + ipoptlen + sizeof(struct ip6_hdr)); 9424 udp->uh_sport = htons(V_tcp_udp_tunneling_port); 9425 udp->uh_dport = tp->t_port; 9426 ulen = hdrlen + len - sizeof(struct ip6_hdr); 9427 udp->uh_ulen = htons(ulen); 9428 th = (struct tcphdr *)(udp + 1); 9429 } else 9430 #endif 9431 th = (struct tcphdr *)(ip6 + 1); 9432 tcpip_fillheaders(inp, 9433 #ifdef NETFLIX_TCPOUDP 9434 tp->t_port, 9435 #endif 9436 ip6, th); 9437 } else 9438 #endif /* INET6 */ 9439 { 9440 ip = mtod(m, struct ip *); 9441 #ifdef TCPDEBUG 9442 ipov = (struct ipovly *)ip; 9443 #endif 9444 #ifdef NETFLIX_TCPOUDP 9445 if (tp->t_port) { 9446 udp = (struct udphdr *)((caddr_t)ip + ipoptlen + sizeof(struct ip)); 9447 udp->uh_sport = htons(V_tcp_udp_tunneling_port); 9448 udp->uh_dport = tp->t_port; 9449 ulen = hdrlen + len - sizeof(struct ip); 9450 udp->uh_ulen = htons(ulen); 9451 th = (struct tcphdr *)(udp + 1); 9452 } else 9453 #endif 9454 th = (struct tcphdr *)(ip + 1); 9455 tcpip_fillheaders(inp, 9456 #ifdef NETFLIX_TCPOUDP 9457 tp->t_port, 9458 #endif 9459 ip, th); 9460 } 9461 /* 9462 * Fill in fields, remembering maximum advertised window for use in 9463 * delaying messages about window sizes. If resending a FIN, be sure 9464 * not to use a new sequence number. 9465 */ 9466 if (flags & TH_FIN && tp->t_flags & TF_SENTFIN && 9467 tp->snd_nxt == tp->snd_max) 9468 tp->snd_nxt--; 9469 /* 9470 * If we are starting a connection, send ECN setup SYN packet. If we 9471 * are on a retransmit, we may resend those bits a number of times 9472 * as per RFC 3168. 9473 */ 9474 if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn == 1) { 9475 if (tp->t_rxtshift >= 1) { 9476 if (tp->t_rxtshift <= V_tcp_ecn_maxretries) 9477 flags |= TH_ECE | TH_CWR; 9478 } else 9479 flags |= TH_ECE | TH_CWR; 9480 } 9481 if (tp->t_state == TCPS_ESTABLISHED && 9482 (tp->t_flags2 & TF2_ECN_PERMIT)) { 9483 /* 9484 * If the peer has ECN, mark data packets with ECN capable 9485 * transmission (ECT). Ignore pure ack packets, 9486 * retransmissions and window probes. 9487 */ 9488 if (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) && 9489 (sack_rxmit == 0) && 9490 !((tp->t_flags & TF_FORCEDATA) && len == 1)) { 9491 #ifdef INET6 9492 if (isipv6) 9493 ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20); 9494 else 9495 #endif 9496 ip->ip_tos |= IPTOS_ECN_ECT0; 9497 KMOD_TCPSTAT_INC(tcps_ecn_ect0); 9498 } 9499 /* 9500 * Reply with proper ECN notifications. 9501 */ 9502 if (tp->t_flags2 & TF2_ECN_SND_CWR) { 9503 flags |= TH_CWR; 9504 tp->t_flags2 &= ~TF2_ECN_SND_CWR; 9505 } 9506 if (tp->t_flags2 & TF2_ECN_SND_ECE) 9507 flags |= TH_ECE; 9508 } 9509 /* 9510 * If we are doing retransmissions, then snd_nxt will not reflect 9511 * the first unsent octet. For ACK only packets, we do not want the 9512 * sequence number of the retransmitted packet, we want the sequence 9513 * number of the next unsent octet. So, if there is no data (and no 9514 * SYN or FIN), use snd_max instead of snd_nxt when filling in 9515 * ti_seq. But if we are in persist state, snd_max might reflect 9516 * one byte beyond the right edge of the window, so use snd_nxt in 9517 * that case, since we know we aren't doing a retransmission. 9518 * (retransmit and persist are mutually exclusive...) 9519 */ 9520 if (sack_rxmit == 0) { 9521 if (len || (flags & (TH_SYN | TH_FIN)) || 9522 rack->rc_in_persist) { 9523 th->th_seq = htonl(tp->snd_nxt); 9524 rack_seq = tp->snd_nxt; 9525 } else if (flags & TH_RST) { 9526 /* 9527 * For a Reset send the last cum ack in sequence 9528 * (this like any other choice may still generate a 9529 * challenge ack, if a ack-update packet is in 9530 * flight). 9531 */ 9532 th->th_seq = htonl(tp->snd_una); 9533 rack_seq = tp->snd_una; 9534 } else { 9535 th->th_seq = htonl(tp->snd_max); 9536 rack_seq = tp->snd_max; 9537 } 9538 } else { 9539 th->th_seq = htonl(rsm->r_start); 9540 rack_seq = rsm->r_start; 9541 } 9542 th->th_ack = htonl(tp->rcv_nxt); 9543 if (optlen) { 9544 bcopy(opt, th + 1, optlen); 9545 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 9546 } 9547 th->th_flags = flags; 9548 /* 9549 * Calculate receive window. Don't shrink window, but avoid silly 9550 * window syndrome. 9551 * If a RST segment is sent, advertise a window of zero. 9552 */ 9553 if (flags & TH_RST) { 9554 recwin = 0; 9555 } else { 9556 if (recwin < (long)(so->so_rcv.sb_hiwat / 4) && 9557 recwin < (long)ctf_fixed_maxseg(tp)) 9558 recwin = 0; 9559 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) && 9560 recwin < (long)(tp->rcv_adv - tp->rcv_nxt)) 9561 recwin = (long)(tp->rcv_adv - tp->rcv_nxt); 9562 if (recwin > (long)TCP_MAXWIN << tp->rcv_scale) 9563 recwin = (long)TCP_MAXWIN << tp->rcv_scale; 9564 } 9565 9566 /* 9567 * According to RFC1323 the window field in a SYN (i.e., a <SYN> or 9568 * <SYN,ACK>) segment itself is never scaled. The <SYN,ACK> case is 9569 * handled in syncache. 9570 */ 9571 if (flags & TH_SYN) 9572 th->th_win = htons((u_short) 9573 (min(sbspace(&so->so_rcv), TCP_MAXWIN))); 9574 else 9575 th->th_win = htons((u_short)(recwin >> tp->rcv_scale)); 9576 /* 9577 * Adjust the RXWIN0SENT flag - indicate that we have advertised a 0 9578 * window. This may cause the remote transmitter to stall. This 9579 * flag tells soreceive() to disable delayed acknowledgements when 9580 * draining the buffer. This can occur if the receiver is 9581 * attempting to read more data than can be buffered prior to 9582 * transmitting on the connection. 9583 */ 9584 if (th->th_win == 0) { 9585 tp->t_sndzerowin++; 9586 tp->t_flags |= TF_RXWIN0SENT; 9587 } else 9588 tp->t_flags &= ~TF_RXWIN0SENT; 9589 if (SEQ_GT(tp->snd_up, tp->snd_nxt)) { 9590 th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt)); 9591 th->th_flags |= TH_URG; 9592 } else 9593 /* 9594 * If no urgent pointer to send, then we pull the urgent 9595 * pointer to the left edge of the send window so that it 9596 * doesn't drift into the send window on sequence number 9597 * wraparound. 9598 */ 9599 tp->snd_up = tp->snd_una; /* drag it along */ 9600 9601 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) 9602 if (to.to_flags & TOF_SIGNATURE) { 9603 /* 9604 * Calculate MD5 signature and put it into the place 9605 * determined before. 9606 * NOTE: since TCP options buffer doesn't point into 9607 * mbuf's data, calculate offset and use it. 9608 */ 9609 if (!TCPMD5_ENABLED() || TCPMD5_OUTPUT(m, th, 9610 (u_char *)(th + 1) + (to.to_signature - opt)) != 0) { 9611 /* 9612 * Do not send segment if the calculation of MD5 9613 * digest has failed. 9614 */ 9615 goto out; 9616 } 9617 } 9618 #endif 9619 9620 /* 9621 * Put TCP length in extended header, and then checksum extended 9622 * header and data. 9623 */ 9624 m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */ 9625 #ifdef INET6 9626 if (isipv6) { 9627 /* 9628 * ip6_plen is not need to be filled now, and will be filled 9629 * in ip6_output. 9630 */ 9631 if (tp->t_port) { 9632 m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; 9633 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 9634 udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0); 9635 th->th_sum = htons(0); 9636 UDPSTAT_INC(udps_opackets); 9637 } else { 9638 m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; 9639 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 9640 th->th_sum = in6_cksum_pseudo(ip6, 9641 sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP, 9642 0); 9643 } 9644 } 9645 #endif 9646 #if defined(INET6) && defined(INET) 9647 else 9648 #endif 9649 #ifdef INET 9650 { 9651 if (tp->t_port) { 9652 m->m_pkthdr.csum_flags = CSUM_UDP; 9653 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 9654 udp->uh_sum = in_pseudo(ip->ip_src.s_addr, 9655 ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP)); 9656 th->th_sum = htons(0); 9657 UDPSTAT_INC(udps_opackets); 9658 } else { 9659 m->m_pkthdr.csum_flags = CSUM_TCP; 9660 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 9661 th->th_sum = in_pseudo(ip->ip_src.s_addr, 9662 ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + 9663 IPPROTO_TCP + len + optlen)); 9664 } 9665 /* IP version must be set here for ipv4/ipv6 checking later */ 9666 KASSERT(ip->ip_v == IPVERSION, 9667 ("%s: IP version incorrect: %d", __func__, ip->ip_v)); 9668 } 9669 #endif 9670 /* 9671 * Enable TSO and specify the size of the segments. The TCP pseudo 9672 * header checksum is always provided. XXX: Fixme: This is currently 9673 * not the case for IPv6. 9674 */ 9675 if (tso || force_tso) { 9676 KASSERT(force_tso || len > tp->t_maxseg - optlen, 9677 ("%s: len <= tso_segsz", __func__)); 9678 m->m_pkthdr.csum_flags |= CSUM_TSO; 9679 m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen; 9680 } 9681 KASSERT(len + hdrlen == m_length(m, NULL), 9682 ("%s: mbuf chain different than expected: %d + %u != %u", 9683 __func__, len, hdrlen, m_length(m, NULL))); 9684 9685 #ifdef TCP_HHOOK 9686 /* Run HHOOK_TCP_ESTABLISHED_OUT helper hooks. */ 9687 hhook_run_tcp_est_out(tp, th, &to, len, tso); 9688 #endif 9689 #ifdef TCPDEBUG 9690 /* 9691 * Trace. 9692 */ 9693 if (so->so_options & SO_DEBUG) { 9694 u_short save = 0; 9695 9696 #ifdef INET6 9697 if (!isipv6) 9698 #endif 9699 { 9700 save = ipov->ih_len; 9701 ipov->ih_len = htons(m->m_pkthdr.len /* - hdrlen + 9702 * (th->th_off << 2) */ ); 9703 } 9704 tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0); 9705 #ifdef INET6 9706 if (!isipv6) 9707 #endif 9708 ipov->ih_len = save; 9709 } 9710 #endif /* TCPDEBUG */ 9711 9712 /* We're getting ready to send; log now. */ 9713 if (tp->t_logstate != TCP_LOG_STATE_OFF) { 9714 union tcp_log_stackspecific log; 9715 struct timeval tv; 9716 9717 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 9718 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 9719 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 9720 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; 9721 log.u_bbr.flex2 = rack->r_ctl.rc_pace_min_segs; 9722 log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs; 9723 log.u_bbr.flex4 = orig_len; 9724 if (filled_all) 9725 log.u_bbr.flex5 = 0x80000000; 9726 else 9727 log.u_bbr.flex5 = 0; 9728 if (rsm || sack_rxmit) { 9729 log.u_bbr.flex8 = 1; 9730 } else { 9731 log.u_bbr.flex8 = 0; 9732 } 9733 log.u_bbr.pkts_out = tp->t_maxseg; 9734 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 9735 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 9736 lgb = tcp_log_event_(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_OUT, ERRNO_UNK, 9737 len, &log, false, NULL, NULL, 0, &tv); 9738 } else 9739 lgb = NULL; 9740 9741 /* 9742 * Fill in IP length and desired time to live and send to IP level. 9743 * There should be a better way to handle ttl and tos; we could keep 9744 * them in the template, but need a way to checksum without them. 9745 */ 9746 /* 9747 * m->m_pkthdr.len should have been set before cksum calcuration, 9748 * because in6_cksum() need it. 9749 */ 9750 #ifdef INET6 9751 if (isipv6) { 9752 /* 9753 * we separately set hoplimit for every segment, since the 9754 * user might want to change the value via setsockopt. Also, 9755 * desired default hop limit might be changed via Neighbor 9756 * Discovery. 9757 */ 9758 ip6->ip6_hlim = in6_selecthlim(inp, NULL); 9759 9760 /* 9761 * Set the packet size here for the benefit of DTrace 9762 * probes. ip6_output() will set it properly; it's supposed 9763 * to include the option header lengths as well. 9764 */ 9765 ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6)); 9766 9767 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) 9768 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 9769 else 9770 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 9771 9772 if (tp->t_state == TCPS_SYN_SENT) 9773 TCP_PROBE5(connect__request, NULL, tp, ip6, tp, th); 9774 9775 TCP_PROBE5(send, NULL, tp, ip6, tp, th); 9776 /* TODO: IPv6 IP6TOS_ECT bit on */ 9777 error = ip6_output(m, tp->t_inpcb->in6p_outputopts, 9778 &inp->inp_route6, 9779 ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), 9780 NULL, NULL, inp); 9781 9782 if (error == EMSGSIZE && inp->inp_route6.ro_nh != NULL) 9783 mtu = inp->inp_route6.ro_nh->nh_mtu; 9784 } 9785 #endif /* INET6 */ 9786 #if defined(INET) && defined(INET6) 9787 else 9788 #endif 9789 #ifdef INET 9790 { 9791 ip->ip_len = htons(m->m_pkthdr.len); 9792 #ifdef INET6 9793 if (inp->inp_vflag & INP_IPV6PROTO) 9794 ip->ip_ttl = in6_selecthlim(inp, NULL); 9795 #endif /* INET6 */ 9796 /* 9797 * If we do path MTU discovery, then we set DF on every 9798 * packet. This might not be the best thing to do according 9799 * to RFC3390 Section 2. However the tcp hostcache migitates 9800 * the problem so it affects only the first tcp connection 9801 * with a host. 9802 * 9803 * NB: Don't set DF on small MTU/MSS to have a safe 9804 * fallback. 9805 */ 9806 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) { 9807 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 9808 if (tp->t_port == 0 || len < V_tcp_minmss) { 9809 ip->ip_off |= htons(IP_DF); 9810 } 9811 } else { 9812 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 9813 } 9814 9815 if (tp->t_state == TCPS_SYN_SENT) 9816 TCP_PROBE5(connect__request, NULL, tp, ip, tp, th); 9817 9818 TCP_PROBE5(send, NULL, tp, ip, tp, th); 9819 9820 error = ip_output(m, tp->t_inpcb->inp_options, &inp->inp_route, 9821 ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), 0, 9822 inp); 9823 if (error == EMSGSIZE && inp->inp_route.ro_nh != NULL) 9824 mtu = inp->inp_route.ro_nh->nh_mtu; 9825 } 9826 #endif /* INET */ 9827 9828 out: 9829 if (lgb) { 9830 lgb->tlb_errno = error; 9831 lgb = NULL; 9832 } 9833 /* 9834 * In transmit state, time the transmission and arrange for the 9835 * retransmit. In persist state, just set snd_max. 9836 */ 9837 if (error == 0) { 9838 if (TCPS_HAVEESTABLISHED(tp->t_state) && 9839 (tp->t_flags & TF_SACK_PERMIT) && 9840 tp->rcv_numsacks > 0) 9841 tcp_clean_dsack_blocks(tp); 9842 if (len == 0) 9843 counter_u64_add(rack_out_size[TCP_MSS_ACCT_SNDACK], 1); 9844 else if (len == 1) { 9845 counter_u64_add(rack_out_size[TCP_MSS_ACCT_PERSIST], 1); 9846 } else if (len > 1) { 9847 int idx; 9848 9849 idx = (len / ctf_fixed_maxseg(tp)) + 3; 9850 if (idx >= TCP_MSS_ACCT_ATIMER) 9851 counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1); 9852 else 9853 counter_u64_add(rack_out_size[idx], 1); 9854 } 9855 if (hw_tls && len > 0) { 9856 if (filled_all) { 9857 counter_u64_add(rack_tls_filled, 1); 9858 rack_log_type_hrdwtso(tp, rack, len, 0, orig_len, 1); 9859 } else { 9860 if (rsm) { 9861 counter_u64_add(rack_tls_rxt, 1); 9862 rack_log_type_hrdwtso(tp, rack, len, 2, orig_len, 1); 9863 } else if (doing_tlp) { 9864 counter_u64_add(rack_tls_tlp, 1); 9865 rack_log_type_hrdwtso(tp, rack, len, 3, orig_len, 1); 9866 } else if ( (ctf_outstanding(tp) + rack->r_ctl.rc_pace_min_segs) > sbavail(sb)) { 9867 counter_u64_add(rack_tls_app, 1); 9868 rack_log_type_hrdwtso(tp, rack, len, 4, orig_len, 1); 9869 } else if ((ctf_flight_size(tp, rack->r_ctl.rc_sacked) + rack->r_ctl.rc_pace_min_segs) > tp->snd_cwnd) { 9870 counter_u64_add(rack_tls_cwnd, 1); 9871 rack_log_type_hrdwtso(tp, rack, len, 5, orig_len, 1); 9872 } else if ((ctf_outstanding(tp) + rack->r_ctl.rc_pace_min_segs) > tp->snd_wnd) { 9873 counter_u64_add(rack_tls_rwnd, 1); 9874 rack_log_type_hrdwtso(tp, rack, len, 6, orig_len, 1); 9875 } else { 9876 rack_log_type_hrdwtso(tp, rack, len, 7, orig_len, 1); 9877 counter_u64_add(rack_tls_other, 1); 9878 } 9879 } 9880 } 9881 } 9882 if (sub_from_prr && (error == 0)) { 9883 if (rack->r_ctl.rc_prr_sndcnt >= len) 9884 rack->r_ctl.rc_prr_sndcnt -= len; 9885 else 9886 rack->r_ctl.rc_prr_sndcnt = 0; 9887 } 9888 sub_from_prr = 0; 9889 rack_log_output(tp, &to, len, rack_seq, (uint8_t) flags, error, cts, 9890 pass, rsm); 9891 if ((error == 0) && 9892 (len > 0) && 9893 (tp->snd_una == tp->snd_max)) 9894 rack->r_ctl.rc_tlp_rxt_last_time = cts; 9895 if ((tp->t_flags & TF_FORCEDATA) == 0 || 9896 (rack->rc_in_persist == 0)) { 9897 tcp_seq startseq = tp->snd_nxt; 9898 9899 /* 9900 * Advance snd_nxt over sequence space of this segment. 9901 */ 9902 if (error) 9903 /* We don't log or do anything with errors */ 9904 goto nomore; 9905 9906 if (flags & (TH_SYN | TH_FIN)) { 9907 if (flags & TH_SYN) 9908 tp->snd_nxt++; 9909 if (flags & TH_FIN) { 9910 tp->snd_nxt++; 9911 tp->t_flags |= TF_SENTFIN; 9912 } 9913 } 9914 /* In the ENOBUFS case we do *not* update snd_max */ 9915 if (sack_rxmit) 9916 goto nomore; 9917 9918 tp->snd_nxt += len; 9919 if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { 9920 if (tp->snd_una == tp->snd_max) { 9921 /* 9922 * Update the time we just added data since 9923 * none was outstanding. 9924 */ 9925 rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__); 9926 tp->t_acktime = ticks; 9927 } 9928 tp->snd_max = tp->snd_nxt; 9929 /* 9930 * Time this transmission if not a retransmission and 9931 * not currently timing anything. 9932 * This is only relevant in case of switching back to 9933 * the base stack. 9934 */ 9935 if (tp->t_rtttime == 0) { 9936 tp->t_rtttime = ticks; 9937 tp->t_rtseq = startseq; 9938 KMOD_TCPSTAT_INC(tcps_segstimed); 9939 } 9940 #ifdef STATS 9941 if (!(tp->t_flags & TF_GPUTINPROG) && len) { 9942 tp->t_flags |= TF_GPUTINPROG; 9943 tp->gput_seq = startseq; 9944 tp->gput_ack = startseq + 9945 ulmin(sbavail(sb) - sb_offset, sendwin); 9946 tp->gput_ts = tcp_ts_getticks(); 9947 } 9948 #endif 9949 } 9950 } else { 9951 /* 9952 * Persist case, update snd_max but since we are in persist 9953 * mode (no window) we do not update snd_nxt. 9954 */ 9955 int32_t xlen = len; 9956 9957 if (error) 9958 goto nomore; 9959 9960 if (flags & TH_SYN) 9961 ++xlen; 9962 if (flags & TH_FIN) { 9963 ++xlen; 9964 tp->t_flags |= TF_SENTFIN; 9965 } 9966 /* In the ENOBUFS case we do *not* update snd_max */ 9967 if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max)) { 9968 if (tp->snd_una == tp->snd_max) { 9969 /* 9970 * Update the time we just added data since 9971 * none was outstanding. 9972 */ 9973 rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__); 9974 tp->t_acktime = ticks; 9975 } 9976 tp->snd_max = tp->snd_nxt + len; 9977 } 9978 } 9979 nomore: 9980 if (error) { 9981 SOCKBUF_UNLOCK_ASSERT(sb); /* Check gotos. */ 9982 /* 9983 * Failures do not advance the seq counter above. For the 9984 * case of ENOBUFS we will fall out and retry in 1ms with 9985 * the hpts. Everything else will just have to retransmit 9986 * with the timer. 9987 * 9988 * In any case, we do not want to loop around for another 9989 * send without a good reason. 9990 */ 9991 sendalot = 0; 9992 switch (error) { 9993 case EPERM: 9994 tp->t_flags &= ~TF_FORCEDATA; 9995 tp->t_softerror = error; 9996 return (error); 9997 case ENOBUFS: 9998 if (slot == 0) { 9999 /* 10000 * Pace us right away to retry in a some 10001 * time 10002 */ 10003 slot = 1 + rack->rc_enobuf; 10004 if (rack->rc_enobuf < 255) 10005 rack->rc_enobuf++; 10006 if (slot > (rack->rc_rack_rtt / 2)) { 10007 slot = rack->rc_rack_rtt / 2; 10008 } 10009 if (slot < 10) 10010 slot = 10; 10011 } 10012 counter_u64_add(rack_saw_enobuf, 1); 10013 error = 0; 10014 goto enobufs; 10015 case EMSGSIZE: 10016 /* 10017 * For some reason the interface we used initially 10018 * to send segments changed to another or lowered 10019 * its MTU. If TSO was active we either got an 10020 * interface without TSO capabilits or TSO was 10021 * turned off. If we obtained mtu from ip_output() 10022 * then update it and try again. 10023 */ 10024 if (tso) 10025 tp->t_flags &= ~TF_TSO; 10026 if (mtu != 0) { 10027 tcp_mss_update(tp, -1, mtu, NULL, NULL); 10028 goto again; 10029 } 10030 slot = 10; 10031 rack_start_hpts_timer(rack, tp, cts, slot, 0, 0); 10032 tp->t_flags &= ~TF_FORCEDATA; 10033 return (error); 10034 case ENETUNREACH: 10035 counter_u64_add(rack_saw_enetunreach, 1); 10036 case EHOSTDOWN: 10037 case EHOSTUNREACH: 10038 case ENETDOWN: 10039 if (TCPS_HAVERCVDSYN(tp->t_state)) { 10040 tp->t_softerror = error; 10041 } 10042 /* FALLTHROUGH */ 10043 default: 10044 slot = 10; 10045 rack_start_hpts_timer(rack, tp, cts, slot, 0, 0); 10046 tp->t_flags &= ~TF_FORCEDATA; 10047 return (error); 10048 } 10049 } else { 10050 rack->rc_enobuf = 0; 10051 } 10052 KMOD_TCPSTAT_INC(tcps_sndtotal); 10053 10054 /* 10055 * Data sent (as far as we can tell). If this advertises a larger 10056 * window than any other segment, then remember the size of the 10057 * advertised window. Any pending ACK has now been sent. 10058 */ 10059 if (recwin > 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv)) 10060 tp->rcv_adv = tp->rcv_nxt + recwin; 10061 tp->last_ack_sent = tp->rcv_nxt; 10062 tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); 10063 enobufs: 10064 rack->r_tlp_running = 0; 10065 if (flags & TH_RST) { 10066 /* 10067 * We don't send again after sending a RST. 10068 */ 10069 slot = 0; 10070 sendalot = 0; 10071 } 10072 if (rsm && (slot == 0)) { 10073 /* 10074 * Dup ack retransmission possibly, so 10075 * lets assure we have at least min rack 10076 * time, if its a rack resend then the rack 10077 * to will also be set to this. 10078 */ 10079 slot = rack->r_ctl.rc_min_to; 10080 } 10081 if (slot) { 10082 /* set the rack tcb into the slot N */ 10083 counter_u64_add(rack_paced_segments, 1); 10084 } else if (sendalot) { 10085 if (len) 10086 counter_u64_add(rack_unpaced_segments, 1); 10087 sack_rxmit = 0; 10088 tp->t_flags &= ~TF_FORCEDATA; 10089 goto again; 10090 } else if (len) { 10091 counter_u64_add(rack_unpaced_segments, 1); 10092 } 10093 tp->t_flags &= ~TF_FORCEDATA; 10094 rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, 0); 10095 return (error); 10096 } 10097 10098 /* 10099 * rack_ctloutput() must drop the inpcb lock before performing copyin on 10100 * socket option arguments. When it re-acquires the lock after the copy, it 10101 * has to revalidate that the connection is still valid for the socket 10102 * option. 10103 */ 10104 static int 10105 rack_set_sockopt(struct socket *so, struct sockopt *sopt, 10106 struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack) 10107 { 10108 struct epoch_tracker et; 10109 int32_t error = 0, optval; 10110 10111 switch (sopt->sopt_name) { 10112 case TCP_RACK_PROP_RATE: 10113 case TCP_RACK_PROP: 10114 case TCP_RACK_TLP_REDUCE: 10115 case TCP_RACK_EARLY_RECOV: 10116 case TCP_RACK_PACE_ALWAYS: 10117 case TCP_DELACK: 10118 case TCP_RACK_PACE_REDUCE: 10119 case TCP_RACK_PACE_MAX_SEG: 10120 case TCP_RACK_PRR_SENDALOT: 10121 case TCP_RACK_MIN_TO: 10122 case TCP_RACK_EARLY_SEG: 10123 case TCP_RACK_REORD_THRESH: 10124 case TCP_RACK_REORD_FADE: 10125 case TCP_RACK_TLP_THRESH: 10126 case TCP_RACK_PKT_DELAY: 10127 case TCP_RACK_TLP_USE: 10128 case TCP_RACK_TLP_INC_VAR: 10129 case TCP_RACK_IDLE_REDUCE_HIGH: 10130 case TCP_RACK_MIN_PACE: 10131 case TCP_RACK_GP_INCREASE: 10132 case TCP_BBR_RACK_RTT_USE: 10133 case TCP_BBR_USE_RACK_CHEAT: 10134 case TCP_RACK_DO_DETECTION: 10135 case TCP_DATA_AFTER_CLOSE: 10136 break; 10137 default: 10138 return (tcp_default_ctloutput(so, sopt, inp, tp)); 10139 break; 10140 } 10141 INP_WUNLOCK(inp); 10142 error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); 10143 if (error) 10144 return (error); 10145 INP_WLOCK(inp); 10146 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 10147 INP_WUNLOCK(inp); 10148 return (ECONNRESET); 10149 } 10150 tp = intotcpcb(inp); 10151 rack = (struct tcp_rack *)tp->t_fb_ptr; 10152 switch (sopt->sopt_name) { 10153 case TCP_RACK_DO_DETECTION: 10154 RACK_OPTS_INC(tcp_rack_do_detection); 10155 if (optval == 0) 10156 rack->do_detection = 0; 10157 else 10158 rack->do_detection = 1; 10159 break; 10160 case TCP_RACK_PROP_RATE: 10161 if ((optval <= 0) || (optval >= 100)) { 10162 error = EINVAL; 10163 break; 10164 } 10165 RACK_OPTS_INC(tcp_rack_prop_rate); 10166 rack->r_ctl.rc_prop_rate = optval; 10167 break; 10168 case TCP_RACK_TLP_USE: 10169 if ((optval < TLP_USE_ID) || (optval > TLP_USE_TWO_TWO)) { 10170 error = EINVAL; 10171 break; 10172 } 10173 RACK_OPTS_INC(tcp_tlp_use); 10174 rack->rack_tlp_threshold_use = optval; 10175 break; 10176 case TCP_RACK_PROP: 10177 /* RACK proportional rate reduction (bool) */ 10178 RACK_OPTS_INC(tcp_rack_prop); 10179 rack->r_ctl.rc_prop_reduce = optval; 10180 break; 10181 case TCP_RACK_TLP_REDUCE: 10182 /* RACK TLP cwnd reduction (bool) */ 10183 RACK_OPTS_INC(tcp_rack_tlp_reduce); 10184 rack->r_ctl.rc_tlp_cwnd_reduce = optval; 10185 break; 10186 case TCP_RACK_EARLY_RECOV: 10187 /* Should recovery happen early (bool) */ 10188 RACK_OPTS_INC(tcp_rack_early_recov); 10189 rack->r_ctl.rc_early_recovery = optval; 10190 break; 10191 case TCP_RACK_PACE_ALWAYS: 10192 /* Use the always pace method (bool) */ 10193 RACK_OPTS_INC(tcp_rack_pace_always); 10194 if (optval > 0) 10195 rack->rc_always_pace = 1; 10196 else 10197 rack->rc_always_pace = 0; 10198 break; 10199 case TCP_RACK_PACE_REDUCE: 10200 /* RACK Hptsi reduction factor (divisor) */ 10201 RACK_OPTS_INC(tcp_rack_pace_reduce); 10202 if (optval) 10203 /* Must be non-zero */ 10204 rack->rc_pace_reduce = optval; 10205 else 10206 error = EINVAL; 10207 break; 10208 case TCP_RACK_PACE_MAX_SEG: 10209 /* Max segments in a pace */ 10210 RACK_OPTS_INC(tcp_rack_max_seg); 10211 rack->rc_pace_max_segs = optval; 10212 rack_set_pace_segments(tp, rack); 10213 break; 10214 case TCP_RACK_PRR_SENDALOT: 10215 /* Allow PRR to send more than one seg */ 10216 RACK_OPTS_INC(tcp_rack_prr_sendalot); 10217 rack->r_ctl.rc_prr_sendalot = optval; 10218 break; 10219 case TCP_RACK_MIN_TO: 10220 /* Minimum time between rack t-o's in ms */ 10221 RACK_OPTS_INC(tcp_rack_min_to); 10222 rack->r_ctl.rc_min_to = optval; 10223 break; 10224 case TCP_RACK_EARLY_SEG: 10225 /* If early recovery max segments */ 10226 RACK_OPTS_INC(tcp_rack_early_seg); 10227 rack->r_ctl.rc_early_recovery_segs = optval; 10228 break; 10229 case TCP_RACK_REORD_THRESH: 10230 /* RACK reorder threshold (shift amount) */ 10231 RACK_OPTS_INC(tcp_rack_reord_thresh); 10232 if ((optval > 0) && (optval < 31)) 10233 rack->r_ctl.rc_reorder_shift = optval; 10234 else 10235 error = EINVAL; 10236 break; 10237 case TCP_RACK_REORD_FADE: 10238 /* Does reordering fade after ms time */ 10239 RACK_OPTS_INC(tcp_rack_reord_fade); 10240 rack->r_ctl.rc_reorder_fade = optval; 10241 break; 10242 case TCP_RACK_TLP_THRESH: 10243 /* RACK TLP theshold i.e. srtt+(srtt/N) */ 10244 RACK_OPTS_INC(tcp_rack_tlp_thresh); 10245 if (optval) 10246 rack->r_ctl.rc_tlp_threshold = optval; 10247 else 10248 error = EINVAL; 10249 break; 10250 case TCP_BBR_USE_RACK_CHEAT: 10251 RACK_OPTS_INC(tcp_rack_cheat); 10252 if (optval) 10253 rack->use_rack_cheat = 1; 10254 else 10255 rack->use_rack_cheat = 0; 10256 break; 10257 case TCP_RACK_PKT_DELAY: 10258 /* RACK added ms i.e. rack-rtt + reord + N */ 10259 RACK_OPTS_INC(tcp_rack_pkt_delay); 10260 rack->r_ctl.rc_pkt_delay = optval; 10261 break; 10262 case TCP_RACK_TLP_INC_VAR: 10263 /* Does TLP include rtt variance in t-o */ 10264 error = EINVAL; 10265 break; 10266 case TCP_RACK_IDLE_REDUCE_HIGH: 10267 error = EINVAL; 10268 break; 10269 case TCP_DELACK: 10270 if (optval == 0) 10271 tp->t_delayed_ack = 0; 10272 else 10273 tp->t_delayed_ack = 1; 10274 if (tp->t_flags & TF_DELACK) { 10275 tp->t_flags &= ~TF_DELACK; 10276 tp->t_flags |= TF_ACKNOW; 10277 NET_EPOCH_ENTER(et); 10278 rack_output(tp); 10279 NET_EPOCH_EXIT(et); 10280 } 10281 break; 10282 case TCP_RACK_MIN_PACE: 10283 RACK_OPTS_INC(tcp_rack_min_pace); 10284 if (optval > 3) 10285 rack->r_enforce_min_pace = 3; 10286 else 10287 rack->r_enforce_min_pace = optval; 10288 break; 10289 case TCP_RACK_GP_INCREASE: 10290 if ((optval >= 0) && 10291 (optval <= 256)) 10292 rack->rack_per_of_gp = optval; 10293 else 10294 error = EINVAL; 10295 10296 break; 10297 case TCP_BBR_RACK_RTT_USE: 10298 if ((optval != USE_RTT_HIGH) && 10299 (optval != USE_RTT_LOW) && 10300 (optval != USE_RTT_AVG)) 10301 error = EINVAL; 10302 else 10303 rack->r_ctl.rc_rate_sample_method = optval; 10304 break; 10305 case TCP_DATA_AFTER_CLOSE: 10306 if (optval) 10307 rack->rc_allow_data_af_clo = 1; 10308 else 10309 rack->rc_allow_data_af_clo = 0; 10310 break; 10311 default: 10312 return (tcp_default_ctloutput(so, sopt, inp, tp)); 10313 break; 10314 } 10315 #ifdef NETFLIX_STATS 10316 tcp_log_socket_option(tp, sopt->sopt_name, optval, error); 10317 #endif 10318 INP_WUNLOCK(inp); 10319 return (error); 10320 } 10321 10322 static int 10323 rack_get_sockopt(struct socket *so, struct sockopt *sopt, 10324 struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack) 10325 { 10326 int32_t error, optval; 10327 10328 /* 10329 * Because all our options are either boolean or an int, we can just 10330 * pull everything into optval and then unlock and copy. If we ever 10331 * add a option that is not a int, then this will have quite an 10332 * impact to this routine. 10333 */ 10334 error = 0; 10335 switch (sopt->sopt_name) { 10336 case TCP_RACK_DO_DETECTION: 10337 optval = rack->do_detection; 10338 break; 10339 10340 case TCP_RACK_PROP_RATE: 10341 optval = rack->r_ctl.rc_prop_rate; 10342 break; 10343 case TCP_RACK_PROP: 10344 /* RACK proportional rate reduction (bool) */ 10345 optval = rack->r_ctl.rc_prop_reduce; 10346 break; 10347 case TCP_RACK_TLP_REDUCE: 10348 /* RACK TLP cwnd reduction (bool) */ 10349 optval = rack->r_ctl.rc_tlp_cwnd_reduce; 10350 break; 10351 case TCP_RACK_EARLY_RECOV: 10352 /* Should recovery happen early (bool) */ 10353 optval = rack->r_ctl.rc_early_recovery; 10354 break; 10355 case TCP_RACK_PACE_REDUCE: 10356 /* RACK Hptsi reduction factor (divisor) */ 10357 optval = rack->rc_pace_reduce; 10358 break; 10359 case TCP_RACK_PACE_MAX_SEG: 10360 /* Max segments in a pace */ 10361 optval = rack->rc_pace_max_segs; 10362 break; 10363 case TCP_RACK_PACE_ALWAYS: 10364 /* Use the always pace method */ 10365 optval = rack->rc_always_pace; 10366 break; 10367 case TCP_RACK_PRR_SENDALOT: 10368 /* Allow PRR to send more than one seg */ 10369 optval = rack->r_ctl.rc_prr_sendalot; 10370 break; 10371 case TCP_RACK_MIN_TO: 10372 /* Minimum time between rack t-o's in ms */ 10373 optval = rack->r_ctl.rc_min_to; 10374 break; 10375 case TCP_RACK_EARLY_SEG: 10376 /* If early recovery max segments */ 10377 optval = rack->r_ctl.rc_early_recovery_segs; 10378 break; 10379 case TCP_RACK_REORD_THRESH: 10380 /* RACK reorder threshold (shift amount) */ 10381 optval = rack->r_ctl.rc_reorder_shift; 10382 break; 10383 case TCP_RACK_REORD_FADE: 10384 /* Does reordering fade after ms time */ 10385 optval = rack->r_ctl.rc_reorder_fade; 10386 break; 10387 case TCP_BBR_USE_RACK_CHEAT: 10388 /* Do we use the rack cheat for rxt */ 10389 optval = rack->use_rack_cheat; 10390 break; 10391 case TCP_RACK_TLP_THRESH: 10392 /* RACK TLP theshold i.e. srtt+(srtt/N) */ 10393 optval = rack->r_ctl.rc_tlp_threshold; 10394 break; 10395 case TCP_RACK_PKT_DELAY: 10396 /* RACK added ms i.e. rack-rtt + reord + N */ 10397 optval = rack->r_ctl.rc_pkt_delay; 10398 break; 10399 case TCP_RACK_TLP_USE: 10400 optval = rack->rack_tlp_threshold_use; 10401 break; 10402 case TCP_RACK_TLP_INC_VAR: 10403 /* Does TLP include rtt variance in t-o */ 10404 error = EINVAL; 10405 break; 10406 case TCP_RACK_IDLE_REDUCE_HIGH: 10407 error = EINVAL; 10408 break; 10409 case TCP_RACK_MIN_PACE: 10410 optval = rack->r_enforce_min_pace; 10411 break; 10412 case TCP_RACK_GP_INCREASE: 10413 optval = rack->rack_per_of_gp; 10414 break; 10415 case TCP_BBR_RACK_RTT_USE: 10416 optval = rack->r_ctl.rc_rate_sample_method; 10417 break; 10418 case TCP_DELACK: 10419 optval = tp->t_delayed_ack; 10420 break; 10421 case TCP_DATA_AFTER_CLOSE: 10422 optval = rack->rc_allow_data_af_clo; 10423 break; 10424 default: 10425 return (tcp_default_ctloutput(so, sopt, inp, tp)); 10426 break; 10427 } 10428 INP_WUNLOCK(inp); 10429 if (error == 0) { 10430 error = sooptcopyout(sopt, &optval, sizeof optval); 10431 } 10432 return (error); 10433 } 10434 10435 static int 10436 rack_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp) 10437 { 10438 int32_t error = EINVAL; 10439 struct tcp_rack *rack; 10440 10441 rack = (struct tcp_rack *)tp->t_fb_ptr; 10442 if (rack == NULL) { 10443 /* Huh? */ 10444 goto out; 10445 } 10446 if (sopt->sopt_dir == SOPT_SET) { 10447 return (rack_set_sockopt(so, sopt, inp, tp, rack)); 10448 } else if (sopt->sopt_dir == SOPT_GET) { 10449 return (rack_get_sockopt(so, sopt, inp, tp, rack)); 10450 } 10451 out: 10452 INP_WUNLOCK(inp); 10453 return (error); 10454 } 10455 10456 10457 static struct tcp_function_block __tcp_rack = { 10458 .tfb_tcp_block_name = __XSTRING(STACKNAME), 10459 .tfb_tcp_output = rack_output, 10460 .tfb_do_queued_segments = ctf_do_queued_segments, 10461 .tfb_do_segment_nounlock = rack_do_segment_nounlock, 10462 .tfb_tcp_do_segment = rack_do_segment, 10463 .tfb_tcp_ctloutput = rack_ctloutput, 10464 .tfb_tcp_fb_init = rack_init, 10465 .tfb_tcp_fb_fini = rack_fini, 10466 .tfb_tcp_timer_stop_all = rack_stopall, 10467 .tfb_tcp_timer_activate = rack_timer_activate, 10468 .tfb_tcp_timer_active = rack_timer_active, 10469 .tfb_tcp_timer_stop = rack_timer_stop, 10470 .tfb_tcp_rexmit_tmr = rack_remxt_tmr, 10471 .tfb_tcp_handoff_ok = rack_handoff_ok 10472 }; 10473 10474 static const char *rack_stack_names[] = { 10475 __XSTRING(STACKNAME), 10476 #ifdef STACKALIAS 10477 __XSTRING(STACKALIAS), 10478 #endif 10479 }; 10480 10481 static int 10482 rack_ctor(void *mem, int32_t size, void *arg, int32_t how) 10483 { 10484 memset(mem, 0, size); 10485 return (0); 10486 } 10487 10488 static void 10489 rack_dtor(void *mem, int32_t size, void *arg) 10490 { 10491 10492 } 10493 10494 static bool rack_mod_inited = false; 10495 10496 static int 10497 tcp_addrack(module_t mod, int32_t type, void *data) 10498 { 10499 int32_t err = 0; 10500 int num_stacks; 10501 10502 switch (type) { 10503 case MOD_LOAD: 10504 rack_zone = uma_zcreate(__XSTRING(MODNAME) "_map", 10505 sizeof(struct rack_sendmap), 10506 rack_ctor, rack_dtor, NULL, NULL, UMA_ALIGN_PTR, 0); 10507 10508 rack_pcb_zone = uma_zcreate(__XSTRING(MODNAME) "_pcb", 10509 sizeof(struct tcp_rack), 10510 rack_ctor, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0); 10511 10512 sysctl_ctx_init(&rack_sysctl_ctx); 10513 rack_sysctl_root = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 10514 SYSCTL_STATIC_CHILDREN(_net_inet_tcp), 10515 OID_AUTO, 10516 #ifdef STACKALIAS 10517 __XSTRING(STACKALIAS), 10518 #else 10519 __XSTRING(STACKNAME), 10520 #endif 10521 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 10522 ""); 10523 if (rack_sysctl_root == NULL) { 10524 printf("Failed to add sysctl node\n"); 10525 err = EFAULT; 10526 goto free_uma; 10527 } 10528 rack_init_sysctls(); 10529 num_stacks = nitems(rack_stack_names); 10530 err = register_tcp_functions_as_names(&__tcp_rack, M_WAITOK, 10531 rack_stack_names, &num_stacks); 10532 if (err) { 10533 printf("Failed to register %s stack name for " 10534 "%s module\n", rack_stack_names[num_stacks], 10535 __XSTRING(MODNAME)); 10536 sysctl_ctx_free(&rack_sysctl_ctx); 10537 free_uma: 10538 uma_zdestroy(rack_zone); 10539 uma_zdestroy(rack_pcb_zone); 10540 rack_counter_destroy(); 10541 printf("Failed to register rack module -- err:%d\n", err); 10542 return (err); 10543 } 10544 tcp_lro_reg_mbufq(); 10545 rack_mod_inited = true; 10546 break; 10547 case MOD_QUIESCE: 10548 err = deregister_tcp_functions(&__tcp_rack, true, false); 10549 break; 10550 case MOD_UNLOAD: 10551 err = deregister_tcp_functions(&__tcp_rack, false, true); 10552 if (err == EBUSY) 10553 break; 10554 if (rack_mod_inited) { 10555 uma_zdestroy(rack_zone); 10556 uma_zdestroy(rack_pcb_zone); 10557 sysctl_ctx_free(&rack_sysctl_ctx); 10558 rack_counter_destroy(); 10559 rack_mod_inited = false; 10560 } 10561 tcp_lro_dereg_mbufq(); 10562 err = 0; 10563 break; 10564 default: 10565 return (EOPNOTSUPP); 10566 } 10567 return (err); 10568 } 10569 10570 static moduledata_t tcp_rack = { 10571 .name = __XSTRING(MODNAME), 10572 .evhand = tcp_addrack, 10573 .priv = 0 10574 }; 10575 10576 MODULE_VERSION(MODNAME, 1); 10577 DECLARE_MODULE(MODNAME, tcp_rack, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY); 10578 MODULE_DEPEND(MODNAME, tcphpts, 1, 1, 1); 10579