1 /*- 2 * Copyright (c) 2016-2018 3 * Netflix Inc. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 * 26 */ 27 28 #include <sys/cdefs.h> 29 __FBSDID("$FreeBSD$"); 30 31 #include "opt_inet.h" 32 #include "opt_inet6.h" 33 #include "opt_ipsec.h" 34 #include "opt_tcpdebug.h" 35 36 #include <sys/param.h> 37 #include <sys/module.h> 38 #include <sys/kernel.h> 39 #ifdef TCP_HHOOK 40 #include <sys/hhook.h> 41 #endif 42 #include <sys/lock.h> 43 #include <sys/malloc.h> 44 #include <sys/lock.h> 45 #include <sys/mutex.h> 46 #include <sys/mbuf.h> 47 #include <sys/proc.h> /* for proc0 declaration */ 48 #include <sys/socket.h> 49 #include <sys/socketvar.h> 50 #include <sys/sysctl.h> 51 #include <sys/systm.h> 52 #ifdef NETFLIX_STATS 53 #include <sys/stats.h> 54 #endif 55 #include <sys/refcount.h> 56 #include <sys/queue.h> 57 #include <sys/smp.h> 58 #include <sys/kthread.h> 59 #include <sys/kern_prefetch.h> 60 61 #include <vm/uma.h> 62 63 #include <net/route.h> 64 #include <net/vnet.h> 65 66 #define TCPSTATES /* for logging */ 67 68 #include <netinet/in.h> 69 #include <netinet/in_kdtrace.h> 70 #include <netinet/in_pcb.h> 71 #include <netinet/ip.h> 72 #include <netinet/ip_icmp.h> /* required for icmp_var.h */ 73 #include <netinet/icmp_var.h> /* for ICMP_BANDLIM */ 74 #include <netinet/ip_var.h> 75 #include <netinet/ip6.h> 76 #include <netinet6/in6_pcb.h> 77 #include <netinet6/ip6_var.h> 78 #define TCPOUTFLAGS 79 #include <netinet/tcp.h> 80 #include <netinet/tcp_fsm.h> 81 #include <netinet/tcp_log_buf.h> 82 #include <netinet/tcp_seq.h> 83 #include <netinet/tcp_timer.h> 84 #include <netinet/tcp_var.h> 85 #include <netinet/tcp_hpts.h> 86 #include <netinet/tcpip.h> 87 #include <netinet/cc/cc.h> 88 #ifdef NETFLIX_CWV 89 #include <netinet/tcp_newcwv.h> 90 #endif 91 #include <netinet/tcp_fastopen.h> 92 #ifdef TCPDEBUG 93 #include <netinet/tcp_debug.h> 94 #endif /* TCPDEBUG */ 95 #ifdef TCP_OFFLOAD 96 #include <netinet/tcp_offload.h> 97 #endif 98 #ifdef INET6 99 #include <netinet6/tcp6_var.h> 100 #endif 101 102 #include <netipsec/ipsec_support.h> 103 104 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 105 #include <netipsec/ipsec.h> 106 #include <netipsec/ipsec6.h> 107 #endif /* IPSEC */ 108 109 #include <netinet/udp.h> 110 #include <netinet/udp_var.h> 111 #include <machine/in_cksum.h> 112 113 #ifdef MAC 114 #include <security/mac/mac_framework.h> 115 #endif 116 #include "sack_filter.h" 117 #include "tcp_rack.h" 118 #include "rack_bbr_common.h" 119 120 uma_zone_t rack_zone; 121 uma_zone_t rack_pcb_zone; 122 123 #ifndef TICKS2SBT 124 #define TICKS2SBT(__t) (tick_sbt * ((sbintime_t)(__t))) 125 #endif 126 127 struct sysctl_ctx_list rack_sysctl_ctx; 128 struct sysctl_oid *rack_sysctl_root; 129 130 #define CUM_ACKED 1 131 #define SACKED 2 132 133 /* 134 * The RACK module incorporates a number of 135 * TCP ideas that have been put out into the IETF 136 * over the last few years: 137 * - Matt Mathis's Rate Halving which slowly drops 138 * the congestion window so that the ack clock can 139 * be maintained during a recovery. 140 * - Yuchung Cheng's RACK TCP (for which its named) that 141 * will stop us using the number of dup acks and instead 142 * use time as the gage of when we retransmit. 143 * - Reorder Detection of RFC4737 and the Tail-Loss probe draft 144 * of Dukkipati et.al. 145 * RACK depends on SACK, so if an endpoint arrives that 146 * cannot do SACK the state machine below will shuttle the 147 * connection back to using the "default" TCP stack that is 148 * in FreeBSD. 149 * 150 * To implement RACK the original TCP stack was first decomposed 151 * into a functional state machine with individual states 152 * for each of the possible TCP connection states. The do_segement 153 * functions role in life is to mandate the connection supports SACK 154 * initially and then assure that the RACK state matches the conenction 155 * state before calling the states do_segment function. Each 156 * state is simplified due to the fact that the original do_segment 157 * has been decomposed and we *know* what state we are in (no 158 * switches on the state) and all tests for SACK are gone. This 159 * greatly simplifies what each state does. 160 * 161 * TCP output is also over-written with a new version since it 162 * must maintain the new rack scoreboard. 163 * 164 */ 165 static int32_t rack_precache = 1; 166 static int32_t rack_tlp_thresh = 1; 167 static int32_t rack_reorder_thresh = 2; 168 static int32_t rack_reorder_fade = 60000; /* 0 - never fade, def 60,000 169 * - 60 seconds */ 170 static int32_t rack_pkt_delay = 1; 171 static int32_t rack_inc_var = 0;/* For TLP */ 172 static int32_t rack_reduce_largest_on_idle = 0; 173 static int32_t rack_min_pace_time = 0; 174 static int32_t rack_min_pace_time_seg_req=6; 175 static int32_t rack_early_recovery = 1; 176 static int32_t rack_early_recovery_max_seg = 6; 177 static int32_t rack_send_a_lot_in_prr = 1; 178 static int32_t rack_min_to = 1; /* Number of ms minimum timeout */ 179 static int32_t rack_tlp_in_recovery = 1; /* Can we do TLP in recovery? */ 180 static int32_t rack_verbose_logging = 0; 181 static int32_t rack_ignore_data_after_close = 1; 182 /* 183 * Currently regular tcp has a rto_min of 30ms 184 * the backoff goes 12 times so that ends up 185 * being a total of 122.850 seconds before a 186 * connection is killed. 187 */ 188 static int32_t rack_tlp_min = 10; 189 static int32_t rack_rto_min = 30; /* 30ms same as main freebsd */ 190 static int32_t rack_rto_max = 30000; /* 30 seconds */ 191 static const int32_t rack_free_cache = 2; 192 static int32_t rack_hptsi_segments = 40; 193 static int32_t rack_rate_sample_method = USE_RTT_LOW; 194 static int32_t rack_pace_every_seg = 1; 195 static int32_t rack_delayed_ack_time = 200; /* 200ms */ 196 static int32_t rack_slot_reduction = 4; 197 static int32_t rack_lower_cwnd_at_tlp = 0; 198 static int32_t rack_use_proportional_reduce = 0; 199 static int32_t rack_proportional_rate = 10; 200 static int32_t rack_tlp_max_resend = 2; 201 static int32_t rack_limited_retran = 0; 202 static int32_t rack_always_send_oldest = 0; 203 static int32_t rack_sack_block_limit = 128; 204 static int32_t rack_use_sack_filter = 1; 205 static int32_t rack_tlp_threshold_use = TLP_USE_TWO_ONE; 206 207 /* Rack specific counters */ 208 counter_u64_t rack_badfr; 209 counter_u64_t rack_badfr_bytes; 210 counter_u64_t rack_rtm_prr_retran; 211 counter_u64_t rack_rtm_prr_newdata; 212 counter_u64_t rack_timestamp_mismatch; 213 counter_u64_t rack_reorder_seen; 214 counter_u64_t rack_paced_segments; 215 counter_u64_t rack_unpaced_segments; 216 counter_u64_t rack_saw_enobuf; 217 counter_u64_t rack_saw_enetunreach; 218 219 /* Tail loss probe counters */ 220 counter_u64_t rack_tlp_tot; 221 counter_u64_t rack_tlp_newdata; 222 counter_u64_t rack_tlp_retran; 223 counter_u64_t rack_tlp_retran_bytes; 224 counter_u64_t rack_tlp_retran_fail; 225 counter_u64_t rack_to_tot; 226 counter_u64_t rack_to_arm_rack; 227 counter_u64_t rack_to_arm_tlp; 228 counter_u64_t rack_to_alloc; 229 counter_u64_t rack_to_alloc_hard; 230 counter_u64_t rack_to_alloc_emerg; 231 232 counter_u64_t rack_sack_proc_all; 233 counter_u64_t rack_sack_proc_short; 234 counter_u64_t rack_sack_proc_restart; 235 counter_u64_t rack_runt_sacks; 236 counter_u64_t rack_used_tlpmethod; 237 counter_u64_t rack_used_tlpmethod2; 238 counter_u64_t rack_enter_tlp_calc; 239 counter_u64_t rack_input_idle_reduces; 240 counter_u64_t rack_tlp_does_nada; 241 242 /* Temp CPU counters */ 243 counter_u64_t rack_find_high; 244 245 counter_u64_t rack_progress_drops; 246 counter_u64_t rack_out_size[TCP_MSS_ACCT_SIZE]; 247 counter_u64_t rack_opts_arry[RACK_OPTS_SIZE]; 248 249 static void 250 rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, int event, int line); 251 252 static int 253 rack_process_ack(struct mbuf *m, struct tcphdr *th, 254 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t * ti_locked, 255 uint32_t tiwin, int32_t tlen, int32_t * ofia, int32_t thflags, int32_t * ret_val); 256 static int 257 rack_process_data(struct mbuf *m, struct tcphdr *th, 258 struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, 259 int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); 260 static void 261 rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, 262 struct tcphdr *th, uint16_t nsegs, uint16_t type, int32_t recovery); 263 static struct rack_sendmap *rack_alloc(struct tcp_rack *rack); 264 static struct rack_sendmap * 265 rack_check_recovery_mode(struct tcpcb *tp, 266 uint32_t tsused); 267 static void 268 rack_cong_signal(struct tcpcb *tp, struct tcphdr *th, 269 uint32_t type); 270 static void rack_counter_destroy(void); 271 static int 272 rack_ctloutput(struct socket *so, struct sockopt *sopt, 273 struct inpcb *inp, struct tcpcb *tp); 274 static int32_t rack_ctor(void *mem, int32_t size, void *arg, int32_t how); 275 static void 276 rack_do_segment(struct mbuf *m, struct tcphdr *th, 277 struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, 278 uint8_t iptos, int32_t ti_locked); 279 static void rack_dtor(void *mem, int32_t size, void *arg); 280 static void 281 rack_earlier_retran(struct tcpcb *tp, struct rack_sendmap *rsm, 282 uint32_t t, uint32_t cts); 283 static struct rack_sendmap * 284 rack_find_high_nonack(struct tcp_rack *rack, 285 struct rack_sendmap *rsm); 286 static struct rack_sendmap *rack_find_lowest_rsm(struct tcp_rack *rack); 287 static void rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm); 288 static void rack_fini(struct tcpcb *tp, int32_t tcb_is_purged); 289 static int 290 rack_get_sockopt(struct socket *so, struct sockopt *sopt, 291 struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack); 292 static int32_t rack_handoff_ok(struct tcpcb *tp); 293 static int32_t rack_init(struct tcpcb *tp); 294 static void rack_init_sysctls(void); 295 static void 296 rack_log_ack(struct tcpcb *tp, struct tcpopt *to, 297 struct tcphdr *th); 298 static void 299 rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len, 300 uint32_t seq_out, uint8_t th_flags, int32_t err, uint32_t ts, 301 uint8_t pass, struct rack_sendmap *hintrsm); 302 static void 303 rack_log_sack_passed(struct tcpcb *tp, struct tcp_rack *rack, 304 struct rack_sendmap *rsm); 305 static void rack_log_to_event(struct tcp_rack *rack, int32_t to_num); 306 static int32_t rack_output(struct tcpcb *tp); 307 static void 308 rack_hpts_do_segment(struct mbuf *m, struct tcphdr *th, 309 struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, 310 uint8_t iptos, int32_t ti_locked, int32_t nxt_pkt, struct timeval *tv); 311 312 static uint32_t 313 rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, 314 struct sackblk *sack, struct tcpopt *to, struct rack_sendmap **prsm, 315 uint32_t cts); 316 static void rack_post_recovery(struct tcpcb *tp, struct tcphdr *th); 317 static void rack_remxt_tmr(struct tcpcb *tp); 318 static int 319 rack_set_sockopt(struct socket *so, struct sockopt *sopt, 320 struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack); 321 static void rack_set_state(struct tcpcb *tp, struct tcp_rack *rack); 322 static int32_t rack_stopall(struct tcpcb *tp); 323 static void 324 rack_timer_activate(struct tcpcb *tp, uint32_t timer_type, 325 uint32_t delta); 326 static int32_t rack_timer_active(struct tcpcb *tp, uint32_t timer_type); 327 static void rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line); 328 static void rack_timer_stop(struct tcpcb *tp, uint32_t timer_type); 329 static uint32_t 330 rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack, 331 struct rack_sendmap *rsm, uint32_t ts, int32_t * lenp); 332 static void 333 rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack, 334 struct rack_sendmap *rsm, uint32_t ts); 335 static int 336 rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack, 337 struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type); 338 static int32_t tcp_addrack(module_t mod, int32_t type, void *data); 339 static void 340 rack_challenge_ack(struct mbuf *m, struct tcphdr *th, 341 struct tcpcb *tp, int32_t * ti_locked, int32_t * ret_val); 342 static int 343 rack_do_close_wait(struct mbuf *m, struct tcphdr *th, 344 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 345 int32_t tlen, int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); 346 static int 347 rack_do_closing(struct mbuf *m, struct tcphdr *th, 348 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 349 int32_t tlen, int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); 350 static void rack_do_drop(struct mbuf *m, struct tcpcb *tp, int32_t * ti_locked); 351 static void 352 rack_do_dropafterack(struct mbuf *m, struct tcpcb *tp, 353 struct tcphdr *th, int32_t * ti_locked, int32_t thflags, int32_t tlen, int32_t * ret_val); 354 static void 355 rack_do_dropwithreset(struct mbuf *m, struct tcpcb *tp, 356 struct tcphdr *th, int32_t * ti_locked, int32_t rstreason, int32_t tlen); 357 static int 358 rack_do_established(struct mbuf *m, struct tcphdr *th, 359 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 360 int32_t tlen, int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); 361 static int 362 rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, 363 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 364 int32_t tlen, int32_t * ti_locked, uint32_t tiwin, int32_t nxt_pkt); 365 static int 366 rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, 367 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 368 int32_t tlen, int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); 369 static int 370 rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, 371 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 372 int32_t tlen, int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); 373 static int 374 rack_do_lastack(struct mbuf *m, struct tcphdr *th, 375 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 376 int32_t tlen, int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); 377 static int 378 rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, 379 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 380 int32_t tlen, int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); 381 static int 382 rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, 383 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 384 int32_t tlen, int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); 385 static int 386 rack_drop_checks(struct tcpopt *to, struct mbuf *m, 387 struct tcphdr *th, struct tcpcb *tp, int32_t * tlenp, int32_t * ti_locked, int32_t * thf, 388 int32_t * drop_hdrlen, int32_t * ret_val); 389 static int 390 rack_process_rst(struct mbuf *m, struct tcphdr *th, 391 struct socket *so, struct tcpcb *tp, int32_t * ti_locked); 392 struct rack_sendmap * 393 tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, 394 uint32_t tsused); 395 static void tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt); 396 static void 397 tcp_rack_partialack(struct tcpcb *tp, struct tcphdr *th); 398 399 static int 400 rack_ts_check(struct mbuf *m, struct tcphdr *th, 401 struct tcpcb *tp, int32_t * ti_locked, int32_t tlen, int32_t thflags, int32_t * ret_val); 402 403 int32_t rack_clear_counter=0; 404 405 406 static int 407 sysctl_rack_clear(SYSCTL_HANDLER_ARGS) 408 { 409 uint32_t stat; 410 int32_t error; 411 412 error = SYSCTL_OUT(req, &rack_clear_counter, sizeof(uint32_t)); 413 if (error || req->newptr == NULL) 414 return error; 415 416 error = SYSCTL_IN(req, &stat, sizeof(uint32_t)); 417 if (error) 418 return (error); 419 if (stat == 1) { 420 #ifdef INVARIANTS 421 printf("Clearing RACK counters\n"); 422 #endif 423 counter_u64_zero(rack_badfr); 424 counter_u64_zero(rack_badfr_bytes); 425 counter_u64_zero(rack_rtm_prr_retran); 426 counter_u64_zero(rack_rtm_prr_newdata); 427 counter_u64_zero(rack_timestamp_mismatch); 428 counter_u64_zero(rack_reorder_seen); 429 counter_u64_zero(rack_tlp_tot); 430 counter_u64_zero(rack_tlp_newdata); 431 counter_u64_zero(rack_tlp_retran); 432 counter_u64_zero(rack_tlp_retran_bytes); 433 counter_u64_zero(rack_tlp_retran_fail); 434 counter_u64_zero(rack_to_tot); 435 counter_u64_zero(rack_to_arm_rack); 436 counter_u64_zero(rack_to_arm_tlp); 437 counter_u64_zero(rack_paced_segments); 438 counter_u64_zero(rack_unpaced_segments); 439 counter_u64_zero(rack_saw_enobuf); 440 counter_u64_zero(rack_saw_enetunreach); 441 counter_u64_zero(rack_to_alloc_hard); 442 counter_u64_zero(rack_to_alloc_emerg); 443 counter_u64_zero(rack_sack_proc_all); 444 counter_u64_zero(rack_sack_proc_short); 445 counter_u64_zero(rack_sack_proc_restart); 446 counter_u64_zero(rack_to_alloc); 447 counter_u64_zero(rack_find_high); 448 counter_u64_zero(rack_runt_sacks); 449 counter_u64_zero(rack_used_tlpmethod); 450 counter_u64_zero(rack_used_tlpmethod2); 451 counter_u64_zero(rack_enter_tlp_calc); 452 counter_u64_zero(rack_progress_drops); 453 counter_u64_zero(rack_tlp_does_nada); 454 } 455 rack_clear_counter = 0; 456 return (0); 457 } 458 459 460 461 static void 462 rack_init_sysctls() 463 { 464 SYSCTL_ADD_S32(&rack_sysctl_ctx, 465 SYSCTL_CHILDREN(rack_sysctl_root), 466 OID_AUTO, "rate_sample_method", CTLFLAG_RW, 467 &rack_rate_sample_method , USE_RTT_LOW, 468 "What method should we use for rate sampling 0=high, 1=low "); 469 SYSCTL_ADD_S32(&rack_sysctl_ctx, 470 SYSCTL_CHILDREN(rack_sysctl_root), 471 OID_AUTO, "data_after_close", CTLFLAG_RW, 472 &rack_ignore_data_after_close, 0, 473 "Do we hold off sending a RST until all pending data is ack'd"); 474 SYSCTL_ADD_S32(&rack_sysctl_ctx, 475 SYSCTL_CHILDREN(rack_sysctl_root), 476 OID_AUTO, "tlpmethod", CTLFLAG_RW, 477 &rack_tlp_threshold_use, TLP_USE_TWO_ONE, 478 "What method do we do for TLP time calc 0=no-de-ack-comp, 1=ID, 2=2.1, 3=2.2"); 479 SYSCTL_ADD_S32(&rack_sysctl_ctx, 480 SYSCTL_CHILDREN(rack_sysctl_root), 481 OID_AUTO, "min_pace_time", CTLFLAG_RW, 482 &rack_min_pace_time, 0, 483 "Should we enforce a minimum pace time of 1ms"); 484 SYSCTL_ADD_S32(&rack_sysctl_ctx, 485 SYSCTL_CHILDREN(rack_sysctl_root), 486 OID_AUTO, "min_pace_segs", CTLFLAG_RW, 487 &rack_min_pace_time_seg_req, 6, 488 "How many segments have to be in the len to enforce min-pace-time"); 489 SYSCTL_ADD_S32(&rack_sysctl_ctx, 490 SYSCTL_CHILDREN(rack_sysctl_root), 491 OID_AUTO, "idle_reduce_high", CTLFLAG_RW, 492 &rack_reduce_largest_on_idle, 0, 493 "Should we reduce the largest cwnd seen to IW on idle reduction"); 494 SYSCTL_ADD_S32(&rack_sysctl_ctx, 495 SYSCTL_CHILDREN(rack_sysctl_root), 496 OID_AUTO, "bb_verbose", CTLFLAG_RW, 497 &rack_verbose_logging, 0, 498 "Should RACK black box logging be verbose"); 499 SYSCTL_ADD_S32(&rack_sysctl_ctx, 500 SYSCTL_CHILDREN(rack_sysctl_root), 501 OID_AUTO, "sackfiltering", CTLFLAG_RW, 502 &rack_use_sack_filter, 1, 503 "Do we use sack filtering?"); 504 SYSCTL_ADD_S32(&rack_sysctl_ctx, 505 SYSCTL_CHILDREN(rack_sysctl_root), 506 OID_AUTO, "delayed_ack", CTLFLAG_RW, 507 &rack_delayed_ack_time, 200, 508 "Delayed ack time (200ms)"); 509 SYSCTL_ADD_S32(&rack_sysctl_ctx, 510 SYSCTL_CHILDREN(rack_sysctl_root), 511 OID_AUTO, "tlpminto", CTLFLAG_RW, 512 &rack_tlp_min, 10, 513 "TLP minimum timeout per the specification (10ms)"); 514 SYSCTL_ADD_S32(&rack_sysctl_ctx, 515 SYSCTL_CHILDREN(rack_sysctl_root), 516 OID_AUTO, "precache", CTLFLAG_RW, 517 &rack_precache, 0, 518 "Where should we precache the mcopy (0 is not at all)"); 519 SYSCTL_ADD_S32(&rack_sysctl_ctx, 520 SYSCTL_CHILDREN(rack_sysctl_root), 521 OID_AUTO, "sblklimit", CTLFLAG_RW, 522 &rack_sack_block_limit, 128, 523 "When do we start paying attention to small sack blocks"); 524 SYSCTL_ADD_S32(&rack_sysctl_ctx, 525 SYSCTL_CHILDREN(rack_sysctl_root), 526 OID_AUTO, "send_oldest", CTLFLAG_RW, 527 &rack_always_send_oldest, 1, 528 "Should we always send the oldest TLP and RACK-TLP"); 529 SYSCTL_ADD_S32(&rack_sysctl_ctx, 530 SYSCTL_CHILDREN(rack_sysctl_root), 531 OID_AUTO, "rack_tlp_in_recovery", CTLFLAG_RW, 532 &rack_tlp_in_recovery, 1, 533 "Can we do a TLP during recovery?"); 534 SYSCTL_ADD_S32(&rack_sysctl_ctx, 535 SYSCTL_CHILDREN(rack_sysctl_root), 536 OID_AUTO, "rack_tlimit", CTLFLAG_RW, 537 &rack_limited_retran, 0, 538 "How many times can a rack timeout drive out sends"); 539 SYSCTL_ADD_S32(&rack_sysctl_ctx, 540 SYSCTL_CHILDREN(rack_sysctl_root), 541 OID_AUTO, "minrto", CTLFLAG_RW, 542 &rack_rto_min, 0, 543 "Minimum RTO in ms -- set with caution below 1000 due to TLP"); 544 SYSCTL_ADD_S32(&rack_sysctl_ctx, 545 SYSCTL_CHILDREN(rack_sysctl_root), 546 OID_AUTO, "maxrto", CTLFLAG_RW, 547 &rack_rto_max, 0, 548 "Maxiumum RTO in ms -- should be at least as large as min_rto"); 549 SYSCTL_ADD_S32(&rack_sysctl_ctx, 550 SYSCTL_CHILDREN(rack_sysctl_root), 551 OID_AUTO, "tlp_retry", CTLFLAG_RW, 552 &rack_tlp_max_resend, 2, 553 "How many times does TLP retry a single segment or multiple with no ACK"); 554 SYSCTL_ADD_S32(&rack_sysctl_ctx, 555 SYSCTL_CHILDREN(rack_sysctl_root), 556 OID_AUTO, "recovery_loss_prop", CTLFLAG_RW, 557 &rack_use_proportional_reduce, 0, 558 "Should we proportionaly reduce cwnd based on the number of losses "); 559 SYSCTL_ADD_S32(&rack_sysctl_ctx, 560 SYSCTL_CHILDREN(rack_sysctl_root), 561 OID_AUTO, "recovery_prop", CTLFLAG_RW, 562 &rack_proportional_rate, 10, 563 "What percent reduction per loss"); 564 SYSCTL_ADD_S32(&rack_sysctl_ctx, 565 SYSCTL_CHILDREN(rack_sysctl_root), 566 OID_AUTO, "tlp_cwnd_flag", CTLFLAG_RW, 567 &rack_lower_cwnd_at_tlp, 0, 568 "When a TLP completes a retran should we enter recovery?"); 569 SYSCTL_ADD_S32(&rack_sysctl_ctx, 570 SYSCTL_CHILDREN(rack_sysctl_root), 571 OID_AUTO, "hptsi_reduces", CTLFLAG_RW, 572 &rack_slot_reduction, 4, 573 "When setting a slot should we reduce by divisor"); 574 SYSCTL_ADD_S32(&rack_sysctl_ctx, 575 SYSCTL_CHILDREN(rack_sysctl_root), 576 OID_AUTO, "hptsi_every_seg", CTLFLAG_RW, 577 &rack_pace_every_seg, 1, 578 "Should we pace out every segment hptsi"); 579 SYSCTL_ADD_S32(&rack_sysctl_ctx, 580 SYSCTL_CHILDREN(rack_sysctl_root), 581 OID_AUTO, "hptsi_seg_max", CTLFLAG_RW, 582 &rack_hptsi_segments, 6, 583 "Should we pace out only a limited size of segments"); 584 SYSCTL_ADD_S32(&rack_sysctl_ctx, 585 SYSCTL_CHILDREN(rack_sysctl_root), 586 OID_AUTO, "prr_sendalot", CTLFLAG_RW, 587 &rack_send_a_lot_in_prr, 1, 588 "Send a lot in prr"); 589 SYSCTL_ADD_S32(&rack_sysctl_ctx, 590 SYSCTL_CHILDREN(rack_sysctl_root), 591 OID_AUTO, "minto", CTLFLAG_RW, 592 &rack_min_to, 1, 593 "Minimum rack timeout in milliseconds"); 594 SYSCTL_ADD_S32(&rack_sysctl_ctx, 595 SYSCTL_CHILDREN(rack_sysctl_root), 596 OID_AUTO, "earlyrecoveryseg", CTLFLAG_RW, 597 &rack_early_recovery_max_seg, 6, 598 "Max segments in early recovery"); 599 SYSCTL_ADD_S32(&rack_sysctl_ctx, 600 SYSCTL_CHILDREN(rack_sysctl_root), 601 OID_AUTO, "earlyrecovery", CTLFLAG_RW, 602 &rack_early_recovery, 1, 603 "Do we do early recovery with rack"); 604 SYSCTL_ADD_S32(&rack_sysctl_ctx, 605 SYSCTL_CHILDREN(rack_sysctl_root), 606 OID_AUTO, "reorder_thresh", CTLFLAG_RW, 607 &rack_reorder_thresh, 2, 608 "What factor for rack will be added when seeing reordering (shift right)"); 609 SYSCTL_ADD_S32(&rack_sysctl_ctx, 610 SYSCTL_CHILDREN(rack_sysctl_root), 611 OID_AUTO, "rtt_tlp_thresh", CTLFLAG_RW, 612 &rack_tlp_thresh, 1, 613 "what divisor for TLP rtt/retran will be added (1=rtt, 2=1/2 rtt etc)"); 614 SYSCTL_ADD_S32(&rack_sysctl_ctx, 615 SYSCTL_CHILDREN(rack_sysctl_root), 616 OID_AUTO, "reorder_fade", CTLFLAG_RW, 617 &rack_reorder_fade, 0, 618 "Does reorder detection fade, if so how many ms (0 means never)"); 619 SYSCTL_ADD_S32(&rack_sysctl_ctx, 620 SYSCTL_CHILDREN(rack_sysctl_root), 621 OID_AUTO, "pktdelay", CTLFLAG_RW, 622 &rack_pkt_delay, 1, 623 "Extra RACK time (in ms) besides reordering thresh"); 624 SYSCTL_ADD_S32(&rack_sysctl_ctx, 625 SYSCTL_CHILDREN(rack_sysctl_root), 626 OID_AUTO, "inc_var", CTLFLAG_RW, 627 &rack_inc_var, 0, 628 "Should rack add to the TLP timer the variance in rtt calculation"); 629 rack_badfr = counter_u64_alloc(M_WAITOK); 630 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 631 SYSCTL_CHILDREN(rack_sysctl_root), 632 OID_AUTO, "badfr", CTLFLAG_RD, 633 &rack_badfr, "Total number of bad FRs"); 634 rack_badfr_bytes = counter_u64_alloc(M_WAITOK); 635 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 636 SYSCTL_CHILDREN(rack_sysctl_root), 637 OID_AUTO, "badfr_bytes", CTLFLAG_RD, 638 &rack_badfr_bytes, "Total number of bad FRs"); 639 rack_rtm_prr_retran = counter_u64_alloc(M_WAITOK); 640 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 641 SYSCTL_CHILDREN(rack_sysctl_root), 642 OID_AUTO, "prrsndret", CTLFLAG_RD, 643 &rack_rtm_prr_retran, 644 "Total number of prr based retransmits"); 645 rack_rtm_prr_newdata = counter_u64_alloc(M_WAITOK); 646 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 647 SYSCTL_CHILDREN(rack_sysctl_root), 648 OID_AUTO, "prrsndnew", CTLFLAG_RD, 649 &rack_rtm_prr_newdata, 650 "Total number of prr based new transmits"); 651 rack_timestamp_mismatch = counter_u64_alloc(M_WAITOK); 652 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 653 SYSCTL_CHILDREN(rack_sysctl_root), 654 OID_AUTO, "tsnf", CTLFLAG_RD, 655 &rack_timestamp_mismatch, 656 "Total number of timestamps that we could not find the reported ts"); 657 rack_find_high = counter_u64_alloc(M_WAITOK); 658 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 659 SYSCTL_CHILDREN(rack_sysctl_root), 660 OID_AUTO, "findhigh", CTLFLAG_RD, 661 &rack_find_high, 662 "Total number of FIN causing find-high"); 663 rack_reorder_seen = counter_u64_alloc(M_WAITOK); 664 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 665 SYSCTL_CHILDREN(rack_sysctl_root), 666 OID_AUTO, "reordering", CTLFLAG_RD, 667 &rack_reorder_seen, 668 "Total number of times we added delay due to reordering"); 669 rack_tlp_tot = counter_u64_alloc(M_WAITOK); 670 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 671 SYSCTL_CHILDREN(rack_sysctl_root), 672 OID_AUTO, "tlp_to_total", CTLFLAG_RD, 673 &rack_tlp_tot, 674 "Total number of tail loss probe expirations"); 675 rack_tlp_newdata = counter_u64_alloc(M_WAITOK); 676 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 677 SYSCTL_CHILDREN(rack_sysctl_root), 678 OID_AUTO, "tlp_new", CTLFLAG_RD, 679 &rack_tlp_newdata, 680 "Total number of tail loss probe sending new data"); 681 682 rack_tlp_retran = counter_u64_alloc(M_WAITOK); 683 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 684 SYSCTL_CHILDREN(rack_sysctl_root), 685 OID_AUTO, "tlp_retran", CTLFLAG_RD, 686 &rack_tlp_retran, 687 "Total number of tail loss probe sending retransmitted data"); 688 rack_tlp_retran_bytes = counter_u64_alloc(M_WAITOK); 689 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 690 SYSCTL_CHILDREN(rack_sysctl_root), 691 OID_AUTO, "tlp_retran_bytes", CTLFLAG_RD, 692 &rack_tlp_retran_bytes, 693 "Total bytes of tail loss probe sending retransmitted data"); 694 rack_tlp_retran_fail = counter_u64_alloc(M_WAITOK); 695 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 696 SYSCTL_CHILDREN(rack_sysctl_root), 697 OID_AUTO, "tlp_retran_fail", CTLFLAG_RD, 698 &rack_tlp_retran_fail, 699 "Total number of tail loss probe sending retransmitted data that failed (wait for t3)"); 700 rack_to_tot = counter_u64_alloc(M_WAITOK); 701 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 702 SYSCTL_CHILDREN(rack_sysctl_root), 703 OID_AUTO, "rack_to_tot", CTLFLAG_RD, 704 &rack_to_tot, 705 "Total number of times the rack to expired?"); 706 rack_to_arm_rack = counter_u64_alloc(M_WAITOK); 707 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 708 SYSCTL_CHILDREN(rack_sysctl_root), 709 OID_AUTO, "arm_rack", CTLFLAG_RD, 710 &rack_to_arm_rack, 711 "Total number of times the rack timer armed?"); 712 rack_to_arm_tlp = counter_u64_alloc(M_WAITOK); 713 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 714 SYSCTL_CHILDREN(rack_sysctl_root), 715 OID_AUTO, "arm_tlp", CTLFLAG_RD, 716 &rack_to_arm_tlp, 717 "Total number of times the tlp timer armed?"); 718 rack_paced_segments = counter_u64_alloc(M_WAITOK); 719 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 720 SYSCTL_CHILDREN(rack_sysctl_root), 721 OID_AUTO, "paced", CTLFLAG_RD, 722 &rack_paced_segments, 723 "Total number of times a segment send caused hptsi"); 724 rack_unpaced_segments = counter_u64_alloc(M_WAITOK); 725 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 726 SYSCTL_CHILDREN(rack_sysctl_root), 727 OID_AUTO, "unpaced", CTLFLAG_RD, 728 &rack_unpaced_segments, 729 "Total number of times a segment did not cause hptsi"); 730 rack_saw_enobuf = counter_u64_alloc(M_WAITOK); 731 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 732 SYSCTL_CHILDREN(rack_sysctl_root), 733 OID_AUTO, "saw_enobufs", CTLFLAG_RD, 734 &rack_saw_enobuf, 735 "Total number of times a segment did not cause hptsi"); 736 rack_saw_enetunreach = counter_u64_alloc(M_WAITOK); 737 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 738 SYSCTL_CHILDREN(rack_sysctl_root), 739 OID_AUTO, "saw_enetunreach", CTLFLAG_RD, 740 &rack_saw_enetunreach, 741 "Total number of times a segment did not cause hptsi"); 742 rack_to_alloc = counter_u64_alloc(M_WAITOK); 743 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 744 SYSCTL_CHILDREN(rack_sysctl_root), 745 OID_AUTO, "allocs", CTLFLAG_RD, 746 &rack_to_alloc, 747 "Total allocations of tracking structures"); 748 rack_to_alloc_hard = counter_u64_alloc(M_WAITOK); 749 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 750 SYSCTL_CHILDREN(rack_sysctl_root), 751 OID_AUTO, "allochard", CTLFLAG_RD, 752 &rack_to_alloc_hard, 753 "Total allocations done with sleeping the hard way"); 754 rack_to_alloc_emerg = counter_u64_alloc(M_WAITOK); 755 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 756 SYSCTL_CHILDREN(rack_sysctl_root), 757 OID_AUTO, "allocemerg", CTLFLAG_RD, 758 &rack_to_alloc_emerg, 759 "Total alocations done from emergency cache"); 760 rack_sack_proc_all = counter_u64_alloc(M_WAITOK); 761 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 762 SYSCTL_CHILDREN(rack_sysctl_root), 763 OID_AUTO, "sack_long", CTLFLAG_RD, 764 &rack_sack_proc_all, 765 "Total times we had to walk whole list for sack processing"); 766 767 rack_sack_proc_restart = counter_u64_alloc(M_WAITOK); 768 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 769 SYSCTL_CHILDREN(rack_sysctl_root), 770 OID_AUTO, "sack_restart", CTLFLAG_RD, 771 &rack_sack_proc_restart, 772 "Total times we had to walk whole list due to a restart"); 773 rack_sack_proc_short = counter_u64_alloc(M_WAITOK); 774 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 775 SYSCTL_CHILDREN(rack_sysctl_root), 776 OID_AUTO, "sack_short", CTLFLAG_RD, 777 &rack_sack_proc_short, 778 "Total times we took shortcut for sack processing"); 779 rack_enter_tlp_calc = counter_u64_alloc(M_WAITOK); 780 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 781 SYSCTL_CHILDREN(rack_sysctl_root), 782 OID_AUTO, "tlp_calc_entered", CTLFLAG_RD, 783 &rack_enter_tlp_calc, 784 "Total times we called calc-tlp"); 785 rack_used_tlpmethod = counter_u64_alloc(M_WAITOK); 786 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 787 SYSCTL_CHILDREN(rack_sysctl_root), 788 OID_AUTO, "hit_tlp_method", CTLFLAG_RD, 789 &rack_used_tlpmethod, 790 "Total number of runt sacks"); 791 rack_used_tlpmethod2 = counter_u64_alloc(M_WAITOK); 792 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 793 SYSCTL_CHILDREN(rack_sysctl_root), 794 OID_AUTO, "hit_tlp_method2", CTLFLAG_RD, 795 &rack_used_tlpmethod2, 796 "Total number of runt sacks 2"); 797 rack_runt_sacks = counter_u64_alloc(M_WAITOK); 798 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 799 SYSCTL_CHILDREN(rack_sysctl_root), 800 OID_AUTO, "runtsacks", CTLFLAG_RD, 801 &rack_runt_sacks, 802 "Total number of runt sacks"); 803 rack_progress_drops = counter_u64_alloc(M_WAITOK); 804 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 805 SYSCTL_CHILDREN(rack_sysctl_root), 806 OID_AUTO, "prog_drops", CTLFLAG_RD, 807 &rack_progress_drops, 808 "Total number of progress drops"); 809 rack_input_idle_reduces = counter_u64_alloc(M_WAITOK); 810 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 811 SYSCTL_CHILDREN(rack_sysctl_root), 812 OID_AUTO, "idle_reduce_oninput", CTLFLAG_RD, 813 &rack_input_idle_reduces, 814 "Total number of idle reductions on input"); 815 rack_tlp_does_nada = counter_u64_alloc(M_WAITOK); 816 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 817 SYSCTL_CHILDREN(rack_sysctl_root), 818 OID_AUTO, "tlp_nada", CTLFLAG_RD, 819 &rack_tlp_does_nada, 820 "Total number of nada tlp calls"); 821 COUNTER_ARRAY_ALLOC(rack_out_size, TCP_MSS_ACCT_SIZE, M_WAITOK); 822 SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), 823 OID_AUTO, "outsize", CTLFLAG_RD, 824 rack_out_size, TCP_MSS_ACCT_SIZE, "MSS send sizes"); 825 COUNTER_ARRAY_ALLOC(rack_opts_arry, RACK_OPTS_SIZE, M_WAITOK); 826 SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), 827 OID_AUTO, "opts", CTLFLAG_RD, 828 rack_opts_arry, RACK_OPTS_SIZE, "RACK Option Stats"); 829 SYSCTL_ADD_PROC(&rack_sysctl_ctx, 830 SYSCTL_CHILDREN(rack_sysctl_root), 831 OID_AUTO, "clear", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, 832 &rack_clear_counter, 0, sysctl_rack_clear, "IU", "Clear counters"); 833 } 834 835 static inline int32_t 836 rack_progress_timeout_check(struct tcpcb *tp) 837 { 838 if (tp->t_maxunacktime && tp->t_acktime && TSTMP_GT(ticks, tp->t_acktime)) { 839 if ((ticks - tp->t_acktime) >= tp->t_maxunacktime) { 840 /* 841 * There is an assumption that the caller 842 * will drop the connection so we will 843 * increment the counters here. 844 */ 845 struct tcp_rack *rack; 846 rack = (struct tcp_rack *)tp->t_fb_ptr; 847 counter_u64_add(rack_progress_drops, 1); 848 #ifdef NETFLIX_STATS 849 TCPSTAT_INC(tcps_progdrops); 850 #endif 851 rack_log_progress_event(rack, tp, ticks, PROGRESS_DROP, __LINE__); 852 return (1); 853 } 854 } 855 return (0); 856 } 857 858 859 static void 860 rack_log_to_start(struct tcp_rack *rack, uint32_t cts, uint32_t to, int32_t slot, uint8_t which) 861 { 862 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 863 union tcp_log_stackspecific log; 864 865 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 866 log.u_bbr.flex1 = TICKS_2_MSEC(rack->rc_tp->t_srtt >> TCP_RTT_SHIFT); 867 log.u_bbr.flex2 = to; 868 log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags; 869 log.u_bbr.flex4 = slot; 870 log.u_bbr.flex5 = rack->rc_inp->inp_hptsslot; 871 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur; 872 log.u_bbr.flex8 = which; 873 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 874 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 875 TCP_LOG_EVENT(rack->rc_tp, NULL, 876 &rack->rc_inp->inp_socket->so_rcv, 877 &rack->rc_inp->inp_socket->so_snd, 878 BBR_LOG_TIMERSTAR, 0, 879 0, &log, false); 880 } 881 } 882 883 static void 884 rack_log_to_event(struct tcp_rack *rack, int32_t to_num) 885 { 886 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 887 union tcp_log_stackspecific log; 888 889 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 890 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 891 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 892 log.u_bbr.flex8 = to_num; 893 log.u_bbr.flex1 = rack->r_ctl.rc_rack_min_rtt; 894 log.u_bbr.flex2 = rack->rc_rack_rtt; 895 TCP_LOG_EVENT(rack->rc_tp, NULL, 896 &rack->rc_inp->inp_socket->so_rcv, 897 &rack->rc_inp->inp_socket->so_snd, 898 BBR_LOG_RTO, 0, 899 0, &log, false); 900 } 901 } 902 903 static void 904 rack_log_rtt_upd(struct tcpcb *tp, struct tcp_rack *rack, int32_t t, 905 uint32_t o_srtt, uint32_t o_var) 906 { 907 if (tp->t_logstate != TCP_LOG_STATE_OFF) { 908 union tcp_log_stackspecific log; 909 910 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 911 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 912 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 913 log.u_bbr.flex1 = t; 914 log.u_bbr.flex2 = o_srtt; 915 log.u_bbr.flex3 = o_var; 916 log.u_bbr.flex4 = rack->r_ctl.rack_rs.rs_rtt_lowest; 917 log.u_bbr.flex5 = rack->r_ctl.rack_rs.rs_rtt_highest; 918 log.u_bbr.flex6 = rack->r_ctl.rack_rs.rs_rtt_cnt; 919 log.u_bbr.rttProp = rack->r_ctl.rack_rs.rs_rtt_tot; 920 log.u_bbr.flex8 = rack->r_ctl.rc_rate_sample_method; 921 TCP_LOG_EVENT(tp, NULL, 922 &rack->rc_inp->inp_socket->so_rcv, 923 &rack->rc_inp->inp_socket->so_snd, 924 BBR_LOG_BBRRTT, 0, 925 0, &log, false); 926 } 927 } 928 929 static void 930 rack_log_rtt_sample(struct tcp_rack *rack, uint32_t rtt) 931 { 932 /* 933 * Log the rtt sample we are 934 * applying to the srtt algorithm in 935 * useconds. 936 */ 937 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 938 union tcp_log_stackspecific log; 939 struct timeval tv; 940 941 /* Convert our ms to a microsecond */ 942 log.u_bbr.flex1 = rtt * 1000; 943 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 944 TCP_LOG_EVENTP(rack->rc_tp, NULL, 945 &rack->rc_inp->inp_socket->so_rcv, 946 &rack->rc_inp->inp_socket->so_snd, 947 TCP_LOG_RTT, 0, 948 0, &log, false, &tv); 949 } 950 } 951 952 953 static inline void 954 rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, int event, int line) 955 { 956 if (rack_verbose_logging && (tp->t_logstate != TCP_LOG_STATE_OFF)) { 957 union tcp_log_stackspecific log; 958 959 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 960 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 961 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 962 log.u_bbr.flex1 = line; 963 log.u_bbr.flex2 = tick; 964 log.u_bbr.flex3 = tp->t_maxunacktime; 965 log.u_bbr.flex4 = tp->t_acktime; 966 log.u_bbr.flex8 = event; 967 TCP_LOG_EVENT(tp, NULL, 968 &rack->rc_inp->inp_socket->so_rcv, 969 &rack->rc_inp->inp_socket->so_snd, 970 BBR_LOG_PROGRESS, 0, 971 0, &log, false); 972 } 973 } 974 975 static void 976 rack_log_type_bbrsnd(struct tcp_rack *rack, uint32_t len, uint32_t slot, uint32_t cts) 977 { 978 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 979 union tcp_log_stackspecific log; 980 981 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 982 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 983 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 984 log.u_bbr.flex1 = slot; 985 log.u_bbr.flex7 = (0x0000ffff & rack->r_ctl.rc_hpts_flags); 986 log.u_bbr.flex8 = rack->rc_in_persist; 987 TCP_LOG_EVENT(rack->rc_tp, NULL, 988 &rack->rc_inp->inp_socket->so_rcv, 989 &rack->rc_inp->inp_socket->so_snd, 990 BBR_LOG_BBRSND, 0, 991 0, &log, false); 992 } 993 } 994 995 static void 996 rack_log_doseg_done(struct tcp_rack *rack, uint32_t cts, int32_t nxt_pkt, int32_t did_out, int way_out) 997 { 998 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 999 union tcp_log_stackspecific log; 1000 log.u_bbr.flex1 = did_out; 1001 log.u_bbr.flex2 = nxt_pkt; 1002 log.u_bbr.flex3 = way_out; 1003 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; 1004 log.u_bbr.flex7 = rack->r_wanted_output; 1005 log.u_bbr.flex8 = rack->rc_in_persist; 1006 TCP_LOG_EVENT(rack->rc_tp, NULL, 1007 &rack->rc_inp->inp_socket->so_rcv, 1008 &rack->rc_inp->inp_socket->so_snd, 1009 BBR_LOG_DOSEG_DONE, 0, 1010 0, &log, false); 1011 } 1012 } 1013 1014 1015 static void 1016 rack_log_type_just_return(struct tcp_rack *rack, uint32_t cts, uint32_t tlen, uint32_t slot, uint8_t hpts_calling) 1017 { 1018 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 1019 union tcp_log_stackspecific log; 1020 1021 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 1022 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 1023 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 1024 log.u_bbr.flex1 = slot; 1025 log.u_bbr.flex2 = rack->r_ctl.rc_hpts_flags; 1026 log.u_bbr.flex7 = hpts_calling; 1027 log.u_bbr.flex8 = rack->rc_in_persist; 1028 TCP_LOG_EVENT(rack->rc_tp, NULL, 1029 &rack->rc_inp->inp_socket->so_rcv, 1030 &rack->rc_inp->inp_socket->so_snd, 1031 BBR_LOG_JUSTRET, 0, 1032 tlen, &log, false); 1033 } 1034 } 1035 1036 static void 1037 rack_log_to_cancel(struct tcp_rack *rack, int32_t hpts_removed, int line) 1038 { 1039 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 1040 union tcp_log_stackspecific log; 1041 1042 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 1043 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 1044 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 1045 log.u_bbr.flex1 = line; 1046 log.u_bbr.flex2 = 0; 1047 log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags; 1048 log.u_bbr.flex4 = 0; 1049 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur; 1050 log.u_bbr.flex8 = hpts_removed; 1051 TCP_LOG_EVENT(rack->rc_tp, NULL, 1052 &rack->rc_inp->inp_socket->so_rcv, 1053 &rack->rc_inp->inp_socket->so_snd, 1054 BBR_LOG_TIMERCANC, 0, 1055 0, &log, false); 1056 } 1057 } 1058 1059 static void 1060 rack_log_to_processing(struct tcp_rack *rack, uint32_t cts, int32_t ret, int32_t timers) 1061 { 1062 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 1063 union tcp_log_stackspecific log; 1064 1065 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 1066 log.u_bbr.flex1 = timers; 1067 log.u_bbr.flex2 = ret; 1068 log.u_bbr.flex3 = rack->r_ctl.rc_timer_exp; 1069 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; 1070 log.u_bbr.flex5 = cts; 1071 TCP_LOG_EVENT(rack->rc_tp, NULL, 1072 &rack->rc_inp->inp_socket->so_rcv, 1073 &rack->rc_inp->inp_socket->so_snd, 1074 BBR_LOG_TO_PROCESS, 0, 1075 0, &log, false); 1076 } 1077 } 1078 1079 static void 1080 rack_counter_destroy() 1081 { 1082 counter_u64_free(rack_badfr); 1083 counter_u64_free(rack_badfr_bytes); 1084 counter_u64_free(rack_rtm_prr_retran); 1085 counter_u64_free(rack_rtm_prr_newdata); 1086 counter_u64_free(rack_timestamp_mismatch); 1087 counter_u64_free(rack_reorder_seen); 1088 counter_u64_free(rack_tlp_tot); 1089 counter_u64_free(rack_tlp_newdata); 1090 counter_u64_free(rack_tlp_retran); 1091 counter_u64_free(rack_tlp_retran_bytes); 1092 counter_u64_free(rack_tlp_retran_fail); 1093 counter_u64_free(rack_to_tot); 1094 counter_u64_free(rack_to_arm_rack); 1095 counter_u64_free(rack_to_arm_tlp); 1096 counter_u64_free(rack_paced_segments); 1097 counter_u64_free(rack_unpaced_segments); 1098 counter_u64_free(rack_saw_enobuf); 1099 counter_u64_free(rack_saw_enetunreach); 1100 counter_u64_free(rack_to_alloc_hard); 1101 counter_u64_free(rack_to_alloc_emerg); 1102 counter_u64_free(rack_sack_proc_all); 1103 counter_u64_free(rack_sack_proc_short); 1104 counter_u64_free(rack_sack_proc_restart); 1105 counter_u64_free(rack_to_alloc); 1106 counter_u64_free(rack_find_high); 1107 counter_u64_free(rack_runt_sacks); 1108 counter_u64_free(rack_enter_tlp_calc); 1109 counter_u64_free(rack_used_tlpmethod); 1110 counter_u64_free(rack_used_tlpmethod2); 1111 counter_u64_free(rack_progress_drops); 1112 counter_u64_free(rack_input_idle_reduces); 1113 counter_u64_free(rack_tlp_does_nada); 1114 COUNTER_ARRAY_FREE(rack_out_size, TCP_MSS_ACCT_SIZE); 1115 COUNTER_ARRAY_FREE(rack_opts_arry, RACK_OPTS_SIZE); 1116 } 1117 1118 static struct rack_sendmap * 1119 rack_alloc(struct tcp_rack *rack) 1120 { 1121 struct rack_sendmap *rsm; 1122 1123 counter_u64_add(rack_to_alloc, 1); 1124 rack->r_ctl.rc_num_maps_alloced++; 1125 rsm = uma_zalloc(rack_zone, M_NOWAIT); 1126 if (rsm) { 1127 return (rsm); 1128 } 1129 if (rack->rc_free_cnt) { 1130 counter_u64_add(rack_to_alloc_emerg, 1); 1131 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); 1132 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_next); 1133 rack->rc_free_cnt--; 1134 return (rsm); 1135 } 1136 return (NULL); 1137 } 1138 1139 static void 1140 rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm) 1141 { 1142 rack->r_ctl.rc_num_maps_alloced--; 1143 if (rack->r_ctl.rc_tlpsend == rsm) 1144 rack->r_ctl.rc_tlpsend = NULL; 1145 if (rack->r_ctl.rc_next == rsm) 1146 rack->r_ctl.rc_next = NULL; 1147 if (rack->r_ctl.rc_sacklast == rsm) 1148 rack->r_ctl.rc_sacklast = NULL; 1149 if (rack->rc_free_cnt < rack_free_cache) { 1150 memset(rsm, 0, sizeof(struct rack_sendmap)); 1151 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_next); 1152 rack->rc_free_cnt++; 1153 return; 1154 } 1155 uma_zfree(rack_zone, rsm); 1156 } 1157 1158 /* 1159 * CC wrapper hook functions 1160 */ 1161 static void 1162 rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, struct tcphdr *th, uint16_t nsegs, 1163 uint16_t type, int32_t recovery) 1164 { 1165 #ifdef NETFLIX_STATS 1166 int32_t gput; 1167 #endif 1168 #ifdef NETFLIX_CWV 1169 u_long old_cwnd = tp->snd_cwnd; 1170 #endif 1171 1172 INP_WLOCK_ASSERT(tp->t_inpcb); 1173 tp->ccv->nsegs = nsegs; 1174 tp->ccv->bytes_this_ack = BYTES_THIS_ACK(tp, th); 1175 if ((recovery) && (rack->r_ctl.rc_early_recovery_segs)) { 1176 uint32_t max; 1177 1178 max = rack->r_ctl.rc_early_recovery_segs * tp->t_maxseg; 1179 if (tp->ccv->bytes_this_ack > max) { 1180 tp->ccv->bytes_this_ack = max; 1181 } 1182 } 1183 if (tp->snd_cwnd <= tp->snd_wnd) 1184 tp->ccv->flags |= CCF_CWND_LIMITED; 1185 else 1186 tp->ccv->flags &= ~CCF_CWND_LIMITED; 1187 1188 if (type == CC_ACK) { 1189 #ifdef NETFLIX_STATS 1190 stats_voi_update_abs_s32(tp->t_stats, VOI_TCP_CALCFRWINDIFF, 1191 ((int32_t) tp->snd_cwnd) - tp->snd_wnd); 1192 if ((tp->t_flags & TF_GPUTINPROG) && 1193 SEQ_GEQ(th->th_ack, tp->gput_ack)) { 1194 gput = (((int64_t) (th->th_ack - tp->gput_seq)) << 3) / 1195 max(1, tcp_ts_getticks() - tp->gput_ts); 1196 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_GPUT, 1197 gput); 1198 /* 1199 * XXXLAS: This is a temporary hack, and should be 1200 * chained off VOI_TCP_GPUT when stats(9) grows an 1201 * API to deal with chained VOIs. 1202 */ 1203 if (tp->t_stats_gput_prev > 0) 1204 stats_voi_update_abs_s32(tp->t_stats, 1205 VOI_TCP_GPUT_ND, 1206 ((gput - tp->t_stats_gput_prev) * 100) / 1207 tp->t_stats_gput_prev); 1208 tp->t_flags &= ~TF_GPUTINPROG; 1209 tp->t_stats_gput_prev = gput; 1210 1211 if (tp->t_maxpeakrate) { 1212 /* 1213 * We update t_peakrate_thr. This gives us roughly 1214 * one update per round trip time. 1215 */ 1216 tcp_update_peakrate_thr(tp); 1217 } 1218 } 1219 #endif 1220 if (tp->snd_cwnd > tp->snd_ssthresh) { 1221 tp->t_bytes_acked += min(tp->ccv->bytes_this_ack, 1222 nsegs * V_tcp_abc_l_var * tp->t_maxseg); 1223 if (tp->t_bytes_acked >= tp->snd_cwnd) { 1224 tp->t_bytes_acked -= tp->snd_cwnd; 1225 tp->ccv->flags |= CCF_ABC_SENTAWND; 1226 } 1227 } else { 1228 tp->ccv->flags &= ~CCF_ABC_SENTAWND; 1229 tp->t_bytes_acked = 0; 1230 } 1231 } 1232 if (CC_ALGO(tp)->ack_received != NULL) { 1233 /* XXXLAS: Find a way to live without this */ 1234 tp->ccv->curack = th->th_ack; 1235 CC_ALGO(tp)->ack_received(tp->ccv, type); 1236 } 1237 #ifdef NETFLIX_STATS 1238 stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_LCWIN, tp->snd_cwnd); 1239 #endif 1240 if (rack->r_ctl.rc_rack_largest_cwnd < tp->snd_cwnd) { 1241 rack->r_ctl.rc_rack_largest_cwnd = tp->snd_cwnd; 1242 } 1243 #ifdef NETFLIX_CWV 1244 if (tp->cwv_enabled) { 1245 /* 1246 * Per RFC 7661: The behaviour in the non-validated phase is 1247 * specified as: o A sender determines whether to increase 1248 * the cwnd based upon whether it is cwnd-limited (see 1249 * Section 4.5.3): * A sender that is cwnd-limited MAY use 1250 * the standard TCP method to increase cwnd (i.e., the 1251 * standard method permits a TCP sender that fully utilises 1252 * the cwnd to increase the cwnd each time it receives an 1253 * ACK). * A sender that is not cwnd-limited MUST NOT 1254 * increase the cwnd when ACK packets are received in this 1255 * phase (i.e., needs to avoid growing the cwnd when it has 1256 * not recently sent using the current size of cwnd). 1257 */ 1258 if ((tp->snd_cwnd > old_cwnd) && 1259 (tp->cwv_cwnd_valid == 0) && 1260 (!(tp->ccv->flags & CCF_CWND_LIMITED))) { 1261 tp->snd_cwnd = old_cwnd; 1262 } 1263 /* Try to update pipeAck and NCWV state */ 1264 if (TCPS_HAVEESTABLISHED(tp->t_state) && 1265 !IN_RECOVERY(tp->t_flags)) { 1266 uint32_t data = sbavail(&(tp->t_inpcb->inp_socket->so_snd)); 1267 1268 tcp_newcwv_update_pipeack(tp, data); 1269 } 1270 } 1271 #endif 1272 /* we enforce max peak rate if it is set. */ 1273 if (tp->t_peakrate_thr && tp->snd_cwnd > tp->t_peakrate_thr) { 1274 tp->snd_cwnd = tp->t_peakrate_thr; 1275 } 1276 } 1277 1278 static void 1279 tcp_rack_partialack(struct tcpcb *tp, struct tcphdr *th) 1280 { 1281 struct tcp_rack *rack; 1282 1283 rack = (struct tcp_rack *)tp->t_fb_ptr; 1284 INP_WLOCK_ASSERT(tp->t_inpcb); 1285 if (rack->r_ctl.rc_prr_sndcnt > 0) 1286 rack->r_wanted_output++; 1287 } 1288 1289 static void 1290 rack_post_recovery(struct tcpcb *tp, struct tcphdr *th) 1291 { 1292 struct tcp_rack *rack; 1293 1294 INP_WLOCK_ASSERT(tp->t_inpcb); 1295 rack = (struct tcp_rack *)tp->t_fb_ptr; 1296 if (CC_ALGO(tp)->post_recovery != NULL) { 1297 tp->ccv->curack = th->th_ack; 1298 CC_ALGO(tp)->post_recovery(tp->ccv); 1299 } 1300 /* 1301 * Here we can in theory adjust cwnd to be based on the number of 1302 * losses in the window (rack->r_ctl.rc_loss_count). This is done 1303 * based on the rack_use_proportional flag. 1304 */ 1305 if (rack->r_ctl.rc_prop_reduce && rack->r_ctl.rc_prop_rate) { 1306 int32_t reduce; 1307 1308 reduce = (rack->r_ctl.rc_loss_count * rack->r_ctl.rc_prop_rate); 1309 if (reduce > 50) { 1310 reduce = 50; 1311 } 1312 tp->snd_cwnd -= ((reduce * tp->snd_cwnd) / 100); 1313 } else { 1314 if (tp->snd_cwnd > tp->snd_ssthresh) { 1315 /* Drop us down to the ssthresh (1/2 cwnd at loss) */ 1316 tp->snd_cwnd = tp->snd_ssthresh; 1317 } 1318 } 1319 if (rack->r_ctl.rc_prr_sndcnt > 0) { 1320 /* Suck the next prr cnt back into cwnd */ 1321 tp->snd_cwnd += rack->r_ctl.rc_prr_sndcnt; 1322 rack->r_ctl.rc_prr_sndcnt = 0; 1323 } 1324 EXIT_RECOVERY(tp->t_flags); 1325 1326 1327 #ifdef NETFLIX_CWV 1328 if (tp->cwv_enabled) { 1329 if ((tp->cwv_cwnd_valid == 0) && 1330 (tp->snd_cwv.in_recovery)) 1331 tcp_newcwv_end_recovery(tp); 1332 } 1333 #endif 1334 } 1335 1336 static void 1337 rack_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type) 1338 { 1339 struct tcp_rack *rack; 1340 1341 INP_WLOCK_ASSERT(tp->t_inpcb); 1342 1343 rack = (struct tcp_rack *)tp->t_fb_ptr; 1344 switch (type) { 1345 case CC_NDUPACK: 1346 /* rack->r_ctl.rc_ssthresh_set = 1;*/ 1347 if (!IN_FASTRECOVERY(tp->t_flags)) { 1348 rack->r_ctl.rc_tlp_rtx_out = 0; 1349 rack->r_ctl.rc_prr_delivered = 0; 1350 rack->r_ctl.rc_prr_out = 0; 1351 rack->r_ctl.rc_loss_count = 0; 1352 rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; 1353 rack->r_ctl.rc_prr_recovery_fs = tp->snd_max - tp->snd_una; 1354 tp->snd_recover = tp->snd_max; 1355 if (tp->t_flags & TF_ECN_PERMIT) 1356 tp->t_flags |= TF_ECN_SND_CWR; 1357 } 1358 break; 1359 case CC_ECN: 1360 if (!IN_CONGRECOVERY(tp->t_flags)) { 1361 TCPSTAT_INC(tcps_ecn_rcwnd); 1362 tp->snd_recover = tp->snd_max; 1363 if (tp->t_flags & TF_ECN_PERMIT) 1364 tp->t_flags |= TF_ECN_SND_CWR; 1365 } 1366 break; 1367 case CC_RTO: 1368 tp->t_dupacks = 0; 1369 tp->t_bytes_acked = 0; 1370 EXIT_RECOVERY(tp->t_flags); 1371 tp->snd_ssthresh = max(2, min(tp->snd_wnd, tp->snd_cwnd) / 2 / 1372 tp->t_maxseg) * tp->t_maxseg; 1373 tp->snd_cwnd = tp->t_maxseg; 1374 break; 1375 case CC_RTO_ERR: 1376 TCPSTAT_INC(tcps_sndrexmitbad); 1377 /* RTO was unnecessary, so reset everything. */ 1378 tp->snd_cwnd = tp->snd_cwnd_prev; 1379 tp->snd_ssthresh = tp->snd_ssthresh_prev; 1380 tp->snd_recover = tp->snd_recover_prev; 1381 if (tp->t_flags & TF_WASFRECOVERY) 1382 ENTER_FASTRECOVERY(tp->t_flags); 1383 if (tp->t_flags & TF_WASCRECOVERY) 1384 ENTER_CONGRECOVERY(tp->t_flags); 1385 tp->snd_nxt = tp->snd_max; 1386 tp->t_badrxtwin = 0; 1387 break; 1388 } 1389 1390 if (CC_ALGO(tp)->cong_signal != NULL) { 1391 if (th != NULL) 1392 tp->ccv->curack = th->th_ack; 1393 CC_ALGO(tp)->cong_signal(tp->ccv, type); 1394 } 1395 #ifdef NETFLIX_CWV 1396 if (tp->cwv_enabled) { 1397 if (tp->snd_cwv.in_recovery == 0 && IN_RECOVERY(tp->t_flags)) { 1398 tcp_newcwv_enter_recovery(tp); 1399 } 1400 if (type == CC_RTO) { 1401 tcp_newcwv_reset(tp); 1402 } 1403 } 1404 #endif 1405 } 1406 1407 1408 1409 static inline void 1410 rack_cc_after_idle(struct tcpcb *tp, int reduce_largest) 1411 { 1412 uint32_t i_cwnd; 1413 1414 INP_WLOCK_ASSERT(tp->t_inpcb); 1415 1416 #ifdef NETFLIX_STATS 1417 TCPSTAT_INC(tcps_idle_restarts); 1418 if (tp->t_state == TCPS_ESTABLISHED) 1419 TCPSTAT_INC(tcps_idle_estrestarts); 1420 #endif 1421 if (CC_ALGO(tp)->after_idle != NULL) 1422 CC_ALGO(tp)->after_idle(tp->ccv); 1423 1424 if (tp->snd_cwnd == 1) 1425 i_cwnd = tp->t_maxseg; /* SYN(-ACK) lost */ 1426 else if (V_tcp_initcwnd_segments) 1427 i_cwnd = min((V_tcp_initcwnd_segments * tp->t_maxseg), 1428 max(2 * tp->t_maxseg, V_tcp_initcwnd_segments * 1460)); 1429 else if (V_tcp_do_rfc3390) 1430 i_cwnd = min(4 * tp->t_maxseg, 1431 max(2 * tp->t_maxseg, 4380)); 1432 else { 1433 /* Per RFC5681 Section 3.1 */ 1434 if (tp->t_maxseg > 2190) 1435 i_cwnd = 2 * tp->t_maxseg; 1436 else if (tp->t_maxseg > 1095) 1437 i_cwnd = 3 * tp->t_maxseg; 1438 else 1439 i_cwnd = 4 * tp->t_maxseg; 1440 } 1441 if (reduce_largest) { 1442 /* 1443 * Do we reduce the largest cwnd to make 1444 * rack play nice on restart hptsi wise? 1445 */ 1446 if (((struct tcp_rack *)tp->t_fb_ptr)->r_ctl.rc_rack_largest_cwnd > i_cwnd) 1447 ((struct tcp_rack *)tp->t_fb_ptr)->r_ctl.rc_rack_largest_cwnd = i_cwnd; 1448 } 1449 /* 1450 * Being idle is no differnt than the initial window. If the cc 1451 * clamps it down below the initial window raise it to the initial 1452 * window. 1453 */ 1454 if (tp->snd_cwnd < i_cwnd) { 1455 tp->snd_cwnd = i_cwnd; 1456 } 1457 } 1458 1459 1460 /* 1461 * Indicate whether this ack should be delayed. We can delay the ack if 1462 * following conditions are met: 1463 * - There is no delayed ack timer in progress. 1464 * - Our last ack wasn't a 0-sized window. We never want to delay 1465 * the ack that opens up a 0-sized window. 1466 * - LRO wasn't used for this segment. We make sure by checking that the 1467 * segment size is not larger than the MSS. 1468 * - Delayed acks are enabled or this is a half-synchronized T/TCP 1469 * connection. 1470 */ 1471 #define DELAY_ACK(tp, tlen) \ 1472 (((tp->t_flags & TF_RXWIN0SENT) == 0) && \ 1473 ((tp->t_flags & TF_DELACK) == 0) && \ 1474 (tlen <= tp->t_maxseg) && \ 1475 (tp->t_delayed_ack || (tp->t_flags & TF_NEEDSYN))) 1476 1477 static inline void 1478 rack_calc_rwin(struct socket *so, struct tcpcb *tp) 1479 { 1480 int32_t win; 1481 1482 /* 1483 * Calculate amount of space in receive window, and then do TCP 1484 * input processing. Receive window is amount of space in rcv queue, 1485 * but not less than advertised window. 1486 */ 1487 win = sbspace(&so->so_rcv); 1488 if (win < 0) 1489 win = 0; 1490 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); 1491 } 1492 1493 static void 1494 rack_do_drop(struct mbuf *m, struct tcpcb *tp, int32_t * ti_locked) 1495 { 1496 if (*ti_locked == TI_RLOCKED) { 1497 INP_INFO_RUNLOCK(&V_tcbinfo); 1498 *ti_locked = TI_UNLOCKED; 1499 } 1500 /* 1501 * Drop space held by incoming segment and return. 1502 */ 1503 if (tp != NULL) 1504 INP_WUNLOCK(tp->t_inpcb); 1505 if (m) 1506 m_freem(m); 1507 } 1508 1509 static void 1510 rack_do_dropwithreset(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t * ti_locked, int32_t rstreason, int32_t tlen) 1511 { 1512 if (*ti_locked == TI_RLOCKED) { 1513 INP_INFO_RUNLOCK(&V_tcbinfo); 1514 *ti_locked = TI_UNLOCKED; 1515 } 1516 if (tp != NULL) { 1517 tcp_dropwithreset(m, th, tp, tlen, rstreason); 1518 INP_WUNLOCK(tp->t_inpcb); 1519 } else 1520 tcp_dropwithreset(m, th, NULL, tlen, rstreason); 1521 } 1522 1523 /* 1524 * The value in ret_val informs the caller 1525 * if we dropped the tcb (and lock) or not. 1526 * 1 = we dropped it, 0 = the TCB is still locked 1527 * and valid. 1528 */ 1529 static void 1530 rack_do_dropafterack(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t * ti_locked, int32_t thflags, int32_t tlen, int32_t * ret_val) 1531 { 1532 /* 1533 * Generate an ACK dropping incoming segment if it occupies sequence 1534 * space, where the ACK reflects our state. 1535 * 1536 * We can now skip the test for the RST flag since all paths to this 1537 * code happen after packets containing RST have been dropped. 1538 * 1539 * In the SYN-RECEIVED state, don't send an ACK unless the segment 1540 * we received passes the SYN-RECEIVED ACK test. If it fails send a 1541 * RST. This breaks the loop in the "LAND" DoS attack, and also 1542 * prevents an ACK storm between two listening ports that have been 1543 * sent forged SYN segments, each with the source address of the 1544 * other. 1545 */ 1546 struct tcp_rack *rack; 1547 1548 if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) && 1549 (SEQ_GT(tp->snd_una, th->th_ack) || 1550 SEQ_GT(th->th_ack, tp->snd_max))) { 1551 *ret_val = 1; 1552 rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen); 1553 return; 1554 } else 1555 *ret_val = 0; 1556 if (*ti_locked == TI_RLOCKED) { 1557 INP_INFO_RUNLOCK(&V_tcbinfo); 1558 *ti_locked = TI_UNLOCKED; 1559 } 1560 rack = (struct tcp_rack *)tp->t_fb_ptr; 1561 rack->r_wanted_output++; 1562 tp->t_flags |= TF_ACKNOW; 1563 if (m) 1564 m_freem(m); 1565 } 1566 1567 1568 static int 1569 rack_process_rst(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, int32_t * ti_locked) 1570 { 1571 /* 1572 * RFC5961 Section 3.2 1573 * 1574 * - RST drops connection only if SEG.SEQ == RCV.NXT. - If RST is in 1575 * window, we send challenge ACK. 1576 * 1577 * Note: to take into account delayed ACKs, we should test against 1578 * last_ack_sent instead of rcv_nxt. Note 2: we handle special case 1579 * of closed window, not covered by the RFC. 1580 */ 1581 int dropped = 0; 1582 1583 if ((SEQ_GEQ(th->th_seq, (tp->last_ack_sent - 1)) && 1584 SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) || 1585 (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) { 1586 1587 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 1588 KASSERT(*ti_locked == TI_RLOCKED, 1589 ("%s: TH_RST ti_locked %d, th %p tp %p", 1590 __func__, *ti_locked, th, tp)); 1591 KASSERT(tp->t_state != TCPS_SYN_SENT, 1592 ("%s: TH_RST for TCPS_SYN_SENT th %p tp %p", 1593 __func__, th, tp)); 1594 1595 if (V_tcp_insecure_rst || 1596 (tp->last_ack_sent == th->th_seq) || 1597 (tp->rcv_nxt == th->th_seq) || 1598 ((tp->last_ack_sent - 1) == th->th_seq)) { 1599 TCPSTAT_INC(tcps_drops); 1600 /* Drop the connection. */ 1601 switch (tp->t_state) { 1602 case TCPS_SYN_RECEIVED: 1603 so->so_error = ECONNREFUSED; 1604 goto close; 1605 case TCPS_ESTABLISHED: 1606 case TCPS_FIN_WAIT_1: 1607 case TCPS_FIN_WAIT_2: 1608 case TCPS_CLOSE_WAIT: 1609 case TCPS_CLOSING: 1610 case TCPS_LAST_ACK: 1611 so->so_error = ECONNRESET; 1612 close: 1613 tcp_state_change(tp, TCPS_CLOSED); 1614 /* FALLTHROUGH */ 1615 default: 1616 tp = tcp_close(tp); 1617 } 1618 dropped = 1; 1619 rack_do_drop(m, tp, ti_locked); 1620 } else { 1621 TCPSTAT_INC(tcps_badrst); 1622 /* Send challenge ACK. */ 1623 tcp_respond(tp, mtod(m, void *), th, m, 1624 tp->rcv_nxt, tp->snd_nxt, TH_ACK); 1625 tp->last_ack_sent = tp->rcv_nxt; 1626 } 1627 } else { 1628 m_freem(m); 1629 } 1630 return (dropped); 1631 } 1632 1633 /* 1634 * The value in ret_val informs the caller 1635 * if we dropped the tcb (and lock) or not. 1636 * 1 = we dropped it, 0 = the TCB is still locked 1637 * and valid. 1638 */ 1639 static void 1640 rack_challenge_ack(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * ti_locked, int32_t * ret_val) 1641 { 1642 KASSERT(*ti_locked == TI_RLOCKED, 1643 ("tcp_do_segment: TH_SYN ti_locked %d", *ti_locked)); 1644 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 1645 1646 TCPSTAT_INC(tcps_badsyn); 1647 if (V_tcp_insecure_syn && 1648 SEQ_GEQ(th->th_seq, tp->last_ack_sent) && 1649 SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) { 1650 tp = tcp_drop(tp, ECONNRESET); 1651 *ret_val = 1; 1652 rack_do_drop(m, tp, ti_locked); 1653 } else { 1654 /* Send challenge ACK. */ 1655 tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt, 1656 tp->snd_nxt, TH_ACK); 1657 tp->last_ack_sent = tp->rcv_nxt; 1658 m = NULL; 1659 *ret_val = 0; 1660 rack_do_drop(m, NULL, ti_locked); 1661 } 1662 } 1663 1664 /* 1665 * rack_ts_check returns 1 for you should not proceed. It places 1666 * in ret_val what should be returned 1/0 by the caller. The 1 indicates 1667 * that the TCB is unlocked and probably dropped. The 0 indicates the 1668 * TCB is still valid and locked. 1669 */ 1670 static int 1671 rack_ts_check(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * ti_locked, int32_t tlen, int32_t thflags, int32_t * ret_val) 1672 { 1673 1674 /* Check to see if ts_recent is over 24 days old. */ 1675 if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) { 1676 /* 1677 * Invalidate ts_recent. If this segment updates ts_recent, 1678 * the age will be reset later and ts_recent will get a 1679 * valid value. If it does not, setting ts_recent to zero 1680 * will at least satisfy the requirement that zero be placed 1681 * in the timestamp echo reply when ts_recent isn't valid. 1682 * The age isn't reset until we get a valid ts_recent 1683 * because we don't want out-of-order segments to be dropped 1684 * when ts_recent is old. 1685 */ 1686 tp->ts_recent = 0; 1687 } else { 1688 TCPSTAT_INC(tcps_rcvduppack); 1689 TCPSTAT_ADD(tcps_rcvdupbyte, tlen); 1690 TCPSTAT_INC(tcps_pawsdrop); 1691 *ret_val = 0; 1692 if (tlen) { 1693 rack_do_dropafterack(m, tp, th, ti_locked, thflags, tlen, ret_val); 1694 } else { 1695 rack_do_drop(m, NULL, ti_locked); 1696 } 1697 return (1); 1698 } 1699 return (0); 1700 } 1701 1702 /* 1703 * rack_drop_checks returns 1 for you should not proceed. It places 1704 * in ret_val what should be returned 1/0 by the caller. The 1 indicates 1705 * that the TCB is unlocked and probably dropped. The 0 indicates the 1706 * TCB is still valid and locked. 1707 */ 1708 static int 1709 rack_drop_checks(struct tcpopt *to, struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * tlenp, int32_t * ti_locked, int32_t * thf, int32_t * drop_hdrlen, int32_t * ret_val) 1710 { 1711 int32_t todrop; 1712 int32_t thflags; 1713 int32_t tlen; 1714 1715 thflags = *thf; 1716 tlen = *tlenp; 1717 todrop = tp->rcv_nxt - th->th_seq; 1718 if (todrop > 0) { 1719 if (thflags & TH_SYN) { 1720 thflags &= ~TH_SYN; 1721 th->th_seq++; 1722 if (th->th_urp > 1) 1723 th->th_urp--; 1724 else 1725 thflags &= ~TH_URG; 1726 todrop--; 1727 } 1728 /* 1729 * Following if statement from Stevens, vol. 2, p. 960. 1730 */ 1731 if (todrop > tlen 1732 || (todrop == tlen && (thflags & TH_FIN) == 0)) { 1733 /* 1734 * Any valid FIN must be to the left of the window. 1735 * At this point the FIN must be a duplicate or out 1736 * of sequence; drop it. 1737 */ 1738 thflags &= ~TH_FIN; 1739 /* 1740 * Send an ACK to resynchronize and drop any data. 1741 * But keep on processing for RST or ACK. 1742 */ 1743 tp->t_flags |= TF_ACKNOW; 1744 todrop = tlen; 1745 TCPSTAT_INC(tcps_rcvduppack); 1746 TCPSTAT_ADD(tcps_rcvdupbyte, todrop); 1747 } else { 1748 TCPSTAT_INC(tcps_rcvpartduppack); 1749 TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop); 1750 } 1751 *drop_hdrlen += todrop; /* drop from the top afterwards */ 1752 th->th_seq += todrop; 1753 tlen -= todrop; 1754 if (th->th_urp > todrop) 1755 th->th_urp -= todrop; 1756 else { 1757 thflags &= ~TH_URG; 1758 th->th_urp = 0; 1759 } 1760 } 1761 /* 1762 * If segment ends after window, drop trailing data (and PUSH and 1763 * FIN); if nothing left, just ACK. 1764 */ 1765 todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd); 1766 if (todrop > 0) { 1767 TCPSTAT_INC(tcps_rcvpackafterwin); 1768 if (todrop >= tlen) { 1769 TCPSTAT_ADD(tcps_rcvbyteafterwin, tlen); 1770 /* 1771 * If window is closed can only take segments at 1772 * window edge, and have to drop data and PUSH from 1773 * incoming segments. Continue processing, but 1774 * remember to ack. Otherwise, drop segment and 1775 * ack. 1776 */ 1777 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { 1778 tp->t_flags |= TF_ACKNOW; 1779 TCPSTAT_INC(tcps_rcvwinprobe); 1780 } else { 1781 rack_do_dropafterack(m, tp, th, ti_locked, thflags, tlen, ret_val); 1782 return (1); 1783 } 1784 } else 1785 TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); 1786 m_adj(m, -todrop); 1787 tlen -= todrop; 1788 thflags &= ~(TH_PUSH | TH_FIN); 1789 } 1790 *thf = thflags; 1791 *tlenp = tlen; 1792 return (0); 1793 } 1794 1795 static struct rack_sendmap * 1796 rack_find_lowest_rsm(struct tcp_rack *rack) 1797 { 1798 struct rack_sendmap *rsm; 1799 1800 /* 1801 * Walk the time-order transmitted list looking for an rsm that is 1802 * not acked. This will be the one that was sent the longest time 1803 * ago that is still outstanding. 1804 */ 1805 TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) { 1806 if (rsm->r_flags & RACK_ACKED) { 1807 continue; 1808 } 1809 goto finish; 1810 } 1811 finish: 1812 return (rsm); 1813 } 1814 1815 static struct rack_sendmap * 1816 rack_find_high_nonack(struct tcp_rack *rack, struct rack_sendmap *rsm) 1817 { 1818 struct rack_sendmap *prsm; 1819 1820 /* 1821 * Walk the sequence order list backward until we hit and arrive at 1822 * the highest seq not acked. In theory when this is called it 1823 * should be the last segment (which it was not). 1824 */ 1825 counter_u64_add(rack_find_high, 1); 1826 prsm = rsm; 1827 TAILQ_FOREACH_REVERSE_FROM(prsm, &rack->r_ctl.rc_map, rack_head, r_next) { 1828 if (prsm->r_flags & (RACK_ACKED | RACK_HAS_FIN)) { 1829 continue; 1830 } 1831 return (prsm); 1832 } 1833 return (NULL); 1834 } 1835 1836 1837 static uint32_t 1838 rack_calc_thresh_rack(struct tcp_rack *rack, uint32_t srtt, uint32_t cts) 1839 { 1840 int32_t lro; 1841 uint32_t thresh; 1842 1843 /* 1844 * lro is the flag we use to determine if we have seen reordering. 1845 * If it gets set we have seen reordering. The reorder logic either 1846 * works in one of two ways: 1847 * 1848 * If reorder-fade is configured, then we track the last time we saw 1849 * re-ordering occur. If we reach the point where enough time as 1850 * passed we no longer consider reordering has occuring. 1851 * 1852 * Or if reorder-face is 0, then once we see reordering we consider 1853 * the connection to alway be subject to reordering and just set lro 1854 * to 1. 1855 * 1856 * In the end if lro is non-zero we add the extra time for 1857 * reordering in. 1858 */ 1859 if (srtt == 0) 1860 srtt = 1; 1861 if (rack->r_ctl.rc_reorder_ts) { 1862 if (rack->r_ctl.rc_reorder_fade) { 1863 if (SEQ_GEQ(cts, rack->r_ctl.rc_reorder_ts)) { 1864 lro = cts - rack->r_ctl.rc_reorder_ts; 1865 if (lro == 0) { 1866 /* 1867 * No time as passed since the last 1868 * reorder, mark it as reordering. 1869 */ 1870 lro = 1; 1871 } 1872 } else { 1873 /* Negative time? */ 1874 lro = 0; 1875 } 1876 if (lro > rack->r_ctl.rc_reorder_fade) { 1877 /* Turn off reordering seen too */ 1878 rack->r_ctl.rc_reorder_ts = 0; 1879 lro = 0; 1880 } 1881 } else { 1882 /* Reodering does not fade */ 1883 lro = 1; 1884 } 1885 } else { 1886 lro = 0; 1887 } 1888 thresh = srtt + rack->r_ctl.rc_pkt_delay; 1889 if (lro) { 1890 /* It must be set, if not you get 1/4 rtt */ 1891 if (rack->r_ctl.rc_reorder_shift) 1892 thresh += (srtt >> rack->r_ctl.rc_reorder_shift); 1893 else 1894 thresh += (srtt >> 2); 1895 } else { 1896 thresh += 1; 1897 } 1898 /* We don't let the rack timeout be above a RTO */ 1899 1900 if (thresh > TICKS_2_MSEC(rack->rc_tp->t_rxtcur)) { 1901 thresh = TICKS_2_MSEC(rack->rc_tp->t_rxtcur); 1902 } 1903 /* And we don't want it above the RTO max either */ 1904 if (thresh > rack_rto_max) { 1905 thresh = rack_rto_max; 1906 } 1907 return (thresh); 1908 } 1909 1910 static uint32_t 1911 rack_calc_thresh_tlp(struct tcpcb *tp, struct tcp_rack *rack, 1912 struct rack_sendmap *rsm, uint32_t srtt) 1913 { 1914 struct rack_sendmap *prsm; 1915 uint32_t thresh, len; 1916 int maxseg; 1917 1918 if (srtt == 0) 1919 srtt = 1; 1920 if (rack->r_ctl.rc_tlp_threshold) 1921 thresh = srtt + (srtt / rack->r_ctl.rc_tlp_threshold); 1922 else 1923 thresh = (srtt * 2); 1924 1925 /* Get the previous sent packet, if any */ 1926 maxseg = tcp_maxseg(tp); 1927 counter_u64_add(rack_enter_tlp_calc, 1); 1928 len = rsm->r_end - rsm->r_start; 1929 if (rack->rack_tlp_threshold_use == TLP_USE_ID) { 1930 /* Exactly like the ID */ 1931 if (((tp->snd_max - tp->snd_una) - rack->r_ctl.rc_sacked + rack->r_ctl.rc_holes_rxt) <= maxseg) { 1932 uint32_t alt_thresh; 1933 /* 1934 * Compensate for delayed-ack with the d-ack time. 1935 */ 1936 counter_u64_add(rack_used_tlpmethod, 1); 1937 alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time; 1938 if (alt_thresh > thresh) 1939 thresh = alt_thresh; 1940 } 1941 } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_ONE) { 1942 /* 2.1 behavior */ 1943 prsm = TAILQ_PREV(rsm, rack_head, r_tnext); 1944 if (prsm && (len <= maxseg)) { 1945 /* 1946 * Two packets outstanding, thresh should be (2*srtt) + 1947 * possible inter-packet delay (if any). 1948 */ 1949 uint32_t inter_gap = 0; 1950 int idx, nidx; 1951 1952 counter_u64_add(rack_used_tlpmethod, 1); 1953 idx = rsm->r_rtr_cnt - 1; 1954 nidx = prsm->r_rtr_cnt - 1; 1955 if (TSTMP_GEQ(rsm->r_tim_lastsent[nidx], prsm->r_tim_lastsent[idx])) { 1956 /* Yes it was sent later (or at the same time) */ 1957 inter_gap = rsm->r_tim_lastsent[idx] - prsm->r_tim_lastsent[nidx]; 1958 } 1959 thresh += inter_gap; 1960 } else if (len <= maxseg) { 1961 /* 1962 * Possibly compensate for delayed-ack. 1963 */ 1964 uint32_t alt_thresh; 1965 1966 counter_u64_add(rack_used_tlpmethod2, 1); 1967 alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time; 1968 if (alt_thresh > thresh) 1969 thresh = alt_thresh; 1970 } 1971 } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_TWO) { 1972 /* 2.2 behavior */ 1973 if (len <= maxseg) { 1974 uint32_t alt_thresh; 1975 /* 1976 * Compensate for delayed-ack with the d-ack time. 1977 */ 1978 counter_u64_add(rack_used_tlpmethod, 1); 1979 alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time; 1980 if (alt_thresh > thresh) 1981 thresh = alt_thresh; 1982 } 1983 } 1984 /* Not above an RTO */ 1985 if (thresh > TICKS_2_MSEC(tp->t_rxtcur)) { 1986 thresh = TICKS_2_MSEC(tp->t_rxtcur); 1987 } 1988 /* Not above a RTO max */ 1989 if (thresh > rack_rto_max) { 1990 thresh = rack_rto_max; 1991 } 1992 /* Apply user supplied min TLP */ 1993 if (thresh < rack_tlp_min) { 1994 thresh = rack_tlp_min; 1995 } 1996 return (thresh); 1997 } 1998 1999 static struct rack_sendmap * 2000 rack_check_recovery_mode(struct tcpcb *tp, uint32_t tsused) 2001 { 2002 /* 2003 * Check to see that we don't need to fall into recovery. We will 2004 * need to do so if our oldest transmit is past the time we should 2005 * have had an ack. 2006 */ 2007 struct tcp_rack *rack; 2008 struct rack_sendmap *rsm; 2009 int32_t idx; 2010 uint32_t srtt_cur, srtt, thresh; 2011 2012 rack = (struct tcp_rack *)tp->t_fb_ptr; 2013 if (TAILQ_EMPTY(&rack->r_ctl.rc_map)) { 2014 return (NULL); 2015 } 2016 srtt_cur = tp->t_srtt >> TCP_RTT_SHIFT; 2017 srtt = TICKS_2_MSEC(srtt_cur); 2018 if (rack->rc_rack_rtt && (srtt > rack->rc_rack_rtt)) 2019 srtt = rack->rc_rack_rtt; 2020 2021 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 2022 if (rsm == NULL) 2023 return (NULL); 2024 2025 if (rsm->r_flags & RACK_ACKED) { 2026 rsm = rack_find_lowest_rsm(rack); 2027 if (rsm == NULL) 2028 return (NULL); 2029 } 2030 idx = rsm->r_rtr_cnt - 1; 2031 thresh = rack_calc_thresh_rack(rack, srtt, tsused); 2032 if (tsused < rsm->r_tim_lastsent[idx]) { 2033 return (NULL); 2034 } 2035 if ((tsused - rsm->r_tim_lastsent[idx]) < thresh) { 2036 return (NULL); 2037 } 2038 /* Ok if we reach here we are over-due */ 2039 rack->r_ctl.rc_rsm_start = rsm->r_start; 2040 rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; 2041 rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; 2042 rack_cong_signal(tp, NULL, CC_NDUPACK); 2043 return (rsm); 2044 } 2045 2046 static uint32_t 2047 rack_get_persists_timer_val(struct tcpcb *tp, struct tcp_rack *rack) 2048 { 2049 int32_t t; 2050 int32_t tt; 2051 uint32_t ret_val; 2052 2053 t = TICKS_2_MSEC((tp->t_srtt >> TCP_RTT_SHIFT) + ((tp->t_rttvar * 4) >> TCP_RTT_SHIFT)); 2054 TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift], 2055 tcp_persmin, tcp_persmax); 2056 if (tp->t_rxtshift < TCP_MAXRXTSHIFT) 2057 tp->t_rxtshift++; 2058 rack->r_ctl.rc_hpts_flags |= PACE_TMR_PERSIT; 2059 ret_val = (uint32_t)tt; 2060 return (ret_val); 2061 } 2062 2063 static uint32_t 2064 rack_timer_start(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 2065 { 2066 /* 2067 * Start the FR timer, we do this based on getting the first one in 2068 * the rc_tmap. Note that if its NULL we must stop the timer. in all 2069 * events we need to stop the running timer (if its running) before 2070 * starting the new one. 2071 */ 2072 uint32_t thresh, exp, to, srtt, time_since_sent; 2073 uint32_t srtt_cur; 2074 int32_t idx; 2075 int32_t is_tlp_timer = 0; 2076 struct rack_sendmap *rsm; 2077 2078 if (rack->t_timers_stopped) { 2079 /* All timers have been stopped none are to run */ 2080 return (0); 2081 } 2082 if (rack->rc_in_persist) { 2083 /* We can't start any timer in persists */ 2084 return (rack_get_persists_timer_val(tp, rack)); 2085 } 2086 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 2087 if (rsm == NULL) { 2088 /* Nothing on the send map */ 2089 activate_rxt: 2090 if (SEQ_LT(tp->snd_una, tp->snd_max) || sbavail(&(tp->t_inpcb->inp_socket->so_snd))) { 2091 rack->r_ctl.rc_hpts_flags |= PACE_TMR_RXT; 2092 to = TICKS_2_MSEC(tp->t_rxtcur); 2093 if (to == 0) 2094 to = 1; 2095 return (to); 2096 } 2097 return (0); 2098 } 2099 if (rsm->r_flags & RACK_ACKED) { 2100 rsm = rack_find_lowest_rsm(rack); 2101 if (rsm == NULL) { 2102 /* No lowest? */ 2103 goto activate_rxt; 2104 } 2105 } 2106 /* Convert from ms to usecs */ 2107 if (rsm->r_flags & RACK_SACK_PASSED) { 2108 if ((tp->t_flags & TF_SENTFIN) && 2109 ((tp->snd_max - tp->snd_una) == 1) && 2110 (rsm->r_flags & RACK_HAS_FIN)) { 2111 /* 2112 * We don't start a rack timer if all we have is a 2113 * FIN outstanding. 2114 */ 2115 goto activate_rxt; 2116 } 2117 if (tp->t_srtt) { 2118 srtt_cur = (tp->t_srtt >> TCP_RTT_SHIFT); 2119 srtt = TICKS_2_MSEC(srtt_cur); 2120 } else 2121 srtt = RACK_INITIAL_RTO; 2122 2123 thresh = rack_calc_thresh_rack(rack, srtt, cts); 2124 idx = rsm->r_rtr_cnt - 1; 2125 exp = rsm->r_tim_lastsent[idx] + thresh; 2126 if (SEQ_GEQ(exp, cts)) { 2127 to = exp - cts; 2128 if (to < rack->r_ctl.rc_min_to) { 2129 to = rack->r_ctl.rc_min_to; 2130 } 2131 } else { 2132 to = rack->r_ctl.rc_min_to; 2133 } 2134 } else { 2135 /* Ok we need to do a TLP not RACK */ 2136 if ((rack->rc_tlp_in_progress != 0) || 2137 (rack->r_ctl.rc_tlp_rtx_out != 0)) { 2138 /* 2139 * The previous send was a TLP or a tlp_rtx is in 2140 * process. 2141 */ 2142 goto activate_rxt; 2143 } 2144 rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext); 2145 if (rsm == NULL) { 2146 /* We found no rsm to TLP with. */ 2147 goto activate_rxt; 2148 } 2149 if (rsm->r_flags & RACK_HAS_FIN) { 2150 /* If its a FIN we dont do TLP */ 2151 rsm = NULL; 2152 goto activate_rxt; 2153 } 2154 idx = rsm->r_rtr_cnt - 1; 2155 if (TSTMP_GT(cts, rsm->r_tim_lastsent[idx])) 2156 time_since_sent = cts - rsm->r_tim_lastsent[idx]; 2157 else 2158 time_since_sent = 0; 2159 is_tlp_timer = 1; 2160 if (tp->t_srtt) { 2161 srtt_cur = (tp->t_srtt >> TCP_RTT_SHIFT); 2162 srtt = TICKS_2_MSEC(srtt_cur); 2163 } else 2164 srtt = RACK_INITIAL_RTO; 2165 thresh = rack_calc_thresh_tlp(tp, rack, rsm, srtt); 2166 if (thresh > time_since_sent) 2167 to = thresh - time_since_sent; 2168 else 2169 to = rack->r_ctl.rc_min_to; 2170 if (to > TCPTV_REXMTMAX) { 2171 /* 2172 * If the TLP time works out to larger than the max 2173 * RTO lets not do TLP.. just RTO. 2174 */ 2175 goto activate_rxt; 2176 } 2177 if (rsm->r_start != rack->r_ctl.rc_last_tlp_seq) { 2178 /* 2179 * The tail is no longer the last one I did a probe 2180 * on 2181 */ 2182 rack->r_ctl.rc_tlp_seg_send_cnt = 0; 2183 rack->r_ctl.rc_last_tlp_seq = rsm->r_start; 2184 } 2185 } 2186 if (is_tlp_timer == 0) { 2187 rack->r_ctl.rc_hpts_flags |= PACE_TMR_RACK; 2188 } else { 2189 if ((rack->r_ctl.rc_tlp_send_cnt > rack_tlp_max_resend) || 2190 (rack->r_ctl.rc_tlp_seg_send_cnt > rack_tlp_max_resend)) { 2191 /* 2192 * We have exceeded how many times we can retran the 2193 * current TLP timer, switch to the RTO timer. 2194 */ 2195 goto activate_rxt; 2196 } else { 2197 rack->r_ctl.rc_hpts_flags |= PACE_TMR_TLP; 2198 } 2199 } 2200 if (to == 0) 2201 to = 1; 2202 return (to); 2203 } 2204 2205 static void 2206 rack_enter_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 2207 { 2208 if (rack->rc_in_persist == 0) { 2209 if (((tp->t_flags & TF_SENTFIN) == 0) && 2210 (tp->snd_max - tp->snd_una) >= sbavail(&rack->rc_inp->inp_socket->so_snd)) 2211 /* Must need to send more data to enter persist */ 2212 return; 2213 rack->r_ctl.rc_went_idle_time = cts; 2214 rack_timer_cancel(tp, rack, cts, __LINE__); 2215 tp->t_rxtshift = 0; 2216 rack->rc_in_persist = 1; 2217 } 2218 } 2219 2220 static void 2221 rack_exit_persist(struct tcpcb *tp, struct tcp_rack *rack) 2222 { 2223 if (rack->rc_inp->inp_in_hpts) { 2224 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); 2225 rack->r_ctl.rc_hpts_flags = 0; 2226 } 2227 rack->rc_in_persist = 0; 2228 rack->r_ctl.rc_went_idle_time = 0; 2229 tp->t_flags &= ~TF_FORCEDATA; 2230 tp->t_rxtshift = 0; 2231 } 2232 2233 static void 2234 rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, int32_t line, 2235 int32_t slot, uint32_t tot_len_this_send, int32_t frm_out_sbavail) 2236 { 2237 struct inpcb *inp; 2238 uint32_t delayed_ack = 0; 2239 uint32_t hpts_timeout; 2240 uint8_t stopped; 2241 uint32_t left = 0; 2242 2243 inp = tp->t_inpcb; 2244 if (inp->inp_in_hpts) { 2245 /* A previous call is already set up */ 2246 return; 2247 } 2248 if (tp->t_state == TCPS_CLOSED) { 2249 return; 2250 } 2251 stopped = rack->rc_tmr_stopped; 2252 if (stopped && TSTMP_GT(rack->r_ctl.rc_timer_exp, cts)) { 2253 left = rack->r_ctl.rc_timer_exp - cts; 2254 } 2255 rack->r_ctl.rc_timer_exp = 0; 2256 if (rack->rc_inp->inp_in_hpts == 0) { 2257 rack->r_ctl.rc_hpts_flags = 0; 2258 } 2259 if (slot) { 2260 /* We are hptsi too */ 2261 rack->r_ctl.rc_hpts_flags |= PACE_PKT_OUTPUT; 2262 } else if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { 2263 /* 2264 * We are still left on the hpts when the to goes 2265 * it will be for output. 2266 */ 2267 if (TSTMP_GT(cts, rack->r_ctl.rc_last_output_to)) 2268 slot = cts - rack->r_ctl.rc_last_output_to; 2269 else 2270 slot = 1; 2271 } 2272 if ((tp->snd_wnd == 0) && TCPS_HAVEESTABLISHED(tp->t_state)) { 2273 /* No send window.. we must enter persist */ 2274 rack_enter_persist(tp, rack, cts); 2275 } else if ((frm_out_sbavail && 2276 (frm_out_sbavail > (tp->snd_max - tp->snd_una)) && 2277 (tp->snd_wnd < tp->t_maxseg)) && 2278 TCPS_HAVEESTABLISHED(tp->t_state)) { 2279 /* 2280 * If we have no window or we can't send a segment (and have 2281 * data to send.. we cheat here and frm_out_sbavail is 2282 * passed in with the sbavail(sb) only from bbr_output) and 2283 * we are established, then we must enter persits (if not 2284 * already in persits). 2285 */ 2286 rack_enter_persist(tp, rack, cts); 2287 } 2288 hpts_timeout = rack_timer_start(tp, rack, cts); 2289 if (tp->t_flags & TF_DELACK) { 2290 delayed_ack = tcp_delacktime; 2291 rack->r_ctl.rc_hpts_flags |= PACE_TMR_DELACK; 2292 } 2293 if (delayed_ack && ((hpts_timeout == 0) || 2294 (delayed_ack < hpts_timeout))) 2295 hpts_timeout = delayed_ack; 2296 else 2297 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK; 2298 /* 2299 * If no timers are going to run and we will fall off the hptsi 2300 * wheel, we resort to a keep-alive timer if its configured. 2301 */ 2302 if ((hpts_timeout == 0) && 2303 (slot == 0)) { 2304 if ((tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && 2305 (tp->t_state <= TCPS_CLOSING)) { 2306 /* 2307 * Ok we have no timer (persists, rack, tlp, rxt or 2308 * del-ack), we don't have segments being paced. So 2309 * all that is left is the keepalive timer. 2310 */ 2311 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 2312 /* Get the established keep-alive time */ 2313 hpts_timeout = TP_KEEPIDLE(tp); 2314 } else { 2315 /* Get the initial setup keep-alive time */ 2316 hpts_timeout = TP_KEEPINIT(tp); 2317 } 2318 rack->r_ctl.rc_hpts_flags |= PACE_TMR_KEEP; 2319 } 2320 } 2321 if (left && (stopped & (PACE_TMR_KEEP | PACE_TMR_DELACK)) == 2322 (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK)) { 2323 /* 2324 * RACK, TLP, persists and RXT timers all are restartable 2325 * based on actions input .. i.e we received a packet (ack 2326 * or sack) and that changes things (rw, or snd_una etc). 2327 * Thus we can restart them with a new value. For 2328 * keep-alive, delayed_ack we keep track of what was left 2329 * and restart the timer with a smaller value. 2330 */ 2331 if (left < hpts_timeout) 2332 hpts_timeout = left; 2333 } 2334 if (hpts_timeout) { 2335 /* 2336 * Hack alert for now we can't time-out over 2,147,483 2337 * seconds (a bit more than 596 hours), which is probably ok 2338 * :). 2339 */ 2340 if (hpts_timeout > 0x7ffffffe) 2341 hpts_timeout = 0x7ffffffe; 2342 rack->r_ctl.rc_timer_exp = cts + hpts_timeout; 2343 } 2344 if (slot) { 2345 rack->r_ctl.rc_last_output_to = cts + slot; 2346 if ((hpts_timeout == 0) || (hpts_timeout > slot)) { 2347 if (rack->rc_inp->inp_in_hpts == 0) 2348 tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(slot)); 2349 rack_log_to_start(rack, cts, hpts_timeout, slot, 1); 2350 } else { 2351 /* 2352 * Arrange for the hpts to kick back in after the 2353 * t-o if the t-o does not cause a send. 2354 */ 2355 if (rack->rc_inp->inp_in_hpts == 0) 2356 tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(hpts_timeout)); 2357 rack_log_to_start(rack, cts, hpts_timeout, slot, 0); 2358 } 2359 } else if (hpts_timeout) { 2360 if (rack->rc_inp->inp_in_hpts == 0) 2361 tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(hpts_timeout)); 2362 rack_log_to_start(rack, cts, hpts_timeout, slot, 0); 2363 } else { 2364 /* No timer starting */ 2365 #ifdef INVARIANTS 2366 if (SEQ_GT(tp->snd_max, tp->snd_una)) { 2367 panic("tp:%p rack:%p tlts:%d cts:%u slot:%u pto:%u -- no timer started?", 2368 tp, rack, tot_len_this_send, cts, slot, hpts_timeout); 2369 } 2370 #endif 2371 } 2372 rack->rc_tmr_stopped = 0; 2373 if (slot) 2374 rack_log_type_bbrsnd(rack, tot_len_this_send, slot, cts); 2375 } 2376 2377 /* 2378 * RACK Timer, here we simply do logging and house keeping. 2379 * the normal rack_output() function will call the 2380 * appropriate thing to check if we need to do a RACK retransmit. 2381 * We return 1, saying don't proceed with rack_output only 2382 * when all timers have been stopped (destroyed PCB?). 2383 */ 2384 static int 2385 rack_timeout_rack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 2386 { 2387 /* 2388 * This timer simply provides an internal trigger to send out data. 2389 * The check_recovery_mode call will see if there are needed 2390 * retransmissions, if so we will enter fast-recovery. The output 2391 * call may or may not do the same thing depending on sysctl 2392 * settings. 2393 */ 2394 struct rack_sendmap *rsm; 2395 int32_t recovery; 2396 2397 if (tp->t_timers->tt_flags & TT_STOPPED) { 2398 return (1); 2399 } 2400 if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) { 2401 /* Its not time yet */ 2402 return (0); 2403 } 2404 rack_log_to_event(rack, RACK_TO_FRM_RACK); 2405 recovery = IN_RECOVERY(tp->t_flags); 2406 counter_u64_add(rack_to_tot, 1); 2407 if (rack->r_state && (rack->r_state != tp->t_state)) 2408 rack_set_state(tp, rack); 2409 rsm = rack_check_recovery_mode(tp, cts); 2410 if (rsm) { 2411 uint32_t rtt; 2412 2413 rtt = rack->rc_rack_rtt; 2414 if (rtt == 0) 2415 rtt = 1; 2416 if ((recovery == 0) && 2417 (rack->r_ctl.rc_prr_sndcnt < tp->t_maxseg)) { 2418 /* 2419 * The rack-timeout that enter's us into recovery 2420 * will force out one MSS and set us up so that we 2421 * can do one more send in 2*rtt (transitioning the 2422 * rack timeout into a rack-tlp). 2423 */ 2424 rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; 2425 } else if ((rack->r_ctl.rc_prr_sndcnt < tp->t_maxseg) && 2426 ((rsm->r_end - rsm->r_start) > rack->r_ctl.rc_prr_sndcnt)) { 2427 /* 2428 * When a rack timer goes, we have to send at 2429 * least one segment. They will be paced a min of 1ms 2430 * apart via the next rack timer (or further 2431 * if the rack timer dictates it). 2432 */ 2433 rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; 2434 } 2435 } else { 2436 /* This is a case that should happen rarely if ever */ 2437 counter_u64_add(rack_tlp_does_nada, 1); 2438 #ifdef TCP_BLACKBOX 2439 tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true); 2440 #endif 2441 rack->r_ctl.rc_resend = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 2442 } 2443 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RACK; 2444 return (0); 2445 } 2446 2447 /* 2448 * TLP Timer, here we simply setup what segment we want to 2449 * have the TLP expire on, the normal rack_output() will then 2450 * send it out. 2451 * 2452 * We return 1, saying don't proceed with rack_output only 2453 * when all timers have been stopped (destroyed PCB?). 2454 */ 2455 static int 2456 rack_timeout_tlp(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 2457 { 2458 /* 2459 * Tail Loss Probe. 2460 */ 2461 struct rack_sendmap *rsm = NULL; 2462 struct socket *so; 2463 uint32_t amm, old_prr_snd = 0; 2464 uint32_t out, avail; 2465 2466 if (tp->t_timers->tt_flags & TT_STOPPED) { 2467 return (1); 2468 } 2469 if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) { 2470 /* Its not time yet */ 2471 return (0); 2472 } 2473 if (rack_progress_timeout_check(tp)) { 2474 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 2475 return (1); 2476 } 2477 /* 2478 * A TLP timer has expired. We have been idle for 2 rtts. So we now 2479 * need to figure out how to force a full MSS segment out. 2480 */ 2481 rack_log_to_event(rack, RACK_TO_FRM_TLP); 2482 counter_u64_add(rack_tlp_tot, 1); 2483 if (rack->r_state && (rack->r_state != tp->t_state)) 2484 rack_set_state(tp, rack); 2485 so = tp->t_inpcb->inp_socket; 2486 avail = sbavail(&so->so_snd); 2487 out = tp->snd_max - tp->snd_una; 2488 rack->rc_timer_up = 1; 2489 /* 2490 * If we are in recovery we can jazz out a segment if new data is 2491 * present simply by setting rc_prr_sndcnt to a segment. 2492 */ 2493 if ((avail > out) && 2494 ((rack_always_send_oldest == 0) || (TAILQ_EMPTY(&rack->r_ctl.rc_tmap)))) { 2495 /* New data is available */ 2496 amm = avail - out; 2497 if (amm > tp->t_maxseg) { 2498 amm = tp->t_maxseg; 2499 } else if ((amm < tp->t_maxseg) && ((tp->t_flags & TF_NODELAY) == 0)) { 2500 /* not enough to fill a MTU and no-delay is off */ 2501 goto need_retran; 2502 } 2503 if (IN_RECOVERY(tp->t_flags)) { 2504 /* Unlikely */ 2505 old_prr_snd = rack->r_ctl.rc_prr_sndcnt; 2506 if (out + amm <= tp->snd_wnd) 2507 rack->r_ctl.rc_prr_sndcnt = amm; 2508 else 2509 goto need_retran; 2510 } else { 2511 /* Set the send-new override */ 2512 if (out + amm <= tp->snd_wnd) 2513 rack->r_ctl.rc_tlp_new_data = amm; 2514 else 2515 goto need_retran; 2516 } 2517 rack->r_ctl.rc_tlp_seg_send_cnt = 0; 2518 rack->r_ctl.rc_last_tlp_seq = tp->snd_max; 2519 rack->r_ctl.rc_tlpsend = NULL; 2520 counter_u64_add(rack_tlp_newdata, 1); 2521 goto send; 2522 } 2523 need_retran: 2524 /* 2525 * Ok we need to arrange the last un-acked segment to be re-sent, or 2526 * optionally the first un-acked segment. 2527 */ 2528 if (rack_always_send_oldest) 2529 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 2530 else { 2531 rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_map, rack_sendmap, r_next); 2532 if (rsm && (rsm->r_flags & (RACK_ACKED | RACK_HAS_FIN))) { 2533 rsm = rack_find_high_nonack(rack, rsm); 2534 } 2535 } 2536 if (rsm == NULL) { 2537 counter_u64_add(rack_tlp_does_nada, 1); 2538 #ifdef TCP_BLACKBOX 2539 tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true); 2540 #endif 2541 goto out; 2542 } 2543 if ((rsm->r_end - rsm->r_start) > tp->t_maxseg) { 2544 /* 2545 * We need to split this the last segment in two. 2546 */ 2547 int32_t idx; 2548 struct rack_sendmap *nrsm; 2549 2550 nrsm = rack_alloc(rack); 2551 if (nrsm == NULL) { 2552 /* 2553 * No memory to split, we will just exit and punt 2554 * off to the RXT timer. 2555 */ 2556 counter_u64_add(rack_tlp_does_nada, 1); 2557 goto out; 2558 } 2559 nrsm->r_start = (rsm->r_end - tp->t_maxseg); 2560 nrsm->r_end = rsm->r_end; 2561 nrsm->r_rtr_cnt = rsm->r_rtr_cnt; 2562 nrsm->r_flags = rsm->r_flags; 2563 nrsm->r_sndcnt = rsm->r_sndcnt; 2564 nrsm->r_rtr_bytes = 0; 2565 rsm->r_end = nrsm->r_start; 2566 for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { 2567 nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; 2568 } 2569 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next); 2570 if (rsm->r_in_tmap) { 2571 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 2572 nrsm->r_in_tmap = 1; 2573 } 2574 rsm->r_flags &= (~RACK_HAS_FIN); 2575 rsm = nrsm; 2576 } 2577 rack->r_ctl.rc_tlpsend = rsm; 2578 rack->r_ctl.rc_tlp_rtx_out = 1; 2579 if (rsm->r_start == rack->r_ctl.rc_last_tlp_seq) { 2580 rack->r_ctl.rc_tlp_seg_send_cnt++; 2581 tp->t_rxtshift++; 2582 } else { 2583 rack->r_ctl.rc_last_tlp_seq = rsm->r_start; 2584 rack->r_ctl.rc_tlp_seg_send_cnt = 1; 2585 } 2586 send: 2587 rack->r_ctl.rc_tlp_send_cnt++; 2588 if (rack->r_ctl.rc_tlp_send_cnt > rack_tlp_max_resend) { 2589 /* 2590 * Can't [re]/transmit a segment we have not heard from the 2591 * peer in max times. We need the retransmit timer to take 2592 * over. 2593 */ 2594 restore: 2595 rack->r_ctl.rc_tlpsend = NULL; 2596 if (rsm) 2597 rsm->r_flags &= ~RACK_TLP; 2598 rack->r_ctl.rc_prr_sndcnt = old_prr_snd; 2599 counter_u64_add(rack_tlp_retran_fail, 1); 2600 goto out; 2601 } else if (rsm) { 2602 rsm->r_flags |= RACK_TLP; 2603 } 2604 if (rsm && (rsm->r_start == rack->r_ctl.rc_last_tlp_seq) && 2605 (rack->r_ctl.rc_tlp_seg_send_cnt > rack_tlp_max_resend)) { 2606 /* 2607 * We don't want to send a single segment more than the max 2608 * either. 2609 */ 2610 goto restore; 2611 } 2612 rack->r_timer_override = 1; 2613 rack->r_tlp_running = 1; 2614 rack->rc_tlp_in_progress = 1; 2615 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP; 2616 return (0); 2617 out: 2618 rack->rc_timer_up = 0; 2619 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP; 2620 return (0); 2621 } 2622 2623 /* 2624 * Delayed ack Timer, here we simply need to setup the 2625 * ACK_NOW flag and remove the DELACK flag. From there 2626 * the output routine will send the ack out. 2627 * 2628 * We only return 1, saying don't proceed, if all timers 2629 * are stopped (destroyed PCB?). 2630 */ 2631 static int 2632 rack_timeout_delack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 2633 { 2634 if (tp->t_timers->tt_flags & TT_STOPPED) { 2635 return (1); 2636 } 2637 rack_log_to_event(rack, RACK_TO_FRM_DELACK); 2638 tp->t_flags &= ~TF_DELACK; 2639 tp->t_flags |= TF_ACKNOW; 2640 TCPSTAT_INC(tcps_delack); 2641 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK; 2642 return (0); 2643 } 2644 2645 /* 2646 * Persists timer, here we simply need to setup the 2647 * FORCE-DATA flag the output routine will send 2648 * the one byte send. 2649 * 2650 * We only return 1, saying don't proceed, if all timers 2651 * are stopped (destroyed PCB?). 2652 */ 2653 static int 2654 rack_timeout_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 2655 { 2656 struct inpcb *inp; 2657 int32_t retval = 0; 2658 2659 inp = tp->t_inpcb; 2660 2661 if (tp->t_timers->tt_flags & TT_STOPPED) { 2662 return (1); 2663 } 2664 if (rack->rc_in_persist == 0) 2665 return (0); 2666 if (rack_progress_timeout_check(tp)) { 2667 tcp_set_inp_to_drop(inp, ETIMEDOUT); 2668 return (1); 2669 } 2670 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 2671 /* 2672 * Persistence timer into zero window. Force a byte to be output, if 2673 * possible. 2674 */ 2675 TCPSTAT_INC(tcps_persisttimeo); 2676 /* 2677 * Hack: if the peer is dead/unreachable, we do not time out if the 2678 * window is closed. After a full backoff, drop the connection if 2679 * the idle time (no responses to probes) reaches the maximum 2680 * backoff that we would use if retransmitting. 2681 */ 2682 if (tp->t_rxtshift == TCP_MAXRXTSHIFT && 2683 (ticks - tp->t_rcvtime >= tcp_maxpersistidle || 2684 ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) { 2685 TCPSTAT_INC(tcps_persistdrop); 2686 retval = 1; 2687 tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT); 2688 goto out; 2689 } 2690 if ((sbavail(&rack->rc_inp->inp_socket->so_snd) == 0) && 2691 tp->snd_una == tp->snd_max) 2692 rack_exit_persist(tp, rack); 2693 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_PERSIT; 2694 /* 2695 * If the user has closed the socket then drop a persisting 2696 * connection after a much reduced timeout. 2697 */ 2698 if (tp->t_state > TCPS_CLOSE_WAIT && 2699 (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) { 2700 retval = 1; 2701 TCPSTAT_INC(tcps_persistdrop); 2702 tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT); 2703 goto out; 2704 } 2705 tp->t_flags |= TF_FORCEDATA; 2706 out: 2707 rack_log_to_event(rack, RACK_TO_FRM_PERSIST); 2708 return (retval); 2709 } 2710 2711 /* 2712 * If a keepalive goes off, we had no other timers 2713 * happening. We always return 1 here since this 2714 * routine either drops the connection or sends 2715 * out a segment with respond. 2716 */ 2717 static int 2718 rack_timeout_keepalive(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 2719 { 2720 struct tcptemp *t_template; 2721 struct inpcb *inp; 2722 2723 if (tp->t_timers->tt_flags & TT_STOPPED) { 2724 return (1); 2725 } 2726 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_KEEP; 2727 inp = tp->t_inpcb; 2728 rack_log_to_event(rack, RACK_TO_FRM_KEEP); 2729 /* 2730 * Keep-alive timer went off; send something or drop connection if 2731 * idle for too long. 2732 */ 2733 TCPSTAT_INC(tcps_keeptimeo); 2734 if (tp->t_state < TCPS_ESTABLISHED) 2735 goto dropit; 2736 if ((tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && 2737 tp->t_state <= TCPS_CLOSING) { 2738 if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp)) 2739 goto dropit; 2740 /* 2741 * Send a packet designed to force a response if the peer is 2742 * up and reachable: either an ACK if the connection is 2743 * still alive, or an RST if the peer has closed the 2744 * connection due to timeout or reboot. Using sequence 2745 * number tp->snd_una-1 causes the transmitted zero-length 2746 * segment to lie outside the receive window; by the 2747 * protocol spec, this requires the correspondent TCP to 2748 * respond. 2749 */ 2750 TCPSTAT_INC(tcps_keepprobe); 2751 t_template = tcpip_maketemplate(inp); 2752 if (t_template) { 2753 tcp_respond(tp, t_template->tt_ipgen, 2754 &t_template->tt_t, (struct mbuf *)NULL, 2755 tp->rcv_nxt, tp->snd_una - 1, 0); 2756 free(t_template, M_TEMP); 2757 } 2758 } 2759 rack_start_hpts_timer(rack, tp, cts, __LINE__, 0, 0, 0); 2760 return (1); 2761 dropit: 2762 TCPSTAT_INC(tcps_keepdrops); 2763 tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT); 2764 return (1); 2765 } 2766 2767 /* 2768 * Retransmit helper function, clear up all the ack 2769 * flags and take care of important book keeping. 2770 */ 2771 static void 2772 rack_remxt_tmr(struct tcpcb *tp) 2773 { 2774 /* 2775 * The retransmit timer went off, all sack'd blocks must be 2776 * un-acked. 2777 */ 2778 struct rack_sendmap *rsm, *trsm = NULL; 2779 struct tcp_rack *rack; 2780 int32_t cnt = 0; 2781 2782 rack = (struct tcp_rack *)tp->t_fb_ptr; 2783 rack_timer_cancel(tp, rack, tcp_ts_getticks(), __LINE__); 2784 rack_log_to_event(rack, RACK_TO_FRM_TMR); 2785 if (rack->r_state && (rack->r_state != tp->t_state)) 2786 rack_set_state(tp, rack); 2787 /* 2788 * Ideally we would like to be able to 2789 * mark SACK-PASS on anything not acked here. 2790 * However, if we do that we would burst out 2791 * all that data 1ms apart. This would be unwise, 2792 * so for now we will just let the normal rxt timer 2793 * and tlp timer take care of it. 2794 */ 2795 TAILQ_FOREACH(rsm, &rack->r_ctl.rc_map, r_next) { 2796 if (rsm->r_flags & RACK_ACKED) { 2797 cnt++; 2798 rsm->r_sndcnt = 0; 2799 if (rsm->r_in_tmap == 0) { 2800 /* We must re-add it back to the tlist */ 2801 if (trsm == NULL) { 2802 TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext); 2803 } else { 2804 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, trsm, rsm, r_tnext); 2805 } 2806 rsm->r_in_tmap = 1; 2807 trsm = rsm; 2808 } 2809 } 2810 rsm->r_flags &= ~(RACK_ACKED | RACK_SACK_PASSED | RACK_WAS_SACKPASS); 2811 } 2812 /* Clear the count (we just un-acked them) */ 2813 rack->r_ctl.rc_sacked = 0; 2814 /* Clear the tlp rtx mark */ 2815 rack->r_ctl.rc_tlp_rtx_out = 0; 2816 rack->r_ctl.rc_tlp_seg_send_cnt = 0; 2817 rack->r_ctl.rc_resend = TAILQ_FIRST(&rack->r_ctl.rc_map); 2818 /* Setup so we send one segment */ 2819 if (rack->r_ctl.rc_prr_sndcnt < tp->t_maxseg) 2820 rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; 2821 rack->r_timer_override = 1; 2822 } 2823 2824 /* 2825 * Re-transmit timeout! If we drop the PCB we will return 1, otherwise 2826 * we will setup to retransmit the lowest seq number outstanding. 2827 */ 2828 static int 2829 rack_timeout_rxt(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 2830 { 2831 int32_t rexmt; 2832 struct inpcb *inp; 2833 int32_t retval = 0; 2834 2835 inp = tp->t_inpcb; 2836 if (tp->t_timers->tt_flags & TT_STOPPED) { 2837 return (1); 2838 } 2839 if (rack_progress_timeout_check(tp)) { 2840 tcp_set_inp_to_drop(inp, ETIMEDOUT); 2841 return (1); 2842 } 2843 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RXT; 2844 if (TCPS_HAVEESTABLISHED(tp->t_state) && 2845 (tp->snd_una == tp->snd_max)) { 2846 /* Nothing outstanding .. nothing to do */ 2847 return (0); 2848 } 2849 /* 2850 * Retransmission timer went off. Message has not been acked within 2851 * retransmit interval. Back off to a longer retransmit interval 2852 * and retransmit one segment. 2853 */ 2854 if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) { 2855 tp->t_rxtshift = TCP_MAXRXTSHIFT; 2856 TCPSTAT_INC(tcps_timeoutdrop); 2857 retval = 1; 2858 tcp_set_inp_to_drop(rack->rc_inp, 2859 (tp->t_softerror ? (uint16_t) tp->t_softerror : ETIMEDOUT)); 2860 goto out; 2861 } 2862 rack_remxt_tmr(tp); 2863 if (tp->t_state == TCPS_SYN_SENT) { 2864 /* 2865 * If the SYN was retransmitted, indicate CWND to be limited 2866 * to 1 segment in cc_conn_init(). 2867 */ 2868 tp->snd_cwnd = 1; 2869 } else if (tp->t_rxtshift == 1) { 2870 /* 2871 * first retransmit; record ssthresh and cwnd so they can be 2872 * recovered if this turns out to be a "bad" retransmit. A 2873 * retransmit is considered "bad" if an ACK for this segment 2874 * is received within RTT/2 interval; the assumption here is 2875 * that the ACK was already in flight. See "On Estimating 2876 * End-to-End Network Path Properties" by Allman and Paxson 2877 * for more details. 2878 */ 2879 tp->snd_cwnd_prev = tp->snd_cwnd; 2880 tp->snd_ssthresh_prev = tp->snd_ssthresh; 2881 tp->snd_recover_prev = tp->snd_recover; 2882 if (IN_FASTRECOVERY(tp->t_flags)) 2883 tp->t_flags |= TF_WASFRECOVERY; 2884 else 2885 tp->t_flags &= ~TF_WASFRECOVERY; 2886 if (IN_CONGRECOVERY(tp->t_flags)) 2887 tp->t_flags |= TF_WASCRECOVERY; 2888 else 2889 tp->t_flags &= ~TF_WASCRECOVERY; 2890 tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1)); 2891 tp->t_flags |= TF_PREVVALID; 2892 } else 2893 tp->t_flags &= ~TF_PREVVALID; 2894 TCPSTAT_INC(tcps_rexmttimeo); 2895 if ((tp->t_state == TCPS_SYN_SENT) || 2896 (tp->t_state == TCPS_SYN_RECEIVED)) 2897 rexmt = MSEC_2_TICKS(RACK_INITIAL_RTO * tcp_syn_backoff[tp->t_rxtshift]); 2898 else 2899 rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; 2900 TCPT_RANGESET(tp->t_rxtcur, rexmt, 2901 max(MSEC_2_TICKS(rack_rto_min), rexmt), 2902 MSEC_2_TICKS(rack_rto_max)); 2903 /* 2904 * We enter the path for PLMTUD if connection is established or, if 2905 * connection is FIN_WAIT_1 status, reason for the last is that if 2906 * amount of data we send is very small, we could send it in couple 2907 * of packets and process straight to FIN. In that case we won't 2908 * catch ESTABLISHED state. 2909 */ 2910 if (V_tcp_pmtud_blackhole_detect && (((tp->t_state == TCPS_ESTABLISHED)) 2911 || (tp->t_state == TCPS_FIN_WAIT_1))) { 2912 #ifdef INET6 2913 int32_t isipv6; 2914 #endif 2915 2916 /* 2917 * Idea here is that at each stage of mtu probe (usually, 2918 * 1448 -> 1188 -> 524) should be given 2 chances to recover 2919 * before further clamping down. 'tp->t_rxtshift % 2 == 0' 2920 * should take care of that. 2921 */ 2922 if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) == 2923 (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) && 2924 (tp->t_rxtshift >= 2 && tp->t_rxtshift < 6 && 2925 tp->t_rxtshift % 2 == 0)) { 2926 /* 2927 * Enter Path MTU Black-hole Detection mechanism: - 2928 * Disable Path MTU Discovery (IP "DF" bit). - 2929 * Reduce MTU to lower value than what we negotiated 2930 * with peer. 2931 */ 2932 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) == 0) { 2933 /* Record that we may have found a black hole. */ 2934 tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE; 2935 /* Keep track of previous MSS. */ 2936 tp->t_pmtud_saved_maxseg = tp->t_maxseg; 2937 } 2938 2939 /* 2940 * Reduce the MSS to blackhole value or to the 2941 * default in an attempt to retransmit. 2942 */ 2943 #ifdef INET6 2944 isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? 1 : 0; 2945 if (isipv6 && 2946 tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) { 2947 /* Use the sysctl tuneable blackhole MSS. */ 2948 tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss; 2949 TCPSTAT_INC(tcps_pmtud_blackhole_activated); 2950 } else if (isipv6) { 2951 /* Use the default MSS. */ 2952 tp->t_maxseg = V_tcp_v6mssdflt; 2953 /* 2954 * Disable Path MTU Discovery when we switch 2955 * to minmss. 2956 */ 2957 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 2958 TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); 2959 } 2960 #endif 2961 #if defined(INET6) && defined(INET) 2962 else 2963 #endif 2964 #ifdef INET 2965 if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) { 2966 /* Use the sysctl tuneable blackhole MSS. */ 2967 tp->t_maxseg = V_tcp_pmtud_blackhole_mss; 2968 TCPSTAT_INC(tcps_pmtud_blackhole_activated); 2969 } else { 2970 /* Use the default MSS. */ 2971 tp->t_maxseg = V_tcp_mssdflt; 2972 /* 2973 * Disable Path MTU Discovery when we switch 2974 * to minmss. 2975 */ 2976 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 2977 TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); 2978 } 2979 #endif 2980 } else { 2981 /* 2982 * If further retransmissions are still unsuccessful 2983 * with a lowered MTU, maybe this isn't a blackhole 2984 * and we restore the previous MSS and blackhole 2985 * detection flags. The limit '6' is determined by 2986 * giving each probe stage (1448, 1188, 524) 2 2987 * chances to recover. 2988 */ 2989 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) && 2990 (tp->t_rxtshift >= 6)) { 2991 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 2992 tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE; 2993 tp->t_maxseg = tp->t_pmtud_saved_maxseg; 2994 TCPSTAT_INC(tcps_pmtud_blackhole_failed); 2995 } 2996 } 2997 } 2998 /* 2999 * Disable RFC1323 and SACK if we haven't got any response to our 3000 * third SYN to work-around some broken terminal servers (most of 3001 * which have hopefully been retired) that have bad VJ header 3002 * compression code which trashes TCP segments containing 3003 * unknown-to-them TCP options. 3004 */ 3005 if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) && 3006 (tp->t_rxtshift == 3)) 3007 tp->t_flags &= ~(TF_REQ_SCALE | TF_REQ_TSTMP | TF_SACK_PERMIT); 3008 /* 3009 * If we backed off this far, our srtt estimate is probably bogus. 3010 * Clobber it so we'll take the next rtt measurement as our srtt; 3011 * move the current srtt into rttvar to keep the current retransmit 3012 * times until then. 3013 */ 3014 if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) { 3015 #ifdef INET6 3016 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) 3017 in6_losing(tp->t_inpcb); 3018 else 3019 #endif 3020 in_losing(tp->t_inpcb); 3021 tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT); 3022 tp->t_srtt = 0; 3023 } 3024 if (rack_use_sack_filter) 3025 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 3026 tp->snd_recover = tp->snd_max; 3027 tp->t_flags |= TF_ACKNOW; 3028 tp->t_rtttime = 0; 3029 rack_cong_signal(tp, NULL, CC_RTO); 3030 out: 3031 return (retval); 3032 } 3033 3034 static int 3035 rack_process_timers(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8_t hpts_calling) 3036 { 3037 int32_t ret = 0; 3038 int32_t timers = (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK); 3039 3040 if (timers == 0) { 3041 return (0); 3042 } 3043 if (tp->t_state == TCPS_LISTEN) { 3044 /* no timers on listen sockets */ 3045 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) 3046 return (0); 3047 return (1); 3048 } 3049 if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) { 3050 uint32_t left; 3051 3052 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { 3053 ret = -1; 3054 rack_log_to_processing(rack, cts, ret, 0); 3055 return (0); 3056 } 3057 if (hpts_calling == 0) { 3058 ret = -2; 3059 rack_log_to_processing(rack, cts, ret, 0); 3060 return (0); 3061 } 3062 /* 3063 * Ok our timer went off early and we are not paced false 3064 * alarm, go back to sleep. 3065 */ 3066 ret = -3; 3067 left = rack->r_ctl.rc_timer_exp - cts; 3068 tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(left)); 3069 rack_log_to_processing(rack, cts, ret, left); 3070 rack->rc_last_pto_set = 0; 3071 return (1); 3072 } 3073 rack->rc_tmr_stopped = 0; 3074 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_MASK; 3075 if (timers & PACE_TMR_DELACK) { 3076 ret = rack_timeout_delack(tp, rack, cts); 3077 } else if (timers & PACE_TMR_RACK) { 3078 ret = rack_timeout_rack(tp, rack, cts); 3079 } else if (timers & PACE_TMR_TLP) { 3080 ret = rack_timeout_tlp(tp, rack, cts); 3081 } else if (timers & PACE_TMR_RXT) { 3082 ret = rack_timeout_rxt(tp, rack, cts); 3083 } else if (timers & PACE_TMR_PERSIT) { 3084 ret = rack_timeout_persist(tp, rack, cts); 3085 } else if (timers & PACE_TMR_KEEP) { 3086 ret = rack_timeout_keepalive(tp, rack, cts); 3087 } 3088 rack_log_to_processing(rack, cts, ret, timers); 3089 return (ret); 3090 } 3091 3092 static void 3093 rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line) 3094 { 3095 uint8_t hpts_removed = 0; 3096 3097 if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) && 3098 TSTMP_GEQ(cts, rack->r_ctl.rc_last_output_to)) { 3099 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); 3100 hpts_removed = 1; 3101 } 3102 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { 3103 rack->rc_tmr_stopped = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK; 3104 if (rack->rc_inp->inp_in_hpts && 3105 ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0)) { 3106 /* 3107 * Canceling timer's when we have no output being 3108 * paced. We also must remove ourselves from the 3109 * hpts. 3110 */ 3111 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); 3112 hpts_removed = 1; 3113 } 3114 rack_log_to_cancel(rack, hpts_removed, line); 3115 rack->r_ctl.rc_hpts_flags &= ~(PACE_TMR_MASK); 3116 } 3117 } 3118 3119 static void 3120 rack_timer_stop(struct tcpcb *tp, uint32_t timer_type) 3121 { 3122 return; 3123 } 3124 3125 static int 3126 rack_stopall(struct tcpcb *tp) 3127 { 3128 struct tcp_rack *rack; 3129 rack = (struct tcp_rack *)tp->t_fb_ptr; 3130 rack->t_timers_stopped = 1; 3131 return (0); 3132 } 3133 3134 static void 3135 rack_timer_activate(struct tcpcb *tp, uint32_t timer_type, uint32_t delta) 3136 { 3137 return; 3138 } 3139 3140 static int 3141 rack_timer_active(struct tcpcb *tp, uint32_t timer_type) 3142 { 3143 return (0); 3144 } 3145 3146 static void 3147 rack_stop_all_timers(struct tcpcb *tp) 3148 { 3149 struct tcp_rack *rack; 3150 3151 /* 3152 * Assure no timers are running. 3153 */ 3154 if (tcp_timer_active(tp, TT_PERSIST)) { 3155 /* We enter in persists, set the flag appropriately */ 3156 rack = (struct tcp_rack *)tp->t_fb_ptr; 3157 rack->rc_in_persist = 1; 3158 } 3159 tcp_timer_suspend(tp, TT_PERSIST); 3160 tcp_timer_suspend(tp, TT_REXMT); 3161 tcp_timer_suspend(tp, TT_KEEP); 3162 tcp_timer_suspend(tp, TT_DELACK); 3163 } 3164 3165 static void 3166 rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack, 3167 struct rack_sendmap *rsm, uint32_t ts) 3168 { 3169 int32_t idx; 3170 3171 rsm->r_rtr_cnt++; 3172 rsm->r_sndcnt++; 3173 if (rsm->r_rtr_cnt > RACK_NUM_OF_RETRANS) { 3174 rsm->r_rtr_cnt = RACK_NUM_OF_RETRANS; 3175 rsm->r_flags |= RACK_OVERMAX; 3176 } 3177 if ((rsm->r_rtr_cnt > 1) && (rack->r_tlp_running == 0)) { 3178 rack->r_ctl.rc_holes_rxt += (rsm->r_end - rsm->r_start); 3179 rsm->r_rtr_bytes += (rsm->r_end - rsm->r_start); 3180 } 3181 idx = rsm->r_rtr_cnt - 1; 3182 rsm->r_tim_lastsent[idx] = ts; 3183 if (rsm->r_flags & RACK_ACKED) { 3184 /* Problably MTU discovery messing with us */ 3185 rsm->r_flags &= ~RACK_ACKED; 3186 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); 3187 } 3188 if (rsm->r_in_tmap) { 3189 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 3190 } 3191 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); 3192 rsm->r_in_tmap = 1; 3193 if (rsm->r_flags & RACK_SACK_PASSED) { 3194 /* We have retransmitted due to the SACK pass */ 3195 rsm->r_flags &= ~RACK_SACK_PASSED; 3196 rsm->r_flags |= RACK_WAS_SACKPASS; 3197 } 3198 /* Update memory for next rtr */ 3199 rack->r_ctl.rc_next = TAILQ_NEXT(rsm, r_next); 3200 } 3201 3202 3203 static uint32_t 3204 rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack, 3205 struct rack_sendmap *rsm, uint32_t ts, int32_t * lenp) 3206 { 3207 /* 3208 * We (re-)transmitted starting at rsm->r_start for some length 3209 * (possibly less than r_end. 3210 */ 3211 struct rack_sendmap *nrsm; 3212 uint32_t c_end; 3213 int32_t len; 3214 int32_t idx; 3215 3216 len = *lenp; 3217 c_end = rsm->r_start + len; 3218 if (SEQ_GEQ(c_end, rsm->r_end)) { 3219 /* 3220 * We retransmitted the whole piece or more than the whole 3221 * slopping into the next rsm. 3222 */ 3223 rack_update_rsm(tp, rack, rsm, ts); 3224 if (c_end == rsm->r_end) { 3225 *lenp = 0; 3226 return (0); 3227 } else { 3228 int32_t act_len; 3229 3230 /* Hangs over the end return whats left */ 3231 act_len = rsm->r_end - rsm->r_start; 3232 *lenp = (len - act_len); 3233 return (rsm->r_end); 3234 } 3235 /* We don't get out of this block. */ 3236 } 3237 /* 3238 * Here we retransmitted less than the whole thing which means we 3239 * have to split this into what was transmitted and what was not. 3240 */ 3241 nrsm = rack_alloc(rack); 3242 if (nrsm == NULL) { 3243 /* 3244 * We can't get memory, so lets not proceed. 3245 */ 3246 *lenp = 0; 3247 return (0); 3248 } 3249 /* 3250 * So here we are going to take the original rsm and make it what we 3251 * retransmitted. nrsm will be the tail portion we did not 3252 * retransmit. For example say the chunk was 1, 11 (10 bytes). And 3253 * we retransmitted 5 bytes i.e. 1, 5. The original piece shrinks to 3254 * 1, 6 and the new piece will be 6, 11. 3255 */ 3256 nrsm->r_start = c_end; 3257 nrsm->r_end = rsm->r_end; 3258 nrsm->r_rtr_cnt = rsm->r_rtr_cnt; 3259 nrsm->r_flags = rsm->r_flags; 3260 nrsm->r_sndcnt = rsm->r_sndcnt; 3261 nrsm->r_rtr_bytes = 0; 3262 rsm->r_end = c_end; 3263 for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { 3264 nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; 3265 } 3266 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next); 3267 if (rsm->r_in_tmap) { 3268 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 3269 nrsm->r_in_tmap = 1; 3270 } 3271 rsm->r_flags &= (~RACK_HAS_FIN); 3272 rack_update_rsm(tp, rack, rsm, ts); 3273 *lenp = 0; 3274 return (0); 3275 } 3276 3277 3278 static void 3279 rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len, 3280 uint32_t seq_out, uint8_t th_flags, int32_t err, uint32_t ts, 3281 uint8_t pass, struct rack_sendmap *hintrsm) 3282 { 3283 struct tcp_rack *rack; 3284 struct rack_sendmap *rsm, *nrsm; 3285 register uint32_t snd_max, snd_una; 3286 int32_t idx; 3287 3288 /* 3289 * Add to the RACK log of packets in flight or retransmitted. If 3290 * there is a TS option we will use the TS echoed, if not we will 3291 * grab a TS. 3292 * 3293 * Retransmissions will increment the count and move the ts to its 3294 * proper place. Note that if options do not include TS's then we 3295 * won't be able to effectively use the ACK for an RTT on a retran. 3296 * 3297 * Notes about r_start and r_end. Lets consider a send starting at 3298 * sequence 1 for 10 bytes. In such an example the r_start would be 3299 * 1 (starting sequence) but the r_end would be r_start+len i.e. 11. 3300 * This means that r_end is actually the first sequence for the next 3301 * slot (11). 3302 * 3303 */ 3304 /* 3305 * If err is set what do we do XXXrrs? should we not add the thing? 3306 * -- i.e. return if err != 0 or should we pretend we sent it? -- 3307 * i.e. proceed with add ** do this for now. 3308 */ 3309 INP_WLOCK_ASSERT(tp->t_inpcb); 3310 if (err) 3311 /* 3312 * We don't log errors -- we could but snd_max does not 3313 * advance in this case either. 3314 */ 3315 return; 3316 3317 if (th_flags & TH_RST) { 3318 /* 3319 * We don't log resets and we return immediately from 3320 * sending 3321 */ 3322 return; 3323 } 3324 rack = (struct tcp_rack *)tp->t_fb_ptr; 3325 snd_una = tp->snd_una; 3326 if (SEQ_LEQ((seq_out + len), snd_una)) { 3327 /* Are sending an old segment to induce an ack (keep-alive)? */ 3328 return; 3329 } 3330 if (SEQ_LT(seq_out, snd_una)) { 3331 /* huh? should we panic? */ 3332 uint32_t end; 3333 3334 end = seq_out + len; 3335 seq_out = snd_una; 3336 len = end - seq_out; 3337 } 3338 snd_max = tp->snd_max; 3339 if (th_flags & (TH_SYN | TH_FIN)) { 3340 /* 3341 * The call to rack_log_output is made before bumping 3342 * snd_max. This means we can record one extra byte on a SYN 3343 * or FIN if seq_out is adding more on and a FIN is present 3344 * (and we are not resending). 3345 */ 3346 if (th_flags & TH_SYN) 3347 len++; 3348 if (th_flags & TH_FIN) 3349 len++; 3350 if (SEQ_LT(snd_max, tp->snd_nxt)) { 3351 /* 3352 * The add/update as not been done for the FIN/SYN 3353 * yet. 3354 */ 3355 snd_max = tp->snd_nxt; 3356 } 3357 } 3358 if (len == 0) { 3359 /* We don't log zero window probes */ 3360 return; 3361 } 3362 rack->r_ctl.rc_time_last_sent = ts; 3363 if (IN_RECOVERY(tp->t_flags)) { 3364 rack->r_ctl.rc_prr_out += len; 3365 } 3366 /* First question is it a retransmission? */ 3367 if (seq_out == snd_max) { 3368 again: 3369 rsm = rack_alloc(rack); 3370 if (rsm == NULL) { 3371 /* 3372 * Hmm out of memory and the tcb got destroyed while 3373 * we tried to wait. 3374 */ 3375 #ifdef INVARIANTS 3376 panic("Out of memory when we should not be rack:%p", rack); 3377 #endif 3378 return; 3379 } 3380 if (th_flags & TH_FIN) { 3381 rsm->r_flags = RACK_HAS_FIN; 3382 } else { 3383 rsm->r_flags = 0; 3384 } 3385 rsm->r_tim_lastsent[0] = ts; 3386 rsm->r_rtr_cnt = 1; 3387 rsm->r_rtr_bytes = 0; 3388 rsm->r_start = seq_out; 3389 rsm->r_end = rsm->r_start + len; 3390 rsm->r_sndcnt = 0; 3391 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_map, rsm, r_next); 3392 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); 3393 rsm->r_in_tmap = 1; 3394 return; 3395 } 3396 /* 3397 * If we reach here its a retransmission and we need to find it. 3398 */ 3399 more: 3400 if (hintrsm && (hintrsm->r_start == seq_out)) { 3401 rsm = hintrsm; 3402 hintrsm = NULL; 3403 } else if (rack->r_ctl.rc_next) { 3404 /* We have a hint from a previous run */ 3405 rsm = rack->r_ctl.rc_next; 3406 } else { 3407 /* No hints sorry */ 3408 rsm = NULL; 3409 } 3410 if ((rsm) && (rsm->r_start == seq_out)) { 3411 /* 3412 * We used rc_next or hintrsm to retransmit, hopefully the 3413 * likely case. 3414 */ 3415 seq_out = rack_update_entry(tp, rack, rsm, ts, &len); 3416 if (len == 0) { 3417 return; 3418 } else { 3419 goto more; 3420 } 3421 } 3422 /* Ok it was not the last pointer go through it the hard way. */ 3423 TAILQ_FOREACH(rsm, &rack->r_ctl.rc_map, r_next) { 3424 if (rsm->r_start == seq_out) { 3425 seq_out = rack_update_entry(tp, rack, rsm, ts, &len); 3426 rack->r_ctl.rc_next = TAILQ_NEXT(rsm, r_next); 3427 if (len == 0) { 3428 return; 3429 } else { 3430 continue; 3431 } 3432 } 3433 if (SEQ_GEQ(seq_out, rsm->r_start) && SEQ_LT(seq_out, rsm->r_end)) { 3434 /* Transmitted within this piece */ 3435 /* 3436 * Ok we must split off the front and then let the 3437 * update do the rest 3438 */ 3439 nrsm = rack_alloc(rack); 3440 if (nrsm == NULL) { 3441 #ifdef INVARIANTS 3442 panic("Ran out of memory that was preallocated? rack:%p", rack); 3443 #endif 3444 rack_update_rsm(tp, rack, rsm, ts); 3445 return; 3446 } 3447 /* 3448 * copy rsm to nrsm and then trim the front of rsm 3449 * to not include this part. 3450 */ 3451 nrsm->r_start = seq_out; 3452 nrsm->r_end = rsm->r_end; 3453 nrsm->r_rtr_cnt = rsm->r_rtr_cnt; 3454 nrsm->r_flags = rsm->r_flags; 3455 nrsm->r_sndcnt = rsm->r_sndcnt; 3456 nrsm->r_rtr_bytes = 0; 3457 for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { 3458 nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; 3459 } 3460 rsm->r_end = nrsm->r_start; 3461 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next); 3462 if (rsm->r_in_tmap) { 3463 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 3464 nrsm->r_in_tmap = 1; 3465 } 3466 rsm->r_flags &= (~RACK_HAS_FIN); 3467 seq_out = rack_update_entry(tp, rack, nrsm, ts, &len); 3468 if (len == 0) { 3469 return; 3470 } 3471 } 3472 } 3473 /* 3474 * Hmm not found in map did they retransmit both old and on into the 3475 * new? 3476 */ 3477 if (seq_out == tp->snd_max) { 3478 goto again; 3479 } else if (SEQ_LT(seq_out, tp->snd_max)) { 3480 #ifdef INVARIANTS 3481 printf("seq_out:%u len:%d snd_una:%u snd_max:%u -- but rsm not found?\n", 3482 seq_out, len, tp->snd_una, tp->snd_max); 3483 printf("Starting Dump of all rack entries\n"); 3484 TAILQ_FOREACH(rsm, &rack->r_ctl.rc_map, r_next) { 3485 printf("rsm:%p start:%u end:%u\n", 3486 rsm, rsm->r_start, rsm->r_end); 3487 } 3488 printf("Dump complete\n"); 3489 panic("seq_out not found rack:%p tp:%p", 3490 rack, tp); 3491 #endif 3492 } else { 3493 #ifdef INVARIANTS 3494 /* 3495 * Hmm beyond sndmax? (only if we are using the new rtt-pack 3496 * flag) 3497 */ 3498 panic("seq_out:%u(%d) is beyond snd_max:%u tp:%p", 3499 seq_out, len, tp->snd_max, tp); 3500 #endif 3501 } 3502 } 3503 3504 /* 3505 * Record one of the RTT updates from an ack into 3506 * our sample structure. 3507 */ 3508 static void 3509 tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt) 3510 { 3511 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) || 3512 (rack->r_ctl.rack_rs.rs_rtt_lowest > rtt)) { 3513 rack->r_ctl.rack_rs.rs_rtt_lowest = rtt; 3514 } 3515 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) || 3516 (rack->r_ctl.rack_rs.rs_rtt_highest < rtt)) { 3517 rack->r_ctl.rack_rs.rs_rtt_highest = rtt; 3518 } 3519 rack->r_ctl.rack_rs.rs_flags = RACK_RTT_VALID; 3520 rack->r_ctl.rack_rs.rs_rtt_tot += rtt; 3521 rack->r_ctl.rack_rs.rs_rtt_cnt++; 3522 } 3523 3524 /* 3525 * Collect new round-trip time estimate 3526 * and update averages and current timeout. 3527 */ 3528 static void 3529 tcp_rack_xmit_timer_commit(struct tcp_rack *rack, struct tcpcb *tp) 3530 { 3531 int32_t delta; 3532 uint32_t o_srtt, o_var; 3533 int32_t rtt; 3534 3535 if (rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) 3536 /* No valid sample */ 3537 return; 3538 if (rack->r_ctl.rc_rate_sample_method == USE_RTT_LOW) { 3539 /* We are to use the lowest RTT seen in a single ack */ 3540 rtt = rack->r_ctl.rack_rs.rs_rtt_lowest; 3541 } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_HIGH) { 3542 /* We are to use the highest RTT seen in a single ack */ 3543 rtt = rack->r_ctl.rack_rs.rs_rtt_highest; 3544 } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_AVG) { 3545 /* We are to use the average RTT seen in a single ack */ 3546 rtt = (int32_t)(rack->r_ctl.rack_rs.rs_rtt_tot / 3547 (uint64_t)rack->r_ctl.rack_rs.rs_rtt_cnt); 3548 } else { 3549 #ifdef INVARIANTS 3550 panic("Unknown rtt variant %d", rack->r_ctl.rc_rate_sample_method); 3551 #endif 3552 return; 3553 } 3554 if (rtt == 0) 3555 rtt = 1; 3556 rack_log_rtt_sample(rack, rtt); 3557 o_srtt = tp->t_srtt; 3558 o_var = tp->t_rttvar; 3559 rack = (struct tcp_rack *)tp->t_fb_ptr; 3560 if (tp->t_srtt != 0) { 3561 /* 3562 * srtt is stored as fixed point with 5 bits after the 3563 * binary point (i.e., scaled by 8). The following magic is 3564 * equivalent to the smoothing algorithm in rfc793 with an 3565 * alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed point). 3566 * Adjust rtt to origin 0. 3567 */ 3568 delta = ((rtt - 1) << TCP_DELTA_SHIFT) 3569 - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)); 3570 3571 tp->t_srtt += delta; 3572 if (tp->t_srtt <= 0) 3573 tp->t_srtt = 1; 3574 3575 /* 3576 * We accumulate a smoothed rtt variance (actually, a 3577 * smoothed mean difference), then set the retransmit timer 3578 * to smoothed rtt + 4 times the smoothed variance. rttvar 3579 * is stored as fixed point with 4 bits after the binary 3580 * point (scaled by 16). The following is equivalent to 3581 * rfc793 smoothing with an alpha of .75 (rttvar = 3582 * rttvar*3/4 + |delta| / 4). This replaces rfc793's 3583 * wired-in beta. 3584 */ 3585 if (delta < 0) 3586 delta = -delta; 3587 delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT); 3588 tp->t_rttvar += delta; 3589 if (tp->t_rttvar <= 0) 3590 tp->t_rttvar = 1; 3591 if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar) 3592 tp->t_rttbest = tp->t_srtt + tp->t_rttvar; 3593 } else { 3594 /* 3595 * No rtt measurement yet - use the unsmoothed rtt. Set the 3596 * variance to half the rtt (so our first retransmit happens 3597 * at 3*rtt). 3598 */ 3599 tp->t_srtt = rtt << TCP_RTT_SHIFT; 3600 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1); 3601 tp->t_rttbest = tp->t_srtt + tp->t_rttvar; 3602 } 3603 TCPSTAT_INC(tcps_rttupdated); 3604 rack_log_rtt_upd(tp, rack, rtt, o_srtt, o_var); 3605 tp->t_rttupdated++; 3606 #ifdef NETFLIX_STATS 3607 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rtt)); 3608 #endif 3609 tp->t_rxtshift = 0; 3610 3611 /* 3612 * the retransmit should happen at rtt + 4 * rttvar. Because of the 3613 * way we do the smoothing, srtt and rttvar will each average +1/2 3614 * tick of bias. When we compute the retransmit timer, we want 1/2 3615 * tick of rounding and 1 extra tick because of +-1/2 tick 3616 * uncertainty in the firing of the timer. The bias will give us 3617 * exactly the 1.5 tick we need. But, because the bias is 3618 * statistical, we have to test that we don't drop below the minimum 3619 * feasible timer (which is 2 ticks). 3620 */ 3621 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), 3622 max(MSEC_2_TICKS(rack_rto_min), rtt + 2), MSEC_2_TICKS(rack_rto_max)); 3623 tp->t_softerror = 0; 3624 } 3625 3626 static void 3627 rack_earlier_retran(struct tcpcb *tp, struct rack_sendmap *rsm, 3628 uint32_t t, uint32_t cts) 3629 { 3630 /* 3631 * For this RSM, we acknowledged the data from a previous 3632 * transmission, not the last one we made. This means we did a false 3633 * retransmit. 3634 */ 3635 struct tcp_rack *rack; 3636 3637 if (rsm->r_flags & RACK_HAS_FIN) { 3638 /* 3639 * The sending of the FIN often is multiple sent when we 3640 * have everything outstanding ack'd. We ignore this case 3641 * since its over now. 3642 */ 3643 return; 3644 } 3645 if (rsm->r_flags & RACK_TLP) { 3646 /* 3647 * We expect TLP's to have this occur. 3648 */ 3649 return; 3650 } 3651 rack = (struct tcp_rack *)tp->t_fb_ptr; 3652 /* should we undo cc changes and exit recovery? */ 3653 if (IN_RECOVERY(tp->t_flags)) { 3654 if (rack->r_ctl.rc_rsm_start == rsm->r_start) { 3655 /* 3656 * Undo what we ratched down and exit recovery if 3657 * possible 3658 */ 3659 EXIT_RECOVERY(tp->t_flags); 3660 tp->snd_recover = tp->snd_una; 3661 if (rack->r_ctl.rc_cwnd_at > tp->snd_cwnd) 3662 tp->snd_cwnd = rack->r_ctl.rc_cwnd_at; 3663 if (rack->r_ctl.rc_ssthresh_at > tp->snd_ssthresh) 3664 tp->snd_ssthresh = rack->r_ctl.rc_ssthresh_at; 3665 } 3666 } 3667 if (rsm->r_flags & RACK_WAS_SACKPASS) { 3668 /* 3669 * We retransmitted based on a sack and the earlier 3670 * retransmission ack'd it - re-ordering is occuring. 3671 */ 3672 counter_u64_add(rack_reorder_seen, 1); 3673 rack->r_ctl.rc_reorder_ts = cts; 3674 } 3675 counter_u64_add(rack_badfr, 1); 3676 counter_u64_add(rack_badfr_bytes, (rsm->r_end - rsm->r_start)); 3677 } 3678 3679 3680 static int 3681 rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack, 3682 struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type) 3683 { 3684 int32_t i; 3685 uint32_t t; 3686 3687 if (rsm->r_flags & RACK_ACKED) 3688 /* Already done */ 3689 return (0); 3690 3691 3692 if ((rsm->r_rtr_cnt == 1) || 3693 ((ack_type == CUM_ACKED) && 3694 (to->to_flags & TOF_TS) && 3695 (to->to_tsecr) && 3696 (rsm->r_tim_lastsent[rsm->r_rtr_cnt - 1] == to->to_tsecr)) 3697 ) { 3698 /* 3699 * We will only find a matching timestamp if its cum-acked. 3700 * But if its only one retransmission its for-sure matching 3701 * :-) 3702 */ 3703 t = cts - rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; 3704 if ((int)t <= 0) 3705 t = 1; 3706 if (!tp->t_rttlow || tp->t_rttlow > t) 3707 tp->t_rttlow = t; 3708 if (!rack->r_ctl.rc_rack_min_rtt || 3709 SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 3710 rack->r_ctl.rc_rack_min_rtt = t; 3711 if (rack->r_ctl.rc_rack_min_rtt == 0) { 3712 rack->r_ctl.rc_rack_min_rtt = 1; 3713 } 3714 } 3715 tcp_rack_xmit_timer(rack, TCP_TS_TO_TICKS(t) + 1); 3716 if ((rsm->r_flags & RACK_TLP) && 3717 (!IN_RECOVERY(tp->t_flags))) { 3718 /* Segment was a TLP and our retrans matched */ 3719 if (rack->r_ctl.rc_tlp_cwnd_reduce) { 3720 rack->r_ctl.rc_rsm_start = tp->snd_max; 3721 rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; 3722 rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; 3723 rack_cong_signal(tp, NULL, CC_NDUPACK); 3724 /* 3725 * When we enter recovery we need to assure 3726 * we send one packet. 3727 */ 3728 rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; 3729 } else 3730 rack->r_ctl.rc_tlp_rtx_out = 0; 3731 } 3732 if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) { 3733 /* New more recent rack_tmit_time */ 3734 rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; 3735 rack->rc_rack_rtt = t; 3736 } 3737 return (1); 3738 } 3739 /* 3740 * We clear the soft/rxtshift since we got an ack. 3741 * There is no assurance we will call the commit() function 3742 * so we need to clear these to avoid incorrect handling. 3743 */ 3744 tp->t_rxtshift = 0; 3745 tp->t_softerror = 0; 3746 if ((to->to_flags & TOF_TS) && 3747 (ack_type == CUM_ACKED) && 3748 (to->to_tsecr) && 3749 ((rsm->r_flags & (RACK_DEFERRED | RACK_OVERMAX)) == 0)) { 3750 /* 3751 * Now which timestamp does it match? In this block the ACK 3752 * must be coming from a previous transmission. 3753 */ 3754 for (i = 0; i < rsm->r_rtr_cnt; i++) { 3755 if (rsm->r_tim_lastsent[i] == to->to_tsecr) { 3756 t = cts - rsm->r_tim_lastsent[i]; 3757 if ((int)t <= 0) 3758 t = 1; 3759 if ((i + 1) < rsm->r_rtr_cnt) { 3760 /* Likely */ 3761 rack_earlier_retran(tp, rsm, t, cts); 3762 } 3763 if (!tp->t_rttlow || tp->t_rttlow > t) 3764 tp->t_rttlow = t; 3765 if (!rack->r_ctl.rc_rack_min_rtt || SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 3766 rack->r_ctl.rc_rack_min_rtt = t; 3767 if (rack->r_ctl.rc_rack_min_rtt == 0) { 3768 rack->r_ctl.rc_rack_min_rtt = 1; 3769 } 3770 } 3771 /* 3772 * Note the following calls to 3773 * tcp_rack_xmit_timer() are being commented 3774 * out for now. They give us no more accuracy 3775 * and often lead to a wrong choice. We have 3776 * enough samples that have not been 3777 * retransmitted. I leave the commented out 3778 * code in here in case in the future we 3779 * decide to add it back (though I can't forsee 3780 * doing that). That way we will easily see 3781 * where they need to be placed. 3782 */ 3783 if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, 3784 rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) { 3785 /* New more recent rack_tmit_time */ 3786 rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; 3787 rack->rc_rack_rtt = t; 3788 } 3789 return (1); 3790 } 3791 } 3792 goto ts_not_found; 3793 } else { 3794 /* 3795 * Ok its a SACK block that we retransmitted. or a windows 3796 * machine without timestamps. We can tell nothing from the 3797 * time-stamp since its not there or the time the peer last 3798 * recieved a segment that moved forward its cum-ack point. 3799 */ 3800 ts_not_found: 3801 i = rsm->r_rtr_cnt - 1; 3802 t = cts - rsm->r_tim_lastsent[i]; 3803 if ((int)t <= 0) 3804 t = 1; 3805 if (rack->r_ctl.rc_rack_min_rtt && SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 3806 /* 3807 * We retransmitted and the ack came back in less 3808 * than the smallest rtt we have observed. We most 3809 * likey did an improper retransmit as outlined in 3810 * 4.2 Step 3 point 2 in the rack-draft. 3811 */ 3812 i = rsm->r_rtr_cnt - 2; 3813 t = cts - rsm->r_tim_lastsent[i]; 3814 rack_earlier_retran(tp, rsm, t, cts); 3815 } else if (rack->r_ctl.rc_rack_min_rtt) { 3816 /* 3817 * We retransmitted it and the retransmit did the 3818 * job. 3819 */ 3820 if (!rack->r_ctl.rc_rack_min_rtt || 3821 SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 3822 rack->r_ctl.rc_rack_min_rtt = t; 3823 if (rack->r_ctl.rc_rack_min_rtt == 0) { 3824 rack->r_ctl.rc_rack_min_rtt = 1; 3825 } 3826 } 3827 if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, rsm->r_tim_lastsent[i])) { 3828 /* New more recent rack_tmit_time */ 3829 rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[i]; 3830 rack->rc_rack_rtt = t; 3831 } 3832 return (1); 3833 } 3834 } 3835 return (0); 3836 } 3837 3838 /* 3839 * Mark the SACK_PASSED flag on all entries prior to rsm send wise. 3840 */ 3841 static void 3842 rack_log_sack_passed(struct tcpcb *tp, 3843 struct tcp_rack *rack, struct rack_sendmap *rsm) 3844 { 3845 struct rack_sendmap *nrsm; 3846 uint32_t ts; 3847 int32_t idx; 3848 3849 idx = rsm->r_rtr_cnt - 1; 3850 ts = rsm->r_tim_lastsent[idx]; 3851 nrsm = rsm; 3852 TAILQ_FOREACH_REVERSE_FROM(nrsm, &rack->r_ctl.rc_tmap, 3853 rack_head, r_tnext) { 3854 if (nrsm == rsm) { 3855 /* Skip orginal segment he is acked */ 3856 continue; 3857 } 3858 if (nrsm->r_flags & RACK_ACKED) { 3859 /* Skip ack'd segments */ 3860 continue; 3861 } 3862 idx = nrsm->r_rtr_cnt - 1; 3863 if (ts == nrsm->r_tim_lastsent[idx]) { 3864 /* 3865 * For this case lets use seq no, if we sent in a 3866 * big block (TSO) we would have a bunch of segments 3867 * sent at the same time. 3868 * 3869 * We would only get a report if its SEQ is earlier. 3870 * If we have done multiple retransmits the times 3871 * would not be equal. 3872 */ 3873 if (SEQ_LT(nrsm->r_start, rsm->r_start)) { 3874 nrsm->r_flags |= RACK_SACK_PASSED; 3875 nrsm->r_flags &= ~RACK_WAS_SACKPASS; 3876 } 3877 } else { 3878 /* 3879 * Here they were sent at different times, not a big 3880 * block. Since we transmitted this one later and 3881 * see it sack'd then this must also be missing (or 3882 * we would have gotten a sack block for it) 3883 */ 3884 nrsm->r_flags |= RACK_SACK_PASSED; 3885 nrsm->r_flags &= ~RACK_WAS_SACKPASS; 3886 } 3887 } 3888 } 3889 3890 static uint32_t 3891 rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, struct sackblk *sack, 3892 struct tcpopt *to, struct rack_sendmap **prsm, uint32_t cts) 3893 { 3894 int32_t idx; 3895 int32_t times = 0; 3896 uint32_t start, end, changed = 0; 3897 struct rack_sendmap *rsm, *nrsm; 3898 int32_t used_ref = 1; 3899 3900 start = sack->start; 3901 end = sack->end; 3902 rsm = *prsm; 3903 if (rsm && SEQ_LT(start, rsm->r_start)) { 3904 TAILQ_FOREACH_REVERSE_FROM(rsm, &rack->r_ctl.rc_map, rack_head, r_next) { 3905 if (SEQ_GEQ(start, rsm->r_start) && 3906 SEQ_LT(start, rsm->r_end)) { 3907 goto do_rest_ofb; 3908 } 3909 } 3910 } 3911 if (rsm == NULL) { 3912 start_at_beginning: 3913 rsm = NULL; 3914 used_ref = 0; 3915 } 3916 /* First lets locate the block where this guy is */ 3917 TAILQ_FOREACH_FROM(rsm, &rack->r_ctl.rc_map, r_next) { 3918 if (SEQ_GEQ(start, rsm->r_start) && 3919 SEQ_LT(start, rsm->r_end)) { 3920 break; 3921 } 3922 } 3923 do_rest_ofb: 3924 if (rsm == NULL) { 3925 /* 3926 * This happens when we get duplicate sack blocks with the 3927 * same end. For example SACK 4: 100 SACK 3: 100 The sort 3928 * will not change there location so we would just start at 3929 * the end of the first one and get lost. 3930 */ 3931 if (tp->t_flags & TF_SENTFIN) { 3932 /* 3933 * Check to see if we have not logged the FIN that 3934 * went out. 3935 */ 3936 nrsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_map, rack_sendmap, r_next); 3937 if (nrsm && (nrsm->r_end + 1) == tp->snd_max) { 3938 /* 3939 * Ok we did not get the FIN logged. 3940 */ 3941 nrsm->r_end++; 3942 rsm = nrsm; 3943 goto do_rest_ofb; 3944 } 3945 } 3946 if (times == 1) { 3947 #ifdef INVARIANTS 3948 panic("tp:%p rack:%p sack:%p to:%p prsm:%p", 3949 tp, rack, sack, to, prsm); 3950 #else 3951 goto out; 3952 #endif 3953 } 3954 times++; 3955 counter_u64_add(rack_sack_proc_restart, 1); 3956 goto start_at_beginning; 3957 } 3958 /* Ok we have an ACK for some piece of rsm */ 3959 if (rsm->r_start != start) { 3960 /* 3961 * Need to split this in two pieces the before and after. 3962 */ 3963 nrsm = rack_alloc(rack); 3964 if (nrsm == NULL) { 3965 /* 3966 * failed XXXrrs what can we do but loose the sack 3967 * info? 3968 */ 3969 goto out; 3970 } 3971 nrsm->r_start = start; 3972 nrsm->r_rtr_bytes = 0; 3973 nrsm->r_end = rsm->r_end; 3974 nrsm->r_rtr_cnt = rsm->r_rtr_cnt; 3975 nrsm->r_flags = rsm->r_flags; 3976 nrsm->r_sndcnt = rsm->r_sndcnt; 3977 for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { 3978 nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; 3979 } 3980 rsm->r_end = nrsm->r_start; 3981 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next); 3982 if (rsm->r_in_tmap) { 3983 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 3984 nrsm->r_in_tmap = 1; 3985 } 3986 rsm->r_flags &= (~RACK_HAS_FIN); 3987 rsm = nrsm; 3988 } 3989 if (SEQ_GEQ(end, rsm->r_end)) { 3990 /* 3991 * The end of this block is either beyond this guy or right 3992 * at this guy. 3993 */ 3994 3995 if ((rsm->r_flags & RACK_ACKED) == 0) { 3996 rack_update_rtt(tp, rack, rsm, to, cts, SACKED); 3997 changed += (rsm->r_end - rsm->r_start); 3998 rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start); 3999 rack_log_sack_passed(tp, rack, rsm); 4000 /* Is Reordering occuring? */ 4001 if (rsm->r_flags & RACK_SACK_PASSED) { 4002 counter_u64_add(rack_reorder_seen, 1); 4003 rack->r_ctl.rc_reorder_ts = cts; 4004 } 4005 rsm->r_flags |= RACK_ACKED; 4006 rsm->r_flags &= ~RACK_TLP; 4007 if (rsm->r_in_tmap) { 4008 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 4009 rsm->r_in_tmap = 0; 4010 } 4011 } 4012 if (end == rsm->r_end) { 4013 /* This block only - done */ 4014 goto out; 4015 } 4016 /* There is more not coverend by this rsm move on */ 4017 start = rsm->r_end; 4018 nrsm = TAILQ_NEXT(rsm, r_next); 4019 rsm = nrsm; 4020 times = 0; 4021 goto do_rest_ofb; 4022 } 4023 /* Ok we need to split off this one at the tail */ 4024 nrsm = rack_alloc(rack); 4025 if (nrsm == NULL) { 4026 /* failed rrs what can we do but loose the sack info? */ 4027 goto out; 4028 } 4029 /* Clone it */ 4030 nrsm->r_start = end; 4031 nrsm->r_end = rsm->r_end; 4032 nrsm->r_rtr_bytes = 0; 4033 nrsm->r_rtr_cnt = rsm->r_rtr_cnt; 4034 nrsm->r_flags = rsm->r_flags; 4035 nrsm->r_sndcnt = rsm->r_sndcnt; 4036 for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { 4037 nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; 4038 } 4039 /* The sack block does not cover this guy fully */ 4040 rsm->r_flags &= (~RACK_HAS_FIN); 4041 rsm->r_end = end; 4042 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next); 4043 if (rsm->r_in_tmap) { 4044 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 4045 nrsm->r_in_tmap = 1; 4046 } 4047 if (rsm->r_flags & RACK_ACKED) { 4048 /* Been here done that */ 4049 goto out; 4050 } 4051 rack_update_rtt(tp, rack, rsm, to, cts, SACKED); 4052 changed += (rsm->r_end - rsm->r_start); 4053 rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start); 4054 rack_log_sack_passed(tp, rack, rsm); 4055 /* Is Reordering occuring? */ 4056 if (rsm->r_flags & RACK_SACK_PASSED) { 4057 counter_u64_add(rack_reorder_seen, 1); 4058 rack->r_ctl.rc_reorder_ts = cts; 4059 } 4060 rsm->r_flags |= RACK_ACKED; 4061 rsm->r_flags &= ~RACK_TLP; 4062 if (rsm->r_in_tmap) { 4063 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 4064 rsm->r_in_tmap = 0; 4065 } 4066 out: 4067 if (used_ref == 0) { 4068 counter_u64_add(rack_sack_proc_all, 1); 4069 } else { 4070 counter_u64_add(rack_sack_proc_short, 1); 4071 } 4072 /* Save off where we last were */ 4073 if (rsm) 4074 rack->r_ctl.rc_sacklast = TAILQ_NEXT(rsm, r_next); 4075 else 4076 rack->r_ctl.rc_sacklast = NULL; 4077 *prsm = rsm; 4078 return (changed); 4079 } 4080 4081 static void inline 4082 rack_peer_reneges(struct tcp_rack *rack, struct rack_sendmap *rsm, tcp_seq th_ack) 4083 { 4084 struct rack_sendmap *tmap; 4085 4086 tmap = NULL; 4087 while (rsm && (rsm->r_flags & RACK_ACKED)) { 4088 /* Its no longer sacked, mark it so */ 4089 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); 4090 #ifdef INVARIANTS 4091 if (rsm->r_in_tmap) { 4092 panic("rack:%p rsm:%p flags:0x%x in tmap?", 4093 rack, rsm, rsm->r_flags); 4094 } 4095 #endif 4096 rsm->r_flags &= ~(RACK_ACKED|RACK_SACK_PASSED|RACK_WAS_SACKPASS); 4097 /* Rebuild it into our tmap */ 4098 if (tmap == NULL) { 4099 TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext); 4100 tmap = rsm; 4101 } else { 4102 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, tmap, rsm, r_tnext); 4103 tmap = rsm; 4104 } 4105 tmap->r_in_tmap = 1; 4106 rsm = TAILQ_NEXT(rsm, r_next); 4107 } 4108 /* 4109 * Now lets possibly clear the sack filter so we start 4110 * recognizing sacks that cover this area. 4111 */ 4112 if (rack_use_sack_filter) 4113 sack_filter_clear(&rack->r_ctl.rack_sf, th_ack); 4114 4115 } 4116 4117 static void 4118 rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th) 4119 { 4120 uint32_t changed, last_seq, entered_recovery = 0; 4121 struct tcp_rack *rack; 4122 struct rack_sendmap *rsm; 4123 struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1]; 4124 register uint32_t th_ack; 4125 int32_t i, j, k, num_sack_blks = 0; 4126 uint32_t cts, acked, ack_point, sack_changed = 0; 4127 4128 INP_WLOCK_ASSERT(tp->t_inpcb); 4129 if (th->th_flags & TH_RST) { 4130 /* We don't log resets */ 4131 return; 4132 } 4133 rack = (struct tcp_rack *)tp->t_fb_ptr; 4134 cts = tcp_ts_getticks(); 4135 rsm = TAILQ_FIRST(&rack->r_ctl.rc_map); 4136 changed = 0; 4137 th_ack = th->th_ack; 4138 4139 if (SEQ_GT(th_ack, tp->snd_una)) { 4140 rack_log_progress_event(rack, tp, ticks, PROGRESS_UPDATE, __LINE__); 4141 tp->t_acktime = ticks; 4142 } 4143 if (rsm && SEQ_GT(th_ack, rsm->r_start)) 4144 changed = th_ack - rsm->r_start; 4145 if (changed) { 4146 /* 4147 * The ACK point is advancing to th_ack, we must drop off 4148 * the packets in the rack log and calculate any eligble 4149 * RTT's. 4150 */ 4151 rack->r_wanted_output++; 4152 more: 4153 rsm = TAILQ_FIRST(&rack->r_ctl.rc_map); 4154 if (rsm == NULL) { 4155 if ((th_ack - 1) == tp->iss) { 4156 /* 4157 * For the SYN incoming case we will not 4158 * have called tcp_output for the sending of 4159 * the SYN, so there will be no map. All 4160 * other cases should probably be a panic. 4161 */ 4162 goto proc_sack; 4163 } 4164 if (tp->t_flags & TF_SENTFIN) { 4165 /* if we send a FIN we will not hav a map */ 4166 goto proc_sack; 4167 } 4168 #ifdef INVARIANTS 4169 panic("No rack map tp:%p for th:%p state:%d rack:%p snd_una:%u snd_max:%u snd_nxt:%u chg:%d\n", 4170 tp, 4171 th, tp->t_state, rack, 4172 tp->snd_una, tp->snd_max, tp->snd_nxt, changed); 4173 #endif 4174 goto proc_sack; 4175 } 4176 if (SEQ_LT(th_ack, rsm->r_start)) { 4177 /* Huh map is missing this */ 4178 #ifdef INVARIANTS 4179 printf("Rack map starts at r_start:%u for th_ack:%u huh? ts:%d rs:%d\n", 4180 rsm->r_start, 4181 th_ack, tp->t_state, rack->r_state); 4182 #endif 4183 goto proc_sack; 4184 } 4185 rack_update_rtt(tp, rack, rsm, to, cts, CUM_ACKED); 4186 /* Now do we consume the whole thing? */ 4187 if (SEQ_GEQ(th_ack, rsm->r_end)) { 4188 /* Its all consumed. */ 4189 uint32_t left; 4190 4191 rack->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes; 4192 rsm->r_rtr_bytes = 0; 4193 TAILQ_REMOVE(&rack->r_ctl.rc_map, rsm, r_next); 4194 if (rsm->r_in_tmap) { 4195 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 4196 rsm->r_in_tmap = 0; 4197 } 4198 if (rack->r_ctl.rc_next == rsm) { 4199 /* scoot along the marker */ 4200 rack->r_ctl.rc_next = TAILQ_FIRST(&rack->r_ctl.rc_map); 4201 } 4202 if (rsm->r_flags & RACK_ACKED) { 4203 /* 4204 * It was acked on the scoreboard -- remove 4205 * it from total 4206 */ 4207 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); 4208 } else if (rsm->r_flags & RACK_SACK_PASSED) { 4209 /* 4210 * There are acked segments ACKED on the 4211 * scoreboard further up. We are seeing 4212 * reordering. 4213 */ 4214 counter_u64_add(rack_reorder_seen, 1); 4215 rsm->r_flags |= RACK_ACKED; 4216 rack->r_ctl.rc_reorder_ts = cts; 4217 } 4218 left = th_ack - rsm->r_end; 4219 if (rsm->r_rtr_cnt > 1) { 4220 /* 4221 * Technically we should make r_rtr_cnt be 4222 * monotonicly increasing and just mod it to 4223 * the timestamp it is replacing.. that way 4224 * we would have the last 3 retransmits. Now 4225 * rc_loss_count will be wrong if we 4226 * retransmit something more than 2 times in 4227 * recovery :( 4228 */ 4229 rack->r_ctl.rc_loss_count += (rsm->r_rtr_cnt - 1); 4230 } 4231 /* Free back to zone */ 4232 rack_free(rack, rsm); 4233 if (left) { 4234 goto more; 4235 } 4236 goto proc_sack; 4237 } 4238 if (rsm->r_flags & RACK_ACKED) { 4239 /* 4240 * It was acked on the scoreboard -- remove it from 4241 * total for the part being cum-acked. 4242 */ 4243 rack->r_ctl.rc_sacked -= (th_ack - rsm->r_start); 4244 } 4245 rack->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes; 4246 rsm->r_rtr_bytes = 0; 4247 rsm->r_start = th_ack; 4248 } 4249 proc_sack: 4250 /* Check for reneging */ 4251 rsm = TAILQ_FIRST(&rack->r_ctl.rc_map); 4252 if (rsm && (rsm->r_flags & RACK_ACKED) && (th_ack == rsm->r_start)) { 4253 /* 4254 * The peer has moved snd_una up to 4255 * the edge of this send, i.e. one 4256 * that it had previously acked. The only 4257 * way that can be true if the peer threw 4258 * away data (space issues) that it had 4259 * previously sacked (else it would have 4260 * given us snd_una up to (rsm->r_end). 4261 * We need to undo the acked markings here. 4262 * 4263 * Note we have to look to make sure th_ack is 4264 * our rsm->r_start in case we get an old ack 4265 * where th_ack is behind snd_una. 4266 */ 4267 rack_peer_reneges(rack, rsm, th->th_ack); 4268 } 4269 if ((to->to_flags & TOF_SACK) == 0) { 4270 /* We are done nothing left to log */ 4271 goto out; 4272 } 4273 rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_map, rack_sendmap, r_next); 4274 if (rsm) { 4275 last_seq = rsm->r_end; 4276 } else { 4277 last_seq = tp->snd_max; 4278 } 4279 /* Sack block processing */ 4280 if (SEQ_GT(th_ack, tp->snd_una)) 4281 ack_point = th_ack; 4282 else 4283 ack_point = tp->snd_una; 4284 for (i = 0; i < to->to_nsacks; i++) { 4285 bcopy((to->to_sacks + i * TCPOLEN_SACK), 4286 &sack, sizeof(sack)); 4287 sack.start = ntohl(sack.start); 4288 sack.end = ntohl(sack.end); 4289 if (SEQ_GT(sack.end, sack.start) && 4290 SEQ_GT(sack.start, ack_point) && 4291 SEQ_LT(sack.start, tp->snd_max) && 4292 SEQ_GT(sack.end, ack_point) && 4293 SEQ_LEQ(sack.end, tp->snd_max)) { 4294 if ((rack->r_ctl.rc_num_maps_alloced > rack_sack_block_limit) && 4295 (SEQ_LT(sack.end, last_seq)) && 4296 ((sack.end - sack.start) < (tp->t_maxseg / 8))) { 4297 /* 4298 * Not the last piece and its smaller than 4299 * 1/8th of a MSS. We ignore this. 4300 */ 4301 counter_u64_add(rack_runt_sacks, 1); 4302 continue; 4303 } 4304 sack_blocks[num_sack_blks] = sack; 4305 num_sack_blks++; 4306 #ifdef NETFLIX_STATS 4307 } else if (SEQ_LEQ(sack.start, th_ack) && 4308 SEQ_LEQ(sack.end, th_ack)) { 4309 /* 4310 * Its a D-SACK block. 4311 */ 4312 tcp_record_dsack(sack.start, sack.end); 4313 #endif 4314 } 4315 4316 } 4317 if (num_sack_blks == 0) 4318 goto out; 4319 /* 4320 * Sort the SACK blocks so we can update the rack scoreboard with 4321 * just one pass. 4322 */ 4323 if (rack_use_sack_filter) { 4324 num_sack_blks = sack_filter_blks(&rack->r_ctl.rack_sf, sack_blocks, num_sack_blks, th->th_ack); 4325 } 4326 if (num_sack_blks < 2) { 4327 goto do_sack_work; 4328 } 4329 /* Sort the sacks */ 4330 for (i = 0; i < num_sack_blks; i++) { 4331 for (j = i + 1; j < num_sack_blks; j++) { 4332 if (SEQ_GT(sack_blocks[i].end, sack_blocks[j].end)) { 4333 sack = sack_blocks[i]; 4334 sack_blocks[i] = sack_blocks[j]; 4335 sack_blocks[j] = sack; 4336 } 4337 } 4338 } 4339 /* 4340 * Now are any of the sack block ends the same (yes some 4341 * implememtations send these)? 4342 */ 4343 again: 4344 if (num_sack_blks > 1) { 4345 for (i = 0; i < num_sack_blks; i++) { 4346 for (j = i + 1; j < num_sack_blks; j++) { 4347 if (sack_blocks[i].end == sack_blocks[j].end) { 4348 /* 4349 * Ok these two have the same end we 4350 * want the smallest end and then 4351 * throw away the larger and start 4352 * again. 4353 */ 4354 if (SEQ_LT(sack_blocks[j].start, sack_blocks[i].start)) { 4355 /* 4356 * The second block covers 4357 * more area use that 4358 */ 4359 sack_blocks[i].start = sack_blocks[j].start; 4360 } 4361 /* 4362 * Now collapse out the dup-sack and 4363 * lower the count 4364 */ 4365 for (k = (j + 1); k < num_sack_blks; k++) { 4366 sack_blocks[j].start = sack_blocks[k].start; 4367 sack_blocks[j].end = sack_blocks[k].end; 4368 j++; 4369 } 4370 num_sack_blks--; 4371 goto again; 4372 } 4373 } 4374 } 4375 } 4376 do_sack_work: 4377 rsm = rack->r_ctl.rc_sacklast; 4378 for (i = 0; i < num_sack_blks; i++) { 4379 acked = rack_proc_sack_blk(tp, rack, &sack_blocks[i], to, &rsm, cts); 4380 if (acked) { 4381 rack->r_wanted_output++; 4382 changed += acked; 4383 sack_changed += acked; 4384 } 4385 } 4386 out: 4387 if (changed) { 4388 /* Something changed cancel the rack timer */ 4389 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 4390 } 4391 if ((sack_changed) && (!IN_RECOVERY(tp->t_flags))) { 4392 /* 4393 * Ok we have a high probability that we need to go in to 4394 * recovery since we have data sack'd 4395 */ 4396 struct rack_sendmap *rsm; 4397 uint32_t tsused; 4398 4399 tsused = tcp_ts_getticks(); 4400 rsm = tcp_rack_output(tp, rack, tsused); 4401 if (rsm) { 4402 /* Enter recovery */ 4403 rack->r_ctl.rc_rsm_start = rsm->r_start; 4404 rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; 4405 rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; 4406 entered_recovery = 1; 4407 rack_cong_signal(tp, NULL, CC_NDUPACK); 4408 /* 4409 * When we enter recovery we need to assure we send 4410 * one packet. 4411 */ 4412 rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; 4413 rack->r_timer_override = 1; 4414 } 4415 } 4416 if (IN_RECOVERY(tp->t_flags) && (entered_recovery == 0)) { 4417 /* Deal with changed an PRR here (in recovery only) */ 4418 uint32_t pipe, snd_una; 4419 4420 rack->r_ctl.rc_prr_delivered += changed; 4421 /* Compute prr_sndcnt */ 4422 if (SEQ_GT(tp->snd_una, th_ack)) { 4423 snd_una = tp->snd_una; 4424 } else { 4425 snd_una = th_ack; 4426 } 4427 pipe = ((tp->snd_max - snd_una) - rack->r_ctl.rc_sacked) + rack->r_ctl.rc_holes_rxt; 4428 if (pipe > tp->snd_ssthresh) { 4429 long sndcnt; 4430 4431 sndcnt = rack->r_ctl.rc_prr_delivered * tp->snd_ssthresh; 4432 if (rack->r_ctl.rc_prr_recovery_fs > 0) 4433 sndcnt /= (long)rack->r_ctl.rc_prr_recovery_fs; 4434 else { 4435 rack->r_ctl.rc_prr_sndcnt = 0; 4436 sndcnt = 0; 4437 } 4438 sndcnt++; 4439 if (sndcnt > (long)rack->r_ctl.rc_prr_out) 4440 sndcnt -= rack->r_ctl.rc_prr_out; 4441 else 4442 sndcnt = 0; 4443 rack->r_ctl.rc_prr_sndcnt = sndcnt; 4444 } else { 4445 uint32_t limit; 4446 4447 if (rack->r_ctl.rc_prr_delivered > rack->r_ctl.rc_prr_out) 4448 limit = (rack->r_ctl.rc_prr_delivered - rack->r_ctl.rc_prr_out); 4449 else 4450 limit = 0; 4451 if (changed > limit) 4452 limit = changed; 4453 limit += tp->t_maxseg; 4454 if (tp->snd_ssthresh > pipe) { 4455 rack->r_ctl.rc_prr_sndcnt = min((tp->snd_ssthresh - pipe), limit); 4456 } else { 4457 rack->r_ctl.rc_prr_sndcnt = min(0, limit); 4458 } 4459 } 4460 if (rack->r_ctl.rc_prr_sndcnt >= tp->t_maxseg) { 4461 rack->r_timer_override = 1; 4462 } 4463 } 4464 } 4465 4466 /* 4467 * Return value of 1, we do not need to call rack_process_data(). 4468 * return value of 0, rack_process_data can be called. 4469 * For ret_val if its 0 the TCP is locked, if its non-zero 4470 * its unlocked and probably unsafe to touch the TCB. 4471 */ 4472 static int 4473 rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so, 4474 struct tcpcb *tp, struct tcpopt *to, 4475 int32_t * ti_locked, uint32_t tiwin, int32_t tlen, 4476 int32_t * ofia, int32_t thflags, int32_t * ret_val) 4477 { 4478 int32_t ourfinisacked = 0; 4479 int32_t nsegs, acked_amount; 4480 int32_t acked; 4481 struct mbuf *mfree; 4482 struct tcp_rack *rack; 4483 int32_t recovery = 0; 4484 4485 rack = (struct tcp_rack *)tp->t_fb_ptr; 4486 if (SEQ_GT(th->th_ack, tp->snd_max)) { 4487 rack_do_dropafterack(m, tp, th, ti_locked, thflags, tlen, ret_val); 4488 return (1); 4489 } 4490 if (SEQ_GEQ(th->th_ack, tp->snd_una) || to->to_nsacks) { 4491 rack_log_ack(tp, to, th); 4492 } 4493 if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) { 4494 /* 4495 * Old ack, behind (or duplicate to) the last one rcv'd 4496 * Note: Should mark reordering is occuring! We should also 4497 * look for sack blocks arriving e.g. ack 1, 4-4 then ack 1, 4498 * 3-3, 4-4 would be reording. As well as ack 1, 3-3 <no 4499 * retran and> ack 3 4500 */ 4501 return (0); 4502 } 4503 /* 4504 * If we reach this point, ACK is not a duplicate, i.e., it ACKs 4505 * something we sent. 4506 */ 4507 if (tp->t_flags & TF_NEEDSYN) { 4508 /* 4509 * T/TCP: Connection was half-synchronized, and our SYN has 4510 * been ACK'd (so connection is now fully synchronized). Go 4511 * to non-starred state, increment snd_una for ACK of SYN, 4512 * and check if we can do window scaling. 4513 */ 4514 tp->t_flags &= ~TF_NEEDSYN; 4515 tp->snd_una++; 4516 /* Do window scaling? */ 4517 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 4518 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 4519 tp->rcv_scale = tp->request_r_scale; 4520 /* Send window already scaled. */ 4521 } 4522 } 4523 nsegs = max(1, m->m_pkthdr.lro_nsegs); 4524 INP_WLOCK_ASSERT(tp->t_inpcb); 4525 4526 acked = BYTES_THIS_ACK(tp, th); 4527 TCPSTAT_ADD(tcps_rcvackpack, nsegs); 4528 TCPSTAT_ADD(tcps_rcvackbyte, acked); 4529 4530 /* 4531 * If we just performed our first retransmit, and the ACK arrives 4532 * within our recovery window, then it was a mistake to do the 4533 * retransmit in the first place. Recover our original cwnd and 4534 * ssthresh, and proceed to transmit where we left off. 4535 */ 4536 if (tp->t_flags & TF_PREVVALID) { 4537 tp->t_flags &= ~TF_PREVVALID; 4538 if (tp->t_rxtshift == 1 && 4539 (int)(ticks - tp->t_badrxtwin) < 0) 4540 rack_cong_signal(tp, th, CC_RTO_ERR); 4541 } 4542 /* 4543 * If we have a timestamp reply, update smoothed round trip time. If 4544 * no timestamp is present but transmit timer is running and timed 4545 * sequence number was acked, update smoothed round trip time. Since 4546 * we now have an rtt measurement, cancel the timer backoff (cf., 4547 * Phil Karn's retransmit alg.). Recompute the initial retransmit 4548 * timer. 4549 * 4550 * Some boxes send broken timestamp replies during the SYN+ACK 4551 * phase, ignore timestamps of 0 or we could calculate a huge RTT 4552 * and blow up the retransmit timer. 4553 */ 4554 /* 4555 * If all outstanding data is acked, stop retransmit timer and 4556 * remember to restart (more output or persist). If there is more 4557 * data to be acked, restart retransmit timer, using current 4558 * (possibly backed-off) value. 4559 */ 4560 if (th->th_ack == tp->snd_max) { 4561 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 4562 rack->r_wanted_output++; 4563 } 4564 /* 4565 * If no data (only SYN) was ACK'd, skip rest of ACK processing. 4566 */ 4567 if (acked == 0) { 4568 if (ofia) 4569 *ofia = ourfinisacked; 4570 return (0); 4571 } 4572 if (rack->r_ctl.rc_early_recovery) { 4573 if (IN_FASTRECOVERY(tp->t_flags)) { 4574 if (SEQ_LT(th->th_ack, tp->snd_recover)) { 4575 tcp_rack_partialack(tp, th); 4576 } else { 4577 rack_post_recovery(tp, th); 4578 recovery = 1; 4579 } 4580 } 4581 } 4582 /* 4583 * Let the congestion control algorithm update congestion control 4584 * related information. This typically means increasing the 4585 * congestion window. 4586 */ 4587 rack_ack_received(tp, rack, th, nsegs, CC_ACK, recovery); 4588 SOCKBUF_LOCK(&so->so_snd); 4589 acked_amount = min(acked, (int)sbavail(&so->so_snd)); 4590 tp->snd_wnd -= acked_amount; 4591 mfree = sbcut_locked(&so->so_snd, acked_amount); 4592 if ((sbused(&so->so_snd) == 0) && 4593 (acked > acked_amount) && 4594 (tp->t_state >= TCPS_FIN_WAIT_1)) { 4595 ourfinisacked = 1; 4596 } 4597 /* NB: sowwakeup_locked() does an implicit unlock. */ 4598 sowwakeup_locked(so); 4599 m_freem(mfree); 4600 if (rack->r_ctl.rc_early_recovery == 0) { 4601 if (IN_FASTRECOVERY(tp->t_flags)) { 4602 if (SEQ_LT(th->th_ack, tp->snd_recover)) { 4603 tcp_rack_partialack(tp, th); 4604 } else { 4605 rack_post_recovery(tp, th); 4606 } 4607 } 4608 } 4609 tp->snd_una = th->th_ack; 4610 if (SEQ_GT(tp->snd_una, tp->snd_recover)) 4611 tp->snd_recover = tp->snd_una; 4612 4613 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) { 4614 tp->snd_nxt = tp->snd_una; 4615 } 4616 if (tp->snd_una == tp->snd_max) { 4617 /* Nothing left outstanding */ 4618 rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__); 4619 tp->t_acktime = 0; 4620 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 4621 /* Set need output so persist might get set */ 4622 rack->r_wanted_output++; 4623 if (rack_use_sack_filter) 4624 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 4625 if ((tp->t_state >= TCPS_FIN_WAIT_1) && 4626 (sbavail(&so->so_snd) == 0) && 4627 (tp->t_flags2 & TF2_DROP_AF_DATA)) { 4628 /* 4629 * The socket was gone and the 4630 * peer sent data, time to 4631 * reset him. 4632 */ 4633 *ret_val = 1; 4634 tp = tcp_close(tp); 4635 rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_UNLIMITED, tlen); 4636 return (1); 4637 } 4638 } 4639 if (ofia) 4640 *ofia = ourfinisacked; 4641 return (0); 4642 } 4643 4644 4645 /* 4646 * Return value of 1, the TCB is unlocked and most 4647 * likely gone, return value of 0, the TCP is still 4648 * locked. 4649 */ 4650 static int 4651 rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so, 4652 struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, 4653 int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) 4654 { 4655 /* 4656 * Update window information. Don't look at window if no ACK: TAC's 4657 * send garbage on first SYN. 4658 */ 4659 int32_t nsegs; 4660 #ifdef TCP_RFC7413 4661 int32_t tfo_syn; 4662 #else 4663 #define tfo_syn (FALSE) 4664 #endif 4665 struct tcp_rack *rack; 4666 4667 rack = (struct tcp_rack *)tp->t_fb_ptr; 4668 INP_WLOCK_ASSERT(tp->t_inpcb); 4669 4670 nsegs = max(1, m->m_pkthdr.lro_nsegs); 4671 if ((thflags & TH_ACK) && 4672 (SEQ_LT(tp->snd_wl1, th->th_seq) || 4673 (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || 4674 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { 4675 /* keep track of pure window updates */ 4676 if (tlen == 0 && 4677 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) 4678 TCPSTAT_INC(tcps_rcvwinupd); 4679 tp->snd_wnd = tiwin; 4680 tp->snd_wl1 = th->th_seq; 4681 tp->snd_wl2 = th->th_ack; 4682 if (tp->snd_wnd > tp->max_sndwnd) 4683 tp->max_sndwnd = tp->snd_wnd; 4684 rack->r_wanted_output++; 4685 } else if (thflags & TH_ACK) { 4686 if ((tp->snd_wl2 == th->th_ack) && (tiwin < tp->snd_wnd)) { 4687 tp->snd_wnd = tiwin; 4688 tp->snd_wl1 = th->th_seq; 4689 tp->snd_wl2 = th->th_ack; 4690 } 4691 } 4692 /* Was persist timer active and now we have window space? */ 4693 if ((rack->rc_in_persist != 0) && tp->snd_wnd) { 4694 rack_exit_persist(tp, rack); 4695 tp->snd_nxt = tp->snd_max; 4696 /* Make sure we output to start the timer */ 4697 rack->r_wanted_output++; 4698 } 4699 /* 4700 * Process segments with URG. 4701 */ 4702 if ((thflags & TH_URG) && th->th_urp && 4703 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 4704 /* 4705 * This is a kludge, but if we receive and accept random 4706 * urgent pointers, we'll crash in soreceive. It's hard to 4707 * imagine someone actually wanting to send this much urgent 4708 * data. 4709 */ 4710 SOCKBUF_LOCK(&so->so_rcv); 4711 if (th->th_urp + sbavail(&so->so_rcv) > sb_max) { 4712 th->th_urp = 0; /* XXX */ 4713 thflags &= ~TH_URG; /* XXX */ 4714 SOCKBUF_UNLOCK(&so->so_rcv); /* XXX */ 4715 goto dodata; /* XXX */ 4716 } 4717 /* 4718 * If this segment advances the known urgent pointer, then 4719 * mark the data stream. This should not happen in 4720 * CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since a 4721 * FIN has been received from the remote side. In these 4722 * states we ignore the URG. 4723 * 4724 * According to RFC961 (Assigned Protocols), the urgent 4725 * pointer points to the last octet of urgent data. We 4726 * continue, however, to consider it to indicate the first 4727 * octet of data past the urgent section as the original 4728 * spec states (in one of two places). 4729 */ 4730 if (SEQ_GT(th->th_seq + th->th_urp, tp->rcv_up)) { 4731 tp->rcv_up = th->th_seq + th->th_urp; 4732 so->so_oobmark = sbavail(&so->so_rcv) + 4733 (tp->rcv_up - tp->rcv_nxt) - 1; 4734 if (so->so_oobmark == 0) 4735 so->so_rcv.sb_state |= SBS_RCVATMARK; 4736 sohasoutofband(so); 4737 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); 4738 } 4739 SOCKBUF_UNLOCK(&so->so_rcv); 4740 /* 4741 * Remove out of band data so doesn't get presented to user. 4742 * This can happen independent of advancing the URG pointer, 4743 * but if two URG's are pending at once, some out-of-band 4744 * data may creep in... ick. 4745 */ 4746 if (th->th_urp <= (uint32_t) tlen && 4747 !(so->so_options & SO_OOBINLINE)) { 4748 /* hdr drop is delayed */ 4749 tcp_pulloutofband(so, th, m, drop_hdrlen); 4750 } 4751 } else { 4752 /* 4753 * If no out of band data is expected, pull receive urgent 4754 * pointer along with the receive window. 4755 */ 4756 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) 4757 tp->rcv_up = tp->rcv_nxt; 4758 } 4759 dodata: /* XXX */ 4760 INP_WLOCK_ASSERT(tp->t_inpcb); 4761 4762 /* 4763 * Process the segment text, merging it into the TCP sequencing 4764 * queue, and arranging for acknowledgment of receipt if necessary. 4765 * This process logically involves adjusting tp->rcv_wnd as data is 4766 * presented to the user (this happens in tcp_usrreq.c, case 4767 * PRU_RCVD). If a FIN has already been received on this connection 4768 * then we just ignore the text. 4769 */ 4770 #ifdef TCP_RFC7413 4771 tfo_syn = ((tp->t_state == TCPS_SYN_RECEIVED) && 4772 (tp->t_flags & TF_FASTOPEN)); 4773 #endif 4774 if ((tlen || (thflags & TH_FIN) || tfo_syn) && 4775 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 4776 tcp_seq save_start = th->th_seq; 4777 4778 m_adj(m, drop_hdrlen); /* delayed header drop */ 4779 /* 4780 * Insert segment which includes th into TCP reassembly 4781 * queue with control block tp. Set thflags to whether 4782 * reassembly now includes a segment with FIN. This handles 4783 * the common case inline (segment is the next to be 4784 * received on an established connection, and the queue is 4785 * empty), avoiding linkage into and removal from the queue 4786 * and repetition of various conversions. Set DELACK for 4787 * segments received in order, but ack immediately when 4788 * segments are out of order (so fast retransmit can work). 4789 */ 4790 if (th->th_seq == tp->rcv_nxt && 4791 LIST_EMPTY(&tp->t_segq) && 4792 (TCPS_HAVEESTABLISHED(tp->t_state) || 4793 tfo_syn)) { 4794 if (DELAY_ACK(tp, tlen) || tfo_syn) { 4795 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 4796 tp->t_flags |= TF_DELACK; 4797 } else { 4798 rack->r_wanted_output++; 4799 tp->t_flags |= TF_ACKNOW; 4800 } 4801 tp->rcv_nxt += tlen; 4802 thflags = th->th_flags & TH_FIN; 4803 TCPSTAT_ADD(tcps_rcvpack, nsegs); 4804 TCPSTAT_ADD(tcps_rcvbyte, tlen); 4805 SOCKBUF_LOCK(&so->so_rcv); 4806 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) 4807 m_freem(m); 4808 else 4809 sbappendstream_locked(&so->so_rcv, m, 0); 4810 /* NB: sorwakeup_locked() does an implicit unlock. */ 4811 sorwakeup_locked(so); 4812 } else { 4813 /* 4814 * XXX: Due to the header drop above "th" is 4815 * theoretically invalid by now. Fortunately 4816 * m_adj() doesn't actually frees any mbufs when 4817 * trimming from the head. 4818 */ 4819 thflags = tcp_reass(tp, th, &tlen, m); 4820 tp->t_flags |= TF_ACKNOW; 4821 } 4822 if (tlen > 0) 4823 tcp_update_sack_list(tp, save_start, save_start + tlen); 4824 } else { 4825 m_freem(m); 4826 thflags &= ~TH_FIN; 4827 } 4828 4829 /* 4830 * If FIN is received ACK the FIN and let the user know that the 4831 * connection is closing. 4832 */ 4833 if (thflags & TH_FIN) { 4834 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 4835 socantrcvmore(so); 4836 /* 4837 * If connection is half-synchronized (ie NEEDSYN 4838 * flag on) then delay ACK, so it may be piggybacked 4839 * when SYN is sent. Otherwise, since we received a 4840 * FIN then no more input can be expected, send ACK 4841 * now. 4842 */ 4843 if (tp->t_flags & TF_NEEDSYN) { 4844 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 4845 tp->t_flags |= TF_DELACK; 4846 } else { 4847 tp->t_flags |= TF_ACKNOW; 4848 } 4849 tp->rcv_nxt++; 4850 } 4851 switch (tp->t_state) { 4852 4853 /* 4854 * In SYN_RECEIVED and ESTABLISHED STATES enter the 4855 * CLOSE_WAIT state. 4856 */ 4857 case TCPS_SYN_RECEIVED: 4858 tp->t_starttime = ticks; 4859 /* FALLTHROUGH */ 4860 case TCPS_ESTABLISHED: 4861 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 4862 tcp_state_change(tp, TCPS_CLOSE_WAIT); 4863 break; 4864 4865 /* 4866 * If still in FIN_WAIT_1 STATE FIN has not been 4867 * acked so enter the CLOSING state. 4868 */ 4869 case TCPS_FIN_WAIT_1: 4870 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 4871 tcp_state_change(tp, TCPS_CLOSING); 4872 break; 4873 4874 /* 4875 * In FIN_WAIT_2 state enter the TIME_WAIT state, 4876 * starting the time-wait timer, turning off the 4877 * other standard timers. 4878 */ 4879 case TCPS_FIN_WAIT_2: 4880 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 4881 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 4882 KASSERT(*ti_locked == TI_RLOCKED, ("%s: dodata " 4883 "TCP_FIN_WAIT_2 ti_locked: %d", __func__, 4884 *ti_locked)); 4885 tcp_twstart(tp); 4886 *ti_locked = TI_UNLOCKED; 4887 INP_INFO_RUNLOCK(&V_tcbinfo); 4888 return (1); 4889 } 4890 } 4891 if (*ti_locked == TI_RLOCKED) { 4892 INP_INFO_RUNLOCK(&V_tcbinfo); 4893 *ti_locked = TI_UNLOCKED; 4894 } 4895 /* 4896 * Return any desired output. 4897 */ 4898 if ((tp->t_flags & TF_ACKNOW) || (sbavail(&so->so_snd) > (tp->snd_max - tp->snd_una))) { 4899 rack->r_wanted_output++; 4900 } 4901 KASSERT(*ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d", 4902 __func__, *ti_locked)); 4903 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); 4904 INP_WLOCK_ASSERT(tp->t_inpcb); 4905 return (0); 4906 } 4907 4908 /* 4909 * Here nothing is really faster, its just that we 4910 * have broken out the fast-data path also just like 4911 * the fast-ack. 4912 */ 4913 static int 4914 rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so, 4915 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 4916 int32_t * ti_locked, uint32_t tiwin, int32_t nxt_pkt) 4917 { 4918 int32_t nsegs; 4919 int32_t newsize = 0; /* automatic sockbuf scaling */ 4920 struct tcp_rack *rack; 4921 #ifdef TCPDEBUG 4922 /* 4923 * The size of tcp_saveipgen must be the size of the max ip header, 4924 * now IPv6. 4925 */ 4926 u_char tcp_saveipgen[IP6_HDR_LEN]; 4927 struct tcphdr tcp_savetcp; 4928 short ostate = 0; 4929 4930 #endif 4931 /* 4932 * If last ACK falls within this segment's sequence numbers, record 4933 * the timestamp. NOTE that the test is modified according to the 4934 * latest proposal of the tcplw@cray.com list (Braden 1993/04/26). 4935 */ 4936 if (__predict_false(th->th_seq != tp->rcv_nxt)) { 4937 return (0); 4938 } 4939 if (__predict_false(tp->snd_nxt != tp->snd_max)) { 4940 return (0); 4941 } 4942 if (tiwin && tiwin != tp->snd_wnd) { 4943 return (0); 4944 } 4945 if (__predict_false((tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN)))) { 4946 return (0); 4947 } 4948 if (__predict_false((to->to_flags & TOF_TS) && 4949 (TSTMP_LT(to->to_tsval, tp->ts_recent)))) { 4950 return (0); 4951 } 4952 if (__predict_false((th->th_ack != tp->snd_una))) { 4953 return (0); 4954 } 4955 if (__predict_false(tlen > sbspace(&so->so_rcv))) { 4956 return (0); 4957 } 4958 if ((to->to_flags & TOF_TS) != 0 && 4959 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 4960 tp->ts_recent_age = tcp_ts_getticks(); 4961 tp->ts_recent = to->to_tsval; 4962 } 4963 rack = (struct tcp_rack *)tp->t_fb_ptr; 4964 /* 4965 * This is a pure, in-sequence data packet with nothing on the 4966 * reassembly queue and we have enough buffer space to take it. 4967 */ 4968 if (*ti_locked == TI_RLOCKED) { 4969 INP_INFO_RUNLOCK(&V_tcbinfo); 4970 *ti_locked = TI_UNLOCKED; 4971 } 4972 nsegs = max(1, m->m_pkthdr.lro_nsegs); 4973 4974 4975 /* Clean receiver SACK report if present */ 4976 if (tp->rcv_numsacks) 4977 tcp_clean_sackreport(tp); 4978 TCPSTAT_INC(tcps_preddat); 4979 tp->rcv_nxt += tlen; 4980 /* 4981 * Pull snd_wl1 up to prevent seq wrap relative to th_seq. 4982 */ 4983 tp->snd_wl1 = th->th_seq; 4984 /* 4985 * Pull rcv_up up to prevent seq wrap relative to rcv_nxt. 4986 */ 4987 tp->rcv_up = tp->rcv_nxt; 4988 TCPSTAT_ADD(tcps_rcvpack, nsegs); 4989 TCPSTAT_ADD(tcps_rcvbyte, tlen); 4990 #ifdef TCPDEBUG 4991 if (so->so_options & SO_DEBUG) 4992 tcp_trace(TA_INPUT, ostate, tp, 4993 (void *)tcp_saveipgen, &tcp_savetcp, 0); 4994 #endif 4995 newsize = tcp_autorcvbuf(m, th, so, tp, tlen); 4996 4997 /* Add data to socket buffer. */ 4998 SOCKBUF_LOCK(&so->so_rcv); 4999 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 5000 m_freem(m); 5001 } else { 5002 /* 5003 * Set new socket buffer size. Give up when limit is 5004 * reached. 5005 */ 5006 if (newsize) 5007 if (!sbreserve_locked(&so->so_rcv, 5008 newsize, so, NULL)) 5009 so->so_rcv.sb_flags &= ~SB_AUTOSIZE; 5010 m_adj(m, drop_hdrlen); /* delayed header drop */ 5011 sbappendstream_locked(&so->so_rcv, m, 0); 5012 rack_calc_rwin(so, tp); 5013 } 5014 /* NB: sorwakeup_locked() does an implicit unlock. */ 5015 sorwakeup_locked(so); 5016 if (DELAY_ACK(tp, tlen)) { 5017 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 5018 tp->t_flags |= TF_DELACK; 5019 } else { 5020 tp->t_flags |= TF_ACKNOW; 5021 rack->r_wanted_output++; 5022 } 5023 if ((tp->snd_una == tp->snd_max) && rack_use_sack_filter) 5024 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 5025 return (1); 5026 } 5027 5028 /* 5029 * This subfunction is used to try to highly optimize the 5030 * fast path. We again allow window updates that are 5031 * in sequence to remain in the fast-path. We also add 5032 * in the __predict's to attempt to help the compiler. 5033 * Note that if we return a 0, then we can *not* process 5034 * it and the caller should push the packet into the 5035 * slow-path. 5036 */ 5037 static int 5038 rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so, 5039 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 5040 int32_t * ti_locked, uint32_t tiwin, int32_t nxt_pkt, uint32_t cts) 5041 { 5042 int32_t acked; 5043 int32_t nsegs; 5044 5045 #ifdef TCPDEBUG 5046 /* 5047 * The size of tcp_saveipgen must be the size of the max ip header, 5048 * now IPv6. 5049 */ 5050 u_char tcp_saveipgen[IP6_HDR_LEN]; 5051 struct tcphdr tcp_savetcp; 5052 short ostate = 0; 5053 5054 #endif 5055 struct tcp_rack *rack; 5056 5057 if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) { 5058 /* Old ack, behind (or duplicate to) the last one rcv'd */ 5059 return (0); 5060 } 5061 if (__predict_false(SEQ_GT(th->th_ack, tp->snd_max))) { 5062 /* Above what we have sent? */ 5063 return (0); 5064 } 5065 if (__predict_false(tp->snd_nxt != tp->snd_max)) { 5066 /* We are retransmitting */ 5067 return (0); 5068 } 5069 if (__predict_false(tiwin == 0)) { 5070 /* zero window */ 5071 return (0); 5072 } 5073 if (__predict_false(tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN))) { 5074 /* We need a SYN or a FIN, unlikely.. */ 5075 return (0); 5076 } 5077 if ((to->to_flags & TOF_TS) && __predict_false(TSTMP_LT(to->to_tsval, tp->ts_recent))) { 5078 /* Timestamp is behind .. old ack with seq wrap? */ 5079 return (0); 5080 } 5081 if (__predict_false(IN_RECOVERY(tp->t_flags))) { 5082 /* Still recovering */ 5083 return (0); 5084 } 5085 rack = (struct tcp_rack *)tp->t_fb_ptr; 5086 if (rack->r_ctl.rc_sacked) { 5087 /* We have sack holes on our scoreboard */ 5088 return (0); 5089 } 5090 /* Ok if we reach here, we can process a fast-ack */ 5091 nsegs = max(1, m->m_pkthdr.lro_nsegs); 5092 rack_log_ack(tp, to, th); 5093 /* Did the window get updated? */ 5094 if (tiwin != tp->snd_wnd) { 5095 tp->snd_wnd = tiwin; 5096 tp->snd_wl1 = th->th_seq; 5097 if (tp->snd_wnd > tp->max_sndwnd) 5098 tp->max_sndwnd = tp->snd_wnd; 5099 } 5100 if ((rack->rc_in_persist != 0) && (tp->snd_wnd >= tp->t_maxseg)) { 5101 rack_exit_persist(tp, rack); 5102 } 5103 /* 5104 * If last ACK falls within this segment's sequence numbers, record 5105 * the timestamp. NOTE that the test is modified according to the 5106 * latest proposal of the tcplw@cray.com list (Braden 1993/04/26). 5107 */ 5108 if ((to->to_flags & TOF_TS) != 0 && 5109 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 5110 tp->ts_recent_age = tcp_ts_getticks(); 5111 tp->ts_recent = to->to_tsval; 5112 } 5113 /* 5114 * This is a pure ack for outstanding data. 5115 */ 5116 if (*ti_locked == TI_RLOCKED) { 5117 INP_INFO_RUNLOCK(&V_tcbinfo); 5118 *ti_locked = TI_UNLOCKED; 5119 } 5120 TCPSTAT_INC(tcps_predack); 5121 5122 /* 5123 * "bad retransmit" recovery. 5124 */ 5125 if (tp->t_flags & TF_PREVVALID) { 5126 tp->t_flags &= ~TF_PREVVALID; 5127 if (tp->t_rxtshift == 1 && 5128 (int)(ticks - tp->t_badrxtwin) < 0) 5129 rack_cong_signal(tp, th, CC_RTO_ERR); 5130 } 5131 /* 5132 * Recalculate the transmit timer / rtt. 5133 * 5134 * Some boxes send broken timestamp replies during the SYN+ACK 5135 * phase, ignore timestamps of 0 or we could calculate a huge RTT 5136 * and blow up the retransmit timer. 5137 */ 5138 acked = BYTES_THIS_ACK(tp, th); 5139 5140 #ifdef TCP_HHOOK 5141 /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ 5142 hhook_run_tcp_est_in(tp, th, to); 5143 #endif 5144 5145 TCPSTAT_ADD(tcps_rcvackpack, nsegs); 5146 TCPSTAT_ADD(tcps_rcvackbyte, acked); 5147 sbdrop(&so->so_snd, acked); 5148 /* 5149 * Let the congestion control algorithm update congestion control 5150 * related information. This typically means increasing the 5151 * congestion window. 5152 */ 5153 rack_ack_received(tp, rack, th, nsegs, CC_ACK, 0); 5154 5155 tp->snd_una = th->th_ack; 5156 /* 5157 * Pull snd_wl2 up to prevent seq wrap relative to th_ack. 5158 */ 5159 tp->snd_wl2 = th->th_ack; 5160 tp->t_dupacks = 0; 5161 m_freem(m); 5162 /* ND6_HINT(tp); *//* Some progress has been made. */ 5163 5164 /* 5165 * If all outstanding data are acked, stop retransmit timer, 5166 * otherwise restart timer using current (possibly backed-off) 5167 * value. If process is waiting for space, wakeup/selwakeup/signal. 5168 * If data are ready to send, let tcp_output decide between more 5169 * output or persist. 5170 */ 5171 #ifdef TCPDEBUG 5172 if (so->so_options & SO_DEBUG) 5173 tcp_trace(TA_INPUT, ostate, tp, 5174 (void *)tcp_saveipgen, 5175 &tcp_savetcp, 0); 5176 #endif 5177 if (tp->snd_una == tp->snd_max) { 5178 rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__); 5179 tp->t_acktime = 0; 5180 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 5181 } 5182 /* Wake up the socket if we have room to write more */ 5183 sowwakeup(so); 5184 if (sbavail(&so->so_snd)) { 5185 rack->r_wanted_output++; 5186 } 5187 return (1); 5188 } 5189 5190 /* 5191 * Return value of 1, the TCB is unlocked and most 5192 * likely gone, return value of 0, the TCP is still 5193 * locked. 5194 */ 5195 static int 5196 rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so, 5197 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 5198 int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) 5199 { 5200 int32_t ret_val = 0; 5201 int32_t todrop; 5202 int32_t ourfinisacked = 0; 5203 5204 rack_calc_rwin(so, tp); 5205 /* 5206 * If the state is SYN_SENT: if seg contains an ACK, but not for our 5207 * SYN, drop the input. if seg contains a RST, then drop the 5208 * connection. if seg does not contain SYN, then drop it. Otherwise 5209 * this is an acceptable SYN segment initialize tp->rcv_nxt and 5210 * tp->irs if seg contains ack then advance tp->snd_una if seg 5211 * contains an ECE and ECN support is enabled, the stream is ECN 5212 * capable. if SYN has been acked change to ESTABLISHED else 5213 * SYN_RCVD state arrange for segment to be acked (eventually) 5214 * continue processing rest of data/controls, beginning with URG 5215 */ 5216 if ((thflags & TH_ACK) && 5217 (SEQ_LEQ(th->th_ack, tp->iss) || 5218 SEQ_GT(th->th_ack, tp->snd_max))) { 5219 rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen); 5220 return (1); 5221 } 5222 if ((thflags & (TH_ACK | TH_RST)) == (TH_ACK | TH_RST)) { 5223 TCP_PROBE5(connect__refused, NULL, tp, 5224 mtod(m, const char *), tp, th); 5225 tp = tcp_drop(tp, ECONNREFUSED); 5226 rack_do_drop(m, tp, ti_locked); 5227 return (1); 5228 } 5229 if (thflags & TH_RST) { 5230 rack_do_drop(m, tp, ti_locked); 5231 return (1); 5232 } 5233 if (!(thflags & TH_SYN)) { 5234 rack_do_drop(m, tp, ti_locked); 5235 return (1); 5236 } 5237 tp->irs = th->th_seq; 5238 tcp_rcvseqinit(tp); 5239 if (thflags & TH_ACK) { 5240 TCPSTAT_INC(tcps_connects); 5241 soisconnected(so); 5242 #ifdef MAC 5243 mac_socketpeer_set_from_mbuf(m, so); 5244 #endif 5245 /* Do window scaling on this connection? */ 5246 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 5247 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 5248 tp->rcv_scale = tp->request_r_scale; 5249 } 5250 tp->rcv_adv += min(tp->rcv_wnd, 5251 TCP_MAXWIN << tp->rcv_scale); 5252 /* 5253 * If there's data, delay ACK; if there's also a FIN ACKNOW 5254 * will be turned on later. 5255 */ 5256 if (DELAY_ACK(tp, tlen) && tlen != 0) { 5257 rack_timer_cancel(tp, (struct tcp_rack *)tp->t_fb_ptr, 5258 ((struct tcp_rack *)tp->t_fb_ptr)->r_ctl.rc_rcvtime, __LINE__); 5259 tp->t_flags |= TF_DELACK; 5260 } else { 5261 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output++; 5262 tp->t_flags |= TF_ACKNOW; 5263 } 5264 5265 if ((thflags & TH_ECE) && V_tcp_do_ecn) { 5266 tp->t_flags |= TF_ECN_PERMIT; 5267 TCPSTAT_INC(tcps_ecn_shs); 5268 } 5269 /* 5270 * Received <SYN,ACK> in SYN_SENT[*] state. Transitions: 5271 * SYN_SENT --> ESTABLISHED SYN_SENT* --> FIN_WAIT_1 5272 */ 5273 tp->t_starttime = ticks; 5274 if (tp->t_flags & TF_NEEDFIN) { 5275 tcp_state_change(tp, TCPS_FIN_WAIT_1); 5276 tp->t_flags &= ~TF_NEEDFIN; 5277 thflags &= ~TH_SYN; 5278 } else { 5279 tcp_state_change(tp, TCPS_ESTABLISHED); 5280 TCP_PROBE5(connect__established, NULL, tp, 5281 mtod(m, const char *), tp, th); 5282 cc_conn_init(tp); 5283 } 5284 } else { 5285 /* 5286 * Received initial SYN in SYN-SENT[*] state => simultaneous 5287 * open. If segment contains CC option and there is a 5288 * cached CC, apply TAO test. If it succeeds, connection is * 5289 * half-synchronized. Otherwise, do 3-way handshake: 5290 * SYN-SENT -> SYN-RECEIVED SYN-SENT* -> SYN-RECEIVED* If 5291 * there was no CC option, clear cached CC value. 5292 */ 5293 tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN); 5294 tcp_state_change(tp, TCPS_SYN_RECEIVED); 5295 } 5296 KASSERT(*ti_locked == TI_RLOCKED, ("%s: trimthenstep6: " 5297 "ti_locked %d", __func__, *ti_locked)); 5298 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 5299 INP_WLOCK_ASSERT(tp->t_inpcb); 5300 /* 5301 * Advance th->th_seq to correspond to first data byte. If data, 5302 * trim to stay within window, dropping FIN if necessary. 5303 */ 5304 th->th_seq++; 5305 if (tlen > tp->rcv_wnd) { 5306 todrop = tlen - tp->rcv_wnd; 5307 m_adj(m, -todrop); 5308 tlen = tp->rcv_wnd; 5309 thflags &= ~TH_FIN; 5310 TCPSTAT_INC(tcps_rcvpackafterwin); 5311 TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); 5312 } 5313 tp->snd_wl1 = th->th_seq - 1; 5314 tp->rcv_up = th->th_seq; 5315 /* 5316 * Client side of transaction: already sent SYN and data. If the 5317 * remote host used T/TCP to validate the SYN, our data will be 5318 * ACK'd; if so, enter normal data segment processing in the middle 5319 * of step 5, ack processing. Otherwise, goto step 6. 5320 */ 5321 if (thflags & TH_ACK) { 5322 if (rack_process_ack(m, th, so, tp, to, ti_locked, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) 5323 return (ret_val); 5324 /* We may have changed to FIN_WAIT_1 above */ 5325 if (tp->t_state == TCPS_FIN_WAIT_1) { 5326 /* 5327 * In FIN_WAIT_1 STATE in addition to the processing 5328 * for the ESTABLISHED state if our FIN is now 5329 * acknowledged then enter FIN_WAIT_2. 5330 */ 5331 if (ourfinisacked) { 5332 /* 5333 * If we can't receive any more data, then 5334 * closing user can proceed. Starting the 5335 * timer is contrary to the specification, 5336 * but if we don't get a FIN we'll hang 5337 * forever. 5338 * 5339 * XXXjl: we should release the tp also, and 5340 * use a compressed state. 5341 */ 5342 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 5343 soisdisconnected(so); 5344 tcp_timer_activate(tp, TT_2MSL, 5345 (tcp_fast_finwait2_recycle ? 5346 tcp_finwait2_timeout : 5347 TP_MAXIDLE(tp))); 5348 } 5349 tcp_state_change(tp, TCPS_FIN_WAIT_2); 5350 } 5351 } 5352 } 5353 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 5354 ti_locked, tiwin, thflags, nxt_pkt)); 5355 } 5356 5357 /* 5358 * Return value of 1, the TCB is unlocked and most 5359 * likely gone, return value of 0, the TCP is still 5360 * locked. 5361 */ 5362 static int 5363 rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, 5364 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 5365 int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) 5366 { 5367 int32_t ret_val = 0; 5368 int32_t ourfinisacked = 0; 5369 5370 rack_calc_rwin(so, tp); 5371 5372 if ((thflags & TH_ACK) && 5373 (SEQ_LEQ(th->th_ack, tp->snd_una) || 5374 SEQ_GT(th->th_ack, tp->snd_max))) { 5375 rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen); 5376 return (1); 5377 } 5378 #ifdef TCP_RFC7413 5379 if (tp->t_flags & TF_FASTOPEN) { 5380 /* 5381 * When a TFO connection is in SYN_RECEIVED, the only valid 5382 * packets are the initial SYN, a retransmit/copy of the 5383 * initial SYN (possibly with a subset of the original 5384 * data), a valid ACK, a FIN, or a RST. 5385 */ 5386 if ((thflags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) { 5387 rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen); 5388 return (1); 5389 } else if (thflags & TH_SYN) { 5390 /* non-initial SYN is ignored */ 5391 struct tcp_rack *rack; 5392 5393 rack = (struct tcp_rack *)tp->t_fb_ptr; 5394 if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) || 5395 (rack->r_ctl.rc_hpts_flags & PACE_TMR_TLP) || 5396 (rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK)) { 5397 rack_do_drop(m, NULL, ti_locked); 5398 return (0); 5399 } 5400 } else if (!(thflags & (TH_ACK | TH_FIN | TH_RST))) { 5401 rack_do_drop(m, NULL, ti_locked); 5402 return (0); 5403 } 5404 } 5405 #endif 5406 if (thflags & TH_RST) 5407 return (rack_process_rst(m, th, so, tp, ti_locked)); 5408 /* 5409 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 5410 * synchronized state. 5411 */ 5412 if (thflags & TH_SYN) { 5413 rack_challenge_ack(m, th, tp, ti_locked, &ret_val); 5414 return (ret_val); 5415 } 5416 /* 5417 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 5418 * it's less than ts_recent, drop it. 5419 */ 5420 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 5421 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 5422 if (rack_ts_check(m, th, tp, ti_locked, tlen, thflags, &ret_val)) 5423 return (ret_val); 5424 } 5425 /* 5426 * In the SYN-RECEIVED state, validate that the packet belongs to 5427 * this connection before trimming the data to fit the receive 5428 * window. Check the sequence number versus IRS since we know the 5429 * sequence numbers haven't wrapped. This is a partial fix for the 5430 * "LAND" DoS attack. 5431 */ 5432 if (SEQ_LT(th->th_seq, tp->irs)) { 5433 rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen); 5434 return (1); 5435 } 5436 if (rack_drop_checks(to, m, th, tp, &tlen, ti_locked, &thflags, &drop_hdrlen, &ret_val)) { 5437 return (ret_val); 5438 } 5439 /* 5440 * If last ACK falls within this segment's sequence numbers, record 5441 * its timestamp. NOTE: 1) That the test incorporates suggestions 5442 * from the latest proposal of the tcplw@cray.com list (Braden 5443 * 1993/04/26). 2) That updating only on newer timestamps interferes 5444 * with our earlier PAWS tests, so this check should be solely 5445 * predicated on the sequence space of this segment. 3) That we 5446 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 5447 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 5448 * SEG.Len, This modified check allows us to overcome RFC1323's 5449 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 5450 * p.869. In such cases, we can still calculate the RTT correctly 5451 * when RCV.NXT == Last.ACK.Sent. 5452 */ 5453 if ((to->to_flags & TOF_TS) != 0 && 5454 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 5455 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 5456 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 5457 tp->ts_recent_age = tcp_ts_getticks(); 5458 tp->ts_recent = to->to_tsval; 5459 } 5460 /* 5461 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 5462 * is on (half-synchronized state), then queue data for later 5463 * processing; else drop segment and return. 5464 */ 5465 if ((thflags & TH_ACK) == 0) { 5466 #ifdef TCP_RFC7413 5467 if (tp->t_flags & TF_FASTOPEN) { 5468 tp->snd_wnd = tiwin; 5469 cc_conn_init(tp); 5470 } 5471 #endif 5472 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 5473 ti_locked, tiwin, thflags, nxt_pkt)); 5474 } 5475 TCPSTAT_INC(tcps_connects); 5476 soisconnected(so); 5477 /* Do window scaling? */ 5478 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 5479 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 5480 tp->rcv_scale = tp->request_r_scale; 5481 tp->snd_wnd = tiwin; 5482 } 5483 /* 5484 * Make transitions: SYN-RECEIVED -> ESTABLISHED SYN-RECEIVED* -> 5485 * FIN-WAIT-1 5486 */ 5487 tp->t_starttime = ticks; 5488 if (tp->t_flags & TF_NEEDFIN) { 5489 tcp_state_change(tp, TCPS_FIN_WAIT_1); 5490 tp->t_flags &= ~TF_NEEDFIN; 5491 } else { 5492 tcp_state_change(tp, TCPS_ESTABLISHED); 5493 TCP_PROBE5(accept__established, NULL, tp, 5494 mtod(m, const char *), tp, th); 5495 #ifdef TCP_RFC7413 5496 if (tp->t_tfo_pending) { 5497 tcp_fastopen_decrement_counter(tp->t_tfo_pending); 5498 tp->t_tfo_pending = NULL; 5499 5500 /* 5501 * Account for the ACK of our SYN prior to regular 5502 * ACK processing below. 5503 */ 5504 tp->snd_una++; 5505 } 5506 /* 5507 * TFO connections call cc_conn_init() during SYN 5508 * processing. Calling it again here for such connections 5509 * is not harmless as it would undo the snd_cwnd reduction 5510 * that occurs when a TFO SYN|ACK is retransmitted. 5511 */ 5512 if (!(tp->t_flags & TF_FASTOPEN)) 5513 #endif 5514 cc_conn_init(tp); 5515 } 5516 /* 5517 * If segment contains data or ACK, will call tcp_reass() later; if 5518 * not, do so now to pass queued data to user. 5519 */ 5520 if (tlen == 0 && (thflags & TH_FIN) == 0) 5521 (void)tcp_reass(tp, (struct tcphdr *)0, 0, 5522 (struct mbuf *)0); 5523 tp->snd_wl1 = th->th_seq - 1; 5524 if (rack_process_ack(m, th, so, tp, to, ti_locked, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 5525 return (ret_val); 5526 } 5527 if (tp->t_state == TCPS_FIN_WAIT_1) { 5528 /* We could have went to FIN_WAIT_1 (or EST) above */ 5529 /* 5530 * In FIN_WAIT_1 STATE in addition to the processing for the 5531 * ESTABLISHED state if our FIN is now acknowledged then 5532 * enter FIN_WAIT_2. 5533 */ 5534 if (ourfinisacked) { 5535 /* 5536 * If we can't receive any more data, then closing 5537 * user can proceed. Starting the timer is contrary 5538 * to the specification, but if we don't get a FIN 5539 * we'll hang forever. 5540 * 5541 * XXXjl: we should release the tp also, and use a 5542 * compressed state. 5543 */ 5544 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 5545 soisdisconnected(so); 5546 tcp_timer_activate(tp, TT_2MSL, 5547 (tcp_fast_finwait2_recycle ? 5548 tcp_finwait2_timeout : 5549 TP_MAXIDLE(tp))); 5550 } 5551 tcp_state_change(tp, TCPS_FIN_WAIT_2); 5552 } 5553 } 5554 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 5555 ti_locked, tiwin, thflags, nxt_pkt)); 5556 } 5557 5558 /* 5559 * Return value of 1, the TCB is unlocked and most 5560 * likely gone, return value of 0, the TCP is still 5561 * locked. 5562 */ 5563 static int 5564 rack_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so, 5565 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 5566 int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) 5567 { 5568 int32_t ret_val = 0; 5569 5570 /* 5571 * Header prediction: check for the two common cases of a 5572 * uni-directional data xfer. If the packet has no control flags, 5573 * is in-sequence, the window didn't change and we're not 5574 * retransmitting, it's a candidate. If the length is zero and the 5575 * ack moved forward, we're the sender side of the xfer. Just free 5576 * the data acked & wake any higher level process that was blocked 5577 * waiting for space. If the length is non-zero and the ack didn't 5578 * move, we're the receiver side. If we're getting packets in-order 5579 * (the reassembly queue is empty), add the data toc The socket 5580 * buffer and note that we need a delayed ack. Make sure that the 5581 * hidden state-flags are also off. Since we check for 5582 * TCPS_ESTABLISHED first, it can only be TH_NEEDSYN. 5583 */ 5584 if (__predict_true(((to->to_flags & TOF_SACK) == 0)) && 5585 __predict_true((thflags & (TH_SYN | TH_FIN | TH_RST | TH_URG | TH_ACK)) == TH_ACK) && 5586 __predict_true(LIST_EMPTY(&tp->t_segq)) && 5587 __predict_true(th->th_seq == tp->rcv_nxt)) { 5588 struct tcp_rack *rack; 5589 5590 rack = (struct tcp_rack *)tp->t_fb_ptr; 5591 if (tlen == 0) { 5592 if (rack_fastack(m, th, so, tp, to, drop_hdrlen, tlen, 5593 ti_locked, tiwin, nxt_pkt, rack->r_ctl.rc_rcvtime)) { 5594 return (0); 5595 } 5596 } else { 5597 if (rack_do_fastnewdata(m, th, so, tp, to, drop_hdrlen, tlen, 5598 ti_locked, tiwin, nxt_pkt)) { 5599 return (0); 5600 } 5601 } 5602 } 5603 rack_calc_rwin(so, tp); 5604 5605 if (thflags & TH_RST) 5606 return (rack_process_rst(m, th, so, tp, ti_locked)); 5607 5608 /* 5609 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 5610 * synchronized state. 5611 */ 5612 if (thflags & TH_SYN) { 5613 rack_challenge_ack(m, th, tp, ti_locked, &ret_val); 5614 return (ret_val); 5615 } 5616 /* 5617 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 5618 * it's less than ts_recent, drop it. 5619 */ 5620 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 5621 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 5622 if (rack_ts_check(m, th, tp, ti_locked, tlen, thflags, &ret_val)) 5623 return (ret_val); 5624 } 5625 if (rack_drop_checks(to, m, th, tp, &tlen, ti_locked, &thflags, &drop_hdrlen, &ret_val)) { 5626 return (ret_val); 5627 } 5628 /* 5629 * If last ACK falls within this segment's sequence numbers, record 5630 * its timestamp. NOTE: 1) That the test incorporates suggestions 5631 * from the latest proposal of the tcplw@cray.com list (Braden 5632 * 1993/04/26). 2) That updating only on newer timestamps interferes 5633 * with our earlier PAWS tests, so this check should be solely 5634 * predicated on the sequence space of this segment. 3) That we 5635 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 5636 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 5637 * SEG.Len, This modified check allows us to overcome RFC1323's 5638 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 5639 * p.869. In such cases, we can still calculate the RTT correctly 5640 * when RCV.NXT == Last.ACK.Sent. 5641 */ 5642 if ((to->to_flags & TOF_TS) != 0 && 5643 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 5644 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 5645 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 5646 tp->ts_recent_age = tcp_ts_getticks(); 5647 tp->ts_recent = to->to_tsval; 5648 } 5649 /* 5650 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 5651 * is on (half-synchronized state), then queue data for later 5652 * processing; else drop segment and return. 5653 */ 5654 if ((thflags & TH_ACK) == 0) { 5655 if (tp->t_flags & TF_NEEDSYN) { 5656 5657 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 5658 ti_locked, tiwin, thflags, nxt_pkt)); 5659 5660 } else if (tp->t_flags & TF_ACKNOW) { 5661 rack_do_dropafterack(m, tp, th, ti_locked, thflags, tlen, &ret_val); 5662 return (ret_val); 5663 } else { 5664 rack_do_drop(m, NULL, ti_locked); 5665 return (0); 5666 } 5667 } 5668 /* 5669 * Ack processing. 5670 */ 5671 if (rack_process_ack(m, th, so, tp, to, ti_locked, tiwin, tlen, NULL, thflags, &ret_val)) { 5672 return (ret_val); 5673 } 5674 if (sbavail(&so->so_snd)) { 5675 if (rack_progress_timeout_check(tp)) { 5676 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 5677 rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen); 5678 return (1); 5679 } 5680 } 5681 /* State changes only happen in rack_process_data() */ 5682 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 5683 ti_locked, tiwin, thflags, nxt_pkt)); 5684 } 5685 5686 /* 5687 * Return value of 1, the TCB is unlocked and most 5688 * likely gone, return value of 0, the TCP is still 5689 * locked. 5690 */ 5691 static int 5692 rack_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so, 5693 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 5694 int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) 5695 { 5696 int32_t ret_val = 0; 5697 5698 rack_calc_rwin(so, tp); 5699 if (thflags & TH_RST) 5700 return (rack_process_rst(m, th, so, tp, ti_locked)); 5701 /* 5702 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 5703 * synchronized state. 5704 */ 5705 if (thflags & TH_SYN) { 5706 rack_challenge_ack(m, th, tp, ti_locked, &ret_val); 5707 return (ret_val); 5708 } 5709 /* 5710 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 5711 * it's less than ts_recent, drop it. 5712 */ 5713 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 5714 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 5715 if (rack_ts_check(m, th, tp, ti_locked, tlen, thflags, &ret_val)) 5716 return (ret_val); 5717 } 5718 if (rack_drop_checks(to, m, th, tp, &tlen, ti_locked, &thflags, &drop_hdrlen, &ret_val)) { 5719 return (ret_val); 5720 } 5721 /* 5722 * If last ACK falls within this segment's sequence numbers, record 5723 * its timestamp. NOTE: 1) That the test incorporates suggestions 5724 * from the latest proposal of the tcplw@cray.com list (Braden 5725 * 1993/04/26). 2) That updating only on newer timestamps interferes 5726 * with our earlier PAWS tests, so this check should be solely 5727 * predicated on the sequence space of this segment. 3) That we 5728 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 5729 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 5730 * SEG.Len, This modified check allows us to overcome RFC1323's 5731 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 5732 * p.869. In such cases, we can still calculate the RTT correctly 5733 * when RCV.NXT == Last.ACK.Sent. 5734 */ 5735 if ((to->to_flags & TOF_TS) != 0 && 5736 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 5737 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 5738 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 5739 tp->ts_recent_age = tcp_ts_getticks(); 5740 tp->ts_recent = to->to_tsval; 5741 } 5742 /* 5743 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 5744 * is on (half-synchronized state), then queue data for later 5745 * processing; else drop segment and return. 5746 */ 5747 if ((thflags & TH_ACK) == 0) { 5748 if (tp->t_flags & TF_NEEDSYN) { 5749 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 5750 ti_locked, tiwin, thflags, nxt_pkt)); 5751 5752 } else if (tp->t_flags & TF_ACKNOW) { 5753 rack_do_dropafterack(m, tp, th, ti_locked, thflags, tlen, &ret_val); 5754 return (ret_val); 5755 } else { 5756 rack_do_drop(m, NULL, ti_locked); 5757 return (0); 5758 } 5759 } 5760 /* 5761 * Ack processing. 5762 */ 5763 if (rack_process_ack(m, th, so, tp, to, ti_locked, tiwin, tlen, NULL, thflags, &ret_val)) { 5764 return (ret_val); 5765 } 5766 if (sbavail(&so->so_snd)) { 5767 if (rack_progress_timeout_check(tp)) { 5768 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 5769 rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen); 5770 return (1); 5771 } 5772 } 5773 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 5774 ti_locked, tiwin, thflags, nxt_pkt)); 5775 } 5776 5777 static int 5778 rack_check_data_after_close(struct mbuf *m, 5779 struct tcpcb *tp, int32_t *ti_locked, int32_t *tlen, struct tcphdr *th, struct socket *so) 5780 { 5781 struct tcp_rack *rack; 5782 5783 KASSERT(*ti_locked == TI_RLOCKED, ("%s: SS_NOFDEREF && " 5784 "CLOSE_WAIT && tlen ti_locked %d", __func__, *ti_locked)); 5785 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 5786 rack = (struct tcp_rack *)tp->t_fb_ptr; 5787 if (rack->rc_allow_data_af_clo == 0) { 5788 close_now: 5789 tp = tcp_close(tp); 5790 TCPSTAT_INC(tcps_rcvafterclose); 5791 rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_UNLIMITED, (*tlen)); 5792 return (1); 5793 } 5794 if (sbavail(&so->so_snd) == 0) 5795 goto close_now; 5796 /* Ok we allow data that is ignored and a followup reset */ 5797 tp->rcv_nxt = th->th_seq + *tlen; 5798 tp->t_flags2 |= TF2_DROP_AF_DATA; 5799 rack->r_wanted_output = 1; 5800 *tlen = 0; 5801 return (0); 5802 } 5803 5804 /* 5805 * Return value of 1, the TCB is unlocked and most 5806 * likely gone, return value of 0, the TCP is still 5807 * locked. 5808 */ 5809 static int 5810 rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so, 5811 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 5812 int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) 5813 { 5814 int32_t ret_val = 0; 5815 int32_t ourfinisacked = 0; 5816 5817 rack_calc_rwin(so, tp); 5818 5819 if (thflags & TH_RST) 5820 return (rack_process_rst(m, th, so, tp, ti_locked)); 5821 /* 5822 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 5823 * synchronized state. 5824 */ 5825 if (thflags & TH_SYN) { 5826 rack_challenge_ack(m, th, tp, ti_locked, &ret_val); 5827 return (ret_val); 5828 } 5829 /* 5830 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 5831 * it's less than ts_recent, drop it. 5832 */ 5833 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 5834 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 5835 if (rack_ts_check(m, th, tp, ti_locked, tlen, thflags, &ret_val)) 5836 return (ret_val); 5837 } 5838 if (rack_drop_checks(to, m, th, tp, &tlen, ti_locked, &thflags, &drop_hdrlen, &ret_val)) { 5839 return (ret_val); 5840 } 5841 /* 5842 * If new data are received on a connection after the user processes 5843 * are gone, then RST the other end. 5844 */ 5845 if ((so->so_state & SS_NOFDREF) && tlen) { 5846 if (rack_check_data_after_close(m, tp, ti_locked, &tlen, th, so)) 5847 return (1); 5848 } 5849 /* 5850 * If last ACK falls within this segment's sequence numbers, record 5851 * its timestamp. NOTE: 1) That the test incorporates suggestions 5852 * from the latest proposal of the tcplw@cray.com list (Braden 5853 * 1993/04/26). 2) That updating only on newer timestamps interferes 5854 * with our earlier PAWS tests, so this check should be solely 5855 * predicated on the sequence space of this segment. 3) That we 5856 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 5857 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 5858 * SEG.Len, This modified check allows us to overcome RFC1323's 5859 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 5860 * p.869. In such cases, we can still calculate the RTT correctly 5861 * when RCV.NXT == Last.ACK.Sent. 5862 */ 5863 if ((to->to_flags & TOF_TS) != 0 && 5864 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 5865 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 5866 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 5867 tp->ts_recent_age = tcp_ts_getticks(); 5868 tp->ts_recent = to->to_tsval; 5869 } 5870 /* 5871 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 5872 * is on (half-synchronized state), then queue data for later 5873 * processing; else drop segment and return. 5874 */ 5875 if ((thflags & TH_ACK) == 0) { 5876 if (tp->t_flags & TF_NEEDSYN) { 5877 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 5878 ti_locked, tiwin, thflags, nxt_pkt)); 5879 } else if (tp->t_flags & TF_ACKNOW) { 5880 rack_do_dropafterack(m, tp, th, ti_locked, thflags, tlen, &ret_val); 5881 return (ret_val); 5882 } else { 5883 rack_do_drop(m, NULL, ti_locked); 5884 return (0); 5885 } 5886 } 5887 /* 5888 * Ack processing. 5889 */ 5890 if (rack_process_ack(m, th, so, tp, to, ti_locked, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 5891 return (ret_val); 5892 } 5893 if (ourfinisacked) { 5894 /* 5895 * If we can't receive any more data, then closing user can 5896 * proceed. Starting the timer is contrary to the 5897 * specification, but if we don't get a FIN we'll hang 5898 * forever. 5899 * 5900 * XXXjl: we should release the tp also, and use a 5901 * compressed state. 5902 */ 5903 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 5904 soisdisconnected(so); 5905 tcp_timer_activate(tp, TT_2MSL, 5906 (tcp_fast_finwait2_recycle ? 5907 tcp_finwait2_timeout : 5908 TP_MAXIDLE(tp))); 5909 } 5910 tcp_state_change(tp, TCPS_FIN_WAIT_2); 5911 } 5912 if (sbavail(&so->so_snd)) { 5913 if (rack_progress_timeout_check(tp)) { 5914 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 5915 rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen); 5916 return (1); 5917 } 5918 } 5919 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 5920 ti_locked, tiwin, thflags, nxt_pkt)); 5921 } 5922 5923 /* 5924 * Return value of 1, the TCB is unlocked and most 5925 * likely gone, return value of 0, the TCP is still 5926 * locked. 5927 */ 5928 static int 5929 rack_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so, 5930 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 5931 int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) 5932 { 5933 int32_t ret_val = 0; 5934 int32_t ourfinisacked = 0; 5935 5936 rack_calc_rwin(so, tp); 5937 5938 if (thflags & TH_RST) 5939 return (rack_process_rst(m, th, so, tp, ti_locked)); 5940 /* 5941 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 5942 * synchronized state. 5943 */ 5944 if (thflags & TH_SYN) { 5945 rack_challenge_ack(m, th, tp, ti_locked, &ret_val); 5946 return (ret_val); 5947 } 5948 /* 5949 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 5950 * it's less than ts_recent, drop it. 5951 */ 5952 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 5953 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 5954 if (rack_ts_check(m, th, tp, ti_locked, tlen, thflags, &ret_val)) 5955 return (ret_val); 5956 } 5957 if (rack_drop_checks(to, m, th, tp, &tlen, ti_locked, &thflags, &drop_hdrlen, &ret_val)) { 5958 return (ret_val); 5959 } 5960 /* 5961 * If new data are received on a connection after the user processes 5962 * are gone, then RST the other end. 5963 */ 5964 if ((so->so_state & SS_NOFDREF) && tlen) { 5965 if (rack_check_data_after_close(m, tp, ti_locked, &tlen, th, so)) 5966 return (1); 5967 } 5968 /* 5969 * If last ACK falls within this segment's sequence numbers, record 5970 * its timestamp. NOTE: 1) That the test incorporates suggestions 5971 * from the latest proposal of the tcplw@cray.com list (Braden 5972 * 1993/04/26). 2) That updating only on newer timestamps interferes 5973 * with our earlier PAWS tests, so this check should be solely 5974 * predicated on the sequence space of this segment. 3) That we 5975 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 5976 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 5977 * SEG.Len, This modified check allows us to overcome RFC1323's 5978 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 5979 * p.869. In such cases, we can still calculate the RTT correctly 5980 * when RCV.NXT == Last.ACK.Sent. 5981 */ 5982 if ((to->to_flags & TOF_TS) != 0 && 5983 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 5984 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 5985 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 5986 tp->ts_recent_age = tcp_ts_getticks(); 5987 tp->ts_recent = to->to_tsval; 5988 } 5989 /* 5990 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 5991 * is on (half-synchronized state), then queue data for later 5992 * processing; else drop segment and return. 5993 */ 5994 if ((thflags & TH_ACK) == 0) { 5995 if (tp->t_flags & TF_NEEDSYN) { 5996 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 5997 ti_locked, tiwin, thflags, nxt_pkt)); 5998 } else if (tp->t_flags & TF_ACKNOW) { 5999 rack_do_dropafterack(m, tp, th, ti_locked, thflags, tlen, &ret_val); 6000 return (ret_val); 6001 } else { 6002 rack_do_drop(m, NULL, ti_locked); 6003 return (0); 6004 } 6005 } 6006 /* 6007 * Ack processing. 6008 */ 6009 if (rack_process_ack(m, th, so, tp, to, ti_locked, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 6010 return (ret_val); 6011 } 6012 if (ourfinisacked) { 6013 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 6014 tcp_twstart(tp); 6015 INP_INFO_RUNLOCK(&V_tcbinfo); 6016 *ti_locked = TI_UNLOCKED; 6017 m_freem(m); 6018 return (1); 6019 } 6020 if (sbavail(&so->so_snd)) { 6021 if (rack_progress_timeout_check(tp)) { 6022 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 6023 rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen); 6024 return (1); 6025 } 6026 } 6027 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 6028 ti_locked, tiwin, thflags, nxt_pkt)); 6029 } 6030 6031 /* 6032 * Return value of 1, the TCB is unlocked and most 6033 * likely gone, return value of 0, the TCP is still 6034 * locked. 6035 */ 6036 static int 6037 rack_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so, 6038 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 6039 int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) 6040 { 6041 int32_t ret_val = 0; 6042 int32_t ourfinisacked = 0; 6043 6044 rack_calc_rwin(so, tp); 6045 6046 if (thflags & TH_RST) 6047 return (rack_process_rst(m, th, so, tp, ti_locked)); 6048 /* 6049 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 6050 * synchronized state. 6051 */ 6052 if (thflags & TH_SYN) { 6053 rack_challenge_ack(m, th, tp, ti_locked, &ret_val); 6054 return (ret_val); 6055 } 6056 /* 6057 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 6058 * it's less than ts_recent, drop it. 6059 */ 6060 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 6061 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 6062 if (rack_ts_check(m, th, tp, ti_locked, tlen, thflags, &ret_val)) 6063 return (ret_val); 6064 } 6065 if (rack_drop_checks(to, m, th, tp, &tlen, ti_locked, &thflags, &drop_hdrlen, &ret_val)) { 6066 return (ret_val); 6067 } 6068 /* 6069 * If new data are received on a connection after the user processes 6070 * are gone, then RST the other end. 6071 */ 6072 if ((so->so_state & SS_NOFDREF) && tlen) { 6073 if (rack_check_data_after_close(m, tp, ti_locked, &tlen, th, so)) 6074 return (1); 6075 } 6076 /* 6077 * If last ACK falls within this segment's sequence numbers, record 6078 * its timestamp. NOTE: 1) That the test incorporates suggestions 6079 * from the latest proposal of the tcplw@cray.com list (Braden 6080 * 1993/04/26). 2) That updating only on newer timestamps interferes 6081 * with our earlier PAWS tests, so this check should be solely 6082 * predicated on the sequence space of this segment. 3) That we 6083 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 6084 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 6085 * SEG.Len, This modified check allows us to overcome RFC1323's 6086 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 6087 * p.869. In such cases, we can still calculate the RTT correctly 6088 * when RCV.NXT == Last.ACK.Sent. 6089 */ 6090 if ((to->to_flags & TOF_TS) != 0 && 6091 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 6092 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 6093 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 6094 tp->ts_recent_age = tcp_ts_getticks(); 6095 tp->ts_recent = to->to_tsval; 6096 } 6097 /* 6098 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 6099 * is on (half-synchronized state), then queue data for later 6100 * processing; else drop segment and return. 6101 */ 6102 if ((thflags & TH_ACK) == 0) { 6103 if (tp->t_flags & TF_NEEDSYN) { 6104 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 6105 ti_locked, tiwin, thflags, nxt_pkt)); 6106 } else if (tp->t_flags & TF_ACKNOW) { 6107 rack_do_dropafterack(m, tp, th, ti_locked, thflags, tlen, &ret_val); 6108 return (ret_val); 6109 } else { 6110 rack_do_drop(m, NULL, ti_locked); 6111 return (0); 6112 } 6113 } 6114 /* 6115 * case TCPS_LAST_ACK: Ack processing. 6116 */ 6117 if (rack_process_ack(m, th, so, tp, to, ti_locked, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 6118 return (ret_val); 6119 } 6120 if (ourfinisacked) { 6121 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 6122 tp = tcp_close(tp); 6123 rack_do_drop(m, tp, ti_locked); 6124 return (1); 6125 } 6126 if (sbavail(&so->so_snd)) { 6127 if (rack_progress_timeout_check(tp)) { 6128 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 6129 rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen); 6130 return (1); 6131 } 6132 } 6133 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 6134 ti_locked, tiwin, thflags, nxt_pkt)); 6135 } 6136 6137 6138 /* 6139 * Return value of 1, the TCB is unlocked and most 6140 * likely gone, return value of 0, the TCP is still 6141 * locked. 6142 */ 6143 static int 6144 rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so, 6145 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 6146 int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) 6147 { 6148 int32_t ret_val = 0; 6149 int32_t ourfinisacked = 0; 6150 6151 rack_calc_rwin(so, tp); 6152 6153 /* Reset receive buffer auto scaling when not in bulk receive mode. */ 6154 if (thflags & TH_RST) 6155 return (rack_process_rst(m, th, so, tp, ti_locked)); 6156 /* 6157 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 6158 * synchronized state. 6159 */ 6160 if (thflags & TH_SYN) { 6161 rack_challenge_ack(m, th, tp, ti_locked, &ret_val); 6162 return (ret_val); 6163 } 6164 /* 6165 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 6166 * it's less than ts_recent, drop it. 6167 */ 6168 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 6169 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 6170 if (rack_ts_check(m, th, tp, ti_locked, tlen, thflags, &ret_val)) 6171 return (ret_val); 6172 } 6173 if (rack_drop_checks(to, m, th, tp, &tlen, ti_locked, &thflags, &drop_hdrlen, &ret_val)) { 6174 return (ret_val); 6175 } 6176 /* 6177 * If new data are received on a connection after the user processes 6178 * are gone, then RST the other end. 6179 */ 6180 if ((so->so_state & SS_NOFDREF) && 6181 tlen) { 6182 if (rack_check_data_after_close(m, tp, ti_locked, &tlen, th, so)) 6183 return (1); 6184 } 6185 /* 6186 * If last ACK falls within this segment's sequence numbers, record 6187 * its timestamp. NOTE: 1) That the test incorporates suggestions 6188 * from the latest proposal of the tcplw@cray.com list (Braden 6189 * 1993/04/26). 2) That updating only on newer timestamps interferes 6190 * with our earlier PAWS tests, so this check should be solely 6191 * predicated on the sequence space of this segment. 3) That we 6192 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 6193 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 6194 * SEG.Len, This modified check allows us to overcome RFC1323's 6195 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 6196 * p.869. In such cases, we can still calculate the RTT correctly 6197 * when RCV.NXT == Last.ACK.Sent. 6198 */ 6199 if ((to->to_flags & TOF_TS) != 0 && 6200 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 6201 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 6202 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 6203 tp->ts_recent_age = tcp_ts_getticks(); 6204 tp->ts_recent = to->to_tsval; 6205 } 6206 /* 6207 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 6208 * is on (half-synchronized state), then queue data for later 6209 * processing; else drop segment and return. 6210 */ 6211 if ((thflags & TH_ACK) == 0) { 6212 if (tp->t_flags & TF_NEEDSYN) { 6213 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 6214 ti_locked, tiwin, thflags, nxt_pkt)); 6215 } else if (tp->t_flags & TF_ACKNOW) { 6216 rack_do_dropafterack(m, tp, th, ti_locked, thflags, tlen, &ret_val); 6217 return (ret_val); 6218 } else { 6219 rack_do_drop(m, NULL, ti_locked); 6220 return (0); 6221 } 6222 } 6223 /* 6224 * Ack processing. 6225 */ 6226 if (rack_process_ack(m, th, so, tp, to, ti_locked, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 6227 return (ret_val); 6228 } 6229 if (sbavail(&so->so_snd)) { 6230 if (rack_progress_timeout_check(tp)) { 6231 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 6232 rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen); 6233 return (1); 6234 } 6235 } 6236 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 6237 ti_locked, tiwin, thflags, nxt_pkt)); 6238 } 6239 6240 6241 static void inline 6242 rack_clear_rate_sample(struct tcp_rack *rack) 6243 { 6244 rack->r_ctl.rack_rs.rs_flags = RACK_RTT_EMPTY; 6245 rack->r_ctl.rack_rs.rs_rtt_cnt = 0; 6246 rack->r_ctl.rack_rs.rs_rtt_tot = 0; 6247 } 6248 6249 static int 6250 rack_init(struct tcpcb *tp) 6251 { 6252 struct tcp_rack *rack = NULL; 6253 6254 tp->t_fb_ptr = uma_zalloc(rack_pcb_zone, M_NOWAIT); 6255 if (tp->t_fb_ptr == NULL) { 6256 /* 6257 * We need to allocate memory but cant. The INP and INP_INFO 6258 * locks and they are recusive (happens during setup. So a 6259 * scheme to drop the locks fails :( 6260 * 6261 */ 6262 return (ENOMEM); 6263 } 6264 memset(tp->t_fb_ptr, 0, sizeof(struct tcp_rack)); 6265 6266 rack = (struct tcp_rack *)tp->t_fb_ptr; 6267 TAILQ_INIT(&rack->r_ctl.rc_map); 6268 TAILQ_INIT(&rack->r_ctl.rc_free); 6269 TAILQ_INIT(&rack->r_ctl.rc_tmap); 6270 rack->rc_tp = tp; 6271 if (tp->t_inpcb) { 6272 rack->rc_inp = tp->t_inpcb; 6273 } 6274 /* Probably not needed but lets be sure */ 6275 rack_clear_rate_sample(rack); 6276 rack->r_cpu = 0; 6277 rack->r_ctl.rc_reorder_fade = rack_reorder_fade; 6278 rack->rc_allow_data_af_clo = rack_ignore_data_after_close; 6279 rack->r_ctl.rc_tlp_threshold = rack_tlp_thresh; 6280 rack->rc_pace_reduce = rack_slot_reduction; 6281 if (V_tcp_delack_enabled) 6282 tp->t_delayed_ack = 1; 6283 else 6284 tp->t_delayed_ack = 0; 6285 rack->rc_pace_max_segs = rack_hptsi_segments; 6286 rack->r_ctl.rc_early_recovery_segs = rack_early_recovery_max_seg; 6287 rack->r_ctl.rc_reorder_shift = rack_reorder_thresh; 6288 rack->r_ctl.rc_pkt_delay = rack_pkt_delay; 6289 rack->r_ctl.rc_prop_reduce = rack_use_proportional_reduce; 6290 rack->r_idle_reduce_largest = rack_reduce_largest_on_idle; 6291 rack->r_enforce_min_pace = rack_min_pace_time; 6292 rack->r_min_pace_seg_thresh = rack_min_pace_time_seg_req; 6293 rack->r_ctl.rc_prop_rate = rack_proportional_rate; 6294 rack->r_ctl.rc_tlp_cwnd_reduce = rack_lower_cwnd_at_tlp; 6295 rack->r_ctl.rc_early_recovery = rack_early_recovery; 6296 rack->rc_always_pace = rack_pace_every_seg; 6297 rack->r_ctl.rc_rate_sample_method = rack_rate_sample_method; 6298 rack->rack_tlp_threshold_use = rack_tlp_threshold_use; 6299 rack->r_ctl.rc_prr_sendalot = rack_send_a_lot_in_prr; 6300 rack->r_ctl.rc_min_to = rack_min_to; 6301 rack->r_ctl.rc_prr_inc_var = rack_inc_var; 6302 rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0); 6303 if (tp->snd_una != tp->snd_max) { 6304 /* Create a send map for the current outstanding data */ 6305 struct rack_sendmap *rsm; 6306 6307 rsm = rack_alloc(rack); 6308 if (rsm == NULL) { 6309 uma_zfree(rack_pcb_zone, tp->t_fb_ptr); 6310 tp->t_fb_ptr = NULL; 6311 return (ENOMEM); 6312 } 6313 rsm->r_flags = RACK_OVERMAX; 6314 rsm->r_tim_lastsent[0] = tcp_ts_getticks(); 6315 rsm->r_rtr_cnt = 1; 6316 rsm->r_rtr_bytes = 0; 6317 rsm->r_start = tp->snd_una; 6318 rsm->r_end = tp->snd_max; 6319 rsm->r_sndcnt = 0; 6320 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_map, rsm, r_next); 6321 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); 6322 rsm->r_in_tmap = 1; 6323 } 6324 return (0); 6325 } 6326 6327 static int 6328 rack_handoff_ok(struct tcpcb *tp) 6329 { 6330 if ((tp->t_state == TCPS_CLOSED) || 6331 (tp->t_state == TCPS_LISTEN)) { 6332 /* Sure no problem though it may not stick */ 6333 return (0); 6334 } 6335 if ((tp->t_state == TCPS_SYN_SENT) || 6336 (tp->t_state == TCPS_SYN_RECEIVED)) { 6337 /* 6338 * We really don't know you have to get to ESTAB or beyond 6339 * to tell. 6340 */ 6341 return (EAGAIN); 6342 } 6343 if (tp->t_flags & TF_SACK_PERMIT) { 6344 return (0); 6345 } 6346 /* 6347 * If we reach here we don't do SACK on this connection so we can 6348 * never do rack. 6349 */ 6350 return (EINVAL); 6351 } 6352 6353 static void 6354 rack_fini(struct tcpcb *tp, int32_t tcb_is_purged) 6355 { 6356 if (tp->t_fb_ptr) { 6357 struct tcp_rack *rack; 6358 struct rack_sendmap *rsm; 6359 6360 rack = (struct tcp_rack *)tp->t_fb_ptr; 6361 #ifdef TCP_BLACKBOX 6362 tcp_log_flowend(tp); 6363 #endif 6364 rsm = TAILQ_FIRST(&rack->r_ctl.rc_map); 6365 while (rsm) { 6366 TAILQ_REMOVE(&rack->r_ctl.rc_map, rsm, r_next); 6367 uma_zfree(rack_zone, rsm); 6368 rsm = TAILQ_FIRST(&rack->r_ctl.rc_map); 6369 } 6370 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); 6371 while (rsm) { 6372 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_next); 6373 uma_zfree(rack_zone, rsm); 6374 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); 6375 } 6376 rack->rc_free_cnt = 0; 6377 uma_zfree(rack_pcb_zone, tp->t_fb_ptr); 6378 tp->t_fb_ptr = NULL; 6379 } 6380 } 6381 6382 static void 6383 rack_set_state(struct tcpcb *tp, struct tcp_rack *rack) 6384 { 6385 switch (tp->t_state) { 6386 case TCPS_SYN_SENT: 6387 rack->r_state = TCPS_SYN_SENT; 6388 rack->r_substate = rack_do_syn_sent; 6389 break; 6390 case TCPS_SYN_RECEIVED: 6391 rack->r_state = TCPS_SYN_RECEIVED; 6392 rack->r_substate = rack_do_syn_recv; 6393 break; 6394 case TCPS_ESTABLISHED: 6395 rack->r_state = TCPS_ESTABLISHED; 6396 rack->r_substate = rack_do_established; 6397 break; 6398 case TCPS_CLOSE_WAIT: 6399 rack->r_state = TCPS_CLOSE_WAIT; 6400 rack->r_substate = rack_do_close_wait; 6401 break; 6402 case TCPS_FIN_WAIT_1: 6403 rack->r_state = TCPS_FIN_WAIT_1; 6404 rack->r_substate = rack_do_fin_wait_1; 6405 break; 6406 case TCPS_CLOSING: 6407 rack->r_state = TCPS_CLOSING; 6408 rack->r_substate = rack_do_closing; 6409 break; 6410 case TCPS_LAST_ACK: 6411 rack->r_state = TCPS_LAST_ACK; 6412 rack->r_substate = rack_do_lastack; 6413 break; 6414 case TCPS_FIN_WAIT_2: 6415 rack->r_state = TCPS_FIN_WAIT_2; 6416 rack->r_substate = rack_do_fin_wait_2; 6417 break; 6418 case TCPS_LISTEN: 6419 case TCPS_CLOSED: 6420 case TCPS_TIME_WAIT: 6421 default: 6422 #ifdef INVARIANTS 6423 panic("tcp tp:%p state:%d sees impossible state?", tp, tp->t_state); 6424 #endif 6425 break; 6426 }; 6427 } 6428 6429 6430 static void 6431 rack_timer_audit(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb) 6432 { 6433 /* 6434 * We received an ack, and then did not 6435 * call send or were bounced out due to the 6436 * hpts was running. Now a timer is up as well, is 6437 * it the right timer? 6438 */ 6439 struct rack_sendmap *rsm; 6440 int tmr_up; 6441 6442 tmr_up = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK; 6443 if (rack->rc_in_persist && (tmr_up == PACE_TMR_PERSIT)) 6444 return; 6445 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 6446 if (((rsm == NULL) || (tp->t_state < TCPS_ESTABLISHED)) && 6447 (tmr_up == PACE_TMR_RXT)) { 6448 /* Should be an RXT */ 6449 return; 6450 } 6451 if (rsm == NULL) { 6452 /* Nothing outstanding? */ 6453 if (tp->t_flags & TF_DELACK) { 6454 if (tmr_up == PACE_TMR_DELACK) 6455 /* We are supposed to have delayed ack up and we do */ 6456 return; 6457 } else if (sbavail(&tp->t_inpcb->inp_socket->so_snd) && (tmr_up == PACE_TMR_RXT)) { 6458 /* 6459 * if we hit enobufs then we would expect the possiblity 6460 * of nothing outstanding and the RXT up (and the hptsi timer). 6461 */ 6462 return; 6463 } else if (((tcp_always_keepalive || 6464 rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) && 6465 (tp->t_state <= TCPS_CLOSING)) && 6466 (tmr_up == PACE_TMR_KEEP) && 6467 (tp->snd_max == tp->snd_una)) { 6468 /* We should have keep alive up and we do */ 6469 return; 6470 } 6471 } 6472 if (rsm && (rsm->r_flags & RACK_SACK_PASSED)) { 6473 if ((tp->t_flags & TF_SENTFIN) && 6474 ((tp->snd_max - tp->snd_una) == 1) && 6475 (rsm->r_flags & RACK_HAS_FIN)) { 6476 /* needs to be a RXT */ 6477 if (tmr_up == PACE_TMR_RXT) 6478 return; 6479 } else if (tmr_up == PACE_TMR_RACK) 6480 return; 6481 } else if (SEQ_GT(tp->snd_max,tp->snd_una) && 6482 ((tmr_up == PACE_TMR_TLP) || 6483 (tmr_up == PACE_TMR_RXT))) { 6484 /* 6485 * Either a TLP or RXT is fine if no sack-passed 6486 * is in place and data is outstanding. 6487 */ 6488 return; 6489 } else if (tmr_up == PACE_TMR_DELACK) { 6490 /* 6491 * If the delayed ack was going to go off 6492 * before the rtx/tlp/rack timer were going to 6493 * expire, then that would be the timer in control. 6494 * Note we don't check the time here trusting the 6495 * code is correct. 6496 */ 6497 return; 6498 } 6499 /* 6500 * Ok the timer originally started is not what we want now. 6501 * We will force the hpts to be stopped if any, and restart 6502 * with the slot set to what was in the saved slot. 6503 */ 6504 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 6505 rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0); 6506 } 6507 6508 static void 6509 rack_hpts_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, 6510 struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos, 6511 int32_t ti_locked, int32_t nxt_pkt, struct timeval *tv) 6512 { 6513 int32_t thflags, retval, did_out = 0; 6514 int32_t way_out = 0; 6515 uint32_t cts; 6516 uint32_t tiwin; 6517 struct tcpopt to; 6518 struct tcp_rack *rack; 6519 struct rack_sendmap *rsm; 6520 int32_t prev_state = 0; 6521 6522 cts = tcp_tv_to_mssectick(tv); 6523 rack = (struct tcp_rack *)tp->t_fb_ptr; 6524 6525 kern_prefetch(rack, &prev_state); 6526 prev_state = 0; 6527 thflags = th->th_flags; 6528 /* 6529 * If this is either a state-changing packet or current state isn't 6530 * established, we require a read lock on tcbinfo. Otherwise, we 6531 * allow the tcbinfo to be in either locked or unlocked, as the 6532 * caller may have unnecessarily acquired a lock due to a race. 6533 */ 6534 if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 || 6535 tp->t_state != TCPS_ESTABLISHED) { 6536 KASSERT(ti_locked == TI_RLOCKED, ("%s ti_locked %d for " 6537 "SYN/FIN/RST/!EST", __func__, ti_locked)); 6538 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 6539 } else { 6540 #ifdef INVARIANTS 6541 if (ti_locked == TI_RLOCKED) { 6542 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 6543 } else { 6544 KASSERT(ti_locked == TI_UNLOCKED, ("%s: EST " 6545 "ti_locked: %d", __func__, ti_locked)); 6546 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); 6547 } 6548 #endif 6549 } 6550 INP_WLOCK_ASSERT(tp->t_inpcb); 6551 KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", 6552 __func__)); 6553 KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT", 6554 __func__)); 6555 { 6556 union tcp_log_stackspecific log; 6557 6558 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 6559 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 6560 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 6561 TCP_LOG_EVENT(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_IN, 0, 6562 tlen, &log, true); 6563 } 6564 /* 6565 * Segment received on connection. Reset idle time and keep-alive 6566 * timer. XXX: This should be done after segment validation to 6567 * ignore broken/spoofed segs. 6568 */ 6569 if (tp->t_idle_reduce && (tp->snd_max == tp->snd_una)) { 6570 #ifdef NETFLIX_CWV 6571 if ((tp->cwv_enabled) && 6572 ((tp->cwv_cwnd_valid == 0) && 6573 TCPS_HAVEESTABLISHED(tp->t_state) && 6574 (tp->snd_cwnd > tp->snd_cwv.init_cwnd))) { 6575 tcp_newcwv_nvp_closedown(tp); 6576 } else 6577 #endif 6578 if ((ticks - tp->t_rcvtime) >= tp->t_rxtcur) { 6579 counter_u64_add(rack_input_idle_reduces, 1); 6580 rack_cc_after_idle(tp, 6581 (rack->r_idle_reduce_largest ? 1 :0)); 6582 } 6583 } 6584 rack->r_ctl.rc_rcvtime = cts; 6585 tp->t_rcvtime = ticks; 6586 6587 #ifdef NETFLIX_CWV 6588 if (tp->cwv_enabled) { 6589 if ((tp->cwv_cwnd_valid == 0) && 6590 TCPS_HAVEESTABLISHED(tp->t_state) && 6591 (tp->snd_cwnd > tp->snd_cwv.init_cwnd)) 6592 tcp_newcwv_nvp_closedown(tp); 6593 } 6594 #endif 6595 /* 6596 * Unscale the window into a 32-bit value. For the SYN_SENT state 6597 * the scale is zero. 6598 */ 6599 tiwin = th->th_win << tp->snd_scale; 6600 #ifdef NETFLIX_STATS 6601 stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_FRWIN, tiwin); 6602 #endif 6603 /* 6604 * TCP ECN processing. XXXJTL: If we ever use ECN, we need to move 6605 * this to occur after we've validated the segment. 6606 */ 6607 if (tp->t_flags & TF_ECN_PERMIT) { 6608 if (thflags & TH_CWR) 6609 tp->t_flags &= ~TF_ECN_SND_ECE; 6610 switch (iptos & IPTOS_ECN_MASK) { 6611 case IPTOS_ECN_CE: 6612 tp->t_flags |= TF_ECN_SND_ECE; 6613 TCPSTAT_INC(tcps_ecn_ce); 6614 break; 6615 case IPTOS_ECN_ECT0: 6616 TCPSTAT_INC(tcps_ecn_ect0); 6617 break; 6618 case IPTOS_ECN_ECT1: 6619 TCPSTAT_INC(tcps_ecn_ect1); 6620 break; 6621 } 6622 /* Congestion experienced. */ 6623 if (thflags & TH_ECE) { 6624 rack_cong_signal(tp, th, CC_ECN); 6625 } 6626 } 6627 /* 6628 * Parse options on any incoming segment. 6629 */ 6630 tcp_dooptions(&to, (u_char *)(th + 1), 6631 (th->th_off << 2) - sizeof(struct tcphdr), 6632 (thflags & TH_SYN) ? TO_SYN : 0); 6633 6634 /* 6635 * If echoed timestamp is later than the current time, fall back to 6636 * non RFC1323 RTT calculation. Normalize timestamp if syncookies 6637 * were used when this connection was established. 6638 */ 6639 if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) { 6640 to.to_tsecr -= tp->ts_offset; 6641 if (TSTMP_GT(to.to_tsecr, cts)) 6642 to.to_tsecr = 0; 6643 } 6644 /* 6645 * If its the first time in we need to take care of options and 6646 * verify we can do SACK for rack! 6647 */ 6648 if (rack->r_state == 0) { 6649 /* Should be init'd by rack_init() */ 6650 KASSERT(rack->rc_inp != NULL, 6651 ("%s: rack->rc_inp unexpectedly NULL", __func__)); 6652 if (rack->rc_inp == NULL) { 6653 rack->rc_inp = tp->t_inpcb; 6654 } 6655 6656 /* 6657 * Process options only when we get SYN/ACK back. The SYN 6658 * case for incoming connections is handled in tcp_syncache. 6659 * According to RFC1323 the window field in a SYN (i.e., a 6660 * <SYN> or <SYN,ACK>) segment itself is never scaled. XXX 6661 * this is traditional behavior, may need to be cleaned up. 6662 */ 6663 rack->r_cpu = inp_to_cpuid(tp->t_inpcb); 6664 if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) { 6665 if ((to.to_flags & TOF_SCALE) && 6666 (tp->t_flags & TF_REQ_SCALE)) { 6667 tp->t_flags |= TF_RCVD_SCALE; 6668 tp->snd_scale = to.to_wscale; 6669 } 6670 /* 6671 * Initial send window. It will be updated with the 6672 * next incoming segment to the scaled value. 6673 */ 6674 tp->snd_wnd = th->th_win; 6675 if (to.to_flags & TOF_TS) { 6676 tp->t_flags |= TF_RCVD_TSTMP; 6677 tp->ts_recent = to.to_tsval; 6678 tp->ts_recent_age = cts; 6679 } 6680 if (to.to_flags & TOF_MSS) 6681 tcp_mss(tp, to.to_mss); 6682 if ((tp->t_flags & TF_SACK_PERMIT) && 6683 (to.to_flags & TOF_SACKPERM) == 0) 6684 tp->t_flags &= ~TF_SACK_PERMIT; 6685 } 6686 /* 6687 * At this point we are at the initial call. Here we decide 6688 * if we are doing RACK or not. We do this by seeing if 6689 * TF_SACK_PERMIT is set, if not rack is *not* possible and 6690 * we switch to the default code. 6691 */ 6692 if ((tp->t_flags & TF_SACK_PERMIT) == 0) { 6693 tcp_switch_back_to_default(tp); 6694 (*tp->t_fb->tfb_tcp_do_segment) (m, th, so, tp, drop_hdrlen, 6695 tlen, iptos, ti_locked); 6696 return; 6697 } 6698 /* Set the flag */ 6699 rack->r_is_v6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; 6700 tcp_set_hpts(tp->t_inpcb); 6701 rack_stop_all_timers(tp); 6702 sack_filter_clear(&rack->r_ctl.rack_sf, th->th_ack); 6703 } 6704 /* 6705 * This is the one exception case where we set the rack state 6706 * always. All other times (timers etc) we must have a rack-state 6707 * set (so we assure we have done the checks above for SACK). 6708 */ 6709 if (rack->r_state != tp->t_state) 6710 rack_set_state(tp, rack); 6711 if (SEQ_GT(th->th_ack, tp->snd_una) && (rsm = TAILQ_FIRST(&rack->r_ctl.rc_map)) != NULL) 6712 kern_prefetch(rsm, &prev_state); 6713 prev_state = rack->r_state; 6714 rack->r_ctl.rc_tlp_send_cnt = 0; 6715 rack_clear_rate_sample(rack); 6716 retval = (*rack->r_substate) (m, th, so, 6717 tp, &to, drop_hdrlen, 6718 tlen, &ti_locked, tiwin, thflags, nxt_pkt); 6719 #ifdef INVARIANTS 6720 if ((retval == 0) && 6721 (tp->t_inpcb == NULL)) { 6722 panic("retval:%d tp:%p t_inpcb:NULL state:%d", 6723 retval, tp, prev_state); 6724 } 6725 #endif 6726 if (ti_locked != TI_UNLOCKED) { 6727 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 6728 INP_INFO_RUNLOCK(&V_tcbinfo); 6729 ti_locked = TI_UNLOCKED; 6730 } 6731 if (retval == 0) { 6732 /* 6733 * If retval is 1 the tcb is unlocked and most likely the tp 6734 * is gone. 6735 */ 6736 INP_WLOCK_ASSERT(tp->t_inpcb); 6737 tcp_rack_xmit_timer_commit(rack, tp); 6738 if (((tp->snd_max - tp->snd_una) > tp->snd_wnd) && 6739 (rack->rc_in_persist == 0)){ 6740 /* 6741 * The peer shrunk its window on us to the point 6742 * where we have sent too much. The only thing 6743 * we can do here is stop any timers and 6744 * enter persist. We most likely lost the last 6745 * bytes we sent but oh well, we will have to 6746 * retransmit them after the peer is caught up. 6747 */ 6748 if (rack->rc_inp->inp_in_hpts) 6749 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); 6750 rack_timer_cancel(tp, rack, cts, __LINE__); 6751 rack_enter_persist(tp, rack, cts); 6752 rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0); 6753 way_out = 3; 6754 goto done_with_input; 6755 } 6756 if (nxt_pkt == 0) { 6757 if (rack->r_wanted_output != 0) { 6758 did_out = 1; 6759 (void)tp->t_fb->tfb_tcp_output(tp); 6760 } 6761 rack_start_hpts_timer(rack, tp, cts, __LINE__, 0, 0, 0); 6762 } 6763 if (((rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) == 0) && 6764 (SEQ_GT(tp->snd_max, tp->snd_una) || 6765 (tp->t_flags & TF_DELACK) || 6766 ((tcp_always_keepalive || rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) && 6767 (tp->t_state <= TCPS_CLOSING)))) { 6768 /* We could not send (probably in the hpts but stopped the timer earlier)? */ 6769 if ((tp->snd_max == tp->snd_una) && 6770 ((tp->t_flags & TF_DELACK) == 0) && 6771 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { 6772 /* keep alive not needed if we are hptsi output yet */ 6773 ; 6774 } else { 6775 if (rack->rc_inp->inp_in_hpts) 6776 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); 6777 rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0); 6778 } 6779 way_out = 1; 6780 } else { 6781 /* Do we have the correct timer running? */ 6782 rack_timer_audit(tp, rack, &so->so_snd); 6783 way_out = 2; 6784 } 6785 done_with_input: 6786 rack_log_doseg_done(rack, cts, nxt_pkt, did_out, way_out); 6787 if (did_out) 6788 rack->r_wanted_output = 0; 6789 #ifdef INVARIANTS 6790 if (tp->t_inpcb == NULL) { 6791 panic("OP:%d retval:%d tp:%p t_inpcb:NULL state:%d", 6792 did_out, 6793 retval, tp, prev_state); 6794 } 6795 #endif 6796 INP_WUNLOCK(tp->t_inpcb); 6797 } 6798 } 6799 6800 void 6801 rack_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, 6802 struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos, 6803 int32_t ti_locked) 6804 { 6805 struct timeval tv; 6806 #ifdef RSS 6807 struct tcp_function_block *tfb; 6808 struct tcp_rack *rack; 6809 struct inpcb *inp; 6810 6811 rack = (struct tcp_rack *)tp->t_fb_ptr; 6812 if (rack->r_state == 0) { 6813 /* 6814 * Initial input (ACK to SYN-ACK etc)lets go ahead and get 6815 * it processed 6816 */ 6817 if (ti_locked != TI_RLOCKED && INP_INFO_TRY_RLOCK(&V_tcbinfo)) 6818 ti_locked = TI_RLOCKED; 6819 if (ti_locked != TI_RLOCKED) { 6820 inp = tp->t_inpcb; 6821 tfb = tp->t_fb; 6822 in_pcbref(inp); 6823 INP_WUNLOCK(inp); 6824 INP_INFO_RLOCK(&V_tcbinfo); 6825 ti_locked = TI_RLOCKED; 6826 INP_WLOCK(inp); 6827 if (in_pcbrele_wlocked(inp)) 6828 inp = NULL; 6829 if (inp == NULL || (inp->inp_flags2 & INP_FREED) || 6830 (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED))) { 6831 /* The TCPCB went away. Free the packet. */ 6832 INP_INFO_RUNLOCK(&V_tcbinfo); 6833 if (inp) 6834 INP_WUNLOCK(inp); 6835 m_freem(m); 6836 return; 6837 } 6838 /* If the stack changed, call the correct stack. */ 6839 if (tp->t_fb != tfb) { 6840 tp->t_fb->tfb_tcp_do_segment(m, th, so, tp, 6841 drop_hdrlen, tlen, iptos, ti_locked); 6842 return; 6843 } 6844 } 6845 tcp_get_usecs(&tv); 6846 rack_hpts_do_segment(m, th, so, tp, drop_hdrlen, 6847 tlen, iptos, ti_locked, 0, &tv); 6848 return; 6849 } 6850 if (ti_locked == TI_RLOCKED) 6851 INP_INFO_RUNLOCK(&V_tcbinfo); 6852 tcp_queue_to_input(tp, m, th, tlen, drop_hdrlen, iptos, (uint8_t) ti_locked); 6853 INP_WUNLOCK(tp->t_inpcb); 6854 #else 6855 tcp_get_usecs(&tv); 6856 rack_hpts_do_segment(m, th, so, tp, drop_hdrlen, 6857 tlen, iptos, ti_locked, 0, &tv); 6858 #endif 6859 } 6860 6861 struct rack_sendmap * 6862 tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tsused) 6863 { 6864 struct rack_sendmap *rsm = NULL; 6865 int32_t idx; 6866 uint32_t srtt_cur, srtt = 0, thresh = 0, ts_low = 0; 6867 6868 /* Return the next guy to be re-transmitted */ 6869 if (TAILQ_EMPTY(&rack->r_ctl.rc_map)) { 6870 return (NULL); 6871 } 6872 if (tp->t_flags & TF_SENTFIN) { 6873 /* retran the end FIN? */ 6874 return (NULL); 6875 } 6876 /* ok lets look at this one */ 6877 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 6878 if (rsm && ((rsm->r_flags & RACK_ACKED) == 0)) { 6879 goto check_it; 6880 } 6881 rsm = rack_find_lowest_rsm(rack); 6882 if (rsm == NULL) { 6883 return (NULL); 6884 } 6885 check_it: 6886 srtt_cur = tp->t_srtt >> TCP_RTT_SHIFT; 6887 srtt = TICKS_2_MSEC(srtt_cur); 6888 if (rack->rc_rack_rtt && (srtt > rack->rc_rack_rtt)) 6889 srtt = rack->rc_rack_rtt; 6890 if (rsm->r_flags & RACK_ACKED) { 6891 return (NULL); 6892 } 6893 if ((rsm->r_flags & RACK_SACK_PASSED) == 0) { 6894 /* Its not yet ready */ 6895 return (NULL); 6896 } 6897 idx = rsm->r_rtr_cnt - 1; 6898 ts_low = rsm->r_tim_lastsent[idx]; 6899 thresh = rack_calc_thresh_rack(rack, srtt, tsused); 6900 if (tsused <= ts_low) { 6901 return (NULL); 6902 } 6903 if ((tsused - ts_low) >= thresh) { 6904 return (rsm); 6905 } 6906 return (NULL); 6907 } 6908 6909 static int 6910 rack_output(struct tcpcb *tp) 6911 { 6912 struct socket *so; 6913 uint32_t recwin, sendwin; 6914 uint32_t sb_offset; 6915 int32_t len, flags, error = 0; 6916 struct mbuf *m; 6917 struct mbuf *mb; 6918 uint32_t if_hw_tsomaxsegcount = 0; 6919 uint32_t if_hw_tsomaxsegsize; 6920 long tot_len_this_send = 0; 6921 struct ip *ip = NULL; 6922 #ifdef TCPDEBUG 6923 struct ipovly *ipov = NULL; 6924 #endif 6925 struct udphdr *udp = NULL; 6926 struct tcp_rack *rack; 6927 struct tcphdr *th; 6928 uint8_t pass = 0; 6929 u_char opt[TCP_MAXOLEN]; 6930 unsigned ipoptlen, optlen, hdrlen, ulen=0; 6931 uint32_t rack_seq; 6932 6933 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 6934 unsigned ipsec_optlen = 0; 6935 6936 #endif 6937 int32_t idle, sendalot; 6938 int32_t sub_from_prr = 0; 6939 volatile int32_t sack_rxmit; 6940 struct rack_sendmap *rsm = NULL; 6941 int32_t tso, mtu, would_have_fin = 0; 6942 struct tcpopt to; 6943 int32_t slot = 0; 6944 uint32_t cts; 6945 uint8_t hpts_calling, doing_tlp = 0; 6946 int32_t do_a_prefetch; 6947 int32_t prefetch_rsm = 0; 6948 int32_t prefetch_so_done = 0; 6949 struct tcp_log_buffer *lgb = NULL; 6950 struct inpcb *inp; 6951 struct sockbuf *sb; 6952 #ifdef INET6 6953 struct ip6_hdr *ip6 = NULL; 6954 int32_t isipv6; 6955 #endif 6956 /* setup and take the cache hits here */ 6957 rack = (struct tcp_rack *)tp->t_fb_ptr; 6958 inp = rack->rc_inp; 6959 so = inp->inp_socket; 6960 sb = &so->so_snd; 6961 kern_prefetch(sb, &do_a_prefetch); 6962 do_a_prefetch = 1; 6963 6964 INP_WLOCK_ASSERT(inp); 6965 #ifdef TCP_OFFLOAD 6966 if (tp->t_flags & TF_TOE) 6967 return (tcp_offload_output(tp)); 6968 #endif 6969 6970 #ifdef TCP_RFC7413 6971 /* 6972 * For TFO connections in SYN_RECEIVED, only allow the initial 6973 * SYN|ACK and those sent by the retransmit timer. 6974 */ 6975 if ((tp->t_flags & TF_FASTOPEN) && 6976 (tp->t_state == TCPS_SYN_RECEIVED) && 6977 SEQ_GT(tp->snd_max, tp->snd_una) && /* inital SYN|ACK sent */ 6978 (tp->snd_nxt != tp->snd_una)) /* not a retransmit */ 6979 return (0); 6980 #endif 6981 #ifdef INET6 6982 if (rack->r_state) { 6983 /* Use the cache line loaded if possible */ 6984 isipv6 = rack->r_is_v6; 6985 } else { 6986 isipv6 = (inp->inp_vflag & INP_IPV6) != 0; 6987 } 6988 #endif 6989 cts = tcp_ts_getticks(); 6990 if (((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) && 6991 inp->inp_in_hpts) { 6992 /* 6993 * We are on the hpts for some timer but not hptsi output. 6994 * Remove from the hpts unconditionally. 6995 */ 6996 rack_timer_cancel(tp, rack, cts, __LINE__); 6997 } 6998 /* Mark that we have called rack_output(). */ 6999 if ((rack->r_timer_override) || 7000 (tp->t_flags & TF_FORCEDATA) || 7001 (tp->t_state < TCPS_ESTABLISHED)) { 7002 if (tp->t_inpcb->inp_in_hpts) 7003 tcp_hpts_remove(tp->t_inpcb, HPTS_REMOVE_OUTPUT); 7004 } else if (tp->t_inpcb->inp_in_hpts) { 7005 /* 7006 * On the hpts you can't pass even if ACKNOW is on, we will 7007 * when the hpts fires. 7008 */ 7009 counter_u64_add(rack_out_size[TCP_MSS_ACCT_INPACE], 1); 7010 return (0); 7011 } 7012 hpts_calling = inp->inp_hpts_calls; 7013 inp->inp_hpts_calls = 0; 7014 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { 7015 if (rack_process_timers(tp, rack, cts, hpts_calling)) { 7016 counter_u64_add(rack_out_size[TCP_MSS_ACCT_ATIMER], 1); 7017 return (0); 7018 } 7019 } 7020 rack->r_wanted_output = 0; 7021 rack->r_timer_override = 0; 7022 /* 7023 * Determine length of data that should be transmitted, and flags 7024 * that will be used. If there is some data or critical controls 7025 * (SYN, RST) to send, then transmit; otherwise, investigate 7026 * further. 7027 */ 7028 idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una); 7029 #ifdef NETFLIX_CWV 7030 if (tp->cwv_enabled) { 7031 if ((tp->cwv_cwnd_valid == 0) && 7032 TCPS_HAVEESTABLISHED(tp->t_state) && 7033 (tp->snd_cwnd > tp->snd_cwv.init_cwnd)) 7034 tcp_newcwv_nvp_closedown(tp); 7035 } else 7036 #endif 7037 if (tp->t_idle_reduce) { 7038 if (idle && ((ticks - tp->t_rcvtime) >= tp->t_rxtcur)) 7039 rack_cc_after_idle(tp, 7040 (rack->r_idle_reduce_largest ? 1 :0)); 7041 } 7042 tp->t_flags &= ~TF_LASTIDLE; 7043 if (idle) { 7044 if (tp->t_flags & TF_MORETOCOME) { 7045 tp->t_flags |= TF_LASTIDLE; 7046 idle = 0; 7047 } 7048 } 7049 again: 7050 /* 7051 * If we've recently taken a timeout, snd_max will be greater than 7052 * snd_nxt. There may be SACK information that allows us to avoid 7053 * resending already delivered data. Adjust snd_nxt accordingly. 7054 */ 7055 sendalot = 0; 7056 cts = tcp_ts_getticks(); 7057 tso = 0; 7058 mtu = 0; 7059 sb_offset = tp->snd_max - tp->snd_una; 7060 sendwin = min(tp->snd_wnd, tp->snd_cwnd); 7061 7062 flags = tcp_outflags[tp->t_state]; 7063 /* 7064 * Send any SACK-generated retransmissions. If we're explicitly 7065 * trying to send out new data (when sendalot is 1), bypass this 7066 * function. If we retransmit in fast recovery mode, decrement 7067 * snd_cwnd, since we're replacing a (future) new transmission with 7068 * a retransmission now, and we previously incremented snd_cwnd in 7069 * tcp_input(). 7070 */ 7071 /* 7072 * Still in sack recovery , reset rxmit flag to zero. 7073 */ 7074 while (rack->rc_free_cnt < rack_free_cache) { 7075 rsm = rack_alloc(rack); 7076 if (rsm == NULL) { 7077 if (inp->inp_hpts_calls) 7078 /* Retry in a ms */ 7079 slot = 1; 7080 goto just_return_nolock; 7081 } 7082 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_next); 7083 rack->rc_free_cnt++; 7084 rsm = NULL; 7085 } 7086 if (inp->inp_hpts_calls) 7087 inp->inp_hpts_calls = 0; 7088 sack_rxmit = 0; 7089 len = 0; 7090 rsm = NULL; 7091 if (flags & TH_RST) { 7092 SOCKBUF_LOCK(sb); 7093 goto send; 7094 } 7095 if (rack->r_ctl.rc_tlpsend) { 7096 /* Tail loss probe */ 7097 long cwin; 7098 long tlen; 7099 7100 doing_tlp = 1; 7101 rsm = rack->r_ctl.rc_tlpsend; 7102 rack->r_ctl.rc_tlpsend = NULL; 7103 sack_rxmit = 1; 7104 tlen = rsm->r_end - rsm->r_start; 7105 if (tlen > tp->t_maxseg) 7106 tlen = tp->t_maxseg; 7107 #ifdef INVARIANTS 7108 if (SEQ_GT(tp->snd_una, rsm->r_start)) { 7109 panic("tp:%p rack:%p snd_una:%u rsm:%p r_start:%u", 7110 tp, rack, tp->snd_una, rsm, rsm->r_start); 7111 } 7112 #endif 7113 sb_offset = rsm->r_start - tp->snd_una; 7114 cwin = min(tp->snd_wnd, tlen); 7115 len = cwin; 7116 } else if (rack->r_ctl.rc_resend) { 7117 /* Retransmit timer */ 7118 rsm = rack->r_ctl.rc_resend; 7119 rack->r_ctl.rc_resend = NULL; 7120 len = rsm->r_end - rsm->r_start; 7121 sack_rxmit = 1; 7122 sendalot = 0; 7123 sb_offset = rsm->r_start - tp->snd_una; 7124 if (len >= tp->t_maxseg) { 7125 len = tp->t_maxseg; 7126 } 7127 KASSERT(sb_offset >= 0, ("%s: sack block to the left of una : %d", 7128 __func__, sb_offset)); 7129 } else if ((rack->rc_in_persist == 0) && 7130 ((rsm = tcp_rack_output(tp, rack, cts)) != NULL)) { 7131 long tlen; 7132 7133 if ((!IN_RECOVERY(tp->t_flags)) && 7134 ((tp->t_flags & (TF_WASFRECOVERY | TF_WASCRECOVERY)) == 0)) { 7135 /* Enter recovery if not induced by a time-out */ 7136 rack->r_ctl.rc_rsm_start = rsm->r_start; 7137 rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; 7138 rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; 7139 rack_cong_signal(tp, NULL, CC_NDUPACK); 7140 /* 7141 * When we enter recovery we need to assure we send 7142 * one packet. 7143 */ 7144 rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; 7145 } 7146 #ifdef INVARIANTS 7147 if (SEQ_LT(rsm->r_start, tp->snd_una)) { 7148 panic("Huh, tp:%p rack:%p rsm:%p start:%u < snd_una:%u\n", 7149 tp, rack, rsm, rsm->r_start, tp->snd_una); 7150 } 7151 #endif 7152 tlen = rsm->r_end - rsm->r_start; 7153 sb_offset = rsm->r_start - tp->snd_una; 7154 if (tlen > rack->r_ctl.rc_prr_sndcnt) { 7155 len = rack->r_ctl.rc_prr_sndcnt; 7156 } else { 7157 len = tlen; 7158 } 7159 if (len >= tp->t_maxseg) { 7160 sendalot = 1; 7161 len = tp->t_maxseg; 7162 } else { 7163 sendalot = 0; 7164 if ((rack->rc_timer_up == 0) && 7165 (len < tlen)) { 7166 /* 7167 * If its not a timer don't send a partial 7168 * segment. 7169 */ 7170 len = 0; 7171 goto just_return_nolock; 7172 } 7173 } 7174 KASSERT(sb_offset >= 0, ("%s: sack block to the left of una : %d", 7175 __func__, sb_offset)); 7176 if (len > 0) { 7177 sub_from_prr = 1; 7178 sack_rxmit = 1; 7179 TCPSTAT_INC(tcps_sack_rexmits); 7180 TCPSTAT_ADD(tcps_sack_rexmit_bytes, 7181 min(len, tp->t_maxseg)); 7182 counter_u64_add(rack_rtm_prr_retran, 1); 7183 } 7184 } 7185 if (rsm && (rsm->r_flags & RACK_HAS_FIN)) { 7186 /* we are retransmitting the fin */ 7187 len--; 7188 if (len) { 7189 /* 7190 * When retransmitting data do *not* include the 7191 * FIN. This could happen from a TLP probe. 7192 */ 7193 flags &= ~TH_FIN; 7194 } 7195 } 7196 #ifdef INVARIANTS 7197 /* For debugging */ 7198 rack->r_ctl.rc_rsm_at_retran = rsm; 7199 #endif 7200 /* 7201 * Get standard flags, and add SYN or FIN if requested by 'hidden' 7202 * state flags. 7203 */ 7204 if (tp->t_flags & TF_NEEDFIN) 7205 flags |= TH_FIN; 7206 if (tp->t_flags & TF_NEEDSYN) 7207 flags |= TH_SYN; 7208 if ((sack_rxmit == 0) && (prefetch_rsm == 0)) { 7209 void *end_rsm; 7210 end_rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext); 7211 if (end_rsm) 7212 kern_prefetch(end_rsm, &prefetch_rsm); 7213 prefetch_rsm = 1; 7214 } 7215 SOCKBUF_LOCK(sb); 7216 /* 7217 * If in persist timeout with window of 0, send 1 byte. Otherwise, 7218 * if window is small but nonzero and time TF_SENTFIN expired, we 7219 * will send what we can and go to transmit state. 7220 */ 7221 if (tp->t_flags & TF_FORCEDATA) { 7222 if (sendwin == 0) { 7223 /* 7224 * If we still have some data to send, then clear 7225 * the FIN bit. Usually this would happen below 7226 * when it realizes that we aren't sending all the 7227 * data. However, if we have exactly 1 byte of 7228 * unsent data, then it won't clear the FIN bit 7229 * below, and if we are in persist state, we wind up 7230 * sending the packet without recording that we sent 7231 * the FIN bit. 7232 * 7233 * We can't just blindly clear the FIN bit, because 7234 * if we don't have any more data to send then the 7235 * probe will be the FIN itself. 7236 */ 7237 if (sb_offset < sbused(sb)) 7238 flags &= ~TH_FIN; 7239 sendwin = 1; 7240 } else { 7241 if (rack->rc_in_persist) 7242 rack_exit_persist(tp, rack); 7243 /* 7244 * If we are dropping persist mode then we need to 7245 * correct snd_nxt/snd_max and off. 7246 */ 7247 tp->snd_nxt = tp->snd_max; 7248 sb_offset = tp->snd_nxt - tp->snd_una; 7249 } 7250 } 7251 /* 7252 * If snd_nxt == snd_max and we have transmitted a FIN, the 7253 * sb_offset will be > 0 even if so_snd.sb_cc is 0, resulting in a 7254 * negative length. This can also occur when TCP opens up its 7255 * congestion window while receiving additional duplicate acks after 7256 * fast-retransmit because TCP will reset snd_nxt to snd_max after 7257 * the fast-retransmit. 7258 * 7259 * In the normal retransmit-FIN-only case, however, snd_nxt will be 7260 * set to snd_una, the sb_offset will be 0, and the length may wind 7261 * up 0. 7262 * 7263 * If sack_rxmit is true we are retransmitting from the scoreboard 7264 * in which case len is already set. 7265 */ 7266 if (sack_rxmit == 0) { 7267 uint32_t avail; 7268 7269 avail = sbavail(sb); 7270 if (SEQ_GT(tp->snd_nxt, tp->snd_una)) 7271 sb_offset = tp->snd_nxt - tp->snd_una; 7272 else 7273 sb_offset = 0; 7274 if (IN_RECOVERY(tp->t_flags) == 0) { 7275 if (rack->r_ctl.rc_tlp_new_data) { 7276 /* TLP is forcing out new data */ 7277 if (rack->r_ctl.rc_tlp_new_data > (uint32_t) (avail - sb_offset)) { 7278 rack->r_ctl.rc_tlp_new_data = (uint32_t) (avail - sb_offset); 7279 } 7280 if (rack->r_ctl.rc_tlp_new_data > tp->snd_wnd) 7281 len = tp->snd_wnd; 7282 else 7283 len = rack->r_ctl.rc_tlp_new_data; 7284 rack->r_ctl.rc_tlp_new_data = 0; 7285 doing_tlp = 1; 7286 } else { 7287 if (sendwin > avail) { 7288 /* use the available */ 7289 if (avail > sb_offset) { 7290 len = (int32_t)(avail - sb_offset); 7291 } else { 7292 len = 0; 7293 } 7294 } else { 7295 if (sendwin > sb_offset) { 7296 len = (int32_t)(sendwin - sb_offset); 7297 } else { 7298 len = 0; 7299 } 7300 } 7301 } 7302 } else { 7303 uint32_t outstanding; 7304 7305 /* 7306 * We are inside of a SACK recovery episode and are 7307 * sending new data, having retransmitted all the 7308 * data possible so far in the scoreboard. 7309 */ 7310 outstanding = tp->snd_max - tp->snd_una; 7311 if ((rack->r_ctl.rc_prr_sndcnt + outstanding) > tp->snd_wnd) 7312 len = 0; 7313 else if (avail > sb_offset) 7314 len = avail - sb_offset; 7315 else 7316 len = 0; 7317 if (len > 0) { 7318 if (len > rack->r_ctl.rc_prr_sndcnt) 7319 len = rack->r_ctl.rc_prr_sndcnt; 7320 7321 if (len > 0) { 7322 sub_from_prr = 1; 7323 counter_u64_add(rack_rtm_prr_newdata, 1); 7324 } 7325 } 7326 if (len > tp->t_maxseg) { 7327 /* 7328 * We should never send more than a MSS when 7329 * retransmitting or sending new data in prr 7330 * mode unless the override flag is on. Most 7331 * likely the PRR algorithm is not going to 7332 * let us send a lot as well :-) 7333 */ 7334 if (rack->r_ctl.rc_prr_sendalot == 0) 7335 len = tp->t_maxseg; 7336 } else if (len < tp->t_maxseg) { 7337 /* 7338 * Do we send any? The idea here is if the 7339 * send empty's the socket buffer we want to 7340 * do it. However if not then lets just wait 7341 * for our prr_sndcnt to get bigger. 7342 */ 7343 long leftinsb; 7344 7345 leftinsb = sbavail(sb) - sb_offset; 7346 if (leftinsb > len) { 7347 /* This send does not empty the sb */ 7348 len = 0; 7349 } 7350 } 7351 } 7352 } 7353 if (prefetch_so_done == 0) { 7354 kern_prefetch(so, &prefetch_so_done); 7355 prefetch_so_done = 1; 7356 } 7357 /* 7358 * Lop off SYN bit if it has already been sent. However, if this is 7359 * SYN-SENT state and if segment contains data and if we don't know 7360 * that foreign host supports TAO, suppress sending segment. 7361 */ 7362 if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) { 7363 if ((tp->t_state != TCPS_SYN_RECEIVED) && 7364 (tp->t_state != TCPS_SYN_SENT)) 7365 flags &= ~TH_SYN; 7366 #ifdef TCP_RFC7413 7367 /* 7368 * When sending additional segments following a TFO SYN|ACK, 7369 * do not include the SYN bit. 7370 */ 7371 if ((tp->t_flags & TF_FASTOPEN) && 7372 (tp->t_state == TCPS_SYN_RECEIVED)) 7373 flags &= ~TH_SYN; 7374 #endif 7375 sb_offset--, len++; 7376 if (sbavail(sb) == 0) 7377 len = 0; 7378 } 7379 /* 7380 * Be careful not to send data and/or FIN on SYN segments. This 7381 * measure is needed to prevent interoperability problems with not 7382 * fully conformant TCP implementations. 7383 */ 7384 if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) { 7385 len = 0; 7386 flags &= ~TH_FIN; 7387 } 7388 #ifdef TCP_RFC7413 7389 /* 7390 * When retransmitting SYN|ACK on a passively-created TFO socket, 7391 * don't include data, as the presence of data may have caused the 7392 * original SYN|ACK to have been dropped by a middlebox. 7393 */ 7394 if ((tp->t_flags & TF_FASTOPEN) && 7395 ((tp->t_state == TCPS_SYN_RECEIVED) && (tp->t_rxtshift > 0))) 7396 len = 0; 7397 #endif 7398 if (len <= 0) { 7399 /* 7400 * If FIN has been sent but not acked, but we haven't been 7401 * called to retransmit, len will be < 0. Otherwise, window 7402 * shrank after we sent into it. If window shrank to 0, 7403 * cancel pending retransmit, pull snd_nxt back to (closed) 7404 * window, and set the persist timer if it isn't already 7405 * going. If the window didn't close completely, just wait 7406 * for an ACK. 7407 * 7408 * We also do a general check here to ensure that we will 7409 * set the persist timer when we have data to send, but a 7410 * 0-byte window. This makes sure the persist timer is set 7411 * even if the packet hits one of the "goto send" lines 7412 * below. 7413 */ 7414 len = 0; 7415 if ((tp->snd_wnd == 0) && 7416 (TCPS_HAVEESTABLISHED(tp->t_state)) && 7417 (sb_offset < (int)sbavail(sb))) { 7418 tp->snd_nxt = tp->snd_una; 7419 rack_enter_persist(tp, rack, cts); 7420 } 7421 } 7422 /* len will be >= 0 after this point. */ 7423 KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); 7424 tcp_sndbuf_autoscale(tp, so, sendwin); 7425 /* 7426 * Decide if we can use TCP Segmentation Offloading (if supported by 7427 * hardware). 7428 * 7429 * TSO may only be used if we are in a pure bulk sending state. The 7430 * presence of TCP-MD5, SACK retransmits, SACK advertizements and IP 7431 * options prevent using TSO. With TSO the TCP header is the same 7432 * (except for the sequence number) for all generated packets. This 7433 * makes it impossible to transmit any options which vary per 7434 * generated segment or packet. 7435 * 7436 * IPv4 handling has a clear separation of ip options and ip header 7437 * flags while IPv6 combines both in in6p_outputopts. ip6_optlen() does 7438 * the right thing below to provide length of just ip options and thus 7439 * checking for ipoptlen is enough to decide if ip options are present. 7440 */ 7441 7442 #ifdef INET6 7443 if (isipv6) 7444 ipoptlen = ip6_optlen(tp->t_inpcb); 7445 else 7446 #endif 7447 if (tp->t_inpcb->inp_options) 7448 ipoptlen = tp->t_inpcb->inp_options->m_len - 7449 offsetof(struct ipoption, ipopt_list); 7450 else 7451 ipoptlen = 0; 7452 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 7453 /* 7454 * Pre-calculate here as we save another lookup into the darknesses 7455 * of IPsec that way and can actually decide if TSO is ok. 7456 */ 7457 #ifdef INET6 7458 if (isipv6 && IPSEC_ENABLED(ipv6)) 7459 ipsec_optlen = IPSEC_HDRSIZE(ipv6, tp->t_inpcb); 7460 #ifdef INET 7461 else 7462 #endif 7463 #endif /* INET6 */ 7464 #ifdef INET 7465 if (IPSEC_ENABLED(ipv4)) 7466 ipsec_optlen = IPSEC_HDRSIZE(ipv4, tp->t_inpcb); 7467 #endif /* INET */ 7468 #endif 7469 7470 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 7471 ipoptlen += ipsec_optlen; 7472 #endif 7473 if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > tp->t_maxseg && 7474 (tp->t_port == 0) && 7475 ((tp->t_flags & TF_SIGNATURE) == 0) && 7476 tp->rcv_numsacks == 0 && sack_rxmit == 0 && 7477 ipoptlen == 0) 7478 tso = 1; 7479 { 7480 uint32_t outstanding; 7481 7482 outstanding = tp->snd_max - tp->snd_una; 7483 if (tp->t_flags & TF_SENTFIN) { 7484 /* 7485 * If we sent a fin, snd_max is 1 higher than 7486 * snd_una 7487 */ 7488 outstanding--; 7489 } 7490 if (outstanding > 0) { 7491 /* 7492 * This is sub-optimal. We only send a stand alone 7493 * FIN on its own segment. 7494 */ 7495 if (flags & TH_FIN) { 7496 flags &= ~TH_FIN; 7497 would_have_fin = 1; 7498 } 7499 } else if (sack_rxmit) { 7500 if ((rsm->r_flags & RACK_HAS_FIN) == 0) 7501 flags &= ~TH_FIN; 7502 } else { 7503 if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + 7504 sbused(sb))) 7505 flags &= ~TH_FIN; 7506 } 7507 } 7508 recwin = sbspace(&so->so_rcv); 7509 7510 /* 7511 * Sender silly window avoidance. We transmit under the following 7512 * conditions when len is non-zero: 7513 * 7514 * - We have a full segment (or more with TSO) - This is the last 7515 * buffer in a write()/send() and we are either idle or running 7516 * NODELAY - we've timed out (e.g. persist timer) - we have more 7517 * then 1/2 the maximum send window's worth of data (receiver may be 7518 * limited the window size) - we need to retransmit 7519 */ 7520 if (len) { 7521 if (len >= tp->t_maxseg) { 7522 pass = 1; 7523 goto send; 7524 } 7525 /* 7526 * NOTE! on localhost connections an 'ack' from the remote 7527 * end may occur synchronously with the output and cause us 7528 * to flush a buffer queued with moretocome. XXX 7529 * 7530 */ 7531 if (!(tp->t_flags & TF_MORETOCOME) && /* normal case */ 7532 (idle || (tp->t_flags & TF_NODELAY)) && 7533 ((uint32_t)len + (uint32_t)sb_offset >= sbavail(&so->so_snd)) && 7534 (tp->t_flags & TF_NOPUSH) == 0) { 7535 pass = 2; 7536 goto send; 7537 } 7538 if (tp->t_flags & TF_FORCEDATA) { /* typ. timeout case */ 7539 pass = 3; 7540 goto send; 7541 } 7542 if ((tp->snd_una == tp->snd_max) && len) { /* Nothing outstanding */ 7543 goto send; 7544 } 7545 if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) { 7546 pass = 4; 7547 goto send; 7548 } 7549 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { /* retransmit case */ 7550 pass = 5; 7551 goto send; 7552 } 7553 if (sack_rxmit) { 7554 pass = 6; 7555 goto send; 7556 } 7557 } 7558 /* 7559 * Sending of standalone window updates. 7560 * 7561 * Window updates are important when we close our window due to a 7562 * full socket buffer and are opening it again after the application 7563 * reads data from it. Once the window has opened again and the 7564 * remote end starts to send again the ACK clock takes over and 7565 * provides the most current window information. 7566 * 7567 * We must avoid the silly window syndrome whereas every read from 7568 * the receive buffer, no matter how small, causes a window update 7569 * to be sent. We also should avoid sending a flurry of window 7570 * updates when the socket buffer had queued a lot of data and the 7571 * application is doing small reads. 7572 * 7573 * Prevent a flurry of pointless window updates by only sending an 7574 * update when we can increase the advertized window by more than 7575 * 1/4th of the socket buffer capacity. When the buffer is getting 7576 * full or is very small be more aggressive and send an update 7577 * whenever we can increase by two mss sized segments. In all other 7578 * situations the ACK's to new incoming data will carry further 7579 * window increases. 7580 * 7581 * Don't send an independent window update if a delayed ACK is 7582 * pending (it will get piggy-backed on it) or the remote side 7583 * already has done a half-close and won't send more data. Skip 7584 * this if the connection is in T/TCP half-open state. 7585 */ 7586 if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN) && 7587 !(tp->t_flags & TF_DELACK) && 7588 !TCPS_HAVERCVDFIN(tp->t_state)) { 7589 /* 7590 * "adv" is the amount we could increase the window, taking 7591 * into account that we are limited by TCP_MAXWIN << 7592 * tp->rcv_scale. 7593 */ 7594 int32_t adv; 7595 int oldwin; 7596 7597 adv = min(recwin, (long)TCP_MAXWIN << tp->rcv_scale); 7598 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) { 7599 oldwin = (tp->rcv_adv - tp->rcv_nxt); 7600 adv -= oldwin; 7601 } else 7602 oldwin = 0; 7603 7604 /* 7605 * If the new window size ends up being the same as the old 7606 * size when it is scaled, then don't force a window update. 7607 */ 7608 if (oldwin >> tp->rcv_scale == (adv + oldwin) >> tp->rcv_scale) 7609 goto dontupdate; 7610 7611 if (adv >= (int32_t)(2 * tp->t_maxseg) && 7612 (adv >= (int32_t)(so->so_rcv.sb_hiwat / 4) || 7613 recwin <= (int32_t)(so->so_rcv.sb_hiwat / 8) || 7614 so->so_rcv.sb_hiwat <= 8 * tp->t_maxseg)) { 7615 pass = 7; 7616 goto send; 7617 } 7618 if (2 * adv >= (int32_t) so->so_rcv.sb_hiwat) 7619 goto send; 7620 } 7621 dontupdate: 7622 7623 /* 7624 * Send if we owe the peer an ACK, RST, SYN, or urgent data. ACKNOW 7625 * is also a catch-all for the retransmit timer timeout case. 7626 */ 7627 if (tp->t_flags & TF_ACKNOW) { 7628 pass = 8; 7629 goto send; 7630 } 7631 if (((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0)) { 7632 pass = 9; 7633 goto send; 7634 } 7635 if (SEQ_GT(tp->snd_up, tp->snd_una)) { 7636 pass = 10; 7637 goto send; 7638 } 7639 /* 7640 * If our state indicates that FIN should be sent and we have not 7641 * yet done so, then we need to send. 7642 */ 7643 if (flags & TH_FIN) { 7644 if ((tp->t_flags & TF_SENTFIN) || 7645 (((tp->t_flags & TF_SENTFIN) == 0) && 7646 (tp->snd_nxt == tp->snd_una))) { 7647 pass = 11; 7648 goto send; 7649 } 7650 } 7651 /* 7652 * No reason to send a segment, just return. 7653 */ 7654 just_return: 7655 SOCKBUF_UNLOCK(sb); 7656 just_return_nolock: 7657 if (tot_len_this_send == 0) 7658 counter_u64_add(rack_out_size[TCP_MSS_ACCT_JUSTRET], 1); 7659 rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, tot_len_this_send, 1); 7660 rack_log_type_just_return(rack, cts, tot_len_this_send, slot, hpts_calling); 7661 tp->t_flags &= ~TF_FORCEDATA; 7662 return (0); 7663 7664 send: 7665 if (doing_tlp == 0) { 7666 /* 7667 * Data not a TLP, and its not the rxt firing. If it is the 7668 * rxt firing, we want to leave the tlp_in_progress flag on 7669 * so we don't send another TLP. It has to be a rack timer 7670 * or normal send (response to acked data) to clear the tlp 7671 * in progress flag. 7672 */ 7673 rack->rc_tlp_in_progress = 0; 7674 } 7675 SOCKBUF_LOCK_ASSERT(sb); 7676 if (len > 0) { 7677 if (len >= tp->t_maxseg) 7678 tp->t_flags2 |= TF2_PLPMTU_MAXSEGSNT; 7679 else 7680 tp->t_flags2 &= ~TF2_PLPMTU_MAXSEGSNT; 7681 } 7682 /* 7683 * Before ESTABLISHED, force sending of initial options unless TCP 7684 * set not to do any options. NOTE: we assume that the IP/TCP header 7685 * plus TCP options always fit in a single mbuf, leaving room for a 7686 * maximum link header, i.e. max_linkhdr + sizeof (struct tcpiphdr) 7687 * + optlen <= MCLBYTES 7688 */ 7689 optlen = 0; 7690 #ifdef INET6 7691 if (isipv6) 7692 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 7693 else 7694 #endif 7695 hdrlen = sizeof(struct tcpiphdr); 7696 7697 /* 7698 * Compute options for segment. We only have to care about SYN and 7699 * established connection segments. Options for SYN-ACK segments 7700 * are handled in TCP syncache. 7701 */ 7702 to.to_flags = 0; 7703 if ((tp->t_flags & TF_NOOPT) == 0) { 7704 /* Maximum segment size. */ 7705 if (flags & TH_SYN) { 7706 tp->snd_nxt = tp->iss; 7707 to.to_mss = tcp_mssopt(&inp->inp_inc); 7708 #ifdef NETFLIX_TCPOUDP 7709 if (tp->t_port) 7710 to.to_mss -= V_tcp_udp_tunneling_overhead; 7711 #endif 7712 to.to_flags |= TOF_MSS; 7713 #ifdef TCP_RFC7413 7714 /* 7715 * Only include the TFO option on the first 7716 * transmission of the SYN|ACK on a 7717 * passively-created TFO socket, as the presence of 7718 * the TFO option may have caused the original 7719 * SYN|ACK to have been dropped by a middlebox. 7720 */ 7721 if ((tp->t_flags & TF_FASTOPEN) && 7722 (tp->t_state == TCPS_SYN_RECEIVED) && 7723 (tp->t_rxtshift == 0)) { 7724 to.to_tfo_len = TCP_FASTOPEN_MAX_COOKIE_LEN; 7725 to.to_tfo_cookie = (u_char *)&tp->t_tfo_cookie; 7726 to.to_flags |= TOF_FASTOPEN; 7727 } 7728 #endif 7729 } 7730 /* Window scaling. */ 7731 if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) { 7732 to.to_wscale = tp->request_r_scale; 7733 to.to_flags |= TOF_SCALE; 7734 } 7735 /* Timestamps. */ 7736 if ((tp->t_flags & TF_RCVD_TSTMP) || 7737 ((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) { 7738 to.to_tsval = cts + tp->ts_offset; 7739 to.to_tsecr = tp->ts_recent; 7740 to.to_flags |= TOF_TS; 7741 } 7742 /* Set receive buffer autosizing timestamp. */ 7743 if (tp->rfbuf_ts == 0 && 7744 (so->so_rcv.sb_flags & SB_AUTOSIZE)) 7745 tp->rfbuf_ts = tcp_ts_getticks(); 7746 /* Selective ACK's. */ 7747 if (flags & TH_SYN) 7748 to.to_flags |= TOF_SACKPERM; 7749 else if (TCPS_HAVEESTABLISHED(tp->t_state) && 7750 tp->rcv_numsacks > 0) { 7751 to.to_flags |= TOF_SACK; 7752 to.to_nsacks = tp->rcv_numsacks; 7753 to.to_sacks = (u_char *)tp->sackblks; 7754 } 7755 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) 7756 /* TCP-MD5 (RFC2385). */ 7757 if (tp->t_flags & TF_SIGNATURE) 7758 to.to_flags |= TOF_SIGNATURE; 7759 #endif /* TCP_SIGNATURE */ 7760 7761 /* Processing the options. */ 7762 hdrlen += optlen = tcp_addoptions(&to, opt); 7763 } 7764 #ifdef NETFLIX_TCPOUDP 7765 if (tp->t_port) { 7766 if (V_tcp_udp_tunneling_port == 0) { 7767 /* The port was removed?? */ 7768 SOCKBUF_UNLOCK(&so->so_snd); 7769 return (EHOSTUNREACH); 7770 } 7771 hdrlen += sizeof(struct udphdr); 7772 } 7773 #endif 7774 ipoptlen = 0; 7775 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 7776 ipoptlen += ipsec_optlen; 7777 #endif 7778 7779 /* 7780 * Adjust data length if insertion of options will bump the packet 7781 * length beyond the t_maxseg length. Clear the FIN bit because we 7782 * cut off the tail of the segment. 7783 */ 7784 if (len + optlen + ipoptlen > tp->t_maxseg) { 7785 if (flags & TH_FIN) { 7786 would_have_fin = 1; 7787 flags &= ~TH_FIN; 7788 } 7789 if (tso) { 7790 uint32_t if_hw_tsomax; 7791 uint32_t moff; 7792 int32_t max_len; 7793 7794 /* extract TSO information */ 7795 if_hw_tsomax = tp->t_tsomax; 7796 if_hw_tsomaxsegcount = tp->t_tsomaxsegcount; 7797 if_hw_tsomaxsegsize = tp->t_tsomaxsegsize; 7798 KASSERT(ipoptlen == 0, 7799 ("%s: TSO can't do IP options", __func__)); 7800 7801 /* 7802 * Check if we should limit by maximum payload 7803 * length: 7804 */ 7805 if (if_hw_tsomax != 0) { 7806 /* compute maximum TSO length */ 7807 max_len = (if_hw_tsomax - hdrlen - 7808 max_linkhdr); 7809 if (max_len <= 0) { 7810 len = 0; 7811 } else if (len > max_len) { 7812 sendalot = 1; 7813 len = max_len; 7814 } 7815 } 7816 /* 7817 * Prevent the last segment from being fractional 7818 * unless the send sockbuf can be emptied: 7819 */ 7820 max_len = (tp->t_maxseg - optlen); 7821 if ((sb_offset + len) < sbavail(sb)) { 7822 moff = len % (u_int)max_len; 7823 if (moff != 0) { 7824 len -= moff; 7825 sendalot = 1; 7826 } 7827 } 7828 /* 7829 * In case there are too many small fragments don't 7830 * use TSO: 7831 */ 7832 if (len <= max_len) { 7833 len = max_len; 7834 sendalot = 1; 7835 tso = 0; 7836 } 7837 /* 7838 * Send the FIN in a separate segment after the bulk 7839 * sending is done. We don't trust the TSO 7840 * implementations to clear the FIN flag on all but 7841 * the last segment. 7842 */ 7843 if (tp->t_flags & TF_NEEDFIN) 7844 sendalot = 1; 7845 7846 } else { 7847 len = tp->t_maxseg - optlen - ipoptlen; 7848 sendalot = 1; 7849 } 7850 } else 7851 tso = 0; 7852 KASSERT(len + hdrlen + ipoptlen <= IP_MAXPACKET, 7853 ("%s: len > IP_MAXPACKET", __func__)); 7854 #ifdef DIAGNOSTIC 7855 #ifdef INET6 7856 if (max_linkhdr + hdrlen > MCLBYTES) 7857 #else 7858 if (max_linkhdr + hdrlen > MHLEN) 7859 #endif 7860 panic("tcphdr too big"); 7861 #endif 7862 7863 /* 7864 * This KASSERT is here to catch edge cases at a well defined place. 7865 * Before, those had triggered (random) panic conditions further 7866 * down. 7867 */ 7868 KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); 7869 if ((len == 0) && 7870 (flags & TH_FIN) && 7871 (sbused(sb))) { 7872 /* 7873 * We have outstanding data, don't send a fin by itself!. 7874 */ 7875 goto just_return; 7876 } 7877 /* 7878 * Grab a header mbuf, attaching a copy of data to be transmitted, 7879 * and initialize the header from the template for sends on this 7880 * connection. 7881 */ 7882 if (len) { 7883 uint32_t max_val; 7884 uint32_t moff; 7885 7886 if (rack->rc_pace_max_segs) 7887 max_val = rack->rc_pace_max_segs * tp->t_maxseg; 7888 else 7889 max_val = len; 7890 /* 7891 * We allow a limit on sending with hptsi. 7892 */ 7893 if (len > max_val) { 7894 len = max_val; 7895 } 7896 #ifdef INET6 7897 if (MHLEN < hdrlen + max_linkhdr) 7898 m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); 7899 else 7900 #endif 7901 m = m_gethdr(M_NOWAIT, MT_DATA); 7902 7903 if (m == NULL) { 7904 SOCKBUF_UNLOCK(sb); 7905 error = ENOBUFS; 7906 sack_rxmit = 0; 7907 goto out; 7908 } 7909 m->m_data += max_linkhdr; 7910 m->m_len = hdrlen; 7911 7912 /* 7913 * Start the m_copy functions from the closest mbuf to the 7914 * sb_offset in the socket buffer chain. 7915 */ 7916 mb = sbsndptr_noadv(sb, sb_offset, &moff); 7917 if (len <= MHLEN - hdrlen - max_linkhdr) { 7918 m_copydata(mb, moff, (int)len, 7919 mtod(m, caddr_t)+hdrlen); 7920 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) 7921 sbsndptr_adv(sb, mb, len); 7922 m->m_len += len; 7923 } else { 7924 struct sockbuf *msb; 7925 7926 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) 7927 msb = NULL; 7928 else 7929 msb = sb; 7930 m->m_next = tcp_m_copym(mb, moff, &len, 7931 if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb); 7932 if (len <= (tp->t_maxseg - optlen)) { 7933 /* 7934 * Must have ran out of mbufs for the copy 7935 * shorten it to no longer need tso. Lets 7936 * not put on sendalot since we are low on 7937 * mbufs. 7938 */ 7939 tso = 0; 7940 } 7941 if (m->m_next == NULL) { 7942 SOCKBUF_UNLOCK(sb); 7943 (void)m_free(m); 7944 error = ENOBUFS; 7945 sack_rxmit = 0; 7946 goto out; 7947 } 7948 } 7949 if ((tp->t_flags & TF_FORCEDATA) && len == 1) { 7950 TCPSTAT_INC(tcps_sndprobe); 7951 #ifdef NETFLIX_STATS 7952 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) 7953 stats_voi_update_abs_u32(tp->t_stats, 7954 VOI_TCP_RETXPB, len); 7955 else 7956 stats_voi_update_abs_u64(tp->t_stats, 7957 VOI_TCP_TXPB, len); 7958 #endif 7959 } else if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) { 7960 if (rsm && (rsm->r_flags & RACK_TLP)) { 7961 /* 7962 * TLP should not count in retran count, but 7963 * in its own bin 7964 */ 7965 counter_u64_add(rack_tlp_retran, 1); 7966 counter_u64_add(rack_tlp_retran_bytes, len); 7967 } else { 7968 tp->t_sndrexmitpack++; 7969 TCPSTAT_INC(tcps_sndrexmitpack); 7970 TCPSTAT_ADD(tcps_sndrexmitbyte, len); 7971 } 7972 #ifdef NETFLIX_STATS 7973 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB, 7974 len); 7975 #endif 7976 } else { 7977 TCPSTAT_INC(tcps_sndpack); 7978 TCPSTAT_ADD(tcps_sndbyte, len); 7979 #ifdef NETFLIX_STATS 7980 stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB, 7981 len); 7982 #endif 7983 } 7984 /* 7985 * If we're sending everything we've got, set PUSH. (This 7986 * will keep happy those implementations which only give 7987 * data to the user when a buffer fills or a PUSH comes in.) 7988 */ 7989 if (sb_offset + len == sbused(sb) && 7990 sbused(sb) && 7991 !(flags & TH_SYN)) 7992 flags |= TH_PUSH; 7993 7994 /* 7995 * Are we doing hptsi, if so we must calculate the slot. We 7996 * only do hptsi in ESTABLISHED and with no RESET being 7997 * sent where we have data to send. 7998 */ 7999 if (((tp->t_state == TCPS_ESTABLISHED) || 8000 (tp->t_state == TCPS_CLOSE_WAIT) || 8001 ((tp->t_state == TCPS_FIN_WAIT_1) && 8002 ((tp->t_flags & TF_SENTFIN) == 0) && 8003 ((flags & TH_FIN) == 0))) && 8004 ((flags & TH_RST) == 0) && 8005 (rack->rc_always_pace)) { 8006 /* 8007 * We use the most optimistic possible cwnd/srtt for 8008 * sending calculations. This will make our 8009 * calculation anticipate getting more through 8010 * quicker then possible. But thats ok we don't want 8011 * the peer to have a gap in data sending. 8012 */ 8013 uint32_t srtt, cwnd, tr_perms = 0; 8014 8015 if (rack->r_ctl.rc_rack_min_rtt) 8016 srtt = rack->r_ctl.rc_rack_min_rtt; 8017 else 8018 srtt = TICKS_2_MSEC((tp->t_srtt >> TCP_RTT_SHIFT)); 8019 if (rack->r_ctl.rc_rack_largest_cwnd) 8020 cwnd = rack->r_ctl.rc_rack_largest_cwnd; 8021 else 8022 cwnd = tp->snd_cwnd; 8023 tr_perms = cwnd / srtt; 8024 if (tr_perms == 0) { 8025 tr_perms = tp->t_maxseg; 8026 } 8027 tot_len_this_send += len; 8028 /* 8029 * Calculate how long this will take to drain, if 8030 * the calculation comes out to zero, thats ok we 8031 * will use send_a_lot to possibly spin around for 8032 * more increasing tot_len_this_send to the point 8033 * that its going to require a pace, or we hit the 8034 * cwnd. Which in that case we are just waiting for 8035 * a ACK. 8036 */ 8037 slot = tot_len_this_send / tr_perms; 8038 /* Now do we reduce the time so we don't run dry? */ 8039 if (slot && rack->rc_pace_reduce) { 8040 int32_t reduce; 8041 8042 reduce = (slot / rack->rc_pace_reduce); 8043 if (reduce < slot) { 8044 slot -= reduce; 8045 } else 8046 slot = 0; 8047 } 8048 if (rack->r_enforce_min_pace && 8049 (slot == 0) && 8050 (tot_len_this_send >= (rack->r_min_pace_seg_thresh * tp->t_maxseg))) { 8051 /* We are enforcing a minimum pace time of 1ms */ 8052 slot = rack->r_enforce_min_pace; 8053 } 8054 } 8055 SOCKBUF_UNLOCK(sb); 8056 } else { 8057 SOCKBUF_UNLOCK(sb); 8058 if (tp->t_flags & TF_ACKNOW) 8059 TCPSTAT_INC(tcps_sndacks); 8060 else if (flags & (TH_SYN | TH_FIN | TH_RST)) 8061 TCPSTAT_INC(tcps_sndctrl); 8062 else if (SEQ_GT(tp->snd_up, tp->snd_una)) 8063 TCPSTAT_INC(tcps_sndurg); 8064 else 8065 TCPSTAT_INC(tcps_sndwinup); 8066 8067 m = m_gethdr(M_NOWAIT, MT_DATA); 8068 if (m == NULL) { 8069 error = ENOBUFS; 8070 sack_rxmit = 0; 8071 goto out; 8072 } 8073 #ifdef INET6 8074 if (isipv6 && (MHLEN < hdrlen + max_linkhdr) && 8075 MHLEN >= hdrlen) { 8076 M_ALIGN(m, hdrlen); 8077 } else 8078 #endif 8079 m->m_data += max_linkhdr; 8080 m->m_len = hdrlen; 8081 } 8082 SOCKBUF_UNLOCK_ASSERT(sb); 8083 m->m_pkthdr.rcvif = (struct ifnet *)0; 8084 #ifdef MAC 8085 mac_inpcb_create_mbuf(inp, m); 8086 #endif 8087 #ifdef INET6 8088 if (isipv6) { 8089 ip6 = mtod(m, struct ip6_hdr *); 8090 #ifdef NETFLIX_TCPOUDP 8091 if (tp->t_port) { 8092 udp = (struct udphdr *)((caddr_t)ip6 + ipoptlen + sizeof(struct ip6_hdr)); 8093 udp->uh_sport = htons(V_tcp_udp_tunneling_port); 8094 udp->uh_dport = tp->t_port; 8095 ulen = hdrlen + len - sizeof(struct ip6_hdr); 8096 udp->uh_ulen = htons(ulen); 8097 th = (struct tcphdr *)(udp + 1); 8098 } else 8099 #endif 8100 th = (struct tcphdr *)(ip6 + 1); 8101 tcpip_fillheaders(inp, ip6, th); 8102 } else 8103 #endif /* INET6 */ 8104 { 8105 ip = mtod(m, struct ip *); 8106 #ifdef TCPDEBUG 8107 ipov = (struct ipovly *)ip; 8108 #endif 8109 #ifdef NETFLIX_TCPOUDP 8110 if (tp->t_port) { 8111 udp = (struct udphdr *)((caddr_t)ip + ipoptlen + sizeof(struct ip)); 8112 udp->uh_sport = htons(V_tcp_udp_tunneling_port); 8113 udp->uh_dport = tp->t_port; 8114 ulen = hdrlen + len - sizeof(struct ip); 8115 udp->uh_ulen = htons(ulen); 8116 th = (struct tcphdr *)(udp + 1); 8117 } else 8118 #endif 8119 th = (struct tcphdr *)(ip + 1); 8120 tcpip_fillheaders(inp, ip, th); 8121 } 8122 /* 8123 * Fill in fields, remembering maximum advertised window for use in 8124 * delaying messages about window sizes. If resending a FIN, be sure 8125 * not to use a new sequence number. 8126 */ 8127 if (flags & TH_FIN && tp->t_flags & TF_SENTFIN && 8128 tp->snd_nxt == tp->snd_max) 8129 tp->snd_nxt--; 8130 /* 8131 * If we are starting a connection, send ECN setup SYN packet. If we 8132 * are on a retransmit, we may resend those bits a number of times 8133 * as per RFC 3168. 8134 */ 8135 if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn == 1) { 8136 if (tp->t_rxtshift >= 1) { 8137 if (tp->t_rxtshift <= V_tcp_ecn_maxretries) 8138 flags |= TH_ECE | TH_CWR; 8139 } else 8140 flags |= TH_ECE | TH_CWR; 8141 } 8142 if (tp->t_state == TCPS_ESTABLISHED && 8143 (tp->t_flags & TF_ECN_PERMIT)) { 8144 /* 8145 * If the peer has ECN, mark data packets with ECN capable 8146 * transmission (ECT). Ignore pure ack packets, 8147 * retransmissions and window probes. 8148 */ 8149 if (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) && 8150 !((tp->t_flags & TF_FORCEDATA) && len == 1)) { 8151 #ifdef INET6 8152 if (isipv6) 8153 ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20); 8154 else 8155 #endif 8156 ip->ip_tos |= IPTOS_ECN_ECT0; 8157 TCPSTAT_INC(tcps_ecn_ect0); 8158 } 8159 /* 8160 * Reply with proper ECN notifications. 8161 */ 8162 if (tp->t_flags & TF_ECN_SND_CWR) { 8163 flags |= TH_CWR; 8164 tp->t_flags &= ~TF_ECN_SND_CWR; 8165 } 8166 if (tp->t_flags & TF_ECN_SND_ECE) 8167 flags |= TH_ECE; 8168 } 8169 /* 8170 * If we are doing retransmissions, then snd_nxt will not reflect 8171 * the first unsent octet. For ACK only packets, we do not want the 8172 * sequence number of the retransmitted packet, we want the sequence 8173 * number of the next unsent octet. So, if there is no data (and no 8174 * SYN or FIN), use snd_max instead of snd_nxt when filling in 8175 * ti_seq. But if we are in persist state, snd_max might reflect 8176 * one byte beyond the right edge of the window, so use snd_nxt in 8177 * that case, since we know we aren't doing a retransmission. 8178 * (retransmit and persist are mutually exclusive...) 8179 */ 8180 if (sack_rxmit == 0) { 8181 if (len || (flags & (TH_SYN | TH_FIN)) || 8182 rack->rc_in_persist) { 8183 th->th_seq = htonl(tp->snd_nxt); 8184 rack_seq = tp->snd_nxt; 8185 } else if (flags & TH_RST) { 8186 /* 8187 * For a Reset send the last cum ack in sequence 8188 * (this like any other choice may still generate a 8189 * challenge ack, if a ack-update packet is in 8190 * flight). 8191 */ 8192 th->th_seq = htonl(tp->snd_una); 8193 rack_seq = tp->snd_una; 8194 } else { 8195 th->th_seq = htonl(tp->snd_max); 8196 rack_seq = tp->snd_max; 8197 } 8198 } else { 8199 th->th_seq = htonl(rsm->r_start); 8200 rack_seq = rsm->r_start; 8201 } 8202 th->th_ack = htonl(tp->rcv_nxt); 8203 if (optlen) { 8204 bcopy(opt, th + 1, optlen); 8205 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 8206 } 8207 th->th_flags = flags; 8208 /* 8209 * Calculate receive window. Don't shrink window, but avoid silly 8210 * window syndrome. 8211 */ 8212 if (recwin < (long)(so->so_rcv.sb_hiwat / 4) && 8213 recwin < (long)tp->t_maxseg) 8214 recwin = 0; 8215 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) && 8216 recwin < (long)(tp->rcv_adv - tp->rcv_nxt)) 8217 recwin = (long)(tp->rcv_adv - tp->rcv_nxt); 8218 if (recwin > (long)TCP_MAXWIN << tp->rcv_scale) 8219 recwin = (long)TCP_MAXWIN << tp->rcv_scale; 8220 8221 /* 8222 * According to RFC1323 the window field in a SYN (i.e., a <SYN> or 8223 * <SYN,ACK>) segment itself is never scaled. The <SYN,ACK> case is 8224 * handled in syncache. 8225 */ 8226 if (flags & TH_SYN) 8227 th->th_win = htons((u_short) 8228 (min(sbspace(&so->so_rcv), TCP_MAXWIN))); 8229 else 8230 th->th_win = htons((u_short)(recwin >> tp->rcv_scale)); 8231 /* 8232 * Adjust the RXWIN0SENT flag - indicate that we have advertised a 0 8233 * window. This may cause the remote transmitter to stall. This 8234 * flag tells soreceive() to disable delayed acknowledgements when 8235 * draining the buffer. This can occur if the receiver is 8236 * attempting to read more data than can be buffered prior to 8237 * transmitting on the connection. 8238 */ 8239 if (th->th_win == 0) { 8240 tp->t_sndzerowin++; 8241 tp->t_flags |= TF_RXWIN0SENT; 8242 } else 8243 tp->t_flags &= ~TF_RXWIN0SENT; 8244 if (SEQ_GT(tp->snd_up, tp->snd_nxt)) { 8245 th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt)); 8246 th->th_flags |= TH_URG; 8247 } else 8248 /* 8249 * If no urgent pointer to send, then we pull the urgent 8250 * pointer to the left edge of the send window so that it 8251 * doesn't drift into the send window on sequence number 8252 * wraparound. 8253 */ 8254 tp->snd_up = tp->snd_una; /* drag it along */ 8255 8256 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) 8257 if (to.to_flags & TOF_SIGNATURE) { 8258 /* 8259 * Calculate MD5 signature and put it into the place 8260 * determined before. 8261 * NOTE: since TCP options buffer doesn't point into 8262 * mbuf's data, calculate offset and use it. 8263 */ 8264 if (!TCPMD5_ENABLED() || TCPMD5_OUTPUT(m, th, 8265 (u_char *)(th + 1) + (to.to_signature - opt)) != 0) { 8266 /* 8267 * Do not send segment if the calculation of MD5 8268 * digest has failed. 8269 */ 8270 goto out; 8271 } 8272 } 8273 #endif 8274 8275 /* 8276 * Put TCP length in extended header, and then checksum extended 8277 * header and data. 8278 */ 8279 m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */ 8280 #ifdef INET6 8281 if (isipv6) { 8282 /* 8283 * ip6_plen is not need to be filled now, and will be filled 8284 * in ip6_output. 8285 */ 8286 if (tp->t_port) { 8287 m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; 8288 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 8289 udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0); 8290 th->th_sum = htons(0); 8291 } else { 8292 m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; 8293 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 8294 th->th_sum = in6_cksum_pseudo(ip6, 8295 sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP, 8296 0); 8297 } 8298 } 8299 #endif 8300 #if defined(INET6) && defined(INET) 8301 else 8302 #endif 8303 #ifdef INET 8304 { 8305 if (tp->t_port) { 8306 m->m_pkthdr.csum_flags = CSUM_UDP; 8307 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 8308 udp->uh_sum = in_pseudo(ip->ip_src.s_addr, 8309 ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP)); 8310 th->th_sum = htons(0); 8311 } else { 8312 m->m_pkthdr.csum_flags = CSUM_TCP; 8313 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 8314 th->th_sum = in_pseudo(ip->ip_src.s_addr, 8315 ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + 8316 IPPROTO_TCP + len + optlen)); 8317 } 8318 /* IP version must be set here for ipv4/ipv6 checking later */ 8319 KASSERT(ip->ip_v == IPVERSION, 8320 ("%s: IP version incorrect: %d", __func__, ip->ip_v)); 8321 } 8322 #endif 8323 8324 /* 8325 * Enable TSO and specify the size of the segments. The TCP pseudo 8326 * header checksum is always provided. XXX: Fixme: This is currently 8327 * not the case for IPv6. 8328 */ 8329 if (tso) { 8330 KASSERT(len > tp->t_maxseg - optlen, 8331 ("%s: len <= tso_segsz", __func__)); 8332 m->m_pkthdr.csum_flags |= CSUM_TSO; 8333 m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen; 8334 } 8335 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 8336 KASSERT(len + hdrlen + ipoptlen - ipsec_optlen == m_length(m, NULL), 8337 ("%s: mbuf chain shorter than expected: %d + %u + %u - %u != %u", 8338 __func__, len, hdrlen, ipoptlen, ipsec_optlen, m_length(m, NULL))); 8339 #else 8340 KASSERT(len + hdrlen + ipoptlen == m_length(m, NULL), 8341 ("%s: mbuf chain shorter than expected: %d + %u + %u != %u", 8342 __func__, len, hdrlen, ipoptlen, m_length(m, NULL))); 8343 #endif 8344 8345 #ifdef TCP_HHOOK 8346 /* Run HHOOK_TCP_ESTABLISHED_OUT helper hooks. */ 8347 hhook_run_tcp_est_out(tp, th, &to, len, tso); 8348 #endif 8349 8350 #ifdef TCPDEBUG 8351 /* 8352 * Trace. 8353 */ 8354 if (so->so_options & SO_DEBUG) { 8355 u_short save = 0; 8356 8357 #ifdef INET6 8358 if (!isipv6) 8359 #endif 8360 { 8361 save = ipov->ih_len; 8362 ipov->ih_len = htons(m->m_pkthdr.len /* - hdrlen + 8363 * (th->th_off << 2) */ ); 8364 } 8365 tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0); 8366 #ifdef INET6 8367 if (!isipv6) 8368 #endif 8369 ipov->ih_len = save; 8370 } 8371 #endif /* TCPDEBUG */ 8372 8373 /* We're getting ready to send; log now. */ 8374 if (tp->t_logstate != TCP_LOG_STATE_OFF) { 8375 union tcp_log_stackspecific log; 8376 8377 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 8378 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 8379 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 8380 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; 8381 if (rsm || sack_rxmit) { 8382 log.u_bbr.flex8 = 1; 8383 } else { 8384 log.u_bbr.flex8 = 0; 8385 } 8386 lgb = tcp_log_event_(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_OUT, ERRNO_UNK, 8387 len, &log, false, NULL, NULL, 0, NULL); 8388 } else 8389 lgb = NULL; 8390 8391 /* 8392 * Fill in IP length and desired time to live and send to IP level. 8393 * There should be a better way to handle ttl and tos; we could keep 8394 * them in the template, but need a way to checksum without them. 8395 */ 8396 /* 8397 * m->m_pkthdr.len should have been set before cksum calcuration, 8398 * because in6_cksum() need it. 8399 */ 8400 #ifdef INET6 8401 if (isipv6) { 8402 /* 8403 * we separately set hoplimit for every segment, since the 8404 * user might want to change the value via setsockopt. Also, 8405 * desired default hop limit might be changed via Neighbor 8406 * Discovery. 8407 */ 8408 ip6->ip6_hlim = in6_selecthlim(inp, NULL); 8409 8410 /* 8411 * Set the packet size here for the benefit of DTrace 8412 * probes. ip6_output() will set it properly; it's supposed 8413 * to include the option header lengths as well. 8414 */ 8415 ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6)); 8416 8417 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) 8418 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 8419 else 8420 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 8421 8422 if (tp->t_state == TCPS_SYN_SENT) 8423 TCP_PROBE5(connect__request, NULL, tp, ip6, tp, th); 8424 8425 TCP_PROBE5(send, NULL, tp, ip6, tp, th); 8426 /* TODO: IPv6 IP6TOS_ECT bit on */ 8427 error = ip6_output(m, tp->t_inpcb->in6p_outputopts, 8428 &inp->inp_route6, 8429 ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), 8430 NULL, NULL, inp); 8431 8432 if (error == EMSGSIZE && inp->inp_route6.ro_rt != NULL) 8433 mtu = inp->inp_route6.ro_rt->rt_mtu; 8434 } 8435 #endif /* INET6 */ 8436 #if defined(INET) && defined(INET6) 8437 else 8438 #endif 8439 #ifdef INET 8440 { 8441 ip->ip_len = htons(m->m_pkthdr.len); 8442 #ifdef INET6 8443 if (inp->inp_vflag & INP_IPV6PROTO) 8444 ip->ip_ttl = in6_selecthlim(inp, NULL); 8445 #endif /* INET6 */ 8446 /* 8447 * If we do path MTU discovery, then we set DF on every 8448 * packet. This might not be the best thing to do according 8449 * to RFC3390 Section 2. However the tcp hostcache migitates 8450 * the problem so it affects only the first tcp connection 8451 * with a host. 8452 * 8453 * NB: Don't set DF on small MTU/MSS to have a safe 8454 * fallback. 8455 */ 8456 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) { 8457 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 8458 if (tp->t_port == 0 || len < V_tcp_minmss) { 8459 ip->ip_off |= htons(IP_DF); 8460 } 8461 } else { 8462 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 8463 } 8464 8465 if (tp->t_state == TCPS_SYN_SENT) 8466 TCP_PROBE5(connect__request, NULL, tp, ip, tp, th); 8467 8468 TCP_PROBE5(send, NULL, tp, ip, tp, th); 8469 8470 error = ip_output(m, tp->t_inpcb->inp_options, &inp->inp_route, 8471 ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), 0, 8472 inp); 8473 if (error == EMSGSIZE && inp->inp_route.ro_rt != NULL) 8474 mtu = inp->inp_route.ro_rt->rt_mtu; 8475 } 8476 #endif /* INET */ 8477 8478 out: 8479 if (lgb) { 8480 lgb->tlb_errno = error; 8481 lgb = NULL; 8482 } 8483 /* 8484 * In transmit state, time the transmission and arrange for the 8485 * retransmit. In persist state, just set snd_max. 8486 */ 8487 if (error == 0) { 8488 if (len == 0) 8489 counter_u64_add(rack_out_size[TCP_MSS_ACCT_SNDACK], 1); 8490 else if (len == 1) { 8491 counter_u64_add(rack_out_size[TCP_MSS_ACCT_PERSIST], 1); 8492 } else if (len > 1) { 8493 int idx; 8494 8495 idx = (len / tp->t_maxseg) + 3; 8496 if (idx >= TCP_MSS_ACCT_ATIMER) 8497 counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1); 8498 else 8499 counter_u64_add(rack_out_size[idx], 1); 8500 } 8501 } 8502 if (sub_from_prr && (error == 0)) { 8503 rack->r_ctl.rc_prr_sndcnt -= len; 8504 } 8505 sub_from_prr = 0; 8506 rack_log_output(tp, &to, len, rack_seq, (uint8_t) flags, error, cts, 8507 pass, rsm); 8508 if ((tp->t_flags & TF_FORCEDATA) == 0 || 8509 (rack->rc_in_persist == 0)) { 8510 #ifdef NETFLIX_STATS 8511 tcp_seq startseq = tp->snd_nxt; 8512 #endif 8513 8514 /* 8515 * Advance snd_nxt over sequence space of this segment. 8516 */ 8517 if (error) 8518 /* We don't log or do anything with errors */ 8519 goto timer; 8520 8521 if (flags & (TH_SYN | TH_FIN)) { 8522 if (flags & TH_SYN) 8523 tp->snd_nxt++; 8524 if (flags & TH_FIN) { 8525 tp->snd_nxt++; 8526 tp->t_flags |= TF_SENTFIN; 8527 } 8528 } 8529 /* In the ENOBUFS case we do *not* update snd_max */ 8530 if (sack_rxmit) 8531 goto timer; 8532 8533 tp->snd_nxt += len; 8534 if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { 8535 if (tp->snd_una == tp->snd_max) { 8536 /* 8537 * Update the time we just added data since 8538 * none was outstanding. 8539 */ 8540 rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__); 8541 tp->t_acktime = ticks; 8542 } 8543 tp->snd_max = tp->snd_nxt; 8544 #ifdef NETFLIX_STATS 8545 if (!(tp->t_flags & TF_GPUTINPROG) && len) { 8546 tp->t_flags |= TF_GPUTINPROG; 8547 tp->gput_seq = startseq; 8548 tp->gput_ack = startseq + 8549 ulmin(sbavail(sb) - sb_offset, sendwin); 8550 tp->gput_ts = tcp_ts_getticks(); 8551 } 8552 #endif 8553 } 8554 /* 8555 * Set retransmit timer if not currently set, and not doing 8556 * a pure ack or a keep-alive probe. Initial value for 8557 * retransmit timer is smoothed round-trip time + 2 * 8558 * round-trip time variance. Initialize shift counter which 8559 * is used for backoff of retransmit time. 8560 */ 8561 timer: 8562 if ((tp->snd_wnd == 0) && 8563 TCPS_HAVEESTABLISHED(tp->t_state)) { 8564 /* 8565 * If the persists timer was set above (right before 8566 * the goto send), and still needs to be on. Lets 8567 * make sure all is canceled. If the persist timer 8568 * is not running, we want to get it up. 8569 */ 8570 if (rack->rc_in_persist == 0) { 8571 rack_enter_persist(tp, rack, cts); 8572 } 8573 } 8574 } else { 8575 /* 8576 * Persist case, update snd_max but since we are in persist 8577 * mode (no window) we do not update snd_nxt. 8578 */ 8579 int32_t xlen = len; 8580 8581 if (error) 8582 goto nomore; 8583 8584 if (flags & TH_SYN) 8585 ++xlen; 8586 if (flags & TH_FIN) { 8587 ++xlen; 8588 tp->t_flags |= TF_SENTFIN; 8589 } 8590 /* In the ENOBUFS case we do *not* update snd_max */ 8591 if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max)) { 8592 if (tp->snd_una == tp->snd_max) { 8593 /* 8594 * Update the time we just added data since 8595 * none was outstanding. 8596 */ 8597 rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__); 8598 tp->t_acktime = ticks; 8599 } 8600 tp->snd_max = tp->snd_nxt + len; 8601 } 8602 } 8603 nomore: 8604 if (error) { 8605 SOCKBUF_UNLOCK_ASSERT(sb); /* Check gotos. */ 8606 /* 8607 * Failures do not advance the seq counter above. For the 8608 * case of ENOBUFS we will fall out and retry in 1ms with 8609 * the hpts. Everything else will just have to retransmit 8610 * with the timer. 8611 * 8612 * In any case, we do not want to loop around for another 8613 * send without a good reason. 8614 */ 8615 sendalot = 0; 8616 switch (error) { 8617 case EPERM: 8618 tp->t_flags &= ~TF_FORCEDATA; 8619 tp->t_softerror = error; 8620 return (error); 8621 case ENOBUFS: 8622 if (slot == 0) { 8623 /* 8624 * Pace us right away to retry in a some 8625 * time 8626 */ 8627 slot = 1 + rack->rc_enobuf; 8628 if (rack->rc_enobuf < 255) 8629 rack->rc_enobuf++; 8630 if (slot > (rack->rc_rack_rtt / 2)) { 8631 slot = rack->rc_rack_rtt / 2; 8632 } 8633 if (slot < 10) 8634 slot = 10; 8635 } 8636 counter_u64_add(rack_saw_enobuf, 1); 8637 error = 0; 8638 goto enobufs; 8639 case EMSGSIZE: 8640 /* 8641 * For some reason the interface we used initially 8642 * to send segments changed to another or lowered 8643 * its MTU. If TSO was active we either got an 8644 * interface without TSO capabilits or TSO was 8645 * turned off. If we obtained mtu from ip_output() 8646 * then update it and try again. 8647 */ 8648 if (tso) 8649 tp->t_flags &= ~TF_TSO; 8650 if (mtu != 0) { 8651 tcp_mss_update(tp, -1, mtu, NULL, NULL); 8652 goto again; 8653 } 8654 slot = 10; 8655 rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, 0, 1); 8656 tp->t_flags &= ~TF_FORCEDATA; 8657 return (error); 8658 case ENETUNREACH: 8659 counter_u64_add(rack_saw_enetunreach, 1); 8660 case EHOSTDOWN: 8661 case EHOSTUNREACH: 8662 case ENETDOWN: 8663 if (TCPS_HAVERCVDSYN(tp->t_state)) { 8664 tp->t_softerror = error; 8665 } 8666 /* FALLTHROUGH */ 8667 default: 8668 slot = 10; 8669 rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, 0, 1); 8670 tp->t_flags &= ~TF_FORCEDATA; 8671 return (error); 8672 } 8673 } else { 8674 rack->rc_enobuf = 0; 8675 } 8676 TCPSTAT_INC(tcps_sndtotal); 8677 8678 /* 8679 * Data sent (as far as we can tell). If this advertises a larger 8680 * window than any other segment, then remember the size of the 8681 * advertised window. Any pending ACK has now been sent. 8682 */ 8683 if (recwin > 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv)) 8684 tp->rcv_adv = tp->rcv_nxt + recwin; 8685 tp->last_ack_sent = tp->rcv_nxt; 8686 tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); 8687 enobufs: 8688 rack->r_tlp_running = 0; 8689 if ((flags & TH_RST) || (would_have_fin == 1)) { 8690 /* 8691 * We don't send again after a RST. We also do *not* send 8692 * again if we would have had a find, but now have 8693 * outstanding data. 8694 */ 8695 slot = 0; 8696 sendalot = 0; 8697 } 8698 if (slot) { 8699 /* set the rack tcb into the slot N */ 8700 counter_u64_add(rack_paced_segments, 1); 8701 } else if (sendalot) { 8702 if (len) 8703 counter_u64_add(rack_unpaced_segments, 1); 8704 sack_rxmit = 0; 8705 tp->t_flags &= ~TF_FORCEDATA; 8706 goto again; 8707 } else if (len) { 8708 counter_u64_add(rack_unpaced_segments, 1); 8709 } 8710 tp->t_flags &= ~TF_FORCEDATA; 8711 rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, tot_len_this_send, 1); 8712 return (error); 8713 } 8714 8715 /* 8716 * rack_ctloutput() must drop the inpcb lock before performing copyin on 8717 * socket option arguments. When it re-acquires the lock after the copy, it 8718 * has to revalidate that the connection is still valid for the socket 8719 * option. 8720 */ 8721 static int 8722 rack_set_sockopt(struct socket *so, struct sockopt *sopt, 8723 struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack) 8724 { 8725 int32_t error = 0, optval; 8726 8727 switch (sopt->sopt_name) { 8728 case TCP_RACK_PROP_RATE: 8729 case TCP_RACK_PROP: 8730 case TCP_RACK_TLP_REDUCE: 8731 case TCP_RACK_EARLY_RECOV: 8732 case TCP_RACK_PACE_ALWAYS: 8733 case TCP_DELACK: 8734 case TCP_RACK_PACE_REDUCE: 8735 case TCP_RACK_PACE_MAX_SEG: 8736 case TCP_RACK_PRR_SENDALOT: 8737 case TCP_RACK_MIN_TO: 8738 case TCP_RACK_EARLY_SEG: 8739 case TCP_RACK_REORD_THRESH: 8740 case TCP_RACK_REORD_FADE: 8741 case TCP_RACK_TLP_THRESH: 8742 case TCP_RACK_PKT_DELAY: 8743 case TCP_RACK_TLP_USE: 8744 case TCP_RACK_TLP_INC_VAR: 8745 case TCP_RACK_IDLE_REDUCE_HIGH: 8746 case TCP_RACK_MIN_PACE: 8747 case TCP_RACK_MIN_PACE_SEG: 8748 case TCP_BBR_RACK_RTT_USE: 8749 case TCP_DATA_AFTER_CLOSE: 8750 break; 8751 default: 8752 return (tcp_default_ctloutput(so, sopt, inp, tp)); 8753 break; 8754 } 8755 INP_WUNLOCK(inp); 8756 error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); 8757 if (error) 8758 return (error); 8759 INP_WLOCK(inp); 8760 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 8761 INP_WUNLOCK(inp); 8762 return (ECONNRESET); 8763 } 8764 tp = intotcpcb(inp); 8765 rack = (struct tcp_rack *)tp->t_fb_ptr; 8766 switch (sopt->sopt_name) { 8767 case TCP_RACK_PROP_RATE: 8768 if ((optval <= 0) || (optval >= 100)) { 8769 error = EINVAL; 8770 break; 8771 } 8772 RACK_OPTS_INC(tcp_rack_prop_rate); 8773 rack->r_ctl.rc_prop_rate = optval; 8774 break; 8775 case TCP_RACK_TLP_USE: 8776 if ((optval < TLP_USE_ID) || (optval > TLP_USE_TWO_TWO)) { 8777 error = EINVAL; 8778 break; 8779 } 8780 RACK_OPTS_INC(tcp_tlp_use); 8781 rack->rack_tlp_threshold_use = optval; 8782 break; 8783 case TCP_RACK_PROP: 8784 /* RACK proportional rate reduction (bool) */ 8785 RACK_OPTS_INC(tcp_rack_prop); 8786 rack->r_ctl.rc_prop_reduce = optval; 8787 break; 8788 case TCP_RACK_TLP_REDUCE: 8789 /* RACK TLP cwnd reduction (bool) */ 8790 RACK_OPTS_INC(tcp_rack_tlp_reduce); 8791 rack->r_ctl.rc_tlp_cwnd_reduce = optval; 8792 break; 8793 case TCP_RACK_EARLY_RECOV: 8794 /* Should recovery happen early (bool) */ 8795 RACK_OPTS_INC(tcp_rack_early_recov); 8796 rack->r_ctl.rc_early_recovery = optval; 8797 break; 8798 case TCP_RACK_PACE_ALWAYS: 8799 /* Use the always pace method (bool) */ 8800 RACK_OPTS_INC(tcp_rack_pace_always); 8801 if (optval > 0) 8802 rack->rc_always_pace = 1; 8803 else 8804 rack->rc_always_pace = 0; 8805 break; 8806 case TCP_RACK_PACE_REDUCE: 8807 /* RACK Hptsi reduction factor (divisor) */ 8808 RACK_OPTS_INC(tcp_rack_pace_reduce); 8809 if (optval) 8810 /* Must be non-zero */ 8811 rack->rc_pace_reduce = optval; 8812 else 8813 error = EINVAL; 8814 break; 8815 case TCP_RACK_PACE_MAX_SEG: 8816 /* Max segments in a pace */ 8817 RACK_OPTS_INC(tcp_rack_max_seg); 8818 rack->rc_pace_max_segs = optval; 8819 break; 8820 case TCP_RACK_PRR_SENDALOT: 8821 /* Allow PRR to send more than one seg */ 8822 RACK_OPTS_INC(tcp_rack_prr_sendalot); 8823 rack->r_ctl.rc_prr_sendalot = optval; 8824 break; 8825 case TCP_RACK_MIN_TO: 8826 /* Minimum time between rack t-o's in ms */ 8827 RACK_OPTS_INC(tcp_rack_min_to); 8828 rack->r_ctl.rc_min_to = optval; 8829 break; 8830 case TCP_RACK_EARLY_SEG: 8831 /* If early recovery max segments */ 8832 RACK_OPTS_INC(tcp_rack_early_seg); 8833 rack->r_ctl.rc_early_recovery_segs = optval; 8834 break; 8835 case TCP_RACK_REORD_THRESH: 8836 /* RACK reorder threshold (shift amount) */ 8837 RACK_OPTS_INC(tcp_rack_reord_thresh); 8838 if ((optval > 0) && (optval < 31)) 8839 rack->r_ctl.rc_reorder_shift = optval; 8840 else 8841 error = EINVAL; 8842 break; 8843 case TCP_RACK_REORD_FADE: 8844 /* Does reordering fade after ms time */ 8845 RACK_OPTS_INC(tcp_rack_reord_fade); 8846 rack->r_ctl.rc_reorder_fade = optval; 8847 break; 8848 case TCP_RACK_TLP_THRESH: 8849 /* RACK TLP theshold i.e. srtt+(srtt/N) */ 8850 RACK_OPTS_INC(tcp_rack_tlp_thresh); 8851 if (optval) 8852 rack->r_ctl.rc_tlp_threshold = optval; 8853 else 8854 error = EINVAL; 8855 break; 8856 case TCP_RACK_PKT_DELAY: 8857 /* RACK added ms i.e. rack-rtt + reord + N */ 8858 RACK_OPTS_INC(tcp_rack_pkt_delay); 8859 rack->r_ctl.rc_pkt_delay = optval; 8860 break; 8861 case TCP_RACK_TLP_INC_VAR: 8862 /* Does TLP include rtt variance in t-o */ 8863 RACK_OPTS_INC(tcp_rack_tlp_inc_var); 8864 rack->r_ctl.rc_prr_inc_var = optval; 8865 break; 8866 case TCP_RACK_IDLE_REDUCE_HIGH: 8867 RACK_OPTS_INC(tcp_rack_idle_reduce_high); 8868 if (optval) 8869 rack->r_idle_reduce_largest = 1; 8870 else 8871 rack->r_idle_reduce_largest = 0; 8872 break; 8873 case TCP_DELACK: 8874 if (optval == 0) 8875 tp->t_delayed_ack = 0; 8876 else 8877 tp->t_delayed_ack = 1; 8878 if (tp->t_flags & TF_DELACK) { 8879 tp->t_flags &= ~TF_DELACK; 8880 tp->t_flags |= TF_ACKNOW; 8881 rack_output(tp); 8882 } 8883 break; 8884 case TCP_RACK_MIN_PACE: 8885 RACK_OPTS_INC(tcp_rack_min_pace); 8886 if (optval > 3) 8887 rack->r_enforce_min_pace = 3; 8888 else 8889 rack->r_enforce_min_pace = optval; 8890 break; 8891 case TCP_RACK_MIN_PACE_SEG: 8892 RACK_OPTS_INC(tcp_rack_min_pace_seg); 8893 if (optval >= 16) 8894 rack->r_min_pace_seg_thresh = 15; 8895 else 8896 rack->r_min_pace_seg_thresh = optval; 8897 break; 8898 case TCP_BBR_RACK_RTT_USE: 8899 if ((optval != USE_RTT_HIGH) && 8900 (optval != USE_RTT_LOW) && 8901 (optval != USE_RTT_AVG)) 8902 error = EINVAL; 8903 else 8904 rack->r_ctl.rc_rate_sample_method = optval; 8905 break; 8906 case TCP_DATA_AFTER_CLOSE: 8907 if (optval) 8908 rack->rc_allow_data_af_clo = 1; 8909 else 8910 rack->rc_allow_data_af_clo = 0; 8911 break; 8912 default: 8913 return (tcp_default_ctloutput(so, sopt, inp, tp)); 8914 break; 8915 } 8916 #ifdef NETFLIX_STATS 8917 tcp_log_socket_option(tp, sopt->sopt_name, optval, error); 8918 #endif 8919 INP_WUNLOCK(inp); 8920 return (error); 8921 } 8922 8923 static int 8924 rack_get_sockopt(struct socket *so, struct sockopt *sopt, 8925 struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack) 8926 { 8927 int32_t error, optval; 8928 8929 /* 8930 * Because all our options are either boolean or an int, we can just 8931 * pull everything into optval and then unlock and copy. If we ever 8932 * add a option that is not a int, then this will have quite an 8933 * impact to this routine. 8934 */ 8935 switch (sopt->sopt_name) { 8936 case TCP_RACK_PROP_RATE: 8937 optval = rack->r_ctl.rc_prop_rate; 8938 break; 8939 case TCP_RACK_PROP: 8940 /* RACK proportional rate reduction (bool) */ 8941 optval = rack->r_ctl.rc_prop_reduce; 8942 break; 8943 case TCP_RACK_TLP_REDUCE: 8944 /* RACK TLP cwnd reduction (bool) */ 8945 optval = rack->r_ctl.rc_tlp_cwnd_reduce; 8946 break; 8947 case TCP_RACK_EARLY_RECOV: 8948 /* Should recovery happen early (bool) */ 8949 optval = rack->r_ctl.rc_early_recovery; 8950 break; 8951 case TCP_RACK_PACE_REDUCE: 8952 /* RACK Hptsi reduction factor (divisor) */ 8953 optval = rack->rc_pace_reduce; 8954 break; 8955 case TCP_RACK_PACE_MAX_SEG: 8956 /* Max segments in a pace */ 8957 optval = rack->rc_pace_max_segs; 8958 break; 8959 case TCP_RACK_PACE_ALWAYS: 8960 /* Use the always pace method */ 8961 optval = rack->rc_always_pace; 8962 break; 8963 case TCP_RACK_PRR_SENDALOT: 8964 /* Allow PRR to send more than one seg */ 8965 optval = rack->r_ctl.rc_prr_sendalot; 8966 break; 8967 case TCP_RACK_MIN_TO: 8968 /* Minimum time between rack t-o's in ms */ 8969 optval = rack->r_ctl.rc_min_to; 8970 break; 8971 case TCP_RACK_EARLY_SEG: 8972 /* If early recovery max segments */ 8973 optval = rack->r_ctl.rc_early_recovery_segs; 8974 break; 8975 case TCP_RACK_REORD_THRESH: 8976 /* RACK reorder threshold (shift amount) */ 8977 optval = rack->r_ctl.rc_reorder_shift; 8978 break; 8979 case TCP_RACK_REORD_FADE: 8980 /* Does reordering fade after ms time */ 8981 optval = rack->r_ctl.rc_reorder_fade; 8982 break; 8983 case TCP_RACK_TLP_THRESH: 8984 /* RACK TLP theshold i.e. srtt+(srtt/N) */ 8985 optval = rack->r_ctl.rc_tlp_threshold; 8986 break; 8987 case TCP_RACK_PKT_DELAY: 8988 /* RACK added ms i.e. rack-rtt + reord + N */ 8989 optval = rack->r_ctl.rc_pkt_delay; 8990 break; 8991 case TCP_RACK_TLP_USE: 8992 optval = rack->rack_tlp_threshold_use; 8993 break; 8994 case TCP_RACK_TLP_INC_VAR: 8995 /* Does TLP include rtt variance in t-o */ 8996 optval = rack->r_ctl.rc_prr_inc_var; 8997 break; 8998 case TCP_RACK_IDLE_REDUCE_HIGH: 8999 optval = rack->r_idle_reduce_largest; 9000 break; 9001 case TCP_RACK_MIN_PACE: 9002 optval = rack->r_enforce_min_pace; 9003 break; 9004 case TCP_RACK_MIN_PACE_SEG: 9005 optval = rack->r_min_pace_seg_thresh; 9006 break; 9007 case TCP_BBR_RACK_RTT_USE: 9008 optval = rack->r_ctl.rc_rate_sample_method; 9009 break; 9010 case TCP_DELACK: 9011 optval = tp->t_delayed_ack; 9012 break; 9013 case TCP_DATA_AFTER_CLOSE: 9014 optval = rack->rc_allow_data_af_clo; 9015 break; 9016 default: 9017 return (tcp_default_ctloutput(so, sopt, inp, tp)); 9018 break; 9019 } 9020 INP_WUNLOCK(inp); 9021 error = sooptcopyout(sopt, &optval, sizeof optval); 9022 return (error); 9023 } 9024 9025 static int 9026 rack_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp) 9027 { 9028 int32_t error = EINVAL; 9029 struct tcp_rack *rack; 9030 9031 rack = (struct tcp_rack *)tp->t_fb_ptr; 9032 if (rack == NULL) { 9033 /* Huh? */ 9034 goto out; 9035 } 9036 if (sopt->sopt_dir == SOPT_SET) { 9037 return (rack_set_sockopt(so, sopt, inp, tp, rack)); 9038 } else if (sopt->sopt_dir == SOPT_GET) { 9039 return (rack_get_sockopt(so, sopt, inp, tp, rack)); 9040 } 9041 out: 9042 INP_WUNLOCK(inp); 9043 return (error); 9044 } 9045 9046 9047 struct tcp_function_block __tcp_rack = { 9048 .tfb_tcp_block_name = __XSTRING(STACKNAME), 9049 .tfb_tcp_output = rack_output, 9050 .tfb_tcp_do_segment = rack_do_segment, 9051 .tfb_tcp_hpts_do_segment = rack_hpts_do_segment, 9052 .tfb_tcp_ctloutput = rack_ctloutput, 9053 .tfb_tcp_fb_init = rack_init, 9054 .tfb_tcp_fb_fini = rack_fini, 9055 .tfb_tcp_timer_stop_all = rack_stopall, 9056 .tfb_tcp_timer_activate = rack_timer_activate, 9057 .tfb_tcp_timer_active = rack_timer_active, 9058 .tfb_tcp_timer_stop = rack_timer_stop, 9059 .tfb_tcp_rexmit_tmr = rack_remxt_tmr, 9060 .tfb_tcp_handoff_ok = rack_handoff_ok 9061 }; 9062 9063 static const char *rack_stack_names[] = { 9064 __XSTRING(STACKNAME), 9065 #ifdef STACKALIAS 9066 __XSTRING(STACKALIAS), 9067 #endif 9068 }; 9069 9070 static int 9071 rack_ctor(void *mem, int32_t size, void *arg, int32_t how) 9072 { 9073 memset(mem, 0, size); 9074 return (0); 9075 } 9076 9077 static void 9078 rack_dtor(void *mem, int32_t size, void *arg) 9079 { 9080 9081 } 9082 9083 static bool rack_mod_inited = false; 9084 9085 static int 9086 tcp_addrack(module_t mod, int32_t type, void *data) 9087 { 9088 int32_t err = 0; 9089 int num_stacks; 9090 9091 switch (type) { 9092 case MOD_LOAD: 9093 rack_zone = uma_zcreate(__XSTRING(MODNAME) "_map", 9094 sizeof(struct rack_sendmap), 9095 rack_ctor, rack_dtor, NULL, NULL, UMA_ALIGN_PTR, 0); 9096 9097 rack_pcb_zone = uma_zcreate(__XSTRING(MODNAME) "_pcb", 9098 sizeof(struct tcp_rack), 9099 rack_ctor, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0); 9100 9101 sysctl_ctx_init(&rack_sysctl_ctx); 9102 rack_sysctl_root = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 9103 SYSCTL_STATIC_CHILDREN(_net_inet_tcp), 9104 OID_AUTO, 9105 __XSTRING(STACKNAME), 9106 CTLFLAG_RW, 0, 9107 ""); 9108 if (rack_sysctl_root == NULL) { 9109 printf("Failed to add sysctl node\n"); 9110 err = EFAULT; 9111 goto free_uma; 9112 } 9113 rack_init_sysctls(); 9114 num_stacks = nitems(rack_stack_names); 9115 err = register_tcp_functions_as_names(&__tcp_rack, M_WAITOK, 9116 rack_stack_names, &num_stacks); 9117 if (err) { 9118 printf("Failed to register %s stack name for " 9119 "%s module\n", rack_stack_names[num_stacks], 9120 __XSTRING(MODNAME)); 9121 sysctl_ctx_free(&rack_sysctl_ctx); 9122 free_uma: 9123 uma_zdestroy(rack_zone); 9124 uma_zdestroy(rack_pcb_zone); 9125 rack_counter_destroy(); 9126 printf("Failed to register rack module -- err:%d\n", err); 9127 return (err); 9128 } 9129 rack_mod_inited = true; 9130 break; 9131 case MOD_QUIESCE: 9132 err = deregister_tcp_functions(&__tcp_rack, true, false); 9133 break; 9134 case MOD_UNLOAD: 9135 err = deregister_tcp_functions(&__tcp_rack, false, true); 9136 if (err == EBUSY) 9137 break; 9138 if (rack_mod_inited) { 9139 uma_zdestroy(rack_zone); 9140 uma_zdestroy(rack_pcb_zone); 9141 sysctl_ctx_free(&rack_sysctl_ctx); 9142 rack_counter_destroy(); 9143 rack_mod_inited = false; 9144 } 9145 err = 0; 9146 break; 9147 default: 9148 return (EOPNOTSUPP); 9149 } 9150 return (err); 9151 } 9152 9153 static moduledata_t tcp_rack = { 9154 .name = __XSTRING(MODNAME), 9155 .evhand = tcp_addrack, 9156 .priv = 0 9157 }; 9158 9159 MODULE_VERSION(MODNAME, 1); 9160 DECLARE_MODULE(MODNAME, tcp_rack, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY); 9161 MODULE_DEPEND(MODNAME, tcphpts, 1, 1, 1); 9162