1 /*- 2 * Copyright (c) 2016-2018 3 * Netflix Inc. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 * 26 */ 27 28 #include <sys/cdefs.h> 29 __FBSDID("$FreeBSD$"); 30 31 #include "opt_inet.h" 32 #include "opt_inet6.h" 33 #include "opt_ipsec.h" 34 #include "opt_tcpdebug.h" 35 36 #include <sys/param.h> 37 #include <sys/module.h> 38 #include <sys/kernel.h> 39 #ifdef TCP_HHOOK 40 #include <sys/hhook.h> 41 #endif 42 #include <sys/lock.h> 43 #include <sys/malloc.h> 44 #include <sys/lock.h> 45 #include <sys/mutex.h> 46 #include <sys/mbuf.h> 47 #include <sys/proc.h> /* for proc0 declaration */ 48 #include <sys/socket.h> 49 #include <sys/socketvar.h> 50 #include <sys/sysctl.h> 51 #include <sys/systm.h> 52 #ifdef NETFLIX_STATS 53 #include <sys/stats.h> 54 #endif 55 #include <sys/refcount.h> 56 #include <sys/queue.h> 57 #include <sys/smp.h> 58 #include <sys/kthread.h> 59 #include <sys/kern_prefetch.h> 60 61 #include <vm/uma.h> 62 63 #include <net/route.h> 64 #include <net/vnet.h> 65 66 #define TCPSTATES /* for logging */ 67 68 #include <netinet/in.h> 69 #include <netinet/in_kdtrace.h> 70 #include <netinet/in_pcb.h> 71 #include <netinet/ip.h> 72 #include <netinet/ip_icmp.h> /* required for icmp_var.h */ 73 #include <netinet/icmp_var.h> /* for ICMP_BANDLIM */ 74 #include <netinet/ip_var.h> 75 #include <netinet/ip6.h> 76 #include <netinet6/in6_pcb.h> 77 #include <netinet6/ip6_var.h> 78 #include <netinet/tcp.h> 79 #include <netinet/tcp_fsm.h> 80 #include <netinet/tcp_log_buf.h> 81 #include <netinet/tcp_seq.h> 82 #include <netinet/tcp_timer.h> 83 #include <netinet/tcp_var.h> 84 #include <netinet/tcp_hpts.h> 85 #include <netinet/tcpip.h> 86 #include <netinet/cc/cc.h> 87 #ifdef NETFLIX_CWV 88 #include <netinet/tcp_newcwv.h> 89 #endif 90 #include <netinet/tcp_fastopen.h> 91 #ifdef TCPDEBUG 92 #include <netinet/tcp_debug.h> 93 #endif /* TCPDEBUG */ 94 #ifdef TCP_OFFLOAD 95 #include <netinet/tcp_offload.h> 96 #endif 97 #ifdef INET6 98 #include <netinet6/tcp6_var.h> 99 #endif 100 101 #include <netipsec/ipsec_support.h> 102 103 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 104 #include <netipsec/ipsec.h> 105 #include <netipsec/ipsec6.h> 106 #endif /* IPSEC */ 107 108 #include <netinet/udp.h> 109 #include <netinet/udp_var.h> 110 #include <machine/in_cksum.h> 111 112 #ifdef MAC 113 #include <security/mac/mac_framework.h> 114 #endif 115 #include "sack_filter.h" 116 #include "tcp_rack.h" 117 #include "rack_bbr_common.h" 118 119 uma_zone_t rack_zone; 120 uma_zone_t rack_pcb_zone; 121 122 #ifndef TICKS2SBT 123 #define TICKS2SBT(__t) (tick_sbt * ((sbintime_t)(__t))) 124 #endif 125 126 struct sysctl_ctx_list rack_sysctl_ctx; 127 struct sysctl_oid *rack_sysctl_root; 128 129 #define CUM_ACKED 1 130 #define SACKED 2 131 132 /* 133 * The RACK module incorporates a number of 134 * TCP ideas that have been put out into the IETF 135 * over the last few years: 136 * - Matt Mathis's Rate Halving which slowly drops 137 * the congestion window so that the ack clock can 138 * be maintained during a recovery. 139 * - Yuchung Cheng's RACK TCP (for which its named) that 140 * will stop us using the number of dup acks and instead 141 * use time as the gage of when we retransmit. 142 * - Reorder Detection of RFC4737 and the Tail-Loss probe draft 143 * of Dukkipati et.al. 144 * RACK depends on SACK, so if an endpoint arrives that 145 * cannot do SACK the state machine below will shuttle the 146 * connection back to using the "default" TCP stack that is 147 * in FreeBSD. 148 * 149 * To implement RACK the original TCP stack was first decomposed 150 * into a functional state machine with individual states 151 * for each of the possible TCP connection states. The do_segement 152 * functions role in life is to mandate the connection supports SACK 153 * initially and then assure that the RACK state matches the conenction 154 * state before calling the states do_segment function. Each 155 * state is simplified due to the fact that the original do_segment 156 * has been decomposed and we *know* what state we are in (no 157 * switches on the state) and all tests for SACK are gone. This 158 * greatly simplifies what each state does. 159 * 160 * TCP output is also over-written with a new version since it 161 * must maintain the new rack scoreboard. 162 * 163 */ 164 static int32_t rack_precache = 1; 165 static int32_t rack_tlp_thresh = 1; 166 static int32_t rack_reorder_thresh = 2; 167 static int32_t rack_reorder_fade = 60000; /* 0 - never fade, def 60,000 168 * - 60 seconds */ 169 static int32_t rack_pkt_delay = 1; 170 static int32_t rack_inc_var = 0;/* For TLP */ 171 static int32_t rack_reduce_largest_on_idle = 0; 172 static int32_t rack_min_pace_time = 0; 173 static int32_t rack_min_pace_time_seg_req=6; 174 static int32_t rack_early_recovery = 1; 175 static int32_t rack_early_recovery_max_seg = 6; 176 static int32_t rack_send_a_lot_in_prr = 1; 177 static int32_t rack_min_to = 1; /* Number of ms minimum timeout */ 178 static int32_t rack_tlp_in_recovery = 1; /* Can we do TLP in recovery? */ 179 static int32_t rack_verbose_logging = 0; 180 static int32_t rack_ignore_data_after_close = 1; 181 /* 182 * Currently regular tcp has a rto_min of 30ms 183 * the backoff goes 12 times so that ends up 184 * being a total of 122.850 seconds before a 185 * connection is killed. 186 */ 187 static int32_t rack_tlp_min = 10; 188 static int32_t rack_rto_min = 30; /* 30ms same as main freebsd */ 189 static int32_t rack_rto_max = 30000; /* 30 seconds */ 190 static const int32_t rack_free_cache = 2; 191 static int32_t rack_hptsi_segments = 40; 192 static int32_t rack_rate_sample_method = USE_RTT_LOW; 193 static int32_t rack_pace_every_seg = 1; 194 static int32_t rack_delayed_ack_time = 200; /* 200ms */ 195 static int32_t rack_slot_reduction = 4; 196 static int32_t rack_lower_cwnd_at_tlp = 0; 197 static int32_t rack_use_proportional_reduce = 0; 198 static int32_t rack_proportional_rate = 10; 199 static int32_t rack_tlp_max_resend = 2; 200 static int32_t rack_limited_retran = 0; 201 static int32_t rack_always_send_oldest = 0; 202 static int32_t rack_sack_block_limit = 128; 203 static int32_t rack_use_sack_filter = 1; 204 static int32_t rack_tlp_threshold_use = TLP_USE_TWO_ONE; 205 206 /* Rack specific counters */ 207 counter_u64_t rack_badfr; 208 counter_u64_t rack_badfr_bytes; 209 counter_u64_t rack_rtm_prr_retran; 210 counter_u64_t rack_rtm_prr_newdata; 211 counter_u64_t rack_timestamp_mismatch; 212 counter_u64_t rack_reorder_seen; 213 counter_u64_t rack_paced_segments; 214 counter_u64_t rack_unpaced_segments; 215 counter_u64_t rack_saw_enobuf; 216 counter_u64_t rack_saw_enetunreach; 217 218 /* Tail loss probe counters */ 219 counter_u64_t rack_tlp_tot; 220 counter_u64_t rack_tlp_newdata; 221 counter_u64_t rack_tlp_retran; 222 counter_u64_t rack_tlp_retran_bytes; 223 counter_u64_t rack_tlp_retran_fail; 224 counter_u64_t rack_to_tot; 225 counter_u64_t rack_to_arm_rack; 226 counter_u64_t rack_to_arm_tlp; 227 counter_u64_t rack_to_alloc; 228 counter_u64_t rack_to_alloc_hard; 229 counter_u64_t rack_to_alloc_emerg; 230 231 counter_u64_t rack_sack_proc_all; 232 counter_u64_t rack_sack_proc_short; 233 counter_u64_t rack_sack_proc_restart; 234 counter_u64_t rack_runt_sacks; 235 counter_u64_t rack_used_tlpmethod; 236 counter_u64_t rack_used_tlpmethod2; 237 counter_u64_t rack_enter_tlp_calc; 238 counter_u64_t rack_input_idle_reduces; 239 counter_u64_t rack_tlp_does_nada; 240 241 /* Temp CPU counters */ 242 counter_u64_t rack_find_high; 243 244 counter_u64_t rack_progress_drops; 245 counter_u64_t rack_out_size[TCP_MSS_ACCT_SIZE]; 246 counter_u64_t rack_opts_arry[RACK_OPTS_SIZE]; 247 248 static void 249 rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, int event, int line); 250 251 static int 252 rack_process_ack(struct mbuf *m, struct tcphdr *th, 253 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t * ti_locked, 254 uint32_t tiwin, int32_t tlen, int32_t * ofia, int32_t thflags, int32_t * ret_val); 255 static int 256 rack_process_data(struct mbuf *m, struct tcphdr *th, 257 struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, 258 int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); 259 static void 260 rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, 261 struct tcphdr *th, uint16_t nsegs, uint16_t type, int32_t recovery); 262 static struct rack_sendmap *rack_alloc(struct tcp_rack *rack); 263 static struct rack_sendmap * 264 rack_check_recovery_mode(struct tcpcb *tp, 265 uint32_t tsused); 266 static void 267 rack_cong_signal(struct tcpcb *tp, struct tcphdr *th, 268 uint32_t type); 269 static void rack_counter_destroy(void); 270 static int 271 rack_ctloutput(struct socket *so, struct sockopt *sopt, 272 struct inpcb *inp, struct tcpcb *tp); 273 static int32_t rack_ctor(void *mem, int32_t size, void *arg, int32_t how); 274 static void 275 rack_do_segment(struct mbuf *m, struct tcphdr *th, 276 struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, 277 uint8_t iptos, int32_t ti_locked); 278 static void rack_dtor(void *mem, int32_t size, void *arg); 279 static void 280 rack_earlier_retran(struct tcpcb *tp, struct rack_sendmap *rsm, 281 uint32_t t, uint32_t cts); 282 static struct rack_sendmap * 283 rack_find_high_nonack(struct tcp_rack *rack, 284 struct rack_sendmap *rsm); 285 static struct rack_sendmap *rack_find_lowest_rsm(struct tcp_rack *rack); 286 static void rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm); 287 static void rack_fini(struct tcpcb *tp, int32_t tcb_is_purged); 288 static int 289 rack_get_sockopt(struct socket *so, struct sockopt *sopt, 290 struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack); 291 static int32_t rack_handoff_ok(struct tcpcb *tp); 292 static int32_t rack_init(struct tcpcb *tp); 293 static void rack_init_sysctls(void); 294 static void 295 rack_log_ack(struct tcpcb *tp, struct tcpopt *to, 296 struct tcphdr *th); 297 static void 298 rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len, 299 uint32_t seq_out, uint8_t th_flags, int32_t err, uint32_t ts, 300 uint8_t pass, struct rack_sendmap *hintrsm); 301 static void 302 rack_log_sack_passed(struct tcpcb *tp, struct tcp_rack *rack, 303 struct rack_sendmap *rsm); 304 static void rack_log_to_event(struct tcp_rack *rack, int32_t to_num); 305 static int32_t rack_output(struct tcpcb *tp); 306 static void 307 rack_hpts_do_segment(struct mbuf *m, struct tcphdr *th, 308 struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, 309 uint8_t iptos, int32_t ti_locked, int32_t nxt_pkt, struct timeval *tv); 310 311 static uint32_t 312 rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, 313 struct sackblk *sack, struct tcpopt *to, struct rack_sendmap **prsm, 314 uint32_t cts); 315 static void rack_post_recovery(struct tcpcb *tp, struct tcphdr *th); 316 static void rack_remxt_tmr(struct tcpcb *tp); 317 static int 318 rack_set_sockopt(struct socket *so, struct sockopt *sopt, 319 struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack); 320 static void rack_set_state(struct tcpcb *tp, struct tcp_rack *rack); 321 static int32_t rack_stopall(struct tcpcb *tp); 322 static void 323 rack_timer_activate(struct tcpcb *tp, uint32_t timer_type, 324 uint32_t delta); 325 static int32_t rack_timer_active(struct tcpcb *tp, uint32_t timer_type); 326 static void rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line); 327 static void rack_timer_stop(struct tcpcb *tp, uint32_t timer_type); 328 static uint32_t 329 rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack, 330 struct rack_sendmap *rsm, uint32_t ts, int32_t * lenp); 331 static void 332 rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack, 333 struct rack_sendmap *rsm, uint32_t ts); 334 static int 335 rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack, 336 struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type); 337 static int32_t tcp_addrack(module_t mod, int32_t type, void *data); 338 static void 339 rack_challenge_ack(struct mbuf *m, struct tcphdr *th, 340 struct tcpcb *tp, int32_t * ti_locked, int32_t * ret_val); 341 static int 342 rack_do_close_wait(struct mbuf *m, struct tcphdr *th, 343 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 344 int32_t tlen, int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); 345 static int 346 rack_do_closing(struct mbuf *m, struct tcphdr *th, 347 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 348 int32_t tlen, int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); 349 static void rack_do_drop(struct mbuf *m, struct tcpcb *tp, int32_t * ti_locked); 350 static void 351 rack_do_dropafterack(struct mbuf *m, struct tcpcb *tp, 352 struct tcphdr *th, int32_t * ti_locked, int32_t thflags, int32_t tlen, int32_t * ret_val); 353 static void 354 rack_do_dropwithreset(struct mbuf *m, struct tcpcb *tp, 355 struct tcphdr *th, int32_t * ti_locked, int32_t rstreason, int32_t tlen); 356 static int 357 rack_do_established(struct mbuf *m, struct tcphdr *th, 358 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 359 int32_t tlen, int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); 360 static int 361 rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, 362 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 363 int32_t tlen, int32_t * ti_locked, uint32_t tiwin, int32_t nxt_pkt); 364 static int 365 rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, 366 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 367 int32_t tlen, int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); 368 static int 369 rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, 370 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 371 int32_t tlen, int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); 372 static int 373 rack_do_lastack(struct mbuf *m, struct tcphdr *th, 374 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 375 int32_t tlen, int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); 376 static int 377 rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, 378 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 379 int32_t tlen, int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); 380 static int 381 rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, 382 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 383 int32_t tlen, int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); 384 static int 385 rack_drop_checks(struct tcpopt *to, struct mbuf *m, 386 struct tcphdr *th, struct tcpcb *tp, int32_t * tlenp, int32_t * ti_locked, int32_t * thf, 387 int32_t * drop_hdrlen, int32_t * ret_val); 388 static int 389 rack_process_rst(struct mbuf *m, struct tcphdr *th, 390 struct socket *so, struct tcpcb *tp, int32_t * ti_locked); 391 struct rack_sendmap * 392 tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, 393 uint32_t tsused); 394 static void tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt); 395 static void 396 tcp_rack_partialack(struct tcpcb *tp, struct tcphdr *th); 397 398 static int 399 rack_ts_check(struct mbuf *m, struct tcphdr *th, 400 struct tcpcb *tp, int32_t * ti_locked, int32_t tlen, int32_t thflags, int32_t * ret_val); 401 402 int32_t rack_clear_counter=0; 403 404 405 static int 406 sysctl_rack_clear(SYSCTL_HANDLER_ARGS) 407 { 408 uint32_t stat; 409 int32_t error; 410 411 error = SYSCTL_OUT(req, &rack_clear_counter, sizeof(uint32_t)); 412 if (error || req->newptr == NULL) 413 return error; 414 415 error = SYSCTL_IN(req, &stat, sizeof(uint32_t)); 416 if (error) 417 return (error); 418 if (stat == 1) { 419 #ifdef INVARIANTS 420 printf("Clearing RACK counters\n"); 421 #endif 422 counter_u64_zero(rack_badfr); 423 counter_u64_zero(rack_badfr_bytes); 424 counter_u64_zero(rack_rtm_prr_retran); 425 counter_u64_zero(rack_rtm_prr_newdata); 426 counter_u64_zero(rack_timestamp_mismatch); 427 counter_u64_zero(rack_reorder_seen); 428 counter_u64_zero(rack_tlp_tot); 429 counter_u64_zero(rack_tlp_newdata); 430 counter_u64_zero(rack_tlp_retran); 431 counter_u64_zero(rack_tlp_retran_bytes); 432 counter_u64_zero(rack_tlp_retran_fail); 433 counter_u64_zero(rack_to_tot); 434 counter_u64_zero(rack_to_arm_rack); 435 counter_u64_zero(rack_to_arm_tlp); 436 counter_u64_zero(rack_paced_segments); 437 counter_u64_zero(rack_unpaced_segments); 438 counter_u64_zero(rack_saw_enobuf); 439 counter_u64_zero(rack_saw_enetunreach); 440 counter_u64_zero(rack_to_alloc_hard); 441 counter_u64_zero(rack_to_alloc_emerg); 442 counter_u64_zero(rack_sack_proc_all); 443 counter_u64_zero(rack_sack_proc_short); 444 counter_u64_zero(rack_sack_proc_restart); 445 counter_u64_zero(rack_to_alloc); 446 counter_u64_zero(rack_find_high); 447 counter_u64_zero(rack_runt_sacks); 448 counter_u64_zero(rack_used_tlpmethod); 449 counter_u64_zero(rack_used_tlpmethod2); 450 counter_u64_zero(rack_enter_tlp_calc); 451 counter_u64_zero(rack_progress_drops); 452 counter_u64_zero(rack_tlp_does_nada); 453 } 454 rack_clear_counter = 0; 455 return (0); 456 } 457 458 459 460 static void 461 rack_init_sysctls() 462 { 463 SYSCTL_ADD_S32(&rack_sysctl_ctx, 464 SYSCTL_CHILDREN(rack_sysctl_root), 465 OID_AUTO, "rate_sample_method", CTLFLAG_RW, 466 &rack_rate_sample_method , USE_RTT_LOW, 467 "What method should we use for rate sampling 0=high, 1=low "); 468 SYSCTL_ADD_S32(&rack_sysctl_ctx, 469 SYSCTL_CHILDREN(rack_sysctl_root), 470 OID_AUTO, "data_after_close", CTLFLAG_RW, 471 &rack_ignore_data_after_close, 0, 472 "Do we hold off sending a RST until all pending data is ack'd"); 473 SYSCTL_ADD_S32(&rack_sysctl_ctx, 474 SYSCTL_CHILDREN(rack_sysctl_root), 475 OID_AUTO, "tlpmethod", CTLFLAG_RW, 476 &rack_tlp_threshold_use, TLP_USE_TWO_ONE, 477 "What method do we do for TLP time calc 0=no-de-ack-comp, 1=ID, 2=2.1, 3=2.2"); 478 SYSCTL_ADD_S32(&rack_sysctl_ctx, 479 SYSCTL_CHILDREN(rack_sysctl_root), 480 OID_AUTO, "min_pace_time", CTLFLAG_RW, 481 &rack_min_pace_time, 0, 482 "Should we enforce a minimum pace time of 1ms"); 483 SYSCTL_ADD_S32(&rack_sysctl_ctx, 484 SYSCTL_CHILDREN(rack_sysctl_root), 485 OID_AUTO, "min_pace_segs", CTLFLAG_RW, 486 &rack_min_pace_time_seg_req, 6, 487 "How many segments have to be in the len to enforce min-pace-time"); 488 SYSCTL_ADD_S32(&rack_sysctl_ctx, 489 SYSCTL_CHILDREN(rack_sysctl_root), 490 OID_AUTO, "idle_reduce_high", CTLFLAG_RW, 491 &rack_reduce_largest_on_idle, 0, 492 "Should we reduce the largest cwnd seen to IW on idle reduction"); 493 SYSCTL_ADD_S32(&rack_sysctl_ctx, 494 SYSCTL_CHILDREN(rack_sysctl_root), 495 OID_AUTO, "bb_verbose", CTLFLAG_RW, 496 &rack_verbose_logging, 0, 497 "Should RACK black box logging be verbose"); 498 SYSCTL_ADD_S32(&rack_sysctl_ctx, 499 SYSCTL_CHILDREN(rack_sysctl_root), 500 OID_AUTO, "sackfiltering", CTLFLAG_RW, 501 &rack_use_sack_filter, 1, 502 "Do we use sack filtering?"); 503 SYSCTL_ADD_S32(&rack_sysctl_ctx, 504 SYSCTL_CHILDREN(rack_sysctl_root), 505 OID_AUTO, "delayed_ack", CTLFLAG_RW, 506 &rack_delayed_ack_time, 200, 507 "Delayed ack time (200ms)"); 508 SYSCTL_ADD_S32(&rack_sysctl_ctx, 509 SYSCTL_CHILDREN(rack_sysctl_root), 510 OID_AUTO, "tlpminto", CTLFLAG_RW, 511 &rack_tlp_min, 10, 512 "TLP minimum timeout per the specification (10ms)"); 513 SYSCTL_ADD_S32(&rack_sysctl_ctx, 514 SYSCTL_CHILDREN(rack_sysctl_root), 515 OID_AUTO, "precache", CTLFLAG_RW, 516 &rack_precache, 0, 517 "Where should we precache the mcopy (0 is not at all)"); 518 SYSCTL_ADD_S32(&rack_sysctl_ctx, 519 SYSCTL_CHILDREN(rack_sysctl_root), 520 OID_AUTO, "sblklimit", CTLFLAG_RW, 521 &rack_sack_block_limit, 128, 522 "When do we start paying attention to small sack blocks"); 523 SYSCTL_ADD_S32(&rack_sysctl_ctx, 524 SYSCTL_CHILDREN(rack_sysctl_root), 525 OID_AUTO, "send_oldest", CTLFLAG_RW, 526 &rack_always_send_oldest, 1, 527 "Should we always send the oldest TLP and RACK-TLP"); 528 SYSCTL_ADD_S32(&rack_sysctl_ctx, 529 SYSCTL_CHILDREN(rack_sysctl_root), 530 OID_AUTO, "rack_tlp_in_recovery", CTLFLAG_RW, 531 &rack_tlp_in_recovery, 1, 532 "Can we do a TLP during recovery?"); 533 SYSCTL_ADD_S32(&rack_sysctl_ctx, 534 SYSCTL_CHILDREN(rack_sysctl_root), 535 OID_AUTO, "rack_tlimit", CTLFLAG_RW, 536 &rack_limited_retran, 0, 537 "How many times can a rack timeout drive out sends"); 538 SYSCTL_ADD_S32(&rack_sysctl_ctx, 539 SYSCTL_CHILDREN(rack_sysctl_root), 540 OID_AUTO, "minrto", CTLFLAG_RW, 541 &rack_rto_min, 0, 542 "Minimum RTO in ms -- set with caution below 1000 due to TLP"); 543 SYSCTL_ADD_S32(&rack_sysctl_ctx, 544 SYSCTL_CHILDREN(rack_sysctl_root), 545 OID_AUTO, "maxrto", CTLFLAG_RW, 546 &rack_rto_max, 0, 547 "Maxiumum RTO in ms -- should be at least as large as min_rto"); 548 SYSCTL_ADD_S32(&rack_sysctl_ctx, 549 SYSCTL_CHILDREN(rack_sysctl_root), 550 OID_AUTO, "tlp_retry", CTLFLAG_RW, 551 &rack_tlp_max_resend, 2, 552 "How many times does TLP retry a single segment or multiple with no ACK"); 553 SYSCTL_ADD_S32(&rack_sysctl_ctx, 554 SYSCTL_CHILDREN(rack_sysctl_root), 555 OID_AUTO, "recovery_loss_prop", CTLFLAG_RW, 556 &rack_use_proportional_reduce, 0, 557 "Should we proportionaly reduce cwnd based on the number of losses "); 558 SYSCTL_ADD_S32(&rack_sysctl_ctx, 559 SYSCTL_CHILDREN(rack_sysctl_root), 560 OID_AUTO, "recovery_prop", CTLFLAG_RW, 561 &rack_proportional_rate, 10, 562 "What percent reduction per loss"); 563 SYSCTL_ADD_S32(&rack_sysctl_ctx, 564 SYSCTL_CHILDREN(rack_sysctl_root), 565 OID_AUTO, "tlp_cwnd_flag", CTLFLAG_RW, 566 &rack_lower_cwnd_at_tlp, 0, 567 "When a TLP completes a retran should we enter recovery?"); 568 SYSCTL_ADD_S32(&rack_sysctl_ctx, 569 SYSCTL_CHILDREN(rack_sysctl_root), 570 OID_AUTO, "hptsi_reduces", CTLFLAG_RW, 571 &rack_slot_reduction, 4, 572 "When setting a slot should we reduce by divisor"); 573 SYSCTL_ADD_S32(&rack_sysctl_ctx, 574 SYSCTL_CHILDREN(rack_sysctl_root), 575 OID_AUTO, "hptsi_every_seg", CTLFLAG_RW, 576 &rack_pace_every_seg, 1, 577 "Should we pace out every segment hptsi"); 578 SYSCTL_ADD_S32(&rack_sysctl_ctx, 579 SYSCTL_CHILDREN(rack_sysctl_root), 580 OID_AUTO, "hptsi_seg_max", CTLFLAG_RW, 581 &rack_hptsi_segments, 6, 582 "Should we pace out only a limited size of segments"); 583 SYSCTL_ADD_S32(&rack_sysctl_ctx, 584 SYSCTL_CHILDREN(rack_sysctl_root), 585 OID_AUTO, "prr_sendalot", CTLFLAG_RW, 586 &rack_send_a_lot_in_prr, 1, 587 "Send a lot in prr"); 588 SYSCTL_ADD_S32(&rack_sysctl_ctx, 589 SYSCTL_CHILDREN(rack_sysctl_root), 590 OID_AUTO, "minto", CTLFLAG_RW, 591 &rack_min_to, 1, 592 "Minimum rack timeout in milliseconds"); 593 SYSCTL_ADD_S32(&rack_sysctl_ctx, 594 SYSCTL_CHILDREN(rack_sysctl_root), 595 OID_AUTO, "earlyrecoveryseg", CTLFLAG_RW, 596 &rack_early_recovery_max_seg, 6, 597 "Max segments in early recovery"); 598 SYSCTL_ADD_S32(&rack_sysctl_ctx, 599 SYSCTL_CHILDREN(rack_sysctl_root), 600 OID_AUTO, "earlyrecovery", CTLFLAG_RW, 601 &rack_early_recovery, 1, 602 "Do we do early recovery with rack"); 603 SYSCTL_ADD_S32(&rack_sysctl_ctx, 604 SYSCTL_CHILDREN(rack_sysctl_root), 605 OID_AUTO, "reorder_thresh", CTLFLAG_RW, 606 &rack_reorder_thresh, 2, 607 "What factor for rack will be added when seeing reordering (shift right)"); 608 SYSCTL_ADD_S32(&rack_sysctl_ctx, 609 SYSCTL_CHILDREN(rack_sysctl_root), 610 OID_AUTO, "rtt_tlp_thresh", CTLFLAG_RW, 611 &rack_tlp_thresh, 1, 612 "what divisor for TLP rtt/retran will be added (1=rtt, 2=1/2 rtt etc)"); 613 SYSCTL_ADD_S32(&rack_sysctl_ctx, 614 SYSCTL_CHILDREN(rack_sysctl_root), 615 OID_AUTO, "reorder_fade", CTLFLAG_RW, 616 &rack_reorder_fade, 0, 617 "Does reorder detection fade, if so how many ms (0 means never)"); 618 SYSCTL_ADD_S32(&rack_sysctl_ctx, 619 SYSCTL_CHILDREN(rack_sysctl_root), 620 OID_AUTO, "pktdelay", CTLFLAG_RW, 621 &rack_pkt_delay, 1, 622 "Extra RACK time (in ms) besides reordering thresh"); 623 SYSCTL_ADD_S32(&rack_sysctl_ctx, 624 SYSCTL_CHILDREN(rack_sysctl_root), 625 OID_AUTO, "inc_var", CTLFLAG_RW, 626 &rack_inc_var, 0, 627 "Should rack add to the TLP timer the variance in rtt calculation"); 628 rack_badfr = counter_u64_alloc(M_WAITOK); 629 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 630 SYSCTL_CHILDREN(rack_sysctl_root), 631 OID_AUTO, "badfr", CTLFLAG_RD, 632 &rack_badfr, "Total number of bad FRs"); 633 rack_badfr_bytes = counter_u64_alloc(M_WAITOK); 634 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 635 SYSCTL_CHILDREN(rack_sysctl_root), 636 OID_AUTO, "badfr_bytes", CTLFLAG_RD, 637 &rack_badfr_bytes, "Total number of bad FRs"); 638 rack_rtm_prr_retran = counter_u64_alloc(M_WAITOK); 639 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 640 SYSCTL_CHILDREN(rack_sysctl_root), 641 OID_AUTO, "prrsndret", CTLFLAG_RD, 642 &rack_rtm_prr_retran, 643 "Total number of prr based retransmits"); 644 rack_rtm_prr_newdata = counter_u64_alloc(M_WAITOK); 645 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 646 SYSCTL_CHILDREN(rack_sysctl_root), 647 OID_AUTO, "prrsndnew", CTLFLAG_RD, 648 &rack_rtm_prr_newdata, 649 "Total number of prr based new transmits"); 650 rack_timestamp_mismatch = counter_u64_alloc(M_WAITOK); 651 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 652 SYSCTL_CHILDREN(rack_sysctl_root), 653 OID_AUTO, "tsnf", CTLFLAG_RD, 654 &rack_timestamp_mismatch, 655 "Total number of timestamps that we could not find the reported ts"); 656 rack_find_high = counter_u64_alloc(M_WAITOK); 657 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 658 SYSCTL_CHILDREN(rack_sysctl_root), 659 OID_AUTO, "findhigh", CTLFLAG_RD, 660 &rack_find_high, 661 "Total number of FIN causing find-high"); 662 rack_reorder_seen = counter_u64_alloc(M_WAITOK); 663 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 664 SYSCTL_CHILDREN(rack_sysctl_root), 665 OID_AUTO, "reordering", CTLFLAG_RD, 666 &rack_reorder_seen, 667 "Total number of times we added delay due to reordering"); 668 rack_tlp_tot = counter_u64_alloc(M_WAITOK); 669 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 670 SYSCTL_CHILDREN(rack_sysctl_root), 671 OID_AUTO, "tlp_to_total", CTLFLAG_RD, 672 &rack_tlp_tot, 673 "Total number of tail loss probe expirations"); 674 rack_tlp_newdata = counter_u64_alloc(M_WAITOK); 675 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 676 SYSCTL_CHILDREN(rack_sysctl_root), 677 OID_AUTO, "tlp_new", CTLFLAG_RD, 678 &rack_tlp_newdata, 679 "Total number of tail loss probe sending new data"); 680 681 rack_tlp_retran = counter_u64_alloc(M_WAITOK); 682 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 683 SYSCTL_CHILDREN(rack_sysctl_root), 684 OID_AUTO, "tlp_retran", CTLFLAG_RD, 685 &rack_tlp_retran, 686 "Total number of tail loss probe sending retransmitted data"); 687 rack_tlp_retran_bytes = counter_u64_alloc(M_WAITOK); 688 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 689 SYSCTL_CHILDREN(rack_sysctl_root), 690 OID_AUTO, "tlp_retran_bytes", CTLFLAG_RD, 691 &rack_tlp_retran_bytes, 692 "Total bytes of tail loss probe sending retransmitted data"); 693 rack_tlp_retran_fail = counter_u64_alloc(M_WAITOK); 694 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 695 SYSCTL_CHILDREN(rack_sysctl_root), 696 OID_AUTO, "tlp_retran_fail", CTLFLAG_RD, 697 &rack_tlp_retran_fail, 698 "Total number of tail loss probe sending retransmitted data that failed (wait for t3)"); 699 rack_to_tot = counter_u64_alloc(M_WAITOK); 700 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 701 SYSCTL_CHILDREN(rack_sysctl_root), 702 OID_AUTO, "rack_to_tot", CTLFLAG_RD, 703 &rack_to_tot, 704 "Total number of times the rack to expired?"); 705 rack_to_arm_rack = counter_u64_alloc(M_WAITOK); 706 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 707 SYSCTL_CHILDREN(rack_sysctl_root), 708 OID_AUTO, "arm_rack", CTLFLAG_RD, 709 &rack_to_arm_rack, 710 "Total number of times the rack timer armed?"); 711 rack_to_arm_tlp = counter_u64_alloc(M_WAITOK); 712 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 713 SYSCTL_CHILDREN(rack_sysctl_root), 714 OID_AUTO, "arm_tlp", CTLFLAG_RD, 715 &rack_to_arm_tlp, 716 "Total number of times the tlp timer armed?"); 717 rack_paced_segments = counter_u64_alloc(M_WAITOK); 718 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 719 SYSCTL_CHILDREN(rack_sysctl_root), 720 OID_AUTO, "paced", CTLFLAG_RD, 721 &rack_paced_segments, 722 "Total number of times a segment send caused hptsi"); 723 rack_unpaced_segments = counter_u64_alloc(M_WAITOK); 724 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 725 SYSCTL_CHILDREN(rack_sysctl_root), 726 OID_AUTO, "unpaced", CTLFLAG_RD, 727 &rack_unpaced_segments, 728 "Total number of times a segment did not cause hptsi"); 729 rack_saw_enobuf = counter_u64_alloc(M_WAITOK); 730 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 731 SYSCTL_CHILDREN(rack_sysctl_root), 732 OID_AUTO, "saw_enobufs", CTLFLAG_RD, 733 &rack_saw_enobuf, 734 "Total number of times a segment did not cause hptsi"); 735 rack_saw_enetunreach = counter_u64_alloc(M_WAITOK); 736 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 737 SYSCTL_CHILDREN(rack_sysctl_root), 738 OID_AUTO, "saw_enetunreach", CTLFLAG_RD, 739 &rack_saw_enetunreach, 740 "Total number of times a segment did not cause hptsi"); 741 rack_to_alloc = counter_u64_alloc(M_WAITOK); 742 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 743 SYSCTL_CHILDREN(rack_sysctl_root), 744 OID_AUTO, "allocs", CTLFLAG_RD, 745 &rack_to_alloc, 746 "Total allocations of tracking structures"); 747 rack_to_alloc_hard = counter_u64_alloc(M_WAITOK); 748 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 749 SYSCTL_CHILDREN(rack_sysctl_root), 750 OID_AUTO, "allochard", CTLFLAG_RD, 751 &rack_to_alloc_hard, 752 "Total allocations done with sleeping the hard way"); 753 rack_to_alloc_emerg = counter_u64_alloc(M_WAITOK); 754 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 755 SYSCTL_CHILDREN(rack_sysctl_root), 756 OID_AUTO, "allocemerg", CTLFLAG_RD, 757 &rack_to_alloc_emerg, 758 "Total alocations done from emergency cache"); 759 rack_sack_proc_all = counter_u64_alloc(M_WAITOK); 760 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 761 SYSCTL_CHILDREN(rack_sysctl_root), 762 OID_AUTO, "sack_long", CTLFLAG_RD, 763 &rack_sack_proc_all, 764 "Total times we had to walk whole list for sack processing"); 765 766 rack_sack_proc_restart = counter_u64_alloc(M_WAITOK); 767 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 768 SYSCTL_CHILDREN(rack_sysctl_root), 769 OID_AUTO, "sack_restart", CTLFLAG_RD, 770 &rack_sack_proc_restart, 771 "Total times we had to walk whole list due to a restart"); 772 rack_sack_proc_short = counter_u64_alloc(M_WAITOK); 773 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 774 SYSCTL_CHILDREN(rack_sysctl_root), 775 OID_AUTO, "sack_short", CTLFLAG_RD, 776 &rack_sack_proc_short, 777 "Total times we took shortcut for sack processing"); 778 rack_enter_tlp_calc = counter_u64_alloc(M_WAITOK); 779 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 780 SYSCTL_CHILDREN(rack_sysctl_root), 781 OID_AUTO, "tlp_calc_entered", CTLFLAG_RD, 782 &rack_enter_tlp_calc, 783 "Total times we called calc-tlp"); 784 rack_used_tlpmethod = counter_u64_alloc(M_WAITOK); 785 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 786 SYSCTL_CHILDREN(rack_sysctl_root), 787 OID_AUTO, "hit_tlp_method", CTLFLAG_RD, 788 &rack_used_tlpmethod, 789 "Total number of runt sacks"); 790 rack_used_tlpmethod2 = counter_u64_alloc(M_WAITOK); 791 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 792 SYSCTL_CHILDREN(rack_sysctl_root), 793 OID_AUTO, "hit_tlp_method2", CTLFLAG_RD, 794 &rack_used_tlpmethod2, 795 "Total number of runt sacks 2"); 796 rack_runt_sacks = counter_u64_alloc(M_WAITOK); 797 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 798 SYSCTL_CHILDREN(rack_sysctl_root), 799 OID_AUTO, "runtsacks", CTLFLAG_RD, 800 &rack_runt_sacks, 801 "Total number of runt sacks"); 802 rack_progress_drops = counter_u64_alloc(M_WAITOK); 803 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 804 SYSCTL_CHILDREN(rack_sysctl_root), 805 OID_AUTO, "prog_drops", CTLFLAG_RD, 806 &rack_progress_drops, 807 "Total number of progress drops"); 808 rack_input_idle_reduces = counter_u64_alloc(M_WAITOK); 809 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 810 SYSCTL_CHILDREN(rack_sysctl_root), 811 OID_AUTO, "idle_reduce_oninput", CTLFLAG_RD, 812 &rack_input_idle_reduces, 813 "Total number of idle reductions on input"); 814 rack_tlp_does_nada = counter_u64_alloc(M_WAITOK); 815 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 816 SYSCTL_CHILDREN(rack_sysctl_root), 817 OID_AUTO, "tlp_nada", CTLFLAG_RD, 818 &rack_tlp_does_nada, 819 "Total number of nada tlp calls"); 820 COUNTER_ARRAY_ALLOC(rack_out_size, TCP_MSS_ACCT_SIZE, M_WAITOK); 821 SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), 822 OID_AUTO, "outsize", CTLFLAG_RD, 823 rack_out_size, TCP_MSS_ACCT_SIZE, "MSS send sizes"); 824 COUNTER_ARRAY_ALLOC(rack_opts_arry, RACK_OPTS_SIZE, M_WAITOK); 825 SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), 826 OID_AUTO, "opts", CTLFLAG_RD, 827 rack_opts_arry, RACK_OPTS_SIZE, "RACK Option Stats"); 828 SYSCTL_ADD_PROC(&rack_sysctl_ctx, 829 SYSCTL_CHILDREN(rack_sysctl_root), 830 OID_AUTO, "clear", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, 831 &rack_clear_counter, 0, sysctl_rack_clear, "IU", "Clear counters"); 832 } 833 834 static inline int32_t 835 rack_progress_timeout_check(struct tcpcb *tp) 836 { 837 if (tp->t_maxunacktime && tp->t_acktime && TSTMP_GT(ticks, tp->t_acktime)) { 838 if ((ticks - tp->t_acktime) >= tp->t_maxunacktime) { 839 /* 840 * There is an assumption that the caller 841 * will drop the connection so we will 842 * increment the counters here. 843 */ 844 struct tcp_rack *rack; 845 rack = (struct tcp_rack *)tp->t_fb_ptr; 846 counter_u64_add(rack_progress_drops, 1); 847 #ifdef NETFLIX_STATS 848 TCPSTAT_INC(tcps_progdrops); 849 #endif 850 rack_log_progress_event(rack, tp, ticks, PROGRESS_DROP, __LINE__); 851 return (1); 852 } 853 } 854 return (0); 855 } 856 857 858 static void 859 rack_log_to_start(struct tcp_rack *rack, uint32_t cts, uint32_t to, int32_t slot, uint8_t which) 860 { 861 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 862 union tcp_log_stackspecific log; 863 864 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 865 log.u_bbr.flex1 = TICKS_2_MSEC(rack->rc_tp->t_srtt >> TCP_RTT_SHIFT); 866 log.u_bbr.flex2 = to; 867 log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags; 868 log.u_bbr.flex4 = slot; 869 log.u_bbr.flex5 = rack->rc_inp->inp_hptsslot; 870 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur; 871 log.u_bbr.flex8 = which; 872 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 873 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 874 TCP_LOG_EVENT(rack->rc_tp, NULL, 875 &rack->rc_inp->inp_socket->so_rcv, 876 &rack->rc_inp->inp_socket->so_snd, 877 BBR_LOG_TIMERSTAR, 0, 878 0, &log, false); 879 } 880 } 881 882 static void 883 rack_log_to_event(struct tcp_rack *rack, int32_t to_num) 884 { 885 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 886 union tcp_log_stackspecific log; 887 888 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 889 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 890 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 891 log.u_bbr.flex8 = to_num; 892 log.u_bbr.flex1 = rack->r_ctl.rc_rack_min_rtt; 893 log.u_bbr.flex2 = rack->rc_rack_rtt; 894 TCP_LOG_EVENT(rack->rc_tp, NULL, 895 &rack->rc_inp->inp_socket->so_rcv, 896 &rack->rc_inp->inp_socket->so_snd, 897 BBR_LOG_RTO, 0, 898 0, &log, false); 899 } 900 } 901 902 static void 903 rack_log_rtt_upd(struct tcpcb *tp, struct tcp_rack *rack, int32_t t, 904 uint32_t o_srtt, uint32_t o_var) 905 { 906 if (tp->t_logstate != TCP_LOG_STATE_OFF) { 907 union tcp_log_stackspecific log; 908 909 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 910 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 911 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 912 log.u_bbr.flex1 = t; 913 log.u_bbr.flex2 = o_srtt; 914 log.u_bbr.flex3 = o_var; 915 log.u_bbr.flex4 = rack->r_ctl.rack_rs.rs_rtt_lowest; 916 log.u_bbr.flex5 = rack->r_ctl.rack_rs.rs_rtt_highest; 917 log.u_bbr.flex6 = rack->r_ctl.rack_rs.rs_rtt_cnt; 918 log.u_bbr.rttProp = rack->r_ctl.rack_rs.rs_rtt_tot; 919 log.u_bbr.flex8 = rack->r_ctl.rc_rate_sample_method; 920 TCP_LOG_EVENT(tp, NULL, 921 &rack->rc_inp->inp_socket->so_rcv, 922 &rack->rc_inp->inp_socket->so_snd, 923 BBR_LOG_BBRRTT, 0, 924 0, &log, false); 925 } 926 } 927 928 static void 929 rack_log_rtt_sample(struct tcp_rack *rack, uint32_t rtt) 930 { 931 /* 932 * Log the rtt sample we are 933 * applying to the srtt algorithm in 934 * useconds. 935 */ 936 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 937 union tcp_log_stackspecific log; 938 struct timeval tv; 939 940 /* Convert our ms to a microsecond */ 941 log.u_bbr.flex1 = rtt * 1000; 942 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 943 TCP_LOG_EVENTP(rack->rc_tp, NULL, 944 &rack->rc_inp->inp_socket->so_rcv, 945 &rack->rc_inp->inp_socket->so_snd, 946 TCP_LOG_RTT, 0, 947 0, &log, false, &tv); 948 } 949 } 950 951 952 static inline void 953 rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, int event, int line) 954 { 955 if (rack_verbose_logging && (tp->t_logstate != TCP_LOG_STATE_OFF)) { 956 union tcp_log_stackspecific log; 957 958 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 959 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 960 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 961 log.u_bbr.flex1 = line; 962 log.u_bbr.flex2 = tick; 963 log.u_bbr.flex3 = tp->t_maxunacktime; 964 log.u_bbr.flex4 = tp->t_acktime; 965 log.u_bbr.flex8 = event; 966 TCP_LOG_EVENT(tp, NULL, 967 &rack->rc_inp->inp_socket->so_rcv, 968 &rack->rc_inp->inp_socket->so_snd, 969 BBR_LOG_PROGRESS, 0, 970 0, &log, false); 971 } 972 } 973 974 static void 975 rack_log_type_bbrsnd(struct tcp_rack *rack, uint32_t len, uint32_t slot, uint32_t cts) 976 { 977 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 978 union tcp_log_stackspecific log; 979 980 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 981 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 982 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 983 log.u_bbr.flex1 = slot; 984 log.u_bbr.flex7 = (0x0000ffff & rack->r_ctl.rc_hpts_flags); 985 log.u_bbr.flex8 = rack->rc_in_persist; 986 TCP_LOG_EVENT(rack->rc_tp, NULL, 987 &rack->rc_inp->inp_socket->so_rcv, 988 &rack->rc_inp->inp_socket->so_snd, 989 BBR_LOG_BBRSND, 0, 990 0, &log, false); 991 } 992 } 993 994 static void 995 rack_log_doseg_done(struct tcp_rack *rack, uint32_t cts, int32_t nxt_pkt, int32_t did_out, int way_out) 996 { 997 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 998 union tcp_log_stackspecific log; 999 log.u_bbr.flex1 = did_out; 1000 log.u_bbr.flex2 = nxt_pkt; 1001 log.u_bbr.flex3 = way_out; 1002 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; 1003 log.u_bbr.flex7 = rack->r_wanted_output; 1004 log.u_bbr.flex8 = rack->rc_in_persist; 1005 TCP_LOG_EVENT(rack->rc_tp, NULL, 1006 &rack->rc_inp->inp_socket->so_rcv, 1007 &rack->rc_inp->inp_socket->so_snd, 1008 BBR_LOG_DOSEG_DONE, 0, 1009 0, &log, false); 1010 } 1011 } 1012 1013 1014 static void 1015 rack_log_type_just_return(struct tcp_rack *rack, uint32_t cts, uint32_t tlen, uint32_t slot, uint8_t hpts_calling) 1016 { 1017 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 1018 union tcp_log_stackspecific log; 1019 1020 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 1021 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 1022 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 1023 log.u_bbr.flex1 = slot; 1024 log.u_bbr.flex2 = rack->r_ctl.rc_hpts_flags; 1025 log.u_bbr.flex7 = hpts_calling; 1026 log.u_bbr.flex8 = rack->rc_in_persist; 1027 TCP_LOG_EVENT(rack->rc_tp, NULL, 1028 &rack->rc_inp->inp_socket->so_rcv, 1029 &rack->rc_inp->inp_socket->so_snd, 1030 BBR_LOG_JUSTRET, 0, 1031 tlen, &log, false); 1032 } 1033 } 1034 1035 static void 1036 rack_log_to_cancel(struct tcp_rack *rack, int32_t hpts_removed, int line) 1037 { 1038 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 1039 union tcp_log_stackspecific log; 1040 1041 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 1042 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 1043 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 1044 log.u_bbr.flex1 = line; 1045 log.u_bbr.flex2 = 0; 1046 log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags; 1047 log.u_bbr.flex4 = 0; 1048 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur; 1049 log.u_bbr.flex8 = hpts_removed; 1050 TCP_LOG_EVENT(rack->rc_tp, NULL, 1051 &rack->rc_inp->inp_socket->so_rcv, 1052 &rack->rc_inp->inp_socket->so_snd, 1053 BBR_LOG_TIMERCANC, 0, 1054 0, &log, false); 1055 } 1056 } 1057 1058 static void 1059 rack_log_to_processing(struct tcp_rack *rack, uint32_t cts, int32_t ret, int32_t timers) 1060 { 1061 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 1062 union tcp_log_stackspecific log; 1063 1064 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 1065 log.u_bbr.flex1 = timers; 1066 log.u_bbr.flex2 = ret; 1067 log.u_bbr.flex3 = rack->r_ctl.rc_timer_exp; 1068 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; 1069 log.u_bbr.flex5 = cts; 1070 TCP_LOG_EVENT(rack->rc_tp, NULL, 1071 &rack->rc_inp->inp_socket->so_rcv, 1072 &rack->rc_inp->inp_socket->so_snd, 1073 BBR_LOG_TO_PROCESS, 0, 1074 0, &log, false); 1075 } 1076 } 1077 1078 static void 1079 rack_counter_destroy() 1080 { 1081 counter_u64_free(rack_badfr); 1082 counter_u64_free(rack_badfr_bytes); 1083 counter_u64_free(rack_rtm_prr_retran); 1084 counter_u64_free(rack_rtm_prr_newdata); 1085 counter_u64_free(rack_timestamp_mismatch); 1086 counter_u64_free(rack_reorder_seen); 1087 counter_u64_free(rack_tlp_tot); 1088 counter_u64_free(rack_tlp_newdata); 1089 counter_u64_free(rack_tlp_retran); 1090 counter_u64_free(rack_tlp_retran_bytes); 1091 counter_u64_free(rack_tlp_retran_fail); 1092 counter_u64_free(rack_to_tot); 1093 counter_u64_free(rack_to_arm_rack); 1094 counter_u64_free(rack_to_arm_tlp); 1095 counter_u64_free(rack_paced_segments); 1096 counter_u64_free(rack_unpaced_segments); 1097 counter_u64_free(rack_saw_enobuf); 1098 counter_u64_free(rack_saw_enetunreach); 1099 counter_u64_free(rack_to_alloc_hard); 1100 counter_u64_free(rack_to_alloc_emerg); 1101 counter_u64_free(rack_sack_proc_all); 1102 counter_u64_free(rack_sack_proc_short); 1103 counter_u64_free(rack_sack_proc_restart); 1104 counter_u64_free(rack_to_alloc); 1105 counter_u64_free(rack_find_high); 1106 counter_u64_free(rack_runt_sacks); 1107 counter_u64_free(rack_enter_tlp_calc); 1108 counter_u64_free(rack_used_tlpmethod); 1109 counter_u64_free(rack_used_tlpmethod2); 1110 counter_u64_free(rack_progress_drops); 1111 counter_u64_free(rack_input_idle_reduces); 1112 counter_u64_free(rack_tlp_does_nada); 1113 COUNTER_ARRAY_FREE(rack_out_size, TCP_MSS_ACCT_SIZE); 1114 COUNTER_ARRAY_FREE(rack_opts_arry, RACK_OPTS_SIZE); 1115 } 1116 1117 static struct rack_sendmap * 1118 rack_alloc(struct tcp_rack *rack) 1119 { 1120 struct rack_sendmap *rsm; 1121 1122 counter_u64_add(rack_to_alloc, 1); 1123 rack->r_ctl.rc_num_maps_alloced++; 1124 rsm = uma_zalloc(rack_zone, M_NOWAIT); 1125 if (rsm) { 1126 return (rsm); 1127 } 1128 if (rack->rc_free_cnt) { 1129 counter_u64_add(rack_to_alloc_emerg, 1); 1130 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); 1131 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_next); 1132 rack->rc_free_cnt--; 1133 return (rsm); 1134 } 1135 return (NULL); 1136 } 1137 1138 static void 1139 rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm) 1140 { 1141 rack->r_ctl.rc_num_maps_alloced--; 1142 if (rack->r_ctl.rc_tlpsend == rsm) 1143 rack->r_ctl.rc_tlpsend = NULL; 1144 if (rack->r_ctl.rc_next == rsm) 1145 rack->r_ctl.rc_next = NULL; 1146 if (rack->r_ctl.rc_sacklast == rsm) 1147 rack->r_ctl.rc_sacklast = NULL; 1148 if (rack->rc_free_cnt < rack_free_cache) { 1149 memset(rsm, 0, sizeof(struct rack_sendmap)); 1150 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_next); 1151 rack->rc_free_cnt++; 1152 return; 1153 } 1154 uma_zfree(rack_zone, rsm); 1155 } 1156 1157 /* 1158 * CC wrapper hook functions 1159 */ 1160 static void 1161 rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, struct tcphdr *th, uint16_t nsegs, 1162 uint16_t type, int32_t recovery) 1163 { 1164 #ifdef NETFLIX_STATS 1165 int32_t gput; 1166 #endif 1167 #ifdef NETFLIX_CWV 1168 u_long old_cwnd = tp->snd_cwnd; 1169 #endif 1170 1171 INP_WLOCK_ASSERT(tp->t_inpcb); 1172 tp->ccv->nsegs = nsegs; 1173 tp->ccv->bytes_this_ack = BYTES_THIS_ACK(tp, th); 1174 if ((recovery) && (rack->r_ctl.rc_early_recovery_segs)) { 1175 uint32_t max; 1176 1177 max = rack->r_ctl.rc_early_recovery_segs * tp->t_maxseg; 1178 if (tp->ccv->bytes_this_ack > max) { 1179 tp->ccv->bytes_this_ack = max; 1180 } 1181 } 1182 if (tp->snd_cwnd <= tp->snd_wnd) 1183 tp->ccv->flags |= CCF_CWND_LIMITED; 1184 else 1185 tp->ccv->flags &= ~CCF_CWND_LIMITED; 1186 1187 if (type == CC_ACK) { 1188 #ifdef NETFLIX_STATS 1189 stats_voi_update_abs_s32(tp->t_stats, VOI_TCP_CALCFRWINDIFF, 1190 ((int32_t) tp->snd_cwnd) - tp->snd_wnd); 1191 if ((tp->t_flags & TF_GPUTINPROG) && 1192 SEQ_GEQ(th->th_ack, tp->gput_ack)) { 1193 gput = (((int64_t) (th->th_ack - tp->gput_seq)) << 3) / 1194 max(1, tcp_ts_getticks() - tp->gput_ts); 1195 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_GPUT, 1196 gput); 1197 /* 1198 * XXXLAS: This is a temporary hack, and should be 1199 * chained off VOI_TCP_GPUT when stats(9) grows an 1200 * API to deal with chained VOIs. 1201 */ 1202 if (tp->t_stats_gput_prev > 0) 1203 stats_voi_update_abs_s32(tp->t_stats, 1204 VOI_TCP_GPUT_ND, 1205 ((gput - tp->t_stats_gput_prev) * 100) / 1206 tp->t_stats_gput_prev); 1207 tp->t_flags &= ~TF_GPUTINPROG; 1208 tp->t_stats_gput_prev = gput; 1209 1210 if (tp->t_maxpeakrate) { 1211 /* 1212 * We update t_peakrate_thr. This gives us roughly 1213 * one update per round trip time. 1214 */ 1215 tcp_update_peakrate_thr(tp); 1216 } 1217 } 1218 #endif 1219 if (tp->snd_cwnd > tp->snd_ssthresh) { 1220 tp->t_bytes_acked += min(tp->ccv->bytes_this_ack, 1221 nsegs * V_tcp_abc_l_var * tp->t_maxseg); 1222 if (tp->t_bytes_acked >= tp->snd_cwnd) { 1223 tp->t_bytes_acked -= tp->snd_cwnd; 1224 tp->ccv->flags |= CCF_ABC_SENTAWND; 1225 } 1226 } else { 1227 tp->ccv->flags &= ~CCF_ABC_SENTAWND; 1228 tp->t_bytes_acked = 0; 1229 } 1230 } 1231 if (CC_ALGO(tp)->ack_received != NULL) { 1232 /* XXXLAS: Find a way to live without this */ 1233 tp->ccv->curack = th->th_ack; 1234 CC_ALGO(tp)->ack_received(tp->ccv, type); 1235 } 1236 #ifdef NETFLIX_STATS 1237 stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_LCWIN, tp->snd_cwnd); 1238 #endif 1239 if (rack->r_ctl.rc_rack_largest_cwnd < tp->snd_cwnd) { 1240 rack->r_ctl.rc_rack_largest_cwnd = tp->snd_cwnd; 1241 } 1242 #ifdef NETFLIX_CWV 1243 if (tp->cwv_enabled) { 1244 /* 1245 * Per RFC 7661: The behaviour in the non-validated phase is 1246 * specified as: o A sender determines whether to increase 1247 * the cwnd based upon whether it is cwnd-limited (see 1248 * Section 4.5.3): * A sender that is cwnd-limited MAY use 1249 * the standard TCP method to increase cwnd (i.e., the 1250 * standard method permits a TCP sender that fully utilises 1251 * the cwnd to increase the cwnd each time it receives an 1252 * ACK). * A sender that is not cwnd-limited MUST NOT 1253 * increase the cwnd when ACK packets are received in this 1254 * phase (i.e., needs to avoid growing the cwnd when it has 1255 * not recently sent using the current size of cwnd). 1256 */ 1257 if ((tp->snd_cwnd > old_cwnd) && 1258 (tp->cwv_cwnd_valid == 0) && 1259 (!(tp->ccv->flags & CCF_CWND_LIMITED))) { 1260 tp->snd_cwnd = old_cwnd; 1261 } 1262 /* Try to update pipeAck and NCWV state */ 1263 if (TCPS_HAVEESTABLISHED(tp->t_state) && 1264 !IN_RECOVERY(tp->t_flags)) { 1265 uint32_t data = sbavail(&(tp->t_inpcb->inp_socket->so_snd)); 1266 1267 tcp_newcwv_update_pipeack(tp, data); 1268 } 1269 } 1270 #endif 1271 /* we enforce max peak rate if it is set. */ 1272 if (tp->t_peakrate_thr && tp->snd_cwnd > tp->t_peakrate_thr) { 1273 tp->snd_cwnd = tp->t_peakrate_thr; 1274 } 1275 } 1276 1277 static void 1278 tcp_rack_partialack(struct tcpcb *tp, struct tcphdr *th) 1279 { 1280 struct tcp_rack *rack; 1281 1282 rack = (struct tcp_rack *)tp->t_fb_ptr; 1283 INP_WLOCK_ASSERT(tp->t_inpcb); 1284 if (rack->r_ctl.rc_prr_sndcnt > 0) 1285 rack->r_wanted_output++; 1286 } 1287 1288 static void 1289 rack_post_recovery(struct tcpcb *tp, struct tcphdr *th) 1290 { 1291 struct tcp_rack *rack; 1292 1293 INP_WLOCK_ASSERT(tp->t_inpcb); 1294 rack = (struct tcp_rack *)tp->t_fb_ptr; 1295 if (CC_ALGO(tp)->post_recovery != NULL) { 1296 tp->ccv->curack = th->th_ack; 1297 CC_ALGO(tp)->post_recovery(tp->ccv); 1298 } 1299 /* 1300 * Here we can in theory adjust cwnd to be based on the number of 1301 * losses in the window (rack->r_ctl.rc_loss_count). This is done 1302 * based on the rack_use_proportional flag. 1303 */ 1304 if (rack->r_ctl.rc_prop_reduce && rack->r_ctl.rc_prop_rate) { 1305 int32_t reduce; 1306 1307 reduce = (rack->r_ctl.rc_loss_count * rack->r_ctl.rc_prop_rate); 1308 if (reduce > 50) { 1309 reduce = 50; 1310 } 1311 tp->snd_cwnd -= ((reduce * tp->snd_cwnd) / 100); 1312 } else { 1313 if (tp->snd_cwnd > tp->snd_ssthresh) { 1314 /* Drop us down to the ssthresh (1/2 cwnd at loss) */ 1315 tp->snd_cwnd = tp->snd_ssthresh; 1316 } 1317 } 1318 if (rack->r_ctl.rc_prr_sndcnt > 0) { 1319 /* Suck the next prr cnt back into cwnd */ 1320 tp->snd_cwnd += rack->r_ctl.rc_prr_sndcnt; 1321 rack->r_ctl.rc_prr_sndcnt = 0; 1322 } 1323 EXIT_RECOVERY(tp->t_flags); 1324 1325 1326 #ifdef NETFLIX_CWV 1327 if (tp->cwv_enabled) { 1328 if ((tp->cwv_cwnd_valid == 0) && 1329 (tp->snd_cwv.in_recovery)) 1330 tcp_newcwv_end_recovery(tp); 1331 } 1332 #endif 1333 } 1334 1335 static void 1336 rack_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type) 1337 { 1338 struct tcp_rack *rack; 1339 1340 INP_WLOCK_ASSERT(tp->t_inpcb); 1341 1342 rack = (struct tcp_rack *)tp->t_fb_ptr; 1343 switch (type) { 1344 case CC_NDUPACK: 1345 /* rack->r_ctl.rc_ssthresh_set = 1;*/ 1346 if (!IN_FASTRECOVERY(tp->t_flags)) { 1347 rack->r_ctl.rc_tlp_rtx_out = 0; 1348 rack->r_ctl.rc_prr_delivered = 0; 1349 rack->r_ctl.rc_prr_out = 0; 1350 rack->r_ctl.rc_loss_count = 0; 1351 rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; 1352 rack->r_ctl.rc_prr_recovery_fs = tp->snd_max - tp->snd_una; 1353 tp->snd_recover = tp->snd_max; 1354 if (tp->t_flags & TF_ECN_PERMIT) 1355 tp->t_flags |= TF_ECN_SND_CWR; 1356 } 1357 break; 1358 case CC_ECN: 1359 if (!IN_CONGRECOVERY(tp->t_flags)) { 1360 TCPSTAT_INC(tcps_ecn_rcwnd); 1361 tp->snd_recover = tp->snd_max; 1362 if (tp->t_flags & TF_ECN_PERMIT) 1363 tp->t_flags |= TF_ECN_SND_CWR; 1364 } 1365 break; 1366 case CC_RTO: 1367 tp->t_dupacks = 0; 1368 tp->t_bytes_acked = 0; 1369 EXIT_RECOVERY(tp->t_flags); 1370 tp->snd_ssthresh = max(2, min(tp->snd_wnd, tp->snd_cwnd) / 2 / 1371 tp->t_maxseg) * tp->t_maxseg; 1372 tp->snd_cwnd = tp->t_maxseg; 1373 break; 1374 case CC_RTO_ERR: 1375 TCPSTAT_INC(tcps_sndrexmitbad); 1376 /* RTO was unnecessary, so reset everything. */ 1377 tp->snd_cwnd = tp->snd_cwnd_prev; 1378 tp->snd_ssthresh = tp->snd_ssthresh_prev; 1379 tp->snd_recover = tp->snd_recover_prev; 1380 if (tp->t_flags & TF_WASFRECOVERY) 1381 ENTER_FASTRECOVERY(tp->t_flags); 1382 if (tp->t_flags & TF_WASCRECOVERY) 1383 ENTER_CONGRECOVERY(tp->t_flags); 1384 tp->snd_nxt = tp->snd_max; 1385 tp->t_badrxtwin = 0; 1386 break; 1387 } 1388 1389 if (CC_ALGO(tp)->cong_signal != NULL) { 1390 if (th != NULL) 1391 tp->ccv->curack = th->th_ack; 1392 CC_ALGO(tp)->cong_signal(tp->ccv, type); 1393 } 1394 #ifdef NETFLIX_CWV 1395 if (tp->cwv_enabled) { 1396 if (tp->snd_cwv.in_recovery == 0 && IN_RECOVERY(tp->t_flags)) { 1397 tcp_newcwv_enter_recovery(tp); 1398 } 1399 if (type == CC_RTO) { 1400 tcp_newcwv_reset(tp); 1401 } 1402 } 1403 #endif 1404 } 1405 1406 1407 1408 static inline void 1409 rack_cc_after_idle(struct tcpcb *tp, int reduce_largest) 1410 { 1411 uint32_t i_cwnd; 1412 1413 INP_WLOCK_ASSERT(tp->t_inpcb); 1414 1415 #ifdef NETFLIX_STATS 1416 TCPSTAT_INC(tcps_idle_restarts); 1417 if (tp->t_state == TCPS_ESTABLISHED) 1418 TCPSTAT_INC(tcps_idle_estrestarts); 1419 #endif 1420 if (CC_ALGO(tp)->after_idle != NULL) 1421 CC_ALGO(tp)->after_idle(tp->ccv); 1422 1423 if (tp->snd_cwnd == 1) 1424 i_cwnd = tp->t_maxseg; /* SYN(-ACK) lost */ 1425 else if (V_tcp_initcwnd_segments) 1426 i_cwnd = min((V_tcp_initcwnd_segments * tp->t_maxseg), 1427 max(2 * tp->t_maxseg, V_tcp_initcwnd_segments * 1460)); 1428 else if (V_tcp_do_rfc3390) 1429 i_cwnd = min(4 * tp->t_maxseg, 1430 max(2 * tp->t_maxseg, 4380)); 1431 else { 1432 /* Per RFC5681 Section 3.1 */ 1433 if (tp->t_maxseg > 2190) 1434 i_cwnd = 2 * tp->t_maxseg; 1435 else if (tp->t_maxseg > 1095) 1436 i_cwnd = 3 * tp->t_maxseg; 1437 else 1438 i_cwnd = 4 * tp->t_maxseg; 1439 } 1440 if (reduce_largest) { 1441 /* 1442 * Do we reduce the largest cwnd to make 1443 * rack play nice on restart hptsi wise? 1444 */ 1445 if (((struct tcp_rack *)tp->t_fb_ptr)->r_ctl.rc_rack_largest_cwnd > i_cwnd) 1446 ((struct tcp_rack *)tp->t_fb_ptr)->r_ctl.rc_rack_largest_cwnd = i_cwnd; 1447 } 1448 /* 1449 * Being idle is no differnt than the initial window. If the cc 1450 * clamps it down below the initial window raise it to the initial 1451 * window. 1452 */ 1453 if (tp->snd_cwnd < i_cwnd) { 1454 tp->snd_cwnd = i_cwnd; 1455 } 1456 } 1457 1458 1459 /* 1460 * Indicate whether this ack should be delayed. We can delay the ack if 1461 * following conditions are met: 1462 * - There is no delayed ack timer in progress. 1463 * - Our last ack wasn't a 0-sized window. We never want to delay 1464 * the ack that opens up a 0-sized window. 1465 * - LRO wasn't used for this segment. We make sure by checking that the 1466 * segment size is not larger than the MSS. 1467 * - Delayed acks are enabled or this is a half-synchronized T/TCP 1468 * connection. 1469 */ 1470 #define DELAY_ACK(tp, tlen) \ 1471 (((tp->t_flags & TF_RXWIN0SENT) == 0) && \ 1472 ((tp->t_flags & TF_DELACK) == 0) && \ 1473 (tlen <= tp->t_maxseg) && \ 1474 (tp->t_delayed_ack || (tp->t_flags & TF_NEEDSYN))) 1475 1476 static inline void 1477 rack_calc_rwin(struct socket *so, struct tcpcb *tp) 1478 { 1479 int32_t win; 1480 1481 /* 1482 * Calculate amount of space in receive window, and then do TCP 1483 * input processing. Receive window is amount of space in rcv queue, 1484 * but not less than advertised window. 1485 */ 1486 win = sbspace(&so->so_rcv); 1487 if (win < 0) 1488 win = 0; 1489 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); 1490 } 1491 1492 static void 1493 rack_do_drop(struct mbuf *m, struct tcpcb *tp, int32_t * ti_locked) 1494 { 1495 if (*ti_locked == TI_RLOCKED) { 1496 INP_INFO_RUNLOCK(&V_tcbinfo); 1497 *ti_locked = TI_UNLOCKED; 1498 } 1499 /* 1500 * Drop space held by incoming segment and return. 1501 */ 1502 if (tp != NULL) 1503 INP_WUNLOCK(tp->t_inpcb); 1504 if (m) 1505 m_freem(m); 1506 } 1507 1508 static void 1509 rack_do_dropwithreset(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t * ti_locked, int32_t rstreason, int32_t tlen) 1510 { 1511 if (*ti_locked == TI_RLOCKED) { 1512 INP_INFO_RUNLOCK(&V_tcbinfo); 1513 *ti_locked = TI_UNLOCKED; 1514 } 1515 if (tp != NULL) { 1516 tcp_dropwithreset(m, th, tp, tlen, rstreason); 1517 INP_WUNLOCK(tp->t_inpcb); 1518 } else 1519 tcp_dropwithreset(m, th, NULL, tlen, rstreason); 1520 } 1521 1522 /* 1523 * The value in ret_val informs the caller 1524 * if we dropped the tcb (and lock) or not. 1525 * 1 = we dropped it, 0 = the TCB is still locked 1526 * and valid. 1527 */ 1528 static void 1529 rack_do_dropafterack(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t * ti_locked, int32_t thflags, int32_t tlen, int32_t * ret_val) 1530 { 1531 /* 1532 * Generate an ACK dropping incoming segment if it occupies sequence 1533 * space, where the ACK reflects our state. 1534 * 1535 * We can now skip the test for the RST flag since all paths to this 1536 * code happen after packets containing RST have been dropped. 1537 * 1538 * In the SYN-RECEIVED state, don't send an ACK unless the segment 1539 * we received passes the SYN-RECEIVED ACK test. If it fails send a 1540 * RST. This breaks the loop in the "LAND" DoS attack, and also 1541 * prevents an ACK storm between two listening ports that have been 1542 * sent forged SYN segments, each with the source address of the 1543 * other. 1544 */ 1545 struct tcp_rack *rack; 1546 1547 if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) && 1548 (SEQ_GT(tp->snd_una, th->th_ack) || 1549 SEQ_GT(th->th_ack, tp->snd_max))) { 1550 *ret_val = 1; 1551 rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen); 1552 return; 1553 } else 1554 *ret_val = 0; 1555 if (*ti_locked == TI_RLOCKED) { 1556 INP_INFO_RUNLOCK(&V_tcbinfo); 1557 *ti_locked = TI_UNLOCKED; 1558 } 1559 rack = (struct tcp_rack *)tp->t_fb_ptr; 1560 rack->r_wanted_output++; 1561 tp->t_flags |= TF_ACKNOW; 1562 if (m) 1563 m_freem(m); 1564 } 1565 1566 1567 static int 1568 rack_process_rst(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, int32_t * ti_locked) 1569 { 1570 /* 1571 * RFC5961 Section 3.2 1572 * 1573 * - RST drops connection only if SEG.SEQ == RCV.NXT. - If RST is in 1574 * window, we send challenge ACK. 1575 * 1576 * Note: to take into account delayed ACKs, we should test against 1577 * last_ack_sent instead of rcv_nxt. Note 2: we handle special case 1578 * of closed window, not covered by the RFC. 1579 */ 1580 int dropped = 0; 1581 1582 if ((SEQ_GEQ(th->th_seq, (tp->last_ack_sent - 1)) && 1583 SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) || 1584 (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) { 1585 1586 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 1587 KASSERT(*ti_locked == TI_RLOCKED, 1588 ("%s: TH_RST ti_locked %d, th %p tp %p", 1589 __func__, *ti_locked, th, tp)); 1590 KASSERT(tp->t_state != TCPS_SYN_SENT, 1591 ("%s: TH_RST for TCPS_SYN_SENT th %p tp %p", 1592 __func__, th, tp)); 1593 1594 if (V_tcp_insecure_rst || 1595 (tp->last_ack_sent == th->th_seq) || 1596 (tp->rcv_nxt == th->th_seq) || 1597 ((tp->last_ack_sent - 1) == th->th_seq)) { 1598 TCPSTAT_INC(tcps_drops); 1599 /* Drop the connection. */ 1600 switch (tp->t_state) { 1601 case TCPS_SYN_RECEIVED: 1602 so->so_error = ECONNREFUSED; 1603 goto close; 1604 case TCPS_ESTABLISHED: 1605 case TCPS_FIN_WAIT_1: 1606 case TCPS_FIN_WAIT_2: 1607 case TCPS_CLOSE_WAIT: 1608 case TCPS_CLOSING: 1609 case TCPS_LAST_ACK: 1610 so->so_error = ECONNRESET; 1611 close: 1612 tcp_state_change(tp, TCPS_CLOSED); 1613 /* FALLTHROUGH */ 1614 default: 1615 tp = tcp_close(tp); 1616 } 1617 dropped = 1; 1618 rack_do_drop(m, tp, ti_locked); 1619 } else { 1620 TCPSTAT_INC(tcps_badrst); 1621 /* Send challenge ACK. */ 1622 tcp_respond(tp, mtod(m, void *), th, m, 1623 tp->rcv_nxt, tp->snd_nxt, TH_ACK); 1624 tp->last_ack_sent = tp->rcv_nxt; 1625 } 1626 } else { 1627 m_freem(m); 1628 } 1629 return (dropped); 1630 } 1631 1632 /* 1633 * The value in ret_val informs the caller 1634 * if we dropped the tcb (and lock) or not. 1635 * 1 = we dropped it, 0 = the TCB is still locked 1636 * and valid. 1637 */ 1638 static void 1639 rack_challenge_ack(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * ti_locked, int32_t * ret_val) 1640 { 1641 KASSERT(*ti_locked == TI_RLOCKED, 1642 ("tcp_do_segment: TH_SYN ti_locked %d", *ti_locked)); 1643 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 1644 1645 TCPSTAT_INC(tcps_badsyn); 1646 if (V_tcp_insecure_syn && 1647 SEQ_GEQ(th->th_seq, tp->last_ack_sent) && 1648 SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) { 1649 tp = tcp_drop(tp, ECONNRESET); 1650 *ret_val = 1; 1651 rack_do_drop(m, tp, ti_locked); 1652 } else { 1653 /* Send challenge ACK. */ 1654 tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt, 1655 tp->snd_nxt, TH_ACK); 1656 tp->last_ack_sent = tp->rcv_nxt; 1657 m = NULL; 1658 *ret_val = 0; 1659 rack_do_drop(m, NULL, ti_locked); 1660 } 1661 } 1662 1663 /* 1664 * rack_ts_check returns 1 for you should not proceed. It places 1665 * in ret_val what should be returned 1/0 by the caller. The 1 indicates 1666 * that the TCB is unlocked and probably dropped. The 0 indicates the 1667 * TCB is still valid and locked. 1668 */ 1669 static int 1670 rack_ts_check(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * ti_locked, int32_t tlen, int32_t thflags, int32_t * ret_val) 1671 { 1672 1673 /* Check to see if ts_recent is over 24 days old. */ 1674 if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) { 1675 /* 1676 * Invalidate ts_recent. If this segment updates ts_recent, 1677 * the age will be reset later and ts_recent will get a 1678 * valid value. If it does not, setting ts_recent to zero 1679 * will at least satisfy the requirement that zero be placed 1680 * in the timestamp echo reply when ts_recent isn't valid. 1681 * The age isn't reset until we get a valid ts_recent 1682 * because we don't want out-of-order segments to be dropped 1683 * when ts_recent is old. 1684 */ 1685 tp->ts_recent = 0; 1686 } else { 1687 TCPSTAT_INC(tcps_rcvduppack); 1688 TCPSTAT_ADD(tcps_rcvdupbyte, tlen); 1689 TCPSTAT_INC(tcps_pawsdrop); 1690 *ret_val = 0; 1691 if (tlen) { 1692 rack_do_dropafterack(m, tp, th, ti_locked, thflags, tlen, ret_val); 1693 } else { 1694 rack_do_drop(m, NULL, ti_locked); 1695 } 1696 return (1); 1697 } 1698 return (0); 1699 } 1700 1701 /* 1702 * rack_drop_checks returns 1 for you should not proceed. It places 1703 * in ret_val what should be returned 1/0 by the caller. The 1 indicates 1704 * that the TCB is unlocked and probably dropped. The 0 indicates the 1705 * TCB is still valid and locked. 1706 */ 1707 static int 1708 rack_drop_checks(struct tcpopt *to, struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * tlenp, int32_t * ti_locked, int32_t * thf, int32_t * drop_hdrlen, int32_t * ret_val) 1709 { 1710 int32_t todrop; 1711 int32_t thflags; 1712 int32_t tlen; 1713 1714 thflags = *thf; 1715 tlen = *tlenp; 1716 todrop = tp->rcv_nxt - th->th_seq; 1717 if (todrop > 0) { 1718 if (thflags & TH_SYN) { 1719 thflags &= ~TH_SYN; 1720 th->th_seq++; 1721 if (th->th_urp > 1) 1722 th->th_urp--; 1723 else 1724 thflags &= ~TH_URG; 1725 todrop--; 1726 } 1727 /* 1728 * Following if statement from Stevens, vol. 2, p. 960. 1729 */ 1730 if (todrop > tlen 1731 || (todrop == tlen && (thflags & TH_FIN) == 0)) { 1732 /* 1733 * Any valid FIN must be to the left of the window. 1734 * At this point the FIN must be a duplicate or out 1735 * of sequence; drop it. 1736 */ 1737 thflags &= ~TH_FIN; 1738 /* 1739 * Send an ACK to resynchronize and drop any data. 1740 * But keep on processing for RST or ACK. 1741 */ 1742 tp->t_flags |= TF_ACKNOW; 1743 todrop = tlen; 1744 TCPSTAT_INC(tcps_rcvduppack); 1745 TCPSTAT_ADD(tcps_rcvdupbyte, todrop); 1746 } else { 1747 TCPSTAT_INC(tcps_rcvpartduppack); 1748 TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop); 1749 } 1750 *drop_hdrlen += todrop; /* drop from the top afterwards */ 1751 th->th_seq += todrop; 1752 tlen -= todrop; 1753 if (th->th_urp > todrop) 1754 th->th_urp -= todrop; 1755 else { 1756 thflags &= ~TH_URG; 1757 th->th_urp = 0; 1758 } 1759 } 1760 /* 1761 * If segment ends after window, drop trailing data (and PUSH and 1762 * FIN); if nothing left, just ACK. 1763 */ 1764 todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd); 1765 if (todrop > 0) { 1766 TCPSTAT_INC(tcps_rcvpackafterwin); 1767 if (todrop >= tlen) { 1768 TCPSTAT_ADD(tcps_rcvbyteafterwin, tlen); 1769 /* 1770 * If window is closed can only take segments at 1771 * window edge, and have to drop data and PUSH from 1772 * incoming segments. Continue processing, but 1773 * remember to ack. Otherwise, drop segment and 1774 * ack. 1775 */ 1776 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { 1777 tp->t_flags |= TF_ACKNOW; 1778 TCPSTAT_INC(tcps_rcvwinprobe); 1779 } else { 1780 rack_do_dropafterack(m, tp, th, ti_locked, thflags, tlen, ret_val); 1781 return (1); 1782 } 1783 } else 1784 TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); 1785 m_adj(m, -todrop); 1786 tlen -= todrop; 1787 thflags &= ~(TH_PUSH | TH_FIN); 1788 } 1789 *thf = thflags; 1790 *tlenp = tlen; 1791 return (0); 1792 } 1793 1794 static struct rack_sendmap * 1795 rack_find_lowest_rsm(struct tcp_rack *rack) 1796 { 1797 struct rack_sendmap *rsm; 1798 1799 /* 1800 * Walk the time-order transmitted list looking for an rsm that is 1801 * not acked. This will be the one that was sent the longest time 1802 * ago that is still outstanding. 1803 */ 1804 TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) { 1805 if (rsm->r_flags & RACK_ACKED) { 1806 continue; 1807 } 1808 goto finish; 1809 } 1810 finish: 1811 return (rsm); 1812 } 1813 1814 static struct rack_sendmap * 1815 rack_find_high_nonack(struct tcp_rack *rack, struct rack_sendmap *rsm) 1816 { 1817 struct rack_sendmap *prsm; 1818 1819 /* 1820 * Walk the sequence order list backward until we hit and arrive at 1821 * the highest seq not acked. In theory when this is called it 1822 * should be the last segment (which it was not). 1823 */ 1824 counter_u64_add(rack_find_high, 1); 1825 prsm = rsm; 1826 TAILQ_FOREACH_REVERSE_FROM(prsm, &rack->r_ctl.rc_map, rack_head, r_next) { 1827 if (prsm->r_flags & (RACK_ACKED | RACK_HAS_FIN)) { 1828 continue; 1829 } 1830 return (prsm); 1831 } 1832 return (NULL); 1833 } 1834 1835 1836 static uint32_t 1837 rack_calc_thresh_rack(struct tcp_rack *rack, uint32_t srtt, uint32_t cts) 1838 { 1839 int32_t lro; 1840 uint32_t thresh; 1841 1842 /* 1843 * lro is the flag we use to determine if we have seen reordering. 1844 * If it gets set we have seen reordering. The reorder logic either 1845 * works in one of two ways: 1846 * 1847 * If reorder-fade is configured, then we track the last time we saw 1848 * re-ordering occur. If we reach the point where enough time as 1849 * passed we no longer consider reordering has occuring. 1850 * 1851 * Or if reorder-face is 0, then once we see reordering we consider 1852 * the connection to alway be subject to reordering and just set lro 1853 * to 1. 1854 * 1855 * In the end if lro is non-zero we add the extra time for 1856 * reordering in. 1857 */ 1858 if (srtt == 0) 1859 srtt = 1; 1860 if (rack->r_ctl.rc_reorder_ts) { 1861 if (rack->r_ctl.rc_reorder_fade) { 1862 if (SEQ_GEQ(cts, rack->r_ctl.rc_reorder_ts)) { 1863 lro = cts - rack->r_ctl.rc_reorder_ts; 1864 if (lro == 0) { 1865 /* 1866 * No time as passed since the last 1867 * reorder, mark it as reordering. 1868 */ 1869 lro = 1; 1870 } 1871 } else { 1872 /* Negative time? */ 1873 lro = 0; 1874 } 1875 if (lro > rack->r_ctl.rc_reorder_fade) { 1876 /* Turn off reordering seen too */ 1877 rack->r_ctl.rc_reorder_ts = 0; 1878 lro = 0; 1879 } 1880 } else { 1881 /* Reodering does not fade */ 1882 lro = 1; 1883 } 1884 } else { 1885 lro = 0; 1886 } 1887 thresh = srtt + rack->r_ctl.rc_pkt_delay; 1888 if (lro) { 1889 /* It must be set, if not you get 1/4 rtt */ 1890 if (rack->r_ctl.rc_reorder_shift) 1891 thresh += (srtt >> rack->r_ctl.rc_reorder_shift); 1892 else 1893 thresh += (srtt >> 2); 1894 } else { 1895 thresh += 1; 1896 } 1897 /* We don't let the rack timeout be above a RTO */ 1898 1899 if (thresh > TICKS_2_MSEC(rack->rc_tp->t_rxtcur)) { 1900 thresh = TICKS_2_MSEC(rack->rc_tp->t_rxtcur); 1901 } 1902 /* And we don't want it above the RTO max either */ 1903 if (thresh > rack_rto_max) { 1904 thresh = rack_rto_max; 1905 } 1906 return (thresh); 1907 } 1908 1909 static uint32_t 1910 rack_calc_thresh_tlp(struct tcpcb *tp, struct tcp_rack *rack, 1911 struct rack_sendmap *rsm, uint32_t srtt) 1912 { 1913 struct rack_sendmap *prsm; 1914 uint32_t thresh, len; 1915 int maxseg; 1916 1917 if (srtt == 0) 1918 srtt = 1; 1919 if (rack->r_ctl.rc_tlp_threshold) 1920 thresh = srtt + (srtt / rack->r_ctl.rc_tlp_threshold); 1921 else 1922 thresh = (srtt * 2); 1923 1924 /* Get the previous sent packet, if any */ 1925 maxseg = tcp_maxseg(tp); 1926 counter_u64_add(rack_enter_tlp_calc, 1); 1927 len = rsm->r_end - rsm->r_start; 1928 if (rack->rack_tlp_threshold_use == TLP_USE_ID) { 1929 /* Exactly like the ID */ 1930 if (((tp->snd_max - tp->snd_una) - rack->r_ctl.rc_sacked + rack->r_ctl.rc_holes_rxt) <= maxseg) { 1931 uint32_t alt_thresh; 1932 /* 1933 * Compensate for delayed-ack with the d-ack time. 1934 */ 1935 counter_u64_add(rack_used_tlpmethod, 1); 1936 alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time; 1937 if (alt_thresh > thresh) 1938 thresh = alt_thresh; 1939 } 1940 } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_ONE) { 1941 /* 2.1 behavior */ 1942 prsm = TAILQ_PREV(rsm, rack_head, r_tnext); 1943 if (prsm && (len <= maxseg)) { 1944 /* 1945 * Two packets outstanding, thresh should be (2*srtt) + 1946 * possible inter-packet delay (if any). 1947 */ 1948 uint32_t inter_gap = 0; 1949 int idx, nidx; 1950 1951 counter_u64_add(rack_used_tlpmethod, 1); 1952 idx = rsm->r_rtr_cnt - 1; 1953 nidx = prsm->r_rtr_cnt - 1; 1954 if (TSTMP_GEQ(rsm->r_tim_lastsent[nidx], prsm->r_tim_lastsent[idx])) { 1955 /* Yes it was sent later (or at the same time) */ 1956 inter_gap = rsm->r_tim_lastsent[idx] - prsm->r_tim_lastsent[nidx]; 1957 } 1958 thresh += inter_gap; 1959 } else if (len <= maxseg) { 1960 /* 1961 * Possibly compensate for delayed-ack. 1962 */ 1963 uint32_t alt_thresh; 1964 1965 counter_u64_add(rack_used_tlpmethod2, 1); 1966 alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time; 1967 if (alt_thresh > thresh) 1968 thresh = alt_thresh; 1969 } 1970 } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_TWO) { 1971 /* 2.2 behavior */ 1972 if (len <= maxseg) { 1973 uint32_t alt_thresh; 1974 /* 1975 * Compensate for delayed-ack with the d-ack time. 1976 */ 1977 counter_u64_add(rack_used_tlpmethod, 1); 1978 alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time; 1979 if (alt_thresh > thresh) 1980 thresh = alt_thresh; 1981 } 1982 } 1983 /* Not above an RTO */ 1984 if (thresh > TICKS_2_MSEC(tp->t_rxtcur)) { 1985 thresh = TICKS_2_MSEC(tp->t_rxtcur); 1986 } 1987 /* Not above a RTO max */ 1988 if (thresh > rack_rto_max) { 1989 thresh = rack_rto_max; 1990 } 1991 /* Apply user supplied min TLP */ 1992 if (thresh < rack_tlp_min) { 1993 thresh = rack_tlp_min; 1994 } 1995 return (thresh); 1996 } 1997 1998 static struct rack_sendmap * 1999 rack_check_recovery_mode(struct tcpcb *tp, uint32_t tsused) 2000 { 2001 /* 2002 * Check to see that we don't need to fall into recovery. We will 2003 * need to do so if our oldest transmit is past the time we should 2004 * have had an ack. 2005 */ 2006 struct tcp_rack *rack; 2007 struct rack_sendmap *rsm; 2008 int32_t idx; 2009 uint32_t srtt_cur, srtt, thresh; 2010 2011 rack = (struct tcp_rack *)tp->t_fb_ptr; 2012 if (TAILQ_EMPTY(&rack->r_ctl.rc_map)) { 2013 return (NULL); 2014 } 2015 srtt_cur = tp->t_srtt >> TCP_RTT_SHIFT; 2016 srtt = TICKS_2_MSEC(srtt_cur); 2017 if (rack->rc_rack_rtt && (srtt > rack->rc_rack_rtt)) 2018 srtt = rack->rc_rack_rtt; 2019 2020 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 2021 if (rsm == NULL) 2022 return (NULL); 2023 2024 if (rsm->r_flags & RACK_ACKED) { 2025 rsm = rack_find_lowest_rsm(rack); 2026 if (rsm == NULL) 2027 return (NULL); 2028 } 2029 idx = rsm->r_rtr_cnt - 1; 2030 thresh = rack_calc_thresh_rack(rack, srtt, tsused); 2031 if (tsused < rsm->r_tim_lastsent[idx]) { 2032 return (NULL); 2033 } 2034 if ((tsused - rsm->r_tim_lastsent[idx]) < thresh) { 2035 return (NULL); 2036 } 2037 /* Ok if we reach here we are over-due */ 2038 rack->r_ctl.rc_rsm_start = rsm->r_start; 2039 rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; 2040 rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; 2041 rack_cong_signal(tp, NULL, CC_NDUPACK); 2042 return (rsm); 2043 } 2044 2045 static uint32_t 2046 rack_get_persists_timer_val(struct tcpcb *tp, struct tcp_rack *rack) 2047 { 2048 int32_t t; 2049 int32_t tt; 2050 uint32_t ret_val; 2051 2052 t = TICKS_2_MSEC((tp->t_srtt >> TCP_RTT_SHIFT) + ((tp->t_rttvar * 4) >> TCP_RTT_SHIFT)); 2053 TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift], 2054 tcp_persmin, tcp_persmax); 2055 if (tp->t_rxtshift < TCP_MAXRXTSHIFT) 2056 tp->t_rxtshift++; 2057 rack->r_ctl.rc_hpts_flags |= PACE_TMR_PERSIT; 2058 ret_val = (uint32_t)tt; 2059 return (ret_val); 2060 } 2061 2062 static uint32_t 2063 rack_timer_start(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 2064 { 2065 /* 2066 * Start the FR timer, we do this based on getting the first one in 2067 * the rc_tmap. Note that if its NULL we must stop the timer. in all 2068 * events we need to stop the running timer (if its running) before 2069 * starting the new one. 2070 */ 2071 uint32_t thresh, exp, to, srtt, time_since_sent; 2072 uint32_t srtt_cur; 2073 int32_t idx; 2074 int32_t is_tlp_timer = 0; 2075 struct rack_sendmap *rsm; 2076 2077 if (rack->t_timers_stopped) { 2078 /* All timers have been stopped none are to run */ 2079 return (0); 2080 } 2081 if (rack->rc_in_persist) { 2082 /* We can't start any timer in persists */ 2083 return (rack_get_persists_timer_val(tp, rack)); 2084 } 2085 if (tp->t_state < TCPS_ESTABLISHED) 2086 goto activate_rxt; 2087 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 2088 if (rsm == NULL) { 2089 /* Nothing on the send map */ 2090 activate_rxt: 2091 if (SEQ_LT(tp->snd_una, tp->snd_max) || sbavail(&(tp->t_inpcb->inp_socket->so_snd))) { 2092 rack->r_ctl.rc_hpts_flags |= PACE_TMR_RXT; 2093 to = TICKS_2_MSEC(tp->t_rxtcur); 2094 if (to == 0) 2095 to = 1; 2096 return (to); 2097 } 2098 return (0); 2099 } 2100 if (rsm->r_flags & RACK_ACKED) { 2101 rsm = rack_find_lowest_rsm(rack); 2102 if (rsm == NULL) { 2103 /* No lowest? */ 2104 goto activate_rxt; 2105 } 2106 } 2107 /* Convert from ms to usecs */ 2108 if (rsm->r_flags & RACK_SACK_PASSED) { 2109 if ((tp->t_flags & TF_SENTFIN) && 2110 ((tp->snd_max - tp->snd_una) == 1) && 2111 (rsm->r_flags & RACK_HAS_FIN)) { 2112 /* 2113 * We don't start a rack timer if all we have is a 2114 * FIN outstanding. 2115 */ 2116 goto activate_rxt; 2117 } 2118 if (tp->t_srtt) { 2119 srtt_cur = (tp->t_srtt >> TCP_RTT_SHIFT); 2120 srtt = TICKS_2_MSEC(srtt_cur); 2121 } else 2122 srtt = RACK_INITIAL_RTO; 2123 2124 thresh = rack_calc_thresh_rack(rack, srtt, cts); 2125 idx = rsm->r_rtr_cnt - 1; 2126 exp = rsm->r_tim_lastsent[idx] + thresh; 2127 if (SEQ_GEQ(exp, cts)) { 2128 to = exp - cts; 2129 if (to < rack->r_ctl.rc_min_to) { 2130 to = rack->r_ctl.rc_min_to; 2131 } 2132 } else { 2133 to = rack->r_ctl.rc_min_to; 2134 } 2135 } else { 2136 /* Ok we need to do a TLP not RACK */ 2137 if ((rack->rc_tlp_in_progress != 0) || 2138 (rack->r_ctl.rc_tlp_rtx_out != 0)) { 2139 /* 2140 * The previous send was a TLP or a tlp_rtx is in 2141 * process. 2142 */ 2143 goto activate_rxt; 2144 } 2145 rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext); 2146 if (rsm == NULL) { 2147 /* We found no rsm to TLP with. */ 2148 goto activate_rxt; 2149 } 2150 if (rsm->r_flags & RACK_HAS_FIN) { 2151 /* If its a FIN we dont do TLP */ 2152 rsm = NULL; 2153 goto activate_rxt; 2154 } 2155 idx = rsm->r_rtr_cnt - 1; 2156 if (TSTMP_GT(cts, rsm->r_tim_lastsent[idx])) 2157 time_since_sent = cts - rsm->r_tim_lastsent[idx]; 2158 else 2159 time_since_sent = 0; 2160 is_tlp_timer = 1; 2161 if (tp->t_srtt) { 2162 srtt_cur = (tp->t_srtt >> TCP_RTT_SHIFT); 2163 srtt = TICKS_2_MSEC(srtt_cur); 2164 } else 2165 srtt = RACK_INITIAL_RTO; 2166 thresh = rack_calc_thresh_tlp(tp, rack, rsm, srtt); 2167 if (thresh > time_since_sent) 2168 to = thresh - time_since_sent; 2169 else 2170 to = rack->r_ctl.rc_min_to; 2171 if (to > TCPTV_REXMTMAX) { 2172 /* 2173 * If the TLP time works out to larger than the max 2174 * RTO lets not do TLP.. just RTO. 2175 */ 2176 goto activate_rxt; 2177 } 2178 if (rsm->r_start != rack->r_ctl.rc_last_tlp_seq) { 2179 /* 2180 * The tail is no longer the last one I did a probe 2181 * on 2182 */ 2183 rack->r_ctl.rc_tlp_seg_send_cnt = 0; 2184 rack->r_ctl.rc_last_tlp_seq = rsm->r_start; 2185 } 2186 } 2187 if (is_tlp_timer == 0) { 2188 rack->r_ctl.rc_hpts_flags |= PACE_TMR_RACK; 2189 } else { 2190 if ((rack->r_ctl.rc_tlp_send_cnt > rack_tlp_max_resend) || 2191 (rack->r_ctl.rc_tlp_seg_send_cnt > rack_tlp_max_resend)) { 2192 /* 2193 * We have exceeded how many times we can retran the 2194 * current TLP timer, switch to the RTO timer. 2195 */ 2196 goto activate_rxt; 2197 } else { 2198 rack->r_ctl.rc_hpts_flags |= PACE_TMR_TLP; 2199 } 2200 } 2201 if (to == 0) 2202 to = 1; 2203 return (to); 2204 } 2205 2206 static void 2207 rack_enter_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 2208 { 2209 if (rack->rc_in_persist == 0) { 2210 if (((tp->t_flags & TF_SENTFIN) == 0) && 2211 (tp->snd_max - tp->snd_una) >= sbavail(&rack->rc_inp->inp_socket->so_snd)) 2212 /* Must need to send more data to enter persist */ 2213 return; 2214 rack->r_ctl.rc_went_idle_time = cts; 2215 rack_timer_cancel(tp, rack, cts, __LINE__); 2216 tp->t_rxtshift = 0; 2217 rack->rc_in_persist = 1; 2218 } 2219 } 2220 2221 static void 2222 rack_exit_persist(struct tcpcb *tp, struct tcp_rack *rack) 2223 { 2224 if (rack->rc_inp->inp_in_hpts) { 2225 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); 2226 rack->r_ctl.rc_hpts_flags = 0; 2227 } 2228 rack->rc_in_persist = 0; 2229 rack->r_ctl.rc_went_idle_time = 0; 2230 tp->t_flags &= ~TF_FORCEDATA; 2231 tp->t_rxtshift = 0; 2232 } 2233 2234 static void 2235 rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, int32_t line, 2236 int32_t slot, uint32_t tot_len_this_send, int32_t frm_out_sbavail) 2237 { 2238 struct inpcb *inp; 2239 uint32_t delayed_ack = 0; 2240 uint32_t hpts_timeout; 2241 uint8_t stopped; 2242 uint32_t left = 0; 2243 2244 inp = tp->t_inpcb; 2245 if (inp->inp_in_hpts) { 2246 /* A previous call is already set up */ 2247 return; 2248 } 2249 if (tp->t_state == TCPS_CLOSED) { 2250 return; 2251 } 2252 stopped = rack->rc_tmr_stopped; 2253 if (stopped && TSTMP_GT(rack->r_ctl.rc_timer_exp, cts)) { 2254 left = rack->r_ctl.rc_timer_exp - cts; 2255 } 2256 rack->r_ctl.rc_timer_exp = 0; 2257 if (rack->rc_inp->inp_in_hpts == 0) { 2258 rack->r_ctl.rc_hpts_flags = 0; 2259 } 2260 if (slot) { 2261 /* We are hptsi too */ 2262 rack->r_ctl.rc_hpts_flags |= PACE_PKT_OUTPUT; 2263 } else if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { 2264 /* 2265 * We are still left on the hpts when the to goes 2266 * it will be for output. 2267 */ 2268 if (TSTMP_GT(cts, rack->r_ctl.rc_last_output_to)) 2269 slot = cts - rack->r_ctl.rc_last_output_to; 2270 else 2271 slot = 1; 2272 } 2273 if ((tp->snd_wnd == 0) && TCPS_HAVEESTABLISHED(tp->t_state)) { 2274 /* No send window.. we must enter persist */ 2275 rack_enter_persist(tp, rack, cts); 2276 } else if ((frm_out_sbavail && 2277 (frm_out_sbavail > (tp->snd_max - tp->snd_una)) && 2278 (tp->snd_wnd < tp->t_maxseg)) && 2279 TCPS_HAVEESTABLISHED(tp->t_state)) { 2280 /* 2281 * If we have no window or we can't send a segment (and have 2282 * data to send.. we cheat here and frm_out_sbavail is 2283 * passed in with the sbavail(sb) only from bbr_output) and 2284 * we are established, then we must enter persits (if not 2285 * already in persits). 2286 */ 2287 rack_enter_persist(tp, rack, cts); 2288 } 2289 hpts_timeout = rack_timer_start(tp, rack, cts); 2290 if (tp->t_flags & TF_DELACK) { 2291 delayed_ack = tcp_delacktime; 2292 rack->r_ctl.rc_hpts_flags |= PACE_TMR_DELACK; 2293 } 2294 if (delayed_ack && ((hpts_timeout == 0) || 2295 (delayed_ack < hpts_timeout))) 2296 hpts_timeout = delayed_ack; 2297 else 2298 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK; 2299 /* 2300 * If no timers are going to run and we will fall off the hptsi 2301 * wheel, we resort to a keep-alive timer if its configured. 2302 */ 2303 if ((hpts_timeout == 0) && 2304 (slot == 0)) { 2305 if ((tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && 2306 (tp->t_state <= TCPS_CLOSING)) { 2307 /* 2308 * Ok we have no timer (persists, rack, tlp, rxt or 2309 * del-ack), we don't have segments being paced. So 2310 * all that is left is the keepalive timer. 2311 */ 2312 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 2313 /* Get the established keep-alive time */ 2314 hpts_timeout = TP_KEEPIDLE(tp); 2315 } else { 2316 /* Get the initial setup keep-alive time */ 2317 hpts_timeout = TP_KEEPINIT(tp); 2318 } 2319 rack->r_ctl.rc_hpts_flags |= PACE_TMR_KEEP; 2320 } 2321 } 2322 if (left && (stopped & (PACE_TMR_KEEP | PACE_TMR_DELACK)) == 2323 (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK)) { 2324 /* 2325 * RACK, TLP, persists and RXT timers all are restartable 2326 * based on actions input .. i.e we received a packet (ack 2327 * or sack) and that changes things (rw, or snd_una etc). 2328 * Thus we can restart them with a new value. For 2329 * keep-alive, delayed_ack we keep track of what was left 2330 * and restart the timer with a smaller value. 2331 */ 2332 if (left < hpts_timeout) 2333 hpts_timeout = left; 2334 } 2335 if (hpts_timeout) { 2336 /* 2337 * Hack alert for now we can't time-out over 2,147,483 2338 * seconds (a bit more than 596 hours), which is probably ok 2339 * :). 2340 */ 2341 if (hpts_timeout > 0x7ffffffe) 2342 hpts_timeout = 0x7ffffffe; 2343 rack->r_ctl.rc_timer_exp = cts + hpts_timeout; 2344 } 2345 if (slot) { 2346 rack->r_ctl.rc_last_output_to = cts + slot; 2347 if ((hpts_timeout == 0) || (hpts_timeout > slot)) { 2348 if (rack->rc_inp->inp_in_hpts == 0) 2349 tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(slot)); 2350 rack_log_to_start(rack, cts, hpts_timeout, slot, 1); 2351 } else { 2352 /* 2353 * Arrange for the hpts to kick back in after the 2354 * t-o if the t-o does not cause a send. 2355 */ 2356 if (rack->rc_inp->inp_in_hpts == 0) 2357 tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(hpts_timeout)); 2358 rack_log_to_start(rack, cts, hpts_timeout, slot, 0); 2359 } 2360 } else if (hpts_timeout) { 2361 if (rack->rc_inp->inp_in_hpts == 0) 2362 tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(hpts_timeout)); 2363 rack_log_to_start(rack, cts, hpts_timeout, slot, 0); 2364 } else { 2365 /* No timer starting */ 2366 #ifdef INVARIANTS 2367 if (SEQ_GT(tp->snd_max, tp->snd_una)) { 2368 panic("tp:%p rack:%p tlts:%d cts:%u slot:%u pto:%u -- no timer started?", 2369 tp, rack, tot_len_this_send, cts, slot, hpts_timeout); 2370 } 2371 #endif 2372 } 2373 rack->rc_tmr_stopped = 0; 2374 if (slot) 2375 rack_log_type_bbrsnd(rack, tot_len_this_send, slot, cts); 2376 } 2377 2378 /* 2379 * RACK Timer, here we simply do logging and house keeping. 2380 * the normal rack_output() function will call the 2381 * appropriate thing to check if we need to do a RACK retransmit. 2382 * We return 1, saying don't proceed with rack_output only 2383 * when all timers have been stopped (destroyed PCB?). 2384 */ 2385 static int 2386 rack_timeout_rack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 2387 { 2388 /* 2389 * This timer simply provides an internal trigger to send out data. 2390 * The check_recovery_mode call will see if there are needed 2391 * retransmissions, if so we will enter fast-recovery. The output 2392 * call may or may not do the same thing depending on sysctl 2393 * settings. 2394 */ 2395 struct rack_sendmap *rsm; 2396 int32_t recovery; 2397 2398 if (tp->t_timers->tt_flags & TT_STOPPED) { 2399 return (1); 2400 } 2401 if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) { 2402 /* Its not time yet */ 2403 return (0); 2404 } 2405 rack_log_to_event(rack, RACK_TO_FRM_RACK); 2406 recovery = IN_RECOVERY(tp->t_flags); 2407 counter_u64_add(rack_to_tot, 1); 2408 if (rack->r_state && (rack->r_state != tp->t_state)) 2409 rack_set_state(tp, rack); 2410 rsm = rack_check_recovery_mode(tp, cts); 2411 if (rsm) { 2412 uint32_t rtt; 2413 2414 rtt = rack->rc_rack_rtt; 2415 if (rtt == 0) 2416 rtt = 1; 2417 if ((recovery == 0) && 2418 (rack->r_ctl.rc_prr_sndcnt < tp->t_maxseg)) { 2419 /* 2420 * The rack-timeout that enter's us into recovery 2421 * will force out one MSS and set us up so that we 2422 * can do one more send in 2*rtt (transitioning the 2423 * rack timeout into a rack-tlp). 2424 */ 2425 rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; 2426 } else if ((rack->r_ctl.rc_prr_sndcnt < tp->t_maxseg) && 2427 ((rsm->r_end - rsm->r_start) > rack->r_ctl.rc_prr_sndcnt)) { 2428 /* 2429 * When a rack timer goes, we have to send at 2430 * least one segment. They will be paced a min of 1ms 2431 * apart via the next rack timer (or further 2432 * if the rack timer dictates it). 2433 */ 2434 rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; 2435 } 2436 } else { 2437 /* This is a case that should happen rarely if ever */ 2438 counter_u64_add(rack_tlp_does_nada, 1); 2439 #ifdef TCP_BLACKBOX 2440 tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true); 2441 #endif 2442 rack->r_ctl.rc_resend = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 2443 } 2444 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RACK; 2445 return (0); 2446 } 2447 2448 /* 2449 * TLP Timer, here we simply setup what segment we want to 2450 * have the TLP expire on, the normal rack_output() will then 2451 * send it out. 2452 * 2453 * We return 1, saying don't proceed with rack_output only 2454 * when all timers have been stopped (destroyed PCB?). 2455 */ 2456 static int 2457 rack_timeout_tlp(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 2458 { 2459 /* 2460 * Tail Loss Probe. 2461 */ 2462 struct rack_sendmap *rsm = NULL; 2463 struct socket *so; 2464 uint32_t amm, old_prr_snd = 0; 2465 uint32_t out, avail; 2466 2467 if (tp->t_timers->tt_flags & TT_STOPPED) { 2468 return (1); 2469 } 2470 if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) { 2471 /* Its not time yet */ 2472 return (0); 2473 } 2474 if (rack_progress_timeout_check(tp)) { 2475 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 2476 return (1); 2477 } 2478 /* 2479 * A TLP timer has expired. We have been idle for 2 rtts. So we now 2480 * need to figure out how to force a full MSS segment out. 2481 */ 2482 rack_log_to_event(rack, RACK_TO_FRM_TLP); 2483 counter_u64_add(rack_tlp_tot, 1); 2484 if (rack->r_state && (rack->r_state != tp->t_state)) 2485 rack_set_state(tp, rack); 2486 so = tp->t_inpcb->inp_socket; 2487 avail = sbavail(&so->so_snd); 2488 out = tp->snd_max - tp->snd_una; 2489 rack->rc_timer_up = 1; 2490 /* 2491 * If we are in recovery we can jazz out a segment if new data is 2492 * present simply by setting rc_prr_sndcnt to a segment. 2493 */ 2494 if ((avail > out) && 2495 ((rack_always_send_oldest == 0) || (TAILQ_EMPTY(&rack->r_ctl.rc_tmap)))) { 2496 /* New data is available */ 2497 amm = avail - out; 2498 if (amm > tp->t_maxseg) { 2499 amm = tp->t_maxseg; 2500 } else if ((amm < tp->t_maxseg) && ((tp->t_flags & TF_NODELAY) == 0)) { 2501 /* not enough to fill a MTU and no-delay is off */ 2502 goto need_retran; 2503 } 2504 if (IN_RECOVERY(tp->t_flags)) { 2505 /* Unlikely */ 2506 old_prr_snd = rack->r_ctl.rc_prr_sndcnt; 2507 if (out + amm <= tp->snd_wnd) 2508 rack->r_ctl.rc_prr_sndcnt = amm; 2509 else 2510 goto need_retran; 2511 } else { 2512 /* Set the send-new override */ 2513 if (out + amm <= tp->snd_wnd) 2514 rack->r_ctl.rc_tlp_new_data = amm; 2515 else 2516 goto need_retran; 2517 } 2518 rack->r_ctl.rc_tlp_seg_send_cnt = 0; 2519 rack->r_ctl.rc_last_tlp_seq = tp->snd_max; 2520 rack->r_ctl.rc_tlpsend = NULL; 2521 counter_u64_add(rack_tlp_newdata, 1); 2522 goto send; 2523 } 2524 need_retran: 2525 /* 2526 * Ok we need to arrange the last un-acked segment to be re-sent, or 2527 * optionally the first un-acked segment. 2528 */ 2529 if (rack_always_send_oldest) 2530 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 2531 else { 2532 rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_map, rack_sendmap, r_next); 2533 if (rsm && (rsm->r_flags & (RACK_ACKED | RACK_HAS_FIN))) { 2534 rsm = rack_find_high_nonack(rack, rsm); 2535 } 2536 } 2537 if (rsm == NULL) { 2538 counter_u64_add(rack_tlp_does_nada, 1); 2539 #ifdef TCP_BLACKBOX 2540 tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true); 2541 #endif 2542 goto out; 2543 } 2544 if ((rsm->r_end - rsm->r_start) > tp->t_maxseg) { 2545 /* 2546 * We need to split this the last segment in two. 2547 */ 2548 int32_t idx; 2549 struct rack_sendmap *nrsm; 2550 2551 nrsm = rack_alloc(rack); 2552 if (nrsm == NULL) { 2553 /* 2554 * No memory to split, we will just exit and punt 2555 * off to the RXT timer. 2556 */ 2557 counter_u64_add(rack_tlp_does_nada, 1); 2558 goto out; 2559 } 2560 nrsm->r_start = (rsm->r_end - tp->t_maxseg); 2561 nrsm->r_end = rsm->r_end; 2562 nrsm->r_rtr_cnt = rsm->r_rtr_cnt; 2563 nrsm->r_flags = rsm->r_flags; 2564 nrsm->r_sndcnt = rsm->r_sndcnt; 2565 nrsm->r_rtr_bytes = 0; 2566 rsm->r_end = nrsm->r_start; 2567 for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { 2568 nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; 2569 } 2570 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next); 2571 if (rsm->r_in_tmap) { 2572 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 2573 nrsm->r_in_tmap = 1; 2574 } 2575 rsm->r_flags &= (~RACK_HAS_FIN); 2576 rsm = nrsm; 2577 } 2578 rack->r_ctl.rc_tlpsend = rsm; 2579 rack->r_ctl.rc_tlp_rtx_out = 1; 2580 if (rsm->r_start == rack->r_ctl.rc_last_tlp_seq) { 2581 rack->r_ctl.rc_tlp_seg_send_cnt++; 2582 tp->t_rxtshift++; 2583 } else { 2584 rack->r_ctl.rc_last_tlp_seq = rsm->r_start; 2585 rack->r_ctl.rc_tlp_seg_send_cnt = 1; 2586 } 2587 send: 2588 rack->r_ctl.rc_tlp_send_cnt++; 2589 if (rack->r_ctl.rc_tlp_send_cnt > rack_tlp_max_resend) { 2590 /* 2591 * Can't [re]/transmit a segment we have not heard from the 2592 * peer in max times. We need the retransmit timer to take 2593 * over. 2594 */ 2595 restore: 2596 rack->r_ctl.rc_tlpsend = NULL; 2597 if (rsm) 2598 rsm->r_flags &= ~RACK_TLP; 2599 rack->r_ctl.rc_prr_sndcnt = old_prr_snd; 2600 counter_u64_add(rack_tlp_retran_fail, 1); 2601 goto out; 2602 } else if (rsm) { 2603 rsm->r_flags |= RACK_TLP; 2604 } 2605 if (rsm && (rsm->r_start == rack->r_ctl.rc_last_tlp_seq) && 2606 (rack->r_ctl.rc_tlp_seg_send_cnt > rack_tlp_max_resend)) { 2607 /* 2608 * We don't want to send a single segment more than the max 2609 * either. 2610 */ 2611 goto restore; 2612 } 2613 rack->r_timer_override = 1; 2614 rack->r_tlp_running = 1; 2615 rack->rc_tlp_in_progress = 1; 2616 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP; 2617 return (0); 2618 out: 2619 rack->rc_timer_up = 0; 2620 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP; 2621 return (0); 2622 } 2623 2624 /* 2625 * Delayed ack Timer, here we simply need to setup the 2626 * ACK_NOW flag and remove the DELACK flag. From there 2627 * the output routine will send the ack out. 2628 * 2629 * We only return 1, saying don't proceed, if all timers 2630 * are stopped (destroyed PCB?). 2631 */ 2632 static int 2633 rack_timeout_delack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 2634 { 2635 if (tp->t_timers->tt_flags & TT_STOPPED) { 2636 return (1); 2637 } 2638 rack_log_to_event(rack, RACK_TO_FRM_DELACK); 2639 tp->t_flags &= ~TF_DELACK; 2640 tp->t_flags |= TF_ACKNOW; 2641 TCPSTAT_INC(tcps_delack); 2642 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK; 2643 return (0); 2644 } 2645 2646 /* 2647 * Persists timer, here we simply need to setup the 2648 * FORCE-DATA flag the output routine will send 2649 * the one byte send. 2650 * 2651 * We only return 1, saying don't proceed, if all timers 2652 * are stopped (destroyed PCB?). 2653 */ 2654 static int 2655 rack_timeout_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 2656 { 2657 struct inpcb *inp; 2658 int32_t retval = 0; 2659 2660 inp = tp->t_inpcb; 2661 2662 if (tp->t_timers->tt_flags & TT_STOPPED) { 2663 return (1); 2664 } 2665 if (rack->rc_in_persist == 0) 2666 return (0); 2667 if (rack_progress_timeout_check(tp)) { 2668 tcp_set_inp_to_drop(inp, ETIMEDOUT); 2669 return (1); 2670 } 2671 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 2672 /* 2673 * Persistence timer into zero window. Force a byte to be output, if 2674 * possible. 2675 */ 2676 TCPSTAT_INC(tcps_persisttimeo); 2677 /* 2678 * Hack: if the peer is dead/unreachable, we do not time out if the 2679 * window is closed. After a full backoff, drop the connection if 2680 * the idle time (no responses to probes) reaches the maximum 2681 * backoff that we would use if retransmitting. 2682 */ 2683 if (tp->t_rxtshift == TCP_MAXRXTSHIFT && 2684 (ticks - tp->t_rcvtime >= tcp_maxpersistidle || 2685 ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) { 2686 TCPSTAT_INC(tcps_persistdrop); 2687 retval = 1; 2688 tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT); 2689 goto out; 2690 } 2691 if ((sbavail(&rack->rc_inp->inp_socket->so_snd) == 0) && 2692 tp->snd_una == tp->snd_max) 2693 rack_exit_persist(tp, rack); 2694 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_PERSIT; 2695 /* 2696 * If the user has closed the socket then drop a persisting 2697 * connection after a much reduced timeout. 2698 */ 2699 if (tp->t_state > TCPS_CLOSE_WAIT && 2700 (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) { 2701 retval = 1; 2702 TCPSTAT_INC(tcps_persistdrop); 2703 tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT); 2704 goto out; 2705 } 2706 tp->t_flags |= TF_FORCEDATA; 2707 out: 2708 rack_log_to_event(rack, RACK_TO_FRM_PERSIST); 2709 return (retval); 2710 } 2711 2712 /* 2713 * If a keepalive goes off, we had no other timers 2714 * happening. We always return 1 here since this 2715 * routine either drops the connection or sends 2716 * out a segment with respond. 2717 */ 2718 static int 2719 rack_timeout_keepalive(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 2720 { 2721 struct tcptemp *t_template; 2722 struct inpcb *inp; 2723 2724 if (tp->t_timers->tt_flags & TT_STOPPED) { 2725 return (1); 2726 } 2727 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_KEEP; 2728 inp = tp->t_inpcb; 2729 rack_log_to_event(rack, RACK_TO_FRM_KEEP); 2730 /* 2731 * Keep-alive timer went off; send something or drop connection if 2732 * idle for too long. 2733 */ 2734 TCPSTAT_INC(tcps_keeptimeo); 2735 if (tp->t_state < TCPS_ESTABLISHED) 2736 goto dropit; 2737 if ((tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && 2738 tp->t_state <= TCPS_CLOSING) { 2739 if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp)) 2740 goto dropit; 2741 /* 2742 * Send a packet designed to force a response if the peer is 2743 * up and reachable: either an ACK if the connection is 2744 * still alive, or an RST if the peer has closed the 2745 * connection due to timeout or reboot. Using sequence 2746 * number tp->snd_una-1 causes the transmitted zero-length 2747 * segment to lie outside the receive window; by the 2748 * protocol spec, this requires the correspondent TCP to 2749 * respond. 2750 */ 2751 TCPSTAT_INC(tcps_keepprobe); 2752 t_template = tcpip_maketemplate(inp); 2753 if (t_template) { 2754 tcp_respond(tp, t_template->tt_ipgen, 2755 &t_template->tt_t, (struct mbuf *)NULL, 2756 tp->rcv_nxt, tp->snd_una - 1, 0); 2757 free(t_template, M_TEMP); 2758 } 2759 } 2760 rack_start_hpts_timer(rack, tp, cts, __LINE__, 0, 0, 0); 2761 return (1); 2762 dropit: 2763 TCPSTAT_INC(tcps_keepdrops); 2764 tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT); 2765 return (1); 2766 } 2767 2768 /* 2769 * Retransmit helper function, clear up all the ack 2770 * flags and take care of important book keeping. 2771 */ 2772 static void 2773 rack_remxt_tmr(struct tcpcb *tp) 2774 { 2775 /* 2776 * The retransmit timer went off, all sack'd blocks must be 2777 * un-acked. 2778 */ 2779 struct rack_sendmap *rsm, *trsm = NULL; 2780 struct tcp_rack *rack; 2781 int32_t cnt = 0; 2782 2783 rack = (struct tcp_rack *)tp->t_fb_ptr; 2784 rack_timer_cancel(tp, rack, tcp_ts_getticks(), __LINE__); 2785 rack_log_to_event(rack, RACK_TO_FRM_TMR); 2786 if (rack->r_state && (rack->r_state != tp->t_state)) 2787 rack_set_state(tp, rack); 2788 /* 2789 * Ideally we would like to be able to 2790 * mark SACK-PASS on anything not acked here. 2791 * However, if we do that we would burst out 2792 * all that data 1ms apart. This would be unwise, 2793 * so for now we will just let the normal rxt timer 2794 * and tlp timer take care of it. 2795 */ 2796 TAILQ_FOREACH(rsm, &rack->r_ctl.rc_map, r_next) { 2797 if (rsm->r_flags & RACK_ACKED) { 2798 cnt++; 2799 rsm->r_sndcnt = 0; 2800 if (rsm->r_in_tmap == 0) { 2801 /* We must re-add it back to the tlist */ 2802 if (trsm == NULL) { 2803 TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext); 2804 } else { 2805 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, trsm, rsm, r_tnext); 2806 } 2807 rsm->r_in_tmap = 1; 2808 trsm = rsm; 2809 } 2810 } 2811 rsm->r_flags &= ~(RACK_ACKED | RACK_SACK_PASSED | RACK_WAS_SACKPASS); 2812 } 2813 /* Clear the count (we just un-acked them) */ 2814 rack->r_ctl.rc_sacked = 0; 2815 /* Clear the tlp rtx mark */ 2816 rack->r_ctl.rc_tlp_rtx_out = 0; 2817 rack->r_ctl.rc_tlp_seg_send_cnt = 0; 2818 rack->r_ctl.rc_resend = TAILQ_FIRST(&rack->r_ctl.rc_map); 2819 /* Setup so we send one segment */ 2820 if (rack->r_ctl.rc_prr_sndcnt < tp->t_maxseg) 2821 rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; 2822 rack->r_timer_override = 1; 2823 } 2824 2825 /* 2826 * Re-transmit timeout! If we drop the PCB we will return 1, otherwise 2827 * we will setup to retransmit the lowest seq number outstanding. 2828 */ 2829 static int 2830 rack_timeout_rxt(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 2831 { 2832 int32_t rexmt; 2833 struct inpcb *inp; 2834 int32_t retval = 0; 2835 2836 inp = tp->t_inpcb; 2837 if (tp->t_timers->tt_flags & TT_STOPPED) { 2838 return (1); 2839 } 2840 if (rack_progress_timeout_check(tp)) { 2841 tcp_set_inp_to_drop(inp, ETIMEDOUT); 2842 return (1); 2843 } 2844 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RXT; 2845 if (TCPS_HAVEESTABLISHED(tp->t_state) && 2846 (tp->snd_una == tp->snd_max)) { 2847 /* Nothing outstanding .. nothing to do */ 2848 return (0); 2849 } 2850 /* 2851 * Retransmission timer went off. Message has not been acked within 2852 * retransmit interval. Back off to a longer retransmit interval 2853 * and retransmit one segment. 2854 */ 2855 if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) { 2856 tp->t_rxtshift = TCP_MAXRXTSHIFT; 2857 TCPSTAT_INC(tcps_timeoutdrop); 2858 retval = 1; 2859 tcp_set_inp_to_drop(rack->rc_inp, 2860 (tp->t_softerror ? (uint16_t) tp->t_softerror : ETIMEDOUT)); 2861 goto out; 2862 } 2863 rack_remxt_tmr(tp); 2864 if (tp->t_state == TCPS_SYN_SENT) { 2865 /* 2866 * If the SYN was retransmitted, indicate CWND to be limited 2867 * to 1 segment in cc_conn_init(). 2868 */ 2869 tp->snd_cwnd = 1; 2870 } else if (tp->t_rxtshift == 1) { 2871 /* 2872 * first retransmit; record ssthresh and cwnd so they can be 2873 * recovered if this turns out to be a "bad" retransmit. A 2874 * retransmit is considered "bad" if an ACK for this segment 2875 * is received within RTT/2 interval; the assumption here is 2876 * that the ACK was already in flight. See "On Estimating 2877 * End-to-End Network Path Properties" by Allman and Paxson 2878 * for more details. 2879 */ 2880 tp->snd_cwnd_prev = tp->snd_cwnd; 2881 tp->snd_ssthresh_prev = tp->snd_ssthresh; 2882 tp->snd_recover_prev = tp->snd_recover; 2883 if (IN_FASTRECOVERY(tp->t_flags)) 2884 tp->t_flags |= TF_WASFRECOVERY; 2885 else 2886 tp->t_flags &= ~TF_WASFRECOVERY; 2887 if (IN_CONGRECOVERY(tp->t_flags)) 2888 tp->t_flags |= TF_WASCRECOVERY; 2889 else 2890 tp->t_flags &= ~TF_WASCRECOVERY; 2891 tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1)); 2892 tp->t_flags |= TF_PREVVALID; 2893 } else 2894 tp->t_flags &= ~TF_PREVVALID; 2895 TCPSTAT_INC(tcps_rexmttimeo); 2896 if ((tp->t_state == TCPS_SYN_SENT) || 2897 (tp->t_state == TCPS_SYN_RECEIVED)) 2898 rexmt = MSEC_2_TICKS(RACK_INITIAL_RTO * tcp_syn_backoff[tp->t_rxtshift]); 2899 else 2900 rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; 2901 TCPT_RANGESET(tp->t_rxtcur, rexmt, 2902 max(MSEC_2_TICKS(rack_rto_min), rexmt), 2903 MSEC_2_TICKS(rack_rto_max)); 2904 /* 2905 * We enter the path for PLMTUD if connection is established or, if 2906 * connection is FIN_WAIT_1 status, reason for the last is that if 2907 * amount of data we send is very small, we could send it in couple 2908 * of packets and process straight to FIN. In that case we won't 2909 * catch ESTABLISHED state. 2910 */ 2911 if (V_tcp_pmtud_blackhole_detect && (((tp->t_state == TCPS_ESTABLISHED)) 2912 || (tp->t_state == TCPS_FIN_WAIT_1))) { 2913 #ifdef INET6 2914 int32_t isipv6; 2915 #endif 2916 2917 /* 2918 * Idea here is that at each stage of mtu probe (usually, 2919 * 1448 -> 1188 -> 524) should be given 2 chances to recover 2920 * before further clamping down. 'tp->t_rxtshift % 2 == 0' 2921 * should take care of that. 2922 */ 2923 if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) == 2924 (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) && 2925 (tp->t_rxtshift >= 2 && tp->t_rxtshift < 6 && 2926 tp->t_rxtshift % 2 == 0)) { 2927 /* 2928 * Enter Path MTU Black-hole Detection mechanism: - 2929 * Disable Path MTU Discovery (IP "DF" bit). - 2930 * Reduce MTU to lower value than what we negotiated 2931 * with peer. 2932 */ 2933 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) == 0) { 2934 /* Record that we may have found a black hole. */ 2935 tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE; 2936 /* Keep track of previous MSS. */ 2937 tp->t_pmtud_saved_maxseg = tp->t_maxseg; 2938 } 2939 2940 /* 2941 * Reduce the MSS to blackhole value or to the 2942 * default in an attempt to retransmit. 2943 */ 2944 #ifdef INET6 2945 isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? 1 : 0; 2946 if (isipv6 && 2947 tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) { 2948 /* Use the sysctl tuneable blackhole MSS. */ 2949 tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss; 2950 TCPSTAT_INC(tcps_pmtud_blackhole_activated); 2951 } else if (isipv6) { 2952 /* Use the default MSS. */ 2953 tp->t_maxseg = V_tcp_v6mssdflt; 2954 /* 2955 * Disable Path MTU Discovery when we switch 2956 * to minmss. 2957 */ 2958 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 2959 TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); 2960 } 2961 #endif 2962 #if defined(INET6) && defined(INET) 2963 else 2964 #endif 2965 #ifdef INET 2966 if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) { 2967 /* Use the sysctl tuneable blackhole MSS. */ 2968 tp->t_maxseg = V_tcp_pmtud_blackhole_mss; 2969 TCPSTAT_INC(tcps_pmtud_blackhole_activated); 2970 } else { 2971 /* Use the default MSS. */ 2972 tp->t_maxseg = V_tcp_mssdflt; 2973 /* 2974 * Disable Path MTU Discovery when we switch 2975 * to minmss. 2976 */ 2977 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 2978 TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); 2979 } 2980 #endif 2981 } else { 2982 /* 2983 * If further retransmissions are still unsuccessful 2984 * with a lowered MTU, maybe this isn't a blackhole 2985 * and we restore the previous MSS and blackhole 2986 * detection flags. The limit '6' is determined by 2987 * giving each probe stage (1448, 1188, 524) 2 2988 * chances to recover. 2989 */ 2990 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) && 2991 (tp->t_rxtshift >= 6)) { 2992 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 2993 tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE; 2994 tp->t_maxseg = tp->t_pmtud_saved_maxseg; 2995 TCPSTAT_INC(tcps_pmtud_blackhole_failed); 2996 } 2997 } 2998 } 2999 /* 3000 * Disable RFC1323 and SACK if we haven't got any response to our 3001 * third SYN to work-around some broken terminal servers (most of 3002 * which have hopefully been retired) that have bad VJ header 3003 * compression code which trashes TCP segments containing 3004 * unknown-to-them TCP options. 3005 */ 3006 if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) && 3007 (tp->t_rxtshift == 3)) 3008 tp->t_flags &= ~(TF_REQ_SCALE | TF_REQ_TSTMP | TF_SACK_PERMIT); 3009 /* 3010 * If we backed off this far, our srtt estimate is probably bogus. 3011 * Clobber it so we'll take the next rtt measurement as our srtt; 3012 * move the current srtt into rttvar to keep the current retransmit 3013 * times until then. 3014 */ 3015 if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) { 3016 #ifdef INET6 3017 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) 3018 in6_losing(tp->t_inpcb); 3019 else 3020 #endif 3021 in_losing(tp->t_inpcb); 3022 tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT); 3023 tp->t_srtt = 0; 3024 } 3025 if (rack_use_sack_filter) 3026 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 3027 tp->snd_recover = tp->snd_max; 3028 tp->t_flags |= TF_ACKNOW; 3029 tp->t_rtttime = 0; 3030 rack_cong_signal(tp, NULL, CC_RTO); 3031 out: 3032 return (retval); 3033 } 3034 3035 static int 3036 rack_process_timers(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8_t hpts_calling) 3037 { 3038 int32_t ret = 0; 3039 int32_t timers = (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK); 3040 3041 if (timers == 0) { 3042 return (0); 3043 } 3044 if (tp->t_state == TCPS_LISTEN) { 3045 /* no timers on listen sockets */ 3046 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) 3047 return (0); 3048 return (1); 3049 } 3050 if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) { 3051 uint32_t left; 3052 3053 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { 3054 ret = -1; 3055 rack_log_to_processing(rack, cts, ret, 0); 3056 return (0); 3057 } 3058 if (hpts_calling == 0) { 3059 ret = -2; 3060 rack_log_to_processing(rack, cts, ret, 0); 3061 return (0); 3062 } 3063 /* 3064 * Ok our timer went off early and we are not paced false 3065 * alarm, go back to sleep. 3066 */ 3067 ret = -3; 3068 left = rack->r_ctl.rc_timer_exp - cts; 3069 tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(left)); 3070 rack_log_to_processing(rack, cts, ret, left); 3071 rack->rc_last_pto_set = 0; 3072 return (1); 3073 } 3074 rack->rc_tmr_stopped = 0; 3075 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_MASK; 3076 if (timers & PACE_TMR_DELACK) { 3077 ret = rack_timeout_delack(tp, rack, cts); 3078 } else if (timers & PACE_TMR_RACK) { 3079 ret = rack_timeout_rack(tp, rack, cts); 3080 } else if (timers & PACE_TMR_TLP) { 3081 ret = rack_timeout_tlp(tp, rack, cts); 3082 } else if (timers & PACE_TMR_RXT) { 3083 ret = rack_timeout_rxt(tp, rack, cts); 3084 } else if (timers & PACE_TMR_PERSIT) { 3085 ret = rack_timeout_persist(tp, rack, cts); 3086 } else if (timers & PACE_TMR_KEEP) { 3087 ret = rack_timeout_keepalive(tp, rack, cts); 3088 } 3089 rack_log_to_processing(rack, cts, ret, timers); 3090 return (ret); 3091 } 3092 3093 static void 3094 rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line) 3095 { 3096 uint8_t hpts_removed = 0; 3097 3098 if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) && 3099 TSTMP_GEQ(cts, rack->r_ctl.rc_last_output_to)) { 3100 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); 3101 hpts_removed = 1; 3102 } 3103 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { 3104 rack->rc_tmr_stopped = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK; 3105 if (rack->rc_inp->inp_in_hpts && 3106 ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0)) { 3107 /* 3108 * Canceling timer's when we have no output being 3109 * paced. We also must remove ourselves from the 3110 * hpts. 3111 */ 3112 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); 3113 hpts_removed = 1; 3114 } 3115 rack_log_to_cancel(rack, hpts_removed, line); 3116 rack->r_ctl.rc_hpts_flags &= ~(PACE_TMR_MASK); 3117 } 3118 } 3119 3120 static void 3121 rack_timer_stop(struct tcpcb *tp, uint32_t timer_type) 3122 { 3123 return; 3124 } 3125 3126 static int 3127 rack_stopall(struct tcpcb *tp) 3128 { 3129 struct tcp_rack *rack; 3130 rack = (struct tcp_rack *)tp->t_fb_ptr; 3131 rack->t_timers_stopped = 1; 3132 return (0); 3133 } 3134 3135 static void 3136 rack_timer_activate(struct tcpcb *tp, uint32_t timer_type, uint32_t delta) 3137 { 3138 return; 3139 } 3140 3141 static int 3142 rack_timer_active(struct tcpcb *tp, uint32_t timer_type) 3143 { 3144 return (0); 3145 } 3146 3147 static void 3148 rack_stop_all_timers(struct tcpcb *tp) 3149 { 3150 struct tcp_rack *rack; 3151 3152 /* 3153 * Assure no timers are running. 3154 */ 3155 if (tcp_timer_active(tp, TT_PERSIST)) { 3156 /* We enter in persists, set the flag appropriately */ 3157 rack = (struct tcp_rack *)tp->t_fb_ptr; 3158 rack->rc_in_persist = 1; 3159 } 3160 tcp_timer_suspend(tp, TT_PERSIST); 3161 tcp_timer_suspend(tp, TT_REXMT); 3162 tcp_timer_suspend(tp, TT_KEEP); 3163 tcp_timer_suspend(tp, TT_DELACK); 3164 } 3165 3166 static void 3167 rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack, 3168 struct rack_sendmap *rsm, uint32_t ts) 3169 { 3170 int32_t idx; 3171 3172 rsm->r_rtr_cnt++; 3173 rsm->r_sndcnt++; 3174 if (rsm->r_rtr_cnt > RACK_NUM_OF_RETRANS) { 3175 rsm->r_rtr_cnt = RACK_NUM_OF_RETRANS; 3176 rsm->r_flags |= RACK_OVERMAX; 3177 } 3178 if ((rsm->r_rtr_cnt > 1) && (rack->r_tlp_running == 0)) { 3179 rack->r_ctl.rc_holes_rxt += (rsm->r_end - rsm->r_start); 3180 rsm->r_rtr_bytes += (rsm->r_end - rsm->r_start); 3181 } 3182 idx = rsm->r_rtr_cnt - 1; 3183 rsm->r_tim_lastsent[idx] = ts; 3184 if (rsm->r_flags & RACK_ACKED) { 3185 /* Problably MTU discovery messing with us */ 3186 rsm->r_flags &= ~RACK_ACKED; 3187 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); 3188 } 3189 if (rsm->r_in_tmap) { 3190 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 3191 } 3192 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); 3193 rsm->r_in_tmap = 1; 3194 if (rsm->r_flags & RACK_SACK_PASSED) { 3195 /* We have retransmitted due to the SACK pass */ 3196 rsm->r_flags &= ~RACK_SACK_PASSED; 3197 rsm->r_flags |= RACK_WAS_SACKPASS; 3198 } 3199 /* Update memory for next rtr */ 3200 rack->r_ctl.rc_next = TAILQ_NEXT(rsm, r_next); 3201 } 3202 3203 3204 static uint32_t 3205 rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack, 3206 struct rack_sendmap *rsm, uint32_t ts, int32_t * lenp) 3207 { 3208 /* 3209 * We (re-)transmitted starting at rsm->r_start for some length 3210 * (possibly less than r_end. 3211 */ 3212 struct rack_sendmap *nrsm; 3213 uint32_t c_end; 3214 int32_t len; 3215 int32_t idx; 3216 3217 len = *lenp; 3218 c_end = rsm->r_start + len; 3219 if (SEQ_GEQ(c_end, rsm->r_end)) { 3220 /* 3221 * We retransmitted the whole piece or more than the whole 3222 * slopping into the next rsm. 3223 */ 3224 rack_update_rsm(tp, rack, rsm, ts); 3225 if (c_end == rsm->r_end) { 3226 *lenp = 0; 3227 return (0); 3228 } else { 3229 int32_t act_len; 3230 3231 /* Hangs over the end return whats left */ 3232 act_len = rsm->r_end - rsm->r_start; 3233 *lenp = (len - act_len); 3234 return (rsm->r_end); 3235 } 3236 /* We don't get out of this block. */ 3237 } 3238 /* 3239 * Here we retransmitted less than the whole thing which means we 3240 * have to split this into what was transmitted and what was not. 3241 */ 3242 nrsm = rack_alloc(rack); 3243 if (nrsm == NULL) { 3244 /* 3245 * We can't get memory, so lets not proceed. 3246 */ 3247 *lenp = 0; 3248 return (0); 3249 } 3250 /* 3251 * So here we are going to take the original rsm and make it what we 3252 * retransmitted. nrsm will be the tail portion we did not 3253 * retransmit. For example say the chunk was 1, 11 (10 bytes). And 3254 * we retransmitted 5 bytes i.e. 1, 5. The original piece shrinks to 3255 * 1, 6 and the new piece will be 6, 11. 3256 */ 3257 nrsm->r_start = c_end; 3258 nrsm->r_end = rsm->r_end; 3259 nrsm->r_rtr_cnt = rsm->r_rtr_cnt; 3260 nrsm->r_flags = rsm->r_flags; 3261 nrsm->r_sndcnt = rsm->r_sndcnt; 3262 nrsm->r_rtr_bytes = 0; 3263 rsm->r_end = c_end; 3264 for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { 3265 nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; 3266 } 3267 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next); 3268 if (rsm->r_in_tmap) { 3269 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 3270 nrsm->r_in_tmap = 1; 3271 } 3272 rsm->r_flags &= (~RACK_HAS_FIN); 3273 rack_update_rsm(tp, rack, rsm, ts); 3274 *lenp = 0; 3275 return (0); 3276 } 3277 3278 3279 static void 3280 rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len, 3281 uint32_t seq_out, uint8_t th_flags, int32_t err, uint32_t ts, 3282 uint8_t pass, struct rack_sendmap *hintrsm) 3283 { 3284 struct tcp_rack *rack; 3285 struct rack_sendmap *rsm, *nrsm; 3286 register uint32_t snd_max, snd_una; 3287 int32_t idx; 3288 3289 /* 3290 * Add to the RACK log of packets in flight or retransmitted. If 3291 * there is a TS option we will use the TS echoed, if not we will 3292 * grab a TS. 3293 * 3294 * Retransmissions will increment the count and move the ts to its 3295 * proper place. Note that if options do not include TS's then we 3296 * won't be able to effectively use the ACK for an RTT on a retran. 3297 * 3298 * Notes about r_start and r_end. Lets consider a send starting at 3299 * sequence 1 for 10 bytes. In such an example the r_start would be 3300 * 1 (starting sequence) but the r_end would be r_start+len i.e. 11. 3301 * This means that r_end is actually the first sequence for the next 3302 * slot (11). 3303 * 3304 */ 3305 /* 3306 * If err is set what do we do XXXrrs? should we not add the thing? 3307 * -- i.e. return if err != 0 or should we pretend we sent it? -- 3308 * i.e. proceed with add ** do this for now. 3309 */ 3310 INP_WLOCK_ASSERT(tp->t_inpcb); 3311 if (err) 3312 /* 3313 * We don't log errors -- we could but snd_max does not 3314 * advance in this case either. 3315 */ 3316 return; 3317 3318 if (th_flags & TH_RST) { 3319 /* 3320 * We don't log resets and we return immediately from 3321 * sending 3322 */ 3323 return; 3324 } 3325 rack = (struct tcp_rack *)tp->t_fb_ptr; 3326 snd_una = tp->snd_una; 3327 if (SEQ_LEQ((seq_out + len), snd_una)) { 3328 /* Are sending an old segment to induce an ack (keep-alive)? */ 3329 return; 3330 } 3331 if (SEQ_LT(seq_out, snd_una)) { 3332 /* huh? should we panic? */ 3333 uint32_t end; 3334 3335 end = seq_out + len; 3336 seq_out = snd_una; 3337 len = end - seq_out; 3338 } 3339 snd_max = tp->snd_max; 3340 if (th_flags & (TH_SYN | TH_FIN)) { 3341 /* 3342 * The call to rack_log_output is made before bumping 3343 * snd_max. This means we can record one extra byte on a SYN 3344 * or FIN if seq_out is adding more on and a FIN is present 3345 * (and we are not resending). 3346 */ 3347 if (th_flags & TH_SYN) 3348 len++; 3349 if (th_flags & TH_FIN) 3350 len++; 3351 if (SEQ_LT(snd_max, tp->snd_nxt)) { 3352 /* 3353 * The add/update as not been done for the FIN/SYN 3354 * yet. 3355 */ 3356 snd_max = tp->snd_nxt; 3357 } 3358 } 3359 if (len == 0) { 3360 /* We don't log zero window probes */ 3361 return; 3362 } 3363 rack->r_ctl.rc_time_last_sent = ts; 3364 if (IN_RECOVERY(tp->t_flags)) { 3365 rack->r_ctl.rc_prr_out += len; 3366 } 3367 /* First question is it a retransmission? */ 3368 if (seq_out == snd_max) { 3369 again: 3370 rsm = rack_alloc(rack); 3371 if (rsm == NULL) { 3372 /* 3373 * Hmm out of memory and the tcb got destroyed while 3374 * we tried to wait. 3375 */ 3376 #ifdef INVARIANTS 3377 panic("Out of memory when we should not be rack:%p", rack); 3378 #endif 3379 return; 3380 } 3381 if (th_flags & TH_FIN) { 3382 rsm->r_flags = RACK_HAS_FIN; 3383 } else { 3384 rsm->r_flags = 0; 3385 } 3386 rsm->r_tim_lastsent[0] = ts; 3387 rsm->r_rtr_cnt = 1; 3388 rsm->r_rtr_bytes = 0; 3389 if (th_flags & TH_SYN) { 3390 /* The data space is one beyond snd_una */ 3391 rsm->r_start = seq_out + 1; 3392 rsm->r_end = rsm->r_start + (len - 1); 3393 } else { 3394 /* Normal case */ 3395 rsm->r_start = seq_out; 3396 rsm->r_end = rsm->r_start + len; 3397 } 3398 rsm->r_sndcnt = 0; 3399 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_map, rsm, r_next); 3400 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); 3401 rsm->r_in_tmap = 1; 3402 return; 3403 } 3404 /* 3405 * If we reach here its a retransmission and we need to find it. 3406 */ 3407 more: 3408 if (hintrsm && (hintrsm->r_start == seq_out)) { 3409 rsm = hintrsm; 3410 hintrsm = NULL; 3411 } else if (rack->r_ctl.rc_next) { 3412 /* We have a hint from a previous run */ 3413 rsm = rack->r_ctl.rc_next; 3414 } else { 3415 /* No hints sorry */ 3416 rsm = NULL; 3417 } 3418 if ((rsm) && (rsm->r_start == seq_out)) { 3419 /* 3420 * We used rc_next or hintrsm to retransmit, hopefully the 3421 * likely case. 3422 */ 3423 seq_out = rack_update_entry(tp, rack, rsm, ts, &len); 3424 if (len == 0) { 3425 return; 3426 } else { 3427 goto more; 3428 } 3429 } 3430 /* Ok it was not the last pointer go through it the hard way. */ 3431 TAILQ_FOREACH(rsm, &rack->r_ctl.rc_map, r_next) { 3432 if (rsm->r_start == seq_out) { 3433 seq_out = rack_update_entry(tp, rack, rsm, ts, &len); 3434 rack->r_ctl.rc_next = TAILQ_NEXT(rsm, r_next); 3435 if (len == 0) { 3436 return; 3437 } else { 3438 continue; 3439 } 3440 } 3441 if (SEQ_GEQ(seq_out, rsm->r_start) && SEQ_LT(seq_out, rsm->r_end)) { 3442 /* Transmitted within this piece */ 3443 /* 3444 * Ok we must split off the front and then let the 3445 * update do the rest 3446 */ 3447 nrsm = rack_alloc(rack); 3448 if (nrsm == NULL) { 3449 #ifdef INVARIANTS 3450 panic("Ran out of memory that was preallocated? rack:%p", rack); 3451 #endif 3452 rack_update_rsm(tp, rack, rsm, ts); 3453 return; 3454 } 3455 /* 3456 * copy rsm to nrsm and then trim the front of rsm 3457 * to not include this part. 3458 */ 3459 nrsm->r_start = seq_out; 3460 nrsm->r_end = rsm->r_end; 3461 nrsm->r_rtr_cnt = rsm->r_rtr_cnt; 3462 nrsm->r_flags = rsm->r_flags; 3463 nrsm->r_sndcnt = rsm->r_sndcnt; 3464 nrsm->r_rtr_bytes = 0; 3465 for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { 3466 nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; 3467 } 3468 rsm->r_end = nrsm->r_start; 3469 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next); 3470 if (rsm->r_in_tmap) { 3471 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 3472 nrsm->r_in_tmap = 1; 3473 } 3474 rsm->r_flags &= (~RACK_HAS_FIN); 3475 seq_out = rack_update_entry(tp, rack, nrsm, ts, &len); 3476 if (len == 0) { 3477 return; 3478 } 3479 } 3480 } 3481 /* 3482 * Hmm not found in map did they retransmit both old and on into the 3483 * new? 3484 */ 3485 if (seq_out == tp->snd_max) { 3486 goto again; 3487 } else if (SEQ_LT(seq_out, tp->snd_max)) { 3488 #ifdef INVARIANTS 3489 printf("seq_out:%u len:%d snd_una:%u snd_max:%u -- but rsm not found?\n", 3490 seq_out, len, tp->snd_una, tp->snd_max); 3491 printf("Starting Dump of all rack entries\n"); 3492 TAILQ_FOREACH(rsm, &rack->r_ctl.rc_map, r_next) { 3493 printf("rsm:%p start:%u end:%u\n", 3494 rsm, rsm->r_start, rsm->r_end); 3495 } 3496 printf("Dump complete\n"); 3497 panic("seq_out not found rack:%p tp:%p", 3498 rack, tp); 3499 #endif 3500 } else { 3501 #ifdef INVARIANTS 3502 /* 3503 * Hmm beyond sndmax? (only if we are using the new rtt-pack 3504 * flag) 3505 */ 3506 panic("seq_out:%u(%d) is beyond snd_max:%u tp:%p", 3507 seq_out, len, tp->snd_max, tp); 3508 #endif 3509 } 3510 } 3511 3512 /* 3513 * Record one of the RTT updates from an ack into 3514 * our sample structure. 3515 */ 3516 static void 3517 tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt) 3518 { 3519 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) || 3520 (rack->r_ctl.rack_rs.rs_rtt_lowest > rtt)) { 3521 rack->r_ctl.rack_rs.rs_rtt_lowest = rtt; 3522 } 3523 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) || 3524 (rack->r_ctl.rack_rs.rs_rtt_highest < rtt)) { 3525 rack->r_ctl.rack_rs.rs_rtt_highest = rtt; 3526 } 3527 rack->r_ctl.rack_rs.rs_flags = RACK_RTT_VALID; 3528 rack->r_ctl.rack_rs.rs_rtt_tot += rtt; 3529 rack->r_ctl.rack_rs.rs_rtt_cnt++; 3530 } 3531 3532 /* 3533 * Collect new round-trip time estimate 3534 * and update averages and current timeout. 3535 */ 3536 static void 3537 tcp_rack_xmit_timer_commit(struct tcp_rack *rack, struct tcpcb *tp) 3538 { 3539 int32_t delta; 3540 uint32_t o_srtt, o_var; 3541 int32_t rtt; 3542 3543 if (rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) 3544 /* No valid sample */ 3545 return; 3546 if (rack->r_ctl.rc_rate_sample_method == USE_RTT_LOW) { 3547 /* We are to use the lowest RTT seen in a single ack */ 3548 rtt = rack->r_ctl.rack_rs.rs_rtt_lowest; 3549 } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_HIGH) { 3550 /* We are to use the highest RTT seen in a single ack */ 3551 rtt = rack->r_ctl.rack_rs.rs_rtt_highest; 3552 } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_AVG) { 3553 /* We are to use the average RTT seen in a single ack */ 3554 rtt = (int32_t)(rack->r_ctl.rack_rs.rs_rtt_tot / 3555 (uint64_t)rack->r_ctl.rack_rs.rs_rtt_cnt); 3556 } else { 3557 #ifdef INVARIANTS 3558 panic("Unknown rtt variant %d", rack->r_ctl.rc_rate_sample_method); 3559 #endif 3560 return; 3561 } 3562 if (rtt == 0) 3563 rtt = 1; 3564 rack_log_rtt_sample(rack, rtt); 3565 o_srtt = tp->t_srtt; 3566 o_var = tp->t_rttvar; 3567 rack = (struct tcp_rack *)tp->t_fb_ptr; 3568 if (tp->t_srtt != 0) { 3569 /* 3570 * srtt is stored as fixed point with 5 bits after the 3571 * binary point (i.e., scaled by 8). The following magic is 3572 * equivalent to the smoothing algorithm in rfc793 with an 3573 * alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed point). 3574 * Adjust rtt to origin 0. 3575 */ 3576 delta = ((rtt - 1) << TCP_DELTA_SHIFT) 3577 - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)); 3578 3579 tp->t_srtt += delta; 3580 if (tp->t_srtt <= 0) 3581 tp->t_srtt = 1; 3582 3583 /* 3584 * We accumulate a smoothed rtt variance (actually, a 3585 * smoothed mean difference), then set the retransmit timer 3586 * to smoothed rtt + 4 times the smoothed variance. rttvar 3587 * is stored as fixed point with 4 bits after the binary 3588 * point (scaled by 16). The following is equivalent to 3589 * rfc793 smoothing with an alpha of .75 (rttvar = 3590 * rttvar*3/4 + |delta| / 4). This replaces rfc793's 3591 * wired-in beta. 3592 */ 3593 if (delta < 0) 3594 delta = -delta; 3595 delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT); 3596 tp->t_rttvar += delta; 3597 if (tp->t_rttvar <= 0) 3598 tp->t_rttvar = 1; 3599 if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar) 3600 tp->t_rttbest = tp->t_srtt + tp->t_rttvar; 3601 } else { 3602 /* 3603 * No rtt measurement yet - use the unsmoothed rtt. Set the 3604 * variance to half the rtt (so our first retransmit happens 3605 * at 3*rtt). 3606 */ 3607 tp->t_srtt = rtt << TCP_RTT_SHIFT; 3608 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1); 3609 tp->t_rttbest = tp->t_srtt + tp->t_rttvar; 3610 } 3611 TCPSTAT_INC(tcps_rttupdated); 3612 rack_log_rtt_upd(tp, rack, rtt, o_srtt, o_var); 3613 tp->t_rttupdated++; 3614 #ifdef NETFLIX_STATS 3615 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rtt)); 3616 #endif 3617 tp->t_rxtshift = 0; 3618 3619 /* 3620 * the retransmit should happen at rtt + 4 * rttvar. Because of the 3621 * way we do the smoothing, srtt and rttvar will each average +1/2 3622 * tick of bias. When we compute the retransmit timer, we want 1/2 3623 * tick of rounding and 1 extra tick because of +-1/2 tick 3624 * uncertainty in the firing of the timer. The bias will give us 3625 * exactly the 1.5 tick we need. But, because the bias is 3626 * statistical, we have to test that we don't drop below the minimum 3627 * feasible timer (which is 2 ticks). 3628 */ 3629 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), 3630 max(MSEC_2_TICKS(rack_rto_min), rtt + 2), MSEC_2_TICKS(rack_rto_max)); 3631 tp->t_softerror = 0; 3632 } 3633 3634 static void 3635 rack_earlier_retran(struct tcpcb *tp, struct rack_sendmap *rsm, 3636 uint32_t t, uint32_t cts) 3637 { 3638 /* 3639 * For this RSM, we acknowledged the data from a previous 3640 * transmission, not the last one we made. This means we did a false 3641 * retransmit. 3642 */ 3643 struct tcp_rack *rack; 3644 3645 if (rsm->r_flags & RACK_HAS_FIN) { 3646 /* 3647 * The sending of the FIN often is multiple sent when we 3648 * have everything outstanding ack'd. We ignore this case 3649 * since its over now. 3650 */ 3651 return; 3652 } 3653 if (rsm->r_flags & RACK_TLP) { 3654 /* 3655 * We expect TLP's to have this occur. 3656 */ 3657 return; 3658 } 3659 rack = (struct tcp_rack *)tp->t_fb_ptr; 3660 /* should we undo cc changes and exit recovery? */ 3661 if (IN_RECOVERY(tp->t_flags)) { 3662 if (rack->r_ctl.rc_rsm_start == rsm->r_start) { 3663 /* 3664 * Undo what we ratched down and exit recovery if 3665 * possible 3666 */ 3667 EXIT_RECOVERY(tp->t_flags); 3668 tp->snd_recover = tp->snd_una; 3669 if (rack->r_ctl.rc_cwnd_at > tp->snd_cwnd) 3670 tp->snd_cwnd = rack->r_ctl.rc_cwnd_at; 3671 if (rack->r_ctl.rc_ssthresh_at > tp->snd_ssthresh) 3672 tp->snd_ssthresh = rack->r_ctl.rc_ssthresh_at; 3673 } 3674 } 3675 if (rsm->r_flags & RACK_WAS_SACKPASS) { 3676 /* 3677 * We retransmitted based on a sack and the earlier 3678 * retransmission ack'd it - re-ordering is occuring. 3679 */ 3680 counter_u64_add(rack_reorder_seen, 1); 3681 rack->r_ctl.rc_reorder_ts = cts; 3682 } 3683 counter_u64_add(rack_badfr, 1); 3684 counter_u64_add(rack_badfr_bytes, (rsm->r_end - rsm->r_start)); 3685 } 3686 3687 3688 static int 3689 rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack, 3690 struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type) 3691 { 3692 int32_t i; 3693 uint32_t t; 3694 3695 if (rsm->r_flags & RACK_ACKED) 3696 /* Already done */ 3697 return (0); 3698 3699 3700 if ((rsm->r_rtr_cnt == 1) || 3701 ((ack_type == CUM_ACKED) && 3702 (to->to_flags & TOF_TS) && 3703 (to->to_tsecr) && 3704 (rsm->r_tim_lastsent[rsm->r_rtr_cnt - 1] == to->to_tsecr)) 3705 ) { 3706 /* 3707 * We will only find a matching timestamp if its cum-acked. 3708 * But if its only one retransmission its for-sure matching 3709 * :-) 3710 */ 3711 t = cts - rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; 3712 if ((int)t <= 0) 3713 t = 1; 3714 if (!tp->t_rttlow || tp->t_rttlow > t) 3715 tp->t_rttlow = t; 3716 if (!rack->r_ctl.rc_rack_min_rtt || 3717 SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 3718 rack->r_ctl.rc_rack_min_rtt = t; 3719 if (rack->r_ctl.rc_rack_min_rtt == 0) { 3720 rack->r_ctl.rc_rack_min_rtt = 1; 3721 } 3722 } 3723 tcp_rack_xmit_timer(rack, TCP_TS_TO_TICKS(t) + 1); 3724 if ((rsm->r_flags & RACK_TLP) && 3725 (!IN_RECOVERY(tp->t_flags))) { 3726 /* Segment was a TLP and our retrans matched */ 3727 if (rack->r_ctl.rc_tlp_cwnd_reduce) { 3728 rack->r_ctl.rc_rsm_start = tp->snd_max; 3729 rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; 3730 rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; 3731 rack_cong_signal(tp, NULL, CC_NDUPACK); 3732 /* 3733 * When we enter recovery we need to assure 3734 * we send one packet. 3735 */ 3736 rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; 3737 } else 3738 rack->r_ctl.rc_tlp_rtx_out = 0; 3739 } 3740 if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) { 3741 /* New more recent rack_tmit_time */ 3742 rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; 3743 rack->rc_rack_rtt = t; 3744 } 3745 return (1); 3746 } 3747 /* 3748 * We clear the soft/rxtshift since we got an ack. 3749 * There is no assurance we will call the commit() function 3750 * so we need to clear these to avoid incorrect handling. 3751 */ 3752 tp->t_rxtshift = 0; 3753 tp->t_softerror = 0; 3754 if ((to->to_flags & TOF_TS) && 3755 (ack_type == CUM_ACKED) && 3756 (to->to_tsecr) && 3757 ((rsm->r_flags & (RACK_DEFERRED | RACK_OVERMAX)) == 0)) { 3758 /* 3759 * Now which timestamp does it match? In this block the ACK 3760 * must be coming from a previous transmission. 3761 */ 3762 for (i = 0; i < rsm->r_rtr_cnt; i++) { 3763 if (rsm->r_tim_lastsent[i] == to->to_tsecr) { 3764 t = cts - rsm->r_tim_lastsent[i]; 3765 if ((int)t <= 0) 3766 t = 1; 3767 if ((i + 1) < rsm->r_rtr_cnt) { 3768 /* Likely */ 3769 rack_earlier_retran(tp, rsm, t, cts); 3770 } 3771 if (!tp->t_rttlow || tp->t_rttlow > t) 3772 tp->t_rttlow = t; 3773 if (!rack->r_ctl.rc_rack_min_rtt || SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 3774 rack->r_ctl.rc_rack_min_rtt = t; 3775 if (rack->r_ctl.rc_rack_min_rtt == 0) { 3776 rack->r_ctl.rc_rack_min_rtt = 1; 3777 } 3778 } 3779 /* 3780 * Note the following calls to 3781 * tcp_rack_xmit_timer() are being commented 3782 * out for now. They give us no more accuracy 3783 * and often lead to a wrong choice. We have 3784 * enough samples that have not been 3785 * retransmitted. I leave the commented out 3786 * code in here in case in the future we 3787 * decide to add it back (though I can't forsee 3788 * doing that). That way we will easily see 3789 * where they need to be placed. 3790 */ 3791 if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, 3792 rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) { 3793 /* New more recent rack_tmit_time */ 3794 rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; 3795 rack->rc_rack_rtt = t; 3796 } 3797 return (1); 3798 } 3799 } 3800 goto ts_not_found; 3801 } else { 3802 /* 3803 * Ok its a SACK block that we retransmitted. or a windows 3804 * machine without timestamps. We can tell nothing from the 3805 * time-stamp since its not there or the time the peer last 3806 * recieved a segment that moved forward its cum-ack point. 3807 */ 3808 ts_not_found: 3809 i = rsm->r_rtr_cnt - 1; 3810 t = cts - rsm->r_tim_lastsent[i]; 3811 if ((int)t <= 0) 3812 t = 1; 3813 if (rack->r_ctl.rc_rack_min_rtt && SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 3814 /* 3815 * We retransmitted and the ack came back in less 3816 * than the smallest rtt we have observed. We most 3817 * likey did an improper retransmit as outlined in 3818 * 4.2 Step 3 point 2 in the rack-draft. 3819 */ 3820 i = rsm->r_rtr_cnt - 2; 3821 t = cts - rsm->r_tim_lastsent[i]; 3822 rack_earlier_retran(tp, rsm, t, cts); 3823 } else if (rack->r_ctl.rc_rack_min_rtt) { 3824 /* 3825 * We retransmitted it and the retransmit did the 3826 * job. 3827 */ 3828 if (!rack->r_ctl.rc_rack_min_rtt || 3829 SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 3830 rack->r_ctl.rc_rack_min_rtt = t; 3831 if (rack->r_ctl.rc_rack_min_rtt == 0) { 3832 rack->r_ctl.rc_rack_min_rtt = 1; 3833 } 3834 } 3835 if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, rsm->r_tim_lastsent[i])) { 3836 /* New more recent rack_tmit_time */ 3837 rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[i]; 3838 rack->rc_rack_rtt = t; 3839 } 3840 return (1); 3841 } 3842 } 3843 return (0); 3844 } 3845 3846 /* 3847 * Mark the SACK_PASSED flag on all entries prior to rsm send wise. 3848 */ 3849 static void 3850 rack_log_sack_passed(struct tcpcb *tp, 3851 struct tcp_rack *rack, struct rack_sendmap *rsm) 3852 { 3853 struct rack_sendmap *nrsm; 3854 uint32_t ts; 3855 int32_t idx; 3856 3857 idx = rsm->r_rtr_cnt - 1; 3858 ts = rsm->r_tim_lastsent[idx]; 3859 nrsm = rsm; 3860 TAILQ_FOREACH_REVERSE_FROM(nrsm, &rack->r_ctl.rc_tmap, 3861 rack_head, r_tnext) { 3862 if (nrsm == rsm) { 3863 /* Skip orginal segment he is acked */ 3864 continue; 3865 } 3866 if (nrsm->r_flags & RACK_ACKED) { 3867 /* Skip ack'd segments */ 3868 continue; 3869 } 3870 idx = nrsm->r_rtr_cnt - 1; 3871 if (ts == nrsm->r_tim_lastsent[idx]) { 3872 /* 3873 * For this case lets use seq no, if we sent in a 3874 * big block (TSO) we would have a bunch of segments 3875 * sent at the same time. 3876 * 3877 * We would only get a report if its SEQ is earlier. 3878 * If we have done multiple retransmits the times 3879 * would not be equal. 3880 */ 3881 if (SEQ_LT(nrsm->r_start, rsm->r_start)) { 3882 nrsm->r_flags |= RACK_SACK_PASSED; 3883 nrsm->r_flags &= ~RACK_WAS_SACKPASS; 3884 } 3885 } else { 3886 /* 3887 * Here they were sent at different times, not a big 3888 * block. Since we transmitted this one later and 3889 * see it sack'd then this must also be missing (or 3890 * we would have gotten a sack block for it) 3891 */ 3892 nrsm->r_flags |= RACK_SACK_PASSED; 3893 nrsm->r_flags &= ~RACK_WAS_SACKPASS; 3894 } 3895 } 3896 } 3897 3898 static uint32_t 3899 rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, struct sackblk *sack, 3900 struct tcpopt *to, struct rack_sendmap **prsm, uint32_t cts) 3901 { 3902 int32_t idx; 3903 int32_t times = 0; 3904 uint32_t start, end, changed = 0; 3905 struct rack_sendmap *rsm, *nrsm; 3906 int32_t used_ref = 1; 3907 3908 start = sack->start; 3909 end = sack->end; 3910 rsm = *prsm; 3911 if (rsm && SEQ_LT(start, rsm->r_start)) { 3912 TAILQ_FOREACH_REVERSE_FROM(rsm, &rack->r_ctl.rc_map, rack_head, r_next) { 3913 if (SEQ_GEQ(start, rsm->r_start) && 3914 SEQ_LT(start, rsm->r_end)) { 3915 goto do_rest_ofb; 3916 } 3917 } 3918 } 3919 if (rsm == NULL) { 3920 start_at_beginning: 3921 rsm = NULL; 3922 used_ref = 0; 3923 } 3924 /* First lets locate the block where this guy is */ 3925 TAILQ_FOREACH_FROM(rsm, &rack->r_ctl.rc_map, r_next) { 3926 if (SEQ_GEQ(start, rsm->r_start) && 3927 SEQ_LT(start, rsm->r_end)) { 3928 break; 3929 } 3930 } 3931 do_rest_ofb: 3932 if (rsm == NULL) { 3933 /* 3934 * This happens when we get duplicate sack blocks with the 3935 * same end. For example SACK 4: 100 SACK 3: 100 The sort 3936 * will not change there location so we would just start at 3937 * the end of the first one and get lost. 3938 */ 3939 if (tp->t_flags & TF_SENTFIN) { 3940 /* 3941 * Check to see if we have not logged the FIN that 3942 * went out. 3943 */ 3944 nrsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_map, rack_sendmap, r_next); 3945 if (nrsm && (nrsm->r_end + 1) == tp->snd_max) { 3946 /* 3947 * Ok we did not get the FIN logged. 3948 */ 3949 nrsm->r_end++; 3950 rsm = nrsm; 3951 goto do_rest_ofb; 3952 } 3953 } 3954 if (times == 1) { 3955 #ifdef INVARIANTS 3956 panic("tp:%p rack:%p sack:%p to:%p prsm:%p", 3957 tp, rack, sack, to, prsm); 3958 #else 3959 goto out; 3960 #endif 3961 } 3962 times++; 3963 counter_u64_add(rack_sack_proc_restart, 1); 3964 goto start_at_beginning; 3965 } 3966 /* Ok we have an ACK for some piece of rsm */ 3967 if (rsm->r_start != start) { 3968 /* 3969 * Need to split this in two pieces the before and after. 3970 */ 3971 nrsm = rack_alloc(rack); 3972 if (nrsm == NULL) { 3973 /* 3974 * failed XXXrrs what can we do but loose the sack 3975 * info? 3976 */ 3977 goto out; 3978 } 3979 nrsm->r_start = start; 3980 nrsm->r_rtr_bytes = 0; 3981 nrsm->r_end = rsm->r_end; 3982 nrsm->r_rtr_cnt = rsm->r_rtr_cnt; 3983 nrsm->r_flags = rsm->r_flags; 3984 nrsm->r_sndcnt = rsm->r_sndcnt; 3985 for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { 3986 nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; 3987 } 3988 rsm->r_end = nrsm->r_start; 3989 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next); 3990 if (rsm->r_in_tmap) { 3991 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 3992 nrsm->r_in_tmap = 1; 3993 } 3994 rsm->r_flags &= (~RACK_HAS_FIN); 3995 rsm = nrsm; 3996 } 3997 if (SEQ_GEQ(end, rsm->r_end)) { 3998 /* 3999 * The end of this block is either beyond this guy or right 4000 * at this guy. 4001 */ 4002 4003 if ((rsm->r_flags & RACK_ACKED) == 0) { 4004 rack_update_rtt(tp, rack, rsm, to, cts, SACKED); 4005 changed += (rsm->r_end - rsm->r_start); 4006 rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start); 4007 rack_log_sack_passed(tp, rack, rsm); 4008 /* Is Reordering occuring? */ 4009 if (rsm->r_flags & RACK_SACK_PASSED) { 4010 counter_u64_add(rack_reorder_seen, 1); 4011 rack->r_ctl.rc_reorder_ts = cts; 4012 } 4013 rsm->r_flags |= RACK_ACKED; 4014 rsm->r_flags &= ~RACK_TLP; 4015 if (rsm->r_in_tmap) { 4016 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 4017 rsm->r_in_tmap = 0; 4018 } 4019 } 4020 if (end == rsm->r_end) { 4021 /* This block only - done */ 4022 goto out; 4023 } 4024 /* There is more not coverend by this rsm move on */ 4025 start = rsm->r_end; 4026 nrsm = TAILQ_NEXT(rsm, r_next); 4027 rsm = nrsm; 4028 times = 0; 4029 goto do_rest_ofb; 4030 } 4031 /* Ok we need to split off this one at the tail */ 4032 nrsm = rack_alloc(rack); 4033 if (nrsm == NULL) { 4034 /* failed rrs what can we do but loose the sack info? */ 4035 goto out; 4036 } 4037 /* Clone it */ 4038 nrsm->r_start = end; 4039 nrsm->r_end = rsm->r_end; 4040 nrsm->r_rtr_bytes = 0; 4041 nrsm->r_rtr_cnt = rsm->r_rtr_cnt; 4042 nrsm->r_flags = rsm->r_flags; 4043 nrsm->r_sndcnt = rsm->r_sndcnt; 4044 for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { 4045 nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; 4046 } 4047 /* The sack block does not cover this guy fully */ 4048 rsm->r_flags &= (~RACK_HAS_FIN); 4049 rsm->r_end = end; 4050 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next); 4051 if (rsm->r_in_tmap) { 4052 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 4053 nrsm->r_in_tmap = 1; 4054 } 4055 if (rsm->r_flags & RACK_ACKED) { 4056 /* Been here done that */ 4057 goto out; 4058 } 4059 rack_update_rtt(tp, rack, rsm, to, cts, SACKED); 4060 changed += (rsm->r_end - rsm->r_start); 4061 rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start); 4062 rack_log_sack_passed(tp, rack, rsm); 4063 /* Is Reordering occuring? */ 4064 if (rsm->r_flags & RACK_SACK_PASSED) { 4065 counter_u64_add(rack_reorder_seen, 1); 4066 rack->r_ctl.rc_reorder_ts = cts; 4067 } 4068 rsm->r_flags |= RACK_ACKED; 4069 rsm->r_flags &= ~RACK_TLP; 4070 if (rsm->r_in_tmap) { 4071 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 4072 rsm->r_in_tmap = 0; 4073 } 4074 out: 4075 if (used_ref == 0) { 4076 counter_u64_add(rack_sack_proc_all, 1); 4077 } else { 4078 counter_u64_add(rack_sack_proc_short, 1); 4079 } 4080 /* Save off where we last were */ 4081 if (rsm) 4082 rack->r_ctl.rc_sacklast = TAILQ_NEXT(rsm, r_next); 4083 else 4084 rack->r_ctl.rc_sacklast = NULL; 4085 *prsm = rsm; 4086 return (changed); 4087 } 4088 4089 static void inline 4090 rack_peer_reneges(struct tcp_rack *rack, struct rack_sendmap *rsm, tcp_seq th_ack) 4091 { 4092 struct rack_sendmap *tmap; 4093 4094 tmap = NULL; 4095 while (rsm && (rsm->r_flags & RACK_ACKED)) { 4096 /* Its no longer sacked, mark it so */ 4097 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); 4098 #ifdef INVARIANTS 4099 if (rsm->r_in_tmap) { 4100 panic("rack:%p rsm:%p flags:0x%x in tmap?", 4101 rack, rsm, rsm->r_flags); 4102 } 4103 #endif 4104 rsm->r_flags &= ~(RACK_ACKED|RACK_SACK_PASSED|RACK_WAS_SACKPASS); 4105 /* Rebuild it into our tmap */ 4106 if (tmap == NULL) { 4107 TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext); 4108 tmap = rsm; 4109 } else { 4110 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, tmap, rsm, r_tnext); 4111 tmap = rsm; 4112 } 4113 tmap->r_in_tmap = 1; 4114 rsm = TAILQ_NEXT(rsm, r_next); 4115 } 4116 /* 4117 * Now lets possibly clear the sack filter so we start 4118 * recognizing sacks that cover this area. 4119 */ 4120 if (rack_use_sack_filter) 4121 sack_filter_clear(&rack->r_ctl.rack_sf, th_ack); 4122 4123 } 4124 4125 static void 4126 rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th) 4127 { 4128 uint32_t changed, last_seq, entered_recovery = 0; 4129 struct tcp_rack *rack; 4130 struct rack_sendmap *rsm; 4131 struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1]; 4132 register uint32_t th_ack; 4133 int32_t i, j, k, num_sack_blks = 0; 4134 uint32_t cts, acked, ack_point, sack_changed = 0; 4135 4136 INP_WLOCK_ASSERT(tp->t_inpcb); 4137 if (th->th_flags & TH_RST) { 4138 /* We don't log resets */ 4139 return; 4140 } 4141 rack = (struct tcp_rack *)tp->t_fb_ptr; 4142 cts = tcp_ts_getticks(); 4143 rsm = TAILQ_FIRST(&rack->r_ctl.rc_map); 4144 changed = 0; 4145 th_ack = th->th_ack; 4146 4147 if (SEQ_GT(th_ack, tp->snd_una)) { 4148 rack_log_progress_event(rack, tp, ticks, PROGRESS_UPDATE, __LINE__); 4149 tp->t_acktime = ticks; 4150 } 4151 if (rsm && SEQ_GT(th_ack, rsm->r_start)) 4152 changed = th_ack - rsm->r_start; 4153 if (changed) { 4154 /* 4155 * The ACK point is advancing to th_ack, we must drop off 4156 * the packets in the rack log and calculate any eligble 4157 * RTT's. 4158 */ 4159 rack->r_wanted_output++; 4160 more: 4161 rsm = TAILQ_FIRST(&rack->r_ctl.rc_map); 4162 if (rsm == NULL) { 4163 if ((th_ack - 1) == tp->iss) { 4164 /* 4165 * For the SYN incoming case we will not 4166 * have called tcp_output for the sending of 4167 * the SYN, so there will be no map. All 4168 * other cases should probably be a panic. 4169 */ 4170 goto proc_sack; 4171 } 4172 if (tp->t_flags & TF_SENTFIN) { 4173 /* if we send a FIN we will not hav a map */ 4174 goto proc_sack; 4175 } 4176 #ifdef INVARIANTS 4177 panic("No rack map tp:%p for th:%p state:%d rack:%p snd_una:%u snd_max:%u snd_nxt:%u chg:%d\n", 4178 tp, 4179 th, tp->t_state, rack, 4180 tp->snd_una, tp->snd_max, tp->snd_nxt, changed); 4181 #endif 4182 goto proc_sack; 4183 } 4184 if (SEQ_LT(th_ack, rsm->r_start)) { 4185 /* Huh map is missing this */ 4186 #ifdef INVARIANTS 4187 printf("Rack map starts at r_start:%u for th_ack:%u huh? ts:%d rs:%d\n", 4188 rsm->r_start, 4189 th_ack, tp->t_state, rack->r_state); 4190 #endif 4191 goto proc_sack; 4192 } 4193 rack_update_rtt(tp, rack, rsm, to, cts, CUM_ACKED); 4194 /* Now do we consume the whole thing? */ 4195 if (SEQ_GEQ(th_ack, rsm->r_end)) { 4196 /* Its all consumed. */ 4197 uint32_t left; 4198 4199 rack->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes; 4200 rsm->r_rtr_bytes = 0; 4201 TAILQ_REMOVE(&rack->r_ctl.rc_map, rsm, r_next); 4202 if (rsm->r_in_tmap) { 4203 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 4204 rsm->r_in_tmap = 0; 4205 } 4206 if (rack->r_ctl.rc_next == rsm) { 4207 /* scoot along the marker */ 4208 rack->r_ctl.rc_next = TAILQ_FIRST(&rack->r_ctl.rc_map); 4209 } 4210 if (rsm->r_flags & RACK_ACKED) { 4211 /* 4212 * It was acked on the scoreboard -- remove 4213 * it from total 4214 */ 4215 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); 4216 } else if (rsm->r_flags & RACK_SACK_PASSED) { 4217 /* 4218 * There are acked segments ACKED on the 4219 * scoreboard further up. We are seeing 4220 * reordering. 4221 */ 4222 counter_u64_add(rack_reorder_seen, 1); 4223 rsm->r_flags |= RACK_ACKED; 4224 rack->r_ctl.rc_reorder_ts = cts; 4225 } 4226 left = th_ack - rsm->r_end; 4227 if (rsm->r_rtr_cnt > 1) { 4228 /* 4229 * Technically we should make r_rtr_cnt be 4230 * monotonicly increasing and just mod it to 4231 * the timestamp it is replacing.. that way 4232 * we would have the last 3 retransmits. Now 4233 * rc_loss_count will be wrong if we 4234 * retransmit something more than 2 times in 4235 * recovery :( 4236 */ 4237 rack->r_ctl.rc_loss_count += (rsm->r_rtr_cnt - 1); 4238 } 4239 /* Free back to zone */ 4240 rack_free(rack, rsm); 4241 if (left) { 4242 goto more; 4243 } 4244 goto proc_sack; 4245 } 4246 if (rsm->r_flags & RACK_ACKED) { 4247 /* 4248 * It was acked on the scoreboard -- remove it from 4249 * total for the part being cum-acked. 4250 */ 4251 rack->r_ctl.rc_sacked -= (th_ack - rsm->r_start); 4252 } 4253 rack->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes; 4254 rsm->r_rtr_bytes = 0; 4255 rsm->r_start = th_ack; 4256 } 4257 proc_sack: 4258 /* Check for reneging */ 4259 rsm = TAILQ_FIRST(&rack->r_ctl.rc_map); 4260 if (rsm && (rsm->r_flags & RACK_ACKED) && (th_ack == rsm->r_start)) { 4261 /* 4262 * The peer has moved snd_una up to 4263 * the edge of this send, i.e. one 4264 * that it had previously acked. The only 4265 * way that can be true if the peer threw 4266 * away data (space issues) that it had 4267 * previously sacked (else it would have 4268 * given us snd_una up to (rsm->r_end). 4269 * We need to undo the acked markings here. 4270 * 4271 * Note we have to look to make sure th_ack is 4272 * our rsm->r_start in case we get an old ack 4273 * where th_ack is behind snd_una. 4274 */ 4275 rack_peer_reneges(rack, rsm, th->th_ack); 4276 } 4277 if ((to->to_flags & TOF_SACK) == 0) { 4278 /* We are done nothing left to log */ 4279 goto out; 4280 } 4281 rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_map, rack_sendmap, r_next); 4282 if (rsm) { 4283 last_seq = rsm->r_end; 4284 } else { 4285 last_seq = tp->snd_max; 4286 } 4287 /* Sack block processing */ 4288 if (SEQ_GT(th_ack, tp->snd_una)) 4289 ack_point = th_ack; 4290 else 4291 ack_point = tp->snd_una; 4292 for (i = 0; i < to->to_nsacks; i++) { 4293 bcopy((to->to_sacks + i * TCPOLEN_SACK), 4294 &sack, sizeof(sack)); 4295 sack.start = ntohl(sack.start); 4296 sack.end = ntohl(sack.end); 4297 if (SEQ_GT(sack.end, sack.start) && 4298 SEQ_GT(sack.start, ack_point) && 4299 SEQ_LT(sack.start, tp->snd_max) && 4300 SEQ_GT(sack.end, ack_point) && 4301 SEQ_LEQ(sack.end, tp->snd_max)) { 4302 if ((rack->r_ctl.rc_num_maps_alloced > rack_sack_block_limit) && 4303 (SEQ_LT(sack.end, last_seq)) && 4304 ((sack.end - sack.start) < (tp->t_maxseg / 8))) { 4305 /* 4306 * Not the last piece and its smaller than 4307 * 1/8th of a MSS. We ignore this. 4308 */ 4309 counter_u64_add(rack_runt_sacks, 1); 4310 continue; 4311 } 4312 sack_blocks[num_sack_blks] = sack; 4313 num_sack_blks++; 4314 #ifdef NETFLIX_STATS 4315 } else if (SEQ_LEQ(sack.start, th_ack) && 4316 SEQ_LEQ(sack.end, th_ack)) { 4317 /* 4318 * Its a D-SACK block. 4319 */ 4320 tcp_record_dsack(sack.start, sack.end); 4321 #endif 4322 } 4323 4324 } 4325 if (num_sack_blks == 0) 4326 goto out; 4327 /* 4328 * Sort the SACK blocks so we can update the rack scoreboard with 4329 * just one pass. 4330 */ 4331 if (rack_use_sack_filter) { 4332 num_sack_blks = sack_filter_blks(&rack->r_ctl.rack_sf, sack_blocks, num_sack_blks, th->th_ack); 4333 } 4334 if (num_sack_blks < 2) { 4335 goto do_sack_work; 4336 } 4337 /* Sort the sacks */ 4338 for (i = 0; i < num_sack_blks; i++) { 4339 for (j = i + 1; j < num_sack_blks; j++) { 4340 if (SEQ_GT(sack_blocks[i].end, sack_blocks[j].end)) { 4341 sack = sack_blocks[i]; 4342 sack_blocks[i] = sack_blocks[j]; 4343 sack_blocks[j] = sack; 4344 } 4345 } 4346 } 4347 /* 4348 * Now are any of the sack block ends the same (yes some 4349 * implememtations send these)? 4350 */ 4351 again: 4352 if (num_sack_blks > 1) { 4353 for (i = 0; i < num_sack_blks; i++) { 4354 for (j = i + 1; j < num_sack_blks; j++) { 4355 if (sack_blocks[i].end == sack_blocks[j].end) { 4356 /* 4357 * Ok these two have the same end we 4358 * want the smallest end and then 4359 * throw away the larger and start 4360 * again. 4361 */ 4362 if (SEQ_LT(sack_blocks[j].start, sack_blocks[i].start)) { 4363 /* 4364 * The second block covers 4365 * more area use that 4366 */ 4367 sack_blocks[i].start = sack_blocks[j].start; 4368 } 4369 /* 4370 * Now collapse out the dup-sack and 4371 * lower the count 4372 */ 4373 for (k = (j + 1); k < num_sack_blks; k++) { 4374 sack_blocks[j].start = sack_blocks[k].start; 4375 sack_blocks[j].end = sack_blocks[k].end; 4376 j++; 4377 } 4378 num_sack_blks--; 4379 goto again; 4380 } 4381 } 4382 } 4383 } 4384 do_sack_work: 4385 rsm = rack->r_ctl.rc_sacklast; 4386 for (i = 0; i < num_sack_blks; i++) { 4387 acked = rack_proc_sack_blk(tp, rack, &sack_blocks[i], to, &rsm, cts); 4388 if (acked) { 4389 rack->r_wanted_output++; 4390 changed += acked; 4391 sack_changed += acked; 4392 } 4393 } 4394 out: 4395 if (changed) { 4396 /* Something changed cancel the rack timer */ 4397 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 4398 } 4399 if ((sack_changed) && (!IN_RECOVERY(tp->t_flags))) { 4400 /* 4401 * Ok we have a high probability that we need to go in to 4402 * recovery since we have data sack'd 4403 */ 4404 struct rack_sendmap *rsm; 4405 uint32_t tsused; 4406 4407 tsused = tcp_ts_getticks(); 4408 rsm = tcp_rack_output(tp, rack, tsused); 4409 if (rsm) { 4410 /* Enter recovery */ 4411 rack->r_ctl.rc_rsm_start = rsm->r_start; 4412 rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; 4413 rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; 4414 entered_recovery = 1; 4415 rack_cong_signal(tp, NULL, CC_NDUPACK); 4416 /* 4417 * When we enter recovery we need to assure we send 4418 * one packet. 4419 */ 4420 rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; 4421 rack->r_timer_override = 1; 4422 } 4423 } 4424 if (IN_RECOVERY(tp->t_flags) && (entered_recovery == 0)) { 4425 /* Deal with changed an PRR here (in recovery only) */ 4426 uint32_t pipe, snd_una; 4427 4428 rack->r_ctl.rc_prr_delivered += changed; 4429 /* Compute prr_sndcnt */ 4430 if (SEQ_GT(tp->snd_una, th_ack)) { 4431 snd_una = tp->snd_una; 4432 } else { 4433 snd_una = th_ack; 4434 } 4435 pipe = ((tp->snd_max - snd_una) - rack->r_ctl.rc_sacked) + rack->r_ctl.rc_holes_rxt; 4436 if (pipe > tp->snd_ssthresh) { 4437 long sndcnt; 4438 4439 sndcnt = rack->r_ctl.rc_prr_delivered * tp->snd_ssthresh; 4440 if (rack->r_ctl.rc_prr_recovery_fs > 0) 4441 sndcnt /= (long)rack->r_ctl.rc_prr_recovery_fs; 4442 else { 4443 rack->r_ctl.rc_prr_sndcnt = 0; 4444 sndcnt = 0; 4445 } 4446 sndcnt++; 4447 if (sndcnt > (long)rack->r_ctl.rc_prr_out) 4448 sndcnt -= rack->r_ctl.rc_prr_out; 4449 else 4450 sndcnt = 0; 4451 rack->r_ctl.rc_prr_sndcnt = sndcnt; 4452 } else { 4453 uint32_t limit; 4454 4455 if (rack->r_ctl.rc_prr_delivered > rack->r_ctl.rc_prr_out) 4456 limit = (rack->r_ctl.rc_prr_delivered - rack->r_ctl.rc_prr_out); 4457 else 4458 limit = 0; 4459 if (changed > limit) 4460 limit = changed; 4461 limit += tp->t_maxseg; 4462 if (tp->snd_ssthresh > pipe) { 4463 rack->r_ctl.rc_prr_sndcnt = min((tp->snd_ssthresh - pipe), limit); 4464 } else { 4465 rack->r_ctl.rc_prr_sndcnt = min(0, limit); 4466 } 4467 } 4468 if (rack->r_ctl.rc_prr_sndcnt >= tp->t_maxseg) { 4469 rack->r_timer_override = 1; 4470 } 4471 } 4472 } 4473 4474 /* 4475 * Return value of 1, we do not need to call rack_process_data(). 4476 * return value of 0, rack_process_data can be called. 4477 * For ret_val if its 0 the TCP is locked, if its non-zero 4478 * its unlocked and probably unsafe to touch the TCB. 4479 */ 4480 static int 4481 rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so, 4482 struct tcpcb *tp, struct tcpopt *to, 4483 int32_t * ti_locked, uint32_t tiwin, int32_t tlen, 4484 int32_t * ofia, int32_t thflags, int32_t * ret_val) 4485 { 4486 int32_t ourfinisacked = 0; 4487 int32_t nsegs, acked_amount; 4488 int32_t acked; 4489 struct mbuf *mfree; 4490 struct tcp_rack *rack; 4491 int32_t recovery = 0; 4492 4493 rack = (struct tcp_rack *)tp->t_fb_ptr; 4494 if (SEQ_GT(th->th_ack, tp->snd_max)) { 4495 rack_do_dropafterack(m, tp, th, ti_locked, thflags, tlen, ret_val); 4496 return (1); 4497 } 4498 if (SEQ_GEQ(th->th_ack, tp->snd_una) || to->to_nsacks) { 4499 rack_log_ack(tp, to, th); 4500 } 4501 if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) { 4502 /* 4503 * Old ack, behind (or duplicate to) the last one rcv'd 4504 * Note: Should mark reordering is occuring! We should also 4505 * look for sack blocks arriving e.g. ack 1, 4-4 then ack 1, 4506 * 3-3, 4-4 would be reording. As well as ack 1, 3-3 <no 4507 * retran and> ack 3 4508 */ 4509 return (0); 4510 } 4511 /* 4512 * If we reach this point, ACK is not a duplicate, i.e., it ACKs 4513 * something we sent. 4514 */ 4515 if (tp->t_flags & TF_NEEDSYN) { 4516 /* 4517 * T/TCP: Connection was half-synchronized, and our SYN has 4518 * been ACK'd (so connection is now fully synchronized). Go 4519 * to non-starred state, increment snd_una for ACK of SYN, 4520 * and check if we can do window scaling. 4521 */ 4522 tp->t_flags &= ~TF_NEEDSYN; 4523 tp->snd_una++; 4524 /* Do window scaling? */ 4525 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 4526 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 4527 tp->rcv_scale = tp->request_r_scale; 4528 /* Send window already scaled. */ 4529 } 4530 } 4531 nsegs = max(1, m->m_pkthdr.lro_nsegs); 4532 INP_WLOCK_ASSERT(tp->t_inpcb); 4533 4534 acked = BYTES_THIS_ACK(tp, th); 4535 TCPSTAT_ADD(tcps_rcvackpack, nsegs); 4536 TCPSTAT_ADD(tcps_rcvackbyte, acked); 4537 4538 /* 4539 * If we just performed our first retransmit, and the ACK arrives 4540 * within our recovery window, then it was a mistake to do the 4541 * retransmit in the first place. Recover our original cwnd and 4542 * ssthresh, and proceed to transmit where we left off. 4543 */ 4544 if (tp->t_flags & TF_PREVVALID) { 4545 tp->t_flags &= ~TF_PREVVALID; 4546 if (tp->t_rxtshift == 1 && 4547 (int)(ticks - tp->t_badrxtwin) < 0) 4548 rack_cong_signal(tp, th, CC_RTO_ERR); 4549 } 4550 /* 4551 * If we have a timestamp reply, update smoothed round trip time. If 4552 * no timestamp is present but transmit timer is running and timed 4553 * sequence number was acked, update smoothed round trip time. Since 4554 * we now have an rtt measurement, cancel the timer backoff (cf., 4555 * Phil Karn's retransmit alg.). Recompute the initial retransmit 4556 * timer. 4557 * 4558 * Some boxes send broken timestamp replies during the SYN+ACK 4559 * phase, ignore timestamps of 0 or we could calculate a huge RTT 4560 * and blow up the retransmit timer. 4561 */ 4562 /* 4563 * If all outstanding data is acked, stop retransmit timer and 4564 * remember to restart (more output or persist). If there is more 4565 * data to be acked, restart retransmit timer, using current 4566 * (possibly backed-off) value. 4567 */ 4568 if (th->th_ack == tp->snd_max) { 4569 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 4570 rack->r_wanted_output++; 4571 } 4572 /* 4573 * If no data (only SYN) was ACK'd, skip rest of ACK processing. 4574 */ 4575 if (acked == 0) { 4576 if (ofia) 4577 *ofia = ourfinisacked; 4578 return (0); 4579 } 4580 if (rack->r_ctl.rc_early_recovery) { 4581 if (IN_FASTRECOVERY(tp->t_flags)) { 4582 if (SEQ_LT(th->th_ack, tp->snd_recover)) { 4583 tcp_rack_partialack(tp, th); 4584 } else { 4585 rack_post_recovery(tp, th); 4586 recovery = 1; 4587 } 4588 } 4589 } 4590 /* 4591 * Let the congestion control algorithm update congestion control 4592 * related information. This typically means increasing the 4593 * congestion window. 4594 */ 4595 rack_ack_received(tp, rack, th, nsegs, CC_ACK, recovery); 4596 SOCKBUF_LOCK(&so->so_snd); 4597 acked_amount = min(acked, (int)sbavail(&so->so_snd)); 4598 tp->snd_wnd -= acked_amount; 4599 mfree = sbcut_locked(&so->so_snd, acked_amount); 4600 if ((sbused(&so->so_snd) == 0) && 4601 (acked > acked_amount) && 4602 (tp->t_state >= TCPS_FIN_WAIT_1)) { 4603 ourfinisacked = 1; 4604 } 4605 /* NB: sowwakeup_locked() does an implicit unlock. */ 4606 sowwakeup_locked(so); 4607 m_freem(mfree); 4608 if (rack->r_ctl.rc_early_recovery == 0) { 4609 if (IN_FASTRECOVERY(tp->t_flags)) { 4610 if (SEQ_LT(th->th_ack, tp->snd_recover)) { 4611 tcp_rack_partialack(tp, th); 4612 } else { 4613 rack_post_recovery(tp, th); 4614 } 4615 } 4616 } 4617 tp->snd_una = th->th_ack; 4618 if (SEQ_GT(tp->snd_una, tp->snd_recover)) 4619 tp->snd_recover = tp->snd_una; 4620 4621 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) { 4622 tp->snd_nxt = tp->snd_una; 4623 } 4624 if (tp->snd_una == tp->snd_max) { 4625 /* Nothing left outstanding */ 4626 rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__); 4627 tp->t_acktime = 0; 4628 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 4629 /* Set need output so persist might get set */ 4630 rack->r_wanted_output++; 4631 if (rack_use_sack_filter) 4632 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 4633 if ((tp->t_state >= TCPS_FIN_WAIT_1) && 4634 (sbavail(&so->so_snd) == 0) && 4635 (tp->t_flags2 & TF2_DROP_AF_DATA)) { 4636 /* 4637 * The socket was gone and the 4638 * peer sent data, time to 4639 * reset him. 4640 */ 4641 *ret_val = 1; 4642 tp = tcp_close(tp); 4643 rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_UNLIMITED, tlen); 4644 return (1); 4645 } 4646 } 4647 if (ofia) 4648 *ofia = ourfinisacked; 4649 return (0); 4650 } 4651 4652 4653 /* 4654 * Return value of 1, the TCB is unlocked and most 4655 * likely gone, return value of 0, the TCP is still 4656 * locked. 4657 */ 4658 static int 4659 rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so, 4660 struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, 4661 int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) 4662 { 4663 /* 4664 * Update window information. Don't look at window if no ACK: TAC's 4665 * send garbage on first SYN. 4666 */ 4667 int32_t nsegs; 4668 int32_t tfo_syn; 4669 struct tcp_rack *rack; 4670 4671 rack = (struct tcp_rack *)tp->t_fb_ptr; 4672 INP_WLOCK_ASSERT(tp->t_inpcb); 4673 4674 nsegs = max(1, m->m_pkthdr.lro_nsegs); 4675 if ((thflags & TH_ACK) && 4676 (SEQ_LT(tp->snd_wl1, th->th_seq) || 4677 (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || 4678 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { 4679 /* keep track of pure window updates */ 4680 if (tlen == 0 && 4681 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) 4682 TCPSTAT_INC(tcps_rcvwinupd); 4683 tp->snd_wnd = tiwin; 4684 tp->snd_wl1 = th->th_seq; 4685 tp->snd_wl2 = th->th_ack; 4686 if (tp->snd_wnd > tp->max_sndwnd) 4687 tp->max_sndwnd = tp->snd_wnd; 4688 rack->r_wanted_output++; 4689 } else if (thflags & TH_ACK) { 4690 if ((tp->snd_wl2 == th->th_ack) && (tiwin < tp->snd_wnd)) { 4691 tp->snd_wnd = tiwin; 4692 tp->snd_wl1 = th->th_seq; 4693 tp->snd_wl2 = th->th_ack; 4694 } 4695 } 4696 /* Was persist timer active and now we have window space? */ 4697 if ((rack->rc_in_persist != 0) && tp->snd_wnd) { 4698 rack_exit_persist(tp, rack); 4699 tp->snd_nxt = tp->snd_max; 4700 /* Make sure we output to start the timer */ 4701 rack->r_wanted_output++; 4702 } 4703 /* 4704 * Process segments with URG. 4705 */ 4706 if ((thflags & TH_URG) && th->th_urp && 4707 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 4708 /* 4709 * This is a kludge, but if we receive and accept random 4710 * urgent pointers, we'll crash in soreceive. It's hard to 4711 * imagine someone actually wanting to send this much urgent 4712 * data. 4713 */ 4714 SOCKBUF_LOCK(&so->so_rcv); 4715 if (th->th_urp + sbavail(&so->so_rcv) > sb_max) { 4716 th->th_urp = 0; /* XXX */ 4717 thflags &= ~TH_URG; /* XXX */ 4718 SOCKBUF_UNLOCK(&so->so_rcv); /* XXX */ 4719 goto dodata; /* XXX */ 4720 } 4721 /* 4722 * If this segment advances the known urgent pointer, then 4723 * mark the data stream. This should not happen in 4724 * CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since a 4725 * FIN has been received from the remote side. In these 4726 * states we ignore the URG. 4727 * 4728 * According to RFC961 (Assigned Protocols), the urgent 4729 * pointer points to the last octet of urgent data. We 4730 * continue, however, to consider it to indicate the first 4731 * octet of data past the urgent section as the original 4732 * spec states (in one of two places). 4733 */ 4734 if (SEQ_GT(th->th_seq + th->th_urp, tp->rcv_up)) { 4735 tp->rcv_up = th->th_seq + th->th_urp; 4736 so->so_oobmark = sbavail(&so->so_rcv) + 4737 (tp->rcv_up - tp->rcv_nxt) - 1; 4738 if (so->so_oobmark == 0) 4739 so->so_rcv.sb_state |= SBS_RCVATMARK; 4740 sohasoutofband(so); 4741 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); 4742 } 4743 SOCKBUF_UNLOCK(&so->so_rcv); 4744 /* 4745 * Remove out of band data so doesn't get presented to user. 4746 * This can happen independent of advancing the URG pointer, 4747 * but if two URG's are pending at once, some out-of-band 4748 * data may creep in... ick. 4749 */ 4750 if (th->th_urp <= (uint32_t) tlen && 4751 !(so->so_options & SO_OOBINLINE)) { 4752 /* hdr drop is delayed */ 4753 tcp_pulloutofband(so, th, m, drop_hdrlen); 4754 } 4755 } else { 4756 /* 4757 * If no out of band data is expected, pull receive urgent 4758 * pointer along with the receive window. 4759 */ 4760 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) 4761 tp->rcv_up = tp->rcv_nxt; 4762 } 4763 dodata: /* XXX */ 4764 INP_WLOCK_ASSERT(tp->t_inpcb); 4765 4766 /* 4767 * Process the segment text, merging it into the TCP sequencing 4768 * queue, and arranging for acknowledgment of receipt if necessary. 4769 * This process logically involves adjusting tp->rcv_wnd as data is 4770 * presented to the user (this happens in tcp_usrreq.c, case 4771 * PRU_RCVD). If a FIN has already been received on this connection 4772 * then we just ignore the text. 4773 */ 4774 tfo_syn = ((tp->t_state == TCPS_SYN_RECEIVED) && 4775 IS_FASTOPEN(tp->t_flags)); 4776 if ((tlen || (thflags & TH_FIN) || tfo_syn) && 4777 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 4778 tcp_seq save_start = th->th_seq; 4779 4780 m_adj(m, drop_hdrlen); /* delayed header drop */ 4781 /* 4782 * Insert segment which includes th into TCP reassembly 4783 * queue with control block tp. Set thflags to whether 4784 * reassembly now includes a segment with FIN. This handles 4785 * the common case inline (segment is the next to be 4786 * received on an established connection, and the queue is 4787 * empty), avoiding linkage into and removal from the queue 4788 * and repetition of various conversions. Set DELACK for 4789 * segments received in order, but ack immediately when 4790 * segments are out of order (so fast retransmit can work). 4791 */ 4792 if (th->th_seq == tp->rcv_nxt && 4793 LIST_EMPTY(&tp->t_segq) && 4794 (TCPS_HAVEESTABLISHED(tp->t_state) || 4795 tfo_syn)) { 4796 if (DELAY_ACK(tp, tlen) || tfo_syn) { 4797 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 4798 tp->t_flags |= TF_DELACK; 4799 } else { 4800 rack->r_wanted_output++; 4801 tp->t_flags |= TF_ACKNOW; 4802 } 4803 tp->rcv_nxt += tlen; 4804 thflags = th->th_flags & TH_FIN; 4805 TCPSTAT_ADD(tcps_rcvpack, nsegs); 4806 TCPSTAT_ADD(tcps_rcvbyte, tlen); 4807 SOCKBUF_LOCK(&so->so_rcv); 4808 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) 4809 m_freem(m); 4810 else 4811 sbappendstream_locked(&so->so_rcv, m, 0); 4812 /* NB: sorwakeup_locked() does an implicit unlock. */ 4813 sorwakeup_locked(so); 4814 } else { 4815 /* 4816 * XXX: Due to the header drop above "th" is 4817 * theoretically invalid by now. Fortunately 4818 * m_adj() doesn't actually frees any mbufs when 4819 * trimming from the head. 4820 */ 4821 thflags = tcp_reass(tp, th, &tlen, m); 4822 tp->t_flags |= TF_ACKNOW; 4823 } 4824 if (tlen > 0) 4825 tcp_update_sack_list(tp, save_start, save_start + tlen); 4826 } else { 4827 m_freem(m); 4828 thflags &= ~TH_FIN; 4829 } 4830 4831 /* 4832 * If FIN is received ACK the FIN and let the user know that the 4833 * connection is closing. 4834 */ 4835 if (thflags & TH_FIN) { 4836 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 4837 socantrcvmore(so); 4838 /* 4839 * If connection is half-synchronized (ie NEEDSYN 4840 * flag on) then delay ACK, so it may be piggybacked 4841 * when SYN is sent. Otherwise, since we received a 4842 * FIN then no more input can be expected, send ACK 4843 * now. 4844 */ 4845 if (tp->t_flags & TF_NEEDSYN) { 4846 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 4847 tp->t_flags |= TF_DELACK; 4848 } else { 4849 tp->t_flags |= TF_ACKNOW; 4850 } 4851 tp->rcv_nxt++; 4852 } 4853 switch (tp->t_state) { 4854 4855 /* 4856 * In SYN_RECEIVED and ESTABLISHED STATES enter the 4857 * CLOSE_WAIT state. 4858 */ 4859 case TCPS_SYN_RECEIVED: 4860 tp->t_starttime = ticks; 4861 /* FALLTHROUGH */ 4862 case TCPS_ESTABLISHED: 4863 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 4864 tcp_state_change(tp, TCPS_CLOSE_WAIT); 4865 break; 4866 4867 /* 4868 * If still in FIN_WAIT_1 STATE FIN has not been 4869 * acked so enter the CLOSING state. 4870 */ 4871 case TCPS_FIN_WAIT_1: 4872 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 4873 tcp_state_change(tp, TCPS_CLOSING); 4874 break; 4875 4876 /* 4877 * In FIN_WAIT_2 state enter the TIME_WAIT state, 4878 * starting the time-wait timer, turning off the 4879 * other standard timers. 4880 */ 4881 case TCPS_FIN_WAIT_2: 4882 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 4883 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 4884 KASSERT(*ti_locked == TI_RLOCKED, ("%s: dodata " 4885 "TCP_FIN_WAIT_2 ti_locked: %d", __func__, 4886 *ti_locked)); 4887 tcp_twstart(tp); 4888 *ti_locked = TI_UNLOCKED; 4889 INP_INFO_RUNLOCK(&V_tcbinfo); 4890 return (1); 4891 } 4892 } 4893 if (*ti_locked == TI_RLOCKED) { 4894 INP_INFO_RUNLOCK(&V_tcbinfo); 4895 *ti_locked = TI_UNLOCKED; 4896 } 4897 /* 4898 * Return any desired output. 4899 */ 4900 if ((tp->t_flags & TF_ACKNOW) || (sbavail(&so->so_snd) > (tp->snd_max - tp->snd_una))) { 4901 rack->r_wanted_output++; 4902 } 4903 KASSERT(*ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d", 4904 __func__, *ti_locked)); 4905 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); 4906 INP_WLOCK_ASSERT(tp->t_inpcb); 4907 return (0); 4908 } 4909 4910 /* 4911 * Here nothing is really faster, its just that we 4912 * have broken out the fast-data path also just like 4913 * the fast-ack. 4914 */ 4915 static int 4916 rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so, 4917 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 4918 int32_t * ti_locked, uint32_t tiwin, int32_t nxt_pkt) 4919 { 4920 int32_t nsegs; 4921 int32_t newsize = 0; /* automatic sockbuf scaling */ 4922 struct tcp_rack *rack; 4923 #ifdef TCPDEBUG 4924 /* 4925 * The size of tcp_saveipgen must be the size of the max ip header, 4926 * now IPv6. 4927 */ 4928 u_char tcp_saveipgen[IP6_HDR_LEN]; 4929 struct tcphdr tcp_savetcp; 4930 short ostate = 0; 4931 4932 #endif 4933 /* 4934 * If last ACK falls within this segment's sequence numbers, record 4935 * the timestamp. NOTE that the test is modified according to the 4936 * latest proposal of the tcplw@cray.com list (Braden 1993/04/26). 4937 */ 4938 if (__predict_false(th->th_seq != tp->rcv_nxt)) { 4939 return (0); 4940 } 4941 if (__predict_false(tp->snd_nxt != tp->snd_max)) { 4942 return (0); 4943 } 4944 if (tiwin && tiwin != tp->snd_wnd) { 4945 return (0); 4946 } 4947 if (__predict_false((tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN)))) { 4948 return (0); 4949 } 4950 if (__predict_false((to->to_flags & TOF_TS) && 4951 (TSTMP_LT(to->to_tsval, tp->ts_recent)))) { 4952 return (0); 4953 } 4954 if (__predict_false((th->th_ack != tp->snd_una))) { 4955 return (0); 4956 } 4957 if (__predict_false(tlen > sbspace(&so->so_rcv))) { 4958 return (0); 4959 } 4960 if ((to->to_flags & TOF_TS) != 0 && 4961 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 4962 tp->ts_recent_age = tcp_ts_getticks(); 4963 tp->ts_recent = to->to_tsval; 4964 } 4965 rack = (struct tcp_rack *)tp->t_fb_ptr; 4966 /* 4967 * This is a pure, in-sequence data packet with nothing on the 4968 * reassembly queue and we have enough buffer space to take it. 4969 */ 4970 if (*ti_locked == TI_RLOCKED) { 4971 INP_INFO_RUNLOCK(&V_tcbinfo); 4972 *ti_locked = TI_UNLOCKED; 4973 } 4974 nsegs = max(1, m->m_pkthdr.lro_nsegs); 4975 4976 4977 /* Clean receiver SACK report if present */ 4978 if (tp->rcv_numsacks) 4979 tcp_clean_sackreport(tp); 4980 TCPSTAT_INC(tcps_preddat); 4981 tp->rcv_nxt += tlen; 4982 /* 4983 * Pull snd_wl1 up to prevent seq wrap relative to th_seq. 4984 */ 4985 tp->snd_wl1 = th->th_seq; 4986 /* 4987 * Pull rcv_up up to prevent seq wrap relative to rcv_nxt. 4988 */ 4989 tp->rcv_up = tp->rcv_nxt; 4990 TCPSTAT_ADD(tcps_rcvpack, nsegs); 4991 TCPSTAT_ADD(tcps_rcvbyte, tlen); 4992 #ifdef TCPDEBUG 4993 if (so->so_options & SO_DEBUG) 4994 tcp_trace(TA_INPUT, ostate, tp, 4995 (void *)tcp_saveipgen, &tcp_savetcp, 0); 4996 #endif 4997 newsize = tcp_autorcvbuf(m, th, so, tp, tlen); 4998 4999 /* Add data to socket buffer. */ 5000 SOCKBUF_LOCK(&so->so_rcv); 5001 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 5002 m_freem(m); 5003 } else { 5004 /* 5005 * Set new socket buffer size. Give up when limit is 5006 * reached. 5007 */ 5008 if (newsize) 5009 if (!sbreserve_locked(&so->so_rcv, 5010 newsize, so, NULL)) 5011 so->so_rcv.sb_flags &= ~SB_AUTOSIZE; 5012 m_adj(m, drop_hdrlen); /* delayed header drop */ 5013 sbappendstream_locked(&so->so_rcv, m, 0); 5014 rack_calc_rwin(so, tp); 5015 } 5016 /* NB: sorwakeup_locked() does an implicit unlock. */ 5017 sorwakeup_locked(so); 5018 if (DELAY_ACK(tp, tlen)) { 5019 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 5020 tp->t_flags |= TF_DELACK; 5021 } else { 5022 tp->t_flags |= TF_ACKNOW; 5023 rack->r_wanted_output++; 5024 } 5025 if ((tp->snd_una == tp->snd_max) && rack_use_sack_filter) 5026 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 5027 return (1); 5028 } 5029 5030 /* 5031 * This subfunction is used to try to highly optimize the 5032 * fast path. We again allow window updates that are 5033 * in sequence to remain in the fast-path. We also add 5034 * in the __predict's to attempt to help the compiler. 5035 * Note that if we return a 0, then we can *not* process 5036 * it and the caller should push the packet into the 5037 * slow-path. 5038 */ 5039 static int 5040 rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so, 5041 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 5042 int32_t * ti_locked, uint32_t tiwin, int32_t nxt_pkt, uint32_t cts) 5043 { 5044 int32_t acked; 5045 int32_t nsegs; 5046 5047 #ifdef TCPDEBUG 5048 /* 5049 * The size of tcp_saveipgen must be the size of the max ip header, 5050 * now IPv6. 5051 */ 5052 u_char tcp_saveipgen[IP6_HDR_LEN]; 5053 struct tcphdr tcp_savetcp; 5054 short ostate = 0; 5055 5056 #endif 5057 struct tcp_rack *rack; 5058 5059 if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) { 5060 /* Old ack, behind (or duplicate to) the last one rcv'd */ 5061 return (0); 5062 } 5063 if (__predict_false(SEQ_GT(th->th_ack, tp->snd_max))) { 5064 /* Above what we have sent? */ 5065 return (0); 5066 } 5067 if (__predict_false(tp->snd_nxt != tp->snd_max)) { 5068 /* We are retransmitting */ 5069 return (0); 5070 } 5071 if (__predict_false(tiwin == 0)) { 5072 /* zero window */ 5073 return (0); 5074 } 5075 if (__predict_false(tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN))) { 5076 /* We need a SYN or a FIN, unlikely.. */ 5077 return (0); 5078 } 5079 if ((to->to_flags & TOF_TS) && __predict_false(TSTMP_LT(to->to_tsval, tp->ts_recent))) { 5080 /* Timestamp is behind .. old ack with seq wrap? */ 5081 return (0); 5082 } 5083 if (__predict_false(IN_RECOVERY(tp->t_flags))) { 5084 /* Still recovering */ 5085 return (0); 5086 } 5087 rack = (struct tcp_rack *)tp->t_fb_ptr; 5088 if (rack->r_ctl.rc_sacked) { 5089 /* We have sack holes on our scoreboard */ 5090 return (0); 5091 } 5092 /* Ok if we reach here, we can process a fast-ack */ 5093 nsegs = max(1, m->m_pkthdr.lro_nsegs); 5094 rack_log_ack(tp, to, th); 5095 /* Did the window get updated? */ 5096 if (tiwin != tp->snd_wnd) { 5097 tp->snd_wnd = tiwin; 5098 tp->snd_wl1 = th->th_seq; 5099 if (tp->snd_wnd > tp->max_sndwnd) 5100 tp->max_sndwnd = tp->snd_wnd; 5101 } 5102 if ((rack->rc_in_persist != 0) && (tp->snd_wnd >= tp->t_maxseg)) { 5103 rack_exit_persist(tp, rack); 5104 } 5105 /* 5106 * If last ACK falls within this segment's sequence numbers, record 5107 * the timestamp. NOTE that the test is modified according to the 5108 * latest proposal of the tcplw@cray.com list (Braden 1993/04/26). 5109 */ 5110 if ((to->to_flags & TOF_TS) != 0 && 5111 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 5112 tp->ts_recent_age = tcp_ts_getticks(); 5113 tp->ts_recent = to->to_tsval; 5114 } 5115 /* 5116 * This is a pure ack for outstanding data. 5117 */ 5118 if (*ti_locked == TI_RLOCKED) { 5119 INP_INFO_RUNLOCK(&V_tcbinfo); 5120 *ti_locked = TI_UNLOCKED; 5121 } 5122 TCPSTAT_INC(tcps_predack); 5123 5124 /* 5125 * "bad retransmit" recovery. 5126 */ 5127 if (tp->t_flags & TF_PREVVALID) { 5128 tp->t_flags &= ~TF_PREVVALID; 5129 if (tp->t_rxtshift == 1 && 5130 (int)(ticks - tp->t_badrxtwin) < 0) 5131 rack_cong_signal(tp, th, CC_RTO_ERR); 5132 } 5133 /* 5134 * Recalculate the transmit timer / rtt. 5135 * 5136 * Some boxes send broken timestamp replies during the SYN+ACK 5137 * phase, ignore timestamps of 0 or we could calculate a huge RTT 5138 * and blow up the retransmit timer. 5139 */ 5140 acked = BYTES_THIS_ACK(tp, th); 5141 5142 #ifdef TCP_HHOOK 5143 /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ 5144 hhook_run_tcp_est_in(tp, th, to); 5145 #endif 5146 5147 TCPSTAT_ADD(tcps_rcvackpack, nsegs); 5148 TCPSTAT_ADD(tcps_rcvackbyte, acked); 5149 sbdrop(&so->so_snd, acked); 5150 /* 5151 * Let the congestion control algorithm update congestion control 5152 * related information. This typically means increasing the 5153 * congestion window. 5154 */ 5155 rack_ack_received(tp, rack, th, nsegs, CC_ACK, 0); 5156 5157 tp->snd_una = th->th_ack; 5158 /* 5159 * Pull snd_wl2 up to prevent seq wrap relative to th_ack. 5160 */ 5161 tp->snd_wl2 = th->th_ack; 5162 tp->t_dupacks = 0; 5163 m_freem(m); 5164 /* ND6_HINT(tp); *//* Some progress has been made. */ 5165 5166 /* 5167 * If all outstanding data are acked, stop retransmit timer, 5168 * otherwise restart timer using current (possibly backed-off) 5169 * value. If process is waiting for space, wakeup/selwakeup/signal. 5170 * If data are ready to send, let tcp_output decide between more 5171 * output or persist. 5172 */ 5173 #ifdef TCPDEBUG 5174 if (so->so_options & SO_DEBUG) 5175 tcp_trace(TA_INPUT, ostate, tp, 5176 (void *)tcp_saveipgen, 5177 &tcp_savetcp, 0); 5178 #endif 5179 if (tp->snd_una == tp->snd_max) { 5180 rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__); 5181 tp->t_acktime = 0; 5182 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 5183 } 5184 /* Wake up the socket if we have room to write more */ 5185 sowwakeup(so); 5186 if (sbavail(&so->so_snd)) { 5187 rack->r_wanted_output++; 5188 } 5189 return (1); 5190 } 5191 5192 /* 5193 * Return value of 1, the TCB is unlocked and most 5194 * likely gone, return value of 0, the TCP is still 5195 * locked. 5196 */ 5197 static int 5198 rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so, 5199 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 5200 int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) 5201 { 5202 int32_t ret_val = 0; 5203 int32_t todrop; 5204 int32_t ourfinisacked = 0; 5205 5206 rack_calc_rwin(so, tp); 5207 /* 5208 * If the state is SYN_SENT: if seg contains an ACK, but not for our 5209 * SYN, drop the input. if seg contains a RST, then drop the 5210 * connection. if seg does not contain SYN, then drop it. Otherwise 5211 * this is an acceptable SYN segment initialize tp->rcv_nxt and 5212 * tp->irs if seg contains ack then advance tp->snd_una if seg 5213 * contains an ECE and ECN support is enabled, the stream is ECN 5214 * capable. if SYN has been acked change to ESTABLISHED else 5215 * SYN_RCVD state arrange for segment to be acked (eventually) 5216 * continue processing rest of data/controls, beginning with URG 5217 */ 5218 if ((thflags & TH_ACK) && 5219 (SEQ_LEQ(th->th_ack, tp->iss) || 5220 SEQ_GT(th->th_ack, tp->snd_max))) { 5221 rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen); 5222 return (1); 5223 } 5224 if ((thflags & (TH_ACK | TH_RST)) == (TH_ACK | TH_RST)) { 5225 TCP_PROBE5(connect__refused, NULL, tp, 5226 mtod(m, const char *), tp, th); 5227 tp = tcp_drop(tp, ECONNREFUSED); 5228 rack_do_drop(m, tp, ti_locked); 5229 return (1); 5230 } 5231 if (thflags & TH_RST) { 5232 rack_do_drop(m, tp, ti_locked); 5233 return (1); 5234 } 5235 if (!(thflags & TH_SYN)) { 5236 rack_do_drop(m, tp, ti_locked); 5237 return (1); 5238 } 5239 tp->irs = th->th_seq; 5240 tcp_rcvseqinit(tp); 5241 if (thflags & TH_ACK) { 5242 int tfo_partial = 0; 5243 5244 TCPSTAT_INC(tcps_connects); 5245 soisconnected(so); 5246 #ifdef MAC 5247 mac_socketpeer_set_from_mbuf(m, so); 5248 #endif 5249 /* Do window scaling on this connection? */ 5250 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 5251 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 5252 tp->rcv_scale = tp->request_r_scale; 5253 } 5254 tp->rcv_adv += min(tp->rcv_wnd, 5255 TCP_MAXWIN << tp->rcv_scale); 5256 /* 5257 * If not all the data that was sent in the TFO SYN 5258 * has been acked, resend the remainder right away. 5259 */ 5260 if (IS_FASTOPEN(tp->t_flags) && 5261 (tp->snd_una != tp->snd_max)) { 5262 tp->snd_nxt = th->th_ack; 5263 tfo_partial = 1; 5264 } 5265 /* 5266 * If there's data, delay ACK; if there's also a FIN ACKNOW 5267 * will be turned on later. 5268 */ 5269 if (DELAY_ACK(tp, tlen) && tlen != 0 && (tfo_partial == 0)) { 5270 rack_timer_cancel(tp, (struct tcp_rack *)tp->t_fb_ptr, 5271 ((struct tcp_rack *)tp->t_fb_ptr)->r_ctl.rc_rcvtime, __LINE__); 5272 tp->t_flags |= TF_DELACK; 5273 } else { 5274 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output++; 5275 tp->t_flags |= TF_ACKNOW; 5276 } 5277 5278 if ((thflags & TH_ECE) && V_tcp_do_ecn) { 5279 tp->t_flags |= TF_ECN_PERMIT; 5280 TCPSTAT_INC(tcps_ecn_shs); 5281 } 5282 if (SEQ_GT(th->th_ack, tp->snd_una)) { 5283 /* 5284 * We advance snd_una for the 5285 * fast open case. If th_ack is 5286 * acknowledging data beyond 5287 * snd_una we can't just call 5288 * ack-processing since the 5289 * data stream in our send-map 5290 * will start at snd_una + 1 (one 5291 * beyond the SYN). If its just 5292 * equal we don't need to do that 5293 * and there is no send_map. 5294 */ 5295 tp->snd_una++; 5296 } 5297 /* 5298 * Received <SYN,ACK> in SYN_SENT[*] state. Transitions: 5299 * SYN_SENT --> ESTABLISHED SYN_SENT* --> FIN_WAIT_1 5300 */ 5301 tp->t_starttime = ticks; 5302 if (tp->t_flags & TF_NEEDFIN) { 5303 tcp_state_change(tp, TCPS_FIN_WAIT_1); 5304 tp->t_flags &= ~TF_NEEDFIN; 5305 thflags &= ~TH_SYN; 5306 } else { 5307 tcp_state_change(tp, TCPS_ESTABLISHED); 5308 TCP_PROBE5(connect__established, NULL, tp, 5309 mtod(m, const char *), tp, th); 5310 cc_conn_init(tp); 5311 } 5312 } else { 5313 /* 5314 * Received initial SYN in SYN-SENT[*] state => simultaneous 5315 * open. If segment contains CC option and there is a 5316 * cached CC, apply TAO test. If it succeeds, connection is * 5317 * half-synchronized. Otherwise, do 3-way handshake: 5318 * SYN-SENT -> SYN-RECEIVED SYN-SENT* -> SYN-RECEIVED* If 5319 * there was no CC option, clear cached CC value. 5320 */ 5321 tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN); 5322 tcp_state_change(tp, TCPS_SYN_RECEIVED); 5323 } 5324 KASSERT(*ti_locked == TI_RLOCKED, ("%s: trimthenstep6: " 5325 "ti_locked %d", __func__, *ti_locked)); 5326 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 5327 INP_WLOCK_ASSERT(tp->t_inpcb); 5328 /* 5329 * Advance th->th_seq to correspond to first data byte. If data, 5330 * trim to stay within window, dropping FIN if necessary. 5331 */ 5332 th->th_seq++; 5333 if (tlen > tp->rcv_wnd) { 5334 todrop = tlen - tp->rcv_wnd; 5335 m_adj(m, -todrop); 5336 tlen = tp->rcv_wnd; 5337 thflags &= ~TH_FIN; 5338 TCPSTAT_INC(tcps_rcvpackafterwin); 5339 TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); 5340 } 5341 tp->snd_wl1 = th->th_seq - 1; 5342 tp->rcv_up = th->th_seq; 5343 /* 5344 * Client side of transaction: already sent SYN and data. If the 5345 * remote host used T/TCP to validate the SYN, our data will be 5346 * ACK'd; if so, enter normal data segment processing in the middle 5347 * of step 5, ack processing. Otherwise, goto step 6. 5348 */ 5349 if (thflags & TH_ACK) { 5350 if (rack_process_ack(m, th, so, tp, to, ti_locked, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) 5351 return (ret_val); 5352 /* We may have changed to FIN_WAIT_1 above */ 5353 if (tp->t_state == TCPS_FIN_WAIT_1) { 5354 /* 5355 * In FIN_WAIT_1 STATE in addition to the processing 5356 * for the ESTABLISHED state if our FIN is now 5357 * acknowledged then enter FIN_WAIT_2. 5358 */ 5359 if (ourfinisacked) { 5360 /* 5361 * If we can't receive any more data, then 5362 * closing user can proceed. Starting the 5363 * timer is contrary to the specification, 5364 * but if we don't get a FIN we'll hang 5365 * forever. 5366 * 5367 * XXXjl: we should release the tp also, and 5368 * use a compressed state. 5369 */ 5370 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 5371 soisdisconnected(so); 5372 tcp_timer_activate(tp, TT_2MSL, 5373 (tcp_fast_finwait2_recycle ? 5374 tcp_finwait2_timeout : 5375 TP_MAXIDLE(tp))); 5376 } 5377 tcp_state_change(tp, TCPS_FIN_WAIT_2); 5378 } 5379 } 5380 } 5381 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 5382 ti_locked, tiwin, thflags, nxt_pkt)); 5383 } 5384 5385 /* 5386 * Return value of 1, the TCB is unlocked and most 5387 * likely gone, return value of 0, the TCP is still 5388 * locked. 5389 */ 5390 static int 5391 rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, 5392 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 5393 int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) 5394 { 5395 int32_t ret_val = 0; 5396 int32_t ourfinisacked = 0; 5397 5398 rack_calc_rwin(so, tp); 5399 5400 if ((thflags & TH_ACK) && 5401 (SEQ_LEQ(th->th_ack, tp->snd_una) || 5402 SEQ_GT(th->th_ack, tp->snd_max))) { 5403 rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen); 5404 return (1); 5405 } 5406 if (IS_FASTOPEN(tp->t_flags)) { 5407 /* 5408 * When a TFO connection is in SYN_RECEIVED, the 5409 * only valid packets are the initial SYN, a 5410 * retransmit/copy of the initial SYN (possibly with 5411 * a subset of the original data), a valid ACK, a 5412 * FIN, or a RST. 5413 */ 5414 if ((thflags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) { 5415 rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen); 5416 return (1); 5417 } else if (thflags & TH_SYN) { 5418 /* non-initial SYN is ignored */ 5419 struct tcp_rack *rack; 5420 5421 rack = (struct tcp_rack *)tp->t_fb_ptr; 5422 if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) || 5423 (rack->r_ctl.rc_hpts_flags & PACE_TMR_TLP) || 5424 (rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK)) { 5425 rack_do_drop(m, NULL, ti_locked); 5426 return (0); 5427 } 5428 } else if (!(thflags & (TH_ACK | TH_FIN | TH_RST))) { 5429 rack_do_drop(m, NULL, ti_locked); 5430 return (0); 5431 } 5432 } 5433 if (thflags & TH_RST) 5434 return (rack_process_rst(m, th, so, tp, ti_locked)); 5435 /* 5436 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 5437 * synchronized state. 5438 */ 5439 if (thflags & TH_SYN) { 5440 rack_challenge_ack(m, th, tp, ti_locked, &ret_val); 5441 return (ret_val); 5442 } 5443 /* 5444 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 5445 * it's less than ts_recent, drop it. 5446 */ 5447 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 5448 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 5449 if (rack_ts_check(m, th, tp, ti_locked, tlen, thflags, &ret_val)) 5450 return (ret_val); 5451 } 5452 /* 5453 * In the SYN-RECEIVED state, validate that the packet belongs to 5454 * this connection before trimming the data to fit the receive 5455 * window. Check the sequence number versus IRS since we know the 5456 * sequence numbers haven't wrapped. This is a partial fix for the 5457 * "LAND" DoS attack. 5458 */ 5459 if (SEQ_LT(th->th_seq, tp->irs)) { 5460 rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen); 5461 return (1); 5462 } 5463 if (rack_drop_checks(to, m, th, tp, &tlen, ti_locked, &thflags, &drop_hdrlen, &ret_val)) { 5464 return (ret_val); 5465 } 5466 /* 5467 * If last ACK falls within this segment's sequence numbers, record 5468 * its timestamp. NOTE: 1) That the test incorporates suggestions 5469 * from the latest proposal of the tcplw@cray.com list (Braden 5470 * 1993/04/26). 2) That updating only on newer timestamps interferes 5471 * with our earlier PAWS tests, so this check should be solely 5472 * predicated on the sequence space of this segment. 3) That we 5473 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 5474 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 5475 * SEG.Len, This modified check allows us to overcome RFC1323's 5476 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 5477 * p.869. In such cases, we can still calculate the RTT correctly 5478 * when RCV.NXT == Last.ACK.Sent. 5479 */ 5480 if ((to->to_flags & TOF_TS) != 0 && 5481 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 5482 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 5483 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 5484 tp->ts_recent_age = tcp_ts_getticks(); 5485 tp->ts_recent = to->to_tsval; 5486 } 5487 /* 5488 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 5489 * is on (half-synchronized state), then queue data for later 5490 * processing; else drop segment and return. 5491 */ 5492 if ((thflags & TH_ACK) == 0) { 5493 if (IS_FASTOPEN(tp->t_flags)) { 5494 tp->snd_wnd = tiwin; 5495 cc_conn_init(tp); 5496 } 5497 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 5498 ti_locked, tiwin, thflags, nxt_pkt)); 5499 } 5500 TCPSTAT_INC(tcps_connects); 5501 soisconnected(so); 5502 /* Do window scaling? */ 5503 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 5504 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 5505 tp->rcv_scale = tp->request_r_scale; 5506 tp->snd_wnd = tiwin; 5507 } 5508 /* 5509 * Make transitions: SYN-RECEIVED -> ESTABLISHED SYN-RECEIVED* -> 5510 * FIN-WAIT-1 5511 */ 5512 tp->t_starttime = ticks; 5513 if (tp->t_flags & TF_NEEDFIN) { 5514 tcp_state_change(tp, TCPS_FIN_WAIT_1); 5515 tp->t_flags &= ~TF_NEEDFIN; 5516 } else { 5517 tcp_state_change(tp, TCPS_ESTABLISHED); 5518 TCP_PROBE5(accept__established, NULL, tp, 5519 mtod(m, const char *), tp, th); 5520 if (IS_FASTOPEN(tp->t_flags) && tp->t_tfo_pending) { 5521 tcp_fastopen_decrement_counter(tp->t_tfo_pending); 5522 tp->t_tfo_pending = NULL; 5523 5524 /* 5525 * Account for the ACK of our SYN prior to regular 5526 * ACK processing below. 5527 */ 5528 tp->snd_una++; 5529 } 5530 /* 5531 * TFO connections call cc_conn_init() during SYN 5532 * processing. Calling it again here for such connections 5533 * is not harmless as it would undo the snd_cwnd reduction 5534 * that occurs when a TFO SYN|ACK is retransmitted. 5535 */ 5536 if (!IS_FASTOPEN(tp->t_flags)) 5537 cc_conn_init(tp); 5538 } 5539 /* 5540 * If segment contains data or ACK, will call tcp_reass() later; if 5541 * not, do so now to pass queued data to user. 5542 */ 5543 if (tlen == 0 && (thflags & TH_FIN) == 0) 5544 (void)tcp_reass(tp, (struct tcphdr *)0, 0, 5545 (struct mbuf *)0); 5546 tp->snd_wl1 = th->th_seq - 1; 5547 if (rack_process_ack(m, th, so, tp, to, ti_locked, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 5548 return (ret_val); 5549 } 5550 if (tp->t_state == TCPS_FIN_WAIT_1) { 5551 /* We could have went to FIN_WAIT_1 (or EST) above */ 5552 /* 5553 * In FIN_WAIT_1 STATE in addition to the processing for the 5554 * ESTABLISHED state if our FIN is now acknowledged then 5555 * enter FIN_WAIT_2. 5556 */ 5557 if (ourfinisacked) { 5558 /* 5559 * If we can't receive any more data, then closing 5560 * user can proceed. Starting the timer is contrary 5561 * to the specification, but if we don't get a FIN 5562 * we'll hang forever. 5563 * 5564 * XXXjl: we should release the tp also, and use a 5565 * compressed state. 5566 */ 5567 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 5568 soisdisconnected(so); 5569 tcp_timer_activate(tp, TT_2MSL, 5570 (tcp_fast_finwait2_recycle ? 5571 tcp_finwait2_timeout : 5572 TP_MAXIDLE(tp))); 5573 } 5574 tcp_state_change(tp, TCPS_FIN_WAIT_2); 5575 } 5576 } 5577 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 5578 ti_locked, tiwin, thflags, nxt_pkt)); 5579 } 5580 5581 /* 5582 * Return value of 1, the TCB is unlocked and most 5583 * likely gone, return value of 0, the TCP is still 5584 * locked. 5585 */ 5586 static int 5587 rack_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so, 5588 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 5589 int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) 5590 { 5591 int32_t ret_val = 0; 5592 5593 /* 5594 * Header prediction: check for the two common cases of a 5595 * uni-directional data xfer. If the packet has no control flags, 5596 * is in-sequence, the window didn't change and we're not 5597 * retransmitting, it's a candidate. If the length is zero and the 5598 * ack moved forward, we're the sender side of the xfer. Just free 5599 * the data acked & wake any higher level process that was blocked 5600 * waiting for space. If the length is non-zero and the ack didn't 5601 * move, we're the receiver side. If we're getting packets in-order 5602 * (the reassembly queue is empty), add the data toc The socket 5603 * buffer and note that we need a delayed ack. Make sure that the 5604 * hidden state-flags are also off. Since we check for 5605 * TCPS_ESTABLISHED first, it can only be TH_NEEDSYN. 5606 */ 5607 if (__predict_true(((to->to_flags & TOF_SACK) == 0)) && 5608 __predict_true((thflags & (TH_SYN | TH_FIN | TH_RST | TH_URG | TH_ACK)) == TH_ACK) && 5609 __predict_true(LIST_EMPTY(&tp->t_segq)) && 5610 __predict_true(th->th_seq == tp->rcv_nxt)) { 5611 struct tcp_rack *rack; 5612 5613 rack = (struct tcp_rack *)tp->t_fb_ptr; 5614 if (tlen == 0) { 5615 if (rack_fastack(m, th, so, tp, to, drop_hdrlen, tlen, 5616 ti_locked, tiwin, nxt_pkt, rack->r_ctl.rc_rcvtime)) { 5617 return (0); 5618 } 5619 } else { 5620 if (rack_do_fastnewdata(m, th, so, tp, to, drop_hdrlen, tlen, 5621 ti_locked, tiwin, nxt_pkt)) { 5622 return (0); 5623 } 5624 } 5625 } 5626 rack_calc_rwin(so, tp); 5627 5628 if (thflags & TH_RST) 5629 return (rack_process_rst(m, th, so, tp, ti_locked)); 5630 5631 /* 5632 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 5633 * synchronized state. 5634 */ 5635 if (thflags & TH_SYN) { 5636 rack_challenge_ack(m, th, tp, ti_locked, &ret_val); 5637 return (ret_val); 5638 } 5639 /* 5640 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 5641 * it's less than ts_recent, drop it. 5642 */ 5643 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 5644 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 5645 if (rack_ts_check(m, th, tp, ti_locked, tlen, thflags, &ret_val)) 5646 return (ret_val); 5647 } 5648 if (rack_drop_checks(to, m, th, tp, &tlen, ti_locked, &thflags, &drop_hdrlen, &ret_val)) { 5649 return (ret_val); 5650 } 5651 /* 5652 * If last ACK falls within this segment's sequence numbers, record 5653 * its timestamp. NOTE: 1) That the test incorporates suggestions 5654 * from the latest proposal of the tcplw@cray.com list (Braden 5655 * 1993/04/26). 2) That updating only on newer timestamps interferes 5656 * with our earlier PAWS tests, so this check should be solely 5657 * predicated on the sequence space of this segment. 3) That we 5658 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 5659 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 5660 * SEG.Len, This modified check allows us to overcome RFC1323's 5661 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 5662 * p.869. In such cases, we can still calculate the RTT correctly 5663 * when RCV.NXT == Last.ACK.Sent. 5664 */ 5665 if ((to->to_flags & TOF_TS) != 0 && 5666 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 5667 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 5668 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 5669 tp->ts_recent_age = tcp_ts_getticks(); 5670 tp->ts_recent = to->to_tsval; 5671 } 5672 /* 5673 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 5674 * is on (half-synchronized state), then queue data for later 5675 * processing; else drop segment and return. 5676 */ 5677 if ((thflags & TH_ACK) == 0) { 5678 if (tp->t_flags & TF_NEEDSYN) { 5679 5680 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 5681 ti_locked, tiwin, thflags, nxt_pkt)); 5682 5683 } else if (tp->t_flags & TF_ACKNOW) { 5684 rack_do_dropafterack(m, tp, th, ti_locked, thflags, tlen, &ret_val); 5685 return (ret_val); 5686 } else { 5687 rack_do_drop(m, NULL, ti_locked); 5688 return (0); 5689 } 5690 } 5691 /* 5692 * Ack processing. 5693 */ 5694 if (rack_process_ack(m, th, so, tp, to, ti_locked, tiwin, tlen, NULL, thflags, &ret_val)) { 5695 return (ret_val); 5696 } 5697 if (sbavail(&so->so_snd)) { 5698 if (rack_progress_timeout_check(tp)) { 5699 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 5700 rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen); 5701 return (1); 5702 } 5703 } 5704 /* State changes only happen in rack_process_data() */ 5705 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 5706 ti_locked, tiwin, thflags, nxt_pkt)); 5707 } 5708 5709 /* 5710 * Return value of 1, the TCB is unlocked and most 5711 * likely gone, return value of 0, the TCP is still 5712 * locked. 5713 */ 5714 static int 5715 rack_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so, 5716 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 5717 int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) 5718 { 5719 int32_t ret_val = 0; 5720 5721 rack_calc_rwin(so, tp); 5722 if (thflags & TH_RST) 5723 return (rack_process_rst(m, th, so, tp, ti_locked)); 5724 /* 5725 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 5726 * synchronized state. 5727 */ 5728 if (thflags & TH_SYN) { 5729 rack_challenge_ack(m, th, tp, ti_locked, &ret_val); 5730 return (ret_val); 5731 } 5732 /* 5733 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 5734 * it's less than ts_recent, drop it. 5735 */ 5736 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 5737 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 5738 if (rack_ts_check(m, th, tp, ti_locked, tlen, thflags, &ret_val)) 5739 return (ret_val); 5740 } 5741 if (rack_drop_checks(to, m, th, tp, &tlen, ti_locked, &thflags, &drop_hdrlen, &ret_val)) { 5742 return (ret_val); 5743 } 5744 /* 5745 * If last ACK falls within this segment's sequence numbers, record 5746 * its timestamp. NOTE: 1) That the test incorporates suggestions 5747 * from the latest proposal of the tcplw@cray.com list (Braden 5748 * 1993/04/26). 2) That updating only on newer timestamps interferes 5749 * with our earlier PAWS tests, so this check should be solely 5750 * predicated on the sequence space of this segment. 3) That we 5751 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 5752 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 5753 * SEG.Len, This modified check allows us to overcome RFC1323's 5754 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 5755 * p.869. In such cases, we can still calculate the RTT correctly 5756 * when RCV.NXT == Last.ACK.Sent. 5757 */ 5758 if ((to->to_flags & TOF_TS) != 0 && 5759 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 5760 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 5761 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 5762 tp->ts_recent_age = tcp_ts_getticks(); 5763 tp->ts_recent = to->to_tsval; 5764 } 5765 /* 5766 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 5767 * is on (half-synchronized state), then queue data for later 5768 * processing; else drop segment and return. 5769 */ 5770 if ((thflags & TH_ACK) == 0) { 5771 if (tp->t_flags & TF_NEEDSYN) { 5772 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 5773 ti_locked, tiwin, thflags, nxt_pkt)); 5774 5775 } else if (tp->t_flags & TF_ACKNOW) { 5776 rack_do_dropafterack(m, tp, th, ti_locked, thflags, tlen, &ret_val); 5777 return (ret_val); 5778 } else { 5779 rack_do_drop(m, NULL, ti_locked); 5780 return (0); 5781 } 5782 } 5783 /* 5784 * Ack processing. 5785 */ 5786 if (rack_process_ack(m, th, so, tp, to, ti_locked, tiwin, tlen, NULL, thflags, &ret_val)) { 5787 return (ret_val); 5788 } 5789 if (sbavail(&so->so_snd)) { 5790 if (rack_progress_timeout_check(tp)) { 5791 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 5792 rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen); 5793 return (1); 5794 } 5795 } 5796 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 5797 ti_locked, tiwin, thflags, nxt_pkt)); 5798 } 5799 5800 static int 5801 rack_check_data_after_close(struct mbuf *m, 5802 struct tcpcb *tp, int32_t *ti_locked, int32_t *tlen, struct tcphdr *th, struct socket *so) 5803 { 5804 struct tcp_rack *rack; 5805 5806 KASSERT(*ti_locked == TI_RLOCKED, ("%s: SS_NOFDEREF && " 5807 "CLOSE_WAIT && tlen ti_locked %d", __func__, *ti_locked)); 5808 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 5809 rack = (struct tcp_rack *)tp->t_fb_ptr; 5810 if (rack->rc_allow_data_af_clo == 0) { 5811 close_now: 5812 tp = tcp_close(tp); 5813 TCPSTAT_INC(tcps_rcvafterclose); 5814 rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_UNLIMITED, (*tlen)); 5815 return (1); 5816 } 5817 if (sbavail(&so->so_snd) == 0) 5818 goto close_now; 5819 /* Ok we allow data that is ignored and a followup reset */ 5820 tp->rcv_nxt = th->th_seq + *tlen; 5821 tp->t_flags2 |= TF2_DROP_AF_DATA; 5822 rack->r_wanted_output = 1; 5823 *tlen = 0; 5824 return (0); 5825 } 5826 5827 /* 5828 * Return value of 1, the TCB is unlocked and most 5829 * likely gone, return value of 0, the TCP is still 5830 * locked. 5831 */ 5832 static int 5833 rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so, 5834 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 5835 int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) 5836 { 5837 int32_t ret_val = 0; 5838 int32_t ourfinisacked = 0; 5839 5840 rack_calc_rwin(so, tp); 5841 5842 if (thflags & TH_RST) 5843 return (rack_process_rst(m, th, so, tp, ti_locked)); 5844 /* 5845 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 5846 * synchronized state. 5847 */ 5848 if (thflags & TH_SYN) { 5849 rack_challenge_ack(m, th, tp, ti_locked, &ret_val); 5850 return (ret_val); 5851 } 5852 /* 5853 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 5854 * it's less than ts_recent, drop it. 5855 */ 5856 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 5857 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 5858 if (rack_ts_check(m, th, tp, ti_locked, tlen, thflags, &ret_val)) 5859 return (ret_val); 5860 } 5861 if (rack_drop_checks(to, m, th, tp, &tlen, ti_locked, &thflags, &drop_hdrlen, &ret_val)) { 5862 return (ret_val); 5863 } 5864 /* 5865 * If new data are received on a connection after the user processes 5866 * are gone, then RST the other end. 5867 */ 5868 if ((so->so_state & SS_NOFDREF) && tlen) { 5869 if (rack_check_data_after_close(m, tp, ti_locked, &tlen, th, so)) 5870 return (1); 5871 } 5872 /* 5873 * If last ACK falls within this segment's sequence numbers, record 5874 * its timestamp. NOTE: 1) That the test incorporates suggestions 5875 * from the latest proposal of the tcplw@cray.com list (Braden 5876 * 1993/04/26). 2) That updating only on newer timestamps interferes 5877 * with our earlier PAWS tests, so this check should be solely 5878 * predicated on the sequence space of this segment. 3) That we 5879 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 5880 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 5881 * SEG.Len, This modified check allows us to overcome RFC1323's 5882 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 5883 * p.869. In such cases, we can still calculate the RTT correctly 5884 * when RCV.NXT == Last.ACK.Sent. 5885 */ 5886 if ((to->to_flags & TOF_TS) != 0 && 5887 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 5888 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 5889 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 5890 tp->ts_recent_age = tcp_ts_getticks(); 5891 tp->ts_recent = to->to_tsval; 5892 } 5893 /* 5894 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 5895 * is on (half-synchronized state), then queue data for later 5896 * processing; else drop segment and return. 5897 */ 5898 if ((thflags & TH_ACK) == 0) { 5899 if (tp->t_flags & TF_NEEDSYN) { 5900 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 5901 ti_locked, tiwin, thflags, nxt_pkt)); 5902 } else if (tp->t_flags & TF_ACKNOW) { 5903 rack_do_dropafterack(m, tp, th, ti_locked, thflags, tlen, &ret_val); 5904 return (ret_val); 5905 } else { 5906 rack_do_drop(m, NULL, ti_locked); 5907 return (0); 5908 } 5909 } 5910 /* 5911 * Ack processing. 5912 */ 5913 if (rack_process_ack(m, th, so, tp, to, ti_locked, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 5914 return (ret_val); 5915 } 5916 if (ourfinisacked) { 5917 /* 5918 * If we can't receive any more data, then closing user can 5919 * proceed. Starting the timer is contrary to the 5920 * specification, but if we don't get a FIN we'll hang 5921 * forever. 5922 * 5923 * XXXjl: we should release the tp also, and use a 5924 * compressed state. 5925 */ 5926 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 5927 soisdisconnected(so); 5928 tcp_timer_activate(tp, TT_2MSL, 5929 (tcp_fast_finwait2_recycle ? 5930 tcp_finwait2_timeout : 5931 TP_MAXIDLE(tp))); 5932 } 5933 tcp_state_change(tp, TCPS_FIN_WAIT_2); 5934 } 5935 if (sbavail(&so->so_snd)) { 5936 if (rack_progress_timeout_check(tp)) { 5937 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 5938 rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen); 5939 return (1); 5940 } 5941 } 5942 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 5943 ti_locked, tiwin, thflags, nxt_pkt)); 5944 } 5945 5946 /* 5947 * Return value of 1, the TCB is unlocked and most 5948 * likely gone, return value of 0, the TCP is still 5949 * locked. 5950 */ 5951 static int 5952 rack_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so, 5953 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 5954 int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) 5955 { 5956 int32_t ret_val = 0; 5957 int32_t ourfinisacked = 0; 5958 5959 rack_calc_rwin(so, tp); 5960 5961 if (thflags & TH_RST) 5962 return (rack_process_rst(m, th, so, tp, ti_locked)); 5963 /* 5964 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 5965 * synchronized state. 5966 */ 5967 if (thflags & TH_SYN) { 5968 rack_challenge_ack(m, th, tp, ti_locked, &ret_val); 5969 return (ret_val); 5970 } 5971 /* 5972 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 5973 * it's less than ts_recent, drop it. 5974 */ 5975 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 5976 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 5977 if (rack_ts_check(m, th, tp, ti_locked, tlen, thflags, &ret_val)) 5978 return (ret_val); 5979 } 5980 if (rack_drop_checks(to, m, th, tp, &tlen, ti_locked, &thflags, &drop_hdrlen, &ret_val)) { 5981 return (ret_val); 5982 } 5983 /* 5984 * If new data are received on a connection after the user processes 5985 * are gone, then RST the other end. 5986 */ 5987 if ((so->so_state & SS_NOFDREF) && tlen) { 5988 if (rack_check_data_after_close(m, tp, ti_locked, &tlen, th, so)) 5989 return (1); 5990 } 5991 /* 5992 * If last ACK falls within this segment's sequence numbers, record 5993 * its timestamp. NOTE: 1) That the test incorporates suggestions 5994 * from the latest proposal of the tcplw@cray.com list (Braden 5995 * 1993/04/26). 2) That updating only on newer timestamps interferes 5996 * with our earlier PAWS tests, so this check should be solely 5997 * predicated on the sequence space of this segment. 3) That we 5998 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 5999 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 6000 * SEG.Len, This modified check allows us to overcome RFC1323's 6001 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 6002 * p.869. In such cases, we can still calculate the RTT correctly 6003 * when RCV.NXT == Last.ACK.Sent. 6004 */ 6005 if ((to->to_flags & TOF_TS) != 0 && 6006 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 6007 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 6008 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 6009 tp->ts_recent_age = tcp_ts_getticks(); 6010 tp->ts_recent = to->to_tsval; 6011 } 6012 /* 6013 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 6014 * is on (half-synchronized state), then queue data for later 6015 * processing; else drop segment and return. 6016 */ 6017 if ((thflags & TH_ACK) == 0) { 6018 if (tp->t_flags & TF_NEEDSYN) { 6019 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 6020 ti_locked, tiwin, thflags, nxt_pkt)); 6021 } else if (tp->t_flags & TF_ACKNOW) { 6022 rack_do_dropafterack(m, tp, th, ti_locked, thflags, tlen, &ret_val); 6023 return (ret_val); 6024 } else { 6025 rack_do_drop(m, NULL, ti_locked); 6026 return (0); 6027 } 6028 } 6029 /* 6030 * Ack processing. 6031 */ 6032 if (rack_process_ack(m, th, so, tp, to, ti_locked, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 6033 return (ret_val); 6034 } 6035 if (ourfinisacked) { 6036 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 6037 tcp_twstart(tp); 6038 INP_INFO_RUNLOCK(&V_tcbinfo); 6039 *ti_locked = TI_UNLOCKED; 6040 m_freem(m); 6041 return (1); 6042 } 6043 if (sbavail(&so->so_snd)) { 6044 if (rack_progress_timeout_check(tp)) { 6045 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 6046 rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen); 6047 return (1); 6048 } 6049 } 6050 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 6051 ti_locked, tiwin, thflags, nxt_pkt)); 6052 } 6053 6054 /* 6055 * Return value of 1, the TCB is unlocked and most 6056 * likely gone, return value of 0, the TCP is still 6057 * locked. 6058 */ 6059 static int 6060 rack_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so, 6061 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 6062 int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) 6063 { 6064 int32_t ret_val = 0; 6065 int32_t ourfinisacked = 0; 6066 6067 rack_calc_rwin(so, tp); 6068 6069 if (thflags & TH_RST) 6070 return (rack_process_rst(m, th, so, tp, ti_locked)); 6071 /* 6072 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 6073 * synchronized state. 6074 */ 6075 if (thflags & TH_SYN) { 6076 rack_challenge_ack(m, th, tp, ti_locked, &ret_val); 6077 return (ret_val); 6078 } 6079 /* 6080 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 6081 * it's less than ts_recent, drop it. 6082 */ 6083 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 6084 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 6085 if (rack_ts_check(m, th, tp, ti_locked, tlen, thflags, &ret_val)) 6086 return (ret_val); 6087 } 6088 if (rack_drop_checks(to, m, th, tp, &tlen, ti_locked, &thflags, &drop_hdrlen, &ret_val)) { 6089 return (ret_val); 6090 } 6091 /* 6092 * If new data are received on a connection after the user processes 6093 * are gone, then RST the other end. 6094 */ 6095 if ((so->so_state & SS_NOFDREF) && tlen) { 6096 if (rack_check_data_after_close(m, tp, ti_locked, &tlen, th, so)) 6097 return (1); 6098 } 6099 /* 6100 * If last ACK falls within this segment's sequence numbers, record 6101 * its timestamp. NOTE: 1) That the test incorporates suggestions 6102 * from the latest proposal of the tcplw@cray.com list (Braden 6103 * 1993/04/26). 2) That updating only on newer timestamps interferes 6104 * with our earlier PAWS tests, so this check should be solely 6105 * predicated on the sequence space of this segment. 3) That we 6106 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 6107 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 6108 * SEG.Len, This modified check allows us to overcome RFC1323's 6109 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 6110 * p.869. In such cases, we can still calculate the RTT correctly 6111 * when RCV.NXT == Last.ACK.Sent. 6112 */ 6113 if ((to->to_flags & TOF_TS) != 0 && 6114 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 6115 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 6116 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 6117 tp->ts_recent_age = tcp_ts_getticks(); 6118 tp->ts_recent = to->to_tsval; 6119 } 6120 /* 6121 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 6122 * is on (half-synchronized state), then queue data for later 6123 * processing; else drop segment and return. 6124 */ 6125 if ((thflags & TH_ACK) == 0) { 6126 if (tp->t_flags & TF_NEEDSYN) { 6127 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 6128 ti_locked, tiwin, thflags, nxt_pkt)); 6129 } else if (tp->t_flags & TF_ACKNOW) { 6130 rack_do_dropafterack(m, tp, th, ti_locked, thflags, tlen, &ret_val); 6131 return (ret_val); 6132 } else { 6133 rack_do_drop(m, NULL, ti_locked); 6134 return (0); 6135 } 6136 } 6137 /* 6138 * case TCPS_LAST_ACK: Ack processing. 6139 */ 6140 if (rack_process_ack(m, th, so, tp, to, ti_locked, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 6141 return (ret_val); 6142 } 6143 if (ourfinisacked) { 6144 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 6145 tp = tcp_close(tp); 6146 rack_do_drop(m, tp, ti_locked); 6147 return (1); 6148 } 6149 if (sbavail(&so->so_snd)) { 6150 if (rack_progress_timeout_check(tp)) { 6151 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 6152 rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen); 6153 return (1); 6154 } 6155 } 6156 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 6157 ti_locked, tiwin, thflags, nxt_pkt)); 6158 } 6159 6160 6161 /* 6162 * Return value of 1, the TCB is unlocked and most 6163 * likely gone, return value of 0, the TCP is still 6164 * locked. 6165 */ 6166 static int 6167 rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so, 6168 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 6169 int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) 6170 { 6171 int32_t ret_val = 0; 6172 int32_t ourfinisacked = 0; 6173 6174 rack_calc_rwin(so, tp); 6175 6176 /* Reset receive buffer auto scaling when not in bulk receive mode. */ 6177 if (thflags & TH_RST) 6178 return (rack_process_rst(m, th, so, tp, ti_locked)); 6179 /* 6180 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 6181 * synchronized state. 6182 */ 6183 if (thflags & TH_SYN) { 6184 rack_challenge_ack(m, th, tp, ti_locked, &ret_val); 6185 return (ret_val); 6186 } 6187 /* 6188 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 6189 * it's less than ts_recent, drop it. 6190 */ 6191 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 6192 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 6193 if (rack_ts_check(m, th, tp, ti_locked, tlen, thflags, &ret_val)) 6194 return (ret_val); 6195 } 6196 if (rack_drop_checks(to, m, th, tp, &tlen, ti_locked, &thflags, &drop_hdrlen, &ret_val)) { 6197 return (ret_val); 6198 } 6199 /* 6200 * If new data are received on a connection after the user processes 6201 * are gone, then RST the other end. 6202 */ 6203 if ((so->so_state & SS_NOFDREF) && 6204 tlen) { 6205 if (rack_check_data_after_close(m, tp, ti_locked, &tlen, th, so)) 6206 return (1); 6207 } 6208 /* 6209 * If last ACK falls within this segment's sequence numbers, record 6210 * its timestamp. NOTE: 1) That the test incorporates suggestions 6211 * from the latest proposal of the tcplw@cray.com list (Braden 6212 * 1993/04/26). 2) That updating only on newer timestamps interferes 6213 * with our earlier PAWS tests, so this check should be solely 6214 * predicated on the sequence space of this segment. 3) That we 6215 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 6216 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 6217 * SEG.Len, This modified check allows us to overcome RFC1323's 6218 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 6219 * p.869. In such cases, we can still calculate the RTT correctly 6220 * when RCV.NXT == Last.ACK.Sent. 6221 */ 6222 if ((to->to_flags & TOF_TS) != 0 && 6223 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 6224 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 6225 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 6226 tp->ts_recent_age = tcp_ts_getticks(); 6227 tp->ts_recent = to->to_tsval; 6228 } 6229 /* 6230 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 6231 * is on (half-synchronized state), then queue data for later 6232 * processing; else drop segment and return. 6233 */ 6234 if ((thflags & TH_ACK) == 0) { 6235 if (tp->t_flags & TF_NEEDSYN) { 6236 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 6237 ti_locked, tiwin, thflags, nxt_pkt)); 6238 } else if (tp->t_flags & TF_ACKNOW) { 6239 rack_do_dropafterack(m, tp, th, ti_locked, thflags, tlen, &ret_val); 6240 return (ret_val); 6241 } else { 6242 rack_do_drop(m, NULL, ti_locked); 6243 return (0); 6244 } 6245 } 6246 /* 6247 * Ack processing. 6248 */ 6249 if (rack_process_ack(m, th, so, tp, to, ti_locked, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 6250 return (ret_val); 6251 } 6252 if (sbavail(&so->so_snd)) { 6253 if (rack_progress_timeout_check(tp)) { 6254 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 6255 rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen); 6256 return (1); 6257 } 6258 } 6259 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 6260 ti_locked, tiwin, thflags, nxt_pkt)); 6261 } 6262 6263 6264 static void inline 6265 rack_clear_rate_sample(struct tcp_rack *rack) 6266 { 6267 rack->r_ctl.rack_rs.rs_flags = RACK_RTT_EMPTY; 6268 rack->r_ctl.rack_rs.rs_rtt_cnt = 0; 6269 rack->r_ctl.rack_rs.rs_rtt_tot = 0; 6270 } 6271 6272 static int 6273 rack_init(struct tcpcb *tp) 6274 { 6275 struct tcp_rack *rack = NULL; 6276 6277 tp->t_fb_ptr = uma_zalloc(rack_pcb_zone, M_NOWAIT); 6278 if (tp->t_fb_ptr == NULL) { 6279 /* 6280 * We need to allocate memory but cant. The INP and INP_INFO 6281 * locks and they are recusive (happens during setup. So a 6282 * scheme to drop the locks fails :( 6283 * 6284 */ 6285 return (ENOMEM); 6286 } 6287 memset(tp->t_fb_ptr, 0, sizeof(struct tcp_rack)); 6288 6289 rack = (struct tcp_rack *)tp->t_fb_ptr; 6290 TAILQ_INIT(&rack->r_ctl.rc_map); 6291 TAILQ_INIT(&rack->r_ctl.rc_free); 6292 TAILQ_INIT(&rack->r_ctl.rc_tmap); 6293 rack->rc_tp = tp; 6294 if (tp->t_inpcb) { 6295 rack->rc_inp = tp->t_inpcb; 6296 } 6297 /* Probably not needed but lets be sure */ 6298 rack_clear_rate_sample(rack); 6299 rack->r_cpu = 0; 6300 rack->r_ctl.rc_reorder_fade = rack_reorder_fade; 6301 rack->rc_allow_data_af_clo = rack_ignore_data_after_close; 6302 rack->r_ctl.rc_tlp_threshold = rack_tlp_thresh; 6303 rack->rc_pace_reduce = rack_slot_reduction; 6304 if (V_tcp_delack_enabled) 6305 tp->t_delayed_ack = 1; 6306 else 6307 tp->t_delayed_ack = 0; 6308 rack->rc_pace_max_segs = rack_hptsi_segments; 6309 rack->r_ctl.rc_early_recovery_segs = rack_early_recovery_max_seg; 6310 rack->r_ctl.rc_reorder_shift = rack_reorder_thresh; 6311 rack->r_ctl.rc_pkt_delay = rack_pkt_delay; 6312 rack->r_ctl.rc_prop_reduce = rack_use_proportional_reduce; 6313 rack->r_idle_reduce_largest = rack_reduce_largest_on_idle; 6314 rack->r_enforce_min_pace = rack_min_pace_time; 6315 rack->r_min_pace_seg_thresh = rack_min_pace_time_seg_req; 6316 rack->r_ctl.rc_prop_rate = rack_proportional_rate; 6317 rack->r_ctl.rc_tlp_cwnd_reduce = rack_lower_cwnd_at_tlp; 6318 rack->r_ctl.rc_early_recovery = rack_early_recovery; 6319 rack->rc_always_pace = rack_pace_every_seg; 6320 rack->r_ctl.rc_rate_sample_method = rack_rate_sample_method; 6321 rack->rack_tlp_threshold_use = rack_tlp_threshold_use; 6322 rack->r_ctl.rc_prr_sendalot = rack_send_a_lot_in_prr; 6323 rack->r_ctl.rc_min_to = rack_min_to; 6324 rack->r_ctl.rc_prr_inc_var = rack_inc_var; 6325 rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0); 6326 if (tp->snd_una != tp->snd_max) { 6327 /* Create a send map for the current outstanding data */ 6328 struct rack_sendmap *rsm; 6329 6330 rsm = rack_alloc(rack); 6331 if (rsm == NULL) { 6332 uma_zfree(rack_pcb_zone, tp->t_fb_ptr); 6333 tp->t_fb_ptr = NULL; 6334 return (ENOMEM); 6335 } 6336 rsm->r_flags = RACK_OVERMAX; 6337 rsm->r_tim_lastsent[0] = tcp_ts_getticks(); 6338 rsm->r_rtr_cnt = 1; 6339 rsm->r_rtr_bytes = 0; 6340 rsm->r_start = tp->snd_una; 6341 rsm->r_end = tp->snd_max; 6342 rsm->r_sndcnt = 0; 6343 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_map, rsm, r_next); 6344 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); 6345 rsm->r_in_tmap = 1; 6346 } 6347 return (0); 6348 } 6349 6350 static int 6351 rack_handoff_ok(struct tcpcb *tp) 6352 { 6353 if ((tp->t_state == TCPS_CLOSED) || 6354 (tp->t_state == TCPS_LISTEN)) { 6355 /* Sure no problem though it may not stick */ 6356 return (0); 6357 } 6358 if ((tp->t_state == TCPS_SYN_SENT) || 6359 (tp->t_state == TCPS_SYN_RECEIVED)) { 6360 /* 6361 * We really don't know you have to get to ESTAB or beyond 6362 * to tell. 6363 */ 6364 return (EAGAIN); 6365 } 6366 if (tp->t_flags & TF_SACK_PERMIT) { 6367 return (0); 6368 } 6369 /* 6370 * If we reach here we don't do SACK on this connection so we can 6371 * never do rack. 6372 */ 6373 return (EINVAL); 6374 } 6375 6376 static void 6377 rack_fini(struct tcpcb *tp, int32_t tcb_is_purged) 6378 { 6379 if (tp->t_fb_ptr) { 6380 struct tcp_rack *rack; 6381 struct rack_sendmap *rsm; 6382 6383 rack = (struct tcp_rack *)tp->t_fb_ptr; 6384 #ifdef TCP_BLACKBOX 6385 tcp_log_flowend(tp); 6386 #endif 6387 rsm = TAILQ_FIRST(&rack->r_ctl.rc_map); 6388 while (rsm) { 6389 TAILQ_REMOVE(&rack->r_ctl.rc_map, rsm, r_next); 6390 uma_zfree(rack_zone, rsm); 6391 rsm = TAILQ_FIRST(&rack->r_ctl.rc_map); 6392 } 6393 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); 6394 while (rsm) { 6395 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_next); 6396 uma_zfree(rack_zone, rsm); 6397 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); 6398 } 6399 rack->rc_free_cnt = 0; 6400 uma_zfree(rack_pcb_zone, tp->t_fb_ptr); 6401 tp->t_fb_ptr = NULL; 6402 } 6403 } 6404 6405 static void 6406 rack_set_state(struct tcpcb *tp, struct tcp_rack *rack) 6407 { 6408 switch (tp->t_state) { 6409 case TCPS_SYN_SENT: 6410 rack->r_state = TCPS_SYN_SENT; 6411 rack->r_substate = rack_do_syn_sent; 6412 break; 6413 case TCPS_SYN_RECEIVED: 6414 rack->r_state = TCPS_SYN_RECEIVED; 6415 rack->r_substate = rack_do_syn_recv; 6416 break; 6417 case TCPS_ESTABLISHED: 6418 rack->r_state = TCPS_ESTABLISHED; 6419 rack->r_substate = rack_do_established; 6420 break; 6421 case TCPS_CLOSE_WAIT: 6422 rack->r_state = TCPS_CLOSE_WAIT; 6423 rack->r_substate = rack_do_close_wait; 6424 break; 6425 case TCPS_FIN_WAIT_1: 6426 rack->r_state = TCPS_FIN_WAIT_1; 6427 rack->r_substate = rack_do_fin_wait_1; 6428 break; 6429 case TCPS_CLOSING: 6430 rack->r_state = TCPS_CLOSING; 6431 rack->r_substate = rack_do_closing; 6432 break; 6433 case TCPS_LAST_ACK: 6434 rack->r_state = TCPS_LAST_ACK; 6435 rack->r_substate = rack_do_lastack; 6436 break; 6437 case TCPS_FIN_WAIT_2: 6438 rack->r_state = TCPS_FIN_WAIT_2; 6439 rack->r_substate = rack_do_fin_wait_2; 6440 break; 6441 case TCPS_LISTEN: 6442 case TCPS_CLOSED: 6443 case TCPS_TIME_WAIT: 6444 default: 6445 #ifdef INVARIANTS 6446 panic("tcp tp:%p state:%d sees impossible state?", tp, tp->t_state); 6447 #endif 6448 break; 6449 }; 6450 } 6451 6452 6453 static void 6454 rack_timer_audit(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb) 6455 { 6456 /* 6457 * We received an ack, and then did not 6458 * call send or were bounced out due to the 6459 * hpts was running. Now a timer is up as well, is 6460 * it the right timer? 6461 */ 6462 struct rack_sendmap *rsm; 6463 int tmr_up; 6464 6465 tmr_up = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK; 6466 if (rack->rc_in_persist && (tmr_up == PACE_TMR_PERSIT)) 6467 return; 6468 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 6469 if (((rsm == NULL) || (tp->t_state < TCPS_ESTABLISHED)) && 6470 (tmr_up == PACE_TMR_RXT)) { 6471 /* Should be an RXT */ 6472 return; 6473 } 6474 if (rsm == NULL) { 6475 /* Nothing outstanding? */ 6476 if (tp->t_flags & TF_DELACK) { 6477 if (tmr_up == PACE_TMR_DELACK) 6478 /* We are supposed to have delayed ack up and we do */ 6479 return; 6480 } else if (sbavail(&tp->t_inpcb->inp_socket->so_snd) && (tmr_up == PACE_TMR_RXT)) { 6481 /* 6482 * if we hit enobufs then we would expect the possiblity 6483 * of nothing outstanding and the RXT up (and the hptsi timer). 6484 */ 6485 return; 6486 } else if (((tcp_always_keepalive || 6487 rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) && 6488 (tp->t_state <= TCPS_CLOSING)) && 6489 (tmr_up == PACE_TMR_KEEP) && 6490 (tp->snd_max == tp->snd_una)) { 6491 /* We should have keep alive up and we do */ 6492 return; 6493 } 6494 } 6495 if (rsm && (rsm->r_flags & RACK_SACK_PASSED)) { 6496 if ((tp->t_flags & TF_SENTFIN) && 6497 ((tp->snd_max - tp->snd_una) == 1) && 6498 (rsm->r_flags & RACK_HAS_FIN)) { 6499 /* needs to be a RXT */ 6500 if (tmr_up == PACE_TMR_RXT) 6501 return; 6502 } else if (tmr_up == PACE_TMR_RACK) 6503 return; 6504 } else if (SEQ_GT(tp->snd_max,tp->snd_una) && 6505 ((tmr_up == PACE_TMR_TLP) || 6506 (tmr_up == PACE_TMR_RXT))) { 6507 /* 6508 * Either a TLP or RXT is fine if no sack-passed 6509 * is in place and data is outstanding. 6510 */ 6511 return; 6512 } else if (tmr_up == PACE_TMR_DELACK) { 6513 /* 6514 * If the delayed ack was going to go off 6515 * before the rtx/tlp/rack timer were going to 6516 * expire, then that would be the timer in control. 6517 * Note we don't check the time here trusting the 6518 * code is correct. 6519 */ 6520 return; 6521 } 6522 /* 6523 * Ok the timer originally started is not what we want now. 6524 * We will force the hpts to be stopped if any, and restart 6525 * with the slot set to what was in the saved slot. 6526 */ 6527 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 6528 rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0); 6529 } 6530 6531 static void 6532 rack_hpts_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, 6533 struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos, 6534 int32_t ti_locked, int32_t nxt_pkt, struct timeval *tv) 6535 { 6536 int32_t thflags, retval, did_out = 0; 6537 int32_t way_out = 0; 6538 uint32_t cts; 6539 uint32_t tiwin; 6540 struct tcpopt to; 6541 struct tcp_rack *rack; 6542 struct rack_sendmap *rsm; 6543 int32_t prev_state = 0; 6544 6545 cts = tcp_tv_to_mssectick(tv); 6546 rack = (struct tcp_rack *)tp->t_fb_ptr; 6547 6548 kern_prefetch(rack, &prev_state); 6549 prev_state = 0; 6550 thflags = th->th_flags; 6551 /* 6552 * If this is either a state-changing packet or current state isn't 6553 * established, we require a read lock on tcbinfo. Otherwise, we 6554 * allow the tcbinfo to be in either locked or unlocked, as the 6555 * caller may have unnecessarily acquired a lock due to a race. 6556 */ 6557 if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 || 6558 tp->t_state != TCPS_ESTABLISHED) { 6559 KASSERT(ti_locked == TI_RLOCKED, ("%s ti_locked %d for " 6560 "SYN/FIN/RST/!EST", __func__, ti_locked)); 6561 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 6562 } else { 6563 #ifdef INVARIANTS 6564 if (ti_locked == TI_RLOCKED) { 6565 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 6566 } else { 6567 KASSERT(ti_locked == TI_UNLOCKED, ("%s: EST " 6568 "ti_locked: %d", __func__, ti_locked)); 6569 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); 6570 } 6571 #endif 6572 } 6573 INP_WLOCK_ASSERT(tp->t_inpcb); 6574 KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", 6575 __func__)); 6576 KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT", 6577 __func__)); 6578 { 6579 union tcp_log_stackspecific log; 6580 6581 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 6582 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 6583 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 6584 TCP_LOG_EVENT(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_IN, 0, 6585 tlen, &log, true); 6586 } 6587 /* 6588 * Segment received on connection. Reset idle time and keep-alive 6589 * timer. XXX: This should be done after segment validation to 6590 * ignore broken/spoofed segs. 6591 */ 6592 if (tp->t_idle_reduce && (tp->snd_max == tp->snd_una)) { 6593 #ifdef NETFLIX_CWV 6594 if ((tp->cwv_enabled) && 6595 ((tp->cwv_cwnd_valid == 0) && 6596 TCPS_HAVEESTABLISHED(tp->t_state) && 6597 (tp->snd_cwnd > tp->snd_cwv.init_cwnd))) { 6598 tcp_newcwv_nvp_closedown(tp); 6599 } else 6600 #endif 6601 if ((ticks - tp->t_rcvtime) >= tp->t_rxtcur) { 6602 counter_u64_add(rack_input_idle_reduces, 1); 6603 rack_cc_after_idle(tp, 6604 (rack->r_idle_reduce_largest ? 1 :0)); 6605 } 6606 } 6607 rack->r_ctl.rc_rcvtime = cts; 6608 tp->t_rcvtime = ticks; 6609 6610 #ifdef NETFLIX_CWV 6611 if (tp->cwv_enabled) { 6612 if ((tp->cwv_cwnd_valid == 0) && 6613 TCPS_HAVEESTABLISHED(tp->t_state) && 6614 (tp->snd_cwnd > tp->snd_cwv.init_cwnd)) 6615 tcp_newcwv_nvp_closedown(tp); 6616 } 6617 #endif 6618 /* 6619 * Unscale the window into a 32-bit value. For the SYN_SENT state 6620 * the scale is zero. 6621 */ 6622 tiwin = th->th_win << tp->snd_scale; 6623 #ifdef NETFLIX_STATS 6624 stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_FRWIN, tiwin); 6625 #endif 6626 /* 6627 * TCP ECN processing. XXXJTL: If we ever use ECN, we need to move 6628 * this to occur after we've validated the segment. 6629 */ 6630 if (tp->t_flags & TF_ECN_PERMIT) { 6631 if (thflags & TH_CWR) 6632 tp->t_flags &= ~TF_ECN_SND_ECE; 6633 switch (iptos & IPTOS_ECN_MASK) { 6634 case IPTOS_ECN_CE: 6635 tp->t_flags |= TF_ECN_SND_ECE; 6636 TCPSTAT_INC(tcps_ecn_ce); 6637 break; 6638 case IPTOS_ECN_ECT0: 6639 TCPSTAT_INC(tcps_ecn_ect0); 6640 break; 6641 case IPTOS_ECN_ECT1: 6642 TCPSTAT_INC(tcps_ecn_ect1); 6643 break; 6644 } 6645 /* Congestion experienced. */ 6646 if (thflags & TH_ECE) { 6647 rack_cong_signal(tp, th, CC_ECN); 6648 } 6649 } 6650 /* 6651 * Parse options on any incoming segment. 6652 */ 6653 tcp_dooptions(&to, (u_char *)(th + 1), 6654 (th->th_off << 2) - sizeof(struct tcphdr), 6655 (thflags & TH_SYN) ? TO_SYN : 0); 6656 6657 /* 6658 * If echoed timestamp is later than the current time, fall back to 6659 * non RFC1323 RTT calculation. Normalize timestamp if syncookies 6660 * were used when this connection was established. 6661 */ 6662 if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) { 6663 to.to_tsecr -= tp->ts_offset; 6664 if (TSTMP_GT(to.to_tsecr, cts)) 6665 to.to_tsecr = 0; 6666 } 6667 /* 6668 * If its the first time in we need to take care of options and 6669 * verify we can do SACK for rack! 6670 */ 6671 if (rack->r_state == 0) { 6672 /* Should be init'd by rack_init() */ 6673 KASSERT(rack->rc_inp != NULL, 6674 ("%s: rack->rc_inp unexpectedly NULL", __func__)); 6675 if (rack->rc_inp == NULL) { 6676 rack->rc_inp = tp->t_inpcb; 6677 } 6678 6679 /* 6680 * Process options only when we get SYN/ACK back. The SYN 6681 * case for incoming connections is handled in tcp_syncache. 6682 * According to RFC1323 the window field in a SYN (i.e., a 6683 * <SYN> or <SYN,ACK>) segment itself is never scaled. XXX 6684 * this is traditional behavior, may need to be cleaned up. 6685 */ 6686 rack->r_cpu = inp_to_cpuid(tp->t_inpcb); 6687 if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) { 6688 if ((to.to_flags & TOF_SCALE) && 6689 (tp->t_flags & TF_REQ_SCALE)) { 6690 tp->t_flags |= TF_RCVD_SCALE; 6691 tp->snd_scale = to.to_wscale; 6692 } 6693 /* 6694 * Initial send window. It will be updated with the 6695 * next incoming segment to the scaled value. 6696 */ 6697 tp->snd_wnd = th->th_win; 6698 if (to.to_flags & TOF_TS) { 6699 tp->t_flags |= TF_RCVD_TSTMP; 6700 tp->ts_recent = to.to_tsval; 6701 tp->ts_recent_age = cts; 6702 } 6703 if (to.to_flags & TOF_MSS) 6704 tcp_mss(tp, to.to_mss); 6705 if ((tp->t_flags & TF_SACK_PERMIT) && 6706 (to.to_flags & TOF_SACKPERM) == 0) 6707 tp->t_flags &= ~TF_SACK_PERMIT; 6708 } 6709 /* 6710 * At this point we are at the initial call. Here we decide 6711 * if we are doing RACK or not. We do this by seeing if 6712 * TF_SACK_PERMIT is set, if not rack is *not* possible and 6713 * we switch to the default code. 6714 */ 6715 if ((tp->t_flags & TF_SACK_PERMIT) == 0) { 6716 tcp_switch_back_to_default(tp); 6717 (*tp->t_fb->tfb_tcp_do_segment) (m, th, so, tp, drop_hdrlen, 6718 tlen, iptos, ti_locked); 6719 return; 6720 } 6721 /* Set the flag */ 6722 rack->r_is_v6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; 6723 tcp_set_hpts(tp->t_inpcb); 6724 rack_stop_all_timers(tp); 6725 sack_filter_clear(&rack->r_ctl.rack_sf, th->th_ack); 6726 } 6727 /* 6728 * This is the one exception case where we set the rack state 6729 * always. All other times (timers etc) we must have a rack-state 6730 * set (so we assure we have done the checks above for SACK). 6731 */ 6732 if (rack->r_state != tp->t_state) 6733 rack_set_state(tp, rack); 6734 if (SEQ_GT(th->th_ack, tp->snd_una) && (rsm = TAILQ_FIRST(&rack->r_ctl.rc_map)) != NULL) 6735 kern_prefetch(rsm, &prev_state); 6736 prev_state = rack->r_state; 6737 rack->r_ctl.rc_tlp_send_cnt = 0; 6738 rack_clear_rate_sample(rack); 6739 retval = (*rack->r_substate) (m, th, so, 6740 tp, &to, drop_hdrlen, 6741 tlen, &ti_locked, tiwin, thflags, nxt_pkt); 6742 #ifdef INVARIANTS 6743 if ((retval == 0) && 6744 (tp->t_inpcb == NULL)) { 6745 panic("retval:%d tp:%p t_inpcb:NULL state:%d", 6746 retval, tp, prev_state); 6747 } 6748 #endif 6749 if (ti_locked != TI_UNLOCKED) { 6750 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 6751 INP_INFO_RUNLOCK(&V_tcbinfo); 6752 ti_locked = TI_UNLOCKED; 6753 } 6754 if (retval == 0) { 6755 /* 6756 * If retval is 1 the tcb is unlocked and most likely the tp 6757 * is gone. 6758 */ 6759 INP_WLOCK_ASSERT(tp->t_inpcb); 6760 tcp_rack_xmit_timer_commit(rack, tp); 6761 if (((tp->snd_max - tp->snd_una) > tp->snd_wnd) && 6762 (rack->rc_in_persist == 0)){ 6763 /* 6764 * The peer shrunk its window on us to the point 6765 * where we have sent too much. The only thing 6766 * we can do here is stop any timers and 6767 * enter persist. We most likely lost the last 6768 * bytes we sent but oh well, we will have to 6769 * retransmit them after the peer is caught up. 6770 */ 6771 if (rack->rc_inp->inp_in_hpts) 6772 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); 6773 rack_timer_cancel(tp, rack, cts, __LINE__); 6774 rack_enter_persist(tp, rack, cts); 6775 rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0); 6776 way_out = 3; 6777 goto done_with_input; 6778 } 6779 if (nxt_pkt == 0) { 6780 if (rack->r_wanted_output != 0) { 6781 did_out = 1; 6782 (void)tp->t_fb->tfb_tcp_output(tp); 6783 } 6784 rack_start_hpts_timer(rack, tp, cts, __LINE__, 0, 0, 0); 6785 } 6786 if (((rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) == 0) && 6787 (SEQ_GT(tp->snd_max, tp->snd_una) || 6788 (tp->t_flags & TF_DELACK) || 6789 ((tcp_always_keepalive || rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) && 6790 (tp->t_state <= TCPS_CLOSING)))) { 6791 /* We could not send (probably in the hpts but stopped the timer earlier)? */ 6792 if ((tp->snd_max == tp->snd_una) && 6793 ((tp->t_flags & TF_DELACK) == 0) && 6794 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { 6795 /* keep alive not needed if we are hptsi output yet */ 6796 ; 6797 } else { 6798 if (rack->rc_inp->inp_in_hpts) 6799 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); 6800 rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0); 6801 } 6802 way_out = 1; 6803 } else { 6804 /* Do we have the correct timer running? */ 6805 rack_timer_audit(tp, rack, &so->so_snd); 6806 way_out = 2; 6807 } 6808 done_with_input: 6809 rack_log_doseg_done(rack, cts, nxt_pkt, did_out, way_out); 6810 if (did_out) 6811 rack->r_wanted_output = 0; 6812 #ifdef INVARIANTS 6813 if (tp->t_inpcb == NULL) { 6814 panic("OP:%d retval:%d tp:%p t_inpcb:NULL state:%d", 6815 did_out, 6816 retval, tp, prev_state); 6817 } 6818 #endif 6819 INP_WUNLOCK(tp->t_inpcb); 6820 } 6821 } 6822 6823 void 6824 rack_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, 6825 struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos, 6826 int32_t ti_locked) 6827 { 6828 struct timeval tv; 6829 #ifdef RSS 6830 struct tcp_function_block *tfb; 6831 struct tcp_rack *rack; 6832 struct inpcb *inp; 6833 6834 rack = (struct tcp_rack *)tp->t_fb_ptr; 6835 if (rack->r_state == 0) { 6836 /* 6837 * Initial input (ACK to SYN-ACK etc)lets go ahead and get 6838 * it processed 6839 */ 6840 if (ti_locked != TI_RLOCKED && INP_INFO_TRY_RLOCK(&V_tcbinfo)) 6841 ti_locked = TI_RLOCKED; 6842 if (ti_locked != TI_RLOCKED) { 6843 inp = tp->t_inpcb; 6844 tfb = tp->t_fb; 6845 in_pcbref(inp); 6846 INP_WUNLOCK(inp); 6847 INP_INFO_RLOCK(&V_tcbinfo); 6848 ti_locked = TI_RLOCKED; 6849 INP_WLOCK(inp); 6850 if (in_pcbrele_wlocked(inp)) 6851 inp = NULL; 6852 if (inp == NULL || (inp->inp_flags2 & INP_FREED) || 6853 (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED))) { 6854 /* The TCPCB went away. Free the packet. */ 6855 INP_INFO_RUNLOCK(&V_tcbinfo); 6856 if (inp) 6857 INP_WUNLOCK(inp); 6858 m_freem(m); 6859 return; 6860 } 6861 /* If the stack changed, call the correct stack. */ 6862 if (tp->t_fb != tfb) { 6863 tp->t_fb->tfb_tcp_do_segment(m, th, so, tp, 6864 drop_hdrlen, tlen, iptos, ti_locked); 6865 return; 6866 } 6867 } 6868 tcp_get_usecs(&tv); 6869 rack_hpts_do_segment(m, th, so, tp, drop_hdrlen, 6870 tlen, iptos, ti_locked, 0, &tv); 6871 return; 6872 } 6873 if (ti_locked == TI_RLOCKED) 6874 INP_INFO_RUNLOCK(&V_tcbinfo); 6875 tcp_queue_to_input(tp, m, th, tlen, drop_hdrlen, iptos, (uint8_t) ti_locked); 6876 INP_WUNLOCK(tp->t_inpcb); 6877 #else 6878 tcp_get_usecs(&tv); 6879 rack_hpts_do_segment(m, th, so, tp, drop_hdrlen, 6880 tlen, iptos, ti_locked, 0, &tv); 6881 #endif 6882 } 6883 6884 struct rack_sendmap * 6885 tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tsused) 6886 { 6887 struct rack_sendmap *rsm = NULL; 6888 int32_t idx; 6889 uint32_t srtt_cur, srtt = 0, thresh = 0, ts_low = 0; 6890 6891 /* Return the next guy to be re-transmitted */ 6892 if (TAILQ_EMPTY(&rack->r_ctl.rc_map)) { 6893 return (NULL); 6894 } 6895 if (tp->t_flags & TF_SENTFIN) { 6896 /* retran the end FIN? */ 6897 return (NULL); 6898 } 6899 /* ok lets look at this one */ 6900 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 6901 if (rsm && ((rsm->r_flags & RACK_ACKED) == 0)) { 6902 goto check_it; 6903 } 6904 rsm = rack_find_lowest_rsm(rack); 6905 if (rsm == NULL) { 6906 return (NULL); 6907 } 6908 check_it: 6909 srtt_cur = tp->t_srtt >> TCP_RTT_SHIFT; 6910 srtt = TICKS_2_MSEC(srtt_cur); 6911 if (rack->rc_rack_rtt && (srtt > rack->rc_rack_rtt)) 6912 srtt = rack->rc_rack_rtt; 6913 if (rsm->r_flags & RACK_ACKED) { 6914 return (NULL); 6915 } 6916 if ((rsm->r_flags & RACK_SACK_PASSED) == 0) { 6917 /* Its not yet ready */ 6918 return (NULL); 6919 } 6920 idx = rsm->r_rtr_cnt - 1; 6921 ts_low = rsm->r_tim_lastsent[idx]; 6922 thresh = rack_calc_thresh_rack(rack, srtt, tsused); 6923 if (tsused <= ts_low) { 6924 return (NULL); 6925 } 6926 if ((tsused - ts_low) >= thresh) { 6927 return (rsm); 6928 } 6929 return (NULL); 6930 } 6931 6932 static int 6933 rack_output(struct tcpcb *tp) 6934 { 6935 struct socket *so; 6936 uint32_t recwin, sendwin; 6937 uint32_t sb_offset; 6938 int32_t len, flags, error = 0; 6939 struct mbuf *m; 6940 struct mbuf *mb; 6941 uint32_t if_hw_tsomaxsegcount = 0; 6942 uint32_t if_hw_tsomaxsegsize; 6943 long tot_len_this_send = 0; 6944 struct ip *ip = NULL; 6945 #ifdef TCPDEBUG 6946 struct ipovly *ipov = NULL; 6947 #endif 6948 struct udphdr *udp = NULL; 6949 struct tcp_rack *rack; 6950 struct tcphdr *th; 6951 uint8_t pass = 0; 6952 uint8_t wanted_cookie = 0; 6953 u_char opt[TCP_MAXOLEN]; 6954 unsigned ipoptlen, optlen, hdrlen, ulen=0; 6955 uint32_t rack_seq; 6956 6957 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 6958 unsigned ipsec_optlen = 0; 6959 6960 #endif 6961 int32_t idle, sendalot; 6962 int32_t sub_from_prr = 0; 6963 volatile int32_t sack_rxmit; 6964 struct rack_sendmap *rsm = NULL; 6965 int32_t tso, mtu, would_have_fin = 0; 6966 struct tcpopt to; 6967 int32_t slot = 0; 6968 uint32_t cts; 6969 uint8_t hpts_calling, doing_tlp = 0; 6970 int32_t do_a_prefetch; 6971 int32_t prefetch_rsm = 0; 6972 int32_t prefetch_so_done = 0; 6973 struct tcp_log_buffer *lgb = NULL; 6974 struct inpcb *inp; 6975 struct sockbuf *sb; 6976 #ifdef INET6 6977 struct ip6_hdr *ip6 = NULL; 6978 int32_t isipv6; 6979 #endif 6980 /* setup and take the cache hits here */ 6981 rack = (struct tcp_rack *)tp->t_fb_ptr; 6982 inp = rack->rc_inp; 6983 so = inp->inp_socket; 6984 sb = &so->so_snd; 6985 kern_prefetch(sb, &do_a_prefetch); 6986 do_a_prefetch = 1; 6987 6988 INP_WLOCK_ASSERT(inp); 6989 #ifdef TCP_OFFLOAD 6990 if (tp->t_flags & TF_TOE) 6991 return (tcp_offload_output(tp)); 6992 #endif 6993 6994 /* 6995 * For TFO connections in SYN_RECEIVED, only allow the initial 6996 * SYN|ACK and those sent by the retransmit timer. 6997 */ 6998 if (IS_FASTOPEN(tp->t_flags) && 6999 (tp->t_state == TCPS_SYN_RECEIVED) && 7000 SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN|ACK sent */ 7001 (rack->r_ctl.rc_resend == NULL)) /* not a retransmit */ 7002 return (0); 7003 #ifdef INET6 7004 if (rack->r_state) { 7005 /* Use the cache line loaded if possible */ 7006 isipv6 = rack->r_is_v6; 7007 } else { 7008 isipv6 = (inp->inp_vflag & INP_IPV6) != 0; 7009 } 7010 #endif 7011 cts = tcp_ts_getticks(); 7012 if (((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) && 7013 inp->inp_in_hpts) { 7014 /* 7015 * We are on the hpts for some timer but not hptsi output. 7016 * Remove from the hpts unconditionally. 7017 */ 7018 rack_timer_cancel(tp, rack, cts, __LINE__); 7019 } 7020 /* Mark that we have called rack_output(). */ 7021 if ((rack->r_timer_override) || 7022 (tp->t_flags & TF_FORCEDATA) || 7023 (tp->t_state < TCPS_ESTABLISHED)) { 7024 if (tp->t_inpcb->inp_in_hpts) 7025 tcp_hpts_remove(tp->t_inpcb, HPTS_REMOVE_OUTPUT); 7026 } else if (tp->t_inpcb->inp_in_hpts) { 7027 /* 7028 * On the hpts you can't pass even if ACKNOW is on, we will 7029 * when the hpts fires. 7030 */ 7031 counter_u64_add(rack_out_size[TCP_MSS_ACCT_INPACE], 1); 7032 return (0); 7033 } 7034 hpts_calling = inp->inp_hpts_calls; 7035 inp->inp_hpts_calls = 0; 7036 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { 7037 if (rack_process_timers(tp, rack, cts, hpts_calling)) { 7038 counter_u64_add(rack_out_size[TCP_MSS_ACCT_ATIMER], 1); 7039 return (0); 7040 } 7041 } 7042 rack->r_wanted_output = 0; 7043 rack->r_timer_override = 0; 7044 /* 7045 * Determine length of data that should be transmitted, and flags 7046 * that will be used. If there is some data or critical controls 7047 * (SYN, RST) to send, then transmit; otherwise, investigate 7048 * further. 7049 */ 7050 idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una); 7051 #ifdef NETFLIX_CWV 7052 if (tp->cwv_enabled) { 7053 if ((tp->cwv_cwnd_valid == 0) && 7054 TCPS_HAVEESTABLISHED(tp->t_state) && 7055 (tp->snd_cwnd > tp->snd_cwv.init_cwnd)) 7056 tcp_newcwv_nvp_closedown(tp); 7057 } else 7058 #endif 7059 if (tp->t_idle_reduce) { 7060 if (idle && ((ticks - tp->t_rcvtime) >= tp->t_rxtcur)) 7061 rack_cc_after_idle(tp, 7062 (rack->r_idle_reduce_largest ? 1 :0)); 7063 } 7064 tp->t_flags &= ~TF_LASTIDLE; 7065 if (idle) { 7066 if (tp->t_flags & TF_MORETOCOME) { 7067 tp->t_flags |= TF_LASTIDLE; 7068 idle = 0; 7069 } 7070 } 7071 again: 7072 /* 7073 * If we've recently taken a timeout, snd_max will be greater than 7074 * snd_nxt. There may be SACK information that allows us to avoid 7075 * resending already delivered data. Adjust snd_nxt accordingly. 7076 */ 7077 sendalot = 0; 7078 cts = tcp_ts_getticks(); 7079 tso = 0; 7080 mtu = 0; 7081 sb_offset = tp->snd_max - tp->snd_una; 7082 sendwin = min(tp->snd_wnd, tp->snd_cwnd); 7083 7084 flags = tcp_outflags[tp->t_state]; 7085 /* 7086 * Send any SACK-generated retransmissions. If we're explicitly 7087 * trying to send out new data (when sendalot is 1), bypass this 7088 * function. If we retransmit in fast recovery mode, decrement 7089 * snd_cwnd, since we're replacing a (future) new transmission with 7090 * a retransmission now, and we previously incremented snd_cwnd in 7091 * tcp_input(). 7092 */ 7093 /* 7094 * Still in sack recovery , reset rxmit flag to zero. 7095 */ 7096 while (rack->rc_free_cnt < rack_free_cache) { 7097 rsm = rack_alloc(rack); 7098 if (rsm == NULL) { 7099 if (inp->inp_hpts_calls) 7100 /* Retry in a ms */ 7101 slot = 1; 7102 goto just_return_nolock; 7103 } 7104 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_next); 7105 rack->rc_free_cnt++; 7106 rsm = NULL; 7107 } 7108 if (inp->inp_hpts_calls) 7109 inp->inp_hpts_calls = 0; 7110 sack_rxmit = 0; 7111 len = 0; 7112 rsm = NULL; 7113 if (flags & TH_RST) { 7114 SOCKBUF_LOCK(sb); 7115 goto send; 7116 } 7117 if (rack->r_ctl.rc_tlpsend) { 7118 /* Tail loss probe */ 7119 long cwin; 7120 long tlen; 7121 7122 doing_tlp = 1; 7123 rsm = rack->r_ctl.rc_tlpsend; 7124 rack->r_ctl.rc_tlpsend = NULL; 7125 sack_rxmit = 1; 7126 tlen = rsm->r_end - rsm->r_start; 7127 if (tlen > tp->t_maxseg) 7128 tlen = tp->t_maxseg; 7129 #ifdef INVARIANTS 7130 if (SEQ_GT(tp->snd_una, rsm->r_start)) { 7131 panic("tp:%p rack:%p snd_una:%u rsm:%p r_start:%u", 7132 tp, rack, tp->snd_una, rsm, rsm->r_start); 7133 } 7134 #endif 7135 sb_offset = rsm->r_start - tp->snd_una; 7136 cwin = min(tp->snd_wnd, tlen); 7137 len = cwin; 7138 } else if (rack->r_ctl.rc_resend) { 7139 /* Retransmit timer */ 7140 rsm = rack->r_ctl.rc_resend; 7141 rack->r_ctl.rc_resend = NULL; 7142 len = rsm->r_end - rsm->r_start; 7143 sack_rxmit = 1; 7144 sendalot = 0; 7145 sb_offset = rsm->r_start - tp->snd_una; 7146 if (len >= tp->t_maxseg) { 7147 len = tp->t_maxseg; 7148 } 7149 KASSERT(sb_offset >= 0, ("%s: sack block to the left of una : %d", 7150 __func__, sb_offset)); 7151 } else if ((rack->rc_in_persist == 0) && 7152 ((rsm = tcp_rack_output(tp, rack, cts)) != NULL)) { 7153 long tlen; 7154 7155 if ((!IN_RECOVERY(tp->t_flags)) && 7156 ((tp->t_flags & (TF_WASFRECOVERY | TF_WASCRECOVERY)) == 0)) { 7157 /* Enter recovery if not induced by a time-out */ 7158 rack->r_ctl.rc_rsm_start = rsm->r_start; 7159 rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; 7160 rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; 7161 rack_cong_signal(tp, NULL, CC_NDUPACK); 7162 /* 7163 * When we enter recovery we need to assure we send 7164 * one packet. 7165 */ 7166 rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; 7167 } 7168 #ifdef INVARIANTS 7169 if (SEQ_LT(rsm->r_start, tp->snd_una)) { 7170 panic("Huh, tp:%p rack:%p rsm:%p start:%u < snd_una:%u\n", 7171 tp, rack, rsm, rsm->r_start, tp->snd_una); 7172 } 7173 #endif 7174 tlen = rsm->r_end - rsm->r_start; 7175 sb_offset = rsm->r_start - tp->snd_una; 7176 if (tlen > rack->r_ctl.rc_prr_sndcnt) { 7177 len = rack->r_ctl.rc_prr_sndcnt; 7178 } else { 7179 len = tlen; 7180 } 7181 if (len >= tp->t_maxseg) { 7182 sendalot = 1; 7183 len = tp->t_maxseg; 7184 } else { 7185 sendalot = 0; 7186 if ((rack->rc_timer_up == 0) && 7187 (len < tlen)) { 7188 /* 7189 * If its not a timer don't send a partial 7190 * segment. 7191 */ 7192 len = 0; 7193 goto just_return_nolock; 7194 } 7195 } 7196 KASSERT(sb_offset >= 0, ("%s: sack block to the left of una : %d", 7197 __func__, sb_offset)); 7198 if (len > 0) { 7199 sub_from_prr = 1; 7200 sack_rxmit = 1; 7201 TCPSTAT_INC(tcps_sack_rexmits); 7202 TCPSTAT_ADD(tcps_sack_rexmit_bytes, 7203 min(len, tp->t_maxseg)); 7204 counter_u64_add(rack_rtm_prr_retran, 1); 7205 } 7206 } 7207 if (rsm && (rsm->r_flags & RACK_HAS_FIN)) { 7208 /* we are retransmitting the fin */ 7209 len--; 7210 if (len) { 7211 /* 7212 * When retransmitting data do *not* include the 7213 * FIN. This could happen from a TLP probe. 7214 */ 7215 flags &= ~TH_FIN; 7216 } 7217 } 7218 #ifdef INVARIANTS 7219 /* For debugging */ 7220 rack->r_ctl.rc_rsm_at_retran = rsm; 7221 #endif 7222 /* 7223 * Get standard flags, and add SYN or FIN if requested by 'hidden' 7224 * state flags. 7225 */ 7226 if (tp->t_flags & TF_NEEDFIN) 7227 flags |= TH_FIN; 7228 if (tp->t_flags & TF_NEEDSYN) 7229 flags |= TH_SYN; 7230 if ((sack_rxmit == 0) && (prefetch_rsm == 0)) { 7231 void *end_rsm; 7232 end_rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext); 7233 if (end_rsm) 7234 kern_prefetch(end_rsm, &prefetch_rsm); 7235 prefetch_rsm = 1; 7236 } 7237 SOCKBUF_LOCK(sb); 7238 /* 7239 * If in persist timeout with window of 0, send 1 byte. Otherwise, 7240 * if window is small but nonzero and time TF_SENTFIN expired, we 7241 * will send what we can and go to transmit state. 7242 */ 7243 if (tp->t_flags & TF_FORCEDATA) { 7244 if (sendwin == 0) { 7245 /* 7246 * If we still have some data to send, then clear 7247 * the FIN bit. Usually this would happen below 7248 * when it realizes that we aren't sending all the 7249 * data. However, if we have exactly 1 byte of 7250 * unsent data, then it won't clear the FIN bit 7251 * below, and if we are in persist state, we wind up 7252 * sending the packet without recording that we sent 7253 * the FIN bit. 7254 * 7255 * We can't just blindly clear the FIN bit, because 7256 * if we don't have any more data to send then the 7257 * probe will be the FIN itself. 7258 */ 7259 if (sb_offset < sbused(sb)) 7260 flags &= ~TH_FIN; 7261 sendwin = 1; 7262 } else { 7263 if (rack->rc_in_persist) 7264 rack_exit_persist(tp, rack); 7265 /* 7266 * If we are dropping persist mode then we need to 7267 * correct snd_nxt/snd_max and off. 7268 */ 7269 tp->snd_nxt = tp->snd_max; 7270 sb_offset = tp->snd_nxt - tp->snd_una; 7271 } 7272 } 7273 /* 7274 * If snd_nxt == snd_max and we have transmitted a FIN, the 7275 * sb_offset will be > 0 even if so_snd.sb_cc is 0, resulting in a 7276 * negative length. This can also occur when TCP opens up its 7277 * congestion window while receiving additional duplicate acks after 7278 * fast-retransmit because TCP will reset snd_nxt to snd_max after 7279 * the fast-retransmit. 7280 * 7281 * In the normal retransmit-FIN-only case, however, snd_nxt will be 7282 * set to snd_una, the sb_offset will be 0, and the length may wind 7283 * up 0. 7284 * 7285 * If sack_rxmit is true we are retransmitting from the scoreboard 7286 * in which case len is already set. 7287 */ 7288 if (sack_rxmit == 0) { 7289 uint32_t avail; 7290 7291 avail = sbavail(sb); 7292 if (SEQ_GT(tp->snd_nxt, tp->snd_una) && avail) 7293 sb_offset = tp->snd_nxt - tp->snd_una; 7294 else 7295 sb_offset = 0; 7296 if (IN_RECOVERY(tp->t_flags) == 0) { 7297 if (rack->r_ctl.rc_tlp_new_data) { 7298 /* TLP is forcing out new data */ 7299 if (rack->r_ctl.rc_tlp_new_data > (uint32_t) (avail - sb_offset)) { 7300 rack->r_ctl.rc_tlp_new_data = (uint32_t) (avail - sb_offset); 7301 } 7302 if (rack->r_ctl.rc_tlp_new_data > tp->snd_wnd) 7303 len = tp->snd_wnd; 7304 else 7305 len = rack->r_ctl.rc_tlp_new_data; 7306 rack->r_ctl.rc_tlp_new_data = 0; 7307 doing_tlp = 1; 7308 } else { 7309 if (sendwin > avail) { 7310 /* use the available */ 7311 if (avail > sb_offset) { 7312 len = (int32_t)(avail - sb_offset); 7313 } else { 7314 len = 0; 7315 } 7316 } else { 7317 if (sendwin > sb_offset) { 7318 len = (int32_t)(sendwin - sb_offset); 7319 } else { 7320 len = 0; 7321 } 7322 } 7323 } 7324 } else { 7325 uint32_t outstanding; 7326 7327 /* 7328 * We are inside of a SACK recovery episode and are 7329 * sending new data, having retransmitted all the 7330 * data possible so far in the scoreboard. 7331 */ 7332 outstanding = tp->snd_max - tp->snd_una; 7333 if ((rack->r_ctl.rc_prr_sndcnt + outstanding) > tp->snd_wnd) 7334 len = 0; 7335 else if (avail > sb_offset) 7336 len = avail - sb_offset; 7337 else 7338 len = 0; 7339 if (len > 0) { 7340 if (len > rack->r_ctl.rc_prr_sndcnt) 7341 len = rack->r_ctl.rc_prr_sndcnt; 7342 7343 if (len > 0) { 7344 sub_from_prr = 1; 7345 counter_u64_add(rack_rtm_prr_newdata, 1); 7346 } 7347 } 7348 if (len > tp->t_maxseg) { 7349 /* 7350 * We should never send more than a MSS when 7351 * retransmitting or sending new data in prr 7352 * mode unless the override flag is on. Most 7353 * likely the PRR algorithm is not going to 7354 * let us send a lot as well :-) 7355 */ 7356 if (rack->r_ctl.rc_prr_sendalot == 0) 7357 len = tp->t_maxseg; 7358 } else if (len < tp->t_maxseg) { 7359 /* 7360 * Do we send any? The idea here is if the 7361 * send empty's the socket buffer we want to 7362 * do it. However if not then lets just wait 7363 * for our prr_sndcnt to get bigger. 7364 */ 7365 long leftinsb; 7366 7367 leftinsb = sbavail(sb) - sb_offset; 7368 if (leftinsb > len) { 7369 /* This send does not empty the sb */ 7370 len = 0; 7371 } 7372 } 7373 } 7374 } 7375 if (prefetch_so_done == 0) { 7376 kern_prefetch(so, &prefetch_so_done); 7377 prefetch_so_done = 1; 7378 } 7379 /* 7380 * Lop off SYN bit if it has already been sent. However, if this is 7381 * SYN-SENT state and if segment contains data and if we don't know 7382 * that foreign host supports TAO, suppress sending segment. 7383 */ 7384 if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una) && 7385 ((sack_rxmit == 0) && (tp->t_rxtshift == 0))) { 7386 if (tp->t_state != TCPS_SYN_RECEIVED) 7387 flags &= ~TH_SYN; 7388 /* 7389 * When sending additional segments following a TFO SYN|ACK, 7390 * do not include the SYN bit. 7391 */ 7392 if (IS_FASTOPEN(tp->t_flags) && 7393 (tp->t_state == TCPS_SYN_RECEIVED)) 7394 flags &= ~TH_SYN; 7395 sb_offset--, len++; 7396 } 7397 /* 7398 * Be careful not to send data and/or FIN on SYN segments. This 7399 * measure is needed to prevent interoperability problems with not 7400 * fully conformant TCP implementations. 7401 */ 7402 if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) { 7403 len = 0; 7404 flags &= ~TH_FIN; 7405 } 7406 /* 7407 * On TFO sockets, ensure no data is sent in the following cases: 7408 * 7409 * - When retransmitting SYN|ACK on a passively-created socket 7410 * 7411 * - When retransmitting SYN on an actively created socket 7412 * 7413 * - When sending a zero-length cookie (cookie request) on an 7414 * actively created socket 7415 * 7416 * - When the socket is in the CLOSED state (RST is being sent) 7417 */ 7418 if (IS_FASTOPEN(tp->t_flags) && 7419 (((flags & TH_SYN) && (tp->t_rxtshift > 0)) || 7420 ((tp->t_state == TCPS_SYN_SENT) && 7421 (tp->t_tfo_client_cookie_len == 0)) || 7422 (flags & TH_RST))) 7423 len = 0; 7424 /* Without fast-open there should never be data sent on a SYN */ 7425 if ((flags & TH_SYN) && (!IS_FASTOPEN(tp->t_flags))) 7426 len = 0; 7427 if (len <= 0) { 7428 /* 7429 * If FIN has been sent but not acked, but we haven't been 7430 * called to retransmit, len will be < 0. Otherwise, window 7431 * shrank after we sent into it. If window shrank to 0, 7432 * cancel pending retransmit, pull snd_nxt back to (closed) 7433 * window, and set the persist timer if it isn't already 7434 * going. If the window didn't close completely, just wait 7435 * for an ACK. 7436 * 7437 * We also do a general check here to ensure that we will 7438 * set the persist timer when we have data to send, but a 7439 * 0-byte window. This makes sure the persist timer is set 7440 * even if the packet hits one of the "goto send" lines 7441 * below. 7442 */ 7443 len = 0; 7444 if ((tp->snd_wnd == 0) && 7445 (TCPS_HAVEESTABLISHED(tp->t_state)) && 7446 (sb_offset < (int)sbavail(sb))) { 7447 tp->snd_nxt = tp->snd_una; 7448 rack_enter_persist(tp, rack, cts); 7449 } 7450 } 7451 /* len will be >= 0 after this point. */ 7452 KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); 7453 tcp_sndbuf_autoscale(tp, so, sendwin); 7454 /* 7455 * Decide if we can use TCP Segmentation Offloading (if supported by 7456 * hardware). 7457 * 7458 * TSO may only be used if we are in a pure bulk sending state. The 7459 * presence of TCP-MD5, SACK retransmits, SACK advertizements and IP 7460 * options prevent using TSO. With TSO the TCP header is the same 7461 * (except for the sequence number) for all generated packets. This 7462 * makes it impossible to transmit any options which vary per 7463 * generated segment or packet. 7464 * 7465 * IPv4 handling has a clear separation of ip options and ip header 7466 * flags while IPv6 combines both in in6p_outputopts. ip6_optlen() does 7467 * the right thing below to provide length of just ip options and thus 7468 * checking for ipoptlen is enough to decide if ip options are present. 7469 */ 7470 7471 #ifdef INET6 7472 if (isipv6) 7473 ipoptlen = ip6_optlen(tp->t_inpcb); 7474 else 7475 #endif 7476 if (tp->t_inpcb->inp_options) 7477 ipoptlen = tp->t_inpcb->inp_options->m_len - 7478 offsetof(struct ipoption, ipopt_list); 7479 else 7480 ipoptlen = 0; 7481 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 7482 /* 7483 * Pre-calculate here as we save another lookup into the darknesses 7484 * of IPsec that way and can actually decide if TSO is ok. 7485 */ 7486 #ifdef INET6 7487 if (isipv6 && IPSEC_ENABLED(ipv6)) 7488 ipsec_optlen = IPSEC_HDRSIZE(ipv6, tp->t_inpcb); 7489 #ifdef INET 7490 else 7491 #endif 7492 #endif /* INET6 */ 7493 #ifdef INET 7494 if (IPSEC_ENABLED(ipv4)) 7495 ipsec_optlen = IPSEC_HDRSIZE(ipv4, tp->t_inpcb); 7496 #endif /* INET */ 7497 #endif 7498 7499 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 7500 ipoptlen += ipsec_optlen; 7501 #endif 7502 if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > tp->t_maxseg && 7503 (tp->t_port == 0) && 7504 ((tp->t_flags & TF_SIGNATURE) == 0) && 7505 tp->rcv_numsacks == 0 && sack_rxmit == 0 && 7506 ipoptlen == 0) 7507 tso = 1; 7508 { 7509 uint32_t outstanding; 7510 7511 outstanding = tp->snd_max - tp->snd_una; 7512 if (tp->t_flags & TF_SENTFIN) { 7513 /* 7514 * If we sent a fin, snd_max is 1 higher than 7515 * snd_una 7516 */ 7517 outstanding--; 7518 } 7519 if (outstanding > 0) { 7520 /* 7521 * This is sub-optimal. We only send a stand alone 7522 * FIN on its own segment. 7523 */ 7524 if (flags & TH_FIN) { 7525 flags &= ~TH_FIN; 7526 would_have_fin = 1; 7527 } 7528 } else if (sack_rxmit) { 7529 if ((rsm->r_flags & RACK_HAS_FIN) == 0) 7530 flags &= ~TH_FIN; 7531 } else { 7532 if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + 7533 sbused(sb))) 7534 flags &= ~TH_FIN; 7535 } 7536 } 7537 recwin = sbspace(&so->so_rcv); 7538 7539 /* 7540 * Sender silly window avoidance. We transmit under the following 7541 * conditions when len is non-zero: 7542 * 7543 * - We have a full segment (or more with TSO) - This is the last 7544 * buffer in a write()/send() and we are either idle or running 7545 * NODELAY - we've timed out (e.g. persist timer) - we have more 7546 * then 1/2 the maximum send window's worth of data (receiver may be 7547 * limited the window size) - we need to retransmit 7548 */ 7549 if (len) { 7550 if (len >= tp->t_maxseg) { 7551 pass = 1; 7552 goto send; 7553 } 7554 /* 7555 * NOTE! on localhost connections an 'ack' from the remote 7556 * end may occur synchronously with the output and cause us 7557 * to flush a buffer queued with moretocome. XXX 7558 * 7559 */ 7560 if (!(tp->t_flags & TF_MORETOCOME) && /* normal case */ 7561 (idle || (tp->t_flags & TF_NODELAY)) && 7562 ((uint32_t)len + (uint32_t)sb_offset >= sbavail(&so->so_snd)) && 7563 (tp->t_flags & TF_NOPUSH) == 0) { 7564 pass = 2; 7565 goto send; 7566 } 7567 if (tp->t_flags & TF_FORCEDATA) { /* typ. timeout case */ 7568 pass = 3; 7569 goto send; 7570 } 7571 if ((tp->snd_una == tp->snd_max) && len) { /* Nothing outstanding */ 7572 goto send; 7573 } 7574 if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) { 7575 pass = 4; 7576 goto send; 7577 } 7578 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { /* retransmit case */ 7579 pass = 5; 7580 goto send; 7581 } 7582 if (sack_rxmit) { 7583 pass = 6; 7584 goto send; 7585 } 7586 } 7587 /* 7588 * Sending of standalone window updates. 7589 * 7590 * Window updates are important when we close our window due to a 7591 * full socket buffer and are opening it again after the application 7592 * reads data from it. Once the window has opened again and the 7593 * remote end starts to send again the ACK clock takes over and 7594 * provides the most current window information. 7595 * 7596 * We must avoid the silly window syndrome whereas every read from 7597 * the receive buffer, no matter how small, causes a window update 7598 * to be sent. We also should avoid sending a flurry of window 7599 * updates when the socket buffer had queued a lot of data and the 7600 * application is doing small reads. 7601 * 7602 * Prevent a flurry of pointless window updates by only sending an 7603 * update when we can increase the advertized window by more than 7604 * 1/4th of the socket buffer capacity. When the buffer is getting 7605 * full or is very small be more aggressive and send an update 7606 * whenever we can increase by two mss sized segments. In all other 7607 * situations the ACK's to new incoming data will carry further 7608 * window increases. 7609 * 7610 * Don't send an independent window update if a delayed ACK is 7611 * pending (it will get piggy-backed on it) or the remote side 7612 * already has done a half-close and won't send more data. Skip 7613 * this if the connection is in T/TCP half-open state. 7614 */ 7615 if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN) && 7616 !(tp->t_flags & TF_DELACK) && 7617 !TCPS_HAVERCVDFIN(tp->t_state)) { 7618 /* 7619 * "adv" is the amount we could increase the window, taking 7620 * into account that we are limited by TCP_MAXWIN << 7621 * tp->rcv_scale. 7622 */ 7623 int32_t adv; 7624 int oldwin; 7625 7626 adv = min(recwin, (long)TCP_MAXWIN << tp->rcv_scale); 7627 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) { 7628 oldwin = (tp->rcv_adv - tp->rcv_nxt); 7629 adv -= oldwin; 7630 } else 7631 oldwin = 0; 7632 7633 /* 7634 * If the new window size ends up being the same as the old 7635 * size when it is scaled, then don't force a window update. 7636 */ 7637 if (oldwin >> tp->rcv_scale == (adv + oldwin) >> tp->rcv_scale) 7638 goto dontupdate; 7639 7640 if (adv >= (int32_t)(2 * tp->t_maxseg) && 7641 (adv >= (int32_t)(so->so_rcv.sb_hiwat / 4) || 7642 recwin <= (int32_t)(so->so_rcv.sb_hiwat / 8) || 7643 so->so_rcv.sb_hiwat <= 8 * tp->t_maxseg)) { 7644 pass = 7; 7645 goto send; 7646 } 7647 if (2 * adv >= (int32_t) so->so_rcv.sb_hiwat) 7648 goto send; 7649 } 7650 dontupdate: 7651 7652 /* 7653 * Send if we owe the peer an ACK, RST, SYN, or urgent data. ACKNOW 7654 * is also a catch-all for the retransmit timer timeout case. 7655 */ 7656 if (tp->t_flags & TF_ACKNOW) { 7657 pass = 8; 7658 goto send; 7659 } 7660 if (((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0)) { 7661 pass = 9; 7662 goto send; 7663 } 7664 if (SEQ_GT(tp->snd_up, tp->snd_una)) { 7665 pass = 10; 7666 goto send; 7667 } 7668 /* 7669 * If our state indicates that FIN should be sent and we have not 7670 * yet done so, then we need to send. 7671 */ 7672 if (flags & TH_FIN) { 7673 if ((tp->t_flags & TF_SENTFIN) || 7674 (((tp->t_flags & TF_SENTFIN) == 0) && 7675 (tp->snd_nxt == tp->snd_una))) { 7676 pass = 11; 7677 goto send; 7678 } 7679 } 7680 /* 7681 * No reason to send a segment, just return. 7682 */ 7683 just_return: 7684 SOCKBUF_UNLOCK(sb); 7685 just_return_nolock: 7686 if (tot_len_this_send == 0) 7687 counter_u64_add(rack_out_size[TCP_MSS_ACCT_JUSTRET], 1); 7688 rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, tot_len_this_send, 1); 7689 rack_log_type_just_return(rack, cts, tot_len_this_send, slot, hpts_calling); 7690 tp->t_flags &= ~TF_FORCEDATA; 7691 return (0); 7692 7693 send: 7694 if (doing_tlp == 0) { 7695 /* 7696 * Data not a TLP, and its not the rxt firing. If it is the 7697 * rxt firing, we want to leave the tlp_in_progress flag on 7698 * so we don't send another TLP. It has to be a rack timer 7699 * or normal send (response to acked data) to clear the tlp 7700 * in progress flag. 7701 */ 7702 rack->rc_tlp_in_progress = 0; 7703 } 7704 SOCKBUF_LOCK_ASSERT(sb); 7705 if (len > 0) { 7706 if (len >= tp->t_maxseg) 7707 tp->t_flags2 |= TF2_PLPMTU_MAXSEGSNT; 7708 else 7709 tp->t_flags2 &= ~TF2_PLPMTU_MAXSEGSNT; 7710 } 7711 /* 7712 * Before ESTABLISHED, force sending of initial options unless TCP 7713 * set not to do any options. NOTE: we assume that the IP/TCP header 7714 * plus TCP options always fit in a single mbuf, leaving room for a 7715 * maximum link header, i.e. max_linkhdr + sizeof (struct tcpiphdr) 7716 * + optlen <= MCLBYTES 7717 */ 7718 optlen = 0; 7719 #ifdef INET6 7720 if (isipv6) 7721 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 7722 else 7723 #endif 7724 hdrlen = sizeof(struct tcpiphdr); 7725 7726 /* 7727 * Compute options for segment. We only have to care about SYN and 7728 * established connection segments. Options for SYN-ACK segments 7729 * are handled in TCP syncache. 7730 */ 7731 to.to_flags = 0; 7732 if ((tp->t_flags & TF_NOOPT) == 0) { 7733 /* Maximum segment size. */ 7734 if (flags & TH_SYN) { 7735 tp->snd_nxt = tp->iss; 7736 to.to_mss = tcp_mssopt(&inp->inp_inc); 7737 #ifdef NETFLIX_TCPOUDP 7738 if (tp->t_port) 7739 to.to_mss -= V_tcp_udp_tunneling_overhead; 7740 #endif 7741 to.to_flags |= TOF_MSS; 7742 7743 /* 7744 * On SYN or SYN|ACK transmits on TFO connections, 7745 * only include the TFO option if it is not a 7746 * retransmit, as the presence of the TFO option may 7747 * have caused the original SYN or SYN|ACK to have 7748 * been dropped by a middlebox. 7749 */ 7750 if (IS_FASTOPEN(tp->t_flags) && 7751 (tp->t_rxtshift == 0)) { 7752 if (tp->t_state == TCPS_SYN_RECEIVED) { 7753 to.to_tfo_len = TCP_FASTOPEN_COOKIE_LEN; 7754 to.to_tfo_cookie = 7755 (u_int8_t *)&tp->t_tfo_cookie.server; 7756 to.to_flags |= TOF_FASTOPEN; 7757 wanted_cookie = 1; 7758 } else if (tp->t_state == TCPS_SYN_SENT) { 7759 to.to_tfo_len = 7760 tp->t_tfo_client_cookie_len; 7761 to.to_tfo_cookie = 7762 tp->t_tfo_cookie.client; 7763 to.to_flags |= TOF_FASTOPEN; 7764 wanted_cookie = 1; 7765 /* 7766 * If we wind up having more data to 7767 * send with the SYN than can fit in 7768 * one segment, don't send any more 7769 * until the SYN|ACK comes back from 7770 * the other end. 7771 */ 7772 sendalot = 0; 7773 } 7774 } 7775 } 7776 /* Window scaling. */ 7777 if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) { 7778 to.to_wscale = tp->request_r_scale; 7779 to.to_flags |= TOF_SCALE; 7780 } 7781 /* Timestamps. */ 7782 if ((tp->t_flags & TF_RCVD_TSTMP) || 7783 ((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) { 7784 to.to_tsval = cts + tp->ts_offset; 7785 to.to_tsecr = tp->ts_recent; 7786 to.to_flags |= TOF_TS; 7787 } 7788 /* Set receive buffer autosizing timestamp. */ 7789 if (tp->rfbuf_ts == 0 && 7790 (so->so_rcv.sb_flags & SB_AUTOSIZE)) 7791 tp->rfbuf_ts = tcp_ts_getticks(); 7792 /* Selective ACK's. */ 7793 if (flags & TH_SYN) 7794 to.to_flags |= TOF_SACKPERM; 7795 else if (TCPS_HAVEESTABLISHED(tp->t_state) && 7796 tp->rcv_numsacks > 0) { 7797 to.to_flags |= TOF_SACK; 7798 to.to_nsacks = tp->rcv_numsacks; 7799 to.to_sacks = (u_char *)tp->sackblks; 7800 } 7801 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) 7802 /* TCP-MD5 (RFC2385). */ 7803 if (tp->t_flags & TF_SIGNATURE) 7804 to.to_flags |= TOF_SIGNATURE; 7805 #endif /* TCP_SIGNATURE */ 7806 7807 /* Processing the options. */ 7808 hdrlen += optlen = tcp_addoptions(&to, opt); 7809 /* 7810 * If we wanted a TFO option to be added, but it was unable 7811 * to fit, ensure no data is sent. 7812 */ 7813 if (IS_FASTOPEN(tp->t_flags) && wanted_cookie && 7814 !(to.to_flags & TOF_FASTOPEN)) 7815 len = 0; 7816 } 7817 #ifdef NETFLIX_TCPOUDP 7818 if (tp->t_port) { 7819 if (V_tcp_udp_tunneling_port == 0) { 7820 /* The port was removed?? */ 7821 SOCKBUF_UNLOCK(&so->so_snd); 7822 return (EHOSTUNREACH); 7823 } 7824 hdrlen += sizeof(struct udphdr); 7825 } 7826 #endif 7827 ipoptlen = 0; 7828 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 7829 ipoptlen += ipsec_optlen; 7830 #endif 7831 7832 /* 7833 * Adjust data length if insertion of options will bump the packet 7834 * length beyond the t_maxseg length. Clear the FIN bit because we 7835 * cut off the tail of the segment. 7836 */ 7837 if (len + optlen + ipoptlen > tp->t_maxseg) { 7838 if (flags & TH_FIN) { 7839 would_have_fin = 1; 7840 flags &= ~TH_FIN; 7841 } 7842 if (tso) { 7843 uint32_t if_hw_tsomax; 7844 uint32_t moff; 7845 int32_t max_len; 7846 7847 /* extract TSO information */ 7848 if_hw_tsomax = tp->t_tsomax; 7849 if_hw_tsomaxsegcount = tp->t_tsomaxsegcount; 7850 if_hw_tsomaxsegsize = tp->t_tsomaxsegsize; 7851 KASSERT(ipoptlen == 0, 7852 ("%s: TSO can't do IP options", __func__)); 7853 7854 /* 7855 * Check if we should limit by maximum payload 7856 * length: 7857 */ 7858 if (if_hw_tsomax != 0) { 7859 /* compute maximum TSO length */ 7860 max_len = (if_hw_tsomax - hdrlen - 7861 max_linkhdr); 7862 if (max_len <= 0) { 7863 len = 0; 7864 } else if (len > max_len) { 7865 sendalot = 1; 7866 len = max_len; 7867 } 7868 } 7869 /* 7870 * Prevent the last segment from being fractional 7871 * unless the send sockbuf can be emptied: 7872 */ 7873 max_len = (tp->t_maxseg - optlen); 7874 if ((sb_offset + len) < sbavail(sb)) { 7875 moff = len % (u_int)max_len; 7876 if (moff != 0) { 7877 len -= moff; 7878 sendalot = 1; 7879 } 7880 } 7881 /* 7882 * In case there are too many small fragments don't 7883 * use TSO: 7884 */ 7885 if (len <= max_len) { 7886 len = max_len; 7887 sendalot = 1; 7888 tso = 0; 7889 } 7890 /* 7891 * Send the FIN in a separate segment after the bulk 7892 * sending is done. We don't trust the TSO 7893 * implementations to clear the FIN flag on all but 7894 * the last segment. 7895 */ 7896 if (tp->t_flags & TF_NEEDFIN) 7897 sendalot = 1; 7898 7899 } else { 7900 len = tp->t_maxseg - optlen - ipoptlen; 7901 sendalot = 1; 7902 } 7903 } else 7904 tso = 0; 7905 KASSERT(len + hdrlen + ipoptlen <= IP_MAXPACKET, 7906 ("%s: len > IP_MAXPACKET", __func__)); 7907 #ifdef DIAGNOSTIC 7908 #ifdef INET6 7909 if (max_linkhdr + hdrlen > MCLBYTES) 7910 #else 7911 if (max_linkhdr + hdrlen > MHLEN) 7912 #endif 7913 panic("tcphdr too big"); 7914 #endif 7915 7916 /* 7917 * This KASSERT is here to catch edge cases at a well defined place. 7918 * Before, those had triggered (random) panic conditions further 7919 * down. 7920 */ 7921 KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); 7922 if ((len == 0) && 7923 (flags & TH_FIN) && 7924 (sbused(sb))) { 7925 /* 7926 * We have outstanding data, don't send a fin by itself!. 7927 */ 7928 goto just_return; 7929 } 7930 /* 7931 * Grab a header mbuf, attaching a copy of data to be transmitted, 7932 * and initialize the header from the template for sends on this 7933 * connection. 7934 */ 7935 if (len) { 7936 uint32_t max_val; 7937 uint32_t moff; 7938 7939 if (rack->rc_pace_max_segs) 7940 max_val = rack->rc_pace_max_segs * tp->t_maxseg; 7941 else 7942 max_val = len; 7943 /* 7944 * We allow a limit on sending with hptsi. 7945 */ 7946 if (len > max_val) { 7947 len = max_val; 7948 } 7949 #ifdef INET6 7950 if (MHLEN < hdrlen + max_linkhdr) 7951 m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); 7952 else 7953 #endif 7954 m = m_gethdr(M_NOWAIT, MT_DATA); 7955 7956 if (m == NULL) { 7957 SOCKBUF_UNLOCK(sb); 7958 error = ENOBUFS; 7959 sack_rxmit = 0; 7960 goto out; 7961 } 7962 m->m_data += max_linkhdr; 7963 m->m_len = hdrlen; 7964 7965 /* 7966 * Start the m_copy functions from the closest mbuf to the 7967 * sb_offset in the socket buffer chain. 7968 */ 7969 mb = sbsndptr_noadv(sb, sb_offset, &moff); 7970 if (len <= MHLEN - hdrlen - max_linkhdr) { 7971 m_copydata(mb, moff, (int)len, 7972 mtod(m, caddr_t)+hdrlen); 7973 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) 7974 sbsndptr_adv(sb, mb, len); 7975 m->m_len += len; 7976 } else { 7977 struct sockbuf *msb; 7978 7979 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) 7980 msb = NULL; 7981 else 7982 msb = sb; 7983 m->m_next = tcp_m_copym(mb, moff, &len, 7984 if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb); 7985 if (len <= (tp->t_maxseg - optlen)) { 7986 /* 7987 * Must have ran out of mbufs for the copy 7988 * shorten it to no longer need tso. Lets 7989 * not put on sendalot since we are low on 7990 * mbufs. 7991 */ 7992 tso = 0; 7993 } 7994 if (m->m_next == NULL) { 7995 SOCKBUF_UNLOCK(sb); 7996 (void)m_free(m); 7997 error = ENOBUFS; 7998 sack_rxmit = 0; 7999 goto out; 8000 } 8001 } 8002 if ((tp->t_flags & TF_FORCEDATA) && len == 1) { 8003 TCPSTAT_INC(tcps_sndprobe); 8004 #ifdef NETFLIX_STATS 8005 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) 8006 stats_voi_update_abs_u32(tp->t_stats, 8007 VOI_TCP_RETXPB, len); 8008 else 8009 stats_voi_update_abs_u64(tp->t_stats, 8010 VOI_TCP_TXPB, len); 8011 #endif 8012 } else if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) { 8013 if (rsm && (rsm->r_flags & RACK_TLP)) { 8014 /* 8015 * TLP should not count in retran count, but 8016 * in its own bin 8017 */ 8018 counter_u64_add(rack_tlp_retran, 1); 8019 counter_u64_add(rack_tlp_retran_bytes, len); 8020 } else { 8021 tp->t_sndrexmitpack++; 8022 TCPSTAT_INC(tcps_sndrexmitpack); 8023 TCPSTAT_ADD(tcps_sndrexmitbyte, len); 8024 } 8025 #ifdef NETFLIX_STATS 8026 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB, 8027 len); 8028 #endif 8029 } else { 8030 TCPSTAT_INC(tcps_sndpack); 8031 TCPSTAT_ADD(tcps_sndbyte, len); 8032 #ifdef NETFLIX_STATS 8033 stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB, 8034 len); 8035 #endif 8036 } 8037 /* 8038 * If we're sending everything we've got, set PUSH. (This 8039 * will keep happy those implementations which only give 8040 * data to the user when a buffer fills or a PUSH comes in.) 8041 */ 8042 if (sb_offset + len == sbused(sb) && 8043 sbused(sb) && 8044 !(flags & TH_SYN)) 8045 flags |= TH_PUSH; 8046 8047 /* 8048 * Are we doing hptsi, if so we must calculate the slot. We 8049 * only do hptsi in ESTABLISHED and with no RESET being 8050 * sent where we have data to send. 8051 */ 8052 if (((tp->t_state == TCPS_ESTABLISHED) || 8053 (tp->t_state == TCPS_CLOSE_WAIT) || 8054 ((tp->t_state == TCPS_FIN_WAIT_1) && 8055 ((tp->t_flags & TF_SENTFIN) == 0) && 8056 ((flags & TH_FIN) == 0))) && 8057 ((flags & TH_RST) == 0) && 8058 (rack->rc_always_pace)) { 8059 /* 8060 * We use the most optimistic possible cwnd/srtt for 8061 * sending calculations. This will make our 8062 * calculation anticipate getting more through 8063 * quicker then possible. But thats ok we don't want 8064 * the peer to have a gap in data sending. 8065 */ 8066 uint32_t srtt, cwnd, tr_perms = 0; 8067 8068 if (rack->r_ctl.rc_rack_min_rtt) 8069 srtt = rack->r_ctl.rc_rack_min_rtt; 8070 else 8071 srtt = TICKS_2_MSEC((tp->t_srtt >> TCP_RTT_SHIFT)); 8072 if (rack->r_ctl.rc_rack_largest_cwnd) 8073 cwnd = rack->r_ctl.rc_rack_largest_cwnd; 8074 else 8075 cwnd = tp->snd_cwnd; 8076 tr_perms = cwnd / srtt; 8077 if (tr_perms == 0) { 8078 tr_perms = tp->t_maxseg; 8079 } 8080 tot_len_this_send += len; 8081 /* 8082 * Calculate how long this will take to drain, if 8083 * the calculation comes out to zero, thats ok we 8084 * will use send_a_lot to possibly spin around for 8085 * more increasing tot_len_this_send to the point 8086 * that its going to require a pace, or we hit the 8087 * cwnd. Which in that case we are just waiting for 8088 * a ACK. 8089 */ 8090 slot = tot_len_this_send / tr_perms; 8091 /* Now do we reduce the time so we don't run dry? */ 8092 if (slot && rack->rc_pace_reduce) { 8093 int32_t reduce; 8094 8095 reduce = (slot / rack->rc_pace_reduce); 8096 if (reduce < slot) { 8097 slot -= reduce; 8098 } else 8099 slot = 0; 8100 } 8101 if (rack->r_enforce_min_pace && 8102 (slot == 0) && 8103 (tot_len_this_send >= (rack->r_min_pace_seg_thresh * tp->t_maxseg))) { 8104 /* We are enforcing a minimum pace time of 1ms */ 8105 slot = rack->r_enforce_min_pace; 8106 } 8107 } 8108 SOCKBUF_UNLOCK(sb); 8109 } else { 8110 SOCKBUF_UNLOCK(sb); 8111 if (tp->t_flags & TF_ACKNOW) 8112 TCPSTAT_INC(tcps_sndacks); 8113 else if (flags & (TH_SYN | TH_FIN | TH_RST)) 8114 TCPSTAT_INC(tcps_sndctrl); 8115 else if (SEQ_GT(tp->snd_up, tp->snd_una)) 8116 TCPSTAT_INC(tcps_sndurg); 8117 else 8118 TCPSTAT_INC(tcps_sndwinup); 8119 8120 m = m_gethdr(M_NOWAIT, MT_DATA); 8121 if (m == NULL) { 8122 error = ENOBUFS; 8123 sack_rxmit = 0; 8124 goto out; 8125 } 8126 #ifdef INET6 8127 if (isipv6 && (MHLEN < hdrlen + max_linkhdr) && 8128 MHLEN >= hdrlen) { 8129 M_ALIGN(m, hdrlen); 8130 } else 8131 #endif 8132 m->m_data += max_linkhdr; 8133 m->m_len = hdrlen; 8134 } 8135 SOCKBUF_UNLOCK_ASSERT(sb); 8136 m->m_pkthdr.rcvif = (struct ifnet *)0; 8137 #ifdef MAC 8138 mac_inpcb_create_mbuf(inp, m); 8139 #endif 8140 #ifdef INET6 8141 if (isipv6) { 8142 ip6 = mtod(m, struct ip6_hdr *); 8143 #ifdef NETFLIX_TCPOUDP 8144 if (tp->t_port) { 8145 udp = (struct udphdr *)((caddr_t)ip6 + ipoptlen + sizeof(struct ip6_hdr)); 8146 udp->uh_sport = htons(V_tcp_udp_tunneling_port); 8147 udp->uh_dport = tp->t_port; 8148 ulen = hdrlen + len - sizeof(struct ip6_hdr); 8149 udp->uh_ulen = htons(ulen); 8150 th = (struct tcphdr *)(udp + 1); 8151 } else 8152 #endif 8153 th = (struct tcphdr *)(ip6 + 1); 8154 tcpip_fillheaders(inp, ip6, th); 8155 } else 8156 #endif /* INET6 */ 8157 { 8158 ip = mtod(m, struct ip *); 8159 #ifdef TCPDEBUG 8160 ipov = (struct ipovly *)ip; 8161 #endif 8162 #ifdef NETFLIX_TCPOUDP 8163 if (tp->t_port) { 8164 udp = (struct udphdr *)((caddr_t)ip + ipoptlen + sizeof(struct ip)); 8165 udp->uh_sport = htons(V_tcp_udp_tunneling_port); 8166 udp->uh_dport = tp->t_port; 8167 ulen = hdrlen + len - sizeof(struct ip); 8168 udp->uh_ulen = htons(ulen); 8169 th = (struct tcphdr *)(udp + 1); 8170 } else 8171 #endif 8172 th = (struct tcphdr *)(ip + 1); 8173 tcpip_fillheaders(inp, ip, th); 8174 } 8175 /* 8176 * Fill in fields, remembering maximum advertised window for use in 8177 * delaying messages about window sizes. If resending a FIN, be sure 8178 * not to use a new sequence number. 8179 */ 8180 if (flags & TH_FIN && tp->t_flags & TF_SENTFIN && 8181 tp->snd_nxt == tp->snd_max) 8182 tp->snd_nxt--; 8183 /* 8184 * If we are starting a connection, send ECN setup SYN packet. If we 8185 * are on a retransmit, we may resend those bits a number of times 8186 * as per RFC 3168. 8187 */ 8188 if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn == 1) { 8189 if (tp->t_rxtshift >= 1) { 8190 if (tp->t_rxtshift <= V_tcp_ecn_maxretries) 8191 flags |= TH_ECE | TH_CWR; 8192 } else 8193 flags |= TH_ECE | TH_CWR; 8194 } 8195 if (tp->t_state == TCPS_ESTABLISHED && 8196 (tp->t_flags & TF_ECN_PERMIT)) { 8197 /* 8198 * If the peer has ECN, mark data packets with ECN capable 8199 * transmission (ECT). Ignore pure ack packets, 8200 * retransmissions and window probes. 8201 */ 8202 if (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) && 8203 !((tp->t_flags & TF_FORCEDATA) && len == 1)) { 8204 #ifdef INET6 8205 if (isipv6) 8206 ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20); 8207 else 8208 #endif 8209 ip->ip_tos |= IPTOS_ECN_ECT0; 8210 TCPSTAT_INC(tcps_ecn_ect0); 8211 } 8212 /* 8213 * Reply with proper ECN notifications. 8214 */ 8215 if (tp->t_flags & TF_ECN_SND_CWR) { 8216 flags |= TH_CWR; 8217 tp->t_flags &= ~TF_ECN_SND_CWR; 8218 } 8219 if (tp->t_flags & TF_ECN_SND_ECE) 8220 flags |= TH_ECE; 8221 } 8222 /* 8223 * If we are doing retransmissions, then snd_nxt will not reflect 8224 * the first unsent octet. For ACK only packets, we do not want the 8225 * sequence number of the retransmitted packet, we want the sequence 8226 * number of the next unsent octet. So, if there is no data (and no 8227 * SYN or FIN), use snd_max instead of snd_nxt when filling in 8228 * ti_seq. But if we are in persist state, snd_max might reflect 8229 * one byte beyond the right edge of the window, so use snd_nxt in 8230 * that case, since we know we aren't doing a retransmission. 8231 * (retransmit and persist are mutually exclusive...) 8232 */ 8233 if (sack_rxmit == 0) { 8234 if (len || (flags & (TH_SYN | TH_FIN)) || 8235 rack->rc_in_persist) { 8236 th->th_seq = htonl(tp->snd_nxt); 8237 rack_seq = tp->snd_nxt; 8238 } else if (flags & TH_RST) { 8239 /* 8240 * For a Reset send the last cum ack in sequence 8241 * (this like any other choice may still generate a 8242 * challenge ack, if a ack-update packet is in 8243 * flight). 8244 */ 8245 th->th_seq = htonl(tp->snd_una); 8246 rack_seq = tp->snd_una; 8247 } else { 8248 th->th_seq = htonl(tp->snd_max); 8249 rack_seq = tp->snd_max; 8250 } 8251 } else { 8252 th->th_seq = htonl(rsm->r_start); 8253 rack_seq = rsm->r_start; 8254 } 8255 th->th_ack = htonl(tp->rcv_nxt); 8256 if (optlen) { 8257 bcopy(opt, th + 1, optlen); 8258 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 8259 } 8260 th->th_flags = flags; 8261 /* 8262 * Calculate receive window. Don't shrink window, but avoid silly 8263 * window syndrome. 8264 */ 8265 if (recwin < (long)(so->so_rcv.sb_hiwat / 4) && 8266 recwin < (long)tp->t_maxseg) 8267 recwin = 0; 8268 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) && 8269 recwin < (long)(tp->rcv_adv - tp->rcv_nxt)) 8270 recwin = (long)(tp->rcv_adv - tp->rcv_nxt); 8271 if (recwin > (long)TCP_MAXWIN << tp->rcv_scale) 8272 recwin = (long)TCP_MAXWIN << tp->rcv_scale; 8273 8274 /* 8275 * According to RFC1323 the window field in a SYN (i.e., a <SYN> or 8276 * <SYN,ACK>) segment itself is never scaled. The <SYN,ACK> case is 8277 * handled in syncache. 8278 */ 8279 if (flags & TH_SYN) 8280 th->th_win = htons((u_short) 8281 (min(sbspace(&so->so_rcv), TCP_MAXWIN))); 8282 else 8283 th->th_win = htons((u_short)(recwin >> tp->rcv_scale)); 8284 /* 8285 * Adjust the RXWIN0SENT flag - indicate that we have advertised a 0 8286 * window. This may cause the remote transmitter to stall. This 8287 * flag tells soreceive() to disable delayed acknowledgements when 8288 * draining the buffer. This can occur if the receiver is 8289 * attempting to read more data than can be buffered prior to 8290 * transmitting on the connection. 8291 */ 8292 if (th->th_win == 0) { 8293 tp->t_sndzerowin++; 8294 tp->t_flags |= TF_RXWIN0SENT; 8295 } else 8296 tp->t_flags &= ~TF_RXWIN0SENT; 8297 if (SEQ_GT(tp->snd_up, tp->snd_nxt)) { 8298 th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt)); 8299 th->th_flags |= TH_URG; 8300 } else 8301 /* 8302 * If no urgent pointer to send, then we pull the urgent 8303 * pointer to the left edge of the send window so that it 8304 * doesn't drift into the send window on sequence number 8305 * wraparound. 8306 */ 8307 tp->snd_up = tp->snd_una; /* drag it along */ 8308 8309 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) 8310 if (to.to_flags & TOF_SIGNATURE) { 8311 /* 8312 * Calculate MD5 signature and put it into the place 8313 * determined before. 8314 * NOTE: since TCP options buffer doesn't point into 8315 * mbuf's data, calculate offset and use it. 8316 */ 8317 if (!TCPMD5_ENABLED() || TCPMD5_OUTPUT(m, th, 8318 (u_char *)(th + 1) + (to.to_signature - opt)) != 0) { 8319 /* 8320 * Do not send segment if the calculation of MD5 8321 * digest has failed. 8322 */ 8323 goto out; 8324 } 8325 } 8326 #endif 8327 8328 /* 8329 * Put TCP length in extended header, and then checksum extended 8330 * header and data. 8331 */ 8332 m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */ 8333 #ifdef INET6 8334 if (isipv6) { 8335 /* 8336 * ip6_plen is not need to be filled now, and will be filled 8337 * in ip6_output. 8338 */ 8339 if (tp->t_port) { 8340 m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; 8341 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 8342 udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0); 8343 th->th_sum = htons(0); 8344 } else { 8345 m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; 8346 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 8347 th->th_sum = in6_cksum_pseudo(ip6, 8348 sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP, 8349 0); 8350 } 8351 } 8352 #endif 8353 #if defined(INET6) && defined(INET) 8354 else 8355 #endif 8356 #ifdef INET 8357 { 8358 if (tp->t_port) { 8359 m->m_pkthdr.csum_flags = CSUM_UDP; 8360 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 8361 udp->uh_sum = in_pseudo(ip->ip_src.s_addr, 8362 ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP)); 8363 th->th_sum = htons(0); 8364 } else { 8365 m->m_pkthdr.csum_flags = CSUM_TCP; 8366 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 8367 th->th_sum = in_pseudo(ip->ip_src.s_addr, 8368 ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + 8369 IPPROTO_TCP + len + optlen)); 8370 } 8371 /* IP version must be set here for ipv4/ipv6 checking later */ 8372 KASSERT(ip->ip_v == IPVERSION, 8373 ("%s: IP version incorrect: %d", __func__, ip->ip_v)); 8374 } 8375 #endif 8376 8377 /* 8378 * Enable TSO and specify the size of the segments. The TCP pseudo 8379 * header checksum is always provided. XXX: Fixme: This is currently 8380 * not the case for IPv6. 8381 */ 8382 if (tso) { 8383 KASSERT(len > tp->t_maxseg - optlen, 8384 ("%s: len <= tso_segsz", __func__)); 8385 m->m_pkthdr.csum_flags |= CSUM_TSO; 8386 m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen; 8387 } 8388 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 8389 KASSERT(len + hdrlen + ipoptlen - ipsec_optlen == m_length(m, NULL), 8390 ("%s: mbuf chain shorter than expected: %d + %u + %u - %u != %u", 8391 __func__, len, hdrlen, ipoptlen, ipsec_optlen, m_length(m, NULL))); 8392 #else 8393 KASSERT(len + hdrlen + ipoptlen == m_length(m, NULL), 8394 ("%s: mbuf chain shorter than expected: %d + %u + %u != %u", 8395 __func__, len, hdrlen, ipoptlen, m_length(m, NULL))); 8396 #endif 8397 8398 #ifdef TCP_HHOOK 8399 /* Run HHOOK_TCP_ESTABLISHED_OUT helper hooks. */ 8400 hhook_run_tcp_est_out(tp, th, &to, len, tso); 8401 #endif 8402 8403 #ifdef TCPDEBUG 8404 /* 8405 * Trace. 8406 */ 8407 if (so->so_options & SO_DEBUG) { 8408 u_short save = 0; 8409 8410 #ifdef INET6 8411 if (!isipv6) 8412 #endif 8413 { 8414 save = ipov->ih_len; 8415 ipov->ih_len = htons(m->m_pkthdr.len /* - hdrlen + 8416 * (th->th_off << 2) */ ); 8417 } 8418 tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0); 8419 #ifdef INET6 8420 if (!isipv6) 8421 #endif 8422 ipov->ih_len = save; 8423 } 8424 #endif /* TCPDEBUG */ 8425 8426 /* We're getting ready to send; log now. */ 8427 if (tp->t_logstate != TCP_LOG_STATE_OFF) { 8428 union tcp_log_stackspecific log; 8429 8430 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 8431 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 8432 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 8433 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; 8434 if (rsm || sack_rxmit) { 8435 log.u_bbr.flex8 = 1; 8436 } else { 8437 log.u_bbr.flex8 = 0; 8438 } 8439 lgb = tcp_log_event_(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_OUT, ERRNO_UNK, 8440 len, &log, false, NULL, NULL, 0, NULL); 8441 } else 8442 lgb = NULL; 8443 8444 /* 8445 * Fill in IP length and desired time to live and send to IP level. 8446 * There should be a better way to handle ttl and tos; we could keep 8447 * them in the template, but need a way to checksum without them. 8448 */ 8449 /* 8450 * m->m_pkthdr.len should have been set before cksum calcuration, 8451 * because in6_cksum() need it. 8452 */ 8453 #ifdef INET6 8454 if (isipv6) { 8455 /* 8456 * we separately set hoplimit for every segment, since the 8457 * user might want to change the value via setsockopt. Also, 8458 * desired default hop limit might be changed via Neighbor 8459 * Discovery. 8460 */ 8461 ip6->ip6_hlim = in6_selecthlim(inp, NULL); 8462 8463 /* 8464 * Set the packet size here for the benefit of DTrace 8465 * probes. ip6_output() will set it properly; it's supposed 8466 * to include the option header lengths as well. 8467 */ 8468 ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6)); 8469 8470 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) 8471 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 8472 else 8473 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 8474 8475 if (tp->t_state == TCPS_SYN_SENT) 8476 TCP_PROBE5(connect__request, NULL, tp, ip6, tp, th); 8477 8478 TCP_PROBE5(send, NULL, tp, ip6, tp, th); 8479 /* TODO: IPv6 IP6TOS_ECT bit on */ 8480 error = ip6_output(m, tp->t_inpcb->in6p_outputopts, 8481 &inp->inp_route6, 8482 ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), 8483 NULL, NULL, inp); 8484 8485 if (error == EMSGSIZE && inp->inp_route6.ro_rt != NULL) 8486 mtu = inp->inp_route6.ro_rt->rt_mtu; 8487 } 8488 #endif /* INET6 */ 8489 #if defined(INET) && defined(INET6) 8490 else 8491 #endif 8492 #ifdef INET 8493 { 8494 ip->ip_len = htons(m->m_pkthdr.len); 8495 #ifdef INET6 8496 if (inp->inp_vflag & INP_IPV6PROTO) 8497 ip->ip_ttl = in6_selecthlim(inp, NULL); 8498 #endif /* INET6 */ 8499 /* 8500 * If we do path MTU discovery, then we set DF on every 8501 * packet. This might not be the best thing to do according 8502 * to RFC3390 Section 2. However the tcp hostcache migitates 8503 * the problem so it affects only the first tcp connection 8504 * with a host. 8505 * 8506 * NB: Don't set DF on small MTU/MSS to have a safe 8507 * fallback. 8508 */ 8509 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) { 8510 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 8511 if (tp->t_port == 0 || len < V_tcp_minmss) { 8512 ip->ip_off |= htons(IP_DF); 8513 } 8514 } else { 8515 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 8516 } 8517 8518 if (tp->t_state == TCPS_SYN_SENT) 8519 TCP_PROBE5(connect__request, NULL, tp, ip, tp, th); 8520 8521 TCP_PROBE5(send, NULL, tp, ip, tp, th); 8522 8523 error = ip_output(m, tp->t_inpcb->inp_options, &inp->inp_route, 8524 ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), 0, 8525 inp); 8526 if (error == EMSGSIZE && inp->inp_route.ro_rt != NULL) 8527 mtu = inp->inp_route.ro_rt->rt_mtu; 8528 } 8529 #endif /* INET */ 8530 8531 out: 8532 if (lgb) { 8533 lgb->tlb_errno = error; 8534 lgb = NULL; 8535 } 8536 /* 8537 * In transmit state, time the transmission and arrange for the 8538 * retransmit. In persist state, just set snd_max. 8539 */ 8540 if (error == 0) { 8541 if (len == 0) 8542 counter_u64_add(rack_out_size[TCP_MSS_ACCT_SNDACK], 1); 8543 else if (len == 1) { 8544 counter_u64_add(rack_out_size[TCP_MSS_ACCT_PERSIST], 1); 8545 } else if (len > 1) { 8546 int idx; 8547 8548 idx = (len / tp->t_maxseg) + 3; 8549 if (idx >= TCP_MSS_ACCT_ATIMER) 8550 counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1); 8551 else 8552 counter_u64_add(rack_out_size[idx], 1); 8553 } 8554 } 8555 if (sub_from_prr && (error == 0)) { 8556 rack->r_ctl.rc_prr_sndcnt -= len; 8557 } 8558 sub_from_prr = 0; 8559 rack_log_output(tp, &to, len, rack_seq, (uint8_t) flags, error, cts, 8560 pass, rsm); 8561 if ((tp->t_flags & TF_FORCEDATA) == 0 || 8562 (rack->rc_in_persist == 0)) { 8563 #ifdef NETFLIX_STATS 8564 tcp_seq startseq = tp->snd_nxt; 8565 #endif 8566 8567 /* 8568 * Advance snd_nxt over sequence space of this segment. 8569 */ 8570 if (error) 8571 /* We don't log or do anything with errors */ 8572 goto timer; 8573 8574 if (flags & (TH_SYN | TH_FIN)) { 8575 if (flags & TH_SYN) 8576 tp->snd_nxt++; 8577 if (flags & TH_FIN) { 8578 tp->snd_nxt++; 8579 tp->t_flags |= TF_SENTFIN; 8580 } 8581 } 8582 /* In the ENOBUFS case we do *not* update snd_max */ 8583 if (sack_rxmit) 8584 goto timer; 8585 8586 tp->snd_nxt += len; 8587 if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { 8588 if (tp->snd_una == tp->snd_max) { 8589 /* 8590 * Update the time we just added data since 8591 * none was outstanding. 8592 */ 8593 rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__); 8594 tp->t_acktime = ticks; 8595 } 8596 tp->snd_max = tp->snd_nxt; 8597 #ifdef NETFLIX_STATS 8598 if (!(tp->t_flags & TF_GPUTINPROG) && len) { 8599 tp->t_flags |= TF_GPUTINPROG; 8600 tp->gput_seq = startseq; 8601 tp->gput_ack = startseq + 8602 ulmin(sbavail(sb) - sb_offset, sendwin); 8603 tp->gput_ts = tcp_ts_getticks(); 8604 } 8605 #endif 8606 } 8607 /* 8608 * Set retransmit timer if not currently set, and not doing 8609 * a pure ack or a keep-alive probe. Initial value for 8610 * retransmit timer is smoothed round-trip time + 2 * 8611 * round-trip time variance. Initialize shift counter which 8612 * is used for backoff of retransmit time. 8613 */ 8614 timer: 8615 if ((tp->snd_wnd == 0) && 8616 TCPS_HAVEESTABLISHED(tp->t_state)) { 8617 /* 8618 * If the persists timer was set above (right before 8619 * the goto send), and still needs to be on. Lets 8620 * make sure all is canceled. If the persist timer 8621 * is not running, we want to get it up. 8622 */ 8623 if (rack->rc_in_persist == 0) { 8624 rack_enter_persist(tp, rack, cts); 8625 } 8626 } 8627 } else { 8628 /* 8629 * Persist case, update snd_max but since we are in persist 8630 * mode (no window) we do not update snd_nxt. 8631 */ 8632 int32_t xlen = len; 8633 8634 if (error) 8635 goto nomore; 8636 8637 if (flags & TH_SYN) 8638 ++xlen; 8639 if (flags & TH_FIN) { 8640 ++xlen; 8641 tp->t_flags |= TF_SENTFIN; 8642 } 8643 /* In the ENOBUFS case we do *not* update snd_max */ 8644 if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max)) { 8645 if (tp->snd_una == tp->snd_max) { 8646 /* 8647 * Update the time we just added data since 8648 * none was outstanding. 8649 */ 8650 rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__); 8651 tp->t_acktime = ticks; 8652 } 8653 tp->snd_max = tp->snd_nxt + len; 8654 } 8655 } 8656 nomore: 8657 if (error) { 8658 SOCKBUF_UNLOCK_ASSERT(sb); /* Check gotos. */ 8659 /* 8660 * Failures do not advance the seq counter above. For the 8661 * case of ENOBUFS we will fall out and retry in 1ms with 8662 * the hpts. Everything else will just have to retransmit 8663 * with the timer. 8664 * 8665 * In any case, we do not want to loop around for another 8666 * send without a good reason. 8667 */ 8668 sendalot = 0; 8669 switch (error) { 8670 case EPERM: 8671 tp->t_flags &= ~TF_FORCEDATA; 8672 tp->t_softerror = error; 8673 return (error); 8674 case ENOBUFS: 8675 if (slot == 0) { 8676 /* 8677 * Pace us right away to retry in a some 8678 * time 8679 */ 8680 slot = 1 + rack->rc_enobuf; 8681 if (rack->rc_enobuf < 255) 8682 rack->rc_enobuf++; 8683 if (slot > (rack->rc_rack_rtt / 2)) { 8684 slot = rack->rc_rack_rtt / 2; 8685 } 8686 if (slot < 10) 8687 slot = 10; 8688 } 8689 counter_u64_add(rack_saw_enobuf, 1); 8690 error = 0; 8691 goto enobufs; 8692 case EMSGSIZE: 8693 /* 8694 * For some reason the interface we used initially 8695 * to send segments changed to another or lowered 8696 * its MTU. If TSO was active we either got an 8697 * interface without TSO capabilits or TSO was 8698 * turned off. If we obtained mtu from ip_output() 8699 * then update it and try again. 8700 */ 8701 if (tso) 8702 tp->t_flags &= ~TF_TSO; 8703 if (mtu != 0) { 8704 tcp_mss_update(tp, -1, mtu, NULL, NULL); 8705 goto again; 8706 } 8707 slot = 10; 8708 rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, 0, 1); 8709 tp->t_flags &= ~TF_FORCEDATA; 8710 return (error); 8711 case ENETUNREACH: 8712 counter_u64_add(rack_saw_enetunreach, 1); 8713 case EHOSTDOWN: 8714 case EHOSTUNREACH: 8715 case ENETDOWN: 8716 if (TCPS_HAVERCVDSYN(tp->t_state)) { 8717 tp->t_softerror = error; 8718 } 8719 /* FALLTHROUGH */ 8720 default: 8721 slot = 10; 8722 rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, 0, 1); 8723 tp->t_flags &= ~TF_FORCEDATA; 8724 return (error); 8725 } 8726 } else { 8727 rack->rc_enobuf = 0; 8728 } 8729 TCPSTAT_INC(tcps_sndtotal); 8730 8731 /* 8732 * Data sent (as far as we can tell). If this advertises a larger 8733 * window than any other segment, then remember the size of the 8734 * advertised window. Any pending ACK has now been sent. 8735 */ 8736 if (recwin > 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv)) 8737 tp->rcv_adv = tp->rcv_nxt + recwin; 8738 tp->last_ack_sent = tp->rcv_nxt; 8739 tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); 8740 enobufs: 8741 rack->r_tlp_running = 0; 8742 if ((flags & TH_RST) || (would_have_fin == 1)) { 8743 /* 8744 * We don't send again after a RST. We also do *not* send 8745 * again if we would have had a find, but now have 8746 * outstanding data. 8747 */ 8748 slot = 0; 8749 sendalot = 0; 8750 } 8751 if (slot) { 8752 /* set the rack tcb into the slot N */ 8753 counter_u64_add(rack_paced_segments, 1); 8754 } else if (sendalot) { 8755 if (len) 8756 counter_u64_add(rack_unpaced_segments, 1); 8757 sack_rxmit = 0; 8758 tp->t_flags &= ~TF_FORCEDATA; 8759 goto again; 8760 } else if (len) { 8761 counter_u64_add(rack_unpaced_segments, 1); 8762 } 8763 tp->t_flags &= ~TF_FORCEDATA; 8764 rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, tot_len_this_send, 1); 8765 return (error); 8766 } 8767 8768 /* 8769 * rack_ctloutput() must drop the inpcb lock before performing copyin on 8770 * socket option arguments. When it re-acquires the lock after the copy, it 8771 * has to revalidate that the connection is still valid for the socket 8772 * option. 8773 */ 8774 static int 8775 rack_set_sockopt(struct socket *so, struct sockopt *sopt, 8776 struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack) 8777 { 8778 int32_t error = 0, optval; 8779 8780 switch (sopt->sopt_name) { 8781 case TCP_RACK_PROP_RATE: 8782 case TCP_RACK_PROP: 8783 case TCP_RACK_TLP_REDUCE: 8784 case TCP_RACK_EARLY_RECOV: 8785 case TCP_RACK_PACE_ALWAYS: 8786 case TCP_DELACK: 8787 case TCP_RACK_PACE_REDUCE: 8788 case TCP_RACK_PACE_MAX_SEG: 8789 case TCP_RACK_PRR_SENDALOT: 8790 case TCP_RACK_MIN_TO: 8791 case TCP_RACK_EARLY_SEG: 8792 case TCP_RACK_REORD_THRESH: 8793 case TCP_RACK_REORD_FADE: 8794 case TCP_RACK_TLP_THRESH: 8795 case TCP_RACK_PKT_DELAY: 8796 case TCP_RACK_TLP_USE: 8797 case TCP_RACK_TLP_INC_VAR: 8798 case TCP_RACK_IDLE_REDUCE_HIGH: 8799 case TCP_RACK_MIN_PACE: 8800 case TCP_RACK_MIN_PACE_SEG: 8801 case TCP_BBR_RACK_RTT_USE: 8802 case TCP_DATA_AFTER_CLOSE: 8803 break; 8804 default: 8805 return (tcp_default_ctloutput(so, sopt, inp, tp)); 8806 break; 8807 } 8808 INP_WUNLOCK(inp); 8809 error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); 8810 if (error) 8811 return (error); 8812 INP_WLOCK(inp); 8813 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 8814 INP_WUNLOCK(inp); 8815 return (ECONNRESET); 8816 } 8817 tp = intotcpcb(inp); 8818 rack = (struct tcp_rack *)tp->t_fb_ptr; 8819 switch (sopt->sopt_name) { 8820 case TCP_RACK_PROP_RATE: 8821 if ((optval <= 0) || (optval >= 100)) { 8822 error = EINVAL; 8823 break; 8824 } 8825 RACK_OPTS_INC(tcp_rack_prop_rate); 8826 rack->r_ctl.rc_prop_rate = optval; 8827 break; 8828 case TCP_RACK_TLP_USE: 8829 if ((optval < TLP_USE_ID) || (optval > TLP_USE_TWO_TWO)) { 8830 error = EINVAL; 8831 break; 8832 } 8833 RACK_OPTS_INC(tcp_tlp_use); 8834 rack->rack_tlp_threshold_use = optval; 8835 break; 8836 case TCP_RACK_PROP: 8837 /* RACK proportional rate reduction (bool) */ 8838 RACK_OPTS_INC(tcp_rack_prop); 8839 rack->r_ctl.rc_prop_reduce = optval; 8840 break; 8841 case TCP_RACK_TLP_REDUCE: 8842 /* RACK TLP cwnd reduction (bool) */ 8843 RACK_OPTS_INC(tcp_rack_tlp_reduce); 8844 rack->r_ctl.rc_tlp_cwnd_reduce = optval; 8845 break; 8846 case TCP_RACK_EARLY_RECOV: 8847 /* Should recovery happen early (bool) */ 8848 RACK_OPTS_INC(tcp_rack_early_recov); 8849 rack->r_ctl.rc_early_recovery = optval; 8850 break; 8851 case TCP_RACK_PACE_ALWAYS: 8852 /* Use the always pace method (bool) */ 8853 RACK_OPTS_INC(tcp_rack_pace_always); 8854 if (optval > 0) 8855 rack->rc_always_pace = 1; 8856 else 8857 rack->rc_always_pace = 0; 8858 break; 8859 case TCP_RACK_PACE_REDUCE: 8860 /* RACK Hptsi reduction factor (divisor) */ 8861 RACK_OPTS_INC(tcp_rack_pace_reduce); 8862 if (optval) 8863 /* Must be non-zero */ 8864 rack->rc_pace_reduce = optval; 8865 else 8866 error = EINVAL; 8867 break; 8868 case TCP_RACK_PACE_MAX_SEG: 8869 /* Max segments in a pace */ 8870 RACK_OPTS_INC(tcp_rack_max_seg); 8871 rack->rc_pace_max_segs = optval; 8872 break; 8873 case TCP_RACK_PRR_SENDALOT: 8874 /* Allow PRR to send more than one seg */ 8875 RACK_OPTS_INC(tcp_rack_prr_sendalot); 8876 rack->r_ctl.rc_prr_sendalot = optval; 8877 break; 8878 case TCP_RACK_MIN_TO: 8879 /* Minimum time between rack t-o's in ms */ 8880 RACK_OPTS_INC(tcp_rack_min_to); 8881 rack->r_ctl.rc_min_to = optval; 8882 break; 8883 case TCP_RACK_EARLY_SEG: 8884 /* If early recovery max segments */ 8885 RACK_OPTS_INC(tcp_rack_early_seg); 8886 rack->r_ctl.rc_early_recovery_segs = optval; 8887 break; 8888 case TCP_RACK_REORD_THRESH: 8889 /* RACK reorder threshold (shift amount) */ 8890 RACK_OPTS_INC(tcp_rack_reord_thresh); 8891 if ((optval > 0) && (optval < 31)) 8892 rack->r_ctl.rc_reorder_shift = optval; 8893 else 8894 error = EINVAL; 8895 break; 8896 case TCP_RACK_REORD_FADE: 8897 /* Does reordering fade after ms time */ 8898 RACK_OPTS_INC(tcp_rack_reord_fade); 8899 rack->r_ctl.rc_reorder_fade = optval; 8900 break; 8901 case TCP_RACK_TLP_THRESH: 8902 /* RACK TLP theshold i.e. srtt+(srtt/N) */ 8903 RACK_OPTS_INC(tcp_rack_tlp_thresh); 8904 if (optval) 8905 rack->r_ctl.rc_tlp_threshold = optval; 8906 else 8907 error = EINVAL; 8908 break; 8909 case TCP_RACK_PKT_DELAY: 8910 /* RACK added ms i.e. rack-rtt + reord + N */ 8911 RACK_OPTS_INC(tcp_rack_pkt_delay); 8912 rack->r_ctl.rc_pkt_delay = optval; 8913 break; 8914 case TCP_RACK_TLP_INC_VAR: 8915 /* Does TLP include rtt variance in t-o */ 8916 RACK_OPTS_INC(tcp_rack_tlp_inc_var); 8917 rack->r_ctl.rc_prr_inc_var = optval; 8918 break; 8919 case TCP_RACK_IDLE_REDUCE_HIGH: 8920 RACK_OPTS_INC(tcp_rack_idle_reduce_high); 8921 if (optval) 8922 rack->r_idle_reduce_largest = 1; 8923 else 8924 rack->r_idle_reduce_largest = 0; 8925 break; 8926 case TCP_DELACK: 8927 if (optval == 0) 8928 tp->t_delayed_ack = 0; 8929 else 8930 tp->t_delayed_ack = 1; 8931 if (tp->t_flags & TF_DELACK) { 8932 tp->t_flags &= ~TF_DELACK; 8933 tp->t_flags |= TF_ACKNOW; 8934 rack_output(tp); 8935 } 8936 break; 8937 case TCP_RACK_MIN_PACE: 8938 RACK_OPTS_INC(tcp_rack_min_pace); 8939 if (optval > 3) 8940 rack->r_enforce_min_pace = 3; 8941 else 8942 rack->r_enforce_min_pace = optval; 8943 break; 8944 case TCP_RACK_MIN_PACE_SEG: 8945 RACK_OPTS_INC(tcp_rack_min_pace_seg); 8946 if (optval >= 16) 8947 rack->r_min_pace_seg_thresh = 15; 8948 else 8949 rack->r_min_pace_seg_thresh = optval; 8950 break; 8951 case TCP_BBR_RACK_RTT_USE: 8952 if ((optval != USE_RTT_HIGH) && 8953 (optval != USE_RTT_LOW) && 8954 (optval != USE_RTT_AVG)) 8955 error = EINVAL; 8956 else 8957 rack->r_ctl.rc_rate_sample_method = optval; 8958 break; 8959 case TCP_DATA_AFTER_CLOSE: 8960 if (optval) 8961 rack->rc_allow_data_af_clo = 1; 8962 else 8963 rack->rc_allow_data_af_clo = 0; 8964 break; 8965 default: 8966 return (tcp_default_ctloutput(so, sopt, inp, tp)); 8967 break; 8968 } 8969 #ifdef NETFLIX_STATS 8970 tcp_log_socket_option(tp, sopt->sopt_name, optval, error); 8971 #endif 8972 INP_WUNLOCK(inp); 8973 return (error); 8974 } 8975 8976 static int 8977 rack_get_sockopt(struct socket *so, struct sockopt *sopt, 8978 struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack) 8979 { 8980 int32_t error, optval; 8981 8982 /* 8983 * Because all our options are either boolean or an int, we can just 8984 * pull everything into optval and then unlock and copy. If we ever 8985 * add a option that is not a int, then this will have quite an 8986 * impact to this routine. 8987 */ 8988 switch (sopt->sopt_name) { 8989 case TCP_RACK_PROP_RATE: 8990 optval = rack->r_ctl.rc_prop_rate; 8991 break; 8992 case TCP_RACK_PROP: 8993 /* RACK proportional rate reduction (bool) */ 8994 optval = rack->r_ctl.rc_prop_reduce; 8995 break; 8996 case TCP_RACK_TLP_REDUCE: 8997 /* RACK TLP cwnd reduction (bool) */ 8998 optval = rack->r_ctl.rc_tlp_cwnd_reduce; 8999 break; 9000 case TCP_RACK_EARLY_RECOV: 9001 /* Should recovery happen early (bool) */ 9002 optval = rack->r_ctl.rc_early_recovery; 9003 break; 9004 case TCP_RACK_PACE_REDUCE: 9005 /* RACK Hptsi reduction factor (divisor) */ 9006 optval = rack->rc_pace_reduce; 9007 break; 9008 case TCP_RACK_PACE_MAX_SEG: 9009 /* Max segments in a pace */ 9010 optval = rack->rc_pace_max_segs; 9011 break; 9012 case TCP_RACK_PACE_ALWAYS: 9013 /* Use the always pace method */ 9014 optval = rack->rc_always_pace; 9015 break; 9016 case TCP_RACK_PRR_SENDALOT: 9017 /* Allow PRR to send more than one seg */ 9018 optval = rack->r_ctl.rc_prr_sendalot; 9019 break; 9020 case TCP_RACK_MIN_TO: 9021 /* Minimum time between rack t-o's in ms */ 9022 optval = rack->r_ctl.rc_min_to; 9023 break; 9024 case TCP_RACK_EARLY_SEG: 9025 /* If early recovery max segments */ 9026 optval = rack->r_ctl.rc_early_recovery_segs; 9027 break; 9028 case TCP_RACK_REORD_THRESH: 9029 /* RACK reorder threshold (shift amount) */ 9030 optval = rack->r_ctl.rc_reorder_shift; 9031 break; 9032 case TCP_RACK_REORD_FADE: 9033 /* Does reordering fade after ms time */ 9034 optval = rack->r_ctl.rc_reorder_fade; 9035 break; 9036 case TCP_RACK_TLP_THRESH: 9037 /* RACK TLP theshold i.e. srtt+(srtt/N) */ 9038 optval = rack->r_ctl.rc_tlp_threshold; 9039 break; 9040 case TCP_RACK_PKT_DELAY: 9041 /* RACK added ms i.e. rack-rtt + reord + N */ 9042 optval = rack->r_ctl.rc_pkt_delay; 9043 break; 9044 case TCP_RACK_TLP_USE: 9045 optval = rack->rack_tlp_threshold_use; 9046 break; 9047 case TCP_RACK_TLP_INC_VAR: 9048 /* Does TLP include rtt variance in t-o */ 9049 optval = rack->r_ctl.rc_prr_inc_var; 9050 break; 9051 case TCP_RACK_IDLE_REDUCE_HIGH: 9052 optval = rack->r_idle_reduce_largest; 9053 break; 9054 case TCP_RACK_MIN_PACE: 9055 optval = rack->r_enforce_min_pace; 9056 break; 9057 case TCP_RACK_MIN_PACE_SEG: 9058 optval = rack->r_min_pace_seg_thresh; 9059 break; 9060 case TCP_BBR_RACK_RTT_USE: 9061 optval = rack->r_ctl.rc_rate_sample_method; 9062 break; 9063 case TCP_DELACK: 9064 optval = tp->t_delayed_ack; 9065 break; 9066 case TCP_DATA_AFTER_CLOSE: 9067 optval = rack->rc_allow_data_af_clo; 9068 break; 9069 default: 9070 return (tcp_default_ctloutput(so, sopt, inp, tp)); 9071 break; 9072 } 9073 INP_WUNLOCK(inp); 9074 error = sooptcopyout(sopt, &optval, sizeof optval); 9075 return (error); 9076 } 9077 9078 static int 9079 rack_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp) 9080 { 9081 int32_t error = EINVAL; 9082 struct tcp_rack *rack; 9083 9084 rack = (struct tcp_rack *)tp->t_fb_ptr; 9085 if (rack == NULL) { 9086 /* Huh? */ 9087 goto out; 9088 } 9089 if (sopt->sopt_dir == SOPT_SET) { 9090 return (rack_set_sockopt(so, sopt, inp, tp, rack)); 9091 } else if (sopt->sopt_dir == SOPT_GET) { 9092 return (rack_get_sockopt(so, sopt, inp, tp, rack)); 9093 } 9094 out: 9095 INP_WUNLOCK(inp); 9096 return (error); 9097 } 9098 9099 9100 struct tcp_function_block __tcp_rack = { 9101 .tfb_tcp_block_name = __XSTRING(STACKNAME), 9102 .tfb_tcp_output = rack_output, 9103 .tfb_tcp_do_segment = rack_do_segment, 9104 .tfb_tcp_hpts_do_segment = rack_hpts_do_segment, 9105 .tfb_tcp_ctloutput = rack_ctloutput, 9106 .tfb_tcp_fb_init = rack_init, 9107 .tfb_tcp_fb_fini = rack_fini, 9108 .tfb_tcp_timer_stop_all = rack_stopall, 9109 .tfb_tcp_timer_activate = rack_timer_activate, 9110 .tfb_tcp_timer_active = rack_timer_active, 9111 .tfb_tcp_timer_stop = rack_timer_stop, 9112 .tfb_tcp_rexmit_tmr = rack_remxt_tmr, 9113 .tfb_tcp_handoff_ok = rack_handoff_ok 9114 }; 9115 9116 static const char *rack_stack_names[] = { 9117 __XSTRING(STACKNAME), 9118 #ifdef STACKALIAS 9119 __XSTRING(STACKALIAS), 9120 #endif 9121 }; 9122 9123 static int 9124 rack_ctor(void *mem, int32_t size, void *arg, int32_t how) 9125 { 9126 memset(mem, 0, size); 9127 return (0); 9128 } 9129 9130 static void 9131 rack_dtor(void *mem, int32_t size, void *arg) 9132 { 9133 9134 } 9135 9136 static bool rack_mod_inited = false; 9137 9138 static int 9139 tcp_addrack(module_t mod, int32_t type, void *data) 9140 { 9141 int32_t err = 0; 9142 int num_stacks; 9143 9144 switch (type) { 9145 case MOD_LOAD: 9146 rack_zone = uma_zcreate(__XSTRING(MODNAME) "_map", 9147 sizeof(struct rack_sendmap), 9148 rack_ctor, rack_dtor, NULL, NULL, UMA_ALIGN_PTR, 0); 9149 9150 rack_pcb_zone = uma_zcreate(__XSTRING(MODNAME) "_pcb", 9151 sizeof(struct tcp_rack), 9152 rack_ctor, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0); 9153 9154 sysctl_ctx_init(&rack_sysctl_ctx); 9155 rack_sysctl_root = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 9156 SYSCTL_STATIC_CHILDREN(_net_inet_tcp), 9157 OID_AUTO, 9158 __XSTRING(STACKNAME), 9159 CTLFLAG_RW, 0, 9160 ""); 9161 if (rack_sysctl_root == NULL) { 9162 printf("Failed to add sysctl node\n"); 9163 err = EFAULT; 9164 goto free_uma; 9165 } 9166 rack_init_sysctls(); 9167 num_stacks = nitems(rack_stack_names); 9168 err = register_tcp_functions_as_names(&__tcp_rack, M_WAITOK, 9169 rack_stack_names, &num_stacks); 9170 if (err) { 9171 printf("Failed to register %s stack name for " 9172 "%s module\n", rack_stack_names[num_stacks], 9173 __XSTRING(MODNAME)); 9174 sysctl_ctx_free(&rack_sysctl_ctx); 9175 free_uma: 9176 uma_zdestroy(rack_zone); 9177 uma_zdestroy(rack_pcb_zone); 9178 rack_counter_destroy(); 9179 printf("Failed to register rack module -- err:%d\n", err); 9180 return (err); 9181 } 9182 rack_mod_inited = true; 9183 break; 9184 case MOD_QUIESCE: 9185 err = deregister_tcp_functions(&__tcp_rack, true, false); 9186 break; 9187 case MOD_UNLOAD: 9188 err = deregister_tcp_functions(&__tcp_rack, false, true); 9189 if (err == EBUSY) 9190 break; 9191 if (rack_mod_inited) { 9192 uma_zdestroy(rack_zone); 9193 uma_zdestroy(rack_pcb_zone); 9194 sysctl_ctx_free(&rack_sysctl_ctx); 9195 rack_counter_destroy(); 9196 rack_mod_inited = false; 9197 } 9198 err = 0; 9199 break; 9200 default: 9201 return (EOPNOTSUPP); 9202 } 9203 return (err); 9204 } 9205 9206 static moduledata_t tcp_rack = { 9207 .name = __XSTRING(MODNAME), 9208 .evhand = tcp_addrack, 9209 .priv = 0 9210 }; 9211 9212 MODULE_VERSION(MODNAME, 1); 9213 DECLARE_MODULE(MODNAME, tcp_rack, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY); 9214 MODULE_DEPEND(MODNAME, tcphpts, 1, 1, 1); 9215