1 /*- 2 * Copyright (c) 2016-2018 3 * Netflix Inc. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 * 26 */ 27 28 #include <sys/cdefs.h> 29 __FBSDID("$FreeBSD$"); 30 31 #include "opt_inet.h" 32 #include "opt_inet6.h" 33 #include "opt_ipsec.h" 34 #include "opt_tcpdebug.h" 35 36 #include <sys/param.h> 37 #include <sys/module.h> 38 #include <sys/kernel.h> 39 #ifdef TCP_HHOOK 40 #include <sys/hhook.h> 41 #endif 42 #include <sys/lock.h> 43 #include <sys/malloc.h> 44 #include <sys/lock.h> 45 #include <sys/mutex.h> 46 #include <sys/mbuf.h> 47 #include <sys/proc.h> /* for proc0 declaration */ 48 #include <sys/socket.h> 49 #include <sys/socketvar.h> 50 #include <sys/sysctl.h> 51 #include <sys/systm.h> 52 #ifdef NETFLIX_STATS 53 #include <sys/stats.h> 54 #endif 55 #include <sys/refcount.h> 56 #include <sys/queue.h> 57 #include <sys/smp.h> 58 #include <sys/kthread.h> 59 #include <sys/kern_prefetch.h> 60 61 #include <vm/uma.h> 62 63 #include <net/route.h> 64 #include <net/vnet.h> 65 66 #define TCPSTATES /* for logging */ 67 68 #include <netinet/in.h> 69 #include <netinet/in_kdtrace.h> 70 #include <netinet/in_pcb.h> 71 #include <netinet/ip.h> 72 #include <netinet/ip_icmp.h> /* required for icmp_var.h */ 73 #include <netinet/icmp_var.h> /* for ICMP_BANDLIM */ 74 #include <netinet/ip_var.h> 75 #include <netinet/ip6.h> 76 #include <netinet6/in6_pcb.h> 77 #include <netinet6/ip6_var.h> 78 #define TCPOUTFLAGS 79 #include <netinet/tcp.h> 80 #include <netinet/tcp_fsm.h> 81 #include <netinet/tcp_log_buf.h> 82 #include <netinet/tcp_seq.h> 83 #include <netinet/tcp_timer.h> 84 #include <netinet/tcp_var.h> 85 #include <netinet/tcp_hpts.h> 86 #include <netinet/tcpip.h> 87 #include <netinet/cc/cc.h> 88 #ifdef NETFLIX_CWV 89 #include <netinet/tcp_newcwv.h> 90 #endif 91 #include <netinet/tcp_fastopen.h> 92 #ifdef TCPDEBUG 93 #include <netinet/tcp_debug.h> 94 #endif /* TCPDEBUG */ 95 #ifdef TCP_OFFLOAD 96 #include <netinet/tcp_offload.h> 97 #endif 98 #ifdef INET6 99 #include <netinet6/tcp6_var.h> 100 #endif 101 102 #include <netipsec/ipsec_support.h> 103 104 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 105 #include <netipsec/ipsec.h> 106 #include <netipsec/ipsec6.h> 107 #endif /* IPSEC */ 108 109 #include <netinet/udp.h> 110 #include <netinet/udp_var.h> 111 #include <machine/in_cksum.h> 112 113 #ifdef MAC 114 #include <security/mac/mac_framework.h> 115 #endif 116 #include "sack_filter.h" 117 #include "tcp_rack.h" 118 #include "rack_bbr_common.h" 119 120 uma_zone_t rack_zone; 121 uma_zone_t rack_pcb_zone; 122 123 #ifndef TICKS2SBT 124 #define TICKS2SBT(__t) (tick_sbt * ((sbintime_t)(__t))) 125 #endif 126 127 struct sysctl_ctx_list rack_sysctl_ctx; 128 struct sysctl_oid *rack_sysctl_root; 129 130 #ifndef TCPHPTS 131 fatal error missing option TCPHSTS in the build; 132 #endif 133 134 #define CUM_ACKED 1 135 #define SACKED 2 136 137 /* 138 * The RACK module incorporates a number of 139 * TCP ideas that have been put out into the IETF 140 * over the last few years: 141 * - Matt Mathis's Rate Halving which slowly drops 142 * the congestion window so that the ack clock can 143 * be maintained during a recovery. 144 * - Yuchung Cheng's RACK TCP (for which its named) that 145 * will stop us using the number of dup acks and instead 146 * use time as the gage of when we retransmit. 147 * - Reorder Detection of RFC4737 and the Tail-Loss probe draft 148 * of Dukkipati et.al. 149 * RACK depends on SACK, so if an endpoint arrives that 150 * cannot do SACK the state machine below will shuttle the 151 * connection back to using the "default" TCP stack that is 152 * in FreeBSD. 153 * 154 * To implement RACK the original TCP stack was first decomposed 155 * into a functional state machine with individual states 156 * for each of the possible TCP connection states. The do_segement 157 * functions role in life is to mandate the connection supports SACK 158 * initially and then assure that the RACK state matches the conenction 159 * state before calling the states do_segment function. Each 160 * state is simplified due to the fact that the original do_segment 161 * has been decomposed and we *know* what state we are in (no 162 * switches on the state) and all tests for SACK are gone. This 163 * greatly simplifies what each state does. 164 * 165 * TCP output is also over-written with a new version since it 166 * must maintain the new rack scoreboard. 167 * 168 */ 169 static int32_t rack_precache = 1; 170 static int32_t rack_tlp_thresh = 1; 171 static int32_t rack_reorder_thresh = 2; 172 static int32_t rack_reorder_fade = 60000; /* 0 - never fade, def 60,000 173 * - 60 seconds */ 174 static int32_t rack_pkt_delay = 1; 175 static int32_t rack_inc_var = 0;/* For TLP */ 176 static int32_t rack_reduce_largest_on_idle = 0; 177 static int32_t rack_min_pace_time = 0; 178 static int32_t rack_min_pace_time_seg_req=6; 179 static int32_t rack_early_recovery = 1; 180 static int32_t rack_early_recovery_max_seg = 6; 181 static int32_t rack_send_a_lot_in_prr = 1; 182 static int32_t rack_min_to = 1; /* Number of ms minimum timeout */ 183 static int32_t rack_tlp_in_recovery = 1; /* Can we do TLP in recovery? */ 184 static int32_t rack_verbose_logging = 0; 185 static int32_t rack_ignore_data_after_close = 1; 186 /* 187 * Currently regular tcp has a rto_min of 30ms 188 * the backoff goes 12 times so that ends up 189 * being a total of 122.850 seconds before a 190 * connection is killed. 191 */ 192 static int32_t rack_tlp_min = 10; 193 static int32_t rack_rto_min = 30; /* 30ms same as main freebsd */ 194 static int32_t rack_rto_max = 30000; /* 30 seconds */ 195 static const int32_t rack_free_cache = 2; 196 static int32_t rack_hptsi_segments = 40; 197 static int32_t rack_rate_sample_method = USE_RTT_LOW; 198 static int32_t rack_pace_every_seg = 1; 199 static int32_t rack_delayed_ack_time = 200; /* 200ms */ 200 static int32_t rack_slot_reduction = 4; 201 static int32_t rack_lower_cwnd_at_tlp = 0; 202 static int32_t rack_use_proportional_reduce = 0; 203 static int32_t rack_proportional_rate = 10; 204 static int32_t rack_tlp_max_resend = 2; 205 static int32_t rack_limited_retran = 0; 206 static int32_t rack_always_send_oldest = 0; 207 static int32_t rack_sack_block_limit = 128; 208 static int32_t rack_use_sack_filter = 1; 209 static int32_t rack_tlp_threshold_use = TLP_USE_TWO_ONE; 210 211 /* Rack specific counters */ 212 counter_u64_t rack_badfr; 213 counter_u64_t rack_badfr_bytes; 214 counter_u64_t rack_rtm_prr_retran; 215 counter_u64_t rack_rtm_prr_newdata; 216 counter_u64_t rack_timestamp_mismatch; 217 counter_u64_t rack_reorder_seen; 218 counter_u64_t rack_paced_segments; 219 counter_u64_t rack_unpaced_segments; 220 counter_u64_t rack_saw_enobuf; 221 counter_u64_t rack_saw_enetunreach; 222 223 /* Tail loss probe counters */ 224 counter_u64_t rack_tlp_tot; 225 counter_u64_t rack_tlp_newdata; 226 counter_u64_t rack_tlp_retran; 227 counter_u64_t rack_tlp_retran_bytes; 228 counter_u64_t rack_tlp_retran_fail; 229 counter_u64_t rack_to_tot; 230 counter_u64_t rack_to_arm_rack; 231 counter_u64_t rack_to_arm_tlp; 232 counter_u64_t rack_to_alloc; 233 counter_u64_t rack_to_alloc_hard; 234 counter_u64_t rack_to_alloc_emerg; 235 236 counter_u64_t rack_sack_proc_all; 237 counter_u64_t rack_sack_proc_short; 238 counter_u64_t rack_sack_proc_restart; 239 counter_u64_t rack_runt_sacks; 240 counter_u64_t rack_used_tlpmethod; 241 counter_u64_t rack_used_tlpmethod2; 242 counter_u64_t rack_enter_tlp_calc; 243 counter_u64_t rack_input_idle_reduces; 244 counter_u64_t rack_tlp_does_nada; 245 246 /* Temp CPU counters */ 247 counter_u64_t rack_find_high; 248 249 counter_u64_t rack_progress_drops; 250 counter_u64_t rack_out_size[TCP_MSS_ACCT_SIZE]; 251 counter_u64_t rack_opts_arry[RACK_OPTS_SIZE]; 252 253 static void 254 rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, int event, int line); 255 256 static int 257 rack_process_ack(struct mbuf *m, struct tcphdr *th, 258 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t * ti_locked, 259 uint32_t tiwin, int32_t tlen, int32_t * ofia, int32_t thflags, int32_t * ret_val); 260 static int 261 rack_process_data(struct mbuf *m, struct tcphdr *th, 262 struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, 263 int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); 264 static void 265 rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, 266 struct tcphdr *th, uint16_t nsegs, uint16_t type, int32_t recovery); 267 static struct rack_sendmap *rack_alloc(struct tcp_rack *rack); 268 static struct rack_sendmap * 269 rack_check_recovery_mode(struct tcpcb *tp, 270 uint32_t tsused); 271 static void 272 rack_cong_signal(struct tcpcb *tp, struct tcphdr *th, 273 uint32_t type); 274 static void rack_counter_destroy(void); 275 static int 276 rack_ctloutput(struct socket *so, struct sockopt *sopt, 277 struct inpcb *inp, struct tcpcb *tp); 278 static int32_t rack_ctor(void *mem, int32_t size, void *arg, int32_t how); 279 static void 280 rack_do_segment(struct mbuf *m, struct tcphdr *th, 281 struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, 282 uint8_t iptos, int32_t ti_locked); 283 static void rack_dtor(void *mem, int32_t size, void *arg); 284 static void 285 rack_earlier_retran(struct tcpcb *tp, struct rack_sendmap *rsm, 286 uint32_t t, uint32_t cts); 287 static struct rack_sendmap * 288 rack_find_high_nonack(struct tcp_rack *rack, 289 struct rack_sendmap *rsm); 290 static struct rack_sendmap *rack_find_lowest_rsm(struct tcp_rack *rack); 291 static void rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm); 292 static void rack_fini(struct tcpcb *tp, int32_t tcb_is_purged); 293 static int 294 rack_get_sockopt(struct socket *so, struct sockopt *sopt, 295 struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack); 296 static int32_t rack_handoff_ok(struct tcpcb *tp); 297 static int32_t rack_init(struct tcpcb *tp); 298 static void rack_init_sysctls(void); 299 static void 300 rack_log_ack(struct tcpcb *tp, struct tcpopt *to, 301 struct tcphdr *th); 302 static void 303 rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len, 304 uint32_t seq_out, uint8_t th_flags, int32_t err, uint32_t ts, 305 uint8_t pass, struct rack_sendmap *hintrsm); 306 static void 307 rack_log_sack_passed(struct tcpcb *tp, struct tcp_rack *rack, 308 struct rack_sendmap *rsm); 309 static void rack_log_to_event(struct tcp_rack *rack, int32_t to_num); 310 static int32_t rack_output(struct tcpcb *tp); 311 static void 312 rack_hpts_do_segment(struct mbuf *m, struct tcphdr *th, 313 struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, 314 uint8_t iptos, int32_t ti_locked, int32_t nxt_pkt, struct timeval *tv); 315 316 static uint32_t 317 rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, 318 struct sackblk *sack, struct tcpopt *to, struct rack_sendmap **prsm, 319 uint32_t cts); 320 static void rack_post_recovery(struct tcpcb *tp, struct tcphdr *th); 321 static void rack_remxt_tmr(struct tcpcb *tp); 322 static int 323 rack_set_sockopt(struct socket *so, struct sockopt *sopt, 324 struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack); 325 static void rack_set_state(struct tcpcb *tp, struct tcp_rack *rack); 326 static int32_t rack_stopall(struct tcpcb *tp); 327 static void 328 rack_timer_activate(struct tcpcb *tp, uint32_t timer_type, 329 uint32_t delta); 330 static int32_t rack_timer_active(struct tcpcb *tp, uint32_t timer_type); 331 static void rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line); 332 static void rack_timer_stop(struct tcpcb *tp, uint32_t timer_type); 333 static uint32_t 334 rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack, 335 struct rack_sendmap *rsm, uint32_t ts, int32_t * lenp); 336 static void 337 rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack, 338 struct rack_sendmap *rsm, uint32_t ts); 339 static int 340 rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack, 341 struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type); 342 static int32_t tcp_addrack(module_t mod, int32_t type, void *data); 343 static void 344 rack_challenge_ack(struct mbuf *m, struct tcphdr *th, 345 struct tcpcb *tp, int32_t * ti_locked, int32_t * ret_val); 346 static int 347 rack_do_close_wait(struct mbuf *m, struct tcphdr *th, 348 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 349 int32_t tlen, int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); 350 static int 351 rack_do_closing(struct mbuf *m, struct tcphdr *th, 352 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 353 int32_t tlen, int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); 354 static void rack_do_drop(struct mbuf *m, struct tcpcb *tp, int32_t * ti_locked); 355 static void 356 rack_do_dropafterack(struct mbuf *m, struct tcpcb *tp, 357 struct tcphdr *th, int32_t * ti_locked, int32_t thflags, int32_t tlen, int32_t * ret_val); 358 static void 359 rack_do_dropwithreset(struct mbuf *m, struct tcpcb *tp, 360 struct tcphdr *th, int32_t * ti_locked, int32_t rstreason, int32_t tlen); 361 static int 362 rack_do_established(struct mbuf *m, struct tcphdr *th, 363 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 364 int32_t tlen, int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); 365 static int 366 rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, 367 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 368 int32_t tlen, int32_t * ti_locked, uint32_t tiwin, int32_t nxt_pkt); 369 static int 370 rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, 371 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 372 int32_t tlen, int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); 373 static int 374 rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, 375 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 376 int32_t tlen, int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); 377 static int 378 rack_do_lastack(struct mbuf *m, struct tcphdr *th, 379 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 380 int32_t tlen, int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); 381 static int 382 rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, 383 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 384 int32_t tlen, int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); 385 static int 386 rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, 387 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 388 int32_t tlen, int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); 389 static int 390 rack_drop_checks(struct tcpopt *to, struct mbuf *m, 391 struct tcphdr *th, struct tcpcb *tp, int32_t * tlenp, int32_t * ti_locked, int32_t * thf, 392 int32_t * drop_hdrlen, int32_t * ret_val); 393 static int 394 rack_process_rst(struct mbuf *m, struct tcphdr *th, 395 struct socket *so, struct tcpcb *tp, int32_t * ti_locked); 396 struct rack_sendmap * 397 tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, 398 uint32_t tsused); 399 static void tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt); 400 static void 401 tcp_rack_partialack(struct tcpcb *tp, struct tcphdr *th); 402 403 static int 404 rack_ts_check(struct mbuf *m, struct tcphdr *th, 405 struct tcpcb *tp, int32_t * ti_locked, int32_t tlen, int32_t thflags, int32_t * ret_val); 406 407 int32_t rack_clear_counter=0; 408 409 410 static int 411 sysctl_rack_clear(SYSCTL_HANDLER_ARGS) 412 { 413 uint32_t stat; 414 int32_t error; 415 416 error = SYSCTL_OUT(req, &rack_clear_counter, sizeof(uint32_t)); 417 if (error || req->newptr == NULL) 418 return error; 419 420 error = SYSCTL_IN(req, &stat, sizeof(uint32_t)); 421 if (error) 422 return (error); 423 if (stat == 1) { 424 #ifdef INVARIANTS 425 printf("Clearing RACK counters\n"); 426 #endif 427 counter_u64_zero(rack_badfr); 428 counter_u64_zero(rack_badfr_bytes); 429 counter_u64_zero(rack_rtm_prr_retran); 430 counter_u64_zero(rack_rtm_prr_newdata); 431 counter_u64_zero(rack_timestamp_mismatch); 432 counter_u64_zero(rack_reorder_seen); 433 counter_u64_zero(rack_tlp_tot); 434 counter_u64_zero(rack_tlp_newdata); 435 counter_u64_zero(rack_tlp_retran); 436 counter_u64_zero(rack_tlp_retran_bytes); 437 counter_u64_zero(rack_tlp_retran_fail); 438 counter_u64_zero(rack_to_tot); 439 counter_u64_zero(rack_to_arm_rack); 440 counter_u64_zero(rack_to_arm_tlp); 441 counter_u64_zero(rack_paced_segments); 442 counter_u64_zero(rack_unpaced_segments); 443 counter_u64_zero(rack_saw_enobuf); 444 counter_u64_zero(rack_saw_enetunreach); 445 counter_u64_zero(rack_to_alloc_hard); 446 counter_u64_zero(rack_to_alloc_emerg); 447 counter_u64_zero(rack_sack_proc_all); 448 counter_u64_zero(rack_sack_proc_short); 449 counter_u64_zero(rack_sack_proc_restart); 450 counter_u64_zero(rack_to_alloc); 451 counter_u64_zero(rack_find_high); 452 counter_u64_zero(rack_runt_sacks); 453 counter_u64_zero(rack_used_tlpmethod); 454 counter_u64_zero(rack_used_tlpmethod2); 455 counter_u64_zero(rack_enter_tlp_calc); 456 counter_u64_zero(rack_progress_drops); 457 counter_u64_zero(rack_tlp_does_nada); 458 } 459 rack_clear_counter = 0; 460 return (0); 461 } 462 463 464 465 static void 466 rack_init_sysctls() 467 { 468 SYSCTL_ADD_S32(&rack_sysctl_ctx, 469 SYSCTL_CHILDREN(rack_sysctl_root), 470 OID_AUTO, "rate_sample_method", CTLFLAG_RW, 471 &rack_rate_sample_method , USE_RTT_LOW, 472 "What method should we use for rate sampling 0=high, 1=low "); 473 SYSCTL_ADD_S32(&rack_sysctl_ctx, 474 SYSCTL_CHILDREN(rack_sysctl_root), 475 OID_AUTO, "data_after_close", CTLFLAG_RW, 476 &rack_ignore_data_after_close, 0, 477 "Do we hold off sending a RST until all pending data is ack'd"); 478 SYSCTL_ADD_S32(&rack_sysctl_ctx, 479 SYSCTL_CHILDREN(rack_sysctl_root), 480 OID_AUTO, "tlpmethod", CTLFLAG_RW, 481 &rack_tlp_threshold_use, TLP_USE_TWO_ONE, 482 "What method do we do for TLP time calc 0=no-de-ack-comp, 1=ID, 2=2.1, 3=2.2"); 483 SYSCTL_ADD_S32(&rack_sysctl_ctx, 484 SYSCTL_CHILDREN(rack_sysctl_root), 485 OID_AUTO, "min_pace_time", CTLFLAG_RW, 486 &rack_min_pace_time, 0, 487 "Should we enforce a minimum pace time of 1ms"); 488 SYSCTL_ADD_S32(&rack_sysctl_ctx, 489 SYSCTL_CHILDREN(rack_sysctl_root), 490 OID_AUTO, "min_pace_segs", CTLFLAG_RW, 491 &rack_min_pace_time_seg_req, 6, 492 "How many segments have to be in the len to enforce min-pace-time"); 493 SYSCTL_ADD_S32(&rack_sysctl_ctx, 494 SYSCTL_CHILDREN(rack_sysctl_root), 495 OID_AUTO, "idle_reduce_high", CTLFLAG_RW, 496 &rack_reduce_largest_on_idle, 0, 497 "Should we reduce the largest cwnd seen to IW on idle reduction"); 498 SYSCTL_ADD_S32(&rack_sysctl_ctx, 499 SYSCTL_CHILDREN(rack_sysctl_root), 500 OID_AUTO, "bb_verbose", CTLFLAG_RW, 501 &rack_verbose_logging, 0, 502 "Should RACK black box logging be verbose"); 503 SYSCTL_ADD_S32(&rack_sysctl_ctx, 504 SYSCTL_CHILDREN(rack_sysctl_root), 505 OID_AUTO, "sackfiltering", CTLFLAG_RW, 506 &rack_use_sack_filter, 1, 507 "Do we use sack filtering?"); 508 SYSCTL_ADD_S32(&rack_sysctl_ctx, 509 SYSCTL_CHILDREN(rack_sysctl_root), 510 OID_AUTO, "delayed_ack", CTLFLAG_RW, 511 &rack_delayed_ack_time, 200, 512 "Delayed ack time (200ms)"); 513 SYSCTL_ADD_S32(&rack_sysctl_ctx, 514 SYSCTL_CHILDREN(rack_sysctl_root), 515 OID_AUTO, "tlpminto", CTLFLAG_RW, 516 &rack_tlp_min, 10, 517 "TLP minimum timeout per the specification (10ms)"); 518 SYSCTL_ADD_S32(&rack_sysctl_ctx, 519 SYSCTL_CHILDREN(rack_sysctl_root), 520 OID_AUTO, "precache", CTLFLAG_RW, 521 &rack_precache, 0, 522 "Where should we precache the mcopy (0 is not at all)"); 523 SYSCTL_ADD_S32(&rack_sysctl_ctx, 524 SYSCTL_CHILDREN(rack_sysctl_root), 525 OID_AUTO, "sblklimit", CTLFLAG_RW, 526 &rack_sack_block_limit, 128, 527 "When do we start paying attention to small sack blocks"); 528 SYSCTL_ADD_S32(&rack_sysctl_ctx, 529 SYSCTL_CHILDREN(rack_sysctl_root), 530 OID_AUTO, "send_oldest", CTLFLAG_RW, 531 &rack_always_send_oldest, 1, 532 "Should we always send the oldest TLP and RACK-TLP"); 533 SYSCTL_ADD_S32(&rack_sysctl_ctx, 534 SYSCTL_CHILDREN(rack_sysctl_root), 535 OID_AUTO, "rack_tlp_in_recovery", CTLFLAG_RW, 536 &rack_tlp_in_recovery, 1, 537 "Can we do a TLP during recovery?"); 538 SYSCTL_ADD_S32(&rack_sysctl_ctx, 539 SYSCTL_CHILDREN(rack_sysctl_root), 540 OID_AUTO, "rack_tlimit", CTLFLAG_RW, 541 &rack_limited_retran, 0, 542 "How many times can a rack timeout drive out sends"); 543 SYSCTL_ADD_S32(&rack_sysctl_ctx, 544 SYSCTL_CHILDREN(rack_sysctl_root), 545 OID_AUTO, "minrto", CTLFLAG_RW, 546 &rack_rto_min, 0, 547 "Minimum RTO in ms -- set with caution below 1000 due to TLP"); 548 SYSCTL_ADD_S32(&rack_sysctl_ctx, 549 SYSCTL_CHILDREN(rack_sysctl_root), 550 OID_AUTO, "maxrto", CTLFLAG_RW, 551 &rack_rto_max, 0, 552 "Maxiumum RTO in ms -- should be at least as large as min_rto"); 553 SYSCTL_ADD_S32(&rack_sysctl_ctx, 554 SYSCTL_CHILDREN(rack_sysctl_root), 555 OID_AUTO, "tlp_retry", CTLFLAG_RW, 556 &rack_tlp_max_resend, 2, 557 "How many times does TLP retry a single segment or multiple with no ACK"); 558 SYSCTL_ADD_S32(&rack_sysctl_ctx, 559 SYSCTL_CHILDREN(rack_sysctl_root), 560 OID_AUTO, "recovery_loss_prop", CTLFLAG_RW, 561 &rack_use_proportional_reduce, 0, 562 "Should we proportionaly reduce cwnd based on the number of losses "); 563 SYSCTL_ADD_S32(&rack_sysctl_ctx, 564 SYSCTL_CHILDREN(rack_sysctl_root), 565 OID_AUTO, "recovery_prop", CTLFLAG_RW, 566 &rack_proportional_rate, 10, 567 "What percent reduction per loss"); 568 SYSCTL_ADD_S32(&rack_sysctl_ctx, 569 SYSCTL_CHILDREN(rack_sysctl_root), 570 OID_AUTO, "tlp_cwnd_flag", CTLFLAG_RW, 571 &rack_lower_cwnd_at_tlp, 0, 572 "When a TLP completes a retran should we enter recovery?"); 573 SYSCTL_ADD_S32(&rack_sysctl_ctx, 574 SYSCTL_CHILDREN(rack_sysctl_root), 575 OID_AUTO, "hptsi_reduces", CTLFLAG_RW, 576 &rack_slot_reduction, 4, 577 "When setting a slot should we reduce by divisor"); 578 SYSCTL_ADD_S32(&rack_sysctl_ctx, 579 SYSCTL_CHILDREN(rack_sysctl_root), 580 OID_AUTO, "hptsi_every_seg", CTLFLAG_RW, 581 &rack_pace_every_seg, 1, 582 "Should we pace out every segment hptsi"); 583 SYSCTL_ADD_S32(&rack_sysctl_ctx, 584 SYSCTL_CHILDREN(rack_sysctl_root), 585 OID_AUTO, "hptsi_seg_max", CTLFLAG_RW, 586 &rack_hptsi_segments, 6, 587 "Should we pace out only a limited size of segments"); 588 SYSCTL_ADD_S32(&rack_sysctl_ctx, 589 SYSCTL_CHILDREN(rack_sysctl_root), 590 OID_AUTO, "prr_sendalot", CTLFLAG_RW, 591 &rack_send_a_lot_in_prr, 1, 592 "Send a lot in prr"); 593 SYSCTL_ADD_S32(&rack_sysctl_ctx, 594 SYSCTL_CHILDREN(rack_sysctl_root), 595 OID_AUTO, "minto", CTLFLAG_RW, 596 &rack_min_to, 1, 597 "Minimum rack timeout in milliseconds"); 598 SYSCTL_ADD_S32(&rack_sysctl_ctx, 599 SYSCTL_CHILDREN(rack_sysctl_root), 600 OID_AUTO, "earlyrecoveryseg", CTLFLAG_RW, 601 &rack_early_recovery_max_seg, 6, 602 "Max segments in early recovery"); 603 SYSCTL_ADD_S32(&rack_sysctl_ctx, 604 SYSCTL_CHILDREN(rack_sysctl_root), 605 OID_AUTO, "earlyrecovery", CTLFLAG_RW, 606 &rack_early_recovery, 1, 607 "Do we do early recovery with rack"); 608 SYSCTL_ADD_S32(&rack_sysctl_ctx, 609 SYSCTL_CHILDREN(rack_sysctl_root), 610 OID_AUTO, "reorder_thresh", CTLFLAG_RW, 611 &rack_reorder_thresh, 2, 612 "What factor for rack will be added when seeing reordering (shift right)"); 613 SYSCTL_ADD_S32(&rack_sysctl_ctx, 614 SYSCTL_CHILDREN(rack_sysctl_root), 615 OID_AUTO, "rtt_tlp_thresh", CTLFLAG_RW, 616 &rack_tlp_thresh, 1, 617 "what divisor for TLP rtt/retran will be added (1=rtt, 2=1/2 rtt etc)"); 618 SYSCTL_ADD_S32(&rack_sysctl_ctx, 619 SYSCTL_CHILDREN(rack_sysctl_root), 620 OID_AUTO, "reorder_fade", CTLFLAG_RW, 621 &rack_reorder_fade, 0, 622 "Does reorder detection fade, if so how many ms (0 means never)"); 623 SYSCTL_ADD_S32(&rack_sysctl_ctx, 624 SYSCTL_CHILDREN(rack_sysctl_root), 625 OID_AUTO, "pktdelay", CTLFLAG_RW, 626 &rack_pkt_delay, 1, 627 "Extra RACK time (in ms) besides reordering thresh"); 628 SYSCTL_ADD_S32(&rack_sysctl_ctx, 629 SYSCTL_CHILDREN(rack_sysctl_root), 630 OID_AUTO, "inc_var", CTLFLAG_RW, 631 &rack_inc_var, 0, 632 "Should rack add to the TLP timer the variance in rtt calculation"); 633 rack_badfr = counter_u64_alloc(M_WAITOK); 634 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 635 SYSCTL_CHILDREN(rack_sysctl_root), 636 OID_AUTO, "badfr", CTLFLAG_RD, 637 &rack_badfr, "Total number of bad FRs"); 638 rack_badfr_bytes = counter_u64_alloc(M_WAITOK); 639 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 640 SYSCTL_CHILDREN(rack_sysctl_root), 641 OID_AUTO, "badfr_bytes", CTLFLAG_RD, 642 &rack_badfr_bytes, "Total number of bad FRs"); 643 rack_rtm_prr_retran = counter_u64_alloc(M_WAITOK); 644 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 645 SYSCTL_CHILDREN(rack_sysctl_root), 646 OID_AUTO, "prrsndret", CTLFLAG_RD, 647 &rack_rtm_prr_retran, 648 "Total number of prr based retransmits"); 649 rack_rtm_prr_newdata = counter_u64_alloc(M_WAITOK); 650 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 651 SYSCTL_CHILDREN(rack_sysctl_root), 652 OID_AUTO, "prrsndnew", CTLFLAG_RD, 653 &rack_rtm_prr_newdata, 654 "Total number of prr based new transmits"); 655 rack_timestamp_mismatch = counter_u64_alloc(M_WAITOK); 656 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 657 SYSCTL_CHILDREN(rack_sysctl_root), 658 OID_AUTO, "tsnf", CTLFLAG_RD, 659 &rack_timestamp_mismatch, 660 "Total number of timestamps that we could not find the reported ts"); 661 rack_find_high = counter_u64_alloc(M_WAITOK); 662 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 663 SYSCTL_CHILDREN(rack_sysctl_root), 664 OID_AUTO, "findhigh", CTLFLAG_RD, 665 &rack_find_high, 666 "Total number of FIN causing find-high"); 667 rack_reorder_seen = counter_u64_alloc(M_WAITOK); 668 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 669 SYSCTL_CHILDREN(rack_sysctl_root), 670 OID_AUTO, "reordering", CTLFLAG_RD, 671 &rack_reorder_seen, 672 "Total number of times we added delay due to reordering"); 673 rack_tlp_tot = counter_u64_alloc(M_WAITOK); 674 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 675 SYSCTL_CHILDREN(rack_sysctl_root), 676 OID_AUTO, "tlp_to_total", CTLFLAG_RD, 677 &rack_tlp_tot, 678 "Total number of tail loss probe expirations"); 679 rack_tlp_newdata = counter_u64_alloc(M_WAITOK); 680 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 681 SYSCTL_CHILDREN(rack_sysctl_root), 682 OID_AUTO, "tlp_new", CTLFLAG_RD, 683 &rack_tlp_newdata, 684 "Total number of tail loss probe sending new data"); 685 686 rack_tlp_retran = counter_u64_alloc(M_WAITOK); 687 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 688 SYSCTL_CHILDREN(rack_sysctl_root), 689 OID_AUTO, "tlp_retran", CTLFLAG_RD, 690 &rack_tlp_retran, 691 "Total number of tail loss probe sending retransmitted data"); 692 rack_tlp_retran_bytes = counter_u64_alloc(M_WAITOK); 693 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 694 SYSCTL_CHILDREN(rack_sysctl_root), 695 OID_AUTO, "tlp_retran_bytes", CTLFLAG_RD, 696 &rack_tlp_retran_bytes, 697 "Total bytes of tail loss probe sending retransmitted data"); 698 rack_tlp_retran_fail = counter_u64_alloc(M_WAITOK); 699 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 700 SYSCTL_CHILDREN(rack_sysctl_root), 701 OID_AUTO, "tlp_retran_fail", CTLFLAG_RD, 702 &rack_tlp_retran_fail, 703 "Total number of tail loss probe sending retransmitted data that failed (wait for t3)"); 704 rack_to_tot = counter_u64_alloc(M_WAITOK); 705 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 706 SYSCTL_CHILDREN(rack_sysctl_root), 707 OID_AUTO, "rack_to_tot", CTLFLAG_RD, 708 &rack_to_tot, 709 "Total number of times the rack to expired?"); 710 rack_to_arm_rack = counter_u64_alloc(M_WAITOK); 711 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 712 SYSCTL_CHILDREN(rack_sysctl_root), 713 OID_AUTO, "arm_rack", CTLFLAG_RD, 714 &rack_to_arm_rack, 715 "Total number of times the rack timer armed?"); 716 rack_to_arm_tlp = counter_u64_alloc(M_WAITOK); 717 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 718 SYSCTL_CHILDREN(rack_sysctl_root), 719 OID_AUTO, "arm_tlp", CTLFLAG_RD, 720 &rack_to_arm_tlp, 721 "Total number of times the tlp timer armed?"); 722 rack_paced_segments = counter_u64_alloc(M_WAITOK); 723 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 724 SYSCTL_CHILDREN(rack_sysctl_root), 725 OID_AUTO, "paced", CTLFLAG_RD, 726 &rack_paced_segments, 727 "Total number of times a segment send caused hptsi"); 728 rack_unpaced_segments = counter_u64_alloc(M_WAITOK); 729 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 730 SYSCTL_CHILDREN(rack_sysctl_root), 731 OID_AUTO, "unpaced", CTLFLAG_RD, 732 &rack_unpaced_segments, 733 "Total number of times a segment did not cause hptsi"); 734 rack_saw_enobuf = counter_u64_alloc(M_WAITOK); 735 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 736 SYSCTL_CHILDREN(rack_sysctl_root), 737 OID_AUTO, "saw_enobufs", CTLFLAG_RD, 738 &rack_saw_enobuf, 739 "Total number of times a segment did not cause hptsi"); 740 rack_saw_enetunreach = counter_u64_alloc(M_WAITOK); 741 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 742 SYSCTL_CHILDREN(rack_sysctl_root), 743 OID_AUTO, "saw_enetunreach", CTLFLAG_RD, 744 &rack_saw_enetunreach, 745 "Total number of times a segment did not cause hptsi"); 746 rack_to_alloc = counter_u64_alloc(M_WAITOK); 747 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 748 SYSCTL_CHILDREN(rack_sysctl_root), 749 OID_AUTO, "allocs", CTLFLAG_RD, 750 &rack_to_alloc, 751 "Total allocations of tracking structures"); 752 rack_to_alloc_hard = counter_u64_alloc(M_WAITOK); 753 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 754 SYSCTL_CHILDREN(rack_sysctl_root), 755 OID_AUTO, "allochard", CTLFLAG_RD, 756 &rack_to_alloc_hard, 757 "Total allocations done with sleeping the hard way"); 758 rack_to_alloc_emerg = counter_u64_alloc(M_WAITOK); 759 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 760 SYSCTL_CHILDREN(rack_sysctl_root), 761 OID_AUTO, "allocemerg", CTLFLAG_RD, 762 &rack_to_alloc_emerg, 763 "Total alocations done from emergency cache"); 764 rack_sack_proc_all = counter_u64_alloc(M_WAITOK); 765 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 766 SYSCTL_CHILDREN(rack_sysctl_root), 767 OID_AUTO, "sack_long", CTLFLAG_RD, 768 &rack_sack_proc_all, 769 "Total times we had to walk whole list for sack processing"); 770 771 rack_sack_proc_restart = counter_u64_alloc(M_WAITOK); 772 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 773 SYSCTL_CHILDREN(rack_sysctl_root), 774 OID_AUTO, "sack_restart", CTLFLAG_RD, 775 &rack_sack_proc_restart, 776 "Total times we had to walk whole list due to a restart"); 777 rack_sack_proc_short = counter_u64_alloc(M_WAITOK); 778 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 779 SYSCTL_CHILDREN(rack_sysctl_root), 780 OID_AUTO, "sack_short", CTLFLAG_RD, 781 &rack_sack_proc_short, 782 "Total times we took shortcut for sack processing"); 783 rack_enter_tlp_calc = counter_u64_alloc(M_WAITOK); 784 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 785 SYSCTL_CHILDREN(rack_sysctl_root), 786 OID_AUTO, "tlp_calc_entered", CTLFLAG_RD, 787 &rack_enter_tlp_calc, 788 "Total times we called calc-tlp"); 789 rack_used_tlpmethod = counter_u64_alloc(M_WAITOK); 790 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 791 SYSCTL_CHILDREN(rack_sysctl_root), 792 OID_AUTO, "hit_tlp_method", CTLFLAG_RD, 793 &rack_used_tlpmethod, 794 "Total number of runt sacks"); 795 rack_used_tlpmethod2 = counter_u64_alloc(M_WAITOK); 796 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 797 SYSCTL_CHILDREN(rack_sysctl_root), 798 OID_AUTO, "hit_tlp_method2", CTLFLAG_RD, 799 &rack_used_tlpmethod2, 800 "Total number of runt sacks 2"); 801 rack_runt_sacks = counter_u64_alloc(M_WAITOK); 802 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 803 SYSCTL_CHILDREN(rack_sysctl_root), 804 OID_AUTO, "runtsacks", CTLFLAG_RD, 805 &rack_runt_sacks, 806 "Total number of runt sacks"); 807 rack_progress_drops = counter_u64_alloc(M_WAITOK); 808 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 809 SYSCTL_CHILDREN(rack_sysctl_root), 810 OID_AUTO, "prog_drops", CTLFLAG_RD, 811 &rack_progress_drops, 812 "Total number of progress drops"); 813 rack_input_idle_reduces = counter_u64_alloc(M_WAITOK); 814 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 815 SYSCTL_CHILDREN(rack_sysctl_root), 816 OID_AUTO, "idle_reduce_oninput", CTLFLAG_RD, 817 &rack_input_idle_reduces, 818 "Total number of idle reductions on input"); 819 rack_tlp_does_nada = counter_u64_alloc(M_WAITOK); 820 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 821 SYSCTL_CHILDREN(rack_sysctl_root), 822 OID_AUTO, "tlp_nada", CTLFLAG_RD, 823 &rack_tlp_does_nada, 824 "Total number of nada tlp calls"); 825 COUNTER_ARRAY_ALLOC(rack_out_size, TCP_MSS_ACCT_SIZE, M_WAITOK); 826 SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), 827 OID_AUTO, "outsize", CTLFLAG_RD, 828 rack_out_size, TCP_MSS_ACCT_SIZE, "MSS send sizes"); 829 COUNTER_ARRAY_ALLOC(rack_opts_arry, RACK_OPTS_SIZE, M_WAITOK); 830 SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), 831 OID_AUTO, "opts", CTLFLAG_RD, 832 rack_opts_arry, RACK_OPTS_SIZE, "RACK Option Stats"); 833 SYSCTL_ADD_PROC(&rack_sysctl_ctx, 834 SYSCTL_CHILDREN(rack_sysctl_root), 835 OID_AUTO, "clear", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, 836 &rack_clear_counter, 0, sysctl_rack_clear, "IU", "Clear counters"); 837 } 838 839 static inline int32_t 840 rack_progress_timeout_check(struct tcpcb *tp) 841 { 842 if (tp->t_maxunacktime && tp->t_acktime && TSTMP_GT(ticks, tp->t_acktime)) { 843 if ((ticks - tp->t_acktime) >= tp->t_maxunacktime) { 844 /* 845 * There is an assumption that the caller 846 * will drop the connection so we will 847 * increment the counters here. 848 */ 849 struct tcp_rack *rack; 850 rack = (struct tcp_rack *)tp->t_fb_ptr; 851 counter_u64_add(rack_progress_drops, 1); 852 #ifdef NETFLIX_STATS 853 TCPSTAT_INC(tcps_progdrops); 854 #endif 855 rack_log_progress_event(rack, tp, ticks, PROGRESS_DROP, __LINE__); 856 return (1); 857 } 858 } 859 return (0); 860 } 861 862 863 static void 864 rack_log_to_start(struct tcp_rack *rack, uint32_t cts, uint32_t to, int32_t slot, uint8_t which) 865 { 866 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 867 union tcp_log_stackspecific log; 868 869 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 870 log.u_bbr.flex1 = TICKS_2_MSEC(rack->rc_tp->t_srtt >> TCP_RTT_SHIFT); 871 log.u_bbr.flex2 = to; 872 log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags; 873 log.u_bbr.flex4 = slot; 874 log.u_bbr.flex5 = rack->rc_inp->inp_hptsslot; 875 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur; 876 log.u_bbr.flex8 = which; 877 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 878 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 879 TCP_LOG_EVENT(rack->rc_tp, NULL, 880 &rack->rc_inp->inp_socket->so_rcv, 881 &rack->rc_inp->inp_socket->so_snd, 882 BBR_LOG_TIMERSTAR, 0, 883 0, &log, false); 884 } 885 } 886 887 static void 888 rack_log_to_event(struct tcp_rack *rack, int32_t to_num) 889 { 890 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 891 union tcp_log_stackspecific log; 892 893 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 894 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 895 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 896 log.u_bbr.flex8 = to_num; 897 log.u_bbr.flex1 = rack->r_ctl.rc_rack_min_rtt; 898 log.u_bbr.flex2 = rack->rc_rack_rtt; 899 TCP_LOG_EVENT(rack->rc_tp, NULL, 900 &rack->rc_inp->inp_socket->so_rcv, 901 &rack->rc_inp->inp_socket->so_snd, 902 BBR_LOG_RTO, 0, 903 0, &log, false); 904 } 905 } 906 907 static void 908 rack_log_rtt_upd(struct tcpcb *tp, struct tcp_rack *rack, int32_t t, 909 uint32_t o_srtt, uint32_t o_var) 910 { 911 if (tp->t_logstate != TCP_LOG_STATE_OFF) { 912 union tcp_log_stackspecific log; 913 914 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 915 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 916 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 917 log.u_bbr.flex1 = t; 918 log.u_bbr.flex2 = o_srtt; 919 log.u_bbr.flex3 = o_var; 920 log.u_bbr.flex4 = rack->r_ctl.rack_rs.rs_rtt_lowest; 921 log.u_bbr.flex5 = rack->r_ctl.rack_rs.rs_rtt_highest; 922 log.u_bbr.flex6 = rack->r_ctl.rack_rs.rs_rtt_cnt; 923 log.u_bbr.rttProp = rack->r_ctl.rack_rs.rs_rtt_tot; 924 log.u_bbr.flex8 = rack->r_ctl.rc_rate_sample_method; 925 TCP_LOG_EVENT(tp, NULL, 926 &rack->rc_inp->inp_socket->so_rcv, 927 &rack->rc_inp->inp_socket->so_snd, 928 BBR_LOG_BBRRTT, 0, 929 0, &log, false); 930 } 931 } 932 933 static void 934 rack_log_rtt_sample(struct tcp_rack *rack, uint32_t rtt) 935 { 936 /* 937 * Log the rtt sample we are 938 * applying to the srtt algorithm in 939 * useconds. 940 */ 941 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 942 union tcp_log_stackspecific log; 943 struct timeval tv; 944 945 /* Convert our ms to a microsecond */ 946 log.u_bbr.flex1 = rtt * 1000; 947 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 948 TCP_LOG_EVENTP(rack->rc_tp, NULL, 949 &rack->rc_inp->inp_socket->so_rcv, 950 &rack->rc_inp->inp_socket->so_snd, 951 TCP_LOG_RTT, 0, 952 0, &log, false, &tv); 953 } 954 } 955 956 957 static inline void 958 rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, int event, int line) 959 { 960 if (rack_verbose_logging && (tp->t_logstate != TCP_LOG_STATE_OFF)) { 961 union tcp_log_stackspecific log; 962 963 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 964 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 965 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 966 log.u_bbr.flex1 = line; 967 log.u_bbr.flex2 = tick; 968 log.u_bbr.flex3 = tp->t_maxunacktime; 969 log.u_bbr.flex4 = tp->t_acktime; 970 log.u_bbr.flex8 = event; 971 TCP_LOG_EVENT(tp, NULL, 972 &rack->rc_inp->inp_socket->so_rcv, 973 &rack->rc_inp->inp_socket->so_snd, 974 BBR_LOG_PROGRESS, 0, 975 0, &log, false); 976 } 977 } 978 979 static void 980 rack_log_type_bbrsnd(struct tcp_rack *rack, uint32_t len, uint32_t slot, uint32_t cts) 981 { 982 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 983 union tcp_log_stackspecific log; 984 985 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 986 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 987 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 988 log.u_bbr.flex1 = slot; 989 log.u_bbr.flex7 = (0x0000ffff & rack->r_ctl.rc_hpts_flags); 990 log.u_bbr.flex8 = rack->rc_in_persist; 991 TCP_LOG_EVENT(rack->rc_tp, NULL, 992 &rack->rc_inp->inp_socket->so_rcv, 993 &rack->rc_inp->inp_socket->so_snd, 994 BBR_LOG_BBRSND, 0, 995 0, &log, false); 996 } 997 } 998 999 static void 1000 rack_log_doseg_done(struct tcp_rack *rack, uint32_t cts, int32_t nxt_pkt, int32_t did_out, int way_out) 1001 { 1002 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 1003 union tcp_log_stackspecific log; 1004 log.u_bbr.flex1 = did_out; 1005 log.u_bbr.flex2 = nxt_pkt; 1006 log.u_bbr.flex3 = way_out; 1007 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; 1008 log.u_bbr.flex7 = rack->r_wanted_output; 1009 log.u_bbr.flex8 = rack->rc_in_persist; 1010 TCP_LOG_EVENT(rack->rc_tp, NULL, 1011 &rack->rc_inp->inp_socket->so_rcv, 1012 &rack->rc_inp->inp_socket->so_snd, 1013 BBR_LOG_DOSEG_DONE, 0, 1014 0, &log, false); 1015 } 1016 } 1017 1018 1019 static void 1020 rack_log_type_just_return(struct tcp_rack *rack, uint32_t cts, uint32_t tlen, uint32_t slot, uint8_t hpts_calling) 1021 { 1022 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 1023 union tcp_log_stackspecific log; 1024 1025 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 1026 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 1027 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 1028 log.u_bbr.flex1 = slot; 1029 log.u_bbr.flex2 = rack->r_ctl.rc_hpts_flags; 1030 log.u_bbr.flex7 = hpts_calling; 1031 log.u_bbr.flex8 = rack->rc_in_persist; 1032 TCP_LOG_EVENT(rack->rc_tp, NULL, 1033 &rack->rc_inp->inp_socket->so_rcv, 1034 &rack->rc_inp->inp_socket->so_snd, 1035 BBR_LOG_JUSTRET, 0, 1036 tlen, &log, false); 1037 } 1038 } 1039 1040 static void 1041 rack_log_to_cancel(struct tcp_rack *rack, int32_t hpts_removed, int line) 1042 { 1043 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 1044 union tcp_log_stackspecific log; 1045 1046 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 1047 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 1048 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 1049 log.u_bbr.flex1 = line; 1050 log.u_bbr.flex2 = 0; 1051 log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags; 1052 log.u_bbr.flex4 = 0; 1053 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur; 1054 log.u_bbr.flex8 = hpts_removed; 1055 TCP_LOG_EVENT(rack->rc_tp, NULL, 1056 &rack->rc_inp->inp_socket->so_rcv, 1057 &rack->rc_inp->inp_socket->so_snd, 1058 BBR_LOG_TIMERCANC, 0, 1059 0, &log, false); 1060 } 1061 } 1062 1063 static void 1064 rack_log_to_processing(struct tcp_rack *rack, uint32_t cts, int32_t ret, int32_t timers) 1065 { 1066 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 1067 union tcp_log_stackspecific log; 1068 1069 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 1070 log.u_bbr.flex1 = timers; 1071 log.u_bbr.flex2 = ret; 1072 log.u_bbr.flex3 = rack->r_ctl.rc_timer_exp; 1073 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; 1074 log.u_bbr.flex5 = cts; 1075 TCP_LOG_EVENT(rack->rc_tp, NULL, 1076 &rack->rc_inp->inp_socket->so_rcv, 1077 &rack->rc_inp->inp_socket->so_snd, 1078 BBR_LOG_TO_PROCESS, 0, 1079 0, &log, false); 1080 } 1081 } 1082 1083 static void 1084 rack_counter_destroy() 1085 { 1086 counter_u64_free(rack_badfr); 1087 counter_u64_free(rack_badfr_bytes); 1088 counter_u64_free(rack_rtm_prr_retran); 1089 counter_u64_free(rack_rtm_prr_newdata); 1090 counter_u64_free(rack_timestamp_mismatch); 1091 counter_u64_free(rack_reorder_seen); 1092 counter_u64_free(rack_tlp_tot); 1093 counter_u64_free(rack_tlp_newdata); 1094 counter_u64_free(rack_tlp_retran); 1095 counter_u64_free(rack_tlp_retran_bytes); 1096 counter_u64_free(rack_tlp_retran_fail); 1097 counter_u64_free(rack_to_tot); 1098 counter_u64_free(rack_to_arm_rack); 1099 counter_u64_free(rack_to_arm_tlp); 1100 counter_u64_free(rack_paced_segments); 1101 counter_u64_free(rack_unpaced_segments); 1102 counter_u64_free(rack_saw_enobuf); 1103 counter_u64_free(rack_saw_enetunreach); 1104 counter_u64_free(rack_to_alloc_hard); 1105 counter_u64_free(rack_to_alloc_emerg); 1106 counter_u64_free(rack_sack_proc_all); 1107 counter_u64_free(rack_sack_proc_short); 1108 counter_u64_free(rack_sack_proc_restart); 1109 counter_u64_free(rack_to_alloc); 1110 counter_u64_free(rack_find_high); 1111 counter_u64_free(rack_runt_sacks); 1112 counter_u64_free(rack_enter_tlp_calc); 1113 counter_u64_free(rack_used_tlpmethod); 1114 counter_u64_free(rack_used_tlpmethod2); 1115 counter_u64_free(rack_progress_drops); 1116 counter_u64_free(rack_input_idle_reduces); 1117 counter_u64_free(rack_tlp_does_nada); 1118 COUNTER_ARRAY_FREE(rack_out_size, TCP_MSS_ACCT_SIZE); 1119 COUNTER_ARRAY_FREE(rack_opts_arry, RACK_OPTS_SIZE); 1120 } 1121 1122 static struct rack_sendmap * 1123 rack_alloc(struct tcp_rack *rack) 1124 { 1125 struct rack_sendmap *rsm; 1126 1127 counter_u64_add(rack_to_alloc, 1); 1128 rack->r_ctl.rc_num_maps_alloced++; 1129 rsm = uma_zalloc(rack_zone, M_NOWAIT); 1130 if (rsm) { 1131 return (rsm); 1132 } 1133 if (rack->rc_free_cnt) { 1134 counter_u64_add(rack_to_alloc_emerg, 1); 1135 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); 1136 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_next); 1137 rack->rc_free_cnt--; 1138 return (rsm); 1139 } 1140 return (NULL); 1141 } 1142 1143 static void 1144 rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm) 1145 { 1146 rack->r_ctl.rc_num_maps_alloced--; 1147 if (rack->r_ctl.rc_tlpsend == rsm) 1148 rack->r_ctl.rc_tlpsend = NULL; 1149 if (rack->r_ctl.rc_next == rsm) 1150 rack->r_ctl.rc_next = NULL; 1151 if (rack->r_ctl.rc_sacklast == rsm) 1152 rack->r_ctl.rc_sacklast = NULL; 1153 if (rack->rc_free_cnt < rack_free_cache) { 1154 memset(rsm, 0, sizeof(struct rack_sendmap)); 1155 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_next); 1156 rack->rc_free_cnt++; 1157 return; 1158 } 1159 uma_zfree(rack_zone, rsm); 1160 } 1161 1162 /* 1163 * CC wrapper hook functions 1164 */ 1165 static void 1166 rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, struct tcphdr *th, uint16_t nsegs, 1167 uint16_t type, int32_t recovery) 1168 { 1169 #ifdef NETFLIX_STATS 1170 int32_t gput; 1171 #endif 1172 #ifdef NETFLIX_CWV 1173 u_long old_cwnd = tp->snd_cwnd; 1174 #endif 1175 1176 INP_WLOCK_ASSERT(tp->t_inpcb); 1177 tp->ccv->nsegs = nsegs; 1178 tp->ccv->bytes_this_ack = BYTES_THIS_ACK(tp, th); 1179 if ((recovery) && (rack->r_ctl.rc_early_recovery_segs)) { 1180 uint32_t max; 1181 1182 max = rack->r_ctl.rc_early_recovery_segs * tp->t_maxseg; 1183 if (tp->ccv->bytes_this_ack > max) { 1184 tp->ccv->bytes_this_ack = max; 1185 } 1186 } 1187 if (tp->snd_cwnd <= tp->snd_wnd) 1188 tp->ccv->flags |= CCF_CWND_LIMITED; 1189 else 1190 tp->ccv->flags &= ~CCF_CWND_LIMITED; 1191 1192 if (type == CC_ACK) { 1193 #ifdef NETFLIX_STATS 1194 stats_voi_update_abs_s32(tp->t_stats, VOI_TCP_CALCFRWINDIFF, 1195 ((int32_t) tp->snd_cwnd) - tp->snd_wnd); 1196 if ((tp->t_flags & TF_GPUTINPROG) && 1197 SEQ_GEQ(th->th_ack, tp->gput_ack)) { 1198 gput = (((int64_t) (th->th_ack - tp->gput_seq)) << 3) / 1199 max(1, tcp_ts_getticks() - tp->gput_ts); 1200 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_GPUT, 1201 gput); 1202 /* 1203 * XXXLAS: This is a temporary hack, and should be 1204 * chained off VOI_TCP_GPUT when stats(9) grows an 1205 * API to deal with chained VOIs. 1206 */ 1207 if (tp->t_stats_gput_prev > 0) 1208 stats_voi_update_abs_s32(tp->t_stats, 1209 VOI_TCP_GPUT_ND, 1210 ((gput - tp->t_stats_gput_prev) * 100) / 1211 tp->t_stats_gput_prev); 1212 tp->t_flags &= ~TF_GPUTINPROG; 1213 tp->t_stats_gput_prev = gput; 1214 1215 if (tp->t_maxpeakrate) { 1216 /* 1217 * We update t_peakrate_thr. This gives us roughly 1218 * one update per round trip time. 1219 */ 1220 tcp_update_peakrate_thr(tp); 1221 } 1222 } 1223 #endif 1224 if (tp->snd_cwnd > tp->snd_ssthresh) { 1225 tp->t_bytes_acked += min(tp->ccv->bytes_this_ack, 1226 nsegs * V_tcp_abc_l_var * tp->t_maxseg); 1227 if (tp->t_bytes_acked >= tp->snd_cwnd) { 1228 tp->t_bytes_acked -= tp->snd_cwnd; 1229 tp->ccv->flags |= CCF_ABC_SENTAWND; 1230 } 1231 } else { 1232 tp->ccv->flags &= ~CCF_ABC_SENTAWND; 1233 tp->t_bytes_acked = 0; 1234 } 1235 } 1236 if (CC_ALGO(tp)->ack_received != NULL) { 1237 /* XXXLAS: Find a way to live without this */ 1238 tp->ccv->curack = th->th_ack; 1239 CC_ALGO(tp)->ack_received(tp->ccv, type); 1240 } 1241 #ifdef NETFLIX_STATS 1242 stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_LCWIN, tp->snd_cwnd); 1243 #endif 1244 if (rack->r_ctl.rc_rack_largest_cwnd < tp->snd_cwnd) { 1245 rack->r_ctl.rc_rack_largest_cwnd = tp->snd_cwnd; 1246 } 1247 #ifdef NETFLIX_CWV 1248 if (tp->cwv_enabled) { 1249 /* 1250 * Per RFC 7661: The behaviour in the non-validated phase is 1251 * specified as: o A sender determines whether to increase 1252 * the cwnd based upon whether it is cwnd-limited (see 1253 * Section 4.5.3): * A sender that is cwnd-limited MAY use 1254 * the standard TCP method to increase cwnd (i.e., the 1255 * standard method permits a TCP sender that fully utilises 1256 * the cwnd to increase the cwnd each time it receives an 1257 * ACK). * A sender that is not cwnd-limited MUST NOT 1258 * increase the cwnd when ACK packets are received in this 1259 * phase (i.e., needs to avoid growing the cwnd when it has 1260 * not recently sent using the current size of cwnd). 1261 */ 1262 if ((tp->snd_cwnd > old_cwnd) && 1263 (tp->cwv_cwnd_valid == 0) && 1264 (!(tp->ccv->flags & CCF_CWND_LIMITED))) { 1265 tp->snd_cwnd = old_cwnd; 1266 } 1267 /* Try to update pipeAck and NCWV state */ 1268 if (TCPS_HAVEESTABLISHED(tp->t_state) && 1269 !IN_RECOVERY(tp->t_flags)) { 1270 uint32_t data = sbavail(&(tp->t_inpcb->inp_socket->so_snd)); 1271 1272 tcp_newcwv_update_pipeack(tp, data); 1273 } 1274 } 1275 #endif 1276 /* we enforce max peak rate if it is set. */ 1277 if (tp->t_peakrate_thr && tp->snd_cwnd > tp->t_peakrate_thr) { 1278 tp->snd_cwnd = tp->t_peakrate_thr; 1279 } 1280 } 1281 1282 static void 1283 tcp_rack_partialack(struct tcpcb *tp, struct tcphdr *th) 1284 { 1285 struct tcp_rack *rack; 1286 1287 rack = (struct tcp_rack *)tp->t_fb_ptr; 1288 INP_WLOCK_ASSERT(tp->t_inpcb); 1289 if (rack->r_ctl.rc_prr_sndcnt > 0) 1290 rack->r_wanted_output++; 1291 } 1292 1293 static void 1294 rack_post_recovery(struct tcpcb *tp, struct tcphdr *th) 1295 { 1296 struct tcp_rack *rack; 1297 1298 INP_WLOCK_ASSERT(tp->t_inpcb); 1299 rack = (struct tcp_rack *)tp->t_fb_ptr; 1300 if (CC_ALGO(tp)->post_recovery != NULL) { 1301 tp->ccv->curack = th->th_ack; 1302 CC_ALGO(tp)->post_recovery(tp->ccv); 1303 } 1304 /* 1305 * Here we can in theory adjust cwnd to be based on the number of 1306 * losses in the window (rack->r_ctl.rc_loss_count). This is done 1307 * based on the rack_use_proportional flag. 1308 */ 1309 if (rack->r_ctl.rc_prop_reduce && rack->r_ctl.rc_prop_rate) { 1310 int32_t reduce; 1311 1312 reduce = (rack->r_ctl.rc_loss_count * rack->r_ctl.rc_prop_rate); 1313 if (reduce > 50) { 1314 reduce = 50; 1315 } 1316 tp->snd_cwnd -= ((reduce * tp->snd_cwnd) / 100); 1317 } else { 1318 if (tp->snd_cwnd > tp->snd_ssthresh) { 1319 /* Drop us down to the ssthresh (1/2 cwnd at loss) */ 1320 tp->snd_cwnd = tp->snd_ssthresh; 1321 } 1322 } 1323 if (rack->r_ctl.rc_prr_sndcnt > 0) { 1324 /* Suck the next prr cnt back into cwnd */ 1325 tp->snd_cwnd += rack->r_ctl.rc_prr_sndcnt; 1326 rack->r_ctl.rc_prr_sndcnt = 0; 1327 } 1328 EXIT_RECOVERY(tp->t_flags); 1329 1330 1331 #ifdef NETFLIX_CWV 1332 if (tp->cwv_enabled) { 1333 if ((tp->cwv_cwnd_valid == 0) && 1334 (tp->snd_cwv.in_recovery)) 1335 tcp_newcwv_end_recovery(tp); 1336 } 1337 #endif 1338 } 1339 1340 static void 1341 rack_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type) 1342 { 1343 struct tcp_rack *rack; 1344 1345 INP_WLOCK_ASSERT(tp->t_inpcb); 1346 1347 rack = (struct tcp_rack *)tp->t_fb_ptr; 1348 switch (type) { 1349 case CC_NDUPACK: 1350 /* rack->r_ctl.rc_ssthresh_set = 1;*/ 1351 if (!IN_FASTRECOVERY(tp->t_flags)) { 1352 rack->r_ctl.rc_tlp_rtx_out = 0; 1353 rack->r_ctl.rc_prr_delivered = 0; 1354 rack->r_ctl.rc_prr_out = 0; 1355 rack->r_ctl.rc_loss_count = 0; 1356 rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; 1357 rack->r_ctl.rc_prr_recovery_fs = tp->snd_max - tp->snd_una; 1358 tp->snd_recover = tp->snd_max; 1359 if (tp->t_flags & TF_ECN_PERMIT) 1360 tp->t_flags |= TF_ECN_SND_CWR; 1361 } 1362 break; 1363 case CC_ECN: 1364 if (!IN_CONGRECOVERY(tp->t_flags)) { 1365 TCPSTAT_INC(tcps_ecn_rcwnd); 1366 tp->snd_recover = tp->snd_max; 1367 if (tp->t_flags & TF_ECN_PERMIT) 1368 tp->t_flags |= TF_ECN_SND_CWR; 1369 } 1370 break; 1371 case CC_RTO: 1372 tp->t_dupacks = 0; 1373 tp->t_bytes_acked = 0; 1374 EXIT_RECOVERY(tp->t_flags); 1375 tp->snd_ssthresh = max(2, min(tp->snd_wnd, tp->snd_cwnd) / 2 / 1376 tp->t_maxseg) * tp->t_maxseg; 1377 tp->snd_cwnd = tp->t_maxseg; 1378 break; 1379 case CC_RTO_ERR: 1380 TCPSTAT_INC(tcps_sndrexmitbad); 1381 /* RTO was unnecessary, so reset everything. */ 1382 tp->snd_cwnd = tp->snd_cwnd_prev; 1383 tp->snd_ssthresh = tp->snd_ssthresh_prev; 1384 tp->snd_recover = tp->snd_recover_prev; 1385 if (tp->t_flags & TF_WASFRECOVERY) 1386 ENTER_FASTRECOVERY(tp->t_flags); 1387 if (tp->t_flags & TF_WASCRECOVERY) 1388 ENTER_CONGRECOVERY(tp->t_flags); 1389 tp->snd_nxt = tp->snd_max; 1390 tp->t_badrxtwin = 0; 1391 break; 1392 } 1393 1394 if (CC_ALGO(tp)->cong_signal != NULL) { 1395 if (th != NULL) 1396 tp->ccv->curack = th->th_ack; 1397 CC_ALGO(tp)->cong_signal(tp->ccv, type); 1398 } 1399 #ifdef NETFLIX_CWV 1400 if (tp->cwv_enabled) { 1401 if (tp->snd_cwv.in_recovery == 0 && IN_RECOVERY(tp->t_flags)) { 1402 tcp_newcwv_enter_recovery(tp); 1403 } 1404 if (type == CC_RTO) { 1405 tcp_newcwv_reset(tp); 1406 } 1407 } 1408 #endif 1409 } 1410 1411 1412 1413 static inline void 1414 rack_cc_after_idle(struct tcpcb *tp, int reduce_largest) 1415 { 1416 uint32_t i_cwnd; 1417 1418 INP_WLOCK_ASSERT(tp->t_inpcb); 1419 1420 #ifdef NETFLIX_STATS 1421 TCPSTAT_INC(tcps_idle_restarts); 1422 if (tp->t_state == TCPS_ESTABLISHED) 1423 TCPSTAT_INC(tcps_idle_estrestarts); 1424 #endif 1425 if (CC_ALGO(tp)->after_idle != NULL) 1426 CC_ALGO(tp)->after_idle(tp->ccv); 1427 1428 if (tp->snd_cwnd == 1) 1429 i_cwnd = tp->t_maxseg; /* SYN(-ACK) lost */ 1430 else if (V_tcp_initcwnd_segments) 1431 i_cwnd = min((V_tcp_initcwnd_segments * tp->t_maxseg), 1432 max(2 * tp->t_maxseg, V_tcp_initcwnd_segments * 1460)); 1433 else if (V_tcp_do_rfc3390) 1434 i_cwnd = min(4 * tp->t_maxseg, 1435 max(2 * tp->t_maxseg, 4380)); 1436 else { 1437 /* Per RFC5681 Section 3.1 */ 1438 if (tp->t_maxseg > 2190) 1439 i_cwnd = 2 * tp->t_maxseg; 1440 else if (tp->t_maxseg > 1095) 1441 i_cwnd = 3 * tp->t_maxseg; 1442 else 1443 i_cwnd = 4 * tp->t_maxseg; 1444 } 1445 if (reduce_largest) { 1446 /* 1447 * Do we reduce the largest cwnd to make 1448 * rack play nice on restart hptsi wise? 1449 */ 1450 if (((struct tcp_rack *)tp->t_fb_ptr)->r_ctl.rc_rack_largest_cwnd > i_cwnd) 1451 ((struct tcp_rack *)tp->t_fb_ptr)->r_ctl.rc_rack_largest_cwnd = i_cwnd; 1452 } 1453 /* 1454 * Being idle is no differnt than the initial window. If the cc 1455 * clamps it down below the initial window raise it to the initial 1456 * window. 1457 */ 1458 if (tp->snd_cwnd < i_cwnd) { 1459 tp->snd_cwnd = i_cwnd; 1460 } 1461 } 1462 1463 1464 /* 1465 * Indicate whether this ack should be delayed. We can delay the ack if 1466 * following conditions are met: 1467 * - There is no delayed ack timer in progress. 1468 * - Our last ack wasn't a 0-sized window. We never want to delay 1469 * the ack that opens up a 0-sized window. 1470 * - LRO wasn't used for this segment. We make sure by checking that the 1471 * segment size is not larger than the MSS. 1472 * - Delayed acks are enabled or this is a half-synchronized T/TCP 1473 * connection. 1474 */ 1475 #define DELAY_ACK(tp, tlen) \ 1476 (((tp->t_flags & TF_RXWIN0SENT) == 0) && \ 1477 ((tp->t_flags & TF_DELACK) == 0) && \ 1478 (tlen <= tp->t_maxseg) && \ 1479 (tp->t_delayed_ack || (tp->t_flags & TF_NEEDSYN))) 1480 1481 static inline void 1482 rack_calc_rwin(struct socket *so, struct tcpcb *tp) 1483 { 1484 int32_t win; 1485 1486 /* 1487 * Calculate amount of space in receive window, and then do TCP 1488 * input processing. Receive window is amount of space in rcv queue, 1489 * but not less than advertised window. 1490 */ 1491 win = sbspace(&so->so_rcv); 1492 if (win < 0) 1493 win = 0; 1494 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); 1495 } 1496 1497 static void 1498 rack_do_drop(struct mbuf *m, struct tcpcb *tp, int32_t * ti_locked) 1499 { 1500 if (*ti_locked == TI_RLOCKED) { 1501 INP_INFO_RUNLOCK(&V_tcbinfo); 1502 *ti_locked = TI_UNLOCKED; 1503 } 1504 /* 1505 * Drop space held by incoming segment and return. 1506 */ 1507 if (tp != NULL) 1508 INP_WUNLOCK(tp->t_inpcb); 1509 if (m) 1510 m_freem(m); 1511 } 1512 1513 static void 1514 rack_do_dropwithreset(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t * ti_locked, int32_t rstreason, int32_t tlen) 1515 { 1516 if (*ti_locked == TI_RLOCKED) { 1517 INP_INFO_RUNLOCK(&V_tcbinfo); 1518 *ti_locked = TI_UNLOCKED; 1519 } 1520 if (tp != NULL) { 1521 tcp_dropwithreset(m, th, tp, tlen, rstreason); 1522 INP_WUNLOCK(tp->t_inpcb); 1523 } else 1524 tcp_dropwithreset(m, th, NULL, tlen, rstreason); 1525 } 1526 1527 /* 1528 * The value in ret_val informs the caller 1529 * if we dropped the tcb (and lock) or not. 1530 * 1 = we dropped it, 0 = the TCB is still locked 1531 * and valid. 1532 */ 1533 static void 1534 rack_do_dropafterack(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t * ti_locked, int32_t thflags, int32_t tlen, int32_t * ret_val) 1535 { 1536 /* 1537 * Generate an ACK dropping incoming segment if it occupies sequence 1538 * space, where the ACK reflects our state. 1539 * 1540 * We can now skip the test for the RST flag since all paths to this 1541 * code happen after packets containing RST have been dropped. 1542 * 1543 * In the SYN-RECEIVED state, don't send an ACK unless the segment 1544 * we received passes the SYN-RECEIVED ACK test. If it fails send a 1545 * RST. This breaks the loop in the "LAND" DoS attack, and also 1546 * prevents an ACK storm between two listening ports that have been 1547 * sent forged SYN segments, each with the source address of the 1548 * other. 1549 */ 1550 struct tcp_rack *rack; 1551 1552 if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) && 1553 (SEQ_GT(tp->snd_una, th->th_ack) || 1554 SEQ_GT(th->th_ack, tp->snd_max))) { 1555 *ret_val = 1; 1556 rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen); 1557 return; 1558 } else 1559 *ret_val = 0; 1560 if (*ti_locked == TI_RLOCKED) { 1561 INP_INFO_RUNLOCK(&V_tcbinfo); 1562 *ti_locked = TI_UNLOCKED; 1563 } 1564 rack = (struct tcp_rack *)tp->t_fb_ptr; 1565 rack->r_wanted_output++; 1566 tp->t_flags |= TF_ACKNOW; 1567 if (m) 1568 m_freem(m); 1569 } 1570 1571 1572 static int 1573 rack_process_rst(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, int32_t * ti_locked) 1574 { 1575 /* 1576 * RFC5961 Section 3.2 1577 * 1578 * - RST drops connection only if SEG.SEQ == RCV.NXT. - If RST is in 1579 * window, we send challenge ACK. 1580 * 1581 * Note: to take into account delayed ACKs, we should test against 1582 * last_ack_sent instead of rcv_nxt. Note 2: we handle special case 1583 * of closed window, not covered by the RFC. 1584 */ 1585 int dropped = 0; 1586 1587 if ((SEQ_GEQ(th->th_seq, (tp->last_ack_sent - 1)) && 1588 SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) || 1589 (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) { 1590 1591 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 1592 KASSERT(*ti_locked == TI_RLOCKED, 1593 ("%s: TH_RST ti_locked %d, th %p tp %p", 1594 __func__, *ti_locked, th, tp)); 1595 KASSERT(tp->t_state != TCPS_SYN_SENT, 1596 ("%s: TH_RST for TCPS_SYN_SENT th %p tp %p", 1597 __func__, th, tp)); 1598 1599 if (V_tcp_insecure_rst || 1600 (tp->last_ack_sent == th->th_seq) || 1601 (tp->rcv_nxt == th->th_seq) || 1602 ((tp->last_ack_sent - 1) == th->th_seq)) { 1603 TCPSTAT_INC(tcps_drops); 1604 /* Drop the connection. */ 1605 switch (tp->t_state) { 1606 case TCPS_SYN_RECEIVED: 1607 so->so_error = ECONNREFUSED; 1608 goto close; 1609 case TCPS_ESTABLISHED: 1610 case TCPS_FIN_WAIT_1: 1611 case TCPS_FIN_WAIT_2: 1612 case TCPS_CLOSE_WAIT: 1613 case TCPS_CLOSING: 1614 case TCPS_LAST_ACK: 1615 so->so_error = ECONNRESET; 1616 close: 1617 tcp_state_change(tp, TCPS_CLOSED); 1618 /* FALLTHROUGH */ 1619 default: 1620 tp = tcp_close(tp); 1621 } 1622 dropped = 1; 1623 rack_do_drop(m, tp, ti_locked); 1624 } else { 1625 TCPSTAT_INC(tcps_badrst); 1626 /* Send challenge ACK. */ 1627 tcp_respond(tp, mtod(m, void *), th, m, 1628 tp->rcv_nxt, tp->snd_nxt, TH_ACK); 1629 tp->last_ack_sent = tp->rcv_nxt; 1630 } 1631 } else { 1632 m_freem(m); 1633 } 1634 return (dropped); 1635 } 1636 1637 /* 1638 * The value in ret_val informs the caller 1639 * if we dropped the tcb (and lock) or not. 1640 * 1 = we dropped it, 0 = the TCB is still locked 1641 * and valid. 1642 */ 1643 static void 1644 rack_challenge_ack(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * ti_locked, int32_t * ret_val) 1645 { 1646 KASSERT(*ti_locked == TI_RLOCKED, 1647 ("tcp_do_segment: TH_SYN ti_locked %d", *ti_locked)); 1648 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 1649 1650 TCPSTAT_INC(tcps_badsyn); 1651 if (V_tcp_insecure_syn && 1652 SEQ_GEQ(th->th_seq, tp->last_ack_sent) && 1653 SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) { 1654 tp = tcp_drop(tp, ECONNRESET); 1655 *ret_val = 1; 1656 rack_do_drop(m, tp, ti_locked); 1657 } else { 1658 /* Send challenge ACK. */ 1659 tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt, 1660 tp->snd_nxt, TH_ACK); 1661 tp->last_ack_sent = tp->rcv_nxt; 1662 m = NULL; 1663 *ret_val = 0; 1664 rack_do_drop(m, NULL, ti_locked); 1665 } 1666 } 1667 1668 /* 1669 * rack_ts_check returns 1 for you should not proceed. It places 1670 * in ret_val what should be returned 1/0 by the caller. The 1 indicates 1671 * that the TCB is unlocked and probably dropped. The 0 indicates the 1672 * TCB is still valid and locked. 1673 */ 1674 static int 1675 rack_ts_check(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * ti_locked, int32_t tlen, int32_t thflags, int32_t * ret_val) 1676 { 1677 1678 /* Check to see if ts_recent is over 24 days old. */ 1679 if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) { 1680 /* 1681 * Invalidate ts_recent. If this segment updates ts_recent, 1682 * the age will be reset later and ts_recent will get a 1683 * valid value. If it does not, setting ts_recent to zero 1684 * will at least satisfy the requirement that zero be placed 1685 * in the timestamp echo reply when ts_recent isn't valid. 1686 * The age isn't reset until we get a valid ts_recent 1687 * because we don't want out-of-order segments to be dropped 1688 * when ts_recent is old. 1689 */ 1690 tp->ts_recent = 0; 1691 } else { 1692 TCPSTAT_INC(tcps_rcvduppack); 1693 TCPSTAT_ADD(tcps_rcvdupbyte, tlen); 1694 TCPSTAT_INC(tcps_pawsdrop); 1695 *ret_val = 0; 1696 if (tlen) { 1697 rack_do_dropafterack(m, tp, th, ti_locked, thflags, tlen, ret_val); 1698 } else { 1699 rack_do_drop(m, NULL, ti_locked); 1700 } 1701 return (1); 1702 } 1703 return (0); 1704 } 1705 1706 /* 1707 * rack_drop_checks returns 1 for you should not proceed. It places 1708 * in ret_val what should be returned 1/0 by the caller. The 1 indicates 1709 * that the TCB is unlocked and probably dropped. The 0 indicates the 1710 * TCB is still valid and locked. 1711 */ 1712 static int 1713 rack_drop_checks(struct tcpopt *to, struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * tlenp, int32_t * ti_locked, int32_t * thf, int32_t * drop_hdrlen, int32_t * ret_val) 1714 { 1715 int32_t todrop; 1716 int32_t thflags; 1717 int32_t tlen; 1718 1719 thflags = *thf; 1720 tlen = *tlenp; 1721 todrop = tp->rcv_nxt - th->th_seq; 1722 if (todrop > 0) { 1723 if (thflags & TH_SYN) { 1724 thflags &= ~TH_SYN; 1725 th->th_seq++; 1726 if (th->th_urp > 1) 1727 th->th_urp--; 1728 else 1729 thflags &= ~TH_URG; 1730 todrop--; 1731 } 1732 /* 1733 * Following if statement from Stevens, vol. 2, p. 960. 1734 */ 1735 if (todrop > tlen 1736 || (todrop == tlen && (thflags & TH_FIN) == 0)) { 1737 /* 1738 * Any valid FIN must be to the left of the window. 1739 * At this point the FIN must be a duplicate or out 1740 * of sequence; drop it. 1741 */ 1742 thflags &= ~TH_FIN; 1743 /* 1744 * Send an ACK to resynchronize and drop any data. 1745 * But keep on processing for RST or ACK. 1746 */ 1747 tp->t_flags |= TF_ACKNOW; 1748 todrop = tlen; 1749 TCPSTAT_INC(tcps_rcvduppack); 1750 TCPSTAT_ADD(tcps_rcvdupbyte, todrop); 1751 } else { 1752 TCPSTAT_INC(tcps_rcvpartduppack); 1753 TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop); 1754 } 1755 *drop_hdrlen += todrop; /* drop from the top afterwards */ 1756 th->th_seq += todrop; 1757 tlen -= todrop; 1758 if (th->th_urp > todrop) 1759 th->th_urp -= todrop; 1760 else { 1761 thflags &= ~TH_URG; 1762 th->th_urp = 0; 1763 } 1764 } 1765 /* 1766 * If segment ends after window, drop trailing data (and PUSH and 1767 * FIN); if nothing left, just ACK. 1768 */ 1769 todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd); 1770 if (todrop > 0) { 1771 TCPSTAT_INC(tcps_rcvpackafterwin); 1772 if (todrop >= tlen) { 1773 TCPSTAT_ADD(tcps_rcvbyteafterwin, tlen); 1774 /* 1775 * If window is closed can only take segments at 1776 * window edge, and have to drop data and PUSH from 1777 * incoming segments. Continue processing, but 1778 * remember to ack. Otherwise, drop segment and 1779 * ack. 1780 */ 1781 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { 1782 tp->t_flags |= TF_ACKNOW; 1783 TCPSTAT_INC(tcps_rcvwinprobe); 1784 } else { 1785 rack_do_dropafterack(m, tp, th, ti_locked, thflags, tlen, ret_val); 1786 return (1); 1787 } 1788 } else 1789 TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); 1790 m_adj(m, -todrop); 1791 tlen -= todrop; 1792 thflags &= ~(TH_PUSH | TH_FIN); 1793 } 1794 *thf = thflags; 1795 *tlenp = tlen; 1796 return (0); 1797 } 1798 1799 static struct rack_sendmap * 1800 rack_find_lowest_rsm(struct tcp_rack *rack) 1801 { 1802 struct rack_sendmap *rsm; 1803 1804 /* 1805 * Walk the time-order transmitted list looking for an rsm that is 1806 * not acked. This will be the one that was sent the longest time 1807 * ago that is still outstanding. 1808 */ 1809 TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) { 1810 if (rsm->r_flags & RACK_ACKED) { 1811 continue; 1812 } 1813 goto finish; 1814 } 1815 finish: 1816 return (rsm); 1817 } 1818 1819 static struct rack_sendmap * 1820 rack_find_high_nonack(struct tcp_rack *rack, struct rack_sendmap *rsm) 1821 { 1822 struct rack_sendmap *prsm; 1823 1824 /* 1825 * Walk the sequence order list backward until we hit and arrive at 1826 * the highest seq not acked. In theory when this is called it 1827 * should be the last segment (which it was not). 1828 */ 1829 counter_u64_add(rack_find_high, 1); 1830 prsm = rsm; 1831 TAILQ_FOREACH_REVERSE_FROM(prsm, &rack->r_ctl.rc_map, rack_head, r_next) { 1832 if (prsm->r_flags & (RACK_ACKED | RACK_HAS_FIN)) { 1833 continue; 1834 } 1835 return (prsm); 1836 } 1837 return (NULL); 1838 } 1839 1840 1841 static uint32_t 1842 rack_calc_thresh_rack(struct tcp_rack *rack, uint32_t srtt, uint32_t cts) 1843 { 1844 int32_t lro; 1845 uint32_t thresh; 1846 1847 /* 1848 * lro is the flag we use to determine if we have seen reordering. 1849 * If it gets set we have seen reordering. The reorder logic either 1850 * works in one of two ways: 1851 * 1852 * If reorder-fade is configured, then we track the last time we saw 1853 * re-ordering occur. If we reach the point where enough time as 1854 * passed we no longer consider reordering has occuring. 1855 * 1856 * Or if reorder-face is 0, then once we see reordering we consider 1857 * the connection to alway be subject to reordering and just set lro 1858 * to 1. 1859 * 1860 * In the end if lro is non-zero we add the extra time for 1861 * reordering in. 1862 */ 1863 if (srtt == 0) 1864 srtt = 1; 1865 if (rack->r_ctl.rc_reorder_ts) { 1866 if (rack->r_ctl.rc_reorder_fade) { 1867 if (SEQ_GEQ(cts, rack->r_ctl.rc_reorder_ts)) { 1868 lro = cts - rack->r_ctl.rc_reorder_ts; 1869 if (lro == 0) { 1870 /* 1871 * No time as passed since the last 1872 * reorder, mark it as reordering. 1873 */ 1874 lro = 1; 1875 } 1876 } else { 1877 /* Negative time? */ 1878 lro = 0; 1879 } 1880 if (lro > rack->r_ctl.rc_reorder_fade) { 1881 /* Turn off reordering seen too */ 1882 rack->r_ctl.rc_reorder_ts = 0; 1883 lro = 0; 1884 } 1885 } else { 1886 /* Reodering does not fade */ 1887 lro = 1; 1888 } 1889 } else { 1890 lro = 0; 1891 } 1892 thresh = srtt + rack->r_ctl.rc_pkt_delay; 1893 if (lro) { 1894 /* It must be set, if not you get 1/4 rtt */ 1895 if (rack->r_ctl.rc_reorder_shift) 1896 thresh += (srtt >> rack->r_ctl.rc_reorder_shift); 1897 else 1898 thresh += (srtt >> 2); 1899 } else { 1900 thresh += 1; 1901 } 1902 /* We don't let the rack timeout be above a RTO */ 1903 1904 if (thresh > TICKS_2_MSEC(rack->rc_tp->t_rxtcur)) { 1905 thresh = TICKS_2_MSEC(rack->rc_tp->t_rxtcur); 1906 } 1907 /* And we don't want it above the RTO max either */ 1908 if (thresh > rack_rto_max) { 1909 thresh = rack_rto_max; 1910 } 1911 return (thresh); 1912 } 1913 1914 static uint32_t 1915 rack_calc_thresh_tlp(struct tcpcb *tp, struct tcp_rack *rack, 1916 struct rack_sendmap *rsm, uint32_t srtt) 1917 { 1918 struct rack_sendmap *prsm; 1919 uint32_t thresh, len; 1920 int maxseg; 1921 1922 if (srtt == 0) 1923 srtt = 1; 1924 if (rack->r_ctl.rc_tlp_threshold) 1925 thresh = srtt + (srtt / rack->r_ctl.rc_tlp_threshold); 1926 else 1927 thresh = (srtt * 2); 1928 1929 /* Get the previous sent packet, if any */ 1930 maxseg = tcp_maxseg(tp); 1931 counter_u64_add(rack_enter_tlp_calc, 1); 1932 len = rsm->r_end - rsm->r_start; 1933 if (rack->rack_tlp_threshold_use == TLP_USE_ID) { 1934 /* Exactly like the ID */ 1935 if (((tp->snd_max - tp->snd_una) - rack->r_ctl.rc_sacked + rack->r_ctl.rc_holes_rxt) <= maxseg) { 1936 uint32_t alt_thresh; 1937 /* 1938 * Compensate for delayed-ack with the d-ack time. 1939 */ 1940 counter_u64_add(rack_used_tlpmethod, 1); 1941 alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time; 1942 if (alt_thresh > thresh) 1943 thresh = alt_thresh; 1944 } 1945 } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_ONE) { 1946 /* 2.1 behavior */ 1947 prsm = TAILQ_PREV(rsm, rack_head, r_tnext); 1948 if (prsm && (len <= maxseg)) { 1949 /* 1950 * Two packets outstanding, thresh should be (2*srtt) + 1951 * possible inter-packet delay (if any). 1952 */ 1953 uint32_t inter_gap = 0; 1954 int idx, nidx; 1955 1956 counter_u64_add(rack_used_tlpmethod, 1); 1957 idx = rsm->r_rtr_cnt - 1; 1958 nidx = prsm->r_rtr_cnt - 1; 1959 if (TSTMP_GEQ(rsm->r_tim_lastsent[nidx], prsm->r_tim_lastsent[idx])) { 1960 /* Yes it was sent later (or at the same time) */ 1961 inter_gap = rsm->r_tim_lastsent[idx] - prsm->r_tim_lastsent[nidx]; 1962 } 1963 thresh += inter_gap; 1964 } else if (len <= maxseg) { 1965 /* 1966 * Possibly compensate for delayed-ack. 1967 */ 1968 uint32_t alt_thresh; 1969 1970 counter_u64_add(rack_used_tlpmethod2, 1); 1971 alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time; 1972 if (alt_thresh > thresh) 1973 thresh = alt_thresh; 1974 } 1975 } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_TWO) { 1976 /* 2.2 behavior */ 1977 if (len <= maxseg) { 1978 uint32_t alt_thresh; 1979 /* 1980 * Compensate for delayed-ack with the d-ack time. 1981 */ 1982 counter_u64_add(rack_used_tlpmethod, 1); 1983 alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time; 1984 if (alt_thresh > thresh) 1985 thresh = alt_thresh; 1986 } 1987 } 1988 /* Not above an RTO */ 1989 if (thresh > TICKS_2_MSEC(tp->t_rxtcur)) { 1990 thresh = TICKS_2_MSEC(tp->t_rxtcur); 1991 } 1992 /* Not above a RTO max */ 1993 if (thresh > rack_rto_max) { 1994 thresh = rack_rto_max; 1995 } 1996 /* Apply user supplied min TLP */ 1997 if (thresh < rack_tlp_min) { 1998 thresh = rack_tlp_min; 1999 } 2000 return (thresh); 2001 } 2002 2003 static struct rack_sendmap * 2004 rack_check_recovery_mode(struct tcpcb *tp, uint32_t tsused) 2005 { 2006 /* 2007 * Check to see that we don't need to fall into recovery. We will 2008 * need to do so if our oldest transmit is past the time we should 2009 * have had an ack. 2010 */ 2011 struct tcp_rack *rack; 2012 struct rack_sendmap *rsm; 2013 int32_t idx; 2014 uint32_t srtt_cur, srtt, thresh; 2015 2016 rack = (struct tcp_rack *)tp->t_fb_ptr; 2017 if (TAILQ_EMPTY(&rack->r_ctl.rc_map)) { 2018 return (NULL); 2019 } 2020 srtt_cur = tp->t_srtt >> TCP_RTT_SHIFT; 2021 srtt = TICKS_2_MSEC(srtt_cur); 2022 if (rack->rc_rack_rtt && (srtt > rack->rc_rack_rtt)) 2023 srtt = rack->rc_rack_rtt; 2024 2025 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 2026 if (rsm == NULL) 2027 return (NULL); 2028 2029 if (rsm->r_flags & RACK_ACKED) { 2030 rsm = rack_find_lowest_rsm(rack); 2031 if (rsm == NULL) 2032 return (NULL); 2033 } 2034 idx = rsm->r_rtr_cnt - 1; 2035 thresh = rack_calc_thresh_rack(rack, srtt, tsused); 2036 if (tsused < rsm->r_tim_lastsent[idx]) { 2037 return (NULL); 2038 } 2039 if ((tsused - rsm->r_tim_lastsent[idx]) < thresh) { 2040 return (NULL); 2041 } 2042 /* Ok if we reach here we are over-due */ 2043 rack->r_ctl.rc_rsm_start = rsm->r_start; 2044 rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; 2045 rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; 2046 rack_cong_signal(tp, NULL, CC_NDUPACK); 2047 return (rsm); 2048 } 2049 2050 static uint32_t 2051 rack_get_persists_timer_val(struct tcpcb *tp, struct tcp_rack *rack) 2052 { 2053 int32_t t; 2054 int32_t tt; 2055 uint32_t ret_val; 2056 2057 t = TICKS_2_MSEC((tp->t_srtt >> TCP_RTT_SHIFT) + ((tp->t_rttvar * 4) >> TCP_RTT_SHIFT)); 2058 TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift], 2059 tcp_persmin, tcp_persmax); 2060 if (tp->t_rxtshift < TCP_MAXRXTSHIFT) 2061 tp->t_rxtshift++; 2062 rack->r_ctl.rc_hpts_flags |= PACE_TMR_PERSIT; 2063 ret_val = (uint32_t)tt; 2064 return (ret_val); 2065 } 2066 2067 static uint32_t 2068 rack_timer_start(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 2069 { 2070 /* 2071 * Start the FR timer, we do this based on getting the first one in 2072 * the rc_tmap. Note that if its NULL we must stop the timer. in all 2073 * events we need to stop the running timer (if its running) before 2074 * starting the new one. 2075 */ 2076 uint32_t thresh, exp, to, srtt, time_since_sent; 2077 uint32_t srtt_cur; 2078 int32_t idx; 2079 int32_t is_tlp_timer = 0; 2080 struct rack_sendmap *rsm; 2081 2082 if (rack->t_timers_stopped) { 2083 /* All timers have been stopped none are to run */ 2084 return (0); 2085 } 2086 if (rack->rc_in_persist) { 2087 /* We can't start any timer in persists */ 2088 return (rack_get_persists_timer_val(tp, rack)); 2089 } 2090 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 2091 if (rsm == NULL) { 2092 /* Nothing on the send map */ 2093 activate_rxt: 2094 if (SEQ_LT(tp->snd_una, tp->snd_max) || sbavail(&(tp->t_inpcb->inp_socket->so_snd))) { 2095 rack->r_ctl.rc_hpts_flags |= PACE_TMR_RXT; 2096 to = TICKS_2_MSEC(tp->t_rxtcur); 2097 if (to == 0) 2098 to = 1; 2099 return (to); 2100 } 2101 return (0); 2102 } 2103 if (rsm->r_flags & RACK_ACKED) { 2104 rsm = rack_find_lowest_rsm(rack); 2105 if (rsm == NULL) { 2106 /* No lowest? */ 2107 goto activate_rxt; 2108 } 2109 } 2110 /* Convert from ms to usecs */ 2111 if (rsm->r_flags & RACK_SACK_PASSED) { 2112 if ((tp->t_flags & TF_SENTFIN) && 2113 ((tp->snd_max - tp->snd_una) == 1) && 2114 (rsm->r_flags & RACK_HAS_FIN)) { 2115 /* 2116 * We don't start a rack timer if all we have is a 2117 * FIN outstanding. 2118 */ 2119 goto activate_rxt; 2120 } 2121 if (tp->t_srtt) { 2122 srtt_cur = (tp->t_srtt >> TCP_RTT_SHIFT); 2123 srtt = TICKS_2_MSEC(srtt_cur); 2124 } else 2125 srtt = RACK_INITIAL_RTO; 2126 2127 thresh = rack_calc_thresh_rack(rack, srtt, cts); 2128 idx = rsm->r_rtr_cnt - 1; 2129 exp = rsm->r_tim_lastsent[idx] + thresh; 2130 if (SEQ_GEQ(exp, cts)) { 2131 to = exp - cts; 2132 if (to < rack->r_ctl.rc_min_to) { 2133 to = rack->r_ctl.rc_min_to; 2134 } 2135 } else { 2136 to = rack->r_ctl.rc_min_to; 2137 } 2138 } else { 2139 /* Ok we need to do a TLP not RACK */ 2140 if ((rack->rc_tlp_in_progress != 0) || 2141 (rack->r_ctl.rc_tlp_rtx_out != 0)) { 2142 /* 2143 * The previous send was a TLP or a tlp_rtx is in 2144 * process. 2145 */ 2146 goto activate_rxt; 2147 } 2148 rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext); 2149 if (rsm == NULL) { 2150 /* We found no rsm to TLP with. */ 2151 goto activate_rxt; 2152 } 2153 if (rsm->r_flags & RACK_HAS_FIN) { 2154 /* If its a FIN we dont do TLP */ 2155 rsm = NULL; 2156 goto activate_rxt; 2157 } 2158 idx = rsm->r_rtr_cnt - 1; 2159 if (TSTMP_GT(cts, rsm->r_tim_lastsent[idx])) 2160 time_since_sent = cts - rsm->r_tim_lastsent[idx]; 2161 else 2162 time_since_sent = 0; 2163 is_tlp_timer = 1; 2164 if (tp->t_srtt) { 2165 srtt_cur = (tp->t_srtt >> TCP_RTT_SHIFT); 2166 srtt = TICKS_2_MSEC(srtt_cur); 2167 } else 2168 srtt = RACK_INITIAL_RTO; 2169 thresh = rack_calc_thresh_tlp(tp, rack, rsm, srtt); 2170 if (thresh > time_since_sent) 2171 to = thresh - time_since_sent; 2172 else 2173 to = rack->r_ctl.rc_min_to; 2174 if (to > TCPTV_REXMTMAX) { 2175 /* 2176 * If the TLP time works out to larger than the max 2177 * RTO lets not do TLP.. just RTO. 2178 */ 2179 goto activate_rxt; 2180 } 2181 if (rsm->r_start != rack->r_ctl.rc_last_tlp_seq) { 2182 /* 2183 * The tail is no longer the last one I did a probe 2184 * on 2185 */ 2186 rack->r_ctl.rc_tlp_seg_send_cnt = 0; 2187 rack->r_ctl.rc_last_tlp_seq = rsm->r_start; 2188 } 2189 } 2190 if (is_tlp_timer == 0) { 2191 rack->r_ctl.rc_hpts_flags |= PACE_TMR_RACK; 2192 } else { 2193 if ((rack->r_ctl.rc_tlp_send_cnt > rack_tlp_max_resend) || 2194 (rack->r_ctl.rc_tlp_seg_send_cnt > rack_tlp_max_resend)) { 2195 /* 2196 * We have exceeded how many times we can retran the 2197 * current TLP timer, switch to the RTO timer. 2198 */ 2199 goto activate_rxt; 2200 } else { 2201 rack->r_ctl.rc_hpts_flags |= PACE_TMR_TLP; 2202 } 2203 } 2204 if (to == 0) 2205 to = 1; 2206 return (to); 2207 } 2208 2209 static void 2210 rack_enter_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 2211 { 2212 if (rack->rc_in_persist == 0) { 2213 if (((tp->t_flags & TF_SENTFIN) == 0) && 2214 (tp->snd_max - tp->snd_una) >= sbavail(&rack->rc_inp->inp_socket->so_snd)) 2215 /* Must need to send more data to enter persist */ 2216 return; 2217 rack->r_ctl.rc_went_idle_time = cts; 2218 rack_timer_cancel(tp, rack, cts, __LINE__); 2219 tp->t_rxtshift = 0; 2220 rack->rc_in_persist = 1; 2221 } 2222 } 2223 2224 static void 2225 rack_exit_persist(struct tcpcb *tp, struct tcp_rack *rack) 2226 { 2227 if (rack->rc_inp->inp_in_hpts) { 2228 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); 2229 rack->r_ctl.rc_hpts_flags = 0; 2230 } 2231 rack->rc_in_persist = 0; 2232 rack->r_ctl.rc_went_idle_time = 0; 2233 tp->t_flags &= ~TF_FORCEDATA; 2234 tp->t_rxtshift = 0; 2235 } 2236 2237 static void 2238 rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, int32_t line, 2239 int32_t slot, uint32_t tot_len_this_send, int32_t frm_out_sbavail) 2240 { 2241 struct inpcb *inp; 2242 uint32_t delayed_ack = 0; 2243 uint32_t hpts_timeout; 2244 uint8_t stopped; 2245 uint32_t left = 0; 2246 2247 inp = tp->t_inpcb; 2248 if (inp->inp_in_hpts) { 2249 /* A previous call is already set up */ 2250 return; 2251 } 2252 if (tp->t_state == TCPS_CLOSED) { 2253 return; 2254 } 2255 stopped = rack->rc_tmr_stopped; 2256 if (stopped && TSTMP_GT(rack->r_ctl.rc_timer_exp, cts)) { 2257 left = rack->r_ctl.rc_timer_exp - cts; 2258 } 2259 rack->r_ctl.rc_timer_exp = 0; 2260 if (rack->rc_inp->inp_in_hpts == 0) { 2261 rack->r_ctl.rc_hpts_flags = 0; 2262 } 2263 if (slot) { 2264 /* We are hptsi too */ 2265 rack->r_ctl.rc_hpts_flags |= PACE_PKT_OUTPUT; 2266 } else if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { 2267 /* 2268 * We are still left on the hpts when the to goes 2269 * it will be for output. 2270 */ 2271 if (TSTMP_GT(cts, rack->r_ctl.rc_last_output_to)) 2272 slot = cts - rack->r_ctl.rc_last_output_to; 2273 else 2274 slot = 1; 2275 } 2276 if ((tp->snd_wnd == 0) && TCPS_HAVEESTABLISHED(tp->t_state)) { 2277 /* No send window.. we must enter persist */ 2278 rack_enter_persist(tp, rack, cts); 2279 } else if ((frm_out_sbavail && 2280 (frm_out_sbavail > (tp->snd_max - tp->snd_una)) && 2281 (tp->snd_wnd < tp->t_maxseg)) && 2282 TCPS_HAVEESTABLISHED(tp->t_state)) { 2283 /* 2284 * If we have no window or we can't send a segment (and have 2285 * data to send.. we cheat here and frm_out_sbavail is 2286 * passed in with the sbavail(sb) only from bbr_output) and 2287 * we are established, then we must enter persits (if not 2288 * already in persits). 2289 */ 2290 rack_enter_persist(tp, rack, cts); 2291 } 2292 hpts_timeout = rack_timer_start(tp, rack, cts); 2293 if (tp->t_flags & TF_DELACK) { 2294 delayed_ack = tcp_delacktime; 2295 rack->r_ctl.rc_hpts_flags |= PACE_TMR_DELACK; 2296 } 2297 if (delayed_ack && ((hpts_timeout == 0) || 2298 (delayed_ack < hpts_timeout))) 2299 hpts_timeout = delayed_ack; 2300 else 2301 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK; 2302 /* 2303 * If no timers are going to run and we will fall off the hptsi 2304 * wheel, we resort to a keep-alive timer if its configured. 2305 */ 2306 if ((hpts_timeout == 0) && 2307 (slot == 0)) { 2308 if ((tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && 2309 (tp->t_state <= TCPS_CLOSING)) { 2310 /* 2311 * Ok we have no timer (persists, rack, tlp, rxt or 2312 * del-ack), we don't have segments being paced. So 2313 * all that is left is the keepalive timer. 2314 */ 2315 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 2316 /* Get the established keep-alive time */ 2317 hpts_timeout = TP_KEEPIDLE(tp); 2318 } else { 2319 /* Get the initial setup keep-alive time */ 2320 hpts_timeout = TP_KEEPINIT(tp); 2321 } 2322 rack->r_ctl.rc_hpts_flags |= PACE_TMR_KEEP; 2323 } 2324 } 2325 if (left && (stopped & (PACE_TMR_KEEP | PACE_TMR_DELACK)) == 2326 (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK)) { 2327 /* 2328 * RACK, TLP, persists and RXT timers all are restartable 2329 * based on actions input .. i.e we received a packet (ack 2330 * or sack) and that changes things (rw, or snd_una etc). 2331 * Thus we can restart them with a new value. For 2332 * keep-alive, delayed_ack we keep track of what was left 2333 * and restart the timer with a smaller value. 2334 */ 2335 if (left < hpts_timeout) 2336 hpts_timeout = left; 2337 } 2338 if (hpts_timeout) { 2339 /* 2340 * Hack alert for now we can't time-out over 2,147,483 2341 * seconds (a bit more than 596 hours), which is probably ok 2342 * :). 2343 */ 2344 if (hpts_timeout > 0x7ffffffe) 2345 hpts_timeout = 0x7ffffffe; 2346 rack->r_ctl.rc_timer_exp = cts + hpts_timeout; 2347 } 2348 if (slot) { 2349 rack->r_ctl.rc_last_output_to = cts + slot; 2350 if ((hpts_timeout == 0) || (hpts_timeout > slot)) { 2351 if (rack->rc_inp->inp_in_hpts == 0) 2352 tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(slot)); 2353 rack_log_to_start(rack, cts, hpts_timeout, slot, 1); 2354 } else { 2355 /* 2356 * Arrange for the hpts to kick back in after the 2357 * t-o if the t-o does not cause a send. 2358 */ 2359 if (rack->rc_inp->inp_in_hpts == 0) 2360 tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(hpts_timeout)); 2361 rack_log_to_start(rack, cts, hpts_timeout, slot, 0); 2362 } 2363 } else if (hpts_timeout) { 2364 if (rack->rc_inp->inp_in_hpts == 0) 2365 tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(hpts_timeout)); 2366 rack_log_to_start(rack, cts, hpts_timeout, slot, 0); 2367 } else { 2368 /* No timer starting */ 2369 #ifdef INVARIANTS 2370 if (SEQ_GT(tp->snd_max, tp->snd_una)) { 2371 panic("tp:%p rack:%p tlts:%d cts:%u slot:%u pto:%u -- no timer started?", 2372 tp, rack, tot_len_this_send, cts, slot, hpts_timeout); 2373 } 2374 #endif 2375 } 2376 rack->rc_tmr_stopped = 0; 2377 if (slot) 2378 rack_log_type_bbrsnd(rack, tot_len_this_send, slot, cts); 2379 } 2380 2381 /* 2382 * RACK Timer, here we simply do logging and house keeping. 2383 * the normal rack_output() function will call the 2384 * appropriate thing to check if we need to do a RACK retransmit. 2385 * We return 1, saying don't proceed with rack_output only 2386 * when all timers have been stopped (destroyed PCB?). 2387 */ 2388 static int 2389 rack_timeout_rack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 2390 { 2391 /* 2392 * This timer simply provides an internal trigger to send out data. 2393 * The check_recovery_mode call will see if there are needed 2394 * retransmissions, if so we will enter fast-recovery. The output 2395 * call may or may not do the same thing depending on sysctl 2396 * settings. 2397 */ 2398 struct rack_sendmap *rsm; 2399 int32_t recovery; 2400 2401 if (tp->t_timers->tt_flags & TT_STOPPED) { 2402 return (1); 2403 } 2404 if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) { 2405 /* Its not time yet */ 2406 return (0); 2407 } 2408 rack_log_to_event(rack, RACK_TO_FRM_RACK); 2409 recovery = IN_RECOVERY(tp->t_flags); 2410 counter_u64_add(rack_to_tot, 1); 2411 if (rack->r_state && (rack->r_state != tp->t_state)) 2412 rack_set_state(tp, rack); 2413 rsm = rack_check_recovery_mode(tp, cts); 2414 if (rsm) { 2415 uint32_t rtt; 2416 2417 rtt = rack->rc_rack_rtt; 2418 if (rtt == 0) 2419 rtt = 1; 2420 if ((recovery == 0) && 2421 (rack->r_ctl.rc_prr_sndcnt < tp->t_maxseg)) { 2422 /* 2423 * The rack-timeout that enter's us into recovery 2424 * will force out one MSS and set us up so that we 2425 * can do one more send in 2*rtt (transitioning the 2426 * rack timeout into a rack-tlp). 2427 */ 2428 rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; 2429 } else if ((rack->r_ctl.rc_prr_sndcnt < tp->t_maxseg) && 2430 ((rsm->r_end - rsm->r_start) > rack->r_ctl.rc_prr_sndcnt)) { 2431 /* 2432 * When a rack timer goes, we have to send at 2433 * least one segment. They will be paced a min of 1ms 2434 * apart via the next rack timer (or further 2435 * if the rack timer dictates it). 2436 */ 2437 rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; 2438 } 2439 } else { 2440 /* This is a case that should happen rarely if ever */ 2441 counter_u64_add(rack_tlp_does_nada, 1); 2442 #ifdef TCP_BLACKBOX 2443 tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true); 2444 #endif 2445 rack->r_ctl.rc_resend = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 2446 } 2447 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RACK; 2448 return (0); 2449 } 2450 2451 /* 2452 * TLP Timer, here we simply setup what segment we want to 2453 * have the TLP expire on, the normal rack_output() will then 2454 * send it out. 2455 * 2456 * We return 1, saying don't proceed with rack_output only 2457 * when all timers have been stopped (destroyed PCB?). 2458 */ 2459 static int 2460 rack_timeout_tlp(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 2461 { 2462 /* 2463 * Tail Loss Probe. 2464 */ 2465 struct rack_sendmap *rsm = NULL; 2466 struct socket *so; 2467 uint32_t amm, old_prr_snd = 0; 2468 uint32_t out, avail; 2469 2470 if (tp->t_timers->tt_flags & TT_STOPPED) { 2471 return (1); 2472 } 2473 if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) { 2474 /* Its not time yet */ 2475 return (0); 2476 } 2477 if (rack_progress_timeout_check(tp)) { 2478 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 2479 return (1); 2480 } 2481 /* 2482 * A TLP timer has expired. We have been idle for 2 rtts. So we now 2483 * need to figure out how to force a full MSS segment out. 2484 */ 2485 rack_log_to_event(rack, RACK_TO_FRM_TLP); 2486 counter_u64_add(rack_tlp_tot, 1); 2487 if (rack->r_state && (rack->r_state != tp->t_state)) 2488 rack_set_state(tp, rack); 2489 so = tp->t_inpcb->inp_socket; 2490 avail = sbavail(&so->so_snd); 2491 out = tp->snd_max - tp->snd_una; 2492 rack->rc_timer_up = 1; 2493 /* 2494 * If we are in recovery we can jazz out a segment if new data is 2495 * present simply by setting rc_prr_sndcnt to a segment. 2496 */ 2497 if ((avail > out) && 2498 ((rack_always_send_oldest == 0) || (TAILQ_EMPTY(&rack->r_ctl.rc_tmap)))) { 2499 /* New data is available */ 2500 amm = avail - out; 2501 if (amm > tp->t_maxseg) { 2502 amm = tp->t_maxseg; 2503 } else if ((amm < tp->t_maxseg) && ((tp->t_flags & TF_NODELAY) == 0)) { 2504 /* not enough to fill a MTU and no-delay is off */ 2505 goto need_retran; 2506 } 2507 if (IN_RECOVERY(tp->t_flags)) { 2508 /* Unlikely */ 2509 old_prr_snd = rack->r_ctl.rc_prr_sndcnt; 2510 if (out + amm <= tp->snd_wnd) 2511 rack->r_ctl.rc_prr_sndcnt = amm; 2512 else 2513 goto need_retran; 2514 } else { 2515 /* Set the send-new override */ 2516 if (out + amm <= tp->snd_wnd) 2517 rack->r_ctl.rc_tlp_new_data = amm; 2518 else 2519 goto need_retran; 2520 } 2521 rack->r_ctl.rc_tlp_seg_send_cnt = 0; 2522 rack->r_ctl.rc_last_tlp_seq = tp->snd_max; 2523 rack->r_ctl.rc_tlpsend = NULL; 2524 counter_u64_add(rack_tlp_newdata, 1); 2525 goto send; 2526 } 2527 need_retran: 2528 /* 2529 * Ok we need to arrange the last un-acked segment to be re-sent, or 2530 * optionally the first un-acked segment. 2531 */ 2532 if (rack_always_send_oldest) 2533 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 2534 else { 2535 rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_map, rack_sendmap, r_next); 2536 if (rsm && (rsm->r_flags & (RACK_ACKED | RACK_HAS_FIN))) { 2537 rsm = rack_find_high_nonack(rack, rsm); 2538 } 2539 } 2540 if (rsm == NULL) { 2541 counter_u64_add(rack_tlp_does_nada, 1); 2542 #ifdef TCP_BLACKBOX 2543 tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true); 2544 #endif 2545 goto out; 2546 } 2547 if ((rsm->r_end - rsm->r_start) > tp->t_maxseg) { 2548 /* 2549 * We need to split this the last segment in two. 2550 */ 2551 int32_t idx; 2552 struct rack_sendmap *nrsm; 2553 2554 nrsm = rack_alloc(rack); 2555 if (nrsm == NULL) { 2556 /* 2557 * No memory to split, we will just exit and punt 2558 * off to the RXT timer. 2559 */ 2560 counter_u64_add(rack_tlp_does_nada, 1); 2561 goto out; 2562 } 2563 nrsm->r_start = (rsm->r_end - tp->t_maxseg); 2564 nrsm->r_end = rsm->r_end; 2565 nrsm->r_rtr_cnt = rsm->r_rtr_cnt; 2566 nrsm->r_flags = rsm->r_flags; 2567 nrsm->r_sndcnt = rsm->r_sndcnt; 2568 nrsm->r_rtr_bytes = 0; 2569 rsm->r_end = nrsm->r_start; 2570 for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { 2571 nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; 2572 } 2573 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next); 2574 if (rsm->r_in_tmap) { 2575 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 2576 nrsm->r_in_tmap = 1; 2577 } 2578 rsm->r_flags &= (~RACK_HAS_FIN); 2579 rsm = nrsm; 2580 } 2581 rack->r_ctl.rc_tlpsend = rsm; 2582 rack->r_ctl.rc_tlp_rtx_out = 1; 2583 if (rsm->r_start == rack->r_ctl.rc_last_tlp_seq) { 2584 rack->r_ctl.rc_tlp_seg_send_cnt++; 2585 tp->t_rxtshift++; 2586 } else { 2587 rack->r_ctl.rc_last_tlp_seq = rsm->r_start; 2588 rack->r_ctl.rc_tlp_seg_send_cnt = 1; 2589 } 2590 send: 2591 rack->r_ctl.rc_tlp_send_cnt++; 2592 if (rack->r_ctl.rc_tlp_send_cnt > rack_tlp_max_resend) { 2593 /* 2594 * Can't [re]/transmit a segment we have not heard from the 2595 * peer in max times. We need the retransmit timer to take 2596 * over. 2597 */ 2598 restore: 2599 rack->r_ctl.rc_tlpsend = NULL; 2600 if (rsm) 2601 rsm->r_flags &= ~RACK_TLP; 2602 rack->r_ctl.rc_prr_sndcnt = old_prr_snd; 2603 counter_u64_add(rack_tlp_retran_fail, 1); 2604 goto out; 2605 } else if (rsm) { 2606 rsm->r_flags |= RACK_TLP; 2607 } 2608 if (rsm && (rsm->r_start == rack->r_ctl.rc_last_tlp_seq) && 2609 (rack->r_ctl.rc_tlp_seg_send_cnt > rack_tlp_max_resend)) { 2610 /* 2611 * We don't want to send a single segment more than the max 2612 * either. 2613 */ 2614 goto restore; 2615 } 2616 rack->r_timer_override = 1; 2617 rack->r_tlp_running = 1; 2618 rack->rc_tlp_in_progress = 1; 2619 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP; 2620 return (0); 2621 out: 2622 rack->rc_timer_up = 0; 2623 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP; 2624 return (0); 2625 } 2626 2627 /* 2628 * Delayed ack Timer, here we simply need to setup the 2629 * ACK_NOW flag and remove the DELACK flag. From there 2630 * the output routine will send the ack out. 2631 * 2632 * We only return 1, saying don't proceed, if all timers 2633 * are stopped (destroyed PCB?). 2634 */ 2635 static int 2636 rack_timeout_delack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 2637 { 2638 if (tp->t_timers->tt_flags & TT_STOPPED) { 2639 return (1); 2640 } 2641 rack_log_to_event(rack, RACK_TO_FRM_DELACK); 2642 tp->t_flags &= ~TF_DELACK; 2643 tp->t_flags |= TF_ACKNOW; 2644 TCPSTAT_INC(tcps_delack); 2645 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK; 2646 return (0); 2647 } 2648 2649 /* 2650 * Persists timer, here we simply need to setup the 2651 * FORCE-DATA flag the output routine will send 2652 * the one byte send. 2653 * 2654 * We only return 1, saying don't proceed, if all timers 2655 * are stopped (destroyed PCB?). 2656 */ 2657 static int 2658 rack_timeout_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 2659 { 2660 struct inpcb *inp; 2661 int32_t retval = 0; 2662 2663 inp = tp->t_inpcb; 2664 2665 if (tp->t_timers->tt_flags & TT_STOPPED) { 2666 return (1); 2667 } 2668 if (rack->rc_in_persist == 0) 2669 return (0); 2670 if (rack_progress_timeout_check(tp)) { 2671 tcp_set_inp_to_drop(inp, ETIMEDOUT); 2672 return (1); 2673 } 2674 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 2675 /* 2676 * Persistence timer into zero window. Force a byte to be output, if 2677 * possible. 2678 */ 2679 TCPSTAT_INC(tcps_persisttimeo); 2680 /* 2681 * Hack: if the peer is dead/unreachable, we do not time out if the 2682 * window is closed. After a full backoff, drop the connection if 2683 * the idle time (no responses to probes) reaches the maximum 2684 * backoff that we would use if retransmitting. 2685 */ 2686 if (tp->t_rxtshift == TCP_MAXRXTSHIFT && 2687 (ticks - tp->t_rcvtime >= tcp_maxpersistidle || 2688 ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) { 2689 TCPSTAT_INC(tcps_persistdrop); 2690 retval = 1; 2691 tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT); 2692 goto out; 2693 } 2694 if ((sbavail(&rack->rc_inp->inp_socket->so_snd) == 0) && 2695 tp->snd_una == tp->snd_max) 2696 rack_exit_persist(tp, rack); 2697 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_PERSIT; 2698 /* 2699 * If the user has closed the socket then drop a persisting 2700 * connection after a much reduced timeout. 2701 */ 2702 if (tp->t_state > TCPS_CLOSE_WAIT && 2703 (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) { 2704 retval = 1; 2705 TCPSTAT_INC(tcps_persistdrop); 2706 tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT); 2707 goto out; 2708 } 2709 tp->t_flags |= TF_FORCEDATA; 2710 out: 2711 rack_log_to_event(rack, RACK_TO_FRM_PERSIST); 2712 return (retval); 2713 } 2714 2715 /* 2716 * If a keepalive goes off, we had no other timers 2717 * happening. We always return 1 here since this 2718 * routine either drops the connection or sends 2719 * out a segment with respond. 2720 */ 2721 static int 2722 rack_timeout_keepalive(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 2723 { 2724 struct tcptemp *t_template; 2725 struct inpcb *inp; 2726 2727 if (tp->t_timers->tt_flags & TT_STOPPED) { 2728 return (1); 2729 } 2730 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_KEEP; 2731 inp = tp->t_inpcb; 2732 rack_log_to_event(rack, RACK_TO_FRM_KEEP); 2733 /* 2734 * Keep-alive timer went off; send something or drop connection if 2735 * idle for too long. 2736 */ 2737 TCPSTAT_INC(tcps_keeptimeo); 2738 if (tp->t_state < TCPS_ESTABLISHED) 2739 goto dropit; 2740 if ((tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && 2741 tp->t_state <= TCPS_CLOSING) { 2742 if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp)) 2743 goto dropit; 2744 /* 2745 * Send a packet designed to force a response if the peer is 2746 * up and reachable: either an ACK if the connection is 2747 * still alive, or an RST if the peer has closed the 2748 * connection due to timeout or reboot. Using sequence 2749 * number tp->snd_una-1 causes the transmitted zero-length 2750 * segment to lie outside the receive window; by the 2751 * protocol spec, this requires the correspondent TCP to 2752 * respond. 2753 */ 2754 TCPSTAT_INC(tcps_keepprobe); 2755 t_template = tcpip_maketemplate(inp); 2756 if (t_template) { 2757 tcp_respond(tp, t_template->tt_ipgen, 2758 &t_template->tt_t, (struct mbuf *)NULL, 2759 tp->rcv_nxt, tp->snd_una - 1, 0); 2760 free(t_template, M_TEMP); 2761 } 2762 } 2763 rack_start_hpts_timer(rack, tp, cts, __LINE__, 0, 0, 0); 2764 return (1); 2765 dropit: 2766 TCPSTAT_INC(tcps_keepdrops); 2767 tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT); 2768 return (1); 2769 } 2770 2771 /* 2772 * Retransmit helper function, clear up all the ack 2773 * flags and take care of important book keeping. 2774 */ 2775 static void 2776 rack_remxt_tmr(struct tcpcb *tp) 2777 { 2778 /* 2779 * The retransmit timer went off, all sack'd blocks must be 2780 * un-acked. 2781 */ 2782 struct rack_sendmap *rsm, *trsm = NULL; 2783 struct tcp_rack *rack; 2784 int32_t cnt = 0; 2785 2786 rack = (struct tcp_rack *)tp->t_fb_ptr; 2787 rack_timer_cancel(tp, rack, tcp_ts_getticks(), __LINE__); 2788 rack_log_to_event(rack, RACK_TO_FRM_TMR); 2789 if (rack->r_state && (rack->r_state != tp->t_state)) 2790 rack_set_state(tp, rack); 2791 /* 2792 * Ideally we would like to be able to 2793 * mark SACK-PASS on anything not acked here. 2794 * However, if we do that we would burst out 2795 * all that data 1ms apart. This would be unwise, 2796 * so for now we will just let the normal rxt timer 2797 * and tlp timer take care of it. 2798 */ 2799 TAILQ_FOREACH(rsm, &rack->r_ctl.rc_map, r_next) { 2800 if (rsm->r_flags & RACK_ACKED) { 2801 cnt++; 2802 rsm->r_sndcnt = 0; 2803 if (rsm->r_in_tmap == 0) { 2804 /* We must re-add it back to the tlist */ 2805 if (trsm == NULL) { 2806 TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext); 2807 } else { 2808 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, trsm, rsm, r_tnext); 2809 } 2810 rsm->r_in_tmap = 1; 2811 trsm = rsm; 2812 } 2813 } 2814 rsm->r_flags &= ~(RACK_ACKED | RACK_SACK_PASSED | RACK_WAS_SACKPASS); 2815 } 2816 /* Clear the count (we just un-acked them) */ 2817 rack->r_ctl.rc_sacked = 0; 2818 /* Clear the tlp rtx mark */ 2819 rack->r_ctl.rc_tlp_rtx_out = 0; 2820 rack->r_ctl.rc_tlp_seg_send_cnt = 0; 2821 rack->r_ctl.rc_resend = TAILQ_FIRST(&rack->r_ctl.rc_map); 2822 /* Setup so we send one segment */ 2823 if (rack->r_ctl.rc_prr_sndcnt < tp->t_maxseg) 2824 rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; 2825 rack->r_timer_override = 1; 2826 } 2827 2828 /* 2829 * Re-transmit timeout! If we drop the PCB we will return 1, otherwise 2830 * we will setup to retransmit the lowest seq number outstanding. 2831 */ 2832 static int 2833 rack_timeout_rxt(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 2834 { 2835 int32_t rexmt; 2836 struct inpcb *inp; 2837 int32_t retval = 0; 2838 2839 inp = tp->t_inpcb; 2840 if (tp->t_timers->tt_flags & TT_STOPPED) { 2841 return (1); 2842 } 2843 if (rack_progress_timeout_check(tp)) { 2844 tcp_set_inp_to_drop(inp, ETIMEDOUT); 2845 return (1); 2846 } 2847 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RXT; 2848 if (TCPS_HAVEESTABLISHED(tp->t_state) && 2849 (tp->snd_una == tp->snd_max)) { 2850 /* Nothing outstanding .. nothing to do */ 2851 return (0); 2852 } 2853 /* 2854 * Retransmission timer went off. Message has not been acked within 2855 * retransmit interval. Back off to a longer retransmit interval 2856 * and retransmit one segment. 2857 */ 2858 if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) { 2859 tp->t_rxtshift = TCP_MAXRXTSHIFT; 2860 TCPSTAT_INC(tcps_timeoutdrop); 2861 retval = 1; 2862 tcp_set_inp_to_drop(rack->rc_inp, 2863 (tp->t_softerror ? (uint16_t) tp->t_softerror : ETIMEDOUT)); 2864 goto out; 2865 } 2866 rack_remxt_tmr(tp); 2867 if (tp->t_state == TCPS_SYN_SENT) { 2868 /* 2869 * If the SYN was retransmitted, indicate CWND to be limited 2870 * to 1 segment in cc_conn_init(). 2871 */ 2872 tp->snd_cwnd = 1; 2873 } else if (tp->t_rxtshift == 1) { 2874 /* 2875 * first retransmit; record ssthresh and cwnd so they can be 2876 * recovered if this turns out to be a "bad" retransmit. A 2877 * retransmit is considered "bad" if an ACK for this segment 2878 * is received within RTT/2 interval; the assumption here is 2879 * that the ACK was already in flight. See "On Estimating 2880 * End-to-End Network Path Properties" by Allman and Paxson 2881 * for more details. 2882 */ 2883 tp->snd_cwnd_prev = tp->snd_cwnd; 2884 tp->snd_ssthresh_prev = tp->snd_ssthresh; 2885 tp->snd_recover_prev = tp->snd_recover; 2886 if (IN_FASTRECOVERY(tp->t_flags)) 2887 tp->t_flags |= TF_WASFRECOVERY; 2888 else 2889 tp->t_flags &= ~TF_WASFRECOVERY; 2890 if (IN_CONGRECOVERY(tp->t_flags)) 2891 tp->t_flags |= TF_WASCRECOVERY; 2892 else 2893 tp->t_flags &= ~TF_WASCRECOVERY; 2894 tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1)); 2895 tp->t_flags |= TF_PREVVALID; 2896 } else 2897 tp->t_flags &= ~TF_PREVVALID; 2898 TCPSTAT_INC(tcps_rexmttimeo); 2899 if ((tp->t_state == TCPS_SYN_SENT) || 2900 (tp->t_state == TCPS_SYN_RECEIVED)) 2901 rexmt = MSEC_2_TICKS(RACK_INITIAL_RTO * tcp_syn_backoff[tp->t_rxtshift]); 2902 else 2903 rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; 2904 TCPT_RANGESET(tp->t_rxtcur, rexmt, 2905 max(MSEC_2_TICKS(rack_rto_min), rexmt), 2906 MSEC_2_TICKS(rack_rto_max)); 2907 /* 2908 * We enter the path for PLMTUD if connection is established or, if 2909 * connection is FIN_WAIT_1 status, reason for the last is that if 2910 * amount of data we send is very small, we could send it in couple 2911 * of packets and process straight to FIN. In that case we won't 2912 * catch ESTABLISHED state. 2913 */ 2914 if (V_tcp_pmtud_blackhole_detect && (((tp->t_state == TCPS_ESTABLISHED)) 2915 || (tp->t_state == TCPS_FIN_WAIT_1))) { 2916 #ifdef INET6 2917 int32_t isipv6; 2918 #endif 2919 2920 /* 2921 * Idea here is that at each stage of mtu probe (usually, 2922 * 1448 -> 1188 -> 524) should be given 2 chances to recover 2923 * before further clamping down. 'tp->t_rxtshift % 2 == 0' 2924 * should take care of that. 2925 */ 2926 if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) == 2927 (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) && 2928 (tp->t_rxtshift >= 2 && tp->t_rxtshift < 6 && 2929 tp->t_rxtshift % 2 == 0)) { 2930 /* 2931 * Enter Path MTU Black-hole Detection mechanism: - 2932 * Disable Path MTU Discovery (IP "DF" bit). - 2933 * Reduce MTU to lower value than what we negotiated 2934 * with peer. 2935 */ 2936 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) == 0) { 2937 /* Record that we may have found a black hole. */ 2938 tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE; 2939 /* Keep track of previous MSS. */ 2940 tp->t_pmtud_saved_maxseg = tp->t_maxseg; 2941 } 2942 2943 /* 2944 * Reduce the MSS to blackhole value or to the 2945 * default in an attempt to retransmit. 2946 */ 2947 #ifdef INET6 2948 isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? 1 : 0; 2949 if (isipv6 && 2950 tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) { 2951 /* Use the sysctl tuneable blackhole MSS. */ 2952 tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss; 2953 TCPSTAT_INC(tcps_pmtud_blackhole_activated); 2954 } else if (isipv6) { 2955 /* Use the default MSS. */ 2956 tp->t_maxseg = V_tcp_v6mssdflt; 2957 /* 2958 * Disable Path MTU Discovery when we switch 2959 * to minmss. 2960 */ 2961 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 2962 TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); 2963 } 2964 #endif 2965 #if defined(INET6) && defined(INET) 2966 else 2967 #endif 2968 #ifdef INET 2969 if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) { 2970 /* Use the sysctl tuneable blackhole MSS. */ 2971 tp->t_maxseg = V_tcp_pmtud_blackhole_mss; 2972 TCPSTAT_INC(tcps_pmtud_blackhole_activated); 2973 } else { 2974 /* Use the default MSS. */ 2975 tp->t_maxseg = V_tcp_mssdflt; 2976 /* 2977 * Disable Path MTU Discovery when we switch 2978 * to minmss. 2979 */ 2980 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 2981 TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); 2982 } 2983 #endif 2984 } else { 2985 /* 2986 * If further retransmissions are still unsuccessful 2987 * with a lowered MTU, maybe this isn't a blackhole 2988 * and we restore the previous MSS and blackhole 2989 * detection flags. The limit '6' is determined by 2990 * giving each probe stage (1448, 1188, 524) 2 2991 * chances to recover. 2992 */ 2993 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) && 2994 (tp->t_rxtshift >= 6)) { 2995 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 2996 tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE; 2997 tp->t_maxseg = tp->t_pmtud_saved_maxseg; 2998 TCPSTAT_INC(tcps_pmtud_blackhole_failed); 2999 } 3000 } 3001 } 3002 /* 3003 * Disable RFC1323 and SACK if we haven't got any response to our 3004 * third SYN to work-around some broken terminal servers (most of 3005 * which have hopefully been retired) that have bad VJ header 3006 * compression code which trashes TCP segments containing 3007 * unknown-to-them TCP options. 3008 */ 3009 if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) && 3010 (tp->t_rxtshift == 3)) 3011 tp->t_flags &= ~(TF_REQ_SCALE | TF_REQ_TSTMP | TF_SACK_PERMIT); 3012 /* 3013 * If we backed off this far, our srtt estimate is probably bogus. 3014 * Clobber it so we'll take the next rtt measurement as our srtt; 3015 * move the current srtt into rttvar to keep the current retransmit 3016 * times until then. 3017 */ 3018 if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) { 3019 #ifdef INET6 3020 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) 3021 in6_losing(tp->t_inpcb); 3022 else 3023 #endif 3024 in_losing(tp->t_inpcb); 3025 tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT); 3026 tp->t_srtt = 0; 3027 } 3028 if (rack_use_sack_filter) 3029 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 3030 tp->snd_recover = tp->snd_max; 3031 tp->t_flags |= TF_ACKNOW; 3032 tp->t_rtttime = 0; 3033 rack_cong_signal(tp, NULL, CC_RTO); 3034 out: 3035 return (retval); 3036 } 3037 3038 static int 3039 rack_process_timers(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8_t hpts_calling) 3040 { 3041 int32_t ret = 0; 3042 int32_t timers = (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK); 3043 3044 if (timers == 0) { 3045 return (0); 3046 } 3047 if (tp->t_state == TCPS_LISTEN) { 3048 /* no timers on listen sockets */ 3049 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) 3050 return (0); 3051 return (1); 3052 } 3053 if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) { 3054 uint32_t left; 3055 3056 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { 3057 ret = -1; 3058 rack_log_to_processing(rack, cts, ret, 0); 3059 return (0); 3060 } 3061 if (hpts_calling == 0) { 3062 ret = -2; 3063 rack_log_to_processing(rack, cts, ret, 0); 3064 return (0); 3065 } 3066 /* 3067 * Ok our timer went off early and we are not paced false 3068 * alarm, go back to sleep. 3069 */ 3070 ret = -3; 3071 left = rack->r_ctl.rc_timer_exp - cts; 3072 tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(left)); 3073 rack_log_to_processing(rack, cts, ret, left); 3074 rack->rc_last_pto_set = 0; 3075 return (1); 3076 } 3077 rack->rc_tmr_stopped = 0; 3078 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_MASK; 3079 if (timers & PACE_TMR_DELACK) { 3080 ret = rack_timeout_delack(tp, rack, cts); 3081 } else if (timers & PACE_TMR_RACK) { 3082 ret = rack_timeout_rack(tp, rack, cts); 3083 } else if (timers & PACE_TMR_TLP) { 3084 ret = rack_timeout_tlp(tp, rack, cts); 3085 } else if (timers & PACE_TMR_RXT) { 3086 ret = rack_timeout_rxt(tp, rack, cts); 3087 } else if (timers & PACE_TMR_PERSIT) { 3088 ret = rack_timeout_persist(tp, rack, cts); 3089 } else if (timers & PACE_TMR_KEEP) { 3090 ret = rack_timeout_keepalive(tp, rack, cts); 3091 } 3092 rack_log_to_processing(rack, cts, ret, timers); 3093 return (ret); 3094 } 3095 3096 static void 3097 rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line) 3098 { 3099 uint8_t hpts_removed = 0; 3100 3101 if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) && 3102 TSTMP_GEQ(cts, rack->r_ctl.rc_last_output_to)) { 3103 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); 3104 hpts_removed = 1; 3105 } 3106 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { 3107 rack->rc_tmr_stopped = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK; 3108 if (rack->rc_inp->inp_in_hpts && 3109 ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0)) { 3110 /* 3111 * Canceling timer's when we have no output being 3112 * paced. We also must remove ourselves from the 3113 * hpts. 3114 */ 3115 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); 3116 hpts_removed = 1; 3117 } 3118 rack_log_to_cancel(rack, hpts_removed, line); 3119 rack->r_ctl.rc_hpts_flags &= ~(PACE_TMR_MASK); 3120 } 3121 } 3122 3123 static void 3124 rack_timer_stop(struct tcpcb *tp, uint32_t timer_type) 3125 { 3126 return; 3127 } 3128 3129 static int 3130 rack_stopall(struct tcpcb *tp) 3131 { 3132 struct tcp_rack *rack; 3133 rack = (struct tcp_rack *)tp->t_fb_ptr; 3134 rack->t_timers_stopped = 1; 3135 return (0); 3136 } 3137 3138 static void 3139 rack_timer_activate(struct tcpcb *tp, uint32_t timer_type, uint32_t delta) 3140 { 3141 return; 3142 } 3143 3144 static int 3145 rack_timer_active(struct tcpcb *tp, uint32_t timer_type) 3146 { 3147 return (0); 3148 } 3149 3150 static void 3151 rack_stop_all_timers(struct tcpcb *tp) 3152 { 3153 struct tcp_rack *rack; 3154 3155 /* 3156 * Assure no timers are running. 3157 */ 3158 if (tcp_timer_active(tp, TT_PERSIST)) { 3159 /* We enter in persists, set the flag appropriately */ 3160 rack = (struct tcp_rack *)tp->t_fb_ptr; 3161 rack->rc_in_persist = 1; 3162 } 3163 tcp_timer_suspend(tp, TT_PERSIST); 3164 tcp_timer_suspend(tp, TT_REXMT); 3165 tcp_timer_suspend(tp, TT_KEEP); 3166 tcp_timer_suspend(tp, TT_DELACK); 3167 } 3168 3169 static void 3170 rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack, 3171 struct rack_sendmap *rsm, uint32_t ts) 3172 { 3173 int32_t idx; 3174 3175 rsm->r_rtr_cnt++; 3176 rsm->r_sndcnt++; 3177 if (rsm->r_rtr_cnt > RACK_NUM_OF_RETRANS) { 3178 rsm->r_rtr_cnt = RACK_NUM_OF_RETRANS; 3179 rsm->r_flags |= RACK_OVERMAX; 3180 } 3181 if ((rsm->r_rtr_cnt > 1) && (rack->r_tlp_running == 0)) { 3182 rack->r_ctl.rc_holes_rxt += (rsm->r_end - rsm->r_start); 3183 rsm->r_rtr_bytes += (rsm->r_end - rsm->r_start); 3184 } 3185 idx = rsm->r_rtr_cnt - 1; 3186 rsm->r_tim_lastsent[idx] = ts; 3187 if (rsm->r_flags & RACK_ACKED) { 3188 /* Problably MTU discovery messing with us */ 3189 rsm->r_flags &= ~RACK_ACKED; 3190 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); 3191 } 3192 if (rsm->r_in_tmap) { 3193 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 3194 } 3195 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); 3196 rsm->r_in_tmap = 1; 3197 if (rsm->r_flags & RACK_SACK_PASSED) { 3198 /* We have retransmitted due to the SACK pass */ 3199 rsm->r_flags &= ~RACK_SACK_PASSED; 3200 rsm->r_flags |= RACK_WAS_SACKPASS; 3201 } 3202 /* Update memory for next rtr */ 3203 rack->r_ctl.rc_next = TAILQ_NEXT(rsm, r_next); 3204 } 3205 3206 3207 static uint32_t 3208 rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack, 3209 struct rack_sendmap *rsm, uint32_t ts, int32_t * lenp) 3210 { 3211 /* 3212 * We (re-)transmitted starting at rsm->r_start for some length 3213 * (possibly less than r_end. 3214 */ 3215 struct rack_sendmap *nrsm; 3216 uint32_t c_end; 3217 int32_t len; 3218 int32_t idx; 3219 3220 len = *lenp; 3221 c_end = rsm->r_start + len; 3222 if (SEQ_GEQ(c_end, rsm->r_end)) { 3223 /* 3224 * We retransmitted the whole piece or more than the whole 3225 * slopping into the next rsm. 3226 */ 3227 rack_update_rsm(tp, rack, rsm, ts); 3228 if (c_end == rsm->r_end) { 3229 *lenp = 0; 3230 return (0); 3231 } else { 3232 int32_t act_len; 3233 3234 /* Hangs over the end return whats left */ 3235 act_len = rsm->r_end - rsm->r_start; 3236 *lenp = (len - act_len); 3237 return (rsm->r_end); 3238 } 3239 /* We don't get out of this block. */ 3240 } 3241 /* 3242 * Here we retransmitted less than the whole thing which means we 3243 * have to split this into what was transmitted and what was not. 3244 */ 3245 nrsm = rack_alloc(rack); 3246 if (nrsm == NULL) { 3247 /* 3248 * We can't get memory, so lets not proceed. 3249 */ 3250 *lenp = 0; 3251 return (0); 3252 } 3253 /* 3254 * So here we are going to take the original rsm and make it what we 3255 * retransmitted. nrsm will be the tail portion we did not 3256 * retransmit. For example say the chunk was 1, 11 (10 bytes). And 3257 * we retransmitted 5 bytes i.e. 1, 5. The original piece shrinks to 3258 * 1, 6 and the new piece will be 6, 11. 3259 */ 3260 nrsm->r_start = c_end; 3261 nrsm->r_end = rsm->r_end; 3262 nrsm->r_rtr_cnt = rsm->r_rtr_cnt; 3263 nrsm->r_flags = rsm->r_flags; 3264 nrsm->r_sndcnt = rsm->r_sndcnt; 3265 nrsm->r_rtr_bytes = 0; 3266 rsm->r_end = c_end; 3267 for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { 3268 nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; 3269 } 3270 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next); 3271 if (rsm->r_in_tmap) { 3272 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 3273 nrsm->r_in_tmap = 1; 3274 } 3275 rsm->r_flags &= (~RACK_HAS_FIN); 3276 rack_update_rsm(tp, rack, rsm, ts); 3277 *lenp = 0; 3278 return (0); 3279 } 3280 3281 3282 static void 3283 rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len, 3284 uint32_t seq_out, uint8_t th_flags, int32_t err, uint32_t ts, 3285 uint8_t pass, struct rack_sendmap *hintrsm) 3286 { 3287 struct tcp_rack *rack; 3288 struct rack_sendmap *rsm, *nrsm; 3289 register uint32_t snd_max, snd_una; 3290 int32_t idx; 3291 3292 /* 3293 * Add to the RACK log of packets in flight or retransmitted. If 3294 * there is a TS option we will use the TS echoed, if not we will 3295 * grab a TS. 3296 * 3297 * Retransmissions will increment the count and move the ts to its 3298 * proper place. Note that if options do not include TS's then we 3299 * won't be able to effectively use the ACK for an RTT on a retran. 3300 * 3301 * Notes about r_start and r_end. Lets consider a send starting at 3302 * sequence 1 for 10 bytes. In such an example the r_start would be 3303 * 1 (starting sequence) but the r_end would be r_start+len i.e. 11. 3304 * This means that r_end is actually the first sequence for the next 3305 * slot (11). 3306 * 3307 */ 3308 /* 3309 * If err is set what do we do XXXrrs? should we not add the thing? 3310 * -- i.e. return if err != 0 or should we pretend we sent it? -- 3311 * i.e. proceed with add ** do this for now. 3312 */ 3313 INP_WLOCK_ASSERT(tp->t_inpcb); 3314 if (err) 3315 /* 3316 * We don't log errors -- we could but snd_max does not 3317 * advance in this case either. 3318 */ 3319 return; 3320 3321 if (th_flags & TH_RST) { 3322 /* 3323 * We don't log resets and we return immediately from 3324 * sending 3325 */ 3326 return; 3327 } 3328 rack = (struct tcp_rack *)tp->t_fb_ptr; 3329 snd_una = tp->snd_una; 3330 if (SEQ_LEQ((seq_out + len), snd_una)) { 3331 /* Are sending an old segment to induce an ack (keep-alive)? */ 3332 return; 3333 } 3334 if (SEQ_LT(seq_out, snd_una)) { 3335 /* huh? should we panic? */ 3336 uint32_t end; 3337 3338 end = seq_out + len; 3339 seq_out = snd_una; 3340 len = end - seq_out; 3341 } 3342 snd_max = tp->snd_max; 3343 if (th_flags & (TH_SYN | TH_FIN)) { 3344 /* 3345 * The call to rack_log_output is made before bumping 3346 * snd_max. This means we can record one extra byte on a SYN 3347 * or FIN if seq_out is adding more on and a FIN is present 3348 * (and we are not resending). 3349 */ 3350 if (th_flags & TH_SYN) 3351 len++; 3352 if (th_flags & TH_FIN) 3353 len++; 3354 if (SEQ_LT(snd_max, tp->snd_nxt)) { 3355 /* 3356 * The add/update as not been done for the FIN/SYN 3357 * yet. 3358 */ 3359 snd_max = tp->snd_nxt; 3360 } 3361 } 3362 if (len == 0) { 3363 /* We don't log zero window probes */ 3364 return; 3365 } 3366 rack->r_ctl.rc_time_last_sent = ts; 3367 if (IN_RECOVERY(tp->t_flags)) { 3368 rack->r_ctl.rc_prr_out += len; 3369 } 3370 /* First question is it a retransmission? */ 3371 if (seq_out == snd_max) { 3372 again: 3373 rsm = rack_alloc(rack); 3374 if (rsm == NULL) { 3375 /* 3376 * Hmm out of memory and the tcb got destroyed while 3377 * we tried to wait. 3378 */ 3379 #ifdef INVARIANTS 3380 panic("Out of memory when we should not be rack:%p", rack); 3381 #endif 3382 return; 3383 } 3384 if (th_flags & TH_FIN) { 3385 rsm->r_flags = RACK_HAS_FIN; 3386 } else { 3387 rsm->r_flags = 0; 3388 } 3389 rsm->r_tim_lastsent[0] = ts; 3390 rsm->r_rtr_cnt = 1; 3391 rsm->r_rtr_bytes = 0; 3392 rsm->r_start = seq_out; 3393 rsm->r_end = rsm->r_start + len; 3394 rsm->r_sndcnt = 0; 3395 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_map, rsm, r_next); 3396 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); 3397 rsm->r_in_tmap = 1; 3398 return; 3399 } 3400 /* 3401 * If we reach here its a retransmission and we need to find it. 3402 */ 3403 more: 3404 if (hintrsm && (hintrsm->r_start == seq_out)) { 3405 rsm = hintrsm; 3406 hintrsm = NULL; 3407 } else if (rack->r_ctl.rc_next) { 3408 /* We have a hint from a previous run */ 3409 rsm = rack->r_ctl.rc_next; 3410 } else { 3411 /* No hints sorry */ 3412 rsm = NULL; 3413 } 3414 if ((rsm) && (rsm->r_start == seq_out)) { 3415 /* 3416 * We used rc_next or hintrsm to retransmit, hopefully the 3417 * likely case. 3418 */ 3419 seq_out = rack_update_entry(tp, rack, rsm, ts, &len); 3420 if (len == 0) { 3421 return; 3422 } else { 3423 goto more; 3424 } 3425 } 3426 /* Ok it was not the last pointer go through it the hard way. */ 3427 TAILQ_FOREACH(rsm, &rack->r_ctl.rc_map, r_next) { 3428 if (rsm->r_start == seq_out) { 3429 seq_out = rack_update_entry(tp, rack, rsm, ts, &len); 3430 rack->r_ctl.rc_next = TAILQ_NEXT(rsm, r_next); 3431 if (len == 0) { 3432 return; 3433 } else { 3434 continue; 3435 } 3436 } 3437 if (SEQ_GEQ(seq_out, rsm->r_start) && SEQ_LT(seq_out, rsm->r_end)) { 3438 /* Transmitted within this piece */ 3439 /* 3440 * Ok we must split off the front and then let the 3441 * update do the rest 3442 */ 3443 nrsm = rack_alloc(rack); 3444 if (nrsm == NULL) { 3445 #ifdef INVARIANTS 3446 panic("Ran out of memory that was preallocated? rack:%p", rack); 3447 #endif 3448 rack_update_rsm(tp, rack, rsm, ts); 3449 return; 3450 } 3451 /* 3452 * copy rsm to nrsm and then trim the front of rsm 3453 * to not include this part. 3454 */ 3455 nrsm->r_start = seq_out; 3456 nrsm->r_end = rsm->r_end; 3457 nrsm->r_rtr_cnt = rsm->r_rtr_cnt; 3458 nrsm->r_flags = rsm->r_flags; 3459 nrsm->r_sndcnt = rsm->r_sndcnt; 3460 nrsm->r_rtr_bytes = 0; 3461 for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { 3462 nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; 3463 } 3464 rsm->r_end = nrsm->r_start; 3465 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next); 3466 if (rsm->r_in_tmap) { 3467 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 3468 nrsm->r_in_tmap = 1; 3469 } 3470 rsm->r_flags &= (~RACK_HAS_FIN); 3471 seq_out = rack_update_entry(tp, rack, nrsm, ts, &len); 3472 if (len == 0) { 3473 return; 3474 } 3475 } 3476 } 3477 /* 3478 * Hmm not found in map did they retransmit both old and on into the 3479 * new? 3480 */ 3481 if (seq_out == tp->snd_max) { 3482 goto again; 3483 } else if (SEQ_LT(seq_out, tp->snd_max)) { 3484 #ifdef INVARIANTS 3485 printf("seq_out:%u len:%d snd_una:%u snd_max:%u -- but rsm not found?\n", 3486 seq_out, len, tp->snd_una, tp->snd_max); 3487 printf("Starting Dump of all rack entries\n"); 3488 TAILQ_FOREACH(rsm, &rack->r_ctl.rc_map, r_next) { 3489 printf("rsm:%p start:%u end:%u\n", 3490 rsm, rsm->r_start, rsm->r_end); 3491 } 3492 printf("Dump complete\n"); 3493 panic("seq_out not found rack:%p tp:%p", 3494 rack, tp); 3495 #endif 3496 } else { 3497 #ifdef INVARIANTS 3498 /* 3499 * Hmm beyond sndmax? (only if we are using the new rtt-pack 3500 * flag) 3501 */ 3502 panic("seq_out:%u(%d) is beyond snd_max:%u tp:%p", 3503 seq_out, len, tp->snd_max, tp); 3504 #endif 3505 } 3506 } 3507 3508 /* 3509 * Record one of the RTT updates from an ack into 3510 * our sample structure. 3511 */ 3512 static void 3513 tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt) 3514 { 3515 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) || 3516 (rack->r_ctl.rack_rs.rs_rtt_lowest > rtt)) { 3517 rack->r_ctl.rack_rs.rs_rtt_lowest = rtt; 3518 } 3519 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) || 3520 (rack->r_ctl.rack_rs.rs_rtt_highest < rtt)) { 3521 rack->r_ctl.rack_rs.rs_rtt_highest = rtt; 3522 } 3523 rack->r_ctl.rack_rs.rs_flags = RACK_RTT_VALID; 3524 rack->r_ctl.rack_rs.rs_rtt_tot += rtt; 3525 rack->r_ctl.rack_rs.rs_rtt_cnt++; 3526 } 3527 3528 /* 3529 * Collect new round-trip time estimate 3530 * and update averages and current timeout. 3531 */ 3532 static void 3533 tcp_rack_xmit_timer_commit(struct tcp_rack *rack, struct tcpcb *tp) 3534 { 3535 int32_t delta; 3536 uint32_t o_srtt, o_var; 3537 int32_t rtt; 3538 3539 if (rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) 3540 /* No valid sample */ 3541 return; 3542 if (rack->r_ctl.rc_rate_sample_method == USE_RTT_LOW) { 3543 /* We are to use the lowest RTT seen in a single ack */ 3544 rtt = rack->r_ctl.rack_rs.rs_rtt_lowest; 3545 } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_HIGH) { 3546 /* We are to use the highest RTT seen in a single ack */ 3547 rtt = rack->r_ctl.rack_rs.rs_rtt_highest; 3548 } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_AVG) { 3549 /* We are to use the average RTT seen in a single ack */ 3550 rtt = (int32_t)(rack->r_ctl.rack_rs.rs_rtt_tot / 3551 (uint64_t)rack->r_ctl.rack_rs.rs_rtt_cnt); 3552 } else { 3553 #ifdef INVARIANTS 3554 panic("Unknown rtt variant %d", rack->r_ctl.rc_rate_sample_method); 3555 #endif 3556 return; 3557 } 3558 if (rtt == 0) 3559 rtt = 1; 3560 rack_log_rtt_sample(rack, rtt); 3561 o_srtt = tp->t_srtt; 3562 o_var = tp->t_rttvar; 3563 rack = (struct tcp_rack *)tp->t_fb_ptr; 3564 if (tp->t_srtt != 0) { 3565 /* 3566 * srtt is stored as fixed point with 5 bits after the 3567 * binary point (i.e., scaled by 8). The following magic is 3568 * equivalent to the smoothing algorithm in rfc793 with an 3569 * alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed point). 3570 * Adjust rtt to origin 0. 3571 */ 3572 delta = ((rtt - 1) << TCP_DELTA_SHIFT) 3573 - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)); 3574 3575 tp->t_srtt += delta; 3576 if (tp->t_srtt <= 0) 3577 tp->t_srtt = 1; 3578 3579 /* 3580 * We accumulate a smoothed rtt variance (actually, a 3581 * smoothed mean difference), then set the retransmit timer 3582 * to smoothed rtt + 4 times the smoothed variance. rttvar 3583 * is stored as fixed point with 4 bits after the binary 3584 * point (scaled by 16). The following is equivalent to 3585 * rfc793 smoothing with an alpha of .75 (rttvar = 3586 * rttvar*3/4 + |delta| / 4). This replaces rfc793's 3587 * wired-in beta. 3588 */ 3589 if (delta < 0) 3590 delta = -delta; 3591 delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT); 3592 tp->t_rttvar += delta; 3593 if (tp->t_rttvar <= 0) 3594 tp->t_rttvar = 1; 3595 if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar) 3596 tp->t_rttbest = tp->t_srtt + tp->t_rttvar; 3597 } else { 3598 /* 3599 * No rtt measurement yet - use the unsmoothed rtt. Set the 3600 * variance to half the rtt (so our first retransmit happens 3601 * at 3*rtt). 3602 */ 3603 tp->t_srtt = rtt << TCP_RTT_SHIFT; 3604 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1); 3605 tp->t_rttbest = tp->t_srtt + tp->t_rttvar; 3606 } 3607 TCPSTAT_INC(tcps_rttupdated); 3608 rack_log_rtt_upd(tp, rack, rtt, o_srtt, o_var); 3609 tp->t_rttupdated++; 3610 #ifdef NETFLIX_STATS 3611 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rtt)); 3612 #endif 3613 tp->t_rxtshift = 0; 3614 3615 /* 3616 * the retransmit should happen at rtt + 4 * rttvar. Because of the 3617 * way we do the smoothing, srtt and rttvar will each average +1/2 3618 * tick of bias. When we compute the retransmit timer, we want 1/2 3619 * tick of rounding and 1 extra tick because of +-1/2 tick 3620 * uncertainty in the firing of the timer. The bias will give us 3621 * exactly the 1.5 tick we need. But, because the bias is 3622 * statistical, we have to test that we don't drop below the minimum 3623 * feasible timer (which is 2 ticks). 3624 */ 3625 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), 3626 max(MSEC_2_TICKS(rack_rto_min), rtt + 2), MSEC_2_TICKS(rack_rto_max)); 3627 tp->t_softerror = 0; 3628 } 3629 3630 static void 3631 rack_earlier_retran(struct tcpcb *tp, struct rack_sendmap *rsm, 3632 uint32_t t, uint32_t cts) 3633 { 3634 /* 3635 * For this RSM, we acknowledged the data from a previous 3636 * transmission, not the last one we made. This means we did a false 3637 * retransmit. 3638 */ 3639 struct tcp_rack *rack; 3640 3641 if (rsm->r_flags & RACK_HAS_FIN) { 3642 /* 3643 * The sending of the FIN often is multiple sent when we 3644 * have everything outstanding ack'd. We ignore this case 3645 * since its over now. 3646 */ 3647 return; 3648 } 3649 if (rsm->r_flags & RACK_TLP) { 3650 /* 3651 * We expect TLP's to have this occur. 3652 */ 3653 return; 3654 } 3655 rack = (struct tcp_rack *)tp->t_fb_ptr; 3656 /* should we undo cc changes and exit recovery? */ 3657 if (IN_RECOVERY(tp->t_flags)) { 3658 if (rack->r_ctl.rc_rsm_start == rsm->r_start) { 3659 /* 3660 * Undo what we ratched down and exit recovery if 3661 * possible 3662 */ 3663 EXIT_RECOVERY(tp->t_flags); 3664 tp->snd_recover = tp->snd_una; 3665 if (rack->r_ctl.rc_cwnd_at > tp->snd_cwnd) 3666 tp->snd_cwnd = rack->r_ctl.rc_cwnd_at; 3667 if (rack->r_ctl.rc_ssthresh_at > tp->snd_ssthresh) 3668 tp->snd_ssthresh = rack->r_ctl.rc_ssthresh_at; 3669 } 3670 } 3671 if (rsm->r_flags & RACK_WAS_SACKPASS) { 3672 /* 3673 * We retransmitted based on a sack and the earlier 3674 * retransmission ack'd it - re-ordering is occuring. 3675 */ 3676 counter_u64_add(rack_reorder_seen, 1); 3677 rack->r_ctl.rc_reorder_ts = cts; 3678 } 3679 counter_u64_add(rack_badfr, 1); 3680 counter_u64_add(rack_badfr_bytes, (rsm->r_end - rsm->r_start)); 3681 } 3682 3683 3684 static int 3685 rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack, 3686 struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type) 3687 { 3688 int32_t i; 3689 uint32_t t; 3690 3691 if (rsm->r_flags & RACK_ACKED) 3692 /* Already done */ 3693 return (0); 3694 3695 3696 if ((rsm->r_rtr_cnt == 1) || 3697 ((ack_type == CUM_ACKED) && 3698 (to->to_flags & TOF_TS) && 3699 (to->to_tsecr) && 3700 (rsm->r_tim_lastsent[rsm->r_rtr_cnt - 1] == to->to_tsecr)) 3701 ) { 3702 /* 3703 * We will only find a matching timestamp if its cum-acked. 3704 * But if its only one retransmission its for-sure matching 3705 * :-) 3706 */ 3707 t = cts - rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; 3708 if ((int)t <= 0) 3709 t = 1; 3710 if (!tp->t_rttlow || tp->t_rttlow > t) 3711 tp->t_rttlow = t; 3712 if (!rack->r_ctl.rc_rack_min_rtt || 3713 SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 3714 rack->r_ctl.rc_rack_min_rtt = t; 3715 if (rack->r_ctl.rc_rack_min_rtt == 0) { 3716 rack->r_ctl.rc_rack_min_rtt = 1; 3717 } 3718 } 3719 tcp_rack_xmit_timer(rack, TCP_TS_TO_TICKS(t) + 1); 3720 if ((rsm->r_flags & RACK_TLP) && 3721 (!IN_RECOVERY(tp->t_flags))) { 3722 /* Segment was a TLP and our retrans matched */ 3723 if (rack->r_ctl.rc_tlp_cwnd_reduce) { 3724 rack->r_ctl.rc_rsm_start = tp->snd_max; 3725 rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; 3726 rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; 3727 rack_cong_signal(tp, NULL, CC_NDUPACK); 3728 /* 3729 * When we enter recovery we need to assure 3730 * we send one packet. 3731 */ 3732 rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; 3733 } else 3734 rack->r_ctl.rc_tlp_rtx_out = 0; 3735 } 3736 if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) { 3737 /* New more recent rack_tmit_time */ 3738 rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; 3739 rack->rc_rack_rtt = t; 3740 } 3741 return (1); 3742 } 3743 /* 3744 * We clear the soft/rxtshift since we got an ack. 3745 * There is no assurance we will call the commit() function 3746 * so we need to clear these to avoid incorrect handling. 3747 */ 3748 tp->t_rxtshift = 0; 3749 tp->t_softerror = 0; 3750 if ((to->to_flags & TOF_TS) && 3751 (ack_type == CUM_ACKED) && 3752 (to->to_tsecr) && 3753 ((rsm->r_flags & (RACK_DEFERRED | RACK_OVERMAX)) == 0)) { 3754 /* 3755 * Now which timestamp does it match? In this block the ACK 3756 * must be coming from a previous transmission. 3757 */ 3758 for (i = 0; i < rsm->r_rtr_cnt; i++) { 3759 if (rsm->r_tim_lastsent[i] == to->to_tsecr) { 3760 t = cts - rsm->r_tim_lastsent[i]; 3761 if ((int)t <= 0) 3762 t = 1; 3763 if ((i + 1) < rsm->r_rtr_cnt) { 3764 /* Likely */ 3765 rack_earlier_retran(tp, rsm, t, cts); 3766 } 3767 if (!tp->t_rttlow || tp->t_rttlow > t) 3768 tp->t_rttlow = t; 3769 if (!rack->r_ctl.rc_rack_min_rtt || SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 3770 rack->r_ctl.rc_rack_min_rtt = t; 3771 if (rack->r_ctl.rc_rack_min_rtt == 0) { 3772 rack->r_ctl.rc_rack_min_rtt = 1; 3773 } 3774 } 3775 /* 3776 * Note the following calls to 3777 * tcp_rack_xmit_timer() are being commented 3778 * out for now. They give us no more accuracy 3779 * and often lead to a wrong choice. We have 3780 * enough samples that have not been 3781 * retransmitted. I leave the commented out 3782 * code in here in case in the future we 3783 * decide to add it back (though I can't forsee 3784 * doing that). That way we will easily see 3785 * where they need to be placed. 3786 */ 3787 if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, 3788 rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) { 3789 /* New more recent rack_tmit_time */ 3790 rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; 3791 rack->rc_rack_rtt = t; 3792 } 3793 return (1); 3794 } 3795 } 3796 goto ts_not_found; 3797 } else { 3798 /* 3799 * Ok its a SACK block that we retransmitted. or a windows 3800 * machine without timestamps. We can tell nothing from the 3801 * time-stamp since its not there or the time the peer last 3802 * recieved a segment that moved forward its cum-ack point. 3803 */ 3804 ts_not_found: 3805 i = rsm->r_rtr_cnt - 1; 3806 t = cts - rsm->r_tim_lastsent[i]; 3807 if ((int)t <= 0) 3808 t = 1; 3809 if (rack->r_ctl.rc_rack_min_rtt && SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 3810 /* 3811 * We retransmitted and the ack came back in less 3812 * than the smallest rtt we have observed. We most 3813 * likey did an improper retransmit as outlined in 3814 * 4.2 Step 3 point 2 in the rack-draft. 3815 */ 3816 i = rsm->r_rtr_cnt - 2; 3817 t = cts - rsm->r_tim_lastsent[i]; 3818 rack_earlier_retran(tp, rsm, t, cts); 3819 } else if (rack->r_ctl.rc_rack_min_rtt) { 3820 /* 3821 * We retransmitted it and the retransmit did the 3822 * job. 3823 */ 3824 if (!rack->r_ctl.rc_rack_min_rtt || 3825 SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 3826 rack->r_ctl.rc_rack_min_rtt = t; 3827 if (rack->r_ctl.rc_rack_min_rtt == 0) { 3828 rack->r_ctl.rc_rack_min_rtt = 1; 3829 } 3830 } 3831 if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, rsm->r_tim_lastsent[i])) { 3832 /* New more recent rack_tmit_time */ 3833 rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[i]; 3834 rack->rc_rack_rtt = t; 3835 } 3836 return (1); 3837 } 3838 } 3839 return (0); 3840 } 3841 3842 /* 3843 * Mark the SACK_PASSED flag on all entries prior to rsm send wise. 3844 */ 3845 static void 3846 rack_log_sack_passed(struct tcpcb *tp, 3847 struct tcp_rack *rack, struct rack_sendmap *rsm) 3848 { 3849 struct rack_sendmap *nrsm; 3850 uint32_t ts; 3851 int32_t idx; 3852 3853 idx = rsm->r_rtr_cnt - 1; 3854 ts = rsm->r_tim_lastsent[idx]; 3855 nrsm = rsm; 3856 TAILQ_FOREACH_REVERSE_FROM(nrsm, &rack->r_ctl.rc_tmap, 3857 rack_head, r_tnext) { 3858 if (nrsm == rsm) { 3859 /* Skip orginal segment he is acked */ 3860 continue; 3861 } 3862 if (nrsm->r_flags & RACK_ACKED) { 3863 /* Skip ack'd segments */ 3864 continue; 3865 } 3866 idx = nrsm->r_rtr_cnt - 1; 3867 if (ts == nrsm->r_tim_lastsent[idx]) { 3868 /* 3869 * For this case lets use seq no, if we sent in a 3870 * big block (TSO) we would have a bunch of segments 3871 * sent at the same time. 3872 * 3873 * We would only get a report if its SEQ is earlier. 3874 * If we have done multiple retransmits the times 3875 * would not be equal. 3876 */ 3877 if (SEQ_LT(nrsm->r_start, rsm->r_start)) { 3878 nrsm->r_flags |= RACK_SACK_PASSED; 3879 nrsm->r_flags &= ~RACK_WAS_SACKPASS; 3880 } 3881 } else { 3882 /* 3883 * Here they were sent at different times, not a big 3884 * block. Since we transmitted this one later and 3885 * see it sack'd then this must also be missing (or 3886 * we would have gotten a sack block for it) 3887 */ 3888 nrsm->r_flags |= RACK_SACK_PASSED; 3889 nrsm->r_flags &= ~RACK_WAS_SACKPASS; 3890 } 3891 } 3892 } 3893 3894 static uint32_t 3895 rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, struct sackblk *sack, 3896 struct tcpopt *to, struct rack_sendmap **prsm, uint32_t cts) 3897 { 3898 int32_t idx; 3899 int32_t times = 0; 3900 uint32_t start, end, changed = 0; 3901 struct rack_sendmap *rsm, *nrsm; 3902 int32_t used_ref = 1; 3903 3904 start = sack->start; 3905 end = sack->end; 3906 rsm = *prsm; 3907 if (rsm && SEQ_LT(start, rsm->r_start)) { 3908 TAILQ_FOREACH_REVERSE_FROM(rsm, &rack->r_ctl.rc_map, rack_head, r_next) { 3909 if (SEQ_GEQ(start, rsm->r_start) && 3910 SEQ_LT(start, rsm->r_end)) { 3911 goto do_rest_ofb; 3912 } 3913 } 3914 } 3915 if (rsm == NULL) { 3916 start_at_beginning: 3917 rsm = NULL; 3918 used_ref = 0; 3919 } 3920 /* First lets locate the block where this guy is */ 3921 TAILQ_FOREACH_FROM(rsm, &rack->r_ctl.rc_map, r_next) { 3922 if (SEQ_GEQ(start, rsm->r_start) && 3923 SEQ_LT(start, rsm->r_end)) { 3924 break; 3925 } 3926 } 3927 do_rest_ofb: 3928 if (rsm == NULL) { 3929 /* 3930 * This happens when we get duplicate sack blocks with the 3931 * same end. For example SACK 4: 100 SACK 3: 100 The sort 3932 * will not change there location so we would just start at 3933 * the end of the first one and get lost. 3934 */ 3935 if (tp->t_flags & TF_SENTFIN) { 3936 /* 3937 * Check to see if we have not logged the FIN that 3938 * went out. 3939 */ 3940 nrsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_map, rack_sendmap, r_next); 3941 if (nrsm && (nrsm->r_end + 1) == tp->snd_max) { 3942 /* 3943 * Ok we did not get the FIN logged. 3944 */ 3945 nrsm->r_end++; 3946 rsm = nrsm; 3947 goto do_rest_ofb; 3948 } 3949 } 3950 if (times == 1) { 3951 #ifdef INVARIANTS 3952 panic("tp:%p rack:%p sack:%p to:%p prsm:%p", 3953 tp, rack, sack, to, prsm); 3954 #else 3955 goto out; 3956 #endif 3957 } 3958 times++; 3959 counter_u64_add(rack_sack_proc_restart, 1); 3960 goto start_at_beginning; 3961 } 3962 /* Ok we have an ACK for some piece of rsm */ 3963 if (rsm->r_start != start) { 3964 /* 3965 * Need to split this in two pieces the before and after. 3966 */ 3967 nrsm = rack_alloc(rack); 3968 if (nrsm == NULL) { 3969 /* 3970 * failed XXXrrs what can we do but loose the sack 3971 * info? 3972 */ 3973 goto out; 3974 } 3975 nrsm->r_start = start; 3976 nrsm->r_rtr_bytes = 0; 3977 nrsm->r_end = rsm->r_end; 3978 nrsm->r_rtr_cnt = rsm->r_rtr_cnt; 3979 nrsm->r_flags = rsm->r_flags; 3980 nrsm->r_sndcnt = rsm->r_sndcnt; 3981 for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { 3982 nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; 3983 } 3984 rsm->r_end = nrsm->r_start; 3985 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next); 3986 if (rsm->r_in_tmap) { 3987 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 3988 nrsm->r_in_tmap = 1; 3989 } 3990 rsm->r_flags &= (~RACK_HAS_FIN); 3991 rsm = nrsm; 3992 } 3993 if (SEQ_GEQ(end, rsm->r_end)) { 3994 /* 3995 * The end of this block is either beyond this guy or right 3996 * at this guy. 3997 */ 3998 3999 if ((rsm->r_flags & RACK_ACKED) == 0) { 4000 rack_update_rtt(tp, rack, rsm, to, cts, SACKED); 4001 changed += (rsm->r_end - rsm->r_start); 4002 rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start); 4003 rack_log_sack_passed(tp, rack, rsm); 4004 /* Is Reordering occuring? */ 4005 if (rsm->r_flags & RACK_SACK_PASSED) { 4006 counter_u64_add(rack_reorder_seen, 1); 4007 rack->r_ctl.rc_reorder_ts = cts; 4008 } 4009 rsm->r_flags |= RACK_ACKED; 4010 rsm->r_flags &= ~RACK_TLP; 4011 if (rsm->r_in_tmap) { 4012 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 4013 rsm->r_in_tmap = 0; 4014 } 4015 } 4016 if (end == rsm->r_end) { 4017 /* This block only - done */ 4018 goto out; 4019 } 4020 /* There is more not coverend by this rsm move on */ 4021 start = rsm->r_end; 4022 nrsm = TAILQ_NEXT(rsm, r_next); 4023 rsm = nrsm; 4024 times = 0; 4025 goto do_rest_ofb; 4026 } 4027 /* Ok we need to split off this one at the tail */ 4028 nrsm = rack_alloc(rack); 4029 if (nrsm == NULL) { 4030 /* failed rrs what can we do but loose the sack info? */ 4031 goto out; 4032 } 4033 /* Clone it */ 4034 nrsm->r_start = end; 4035 nrsm->r_end = rsm->r_end; 4036 nrsm->r_rtr_bytes = 0; 4037 nrsm->r_rtr_cnt = rsm->r_rtr_cnt; 4038 nrsm->r_flags = rsm->r_flags; 4039 nrsm->r_sndcnt = rsm->r_sndcnt; 4040 for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { 4041 nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; 4042 } 4043 /* The sack block does not cover this guy fully */ 4044 rsm->r_flags &= (~RACK_HAS_FIN); 4045 rsm->r_end = end; 4046 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next); 4047 if (rsm->r_in_tmap) { 4048 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 4049 nrsm->r_in_tmap = 1; 4050 } 4051 if (rsm->r_flags & RACK_ACKED) { 4052 /* Been here done that */ 4053 goto out; 4054 } 4055 rack_update_rtt(tp, rack, rsm, to, cts, SACKED); 4056 changed += (rsm->r_end - rsm->r_start); 4057 rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start); 4058 rack_log_sack_passed(tp, rack, rsm); 4059 /* Is Reordering occuring? */ 4060 if (rsm->r_flags & RACK_SACK_PASSED) { 4061 counter_u64_add(rack_reorder_seen, 1); 4062 rack->r_ctl.rc_reorder_ts = cts; 4063 } 4064 rsm->r_flags |= RACK_ACKED; 4065 rsm->r_flags &= ~RACK_TLP; 4066 if (rsm->r_in_tmap) { 4067 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 4068 rsm->r_in_tmap = 0; 4069 } 4070 out: 4071 if (used_ref == 0) { 4072 counter_u64_add(rack_sack_proc_all, 1); 4073 } else { 4074 counter_u64_add(rack_sack_proc_short, 1); 4075 } 4076 /* Save off where we last were */ 4077 if (rsm) 4078 rack->r_ctl.rc_sacklast = TAILQ_NEXT(rsm, r_next); 4079 else 4080 rack->r_ctl.rc_sacklast = NULL; 4081 *prsm = rsm; 4082 return (changed); 4083 } 4084 4085 static void inline 4086 rack_peer_reneges(struct tcp_rack *rack, struct rack_sendmap *rsm, tcp_seq th_ack) 4087 { 4088 struct rack_sendmap *tmap; 4089 4090 tmap = NULL; 4091 while (rsm && (rsm->r_flags & RACK_ACKED)) { 4092 /* Its no longer sacked, mark it so */ 4093 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); 4094 #ifdef INVARIANTS 4095 if (rsm->r_in_tmap) { 4096 panic("rack:%p rsm:%p flags:0x%x in tmap?", 4097 rack, rsm, rsm->r_flags); 4098 } 4099 #endif 4100 rsm->r_flags &= ~(RACK_ACKED|RACK_SACK_PASSED|RACK_WAS_SACKPASS); 4101 /* Rebuild it into our tmap */ 4102 if (tmap == NULL) { 4103 TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext); 4104 tmap = rsm; 4105 } else { 4106 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, tmap, rsm, r_tnext); 4107 tmap = rsm; 4108 } 4109 tmap->r_in_tmap = 1; 4110 rsm = TAILQ_NEXT(rsm, r_next); 4111 } 4112 /* 4113 * Now lets possibly clear the sack filter so we start 4114 * recognizing sacks that cover this area. 4115 */ 4116 if (rack_use_sack_filter) 4117 sack_filter_clear(&rack->r_ctl.rack_sf, th_ack); 4118 4119 } 4120 4121 static void 4122 rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th) 4123 { 4124 uint32_t changed, last_seq, entered_recovery = 0; 4125 struct tcp_rack *rack; 4126 struct rack_sendmap *rsm; 4127 struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1]; 4128 register uint32_t th_ack; 4129 int32_t i, j, k, num_sack_blks = 0; 4130 uint32_t cts, acked, ack_point, sack_changed = 0; 4131 4132 INP_WLOCK_ASSERT(tp->t_inpcb); 4133 if (th->th_flags & TH_RST) { 4134 /* We don't log resets */ 4135 return; 4136 } 4137 rack = (struct tcp_rack *)tp->t_fb_ptr; 4138 cts = tcp_ts_getticks(); 4139 rsm = TAILQ_FIRST(&rack->r_ctl.rc_map); 4140 changed = 0; 4141 th_ack = th->th_ack; 4142 4143 if (SEQ_GT(th_ack, tp->snd_una)) { 4144 rack_log_progress_event(rack, tp, ticks, PROGRESS_UPDATE, __LINE__); 4145 tp->t_acktime = ticks; 4146 } 4147 if (rsm && SEQ_GT(th_ack, rsm->r_start)) 4148 changed = th_ack - rsm->r_start; 4149 if (changed) { 4150 /* 4151 * The ACK point is advancing to th_ack, we must drop off 4152 * the packets in the rack log and calculate any eligble 4153 * RTT's. 4154 */ 4155 rack->r_wanted_output++; 4156 more: 4157 rsm = TAILQ_FIRST(&rack->r_ctl.rc_map); 4158 if (rsm == NULL) { 4159 if ((th_ack - 1) == tp->iss) { 4160 /* 4161 * For the SYN incoming case we will not 4162 * have called tcp_output for the sending of 4163 * the SYN, so there will be no map. All 4164 * other cases should probably be a panic. 4165 */ 4166 goto proc_sack; 4167 } 4168 if (tp->t_flags & TF_SENTFIN) { 4169 /* if we send a FIN we will not hav a map */ 4170 goto proc_sack; 4171 } 4172 #ifdef INVARIANTS 4173 panic("No rack map tp:%p for th:%p state:%d rack:%p snd_una:%u snd_max:%u snd_nxt:%u chg:%d\n", 4174 tp, 4175 th, tp->t_state, rack, 4176 tp->snd_una, tp->snd_max, tp->snd_nxt, changed); 4177 #endif 4178 goto proc_sack; 4179 } 4180 if (SEQ_LT(th_ack, rsm->r_start)) { 4181 /* Huh map is missing this */ 4182 #ifdef INVARIANTS 4183 printf("Rack map starts at r_start:%u for th_ack:%u huh? ts:%d rs:%d\n", 4184 rsm->r_start, 4185 th_ack, tp->t_state, rack->r_state); 4186 #endif 4187 goto proc_sack; 4188 } 4189 rack_update_rtt(tp, rack, rsm, to, cts, CUM_ACKED); 4190 /* Now do we consume the whole thing? */ 4191 if (SEQ_GEQ(th_ack, rsm->r_end)) { 4192 /* Its all consumed. */ 4193 uint32_t left; 4194 4195 rack->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes; 4196 rsm->r_rtr_bytes = 0; 4197 TAILQ_REMOVE(&rack->r_ctl.rc_map, rsm, r_next); 4198 if (rsm->r_in_tmap) { 4199 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 4200 rsm->r_in_tmap = 0; 4201 } 4202 if (rack->r_ctl.rc_next == rsm) { 4203 /* scoot along the marker */ 4204 rack->r_ctl.rc_next = TAILQ_FIRST(&rack->r_ctl.rc_map); 4205 } 4206 if (rsm->r_flags & RACK_ACKED) { 4207 /* 4208 * It was acked on the scoreboard -- remove 4209 * it from total 4210 */ 4211 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); 4212 } else if (rsm->r_flags & RACK_SACK_PASSED) { 4213 /* 4214 * There are acked segments ACKED on the 4215 * scoreboard further up. We are seeing 4216 * reordering. 4217 */ 4218 counter_u64_add(rack_reorder_seen, 1); 4219 rsm->r_flags |= RACK_ACKED; 4220 rack->r_ctl.rc_reorder_ts = cts; 4221 } 4222 left = th_ack - rsm->r_end; 4223 if (rsm->r_rtr_cnt > 1) { 4224 /* 4225 * Technically we should make r_rtr_cnt be 4226 * monotonicly increasing and just mod it to 4227 * the timestamp it is replacing.. that way 4228 * we would have the last 3 retransmits. Now 4229 * rc_loss_count will be wrong if we 4230 * retransmit something more than 2 times in 4231 * recovery :( 4232 */ 4233 rack->r_ctl.rc_loss_count += (rsm->r_rtr_cnt - 1); 4234 } 4235 /* Free back to zone */ 4236 rack_free(rack, rsm); 4237 if (left) { 4238 goto more; 4239 } 4240 goto proc_sack; 4241 } 4242 if (rsm->r_flags & RACK_ACKED) { 4243 /* 4244 * It was acked on the scoreboard -- remove it from 4245 * total for the part being cum-acked. 4246 */ 4247 rack->r_ctl.rc_sacked -= (th_ack - rsm->r_start); 4248 } 4249 rack->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes; 4250 rsm->r_rtr_bytes = 0; 4251 rsm->r_start = th_ack; 4252 } 4253 proc_sack: 4254 /* Check for reneging */ 4255 rsm = TAILQ_FIRST(&rack->r_ctl.rc_map); 4256 if (rsm && (rsm->r_flags & RACK_ACKED) && (th_ack == rsm->r_start)) { 4257 /* 4258 * The peer has moved snd_una up to 4259 * the edge of this send, i.e. one 4260 * that it had previously acked. The only 4261 * way that can be true if the peer threw 4262 * away data (space issues) that it had 4263 * previously sacked (else it would have 4264 * given us snd_una up to (rsm->r_end). 4265 * We need to undo the acked markings here. 4266 * 4267 * Note we have to look to make sure th_ack is 4268 * our rsm->r_start in case we get an old ack 4269 * where th_ack is behind snd_una. 4270 */ 4271 rack_peer_reneges(rack, rsm, th->th_ack); 4272 } 4273 if ((to->to_flags & TOF_SACK) == 0) { 4274 /* We are done nothing left to log */ 4275 goto out; 4276 } 4277 rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_map, rack_sendmap, r_next); 4278 if (rsm) { 4279 last_seq = rsm->r_end; 4280 } else { 4281 last_seq = tp->snd_max; 4282 } 4283 /* Sack block processing */ 4284 if (SEQ_GT(th_ack, tp->snd_una)) 4285 ack_point = th_ack; 4286 else 4287 ack_point = tp->snd_una; 4288 for (i = 0; i < to->to_nsacks; i++) { 4289 bcopy((to->to_sacks + i * TCPOLEN_SACK), 4290 &sack, sizeof(sack)); 4291 sack.start = ntohl(sack.start); 4292 sack.end = ntohl(sack.end); 4293 if (SEQ_GT(sack.end, sack.start) && 4294 SEQ_GT(sack.start, ack_point) && 4295 SEQ_LT(sack.start, tp->snd_max) && 4296 SEQ_GT(sack.end, ack_point) && 4297 SEQ_LEQ(sack.end, tp->snd_max)) { 4298 if ((rack->r_ctl.rc_num_maps_alloced > rack_sack_block_limit) && 4299 (SEQ_LT(sack.end, last_seq)) && 4300 ((sack.end - sack.start) < (tp->t_maxseg / 8))) { 4301 /* 4302 * Not the last piece and its smaller than 4303 * 1/8th of a MSS. We ignore this. 4304 */ 4305 counter_u64_add(rack_runt_sacks, 1); 4306 continue; 4307 } 4308 sack_blocks[num_sack_blks] = sack; 4309 num_sack_blks++; 4310 #ifdef NETFLIX_STATS 4311 } else if (SEQ_LEQ(sack.start, th_ack) && 4312 SEQ_LEQ(sack.end, th_ack)) { 4313 /* 4314 * Its a D-SACK block. 4315 */ 4316 tcp_record_dsack(sack.start, sack.end); 4317 #endif 4318 } 4319 4320 } 4321 if (num_sack_blks == 0) 4322 goto out; 4323 /* 4324 * Sort the SACK blocks so we can update the rack scoreboard with 4325 * just one pass. 4326 */ 4327 if (rack_use_sack_filter) { 4328 num_sack_blks = sack_filter_blks(&rack->r_ctl.rack_sf, sack_blocks, num_sack_blks, th->th_ack); 4329 } 4330 if (num_sack_blks < 2) { 4331 goto do_sack_work; 4332 } 4333 /* Sort the sacks */ 4334 for (i = 0; i < num_sack_blks; i++) { 4335 for (j = i + 1; j < num_sack_blks; j++) { 4336 if (SEQ_GT(sack_blocks[i].end, sack_blocks[j].end)) { 4337 sack = sack_blocks[i]; 4338 sack_blocks[i] = sack_blocks[j]; 4339 sack_blocks[j] = sack; 4340 } 4341 } 4342 } 4343 /* 4344 * Now are any of the sack block ends the same (yes some 4345 * implememtations send these)? 4346 */ 4347 again: 4348 if (num_sack_blks > 1) { 4349 for (i = 0; i < num_sack_blks; i++) { 4350 for (j = i + 1; j < num_sack_blks; j++) { 4351 if (sack_blocks[i].end == sack_blocks[j].end) { 4352 /* 4353 * Ok these two have the same end we 4354 * want the smallest end and then 4355 * throw away the larger and start 4356 * again. 4357 */ 4358 if (SEQ_LT(sack_blocks[j].start, sack_blocks[i].start)) { 4359 /* 4360 * The second block covers 4361 * more area use that 4362 */ 4363 sack_blocks[i].start = sack_blocks[j].start; 4364 } 4365 /* 4366 * Now collapse out the dup-sack and 4367 * lower the count 4368 */ 4369 for (k = (j + 1); k < num_sack_blks; k++) { 4370 sack_blocks[j].start = sack_blocks[k].start; 4371 sack_blocks[j].end = sack_blocks[k].end; 4372 j++; 4373 } 4374 num_sack_blks--; 4375 goto again; 4376 } 4377 } 4378 } 4379 } 4380 do_sack_work: 4381 rsm = rack->r_ctl.rc_sacklast; 4382 for (i = 0; i < num_sack_blks; i++) { 4383 acked = rack_proc_sack_blk(tp, rack, &sack_blocks[i], to, &rsm, cts); 4384 if (acked) { 4385 rack->r_wanted_output++; 4386 changed += acked; 4387 sack_changed += acked; 4388 } 4389 } 4390 out: 4391 if (changed) { 4392 /* Something changed cancel the rack timer */ 4393 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 4394 } 4395 if ((sack_changed) && (!IN_RECOVERY(tp->t_flags))) { 4396 /* 4397 * Ok we have a high probability that we need to go in to 4398 * recovery since we have data sack'd 4399 */ 4400 struct rack_sendmap *rsm; 4401 uint32_t tsused; 4402 4403 tsused = tcp_ts_getticks(); 4404 rsm = tcp_rack_output(tp, rack, tsused); 4405 if (rsm) { 4406 /* Enter recovery */ 4407 rack->r_ctl.rc_rsm_start = rsm->r_start; 4408 rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; 4409 rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; 4410 entered_recovery = 1; 4411 rack_cong_signal(tp, NULL, CC_NDUPACK); 4412 /* 4413 * When we enter recovery we need to assure we send 4414 * one packet. 4415 */ 4416 rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; 4417 rack->r_timer_override = 1; 4418 } 4419 } 4420 if (IN_RECOVERY(tp->t_flags) && (entered_recovery == 0)) { 4421 /* Deal with changed an PRR here (in recovery only) */ 4422 uint32_t pipe, snd_una; 4423 4424 rack->r_ctl.rc_prr_delivered += changed; 4425 /* Compute prr_sndcnt */ 4426 if (SEQ_GT(tp->snd_una, th_ack)) { 4427 snd_una = tp->snd_una; 4428 } else { 4429 snd_una = th_ack; 4430 } 4431 pipe = ((tp->snd_max - snd_una) - rack->r_ctl.rc_sacked) + rack->r_ctl.rc_holes_rxt; 4432 if (pipe > tp->snd_ssthresh) { 4433 long sndcnt; 4434 4435 sndcnt = rack->r_ctl.rc_prr_delivered * tp->snd_ssthresh; 4436 if (rack->r_ctl.rc_prr_recovery_fs > 0) 4437 sndcnt /= (long)rack->r_ctl.rc_prr_recovery_fs; 4438 else { 4439 rack->r_ctl.rc_prr_sndcnt = 0; 4440 sndcnt = 0; 4441 } 4442 sndcnt++; 4443 if (sndcnt > (long)rack->r_ctl.rc_prr_out) 4444 sndcnt -= rack->r_ctl.rc_prr_out; 4445 else 4446 sndcnt = 0; 4447 rack->r_ctl.rc_prr_sndcnt = sndcnt; 4448 } else { 4449 uint32_t limit; 4450 4451 if (rack->r_ctl.rc_prr_delivered > rack->r_ctl.rc_prr_out) 4452 limit = (rack->r_ctl.rc_prr_delivered - rack->r_ctl.rc_prr_out); 4453 else 4454 limit = 0; 4455 if (changed > limit) 4456 limit = changed; 4457 limit += tp->t_maxseg; 4458 if (tp->snd_ssthresh > pipe) { 4459 rack->r_ctl.rc_prr_sndcnt = min((tp->snd_ssthresh - pipe), limit); 4460 } else { 4461 rack->r_ctl.rc_prr_sndcnt = min(0, limit); 4462 } 4463 } 4464 if (rack->r_ctl.rc_prr_sndcnt >= tp->t_maxseg) { 4465 rack->r_timer_override = 1; 4466 } 4467 } 4468 } 4469 4470 /* 4471 * Return value of 1, we do not need to call rack_process_data(). 4472 * return value of 0, rack_process_data can be called. 4473 * For ret_val if its 0 the TCP is locked, if its non-zero 4474 * its unlocked and probably unsafe to touch the TCB. 4475 */ 4476 static int 4477 rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so, 4478 struct tcpcb *tp, struct tcpopt *to, 4479 int32_t * ti_locked, uint32_t tiwin, int32_t tlen, 4480 int32_t * ofia, int32_t thflags, int32_t * ret_val) 4481 { 4482 int32_t ourfinisacked = 0; 4483 int32_t nsegs, acked_amount; 4484 int32_t acked; 4485 struct mbuf *mfree; 4486 struct tcp_rack *rack; 4487 int32_t recovery = 0; 4488 4489 rack = (struct tcp_rack *)tp->t_fb_ptr; 4490 if (SEQ_GT(th->th_ack, tp->snd_max)) { 4491 rack_do_dropafterack(m, tp, th, ti_locked, thflags, tlen, ret_val); 4492 return (1); 4493 } 4494 if (SEQ_GEQ(th->th_ack, tp->snd_una) || to->to_nsacks) { 4495 rack_log_ack(tp, to, th); 4496 } 4497 if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) { 4498 /* 4499 * Old ack, behind (or duplicate to) the last one rcv'd 4500 * Note: Should mark reordering is occuring! We should also 4501 * look for sack blocks arriving e.g. ack 1, 4-4 then ack 1, 4502 * 3-3, 4-4 would be reording. As well as ack 1, 3-3 <no 4503 * retran and> ack 3 4504 */ 4505 return (0); 4506 } 4507 /* 4508 * If we reach this point, ACK is not a duplicate, i.e., it ACKs 4509 * something we sent. 4510 */ 4511 if (tp->t_flags & TF_NEEDSYN) { 4512 /* 4513 * T/TCP: Connection was half-synchronized, and our SYN has 4514 * been ACK'd (so connection is now fully synchronized). Go 4515 * to non-starred state, increment snd_una for ACK of SYN, 4516 * and check if we can do window scaling. 4517 */ 4518 tp->t_flags &= ~TF_NEEDSYN; 4519 tp->snd_una++; 4520 /* Do window scaling? */ 4521 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 4522 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 4523 tp->rcv_scale = tp->request_r_scale; 4524 /* Send window already scaled. */ 4525 } 4526 } 4527 nsegs = max(1, m->m_pkthdr.lro_nsegs); 4528 INP_WLOCK_ASSERT(tp->t_inpcb); 4529 4530 acked = BYTES_THIS_ACK(tp, th); 4531 TCPSTAT_ADD(tcps_rcvackpack, nsegs); 4532 TCPSTAT_ADD(tcps_rcvackbyte, acked); 4533 4534 /* 4535 * If we just performed our first retransmit, and the ACK arrives 4536 * within our recovery window, then it was a mistake to do the 4537 * retransmit in the first place. Recover our original cwnd and 4538 * ssthresh, and proceed to transmit where we left off. 4539 */ 4540 if (tp->t_flags & TF_PREVVALID) { 4541 tp->t_flags &= ~TF_PREVVALID; 4542 if (tp->t_rxtshift == 1 && 4543 (int)(ticks - tp->t_badrxtwin) < 0) 4544 rack_cong_signal(tp, th, CC_RTO_ERR); 4545 } 4546 /* 4547 * If we have a timestamp reply, update smoothed round trip time. If 4548 * no timestamp is present but transmit timer is running and timed 4549 * sequence number was acked, update smoothed round trip time. Since 4550 * we now have an rtt measurement, cancel the timer backoff (cf., 4551 * Phil Karn's retransmit alg.). Recompute the initial retransmit 4552 * timer. 4553 * 4554 * Some boxes send broken timestamp replies during the SYN+ACK 4555 * phase, ignore timestamps of 0 or we could calculate a huge RTT 4556 * and blow up the retransmit timer. 4557 */ 4558 /* 4559 * If all outstanding data is acked, stop retransmit timer and 4560 * remember to restart (more output or persist). If there is more 4561 * data to be acked, restart retransmit timer, using current 4562 * (possibly backed-off) value. 4563 */ 4564 if (th->th_ack == tp->snd_max) { 4565 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 4566 rack->r_wanted_output++; 4567 } 4568 /* 4569 * If no data (only SYN) was ACK'd, skip rest of ACK processing. 4570 */ 4571 if (acked == 0) { 4572 if (ofia) 4573 *ofia = ourfinisacked; 4574 return (0); 4575 } 4576 if (rack->r_ctl.rc_early_recovery) { 4577 if (IN_FASTRECOVERY(tp->t_flags)) { 4578 if (SEQ_LT(th->th_ack, tp->snd_recover)) { 4579 tcp_rack_partialack(tp, th); 4580 } else { 4581 rack_post_recovery(tp, th); 4582 recovery = 1; 4583 } 4584 } 4585 } 4586 /* 4587 * Let the congestion control algorithm update congestion control 4588 * related information. This typically means increasing the 4589 * congestion window. 4590 */ 4591 rack_ack_received(tp, rack, th, nsegs, CC_ACK, recovery); 4592 SOCKBUF_LOCK(&so->so_snd); 4593 acked_amount = min(acked, (int)sbavail(&so->so_snd)); 4594 tp->snd_wnd -= acked_amount; 4595 mfree = sbcut_locked(&so->so_snd, acked_amount); 4596 if ((sbused(&so->so_snd) == 0) && 4597 (acked > acked_amount) && 4598 (tp->t_state >= TCPS_FIN_WAIT_1)) { 4599 ourfinisacked = 1; 4600 } 4601 /* NB: sowwakeup_locked() does an implicit unlock. */ 4602 sowwakeup_locked(so); 4603 m_freem(mfree); 4604 if (rack->r_ctl.rc_early_recovery == 0) { 4605 if (IN_FASTRECOVERY(tp->t_flags)) { 4606 if (SEQ_LT(th->th_ack, tp->snd_recover)) { 4607 tcp_rack_partialack(tp, th); 4608 } else { 4609 rack_post_recovery(tp, th); 4610 } 4611 } 4612 } 4613 tp->snd_una = th->th_ack; 4614 if (SEQ_GT(tp->snd_una, tp->snd_recover)) 4615 tp->snd_recover = tp->snd_una; 4616 4617 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) { 4618 tp->snd_nxt = tp->snd_una; 4619 } 4620 if (tp->snd_una == tp->snd_max) { 4621 /* Nothing left outstanding */ 4622 rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__); 4623 tp->t_acktime = 0; 4624 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 4625 /* Set need output so persist might get set */ 4626 rack->r_wanted_output++; 4627 if (rack_use_sack_filter) 4628 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 4629 if ((tp->t_state >= TCPS_FIN_WAIT_1) && 4630 (sbavail(&so->so_snd) == 0) && 4631 (tp->t_flags2 & TF2_DROP_AF_DATA)) { 4632 /* 4633 * The socket was gone and the 4634 * peer sent data, time to 4635 * reset him. 4636 */ 4637 *ret_val = 1; 4638 tp = tcp_close(tp); 4639 rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_UNLIMITED, tlen); 4640 return (1); 4641 } 4642 } 4643 if (ofia) 4644 *ofia = ourfinisacked; 4645 return (0); 4646 } 4647 4648 4649 /* 4650 * Return value of 1, the TCB is unlocked and most 4651 * likely gone, return value of 0, the TCP is still 4652 * locked. 4653 */ 4654 static int 4655 rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so, 4656 struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, 4657 int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) 4658 { 4659 /* 4660 * Update window information. Don't look at window if no ACK: TAC's 4661 * send garbage on first SYN. 4662 */ 4663 int32_t nsegs; 4664 #ifdef TCP_RFC7413 4665 int32_t tfo_syn; 4666 #else 4667 #define tfo_syn (FALSE) 4668 #endif 4669 struct tcp_rack *rack; 4670 4671 rack = (struct tcp_rack *)tp->t_fb_ptr; 4672 INP_WLOCK_ASSERT(tp->t_inpcb); 4673 4674 nsegs = max(1, m->m_pkthdr.lro_nsegs); 4675 if ((thflags & TH_ACK) && 4676 (SEQ_LT(tp->snd_wl1, th->th_seq) || 4677 (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || 4678 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { 4679 /* keep track of pure window updates */ 4680 if (tlen == 0 && 4681 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) 4682 TCPSTAT_INC(tcps_rcvwinupd); 4683 tp->snd_wnd = tiwin; 4684 tp->snd_wl1 = th->th_seq; 4685 tp->snd_wl2 = th->th_ack; 4686 if (tp->snd_wnd > tp->max_sndwnd) 4687 tp->max_sndwnd = tp->snd_wnd; 4688 rack->r_wanted_output++; 4689 } else if (thflags & TH_ACK) { 4690 if ((tp->snd_wl2 == th->th_ack) && (tiwin < tp->snd_wnd)) { 4691 tp->snd_wnd = tiwin; 4692 tp->snd_wl1 = th->th_seq; 4693 tp->snd_wl2 = th->th_ack; 4694 } 4695 } 4696 /* Was persist timer active and now we have window space? */ 4697 if ((rack->rc_in_persist != 0) && tp->snd_wnd) { 4698 rack_exit_persist(tp, rack); 4699 tp->snd_nxt = tp->snd_max; 4700 /* Make sure we output to start the timer */ 4701 rack->r_wanted_output++; 4702 } 4703 /* 4704 * Process segments with URG. 4705 */ 4706 if ((thflags & TH_URG) && th->th_urp && 4707 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 4708 /* 4709 * This is a kludge, but if we receive and accept random 4710 * urgent pointers, we'll crash in soreceive. It's hard to 4711 * imagine someone actually wanting to send this much urgent 4712 * data. 4713 */ 4714 SOCKBUF_LOCK(&so->so_rcv); 4715 if (th->th_urp + sbavail(&so->so_rcv) > sb_max) { 4716 th->th_urp = 0; /* XXX */ 4717 thflags &= ~TH_URG; /* XXX */ 4718 SOCKBUF_UNLOCK(&so->so_rcv); /* XXX */ 4719 goto dodata; /* XXX */ 4720 } 4721 /* 4722 * If this segment advances the known urgent pointer, then 4723 * mark the data stream. This should not happen in 4724 * CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since a 4725 * FIN has been received from the remote side. In these 4726 * states we ignore the URG. 4727 * 4728 * According to RFC961 (Assigned Protocols), the urgent 4729 * pointer points to the last octet of urgent data. We 4730 * continue, however, to consider it to indicate the first 4731 * octet of data past the urgent section as the original 4732 * spec states (in one of two places). 4733 */ 4734 if (SEQ_GT(th->th_seq + th->th_urp, tp->rcv_up)) { 4735 tp->rcv_up = th->th_seq + th->th_urp; 4736 so->so_oobmark = sbavail(&so->so_rcv) + 4737 (tp->rcv_up - tp->rcv_nxt) - 1; 4738 if (so->so_oobmark == 0) 4739 so->so_rcv.sb_state |= SBS_RCVATMARK; 4740 sohasoutofband(so); 4741 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); 4742 } 4743 SOCKBUF_UNLOCK(&so->so_rcv); 4744 /* 4745 * Remove out of band data so doesn't get presented to user. 4746 * This can happen independent of advancing the URG pointer, 4747 * but if two URG's are pending at once, some out-of-band 4748 * data may creep in... ick. 4749 */ 4750 if (th->th_urp <= (uint32_t) tlen && 4751 !(so->so_options & SO_OOBINLINE)) { 4752 /* hdr drop is delayed */ 4753 tcp_pulloutofband(so, th, m, drop_hdrlen); 4754 } 4755 } else { 4756 /* 4757 * If no out of band data is expected, pull receive urgent 4758 * pointer along with the receive window. 4759 */ 4760 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) 4761 tp->rcv_up = tp->rcv_nxt; 4762 } 4763 dodata: /* XXX */ 4764 INP_WLOCK_ASSERT(tp->t_inpcb); 4765 4766 /* 4767 * Process the segment text, merging it into the TCP sequencing 4768 * queue, and arranging for acknowledgment of receipt if necessary. 4769 * This process logically involves adjusting tp->rcv_wnd as data is 4770 * presented to the user (this happens in tcp_usrreq.c, case 4771 * PRU_RCVD). If a FIN has already been received on this connection 4772 * then we just ignore the text. 4773 */ 4774 #ifdef TCP_RFC7413 4775 tfo_syn = ((tp->t_state == TCPS_SYN_RECEIVED) && 4776 (tp->t_flags & TF_FASTOPEN)); 4777 #endif 4778 if ((tlen || (thflags & TH_FIN) || tfo_syn) && 4779 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 4780 tcp_seq save_start = th->th_seq; 4781 4782 m_adj(m, drop_hdrlen); /* delayed header drop */ 4783 /* 4784 * Insert segment which includes th into TCP reassembly 4785 * queue with control block tp. Set thflags to whether 4786 * reassembly now includes a segment with FIN. This handles 4787 * the common case inline (segment is the next to be 4788 * received on an established connection, and the queue is 4789 * empty), avoiding linkage into and removal from the queue 4790 * and repetition of various conversions. Set DELACK for 4791 * segments received in order, but ack immediately when 4792 * segments are out of order (so fast retransmit can work). 4793 */ 4794 if (th->th_seq == tp->rcv_nxt && 4795 LIST_EMPTY(&tp->t_segq) && 4796 (TCPS_HAVEESTABLISHED(tp->t_state) || 4797 tfo_syn)) { 4798 if (DELAY_ACK(tp, tlen) || tfo_syn) { 4799 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 4800 tp->t_flags |= TF_DELACK; 4801 } else { 4802 rack->r_wanted_output++; 4803 tp->t_flags |= TF_ACKNOW; 4804 } 4805 tp->rcv_nxt += tlen; 4806 thflags = th->th_flags & TH_FIN; 4807 TCPSTAT_ADD(tcps_rcvpack, nsegs); 4808 TCPSTAT_ADD(tcps_rcvbyte, tlen); 4809 SOCKBUF_LOCK(&so->so_rcv); 4810 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) 4811 m_freem(m); 4812 else 4813 sbappendstream_locked(&so->so_rcv, m, 0); 4814 /* NB: sorwakeup_locked() does an implicit unlock. */ 4815 sorwakeup_locked(so); 4816 } else { 4817 /* 4818 * XXX: Due to the header drop above "th" is 4819 * theoretically invalid by now. Fortunately 4820 * m_adj() doesn't actually frees any mbufs when 4821 * trimming from the head. 4822 */ 4823 thflags = tcp_reass(tp, th, &tlen, m); 4824 tp->t_flags |= TF_ACKNOW; 4825 } 4826 if (tlen > 0) 4827 tcp_update_sack_list(tp, save_start, save_start + tlen); 4828 } else { 4829 m_freem(m); 4830 thflags &= ~TH_FIN; 4831 } 4832 4833 /* 4834 * If FIN is received ACK the FIN and let the user know that the 4835 * connection is closing. 4836 */ 4837 if (thflags & TH_FIN) { 4838 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 4839 socantrcvmore(so); 4840 /* 4841 * If connection is half-synchronized (ie NEEDSYN 4842 * flag on) then delay ACK, so it may be piggybacked 4843 * when SYN is sent. Otherwise, since we received a 4844 * FIN then no more input can be expected, send ACK 4845 * now. 4846 */ 4847 if (tp->t_flags & TF_NEEDSYN) { 4848 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 4849 tp->t_flags |= TF_DELACK; 4850 } else { 4851 tp->t_flags |= TF_ACKNOW; 4852 } 4853 tp->rcv_nxt++; 4854 } 4855 switch (tp->t_state) { 4856 4857 /* 4858 * In SYN_RECEIVED and ESTABLISHED STATES enter the 4859 * CLOSE_WAIT state. 4860 */ 4861 case TCPS_SYN_RECEIVED: 4862 tp->t_starttime = ticks; 4863 /* FALLTHROUGH */ 4864 case TCPS_ESTABLISHED: 4865 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 4866 tcp_state_change(tp, TCPS_CLOSE_WAIT); 4867 break; 4868 4869 /* 4870 * If still in FIN_WAIT_1 STATE FIN has not been 4871 * acked so enter the CLOSING state. 4872 */ 4873 case TCPS_FIN_WAIT_1: 4874 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 4875 tcp_state_change(tp, TCPS_CLOSING); 4876 break; 4877 4878 /* 4879 * In FIN_WAIT_2 state enter the TIME_WAIT state, 4880 * starting the time-wait timer, turning off the 4881 * other standard timers. 4882 */ 4883 case TCPS_FIN_WAIT_2: 4884 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 4885 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 4886 KASSERT(*ti_locked == TI_RLOCKED, ("%s: dodata " 4887 "TCP_FIN_WAIT_2 ti_locked: %d", __func__, 4888 *ti_locked)); 4889 tcp_twstart(tp); 4890 *ti_locked = TI_UNLOCKED; 4891 INP_INFO_RUNLOCK(&V_tcbinfo); 4892 return (1); 4893 } 4894 } 4895 if (*ti_locked == TI_RLOCKED) { 4896 INP_INFO_RUNLOCK(&V_tcbinfo); 4897 *ti_locked = TI_UNLOCKED; 4898 } 4899 /* 4900 * Return any desired output. 4901 */ 4902 if ((tp->t_flags & TF_ACKNOW) || (sbavail(&so->so_snd) > (tp->snd_max - tp->snd_una))) { 4903 rack->r_wanted_output++; 4904 } 4905 KASSERT(*ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d", 4906 __func__, *ti_locked)); 4907 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); 4908 INP_WLOCK_ASSERT(tp->t_inpcb); 4909 return (0); 4910 } 4911 4912 /* 4913 * Here nothing is really faster, its just that we 4914 * have broken out the fast-data path also just like 4915 * the fast-ack. 4916 */ 4917 static int 4918 rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so, 4919 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 4920 int32_t * ti_locked, uint32_t tiwin, int32_t nxt_pkt) 4921 { 4922 int32_t nsegs; 4923 int32_t newsize = 0; /* automatic sockbuf scaling */ 4924 struct tcp_rack *rack; 4925 #ifdef TCPDEBUG 4926 /* 4927 * The size of tcp_saveipgen must be the size of the max ip header, 4928 * now IPv6. 4929 */ 4930 u_char tcp_saveipgen[IP6_HDR_LEN]; 4931 struct tcphdr tcp_savetcp; 4932 short ostate = 0; 4933 4934 #endif 4935 /* 4936 * If last ACK falls within this segment's sequence numbers, record 4937 * the timestamp. NOTE that the test is modified according to the 4938 * latest proposal of the tcplw@cray.com list (Braden 1993/04/26). 4939 */ 4940 if (__predict_false(th->th_seq != tp->rcv_nxt)) { 4941 return (0); 4942 } 4943 if (__predict_false(tp->snd_nxt != tp->snd_max)) { 4944 return (0); 4945 } 4946 if (tiwin && tiwin != tp->snd_wnd) { 4947 return (0); 4948 } 4949 if (__predict_false((tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN)))) { 4950 return (0); 4951 } 4952 if (__predict_false((to->to_flags & TOF_TS) && 4953 (TSTMP_LT(to->to_tsval, tp->ts_recent)))) { 4954 return (0); 4955 } 4956 if (__predict_false((th->th_ack != tp->snd_una))) { 4957 return (0); 4958 } 4959 if (__predict_false(tlen > sbspace(&so->so_rcv))) { 4960 return (0); 4961 } 4962 if ((to->to_flags & TOF_TS) != 0 && 4963 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 4964 tp->ts_recent_age = tcp_ts_getticks(); 4965 tp->ts_recent = to->to_tsval; 4966 } 4967 rack = (struct tcp_rack *)tp->t_fb_ptr; 4968 /* 4969 * This is a pure, in-sequence data packet with nothing on the 4970 * reassembly queue and we have enough buffer space to take it. 4971 */ 4972 if (*ti_locked == TI_RLOCKED) { 4973 INP_INFO_RUNLOCK(&V_tcbinfo); 4974 *ti_locked = TI_UNLOCKED; 4975 } 4976 nsegs = max(1, m->m_pkthdr.lro_nsegs); 4977 4978 4979 /* Clean receiver SACK report if present */ 4980 if (tp->rcv_numsacks) 4981 tcp_clean_sackreport(tp); 4982 TCPSTAT_INC(tcps_preddat); 4983 tp->rcv_nxt += tlen; 4984 /* 4985 * Pull snd_wl1 up to prevent seq wrap relative to th_seq. 4986 */ 4987 tp->snd_wl1 = th->th_seq; 4988 /* 4989 * Pull rcv_up up to prevent seq wrap relative to rcv_nxt. 4990 */ 4991 tp->rcv_up = tp->rcv_nxt; 4992 TCPSTAT_ADD(tcps_rcvpack, nsegs); 4993 TCPSTAT_ADD(tcps_rcvbyte, tlen); 4994 #ifdef TCPDEBUG 4995 if (so->so_options & SO_DEBUG) 4996 tcp_trace(TA_INPUT, ostate, tp, 4997 (void *)tcp_saveipgen, &tcp_savetcp, 0); 4998 #endif 4999 newsize = tcp_autorcvbuf(m, th, so, tp, tlen); 5000 5001 /* Add data to socket buffer. */ 5002 SOCKBUF_LOCK(&so->so_rcv); 5003 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 5004 m_freem(m); 5005 } else { 5006 /* 5007 * Set new socket buffer size. Give up when limit is 5008 * reached. 5009 */ 5010 if (newsize) 5011 if (!sbreserve_locked(&so->so_rcv, 5012 newsize, so, NULL)) 5013 so->so_rcv.sb_flags &= ~SB_AUTOSIZE; 5014 m_adj(m, drop_hdrlen); /* delayed header drop */ 5015 sbappendstream_locked(&so->so_rcv, m, 0); 5016 rack_calc_rwin(so, tp); 5017 } 5018 /* NB: sorwakeup_locked() does an implicit unlock. */ 5019 sorwakeup_locked(so); 5020 if (DELAY_ACK(tp, tlen)) { 5021 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 5022 tp->t_flags |= TF_DELACK; 5023 } else { 5024 tp->t_flags |= TF_ACKNOW; 5025 rack->r_wanted_output++; 5026 } 5027 if ((tp->snd_una == tp->snd_max) && rack_use_sack_filter) 5028 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 5029 return (1); 5030 } 5031 5032 /* 5033 * This subfunction is used to try to highly optimize the 5034 * fast path. We again allow window updates that are 5035 * in sequence to remain in the fast-path. We also add 5036 * in the __predict's to attempt to help the compiler. 5037 * Note that if we return a 0, then we can *not* process 5038 * it and the caller should push the packet into the 5039 * slow-path. 5040 */ 5041 static int 5042 rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so, 5043 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 5044 int32_t * ti_locked, uint32_t tiwin, int32_t nxt_pkt, uint32_t cts) 5045 { 5046 int32_t acked; 5047 int32_t nsegs; 5048 5049 #ifdef TCPDEBUG 5050 /* 5051 * The size of tcp_saveipgen must be the size of the max ip header, 5052 * now IPv6. 5053 */ 5054 u_char tcp_saveipgen[IP6_HDR_LEN]; 5055 struct tcphdr tcp_savetcp; 5056 short ostate = 0; 5057 5058 #endif 5059 struct tcp_rack *rack; 5060 5061 if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) { 5062 /* Old ack, behind (or duplicate to) the last one rcv'd */ 5063 return (0); 5064 } 5065 if (__predict_false(SEQ_GT(th->th_ack, tp->snd_max))) { 5066 /* Above what we have sent? */ 5067 return (0); 5068 } 5069 if (__predict_false(tp->snd_nxt != tp->snd_max)) { 5070 /* We are retransmitting */ 5071 return (0); 5072 } 5073 if (__predict_false(tiwin == 0)) { 5074 /* zero window */ 5075 return (0); 5076 } 5077 if (__predict_false(tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN))) { 5078 /* We need a SYN or a FIN, unlikely.. */ 5079 return (0); 5080 } 5081 if ((to->to_flags & TOF_TS) && __predict_false(TSTMP_LT(to->to_tsval, tp->ts_recent))) { 5082 /* Timestamp is behind .. old ack with seq wrap? */ 5083 return (0); 5084 } 5085 if (__predict_false(IN_RECOVERY(tp->t_flags))) { 5086 /* Still recovering */ 5087 return (0); 5088 } 5089 rack = (struct tcp_rack *)tp->t_fb_ptr; 5090 if (rack->r_ctl.rc_sacked) { 5091 /* We have sack holes on our scoreboard */ 5092 return (0); 5093 } 5094 /* Ok if we reach here, we can process a fast-ack */ 5095 nsegs = max(1, m->m_pkthdr.lro_nsegs); 5096 rack_log_ack(tp, to, th); 5097 /* Did the window get updated? */ 5098 if (tiwin != tp->snd_wnd) { 5099 tp->snd_wnd = tiwin; 5100 tp->snd_wl1 = th->th_seq; 5101 if (tp->snd_wnd > tp->max_sndwnd) 5102 tp->max_sndwnd = tp->snd_wnd; 5103 } 5104 if ((rack->rc_in_persist != 0) && (tp->snd_wnd >= tp->t_maxseg)) { 5105 rack_exit_persist(tp, rack); 5106 } 5107 /* 5108 * If last ACK falls within this segment's sequence numbers, record 5109 * the timestamp. NOTE that the test is modified according to the 5110 * latest proposal of the tcplw@cray.com list (Braden 1993/04/26). 5111 */ 5112 if ((to->to_flags & TOF_TS) != 0 && 5113 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 5114 tp->ts_recent_age = tcp_ts_getticks(); 5115 tp->ts_recent = to->to_tsval; 5116 } 5117 /* 5118 * This is a pure ack for outstanding data. 5119 */ 5120 if (*ti_locked == TI_RLOCKED) { 5121 INP_INFO_RUNLOCK(&V_tcbinfo); 5122 *ti_locked = TI_UNLOCKED; 5123 } 5124 TCPSTAT_INC(tcps_predack); 5125 5126 /* 5127 * "bad retransmit" recovery. 5128 */ 5129 if (tp->t_flags & TF_PREVVALID) { 5130 tp->t_flags &= ~TF_PREVVALID; 5131 if (tp->t_rxtshift == 1 && 5132 (int)(ticks - tp->t_badrxtwin) < 0) 5133 rack_cong_signal(tp, th, CC_RTO_ERR); 5134 } 5135 /* 5136 * Recalculate the transmit timer / rtt. 5137 * 5138 * Some boxes send broken timestamp replies during the SYN+ACK 5139 * phase, ignore timestamps of 0 or we could calculate a huge RTT 5140 * and blow up the retransmit timer. 5141 */ 5142 acked = BYTES_THIS_ACK(tp, th); 5143 5144 #ifdef TCP_HHOOK 5145 /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ 5146 hhook_run_tcp_est_in(tp, th, to); 5147 #endif 5148 5149 TCPSTAT_ADD(tcps_rcvackpack, nsegs); 5150 TCPSTAT_ADD(tcps_rcvackbyte, acked); 5151 sbdrop(&so->so_snd, acked); 5152 /* 5153 * Let the congestion control algorithm update congestion control 5154 * related information. This typically means increasing the 5155 * congestion window. 5156 */ 5157 rack_ack_received(tp, rack, th, nsegs, CC_ACK, 0); 5158 5159 tp->snd_una = th->th_ack; 5160 /* 5161 * Pull snd_wl2 up to prevent seq wrap relative to th_ack. 5162 */ 5163 tp->snd_wl2 = th->th_ack; 5164 tp->t_dupacks = 0; 5165 m_freem(m); 5166 /* ND6_HINT(tp); *//* Some progress has been made. */ 5167 5168 /* 5169 * If all outstanding data are acked, stop retransmit timer, 5170 * otherwise restart timer using current (possibly backed-off) 5171 * value. If process is waiting for space, wakeup/selwakeup/signal. 5172 * If data are ready to send, let tcp_output decide between more 5173 * output or persist. 5174 */ 5175 #ifdef TCPDEBUG 5176 if (so->so_options & SO_DEBUG) 5177 tcp_trace(TA_INPUT, ostate, tp, 5178 (void *)tcp_saveipgen, 5179 &tcp_savetcp, 0); 5180 #endif 5181 if (tp->snd_una == tp->snd_max) { 5182 rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__); 5183 tp->t_acktime = 0; 5184 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 5185 } 5186 /* Wake up the socket if we have room to write more */ 5187 sowwakeup(so); 5188 if (sbavail(&so->so_snd)) { 5189 rack->r_wanted_output++; 5190 } 5191 return (1); 5192 } 5193 5194 /* 5195 * Return value of 1, the TCB is unlocked and most 5196 * likely gone, return value of 0, the TCP is still 5197 * locked. 5198 */ 5199 static int 5200 rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so, 5201 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 5202 int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) 5203 { 5204 int32_t ret_val = 0; 5205 int32_t todrop; 5206 int32_t ourfinisacked = 0; 5207 5208 rack_calc_rwin(so, tp); 5209 /* 5210 * If the state is SYN_SENT: if seg contains an ACK, but not for our 5211 * SYN, drop the input. if seg contains a RST, then drop the 5212 * connection. if seg does not contain SYN, then drop it. Otherwise 5213 * this is an acceptable SYN segment initialize tp->rcv_nxt and 5214 * tp->irs if seg contains ack then advance tp->snd_una if seg 5215 * contains an ECE and ECN support is enabled, the stream is ECN 5216 * capable. if SYN has been acked change to ESTABLISHED else 5217 * SYN_RCVD state arrange for segment to be acked (eventually) 5218 * continue processing rest of data/controls, beginning with URG 5219 */ 5220 if ((thflags & TH_ACK) && 5221 (SEQ_LEQ(th->th_ack, tp->iss) || 5222 SEQ_GT(th->th_ack, tp->snd_max))) { 5223 rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen); 5224 return (1); 5225 } 5226 if ((thflags & (TH_ACK | TH_RST)) == (TH_ACK | TH_RST)) { 5227 TCP_PROBE5(connect__refused, NULL, tp, 5228 mtod(m, const char *), tp, th); 5229 tp = tcp_drop(tp, ECONNREFUSED); 5230 rack_do_drop(m, tp, ti_locked); 5231 return (1); 5232 } 5233 if (thflags & TH_RST) { 5234 rack_do_drop(m, tp, ti_locked); 5235 return (1); 5236 } 5237 if (!(thflags & TH_SYN)) { 5238 rack_do_drop(m, tp, ti_locked); 5239 return (1); 5240 } 5241 tp->irs = th->th_seq; 5242 tcp_rcvseqinit(tp); 5243 if (thflags & TH_ACK) { 5244 TCPSTAT_INC(tcps_connects); 5245 soisconnected(so); 5246 #ifdef MAC 5247 mac_socketpeer_set_from_mbuf(m, so); 5248 #endif 5249 /* Do window scaling on this connection? */ 5250 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 5251 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 5252 tp->rcv_scale = tp->request_r_scale; 5253 } 5254 tp->rcv_adv += min(tp->rcv_wnd, 5255 TCP_MAXWIN << tp->rcv_scale); 5256 /* 5257 * If there's data, delay ACK; if there's also a FIN ACKNOW 5258 * will be turned on later. 5259 */ 5260 if (DELAY_ACK(tp, tlen) && tlen != 0) { 5261 rack_timer_cancel(tp, (struct tcp_rack *)tp->t_fb_ptr, 5262 ((struct tcp_rack *)tp->t_fb_ptr)->r_ctl.rc_rcvtime, __LINE__); 5263 tp->t_flags |= TF_DELACK; 5264 } else { 5265 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output++; 5266 tp->t_flags |= TF_ACKNOW; 5267 } 5268 5269 if ((thflags & TH_ECE) && V_tcp_do_ecn) { 5270 tp->t_flags |= TF_ECN_PERMIT; 5271 TCPSTAT_INC(tcps_ecn_shs); 5272 } 5273 /* 5274 * Received <SYN,ACK> in SYN_SENT[*] state. Transitions: 5275 * SYN_SENT --> ESTABLISHED SYN_SENT* --> FIN_WAIT_1 5276 */ 5277 tp->t_starttime = ticks; 5278 if (tp->t_flags & TF_NEEDFIN) { 5279 tcp_state_change(tp, TCPS_FIN_WAIT_1); 5280 tp->t_flags &= ~TF_NEEDFIN; 5281 thflags &= ~TH_SYN; 5282 } else { 5283 tcp_state_change(tp, TCPS_ESTABLISHED); 5284 TCP_PROBE5(connect__established, NULL, tp, 5285 mtod(m, const char *), tp, th); 5286 cc_conn_init(tp); 5287 } 5288 } else { 5289 /* 5290 * Received initial SYN in SYN-SENT[*] state => simultaneous 5291 * open. If segment contains CC option and there is a 5292 * cached CC, apply TAO test. If it succeeds, connection is * 5293 * half-synchronized. Otherwise, do 3-way handshake: 5294 * SYN-SENT -> SYN-RECEIVED SYN-SENT* -> SYN-RECEIVED* If 5295 * there was no CC option, clear cached CC value. 5296 */ 5297 tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN); 5298 tcp_state_change(tp, TCPS_SYN_RECEIVED); 5299 } 5300 KASSERT(*ti_locked == TI_RLOCKED, ("%s: trimthenstep6: " 5301 "ti_locked %d", __func__, *ti_locked)); 5302 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 5303 INP_WLOCK_ASSERT(tp->t_inpcb); 5304 /* 5305 * Advance th->th_seq to correspond to first data byte. If data, 5306 * trim to stay within window, dropping FIN if necessary. 5307 */ 5308 th->th_seq++; 5309 if (tlen > tp->rcv_wnd) { 5310 todrop = tlen - tp->rcv_wnd; 5311 m_adj(m, -todrop); 5312 tlen = tp->rcv_wnd; 5313 thflags &= ~TH_FIN; 5314 TCPSTAT_INC(tcps_rcvpackafterwin); 5315 TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); 5316 } 5317 tp->snd_wl1 = th->th_seq - 1; 5318 tp->rcv_up = th->th_seq; 5319 /* 5320 * Client side of transaction: already sent SYN and data. If the 5321 * remote host used T/TCP to validate the SYN, our data will be 5322 * ACK'd; if so, enter normal data segment processing in the middle 5323 * of step 5, ack processing. Otherwise, goto step 6. 5324 */ 5325 if (thflags & TH_ACK) { 5326 if (rack_process_ack(m, th, so, tp, to, ti_locked, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) 5327 return (ret_val); 5328 /* We may have changed to FIN_WAIT_1 above */ 5329 if (tp->t_state == TCPS_FIN_WAIT_1) { 5330 /* 5331 * In FIN_WAIT_1 STATE in addition to the processing 5332 * for the ESTABLISHED state if our FIN is now 5333 * acknowledged then enter FIN_WAIT_2. 5334 */ 5335 if (ourfinisacked) { 5336 /* 5337 * If we can't receive any more data, then 5338 * closing user can proceed. Starting the 5339 * timer is contrary to the specification, 5340 * but if we don't get a FIN we'll hang 5341 * forever. 5342 * 5343 * XXXjl: we should release the tp also, and 5344 * use a compressed state. 5345 */ 5346 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 5347 soisdisconnected(so); 5348 tcp_timer_activate(tp, TT_2MSL, 5349 (tcp_fast_finwait2_recycle ? 5350 tcp_finwait2_timeout : 5351 TP_MAXIDLE(tp))); 5352 } 5353 tcp_state_change(tp, TCPS_FIN_WAIT_2); 5354 } 5355 } 5356 } 5357 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 5358 ti_locked, tiwin, thflags, nxt_pkt)); 5359 } 5360 5361 /* 5362 * Return value of 1, the TCB is unlocked and most 5363 * likely gone, return value of 0, the TCP is still 5364 * locked. 5365 */ 5366 static int 5367 rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, 5368 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 5369 int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) 5370 { 5371 int32_t ret_val = 0; 5372 int32_t ourfinisacked = 0; 5373 5374 rack_calc_rwin(so, tp); 5375 5376 if ((thflags & TH_ACK) && 5377 (SEQ_LEQ(th->th_ack, tp->snd_una) || 5378 SEQ_GT(th->th_ack, tp->snd_max))) { 5379 rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen); 5380 return (1); 5381 } 5382 #ifdef TCP_RFC7413 5383 if (tp->t_flags & TF_FASTOPEN) { 5384 /* 5385 * When a TFO connection is in SYN_RECEIVED, the only valid 5386 * packets are the initial SYN, a retransmit/copy of the 5387 * initial SYN (possibly with a subset of the original 5388 * data), a valid ACK, a FIN, or a RST. 5389 */ 5390 if ((thflags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) { 5391 rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen); 5392 return (1); 5393 } else if (thflags & TH_SYN) { 5394 /* non-initial SYN is ignored */ 5395 struct tcp_rack *rack; 5396 5397 rack = (struct tcp_rack *)tp->t_fb_ptr; 5398 if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) || 5399 (rack->r_ctl.rc_hpts_flags & PACE_TMR_TLP) || 5400 (rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK)) { 5401 rack_do_drop(m, NULL, ti_locked); 5402 return (0); 5403 } 5404 } else if (!(thflags & (TH_ACK | TH_FIN | TH_RST))) { 5405 rack_do_drop(m, NULL, ti_locked); 5406 return (0); 5407 } 5408 } 5409 #endif 5410 if (thflags & TH_RST) 5411 return (rack_process_rst(m, th, so, tp, ti_locked)); 5412 /* 5413 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 5414 * synchronized state. 5415 */ 5416 if (thflags & TH_SYN) { 5417 rack_challenge_ack(m, th, tp, ti_locked, &ret_val); 5418 return (ret_val); 5419 } 5420 /* 5421 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 5422 * it's less than ts_recent, drop it. 5423 */ 5424 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 5425 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 5426 if (rack_ts_check(m, th, tp, ti_locked, tlen, thflags, &ret_val)) 5427 return (ret_val); 5428 } 5429 /* 5430 * In the SYN-RECEIVED state, validate that the packet belongs to 5431 * this connection before trimming the data to fit the receive 5432 * window. Check the sequence number versus IRS since we know the 5433 * sequence numbers haven't wrapped. This is a partial fix for the 5434 * "LAND" DoS attack. 5435 */ 5436 if (SEQ_LT(th->th_seq, tp->irs)) { 5437 rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen); 5438 return (1); 5439 } 5440 if (rack_drop_checks(to, m, th, tp, &tlen, ti_locked, &thflags, &drop_hdrlen, &ret_val)) { 5441 return (ret_val); 5442 } 5443 /* 5444 * If last ACK falls within this segment's sequence numbers, record 5445 * its timestamp. NOTE: 1) That the test incorporates suggestions 5446 * from the latest proposal of the tcplw@cray.com list (Braden 5447 * 1993/04/26). 2) That updating only on newer timestamps interferes 5448 * with our earlier PAWS tests, so this check should be solely 5449 * predicated on the sequence space of this segment. 3) That we 5450 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 5451 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 5452 * SEG.Len, This modified check allows us to overcome RFC1323's 5453 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 5454 * p.869. In such cases, we can still calculate the RTT correctly 5455 * when RCV.NXT == Last.ACK.Sent. 5456 */ 5457 if ((to->to_flags & TOF_TS) != 0 && 5458 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 5459 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 5460 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 5461 tp->ts_recent_age = tcp_ts_getticks(); 5462 tp->ts_recent = to->to_tsval; 5463 } 5464 /* 5465 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 5466 * is on (half-synchronized state), then queue data for later 5467 * processing; else drop segment and return. 5468 */ 5469 if ((thflags & TH_ACK) == 0) { 5470 #ifdef TCP_RFC7413 5471 if (tp->t_flags & TF_FASTOPEN) { 5472 tp->snd_wnd = tiwin; 5473 cc_conn_init(tp); 5474 } 5475 #endif 5476 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 5477 ti_locked, tiwin, thflags, nxt_pkt)); 5478 } 5479 TCPSTAT_INC(tcps_connects); 5480 soisconnected(so); 5481 /* Do window scaling? */ 5482 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 5483 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 5484 tp->rcv_scale = tp->request_r_scale; 5485 tp->snd_wnd = tiwin; 5486 } 5487 /* 5488 * Make transitions: SYN-RECEIVED -> ESTABLISHED SYN-RECEIVED* -> 5489 * FIN-WAIT-1 5490 */ 5491 tp->t_starttime = ticks; 5492 if (tp->t_flags & TF_NEEDFIN) { 5493 tcp_state_change(tp, TCPS_FIN_WAIT_1); 5494 tp->t_flags &= ~TF_NEEDFIN; 5495 } else { 5496 tcp_state_change(tp, TCPS_ESTABLISHED); 5497 TCP_PROBE5(accept__established, NULL, tp, 5498 mtod(m, const char *), tp, th); 5499 #ifdef TCP_RFC7413 5500 if (tp->t_tfo_pending) { 5501 tcp_fastopen_decrement_counter(tp->t_tfo_pending); 5502 tp->t_tfo_pending = NULL; 5503 5504 /* 5505 * Account for the ACK of our SYN prior to regular 5506 * ACK processing below. 5507 */ 5508 tp->snd_una++; 5509 } 5510 /* 5511 * TFO connections call cc_conn_init() during SYN 5512 * processing. Calling it again here for such connections 5513 * is not harmless as it would undo the snd_cwnd reduction 5514 * that occurs when a TFO SYN|ACK is retransmitted. 5515 */ 5516 if (!(tp->t_flags & TF_FASTOPEN)) 5517 #endif 5518 cc_conn_init(tp); 5519 } 5520 /* 5521 * If segment contains data or ACK, will call tcp_reass() later; if 5522 * not, do so now to pass queued data to user. 5523 */ 5524 if (tlen == 0 && (thflags & TH_FIN) == 0) 5525 (void)tcp_reass(tp, (struct tcphdr *)0, 0, 5526 (struct mbuf *)0); 5527 tp->snd_wl1 = th->th_seq - 1; 5528 if (rack_process_ack(m, th, so, tp, to, ti_locked, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 5529 return (ret_val); 5530 } 5531 if (tp->t_state == TCPS_FIN_WAIT_1) { 5532 /* We could have went to FIN_WAIT_1 (or EST) above */ 5533 /* 5534 * In FIN_WAIT_1 STATE in addition to the processing for the 5535 * ESTABLISHED state if our FIN is now acknowledged then 5536 * enter FIN_WAIT_2. 5537 */ 5538 if (ourfinisacked) { 5539 /* 5540 * If we can't receive any more data, then closing 5541 * user can proceed. Starting the timer is contrary 5542 * to the specification, but if we don't get a FIN 5543 * we'll hang forever. 5544 * 5545 * XXXjl: we should release the tp also, and use a 5546 * compressed state. 5547 */ 5548 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 5549 soisdisconnected(so); 5550 tcp_timer_activate(tp, TT_2MSL, 5551 (tcp_fast_finwait2_recycle ? 5552 tcp_finwait2_timeout : 5553 TP_MAXIDLE(tp))); 5554 } 5555 tcp_state_change(tp, TCPS_FIN_WAIT_2); 5556 } 5557 } 5558 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 5559 ti_locked, tiwin, thflags, nxt_pkt)); 5560 } 5561 5562 /* 5563 * Return value of 1, the TCB is unlocked and most 5564 * likely gone, return value of 0, the TCP is still 5565 * locked. 5566 */ 5567 static int 5568 rack_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so, 5569 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 5570 int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) 5571 { 5572 int32_t ret_val = 0; 5573 5574 /* 5575 * Header prediction: check for the two common cases of a 5576 * uni-directional data xfer. If the packet has no control flags, 5577 * is in-sequence, the window didn't change and we're not 5578 * retransmitting, it's a candidate. If the length is zero and the 5579 * ack moved forward, we're the sender side of the xfer. Just free 5580 * the data acked & wake any higher level process that was blocked 5581 * waiting for space. If the length is non-zero and the ack didn't 5582 * move, we're the receiver side. If we're getting packets in-order 5583 * (the reassembly queue is empty), add the data toc The socket 5584 * buffer and note that we need a delayed ack. Make sure that the 5585 * hidden state-flags are also off. Since we check for 5586 * TCPS_ESTABLISHED first, it can only be TH_NEEDSYN. 5587 */ 5588 if (__predict_true(((to->to_flags & TOF_SACK) == 0)) && 5589 __predict_true((thflags & (TH_SYN | TH_FIN | TH_RST | TH_URG | TH_ACK)) == TH_ACK) && 5590 __predict_true(LIST_EMPTY(&tp->t_segq)) && 5591 __predict_true(th->th_seq == tp->rcv_nxt)) { 5592 struct tcp_rack *rack; 5593 5594 rack = (struct tcp_rack *)tp->t_fb_ptr; 5595 if (tlen == 0) { 5596 if (rack_fastack(m, th, so, tp, to, drop_hdrlen, tlen, 5597 ti_locked, tiwin, nxt_pkt, rack->r_ctl.rc_rcvtime)) { 5598 return (0); 5599 } 5600 } else { 5601 if (rack_do_fastnewdata(m, th, so, tp, to, drop_hdrlen, tlen, 5602 ti_locked, tiwin, nxt_pkt)) { 5603 return (0); 5604 } 5605 } 5606 } 5607 rack_calc_rwin(so, tp); 5608 5609 if (thflags & TH_RST) 5610 return (rack_process_rst(m, th, so, tp, ti_locked)); 5611 5612 /* 5613 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 5614 * synchronized state. 5615 */ 5616 if (thflags & TH_SYN) { 5617 rack_challenge_ack(m, th, tp, ti_locked, &ret_val); 5618 return (ret_val); 5619 } 5620 /* 5621 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 5622 * it's less than ts_recent, drop it. 5623 */ 5624 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 5625 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 5626 if (rack_ts_check(m, th, tp, ti_locked, tlen, thflags, &ret_val)) 5627 return (ret_val); 5628 } 5629 if (rack_drop_checks(to, m, th, tp, &tlen, ti_locked, &thflags, &drop_hdrlen, &ret_val)) { 5630 return (ret_val); 5631 } 5632 /* 5633 * If last ACK falls within this segment's sequence numbers, record 5634 * its timestamp. NOTE: 1) That the test incorporates suggestions 5635 * from the latest proposal of the tcplw@cray.com list (Braden 5636 * 1993/04/26). 2) That updating only on newer timestamps interferes 5637 * with our earlier PAWS tests, so this check should be solely 5638 * predicated on the sequence space of this segment. 3) That we 5639 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 5640 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 5641 * SEG.Len, This modified check allows us to overcome RFC1323's 5642 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 5643 * p.869. In such cases, we can still calculate the RTT correctly 5644 * when RCV.NXT == Last.ACK.Sent. 5645 */ 5646 if ((to->to_flags & TOF_TS) != 0 && 5647 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 5648 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 5649 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 5650 tp->ts_recent_age = tcp_ts_getticks(); 5651 tp->ts_recent = to->to_tsval; 5652 } 5653 /* 5654 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 5655 * is on (half-synchronized state), then queue data for later 5656 * processing; else drop segment and return. 5657 */ 5658 if ((thflags & TH_ACK) == 0) { 5659 if (tp->t_flags & TF_NEEDSYN) { 5660 5661 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 5662 ti_locked, tiwin, thflags, nxt_pkt)); 5663 5664 } else if (tp->t_flags & TF_ACKNOW) { 5665 rack_do_dropafterack(m, tp, th, ti_locked, thflags, tlen, &ret_val); 5666 return (ret_val); 5667 } else { 5668 rack_do_drop(m, NULL, ti_locked); 5669 return (0); 5670 } 5671 } 5672 /* 5673 * Ack processing. 5674 */ 5675 if (rack_process_ack(m, th, so, tp, to, ti_locked, tiwin, tlen, NULL, thflags, &ret_val)) { 5676 return (ret_val); 5677 } 5678 if (sbavail(&so->so_snd)) { 5679 if (rack_progress_timeout_check(tp)) { 5680 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 5681 rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen); 5682 return (1); 5683 } 5684 } 5685 /* State changes only happen in rack_process_data() */ 5686 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 5687 ti_locked, tiwin, thflags, nxt_pkt)); 5688 } 5689 5690 /* 5691 * Return value of 1, the TCB is unlocked and most 5692 * likely gone, return value of 0, the TCP is still 5693 * locked. 5694 */ 5695 static int 5696 rack_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so, 5697 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 5698 int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) 5699 { 5700 int32_t ret_val = 0; 5701 5702 rack_calc_rwin(so, tp); 5703 if (thflags & TH_RST) 5704 return (rack_process_rst(m, th, so, tp, ti_locked)); 5705 /* 5706 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 5707 * synchronized state. 5708 */ 5709 if (thflags & TH_SYN) { 5710 rack_challenge_ack(m, th, tp, ti_locked, &ret_val); 5711 return (ret_val); 5712 } 5713 /* 5714 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 5715 * it's less than ts_recent, drop it. 5716 */ 5717 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 5718 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 5719 if (rack_ts_check(m, th, tp, ti_locked, tlen, thflags, &ret_val)) 5720 return (ret_val); 5721 } 5722 if (rack_drop_checks(to, m, th, tp, &tlen, ti_locked, &thflags, &drop_hdrlen, &ret_val)) { 5723 return (ret_val); 5724 } 5725 /* 5726 * If last ACK falls within this segment's sequence numbers, record 5727 * its timestamp. NOTE: 1) That the test incorporates suggestions 5728 * from the latest proposal of the tcplw@cray.com list (Braden 5729 * 1993/04/26). 2) That updating only on newer timestamps interferes 5730 * with our earlier PAWS tests, so this check should be solely 5731 * predicated on the sequence space of this segment. 3) That we 5732 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 5733 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 5734 * SEG.Len, This modified check allows us to overcome RFC1323's 5735 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 5736 * p.869. In such cases, we can still calculate the RTT correctly 5737 * when RCV.NXT == Last.ACK.Sent. 5738 */ 5739 if ((to->to_flags & TOF_TS) != 0 && 5740 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 5741 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 5742 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 5743 tp->ts_recent_age = tcp_ts_getticks(); 5744 tp->ts_recent = to->to_tsval; 5745 } 5746 /* 5747 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 5748 * is on (half-synchronized state), then queue data for later 5749 * processing; else drop segment and return. 5750 */ 5751 if ((thflags & TH_ACK) == 0) { 5752 if (tp->t_flags & TF_NEEDSYN) { 5753 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 5754 ti_locked, tiwin, thflags, nxt_pkt)); 5755 5756 } else if (tp->t_flags & TF_ACKNOW) { 5757 rack_do_dropafterack(m, tp, th, ti_locked, thflags, tlen, &ret_val); 5758 return (ret_val); 5759 } else { 5760 rack_do_drop(m, NULL, ti_locked); 5761 return (0); 5762 } 5763 } 5764 /* 5765 * Ack processing. 5766 */ 5767 if (rack_process_ack(m, th, so, tp, to, ti_locked, tiwin, tlen, NULL, thflags, &ret_val)) { 5768 return (ret_val); 5769 } 5770 if (sbavail(&so->so_snd)) { 5771 if (rack_progress_timeout_check(tp)) { 5772 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 5773 rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen); 5774 return (1); 5775 } 5776 } 5777 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 5778 ti_locked, tiwin, thflags, nxt_pkt)); 5779 } 5780 5781 static int 5782 rack_check_data_after_close(struct mbuf *m, 5783 struct tcpcb *tp, int32_t *ti_locked, int32_t *tlen, struct tcphdr *th, struct socket *so) 5784 { 5785 struct tcp_rack *rack; 5786 5787 KASSERT(*ti_locked == TI_RLOCKED, ("%s: SS_NOFDEREF && " 5788 "CLOSE_WAIT && tlen ti_locked %d", __func__, *ti_locked)); 5789 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 5790 rack = (struct tcp_rack *)tp->t_fb_ptr; 5791 if (rack->rc_allow_data_af_clo == 0) { 5792 close_now: 5793 tp = tcp_close(tp); 5794 TCPSTAT_INC(tcps_rcvafterclose); 5795 rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_UNLIMITED, (*tlen)); 5796 return (1); 5797 } 5798 if (sbavail(&so->so_snd) == 0) 5799 goto close_now; 5800 /* Ok we allow data that is ignored and a followup reset */ 5801 tp->rcv_nxt = th->th_seq + *tlen; 5802 tp->t_flags2 |= TF2_DROP_AF_DATA; 5803 rack->r_wanted_output = 1; 5804 *tlen = 0; 5805 return (0); 5806 } 5807 5808 /* 5809 * Return value of 1, the TCB is unlocked and most 5810 * likely gone, return value of 0, the TCP is still 5811 * locked. 5812 */ 5813 static int 5814 rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so, 5815 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 5816 int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) 5817 { 5818 int32_t ret_val = 0; 5819 int32_t ourfinisacked = 0; 5820 5821 rack_calc_rwin(so, tp); 5822 5823 if (thflags & TH_RST) 5824 return (rack_process_rst(m, th, so, tp, ti_locked)); 5825 /* 5826 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 5827 * synchronized state. 5828 */ 5829 if (thflags & TH_SYN) { 5830 rack_challenge_ack(m, th, tp, ti_locked, &ret_val); 5831 return (ret_val); 5832 } 5833 /* 5834 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 5835 * it's less than ts_recent, drop it. 5836 */ 5837 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 5838 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 5839 if (rack_ts_check(m, th, tp, ti_locked, tlen, thflags, &ret_val)) 5840 return (ret_val); 5841 } 5842 if (rack_drop_checks(to, m, th, tp, &tlen, ti_locked, &thflags, &drop_hdrlen, &ret_val)) { 5843 return (ret_val); 5844 } 5845 /* 5846 * If new data are received on a connection after the user processes 5847 * are gone, then RST the other end. 5848 */ 5849 if ((so->so_state & SS_NOFDREF) && tlen) { 5850 if (rack_check_data_after_close(m, tp, ti_locked, &tlen, th, so)) 5851 return (1); 5852 } 5853 /* 5854 * If last ACK falls within this segment's sequence numbers, record 5855 * its timestamp. NOTE: 1) That the test incorporates suggestions 5856 * from the latest proposal of the tcplw@cray.com list (Braden 5857 * 1993/04/26). 2) That updating only on newer timestamps interferes 5858 * with our earlier PAWS tests, so this check should be solely 5859 * predicated on the sequence space of this segment. 3) That we 5860 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 5861 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 5862 * SEG.Len, This modified check allows us to overcome RFC1323's 5863 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 5864 * p.869. In such cases, we can still calculate the RTT correctly 5865 * when RCV.NXT == Last.ACK.Sent. 5866 */ 5867 if ((to->to_flags & TOF_TS) != 0 && 5868 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 5869 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 5870 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 5871 tp->ts_recent_age = tcp_ts_getticks(); 5872 tp->ts_recent = to->to_tsval; 5873 } 5874 /* 5875 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 5876 * is on (half-synchronized state), then queue data for later 5877 * processing; else drop segment and return. 5878 */ 5879 if ((thflags & TH_ACK) == 0) { 5880 if (tp->t_flags & TF_NEEDSYN) { 5881 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 5882 ti_locked, tiwin, thflags, nxt_pkt)); 5883 } else if (tp->t_flags & TF_ACKNOW) { 5884 rack_do_dropafterack(m, tp, th, ti_locked, thflags, tlen, &ret_val); 5885 return (ret_val); 5886 } else { 5887 rack_do_drop(m, NULL, ti_locked); 5888 return (0); 5889 } 5890 } 5891 /* 5892 * Ack processing. 5893 */ 5894 if (rack_process_ack(m, th, so, tp, to, ti_locked, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 5895 return (ret_val); 5896 } 5897 if (ourfinisacked) { 5898 /* 5899 * If we can't receive any more data, then closing user can 5900 * proceed. Starting the timer is contrary to the 5901 * specification, but if we don't get a FIN we'll hang 5902 * forever. 5903 * 5904 * XXXjl: we should release the tp also, and use a 5905 * compressed state. 5906 */ 5907 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 5908 soisdisconnected(so); 5909 tcp_timer_activate(tp, TT_2MSL, 5910 (tcp_fast_finwait2_recycle ? 5911 tcp_finwait2_timeout : 5912 TP_MAXIDLE(tp))); 5913 } 5914 tcp_state_change(tp, TCPS_FIN_WAIT_2); 5915 } 5916 if (sbavail(&so->so_snd)) { 5917 if (rack_progress_timeout_check(tp)) { 5918 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 5919 rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen); 5920 return (1); 5921 } 5922 } 5923 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 5924 ti_locked, tiwin, thflags, nxt_pkt)); 5925 } 5926 5927 /* 5928 * Return value of 1, the TCB is unlocked and most 5929 * likely gone, return value of 0, the TCP is still 5930 * locked. 5931 */ 5932 static int 5933 rack_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so, 5934 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 5935 int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) 5936 { 5937 int32_t ret_val = 0; 5938 int32_t ourfinisacked = 0; 5939 5940 rack_calc_rwin(so, tp); 5941 5942 if (thflags & TH_RST) 5943 return (rack_process_rst(m, th, so, tp, ti_locked)); 5944 /* 5945 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 5946 * synchronized state. 5947 */ 5948 if (thflags & TH_SYN) { 5949 rack_challenge_ack(m, th, tp, ti_locked, &ret_val); 5950 return (ret_val); 5951 } 5952 /* 5953 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 5954 * it's less than ts_recent, drop it. 5955 */ 5956 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 5957 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 5958 if (rack_ts_check(m, th, tp, ti_locked, tlen, thflags, &ret_val)) 5959 return (ret_val); 5960 } 5961 if (rack_drop_checks(to, m, th, tp, &tlen, ti_locked, &thflags, &drop_hdrlen, &ret_val)) { 5962 return (ret_val); 5963 } 5964 /* 5965 * If new data are received on a connection after the user processes 5966 * are gone, then RST the other end. 5967 */ 5968 if ((so->so_state & SS_NOFDREF) && tlen) { 5969 if (rack_check_data_after_close(m, tp, ti_locked, &tlen, th, so)) 5970 return (1); 5971 } 5972 /* 5973 * If last ACK falls within this segment's sequence numbers, record 5974 * its timestamp. NOTE: 1) That the test incorporates suggestions 5975 * from the latest proposal of the tcplw@cray.com list (Braden 5976 * 1993/04/26). 2) That updating only on newer timestamps interferes 5977 * with our earlier PAWS tests, so this check should be solely 5978 * predicated on the sequence space of this segment. 3) That we 5979 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 5980 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 5981 * SEG.Len, This modified check allows us to overcome RFC1323's 5982 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 5983 * p.869. In such cases, we can still calculate the RTT correctly 5984 * when RCV.NXT == Last.ACK.Sent. 5985 */ 5986 if ((to->to_flags & TOF_TS) != 0 && 5987 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 5988 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 5989 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 5990 tp->ts_recent_age = tcp_ts_getticks(); 5991 tp->ts_recent = to->to_tsval; 5992 } 5993 /* 5994 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 5995 * is on (half-synchronized state), then queue data for later 5996 * processing; else drop segment and return. 5997 */ 5998 if ((thflags & TH_ACK) == 0) { 5999 if (tp->t_flags & TF_NEEDSYN) { 6000 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 6001 ti_locked, tiwin, thflags, nxt_pkt)); 6002 } else if (tp->t_flags & TF_ACKNOW) { 6003 rack_do_dropafterack(m, tp, th, ti_locked, thflags, tlen, &ret_val); 6004 return (ret_val); 6005 } else { 6006 rack_do_drop(m, NULL, ti_locked); 6007 return (0); 6008 } 6009 } 6010 /* 6011 * Ack processing. 6012 */ 6013 if (rack_process_ack(m, th, so, tp, to, ti_locked, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 6014 return (ret_val); 6015 } 6016 if (ourfinisacked) { 6017 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 6018 tcp_twstart(tp); 6019 INP_INFO_RUNLOCK(&V_tcbinfo); 6020 *ti_locked = TI_UNLOCKED; 6021 m_freem(m); 6022 return (1); 6023 } 6024 if (sbavail(&so->so_snd)) { 6025 if (rack_progress_timeout_check(tp)) { 6026 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 6027 rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen); 6028 return (1); 6029 } 6030 } 6031 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 6032 ti_locked, tiwin, thflags, nxt_pkt)); 6033 } 6034 6035 /* 6036 * Return value of 1, the TCB is unlocked and most 6037 * likely gone, return value of 0, the TCP is still 6038 * locked. 6039 */ 6040 static int 6041 rack_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so, 6042 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 6043 int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) 6044 { 6045 int32_t ret_val = 0; 6046 int32_t ourfinisacked = 0; 6047 6048 rack_calc_rwin(so, tp); 6049 6050 if (thflags & TH_RST) 6051 return (rack_process_rst(m, th, so, tp, ti_locked)); 6052 /* 6053 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 6054 * synchronized state. 6055 */ 6056 if (thflags & TH_SYN) { 6057 rack_challenge_ack(m, th, tp, ti_locked, &ret_val); 6058 return (ret_val); 6059 } 6060 /* 6061 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 6062 * it's less than ts_recent, drop it. 6063 */ 6064 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 6065 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 6066 if (rack_ts_check(m, th, tp, ti_locked, tlen, thflags, &ret_val)) 6067 return (ret_val); 6068 } 6069 if (rack_drop_checks(to, m, th, tp, &tlen, ti_locked, &thflags, &drop_hdrlen, &ret_val)) { 6070 return (ret_val); 6071 } 6072 /* 6073 * If new data are received on a connection after the user processes 6074 * are gone, then RST the other end. 6075 */ 6076 if ((so->so_state & SS_NOFDREF) && tlen) { 6077 if (rack_check_data_after_close(m, tp, ti_locked, &tlen, th, so)) 6078 return (1); 6079 } 6080 /* 6081 * If last ACK falls within this segment's sequence numbers, record 6082 * its timestamp. NOTE: 1) That the test incorporates suggestions 6083 * from the latest proposal of the tcplw@cray.com list (Braden 6084 * 1993/04/26). 2) That updating only on newer timestamps interferes 6085 * with our earlier PAWS tests, so this check should be solely 6086 * predicated on the sequence space of this segment. 3) That we 6087 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 6088 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 6089 * SEG.Len, This modified check allows us to overcome RFC1323's 6090 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 6091 * p.869. In such cases, we can still calculate the RTT correctly 6092 * when RCV.NXT == Last.ACK.Sent. 6093 */ 6094 if ((to->to_flags & TOF_TS) != 0 && 6095 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 6096 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 6097 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 6098 tp->ts_recent_age = tcp_ts_getticks(); 6099 tp->ts_recent = to->to_tsval; 6100 } 6101 /* 6102 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 6103 * is on (half-synchronized state), then queue data for later 6104 * processing; else drop segment and return. 6105 */ 6106 if ((thflags & TH_ACK) == 0) { 6107 if (tp->t_flags & TF_NEEDSYN) { 6108 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 6109 ti_locked, tiwin, thflags, nxt_pkt)); 6110 } else if (tp->t_flags & TF_ACKNOW) { 6111 rack_do_dropafterack(m, tp, th, ti_locked, thflags, tlen, &ret_val); 6112 return (ret_val); 6113 } else { 6114 rack_do_drop(m, NULL, ti_locked); 6115 return (0); 6116 } 6117 } 6118 /* 6119 * case TCPS_LAST_ACK: Ack processing. 6120 */ 6121 if (rack_process_ack(m, th, so, tp, to, ti_locked, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 6122 return (ret_val); 6123 } 6124 if (ourfinisacked) { 6125 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 6126 tp = tcp_close(tp); 6127 rack_do_drop(m, tp, ti_locked); 6128 return (1); 6129 } 6130 if (sbavail(&so->so_snd)) { 6131 if (rack_progress_timeout_check(tp)) { 6132 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 6133 rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen); 6134 return (1); 6135 } 6136 } 6137 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 6138 ti_locked, tiwin, thflags, nxt_pkt)); 6139 } 6140 6141 6142 /* 6143 * Return value of 1, the TCB is unlocked and most 6144 * likely gone, return value of 0, the TCP is still 6145 * locked. 6146 */ 6147 static int 6148 rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so, 6149 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 6150 int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) 6151 { 6152 int32_t ret_val = 0; 6153 int32_t ourfinisacked = 0; 6154 6155 rack_calc_rwin(so, tp); 6156 6157 /* Reset receive buffer auto scaling when not in bulk receive mode. */ 6158 if (thflags & TH_RST) 6159 return (rack_process_rst(m, th, so, tp, ti_locked)); 6160 /* 6161 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 6162 * synchronized state. 6163 */ 6164 if (thflags & TH_SYN) { 6165 rack_challenge_ack(m, th, tp, ti_locked, &ret_val); 6166 return (ret_val); 6167 } 6168 /* 6169 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 6170 * it's less than ts_recent, drop it. 6171 */ 6172 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 6173 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 6174 if (rack_ts_check(m, th, tp, ti_locked, tlen, thflags, &ret_val)) 6175 return (ret_val); 6176 } 6177 if (rack_drop_checks(to, m, th, tp, &tlen, ti_locked, &thflags, &drop_hdrlen, &ret_val)) { 6178 return (ret_val); 6179 } 6180 /* 6181 * If new data are received on a connection after the user processes 6182 * are gone, then RST the other end. 6183 */ 6184 if ((so->so_state & SS_NOFDREF) && 6185 tlen) { 6186 if (rack_check_data_after_close(m, tp, ti_locked, &tlen, th, so)) 6187 return (1); 6188 } 6189 /* 6190 * If last ACK falls within this segment's sequence numbers, record 6191 * its timestamp. NOTE: 1) That the test incorporates suggestions 6192 * from the latest proposal of the tcplw@cray.com list (Braden 6193 * 1993/04/26). 2) That updating only on newer timestamps interferes 6194 * with our earlier PAWS tests, so this check should be solely 6195 * predicated on the sequence space of this segment. 3) That we 6196 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 6197 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 6198 * SEG.Len, This modified check allows us to overcome RFC1323's 6199 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 6200 * p.869. In such cases, we can still calculate the RTT correctly 6201 * when RCV.NXT == Last.ACK.Sent. 6202 */ 6203 if ((to->to_flags & TOF_TS) != 0 && 6204 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 6205 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 6206 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 6207 tp->ts_recent_age = tcp_ts_getticks(); 6208 tp->ts_recent = to->to_tsval; 6209 } 6210 /* 6211 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 6212 * is on (half-synchronized state), then queue data for later 6213 * processing; else drop segment and return. 6214 */ 6215 if ((thflags & TH_ACK) == 0) { 6216 if (tp->t_flags & TF_NEEDSYN) { 6217 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 6218 ti_locked, tiwin, thflags, nxt_pkt)); 6219 } else if (tp->t_flags & TF_ACKNOW) { 6220 rack_do_dropafterack(m, tp, th, ti_locked, thflags, tlen, &ret_val); 6221 return (ret_val); 6222 } else { 6223 rack_do_drop(m, NULL, ti_locked); 6224 return (0); 6225 } 6226 } 6227 /* 6228 * Ack processing. 6229 */ 6230 if (rack_process_ack(m, th, so, tp, to, ti_locked, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 6231 return (ret_val); 6232 } 6233 if (sbavail(&so->so_snd)) { 6234 if (rack_progress_timeout_check(tp)) { 6235 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 6236 rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen); 6237 return (1); 6238 } 6239 } 6240 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 6241 ti_locked, tiwin, thflags, nxt_pkt)); 6242 } 6243 6244 6245 static void inline 6246 rack_clear_rate_sample(struct tcp_rack *rack) 6247 { 6248 rack->r_ctl.rack_rs.rs_flags = RACK_RTT_EMPTY; 6249 rack->r_ctl.rack_rs.rs_rtt_cnt = 0; 6250 rack->r_ctl.rack_rs.rs_rtt_tot = 0; 6251 } 6252 6253 static int 6254 rack_init(struct tcpcb *tp) 6255 { 6256 struct tcp_rack *rack = NULL; 6257 6258 tp->t_fb_ptr = uma_zalloc(rack_pcb_zone, M_NOWAIT); 6259 if (tp->t_fb_ptr == NULL) { 6260 /* 6261 * We need to allocate memory but cant. The INP and INP_INFO 6262 * locks and they are recusive (happens during setup. So a 6263 * scheme to drop the locks fails :( 6264 * 6265 */ 6266 return (ENOMEM); 6267 } 6268 memset(tp->t_fb_ptr, 0, sizeof(struct tcp_rack)); 6269 6270 rack = (struct tcp_rack *)tp->t_fb_ptr; 6271 TAILQ_INIT(&rack->r_ctl.rc_map); 6272 TAILQ_INIT(&rack->r_ctl.rc_free); 6273 TAILQ_INIT(&rack->r_ctl.rc_tmap); 6274 rack->rc_tp = tp; 6275 if (tp->t_inpcb) { 6276 rack->rc_inp = tp->t_inpcb; 6277 } 6278 /* Probably not needed but lets be sure */ 6279 rack_clear_rate_sample(rack); 6280 rack->r_cpu = 0; 6281 rack->r_ctl.rc_reorder_fade = rack_reorder_fade; 6282 rack->rc_allow_data_af_clo = rack_ignore_data_after_close; 6283 rack->r_ctl.rc_tlp_threshold = rack_tlp_thresh; 6284 rack->rc_pace_reduce = rack_slot_reduction; 6285 if (V_tcp_delack_enabled) 6286 tp->t_delayed_ack = 1; 6287 else 6288 tp->t_delayed_ack = 0; 6289 rack->rc_pace_max_segs = rack_hptsi_segments; 6290 rack->r_ctl.rc_early_recovery_segs = rack_early_recovery_max_seg; 6291 rack->r_ctl.rc_reorder_shift = rack_reorder_thresh; 6292 rack->r_ctl.rc_pkt_delay = rack_pkt_delay; 6293 rack->r_ctl.rc_prop_reduce = rack_use_proportional_reduce; 6294 rack->r_idle_reduce_largest = rack_reduce_largest_on_idle; 6295 rack->r_enforce_min_pace = rack_min_pace_time; 6296 rack->r_min_pace_seg_thresh = rack_min_pace_time_seg_req; 6297 rack->r_ctl.rc_prop_rate = rack_proportional_rate; 6298 rack->r_ctl.rc_tlp_cwnd_reduce = rack_lower_cwnd_at_tlp; 6299 rack->r_ctl.rc_early_recovery = rack_early_recovery; 6300 rack->rc_always_pace = rack_pace_every_seg; 6301 rack->r_ctl.rc_rate_sample_method = rack_rate_sample_method; 6302 rack->rack_tlp_threshold_use = rack_tlp_threshold_use; 6303 rack->r_ctl.rc_prr_sendalot = rack_send_a_lot_in_prr; 6304 rack->r_ctl.rc_min_to = rack_min_to; 6305 rack->r_ctl.rc_prr_inc_var = rack_inc_var; 6306 rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0); 6307 if (tp->snd_una != tp->snd_max) { 6308 /* Create a send map for the current outstanding data */ 6309 struct rack_sendmap *rsm; 6310 6311 rsm = rack_alloc(rack); 6312 if (rsm == NULL) { 6313 uma_zfree(rack_pcb_zone, tp->t_fb_ptr); 6314 tp->t_fb_ptr = NULL; 6315 return (ENOMEM); 6316 } 6317 rsm->r_flags = RACK_OVERMAX; 6318 rsm->r_tim_lastsent[0] = tcp_ts_getticks(); 6319 rsm->r_rtr_cnt = 1; 6320 rsm->r_rtr_bytes = 0; 6321 rsm->r_start = tp->snd_una; 6322 rsm->r_end = tp->snd_max; 6323 rsm->r_sndcnt = 0; 6324 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_map, rsm, r_next); 6325 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); 6326 rsm->r_in_tmap = 1; 6327 } 6328 return (0); 6329 } 6330 6331 static int 6332 rack_handoff_ok(struct tcpcb *tp) 6333 { 6334 if ((tp->t_state == TCPS_CLOSED) || 6335 (tp->t_state == TCPS_LISTEN)) { 6336 /* Sure no problem though it may not stick */ 6337 return (0); 6338 } 6339 if ((tp->t_state == TCPS_SYN_SENT) || 6340 (tp->t_state == TCPS_SYN_RECEIVED)) { 6341 /* 6342 * We really don't know you have to get to ESTAB or beyond 6343 * to tell. 6344 */ 6345 return (EAGAIN); 6346 } 6347 if (tp->t_flags & TF_SACK_PERMIT) { 6348 return (0); 6349 } 6350 /* 6351 * If we reach here we don't do SACK on this connection so we can 6352 * never do rack. 6353 */ 6354 return (EINVAL); 6355 } 6356 6357 static void 6358 rack_fini(struct tcpcb *tp, int32_t tcb_is_purged) 6359 { 6360 if (tp->t_fb_ptr) { 6361 struct tcp_rack *rack; 6362 struct rack_sendmap *rsm; 6363 6364 rack = (struct tcp_rack *)tp->t_fb_ptr; 6365 #ifdef TCP_BLACKBOX 6366 tcp_log_flowend(tp); 6367 #endif 6368 rsm = TAILQ_FIRST(&rack->r_ctl.rc_map); 6369 while (rsm) { 6370 TAILQ_REMOVE(&rack->r_ctl.rc_map, rsm, r_next); 6371 uma_zfree(rack_zone, rsm); 6372 rsm = TAILQ_FIRST(&rack->r_ctl.rc_map); 6373 } 6374 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); 6375 while (rsm) { 6376 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_next); 6377 uma_zfree(rack_zone, rsm); 6378 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); 6379 } 6380 rack->rc_free_cnt = 0; 6381 uma_zfree(rack_pcb_zone, tp->t_fb_ptr); 6382 tp->t_fb_ptr = NULL; 6383 } 6384 } 6385 6386 static void 6387 rack_set_state(struct tcpcb *tp, struct tcp_rack *rack) 6388 { 6389 switch (tp->t_state) { 6390 case TCPS_SYN_SENT: 6391 rack->r_state = TCPS_SYN_SENT; 6392 rack->r_substate = rack_do_syn_sent; 6393 break; 6394 case TCPS_SYN_RECEIVED: 6395 rack->r_state = TCPS_SYN_RECEIVED; 6396 rack->r_substate = rack_do_syn_recv; 6397 break; 6398 case TCPS_ESTABLISHED: 6399 rack->r_state = TCPS_ESTABLISHED; 6400 rack->r_substate = rack_do_established; 6401 break; 6402 case TCPS_CLOSE_WAIT: 6403 rack->r_state = TCPS_CLOSE_WAIT; 6404 rack->r_substate = rack_do_close_wait; 6405 break; 6406 case TCPS_FIN_WAIT_1: 6407 rack->r_state = TCPS_FIN_WAIT_1; 6408 rack->r_substate = rack_do_fin_wait_1; 6409 break; 6410 case TCPS_CLOSING: 6411 rack->r_state = TCPS_CLOSING; 6412 rack->r_substate = rack_do_closing; 6413 break; 6414 case TCPS_LAST_ACK: 6415 rack->r_state = TCPS_LAST_ACK; 6416 rack->r_substate = rack_do_lastack; 6417 break; 6418 case TCPS_FIN_WAIT_2: 6419 rack->r_state = TCPS_FIN_WAIT_2; 6420 rack->r_substate = rack_do_fin_wait_2; 6421 break; 6422 case TCPS_LISTEN: 6423 case TCPS_CLOSED: 6424 case TCPS_TIME_WAIT: 6425 default: 6426 #ifdef INVARIANTS 6427 panic("tcp tp:%p state:%d sees impossible state?", tp, tp->t_state); 6428 #endif 6429 break; 6430 }; 6431 } 6432 6433 6434 static void 6435 rack_timer_audit(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb) 6436 { 6437 /* 6438 * We received an ack, and then did not 6439 * call send or were bounced out due to the 6440 * hpts was running. Now a timer is up as well, is 6441 * it the right timer? 6442 */ 6443 struct rack_sendmap *rsm; 6444 int tmr_up; 6445 6446 tmr_up = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK; 6447 if (rack->rc_in_persist && (tmr_up == PACE_TMR_PERSIT)) 6448 return; 6449 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 6450 if (((rsm == NULL) || (tp->t_state < TCPS_ESTABLISHED)) && 6451 (tmr_up == PACE_TMR_RXT)) { 6452 /* Should be an RXT */ 6453 return; 6454 } 6455 if (rsm == NULL) { 6456 /* Nothing outstanding? */ 6457 if (tp->t_flags & TF_DELACK) { 6458 if (tmr_up == PACE_TMR_DELACK) 6459 /* We are supposed to have delayed ack up and we do */ 6460 return; 6461 } else if (sbavail(&tp->t_inpcb->inp_socket->so_snd) && (tmr_up == PACE_TMR_RXT)) { 6462 /* 6463 * if we hit enobufs then we would expect the possiblity 6464 * of nothing outstanding and the RXT up (and the hptsi timer). 6465 */ 6466 return; 6467 } else if (((tcp_always_keepalive || 6468 rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) && 6469 (tp->t_state <= TCPS_CLOSING)) && 6470 (tmr_up == PACE_TMR_KEEP) && 6471 (tp->snd_max == tp->snd_una)) { 6472 /* We should have keep alive up and we do */ 6473 return; 6474 } 6475 } 6476 if (rsm && (rsm->r_flags & RACK_SACK_PASSED)) { 6477 if ((tp->t_flags & TF_SENTFIN) && 6478 ((tp->snd_max - tp->snd_una) == 1) && 6479 (rsm->r_flags & RACK_HAS_FIN)) { 6480 /* needs to be a RXT */ 6481 if (tmr_up == PACE_TMR_RXT) 6482 return; 6483 } else if (tmr_up == PACE_TMR_RACK) 6484 return; 6485 } else if (SEQ_GT(tp->snd_max,tp->snd_una) && 6486 ((tmr_up == PACE_TMR_TLP) || 6487 (tmr_up == PACE_TMR_RXT))) { 6488 /* 6489 * Either a TLP or RXT is fine if no sack-passed 6490 * is in place and data is outstanding. 6491 */ 6492 return; 6493 } else if (tmr_up == PACE_TMR_DELACK) { 6494 /* 6495 * If the delayed ack was going to go off 6496 * before the rtx/tlp/rack timer were going to 6497 * expire, then that would be the timer in control. 6498 * Note we don't check the time here trusting the 6499 * code is correct. 6500 */ 6501 return; 6502 } 6503 /* 6504 * Ok the timer originally started is not what we want now. 6505 * We will force the hpts to be stopped if any, and restart 6506 * with the slot set to what was in the saved slot. 6507 */ 6508 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 6509 rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0); 6510 } 6511 6512 static void 6513 rack_hpts_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, 6514 struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos, 6515 int32_t ti_locked, int32_t nxt_pkt, struct timeval *tv) 6516 { 6517 int32_t thflags, retval, did_out = 0; 6518 int32_t way_out = 0; 6519 uint32_t cts; 6520 uint32_t tiwin; 6521 struct tcpopt to; 6522 struct tcp_rack *rack; 6523 struct rack_sendmap *rsm; 6524 int32_t prev_state = 0; 6525 6526 cts = tcp_tv_to_mssectick(tv); 6527 rack = (struct tcp_rack *)tp->t_fb_ptr; 6528 6529 kern_prefetch(rack, &prev_state); 6530 prev_state = 0; 6531 thflags = th->th_flags; 6532 /* 6533 * If this is either a state-changing packet or current state isn't 6534 * established, we require a read lock on tcbinfo. Otherwise, we 6535 * allow the tcbinfo to be in either locked or unlocked, as the 6536 * caller may have unnecessarily acquired a lock due to a race. 6537 */ 6538 if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 || 6539 tp->t_state != TCPS_ESTABLISHED) { 6540 KASSERT(ti_locked == TI_RLOCKED, ("%s ti_locked %d for " 6541 "SYN/FIN/RST/!EST", __func__, ti_locked)); 6542 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 6543 } else { 6544 #ifdef INVARIANTS 6545 if (ti_locked == TI_RLOCKED) { 6546 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 6547 } else { 6548 KASSERT(ti_locked == TI_UNLOCKED, ("%s: EST " 6549 "ti_locked: %d", __func__, ti_locked)); 6550 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); 6551 } 6552 #endif 6553 } 6554 INP_WLOCK_ASSERT(tp->t_inpcb); 6555 KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", 6556 __func__)); 6557 KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT", 6558 __func__)); 6559 { 6560 union tcp_log_stackspecific log; 6561 6562 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 6563 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 6564 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 6565 TCP_LOG_EVENT(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_IN, 0, 6566 tlen, &log, true); 6567 } 6568 /* 6569 * Segment received on connection. Reset idle time and keep-alive 6570 * timer. XXX: This should be done after segment validation to 6571 * ignore broken/spoofed segs. 6572 */ 6573 if (tp->t_idle_reduce && (tp->snd_max == tp->snd_una)) { 6574 #ifdef NETFLIX_CWV 6575 if ((tp->cwv_enabled) && 6576 ((tp->cwv_cwnd_valid == 0) && 6577 TCPS_HAVEESTABLISHED(tp->t_state) && 6578 (tp->snd_cwnd > tp->snd_cwv.init_cwnd))) { 6579 tcp_newcwv_nvp_closedown(tp); 6580 } else 6581 #endif 6582 if ((ticks - tp->t_rcvtime) >= tp->t_rxtcur) { 6583 counter_u64_add(rack_input_idle_reduces, 1); 6584 rack_cc_after_idle(tp, 6585 (rack->r_idle_reduce_largest ? 1 :0)); 6586 } 6587 } 6588 rack->r_ctl.rc_rcvtime = cts; 6589 tp->t_rcvtime = ticks; 6590 6591 #ifdef NETFLIX_CWV 6592 if (tp->cwv_enabled) { 6593 if ((tp->cwv_cwnd_valid == 0) && 6594 TCPS_HAVEESTABLISHED(tp->t_state) && 6595 (tp->snd_cwnd > tp->snd_cwv.init_cwnd)) 6596 tcp_newcwv_nvp_closedown(tp); 6597 } 6598 #endif 6599 /* 6600 * Unscale the window into a 32-bit value. For the SYN_SENT state 6601 * the scale is zero. 6602 */ 6603 tiwin = th->th_win << tp->snd_scale; 6604 #ifdef NETFLIX_STATS 6605 stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_FRWIN, tiwin); 6606 #endif 6607 /* 6608 * TCP ECN processing. XXXJTL: If we ever use ECN, we need to move 6609 * this to occur after we've validated the segment. 6610 */ 6611 if (tp->t_flags & TF_ECN_PERMIT) { 6612 if (thflags & TH_CWR) 6613 tp->t_flags &= ~TF_ECN_SND_ECE; 6614 switch (iptos & IPTOS_ECN_MASK) { 6615 case IPTOS_ECN_CE: 6616 tp->t_flags |= TF_ECN_SND_ECE; 6617 TCPSTAT_INC(tcps_ecn_ce); 6618 break; 6619 case IPTOS_ECN_ECT0: 6620 TCPSTAT_INC(tcps_ecn_ect0); 6621 break; 6622 case IPTOS_ECN_ECT1: 6623 TCPSTAT_INC(tcps_ecn_ect1); 6624 break; 6625 } 6626 /* Congestion experienced. */ 6627 if (thflags & TH_ECE) { 6628 rack_cong_signal(tp, th, CC_ECN); 6629 } 6630 } 6631 /* 6632 * Parse options on any incoming segment. 6633 */ 6634 tcp_dooptions(&to, (u_char *)(th + 1), 6635 (th->th_off << 2) - sizeof(struct tcphdr), 6636 (thflags & TH_SYN) ? TO_SYN : 0); 6637 6638 /* 6639 * If echoed timestamp is later than the current time, fall back to 6640 * non RFC1323 RTT calculation. Normalize timestamp if syncookies 6641 * were used when this connection was established. 6642 */ 6643 if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) { 6644 to.to_tsecr -= tp->ts_offset; 6645 if (TSTMP_GT(to.to_tsecr, cts)) 6646 to.to_tsecr = 0; 6647 } 6648 /* 6649 * If its the first time in we need to take care of options and 6650 * verify we can do SACK for rack! 6651 */ 6652 if (rack->r_state == 0) { 6653 /* Should be init'd by rack_init() */ 6654 KASSERT(rack->rc_inp != NULL, 6655 ("%s: rack->rc_inp unexpectedly NULL", __func__)); 6656 if (rack->rc_inp == NULL) { 6657 rack->rc_inp = tp->t_inpcb; 6658 } 6659 6660 /* 6661 * Process options only when we get SYN/ACK back. The SYN 6662 * case for incoming connections is handled in tcp_syncache. 6663 * According to RFC1323 the window field in a SYN (i.e., a 6664 * <SYN> or <SYN,ACK>) segment itself is never scaled. XXX 6665 * this is traditional behavior, may need to be cleaned up. 6666 */ 6667 rack->r_cpu = inp_to_cpuid(tp->t_inpcb); 6668 if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) { 6669 if ((to.to_flags & TOF_SCALE) && 6670 (tp->t_flags & TF_REQ_SCALE)) { 6671 tp->t_flags |= TF_RCVD_SCALE; 6672 tp->snd_scale = to.to_wscale; 6673 } 6674 /* 6675 * Initial send window. It will be updated with the 6676 * next incoming segment to the scaled value. 6677 */ 6678 tp->snd_wnd = th->th_win; 6679 if (to.to_flags & TOF_TS) { 6680 tp->t_flags |= TF_RCVD_TSTMP; 6681 tp->ts_recent = to.to_tsval; 6682 tp->ts_recent_age = cts; 6683 } 6684 if (to.to_flags & TOF_MSS) 6685 tcp_mss(tp, to.to_mss); 6686 if ((tp->t_flags & TF_SACK_PERMIT) && 6687 (to.to_flags & TOF_SACKPERM) == 0) 6688 tp->t_flags &= ~TF_SACK_PERMIT; 6689 } 6690 /* 6691 * At this point we are at the initial call. Here we decide 6692 * if we are doing RACK or not. We do this by seeing if 6693 * TF_SACK_PERMIT is set, if not rack is *not* possible and 6694 * we switch to the default code. 6695 */ 6696 if ((tp->t_flags & TF_SACK_PERMIT) == 0) { 6697 tcp_switch_back_to_default(tp); 6698 (*tp->t_fb->tfb_tcp_do_segment) (m, th, so, tp, drop_hdrlen, 6699 tlen, iptos, ti_locked); 6700 return; 6701 } 6702 /* Set the flag */ 6703 rack->r_is_v6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; 6704 tcp_set_hpts(tp->t_inpcb); 6705 rack_stop_all_timers(tp); 6706 sack_filter_clear(&rack->r_ctl.rack_sf, th->th_ack); 6707 } 6708 /* 6709 * This is the one exception case where we set the rack state 6710 * always. All other times (timers etc) we must have a rack-state 6711 * set (so we assure we have done the checks above for SACK). 6712 */ 6713 if (rack->r_state != tp->t_state) 6714 rack_set_state(tp, rack); 6715 if (SEQ_GT(th->th_ack, tp->snd_una) && (rsm = TAILQ_FIRST(&rack->r_ctl.rc_map)) != NULL) 6716 kern_prefetch(rsm, &prev_state); 6717 prev_state = rack->r_state; 6718 rack->r_ctl.rc_tlp_send_cnt = 0; 6719 rack_clear_rate_sample(rack); 6720 retval = (*rack->r_substate) (m, th, so, 6721 tp, &to, drop_hdrlen, 6722 tlen, &ti_locked, tiwin, thflags, nxt_pkt); 6723 #ifdef INVARIANTS 6724 if ((retval == 0) && 6725 (tp->t_inpcb == NULL)) { 6726 panic("retval:%d tp:%p t_inpcb:NULL state:%d", 6727 retval, tp, prev_state); 6728 } 6729 #endif 6730 if (ti_locked != TI_UNLOCKED) { 6731 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 6732 INP_INFO_RUNLOCK(&V_tcbinfo); 6733 ti_locked = TI_UNLOCKED; 6734 } 6735 if (retval == 0) { 6736 /* 6737 * If retval is 1 the tcb is unlocked and most likely the tp 6738 * is gone. 6739 */ 6740 INP_WLOCK_ASSERT(tp->t_inpcb); 6741 tcp_rack_xmit_timer_commit(rack, tp); 6742 if (((tp->snd_max - tp->snd_una) > tp->snd_wnd) && 6743 (rack->rc_in_persist == 0)){ 6744 /* 6745 * The peer shrunk its window on us to the point 6746 * where we have sent too much. The only thing 6747 * we can do here is stop any timers and 6748 * enter persist. We most likely lost the last 6749 * bytes we sent but oh well, we will have to 6750 * retransmit them after the peer is caught up. 6751 */ 6752 if (rack->rc_inp->inp_in_hpts) 6753 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); 6754 rack_timer_cancel(tp, rack, cts, __LINE__); 6755 rack_enter_persist(tp, rack, cts); 6756 rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0); 6757 way_out = 3; 6758 goto done_with_input; 6759 } 6760 if (nxt_pkt == 0) { 6761 if (rack->r_wanted_output != 0) { 6762 did_out = 1; 6763 (void)tp->t_fb->tfb_tcp_output(tp); 6764 } 6765 rack_start_hpts_timer(rack, tp, cts, __LINE__, 0, 0, 0); 6766 } 6767 if (((rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) == 0) && 6768 (SEQ_GT(tp->snd_max, tp->snd_una) || 6769 (tp->t_flags & TF_DELACK) || 6770 ((tcp_always_keepalive || rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) && 6771 (tp->t_state <= TCPS_CLOSING)))) { 6772 /* We could not send (probably in the hpts but stopped the timer earlier)? */ 6773 if ((tp->snd_max == tp->snd_una) && 6774 ((tp->t_flags & TF_DELACK) == 0) && 6775 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { 6776 /* keep alive not needed if we are hptsi output yet */ 6777 ; 6778 } else { 6779 if (rack->rc_inp->inp_in_hpts) 6780 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); 6781 rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0); 6782 } 6783 way_out = 1; 6784 } else { 6785 /* Do we have the correct timer running? */ 6786 rack_timer_audit(tp, rack, &so->so_snd); 6787 way_out = 2; 6788 } 6789 done_with_input: 6790 rack_log_doseg_done(rack, cts, nxt_pkt, did_out, way_out); 6791 if (did_out) 6792 rack->r_wanted_output = 0; 6793 #ifdef INVARIANTS 6794 if (tp->t_inpcb == NULL) { 6795 panic("OP:%d retval:%d tp:%p t_inpcb:NULL state:%d", 6796 did_out, 6797 retval, tp, prev_state); 6798 } 6799 #endif 6800 INP_WUNLOCK(tp->t_inpcb); 6801 } 6802 } 6803 6804 void 6805 rack_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, 6806 struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos, 6807 int32_t ti_locked) 6808 { 6809 struct timeval tv; 6810 #ifdef RSS 6811 struct tcp_function_block *tfb; 6812 struct tcp_rack *rack; 6813 struct inpcb *inp; 6814 6815 rack = (struct tcp_rack *)tp->t_fb_ptr; 6816 if (rack->r_state == 0) { 6817 /* 6818 * Initial input (ACK to SYN-ACK etc)lets go ahead and get 6819 * it processed 6820 */ 6821 if (ti_locked != TI_RLOCKED && INP_INFO_TRY_RLOCK(&V_tcbinfo)) 6822 ti_locked = TI_RLOCKED; 6823 if (ti_locked != TI_RLOCKED) { 6824 inp = tp->t_inpcb; 6825 tfb = tp->t_fb; 6826 in_pcbref(inp); 6827 INP_WUNLOCK(inp); 6828 INP_INFO_RLOCK(&V_tcbinfo); 6829 ti_locked = TI_RLOCKED; 6830 INP_WLOCK(inp); 6831 if (in_pcbrele_wlocked(inp)) 6832 inp = NULL; 6833 if (inp == NULL || (inp->inp_flags2 & INP_FREED) || 6834 (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED))) { 6835 /* The TCPCB went away. Free the packet. */ 6836 INP_INFO_RUNLOCK(&V_tcbinfo); 6837 if (inp) 6838 INP_WUNLOCK(inp); 6839 m_freem(m); 6840 return; 6841 } 6842 /* If the stack changed, call the correct stack. */ 6843 if (tp->t_fb != tfb) { 6844 tp->t_fb->tfb_tcp_do_segment(m, th, so, tp, 6845 drop_hdrlen, tlen, iptos, ti_locked); 6846 return; 6847 } 6848 } 6849 tcp_get_usecs(&tv); 6850 rack_hpts_do_segment(m, th, so, tp, drop_hdrlen, 6851 tlen, iptos, ti_locked, 0, &tv); 6852 return; 6853 } 6854 if (ti_locked == TI_RLOCKED) 6855 INP_INFO_RUNLOCK(&V_tcbinfo); 6856 tcp_queue_to_input(tp, m, th, tlen, drop_hdrlen, iptos, (uint8_t) ti_locked); 6857 INP_WUNLOCK(tp->t_inpcb); 6858 #else 6859 tcp_get_usecs(&tv); 6860 rack_hpts_do_segment(m, th, so, tp, drop_hdrlen, 6861 tlen, iptos, ti_locked, 0, &tv); 6862 #endif 6863 } 6864 6865 struct rack_sendmap * 6866 tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tsused) 6867 { 6868 struct rack_sendmap *rsm = NULL; 6869 int32_t idx; 6870 uint32_t srtt_cur, srtt = 0, thresh = 0, ts_low = 0; 6871 6872 /* Return the next guy to be re-transmitted */ 6873 if (TAILQ_EMPTY(&rack->r_ctl.rc_map)) { 6874 return (NULL); 6875 } 6876 if (tp->t_flags & TF_SENTFIN) { 6877 /* retran the end FIN? */ 6878 return (NULL); 6879 } 6880 /* ok lets look at this one */ 6881 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 6882 if (rsm && ((rsm->r_flags & RACK_ACKED) == 0)) { 6883 goto check_it; 6884 } 6885 rsm = rack_find_lowest_rsm(rack); 6886 if (rsm == NULL) { 6887 return (NULL); 6888 } 6889 check_it: 6890 srtt_cur = tp->t_srtt >> TCP_RTT_SHIFT; 6891 srtt = TICKS_2_MSEC(srtt_cur); 6892 if (rack->rc_rack_rtt && (srtt > rack->rc_rack_rtt)) 6893 srtt = rack->rc_rack_rtt; 6894 if (rsm->r_flags & RACK_ACKED) { 6895 return (NULL); 6896 } 6897 if ((rsm->r_flags & RACK_SACK_PASSED) == 0) { 6898 /* Its not yet ready */ 6899 return (NULL); 6900 } 6901 idx = rsm->r_rtr_cnt - 1; 6902 ts_low = rsm->r_tim_lastsent[idx]; 6903 thresh = rack_calc_thresh_rack(rack, srtt, tsused); 6904 if (tsused <= ts_low) { 6905 return (NULL); 6906 } 6907 if ((tsused - ts_low) >= thresh) { 6908 return (rsm); 6909 } 6910 return (NULL); 6911 } 6912 6913 static int 6914 rack_output(struct tcpcb *tp) 6915 { 6916 struct socket *so; 6917 uint32_t recwin, sendwin; 6918 uint32_t sb_offset; 6919 int32_t len, flags, error = 0; 6920 struct mbuf *m; 6921 struct mbuf *mb; 6922 uint32_t if_hw_tsomaxsegcount = 0; 6923 uint32_t if_hw_tsomaxsegsize; 6924 long tot_len_this_send = 0; 6925 struct ip *ip = NULL; 6926 #ifdef TCPDEBUG 6927 struct ipovly *ipov = NULL; 6928 #endif 6929 struct udphdr *udp = NULL; 6930 struct tcp_rack *rack; 6931 struct tcphdr *th; 6932 uint8_t pass = 0; 6933 u_char opt[TCP_MAXOLEN]; 6934 unsigned ipoptlen, optlen, hdrlen, ulen=0; 6935 uint32_t rack_seq; 6936 6937 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 6938 unsigned ipsec_optlen = 0; 6939 6940 #endif 6941 int32_t idle, sendalot; 6942 int32_t sub_from_prr = 0; 6943 volatile int32_t sack_rxmit; 6944 struct rack_sendmap *rsm = NULL; 6945 int32_t tso, mtu, would_have_fin = 0; 6946 struct tcpopt to; 6947 int32_t slot = 0; 6948 uint32_t cts; 6949 uint8_t hpts_calling, doing_tlp = 0; 6950 int32_t do_a_prefetch; 6951 int32_t prefetch_rsm = 0; 6952 int32_t prefetch_so_done = 0; 6953 struct tcp_log_buffer *lgb = NULL; 6954 struct inpcb *inp; 6955 struct sockbuf *sb; 6956 #ifdef INET6 6957 struct ip6_hdr *ip6 = NULL; 6958 int32_t isipv6; 6959 #endif 6960 /* setup and take the cache hits here */ 6961 rack = (struct tcp_rack *)tp->t_fb_ptr; 6962 inp = rack->rc_inp; 6963 so = inp->inp_socket; 6964 sb = &so->so_snd; 6965 kern_prefetch(sb, &do_a_prefetch); 6966 do_a_prefetch = 1; 6967 6968 INP_WLOCK_ASSERT(inp); 6969 #ifdef TCP_OFFLOAD 6970 if (tp->t_flags & TF_TOE) 6971 return (tcp_offload_output(tp)); 6972 #endif 6973 6974 #ifdef TCP_RFC7413 6975 /* 6976 * For TFO connections in SYN_RECEIVED, only allow the initial 6977 * SYN|ACK and those sent by the retransmit timer. 6978 */ 6979 if ((tp->t_flags & TF_FASTOPEN) && 6980 (tp->t_state == TCPS_SYN_RECEIVED) && 6981 SEQ_GT(tp->snd_max, tp->snd_una) && /* inital SYN|ACK sent */ 6982 (tp->snd_nxt != tp->snd_una)) /* not a retransmit */ 6983 return (0); 6984 #endif 6985 #ifdef INET6 6986 if (rack->r_state) { 6987 /* Use the cache line loaded if possible */ 6988 isipv6 = rack->r_is_v6; 6989 } else { 6990 isipv6 = (inp->inp_vflag & INP_IPV6) != 0; 6991 } 6992 #endif 6993 cts = tcp_ts_getticks(); 6994 if (((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) && 6995 inp->inp_in_hpts) { 6996 /* 6997 * We are on the hpts for some timer but not hptsi output. 6998 * Remove from the hpts unconditionally. 6999 */ 7000 rack_timer_cancel(tp, rack, cts, __LINE__); 7001 } 7002 /* Mark that we have called rack_output(). */ 7003 if ((rack->r_timer_override) || 7004 (tp->t_flags & TF_FORCEDATA) || 7005 (tp->t_state < TCPS_ESTABLISHED)) { 7006 if (tp->t_inpcb->inp_in_hpts) 7007 tcp_hpts_remove(tp->t_inpcb, HPTS_REMOVE_OUTPUT); 7008 } else if (tp->t_inpcb->inp_in_hpts) { 7009 /* 7010 * On the hpts you can't pass even if ACKNOW is on, we will 7011 * when the hpts fires. 7012 */ 7013 counter_u64_add(rack_out_size[TCP_MSS_ACCT_INPACE], 1); 7014 return (0); 7015 } 7016 hpts_calling = inp->inp_hpts_calls; 7017 inp->inp_hpts_calls = 0; 7018 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { 7019 if (rack_process_timers(tp, rack, cts, hpts_calling)) { 7020 counter_u64_add(rack_out_size[TCP_MSS_ACCT_ATIMER], 1); 7021 return (0); 7022 } 7023 } 7024 rack->r_wanted_output = 0; 7025 rack->r_timer_override = 0; 7026 /* 7027 * Determine length of data that should be transmitted, and flags 7028 * that will be used. If there is some data or critical controls 7029 * (SYN, RST) to send, then transmit; otherwise, investigate 7030 * further. 7031 */ 7032 idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una); 7033 #ifdef NETFLIX_CWV 7034 if (tp->cwv_enabled) { 7035 if ((tp->cwv_cwnd_valid == 0) && 7036 TCPS_HAVEESTABLISHED(tp->t_state) && 7037 (tp->snd_cwnd > tp->snd_cwv.init_cwnd)) 7038 tcp_newcwv_nvp_closedown(tp); 7039 } else 7040 #endif 7041 if (tp->t_idle_reduce) { 7042 if (idle && ((ticks - tp->t_rcvtime) >= tp->t_rxtcur)) 7043 rack_cc_after_idle(tp, 7044 (rack->r_idle_reduce_largest ? 1 :0)); 7045 } 7046 tp->t_flags &= ~TF_LASTIDLE; 7047 if (idle) { 7048 if (tp->t_flags & TF_MORETOCOME) { 7049 tp->t_flags |= TF_LASTIDLE; 7050 idle = 0; 7051 } 7052 } 7053 again: 7054 /* 7055 * If we've recently taken a timeout, snd_max will be greater than 7056 * snd_nxt. There may be SACK information that allows us to avoid 7057 * resending already delivered data. Adjust snd_nxt accordingly. 7058 */ 7059 sendalot = 0; 7060 cts = tcp_ts_getticks(); 7061 tso = 0; 7062 mtu = 0; 7063 sb_offset = tp->snd_max - tp->snd_una; 7064 sendwin = min(tp->snd_wnd, tp->snd_cwnd); 7065 7066 flags = tcp_outflags[tp->t_state]; 7067 /* 7068 * Send any SACK-generated retransmissions. If we're explicitly 7069 * trying to send out new data (when sendalot is 1), bypass this 7070 * function. If we retransmit in fast recovery mode, decrement 7071 * snd_cwnd, since we're replacing a (future) new transmission with 7072 * a retransmission now, and we previously incremented snd_cwnd in 7073 * tcp_input(). 7074 */ 7075 /* 7076 * Still in sack recovery , reset rxmit flag to zero. 7077 */ 7078 while (rack->rc_free_cnt < rack_free_cache) { 7079 rsm = rack_alloc(rack); 7080 if (rsm == NULL) { 7081 if (inp->inp_hpts_calls) 7082 /* Retry in a ms */ 7083 slot = 1; 7084 goto just_return_nolock; 7085 } 7086 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_next); 7087 rack->rc_free_cnt++; 7088 rsm = NULL; 7089 } 7090 if (inp->inp_hpts_calls) 7091 inp->inp_hpts_calls = 0; 7092 sack_rxmit = 0; 7093 len = 0; 7094 rsm = NULL; 7095 if (flags & TH_RST) { 7096 SOCKBUF_LOCK(sb); 7097 goto send; 7098 } 7099 if (rack->r_ctl.rc_tlpsend) { 7100 /* Tail loss probe */ 7101 long cwin; 7102 long tlen; 7103 7104 doing_tlp = 1; 7105 rsm = rack->r_ctl.rc_tlpsend; 7106 rack->r_ctl.rc_tlpsend = NULL; 7107 sack_rxmit = 1; 7108 tlen = rsm->r_end - rsm->r_start; 7109 if (tlen > tp->t_maxseg) 7110 tlen = tp->t_maxseg; 7111 #ifdef INVARIANTS 7112 if (SEQ_GT(tp->snd_una, rsm->r_start)) { 7113 panic("tp:%p rack:%p snd_una:%u rsm:%p r_start:%u", 7114 tp, rack, tp->snd_una, rsm, rsm->r_start); 7115 } 7116 #endif 7117 sb_offset = rsm->r_start - tp->snd_una; 7118 cwin = min(tp->snd_wnd, tlen); 7119 len = cwin; 7120 } else if (rack->r_ctl.rc_resend) { 7121 /* Retransmit timer */ 7122 rsm = rack->r_ctl.rc_resend; 7123 rack->r_ctl.rc_resend = NULL; 7124 len = rsm->r_end - rsm->r_start; 7125 sack_rxmit = 1; 7126 sendalot = 0; 7127 sb_offset = rsm->r_start - tp->snd_una; 7128 if (len >= tp->t_maxseg) { 7129 len = tp->t_maxseg; 7130 } 7131 KASSERT(sb_offset >= 0, ("%s: sack block to the left of una : %d", 7132 __func__, sb_offset)); 7133 } else if ((rack->rc_in_persist == 0) && 7134 ((rsm = tcp_rack_output(tp, rack, cts)) != NULL)) { 7135 long tlen; 7136 7137 if ((!IN_RECOVERY(tp->t_flags)) && 7138 ((tp->t_flags & (TF_WASFRECOVERY | TF_WASCRECOVERY)) == 0)) { 7139 /* Enter recovery if not induced by a time-out */ 7140 rack->r_ctl.rc_rsm_start = rsm->r_start; 7141 rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; 7142 rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; 7143 rack_cong_signal(tp, NULL, CC_NDUPACK); 7144 /* 7145 * When we enter recovery we need to assure we send 7146 * one packet. 7147 */ 7148 rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; 7149 } 7150 #ifdef INVARIANTS 7151 if (SEQ_LT(rsm->r_start, tp->snd_una)) { 7152 panic("Huh, tp:%p rack:%p rsm:%p start:%u < snd_una:%u\n", 7153 tp, rack, rsm, rsm->r_start, tp->snd_una); 7154 } 7155 #endif 7156 tlen = rsm->r_end - rsm->r_start; 7157 sb_offset = rsm->r_start - tp->snd_una; 7158 if (tlen > rack->r_ctl.rc_prr_sndcnt) { 7159 len = rack->r_ctl.rc_prr_sndcnt; 7160 } else { 7161 len = tlen; 7162 } 7163 if (len >= tp->t_maxseg) { 7164 sendalot = 1; 7165 len = tp->t_maxseg; 7166 } else { 7167 sendalot = 0; 7168 if ((rack->rc_timer_up == 0) && 7169 (len < tlen)) { 7170 /* 7171 * If its not a timer don't send a partial 7172 * segment. 7173 */ 7174 len = 0; 7175 goto just_return_nolock; 7176 } 7177 } 7178 KASSERT(sb_offset >= 0, ("%s: sack block to the left of una : %d", 7179 __func__, sb_offset)); 7180 if (len > 0) { 7181 sub_from_prr = 1; 7182 sack_rxmit = 1; 7183 TCPSTAT_INC(tcps_sack_rexmits); 7184 TCPSTAT_ADD(tcps_sack_rexmit_bytes, 7185 min(len, tp->t_maxseg)); 7186 counter_u64_add(rack_rtm_prr_retran, 1); 7187 } 7188 } 7189 if (rsm && (rsm->r_flags & RACK_HAS_FIN)) { 7190 /* we are retransmitting the fin */ 7191 len--; 7192 if (len) { 7193 /* 7194 * When retransmitting data do *not* include the 7195 * FIN. This could happen from a TLP probe. 7196 */ 7197 flags &= ~TH_FIN; 7198 } 7199 } 7200 #ifdef INVARIANTS 7201 /* For debugging */ 7202 rack->r_ctl.rc_rsm_at_retran = rsm; 7203 #endif 7204 /* 7205 * Get standard flags, and add SYN or FIN if requested by 'hidden' 7206 * state flags. 7207 */ 7208 if (tp->t_flags & TF_NEEDFIN) 7209 flags |= TH_FIN; 7210 if (tp->t_flags & TF_NEEDSYN) 7211 flags |= TH_SYN; 7212 if ((sack_rxmit == 0) && (prefetch_rsm == 0)) { 7213 void *end_rsm; 7214 end_rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext); 7215 if (end_rsm) 7216 kern_prefetch(end_rsm, &prefetch_rsm); 7217 prefetch_rsm = 1; 7218 } 7219 SOCKBUF_LOCK(sb); 7220 /* 7221 * If in persist timeout with window of 0, send 1 byte. Otherwise, 7222 * if window is small but nonzero and time TF_SENTFIN expired, we 7223 * will send what we can and go to transmit state. 7224 */ 7225 if (tp->t_flags & TF_FORCEDATA) { 7226 if (sendwin == 0) { 7227 /* 7228 * If we still have some data to send, then clear 7229 * the FIN bit. Usually this would happen below 7230 * when it realizes that we aren't sending all the 7231 * data. However, if we have exactly 1 byte of 7232 * unsent data, then it won't clear the FIN bit 7233 * below, and if we are in persist state, we wind up 7234 * sending the packet without recording that we sent 7235 * the FIN bit. 7236 * 7237 * We can't just blindly clear the FIN bit, because 7238 * if we don't have any more data to send then the 7239 * probe will be the FIN itself. 7240 */ 7241 if (sb_offset < sbused(sb)) 7242 flags &= ~TH_FIN; 7243 sendwin = 1; 7244 } else { 7245 if (rack->rc_in_persist) 7246 rack_exit_persist(tp, rack); 7247 /* 7248 * If we are dropping persist mode then we need to 7249 * correct snd_nxt/snd_max and off. 7250 */ 7251 tp->snd_nxt = tp->snd_max; 7252 sb_offset = tp->snd_nxt - tp->snd_una; 7253 } 7254 } 7255 /* 7256 * If snd_nxt == snd_max and we have transmitted a FIN, the 7257 * sb_offset will be > 0 even if so_snd.sb_cc is 0, resulting in a 7258 * negative length. This can also occur when TCP opens up its 7259 * congestion window while receiving additional duplicate acks after 7260 * fast-retransmit because TCP will reset snd_nxt to snd_max after 7261 * the fast-retransmit. 7262 * 7263 * In the normal retransmit-FIN-only case, however, snd_nxt will be 7264 * set to snd_una, the sb_offset will be 0, and the length may wind 7265 * up 0. 7266 * 7267 * If sack_rxmit is true we are retransmitting from the scoreboard 7268 * in which case len is already set. 7269 */ 7270 if (sack_rxmit == 0) { 7271 uint32_t avail; 7272 7273 avail = sbavail(sb); 7274 if (SEQ_GT(tp->snd_nxt, tp->snd_una)) 7275 sb_offset = tp->snd_nxt - tp->snd_una; 7276 else 7277 sb_offset = 0; 7278 if (IN_RECOVERY(tp->t_flags) == 0) { 7279 if (rack->r_ctl.rc_tlp_new_data) { 7280 /* TLP is forcing out new data */ 7281 if (rack->r_ctl.rc_tlp_new_data > (uint32_t) (avail - sb_offset)) { 7282 rack->r_ctl.rc_tlp_new_data = (uint32_t) (avail - sb_offset); 7283 } 7284 if (rack->r_ctl.rc_tlp_new_data > tp->snd_wnd) 7285 len = tp->snd_wnd; 7286 else 7287 len = rack->r_ctl.rc_tlp_new_data; 7288 rack->r_ctl.rc_tlp_new_data = 0; 7289 doing_tlp = 1; 7290 } else { 7291 if (sendwin > avail) { 7292 /* use the available */ 7293 if (avail > sb_offset) { 7294 len = (int32_t)(avail - sb_offset); 7295 } else { 7296 len = 0; 7297 } 7298 } else { 7299 if (sendwin > sb_offset) { 7300 len = (int32_t)(sendwin - sb_offset); 7301 } else { 7302 len = 0; 7303 } 7304 } 7305 } 7306 } else { 7307 uint32_t outstanding; 7308 7309 /* 7310 * We are inside of a SACK recovery episode and are 7311 * sending new data, having retransmitted all the 7312 * data possible so far in the scoreboard. 7313 */ 7314 outstanding = tp->snd_max - tp->snd_una; 7315 if ((rack->r_ctl.rc_prr_sndcnt + outstanding) > tp->snd_wnd) 7316 len = 0; 7317 else if (avail > sb_offset) 7318 len = avail - sb_offset; 7319 else 7320 len = 0; 7321 if (len > 0) { 7322 if (len > rack->r_ctl.rc_prr_sndcnt) 7323 len = rack->r_ctl.rc_prr_sndcnt; 7324 7325 if (len > 0) { 7326 sub_from_prr = 1; 7327 counter_u64_add(rack_rtm_prr_newdata, 1); 7328 } 7329 } 7330 if (len > tp->t_maxseg) { 7331 /* 7332 * We should never send more than a MSS when 7333 * retransmitting or sending new data in prr 7334 * mode unless the override flag is on. Most 7335 * likely the PRR algorithm is not going to 7336 * let us send a lot as well :-) 7337 */ 7338 if (rack->r_ctl.rc_prr_sendalot == 0) 7339 len = tp->t_maxseg; 7340 } else if (len < tp->t_maxseg) { 7341 /* 7342 * Do we send any? The idea here is if the 7343 * send empty's the socket buffer we want to 7344 * do it. However if not then lets just wait 7345 * for our prr_sndcnt to get bigger. 7346 */ 7347 long leftinsb; 7348 7349 leftinsb = sbavail(sb) - sb_offset; 7350 if (leftinsb > len) { 7351 /* This send does not empty the sb */ 7352 len = 0; 7353 } 7354 } 7355 } 7356 } 7357 if (prefetch_so_done == 0) { 7358 kern_prefetch(so, &prefetch_so_done); 7359 prefetch_so_done = 1; 7360 } 7361 /* 7362 * Lop off SYN bit if it has already been sent. However, if this is 7363 * SYN-SENT state and if segment contains data and if we don't know 7364 * that foreign host supports TAO, suppress sending segment. 7365 */ 7366 if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) { 7367 if ((tp->t_state != TCPS_SYN_RECEIVED) && 7368 (tp->t_state != TCPS_SYN_SENT)) 7369 flags &= ~TH_SYN; 7370 #ifdef TCP_RFC7413 7371 /* 7372 * When sending additional segments following a TFO SYN|ACK, 7373 * do not include the SYN bit. 7374 */ 7375 if ((tp->t_flags & TF_FASTOPEN) && 7376 (tp->t_state == TCPS_SYN_RECEIVED)) 7377 flags &= ~TH_SYN; 7378 #endif 7379 sb_offset--, len++; 7380 if (sbavail(sb) == 0) 7381 len = 0; 7382 } 7383 /* 7384 * Be careful not to send data and/or FIN on SYN segments. This 7385 * measure is needed to prevent interoperability problems with not 7386 * fully conformant TCP implementations. 7387 */ 7388 if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) { 7389 len = 0; 7390 flags &= ~TH_FIN; 7391 } 7392 #ifdef TCP_RFC7413 7393 /* 7394 * When retransmitting SYN|ACK on a passively-created TFO socket, 7395 * don't include data, as the presence of data may have caused the 7396 * original SYN|ACK to have been dropped by a middlebox. 7397 */ 7398 if ((tp->t_flags & TF_FASTOPEN) && 7399 ((tp->t_state == TCPS_SYN_RECEIVED) && (tp->t_rxtshift > 0))) 7400 len = 0; 7401 #endif 7402 if (len <= 0) { 7403 /* 7404 * If FIN has been sent but not acked, but we haven't been 7405 * called to retransmit, len will be < 0. Otherwise, window 7406 * shrank after we sent into it. If window shrank to 0, 7407 * cancel pending retransmit, pull snd_nxt back to (closed) 7408 * window, and set the persist timer if it isn't already 7409 * going. If the window didn't close completely, just wait 7410 * for an ACK. 7411 * 7412 * We also do a general check here to ensure that we will 7413 * set the persist timer when we have data to send, but a 7414 * 0-byte window. This makes sure the persist timer is set 7415 * even if the packet hits one of the "goto send" lines 7416 * below. 7417 */ 7418 len = 0; 7419 if ((tp->snd_wnd == 0) && 7420 (TCPS_HAVEESTABLISHED(tp->t_state)) && 7421 (sb_offset < (int)sbavail(sb))) { 7422 tp->snd_nxt = tp->snd_una; 7423 rack_enter_persist(tp, rack, cts); 7424 } 7425 } 7426 /* len will be >= 0 after this point. */ 7427 KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); 7428 tcp_sndbuf_autoscale(tp, so, sendwin); 7429 /* 7430 * Decide if we can use TCP Segmentation Offloading (if supported by 7431 * hardware). 7432 * 7433 * TSO may only be used if we are in a pure bulk sending state. The 7434 * presence of TCP-MD5, SACK retransmits, SACK advertizements and IP 7435 * options prevent using TSO. With TSO the TCP header is the same 7436 * (except for the sequence number) for all generated packets. This 7437 * makes it impossible to transmit any options which vary per 7438 * generated segment or packet. 7439 * 7440 * IPv4 handling has a clear separation of ip options and ip header 7441 * flags while IPv6 combines both in in6p_outputopts. ip6_optlen() does 7442 * the right thing below to provide length of just ip options and thus 7443 * checking for ipoptlen is enough to decide if ip options are present. 7444 */ 7445 7446 #ifdef INET6 7447 if (isipv6) 7448 ipoptlen = ip6_optlen(tp->t_inpcb); 7449 else 7450 #endif 7451 if (tp->t_inpcb->inp_options) 7452 ipoptlen = tp->t_inpcb->inp_options->m_len - 7453 offsetof(struct ipoption, ipopt_list); 7454 else 7455 ipoptlen = 0; 7456 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 7457 /* 7458 * Pre-calculate here as we save another lookup into the darknesses 7459 * of IPsec that way and can actually decide if TSO is ok. 7460 */ 7461 #ifdef INET6 7462 if (isipv6 && IPSEC_ENABLED(ipv6)) 7463 ipsec_optlen = IPSEC_HDRSIZE(ipv6, tp->t_inpcb); 7464 #ifdef INET 7465 else 7466 #endif 7467 #endif /* INET6 */ 7468 #ifdef INET 7469 if (IPSEC_ENABLED(ipv4)) 7470 ipsec_optlen = IPSEC_HDRSIZE(ipv4, tp->t_inpcb); 7471 #endif /* INET */ 7472 #endif 7473 7474 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 7475 ipoptlen += ipsec_optlen; 7476 #endif 7477 if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > tp->t_maxseg && 7478 (tp->t_port == 0) && 7479 ((tp->t_flags & TF_SIGNATURE) == 0) && 7480 tp->rcv_numsacks == 0 && sack_rxmit == 0 && 7481 ipoptlen == 0) 7482 tso = 1; 7483 { 7484 uint32_t outstanding; 7485 7486 outstanding = tp->snd_max - tp->snd_una; 7487 if (tp->t_flags & TF_SENTFIN) { 7488 /* 7489 * If we sent a fin, snd_max is 1 higher than 7490 * snd_una 7491 */ 7492 outstanding--; 7493 } 7494 if (outstanding > 0) { 7495 /* 7496 * This is sub-optimal. We only send a stand alone 7497 * FIN on its own segment. 7498 */ 7499 if (flags & TH_FIN) { 7500 flags &= ~TH_FIN; 7501 would_have_fin = 1; 7502 } 7503 } else if (sack_rxmit) { 7504 if ((rsm->r_flags & RACK_HAS_FIN) == 0) 7505 flags &= ~TH_FIN; 7506 } else { 7507 if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + 7508 sbused(sb))) 7509 flags &= ~TH_FIN; 7510 } 7511 } 7512 recwin = sbspace(&so->so_rcv); 7513 7514 /* 7515 * Sender silly window avoidance. We transmit under the following 7516 * conditions when len is non-zero: 7517 * 7518 * - We have a full segment (or more with TSO) - This is the last 7519 * buffer in a write()/send() and we are either idle or running 7520 * NODELAY - we've timed out (e.g. persist timer) - we have more 7521 * then 1/2 the maximum send window's worth of data (receiver may be 7522 * limited the window size) - we need to retransmit 7523 */ 7524 if (len) { 7525 if (len >= tp->t_maxseg) { 7526 pass = 1; 7527 goto send; 7528 } 7529 /* 7530 * NOTE! on localhost connections an 'ack' from the remote 7531 * end may occur synchronously with the output and cause us 7532 * to flush a buffer queued with moretocome. XXX 7533 * 7534 */ 7535 if (!(tp->t_flags & TF_MORETOCOME) && /* normal case */ 7536 (idle || (tp->t_flags & TF_NODELAY)) && 7537 ((uint32_t)len + (uint32_t)sb_offset >= sbavail(&so->so_snd)) && 7538 (tp->t_flags & TF_NOPUSH) == 0) { 7539 pass = 2; 7540 goto send; 7541 } 7542 if (tp->t_flags & TF_FORCEDATA) { /* typ. timeout case */ 7543 pass = 3; 7544 goto send; 7545 } 7546 if ((tp->snd_una == tp->snd_max) && len) { /* Nothing outstanding */ 7547 goto send; 7548 } 7549 if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) { 7550 pass = 4; 7551 goto send; 7552 } 7553 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { /* retransmit case */ 7554 pass = 5; 7555 goto send; 7556 } 7557 if (sack_rxmit) { 7558 pass = 6; 7559 goto send; 7560 } 7561 } 7562 /* 7563 * Sending of standalone window updates. 7564 * 7565 * Window updates are important when we close our window due to a 7566 * full socket buffer and are opening it again after the application 7567 * reads data from it. Once the window has opened again and the 7568 * remote end starts to send again the ACK clock takes over and 7569 * provides the most current window information. 7570 * 7571 * We must avoid the silly window syndrome whereas every read from 7572 * the receive buffer, no matter how small, causes a window update 7573 * to be sent. We also should avoid sending a flurry of window 7574 * updates when the socket buffer had queued a lot of data and the 7575 * application is doing small reads. 7576 * 7577 * Prevent a flurry of pointless window updates by only sending an 7578 * update when we can increase the advertized window by more than 7579 * 1/4th of the socket buffer capacity. When the buffer is getting 7580 * full or is very small be more aggressive and send an update 7581 * whenever we can increase by two mss sized segments. In all other 7582 * situations the ACK's to new incoming data will carry further 7583 * window increases. 7584 * 7585 * Don't send an independent window update if a delayed ACK is 7586 * pending (it will get piggy-backed on it) or the remote side 7587 * already has done a half-close and won't send more data. Skip 7588 * this if the connection is in T/TCP half-open state. 7589 */ 7590 if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN) && 7591 !(tp->t_flags & TF_DELACK) && 7592 !TCPS_HAVERCVDFIN(tp->t_state)) { 7593 /* 7594 * "adv" is the amount we could increase the window, taking 7595 * into account that we are limited by TCP_MAXWIN << 7596 * tp->rcv_scale. 7597 */ 7598 int32_t adv; 7599 int oldwin; 7600 7601 adv = min(recwin, (long)TCP_MAXWIN << tp->rcv_scale); 7602 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) { 7603 oldwin = (tp->rcv_adv - tp->rcv_nxt); 7604 adv -= oldwin; 7605 } else 7606 oldwin = 0; 7607 7608 /* 7609 * If the new window size ends up being the same as the old 7610 * size when it is scaled, then don't force a window update. 7611 */ 7612 if (oldwin >> tp->rcv_scale == (adv + oldwin) >> tp->rcv_scale) 7613 goto dontupdate; 7614 7615 if (adv >= (int32_t)(2 * tp->t_maxseg) && 7616 (adv >= (int32_t)(so->so_rcv.sb_hiwat / 4) || 7617 recwin <= (int32_t)(so->so_rcv.sb_hiwat / 8) || 7618 so->so_rcv.sb_hiwat <= 8 * tp->t_maxseg)) { 7619 pass = 7; 7620 goto send; 7621 } 7622 if (2 * adv >= (int32_t) so->so_rcv.sb_hiwat) 7623 goto send; 7624 } 7625 dontupdate: 7626 7627 /* 7628 * Send if we owe the peer an ACK, RST, SYN, or urgent data. ACKNOW 7629 * is also a catch-all for the retransmit timer timeout case. 7630 */ 7631 if (tp->t_flags & TF_ACKNOW) { 7632 pass = 8; 7633 goto send; 7634 } 7635 if (((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0)) { 7636 pass = 9; 7637 goto send; 7638 } 7639 if (SEQ_GT(tp->snd_up, tp->snd_una)) { 7640 pass = 10; 7641 goto send; 7642 } 7643 /* 7644 * If our state indicates that FIN should be sent and we have not 7645 * yet done so, then we need to send. 7646 */ 7647 if (flags & TH_FIN) { 7648 if ((tp->t_flags & TF_SENTFIN) || 7649 (((tp->t_flags & TF_SENTFIN) == 0) && 7650 (tp->snd_nxt == tp->snd_una))) { 7651 pass = 11; 7652 goto send; 7653 } 7654 } 7655 /* 7656 * No reason to send a segment, just return. 7657 */ 7658 just_return: 7659 SOCKBUF_UNLOCK(sb); 7660 just_return_nolock: 7661 if (tot_len_this_send == 0) 7662 counter_u64_add(rack_out_size[TCP_MSS_ACCT_JUSTRET], 1); 7663 rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, tot_len_this_send, 1); 7664 rack_log_type_just_return(rack, cts, tot_len_this_send, slot, hpts_calling); 7665 tp->t_flags &= ~TF_FORCEDATA; 7666 return (0); 7667 7668 send: 7669 if (doing_tlp == 0) { 7670 /* 7671 * Data not a TLP, and its not the rxt firing. If it is the 7672 * rxt firing, we want to leave the tlp_in_progress flag on 7673 * so we don't send another TLP. It has to be a rack timer 7674 * or normal send (response to acked data) to clear the tlp 7675 * in progress flag. 7676 */ 7677 rack->rc_tlp_in_progress = 0; 7678 } 7679 SOCKBUF_LOCK_ASSERT(sb); 7680 if (len > 0) { 7681 if (len >= tp->t_maxseg) 7682 tp->t_flags2 |= TF2_PLPMTU_MAXSEGSNT; 7683 else 7684 tp->t_flags2 &= ~TF2_PLPMTU_MAXSEGSNT; 7685 } 7686 /* 7687 * Before ESTABLISHED, force sending of initial options unless TCP 7688 * set not to do any options. NOTE: we assume that the IP/TCP header 7689 * plus TCP options always fit in a single mbuf, leaving room for a 7690 * maximum link header, i.e. max_linkhdr + sizeof (struct tcpiphdr) 7691 * + optlen <= MCLBYTES 7692 */ 7693 optlen = 0; 7694 #ifdef INET6 7695 if (isipv6) 7696 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 7697 else 7698 #endif 7699 hdrlen = sizeof(struct tcpiphdr); 7700 7701 /* 7702 * Compute options for segment. We only have to care about SYN and 7703 * established connection segments. Options for SYN-ACK segments 7704 * are handled in TCP syncache. 7705 */ 7706 to.to_flags = 0; 7707 if ((tp->t_flags & TF_NOOPT) == 0) { 7708 /* Maximum segment size. */ 7709 if (flags & TH_SYN) { 7710 tp->snd_nxt = tp->iss; 7711 to.to_mss = tcp_mssopt(&inp->inp_inc); 7712 #ifdef NETFLIX_TCPOUDP 7713 if (tp->t_port) 7714 to.to_mss -= V_tcp_udp_tunneling_overhead; 7715 #endif 7716 to.to_flags |= TOF_MSS; 7717 #ifdef TCP_RFC7413 7718 /* 7719 * Only include the TFO option on the first 7720 * transmission of the SYN|ACK on a 7721 * passively-created TFO socket, as the presence of 7722 * the TFO option may have caused the original 7723 * SYN|ACK to have been dropped by a middlebox. 7724 */ 7725 if ((tp->t_flags & TF_FASTOPEN) && 7726 (tp->t_state == TCPS_SYN_RECEIVED) && 7727 (tp->t_rxtshift == 0)) { 7728 to.to_tfo_len = TCP_FASTOPEN_MAX_COOKIE_LEN; 7729 to.to_tfo_cookie = (u_char *)&tp->t_tfo_cookie; 7730 to.to_flags |= TOF_FASTOPEN; 7731 } 7732 #endif 7733 } 7734 /* Window scaling. */ 7735 if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) { 7736 to.to_wscale = tp->request_r_scale; 7737 to.to_flags |= TOF_SCALE; 7738 } 7739 /* Timestamps. */ 7740 if ((tp->t_flags & TF_RCVD_TSTMP) || 7741 ((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) { 7742 to.to_tsval = cts + tp->ts_offset; 7743 to.to_tsecr = tp->ts_recent; 7744 to.to_flags |= TOF_TS; 7745 } 7746 /* Set receive buffer autosizing timestamp. */ 7747 if (tp->rfbuf_ts == 0 && 7748 (so->so_rcv.sb_flags & SB_AUTOSIZE)) 7749 tp->rfbuf_ts = tcp_ts_getticks(); 7750 /* Selective ACK's. */ 7751 if (flags & TH_SYN) 7752 to.to_flags |= TOF_SACKPERM; 7753 else if (TCPS_HAVEESTABLISHED(tp->t_state) && 7754 tp->rcv_numsacks > 0) { 7755 to.to_flags |= TOF_SACK; 7756 to.to_nsacks = tp->rcv_numsacks; 7757 to.to_sacks = (u_char *)tp->sackblks; 7758 } 7759 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) 7760 /* TCP-MD5 (RFC2385). */ 7761 if (tp->t_flags & TF_SIGNATURE) 7762 to.to_flags |= TOF_SIGNATURE; 7763 #endif /* TCP_SIGNATURE */ 7764 7765 /* Processing the options. */ 7766 hdrlen += optlen = tcp_addoptions(&to, opt); 7767 } 7768 #ifdef NETFLIX_TCPOUDP 7769 if (tp->t_port) { 7770 if (V_tcp_udp_tunneling_port == 0) { 7771 /* The port was removed?? */ 7772 SOCKBUF_UNLOCK(&so->so_snd); 7773 return (EHOSTUNREACH); 7774 } 7775 hdrlen += sizeof(struct udphdr); 7776 } 7777 #endif 7778 ipoptlen = 0; 7779 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 7780 ipoptlen += ipsec_optlen; 7781 #endif 7782 7783 /* 7784 * Adjust data length if insertion of options will bump the packet 7785 * length beyond the t_maxseg length. Clear the FIN bit because we 7786 * cut off the tail of the segment. 7787 */ 7788 if (len + optlen + ipoptlen > tp->t_maxseg) { 7789 if (flags & TH_FIN) { 7790 would_have_fin = 1; 7791 flags &= ~TH_FIN; 7792 } 7793 if (tso) { 7794 uint32_t if_hw_tsomax; 7795 uint32_t moff; 7796 int32_t max_len; 7797 7798 /* extract TSO information */ 7799 if_hw_tsomax = tp->t_tsomax; 7800 if_hw_tsomaxsegcount = tp->t_tsomaxsegcount; 7801 if_hw_tsomaxsegsize = tp->t_tsomaxsegsize; 7802 KASSERT(ipoptlen == 0, 7803 ("%s: TSO can't do IP options", __func__)); 7804 7805 /* 7806 * Check if we should limit by maximum payload 7807 * length: 7808 */ 7809 if (if_hw_tsomax != 0) { 7810 /* compute maximum TSO length */ 7811 max_len = (if_hw_tsomax - hdrlen - 7812 max_linkhdr); 7813 if (max_len <= 0) { 7814 len = 0; 7815 } else if (len > max_len) { 7816 sendalot = 1; 7817 len = max_len; 7818 } 7819 } 7820 /* 7821 * Prevent the last segment from being fractional 7822 * unless the send sockbuf can be emptied: 7823 */ 7824 max_len = (tp->t_maxseg - optlen); 7825 if ((sb_offset + len) < sbavail(sb)) { 7826 moff = len % (u_int)max_len; 7827 if (moff != 0) { 7828 len -= moff; 7829 sendalot = 1; 7830 } 7831 } 7832 /* 7833 * In case there are too many small fragments don't 7834 * use TSO: 7835 */ 7836 if (len <= max_len) { 7837 len = max_len; 7838 sendalot = 1; 7839 tso = 0; 7840 } 7841 /* 7842 * Send the FIN in a separate segment after the bulk 7843 * sending is done. We don't trust the TSO 7844 * implementations to clear the FIN flag on all but 7845 * the last segment. 7846 */ 7847 if (tp->t_flags & TF_NEEDFIN) 7848 sendalot = 1; 7849 7850 } else { 7851 len = tp->t_maxseg - optlen - ipoptlen; 7852 sendalot = 1; 7853 } 7854 } else 7855 tso = 0; 7856 KASSERT(len + hdrlen + ipoptlen <= IP_MAXPACKET, 7857 ("%s: len > IP_MAXPACKET", __func__)); 7858 #ifdef DIAGNOSTIC 7859 #ifdef INET6 7860 if (max_linkhdr + hdrlen > MCLBYTES) 7861 #else 7862 if (max_linkhdr + hdrlen > MHLEN) 7863 #endif 7864 panic("tcphdr too big"); 7865 #endif 7866 7867 /* 7868 * This KASSERT is here to catch edge cases at a well defined place. 7869 * Before, those had triggered (random) panic conditions further 7870 * down. 7871 */ 7872 KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); 7873 if ((len == 0) && 7874 (flags & TH_FIN) && 7875 (sbused(sb))) { 7876 /* 7877 * We have outstanding data, don't send a fin by itself!. 7878 */ 7879 goto just_return; 7880 } 7881 /* 7882 * Grab a header mbuf, attaching a copy of data to be transmitted, 7883 * and initialize the header from the template for sends on this 7884 * connection. 7885 */ 7886 if (len) { 7887 uint32_t max_val; 7888 uint32_t moff; 7889 7890 if (rack->rc_pace_max_segs) 7891 max_val = rack->rc_pace_max_segs * tp->t_maxseg; 7892 else 7893 max_val = len; 7894 /* 7895 * We allow a limit on sending with hptsi. 7896 */ 7897 if (len > max_val) { 7898 len = max_val; 7899 } 7900 #ifdef INET6 7901 if (MHLEN < hdrlen + max_linkhdr) 7902 m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); 7903 else 7904 #endif 7905 m = m_gethdr(M_NOWAIT, MT_DATA); 7906 7907 if (m == NULL) { 7908 SOCKBUF_UNLOCK(sb); 7909 error = ENOBUFS; 7910 sack_rxmit = 0; 7911 goto out; 7912 } 7913 m->m_data += max_linkhdr; 7914 m->m_len = hdrlen; 7915 7916 /* 7917 * Start the m_copy functions from the closest mbuf to the 7918 * sb_offset in the socket buffer chain. 7919 */ 7920 mb = sbsndptr_noadv(sb, sb_offset, &moff); 7921 if (len <= MHLEN - hdrlen - max_linkhdr) { 7922 m_copydata(mb, moff, (int)len, 7923 mtod(m, caddr_t)+hdrlen); 7924 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) 7925 sbsndptr_adv(sb, mb, len); 7926 m->m_len += len; 7927 } else { 7928 struct sockbuf *msb; 7929 7930 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) 7931 msb = NULL; 7932 else 7933 msb = sb; 7934 m->m_next = tcp_m_copym(mb, moff, &len, 7935 if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb); 7936 if (len <= (tp->t_maxseg - optlen)) { 7937 /* 7938 * Must have ran out of mbufs for the copy 7939 * shorten it to no longer need tso. Lets 7940 * not put on sendalot since we are low on 7941 * mbufs. 7942 */ 7943 tso = 0; 7944 } 7945 if (m->m_next == NULL) { 7946 SOCKBUF_UNLOCK(sb); 7947 (void)m_free(m); 7948 error = ENOBUFS; 7949 sack_rxmit = 0; 7950 goto out; 7951 } 7952 } 7953 if ((tp->t_flags & TF_FORCEDATA) && len == 1) { 7954 TCPSTAT_INC(tcps_sndprobe); 7955 #ifdef NETFLIX_STATS 7956 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) 7957 stats_voi_update_abs_u32(tp->t_stats, 7958 VOI_TCP_RETXPB, len); 7959 else 7960 stats_voi_update_abs_u64(tp->t_stats, 7961 VOI_TCP_TXPB, len); 7962 #endif 7963 } else if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) { 7964 if (rsm && (rsm->r_flags & RACK_TLP)) { 7965 /* 7966 * TLP should not count in retran count, but 7967 * in its own bin 7968 */ 7969 counter_u64_add(rack_tlp_retran, 1); 7970 counter_u64_add(rack_tlp_retran_bytes, len); 7971 } else { 7972 tp->t_sndrexmitpack++; 7973 TCPSTAT_INC(tcps_sndrexmitpack); 7974 TCPSTAT_ADD(tcps_sndrexmitbyte, len); 7975 } 7976 #ifdef NETFLIX_STATS 7977 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB, 7978 len); 7979 #endif 7980 } else { 7981 TCPSTAT_INC(tcps_sndpack); 7982 TCPSTAT_ADD(tcps_sndbyte, len); 7983 #ifdef NETFLIX_STATS 7984 stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB, 7985 len); 7986 #endif 7987 } 7988 /* 7989 * If we're sending everything we've got, set PUSH. (This 7990 * will keep happy those implementations which only give 7991 * data to the user when a buffer fills or a PUSH comes in.) 7992 */ 7993 if (sb_offset + len == sbused(sb) && 7994 sbused(sb) && 7995 !(flags & TH_SYN)) 7996 flags |= TH_PUSH; 7997 7998 /* 7999 * Are we doing hptsi, if so we must calculate the slot. We 8000 * only do hptsi in ESTABLISHED and with no RESET being 8001 * sent where we have data to send. 8002 */ 8003 if (((tp->t_state == TCPS_ESTABLISHED) || 8004 (tp->t_state == TCPS_CLOSE_WAIT) || 8005 ((tp->t_state == TCPS_FIN_WAIT_1) && 8006 ((tp->t_flags & TF_SENTFIN) == 0) && 8007 ((flags & TH_FIN) == 0))) && 8008 ((flags & TH_RST) == 0) && 8009 (rack->rc_always_pace)) { 8010 /* 8011 * We use the most optimistic possible cwnd/srtt for 8012 * sending calculations. This will make our 8013 * calculation anticipate getting more through 8014 * quicker then possible. But thats ok we don't want 8015 * the peer to have a gap in data sending. 8016 */ 8017 uint32_t srtt, cwnd, tr_perms = 0; 8018 8019 if (rack->r_ctl.rc_rack_min_rtt) 8020 srtt = rack->r_ctl.rc_rack_min_rtt; 8021 else 8022 srtt = TICKS_2_MSEC((tp->t_srtt >> TCP_RTT_SHIFT)); 8023 if (rack->r_ctl.rc_rack_largest_cwnd) 8024 cwnd = rack->r_ctl.rc_rack_largest_cwnd; 8025 else 8026 cwnd = tp->snd_cwnd; 8027 tr_perms = cwnd / srtt; 8028 if (tr_perms == 0) { 8029 tr_perms = tp->t_maxseg; 8030 } 8031 tot_len_this_send += len; 8032 /* 8033 * Calculate how long this will take to drain, if 8034 * the calculation comes out to zero, thats ok we 8035 * will use send_a_lot to possibly spin around for 8036 * more increasing tot_len_this_send to the point 8037 * that its going to require a pace, or we hit the 8038 * cwnd. Which in that case we are just waiting for 8039 * a ACK. 8040 */ 8041 slot = tot_len_this_send / tr_perms; 8042 /* Now do we reduce the time so we don't run dry? */ 8043 if (slot && rack->rc_pace_reduce) { 8044 int32_t reduce; 8045 8046 reduce = (slot / rack->rc_pace_reduce); 8047 if (reduce < slot) { 8048 slot -= reduce; 8049 } else 8050 slot = 0; 8051 } 8052 if (rack->r_enforce_min_pace && 8053 (slot == 0) && 8054 (tot_len_this_send >= (rack->r_min_pace_seg_thresh * tp->t_maxseg))) { 8055 /* We are enforcing a minimum pace time of 1ms */ 8056 slot = rack->r_enforce_min_pace; 8057 } 8058 } 8059 SOCKBUF_UNLOCK(sb); 8060 } else { 8061 SOCKBUF_UNLOCK(sb); 8062 if (tp->t_flags & TF_ACKNOW) 8063 TCPSTAT_INC(tcps_sndacks); 8064 else if (flags & (TH_SYN | TH_FIN | TH_RST)) 8065 TCPSTAT_INC(tcps_sndctrl); 8066 else if (SEQ_GT(tp->snd_up, tp->snd_una)) 8067 TCPSTAT_INC(tcps_sndurg); 8068 else 8069 TCPSTAT_INC(tcps_sndwinup); 8070 8071 m = m_gethdr(M_NOWAIT, MT_DATA); 8072 if (m == NULL) { 8073 error = ENOBUFS; 8074 sack_rxmit = 0; 8075 goto out; 8076 } 8077 #ifdef INET6 8078 if (isipv6 && (MHLEN < hdrlen + max_linkhdr) && 8079 MHLEN >= hdrlen) { 8080 M_ALIGN(m, hdrlen); 8081 } else 8082 #endif 8083 m->m_data += max_linkhdr; 8084 m->m_len = hdrlen; 8085 } 8086 SOCKBUF_UNLOCK_ASSERT(sb); 8087 m->m_pkthdr.rcvif = (struct ifnet *)0; 8088 #ifdef MAC 8089 mac_inpcb_create_mbuf(inp, m); 8090 #endif 8091 #ifdef INET6 8092 if (isipv6) { 8093 ip6 = mtod(m, struct ip6_hdr *); 8094 #ifdef NETFLIX_TCPOUDP 8095 if (tp->t_port) { 8096 udp = (struct udphdr *)((caddr_t)ip6 + ipoptlen + sizeof(struct ip6_hdr)); 8097 udp->uh_sport = htons(V_tcp_udp_tunneling_port); 8098 udp->uh_dport = tp->t_port; 8099 ulen = hdrlen + len - sizeof(struct ip6_hdr); 8100 udp->uh_ulen = htons(ulen); 8101 th = (struct tcphdr *)(udp + 1); 8102 } else 8103 #endif 8104 th = (struct tcphdr *)(ip6 + 1); 8105 tcpip_fillheaders(inp, ip6, th); 8106 } else 8107 #endif /* INET6 */ 8108 { 8109 ip = mtod(m, struct ip *); 8110 #ifdef TCPDEBUG 8111 ipov = (struct ipovly *)ip; 8112 #endif 8113 #ifdef NETFLIX_TCPOUDP 8114 if (tp->t_port) { 8115 udp = (struct udphdr *)((caddr_t)ip + ipoptlen + sizeof(struct ip)); 8116 udp->uh_sport = htons(V_tcp_udp_tunneling_port); 8117 udp->uh_dport = tp->t_port; 8118 ulen = hdrlen + len - sizeof(struct ip); 8119 udp->uh_ulen = htons(ulen); 8120 th = (struct tcphdr *)(udp + 1); 8121 } else 8122 #endif 8123 th = (struct tcphdr *)(ip + 1); 8124 tcpip_fillheaders(inp, ip, th); 8125 } 8126 /* 8127 * Fill in fields, remembering maximum advertised window for use in 8128 * delaying messages about window sizes. If resending a FIN, be sure 8129 * not to use a new sequence number. 8130 */ 8131 if (flags & TH_FIN && tp->t_flags & TF_SENTFIN && 8132 tp->snd_nxt == tp->snd_max) 8133 tp->snd_nxt--; 8134 /* 8135 * If we are starting a connection, send ECN setup SYN packet. If we 8136 * are on a retransmit, we may resend those bits a number of times 8137 * as per RFC 3168. 8138 */ 8139 if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn == 1) { 8140 if (tp->t_rxtshift >= 1) { 8141 if (tp->t_rxtshift <= V_tcp_ecn_maxretries) 8142 flags |= TH_ECE | TH_CWR; 8143 } else 8144 flags |= TH_ECE | TH_CWR; 8145 } 8146 if (tp->t_state == TCPS_ESTABLISHED && 8147 (tp->t_flags & TF_ECN_PERMIT)) { 8148 /* 8149 * If the peer has ECN, mark data packets with ECN capable 8150 * transmission (ECT). Ignore pure ack packets, 8151 * retransmissions and window probes. 8152 */ 8153 if (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) && 8154 !((tp->t_flags & TF_FORCEDATA) && len == 1)) { 8155 #ifdef INET6 8156 if (isipv6) 8157 ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20); 8158 else 8159 #endif 8160 ip->ip_tos |= IPTOS_ECN_ECT0; 8161 TCPSTAT_INC(tcps_ecn_ect0); 8162 } 8163 /* 8164 * Reply with proper ECN notifications. 8165 */ 8166 if (tp->t_flags & TF_ECN_SND_CWR) { 8167 flags |= TH_CWR; 8168 tp->t_flags &= ~TF_ECN_SND_CWR; 8169 } 8170 if (tp->t_flags & TF_ECN_SND_ECE) 8171 flags |= TH_ECE; 8172 } 8173 /* 8174 * If we are doing retransmissions, then snd_nxt will not reflect 8175 * the first unsent octet. For ACK only packets, we do not want the 8176 * sequence number of the retransmitted packet, we want the sequence 8177 * number of the next unsent octet. So, if there is no data (and no 8178 * SYN or FIN), use snd_max instead of snd_nxt when filling in 8179 * ti_seq. But if we are in persist state, snd_max might reflect 8180 * one byte beyond the right edge of the window, so use snd_nxt in 8181 * that case, since we know we aren't doing a retransmission. 8182 * (retransmit and persist are mutually exclusive...) 8183 */ 8184 if (sack_rxmit == 0) { 8185 if (len || (flags & (TH_SYN | TH_FIN)) || 8186 rack->rc_in_persist) { 8187 th->th_seq = htonl(tp->snd_nxt); 8188 rack_seq = tp->snd_nxt; 8189 } else if (flags & TH_RST) { 8190 /* 8191 * For a Reset send the last cum ack in sequence 8192 * (this like any other choice may still generate a 8193 * challenge ack, if a ack-update packet is in 8194 * flight). 8195 */ 8196 th->th_seq = htonl(tp->snd_una); 8197 rack_seq = tp->snd_una; 8198 } else { 8199 th->th_seq = htonl(tp->snd_max); 8200 rack_seq = tp->snd_max; 8201 } 8202 } else { 8203 th->th_seq = htonl(rsm->r_start); 8204 rack_seq = rsm->r_start; 8205 } 8206 th->th_ack = htonl(tp->rcv_nxt); 8207 if (optlen) { 8208 bcopy(opt, th + 1, optlen); 8209 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 8210 } 8211 th->th_flags = flags; 8212 /* 8213 * Calculate receive window. Don't shrink window, but avoid silly 8214 * window syndrome. 8215 */ 8216 if (recwin < (long)(so->so_rcv.sb_hiwat / 4) && 8217 recwin < (long)tp->t_maxseg) 8218 recwin = 0; 8219 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) && 8220 recwin < (long)(tp->rcv_adv - tp->rcv_nxt)) 8221 recwin = (long)(tp->rcv_adv - tp->rcv_nxt); 8222 if (recwin > (long)TCP_MAXWIN << tp->rcv_scale) 8223 recwin = (long)TCP_MAXWIN << tp->rcv_scale; 8224 8225 /* 8226 * According to RFC1323 the window field in a SYN (i.e., a <SYN> or 8227 * <SYN,ACK>) segment itself is never scaled. The <SYN,ACK> case is 8228 * handled in syncache. 8229 */ 8230 if (flags & TH_SYN) 8231 th->th_win = htons((u_short) 8232 (min(sbspace(&so->so_rcv), TCP_MAXWIN))); 8233 else 8234 th->th_win = htons((u_short)(recwin >> tp->rcv_scale)); 8235 /* 8236 * Adjust the RXWIN0SENT flag - indicate that we have advertised a 0 8237 * window. This may cause the remote transmitter to stall. This 8238 * flag tells soreceive() to disable delayed acknowledgements when 8239 * draining the buffer. This can occur if the receiver is 8240 * attempting to read more data than can be buffered prior to 8241 * transmitting on the connection. 8242 */ 8243 if (th->th_win == 0) { 8244 tp->t_sndzerowin++; 8245 tp->t_flags |= TF_RXWIN0SENT; 8246 } else 8247 tp->t_flags &= ~TF_RXWIN0SENT; 8248 if (SEQ_GT(tp->snd_up, tp->snd_nxt)) { 8249 th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt)); 8250 th->th_flags |= TH_URG; 8251 } else 8252 /* 8253 * If no urgent pointer to send, then we pull the urgent 8254 * pointer to the left edge of the send window so that it 8255 * doesn't drift into the send window on sequence number 8256 * wraparound. 8257 */ 8258 tp->snd_up = tp->snd_una; /* drag it along */ 8259 8260 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) 8261 if (to.to_flags & TOF_SIGNATURE) { 8262 /* 8263 * Calculate MD5 signature and put it into the place 8264 * determined before. 8265 * NOTE: since TCP options buffer doesn't point into 8266 * mbuf's data, calculate offset and use it. 8267 */ 8268 if (!TCPMD5_ENABLED() || TCPMD5_OUTPUT(m, th, 8269 (u_char *)(th + 1) + (to.to_signature - opt)) != 0) { 8270 /* 8271 * Do not send segment if the calculation of MD5 8272 * digest has failed. 8273 */ 8274 goto out; 8275 } 8276 } 8277 #endif 8278 8279 /* 8280 * Put TCP length in extended header, and then checksum extended 8281 * header and data. 8282 */ 8283 m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */ 8284 #ifdef INET6 8285 if (isipv6) { 8286 /* 8287 * ip6_plen is not need to be filled now, and will be filled 8288 * in ip6_output. 8289 */ 8290 if (tp->t_port) { 8291 m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; 8292 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 8293 udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0); 8294 th->th_sum = htons(0); 8295 } else { 8296 m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; 8297 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 8298 th->th_sum = in6_cksum_pseudo(ip6, 8299 sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP, 8300 0); 8301 } 8302 } 8303 #endif 8304 #if defined(INET6) && defined(INET) 8305 else 8306 #endif 8307 #ifdef INET 8308 { 8309 if (tp->t_port) { 8310 m->m_pkthdr.csum_flags = CSUM_UDP; 8311 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 8312 udp->uh_sum = in_pseudo(ip->ip_src.s_addr, 8313 ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP)); 8314 th->th_sum = htons(0); 8315 } else { 8316 m->m_pkthdr.csum_flags = CSUM_TCP; 8317 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 8318 th->th_sum = in_pseudo(ip->ip_src.s_addr, 8319 ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + 8320 IPPROTO_TCP + len + optlen)); 8321 } 8322 /* IP version must be set here for ipv4/ipv6 checking later */ 8323 KASSERT(ip->ip_v == IPVERSION, 8324 ("%s: IP version incorrect: %d", __func__, ip->ip_v)); 8325 } 8326 #endif 8327 8328 /* 8329 * Enable TSO and specify the size of the segments. The TCP pseudo 8330 * header checksum is always provided. XXX: Fixme: This is currently 8331 * not the case for IPv6. 8332 */ 8333 if (tso) { 8334 KASSERT(len > tp->t_maxseg - optlen, 8335 ("%s: len <= tso_segsz", __func__)); 8336 m->m_pkthdr.csum_flags |= CSUM_TSO; 8337 m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen; 8338 } 8339 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 8340 KASSERT(len + hdrlen + ipoptlen - ipsec_optlen == m_length(m, NULL), 8341 ("%s: mbuf chain shorter than expected: %d + %u + %u - %u != %u", 8342 __func__, len, hdrlen, ipoptlen, ipsec_optlen, m_length(m, NULL))); 8343 #else 8344 KASSERT(len + hdrlen + ipoptlen == m_length(m, NULL), 8345 ("%s: mbuf chain shorter than expected: %d + %u + %u != %u", 8346 __func__, len, hdrlen, ipoptlen, m_length(m, NULL))); 8347 #endif 8348 8349 #ifdef TCP_HHOOK 8350 /* Run HHOOK_TCP_ESTABLISHED_OUT helper hooks. */ 8351 hhook_run_tcp_est_out(tp, th, &to, len, tso); 8352 #endif 8353 8354 #ifdef TCPDEBUG 8355 /* 8356 * Trace. 8357 */ 8358 if (so->so_options & SO_DEBUG) { 8359 u_short save = 0; 8360 8361 #ifdef INET6 8362 if (!isipv6) 8363 #endif 8364 { 8365 save = ipov->ih_len; 8366 ipov->ih_len = htons(m->m_pkthdr.len /* - hdrlen + 8367 * (th->th_off << 2) */ ); 8368 } 8369 tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0); 8370 #ifdef INET6 8371 if (!isipv6) 8372 #endif 8373 ipov->ih_len = save; 8374 } 8375 #endif /* TCPDEBUG */ 8376 8377 /* We're getting ready to send; log now. */ 8378 if (tp->t_logstate != TCP_LOG_STATE_OFF) { 8379 union tcp_log_stackspecific log; 8380 8381 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 8382 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 8383 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 8384 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; 8385 if (rsm || sack_rxmit) { 8386 log.u_bbr.flex8 = 1; 8387 } else { 8388 log.u_bbr.flex8 = 0; 8389 } 8390 lgb = tcp_log_event_(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_OUT, ERRNO_UNK, 8391 len, &log, false, NULL, NULL, 0, NULL); 8392 } else 8393 lgb = NULL; 8394 8395 /* 8396 * Fill in IP length and desired time to live and send to IP level. 8397 * There should be a better way to handle ttl and tos; we could keep 8398 * them in the template, but need a way to checksum without them. 8399 */ 8400 /* 8401 * m->m_pkthdr.len should have been set before cksum calcuration, 8402 * because in6_cksum() need it. 8403 */ 8404 #ifdef INET6 8405 if (isipv6) { 8406 /* 8407 * we separately set hoplimit for every segment, since the 8408 * user might want to change the value via setsockopt. Also, 8409 * desired default hop limit might be changed via Neighbor 8410 * Discovery. 8411 */ 8412 ip6->ip6_hlim = in6_selecthlim(inp, NULL); 8413 8414 /* 8415 * Set the packet size here for the benefit of DTrace 8416 * probes. ip6_output() will set it properly; it's supposed 8417 * to include the option header lengths as well. 8418 */ 8419 ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6)); 8420 8421 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) 8422 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 8423 else 8424 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 8425 8426 if (tp->t_state == TCPS_SYN_SENT) 8427 TCP_PROBE5(connect__request, NULL, tp, ip6, tp, th); 8428 8429 TCP_PROBE5(send, NULL, tp, ip6, tp, th); 8430 /* TODO: IPv6 IP6TOS_ECT bit on */ 8431 error = ip6_output(m, tp->t_inpcb->in6p_outputopts, 8432 &inp->inp_route6, 8433 ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), 8434 NULL, NULL, inp); 8435 8436 if (error == EMSGSIZE && inp->inp_route6.ro_rt != NULL) 8437 mtu = inp->inp_route6.ro_rt->rt_mtu; 8438 } 8439 #endif /* INET6 */ 8440 #if defined(INET) && defined(INET6) 8441 else 8442 #endif 8443 #ifdef INET 8444 { 8445 ip->ip_len = htons(m->m_pkthdr.len); 8446 #ifdef INET6 8447 if (inp->inp_vflag & INP_IPV6PROTO) 8448 ip->ip_ttl = in6_selecthlim(inp, NULL); 8449 #endif /* INET6 */ 8450 /* 8451 * If we do path MTU discovery, then we set DF on every 8452 * packet. This might not be the best thing to do according 8453 * to RFC3390 Section 2. However the tcp hostcache migitates 8454 * the problem so it affects only the first tcp connection 8455 * with a host. 8456 * 8457 * NB: Don't set DF on small MTU/MSS to have a safe 8458 * fallback. 8459 */ 8460 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) { 8461 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 8462 if (tp->t_port == 0 || len < V_tcp_minmss) { 8463 ip->ip_off |= htons(IP_DF); 8464 } 8465 } else { 8466 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 8467 } 8468 8469 if (tp->t_state == TCPS_SYN_SENT) 8470 TCP_PROBE5(connect__request, NULL, tp, ip, tp, th); 8471 8472 TCP_PROBE5(send, NULL, tp, ip, tp, th); 8473 8474 error = ip_output(m, tp->t_inpcb->inp_options, &inp->inp_route, 8475 ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), 0, 8476 inp); 8477 if (error == EMSGSIZE && inp->inp_route.ro_rt != NULL) 8478 mtu = inp->inp_route.ro_rt->rt_mtu; 8479 } 8480 #endif /* INET */ 8481 8482 out: 8483 if (lgb) { 8484 lgb->tlb_errno = error; 8485 lgb = NULL; 8486 } 8487 /* 8488 * In transmit state, time the transmission and arrange for the 8489 * retransmit. In persist state, just set snd_max. 8490 */ 8491 if (error == 0) { 8492 if (len == 0) 8493 counter_u64_add(rack_out_size[TCP_MSS_ACCT_SNDACK], 1); 8494 else if (len == 1) { 8495 counter_u64_add(rack_out_size[TCP_MSS_ACCT_PERSIST], 1); 8496 } else if (len > 1) { 8497 int idx; 8498 8499 idx = (len / tp->t_maxseg) + 3; 8500 if (idx >= TCP_MSS_ACCT_ATIMER) 8501 counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1); 8502 else 8503 counter_u64_add(rack_out_size[idx], 1); 8504 } 8505 } 8506 if (sub_from_prr && (error == 0)) { 8507 rack->r_ctl.rc_prr_sndcnt -= len; 8508 } 8509 sub_from_prr = 0; 8510 rack_log_output(tp, &to, len, rack_seq, (uint8_t) flags, error, cts, 8511 pass, rsm); 8512 if ((tp->t_flags & TF_FORCEDATA) == 0 || 8513 (rack->rc_in_persist == 0)) { 8514 #ifdef NETFLIX_STATS 8515 tcp_seq startseq = tp->snd_nxt; 8516 #endif 8517 8518 /* 8519 * Advance snd_nxt over sequence space of this segment. 8520 */ 8521 if (error) 8522 /* We don't log or do anything with errors */ 8523 goto timer; 8524 8525 if (flags & (TH_SYN | TH_FIN)) { 8526 if (flags & TH_SYN) 8527 tp->snd_nxt++; 8528 if (flags & TH_FIN) { 8529 tp->snd_nxt++; 8530 tp->t_flags |= TF_SENTFIN; 8531 } 8532 } 8533 /* In the ENOBUFS case we do *not* update snd_max */ 8534 if (sack_rxmit) 8535 goto timer; 8536 8537 tp->snd_nxt += len; 8538 if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { 8539 if (tp->snd_una == tp->snd_max) { 8540 /* 8541 * Update the time we just added data since 8542 * none was outstanding. 8543 */ 8544 rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__); 8545 tp->t_acktime = ticks; 8546 } 8547 tp->snd_max = tp->snd_nxt; 8548 #ifdef NETFLIX_STATS 8549 if (!(tp->t_flags & TF_GPUTINPROG) && len) { 8550 tp->t_flags |= TF_GPUTINPROG; 8551 tp->gput_seq = startseq; 8552 tp->gput_ack = startseq + 8553 ulmin(sbavail(sb) - sb_offset, sendwin); 8554 tp->gput_ts = tcp_ts_getticks(); 8555 } 8556 #endif 8557 } 8558 /* 8559 * Set retransmit timer if not currently set, and not doing 8560 * a pure ack or a keep-alive probe. Initial value for 8561 * retransmit timer is smoothed round-trip time + 2 * 8562 * round-trip time variance. Initialize shift counter which 8563 * is used for backoff of retransmit time. 8564 */ 8565 timer: 8566 if ((tp->snd_wnd == 0) && 8567 TCPS_HAVEESTABLISHED(tp->t_state)) { 8568 /* 8569 * If the persists timer was set above (right before 8570 * the goto send), and still needs to be on. Lets 8571 * make sure all is canceled. If the persist timer 8572 * is not running, we want to get it up. 8573 */ 8574 if (rack->rc_in_persist == 0) { 8575 rack_enter_persist(tp, rack, cts); 8576 } 8577 } 8578 } else { 8579 /* 8580 * Persist case, update snd_max but since we are in persist 8581 * mode (no window) we do not update snd_nxt. 8582 */ 8583 int32_t xlen = len; 8584 8585 if (error) 8586 goto nomore; 8587 8588 if (flags & TH_SYN) 8589 ++xlen; 8590 if (flags & TH_FIN) { 8591 ++xlen; 8592 tp->t_flags |= TF_SENTFIN; 8593 } 8594 /* In the ENOBUFS case we do *not* update snd_max */ 8595 if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max)) { 8596 if (tp->snd_una == tp->snd_max) { 8597 /* 8598 * Update the time we just added data since 8599 * none was outstanding. 8600 */ 8601 rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__); 8602 tp->t_acktime = ticks; 8603 } 8604 tp->snd_max = tp->snd_nxt + len; 8605 } 8606 } 8607 nomore: 8608 if (error) { 8609 SOCKBUF_UNLOCK_ASSERT(sb); /* Check gotos. */ 8610 /* 8611 * Failures do not advance the seq counter above. For the 8612 * case of ENOBUFS we will fall out and retry in 1ms with 8613 * the hpts. Everything else will just have to retransmit 8614 * with the timer. 8615 * 8616 * In any case, we do not want to loop around for another 8617 * send without a good reason. 8618 */ 8619 sendalot = 0; 8620 switch (error) { 8621 case EPERM: 8622 tp->t_flags &= ~TF_FORCEDATA; 8623 tp->t_softerror = error; 8624 return (error); 8625 case ENOBUFS: 8626 if (slot == 0) { 8627 /* 8628 * Pace us right away to retry in a some 8629 * time 8630 */ 8631 slot = 1 + rack->rc_enobuf; 8632 if (rack->rc_enobuf < 255) 8633 rack->rc_enobuf++; 8634 if (slot > (rack->rc_rack_rtt / 2)) { 8635 slot = rack->rc_rack_rtt / 2; 8636 } 8637 if (slot < 10) 8638 slot = 10; 8639 } 8640 counter_u64_add(rack_saw_enobuf, 1); 8641 error = 0; 8642 goto enobufs; 8643 case EMSGSIZE: 8644 /* 8645 * For some reason the interface we used initially 8646 * to send segments changed to another or lowered 8647 * its MTU. If TSO was active we either got an 8648 * interface without TSO capabilits or TSO was 8649 * turned off. If we obtained mtu from ip_output() 8650 * then update it and try again. 8651 */ 8652 if (tso) 8653 tp->t_flags &= ~TF_TSO; 8654 if (mtu != 0) { 8655 tcp_mss_update(tp, -1, mtu, NULL, NULL); 8656 goto again; 8657 } 8658 slot = 10; 8659 rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, 0, 1); 8660 tp->t_flags &= ~TF_FORCEDATA; 8661 return (error); 8662 case ENETUNREACH: 8663 counter_u64_add(rack_saw_enetunreach, 1); 8664 case EHOSTDOWN: 8665 case EHOSTUNREACH: 8666 case ENETDOWN: 8667 if (TCPS_HAVERCVDSYN(tp->t_state)) { 8668 tp->t_softerror = error; 8669 } 8670 /* FALLTHROUGH */ 8671 default: 8672 slot = 10; 8673 rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, 0, 1); 8674 tp->t_flags &= ~TF_FORCEDATA; 8675 return (error); 8676 } 8677 } else { 8678 rack->rc_enobuf = 0; 8679 } 8680 TCPSTAT_INC(tcps_sndtotal); 8681 8682 /* 8683 * Data sent (as far as we can tell). If this advertises a larger 8684 * window than any other segment, then remember the size of the 8685 * advertised window. Any pending ACK has now been sent. 8686 */ 8687 if (recwin > 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv)) 8688 tp->rcv_adv = tp->rcv_nxt + recwin; 8689 tp->last_ack_sent = tp->rcv_nxt; 8690 tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); 8691 enobufs: 8692 rack->r_tlp_running = 0; 8693 if ((flags & TH_RST) || (would_have_fin == 1)) { 8694 /* 8695 * We don't send again after a RST. We also do *not* send 8696 * again if we would have had a find, but now have 8697 * outstanding data. 8698 */ 8699 slot = 0; 8700 sendalot = 0; 8701 } 8702 if (slot) { 8703 /* set the rack tcb into the slot N */ 8704 counter_u64_add(rack_paced_segments, 1); 8705 } else if (sendalot) { 8706 if (len) 8707 counter_u64_add(rack_unpaced_segments, 1); 8708 sack_rxmit = 0; 8709 tp->t_flags &= ~TF_FORCEDATA; 8710 goto again; 8711 } else if (len) { 8712 counter_u64_add(rack_unpaced_segments, 1); 8713 } 8714 tp->t_flags &= ~TF_FORCEDATA; 8715 rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, tot_len_this_send, 1); 8716 return (error); 8717 } 8718 8719 /* 8720 * rack_ctloutput() must drop the inpcb lock before performing copyin on 8721 * socket option arguments. When it re-acquires the lock after the copy, it 8722 * has to revalidate that the connection is still valid for the socket 8723 * option. 8724 */ 8725 static int 8726 rack_set_sockopt(struct socket *so, struct sockopt *sopt, 8727 struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack) 8728 { 8729 int32_t error = 0, optval; 8730 8731 switch (sopt->sopt_name) { 8732 case TCP_RACK_PROP_RATE: 8733 case TCP_RACK_PROP: 8734 case TCP_RACK_TLP_REDUCE: 8735 case TCP_RACK_EARLY_RECOV: 8736 case TCP_RACK_PACE_ALWAYS: 8737 case TCP_DELACK: 8738 case TCP_RACK_PACE_REDUCE: 8739 case TCP_RACK_PACE_MAX_SEG: 8740 case TCP_RACK_PRR_SENDALOT: 8741 case TCP_RACK_MIN_TO: 8742 case TCP_RACK_EARLY_SEG: 8743 case TCP_RACK_REORD_THRESH: 8744 case TCP_RACK_REORD_FADE: 8745 case TCP_RACK_TLP_THRESH: 8746 case TCP_RACK_PKT_DELAY: 8747 case TCP_RACK_TLP_USE: 8748 case TCP_RACK_TLP_INC_VAR: 8749 case TCP_RACK_IDLE_REDUCE_HIGH: 8750 case TCP_RACK_MIN_PACE: 8751 case TCP_RACK_MIN_PACE_SEG: 8752 case TCP_BBR_RACK_RTT_USE: 8753 case TCP_DATA_AFTER_CLOSE: 8754 break; 8755 default: 8756 return (tcp_default_ctloutput(so, sopt, inp, tp)); 8757 break; 8758 } 8759 INP_WUNLOCK(inp); 8760 error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); 8761 if (error) 8762 return (error); 8763 INP_WLOCK(inp); 8764 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 8765 INP_WUNLOCK(inp); 8766 return (ECONNRESET); 8767 } 8768 tp = intotcpcb(inp); 8769 rack = (struct tcp_rack *)tp->t_fb_ptr; 8770 switch (sopt->sopt_name) { 8771 case TCP_RACK_PROP_RATE: 8772 if ((optval <= 0) || (optval >= 100)) { 8773 error = EINVAL; 8774 break; 8775 } 8776 RACK_OPTS_INC(tcp_rack_prop_rate); 8777 rack->r_ctl.rc_prop_rate = optval; 8778 break; 8779 case TCP_RACK_TLP_USE: 8780 if ((optval < TLP_USE_ID) || (optval > TLP_USE_TWO_TWO)) { 8781 error = EINVAL; 8782 break; 8783 } 8784 RACK_OPTS_INC(tcp_tlp_use); 8785 rack->rack_tlp_threshold_use = optval; 8786 break; 8787 case TCP_RACK_PROP: 8788 /* RACK proportional rate reduction (bool) */ 8789 RACK_OPTS_INC(tcp_rack_prop); 8790 rack->r_ctl.rc_prop_reduce = optval; 8791 break; 8792 case TCP_RACK_TLP_REDUCE: 8793 /* RACK TLP cwnd reduction (bool) */ 8794 RACK_OPTS_INC(tcp_rack_tlp_reduce); 8795 rack->r_ctl.rc_tlp_cwnd_reduce = optval; 8796 break; 8797 case TCP_RACK_EARLY_RECOV: 8798 /* Should recovery happen early (bool) */ 8799 RACK_OPTS_INC(tcp_rack_early_recov); 8800 rack->r_ctl.rc_early_recovery = optval; 8801 break; 8802 case TCP_RACK_PACE_ALWAYS: 8803 /* Use the always pace method (bool) */ 8804 RACK_OPTS_INC(tcp_rack_pace_always); 8805 if (optval > 0) 8806 rack->rc_always_pace = 1; 8807 else 8808 rack->rc_always_pace = 0; 8809 break; 8810 case TCP_RACK_PACE_REDUCE: 8811 /* RACK Hptsi reduction factor (divisor) */ 8812 RACK_OPTS_INC(tcp_rack_pace_reduce); 8813 if (optval) 8814 /* Must be non-zero */ 8815 rack->rc_pace_reduce = optval; 8816 else 8817 error = EINVAL; 8818 break; 8819 case TCP_RACK_PACE_MAX_SEG: 8820 /* Max segments in a pace */ 8821 RACK_OPTS_INC(tcp_rack_max_seg); 8822 rack->rc_pace_max_segs = optval; 8823 break; 8824 case TCP_RACK_PRR_SENDALOT: 8825 /* Allow PRR to send more than one seg */ 8826 RACK_OPTS_INC(tcp_rack_prr_sendalot); 8827 rack->r_ctl.rc_prr_sendalot = optval; 8828 break; 8829 case TCP_RACK_MIN_TO: 8830 /* Minimum time between rack t-o's in ms */ 8831 RACK_OPTS_INC(tcp_rack_min_to); 8832 rack->r_ctl.rc_min_to = optval; 8833 break; 8834 case TCP_RACK_EARLY_SEG: 8835 /* If early recovery max segments */ 8836 RACK_OPTS_INC(tcp_rack_early_seg); 8837 rack->r_ctl.rc_early_recovery_segs = optval; 8838 break; 8839 case TCP_RACK_REORD_THRESH: 8840 /* RACK reorder threshold (shift amount) */ 8841 RACK_OPTS_INC(tcp_rack_reord_thresh); 8842 if ((optval > 0) && (optval < 31)) 8843 rack->r_ctl.rc_reorder_shift = optval; 8844 else 8845 error = EINVAL; 8846 break; 8847 case TCP_RACK_REORD_FADE: 8848 /* Does reordering fade after ms time */ 8849 RACK_OPTS_INC(tcp_rack_reord_fade); 8850 rack->r_ctl.rc_reorder_fade = optval; 8851 break; 8852 case TCP_RACK_TLP_THRESH: 8853 /* RACK TLP theshold i.e. srtt+(srtt/N) */ 8854 RACK_OPTS_INC(tcp_rack_tlp_thresh); 8855 if (optval) 8856 rack->r_ctl.rc_tlp_threshold = optval; 8857 else 8858 error = EINVAL; 8859 break; 8860 case TCP_RACK_PKT_DELAY: 8861 /* RACK added ms i.e. rack-rtt + reord + N */ 8862 RACK_OPTS_INC(tcp_rack_pkt_delay); 8863 rack->r_ctl.rc_pkt_delay = optval; 8864 break; 8865 case TCP_RACK_TLP_INC_VAR: 8866 /* Does TLP include rtt variance in t-o */ 8867 RACK_OPTS_INC(tcp_rack_tlp_inc_var); 8868 rack->r_ctl.rc_prr_inc_var = optval; 8869 break; 8870 case TCP_RACK_IDLE_REDUCE_HIGH: 8871 RACK_OPTS_INC(tcp_rack_idle_reduce_high); 8872 if (optval) 8873 rack->r_idle_reduce_largest = 1; 8874 else 8875 rack->r_idle_reduce_largest = 0; 8876 break; 8877 case TCP_DELACK: 8878 if (optval == 0) 8879 tp->t_delayed_ack = 0; 8880 else 8881 tp->t_delayed_ack = 1; 8882 if (tp->t_flags & TF_DELACK) { 8883 tp->t_flags &= ~TF_DELACK; 8884 tp->t_flags |= TF_ACKNOW; 8885 rack_output(tp); 8886 } 8887 break; 8888 case TCP_RACK_MIN_PACE: 8889 RACK_OPTS_INC(tcp_rack_min_pace); 8890 if (optval > 3) 8891 rack->r_enforce_min_pace = 3; 8892 else 8893 rack->r_enforce_min_pace = optval; 8894 break; 8895 case TCP_RACK_MIN_PACE_SEG: 8896 RACK_OPTS_INC(tcp_rack_min_pace_seg); 8897 if (optval >= 16) 8898 rack->r_min_pace_seg_thresh = 15; 8899 else 8900 rack->r_min_pace_seg_thresh = optval; 8901 break; 8902 case TCP_BBR_RACK_RTT_USE: 8903 if ((optval != USE_RTT_HIGH) && 8904 (optval != USE_RTT_LOW) && 8905 (optval != USE_RTT_AVG)) 8906 error = EINVAL; 8907 else 8908 rack->r_ctl.rc_rate_sample_method = optval; 8909 break; 8910 case TCP_DATA_AFTER_CLOSE: 8911 if (optval) 8912 rack->rc_allow_data_af_clo = 1; 8913 else 8914 rack->rc_allow_data_af_clo = 0; 8915 break; 8916 default: 8917 return (tcp_default_ctloutput(so, sopt, inp, tp)); 8918 break; 8919 } 8920 #ifdef NETFLIX_STATS 8921 tcp_log_socket_option(tp, sopt->sopt_name, optval, error); 8922 #endif 8923 INP_WUNLOCK(inp); 8924 return (error); 8925 } 8926 8927 static int 8928 rack_get_sockopt(struct socket *so, struct sockopt *sopt, 8929 struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack) 8930 { 8931 int32_t error, optval; 8932 8933 /* 8934 * Because all our options are either boolean or an int, we can just 8935 * pull everything into optval and then unlock and copy. If we ever 8936 * add a option that is not a int, then this will have quite an 8937 * impact to this routine. 8938 */ 8939 switch (sopt->sopt_name) { 8940 case TCP_RACK_PROP_RATE: 8941 optval = rack->r_ctl.rc_prop_rate; 8942 break; 8943 case TCP_RACK_PROP: 8944 /* RACK proportional rate reduction (bool) */ 8945 optval = rack->r_ctl.rc_prop_reduce; 8946 break; 8947 case TCP_RACK_TLP_REDUCE: 8948 /* RACK TLP cwnd reduction (bool) */ 8949 optval = rack->r_ctl.rc_tlp_cwnd_reduce; 8950 break; 8951 case TCP_RACK_EARLY_RECOV: 8952 /* Should recovery happen early (bool) */ 8953 optval = rack->r_ctl.rc_early_recovery; 8954 break; 8955 case TCP_RACK_PACE_REDUCE: 8956 /* RACK Hptsi reduction factor (divisor) */ 8957 optval = rack->rc_pace_reduce; 8958 break; 8959 case TCP_RACK_PACE_MAX_SEG: 8960 /* Max segments in a pace */ 8961 optval = rack->rc_pace_max_segs; 8962 break; 8963 case TCP_RACK_PACE_ALWAYS: 8964 /* Use the always pace method */ 8965 optval = rack->rc_always_pace; 8966 break; 8967 case TCP_RACK_PRR_SENDALOT: 8968 /* Allow PRR to send more than one seg */ 8969 optval = rack->r_ctl.rc_prr_sendalot; 8970 break; 8971 case TCP_RACK_MIN_TO: 8972 /* Minimum time between rack t-o's in ms */ 8973 optval = rack->r_ctl.rc_min_to; 8974 break; 8975 case TCP_RACK_EARLY_SEG: 8976 /* If early recovery max segments */ 8977 optval = rack->r_ctl.rc_early_recovery_segs; 8978 break; 8979 case TCP_RACK_REORD_THRESH: 8980 /* RACK reorder threshold (shift amount) */ 8981 optval = rack->r_ctl.rc_reorder_shift; 8982 break; 8983 case TCP_RACK_REORD_FADE: 8984 /* Does reordering fade after ms time */ 8985 optval = rack->r_ctl.rc_reorder_fade; 8986 break; 8987 case TCP_RACK_TLP_THRESH: 8988 /* RACK TLP theshold i.e. srtt+(srtt/N) */ 8989 optval = rack->r_ctl.rc_tlp_threshold; 8990 break; 8991 case TCP_RACK_PKT_DELAY: 8992 /* RACK added ms i.e. rack-rtt + reord + N */ 8993 optval = rack->r_ctl.rc_pkt_delay; 8994 break; 8995 case TCP_RACK_TLP_USE: 8996 optval = rack->rack_tlp_threshold_use; 8997 break; 8998 case TCP_RACK_TLP_INC_VAR: 8999 /* Does TLP include rtt variance in t-o */ 9000 optval = rack->r_ctl.rc_prr_inc_var; 9001 break; 9002 case TCP_RACK_IDLE_REDUCE_HIGH: 9003 optval = rack->r_idle_reduce_largest; 9004 break; 9005 case TCP_RACK_MIN_PACE: 9006 optval = rack->r_enforce_min_pace; 9007 break; 9008 case TCP_RACK_MIN_PACE_SEG: 9009 optval = rack->r_min_pace_seg_thresh; 9010 break; 9011 case TCP_BBR_RACK_RTT_USE: 9012 optval = rack->r_ctl.rc_rate_sample_method; 9013 break; 9014 case TCP_DELACK: 9015 optval = tp->t_delayed_ack; 9016 break; 9017 case TCP_DATA_AFTER_CLOSE: 9018 optval = rack->rc_allow_data_af_clo; 9019 break; 9020 default: 9021 return (tcp_default_ctloutput(so, sopt, inp, tp)); 9022 break; 9023 } 9024 INP_WUNLOCK(inp); 9025 error = sooptcopyout(sopt, &optval, sizeof optval); 9026 return (error); 9027 } 9028 9029 static int 9030 rack_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp) 9031 { 9032 int32_t error = EINVAL; 9033 struct tcp_rack *rack; 9034 9035 rack = (struct tcp_rack *)tp->t_fb_ptr; 9036 if (rack == NULL) { 9037 /* Huh? */ 9038 goto out; 9039 } 9040 if (sopt->sopt_dir == SOPT_SET) { 9041 return (rack_set_sockopt(so, sopt, inp, tp, rack)); 9042 } else if (sopt->sopt_dir == SOPT_GET) { 9043 return (rack_get_sockopt(so, sopt, inp, tp, rack)); 9044 } 9045 out: 9046 INP_WUNLOCK(inp); 9047 return (error); 9048 } 9049 9050 9051 struct tcp_function_block __tcp_rack = { 9052 .tfb_tcp_block_name = __XSTRING(STACKNAME), 9053 .tfb_tcp_output = rack_output, 9054 .tfb_tcp_do_segment = rack_do_segment, 9055 .tfb_tcp_hpts_do_segment = rack_hpts_do_segment, 9056 .tfb_tcp_ctloutput = rack_ctloutput, 9057 .tfb_tcp_fb_init = rack_init, 9058 .tfb_tcp_fb_fini = rack_fini, 9059 .tfb_tcp_timer_stop_all = rack_stopall, 9060 .tfb_tcp_timer_activate = rack_timer_activate, 9061 .tfb_tcp_timer_active = rack_timer_active, 9062 .tfb_tcp_timer_stop = rack_timer_stop, 9063 .tfb_tcp_rexmit_tmr = rack_remxt_tmr, 9064 .tfb_tcp_handoff_ok = rack_handoff_ok 9065 }; 9066 9067 static const char *rack_stack_names[] = { 9068 __XSTRING(STACKNAME), 9069 #ifdef STACKALIAS 9070 __XSTRING(STACKALIAS), 9071 #endif 9072 }; 9073 9074 static int 9075 rack_ctor(void *mem, int32_t size, void *arg, int32_t how) 9076 { 9077 memset(mem, 0, size); 9078 return (0); 9079 } 9080 9081 static void 9082 rack_dtor(void *mem, int32_t size, void *arg) 9083 { 9084 9085 } 9086 9087 static bool rack_mod_inited = false; 9088 9089 static int 9090 tcp_addrack(module_t mod, int32_t type, void *data) 9091 { 9092 int32_t err = 0; 9093 int num_stacks; 9094 9095 switch (type) { 9096 case MOD_LOAD: 9097 rack_zone = uma_zcreate(__XSTRING(MODNAME) "_map", 9098 sizeof(struct rack_sendmap), 9099 rack_ctor, rack_dtor, NULL, NULL, UMA_ALIGN_PTR, 0); 9100 9101 rack_pcb_zone = uma_zcreate(__XSTRING(MODNAME) "_pcb", 9102 sizeof(struct tcp_rack), 9103 rack_ctor, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0); 9104 9105 sysctl_ctx_init(&rack_sysctl_ctx); 9106 rack_sysctl_root = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 9107 SYSCTL_STATIC_CHILDREN(_net_inet_tcp), 9108 OID_AUTO, 9109 __XSTRING(STACKNAME), 9110 CTLFLAG_RW, 0, 9111 ""); 9112 if (rack_sysctl_root == NULL) { 9113 printf("Failed to add sysctl node\n"); 9114 err = EFAULT; 9115 goto free_uma; 9116 } 9117 rack_init_sysctls(); 9118 num_stacks = nitems(rack_stack_names); 9119 err = register_tcp_functions_as_names(&__tcp_rack, M_WAITOK, 9120 rack_stack_names, &num_stacks); 9121 if (err) { 9122 printf("Failed to register %s stack name for " 9123 "%s module\n", rack_stack_names[num_stacks], 9124 __XSTRING(MODNAME)); 9125 sysctl_ctx_free(&rack_sysctl_ctx); 9126 free_uma: 9127 uma_zdestroy(rack_zone); 9128 uma_zdestroy(rack_pcb_zone); 9129 rack_counter_destroy(); 9130 printf("Failed to register rack module -- err:%d\n", err); 9131 return (err); 9132 } 9133 rack_mod_inited = true; 9134 break; 9135 case MOD_QUIESCE: 9136 err = deregister_tcp_functions(&__tcp_rack, true, false); 9137 break; 9138 case MOD_UNLOAD: 9139 err = deregister_tcp_functions(&__tcp_rack, false, true); 9140 if (err == EBUSY) 9141 break; 9142 if (rack_mod_inited) { 9143 uma_zdestroy(rack_zone); 9144 uma_zdestroy(rack_pcb_zone); 9145 sysctl_ctx_free(&rack_sysctl_ctx); 9146 rack_counter_destroy(); 9147 rack_mod_inited = false; 9148 } 9149 err = 0; 9150 break; 9151 default: 9152 return (EOPNOTSUPP); 9153 } 9154 return (err); 9155 } 9156 9157 static moduledata_t tcp_rack = { 9158 .name = __XSTRING(MODNAME), 9159 .evhand = tcp_addrack, 9160 .priv = 0 9161 }; 9162 9163 MODULE_VERSION(MODNAME, 1); 9164 DECLARE_MODULE(MODNAME, tcp_rack, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY); 9165