1 /*- 2 * Copyright (c) 2016-2018 Netflix, Inc. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 16 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 23 * SUCH DAMAGE. 24 * 25 */ 26 27 #include <sys/cdefs.h> 28 __FBSDID("$FreeBSD$"); 29 30 #include "opt_inet.h" 31 #include "opt_inet6.h" 32 #include "opt_ipsec.h" 33 #include "opt_tcpdebug.h" 34 35 #include <sys/param.h> 36 #include <sys/module.h> 37 #include <sys/kernel.h> 38 #ifdef TCP_HHOOK 39 #include <sys/hhook.h> 40 #endif 41 #include <sys/lock.h> 42 #include <sys/malloc.h> 43 #include <sys/lock.h> 44 #include <sys/mutex.h> 45 #include <sys/mbuf.h> 46 #include <sys/proc.h> /* for proc0 declaration */ 47 #include <sys/socket.h> 48 #include <sys/socketvar.h> 49 #include <sys/sysctl.h> 50 #include <sys/systm.h> 51 #ifdef NETFLIX_STATS 52 #include <sys/stats.h> 53 #endif 54 #include <sys/refcount.h> 55 #include <sys/queue.h> 56 #include <sys/smp.h> 57 #include <sys/kthread.h> 58 #include <sys/kern_prefetch.h> 59 60 #include <vm/uma.h> 61 62 #include <net/route.h> 63 #include <net/vnet.h> 64 65 #define TCPSTATES /* for logging */ 66 67 #include <netinet/in.h> 68 #include <netinet/in_kdtrace.h> 69 #include <netinet/in_pcb.h> 70 #include <netinet/ip.h> 71 #include <netinet/ip_icmp.h> /* required for icmp_var.h */ 72 #include <netinet/icmp_var.h> /* for ICMP_BANDLIM */ 73 #include <netinet/ip_var.h> 74 #include <netinet/ip6.h> 75 #include <netinet6/in6_pcb.h> 76 #include <netinet6/ip6_var.h> 77 #include <netinet/tcp.h> 78 #define TCPOUTFLAGS 79 #include <netinet/tcp_fsm.h> 80 #include <netinet/tcp_log_buf.h> 81 #include <netinet/tcp_seq.h> 82 #include <netinet/tcp_timer.h> 83 #include <netinet/tcp_var.h> 84 #include <netinet/tcp_hpts.h> 85 #include <netinet/tcpip.h> 86 #include <netinet/cc/cc.h> 87 #ifdef NETFLIX_CWV 88 #include <netinet/tcp_newcwv.h> 89 #endif 90 #include <netinet/tcp_fastopen.h> 91 #ifdef TCPDEBUG 92 #include <netinet/tcp_debug.h> 93 #endif /* TCPDEBUG */ 94 #ifdef TCP_OFFLOAD 95 #include <netinet/tcp_offload.h> 96 #endif 97 #ifdef INET6 98 #include <netinet6/tcp6_var.h> 99 #endif 100 101 #include <netipsec/ipsec_support.h> 102 103 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 104 #include <netipsec/ipsec.h> 105 #include <netipsec/ipsec6.h> 106 #endif /* IPSEC */ 107 108 #include <netinet/udp.h> 109 #include <netinet/udp_var.h> 110 #include <machine/in_cksum.h> 111 112 #ifdef MAC 113 #include <security/mac/mac_framework.h> 114 #endif 115 #include "sack_filter.h" 116 #include "tcp_rack.h" 117 #include "rack_bbr_common.h" 118 119 uma_zone_t rack_zone; 120 uma_zone_t rack_pcb_zone; 121 122 #ifndef TICKS2SBT 123 #define TICKS2SBT(__t) (tick_sbt * ((sbintime_t)(__t))) 124 #endif 125 126 struct sysctl_ctx_list rack_sysctl_ctx; 127 struct sysctl_oid *rack_sysctl_root; 128 129 #define CUM_ACKED 1 130 #define SACKED 2 131 132 /* 133 * The RACK module incorporates a number of 134 * TCP ideas that have been put out into the IETF 135 * over the last few years: 136 * - Matt Mathis's Rate Halving which slowly drops 137 * the congestion window so that the ack clock can 138 * be maintained during a recovery. 139 * - Yuchung Cheng's RACK TCP (for which its named) that 140 * will stop us using the number of dup acks and instead 141 * use time as the gage of when we retransmit. 142 * - Reorder Detection of RFC4737 and the Tail-Loss probe draft 143 * of Dukkipati et.al. 144 * RACK depends on SACK, so if an endpoint arrives that 145 * cannot do SACK the state machine below will shuttle the 146 * connection back to using the "default" TCP stack that is 147 * in FreeBSD. 148 * 149 * To implement RACK the original TCP stack was first decomposed 150 * into a functional state machine with individual states 151 * for each of the possible TCP connection states. The do_segement 152 * functions role in life is to mandate the connection supports SACK 153 * initially and then assure that the RACK state matches the conenction 154 * state before calling the states do_segment function. Each 155 * state is simplified due to the fact that the original do_segment 156 * has been decomposed and we *know* what state we are in (no 157 * switches on the state) and all tests for SACK are gone. This 158 * greatly simplifies what each state does. 159 * 160 * TCP output is also over-written with a new version since it 161 * must maintain the new rack scoreboard. 162 * 163 */ 164 static int32_t rack_precache = 1; 165 static int32_t rack_tlp_thresh = 1; 166 static int32_t rack_reorder_thresh = 2; 167 static int32_t rack_reorder_fade = 60000; /* 0 - never fade, def 60,000 168 * - 60 seconds */ 169 static int32_t rack_pkt_delay = 1; 170 static int32_t rack_inc_var = 0;/* For TLP */ 171 static int32_t rack_reduce_largest_on_idle = 0; 172 static int32_t rack_min_pace_time = 0; 173 static int32_t rack_min_pace_time_seg_req=6; 174 static int32_t rack_early_recovery = 1; 175 static int32_t rack_early_recovery_max_seg = 6; 176 static int32_t rack_send_a_lot_in_prr = 1; 177 static int32_t rack_min_to = 1; /* Number of ms minimum timeout */ 178 static int32_t rack_tlp_in_recovery = 1; /* Can we do TLP in recovery? */ 179 static int32_t rack_verbose_logging = 0; 180 static int32_t rack_ignore_data_after_close = 1; 181 /* 182 * Currently regular tcp has a rto_min of 30ms 183 * the backoff goes 12 times so that ends up 184 * being a total of 122.850 seconds before a 185 * connection is killed. 186 */ 187 static int32_t rack_tlp_min = 10; 188 static int32_t rack_rto_min = 30; /* 30ms same as main freebsd */ 189 static int32_t rack_rto_max = 30000; /* 30 seconds */ 190 static const int32_t rack_free_cache = 2; 191 static int32_t rack_hptsi_segments = 40; 192 static int32_t rack_rate_sample_method = USE_RTT_LOW; 193 static int32_t rack_pace_every_seg = 1; 194 static int32_t rack_delayed_ack_time = 200; /* 200ms */ 195 static int32_t rack_slot_reduction = 4; 196 static int32_t rack_lower_cwnd_at_tlp = 0; 197 static int32_t rack_use_proportional_reduce = 0; 198 static int32_t rack_proportional_rate = 10; 199 static int32_t rack_tlp_max_resend = 2; 200 static int32_t rack_limited_retran = 0; 201 static int32_t rack_always_send_oldest = 0; 202 static int32_t rack_sack_block_limit = 128; 203 static int32_t rack_use_sack_filter = 1; 204 static int32_t rack_tlp_threshold_use = TLP_USE_TWO_ONE; 205 206 /* Rack specific counters */ 207 counter_u64_t rack_badfr; 208 counter_u64_t rack_badfr_bytes; 209 counter_u64_t rack_rtm_prr_retran; 210 counter_u64_t rack_rtm_prr_newdata; 211 counter_u64_t rack_timestamp_mismatch; 212 counter_u64_t rack_reorder_seen; 213 counter_u64_t rack_paced_segments; 214 counter_u64_t rack_unpaced_segments; 215 counter_u64_t rack_saw_enobuf; 216 counter_u64_t rack_saw_enetunreach; 217 218 /* Tail loss probe counters */ 219 counter_u64_t rack_tlp_tot; 220 counter_u64_t rack_tlp_newdata; 221 counter_u64_t rack_tlp_retran; 222 counter_u64_t rack_tlp_retran_bytes; 223 counter_u64_t rack_tlp_retran_fail; 224 counter_u64_t rack_to_tot; 225 counter_u64_t rack_to_arm_rack; 226 counter_u64_t rack_to_arm_tlp; 227 counter_u64_t rack_to_alloc; 228 counter_u64_t rack_to_alloc_hard; 229 counter_u64_t rack_to_alloc_emerg; 230 231 counter_u64_t rack_sack_proc_all; 232 counter_u64_t rack_sack_proc_short; 233 counter_u64_t rack_sack_proc_restart; 234 counter_u64_t rack_runt_sacks; 235 counter_u64_t rack_used_tlpmethod; 236 counter_u64_t rack_used_tlpmethod2; 237 counter_u64_t rack_enter_tlp_calc; 238 counter_u64_t rack_input_idle_reduces; 239 counter_u64_t rack_tlp_does_nada; 240 241 /* Temp CPU counters */ 242 counter_u64_t rack_find_high; 243 244 counter_u64_t rack_progress_drops; 245 counter_u64_t rack_out_size[TCP_MSS_ACCT_SIZE]; 246 counter_u64_t rack_opts_arry[RACK_OPTS_SIZE]; 247 248 static void 249 rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, int event, int line); 250 251 static int 252 rack_process_ack(struct mbuf *m, struct tcphdr *th, 253 struct socket *so, struct tcpcb *tp, struct tcpopt *to, 254 uint32_t tiwin, int32_t tlen, int32_t * ofia, int32_t thflags, int32_t * ret_val); 255 static int 256 rack_process_data(struct mbuf *m, struct tcphdr *th, 257 struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, 258 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); 259 static void 260 rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, 261 struct tcphdr *th, uint16_t nsegs, uint16_t type, int32_t recovery); 262 static struct rack_sendmap *rack_alloc(struct tcp_rack *rack); 263 static struct rack_sendmap * 264 rack_check_recovery_mode(struct tcpcb *tp, 265 uint32_t tsused); 266 static void 267 rack_cong_signal(struct tcpcb *tp, struct tcphdr *th, 268 uint32_t type); 269 static void rack_counter_destroy(void); 270 static int 271 rack_ctloutput(struct socket *so, struct sockopt *sopt, 272 struct inpcb *inp, struct tcpcb *tp); 273 static int32_t rack_ctor(void *mem, int32_t size, void *arg, int32_t how); 274 static void 275 rack_do_segment(struct mbuf *m, struct tcphdr *th, 276 struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, 277 uint8_t iptos); 278 static void rack_dtor(void *mem, int32_t size, void *arg); 279 static void 280 rack_earlier_retran(struct tcpcb *tp, struct rack_sendmap *rsm, 281 uint32_t t, uint32_t cts); 282 static struct rack_sendmap * 283 rack_find_high_nonack(struct tcp_rack *rack, 284 struct rack_sendmap *rsm); 285 static struct rack_sendmap *rack_find_lowest_rsm(struct tcp_rack *rack); 286 static void rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm); 287 static void rack_fini(struct tcpcb *tp, int32_t tcb_is_purged); 288 static int 289 rack_get_sockopt(struct socket *so, struct sockopt *sopt, 290 struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack); 291 static int32_t rack_handoff_ok(struct tcpcb *tp); 292 static int32_t rack_init(struct tcpcb *tp); 293 static void rack_init_sysctls(void); 294 static void 295 rack_log_ack(struct tcpcb *tp, struct tcpopt *to, 296 struct tcphdr *th); 297 static void 298 rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len, 299 uint32_t seq_out, uint8_t th_flags, int32_t err, uint32_t ts, 300 uint8_t pass, struct rack_sendmap *hintrsm); 301 static void 302 rack_log_sack_passed(struct tcpcb *tp, struct tcp_rack *rack, 303 struct rack_sendmap *rsm); 304 static void rack_log_to_event(struct tcp_rack *rack, int32_t to_num); 305 static int32_t rack_output(struct tcpcb *tp); 306 static void 307 rack_hpts_do_segment(struct mbuf *m, struct tcphdr *th, 308 struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, 309 uint8_t iptos, int32_t nxt_pkt, struct timeval *tv); 310 311 static uint32_t 312 rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, 313 struct sackblk *sack, struct tcpopt *to, struct rack_sendmap **prsm, 314 uint32_t cts); 315 static void rack_post_recovery(struct tcpcb *tp, struct tcphdr *th); 316 static void rack_remxt_tmr(struct tcpcb *tp); 317 static int 318 rack_set_sockopt(struct socket *so, struct sockopt *sopt, 319 struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack); 320 static void rack_set_state(struct tcpcb *tp, struct tcp_rack *rack); 321 static int32_t rack_stopall(struct tcpcb *tp); 322 static void 323 rack_timer_activate(struct tcpcb *tp, uint32_t timer_type, 324 uint32_t delta); 325 static int32_t rack_timer_active(struct tcpcb *tp, uint32_t timer_type); 326 static void rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line); 327 static void rack_timer_stop(struct tcpcb *tp, uint32_t timer_type); 328 static uint32_t 329 rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack, 330 struct rack_sendmap *rsm, uint32_t ts, int32_t * lenp); 331 static void 332 rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack, 333 struct rack_sendmap *rsm, uint32_t ts); 334 static int 335 rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack, 336 struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type); 337 static int32_t tcp_addrack(module_t mod, int32_t type, void *data); 338 static void 339 rack_challenge_ack(struct mbuf *m, struct tcphdr *th, 340 struct tcpcb *tp, int32_t * ret_val); 341 static int 342 rack_do_close_wait(struct mbuf *m, struct tcphdr *th, 343 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 344 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); 345 static int 346 rack_do_closing(struct mbuf *m, struct tcphdr *th, 347 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 348 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); 349 static void 350 rack_do_drop(struct mbuf *m, struct tcpcb *tp); 351 static void 352 rack_do_dropafterack(struct mbuf *m, struct tcpcb *tp, 353 struct tcphdr *th, int32_t thflags, int32_t tlen, int32_t * ret_val); 354 static void 355 rack_do_dropwithreset(struct mbuf *m, struct tcpcb *tp, 356 struct tcphdr *th, int32_t rstreason, int32_t tlen); 357 static int 358 rack_do_established(struct mbuf *m, struct tcphdr *th, 359 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 360 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); 361 static int 362 rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, 363 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 364 int32_t tlen, uint32_t tiwin, int32_t nxt_pkt); 365 static int 366 rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, 367 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 368 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); 369 static int 370 rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, 371 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 372 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); 373 static int 374 rack_do_lastack(struct mbuf *m, struct tcphdr *th, 375 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 376 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); 377 static int 378 rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, 379 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 380 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); 381 static int 382 rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, 383 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 384 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); 385 static int 386 rack_drop_checks(struct tcpopt *to, struct mbuf *m, 387 struct tcphdr *th, struct tcpcb *tp, int32_t * tlenp, int32_t * thf, 388 int32_t * drop_hdrlen, int32_t * ret_val); 389 static int 390 rack_process_rst(struct mbuf *m, struct tcphdr *th, 391 struct socket *so, struct tcpcb *tp); 392 struct rack_sendmap * 393 tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, 394 uint32_t tsused); 395 static void tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt); 396 static void 397 tcp_rack_partialack(struct tcpcb *tp, struct tcphdr *th); 398 399 static int 400 rack_ts_check(struct mbuf *m, struct tcphdr *th, 401 struct tcpcb *tp, int32_t tlen, int32_t thflags, int32_t * ret_val); 402 403 int32_t rack_clear_counter=0; 404 405 406 static int 407 sysctl_rack_clear(SYSCTL_HANDLER_ARGS) 408 { 409 uint32_t stat; 410 int32_t error; 411 412 error = SYSCTL_OUT(req, &rack_clear_counter, sizeof(uint32_t)); 413 if (error || req->newptr == NULL) 414 return error; 415 416 error = SYSCTL_IN(req, &stat, sizeof(uint32_t)); 417 if (error) 418 return (error); 419 if (stat == 1) { 420 #ifdef INVARIANTS 421 printf("Clearing RACK counters\n"); 422 #endif 423 counter_u64_zero(rack_badfr); 424 counter_u64_zero(rack_badfr_bytes); 425 counter_u64_zero(rack_rtm_prr_retran); 426 counter_u64_zero(rack_rtm_prr_newdata); 427 counter_u64_zero(rack_timestamp_mismatch); 428 counter_u64_zero(rack_reorder_seen); 429 counter_u64_zero(rack_tlp_tot); 430 counter_u64_zero(rack_tlp_newdata); 431 counter_u64_zero(rack_tlp_retran); 432 counter_u64_zero(rack_tlp_retran_bytes); 433 counter_u64_zero(rack_tlp_retran_fail); 434 counter_u64_zero(rack_to_tot); 435 counter_u64_zero(rack_to_arm_rack); 436 counter_u64_zero(rack_to_arm_tlp); 437 counter_u64_zero(rack_paced_segments); 438 counter_u64_zero(rack_unpaced_segments); 439 counter_u64_zero(rack_saw_enobuf); 440 counter_u64_zero(rack_saw_enetunreach); 441 counter_u64_zero(rack_to_alloc_hard); 442 counter_u64_zero(rack_to_alloc_emerg); 443 counter_u64_zero(rack_sack_proc_all); 444 counter_u64_zero(rack_sack_proc_short); 445 counter_u64_zero(rack_sack_proc_restart); 446 counter_u64_zero(rack_to_alloc); 447 counter_u64_zero(rack_find_high); 448 counter_u64_zero(rack_runt_sacks); 449 counter_u64_zero(rack_used_tlpmethod); 450 counter_u64_zero(rack_used_tlpmethod2); 451 counter_u64_zero(rack_enter_tlp_calc); 452 counter_u64_zero(rack_progress_drops); 453 counter_u64_zero(rack_tlp_does_nada); 454 } 455 rack_clear_counter = 0; 456 return (0); 457 } 458 459 460 461 static void 462 rack_init_sysctls() 463 { 464 SYSCTL_ADD_S32(&rack_sysctl_ctx, 465 SYSCTL_CHILDREN(rack_sysctl_root), 466 OID_AUTO, "rate_sample_method", CTLFLAG_RW, 467 &rack_rate_sample_method , USE_RTT_LOW, 468 "What method should we use for rate sampling 0=high, 1=low "); 469 SYSCTL_ADD_S32(&rack_sysctl_ctx, 470 SYSCTL_CHILDREN(rack_sysctl_root), 471 OID_AUTO, "data_after_close", CTLFLAG_RW, 472 &rack_ignore_data_after_close, 0, 473 "Do we hold off sending a RST until all pending data is ack'd"); 474 SYSCTL_ADD_S32(&rack_sysctl_ctx, 475 SYSCTL_CHILDREN(rack_sysctl_root), 476 OID_AUTO, "tlpmethod", CTLFLAG_RW, 477 &rack_tlp_threshold_use, TLP_USE_TWO_ONE, 478 "What method do we do for TLP time calc 0=no-de-ack-comp, 1=ID, 2=2.1, 3=2.2"); 479 SYSCTL_ADD_S32(&rack_sysctl_ctx, 480 SYSCTL_CHILDREN(rack_sysctl_root), 481 OID_AUTO, "min_pace_time", CTLFLAG_RW, 482 &rack_min_pace_time, 0, 483 "Should we enforce a minimum pace time of 1ms"); 484 SYSCTL_ADD_S32(&rack_sysctl_ctx, 485 SYSCTL_CHILDREN(rack_sysctl_root), 486 OID_AUTO, "min_pace_segs", CTLFLAG_RW, 487 &rack_min_pace_time_seg_req, 6, 488 "How many segments have to be in the len to enforce min-pace-time"); 489 SYSCTL_ADD_S32(&rack_sysctl_ctx, 490 SYSCTL_CHILDREN(rack_sysctl_root), 491 OID_AUTO, "idle_reduce_high", CTLFLAG_RW, 492 &rack_reduce_largest_on_idle, 0, 493 "Should we reduce the largest cwnd seen to IW on idle reduction"); 494 SYSCTL_ADD_S32(&rack_sysctl_ctx, 495 SYSCTL_CHILDREN(rack_sysctl_root), 496 OID_AUTO, "bb_verbose", CTLFLAG_RW, 497 &rack_verbose_logging, 0, 498 "Should RACK black box logging be verbose"); 499 SYSCTL_ADD_S32(&rack_sysctl_ctx, 500 SYSCTL_CHILDREN(rack_sysctl_root), 501 OID_AUTO, "sackfiltering", CTLFLAG_RW, 502 &rack_use_sack_filter, 1, 503 "Do we use sack filtering?"); 504 SYSCTL_ADD_S32(&rack_sysctl_ctx, 505 SYSCTL_CHILDREN(rack_sysctl_root), 506 OID_AUTO, "delayed_ack", CTLFLAG_RW, 507 &rack_delayed_ack_time, 200, 508 "Delayed ack time (200ms)"); 509 SYSCTL_ADD_S32(&rack_sysctl_ctx, 510 SYSCTL_CHILDREN(rack_sysctl_root), 511 OID_AUTO, "tlpminto", CTLFLAG_RW, 512 &rack_tlp_min, 10, 513 "TLP minimum timeout per the specification (10ms)"); 514 SYSCTL_ADD_S32(&rack_sysctl_ctx, 515 SYSCTL_CHILDREN(rack_sysctl_root), 516 OID_AUTO, "precache", CTLFLAG_RW, 517 &rack_precache, 0, 518 "Where should we precache the mcopy (0 is not at all)"); 519 SYSCTL_ADD_S32(&rack_sysctl_ctx, 520 SYSCTL_CHILDREN(rack_sysctl_root), 521 OID_AUTO, "sblklimit", CTLFLAG_RW, 522 &rack_sack_block_limit, 128, 523 "When do we start paying attention to small sack blocks"); 524 SYSCTL_ADD_S32(&rack_sysctl_ctx, 525 SYSCTL_CHILDREN(rack_sysctl_root), 526 OID_AUTO, "send_oldest", CTLFLAG_RW, 527 &rack_always_send_oldest, 1, 528 "Should we always send the oldest TLP and RACK-TLP"); 529 SYSCTL_ADD_S32(&rack_sysctl_ctx, 530 SYSCTL_CHILDREN(rack_sysctl_root), 531 OID_AUTO, "rack_tlp_in_recovery", CTLFLAG_RW, 532 &rack_tlp_in_recovery, 1, 533 "Can we do a TLP during recovery?"); 534 SYSCTL_ADD_S32(&rack_sysctl_ctx, 535 SYSCTL_CHILDREN(rack_sysctl_root), 536 OID_AUTO, "rack_tlimit", CTLFLAG_RW, 537 &rack_limited_retran, 0, 538 "How many times can a rack timeout drive out sends"); 539 SYSCTL_ADD_S32(&rack_sysctl_ctx, 540 SYSCTL_CHILDREN(rack_sysctl_root), 541 OID_AUTO, "minrto", CTLFLAG_RW, 542 &rack_rto_min, 0, 543 "Minimum RTO in ms -- set with caution below 1000 due to TLP"); 544 SYSCTL_ADD_S32(&rack_sysctl_ctx, 545 SYSCTL_CHILDREN(rack_sysctl_root), 546 OID_AUTO, "maxrto", CTLFLAG_RW, 547 &rack_rto_max, 0, 548 "Maxiumum RTO in ms -- should be at least as large as min_rto"); 549 SYSCTL_ADD_S32(&rack_sysctl_ctx, 550 SYSCTL_CHILDREN(rack_sysctl_root), 551 OID_AUTO, "tlp_retry", CTLFLAG_RW, 552 &rack_tlp_max_resend, 2, 553 "How many times does TLP retry a single segment or multiple with no ACK"); 554 SYSCTL_ADD_S32(&rack_sysctl_ctx, 555 SYSCTL_CHILDREN(rack_sysctl_root), 556 OID_AUTO, "recovery_loss_prop", CTLFLAG_RW, 557 &rack_use_proportional_reduce, 0, 558 "Should we proportionaly reduce cwnd based on the number of losses "); 559 SYSCTL_ADD_S32(&rack_sysctl_ctx, 560 SYSCTL_CHILDREN(rack_sysctl_root), 561 OID_AUTO, "recovery_prop", CTLFLAG_RW, 562 &rack_proportional_rate, 10, 563 "What percent reduction per loss"); 564 SYSCTL_ADD_S32(&rack_sysctl_ctx, 565 SYSCTL_CHILDREN(rack_sysctl_root), 566 OID_AUTO, "tlp_cwnd_flag", CTLFLAG_RW, 567 &rack_lower_cwnd_at_tlp, 0, 568 "When a TLP completes a retran should we enter recovery?"); 569 SYSCTL_ADD_S32(&rack_sysctl_ctx, 570 SYSCTL_CHILDREN(rack_sysctl_root), 571 OID_AUTO, "hptsi_reduces", CTLFLAG_RW, 572 &rack_slot_reduction, 4, 573 "When setting a slot should we reduce by divisor"); 574 SYSCTL_ADD_S32(&rack_sysctl_ctx, 575 SYSCTL_CHILDREN(rack_sysctl_root), 576 OID_AUTO, "hptsi_every_seg", CTLFLAG_RW, 577 &rack_pace_every_seg, 1, 578 "Should we pace out every segment hptsi"); 579 SYSCTL_ADD_S32(&rack_sysctl_ctx, 580 SYSCTL_CHILDREN(rack_sysctl_root), 581 OID_AUTO, "hptsi_seg_max", CTLFLAG_RW, 582 &rack_hptsi_segments, 6, 583 "Should we pace out only a limited size of segments"); 584 SYSCTL_ADD_S32(&rack_sysctl_ctx, 585 SYSCTL_CHILDREN(rack_sysctl_root), 586 OID_AUTO, "prr_sendalot", CTLFLAG_RW, 587 &rack_send_a_lot_in_prr, 1, 588 "Send a lot in prr"); 589 SYSCTL_ADD_S32(&rack_sysctl_ctx, 590 SYSCTL_CHILDREN(rack_sysctl_root), 591 OID_AUTO, "minto", CTLFLAG_RW, 592 &rack_min_to, 1, 593 "Minimum rack timeout in milliseconds"); 594 SYSCTL_ADD_S32(&rack_sysctl_ctx, 595 SYSCTL_CHILDREN(rack_sysctl_root), 596 OID_AUTO, "earlyrecoveryseg", CTLFLAG_RW, 597 &rack_early_recovery_max_seg, 6, 598 "Max segments in early recovery"); 599 SYSCTL_ADD_S32(&rack_sysctl_ctx, 600 SYSCTL_CHILDREN(rack_sysctl_root), 601 OID_AUTO, "earlyrecovery", CTLFLAG_RW, 602 &rack_early_recovery, 1, 603 "Do we do early recovery with rack"); 604 SYSCTL_ADD_S32(&rack_sysctl_ctx, 605 SYSCTL_CHILDREN(rack_sysctl_root), 606 OID_AUTO, "reorder_thresh", CTLFLAG_RW, 607 &rack_reorder_thresh, 2, 608 "What factor for rack will be added when seeing reordering (shift right)"); 609 SYSCTL_ADD_S32(&rack_sysctl_ctx, 610 SYSCTL_CHILDREN(rack_sysctl_root), 611 OID_AUTO, "rtt_tlp_thresh", CTLFLAG_RW, 612 &rack_tlp_thresh, 1, 613 "what divisor for TLP rtt/retran will be added (1=rtt, 2=1/2 rtt etc)"); 614 SYSCTL_ADD_S32(&rack_sysctl_ctx, 615 SYSCTL_CHILDREN(rack_sysctl_root), 616 OID_AUTO, "reorder_fade", CTLFLAG_RW, 617 &rack_reorder_fade, 0, 618 "Does reorder detection fade, if so how many ms (0 means never)"); 619 SYSCTL_ADD_S32(&rack_sysctl_ctx, 620 SYSCTL_CHILDREN(rack_sysctl_root), 621 OID_AUTO, "pktdelay", CTLFLAG_RW, 622 &rack_pkt_delay, 1, 623 "Extra RACK time (in ms) besides reordering thresh"); 624 SYSCTL_ADD_S32(&rack_sysctl_ctx, 625 SYSCTL_CHILDREN(rack_sysctl_root), 626 OID_AUTO, "inc_var", CTLFLAG_RW, 627 &rack_inc_var, 0, 628 "Should rack add to the TLP timer the variance in rtt calculation"); 629 rack_badfr = counter_u64_alloc(M_WAITOK); 630 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 631 SYSCTL_CHILDREN(rack_sysctl_root), 632 OID_AUTO, "badfr", CTLFLAG_RD, 633 &rack_badfr, "Total number of bad FRs"); 634 rack_badfr_bytes = counter_u64_alloc(M_WAITOK); 635 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 636 SYSCTL_CHILDREN(rack_sysctl_root), 637 OID_AUTO, "badfr_bytes", CTLFLAG_RD, 638 &rack_badfr_bytes, "Total number of bad FRs"); 639 rack_rtm_prr_retran = counter_u64_alloc(M_WAITOK); 640 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 641 SYSCTL_CHILDREN(rack_sysctl_root), 642 OID_AUTO, "prrsndret", CTLFLAG_RD, 643 &rack_rtm_prr_retran, 644 "Total number of prr based retransmits"); 645 rack_rtm_prr_newdata = counter_u64_alloc(M_WAITOK); 646 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 647 SYSCTL_CHILDREN(rack_sysctl_root), 648 OID_AUTO, "prrsndnew", CTLFLAG_RD, 649 &rack_rtm_prr_newdata, 650 "Total number of prr based new transmits"); 651 rack_timestamp_mismatch = counter_u64_alloc(M_WAITOK); 652 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 653 SYSCTL_CHILDREN(rack_sysctl_root), 654 OID_AUTO, "tsnf", CTLFLAG_RD, 655 &rack_timestamp_mismatch, 656 "Total number of timestamps that we could not find the reported ts"); 657 rack_find_high = counter_u64_alloc(M_WAITOK); 658 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 659 SYSCTL_CHILDREN(rack_sysctl_root), 660 OID_AUTO, "findhigh", CTLFLAG_RD, 661 &rack_find_high, 662 "Total number of FIN causing find-high"); 663 rack_reorder_seen = counter_u64_alloc(M_WAITOK); 664 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 665 SYSCTL_CHILDREN(rack_sysctl_root), 666 OID_AUTO, "reordering", CTLFLAG_RD, 667 &rack_reorder_seen, 668 "Total number of times we added delay due to reordering"); 669 rack_tlp_tot = counter_u64_alloc(M_WAITOK); 670 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 671 SYSCTL_CHILDREN(rack_sysctl_root), 672 OID_AUTO, "tlp_to_total", CTLFLAG_RD, 673 &rack_tlp_tot, 674 "Total number of tail loss probe expirations"); 675 rack_tlp_newdata = counter_u64_alloc(M_WAITOK); 676 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 677 SYSCTL_CHILDREN(rack_sysctl_root), 678 OID_AUTO, "tlp_new", CTLFLAG_RD, 679 &rack_tlp_newdata, 680 "Total number of tail loss probe sending new data"); 681 682 rack_tlp_retran = counter_u64_alloc(M_WAITOK); 683 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 684 SYSCTL_CHILDREN(rack_sysctl_root), 685 OID_AUTO, "tlp_retran", CTLFLAG_RD, 686 &rack_tlp_retran, 687 "Total number of tail loss probe sending retransmitted data"); 688 rack_tlp_retran_bytes = counter_u64_alloc(M_WAITOK); 689 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 690 SYSCTL_CHILDREN(rack_sysctl_root), 691 OID_AUTO, "tlp_retran_bytes", CTLFLAG_RD, 692 &rack_tlp_retran_bytes, 693 "Total bytes of tail loss probe sending retransmitted data"); 694 rack_tlp_retran_fail = counter_u64_alloc(M_WAITOK); 695 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 696 SYSCTL_CHILDREN(rack_sysctl_root), 697 OID_AUTO, "tlp_retran_fail", CTLFLAG_RD, 698 &rack_tlp_retran_fail, 699 "Total number of tail loss probe sending retransmitted data that failed (wait for t3)"); 700 rack_to_tot = counter_u64_alloc(M_WAITOK); 701 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 702 SYSCTL_CHILDREN(rack_sysctl_root), 703 OID_AUTO, "rack_to_tot", CTLFLAG_RD, 704 &rack_to_tot, 705 "Total number of times the rack to expired?"); 706 rack_to_arm_rack = counter_u64_alloc(M_WAITOK); 707 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 708 SYSCTL_CHILDREN(rack_sysctl_root), 709 OID_AUTO, "arm_rack", CTLFLAG_RD, 710 &rack_to_arm_rack, 711 "Total number of times the rack timer armed?"); 712 rack_to_arm_tlp = counter_u64_alloc(M_WAITOK); 713 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 714 SYSCTL_CHILDREN(rack_sysctl_root), 715 OID_AUTO, "arm_tlp", CTLFLAG_RD, 716 &rack_to_arm_tlp, 717 "Total number of times the tlp timer armed?"); 718 rack_paced_segments = counter_u64_alloc(M_WAITOK); 719 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 720 SYSCTL_CHILDREN(rack_sysctl_root), 721 OID_AUTO, "paced", CTLFLAG_RD, 722 &rack_paced_segments, 723 "Total number of times a segment send caused hptsi"); 724 rack_unpaced_segments = counter_u64_alloc(M_WAITOK); 725 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 726 SYSCTL_CHILDREN(rack_sysctl_root), 727 OID_AUTO, "unpaced", CTLFLAG_RD, 728 &rack_unpaced_segments, 729 "Total number of times a segment did not cause hptsi"); 730 rack_saw_enobuf = counter_u64_alloc(M_WAITOK); 731 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 732 SYSCTL_CHILDREN(rack_sysctl_root), 733 OID_AUTO, "saw_enobufs", CTLFLAG_RD, 734 &rack_saw_enobuf, 735 "Total number of times a segment did not cause hptsi"); 736 rack_saw_enetunreach = counter_u64_alloc(M_WAITOK); 737 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 738 SYSCTL_CHILDREN(rack_sysctl_root), 739 OID_AUTO, "saw_enetunreach", CTLFLAG_RD, 740 &rack_saw_enetunreach, 741 "Total number of times a segment did not cause hptsi"); 742 rack_to_alloc = counter_u64_alloc(M_WAITOK); 743 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 744 SYSCTL_CHILDREN(rack_sysctl_root), 745 OID_AUTO, "allocs", CTLFLAG_RD, 746 &rack_to_alloc, 747 "Total allocations of tracking structures"); 748 rack_to_alloc_hard = counter_u64_alloc(M_WAITOK); 749 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 750 SYSCTL_CHILDREN(rack_sysctl_root), 751 OID_AUTO, "allochard", CTLFLAG_RD, 752 &rack_to_alloc_hard, 753 "Total allocations done with sleeping the hard way"); 754 rack_to_alloc_emerg = counter_u64_alloc(M_WAITOK); 755 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 756 SYSCTL_CHILDREN(rack_sysctl_root), 757 OID_AUTO, "allocemerg", CTLFLAG_RD, 758 &rack_to_alloc_emerg, 759 "Total alocations done from emergency cache"); 760 rack_sack_proc_all = counter_u64_alloc(M_WAITOK); 761 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 762 SYSCTL_CHILDREN(rack_sysctl_root), 763 OID_AUTO, "sack_long", CTLFLAG_RD, 764 &rack_sack_proc_all, 765 "Total times we had to walk whole list for sack processing"); 766 767 rack_sack_proc_restart = counter_u64_alloc(M_WAITOK); 768 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 769 SYSCTL_CHILDREN(rack_sysctl_root), 770 OID_AUTO, "sack_restart", CTLFLAG_RD, 771 &rack_sack_proc_restart, 772 "Total times we had to walk whole list due to a restart"); 773 rack_sack_proc_short = counter_u64_alloc(M_WAITOK); 774 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 775 SYSCTL_CHILDREN(rack_sysctl_root), 776 OID_AUTO, "sack_short", CTLFLAG_RD, 777 &rack_sack_proc_short, 778 "Total times we took shortcut for sack processing"); 779 rack_enter_tlp_calc = counter_u64_alloc(M_WAITOK); 780 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 781 SYSCTL_CHILDREN(rack_sysctl_root), 782 OID_AUTO, "tlp_calc_entered", CTLFLAG_RD, 783 &rack_enter_tlp_calc, 784 "Total times we called calc-tlp"); 785 rack_used_tlpmethod = counter_u64_alloc(M_WAITOK); 786 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 787 SYSCTL_CHILDREN(rack_sysctl_root), 788 OID_AUTO, "hit_tlp_method", CTLFLAG_RD, 789 &rack_used_tlpmethod, 790 "Total number of runt sacks"); 791 rack_used_tlpmethod2 = counter_u64_alloc(M_WAITOK); 792 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 793 SYSCTL_CHILDREN(rack_sysctl_root), 794 OID_AUTO, "hit_tlp_method2", CTLFLAG_RD, 795 &rack_used_tlpmethod2, 796 "Total number of runt sacks 2"); 797 rack_runt_sacks = counter_u64_alloc(M_WAITOK); 798 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 799 SYSCTL_CHILDREN(rack_sysctl_root), 800 OID_AUTO, "runtsacks", CTLFLAG_RD, 801 &rack_runt_sacks, 802 "Total number of runt sacks"); 803 rack_progress_drops = counter_u64_alloc(M_WAITOK); 804 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 805 SYSCTL_CHILDREN(rack_sysctl_root), 806 OID_AUTO, "prog_drops", CTLFLAG_RD, 807 &rack_progress_drops, 808 "Total number of progress drops"); 809 rack_input_idle_reduces = counter_u64_alloc(M_WAITOK); 810 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 811 SYSCTL_CHILDREN(rack_sysctl_root), 812 OID_AUTO, "idle_reduce_oninput", CTLFLAG_RD, 813 &rack_input_idle_reduces, 814 "Total number of idle reductions on input"); 815 rack_tlp_does_nada = counter_u64_alloc(M_WAITOK); 816 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 817 SYSCTL_CHILDREN(rack_sysctl_root), 818 OID_AUTO, "tlp_nada", CTLFLAG_RD, 819 &rack_tlp_does_nada, 820 "Total number of nada tlp calls"); 821 COUNTER_ARRAY_ALLOC(rack_out_size, TCP_MSS_ACCT_SIZE, M_WAITOK); 822 SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), 823 OID_AUTO, "outsize", CTLFLAG_RD, 824 rack_out_size, TCP_MSS_ACCT_SIZE, "MSS send sizes"); 825 COUNTER_ARRAY_ALLOC(rack_opts_arry, RACK_OPTS_SIZE, M_WAITOK); 826 SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), 827 OID_AUTO, "opts", CTLFLAG_RD, 828 rack_opts_arry, RACK_OPTS_SIZE, "RACK Option Stats"); 829 SYSCTL_ADD_PROC(&rack_sysctl_ctx, 830 SYSCTL_CHILDREN(rack_sysctl_root), 831 OID_AUTO, "clear", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, 832 &rack_clear_counter, 0, sysctl_rack_clear, "IU", "Clear counters"); 833 } 834 835 static inline int32_t 836 rack_progress_timeout_check(struct tcpcb *tp) 837 { 838 if (tp->t_maxunacktime && tp->t_acktime && TSTMP_GT(ticks, tp->t_acktime)) { 839 if ((ticks - tp->t_acktime) >= tp->t_maxunacktime) { 840 /* 841 * There is an assumption that the caller 842 * will drop the connection so we will 843 * increment the counters here. 844 */ 845 struct tcp_rack *rack; 846 rack = (struct tcp_rack *)tp->t_fb_ptr; 847 counter_u64_add(rack_progress_drops, 1); 848 #ifdef NETFLIX_STATS 849 TCPSTAT_INC(tcps_progdrops); 850 #endif 851 rack_log_progress_event(rack, tp, ticks, PROGRESS_DROP, __LINE__); 852 return (1); 853 } 854 } 855 return (0); 856 } 857 858 859 static void 860 rack_log_to_start(struct tcp_rack *rack, uint32_t cts, uint32_t to, int32_t slot, uint8_t which) 861 { 862 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 863 union tcp_log_stackspecific log; 864 865 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 866 log.u_bbr.flex1 = TICKS_2_MSEC(rack->rc_tp->t_srtt >> TCP_RTT_SHIFT); 867 log.u_bbr.flex2 = to; 868 log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags; 869 log.u_bbr.flex4 = slot; 870 log.u_bbr.flex5 = rack->rc_inp->inp_hptsslot; 871 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur; 872 log.u_bbr.flex8 = which; 873 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 874 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 875 TCP_LOG_EVENT(rack->rc_tp, NULL, 876 &rack->rc_inp->inp_socket->so_rcv, 877 &rack->rc_inp->inp_socket->so_snd, 878 BBR_LOG_TIMERSTAR, 0, 879 0, &log, false); 880 } 881 } 882 883 static void 884 rack_log_to_event(struct tcp_rack *rack, int32_t to_num) 885 { 886 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 887 union tcp_log_stackspecific log; 888 889 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 890 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 891 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 892 log.u_bbr.flex8 = to_num; 893 log.u_bbr.flex1 = rack->r_ctl.rc_rack_min_rtt; 894 log.u_bbr.flex2 = rack->rc_rack_rtt; 895 TCP_LOG_EVENT(rack->rc_tp, NULL, 896 &rack->rc_inp->inp_socket->so_rcv, 897 &rack->rc_inp->inp_socket->so_snd, 898 BBR_LOG_RTO, 0, 899 0, &log, false); 900 } 901 } 902 903 static void 904 rack_log_rtt_upd(struct tcpcb *tp, struct tcp_rack *rack, int32_t t, 905 uint32_t o_srtt, uint32_t o_var) 906 { 907 if (tp->t_logstate != TCP_LOG_STATE_OFF) { 908 union tcp_log_stackspecific log; 909 910 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 911 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 912 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 913 log.u_bbr.flex1 = t; 914 log.u_bbr.flex2 = o_srtt; 915 log.u_bbr.flex3 = o_var; 916 log.u_bbr.flex4 = rack->r_ctl.rack_rs.rs_rtt_lowest; 917 log.u_bbr.flex5 = rack->r_ctl.rack_rs.rs_rtt_highest; 918 log.u_bbr.flex6 = rack->r_ctl.rack_rs.rs_rtt_cnt; 919 log.u_bbr.rttProp = rack->r_ctl.rack_rs.rs_rtt_tot; 920 log.u_bbr.flex8 = rack->r_ctl.rc_rate_sample_method; 921 TCP_LOG_EVENT(tp, NULL, 922 &rack->rc_inp->inp_socket->so_rcv, 923 &rack->rc_inp->inp_socket->so_snd, 924 BBR_LOG_BBRRTT, 0, 925 0, &log, false); 926 } 927 } 928 929 static void 930 rack_log_rtt_sample(struct tcp_rack *rack, uint32_t rtt) 931 { 932 /* 933 * Log the rtt sample we are 934 * applying to the srtt algorithm in 935 * useconds. 936 */ 937 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 938 union tcp_log_stackspecific log; 939 struct timeval tv; 940 941 /* Convert our ms to a microsecond */ 942 log.u_bbr.flex1 = rtt * 1000; 943 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 944 TCP_LOG_EVENTP(rack->rc_tp, NULL, 945 &rack->rc_inp->inp_socket->so_rcv, 946 &rack->rc_inp->inp_socket->so_snd, 947 TCP_LOG_RTT, 0, 948 0, &log, false, &tv); 949 } 950 } 951 952 953 static inline void 954 rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, int event, int line) 955 { 956 if (rack_verbose_logging && (tp->t_logstate != TCP_LOG_STATE_OFF)) { 957 union tcp_log_stackspecific log; 958 959 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 960 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 961 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 962 log.u_bbr.flex1 = line; 963 log.u_bbr.flex2 = tick; 964 log.u_bbr.flex3 = tp->t_maxunacktime; 965 log.u_bbr.flex4 = tp->t_acktime; 966 log.u_bbr.flex8 = event; 967 TCP_LOG_EVENT(tp, NULL, 968 &rack->rc_inp->inp_socket->so_rcv, 969 &rack->rc_inp->inp_socket->so_snd, 970 BBR_LOG_PROGRESS, 0, 971 0, &log, false); 972 } 973 } 974 975 static void 976 rack_log_type_bbrsnd(struct tcp_rack *rack, uint32_t len, uint32_t slot, uint32_t cts) 977 { 978 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 979 union tcp_log_stackspecific log; 980 981 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 982 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 983 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 984 log.u_bbr.flex1 = slot; 985 log.u_bbr.flex7 = (0x0000ffff & rack->r_ctl.rc_hpts_flags); 986 log.u_bbr.flex8 = rack->rc_in_persist; 987 TCP_LOG_EVENT(rack->rc_tp, NULL, 988 &rack->rc_inp->inp_socket->so_rcv, 989 &rack->rc_inp->inp_socket->so_snd, 990 BBR_LOG_BBRSND, 0, 991 0, &log, false); 992 } 993 } 994 995 static void 996 rack_log_doseg_done(struct tcp_rack *rack, uint32_t cts, int32_t nxt_pkt, int32_t did_out, int way_out) 997 { 998 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 999 union tcp_log_stackspecific log; 1000 log.u_bbr.flex1 = did_out; 1001 log.u_bbr.flex2 = nxt_pkt; 1002 log.u_bbr.flex3 = way_out; 1003 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; 1004 log.u_bbr.flex7 = rack->r_wanted_output; 1005 log.u_bbr.flex8 = rack->rc_in_persist; 1006 TCP_LOG_EVENT(rack->rc_tp, NULL, 1007 &rack->rc_inp->inp_socket->so_rcv, 1008 &rack->rc_inp->inp_socket->so_snd, 1009 BBR_LOG_DOSEG_DONE, 0, 1010 0, &log, false); 1011 } 1012 } 1013 1014 1015 static void 1016 rack_log_type_just_return(struct tcp_rack *rack, uint32_t cts, uint32_t tlen, uint32_t slot, uint8_t hpts_calling) 1017 { 1018 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 1019 union tcp_log_stackspecific log; 1020 1021 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 1022 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 1023 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 1024 log.u_bbr.flex1 = slot; 1025 log.u_bbr.flex2 = rack->r_ctl.rc_hpts_flags; 1026 log.u_bbr.flex7 = hpts_calling; 1027 log.u_bbr.flex8 = rack->rc_in_persist; 1028 TCP_LOG_EVENT(rack->rc_tp, NULL, 1029 &rack->rc_inp->inp_socket->so_rcv, 1030 &rack->rc_inp->inp_socket->so_snd, 1031 BBR_LOG_JUSTRET, 0, 1032 tlen, &log, false); 1033 } 1034 } 1035 1036 static void 1037 rack_log_to_cancel(struct tcp_rack *rack, int32_t hpts_removed, int line) 1038 { 1039 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 1040 union tcp_log_stackspecific log; 1041 1042 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 1043 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 1044 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 1045 log.u_bbr.flex1 = line; 1046 log.u_bbr.flex2 = 0; 1047 log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags; 1048 log.u_bbr.flex4 = 0; 1049 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur; 1050 log.u_bbr.flex8 = hpts_removed; 1051 TCP_LOG_EVENT(rack->rc_tp, NULL, 1052 &rack->rc_inp->inp_socket->so_rcv, 1053 &rack->rc_inp->inp_socket->so_snd, 1054 BBR_LOG_TIMERCANC, 0, 1055 0, &log, false); 1056 } 1057 } 1058 1059 static void 1060 rack_log_to_processing(struct tcp_rack *rack, uint32_t cts, int32_t ret, int32_t timers) 1061 { 1062 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 1063 union tcp_log_stackspecific log; 1064 1065 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 1066 log.u_bbr.flex1 = timers; 1067 log.u_bbr.flex2 = ret; 1068 log.u_bbr.flex3 = rack->r_ctl.rc_timer_exp; 1069 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; 1070 log.u_bbr.flex5 = cts; 1071 TCP_LOG_EVENT(rack->rc_tp, NULL, 1072 &rack->rc_inp->inp_socket->so_rcv, 1073 &rack->rc_inp->inp_socket->so_snd, 1074 BBR_LOG_TO_PROCESS, 0, 1075 0, &log, false); 1076 } 1077 } 1078 1079 static void 1080 rack_counter_destroy() 1081 { 1082 counter_u64_free(rack_badfr); 1083 counter_u64_free(rack_badfr_bytes); 1084 counter_u64_free(rack_rtm_prr_retran); 1085 counter_u64_free(rack_rtm_prr_newdata); 1086 counter_u64_free(rack_timestamp_mismatch); 1087 counter_u64_free(rack_reorder_seen); 1088 counter_u64_free(rack_tlp_tot); 1089 counter_u64_free(rack_tlp_newdata); 1090 counter_u64_free(rack_tlp_retran); 1091 counter_u64_free(rack_tlp_retran_bytes); 1092 counter_u64_free(rack_tlp_retran_fail); 1093 counter_u64_free(rack_to_tot); 1094 counter_u64_free(rack_to_arm_rack); 1095 counter_u64_free(rack_to_arm_tlp); 1096 counter_u64_free(rack_paced_segments); 1097 counter_u64_free(rack_unpaced_segments); 1098 counter_u64_free(rack_saw_enobuf); 1099 counter_u64_free(rack_saw_enetunreach); 1100 counter_u64_free(rack_to_alloc_hard); 1101 counter_u64_free(rack_to_alloc_emerg); 1102 counter_u64_free(rack_sack_proc_all); 1103 counter_u64_free(rack_sack_proc_short); 1104 counter_u64_free(rack_sack_proc_restart); 1105 counter_u64_free(rack_to_alloc); 1106 counter_u64_free(rack_find_high); 1107 counter_u64_free(rack_runt_sacks); 1108 counter_u64_free(rack_enter_tlp_calc); 1109 counter_u64_free(rack_used_tlpmethod); 1110 counter_u64_free(rack_used_tlpmethod2); 1111 counter_u64_free(rack_progress_drops); 1112 counter_u64_free(rack_input_idle_reduces); 1113 counter_u64_free(rack_tlp_does_nada); 1114 COUNTER_ARRAY_FREE(rack_out_size, TCP_MSS_ACCT_SIZE); 1115 COUNTER_ARRAY_FREE(rack_opts_arry, RACK_OPTS_SIZE); 1116 } 1117 1118 static struct rack_sendmap * 1119 rack_alloc(struct tcp_rack *rack) 1120 { 1121 struct rack_sendmap *rsm; 1122 1123 counter_u64_add(rack_to_alloc, 1); 1124 rack->r_ctl.rc_num_maps_alloced++; 1125 rsm = uma_zalloc(rack_zone, M_NOWAIT); 1126 if (rsm) { 1127 return (rsm); 1128 } 1129 if (rack->rc_free_cnt) { 1130 counter_u64_add(rack_to_alloc_emerg, 1); 1131 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); 1132 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_next); 1133 rack->rc_free_cnt--; 1134 return (rsm); 1135 } 1136 return (NULL); 1137 } 1138 1139 static void 1140 rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm) 1141 { 1142 rack->r_ctl.rc_num_maps_alloced--; 1143 if (rack->r_ctl.rc_tlpsend == rsm) 1144 rack->r_ctl.rc_tlpsend = NULL; 1145 if (rack->r_ctl.rc_next == rsm) 1146 rack->r_ctl.rc_next = NULL; 1147 if (rack->r_ctl.rc_sacklast == rsm) 1148 rack->r_ctl.rc_sacklast = NULL; 1149 if (rack->rc_free_cnt < rack_free_cache) { 1150 memset(rsm, 0, sizeof(struct rack_sendmap)); 1151 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_next); 1152 rack->rc_free_cnt++; 1153 return; 1154 } 1155 uma_zfree(rack_zone, rsm); 1156 } 1157 1158 /* 1159 * CC wrapper hook functions 1160 */ 1161 static void 1162 rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, struct tcphdr *th, uint16_t nsegs, 1163 uint16_t type, int32_t recovery) 1164 { 1165 #ifdef NETFLIX_STATS 1166 int32_t gput; 1167 #endif 1168 #ifdef NETFLIX_CWV 1169 u_long old_cwnd = tp->snd_cwnd; 1170 #endif 1171 1172 INP_WLOCK_ASSERT(tp->t_inpcb); 1173 tp->ccv->nsegs = nsegs; 1174 tp->ccv->bytes_this_ack = BYTES_THIS_ACK(tp, th); 1175 if ((recovery) && (rack->r_ctl.rc_early_recovery_segs)) { 1176 uint32_t max; 1177 1178 max = rack->r_ctl.rc_early_recovery_segs * tp->t_maxseg; 1179 if (tp->ccv->bytes_this_ack > max) { 1180 tp->ccv->bytes_this_ack = max; 1181 } 1182 } 1183 if (tp->snd_cwnd <= tp->snd_wnd) 1184 tp->ccv->flags |= CCF_CWND_LIMITED; 1185 else 1186 tp->ccv->flags &= ~CCF_CWND_LIMITED; 1187 1188 if (type == CC_ACK) { 1189 #ifdef NETFLIX_STATS 1190 stats_voi_update_abs_s32(tp->t_stats, VOI_TCP_CALCFRWINDIFF, 1191 ((int32_t) tp->snd_cwnd) - tp->snd_wnd); 1192 if ((tp->t_flags & TF_GPUTINPROG) && 1193 SEQ_GEQ(th->th_ack, tp->gput_ack)) { 1194 gput = (((int64_t) (th->th_ack - tp->gput_seq)) << 3) / 1195 max(1, tcp_ts_getticks() - tp->gput_ts); 1196 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_GPUT, 1197 gput); 1198 /* 1199 * XXXLAS: This is a temporary hack, and should be 1200 * chained off VOI_TCP_GPUT when stats(9) grows an 1201 * API to deal with chained VOIs. 1202 */ 1203 if (tp->t_stats_gput_prev > 0) 1204 stats_voi_update_abs_s32(tp->t_stats, 1205 VOI_TCP_GPUT_ND, 1206 ((gput - tp->t_stats_gput_prev) * 100) / 1207 tp->t_stats_gput_prev); 1208 tp->t_flags &= ~TF_GPUTINPROG; 1209 tp->t_stats_gput_prev = gput; 1210 #ifdef NETFLIX_CWV 1211 if (tp->t_maxpeakrate) { 1212 /* 1213 * We update t_peakrate_thr. This gives us roughly 1214 * one update per round trip time. 1215 */ 1216 tcp_update_peakrate_thr(tp); 1217 } 1218 #endif 1219 } 1220 #endif 1221 if (tp->snd_cwnd > tp->snd_ssthresh) { 1222 tp->t_bytes_acked += min(tp->ccv->bytes_this_ack, 1223 nsegs * V_tcp_abc_l_var * tp->t_maxseg); 1224 if (tp->t_bytes_acked >= tp->snd_cwnd) { 1225 tp->t_bytes_acked -= tp->snd_cwnd; 1226 tp->ccv->flags |= CCF_ABC_SENTAWND; 1227 } 1228 } else { 1229 tp->ccv->flags &= ~CCF_ABC_SENTAWND; 1230 tp->t_bytes_acked = 0; 1231 } 1232 } 1233 if (CC_ALGO(tp)->ack_received != NULL) { 1234 /* XXXLAS: Find a way to live without this */ 1235 tp->ccv->curack = th->th_ack; 1236 CC_ALGO(tp)->ack_received(tp->ccv, type); 1237 } 1238 #ifdef NETFLIX_STATS 1239 stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_LCWIN, tp->snd_cwnd); 1240 #endif 1241 if (rack->r_ctl.rc_rack_largest_cwnd < tp->snd_cwnd) { 1242 rack->r_ctl.rc_rack_largest_cwnd = tp->snd_cwnd; 1243 } 1244 #ifdef NETFLIX_CWV 1245 if (tp->cwv_enabled) { 1246 /* 1247 * Per RFC 7661: The behaviour in the non-validated phase is 1248 * specified as: o A sender determines whether to increase 1249 * the cwnd based upon whether it is cwnd-limited (see 1250 * Section 4.5.3): * A sender that is cwnd-limited MAY use 1251 * the standard TCP method to increase cwnd (i.e., the 1252 * standard method permits a TCP sender that fully utilises 1253 * the cwnd to increase the cwnd each time it receives an 1254 * ACK). * A sender that is not cwnd-limited MUST NOT 1255 * increase the cwnd when ACK packets are received in this 1256 * phase (i.e., needs to avoid growing the cwnd when it has 1257 * not recently sent using the current size of cwnd). 1258 */ 1259 if ((tp->snd_cwnd > old_cwnd) && 1260 (tp->cwv_cwnd_valid == 0) && 1261 (!(tp->ccv->flags & CCF_CWND_LIMITED))) { 1262 tp->snd_cwnd = old_cwnd; 1263 } 1264 /* Try to update pipeAck and NCWV state */ 1265 if (TCPS_HAVEESTABLISHED(tp->t_state) && 1266 !IN_RECOVERY(tp->t_flags)) { 1267 uint32_t data = sbavail(&(tp->t_inpcb->inp_socket->so_snd)); 1268 1269 tcp_newcwv_update_pipeack(tp, data); 1270 } 1271 } 1272 /* we enforce max peak rate if it is set. */ 1273 if (tp->t_peakrate_thr && tp->snd_cwnd > tp->t_peakrate_thr) { 1274 tp->snd_cwnd = tp->t_peakrate_thr; 1275 } 1276 #endif 1277 } 1278 1279 static void 1280 tcp_rack_partialack(struct tcpcb *tp, struct tcphdr *th) 1281 { 1282 struct tcp_rack *rack; 1283 1284 rack = (struct tcp_rack *)tp->t_fb_ptr; 1285 INP_WLOCK_ASSERT(tp->t_inpcb); 1286 if (rack->r_ctl.rc_prr_sndcnt > 0) 1287 rack->r_wanted_output++; 1288 } 1289 1290 static void 1291 rack_post_recovery(struct tcpcb *tp, struct tcphdr *th) 1292 { 1293 struct tcp_rack *rack; 1294 1295 INP_WLOCK_ASSERT(tp->t_inpcb); 1296 rack = (struct tcp_rack *)tp->t_fb_ptr; 1297 if (CC_ALGO(tp)->post_recovery != NULL) { 1298 tp->ccv->curack = th->th_ack; 1299 CC_ALGO(tp)->post_recovery(tp->ccv); 1300 } 1301 /* 1302 * Here we can in theory adjust cwnd to be based on the number of 1303 * losses in the window (rack->r_ctl.rc_loss_count). This is done 1304 * based on the rack_use_proportional flag. 1305 */ 1306 if (rack->r_ctl.rc_prop_reduce && rack->r_ctl.rc_prop_rate) { 1307 int32_t reduce; 1308 1309 reduce = (rack->r_ctl.rc_loss_count * rack->r_ctl.rc_prop_rate); 1310 if (reduce > 50) { 1311 reduce = 50; 1312 } 1313 tp->snd_cwnd -= ((reduce * tp->snd_cwnd) / 100); 1314 } else { 1315 if (tp->snd_cwnd > tp->snd_ssthresh) { 1316 /* Drop us down to the ssthresh (1/2 cwnd at loss) */ 1317 tp->snd_cwnd = tp->snd_ssthresh; 1318 } 1319 } 1320 if (rack->r_ctl.rc_prr_sndcnt > 0) { 1321 /* Suck the next prr cnt back into cwnd */ 1322 tp->snd_cwnd += rack->r_ctl.rc_prr_sndcnt; 1323 rack->r_ctl.rc_prr_sndcnt = 0; 1324 } 1325 EXIT_RECOVERY(tp->t_flags); 1326 1327 1328 #ifdef NETFLIX_CWV 1329 if (tp->cwv_enabled) { 1330 if ((tp->cwv_cwnd_valid == 0) && 1331 (tp->snd_cwv.in_recovery)) 1332 tcp_newcwv_end_recovery(tp); 1333 } 1334 #endif 1335 } 1336 1337 static void 1338 rack_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type) 1339 { 1340 struct tcp_rack *rack; 1341 1342 INP_WLOCK_ASSERT(tp->t_inpcb); 1343 1344 rack = (struct tcp_rack *)tp->t_fb_ptr; 1345 switch (type) { 1346 case CC_NDUPACK: 1347 /* rack->r_ctl.rc_ssthresh_set = 1;*/ 1348 if (!IN_FASTRECOVERY(tp->t_flags)) { 1349 rack->r_ctl.rc_tlp_rtx_out = 0; 1350 rack->r_ctl.rc_prr_delivered = 0; 1351 rack->r_ctl.rc_prr_out = 0; 1352 rack->r_ctl.rc_loss_count = 0; 1353 rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; 1354 rack->r_ctl.rc_prr_recovery_fs = tp->snd_max - tp->snd_una; 1355 tp->snd_recover = tp->snd_max; 1356 if (tp->t_flags & TF_ECN_PERMIT) 1357 tp->t_flags |= TF_ECN_SND_CWR; 1358 } 1359 break; 1360 case CC_ECN: 1361 if (!IN_CONGRECOVERY(tp->t_flags)) { 1362 TCPSTAT_INC(tcps_ecn_rcwnd); 1363 tp->snd_recover = tp->snd_max; 1364 if (tp->t_flags & TF_ECN_PERMIT) 1365 tp->t_flags |= TF_ECN_SND_CWR; 1366 } 1367 break; 1368 case CC_RTO: 1369 tp->t_dupacks = 0; 1370 tp->t_bytes_acked = 0; 1371 EXIT_RECOVERY(tp->t_flags); 1372 tp->snd_ssthresh = max(2, min(tp->snd_wnd, tp->snd_cwnd) / 2 / 1373 tp->t_maxseg) * tp->t_maxseg; 1374 tp->snd_cwnd = tp->t_maxseg; 1375 break; 1376 case CC_RTO_ERR: 1377 TCPSTAT_INC(tcps_sndrexmitbad); 1378 /* RTO was unnecessary, so reset everything. */ 1379 tp->snd_cwnd = tp->snd_cwnd_prev; 1380 tp->snd_ssthresh = tp->snd_ssthresh_prev; 1381 tp->snd_recover = tp->snd_recover_prev; 1382 if (tp->t_flags & TF_WASFRECOVERY) 1383 ENTER_FASTRECOVERY(tp->t_flags); 1384 if (tp->t_flags & TF_WASCRECOVERY) 1385 ENTER_CONGRECOVERY(tp->t_flags); 1386 tp->snd_nxt = tp->snd_max; 1387 tp->t_badrxtwin = 0; 1388 break; 1389 } 1390 1391 if (CC_ALGO(tp)->cong_signal != NULL) { 1392 if (th != NULL) 1393 tp->ccv->curack = th->th_ack; 1394 CC_ALGO(tp)->cong_signal(tp->ccv, type); 1395 } 1396 #ifdef NETFLIX_CWV 1397 if (tp->cwv_enabled) { 1398 if (tp->snd_cwv.in_recovery == 0 && IN_RECOVERY(tp->t_flags)) { 1399 tcp_newcwv_enter_recovery(tp); 1400 } 1401 if (type == CC_RTO) { 1402 tcp_newcwv_reset(tp); 1403 } 1404 } 1405 #endif 1406 } 1407 1408 1409 1410 static inline void 1411 rack_cc_after_idle(struct tcpcb *tp, int reduce_largest) 1412 { 1413 uint32_t i_cwnd; 1414 1415 INP_WLOCK_ASSERT(tp->t_inpcb); 1416 1417 #ifdef NETFLIX_STATS 1418 TCPSTAT_INC(tcps_idle_restarts); 1419 if (tp->t_state == TCPS_ESTABLISHED) 1420 TCPSTAT_INC(tcps_idle_estrestarts); 1421 #endif 1422 if (CC_ALGO(tp)->after_idle != NULL) 1423 CC_ALGO(tp)->after_idle(tp->ccv); 1424 1425 if (tp->snd_cwnd == 1) 1426 i_cwnd = tp->t_maxseg; /* SYN(-ACK) lost */ 1427 else 1428 i_cwnd = tcp_compute_initwnd(tcp_maxseg(tp)); 1429 1430 if (reduce_largest) { 1431 /* 1432 * Do we reduce the largest cwnd to make 1433 * rack play nice on restart hptsi wise? 1434 */ 1435 if (((struct tcp_rack *)tp->t_fb_ptr)->r_ctl.rc_rack_largest_cwnd > i_cwnd) 1436 ((struct tcp_rack *)tp->t_fb_ptr)->r_ctl.rc_rack_largest_cwnd = i_cwnd; 1437 } 1438 /* 1439 * Being idle is no differnt than the initial window. If the cc 1440 * clamps it down below the initial window raise it to the initial 1441 * window. 1442 */ 1443 if (tp->snd_cwnd < i_cwnd) { 1444 tp->snd_cwnd = i_cwnd; 1445 } 1446 } 1447 1448 1449 /* 1450 * Indicate whether this ack should be delayed. We can delay the ack if 1451 * following conditions are met: 1452 * - There is no delayed ack timer in progress. 1453 * - Our last ack wasn't a 0-sized window. We never want to delay 1454 * the ack that opens up a 0-sized window. 1455 * - LRO wasn't used for this segment. We make sure by checking that the 1456 * segment size is not larger than the MSS. 1457 * - Delayed acks are enabled or this is a half-synchronized T/TCP 1458 * connection. 1459 */ 1460 #define DELAY_ACK(tp, tlen) \ 1461 (((tp->t_flags & TF_RXWIN0SENT) == 0) && \ 1462 ((tp->t_flags & TF_DELACK) == 0) && \ 1463 (tlen <= tp->t_maxseg) && \ 1464 (tp->t_delayed_ack || (tp->t_flags & TF_NEEDSYN))) 1465 1466 static inline void 1467 rack_calc_rwin(struct socket *so, struct tcpcb *tp) 1468 { 1469 int32_t win; 1470 1471 /* 1472 * Calculate amount of space in receive window, and then do TCP 1473 * input processing. Receive window is amount of space in rcv queue, 1474 * but not less than advertised window. 1475 */ 1476 win = sbspace(&so->so_rcv); 1477 if (win < 0) 1478 win = 0; 1479 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); 1480 } 1481 1482 static void 1483 rack_do_drop(struct mbuf *m, struct tcpcb *tp) 1484 { 1485 /* 1486 * Drop space held by incoming segment and return. 1487 */ 1488 if (tp != NULL) 1489 INP_WUNLOCK(tp->t_inpcb); 1490 if (m) 1491 m_freem(m); 1492 } 1493 1494 static void 1495 rack_do_dropwithreset(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, 1496 int32_t rstreason, int32_t tlen) 1497 { 1498 if (tp != NULL) { 1499 tcp_dropwithreset(m, th, tp, tlen, rstreason); 1500 INP_WUNLOCK(tp->t_inpcb); 1501 } else 1502 tcp_dropwithreset(m, th, NULL, tlen, rstreason); 1503 } 1504 1505 /* 1506 * The value in ret_val informs the caller 1507 * if we dropped the tcb (and lock) or not. 1508 * 1 = we dropped it, 0 = the TCB is still locked 1509 * and valid. 1510 */ 1511 static void 1512 rack_do_dropafterack(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t thflags, int32_t tlen, int32_t * ret_val) 1513 { 1514 /* 1515 * Generate an ACK dropping incoming segment if it occupies sequence 1516 * space, where the ACK reflects our state. 1517 * 1518 * We can now skip the test for the RST flag since all paths to this 1519 * code happen after packets containing RST have been dropped. 1520 * 1521 * In the SYN-RECEIVED state, don't send an ACK unless the segment 1522 * we received passes the SYN-RECEIVED ACK test. If it fails send a 1523 * RST. This breaks the loop in the "LAND" DoS attack, and also 1524 * prevents an ACK storm between two listening ports that have been 1525 * sent forged SYN segments, each with the source address of the 1526 * other. 1527 */ 1528 struct tcp_rack *rack; 1529 1530 if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) && 1531 (SEQ_GT(tp->snd_una, th->th_ack) || 1532 SEQ_GT(th->th_ack, tp->snd_max))) { 1533 *ret_val = 1; 1534 rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 1535 return; 1536 } else 1537 *ret_val = 0; 1538 rack = (struct tcp_rack *)tp->t_fb_ptr; 1539 rack->r_wanted_output++; 1540 tp->t_flags |= TF_ACKNOW; 1541 if (m) 1542 m_freem(m); 1543 } 1544 1545 1546 static int 1547 rack_process_rst(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp) 1548 { 1549 /* 1550 * RFC5961 Section 3.2 1551 * 1552 * - RST drops connection only if SEG.SEQ == RCV.NXT. - If RST is in 1553 * window, we send challenge ACK. 1554 * 1555 * Note: to take into account delayed ACKs, we should test against 1556 * last_ack_sent instead of rcv_nxt. Note 2: we handle special case 1557 * of closed window, not covered by the RFC. 1558 */ 1559 int dropped = 0; 1560 1561 if ((SEQ_GEQ(th->th_seq, (tp->last_ack_sent - 1)) && 1562 SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) || 1563 (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) { 1564 1565 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 1566 KASSERT(tp->t_state != TCPS_SYN_SENT, 1567 ("%s: TH_RST for TCPS_SYN_SENT th %p tp %p", 1568 __func__, th, tp)); 1569 1570 if (V_tcp_insecure_rst || 1571 (tp->last_ack_sent == th->th_seq) || 1572 (tp->rcv_nxt == th->th_seq) || 1573 ((tp->last_ack_sent - 1) == th->th_seq)) { 1574 TCPSTAT_INC(tcps_drops); 1575 /* Drop the connection. */ 1576 switch (tp->t_state) { 1577 case TCPS_SYN_RECEIVED: 1578 so->so_error = ECONNREFUSED; 1579 goto close; 1580 case TCPS_ESTABLISHED: 1581 case TCPS_FIN_WAIT_1: 1582 case TCPS_FIN_WAIT_2: 1583 case TCPS_CLOSE_WAIT: 1584 case TCPS_CLOSING: 1585 case TCPS_LAST_ACK: 1586 so->so_error = ECONNRESET; 1587 close: 1588 tcp_state_change(tp, TCPS_CLOSED); 1589 /* FALLTHROUGH */ 1590 default: 1591 tp = tcp_close(tp); 1592 } 1593 dropped = 1; 1594 rack_do_drop(m, tp); 1595 } else { 1596 TCPSTAT_INC(tcps_badrst); 1597 /* Send challenge ACK. */ 1598 tcp_respond(tp, mtod(m, void *), th, m, 1599 tp->rcv_nxt, tp->snd_nxt, TH_ACK); 1600 tp->last_ack_sent = tp->rcv_nxt; 1601 } 1602 } else { 1603 m_freem(m); 1604 } 1605 return (dropped); 1606 } 1607 1608 /* 1609 * The value in ret_val informs the caller 1610 * if we dropped the tcb (and lock) or not. 1611 * 1 = we dropped it, 0 = the TCB is still locked 1612 * and valid. 1613 */ 1614 static void 1615 rack_challenge_ack(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * ret_val) 1616 { 1617 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 1618 1619 TCPSTAT_INC(tcps_badsyn); 1620 if (V_tcp_insecure_syn && 1621 SEQ_GEQ(th->th_seq, tp->last_ack_sent) && 1622 SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) { 1623 tp = tcp_drop(tp, ECONNRESET); 1624 *ret_val = 1; 1625 rack_do_drop(m, tp); 1626 } else { 1627 /* Send challenge ACK. */ 1628 tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt, 1629 tp->snd_nxt, TH_ACK); 1630 tp->last_ack_sent = tp->rcv_nxt; 1631 m = NULL; 1632 *ret_val = 0; 1633 rack_do_drop(m, NULL); 1634 } 1635 } 1636 1637 /* 1638 * rack_ts_check returns 1 for you should not proceed. It places 1639 * in ret_val what should be returned 1/0 by the caller. The 1 indicates 1640 * that the TCB is unlocked and probably dropped. The 0 indicates the 1641 * TCB is still valid and locked. 1642 */ 1643 static int 1644 rack_ts_check(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t tlen, int32_t thflags, int32_t * ret_val) 1645 { 1646 1647 /* Check to see if ts_recent is over 24 days old. */ 1648 if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) { 1649 /* 1650 * Invalidate ts_recent. If this segment updates ts_recent, 1651 * the age will be reset later and ts_recent will get a 1652 * valid value. If it does not, setting ts_recent to zero 1653 * will at least satisfy the requirement that zero be placed 1654 * in the timestamp echo reply when ts_recent isn't valid. 1655 * The age isn't reset until we get a valid ts_recent 1656 * because we don't want out-of-order segments to be dropped 1657 * when ts_recent is old. 1658 */ 1659 tp->ts_recent = 0; 1660 } else { 1661 TCPSTAT_INC(tcps_rcvduppack); 1662 TCPSTAT_ADD(tcps_rcvdupbyte, tlen); 1663 TCPSTAT_INC(tcps_pawsdrop); 1664 *ret_val = 0; 1665 if (tlen) { 1666 rack_do_dropafterack(m, tp, th, thflags, tlen, ret_val); 1667 } else { 1668 rack_do_drop(m, NULL); 1669 } 1670 return (1); 1671 } 1672 return (0); 1673 } 1674 1675 /* 1676 * rack_drop_checks returns 1 for you should not proceed. It places 1677 * in ret_val what should be returned 1/0 by the caller. The 1 indicates 1678 * that the TCB is unlocked and probably dropped. The 0 indicates the 1679 * TCB is still valid and locked. 1680 */ 1681 static int 1682 rack_drop_checks(struct tcpopt *to, struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * tlenp, int32_t * thf, int32_t * drop_hdrlen, int32_t * ret_val) 1683 { 1684 int32_t todrop; 1685 int32_t thflags; 1686 int32_t tlen; 1687 1688 thflags = *thf; 1689 tlen = *tlenp; 1690 todrop = tp->rcv_nxt - th->th_seq; 1691 if (todrop > 0) { 1692 if (thflags & TH_SYN) { 1693 thflags &= ~TH_SYN; 1694 th->th_seq++; 1695 if (th->th_urp > 1) 1696 th->th_urp--; 1697 else 1698 thflags &= ~TH_URG; 1699 todrop--; 1700 } 1701 /* 1702 * Following if statement from Stevens, vol. 2, p. 960. 1703 */ 1704 if (todrop > tlen 1705 || (todrop == tlen && (thflags & TH_FIN) == 0)) { 1706 /* 1707 * Any valid FIN must be to the left of the window. 1708 * At this point the FIN must be a duplicate or out 1709 * of sequence; drop it. 1710 */ 1711 thflags &= ~TH_FIN; 1712 /* 1713 * Send an ACK to resynchronize and drop any data. 1714 * But keep on processing for RST or ACK. 1715 */ 1716 tp->t_flags |= TF_ACKNOW; 1717 todrop = tlen; 1718 TCPSTAT_INC(tcps_rcvduppack); 1719 TCPSTAT_ADD(tcps_rcvdupbyte, todrop); 1720 } else { 1721 TCPSTAT_INC(tcps_rcvpartduppack); 1722 TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop); 1723 } 1724 /* 1725 * DSACK - add SACK block for dropped range 1726 */ 1727 if (tp->t_flags & TF_SACK_PERMIT) { 1728 tcp_update_sack_list(tp, th->th_seq, th->th_seq + tlen); 1729 /* 1730 * ACK now, as the next in-sequence segment 1731 * will clear the DSACK block again 1732 */ 1733 tp->t_flags |= TF_ACKNOW; 1734 } 1735 *drop_hdrlen += todrop; /* drop from the top afterwards */ 1736 th->th_seq += todrop; 1737 tlen -= todrop; 1738 if (th->th_urp > todrop) 1739 th->th_urp -= todrop; 1740 else { 1741 thflags &= ~TH_URG; 1742 th->th_urp = 0; 1743 } 1744 } 1745 /* 1746 * If segment ends after window, drop trailing data (and PUSH and 1747 * FIN); if nothing left, just ACK. 1748 */ 1749 todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd); 1750 if (todrop > 0) { 1751 TCPSTAT_INC(tcps_rcvpackafterwin); 1752 if (todrop >= tlen) { 1753 TCPSTAT_ADD(tcps_rcvbyteafterwin, tlen); 1754 /* 1755 * If window is closed can only take segments at 1756 * window edge, and have to drop data and PUSH from 1757 * incoming segments. Continue processing, but 1758 * remember to ack. Otherwise, drop segment and 1759 * ack. 1760 */ 1761 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { 1762 tp->t_flags |= TF_ACKNOW; 1763 TCPSTAT_INC(tcps_rcvwinprobe); 1764 } else { 1765 rack_do_dropafterack(m, tp, th, thflags, tlen, ret_val); 1766 return (1); 1767 } 1768 } else 1769 TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); 1770 m_adj(m, -todrop); 1771 tlen -= todrop; 1772 thflags &= ~(TH_PUSH | TH_FIN); 1773 } 1774 *thf = thflags; 1775 *tlenp = tlen; 1776 return (0); 1777 } 1778 1779 static struct rack_sendmap * 1780 rack_find_lowest_rsm(struct tcp_rack *rack) 1781 { 1782 struct rack_sendmap *rsm; 1783 1784 /* 1785 * Walk the time-order transmitted list looking for an rsm that is 1786 * not acked. This will be the one that was sent the longest time 1787 * ago that is still outstanding. 1788 */ 1789 TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) { 1790 if (rsm->r_flags & RACK_ACKED) { 1791 continue; 1792 } 1793 goto finish; 1794 } 1795 finish: 1796 return (rsm); 1797 } 1798 1799 static struct rack_sendmap * 1800 rack_find_high_nonack(struct tcp_rack *rack, struct rack_sendmap *rsm) 1801 { 1802 struct rack_sendmap *prsm; 1803 1804 /* 1805 * Walk the sequence order list backward until we hit and arrive at 1806 * the highest seq not acked. In theory when this is called it 1807 * should be the last segment (which it was not). 1808 */ 1809 counter_u64_add(rack_find_high, 1); 1810 prsm = rsm; 1811 TAILQ_FOREACH_REVERSE_FROM(prsm, &rack->r_ctl.rc_map, rack_head, r_next) { 1812 if (prsm->r_flags & (RACK_ACKED | RACK_HAS_FIN)) { 1813 continue; 1814 } 1815 return (prsm); 1816 } 1817 return (NULL); 1818 } 1819 1820 1821 static uint32_t 1822 rack_calc_thresh_rack(struct tcp_rack *rack, uint32_t srtt, uint32_t cts) 1823 { 1824 int32_t lro; 1825 uint32_t thresh; 1826 1827 /* 1828 * lro is the flag we use to determine if we have seen reordering. 1829 * If it gets set we have seen reordering. The reorder logic either 1830 * works in one of two ways: 1831 * 1832 * If reorder-fade is configured, then we track the last time we saw 1833 * re-ordering occur. If we reach the point where enough time as 1834 * passed we no longer consider reordering has occuring. 1835 * 1836 * Or if reorder-face is 0, then once we see reordering we consider 1837 * the connection to alway be subject to reordering and just set lro 1838 * to 1. 1839 * 1840 * In the end if lro is non-zero we add the extra time for 1841 * reordering in. 1842 */ 1843 if (srtt == 0) 1844 srtt = 1; 1845 if (rack->r_ctl.rc_reorder_ts) { 1846 if (rack->r_ctl.rc_reorder_fade) { 1847 if (SEQ_GEQ(cts, rack->r_ctl.rc_reorder_ts)) { 1848 lro = cts - rack->r_ctl.rc_reorder_ts; 1849 if (lro == 0) { 1850 /* 1851 * No time as passed since the last 1852 * reorder, mark it as reordering. 1853 */ 1854 lro = 1; 1855 } 1856 } else { 1857 /* Negative time? */ 1858 lro = 0; 1859 } 1860 if (lro > rack->r_ctl.rc_reorder_fade) { 1861 /* Turn off reordering seen too */ 1862 rack->r_ctl.rc_reorder_ts = 0; 1863 lro = 0; 1864 } 1865 } else { 1866 /* Reodering does not fade */ 1867 lro = 1; 1868 } 1869 } else { 1870 lro = 0; 1871 } 1872 thresh = srtt + rack->r_ctl.rc_pkt_delay; 1873 if (lro) { 1874 /* It must be set, if not you get 1/4 rtt */ 1875 if (rack->r_ctl.rc_reorder_shift) 1876 thresh += (srtt >> rack->r_ctl.rc_reorder_shift); 1877 else 1878 thresh += (srtt >> 2); 1879 } else { 1880 thresh += 1; 1881 } 1882 /* We don't let the rack timeout be above a RTO */ 1883 1884 if (thresh > TICKS_2_MSEC(rack->rc_tp->t_rxtcur)) { 1885 thresh = TICKS_2_MSEC(rack->rc_tp->t_rxtcur); 1886 } 1887 /* And we don't want it above the RTO max either */ 1888 if (thresh > rack_rto_max) { 1889 thresh = rack_rto_max; 1890 } 1891 return (thresh); 1892 } 1893 1894 static uint32_t 1895 rack_calc_thresh_tlp(struct tcpcb *tp, struct tcp_rack *rack, 1896 struct rack_sendmap *rsm, uint32_t srtt) 1897 { 1898 struct rack_sendmap *prsm; 1899 uint32_t thresh, len; 1900 int maxseg; 1901 1902 if (srtt == 0) 1903 srtt = 1; 1904 if (rack->r_ctl.rc_tlp_threshold) 1905 thresh = srtt + (srtt / rack->r_ctl.rc_tlp_threshold); 1906 else 1907 thresh = (srtt * 2); 1908 1909 /* Get the previous sent packet, if any */ 1910 maxseg = tcp_maxseg(tp); 1911 counter_u64_add(rack_enter_tlp_calc, 1); 1912 len = rsm->r_end - rsm->r_start; 1913 if (rack->rack_tlp_threshold_use == TLP_USE_ID) { 1914 /* Exactly like the ID */ 1915 if (((tp->snd_max - tp->snd_una) - rack->r_ctl.rc_sacked + rack->r_ctl.rc_holes_rxt) <= maxseg) { 1916 uint32_t alt_thresh; 1917 /* 1918 * Compensate for delayed-ack with the d-ack time. 1919 */ 1920 counter_u64_add(rack_used_tlpmethod, 1); 1921 alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time; 1922 if (alt_thresh > thresh) 1923 thresh = alt_thresh; 1924 } 1925 } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_ONE) { 1926 /* 2.1 behavior */ 1927 prsm = TAILQ_PREV(rsm, rack_head, r_tnext); 1928 if (prsm && (len <= maxseg)) { 1929 /* 1930 * Two packets outstanding, thresh should be (2*srtt) + 1931 * possible inter-packet delay (if any). 1932 */ 1933 uint32_t inter_gap = 0; 1934 int idx, nidx; 1935 1936 counter_u64_add(rack_used_tlpmethod, 1); 1937 idx = rsm->r_rtr_cnt - 1; 1938 nidx = prsm->r_rtr_cnt - 1; 1939 if (TSTMP_GEQ(rsm->r_tim_lastsent[nidx], prsm->r_tim_lastsent[idx])) { 1940 /* Yes it was sent later (or at the same time) */ 1941 inter_gap = rsm->r_tim_lastsent[idx] - prsm->r_tim_lastsent[nidx]; 1942 } 1943 thresh += inter_gap; 1944 } else if (len <= maxseg) { 1945 /* 1946 * Possibly compensate for delayed-ack. 1947 */ 1948 uint32_t alt_thresh; 1949 1950 counter_u64_add(rack_used_tlpmethod2, 1); 1951 alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time; 1952 if (alt_thresh > thresh) 1953 thresh = alt_thresh; 1954 } 1955 } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_TWO) { 1956 /* 2.2 behavior */ 1957 if (len <= maxseg) { 1958 uint32_t alt_thresh; 1959 /* 1960 * Compensate for delayed-ack with the d-ack time. 1961 */ 1962 counter_u64_add(rack_used_tlpmethod, 1); 1963 alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time; 1964 if (alt_thresh > thresh) 1965 thresh = alt_thresh; 1966 } 1967 } 1968 /* Not above an RTO */ 1969 if (thresh > TICKS_2_MSEC(tp->t_rxtcur)) { 1970 thresh = TICKS_2_MSEC(tp->t_rxtcur); 1971 } 1972 /* Not above a RTO max */ 1973 if (thresh > rack_rto_max) { 1974 thresh = rack_rto_max; 1975 } 1976 /* Apply user supplied min TLP */ 1977 if (thresh < rack_tlp_min) { 1978 thresh = rack_tlp_min; 1979 } 1980 return (thresh); 1981 } 1982 1983 static struct rack_sendmap * 1984 rack_check_recovery_mode(struct tcpcb *tp, uint32_t tsused) 1985 { 1986 /* 1987 * Check to see that we don't need to fall into recovery. We will 1988 * need to do so if our oldest transmit is past the time we should 1989 * have had an ack. 1990 */ 1991 struct tcp_rack *rack; 1992 struct rack_sendmap *rsm; 1993 int32_t idx; 1994 uint32_t srtt_cur, srtt, thresh; 1995 1996 rack = (struct tcp_rack *)tp->t_fb_ptr; 1997 if (TAILQ_EMPTY(&rack->r_ctl.rc_map)) { 1998 return (NULL); 1999 } 2000 srtt_cur = tp->t_srtt >> TCP_RTT_SHIFT; 2001 srtt = TICKS_2_MSEC(srtt_cur); 2002 if (rack->rc_rack_rtt && (srtt > rack->rc_rack_rtt)) 2003 srtt = rack->rc_rack_rtt; 2004 2005 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 2006 if (rsm == NULL) 2007 return (NULL); 2008 2009 if (rsm->r_flags & RACK_ACKED) { 2010 rsm = rack_find_lowest_rsm(rack); 2011 if (rsm == NULL) 2012 return (NULL); 2013 } 2014 idx = rsm->r_rtr_cnt - 1; 2015 thresh = rack_calc_thresh_rack(rack, srtt, tsused); 2016 if (tsused < rsm->r_tim_lastsent[idx]) { 2017 return (NULL); 2018 } 2019 if ((tsused - rsm->r_tim_lastsent[idx]) < thresh) { 2020 return (NULL); 2021 } 2022 /* Ok if we reach here we are over-due */ 2023 rack->r_ctl.rc_rsm_start = rsm->r_start; 2024 rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; 2025 rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; 2026 rack_cong_signal(tp, NULL, CC_NDUPACK); 2027 return (rsm); 2028 } 2029 2030 static uint32_t 2031 rack_get_persists_timer_val(struct tcpcb *tp, struct tcp_rack *rack) 2032 { 2033 int32_t t; 2034 int32_t tt; 2035 uint32_t ret_val; 2036 2037 t = TICKS_2_MSEC((tp->t_srtt >> TCP_RTT_SHIFT) + ((tp->t_rttvar * 4) >> TCP_RTT_SHIFT)); 2038 TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift], 2039 tcp_persmin, tcp_persmax); 2040 if (tp->t_rxtshift < TCP_MAXRXTSHIFT) 2041 tp->t_rxtshift++; 2042 rack->r_ctl.rc_hpts_flags |= PACE_TMR_PERSIT; 2043 ret_val = (uint32_t)tt; 2044 return (ret_val); 2045 } 2046 2047 static uint32_t 2048 rack_timer_start(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 2049 { 2050 /* 2051 * Start the FR timer, we do this based on getting the first one in 2052 * the rc_tmap. Note that if its NULL we must stop the timer. in all 2053 * events we need to stop the running timer (if its running) before 2054 * starting the new one. 2055 */ 2056 uint32_t thresh, exp, to, srtt, time_since_sent; 2057 uint32_t srtt_cur; 2058 int32_t idx; 2059 int32_t is_tlp_timer = 0; 2060 struct rack_sendmap *rsm; 2061 2062 if (rack->t_timers_stopped) { 2063 /* All timers have been stopped none are to run */ 2064 return (0); 2065 } 2066 if (rack->rc_in_persist) { 2067 /* We can't start any timer in persists */ 2068 return (rack_get_persists_timer_val(tp, rack)); 2069 } 2070 if (tp->t_state < TCPS_ESTABLISHED) 2071 goto activate_rxt; 2072 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 2073 if (rsm == NULL) { 2074 /* Nothing on the send map */ 2075 activate_rxt: 2076 if (SEQ_LT(tp->snd_una, tp->snd_max) || sbavail(&(tp->t_inpcb->inp_socket->so_snd))) { 2077 rack->r_ctl.rc_hpts_flags |= PACE_TMR_RXT; 2078 to = TICKS_2_MSEC(tp->t_rxtcur); 2079 if (to == 0) 2080 to = 1; 2081 return (to); 2082 } 2083 return (0); 2084 } 2085 if (rsm->r_flags & RACK_ACKED) { 2086 rsm = rack_find_lowest_rsm(rack); 2087 if (rsm == NULL) { 2088 /* No lowest? */ 2089 goto activate_rxt; 2090 } 2091 } 2092 /* Convert from ms to usecs */ 2093 if (rsm->r_flags & RACK_SACK_PASSED) { 2094 if ((tp->t_flags & TF_SENTFIN) && 2095 ((tp->snd_max - tp->snd_una) == 1) && 2096 (rsm->r_flags & RACK_HAS_FIN)) { 2097 /* 2098 * We don't start a rack timer if all we have is a 2099 * FIN outstanding. 2100 */ 2101 goto activate_rxt; 2102 } 2103 if (tp->t_srtt) { 2104 srtt_cur = (tp->t_srtt >> TCP_RTT_SHIFT); 2105 srtt = TICKS_2_MSEC(srtt_cur); 2106 } else 2107 srtt = RACK_INITIAL_RTO; 2108 2109 thresh = rack_calc_thresh_rack(rack, srtt, cts); 2110 idx = rsm->r_rtr_cnt - 1; 2111 exp = rsm->r_tim_lastsent[idx] + thresh; 2112 if (SEQ_GEQ(exp, cts)) { 2113 to = exp - cts; 2114 if (to < rack->r_ctl.rc_min_to) { 2115 to = rack->r_ctl.rc_min_to; 2116 } 2117 } else { 2118 to = rack->r_ctl.rc_min_to; 2119 } 2120 } else { 2121 /* Ok we need to do a TLP not RACK */ 2122 if ((rack->rc_tlp_in_progress != 0) || 2123 (rack->r_ctl.rc_tlp_rtx_out != 0)) { 2124 /* 2125 * The previous send was a TLP or a tlp_rtx is in 2126 * process. 2127 */ 2128 goto activate_rxt; 2129 } 2130 rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext); 2131 if (rsm == NULL) { 2132 /* We found no rsm to TLP with. */ 2133 goto activate_rxt; 2134 } 2135 if (rsm->r_flags & RACK_HAS_FIN) { 2136 /* If its a FIN we dont do TLP */ 2137 rsm = NULL; 2138 goto activate_rxt; 2139 } 2140 idx = rsm->r_rtr_cnt - 1; 2141 if (TSTMP_GT(cts, rsm->r_tim_lastsent[idx])) 2142 time_since_sent = cts - rsm->r_tim_lastsent[idx]; 2143 else 2144 time_since_sent = 0; 2145 is_tlp_timer = 1; 2146 if (tp->t_srtt) { 2147 srtt_cur = (tp->t_srtt >> TCP_RTT_SHIFT); 2148 srtt = TICKS_2_MSEC(srtt_cur); 2149 } else 2150 srtt = RACK_INITIAL_RTO; 2151 thresh = rack_calc_thresh_tlp(tp, rack, rsm, srtt); 2152 if (thresh > time_since_sent) 2153 to = thresh - time_since_sent; 2154 else 2155 to = rack->r_ctl.rc_min_to; 2156 if (to > TCPTV_REXMTMAX) { 2157 /* 2158 * If the TLP time works out to larger than the max 2159 * RTO lets not do TLP.. just RTO. 2160 */ 2161 goto activate_rxt; 2162 } 2163 if (rsm->r_start != rack->r_ctl.rc_last_tlp_seq) { 2164 /* 2165 * The tail is no longer the last one I did a probe 2166 * on 2167 */ 2168 rack->r_ctl.rc_tlp_seg_send_cnt = 0; 2169 rack->r_ctl.rc_last_tlp_seq = rsm->r_start; 2170 } 2171 } 2172 if (is_tlp_timer == 0) { 2173 rack->r_ctl.rc_hpts_flags |= PACE_TMR_RACK; 2174 } else { 2175 if ((rack->r_ctl.rc_tlp_send_cnt > rack_tlp_max_resend) || 2176 (rack->r_ctl.rc_tlp_seg_send_cnt > rack_tlp_max_resend)) { 2177 /* 2178 * We have exceeded how many times we can retran the 2179 * current TLP timer, switch to the RTO timer. 2180 */ 2181 goto activate_rxt; 2182 } else { 2183 rack->r_ctl.rc_hpts_flags |= PACE_TMR_TLP; 2184 } 2185 } 2186 if (to == 0) 2187 to = 1; 2188 return (to); 2189 } 2190 2191 static void 2192 rack_enter_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 2193 { 2194 if (rack->rc_in_persist == 0) { 2195 if (((tp->t_flags & TF_SENTFIN) == 0) && 2196 (tp->snd_max - tp->snd_una) >= sbavail(&rack->rc_inp->inp_socket->so_snd)) 2197 /* Must need to send more data to enter persist */ 2198 return; 2199 rack->r_ctl.rc_went_idle_time = cts; 2200 rack_timer_cancel(tp, rack, cts, __LINE__); 2201 tp->t_rxtshift = 0; 2202 rack->rc_in_persist = 1; 2203 } 2204 } 2205 2206 static void 2207 rack_exit_persist(struct tcpcb *tp, struct tcp_rack *rack) 2208 { 2209 if (rack->rc_inp->inp_in_hpts) { 2210 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); 2211 rack->r_ctl.rc_hpts_flags = 0; 2212 } 2213 rack->rc_in_persist = 0; 2214 rack->r_ctl.rc_went_idle_time = 0; 2215 tp->t_flags &= ~TF_FORCEDATA; 2216 tp->t_rxtshift = 0; 2217 } 2218 2219 static void 2220 rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, int32_t line, 2221 int32_t slot, uint32_t tot_len_this_send, int32_t frm_out_sbavail) 2222 { 2223 struct inpcb *inp; 2224 uint32_t delayed_ack = 0; 2225 uint32_t hpts_timeout; 2226 uint8_t stopped; 2227 uint32_t left = 0; 2228 2229 inp = tp->t_inpcb; 2230 if (inp->inp_in_hpts) { 2231 /* A previous call is already set up */ 2232 return; 2233 } 2234 if (tp->t_state == TCPS_CLOSED) { 2235 return; 2236 } 2237 stopped = rack->rc_tmr_stopped; 2238 if (stopped && TSTMP_GT(rack->r_ctl.rc_timer_exp, cts)) { 2239 left = rack->r_ctl.rc_timer_exp - cts; 2240 } 2241 rack->r_ctl.rc_timer_exp = 0; 2242 if (rack->rc_inp->inp_in_hpts == 0) { 2243 rack->r_ctl.rc_hpts_flags = 0; 2244 } 2245 if (slot) { 2246 /* We are hptsi too */ 2247 rack->r_ctl.rc_hpts_flags |= PACE_PKT_OUTPUT; 2248 } else if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { 2249 /* 2250 * We are still left on the hpts when the to goes 2251 * it will be for output. 2252 */ 2253 if (TSTMP_GT(cts, rack->r_ctl.rc_last_output_to)) 2254 slot = cts - rack->r_ctl.rc_last_output_to; 2255 else 2256 slot = 1; 2257 } 2258 if ((tp->snd_wnd == 0) && TCPS_HAVEESTABLISHED(tp->t_state)) { 2259 /* No send window.. we must enter persist */ 2260 rack_enter_persist(tp, rack, cts); 2261 } else if ((frm_out_sbavail && 2262 (frm_out_sbavail > (tp->snd_max - tp->snd_una)) && 2263 (tp->snd_wnd < tp->t_maxseg)) && 2264 TCPS_HAVEESTABLISHED(tp->t_state)) { 2265 /* 2266 * If we have no window or we can't send a segment (and have 2267 * data to send.. we cheat here and frm_out_sbavail is 2268 * passed in with the sbavail(sb) only from bbr_output) and 2269 * we are established, then we must enter persits (if not 2270 * already in persits). 2271 */ 2272 rack_enter_persist(tp, rack, cts); 2273 } 2274 hpts_timeout = rack_timer_start(tp, rack, cts); 2275 if (tp->t_flags & TF_DELACK) { 2276 delayed_ack = TICKS_2_MSEC(tcp_delacktime); 2277 rack->r_ctl.rc_hpts_flags |= PACE_TMR_DELACK; 2278 } 2279 if (delayed_ack && ((hpts_timeout == 0) || 2280 (delayed_ack < hpts_timeout))) 2281 hpts_timeout = delayed_ack; 2282 else 2283 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK; 2284 /* 2285 * If no timers are going to run and we will fall off the hptsi 2286 * wheel, we resort to a keep-alive timer if its configured. 2287 */ 2288 if ((hpts_timeout == 0) && 2289 (slot == 0)) { 2290 if ((tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && 2291 (tp->t_state <= TCPS_CLOSING)) { 2292 /* 2293 * Ok we have no timer (persists, rack, tlp, rxt or 2294 * del-ack), we don't have segments being paced. So 2295 * all that is left is the keepalive timer. 2296 */ 2297 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 2298 /* Get the established keep-alive time */ 2299 hpts_timeout = TP_KEEPIDLE(tp); 2300 } else { 2301 /* Get the initial setup keep-alive time */ 2302 hpts_timeout = TP_KEEPINIT(tp); 2303 } 2304 rack->r_ctl.rc_hpts_flags |= PACE_TMR_KEEP; 2305 } 2306 } 2307 if (left && (stopped & (PACE_TMR_KEEP | PACE_TMR_DELACK)) == 2308 (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK)) { 2309 /* 2310 * RACK, TLP, persists and RXT timers all are restartable 2311 * based on actions input .. i.e we received a packet (ack 2312 * or sack) and that changes things (rw, or snd_una etc). 2313 * Thus we can restart them with a new value. For 2314 * keep-alive, delayed_ack we keep track of what was left 2315 * and restart the timer with a smaller value. 2316 */ 2317 if (left < hpts_timeout) 2318 hpts_timeout = left; 2319 } 2320 if (hpts_timeout) { 2321 /* 2322 * Hack alert for now we can't time-out over 2,147,483 2323 * seconds (a bit more than 596 hours), which is probably ok 2324 * :). 2325 */ 2326 if (hpts_timeout > 0x7ffffffe) 2327 hpts_timeout = 0x7ffffffe; 2328 rack->r_ctl.rc_timer_exp = cts + hpts_timeout; 2329 } 2330 if (slot) { 2331 rack->r_ctl.rc_last_output_to = cts + slot; 2332 if ((hpts_timeout == 0) || (hpts_timeout > slot)) { 2333 if (rack->rc_inp->inp_in_hpts == 0) 2334 tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(slot)); 2335 rack_log_to_start(rack, cts, hpts_timeout, slot, 1); 2336 } else { 2337 /* 2338 * Arrange for the hpts to kick back in after the 2339 * t-o if the t-o does not cause a send. 2340 */ 2341 if (rack->rc_inp->inp_in_hpts == 0) 2342 tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(hpts_timeout)); 2343 rack_log_to_start(rack, cts, hpts_timeout, slot, 0); 2344 } 2345 } else if (hpts_timeout) { 2346 if (rack->rc_inp->inp_in_hpts == 0) 2347 tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(hpts_timeout)); 2348 rack_log_to_start(rack, cts, hpts_timeout, slot, 0); 2349 } else { 2350 /* No timer starting */ 2351 #ifdef INVARIANTS 2352 if (SEQ_GT(tp->snd_max, tp->snd_una)) { 2353 panic("tp:%p rack:%p tlts:%d cts:%u slot:%u pto:%u -- no timer started?", 2354 tp, rack, tot_len_this_send, cts, slot, hpts_timeout); 2355 } 2356 #endif 2357 } 2358 rack->rc_tmr_stopped = 0; 2359 if (slot) 2360 rack_log_type_bbrsnd(rack, tot_len_this_send, slot, cts); 2361 } 2362 2363 /* 2364 * RACK Timer, here we simply do logging and house keeping. 2365 * the normal rack_output() function will call the 2366 * appropriate thing to check if we need to do a RACK retransmit. 2367 * We return 1, saying don't proceed with rack_output only 2368 * when all timers have been stopped (destroyed PCB?). 2369 */ 2370 static int 2371 rack_timeout_rack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 2372 { 2373 /* 2374 * This timer simply provides an internal trigger to send out data. 2375 * The check_recovery_mode call will see if there are needed 2376 * retransmissions, if so we will enter fast-recovery. The output 2377 * call may or may not do the same thing depending on sysctl 2378 * settings. 2379 */ 2380 struct rack_sendmap *rsm; 2381 int32_t recovery; 2382 2383 if (tp->t_timers->tt_flags & TT_STOPPED) { 2384 return (1); 2385 } 2386 if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) { 2387 /* Its not time yet */ 2388 return (0); 2389 } 2390 rack_log_to_event(rack, RACK_TO_FRM_RACK); 2391 recovery = IN_RECOVERY(tp->t_flags); 2392 counter_u64_add(rack_to_tot, 1); 2393 if (rack->r_state && (rack->r_state != tp->t_state)) 2394 rack_set_state(tp, rack); 2395 rsm = rack_check_recovery_mode(tp, cts); 2396 if (rsm) { 2397 uint32_t rtt; 2398 2399 rtt = rack->rc_rack_rtt; 2400 if (rtt == 0) 2401 rtt = 1; 2402 if ((recovery == 0) && 2403 (rack->r_ctl.rc_prr_sndcnt < tp->t_maxseg)) { 2404 /* 2405 * The rack-timeout that enter's us into recovery 2406 * will force out one MSS and set us up so that we 2407 * can do one more send in 2*rtt (transitioning the 2408 * rack timeout into a rack-tlp). 2409 */ 2410 rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; 2411 } else if ((rack->r_ctl.rc_prr_sndcnt < tp->t_maxseg) && 2412 ((rsm->r_end - rsm->r_start) > rack->r_ctl.rc_prr_sndcnt)) { 2413 /* 2414 * When a rack timer goes, we have to send at 2415 * least one segment. They will be paced a min of 1ms 2416 * apart via the next rack timer (or further 2417 * if the rack timer dictates it). 2418 */ 2419 rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; 2420 } 2421 } else { 2422 /* This is a case that should happen rarely if ever */ 2423 counter_u64_add(rack_tlp_does_nada, 1); 2424 #ifdef TCP_BLACKBOX 2425 tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true); 2426 #endif 2427 rack->r_ctl.rc_resend = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 2428 } 2429 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RACK; 2430 return (0); 2431 } 2432 2433 /* 2434 * TLP Timer, here we simply setup what segment we want to 2435 * have the TLP expire on, the normal rack_output() will then 2436 * send it out. 2437 * 2438 * We return 1, saying don't proceed with rack_output only 2439 * when all timers have been stopped (destroyed PCB?). 2440 */ 2441 static int 2442 rack_timeout_tlp(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 2443 { 2444 /* 2445 * Tail Loss Probe. 2446 */ 2447 struct rack_sendmap *rsm = NULL; 2448 struct socket *so; 2449 uint32_t amm, old_prr_snd = 0; 2450 uint32_t out, avail; 2451 2452 if (tp->t_timers->tt_flags & TT_STOPPED) { 2453 return (1); 2454 } 2455 if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) { 2456 /* Its not time yet */ 2457 return (0); 2458 } 2459 if (rack_progress_timeout_check(tp)) { 2460 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 2461 return (1); 2462 } 2463 /* 2464 * A TLP timer has expired. We have been idle for 2 rtts. So we now 2465 * need to figure out how to force a full MSS segment out. 2466 */ 2467 rack_log_to_event(rack, RACK_TO_FRM_TLP); 2468 counter_u64_add(rack_tlp_tot, 1); 2469 if (rack->r_state && (rack->r_state != tp->t_state)) 2470 rack_set_state(tp, rack); 2471 so = tp->t_inpcb->inp_socket; 2472 avail = sbavail(&so->so_snd); 2473 out = tp->snd_max - tp->snd_una; 2474 rack->rc_timer_up = 1; 2475 /* 2476 * If we are in recovery we can jazz out a segment if new data is 2477 * present simply by setting rc_prr_sndcnt to a segment. 2478 */ 2479 if ((avail > out) && 2480 ((rack_always_send_oldest == 0) || (TAILQ_EMPTY(&rack->r_ctl.rc_tmap)))) { 2481 /* New data is available */ 2482 amm = avail - out; 2483 if (amm > tp->t_maxseg) { 2484 amm = tp->t_maxseg; 2485 } else if ((amm < tp->t_maxseg) && ((tp->t_flags & TF_NODELAY) == 0)) { 2486 /* not enough to fill a MTU and no-delay is off */ 2487 goto need_retran; 2488 } 2489 if (IN_RECOVERY(tp->t_flags)) { 2490 /* Unlikely */ 2491 old_prr_snd = rack->r_ctl.rc_prr_sndcnt; 2492 if (out + amm <= tp->snd_wnd) 2493 rack->r_ctl.rc_prr_sndcnt = amm; 2494 else 2495 goto need_retran; 2496 } else { 2497 /* Set the send-new override */ 2498 if (out + amm <= tp->snd_wnd) 2499 rack->r_ctl.rc_tlp_new_data = amm; 2500 else 2501 goto need_retran; 2502 } 2503 rack->r_ctl.rc_tlp_seg_send_cnt = 0; 2504 rack->r_ctl.rc_last_tlp_seq = tp->snd_max; 2505 rack->r_ctl.rc_tlpsend = NULL; 2506 counter_u64_add(rack_tlp_newdata, 1); 2507 goto send; 2508 } 2509 need_retran: 2510 /* 2511 * Ok we need to arrange the last un-acked segment to be re-sent, or 2512 * optionally the first un-acked segment. 2513 */ 2514 if (rack_always_send_oldest) 2515 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 2516 else { 2517 rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_map, rack_sendmap, r_next); 2518 if (rsm && (rsm->r_flags & (RACK_ACKED | RACK_HAS_FIN))) { 2519 rsm = rack_find_high_nonack(rack, rsm); 2520 } 2521 } 2522 if (rsm == NULL) { 2523 counter_u64_add(rack_tlp_does_nada, 1); 2524 #ifdef TCP_BLACKBOX 2525 tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true); 2526 #endif 2527 goto out; 2528 } 2529 if ((rsm->r_end - rsm->r_start) > tp->t_maxseg) { 2530 /* 2531 * We need to split this the last segment in two. 2532 */ 2533 int32_t idx; 2534 struct rack_sendmap *nrsm; 2535 2536 nrsm = rack_alloc(rack); 2537 if (nrsm == NULL) { 2538 /* 2539 * No memory to split, we will just exit and punt 2540 * off to the RXT timer. 2541 */ 2542 counter_u64_add(rack_tlp_does_nada, 1); 2543 goto out; 2544 } 2545 nrsm->r_start = (rsm->r_end - tp->t_maxseg); 2546 nrsm->r_end = rsm->r_end; 2547 nrsm->r_rtr_cnt = rsm->r_rtr_cnt; 2548 nrsm->r_flags = rsm->r_flags; 2549 nrsm->r_sndcnt = rsm->r_sndcnt; 2550 nrsm->r_rtr_bytes = 0; 2551 rsm->r_end = nrsm->r_start; 2552 for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { 2553 nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; 2554 } 2555 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next); 2556 if (rsm->r_in_tmap) { 2557 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 2558 nrsm->r_in_tmap = 1; 2559 } 2560 rsm->r_flags &= (~RACK_HAS_FIN); 2561 rsm = nrsm; 2562 } 2563 rack->r_ctl.rc_tlpsend = rsm; 2564 rack->r_ctl.rc_tlp_rtx_out = 1; 2565 if (rsm->r_start == rack->r_ctl.rc_last_tlp_seq) { 2566 rack->r_ctl.rc_tlp_seg_send_cnt++; 2567 tp->t_rxtshift++; 2568 } else { 2569 rack->r_ctl.rc_last_tlp_seq = rsm->r_start; 2570 rack->r_ctl.rc_tlp_seg_send_cnt = 1; 2571 } 2572 send: 2573 rack->r_ctl.rc_tlp_send_cnt++; 2574 if (rack->r_ctl.rc_tlp_send_cnt > rack_tlp_max_resend) { 2575 /* 2576 * Can't [re]/transmit a segment we have not heard from the 2577 * peer in max times. We need the retransmit timer to take 2578 * over. 2579 */ 2580 restore: 2581 rack->r_ctl.rc_tlpsend = NULL; 2582 if (rsm) 2583 rsm->r_flags &= ~RACK_TLP; 2584 rack->r_ctl.rc_prr_sndcnt = old_prr_snd; 2585 counter_u64_add(rack_tlp_retran_fail, 1); 2586 goto out; 2587 } else if (rsm) { 2588 rsm->r_flags |= RACK_TLP; 2589 } 2590 if (rsm && (rsm->r_start == rack->r_ctl.rc_last_tlp_seq) && 2591 (rack->r_ctl.rc_tlp_seg_send_cnt > rack_tlp_max_resend)) { 2592 /* 2593 * We don't want to send a single segment more than the max 2594 * either. 2595 */ 2596 goto restore; 2597 } 2598 rack->r_timer_override = 1; 2599 rack->r_tlp_running = 1; 2600 rack->rc_tlp_in_progress = 1; 2601 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP; 2602 return (0); 2603 out: 2604 rack->rc_timer_up = 0; 2605 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP; 2606 return (0); 2607 } 2608 2609 /* 2610 * Delayed ack Timer, here we simply need to setup the 2611 * ACK_NOW flag and remove the DELACK flag. From there 2612 * the output routine will send the ack out. 2613 * 2614 * We only return 1, saying don't proceed, if all timers 2615 * are stopped (destroyed PCB?). 2616 */ 2617 static int 2618 rack_timeout_delack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 2619 { 2620 if (tp->t_timers->tt_flags & TT_STOPPED) { 2621 return (1); 2622 } 2623 rack_log_to_event(rack, RACK_TO_FRM_DELACK); 2624 tp->t_flags &= ~TF_DELACK; 2625 tp->t_flags |= TF_ACKNOW; 2626 TCPSTAT_INC(tcps_delack); 2627 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK; 2628 return (0); 2629 } 2630 2631 /* 2632 * Persists timer, here we simply need to setup the 2633 * FORCE-DATA flag the output routine will send 2634 * the one byte send. 2635 * 2636 * We only return 1, saying don't proceed, if all timers 2637 * are stopped (destroyed PCB?). 2638 */ 2639 static int 2640 rack_timeout_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 2641 { 2642 struct inpcb *inp; 2643 int32_t retval = 0; 2644 2645 inp = tp->t_inpcb; 2646 2647 if (tp->t_timers->tt_flags & TT_STOPPED) { 2648 return (1); 2649 } 2650 if (rack->rc_in_persist == 0) 2651 return (0); 2652 if (rack_progress_timeout_check(tp)) { 2653 tcp_set_inp_to_drop(inp, ETIMEDOUT); 2654 return (1); 2655 } 2656 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 2657 /* 2658 * Persistence timer into zero window. Force a byte to be output, if 2659 * possible. 2660 */ 2661 TCPSTAT_INC(tcps_persisttimeo); 2662 /* 2663 * Hack: if the peer is dead/unreachable, we do not time out if the 2664 * window is closed. After a full backoff, drop the connection if 2665 * the idle time (no responses to probes) reaches the maximum 2666 * backoff that we would use if retransmitting. 2667 */ 2668 if (tp->t_rxtshift == TCP_MAXRXTSHIFT && 2669 (ticks - tp->t_rcvtime >= tcp_maxpersistidle || 2670 ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) { 2671 TCPSTAT_INC(tcps_persistdrop); 2672 retval = 1; 2673 tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT); 2674 goto out; 2675 } 2676 if ((sbavail(&rack->rc_inp->inp_socket->so_snd) == 0) && 2677 tp->snd_una == tp->snd_max) 2678 rack_exit_persist(tp, rack); 2679 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_PERSIT; 2680 /* 2681 * If the user has closed the socket then drop a persisting 2682 * connection after a much reduced timeout. 2683 */ 2684 if (tp->t_state > TCPS_CLOSE_WAIT && 2685 (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) { 2686 retval = 1; 2687 TCPSTAT_INC(tcps_persistdrop); 2688 tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT); 2689 goto out; 2690 } 2691 tp->t_flags |= TF_FORCEDATA; 2692 out: 2693 rack_log_to_event(rack, RACK_TO_FRM_PERSIST); 2694 return (retval); 2695 } 2696 2697 /* 2698 * If a keepalive goes off, we had no other timers 2699 * happening. We always return 1 here since this 2700 * routine either drops the connection or sends 2701 * out a segment with respond. 2702 */ 2703 static int 2704 rack_timeout_keepalive(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 2705 { 2706 struct tcptemp *t_template; 2707 struct inpcb *inp; 2708 2709 if (tp->t_timers->tt_flags & TT_STOPPED) { 2710 return (1); 2711 } 2712 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_KEEP; 2713 inp = tp->t_inpcb; 2714 rack_log_to_event(rack, RACK_TO_FRM_KEEP); 2715 /* 2716 * Keep-alive timer went off; send something or drop connection if 2717 * idle for too long. 2718 */ 2719 TCPSTAT_INC(tcps_keeptimeo); 2720 if (tp->t_state < TCPS_ESTABLISHED) 2721 goto dropit; 2722 if ((tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && 2723 tp->t_state <= TCPS_CLOSING) { 2724 if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp)) 2725 goto dropit; 2726 /* 2727 * Send a packet designed to force a response if the peer is 2728 * up and reachable: either an ACK if the connection is 2729 * still alive, or an RST if the peer has closed the 2730 * connection due to timeout or reboot. Using sequence 2731 * number tp->snd_una-1 causes the transmitted zero-length 2732 * segment to lie outside the receive window; by the 2733 * protocol spec, this requires the correspondent TCP to 2734 * respond. 2735 */ 2736 TCPSTAT_INC(tcps_keepprobe); 2737 t_template = tcpip_maketemplate(inp); 2738 if (t_template) { 2739 tcp_respond(tp, t_template->tt_ipgen, 2740 &t_template->tt_t, (struct mbuf *)NULL, 2741 tp->rcv_nxt, tp->snd_una - 1, 0); 2742 free(t_template, M_TEMP); 2743 } 2744 } 2745 rack_start_hpts_timer(rack, tp, cts, __LINE__, 0, 0, 0); 2746 return (1); 2747 dropit: 2748 TCPSTAT_INC(tcps_keepdrops); 2749 tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT); 2750 return (1); 2751 } 2752 2753 /* 2754 * Retransmit helper function, clear up all the ack 2755 * flags and take care of important book keeping. 2756 */ 2757 static void 2758 rack_remxt_tmr(struct tcpcb *tp) 2759 { 2760 /* 2761 * The retransmit timer went off, all sack'd blocks must be 2762 * un-acked. 2763 */ 2764 struct rack_sendmap *rsm, *trsm = NULL; 2765 struct tcp_rack *rack; 2766 int32_t cnt = 0; 2767 2768 rack = (struct tcp_rack *)tp->t_fb_ptr; 2769 rack_timer_cancel(tp, rack, tcp_ts_getticks(), __LINE__); 2770 rack_log_to_event(rack, RACK_TO_FRM_TMR); 2771 if (rack->r_state && (rack->r_state != tp->t_state)) 2772 rack_set_state(tp, rack); 2773 /* 2774 * Ideally we would like to be able to 2775 * mark SACK-PASS on anything not acked here. 2776 * However, if we do that we would burst out 2777 * all that data 1ms apart. This would be unwise, 2778 * so for now we will just let the normal rxt timer 2779 * and tlp timer take care of it. 2780 */ 2781 TAILQ_FOREACH(rsm, &rack->r_ctl.rc_map, r_next) { 2782 if (rsm->r_flags & RACK_ACKED) { 2783 cnt++; 2784 rsm->r_sndcnt = 0; 2785 if (rsm->r_in_tmap == 0) { 2786 /* We must re-add it back to the tlist */ 2787 if (trsm == NULL) { 2788 TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext); 2789 } else { 2790 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, trsm, rsm, r_tnext); 2791 } 2792 rsm->r_in_tmap = 1; 2793 trsm = rsm; 2794 } 2795 } 2796 rsm->r_flags &= ~(RACK_ACKED | RACK_SACK_PASSED | RACK_WAS_SACKPASS); 2797 } 2798 /* Clear the count (we just un-acked them) */ 2799 rack->r_ctl.rc_sacked = 0; 2800 /* Clear the tlp rtx mark */ 2801 rack->r_ctl.rc_tlp_rtx_out = 0; 2802 rack->r_ctl.rc_tlp_seg_send_cnt = 0; 2803 rack->r_ctl.rc_resend = TAILQ_FIRST(&rack->r_ctl.rc_map); 2804 /* Setup so we send one segment */ 2805 if (rack->r_ctl.rc_prr_sndcnt < tp->t_maxseg) 2806 rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; 2807 rack->r_timer_override = 1; 2808 } 2809 2810 /* 2811 * Re-transmit timeout! If we drop the PCB we will return 1, otherwise 2812 * we will setup to retransmit the lowest seq number outstanding. 2813 */ 2814 static int 2815 rack_timeout_rxt(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 2816 { 2817 int32_t rexmt; 2818 struct inpcb *inp; 2819 int32_t retval = 0; 2820 2821 inp = tp->t_inpcb; 2822 if (tp->t_timers->tt_flags & TT_STOPPED) { 2823 return (1); 2824 } 2825 if (rack_progress_timeout_check(tp)) { 2826 tcp_set_inp_to_drop(inp, ETIMEDOUT); 2827 return (1); 2828 } 2829 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RXT; 2830 if (TCPS_HAVEESTABLISHED(tp->t_state) && 2831 (tp->snd_una == tp->snd_max)) { 2832 /* Nothing outstanding .. nothing to do */ 2833 return (0); 2834 } 2835 /* 2836 * Retransmission timer went off. Message has not been acked within 2837 * retransmit interval. Back off to a longer retransmit interval 2838 * and retransmit one segment. 2839 */ 2840 if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) { 2841 tp->t_rxtshift = TCP_MAXRXTSHIFT; 2842 TCPSTAT_INC(tcps_timeoutdrop); 2843 retval = 1; 2844 tcp_set_inp_to_drop(rack->rc_inp, 2845 (tp->t_softerror ? (uint16_t) tp->t_softerror : ETIMEDOUT)); 2846 goto out; 2847 } 2848 rack_remxt_tmr(tp); 2849 if (tp->t_state == TCPS_SYN_SENT) { 2850 /* 2851 * If the SYN was retransmitted, indicate CWND to be limited 2852 * to 1 segment in cc_conn_init(). 2853 */ 2854 tp->snd_cwnd = 1; 2855 } else if (tp->t_rxtshift == 1) { 2856 /* 2857 * first retransmit; record ssthresh and cwnd so they can be 2858 * recovered if this turns out to be a "bad" retransmit. A 2859 * retransmit is considered "bad" if an ACK for this segment 2860 * is received within RTT/2 interval; the assumption here is 2861 * that the ACK was already in flight. See "On Estimating 2862 * End-to-End Network Path Properties" by Allman and Paxson 2863 * for more details. 2864 */ 2865 tp->snd_cwnd_prev = tp->snd_cwnd; 2866 tp->snd_ssthresh_prev = tp->snd_ssthresh; 2867 tp->snd_recover_prev = tp->snd_recover; 2868 if (IN_FASTRECOVERY(tp->t_flags)) 2869 tp->t_flags |= TF_WASFRECOVERY; 2870 else 2871 tp->t_flags &= ~TF_WASFRECOVERY; 2872 if (IN_CONGRECOVERY(tp->t_flags)) 2873 tp->t_flags |= TF_WASCRECOVERY; 2874 else 2875 tp->t_flags &= ~TF_WASCRECOVERY; 2876 tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1)); 2877 tp->t_flags |= TF_PREVVALID; 2878 } else 2879 tp->t_flags &= ~TF_PREVVALID; 2880 TCPSTAT_INC(tcps_rexmttimeo); 2881 if ((tp->t_state == TCPS_SYN_SENT) || 2882 (tp->t_state == TCPS_SYN_RECEIVED)) 2883 rexmt = MSEC_2_TICKS(RACK_INITIAL_RTO * tcp_backoff[tp->t_rxtshift]); 2884 else 2885 rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; 2886 TCPT_RANGESET(tp->t_rxtcur, rexmt, 2887 max(MSEC_2_TICKS(rack_rto_min), rexmt), 2888 MSEC_2_TICKS(rack_rto_max)); 2889 /* 2890 * We enter the path for PLMTUD if connection is established or, if 2891 * connection is FIN_WAIT_1 status, reason for the last is that if 2892 * amount of data we send is very small, we could send it in couple 2893 * of packets and process straight to FIN. In that case we won't 2894 * catch ESTABLISHED state. 2895 */ 2896 if (V_tcp_pmtud_blackhole_detect && (((tp->t_state == TCPS_ESTABLISHED)) 2897 || (tp->t_state == TCPS_FIN_WAIT_1))) { 2898 #ifdef INET6 2899 int32_t isipv6; 2900 #endif 2901 2902 /* 2903 * Idea here is that at each stage of mtu probe (usually, 2904 * 1448 -> 1188 -> 524) should be given 2 chances to recover 2905 * before further clamping down. 'tp->t_rxtshift % 2 == 0' 2906 * should take care of that. 2907 */ 2908 if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) == 2909 (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) && 2910 (tp->t_rxtshift >= 2 && tp->t_rxtshift < 6 && 2911 tp->t_rxtshift % 2 == 0)) { 2912 /* 2913 * Enter Path MTU Black-hole Detection mechanism: - 2914 * Disable Path MTU Discovery (IP "DF" bit). - 2915 * Reduce MTU to lower value than what we negotiated 2916 * with peer. 2917 */ 2918 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) == 0) { 2919 /* Record that we may have found a black hole. */ 2920 tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE; 2921 /* Keep track of previous MSS. */ 2922 tp->t_pmtud_saved_maxseg = tp->t_maxseg; 2923 } 2924 2925 /* 2926 * Reduce the MSS to blackhole value or to the 2927 * default in an attempt to retransmit. 2928 */ 2929 #ifdef INET6 2930 isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? 1 : 0; 2931 if (isipv6 && 2932 tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) { 2933 /* Use the sysctl tuneable blackhole MSS. */ 2934 tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss; 2935 TCPSTAT_INC(tcps_pmtud_blackhole_activated); 2936 } else if (isipv6) { 2937 /* Use the default MSS. */ 2938 tp->t_maxseg = V_tcp_v6mssdflt; 2939 /* 2940 * Disable Path MTU Discovery when we switch 2941 * to minmss. 2942 */ 2943 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 2944 TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); 2945 } 2946 #endif 2947 #if defined(INET6) && defined(INET) 2948 else 2949 #endif 2950 #ifdef INET 2951 if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) { 2952 /* Use the sysctl tuneable blackhole MSS. */ 2953 tp->t_maxseg = V_tcp_pmtud_blackhole_mss; 2954 TCPSTAT_INC(tcps_pmtud_blackhole_activated); 2955 } else { 2956 /* Use the default MSS. */ 2957 tp->t_maxseg = V_tcp_mssdflt; 2958 /* 2959 * Disable Path MTU Discovery when we switch 2960 * to minmss. 2961 */ 2962 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 2963 TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); 2964 } 2965 #endif 2966 } else { 2967 /* 2968 * If further retransmissions are still unsuccessful 2969 * with a lowered MTU, maybe this isn't a blackhole 2970 * and we restore the previous MSS and blackhole 2971 * detection flags. The limit '6' is determined by 2972 * giving each probe stage (1448, 1188, 524) 2 2973 * chances to recover. 2974 */ 2975 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) && 2976 (tp->t_rxtshift >= 6)) { 2977 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 2978 tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE; 2979 tp->t_maxseg = tp->t_pmtud_saved_maxseg; 2980 TCPSTAT_INC(tcps_pmtud_blackhole_failed); 2981 } 2982 } 2983 } 2984 /* 2985 * Disable RFC1323 and SACK if we haven't got any response to our 2986 * third SYN to work-around some broken terminal servers (most of 2987 * which have hopefully been retired) that have bad VJ header 2988 * compression code which trashes TCP segments containing 2989 * unknown-to-them TCP options. 2990 */ 2991 if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) && 2992 (tp->t_rxtshift == 3)) 2993 tp->t_flags &= ~(TF_REQ_SCALE | TF_REQ_TSTMP | TF_SACK_PERMIT); 2994 /* 2995 * If we backed off this far, our srtt estimate is probably bogus. 2996 * Clobber it so we'll take the next rtt measurement as our srtt; 2997 * move the current srtt into rttvar to keep the current retransmit 2998 * times until then. 2999 */ 3000 if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) { 3001 #ifdef INET6 3002 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) 3003 in6_losing(tp->t_inpcb); 3004 else 3005 #endif 3006 in_losing(tp->t_inpcb); 3007 tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT); 3008 tp->t_srtt = 0; 3009 } 3010 if (rack_use_sack_filter) 3011 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 3012 tp->snd_recover = tp->snd_max; 3013 tp->t_flags |= TF_ACKNOW; 3014 tp->t_rtttime = 0; 3015 rack_cong_signal(tp, NULL, CC_RTO); 3016 out: 3017 return (retval); 3018 } 3019 3020 static int 3021 rack_process_timers(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8_t hpts_calling) 3022 { 3023 int32_t ret = 0; 3024 int32_t timers = (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK); 3025 3026 if (timers == 0) { 3027 return (0); 3028 } 3029 if (tp->t_state == TCPS_LISTEN) { 3030 /* no timers on listen sockets */ 3031 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) 3032 return (0); 3033 return (1); 3034 } 3035 if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) { 3036 uint32_t left; 3037 3038 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { 3039 ret = -1; 3040 rack_log_to_processing(rack, cts, ret, 0); 3041 return (0); 3042 } 3043 if (hpts_calling == 0) { 3044 ret = -2; 3045 rack_log_to_processing(rack, cts, ret, 0); 3046 return (0); 3047 } 3048 /* 3049 * Ok our timer went off early and we are not paced false 3050 * alarm, go back to sleep. 3051 */ 3052 ret = -3; 3053 left = rack->r_ctl.rc_timer_exp - cts; 3054 tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(left)); 3055 rack_log_to_processing(rack, cts, ret, left); 3056 rack->rc_last_pto_set = 0; 3057 return (1); 3058 } 3059 rack->rc_tmr_stopped = 0; 3060 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_MASK; 3061 if (timers & PACE_TMR_DELACK) { 3062 ret = rack_timeout_delack(tp, rack, cts); 3063 } else if (timers & PACE_TMR_RACK) { 3064 ret = rack_timeout_rack(tp, rack, cts); 3065 } else if (timers & PACE_TMR_TLP) { 3066 ret = rack_timeout_tlp(tp, rack, cts); 3067 } else if (timers & PACE_TMR_RXT) { 3068 ret = rack_timeout_rxt(tp, rack, cts); 3069 } else if (timers & PACE_TMR_PERSIT) { 3070 ret = rack_timeout_persist(tp, rack, cts); 3071 } else if (timers & PACE_TMR_KEEP) { 3072 ret = rack_timeout_keepalive(tp, rack, cts); 3073 } 3074 rack_log_to_processing(rack, cts, ret, timers); 3075 return (ret); 3076 } 3077 3078 static void 3079 rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line) 3080 { 3081 uint8_t hpts_removed = 0; 3082 3083 if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) && 3084 TSTMP_GEQ(cts, rack->r_ctl.rc_last_output_to)) { 3085 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); 3086 hpts_removed = 1; 3087 } 3088 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { 3089 rack->rc_tmr_stopped = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK; 3090 if (rack->rc_inp->inp_in_hpts && 3091 ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0)) { 3092 /* 3093 * Canceling timer's when we have no output being 3094 * paced. We also must remove ourselves from the 3095 * hpts. 3096 */ 3097 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); 3098 hpts_removed = 1; 3099 } 3100 rack_log_to_cancel(rack, hpts_removed, line); 3101 rack->r_ctl.rc_hpts_flags &= ~(PACE_TMR_MASK); 3102 } 3103 } 3104 3105 static void 3106 rack_timer_stop(struct tcpcb *tp, uint32_t timer_type) 3107 { 3108 return; 3109 } 3110 3111 static int 3112 rack_stopall(struct tcpcb *tp) 3113 { 3114 struct tcp_rack *rack; 3115 rack = (struct tcp_rack *)tp->t_fb_ptr; 3116 rack->t_timers_stopped = 1; 3117 return (0); 3118 } 3119 3120 static void 3121 rack_timer_activate(struct tcpcb *tp, uint32_t timer_type, uint32_t delta) 3122 { 3123 return; 3124 } 3125 3126 static int 3127 rack_timer_active(struct tcpcb *tp, uint32_t timer_type) 3128 { 3129 return (0); 3130 } 3131 3132 static void 3133 rack_stop_all_timers(struct tcpcb *tp) 3134 { 3135 struct tcp_rack *rack; 3136 3137 /* 3138 * Assure no timers are running. 3139 */ 3140 if (tcp_timer_active(tp, TT_PERSIST)) { 3141 /* We enter in persists, set the flag appropriately */ 3142 rack = (struct tcp_rack *)tp->t_fb_ptr; 3143 rack->rc_in_persist = 1; 3144 } 3145 tcp_timer_suspend(tp, TT_PERSIST); 3146 tcp_timer_suspend(tp, TT_REXMT); 3147 tcp_timer_suspend(tp, TT_KEEP); 3148 tcp_timer_suspend(tp, TT_DELACK); 3149 } 3150 3151 static void 3152 rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack, 3153 struct rack_sendmap *rsm, uint32_t ts) 3154 { 3155 int32_t idx; 3156 3157 rsm->r_rtr_cnt++; 3158 rsm->r_sndcnt++; 3159 if (rsm->r_rtr_cnt > RACK_NUM_OF_RETRANS) { 3160 rsm->r_rtr_cnt = RACK_NUM_OF_RETRANS; 3161 rsm->r_flags |= RACK_OVERMAX; 3162 } 3163 if ((rsm->r_rtr_cnt > 1) && (rack->r_tlp_running == 0)) { 3164 rack->r_ctl.rc_holes_rxt += (rsm->r_end - rsm->r_start); 3165 rsm->r_rtr_bytes += (rsm->r_end - rsm->r_start); 3166 } 3167 idx = rsm->r_rtr_cnt - 1; 3168 rsm->r_tim_lastsent[idx] = ts; 3169 if (rsm->r_flags & RACK_ACKED) { 3170 /* Problably MTU discovery messing with us */ 3171 rsm->r_flags &= ~RACK_ACKED; 3172 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); 3173 } 3174 if (rsm->r_in_tmap) { 3175 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 3176 } 3177 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); 3178 rsm->r_in_tmap = 1; 3179 if (rsm->r_flags & RACK_SACK_PASSED) { 3180 /* We have retransmitted due to the SACK pass */ 3181 rsm->r_flags &= ~RACK_SACK_PASSED; 3182 rsm->r_flags |= RACK_WAS_SACKPASS; 3183 } 3184 /* Update memory for next rtr */ 3185 rack->r_ctl.rc_next = TAILQ_NEXT(rsm, r_next); 3186 } 3187 3188 3189 static uint32_t 3190 rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack, 3191 struct rack_sendmap *rsm, uint32_t ts, int32_t * lenp) 3192 { 3193 /* 3194 * We (re-)transmitted starting at rsm->r_start for some length 3195 * (possibly less than r_end. 3196 */ 3197 struct rack_sendmap *nrsm; 3198 uint32_t c_end; 3199 int32_t len; 3200 int32_t idx; 3201 3202 len = *lenp; 3203 c_end = rsm->r_start + len; 3204 if (SEQ_GEQ(c_end, rsm->r_end)) { 3205 /* 3206 * We retransmitted the whole piece or more than the whole 3207 * slopping into the next rsm. 3208 */ 3209 rack_update_rsm(tp, rack, rsm, ts); 3210 if (c_end == rsm->r_end) { 3211 *lenp = 0; 3212 return (0); 3213 } else { 3214 int32_t act_len; 3215 3216 /* Hangs over the end return whats left */ 3217 act_len = rsm->r_end - rsm->r_start; 3218 *lenp = (len - act_len); 3219 return (rsm->r_end); 3220 } 3221 /* We don't get out of this block. */ 3222 } 3223 /* 3224 * Here we retransmitted less than the whole thing which means we 3225 * have to split this into what was transmitted and what was not. 3226 */ 3227 nrsm = rack_alloc(rack); 3228 if (nrsm == NULL) { 3229 /* 3230 * We can't get memory, so lets not proceed. 3231 */ 3232 *lenp = 0; 3233 return (0); 3234 } 3235 /* 3236 * So here we are going to take the original rsm and make it what we 3237 * retransmitted. nrsm will be the tail portion we did not 3238 * retransmit. For example say the chunk was 1, 11 (10 bytes). And 3239 * we retransmitted 5 bytes i.e. 1, 5. The original piece shrinks to 3240 * 1, 6 and the new piece will be 6, 11. 3241 */ 3242 nrsm->r_start = c_end; 3243 nrsm->r_end = rsm->r_end; 3244 nrsm->r_rtr_cnt = rsm->r_rtr_cnt; 3245 nrsm->r_flags = rsm->r_flags; 3246 nrsm->r_sndcnt = rsm->r_sndcnt; 3247 nrsm->r_rtr_bytes = 0; 3248 rsm->r_end = c_end; 3249 for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { 3250 nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; 3251 } 3252 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next); 3253 if (rsm->r_in_tmap) { 3254 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 3255 nrsm->r_in_tmap = 1; 3256 } 3257 rsm->r_flags &= (~RACK_HAS_FIN); 3258 rack_update_rsm(tp, rack, rsm, ts); 3259 *lenp = 0; 3260 return (0); 3261 } 3262 3263 3264 static void 3265 rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len, 3266 uint32_t seq_out, uint8_t th_flags, int32_t err, uint32_t ts, 3267 uint8_t pass, struct rack_sendmap *hintrsm) 3268 { 3269 struct tcp_rack *rack; 3270 struct rack_sendmap *rsm, *nrsm; 3271 register uint32_t snd_max, snd_una; 3272 int32_t idx; 3273 3274 /* 3275 * Add to the RACK log of packets in flight or retransmitted. If 3276 * there is a TS option we will use the TS echoed, if not we will 3277 * grab a TS. 3278 * 3279 * Retransmissions will increment the count and move the ts to its 3280 * proper place. Note that if options do not include TS's then we 3281 * won't be able to effectively use the ACK for an RTT on a retran. 3282 * 3283 * Notes about r_start and r_end. Lets consider a send starting at 3284 * sequence 1 for 10 bytes. In such an example the r_start would be 3285 * 1 (starting sequence) but the r_end would be r_start+len i.e. 11. 3286 * This means that r_end is actually the first sequence for the next 3287 * slot (11). 3288 * 3289 */ 3290 /* 3291 * If err is set what do we do XXXrrs? should we not add the thing? 3292 * -- i.e. return if err != 0 or should we pretend we sent it? -- 3293 * i.e. proceed with add ** do this for now. 3294 */ 3295 INP_WLOCK_ASSERT(tp->t_inpcb); 3296 if (err) 3297 /* 3298 * We don't log errors -- we could but snd_max does not 3299 * advance in this case either. 3300 */ 3301 return; 3302 3303 if (th_flags & TH_RST) { 3304 /* 3305 * We don't log resets and we return immediately from 3306 * sending 3307 */ 3308 return; 3309 } 3310 rack = (struct tcp_rack *)tp->t_fb_ptr; 3311 snd_una = tp->snd_una; 3312 if (SEQ_LEQ((seq_out + len), snd_una)) { 3313 /* Are sending an old segment to induce an ack (keep-alive)? */ 3314 return; 3315 } 3316 if (SEQ_LT(seq_out, snd_una)) { 3317 /* huh? should we panic? */ 3318 uint32_t end; 3319 3320 end = seq_out + len; 3321 seq_out = snd_una; 3322 len = end - seq_out; 3323 } 3324 snd_max = tp->snd_max; 3325 if (th_flags & (TH_SYN | TH_FIN)) { 3326 /* 3327 * The call to rack_log_output is made before bumping 3328 * snd_max. This means we can record one extra byte on a SYN 3329 * or FIN if seq_out is adding more on and a FIN is present 3330 * (and we are not resending). 3331 */ 3332 if (th_flags & TH_SYN) 3333 len++; 3334 if (th_flags & TH_FIN) 3335 len++; 3336 if (SEQ_LT(snd_max, tp->snd_nxt)) { 3337 /* 3338 * The add/update as not been done for the FIN/SYN 3339 * yet. 3340 */ 3341 snd_max = tp->snd_nxt; 3342 } 3343 } 3344 if (len == 0) { 3345 /* We don't log zero window probes */ 3346 return; 3347 } 3348 rack->r_ctl.rc_time_last_sent = ts; 3349 if (IN_RECOVERY(tp->t_flags)) { 3350 rack->r_ctl.rc_prr_out += len; 3351 } 3352 /* First question is it a retransmission? */ 3353 if (seq_out == snd_max) { 3354 again: 3355 rsm = rack_alloc(rack); 3356 if (rsm == NULL) { 3357 /* 3358 * Hmm out of memory and the tcb got destroyed while 3359 * we tried to wait. 3360 */ 3361 #ifdef INVARIANTS 3362 panic("Out of memory when we should not be rack:%p", rack); 3363 #endif 3364 return; 3365 } 3366 if (th_flags & TH_FIN) { 3367 rsm->r_flags = RACK_HAS_FIN; 3368 } else { 3369 rsm->r_flags = 0; 3370 } 3371 rsm->r_tim_lastsent[0] = ts; 3372 rsm->r_rtr_cnt = 1; 3373 rsm->r_rtr_bytes = 0; 3374 if (th_flags & TH_SYN) { 3375 /* The data space is one beyond snd_una */ 3376 rsm->r_start = seq_out + 1; 3377 rsm->r_end = rsm->r_start + (len - 1); 3378 } else { 3379 /* Normal case */ 3380 rsm->r_start = seq_out; 3381 rsm->r_end = rsm->r_start + len; 3382 } 3383 rsm->r_sndcnt = 0; 3384 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_map, rsm, r_next); 3385 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); 3386 rsm->r_in_tmap = 1; 3387 return; 3388 } 3389 /* 3390 * If we reach here its a retransmission and we need to find it. 3391 */ 3392 more: 3393 if (hintrsm && (hintrsm->r_start == seq_out)) { 3394 rsm = hintrsm; 3395 hintrsm = NULL; 3396 } else if (rack->r_ctl.rc_next) { 3397 /* We have a hint from a previous run */ 3398 rsm = rack->r_ctl.rc_next; 3399 } else { 3400 /* No hints sorry */ 3401 rsm = NULL; 3402 } 3403 if ((rsm) && (rsm->r_start == seq_out)) { 3404 /* 3405 * We used rc_next or hintrsm to retransmit, hopefully the 3406 * likely case. 3407 */ 3408 seq_out = rack_update_entry(tp, rack, rsm, ts, &len); 3409 if (len == 0) { 3410 return; 3411 } else { 3412 goto more; 3413 } 3414 } 3415 /* Ok it was not the last pointer go through it the hard way. */ 3416 TAILQ_FOREACH(rsm, &rack->r_ctl.rc_map, r_next) { 3417 if (rsm->r_start == seq_out) { 3418 seq_out = rack_update_entry(tp, rack, rsm, ts, &len); 3419 rack->r_ctl.rc_next = TAILQ_NEXT(rsm, r_next); 3420 if (len == 0) { 3421 return; 3422 } else { 3423 continue; 3424 } 3425 } 3426 if (SEQ_GEQ(seq_out, rsm->r_start) && SEQ_LT(seq_out, rsm->r_end)) { 3427 /* Transmitted within this piece */ 3428 /* 3429 * Ok we must split off the front and then let the 3430 * update do the rest 3431 */ 3432 nrsm = rack_alloc(rack); 3433 if (nrsm == NULL) { 3434 #ifdef INVARIANTS 3435 panic("Ran out of memory that was preallocated? rack:%p", rack); 3436 #endif 3437 rack_update_rsm(tp, rack, rsm, ts); 3438 return; 3439 } 3440 /* 3441 * copy rsm to nrsm and then trim the front of rsm 3442 * to not include this part. 3443 */ 3444 nrsm->r_start = seq_out; 3445 nrsm->r_end = rsm->r_end; 3446 nrsm->r_rtr_cnt = rsm->r_rtr_cnt; 3447 nrsm->r_flags = rsm->r_flags; 3448 nrsm->r_sndcnt = rsm->r_sndcnt; 3449 nrsm->r_rtr_bytes = 0; 3450 for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { 3451 nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; 3452 } 3453 rsm->r_end = nrsm->r_start; 3454 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next); 3455 if (rsm->r_in_tmap) { 3456 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 3457 nrsm->r_in_tmap = 1; 3458 } 3459 rsm->r_flags &= (~RACK_HAS_FIN); 3460 seq_out = rack_update_entry(tp, rack, nrsm, ts, &len); 3461 if (len == 0) { 3462 return; 3463 } 3464 } 3465 } 3466 /* 3467 * Hmm not found in map did they retransmit both old and on into the 3468 * new? 3469 */ 3470 if (seq_out == tp->snd_max) { 3471 goto again; 3472 } else if (SEQ_LT(seq_out, tp->snd_max)) { 3473 #ifdef INVARIANTS 3474 printf("seq_out:%u len:%d snd_una:%u snd_max:%u -- but rsm not found?\n", 3475 seq_out, len, tp->snd_una, tp->snd_max); 3476 printf("Starting Dump of all rack entries\n"); 3477 TAILQ_FOREACH(rsm, &rack->r_ctl.rc_map, r_next) { 3478 printf("rsm:%p start:%u end:%u\n", 3479 rsm, rsm->r_start, rsm->r_end); 3480 } 3481 printf("Dump complete\n"); 3482 panic("seq_out not found rack:%p tp:%p", 3483 rack, tp); 3484 #endif 3485 } else { 3486 #ifdef INVARIANTS 3487 /* 3488 * Hmm beyond sndmax? (only if we are using the new rtt-pack 3489 * flag) 3490 */ 3491 panic("seq_out:%u(%d) is beyond snd_max:%u tp:%p", 3492 seq_out, len, tp->snd_max, tp); 3493 #endif 3494 } 3495 } 3496 3497 /* 3498 * Record one of the RTT updates from an ack into 3499 * our sample structure. 3500 */ 3501 static void 3502 tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt) 3503 { 3504 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) || 3505 (rack->r_ctl.rack_rs.rs_rtt_lowest > rtt)) { 3506 rack->r_ctl.rack_rs.rs_rtt_lowest = rtt; 3507 } 3508 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) || 3509 (rack->r_ctl.rack_rs.rs_rtt_highest < rtt)) { 3510 rack->r_ctl.rack_rs.rs_rtt_highest = rtt; 3511 } 3512 rack->r_ctl.rack_rs.rs_flags = RACK_RTT_VALID; 3513 rack->r_ctl.rack_rs.rs_rtt_tot += rtt; 3514 rack->r_ctl.rack_rs.rs_rtt_cnt++; 3515 } 3516 3517 /* 3518 * Collect new round-trip time estimate 3519 * and update averages and current timeout. 3520 */ 3521 static void 3522 tcp_rack_xmit_timer_commit(struct tcp_rack *rack, struct tcpcb *tp) 3523 { 3524 int32_t delta; 3525 uint32_t o_srtt, o_var; 3526 int32_t rtt; 3527 3528 if (rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) 3529 /* No valid sample */ 3530 return; 3531 if (rack->r_ctl.rc_rate_sample_method == USE_RTT_LOW) { 3532 /* We are to use the lowest RTT seen in a single ack */ 3533 rtt = rack->r_ctl.rack_rs.rs_rtt_lowest; 3534 } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_HIGH) { 3535 /* We are to use the highest RTT seen in a single ack */ 3536 rtt = rack->r_ctl.rack_rs.rs_rtt_highest; 3537 } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_AVG) { 3538 /* We are to use the average RTT seen in a single ack */ 3539 rtt = (int32_t)(rack->r_ctl.rack_rs.rs_rtt_tot / 3540 (uint64_t)rack->r_ctl.rack_rs.rs_rtt_cnt); 3541 } else { 3542 #ifdef INVARIANTS 3543 panic("Unknown rtt variant %d", rack->r_ctl.rc_rate_sample_method); 3544 #endif 3545 return; 3546 } 3547 if (rtt == 0) 3548 rtt = 1; 3549 rack_log_rtt_sample(rack, rtt); 3550 o_srtt = tp->t_srtt; 3551 o_var = tp->t_rttvar; 3552 rack = (struct tcp_rack *)tp->t_fb_ptr; 3553 if (tp->t_srtt != 0) { 3554 /* 3555 * srtt is stored as fixed point with 5 bits after the 3556 * binary point (i.e., scaled by 8). The following magic is 3557 * equivalent to the smoothing algorithm in rfc793 with an 3558 * alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed point). 3559 * Adjust rtt to origin 0. 3560 */ 3561 delta = ((rtt - 1) << TCP_DELTA_SHIFT) 3562 - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)); 3563 3564 tp->t_srtt += delta; 3565 if (tp->t_srtt <= 0) 3566 tp->t_srtt = 1; 3567 3568 /* 3569 * We accumulate a smoothed rtt variance (actually, a 3570 * smoothed mean difference), then set the retransmit timer 3571 * to smoothed rtt + 4 times the smoothed variance. rttvar 3572 * is stored as fixed point with 4 bits after the binary 3573 * point (scaled by 16). The following is equivalent to 3574 * rfc793 smoothing with an alpha of .75 (rttvar = 3575 * rttvar*3/4 + |delta| / 4). This replaces rfc793's 3576 * wired-in beta. 3577 */ 3578 if (delta < 0) 3579 delta = -delta; 3580 delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT); 3581 tp->t_rttvar += delta; 3582 if (tp->t_rttvar <= 0) 3583 tp->t_rttvar = 1; 3584 if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar) 3585 tp->t_rttbest = tp->t_srtt + tp->t_rttvar; 3586 } else { 3587 /* 3588 * No rtt measurement yet - use the unsmoothed rtt. Set the 3589 * variance to half the rtt (so our first retransmit happens 3590 * at 3*rtt). 3591 */ 3592 tp->t_srtt = rtt << TCP_RTT_SHIFT; 3593 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1); 3594 tp->t_rttbest = tp->t_srtt + tp->t_rttvar; 3595 } 3596 TCPSTAT_INC(tcps_rttupdated); 3597 rack_log_rtt_upd(tp, rack, rtt, o_srtt, o_var); 3598 tp->t_rttupdated++; 3599 #ifdef NETFLIX_STATS 3600 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rtt)); 3601 #endif 3602 tp->t_rxtshift = 0; 3603 3604 /* 3605 * the retransmit should happen at rtt + 4 * rttvar. Because of the 3606 * way we do the smoothing, srtt and rttvar will each average +1/2 3607 * tick of bias. When we compute the retransmit timer, we want 1/2 3608 * tick of rounding and 1 extra tick because of +-1/2 tick 3609 * uncertainty in the firing of the timer. The bias will give us 3610 * exactly the 1.5 tick we need. But, because the bias is 3611 * statistical, we have to test that we don't drop below the minimum 3612 * feasible timer (which is 2 ticks). 3613 */ 3614 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), 3615 max(MSEC_2_TICKS(rack_rto_min), rtt + 2), MSEC_2_TICKS(rack_rto_max)); 3616 tp->t_softerror = 0; 3617 } 3618 3619 static void 3620 rack_earlier_retran(struct tcpcb *tp, struct rack_sendmap *rsm, 3621 uint32_t t, uint32_t cts) 3622 { 3623 /* 3624 * For this RSM, we acknowledged the data from a previous 3625 * transmission, not the last one we made. This means we did a false 3626 * retransmit. 3627 */ 3628 struct tcp_rack *rack; 3629 3630 if (rsm->r_flags & RACK_HAS_FIN) { 3631 /* 3632 * The sending of the FIN often is multiple sent when we 3633 * have everything outstanding ack'd. We ignore this case 3634 * since its over now. 3635 */ 3636 return; 3637 } 3638 if (rsm->r_flags & RACK_TLP) { 3639 /* 3640 * We expect TLP's to have this occur. 3641 */ 3642 return; 3643 } 3644 rack = (struct tcp_rack *)tp->t_fb_ptr; 3645 /* should we undo cc changes and exit recovery? */ 3646 if (IN_RECOVERY(tp->t_flags)) { 3647 if (rack->r_ctl.rc_rsm_start == rsm->r_start) { 3648 /* 3649 * Undo what we ratched down and exit recovery if 3650 * possible 3651 */ 3652 EXIT_RECOVERY(tp->t_flags); 3653 tp->snd_recover = tp->snd_una; 3654 if (rack->r_ctl.rc_cwnd_at > tp->snd_cwnd) 3655 tp->snd_cwnd = rack->r_ctl.rc_cwnd_at; 3656 if (rack->r_ctl.rc_ssthresh_at > tp->snd_ssthresh) 3657 tp->snd_ssthresh = rack->r_ctl.rc_ssthresh_at; 3658 } 3659 } 3660 if (rsm->r_flags & RACK_WAS_SACKPASS) { 3661 /* 3662 * We retransmitted based on a sack and the earlier 3663 * retransmission ack'd it - re-ordering is occuring. 3664 */ 3665 counter_u64_add(rack_reorder_seen, 1); 3666 rack->r_ctl.rc_reorder_ts = cts; 3667 } 3668 counter_u64_add(rack_badfr, 1); 3669 counter_u64_add(rack_badfr_bytes, (rsm->r_end - rsm->r_start)); 3670 } 3671 3672 3673 static int 3674 rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack, 3675 struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type) 3676 { 3677 int32_t i; 3678 uint32_t t; 3679 3680 if (rsm->r_flags & RACK_ACKED) 3681 /* Already done */ 3682 return (0); 3683 3684 3685 if ((rsm->r_rtr_cnt == 1) || 3686 ((ack_type == CUM_ACKED) && 3687 (to->to_flags & TOF_TS) && 3688 (to->to_tsecr) && 3689 (rsm->r_tim_lastsent[rsm->r_rtr_cnt - 1] == to->to_tsecr)) 3690 ) { 3691 /* 3692 * We will only find a matching timestamp if its cum-acked. 3693 * But if its only one retransmission its for-sure matching 3694 * :-) 3695 */ 3696 t = cts - rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; 3697 if ((int)t <= 0) 3698 t = 1; 3699 if (!tp->t_rttlow || tp->t_rttlow > t) 3700 tp->t_rttlow = t; 3701 if (!rack->r_ctl.rc_rack_min_rtt || 3702 SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 3703 rack->r_ctl.rc_rack_min_rtt = t; 3704 if (rack->r_ctl.rc_rack_min_rtt == 0) { 3705 rack->r_ctl.rc_rack_min_rtt = 1; 3706 } 3707 } 3708 tcp_rack_xmit_timer(rack, TCP_TS_TO_TICKS(t) + 1); 3709 if ((rsm->r_flags & RACK_TLP) && 3710 (!IN_RECOVERY(tp->t_flags))) { 3711 /* Segment was a TLP and our retrans matched */ 3712 if (rack->r_ctl.rc_tlp_cwnd_reduce) { 3713 rack->r_ctl.rc_rsm_start = tp->snd_max; 3714 rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; 3715 rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; 3716 rack_cong_signal(tp, NULL, CC_NDUPACK); 3717 /* 3718 * When we enter recovery we need to assure 3719 * we send one packet. 3720 */ 3721 rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; 3722 } else 3723 rack->r_ctl.rc_tlp_rtx_out = 0; 3724 } 3725 if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) { 3726 /* New more recent rack_tmit_time */ 3727 rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; 3728 rack->rc_rack_rtt = t; 3729 } 3730 return (1); 3731 } 3732 /* 3733 * We clear the soft/rxtshift since we got an ack. 3734 * There is no assurance we will call the commit() function 3735 * so we need to clear these to avoid incorrect handling. 3736 */ 3737 tp->t_rxtshift = 0; 3738 tp->t_softerror = 0; 3739 if ((to->to_flags & TOF_TS) && 3740 (ack_type == CUM_ACKED) && 3741 (to->to_tsecr) && 3742 ((rsm->r_flags & (RACK_DEFERRED | RACK_OVERMAX)) == 0)) { 3743 /* 3744 * Now which timestamp does it match? In this block the ACK 3745 * must be coming from a previous transmission. 3746 */ 3747 for (i = 0; i < rsm->r_rtr_cnt; i++) { 3748 if (rsm->r_tim_lastsent[i] == to->to_tsecr) { 3749 t = cts - rsm->r_tim_lastsent[i]; 3750 if ((int)t <= 0) 3751 t = 1; 3752 if ((i + 1) < rsm->r_rtr_cnt) { 3753 /* Likely */ 3754 rack_earlier_retran(tp, rsm, t, cts); 3755 } 3756 if (!tp->t_rttlow || tp->t_rttlow > t) 3757 tp->t_rttlow = t; 3758 if (!rack->r_ctl.rc_rack_min_rtt || SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 3759 rack->r_ctl.rc_rack_min_rtt = t; 3760 if (rack->r_ctl.rc_rack_min_rtt == 0) { 3761 rack->r_ctl.rc_rack_min_rtt = 1; 3762 } 3763 } 3764 /* 3765 * Note the following calls to 3766 * tcp_rack_xmit_timer() are being commented 3767 * out for now. They give us no more accuracy 3768 * and often lead to a wrong choice. We have 3769 * enough samples that have not been 3770 * retransmitted. I leave the commented out 3771 * code in here in case in the future we 3772 * decide to add it back (though I can't forsee 3773 * doing that). That way we will easily see 3774 * where they need to be placed. 3775 */ 3776 if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, 3777 rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) { 3778 /* New more recent rack_tmit_time */ 3779 rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; 3780 rack->rc_rack_rtt = t; 3781 } 3782 return (1); 3783 } 3784 } 3785 goto ts_not_found; 3786 } else { 3787 /* 3788 * Ok its a SACK block that we retransmitted. or a windows 3789 * machine without timestamps. We can tell nothing from the 3790 * time-stamp since its not there or the time the peer last 3791 * recieved a segment that moved forward its cum-ack point. 3792 */ 3793 ts_not_found: 3794 i = rsm->r_rtr_cnt - 1; 3795 t = cts - rsm->r_tim_lastsent[i]; 3796 if ((int)t <= 0) 3797 t = 1; 3798 if (rack->r_ctl.rc_rack_min_rtt && SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 3799 /* 3800 * We retransmitted and the ack came back in less 3801 * than the smallest rtt we have observed. We most 3802 * likey did an improper retransmit as outlined in 3803 * 4.2 Step 3 point 2 in the rack-draft. 3804 */ 3805 i = rsm->r_rtr_cnt - 2; 3806 t = cts - rsm->r_tim_lastsent[i]; 3807 rack_earlier_retran(tp, rsm, t, cts); 3808 } else if (rack->r_ctl.rc_rack_min_rtt) { 3809 /* 3810 * We retransmitted it and the retransmit did the 3811 * job. 3812 */ 3813 if (!rack->r_ctl.rc_rack_min_rtt || 3814 SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 3815 rack->r_ctl.rc_rack_min_rtt = t; 3816 if (rack->r_ctl.rc_rack_min_rtt == 0) { 3817 rack->r_ctl.rc_rack_min_rtt = 1; 3818 } 3819 } 3820 if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, rsm->r_tim_lastsent[i])) { 3821 /* New more recent rack_tmit_time */ 3822 rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[i]; 3823 rack->rc_rack_rtt = t; 3824 } 3825 return (1); 3826 } 3827 } 3828 return (0); 3829 } 3830 3831 /* 3832 * Mark the SACK_PASSED flag on all entries prior to rsm send wise. 3833 */ 3834 static void 3835 rack_log_sack_passed(struct tcpcb *tp, 3836 struct tcp_rack *rack, struct rack_sendmap *rsm) 3837 { 3838 struct rack_sendmap *nrsm; 3839 uint32_t ts; 3840 int32_t idx; 3841 3842 idx = rsm->r_rtr_cnt - 1; 3843 ts = rsm->r_tim_lastsent[idx]; 3844 nrsm = rsm; 3845 TAILQ_FOREACH_REVERSE_FROM(nrsm, &rack->r_ctl.rc_tmap, 3846 rack_head, r_tnext) { 3847 if (nrsm == rsm) { 3848 /* Skip orginal segment he is acked */ 3849 continue; 3850 } 3851 if (nrsm->r_flags & RACK_ACKED) { 3852 /* Skip ack'd segments */ 3853 continue; 3854 } 3855 idx = nrsm->r_rtr_cnt - 1; 3856 if (ts == nrsm->r_tim_lastsent[idx]) { 3857 /* 3858 * For this case lets use seq no, if we sent in a 3859 * big block (TSO) we would have a bunch of segments 3860 * sent at the same time. 3861 * 3862 * We would only get a report if its SEQ is earlier. 3863 * If we have done multiple retransmits the times 3864 * would not be equal. 3865 */ 3866 if (SEQ_LT(nrsm->r_start, rsm->r_start)) { 3867 nrsm->r_flags |= RACK_SACK_PASSED; 3868 nrsm->r_flags &= ~RACK_WAS_SACKPASS; 3869 } 3870 } else { 3871 /* 3872 * Here they were sent at different times, not a big 3873 * block. Since we transmitted this one later and 3874 * see it sack'd then this must also be missing (or 3875 * we would have gotten a sack block for it) 3876 */ 3877 nrsm->r_flags |= RACK_SACK_PASSED; 3878 nrsm->r_flags &= ~RACK_WAS_SACKPASS; 3879 } 3880 } 3881 } 3882 3883 static uint32_t 3884 rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, struct sackblk *sack, 3885 struct tcpopt *to, struct rack_sendmap **prsm, uint32_t cts) 3886 { 3887 int32_t idx; 3888 int32_t times = 0; 3889 uint32_t start, end, changed = 0; 3890 struct rack_sendmap *rsm, *nrsm; 3891 int32_t used_ref = 1; 3892 3893 start = sack->start; 3894 end = sack->end; 3895 rsm = *prsm; 3896 if (rsm && SEQ_LT(start, rsm->r_start)) { 3897 TAILQ_FOREACH_REVERSE_FROM(rsm, &rack->r_ctl.rc_map, rack_head, r_next) { 3898 if (SEQ_GEQ(start, rsm->r_start) && 3899 SEQ_LT(start, rsm->r_end)) { 3900 goto do_rest_ofb; 3901 } 3902 } 3903 } 3904 if (rsm == NULL) { 3905 start_at_beginning: 3906 rsm = NULL; 3907 used_ref = 0; 3908 } 3909 /* First lets locate the block where this guy is */ 3910 TAILQ_FOREACH_FROM(rsm, &rack->r_ctl.rc_map, r_next) { 3911 if (SEQ_GEQ(start, rsm->r_start) && 3912 SEQ_LT(start, rsm->r_end)) { 3913 break; 3914 } 3915 } 3916 do_rest_ofb: 3917 if (rsm == NULL) { 3918 /* 3919 * This happens when we get duplicate sack blocks with the 3920 * same end. For example SACK 4: 100 SACK 3: 100 The sort 3921 * will not change there location so we would just start at 3922 * the end of the first one and get lost. 3923 */ 3924 if (tp->t_flags & TF_SENTFIN) { 3925 /* 3926 * Check to see if we have not logged the FIN that 3927 * went out. 3928 */ 3929 nrsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_map, rack_sendmap, r_next); 3930 if (nrsm && (nrsm->r_end + 1) == tp->snd_max) { 3931 /* 3932 * Ok we did not get the FIN logged. 3933 */ 3934 nrsm->r_end++; 3935 rsm = nrsm; 3936 goto do_rest_ofb; 3937 } 3938 } 3939 if (times == 1) { 3940 #ifdef INVARIANTS 3941 panic("tp:%p rack:%p sack:%p to:%p prsm:%p", 3942 tp, rack, sack, to, prsm); 3943 #else 3944 goto out; 3945 #endif 3946 } 3947 times++; 3948 counter_u64_add(rack_sack_proc_restart, 1); 3949 goto start_at_beginning; 3950 } 3951 /* Ok we have an ACK for some piece of rsm */ 3952 if (rsm->r_start != start) { 3953 /* 3954 * Need to split this in two pieces the before and after. 3955 */ 3956 nrsm = rack_alloc(rack); 3957 if (nrsm == NULL) { 3958 /* 3959 * failed XXXrrs what can we do but loose the sack 3960 * info? 3961 */ 3962 goto out; 3963 } 3964 nrsm->r_start = start; 3965 nrsm->r_rtr_bytes = 0; 3966 nrsm->r_end = rsm->r_end; 3967 nrsm->r_rtr_cnt = rsm->r_rtr_cnt; 3968 nrsm->r_flags = rsm->r_flags; 3969 nrsm->r_sndcnt = rsm->r_sndcnt; 3970 for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { 3971 nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; 3972 } 3973 rsm->r_end = nrsm->r_start; 3974 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next); 3975 if (rsm->r_in_tmap) { 3976 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 3977 nrsm->r_in_tmap = 1; 3978 } 3979 rsm->r_flags &= (~RACK_HAS_FIN); 3980 rsm = nrsm; 3981 } 3982 if (SEQ_GEQ(end, rsm->r_end)) { 3983 /* 3984 * The end of this block is either beyond this guy or right 3985 * at this guy. 3986 */ 3987 3988 if ((rsm->r_flags & RACK_ACKED) == 0) { 3989 rack_update_rtt(tp, rack, rsm, to, cts, SACKED); 3990 changed += (rsm->r_end - rsm->r_start); 3991 rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start); 3992 rack_log_sack_passed(tp, rack, rsm); 3993 /* Is Reordering occuring? */ 3994 if (rsm->r_flags & RACK_SACK_PASSED) { 3995 counter_u64_add(rack_reorder_seen, 1); 3996 rack->r_ctl.rc_reorder_ts = cts; 3997 } 3998 rsm->r_flags |= RACK_ACKED; 3999 rsm->r_flags &= ~RACK_TLP; 4000 if (rsm->r_in_tmap) { 4001 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 4002 rsm->r_in_tmap = 0; 4003 } 4004 } 4005 if (end == rsm->r_end) { 4006 /* This block only - done */ 4007 goto out; 4008 } 4009 /* There is more not coverend by this rsm move on */ 4010 start = rsm->r_end; 4011 nrsm = TAILQ_NEXT(rsm, r_next); 4012 rsm = nrsm; 4013 times = 0; 4014 goto do_rest_ofb; 4015 } 4016 /* Ok we need to split off this one at the tail */ 4017 nrsm = rack_alloc(rack); 4018 if (nrsm == NULL) { 4019 /* failed rrs what can we do but loose the sack info? */ 4020 goto out; 4021 } 4022 /* Clone it */ 4023 nrsm->r_start = end; 4024 nrsm->r_end = rsm->r_end; 4025 nrsm->r_rtr_bytes = 0; 4026 nrsm->r_rtr_cnt = rsm->r_rtr_cnt; 4027 nrsm->r_flags = rsm->r_flags; 4028 nrsm->r_sndcnt = rsm->r_sndcnt; 4029 for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { 4030 nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; 4031 } 4032 /* The sack block does not cover this guy fully */ 4033 rsm->r_flags &= (~RACK_HAS_FIN); 4034 rsm->r_end = end; 4035 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next); 4036 if (rsm->r_in_tmap) { 4037 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 4038 nrsm->r_in_tmap = 1; 4039 } 4040 if (rsm->r_flags & RACK_ACKED) { 4041 /* Been here done that */ 4042 goto out; 4043 } 4044 rack_update_rtt(tp, rack, rsm, to, cts, SACKED); 4045 changed += (rsm->r_end - rsm->r_start); 4046 rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start); 4047 rack_log_sack_passed(tp, rack, rsm); 4048 /* Is Reordering occuring? */ 4049 if (rsm->r_flags & RACK_SACK_PASSED) { 4050 counter_u64_add(rack_reorder_seen, 1); 4051 rack->r_ctl.rc_reorder_ts = cts; 4052 } 4053 rsm->r_flags |= RACK_ACKED; 4054 rsm->r_flags &= ~RACK_TLP; 4055 if (rsm->r_in_tmap) { 4056 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 4057 rsm->r_in_tmap = 0; 4058 } 4059 out: 4060 if (used_ref == 0) { 4061 counter_u64_add(rack_sack_proc_all, 1); 4062 } else { 4063 counter_u64_add(rack_sack_proc_short, 1); 4064 } 4065 /* Save off where we last were */ 4066 if (rsm) 4067 rack->r_ctl.rc_sacklast = TAILQ_NEXT(rsm, r_next); 4068 else 4069 rack->r_ctl.rc_sacklast = NULL; 4070 *prsm = rsm; 4071 return (changed); 4072 } 4073 4074 static void inline 4075 rack_peer_reneges(struct tcp_rack *rack, struct rack_sendmap *rsm, tcp_seq th_ack) 4076 { 4077 struct rack_sendmap *tmap; 4078 4079 tmap = NULL; 4080 while (rsm && (rsm->r_flags & RACK_ACKED)) { 4081 /* Its no longer sacked, mark it so */ 4082 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); 4083 #ifdef INVARIANTS 4084 if (rsm->r_in_tmap) { 4085 panic("rack:%p rsm:%p flags:0x%x in tmap?", 4086 rack, rsm, rsm->r_flags); 4087 } 4088 #endif 4089 rsm->r_flags &= ~(RACK_ACKED|RACK_SACK_PASSED|RACK_WAS_SACKPASS); 4090 /* Rebuild it into our tmap */ 4091 if (tmap == NULL) { 4092 TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext); 4093 tmap = rsm; 4094 } else { 4095 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, tmap, rsm, r_tnext); 4096 tmap = rsm; 4097 } 4098 tmap->r_in_tmap = 1; 4099 rsm = TAILQ_NEXT(rsm, r_next); 4100 } 4101 /* 4102 * Now lets possibly clear the sack filter so we start 4103 * recognizing sacks that cover this area. 4104 */ 4105 if (rack_use_sack_filter) 4106 sack_filter_clear(&rack->r_ctl.rack_sf, th_ack); 4107 4108 } 4109 4110 static void 4111 rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th) 4112 { 4113 uint32_t changed, last_seq, entered_recovery = 0; 4114 struct tcp_rack *rack; 4115 struct rack_sendmap *rsm; 4116 struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1]; 4117 register uint32_t th_ack; 4118 int32_t i, j, k, num_sack_blks = 0; 4119 uint32_t cts, acked, ack_point, sack_changed = 0; 4120 4121 INP_WLOCK_ASSERT(tp->t_inpcb); 4122 if (th->th_flags & TH_RST) { 4123 /* We don't log resets */ 4124 return; 4125 } 4126 rack = (struct tcp_rack *)tp->t_fb_ptr; 4127 cts = tcp_ts_getticks(); 4128 rsm = TAILQ_FIRST(&rack->r_ctl.rc_map); 4129 changed = 0; 4130 th_ack = th->th_ack; 4131 4132 if (SEQ_GT(th_ack, tp->snd_una)) { 4133 rack_log_progress_event(rack, tp, ticks, PROGRESS_UPDATE, __LINE__); 4134 tp->t_acktime = ticks; 4135 } 4136 if (rsm && SEQ_GT(th_ack, rsm->r_start)) 4137 changed = th_ack - rsm->r_start; 4138 if (changed) { 4139 /* 4140 * The ACK point is advancing to th_ack, we must drop off 4141 * the packets in the rack log and calculate any eligble 4142 * RTT's. 4143 */ 4144 rack->r_wanted_output++; 4145 more: 4146 rsm = TAILQ_FIRST(&rack->r_ctl.rc_map); 4147 if (rsm == NULL) { 4148 if ((th_ack - 1) == tp->iss) { 4149 /* 4150 * For the SYN incoming case we will not 4151 * have called tcp_output for the sending of 4152 * the SYN, so there will be no map. All 4153 * other cases should probably be a panic. 4154 */ 4155 goto proc_sack; 4156 } 4157 if (tp->t_flags & TF_SENTFIN) { 4158 /* if we send a FIN we will not hav a map */ 4159 goto proc_sack; 4160 } 4161 #ifdef INVARIANTS 4162 panic("No rack map tp:%p for th:%p state:%d rack:%p snd_una:%u snd_max:%u snd_nxt:%u chg:%d\n", 4163 tp, 4164 th, tp->t_state, rack, 4165 tp->snd_una, tp->snd_max, tp->snd_nxt, changed); 4166 #endif 4167 goto proc_sack; 4168 } 4169 if (SEQ_LT(th_ack, rsm->r_start)) { 4170 /* Huh map is missing this */ 4171 #ifdef INVARIANTS 4172 printf("Rack map starts at r_start:%u for th_ack:%u huh? ts:%d rs:%d\n", 4173 rsm->r_start, 4174 th_ack, tp->t_state, rack->r_state); 4175 #endif 4176 goto proc_sack; 4177 } 4178 rack_update_rtt(tp, rack, rsm, to, cts, CUM_ACKED); 4179 /* Now do we consume the whole thing? */ 4180 if (SEQ_GEQ(th_ack, rsm->r_end)) { 4181 /* Its all consumed. */ 4182 uint32_t left; 4183 4184 rack->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes; 4185 rsm->r_rtr_bytes = 0; 4186 TAILQ_REMOVE(&rack->r_ctl.rc_map, rsm, r_next); 4187 if (rsm->r_in_tmap) { 4188 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 4189 rsm->r_in_tmap = 0; 4190 } 4191 if (rack->r_ctl.rc_next == rsm) { 4192 /* scoot along the marker */ 4193 rack->r_ctl.rc_next = TAILQ_FIRST(&rack->r_ctl.rc_map); 4194 } 4195 if (rsm->r_flags & RACK_ACKED) { 4196 /* 4197 * It was acked on the scoreboard -- remove 4198 * it from total 4199 */ 4200 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); 4201 } else if (rsm->r_flags & RACK_SACK_PASSED) { 4202 /* 4203 * There are acked segments ACKED on the 4204 * scoreboard further up. We are seeing 4205 * reordering. 4206 */ 4207 counter_u64_add(rack_reorder_seen, 1); 4208 rsm->r_flags |= RACK_ACKED; 4209 rack->r_ctl.rc_reorder_ts = cts; 4210 } 4211 left = th_ack - rsm->r_end; 4212 if (rsm->r_rtr_cnt > 1) { 4213 /* 4214 * Technically we should make r_rtr_cnt be 4215 * monotonicly increasing and just mod it to 4216 * the timestamp it is replacing.. that way 4217 * we would have the last 3 retransmits. Now 4218 * rc_loss_count will be wrong if we 4219 * retransmit something more than 2 times in 4220 * recovery :( 4221 */ 4222 rack->r_ctl.rc_loss_count += (rsm->r_rtr_cnt - 1); 4223 } 4224 /* Free back to zone */ 4225 rack_free(rack, rsm); 4226 if (left) { 4227 goto more; 4228 } 4229 goto proc_sack; 4230 } 4231 if (rsm->r_flags & RACK_ACKED) { 4232 /* 4233 * It was acked on the scoreboard -- remove it from 4234 * total for the part being cum-acked. 4235 */ 4236 rack->r_ctl.rc_sacked -= (th_ack - rsm->r_start); 4237 } 4238 rack->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes; 4239 rsm->r_rtr_bytes = 0; 4240 rsm->r_start = th_ack; 4241 } 4242 proc_sack: 4243 /* Check for reneging */ 4244 rsm = TAILQ_FIRST(&rack->r_ctl.rc_map); 4245 if (rsm && (rsm->r_flags & RACK_ACKED) && (th_ack == rsm->r_start)) { 4246 /* 4247 * The peer has moved snd_una up to 4248 * the edge of this send, i.e. one 4249 * that it had previously acked. The only 4250 * way that can be true if the peer threw 4251 * away data (space issues) that it had 4252 * previously sacked (else it would have 4253 * given us snd_una up to (rsm->r_end). 4254 * We need to undo the acked markings here. 4255 * 4256 * Note we have to look to make sure th_ack is 4257 * our rsm->r_start in case we get an old ack 4258 * where th_ack is behind snd_una. 4259 */ 4260 rack_peer_reneges(rack, rsm, th->th_ack); 4261 } 4262 if ((to->to_flags & TOF_SACK) == 0) { 4263 /* We are done nothing left to log */ 4264 goto out; 4265 } 4266 rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_map, rack_sendmap, r_next); 4267 if (rsm) { 4268 last_seq = rsm->r_end; 4269 } else { 4270 last_seq = tp->snd_max; 4271 } 4272 /* Sack block processing */ 4273 if (SEQ_GT(th_ack, tp->snd_una)) 4274 ack_point = th_ack; 4275 else 4276 ack_point = tp->snd_una; 4277 for (i = 0; i < to->to_nsacks; i++) { 4278 bcopy((to->to_sacks + i * TCPOLEN_SACK), 4279 &sack, sizeof(sack)); 4280 sack.start = ntohl(sack.start); 4281 sack.end = ntohl(sack.end); 4282 if (SEQ_GT(sack.end, sack.start) && 4283 SEQ_GT(sack.start, ack_point) && 4284 SEQ_LT(sack.start, tp->snd_max) && 4285 SEQ_GT(sack.end, ack_point) && 4286 SEQ_LEQ(sack.end, tp->snd_max)) { 4287 if ((rack->r_ctl.rc_num_maps_alloced > rack_sack_block_limit) && 4288 (SEQ_LT(sack.end, last_seq)) && 4289 ((sack.end - sack.start) < (tp->t_maxseg / 8))) { 4290 /* 4291 * Not the last piece and its smaller than 4292 * 1/8th of a MSS. We ignore this. 4293 */ 4294 counter_u64_add(rack_runt_sacks, 1); 4295 continue; 4296 } 4297 sack_blocks[num_sack_blks] = sack; 4298 num_sack_blks++; 4299 #ifdef NETFLIX_STATS 4300 } else if (SEQ_LEQ(sack.start, th_ack) && 4301 SEQ_LEQ(sack.end, th_ack)) { 4302 /* 4303 * Its a D-SACK block. 4304 */ 4305 tcp_record_dsack(sack.start, sack.end); 4306 #endif 4307 } 4308 4309 } 4310 if (num_sack_blks == 0) 4311 goto out; 4312 /* 4313 * Sort the SACK blocks so we can update the rack scoreboard with 4314 * just one pass. 4315 */ 4316 if (rack_use_sack_filter) { 4317 num_sack_blks = sack_filter_blks(&rack->r_ctl.rack_sf, sack_blocks, num_sack_blks, th->th_ack); 4318 } 4319 if (num_sack_blks < 2) { 4320 goto do_sack_work; 4321 } 4322 /* Sort the sacks */ 4323 for (i = 0; i < num_sack_blks; i++) { 4324 for (j = i + 1; j < num_sack_blks; j++) { 4325 if (SEQ_GT(sack_blocks[i].end, sack_blocks[j].end)) { 4326 sack = sack_blocks[i]; 4327 sack_blocks[i] = sack_blocks[j]; 4328 sack_blocks[j] = sack; 4329 } 4330 } 4331 } 4332 /* 4333 * Now are any of the sack block ends the same (yes some 4334 * implememtations send these)? 4335 */ 4336 again: 4337 if (num_sack_blks > 1) { 4338 for (i = 0; i < num_sack_blks; i++) { 4339 for (j = i + 1; j < num_sack_blks; j++) { 4340 if (sack_blocks[i].end == sack_blocks[j].end) { 4341 /* 4342 * Ok these two have the same end we 4343 * want the smallest end and then 4344 * throw away the larger and start 4345 * again. 4346 */ 4347 if (SEQ_LT(sack_blocks[j].start, sack_blocks[i].start)) { 4348 /* 4349 * The second block covers 4350 * more area use that 4351 */ 4352 sack_blocks[i].start = sack_blocks[j].start; 4353 } 4354 /* 4355 * Now collapse out the dup-sack and 4356 * lower the count 4357 */ 4358 for (k = (j + 1); k < num_sack_blks; k++) { 4359 sack_blocks[j].start = sack_blocks[k].start; 4360 sack_blocks[j].end = sack_blocks[k].end; 4361 j++; 4362 } 4363 num_sack_blks--; 4364 goto again; 4365 } 4366 } 4367 } 4368 } 4369 do_sack_work: 4370 rsm = rack->r_ctl.rc_sacklast; 4371 for (i = 0; i < num_sack_blks; i++) { 4372 acked = rack_proc_sack_blk(tp, rack, &sack_blocks[i], to, &rsm, cts); 4373 if (acked) { 4374 rack->r_wanted_output++; 4375 changed += acked; 4376 sack_changed += acked; 4377 } 4378 } 4379 out: 4380 if (changed) { 4381 /* Something changed cancel the rack timer */ 4382 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 4383 } 4384 if ((sack_changed) && (!IN_RECOVERY(tp->t_flags))) { 4385 /* 4386 * Ok we have a high probability that we need to go in to 4387 * recovery since we have data sack'd 4388 */ 4389 struct rack_sendmap *rsm; 4390 uint32_t tsused; 4391 4392 tsused = tcp_ts_getticks(); 4393 rsm = tcp_rack_output(tp, rack, tsused); 4394 if (rsm) { 4395 /* Enter recovery */ 4396 rack->r_ctl.rc_rsm_start = rsm->r_start; 4397 rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; 4398 rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; 4399 entered_recovery = 1; 4400 rack_cong_signal(tp, NULL, CC_NDUPACK); 4401 /* 4402 * When we enter recovery we need to assure we send 4403 * one packet. 4404 */ 4405 rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; 4406 rack->r_timer_override = 1; 4407 } 4408 } 4409 if (IN_RECOVERY(tp->t_flags) && (entered_recovery == 0)) { 4410 /* Deal with changed an PRR here (in recovery only) */ 4411 uint32_t pipe, snd_una; 4412 4413 rack->r_ctl.rc_prr_delivered += changed; 4414 /* Compute prr_sndcnt */ 4415 if (SEQ_GT(tp->snd_una, th_ack)) { 4416 snd_una = tp->snd_una; 4417 } else { 4418 snd_una = th_ack; 4419 } 4420 pipe = ((tp->snd_max - snd_una) - rack->r_ctl.rc_sacked) + rack->r_ctl.rc_holes_rxt; 4421 if (pipe > tp->snd_ssthresh) { 4422 long sndcnt; 4423 4424 sndcnt = rack->r_ctl.rc_prr_delivered * tp->snd_ssthresh; 4425 if (rack->r_ctl.rc_prr_recovery_fs > 0) 4426 sndcnt /= (long)rack->r_ctl.rc_prr_recovery_fs; 4427 else { 4428 rack->r_ctl.rc_prr_sndcnt = 0; 4429 sndcnt = 0; 4430 } 4431 sndcnt++; 4432 if (sndcnt > (long)rack->r_ctl.rc_prr_out) 4433 sndcnt -= rack->r_ctl.rc_prr_out; 4434 else 4435 sndcnt = 0; 4436 rack->r_ctl.rc_prr_sndcnt = sndcnt; 4437 } else { 4438 uint32_t limit; 4439 4440 if (rack->r_ctl.rc_prr_delivered > rack->r_ctl.rc_prr_out) 4441 limit = (rack->r_ctl.rc_prr_delivered - rack->r_ctl.rc_prr_out); 4442 else 4443 limit = 0; 4444 if (changed > limit) 4445 limit = changed; 4446 limit += tp->t_maxseg; 4447 if (tp->snd_ssthresh > pipe) { 4448 rack->r_ctl.rc_prr_sndcnt = min((tp->snd_ssthresh - pipe), limit); 4449 } else { 4450 rack->r_ctl.rc_prr_sndcnt = min(0, limit); 4451 } 4452 } 4453 if (rack->r_ctl.rc_prr_sndcnt >= tp->t_maxseg) { 4454 rack->r_timer_override = 1; 4455 } 4456 } 4457 } 4458 4459 /* 4460 * Return value of 1, we do not need to call rack_process_data(). 4461 * return value of 0, rack_process_data can be called. 4462 * For ret_val if its 0 the TCP is locked, if its non-zero 4463 * its unlocked and probably unsafe to touch the TCB. 4464 */ 4465 static int 4466 rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so, 4467 struct tcpcb *tp, struct tcpopt *to, 4468 uint32_t tiwin, int32_t tlen, 4469 int32_t * ofia, int32_t thflags, int32_t * ret_val) 4470 { 4471 int32_t ourfinisacked = 0; 4472 int32_t nsegs, acked_amount; 4473 int32_t acked; 4474 struct mbuf *mfree; 4475 struct tcp_rack *rack; 4476 int32_t recovery = 0; 4477 4478 rack = (struct tcp_rack *)tp->t_fb_ptr; 4479 if (SEQ_GT(th->th_ack, tp->snd_max)) { 4480 rack_do_dropafterack(m, tp, th, thflags, tlen, ret_val); 4481 return (1); 4482 } 4483 if (SEQ_GEQ(th->th_ack, tp->snd_una) || to->to_nsacks) { 4484 rack_log_ack(tp, to, th); 4485 } 4486 if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) { 4487 /* 4488 * Old ack, behind (or duplicate to) the last one rcv'd 4489 * Note: Should mark reordering is occuring! We should also 4490 * look for sack blocks arriving e.g. ack 1, 4-4 then ack 1, 4491 * 3-3, 4-4 would be reording. As well as ack 1, 3-3 <no 4492 * retran and> ack 3 4493 */ 4494 return (0); 4495 } 4496 /* 4497 * If we reach this point, ACK is not a duplicate, i.e., it ACKs 4498 * something we sent. 4499 */ 4500 if (tp->t_flags & TF_NEEDSYN) { 4501 /* 4502 * T/TCP: Connection was half-synchronized, and our SYN has 4503 * been ACK'd (so connection is now fully synchronized). Go 4504 * to non-starred state, increment snd_una for ACK of SYN, 4505 * and check if we can do window scaling. 4506 */ 4507 tp->t_flags &= ~TF_NEEDSYN; 4508 tp->snd_una++; 4509 /* Do window scaling? */ 4510 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 4511 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 4512 tp->rcv_scale = tp->request_r_scale; 4513 /* Send window already scaled. */ 4514 } 4515 } 4516 nsegs = max(1, m->m_pkthdr.lro_nsegs); 4517 INP_WLOCK_ASSERT(tp->t_inpcb); 4518 4519 acked = BYTES_THIS_ACK(tp, th); 4520 TCPSTAT_ADD(tcps_rcvackpack, nsegs); 4521 TCPSTAT_ADD(tcps_rcvackbyte, acked); 4522 4523 /* 4524 * If we just performed our first retransmit, and the ACK arrives 4525 * within our recovery window, then it was a mistake to do the 4526 * retransmit in the first place. Recover our original cwnd and 4527 * ssthresh, and proceed to transmit where we left off. 4528 */ 4529 if (tp->t_flags & TF_PREVVALID) { 4530 tp->t_flags &= ~TF_PREVVALID; 4531 if (tp->t_rxtshift == 1 && 4532 (int)(ticks - tp->t_badrxtwin) < 0) 4533 rack_cong_signal(tp, th, CC_RTO_ERR); 4534 } 4535 /* 4536 * If we have a timestamp reply, update smoothed round trip time. If 4537 * no timestamp is present but transmit timer is running and timed 4538 * sequence number was acked, update smoothed round trip time. Since 4539 * we now have an rtt measurement, cancel the timer backoff (cf., 4540 * Phil Karn's retransmit alg.). Recompute the initial retransmit 4541 * timer. 4542 * 4543 * Some boxes send broken timestamp replies during the SYN+ACK 4544 * phase, ignore timestamps of 0 or we could calculate a huge RTT 4545 * and blow up the retransmit timer. 4546 */ 4547 /* 4548 * If all outstanding data is acked, stop retransmit timer and 4549 * remember to restart (more output or persist). If there is more 4550 * data to be acked, restart retransmit timer, using current 4551 * (possibly backed-off) value. 4552 */ 4553 if (th->th_ack == tp->snd_max) { 4554 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 4555 rack->r_wanted_output++; 4556 } 4557 /* 4558 * If no data (only SYN) was ACK'd, skip rest of ACK processing. 4559 */ 4560 if (acked == 0) { 4561 if (ofia) 4562 *ofia = ourfinisacked; 4563 return (0); 4564 } 4565 if (rack->r_ctl.rc_early_recovery) { 4566 if (IN_FASTRECOVERY(tp->t_flags)) { 4567 if (SEQ_LT(th->th_ack, tp->snd_recover)) { 4568 tcp_rack_partialack(tp, th); 4569 } else { 4570 rack_post_recovery(tp, th); 4571 recovery = 1; 4572 } 4573 } 4574 } 4575 /* 4576 * Let the congestion control algorithm update congestion control 4577 * related information. This typically means increasing the 4578 * congestion window. 4579 */ 4580 rack_ack_received(tp, rack, th, nsegs, CC_ACK, recovery); 4581 SOCKBUF_LOCK(&so->so_snd); 4582 acked_amount = min(acked, (int)sbavail(&so->so_snd)); 4583 tp->snd_wnd -= acked_amount; 4584 mfree = sbcut_locked(&so->so_snd, acked_amount); 4585 if ((sbused(&so->so_snd) == 0) && 4586 (acked > acked_amount) && 4587 (tp->t_state >= TCPS_FIN_WAIT_1)) { 4588 ourfinisacked = 1; 4589 } 4590 /* NB: sowwakeup_locked() does an implicit unlock. */ 4591 sowwakeup_locked(so); 4592 m_freem(mfree); 4593 if (rack->r_ctl.rc_early_recovery == 0) { 4594 if (IN_FASTRECOVERY(tp->t_flags)) { 4595 if (SEQ_LT(th->th_ack, tp->snd_recover)) { 4596 tcp_rack_partialack(tp, th); 4597 } else { 4598 rack_post_recovery(tp, th); 4599 } 4600 } 4601 } 4602 tp->snd_una = th->th_ack; 4603 if (SEQ_GT(tp->snd_una, tp->snd_recover)) 4604 tp->snd_recover = tp->snd_una; 4605 4606 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) { 4607 tp->snd_nxt = tp->snd_una; 4608 } 4609 if (tp->snd_una == tp->snd_max) { 4610 /* Nothing left outstanding */ 4611 rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__); 4612 tp->t_acktime = 0; 4613 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 4614 /* Set need output so persist might get set */ 4615 rack->r_wanted_output++; 4616 if (rack_use_sack_filter) 4617 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 4618 if ((tp->t_state >= TCPS_FIN_WAIT_1) && 4619 (sbavail(&so->so_snd) == 0) && 4620 (tp->t_flags2 & TF2_DROP_AF_DATA)) { 4621 /* 4622 * The socket was gone and the 4623 * peer sent data, time to 4624 * reset him. 4625 */ 4626 *ret_val = 1; 4627 tp = tcp_close(tp); 4628 rack_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, tlen); 4629 return (1); 4630 } 4631 } 4632 if (ofia) 4633 *ofia = ourfinisacked; 4634 return (0); 4635 } 4636 4637 4638 /* 4639 * Return value of 1, the TCB is unlocked and most 4640 * likely gone, return value of 0, the TCP is still 4641 * locked. 4642 */ 4643 static int 4644 rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so, 4645 struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, 4646 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) 4647 { 4648 /* 4649 * Update window information. Don't look at window if no ACK: TAC's 4650 * send garbage on first SYN. 4651 */ 4652 int32_t nsegs; 4653 int32_t tfo_syn; 4654 struct tcp_rack *rack; 4655 4656 rack = (struct tcp_rack *)tp->t_fb_ptr; 4657 INP_WLOCK_ASSERT(tp->t_inpcb); 4658 nsegs = max(1, m->m_pkthdr.lro_nsegs); 4659 if ((thflags & TH_ACK) && 4660 (SEQ_LT(tp->snd_wl1, th->th_seq) || 4661 (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || 4662 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { 4663 /* keep track of pure window updates */ 4664 if (tlen == 0 && 4665 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) 4666 TCPSTAT_INC(tcps_rcvwinupd); 4667 tp->snd_wnd = tiwin; 4668 tp->snd_wl1 = th->th_seq; 4669 tp->snd_wl2 = th->th_ack; 4670 if (tp->snd_wnd > tp->max_sndwnd) 4671 tp->max_sndwnd = tp->snd_wnd; 4672 rack->r_wanted_output++; 4673 } else if (thflags & TH_ACK) { 4674 if ((tp->snd_wl2 == th->th_ack) && (tiwin < tp->snd_wnd)) { 4675 tp->snd_wnd = tiwin; 4676 tp->snd_wl1 = th->th_seq; 4677 tp->snd_wl2 = th->th_ack; 4678 } 4679 } 4680 /* Was persist timer active and now we have window space? */ 4681 if ((rack->rc_in_persist != 0) && tp->snd_wnd) { 4682 rack_exit_persist(tp, rack); 4683 tp->snd_nxt = tp->snd_max; 4684 /* Make sure we output to start the timer */ 4685 rack->r_wanted_output++; 4686 } 4687 if (tp->t_flags2 & TF2_DROP_AF_DATA) { 4688 m_freem(m); 4689 return (0); 4690 } 4691 /* 4692 * Process segments with URG. 4693 */ 4694 if ((thflags & TH_URG) && th->th_urp && 4695 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 4696 /* 4697 * This is a kludge, but if we receive and accept random 4698 * urgent pointers, we'll crash in soreceive. It's hard to 4699 * imagine someone actually wanting to send this much urgent 4700 * data. 4701 */ 4702 SOCKBUF_LOCK(&so->so_rcv); 4703 if (th->th_urp + sbavail(&so->so_rcv) > sb_max) { 4704 th->th_urp = 0; /* XXX */ 4705 thflags &= ~TH_URG; /* XXX */ 4706 SOCKBUF_UNLOCK(&so->so_rcv); /* XXX */ 4707 goto dodata; /* XXX */ 4708 } 4709 /* 4710 * If this segment advances the known urgent pointer, then 4711 * mark the data stream. This should not happen in 4712 * CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since a 4713 * FIN has been received from the remote side. In these 4714 * states we ignore the URG. 4715 * 4716 * According to RFC961 (Assigned Protocols), the urgent 4717 * pointer points to the last octet of urgent data. We 4718 * continue, however, to consider it to indicate the first 4719 * octet of data past the urgent section as the original 4720 * spec states (in one of two places). 4721 */ 4722 if (SEQ_GT(th->th_seq + th->th_urp, tp->rcv_up)) { 4723 tp->rcv_up = th->th_seq + th->th_urp; 4724 so->so_oobmark = sbavail(&so->so_rcv) + 4725 (tp->rcv_up - tp->rcv_nxt) - 1; 4726 if (so->so_oobmark == 0) 4727 so->so_rcv.sb_state |= SBS_RCVATMARK; 4728 sohasoutofband(so); 4729 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); 4730 } 4731 SOCKBUF_UNLOCK(&so->so_rcv); 4732 /* 4733 * Remove out of band data so doesn't get presented to user. 4734 * This can happen independent of advancing the URG pointer, 4735 * but if two URG's are pending at once, some out-of-band 4736 * data may creep in... ick. 4737 */ 4738 if (th->th_urp <= (uint32_t) tlen && 4739 !(so->so_options & SO_OOBINLINE)) { 4740 /* hdr drop is delayed */ 4741 tcp_pulloutofband(so, th, m, drop_hdrlen); 4742 } 4743 } else { 4744 /* 4745 * If no out of band data is expected, pull receive urgent 4746 * pointer along with the receive window. 4747 */ 4748 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) 4749 tp->rcv_up = tp->rcv_nxt; 4750 } 4751 dodata: /* XXX */ 4752 INP_WLOCK_ASSERT(tp->t_inpcb); 4753 4754 /* 4755 * Process the segment text, merging it into the TCP sequencing 4756 * queue, and arranging for acknowledgment of receipt if necessary. 4757 * This process logically involves adjusting tp->rcv_wnd as data is 4758 * presented to the user (this happens in tcp_usrreq.c, case 4759 * PRU_RCVD). If a FIN has already been received on this connection 4760 * then we just ignore the text. 4761 */ 4762 tfo_syn = ((tp->t_state == TCPS_SYN_RECEIVED) && 4763 IS_FASTOPEN(tp->t_flags)); 4764 if ((tlen || (thflags & TH_FIN) || tfo_syn) && 4765 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 4766 tcp_seq save_start = th->th_seq; 4767 tcp_seq save_rnxt = tp->rcv_nxt; 4768 int save_tlen = tlen; 4769 4770 m_adj(m, drop_hdrlen); /* delayed header drop */ 4771 /* 4772 * Insert segment which includes th into TCP reassembly 4773 * queue with control block tp. Set thflags to whether 4774 * reassembly now includes a segment with FIN. This handles 4775 * the common case inline (segment is the next to be 4776 * received on an established connection, and the queue is 4777 * empty), avoiding linkage into and removal from the queue 4778 * and repetition of various conversions. Set DELACK for 4779 * segments received in order, but ack immediately when 4780 * segments are out of order (so fast retransmit can work). 4781 */ 4782 if (th->th_seq == tp->rcv_nxt && 4783 SEGQ_EMPTY(tp) && 4784 (TCPS_HAVEESTABLISHED(tp->t_state) || 4785 tfo_syn)) { 4786 if (DELAY_ACK(tp, tlen) || tfo_syn) { 4787 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 4788 tp->t_flags |= TF_DELACK; 4789 } else { 4790 rack->r_wanted_output++; 4791 tp->t_flags |= TF_ACKNOW; 4792 } 4793 tp->rcv_nxt += tlen; 4794 thflags = th->th_flags & TH_FIN; 4795 TCPSTAT_ADD(tcps_rcvpack, nsegs); 4796 TCPSTAT_ADD(tcps_rcvbyte, tlen); 4797 SOCKBUF_LOCK(&so->so_rcv); 4798 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) 4799 m_freem(m); 4800 else 4801 sbappendstream_locked(&so->so_rcv, m, 0); 4802 /* NB: sorwakeup_locked() does an implicit unlock. */ 4803 sorwakeup_locked(so); 4804 } else { 4805 /* 4806 * XXX: Due to the header drop above "th" is 4807 * theoretically invalid by now. Fortunately 4808 * m_adj() doesn't actually frees any mbufs when 4809 * trimming from the head. 4810 */ 4811 tcp_seq temp = save_start; 4812 thflags = tcp_reass(tp, th, &temp, &tlen, m); 4813 tp->t_flags |= TF_ACKNOW; 4814 } 4815 if (((tlen == 0) && (save_tlen > 0) && 4816 (SEQ_LT(save_start, save_rnxt)))) { 4817 /* 4818 * DSACK actually handled in the fastpath 4819 * above. 4820 */ 4821 tcp_update_sack_list(tp, save_start, save_start + save_tlen); 4822 } else if ((tlen > 0) && SEQ_GT(tp->rcv_nxt, save_rnxt)) { 4823 /* 4824 * Cleaning sackblks by using zero length 4825 * update. 4826 */ 4827 tcp_update_sack_list(tp, save_start, save_start); 4828 } else if ((tlen > 0) && (tlen >= save_tlen)) { 4829 /* Update of sackblks. */ 4830 tcp_update_sack_list(tp, save_start, save_start + save_tlen); 4831 } else if (tlen > 0) { 4832 tcp_update_sack_list(tp, save_start, save_start+tlen); 4833 } 4834 } else { 4835 m_freem(m); 4836 thflags &= ~TH_FIN; 4837 } 4838 4839 /* 4840 * If FIN is received ACK the FIN and let the user know that the 4841 * connection is closing. 4842 */ 4843 if (thflags & TH_FIN) { 4844 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 4845 socantrcvmore(so); 4846 /* 4847 * If connection is half-synchronized (ie NEEDSYN 4848 * flag on) then delay ACK, so it may be piggybacked 4849 * when SYN is sent. Otherwise, since we received a 4850 * FIN then no more input can be expected, send ACK 4851 * now. 4852 */ 4853 if (tp->t_flags & TF_NEEDSYN) { 4854 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 4855 tp->t_flags |= TF_DELACK; 4856 } else { 4857 tp->t_flags |= TF_ACKNOW; 4858 } 4859 tp->rcv_nxt++; 4860 } 4861 switch (tp->t_state) { 4862 4863 /* 4864 * In SYN_RECEIVED and ESTABLISHED STATES enter the 4865 * CLOSE_WAIT state. 4866 */ 4867 case TCPS_SYN_RECEIVED: 4868 tp->t_starttime = ticks; 4869 /* FALLTHROUGH */ 4870 case TCPS_ESTABLISHED: 4871 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 4872 tcp_state_change(tp, TCPS_CLOSE_WAIT); 4873 break; 4874 4875 /* 4876 * If still in FIN_WAIT_1 STATE FIN has not been 4877 * acked so enter the CLOSING state. 4878 */ 4879 case TCPS_FIN_WAIT_1: 4880 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 4881 tcp_state_change(tp, TCPS_CLOSING); 4882 break; 4883 4884 /* 4885 * In FIN_WAIT_2 state enter the TIME_WAIT state, 4886 * starting the time-wait timer, turning off the 4887 * other standard timers. 4888 */ 4889 case TCPS_FIN_WAIT_2: 4890 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 4891 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 4892 tcp_twstart(tp); 4893 return (1); 4894 } 4895 } 4896 /* 4897 * Return any desired output. 4898 */ 4899 if ((tp->t_flags & TF_ACKNOW) || (sbavail(&so->so_snd) > (tp->snd_max - tp->snd_una))) { 4900 rack->r_wanted_output++; 4901 } 4902 INP_WLOCK_ASSERT(tp->t_inpcb); 4903 return (0); 4904 } 4905 4906 /* 4907 * Here nothing is really faster, its just that we 4908 * have broken out the fast-data path also just like 4909 * the fast-ack. 4910 */ 4911 static int 4912 rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so, 4913 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 4914 uint32_t tiwin, int32_t nxt_pkt) 4915 { 4916 int32_t nsegs; 4917 int32_t newsize = 0; /* automatic sockbuf scaling */ 4918 struct tcp_rack *rack; 4919 #ifdef TCPDEBUG 4920 /* 4921 * The size of tcp_saveipgen must be the size of the max ip header, 4922 * now IPv6. 4923 */ 4924 u_char tcp_saveipgen[IP6_HDR_LEN]; 4925 struct tcphdr tcp_savetcp; 4926 short ostate = 0; 4927 4928 #endif 4929 /* 4930 * If last ACK falls within this segment's sequence numbers, record 4931 * the timestamp. NOTE that the test is modified according to the 4932 * latest proposal of the tcplw@cray.com list (Braden 1993/04/26). 4933 */ 4934 if (__predict_false(th->th_seq != tp->rcv_nxt)) { 4935 return (0); 4936 } 4937 if (__predict_false(tp->snd_nxt != tp->snd_max)) { 4938 return (0); 4939 } 4940 if (tiwin && tiwin != tp->snd_wnd) { 4941 return (0); 4942 } 4943 if (__predict_false((tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN)))) { 4944 return (0); 4945 } 4946 if (__predict_false((to->to_flags & TOF_TS) && 4947 (TSTMP_LT(to->to_tsval, tp->ts_recent)))) { 4948 return (0); 4949 } 4950 if (__predict_false((th->th_ack != tp->snd_una))) { 4951 return (0); 4952 } 4953 if (__predict_false(tlen > sbspace(&so->so_rcv))) { 4954 return (0); 4955 } 4956 if ((to->to_flags & TOF_TS) != 0 && 4957 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 4958 tp->ts_recent_age = tcp_ts_getticks(); 4959 tp->ts_recent = to->to_tsval; 4960 } 4961 rack = (struct tcp_rack *)tp->t_fb_ptr; 4962 /* 4963 * This is a pure, in-sequence data packet with nothing on the 4964 * reassembly queue and we have enough buffer space to take it. 4965 */ 4966 nsegs = max(1, m->m_pkthdr.lro_nsegs); 4967 4968 4969 /* Clean receiver SACK report if present */ 4970 if (tp->rcv_numsacks) 4971 tcp_clean_sackreport(tp); 4972 TCPSTAT_INC(tcps_preddat); 4973 tp->rcv_nxt += tlen; 4974 /* 4975 * Pull snd_wl1 up to prevent seq wrap relative to th_seq. 4976 */ 4977 tp->snd_wl1 = th->th_seq; 4978 /* 4979 * Pull rcv_up up to prevent seq wrap relative to rcv_nxt. 4980 */ 4981 tp->rcv_up = tp->rcv_nxt; 4982 TCPSTAT_ADD(tcps_rcvpack, nsegs); 4983 TCPSTAT_ADD(tcps_rcvbyte, tlen); 4984 #ifdef TCPDEBUG 4985 if (so->so_options & SO_DEBUG) 4986 tcp_trace(TA_INPUT, ostate, tp, 4987 (void *)tcp_saveipgen, &tcp_savetcp, 0); 4988 #endif 4989 newsize = tcp_autorcvbuf(m, th, so, tp, tlen); 4990 4991 /* Add data to socket buffer. */ 4992 SOCKBUF_LOCK(&so->so_rcv); 4993 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 4994 m_freem(m); 4995 } else { 4996 /* 4997 * Set new socket buffer size. Give up when limit is 4998 * reached. 4999 */ 5000 if (newsize) 5001 if (!sbreserve_locked(&so->so_rcv, 5002 newsize, so, NULL)) 5003 so->so_rcv.sb_flags &= ~SB_AUTOSIZE; 5004 m_adj(m, drop_hdrlen); /* delayed header drop */ 5005 sbappendstream_locked(&so->so_rcv, m, 0); 5006 rack_calc_rwin(so, tp); 5007 } 5008 /* NB: sorwakeup_locked() does an implicit unlock. */ 5009 sorwakeup_locked(so); 5010 if (DELAY_ACK(tp, tlen)) { 5011 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 5012 tp->t_flags |= TF_DELACK; 5013 } else { 5014 tp->t_flags |= TF_ACKNOW; 5015 rack->r_wanted_output++; 5016 } 5017 if ((tp->snd_una == tp->snd_max) && rack_use_sack_filter) 5018 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 5019 return (1); 5020 } 5021 5022 /* 5023 * This subfunction is used to try to highly optimize the 5024 * fast path. We again allow window updates that are 5025 * in sequence to remain in the fast-path. We also add 5026 * in the __predict's to attempt to help the compiler. 5027 * Note that if we return a 0, then we can *not* process 5028 * it and the caller should push the packet into the 5029 * slow-path. 5030 */ 5031 static int 5032 rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so, 5033 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 5034 uint32_t tiwin, int32_t nxt_pkt, uint32_t cts) 5035 { 5036 int32_t acked; 5037 int32_t nsegs; 5038 5039 #ifdef TCPDEBUG 5040 /* 5041 * The size of tcp_saveipgen must be the size of the max ip header, 5042 * now IPv6. 5043 */ 5044 u_char tcp_saveipgen[IP6_HDR_LEN]; 5045 struct tcphdr tcp_savetcp; 5046 short ostate = 0; 5047 5048 #endif 5049 struct tcp_rack *rack; 5050 5051 if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) { 5052 /* Old ack, behind (or duplicate to) the last one rcv'd */ 5053 return (0); 5054 } 5055 if (__predict_false(SEQ_GT(th->th_ack, tp->snd_max))) { 5056 /* Above what we have sent? */ 5057 return (0); 5058 } 5059 if (__predict_false(tp->snd_nxt != tp->snd_max)) { 5060 /* We are retransmitting */ 5061 return (0); 5062 } 5063 if (__predict_false(tiwin == 0)) { 5064 /* zero window */ 5065 return (0); 5066 } 5067 if (__predict_false(tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN))) { 5068 /* We need a SYN or a FIN, unlikely.. */ 5069 return (0); 5070 } 5071 if ((to->to_flags & TOF_TS) && __predict_false(TSTMP_LT(to->to_tsval, tp->ts_recent))) { 5072 /* Timestamp is behind .. old ack with seq wrap? */ 5073 return (0); 5074 } 5075 if (__predict_false(IN_RECOVERY(tp->t_flags))) { 5076 /* Still recovering */ 5077 return (0); 5078 } 5079 rack = (struct tcp_rack *)tp->t_fb_ptr; 5080 if (rack->r_ctl.rc_sacked) { 5081 /* We have sack holes on our scoreboard */ 5082 return (0); 5083 } 5084 /* Ok if we reach here, we can process a fast-ack */ 5085 nsegs = max(1, m->m_pkthdr.lro_nsegs); 5086 rack_log_ack(tp, to, th); 5087 /* Did the window get updated? */ 5088 if (tiwin != tp->snd_wnd) { 5089 tp->snd_wnd = tiwin; 5090 tp->snd_wl1 = th->th_seq; 5091 if (tp->snd_wnd > tp->max_sndwnd) 5092 tp->max_sndwnd = tp->snd_wnd; 5093 } 5094 if ((rack->rc_in_persist != 0) && (tp->snd_wnd >= tp->t_maxseg)) { 5095 rack_exit_persist(tp, rack); 5096 } 5097 /* 5098 * If last ACK falls within this segment's sequence numbers, record 5099 * the timestamp. NOTE that the test is modified according to the 5100 * latest proposal of the tcplw@cray.com list (Braden 1993/04/26). 5101 */ 5102 if ((to->to_flags & TOF_TS) != 0 && 5103 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 5104 tp->ts_recent_age = tcp_ts_getticks(); 5105 tp->ts_recent = to->to_tsval; 5106 } 5107 /* 5108 * This is a pure ack for outstanding data. 5109 */ 5110 TCPSTAT_INC(tcps_predack); 5111 5112 /* 5113 * "bad retransmit" recovery. 5114 */ 5115 if (tp->t_flags & TF_PREVVALID) { 5116 tp->t_flags &= ~TF_PREVVALID; 5117 if (tp->t_rxtshift == 1 && 5118 (int)(ticks - tp->t_badrxtwin) < 0) 5119 rack_cong_signal(tp, th, CC_RTO_ERR); 5120 } 5121 /* 5122 * Recalculate the transmit timer / rtt. 5123 * 5124 * Some boxes send broken timestamp replies during the SYN+ACK 5125 * phase, ignore timestamps of 0 or we could calculate a huge RTT 5126 * and blow up the retransmit timer. 5127 */ 5128 acked = BYTES_THIS_ACK(tp, th); 5129 5130 #ifdef TCP_HHOOK 5131 /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ 5132 hhook_run_tcp_est_in(tp, th, to); 5133 #endif 5134 5135 TCPSTAT_ADD(tcps_rcvackpack, nsegs); 5136 TCPSTAT_ADD(tcps_rcvackbyte, acked); 5137 sbdrop(&so->so_snd, acked); 5138 /* 5139 * Let the congestion control algorithm update congestion control 5140 * related information. This typically means increasing the 5141 * congestion window. 5142 */ 5143 rack_ack_received(tp, rack, th, nsegs, CC_ACK, 0); 5144 5145 tp->snd_una = th->th_ack; 5146 /* 5147 * Pull snd_wl2 up to prevent seq wrap relative to th_ack. 5148 */ 5149 tp->snd_wl2 = th->th_ack; 5150 tp->t_dupacks = 0; 5151 m_freem(m); 5152 /* ND6_HINT(tp); *//* Some progress has been made. */ 5153 5154 /* 5155 * If all outstanding data are acked, stop retransmit timer, 5156 * otherwise restart timer using current (possibly backed-off) 5157 * value. If process is waiting for space, wakeup/selwakeup/signal. 5158 * If data are ready to send, let tcp_output decide between more 5159 * output or persist. 5160 */ 5161 #ifdef TCPDEBUG 5162 if (so->so_options & SO_DEBUG) 5163 tcp_trace(TA_INPUT, ostate, tp, 5164 (void *)tcp_saveipgen, 5165 &tcp_savetcp, 0); 5166 #endif 5167 if (tp->snd_una == tp->snd_max) { 5168 rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__); 5169 tp->t_acktime = 0; 5170 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 5171 } 5172 /* Wake up the socket if we have room to write more */ 5173 sowwakeup(so); 5174 if (sbavail(&so->so_snd)) { 5175 rack->r_wanted_output++; 5176 } 5177 return (1); 5178 } 5179 5180 /* 5181 * Return value of 1, the TCB is unlocked and most 5182 * likely gone, return value of 0, the TCP is still 5183 * locked. 5184 */ 5185 static int 5186 rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so, 5187 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 5188 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) 5189 { 5190 int32_t ret_val = 0; 5191 int32_t todrop; 5192 int32_t ourfinisacked = 0; 5193 5194 rack_calc_rwin(so, tp); 5195 /* 5196 * If the state is SYN_SENT: if seg contains an ACK, but not for our 5197 * SYN, drop the input. if seg contains a RST, then drop the 5198 * connection. if seg does not contain SYN, then drop it. Otherwise 5199 * this is an acceptable SYN segment initialize tp->rcv_nxt and 5200 * tp->irs if seg contains ack then advance tp->snd_una if seg 5201 * contains an ECE and ECN support is enabled, the stream is ECN 5202 * capable. if SYN has been acked change to ESTABLISHED else 5203 * SYN_RCVD state arrange for segment to be acked (eventually) 5204 * continue processing rest of data/controls, beginning with URG 5205 */ 5206 if ((thflags & TH_ACK) && 5207 (SEQ_LEQ(th->th_ack, tp->iss) || 5208 SEQ_GT(th->th_ack, tp->snd_max))) { 5209 rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 5210 return (1); 5211 } 5212 if ((thflags & (TH_ACK | TH_RST)) == (TH_ACK | TH_RST)) { 5213 TCP_PROBE5(connect__refused, NULL, tp, 5214 mtod(m, const char *), tp, th); 5215 tp = tcp_drop(tp, ECONNREFUSED); 5216 rack_do_drop(m, tp); 5217 return (1); 5218 } 5219 if (thflags & TH_RST) { 5220 rack_do_drop(m, tp); 5221 return (1); 5222 } 5223 if (!(thflags & TH_SYN)) { 5224 rack_do_drop(m, tp); 5225 return (1); 5226 } 5227 tp->irs = th->th_seq; 5228 tcp_rcvseqinit(tp); 5229 if (thflags & TH_ACK) { 5230 int tfo_partial = 0; 5231 5232 TCPSTAT_INC(tcps_connects); 5233 soisconnected(so); 5234 #ifdef MAC 5235 mac_socketpeer_set_from_mbuf(m, so); 5236 #endif 5237 /* Do window scaling on this connection? */ 5238 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 5239 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 5240 tp->rcv_scale = tp->request_r_scale; 5241 } 5242 tp->rcv_adv += min(tp->rcv_wnd, 5243 TCP_MAXWIN << tp->rcv_scale); 5244 /* 5245 * If not all the data that was sent in the TFO SYN 5246 * has been acked, resend the remainder right away. 5247 */ 5248 if (IS_FASTOPEN(tp->t_flags) && 5249 (tp->snd_una != tp->snd_max)) { 5250 tp->snd_nxt = th->th_ack; 5251 tfo_partial = 1; 5252 } 5253 /* 5254 * If there's data, delay ACK; if there's also a FIN ACKNOW 5255 * will be turned on later. 5256 */ 5257 if (DELAY_ACK(tp, tlen) && tlen != 0 && (tfo_partial == 0)) { 5258 rack_timer_cancel(tp, (struct tcp_rack *)tp->t_fb_ptr, 5259 ((struct tcp_rack *)tp->t_fb_ptr)->r_ctl.rc_rcvtime, __LINE__); 5260 tp->t_flags |= TF_DELACK; 5261 } else { 5262 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output++; 5263 tp->t_flags |= TF_ACKNOW; 5264 } 5265 5266 if (((thflags & (TH_CWR | TH_ECE)) == TH_ECE) && 5267 V_tcp_do_ecn) { 5268 tp->t_flags |= TF_ECN_PERMIT; 5269 TCPSTAT_INC(tcps_ecn_shs); 5270 } 5271 if (SEQ_GT(th->th_ack, tp->snd_una)) { 5272 /* 5273 * We advance snd_una for the 5274 * fast open case. If th_ack is 5275 * acknowledging data beyond 5276 * snd_una we can't just call 5277 * ack-processing since the 5278 * data stream in our send-map 5279 * will start at snd_una + 1 (one 5280 * beyond the SYN). If its just 5281 * equal we don't need to do that 5282 * and there is no send_map. 5283 */ 5284 tp->snd_una++; 5285 } 5286 /* 5287 * Received <SYN,ACK> in SYN_SENT[*] state. Transitions: 5288 * SYN_SENT --> ESTABLISHED SYN_SENT* --> FIN_WAIT_1 5289 */ 5290 tp->t_starttime = ticks; 5291 if (tp->t_flags & TF_NEEDFIN) { 5292 tcp_state_change(tp, TCPS_FIN_WAIT_1); 5293 tp->t_flags &= ~TF_NEEDFIN; 5294 thflags &= ~TH_SYN; 5295 } else { 5296 tcp_state_change(tp, TCPS_ESTABLISHED); 5297 TCP_PROBE5(connect__established, NULL, tp, 5298 mtod(m, const char *), tp, th); 5299 cc_conn_init(tp); 5300 } 5301 } else { 5302 /* 5303 * Received initial SYN in SYN-SENT[*] state => simultaneous 5304 * open. If segment contains CC option and there is a 5305 * cached CC, apply TAO test. If it succeeds, connection is * 5306 * half-synchronized. Otherwise, do 3-way handshake: 5307 * SYN-SENT -> SYN-RECEIVED SYN-SENT* -> SYN-RECEIVED* If 5308 * there was no CC option, clear cached CC value. 5309 */ 5310 tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN); 5311 tcp_state_change(tp, TCPS_SYN_RECEIVED); 5312 } 5313 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 5314 INP_WLOCK_ASSERT(tp->t_inpcb); 5315 /* 5316 * Advance th->th_seq to correspond to first data byte. If data, 5317 * trim to stay within window, dropping FIN if necessary. 5318 */ 5319 th->th_seq++; 5320 if (tlen > tp->rcv_wnd) { 5321 todrop = tlen - tp->rcv_wnd; 5322 m_adj(m, -todrop); 5323 tlen = tp->rcv_wnd; 5324 thflags &= ~TH_FIN; 5325 TCPSTAT_INC(tcps_rcvpackafterwin); 5326 TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); 5327 } 5328 tp->snd_wl1 = th->th_seq - 1; 5329 tp->rcv_up = th->th_seq; 5330 /* 5331 * Client side of transaction: already sent SYN and data. If the 5332 * remote host used T/TCP to validate the SYN, our data will be 5333 * ACK'd; if so, enter normal data segment processing in the middle 5334 * of step 5, ack processing. Otherwise, goto step 6. 5335 */ 5336 if (thflags & TH_ACK) { 5337 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) 5338 return (ret_val); 5339 /* We may have changed to FIN_WAIT_1 above */ 5340 if (tp->t_state == TCPS_FIN_WAIT_1) { 5341 /* 5342 * In FIN_WAIT_1 STATE in addition to the processing 5343 * for the ESTABLISHED state if our FIN is now 5344 * acknowledged then enter FIN_WAIT_2. 5345 */ 5346 if (ourfinisacked) { 5347 /* 5348 * If we can't receive any more data, then 5349 * closing user can proceed. Starting the 5350 * timer is contrary to the specification, 5351 * but if we don't get a FIN we'll hang 5352 * forever. 5353 * 5354 * XXXjl: we should release the tp also, and 5355 * use a compressed state. 5356 */ 5357 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 5358 soisdisconnected(so); 5359 tcp_timer_activate(tp, TT_2MSL, 5360 (tcp_fast_finwait2_recycle ? 5361 tcp_finwait2_timeout : 5362 TP_MAXIDLE(tp))); 5363 } 5364 tcp_state_change(tp, TCPS_FIN_WAIT_2); 5365 } 5366 } 5367 } 5368 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 5369 tiwin, thflags, nxt_pkt)); 5370 } 5371 5372 /* 5373 * Return value of 1, the TCB is unlocked and most 5374 * likely gone, return value of 0, the TCP is still 5375 * locked. 5376 */ 5377 static int 5378 rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, 5379 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 5380 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) 5381 { 5382 int32_t ret_val = 0; 5383 int32_t ourfinisacked = 0; 5384 5385 rack_calc_rwin(so, tp); 5386 5387 if ((thflags & TH_ACK) && 5388 (SEQ_LEQ(th->th_ack, tp->snd_una) || 5389 SEQ_GT(th->th_ack, tp->snd_max))) { 5390 rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 5391 return (1); 5392 } 5393 if (IS_FASTOPEN(tp->t_flags)) { 5394 /* 5395 * When a TFO connection is in SYN_RECEIVED, the 5396 * only valid packets are the initial SYN, a 5397 * retransmit/copy of the initial SYN (possibly with 5398 * a subset of the original data), a valid ACK, a 5399 * FIN, or a RST. 5400 */ 5401 if ((thflags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) { 5402 rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 5403 return (1); 5404 } else if (thflags & TH_SYN) { 5405 /* non-initial SYN is ignored */ 5406 struct tcp_rack *rack; 5407 5408 rack = (struct tcp_rack *)tp->t_fb_ptr; 5409 if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) || 5410 (rack->r_ctl.rc_hpts_flags & PACE_TMR_TLP) || 5411 (rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK)) { 5412 rack_do_drop(m, NULL); 5413 return (0); 5414 } 5415 } else if (!(thflags & (TH_ACK | TH_FIN | TH_RST))) { 5416 rack_do_drop(m, NULL); 5417 return (0); 5418 } 5419 } 5420 if (thflags & TH_RST) 5421 return (rack_process_rst(m, th, so, tp)); 5422 /* 5423 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 5424 * it's less than ts_recent, drop it. 5425 */ 5426 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 5427 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 5428 if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val)) 5429 return (ret_val); 5430 } 5431 /* 5432 * In the SYN-RECEIVED state, validate that the packet belongs to 5433 * this connection before trimming the data to fit the receive 5434 * window. Check the sequence number versus IRS since we know the 5435 * sequence numbers haven't wrapped. This is a partial fix for the 5436 * "LAND" DoS attack. 5437 */ 5438 if (SEQ_LT(th->th_seq, tp->irs)) { 5439 rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 5440 return (1); 5441 } 5442 if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 5443 return (ret_val); 5444 } 5445 /* 5446 * If last ACK falls within this segment's sequence numbers, record 5447 * its timestamp. NOTE: 1) That the test incorporates suggestions 5448 * from the latest proposal of the tcplw@cray.com list (Braden 5449 * 1993/04/26). 2) That updating only on newer timestamps interferes 5450 * with our earlier PAWS tests, so this check should be solely 5451 * predicated on the sequence space of this segment. 3) That we 5452 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 5453 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 5454 * SEG.Len, This modified check allows us to overcome RFC1323's 5455 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 5456 * p.869. In such cases, we can still calculate the RTT correctly 5457 * when RCV.NXT == Last.ACK.Sent. 5458 */ 5459 if ((to->to_flags & TOF_TS) != 0 && 5460 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 5461 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 5462 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 5463 tp->ts_recent_age = tcp_ts_getticks(); 5464 tp->ts_recent = to->to_tsval; 5465 } 5466 tp->snd_wnd = tiwin; 5467 /* 5468 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 5469 * is on (half-synchronized state), then queue data for later 5470 * processing; else drop segment and return. 5471 */ 5472 if ((thflags & TH_ACK) == 0) { 5473 if (IS_FASTOPEN(tp->t_flags)) { 5474 cc_conn_init(tp); 5475 } 5476 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 5477 tiwin, thflags, nxt_pkt)); 5478 } 5479 TCPSTAT_INC(tcps_connects); 5480 soisconnected(so); 5481 /* Do window scaling? */ 5482 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 5483 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 5484 tp->rcv_scale = tp->request_r_scale; 5485 } 5486 /* 5487 * Make transitions: SYN-RECEIVED -> ESTABLISHED SYN-RECEIVED* -> 5488 * FIN-WAIT-1 5489 */ 5490 tp->t_starttime = ticks; 5491 if (IS_FASTOPEN(tp->t_flags) && tp->t_tfo_pending) { 5492 tcp_fastopen_decrement_counter(tp->t_tfo_pending); 5493 tp->t_tfo_pending = NULL; 5494 5495 /* 5496 * Account for the ACK of our SYN prior to 5497 * regular ACK processing below. 5498 */ 5499 tp->snd_una++; 5500 } 5501 if (tp->t_flags & TF_NEEDFIN) { 5502 tcp_state_change(tp, TCPS_FIN_WAIT_1); 5503 tp->t_flags &= ~TF_NEEDFIN; 5504 } else { 5505 tcp_state_change(tp, TCPS_ESTABLISHED); 5506 TCP_PROBE5(accept__established, NULL, tp, 5507 mtod(m, const char *), tp, th); 5508 /* 5509 * TFO connections call cc_conn_init() during SYN 5510 * processing. Calling it again here for such connections 5511 * is not harmless as it would undo the snd_cwnd reduction 5512 * that occurs when a TFO SYN|ACK is retransmitted. 5513 */ 5514 if (!IS_FASTOPEN(tp->t_flags)) 5515 cc_conn_init(tp); 5516 } 5517 /* 5518 * If segment contains data or ACK, will call tcp_reass() later; if 5519 * not, do so now to pass queued data to user. 5520 */ 5521 if (tlen == 0 && (thflags & TH_FIN) == 0) 5522 (void) tcp_reass(tp, (struct tcphdr *)0, NULL, 0, 5523 (struct mbuf *)0); 5524 tp->snd_wl1 = th->th_seq - 1; 5525 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 5526 return (ret_val); 5527 } 5528 if (tp->t_state == TCPS_FIN_WAIT_1) { 5529 /* We could have went to FIN_WAIT_1 (or EST) above */ 5530 /* 5531 * In FIN_WAIT_1 STATE in addition to the processing for the 5532 * ESTABLISHED state if our FIN is now acknowledged then 5533 * enter FIN_WAIT_2. 5534 */ 5535 if (ourfinisacked) { 5536 /* 5537 * If we can't receive any more data, then closing 5538 * user can proceed. Starting the timer is contrary 5539 * to the specification, but if we don't get a FIN 5540 * we'll hang forever. 5541 * 5542 * XXXjl: we should release the tp also, and use a 5543 * compressed state. 5544 */ 5545 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 5546 soisdisconnected(so); 5547 tcp_timer_activate(tp, TT_2MSL, 5548 (tcp_fast_finwait2_recycle ? 5549 tcp_finwait2_timeout : 5550 TP_MAXIDLE(tp))); 5551 } 5552 tcp_state_change(tp, TCPS_FIN_WAIT_2); 5553 } 5554 } 5555 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 5556 tiwin, thflags, nxt_pkt)); 5557 } 5558 5559 /* 5560 * Return value of 1, the TCB is unlocked and most 5561 * likely gone, return value of 0, the TCP is still 5562 * locked. 5563 */ 5564 static int 5565 rack_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so, 5566 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 5567 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) 5568 { 5569 int32_t ret_val = 0; 5570 5571 /* 5572 * Header prediction: check for the two common cases of a 5573 * uni-directional data xfer. If the packet has no control flags, 5574 * is in-sequence, the window didn't change and we're not 5575 * retransmitting, it's a candidate. If the length is zero and the 5576 * ack moved forward, we're the sender side of the xfer. Just free 5577 * the data acked & wake any higher level process that was blocked 5578 * waiting for space. If the length is non-zero and the ack didn't 5579 * move, we're the receiver side. If we're getting packets in-order 5580 * (the reassembly queue is empty), add the data toc The socket 5581 * buffer and note that we need a delayed ack. Make sure that the 5582 * hidden state-flags are also off. Since we check for 5583 * TCPS_ESTABLISHED first, it can only be TH_NEEDSYN. 5584 */ 5585 if (__predict_true(((to->to_flags & TOF_SACK) == 0)) && 5586 __predict_true((thflags & (TH_SYN | TH_FIN | TH_RST | TH_URG | TH_ACK)) == TH_ACK) && 5587 __predict_true(SEGQ_EMPTY(tp)) && 5588 __predict_true(th->th_seq == tp->rcv_nxt)) { 5589 struct tcp_rack *rack; 5590 5591 rack = (struct tcp_rack *)tp->t_fb_ptr; 5592 if (tlen == 0) { 5593 if (rack_fastack(m, th, so, tp, to, drop_hdrlen, tlen, 5594 tiwin, nxt_pkt, rack->r_ctl.rc_rcvtime)) { 5595 return (0); 5596 } 5597 } else { 5598 if (rack_do_fastnewdata(m, th, so, tp, to, drop_hdrlen, tlen, 5599 tiwin, nxt_pkt)) { 5600 return (0); 5601 } 5602 } 5603 } 5604 rack_calc_rwin(so, tp); 5605 5606 if (thflags & TH_RST) 5607 return (rack_process_rst(m, th, so, tp)); 5608 5609 /* 5610 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 5611 * synchronized state. 5612 */ 5613 if (thflags & TH_SYN) { 5614 rack_challenge_ack(m, th, tp, &ret_val); 5615 return (ret_val); 5616 } 5617 /* 5618 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 5619 * it's less than ts_recent, drop it. 5620 */ 5621 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 5622 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 5623 if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val)) 5624 return (ret_val); 5625 } 5626 if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 5627 return (ret_val); 5628 } 5629 /* 5630 * If last ACK falls within this segment's sequence numbers, record 5631 * its timestamp. NOTE: 1) That the test incorporates suggestions 5632 * from the latest proposal of the tcplw@cray.com list (Braden 5633 * 1993/04/26). 2) That updating only on newer timestamps interferes 5634 * with our earlier PAWS tests, so this check should be solely 5635 * predicated on the sequence space of this segment. 3) That we 5636 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 5637 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 5638 * SEG.Len, This modified check allows us to overcome RFC1323's 5639 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 5640 * p.869. In such cases, we can still calculate the RTT correctly 5641 * when RCV.NXT == Last.ACK.Sent. 5642 */ 5643 if ((to->to_flags & TOF_TS) != 0 && 5644 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 5645 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 5646 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 5647 tp->ts_recent_age = tcp_ts_getticks(); 5648 tp->ts_recent = to->to_tsval; 5649 } 5650 /* 5651 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 5652 * is on (half-synchronized state), then queue data for later 5653 * processing; else drop segment and return. 5654 */ 5655 if ((thflags & TH_ACK) == 0) { 5656 if (tp->t_flags & TF_NEEDSYN) { 5657 5658 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 5659 tiwin, thflags, nxt_pkt)); 5660 5661 } else if (tp->t_flags & TF_ACKNOW) { 5662 rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 5663 return (ret_val); 5664 } else { 5665 rack_do_drop(m, NULL); 5666 return (0); 5667 } 5668 } 5669 /* 5670 * Ack processing. 5671 */ 5672 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) { 5673 return (ret_val); 5674 } 5675 if (sbavail(&so->so_snd)) { 5676 if (rack_progress_timeout_check(tp)) { 5677 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 5678 rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 5679 return (1); 5680 } 5681 } 5682 /* State changes only happen in rack_process_data() */ 5683 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 5684 tiwin, thflags, nxt_pkt)); 5685 } 5686 5687 /* 5688 * Return value of 1, the TCB is unlocked and most 5689 * likely gone, return value of 0, the TCP is still 5690 * locked. 5691 */ 5692 static int 5693 rack_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so, 5694 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 5695 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) 5696 { 5697 int32_t ret_val = 0; 5698 5699 rack_calc_rwin(so, tp); 5700 if (thflags & TH_RST) 5701 return (rack_process_rst(m, th, so, tp)); 5702 /* 5703 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 5704 * synchronized state. 5705 */ 5706 if (thflags & TH_SYN) { 5707 rack_challenge_ack(m, th, tp, &ret_val); 5708 return (ret_val); 5709 } 5710 /* 5711 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 5712 * it's less than ts_recent, drop it. 5713 */ 5714 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 5715 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 5716 if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val)) 5717 return (ret_val); 5718 } 5719 if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 5720 return (ret_val); 5721 } 5722 /* 5723 * If last ACK falls within this segment's sequence numbers, record 5724 * its timestamp. NOTE: 1) That the test incorporates suggestions 5725 * from the latest proposal of the tcplw@cray.com list (Braden 5726 * 1993/04/26). 2) That updating only on newer timestamps interferes 5727 * with our earlier PAWS tests, so this check should be solely 5728 * predicated on the sequence space of this segment. 3) That we 5729 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 5730 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 5731 * SEG.Len, This modified check allows us to overcome RFC1323's 5732 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 5733 * p.869. In such cases, we can still calculate the RTT correctly 5734 * when RCV.NXT == Last.ACK.Sent. 5735 */ 5736 if ((to->to_flags & TOF_TS) != 0 && 5737 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 5738 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 5739 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 5740 tp->ts_recent_age = tcp_ts_getticks(); 5741 tp->ts_recent = to->to_tsval; 5742 } 5743 /* 5744 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 5745 * is on (half-synchronized state), then queue data for later 5746 * processing; else drop segment and return. 5747 */ 5748 if ((thflags & TH_ACK) == 0) { 5749 if (tp->t_flags & TF_NEEDSYN) { 5750 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 5751 tiwin, thflags, nxt_pkt)); 5752 5753 } else if (tp->t_flags & TF_ACKNOW) { 5754 rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 5755 return (ret_val); 5756 } else { 5757 rack_do_drop(m, NULL); 5758 return (0); 5759 } 5760 } 5761 /* 5762 * Ack processing. 5763 */ 5764 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) { 5765 return (ret_val); 5766 } 5767 if (sbavail(&so->so_snd)) { 5768 if (rack_progress_timeout_check(tp)) { 5769 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 5770 rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 5771 return (1); 5772 } 5773 } 5774 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 5775 tiwin, thflags, nxt_pkt)); 5776 } 5777 5778 static int 5779 rack_check_data_after_close(struct mbuf *m, 5780 struct tcpcb *tp, int32_t *tlen, struct tcphdr *th, struct socket *so) 5781 { 5782 struct tcp_rack *rack; 5783 5784 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 5785 rack = (struct tcp_rack *)tp->t_fb_ptr; 5786 if (rack->rc_allow_data_af_clo == 0) { 5787 close_now: 5788 tp = tcp_close(tp); 5789 TCPSTAT_INC(tcps_rcvafterclose); 5790 rack_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, (*tlen)); 5791 return (1); 5792 } 5793 if (sbavail(&so->so_snd) == 0) 5794 goto close_now; 5795 /* Ok we allow data that is ignored and a followup reset */ 5796 tp->rcv_nxt = th->th_seq + *tlen; 5797 tp->t_flags2 |= TF2_DROP_AF_DATA; 5798 rack->r_wanted_output = 1; 5799 *tlen = 0; 5800 return (0); 5801 } 5802 5803 /* 5804 * Return value of 1, the TCB is unlocked and most 5805 * likely gone, return value of 0, the TCP is still 5806 * locked. 5807 */ 5808 static int 5809 rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so, 5810 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 5811 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) 5812 { 5813 int32_t ret_val = 0; 5814 int32_t ourfinisacked = 0; 5815 5816 rack_calc_rwin(so, tp); 5817 5818 if (thflags & TH_RST) 5819 return (rack_process_rst(m, th, so, tp)); 5820 /* 5821 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 5822 * synchronized state. 5823 */ 5824 if (thflags & TH_SYN) { 5825 rack_challenge_ack(m, th, tp, &ret_val); 5826 return (ret_val); 5827 } 5828 /* 5829 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 5830 * it's less than ts_recent, drop it. 5831 */ 5832 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 5833 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 5834 if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val)) 5835 return (ret_val); 5836 } 5837 if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 5838 return (ret_val); 5839 } 5840 /* 5841 * If new data are received on a connection after the user processes 5842 * are gone, then RST the other end. 5843 */ 5844 if ((so->so_state & SS_NOFDREF) && tlen) { 5845 if (rack_check_data_after_close(m, tp, &tlen, th, so)) 5846 return (1); 5847 } 5848 /* 5849 * If last ACK falls within this segment's sequence numbers, record 5850 * its timestamp. NOTE: 1) That the test incorporates suggestions 5851 * from the latest proposal of the tcplw@cray.com list (Braden 5852 * 1993/04/26). 2) That updating only on newer timestamps interferes 5853 * with our earlier PAWS tests, so this check should be solely 5854 * predicated on the sequence space of this segment. 3) That we 5855 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 5856 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 5857 * SEG.Len, This modified check allows us to overcome RFC1323's 5858 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 5859 * p.869. In such cases, we can still calculate the RTT correctly 5860 * when RCV.NXT == Last.ACK.Sent. 5861 */ 5862 if ((to->to_flags & TOF_TS) != 0 && 5863 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 5864 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 5865 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 5866 tp->ts_recent_age = tcp_ts_getticks(); 5867 tp->ts_recent = to->to_tsval; 5868 } 5869 /* 5870 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 5871 * is on (half-synchronized state), then queue data for later 5872 * processing; else drop segment and return. 5873 */ 5874 if ((thflags & TH_ACK) == 0) { 5875 if (tp->t_flags & TF_NEEDSYN) { 5876 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 5877 tiwin, thflags, nxt_pkt)); 5878 } else if (tp->t_flags & TF_ACKNOW) { 5879 rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 5880 return (ret_val); 5881 } else { 5882 rack_do_drop(m, NULL); 5883 return (0); 5884 } 5885 } 5886 /* 5887 * Ack processing. 5888 */ 5889 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 5890 return (ret_val); 5891 } 5892 if (ourfinisacked) { 5893 /* 5894 * If we can't receive any more data, then closing user can 5895 * proceed. Starting the timer is contrary to the 5896 * specification, but if we don't get a FIN we'll hang 5897 * forever. 5898 * 5899 * XXXjl: we should release the tp also, and use a 5900 * compressed state. 5901 */ 5902 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 5903 soisdisconnected(so); 5904 tcp_timer_activate(tp, TT_2MSL, 5905 (tcp_fast_finwait2_recycle ? 5906 tcp_finwait2_timeout : 5907 TP_MAXIDLE(tp))); 5908 } 5909 tcp_state_change(tp, TCPS_FIN_WAIT_2); 5910 } 5911 if (sbavail(&so->so_snd)) { 5912 if (rack_progress_timeout_check(tp)) { 5913 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 5914 rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 5915 return (1); 5916 } 5917 } 5918 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 5919 tiwin, thflags, nxt_pkt)); 5920 } 5921 5922 /* 5923 * Return value of 1, the TCB is unlocked and most 5924 * likely gone, return value of 0, the TCP is still 5925 * locked. 5926 */ 5927 static int 5928 rack_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so, 5929 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 5930 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) 5931 { 5932 int32_t ret_val = 0; 5933 int32_t ourfinisacked = 0; 5934 5935 rack_calc_rwin(so, tp); 5936 5937 if (thflags & TH_RST) 5938 return (rack_process_rst(m, th, so, tp)); 5939 /* 5940 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 5941 * synchronized state. 5942 */ 5943 if (thflags & TH_SYN) { 5944 rack_challenge_ack(m, th, tp, &ret_val); 5945 return (ret_val); 5946 } 5947 /* 5948 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 5949 * it's less than ts_recent, drop it. 5950 */ 5951 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 5952 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 5953 if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val)) 5954 return (ret_val); 5955 } 5956 if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 5957 return (ret_val); 5958 } 5959 /* 5960 * If new data are received on a connection after the user processes 5961 * are gone, then RST the other end. 5962 */ 5963 if ((so->so_state & SS_NOFDREF) && tlen) { 5964 if (rack_check_data_after_close(m, tp, &tlen, th, so)) 5965 return (1); 5966 } 5967 /* 5968 * If last ACK falls within this segment's sequence numbers, record 5969 * its timestamp. NOTE: 1) That the test incorporates suggestions 5970 * from the latest proposal of the tcplw@cray.com list (Braden 5971 * 1993/04/26). 2) That updating only on newer timestamps interferes 5972 * with our earlier PAWS tests, so this check should be solely 5973 * predicated on the sequence space of this segment. 3) That we 5974 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 5975 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 5976 * SEG.Len, This modified check allows us to overcome RFC1323's 5977 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 5978 * p.869. In such cases, we can still calculate the RTT correctly 5979 * when RCV.NXT == Last.ACK.Sent. 5980 */ 5981 if ((to->to_flags & TOF_TS) != 0 && 5982 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 5983 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 5984 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 5985 tp->ts_recent_age = tcp_ts_getticks(); 5986 tp->ts_recent = to->to_tsval; 5987 } 5988 /* 5989 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 5990 * is on (half-synchronized state), then queue data for later 5991 * processing; else drop segment and return. 5992 */ 5993 if ((thflags & TH_ACK) == 0) { 5994 if (tp->t_flags & TF_NEEDSYN) { 5995 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 5996 tiwin, thflags, nxt_pkt)); 5997 } else if (tp->t_flags & TF_ACKNOW) { 5998 rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 5999 return (ret_val); 6000 } else { 6001 rack_do_drop(m, NULL); 6002 return (0); 6003 } 6004 } 6005 /* 6006 * Ack processing. 6007 */ 6008 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 6009 return (ret_val); 6010 } 6011 if (ourfinisacked) { 6012 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 6013 tcp_twstart(tp); 6014 m_freem(m); 6015 return (1); 6016 } 6017 if (sbavail(&so->so_snd)) { 6018 if (rack_progress_timeout_check(tp)) { 6019 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 6020 rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 6021 return (1); 6022 } 6023 } 6024 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 6025 tiwin, thflags, nxt_pkt)); 6026 } 6027 6028 /* 6029 * Return value of 1, the TCB is unlocked and most 6030 * likely gone, return value of 0, the TCP is still 6031 * locked. 6032 */ 6033 static int 6034 rack_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so, 6035 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 6036 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) 6037 { 6038 int32_t ret_val = 0; 6039 int32_t ourfinisacked = 0; 6040 6041 rack_calc_rwin(so, tp); 6042 6043 if (thflags & TH_RST) 6044 return (rack_process_rst(m, th, so, tp)); 6045 /* 6046 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 6047 * synchronized state. 6048 */ 6049 if (thflags & TH_SYN) { 6050 rack_challenge_ack(m, th, tp, &ret_val); 6051 return (ret_val); 6052 } 6053 /* 6054 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 6055 * it's less than ts_recent, drop it. 6056 */ 6057 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 6058 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 6059 if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val)) 6060 return (ret_val); 6061 } 6062 if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 6063 return (ret_val); 6064 } 6065 /* 6066 * If new data are received on a connection after the user processes 6067 * are gone, then RST the other end. 6068 */ 6069 if ((so->so_state & SS_NOFDREF) && tlen) { 6070 if (rack_check_data_after_close(m, tp, &tlen, th, so)) 6071 return (1); 6072 } 6073 /* 6074 * If last ACK falls within this segment's sequence numbers, record 6075 * its timestamp. NOTE: 1) That the test incorporates suggestions 6076 * from the latest proposal of the tcplw@cray.com list (Braden 6077 * 1993/04/26). 2) That updating only on newer timestamps interferes 6078 * with our earlier PAWS tests, so this check should be solely 6079 * predicated on the sequence space of this segment. 3) That we 6080 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 6081 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 6082 * SEG.Len, This modified check allows us to overcome RFC1323's 6083 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 6084 * p.869. In such cases, we can still calculate the RTT correctly 6085 * when RCV.NXT == Last.ACK.Sent. 6086 */ 6087 if ((to->to_flags & TOF_TS) != 0 && 6088 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 6089 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 6090 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 6091 tp->ts_recent_age = tcp_ts_getticks(); 6092 tp->ts_recent = to->to_tsval; 6093 } 6094 /* 6095 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 6096 * is on (half-synchronized state), then queue data for later 6097 * processing; else drop segment and return. 6098 */ 6099 if ((thflags & TH_ACK) == 0) { 6100 if (tp->t_flags & TF_NEEDSYN) { 6101 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 6102 tiwin, thflags, nxt_pkt)); 6103 } else if (tp->t_flags & TF_ACKNOW) { 6104 rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 6105 return (ret_val); 6106 } else { 6107 rack_do_drop(m, NULL); 6108 return (0); 6109 } 6110 } 6111 /* 6112 * case TCPS_LAST_ACK: Ack processing. 6113 */ 6114 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 6115 return (ret_val); 6116 } 6117 if (ourfinisacked) { 6118 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 6119 tp = tcp_close(tp); 6120 rack_do_drop(m, tp); 6121 return (1); 6122 } 6123 if (sbavail(&so->so_snd)) { 6124 if (rack_progress_timeout_check(tp)) { 6125 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 6126 rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 6127 return (1); 6128 } 6129 } 6130 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 6131 tiwin, thflags, nxt_pkt)); 6132 } 6133 6134 6135 /* 6136 * Return value of 1, the TCB is unlocked and most 6137 * likely gone, return value of 0, the TCP is still 6138 * locked. 6139 */ 6140 static int 6141 rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so, 6142 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 6143 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) 6144 { 6145 int32_t ret_val = 0; 6146 int32_t ourfinisacked = 0; 6147 6148 rack_calc_rwin(so, tp); 6149 6150 /* Reset receive buffer auto scaling when not in bulk receive mode. */ 6151 if (thflags & TH_RST) 6152 return (rack_process_rst(m, th, so, tp)); 6153 /* 6154 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 6155 * synchronized state. 6156 */ 6157 if (thflags & TH_SYN) { 6158 rack_challenge_ack(m, th, tp, &ret_val); 6159 return (ret_val); 6160 } 6161 /* 6162 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 6163 * it's less than ts_recent, drop it. 6164 */ 6165 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 6166 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 6167 if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val)) 6168 return (ret_val); 6169 } 6170 if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 6171 return (ret_val); 6172 } 6173 /* 6174 * If new data are received on a connection after the user processes 6175 * are gone, then RST the other end. 6176 */ 6177 if ((so->so_state & SS_NOFDREF) && 6178 tlen) { 6179 if (rack_check_data_after_close(m, tp, &tlen, th, so)) 6180 return (1); 6181 } 6182 /* 6183 * If last ACK falls within this segment's sequence numbers, record 6184 * its timestamp. NOTE: 1) That the test incorporates suggestions 6185 * from the latest proposal of the tcplw@cray.com list (Braden 6186 * 1993/04/26). 2) That updating only on newer timestamps interferes 6187 * with our earlier PAWS tests, so this check should be solely 6188 * predicated on the sequence space of this segment. 3) That we 6189 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 6190 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 6191 * SEG.Len, This modified check allows us to overcome RFC1323's 6192 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 6193 * p.869. In such cases, we can still calculate the RTT correctly 6194 * when RCV.NXT == Last.ACK.Sent. 6195 */ 6196 if ((to->to_flags & TOF_TS) != 0 && 6197 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 6198 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 6199 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 6200 tp->ts_recent_age = tcp_ts_getticks(); 6201 tp->ts_recent = to->to_tsval; 6202 } 6203 /* 6204 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 6205 * is on (half-synchronized state), then queue data for later 6206 * processing; else drop segment and return. 6207 */ 6208 if ((thflags & TH_ACK) == 0) { 6209 if (tp->t_flags & TF_NEEDSYN) { 6210 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 6211 tiwin, thflags, nxt_pkt)); 6212 } else if (tp->t_flags & TF_ACKNOW) { 6213 rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 6214 return (ret_val); 6215 } else { 6216 rack_do_drop(m, NULL); 6217 return (0); 6218 } 6219 } 6220 /* 6221 * Ack processing. 6222 */ 6223 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 6224 return (ret_val); 6225 } 6226 if (sbavail(&so->so_snd)) { 6227 if (rack_progress_timeout_check(tp)) { 6228 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 6229 rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 6230 return (1); 6231 } 6232 } 6233 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 6234 tiwin, thflags, nxt_pkt)); 6235 } 6236 6237 6238 static void inline 6239 rack_clear_rate_sample(struct tcp_rack *rack) 6240 { 6241 rack->r_ctl.rack_rs.rs_flags = RACK_RTT_EMPTY; 6242 rack->r_ctl.rack_rs.rs_rtt_cnt = 0; 6243 rack->r_ctl.rack_rs.rs_rtt_tot = 0; 6244 } 6245 6246 static int 6247 rack_init(struct tcpcb *tp) 6248 { 6249 struct tcp_rack *rack = NULL; 6250 6251 tp->t_fb_ptr = uma_zalloc(rack_pcb_zone, M_NOWAIT); 6252 if (tp->t_fb_ptr == NULL) { 6253 /* 6254 * We need to allocate memory but cant. The INP and INP_INFO 6255 * locks and they are recusive (happens during setup. So a 6256 * scheme to drop the locks fails :( 6257 * 6258 */ 6259 return (ENOMEM); 6260 } 6261 memset(tp->t_fb_ptr, 0, sizeof(struct tcp_rack)); 6262 6263 rack = (struct tcp_rack *)tp->t_fb_ptr; 6264 TAILQ_INIT(&rack->r_ctl.rc_map); 6265 TAILQ_INIT(&rack->r_ctl.rc_free); 6266 TAILQ_INIT(&rack->r_ctl.rc_tmap); 6267 rack->rc_tp = tp; 6268 if (tp->t_inpcb) { 6269 rack->rc_inp = tp->t_inpcb; 6270 } 6271 /* Probably not needed but lets be sure */ 6272 rack_clear_rate_sample(rack); 6273 rack->r_cpu = 0; 6274 rack->r_ctl.rc_reorder_fade = rack_reorder_fade; 6275 rack->rc_allow_data_af_clo = rack_ignore_data_after_close; 6276 rack->r_ctl.rc_tlp_threshold = rack_tlp_thresh; 6277 rack->rc_pace_reduce = rack_slot_reduction; 6278 if (V_tcp_delack_enabled) 6279 tp->t_delayed_ack = 1; 6280 else 6281 tp->t_delayed_ack = 0; 6282 rack->rc_pace_max_segs = rack_hptsi_segments; 6283 rack->r_ctl.rc_early_recovery_segs = rack_early_recovery_max_seg; 6284 rack->r_ctl.rc_reorder_shift = rack_reorder_thresh; 6285 rack->r_ctl.rc_pkt_delay = rack_pkt_delay; 6286 rack->r_ctl.rc_prop_reduce = rack_use_proportional_reduce; 6287 rack->r_idle_reduce_largest = rack_reduce_largest_on_idle; 6288 rack->r_enforce_min_pace = rack_min_pace_time; 6289 rack->r_min_pace_seg_thresh = rack_min_pace_time_seg_req; 6290 rack->r_ctl.rc_prop_rate = rack_proportional_rate; 6291 rack->r_ctl.rc_tlp_cwnd_reduce = rack_lower_cwnd_at_tlp; 6292 rack->r_ctl.rc_early_recovery = rack_early_recovery; 6293 rack->rc_always_pace = rack_pace_every_seg; 6294 rack->r_ctl.rc_rate_sample_method = rack_rate_sample_method; 6295 rack->rack_tlp_threshold_use = rack_tlp_threshold_use; 6296 rack->r_ctl.rc_prr_sendalot = rack_send_a_lot_in_prr; 6297 rack->r_ctl.rc_min_to = rack_min_to; 6298 rack->r_ctl.rc_prr_inc_var = rack_inc_var; 6299 rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0); 6300 if (tp->snd_una != tp->snd_max) { 6301 /* Create a send map for the current outstanding data */ 6302 struct rack_sendmap *rsm; 6303 6304 rsm = rack_alloc(rack); 6305 if (rsm == NULL) { 6306 uma_zfree(rack_pcb_zone, tp->t_fb_ptr); 6307 tp->t_fb_ptr = NULL; 6308 return (ENOMEM); 6309 } 6310 rsm->r_flags = RACK_OVERMAX; 6311 rsm->r_tim_lastsent[0] = tcp_ts_getticks(); 6312 rsm->r_rtr_cnt = 1; 6313 rsm->r_rtr_bytes = 0; 6314 rsm->r_start = tp->snd_una; 6315 rsm->r_end = tp->snd_max; 6316 rsm->r_sndcnt = 0; 6317 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_map, rsm, r_next); 6318 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); 6319 rsm->r_in_tmap = 1; 6320 } 6321 return (0); 6322 } 6323 6324 static int 6325 rack_handoff_ok(struct tcpcb *tp) 6326 { 6327 if ((tp->t_state == TCPS_CLOSED) || 6328 (tp->t_state == TCPS_LISTEN)) { 6329 /* Sure no problem though it may not stick */ 6330 return (0); 6331 } 6332 if ((tp->t_state == TCPS_SYN_SENT) || 6333 (tp->t_state == TCPS_SYN_RECEIVED)) { 6334 /* 6335 * We really don't know you have to get to ESTAB or beyond 6336 * to tell. 6337 */ 6338 return (EAGAIN); 6339 } 6340 if (tp->t_flags & TF_SACK_PERMIT) { 6341 return (0); 6342 } 6343 /* 6344 * If we reach here we don't do SACK on this connection so we can 6345 * never do rack. 6346 */ 6347 return (EINVAL); 6348 } 6349 6350 static void 6351 rack_fini(struct tcpcb *tp, int32_t tcb_is_purged) 6352 { 6353 if (tp->t_fb_ptr) { 6354 struct tcp_rack *rack; 6355 struct rack_sendmap *rsm; 6356 6357 rack = (struct tcp_rack *)tp->t_fb_ptr; 6358 #ifdef TCP_BLACKBOX 6359 tcp_log_flowend(tp); 6360 #endif 6361 rsm = TAILQ_FIRST(&rack->r_ctl.rc_map); 6362 while (rsm) { 6363 TAILQ_REMOVE(&rack->r_ctl.rc_map, rsm, r_next); 6364 uma_zfree(rack_zone, rsm); 6365 rsm = TAILQ_FIRST(&rack->r_ctl.rc_map); 6366 } 6367 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); 6368 while (rsm) { 6369 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_next); 6370 uma_zfree(rack_zone, rsm); 6371 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); 6372 } 6373 rack->rc_free_cnt = 0; 6374 uma_zfree(rack_pcb_zone, tp->t_fb_ptr); 6375 tp->t_fb_ptr = NULL; 6376 } 6377 } 6378 6379 static void 6380 rack_set_state(struct tcpcb *tp, struct tcp_rack *rack) 6381 { 6382 switch (tp->t_state) { 6383 case TCPS_SYN_SENT: 6384 rack->r_state = TCPS_SYN_SENT; 6385 rack->r_substate = rack_do_syn_sent; 6386 break; 6387 case TCPS_SYN_RECEIVED: 6388 rack->r_state = TCPS_SYN_RECEIVED; 6389 rack->r_substate = rack_do_syn_recv; 6390 break; 6391 case TCPS_ESTABLISHED: 6392 rack->r_state = TCPS_ESTABLISHED; 6393 rack->r_substate = rack_do_established; 6394 break; 6395 case TCPS_CLOSE_WAIT: 6396 rack->r_state = TCPS_CLOSE_WAIT; 6397 rack->r_substate = rack_do_close_wait; 6398 break; 6399 case TCPS_FIN_WAIT_1: 6400 rack->r_state = TCPS_FIN_WAIT_1; 6401 rack->r_substate = rack_do_fin_wait_1; 6402 break; 6403 case TCPS_CLOSING: 6404 rack->r_state = TCPS_CLOSING; 6405 rack->r_substate = rack_do_closing; 6406 break; 6407 case TCPS_LAST_ACK: 6408 rack->r_state = TCPS_LAST_ACK; 6409 rack->r_substate = rack_do_lastack; 6410 break; 6411 case TCPS_FIN_WAIT_2: 6412 rack->r_state = TCPS_FIN_WAIT_2; 6413 rack->r_substate = rack_do_fin_wait_2; 6414 break; 6415 case TCPS_LISTEN: 6416 case TCPS_CLOSED: 6417 case TCPS_TIME_WAIT: 6418 default: 6419 #ifdef INVARIANTS 6420 panic("tcp tp:%p state:%d sees impossible state?", tp, tp->t_state); 6421 #endif 6422 break; 6423 }; 6424 } 6425 6426 6427 static void 6428 rack_timer_audit(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb) 6429 { 6430 /* 6431 * We received an ack, and then did not 6432 * call send or were bounced out due to the 6433 * hpts was running. Now a timer is up as well, is 6434 * it the right timer? 6435 */ 6436 struct rack_sendmap *rsm; 6437 int tmr_up; 6438 6439 tmr_up = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK; 6440 if (rack->rc_in_persist && (tmr_up == PACE_TMR_PERSIT)) 6441 return; 6442 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 6443 if (((rsm == NULL) || (tp->t_state < TCPS_ESTABLISHED)) && 6444 (tmr_up == PACE_TMR_RXT)) { 6445 /* Should be an RXT */ 6446 return; 6447 } 6448 if (rsm == NULL) { 6449 /* Nothing outstanding? */ 6450 if (tp->t_flags & TF_DELACK) { 6451 if (tmr_up == PACE_TMR_DELACK) 6452 /* We are supposed to have delayed ack up and we do */ 6453 return; 6454 } else if (sbavail(&tp->t_inpcb->inp_socket->so_snd) && (tmr_up == PACE_TMR_RXT)) { 6455 /* 6456 * if we hit enobufs then we would expect the possiblity 6457 * of nothing outstanding and the RXT up (and the hptsi timer). 6458 */ 6459 return; 6460 } else if (((tcp_always_keepalive || 6461 rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) && 6462 (tp->t_state <= TCPS_CLOSING)) && 6463 (tmr_up == PACE_TMR_KEEP) && 6464 (tp->snd_max == tp->snd_una)) { 6465 /* We should have keep alive up and we do */ 6466 return; 6467 } 6468 } 6469 if (rsm && (rsm->r_flags & RACK_SACK_PASSED)) { 6470 if ((tp->t_flags & TF_SENTFIN) && 6471 ((tp->snd_max - tp->snd_una) == 1) && 6472 (rsm->r_flags & RACK_HAS_FIN)) { 6473 /* needs to be a RXT */ 6474 if (tmr_up == PACE_TMR_RXT) 6475 return; 6476 } else if (tmr_up == PACE_TMR_RACK) 6477 return; 6478 } else if (SEQ_GT(tp->snd_max,tp->snd_una) && 6479 ((tmr_up == PACE_TMR_TLP) || 6480 (tmr_up == PACE_TMR_RXT))) { 6481 /* 6482 * Either a TLP or RXT is fine if no sack-passed 6483 * is in place and data is outstanding. 6484 */ 6485 return; 6486 } else if (tmr_up == PACE_TMR_DELACK) { 6487 /* 6488 * If the delayed ack was going to go off 6489 * before the rtx/tlp/rack timer were going to 6490 * expire, then that would be the timer in control. 6491 * Note we don't check the time here trusting the 6492 * code is correct. 6493 */ 6494 return; 6495 } 6496 /* 6497 * Ok the timer originally started is not what we want now. 6498 * We will force the hpts to be stopped if any, and restart 6499 * with the slot set to what was in the saved slot. 6500 */ 6501 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 6502 rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0); 6503 } 6504 6505 static void 6506 rack_hpts_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, 6507 struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos, 6508 int32_t nxt_pkt, struct timeval *tv) 6509 { 6510 int32_t thflags, retval, did_out = 0; 6511 int32_t way_out = 0; 6512 uint32_t cts; 6513 uint32_t tiwin; 6514 struct tcpopt to; 6515 struct tcp_rack *rack; 6516 struct rack_sendmap *rsm; 6517 int32_t prev_state = 0; 6518 6519 cts = tcp_tv_to_mssectick(tv); 6520 rack = (struct tcp_rack *)tp->t_fb_ptr; 6521 6522 kern_prefetch(rack, &prev_state); 6523 prev_state = 0; 6524 thflags = th->th_flags; 6525 /* 6526 * If this is either a state-changing packet or current state isn't 6527 * established, we require a read lock on tcbinfo. Otherwise, we 6528 * allow the tcbinfo to be in either locked or unlocked, as the 6529 * caller may have unnecessarily acquired a lock due to a race. 6530 */ 6531 if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 || 6532 tp->t_state != TCPS_ESTABLISHED) { 6533 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 6534 } 6535 INP_WLOCK_ASSERT(tp->t_inpcb); 6536 KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", 6537 __func__)); 6538 KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT", 6539 __func__)); 6540 { 6541 union tcp_log_stackspecific log; 6542 6543 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 6544 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 6545 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 6546 TCP_LOG_EVENT(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_IN, 0, 6547 tlen, &log, true); 6548 } 6549 if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) { 6550 way_out = 4; 6551 goto done_with_input; 6552 } 6553 /* 6554 * If a segment with the ACK-bit set arrives in the SYN-SENT state 6555 * check SEQ.ACK first as described on page 66 of RFC 793, section 3.9. 6556 */ 6557 if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) && 6558 (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { 6559 rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 6560 return; 6561 } 6562 /* 6563 * Segment received on connection. Reset idle time and keep-alive 6564 * timer. XXX: This should be done after segment validation to 6565 * ignore broken/spoofed segs. 6566 */ 6567 if (tp->t_idle_reduce && (tp->snd_max == tp->snd_una)) { 6568 #ifdef NETFLIX_CWV 6569 if ((tp->cwv_enabled) && 6570 ((tp->cwv_cwnd_valid == 0) && 6571 TCPS_HAVEESTABLISHED(tp->t_state) && 6572 (tp->snd_cwnd > tp->snd_cwv.init_cwnd))) { 6573 tcp_newcwv_nvp_closedown(tp); 6574 } else 6575 #endif 6576 if ((ticks - tp->t_rcvtime) >= tp->t_rxtcur) { 6577 counter_u64_add(rack_input_idle_reduces, 1); 6578 rack_cc_after_idle(tp, 6579 (rack->r_idle_reduce_largest ? 1 :0)); 6580 } 6581 } 6582 rack->r_ctl.rc_rcvtime = cts; 6583 tp->t_rcvtime = ticks; 6584 6585 #ifdef NETFLIX_CWV 6586 if (tp->cwv_enabled) { 6587 if ((tp->cwv_cwnd_valid == 0) && 6588 TCPS_HAVEESTABLISHED(tp->t_state) && 6589 (tp->snd_cwnd > tp->snd_cwv.init_cwnd)) 6590 tcp_newcwv_nvp_closedown(tp); 6591 } 6592 #endif 6593 /* 6594 * Unscale the window into a 32-bit value. For the SYN_SENT state 6595 * the scale is zero. 6596 */ 6597 tiwin = th->th_win << tp->snd_scale; 6598 #ifdef NETFLIX_STATS 6599 stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_FRWIN, tiwin); 6600 #endif 6601 /* 6602 * TCP ECN processing. XXXJTL: If we ever use ECN, we need to move 6603 * this to occur after we've validated the segment. 6604 */ 6605 if (tp->t_flags & TF_ECN_PERMIT) { 6606 if (thflags & TH_CWR) 6607 tp->t_flags &= ~TF_ECN_SND_ECE; 6608 switch (iptos & IPTOS_ECN_MASK) { 6609 case IPTOS_ECN_CE: 6610 tp->t_flags |= TF_ECN_SND_ECE; 6611 TCPSTAT_INC(tcps_ecn_ce); 6612 break; 6613 case IPTOS_ECN_ECT0: 6614 TCPSTAT_INC(tcps_ecn_ect0); 6615 break; 6616 case IPTOS_ECN_ECT1: 6617 TCPSTAT_INC(tcps_ecn_ect1); 6618 break; 6619 } 6620 /* Congestion experienced. */ 6621 if (thflags & TH_ECE) { 6622 rack_cong_signal(tp, th, CC_ECN); 6623 } 6624 } 6625 /* 6626 * Parse options on any incoming segment. 6627 */ 6628 tcp_dooptions(&to, (u_char *)(th + 1), 6629 (th->th_off << 2) - sizeof(struct tcphdr), 6630 (thflags & TH_SYN) ? TO_SYN : 0); 6631 6632 /* 6633 * If echoed timestamp is later than the current time, fall back to 6634 * non RFC1323 RTT calculation. Normalize timestamp if syncookies 6635 * were used when this connection was established. 6636 */ 6637 if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) { 6638 to.to_tsecr -= tp->ts_offset; 6639 if (TSTMP_GT(to.to_tsecr, cts)) 6640 to.to_tsecr = 0; 6641 } 6642 /* 6643 * If its the first time in we need to take care of options and 6644 * verify we can do SACK for rack! 6645 */ 6646 if (rack->r_state == 0) { 6647 /* Should be init'd by rack_init() */ 6648 KASSERT(rack->rc_inp != NULL, 6649 ("%s: rack->rc_inp unexpectedly NULL", __func__)); 6650 if (rack->rc_inp == NULL) { 6651 rack->rc_inp = tp->t_inpcb; 6652 } 6653 6654 /* 6655 * Process options only when we get SYN/ACK back. The SYN 6656 * case for incoming connections is handled in tcp_syncache. 6657 * According to RFC1323 the window field in a SYN (i.e., a 6658 * <SYN> or <SYN,ACK>) segment itself is never scaled. XXX 6659 * this is traditional behavior, may need to be cleaned up. 6660 */ 6661 rack->r_cpu = inp_to_cpuid(tp->t_inpcb); 6662 if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) { 6663 if ((to.to_flags & TOF_SCALE) && 6664 (tp->t_flags & TF_REQ_SCALE)) { 6665 tp->t_flags |= TF_RCVD_SCALE; 6666 tp->snd_scale = to.to_wscale; 6667 } 6668 /* 6669 * Initial send window. It will be updated with the 6670 * next incoming segment to the scaled value. 6671 */ 6672 tp->snd_wnd = th->th_win; 6673 if (to.to_flags & TOF_TS) { 6674 tp->t_flags |= TF_RCVD_TSTMP; 6675 tp->ts_recent = to.to_tsval; 6676 tp->ts_recent_age = cts; 6677 } 6678 if (to.to_flags & TOF_MSS) 6679 tcp_mss(tp, to.to_mss); 6680 if ((tp->t_flags & TF_SACK_PERMIT) && 6681 (to.to_flags & TOF_SACKPERM) == 0) 6682 tp->t_flags &= ~TF_SACK_PERMIT; 6683 if (IS_FASTOPEN(tp->t_flags)) { 6684 if (to.to_flags & TOF_FASTOPEN) { 6685 uint16_t mss; 6686 6687 if (to.to_flags & TOF_MSS) 6688 mss = to.to_mss; 6689 else 6690 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) 6691 mss = TCP6_MSS; 6692 else 6693 mss = TCP_MSS; 6694 tcp_fastopen_update_cache(tp, mss, 6695 to.to_tfo_len, to.to_tfo_cookie); 6696 } else 6697 tcp_fastopen_disable_path(tp); 6698 } 6699 } 6700 /* 6701 * At this point we are at the initial call. Here we decide 6702 * if we are doing RACK or not. We do this by seeing if 6703 * TF_SACK_PERMIT is set, if not rack is *not* possible and 6704 * we switch to the default code. 6705 */ 6706 if ((tp->t_flags & TF_SACK_PERMIT) == 0) { 6707 tcp_switch_back_to_default(tp); 6708 (*tp->t_fb->tfb_tcp_do_segment) (m, th, so, tp, drop_hdrlen, 6709 tlen, iptos); 6710 return; 6711 } 6712 /* Set the flag */ 6713 rack->r_is_v6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; 6714 tcp_set_hpts(tp->t_inpcb); 6715 rack_stop_all_timers(tp); 6716 sack_filter_clear(&rack->r_ctl.rack_sf, th->th_ack); 6717 } 6718 /* 6719 * This is the one exception case where we set the rack state 6720 * always. All other times (timers etc) we must have a rack-state 6721 * set (so we assure we have done the checks above for SACK). 6722 */ 6723 if (rack->r_state != tp->t_state) 6724 rack_set_state(tp, rack); 6725 if (SEQ_GT(th->th_ack, tp->snd_una) && (rsm = TAILQ_FIRST(&rack->r_ctl.rc_map)) != NULL) 6726 kern_prefetch(rsm, &prev_state); 6727 prev_state = rack->r_state; 6728 rack->r_ctl.rc_tlp_send_cnt = 0; 6729 rack_clear_rate_sample(rack); 6730 retval = (*rack->r_substate) (m, th, so, 6731 tp, &to, drop_hdrlen, 6732 tlen, tiwin, thflags, nxt_pkt); 6733 #ifdef INVARIANTS 6734 if ((retval == 0) && 6735 (tp->t_inpcb == NULL)) { 6736 panic("retval:%d tp:%p t_inpcb:NULL state:%d", 6737 retval, tp, prev_state); 6738 } 6739 #endif 6740 if (retval == 0) { 6741 /* 6742 * If retval is 1 the tcb is unlocked and most likely the tp 6743 * is gone. 6744 */ 6745 INP_WLOCK_ASSERT(tp->t_inpcb); 6746 tcp_rack_xmit_timer_commit(rack, tp); 6747 if (((tp->snd_max - tp->snd_una) > tp->snd_wnd) && 6748 (rack->rc_in_persist == 0)){ 6749 /* 6750 * The peer shrunk its window on us to the point 6751 * where we have sent too much. The only thing 6752 * we can do here is stop any timers and 6753 * enter persist. We most likely lost the last 6754 * bytes we sent but oh well, we will have to 6755 * retransmit them after the peer is caught up. 6756 */ 6757 if (rack->rc_inp->inp_in_hpts) 6758 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); 6759 rack_timer_cancel(tp, rack, cts, __LINE__); 6760 rack_enter_persist(tp, rack, cts); 6761 rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0); 6762 way_out = 3; 6763 goto done_with_input; 6764 } 6765 if (nxt_pkt == 0) { 6766 if (rack->r_wanted_output != 0) { 6767 did_out = 1; 6768 (void)tp->t_fb->tfb_tcp_output(tp); 6769 } 6770 rack_start_hpts_timer(rack, tp, cts, __LINE__, 0, 0, 0); 6771 } 6772 if (((rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) == 0) && 6773 (SEQ_GT(tp->snd_max, tp->snd_una) || 6774 (tp->t_flags & TF_DELACK) || 6775 ((tcp_always_keepalive || rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) && 6776 (tp->t_state <= TCPS_CLOSING)))) { 6777 /* We could not send (probably in the hpts but stopped the timer earlier)? */ 6778 if ((tp->snd_max == tp->snd_una) && 6779 ((tp->t_flags & TF_DELACK) == 0) && 6780 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { 6781 /* keep alive not needed if we are hptsi output yet */ 6782 ; 6783 } else { 6784 if (rack->rc_inp->inp_in_hpts) 6785 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); 6786 rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0); 6787 } 6788 way_out = 1; 6789 } else { 6790 /* Do we have the correct timer running? */ 6791 rack_timer_audit(tp, rack, &so->so_snd); 6792 way_out = 2; 6793 } 6794 done_with_input: 6795 rack_log_doseg_done(rack, cts, nxt_pkt, did_out, way_out); 6796 if (did_out) 6797 rack->r_wanted_output = 0; 6798 #ifdef INVARIANTS 6799 if (tp->t_inpcb == NULL) { 6800 panic("OP:%d retval:%d tp:%p t_inpcb:NULL state:%d", 6801 did_out, 6802 retval, tp, prev_state); 6803 } 6804 #endif 6805 INP_WUNLOCK(tp->t_inpcb); 6806 } 6807 } 6808 6809 void 6810 rack_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, 6811 struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos) 6812 { 6813 struct timeval tv; 6814 #ifdef RSS 6815 struct tcp_function_block *tfb; 6816 struct tcp_rack *rack; 6817 struct epoch_tracker et; 6818 6819 rack = (struct tcp_rack *)tp->t_fb_ptr; 6820 if (rack->r_state == 0) { 6821 /* 6822 * Initial input (ACK to SYN-ACK etc)lets go ahead and get 6823 * it processed 6824 */ 6825 INP_INFO_RLOCK_ET(&V_tcbinfo, et); 6826 tcp_get_usecs(&tv); 6827 rack_hpts_do_segment(m, th, so, tp, drop_hdrlen, 6828 tlen, iptos, 0, &tv); 6829 INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); 6830 return; 6831 } 6832 tcp_queue_to_input(tp, m, th, tlen, drop_hdrlen, iptos); 6833 INP_WUNLOCK(tp->t_inpcb); 6834 #else 6835 tcp_get_usecs(&tv); 6836 rack_hpts_do_segment(m, th, so, tp, drop_hdrlen, 6837 tlen, iptos, 0, &tv); 6838 #endif 6839 } 6840 6841 struct rack_sendmap * 6842 tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tsused) 6843 { 6844 struct rack_sendmap *rsm = NULL; 6845 int32_t idx; 6846 uint32_t srtt_cur, srtt = 0, thresh = 0, ts_low = 0; 6847 6848 /* Return the next guy to be re-transmitted */ 6849 if (TAILQ_EMPTY(&rack->r_ctl.rc_map)) { 6850 return (NULL); 6851 } 6852 if (tp->t_flags & TF_SENTFIN) { 6853 /* retran the end FIN? */ 6854 return (NULL); 6855 } 6856 /* ok lets look at this one */ 6857 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 6858 if (rsm && ((rsm->r_flags & RACK_ACKED) == 0)) { 6859 goto check_it; 6860 } 6861 rsm = rack_find_lowest_rsm(rack); 6862 if (rsm == NULL) { 6863 return (NULL); 6864 } 6865 check_it: 6866 srtt_cur = tp->t_srtt >> TCP_RTT_SHIFT; 6867 srtt = TICKS_2_MSEC(srtt_cur); 6868 if (rack->rc_rack_rtt && (srtt > rack->rc_rack_rtt)) 6869 srtt = rack->rc_rack_rtt; 6870 if (rsm->r_flags & RACK_ACKED) { 6871 return (NULL); 6872 } 6873 if ((rsm->r_flags & RACK_SACK_PASSED) == 0) { 6874 /* Its not yet ready */ 6875 return (NULL); 6876 } 6877 idx = rsm->r_rtr_cnt - 1; 6878 ts_low = rsm->r_tim_lastsent[idx]; 6879 thresh = rack_calc_thresh_rack(rack, srtt, tsused); 6880 if (tsused <= ts_low) { 6881 return (NULL); 6882 } 6883 if ((tsused - ts_low) >= thresh) { 6884 return (rsm); 6885 } 6886 return (NULL); 6887 } 6888 6889 static int 6890 rack_output(struct tcpcb *tp) 6891 { 6892 struct socket *so; 6893 uint32_t recwin, sendwin; 6894 uint32_t sb_offset; 6895 int32_t len, flags, error = 0; 6896 struct mbuf *m; 6897 struct mbuf *mb; 6898 uint32_t if_hw_tsomaxsegcount = 0; 6899 uint32_t if_hw_tsomaxsegsize; 6900 long tot_len_this_send = 0; 6901 struct ip *ip = NULL; 6902 #ifdef TCPDEBUG 6903 struct ipovly *ipov = NULL; 6904 #endif 6905 struct udphdr *udp = NULL; 6906 struct tcp_rack *rack; 6907 struct tcphdr *th; 6908 uint8_t pass = 0; 6909 uint8_t wanted_cookie = 0; 6910 u_char opt[TCP_MAXOLEN]; 6911 unsigned ipoptlen, optlen, hdrlen, ulen=0; 6912 uint32_t rack_seq; 6913 6914 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 6915 unsigned ipsec_optlen = 0; 6916 6917 #endif 6918 int32_t idle, sendalot; 6919 int32_t sub_from_prr = 0; 6920 volatile int32_t sack_rxmit; 6921 struct rack_sendmap *rsm = NULL; 6922 int32_t tso, mtu, would_have_fin = 0; 6923 struct tcpopt to; 6924 int32_t slot = 0; 6925 uint32_t cts; 6926 uint8_t hpts_calling, doing_tlp = 0; 6927 int32_t do_a_prefetch; 6928 int32_t prefetch_rsm = 0; 6929 int32_t prefetch_so_done = 0; 6930 struct tcp_log_buffer *lgb = NULL; 6931 struct inpcb *inp; 6932 struct sockbuf *sb; 6933 #ifdef INET6 6934 struct ip6_hdr *ip6 = NULL; 6935 int32_t isipv6; 6936 #endif 6937 /* setup and take the cache hits here */ 6938 rack = (struct tcp_rack *)tp->t_fb_ptr; 6939 inp = rack->rc_inp; 6940 so = inp->inp_socket; 6941 sb = &so->so_snd; 6942 kern_prefetch(sb, &do_a_prefetch); 6943 do_a_prefetch = 1; 6944 6945 INP_WLOCK_ASSERT(inp); 6946 #ifdef TCP_OFFLOAD 6947 if (tp->t_flags & TF_TOE) 6948 return (tcp_offload_output(tp)); 6949 #endif 6950 #ifdef INET6 6951 if (rack->r_state) { 6952 /* Use the cache line loaded if possible */ 6953 isipv6 = rack->r_is_v6; 6954 } else { 6955 isipv6 = (inp->inp_vflag & INP_IPV6) != 0; 6956 } 6957 #endif 6958 cts = tcp_ts_getticks(); 6959 if (((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) && 6960 inp->inp_in_hpts) { 6961 /* 6962 * We are on the hpts for some timer but not hptsi output. 6963 * Remove from the hpts unconditionally. 6964 */ 6965 rack_timer_cancel(tp, rack, cts, __LINE__); 6966 } 6967 /* Mark that we have called rack_output(). */ 6968 if ((rack->r_timer_override) || 6969 (tp->t_flags & TF_FORCEDATA) || 6970 (tp->t_state < TCPS_ESTABLISHED)) { 6971 if (tp->t_inpcb->inp_in_hpts) 6972 tcp_hpts_remove(tp->t_inpcb, HPTS_REMOVE_OUTPUT); 6973 } else if (tp->t_inpcb->inp_in_hpts) { 6974 /* 6975 * On the hpts you can't pass even if ACKNOW is on, we will 6976 * when the hpts fires. 6977 */ 6978 counter_u64_add(rack_out_size[TCP_MSS_ACCT_INPACE], 1); 6979 return (0); 6980 } 6981 hpts_calling = inp->inp_hpts_calls; 6982 inp->inp_hpts_calls = 0; 6983 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { 6984 if (rack_process_timers(tp, rack, cts, hpts_calling)) { 6985 counter_u64_add(rack_out_size[TCP_MSS_ACCT_ATIMER], 1); 6986 return (0); 6987 } 6988 } 6989 rack->r_wanted_output = 0; 6990 rack->r_timer_override = 0; 6991 /* 6992 * For TFO connections in SYN_SENT or SYN_RECEIVED, 6993 * only allow the initial SYN or SYN|ACK and those sent 6994 * by the retransmit timer. 6995 */ 6996 if (IS_FASTOPEN(tp->t_flags) && 6997 ((tp->t_state == TCPS_SYN_RECEIVED) || 6998 (tp->t_state == TCPS_SYN_SENT)) && 6999 SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN or SYN|ACK sent */ 7000 (tp->t_rxtshift == 0)) /* not a retransmit */ 7001 return (0); 7002 /* 7003 * Determine length of data that should be transmitted, and flags 7004 * that will be used. If there is some data or critical controls 7005 * (SYN, RST) to send, then transmit; otherwise, investigate 7006 * further. 7007 */ 7008 idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una); 7009 #ifdef NETFLIX_CWV 7010 if (tp->cwv_enabled) { 7011 if ((tp->cwv_cwnd_valid == 0) && 7012 TCPS_HAVEESTABLISHED(tp->t_state) && 7013 (tp->snd_cwnd > tp->snd_cwv.init_cwnd)) 7014 tcp_newcwv_nvp_closedown(tp); 7015 } else 7016 #endif 7017 if (tp->t_idle_reduce) { 7018 if (idle && ((ticks - tp->t_rcvtime) >= tp->t_rxtcur)) 7019 rack_cc_after_idle(tp, 7020 (rack->r_idle_reduce_largest ? 1 :0)); 7021 } 7022 tp->t_flags &= ~TF_LASTIDLE; 7023 if (idle) { 7024 if (tp->t_flags & TF_MORETOCOME) { 7025 tp->t_flags |= TF_LASTIDLE; 7026 idle = 0; 7027 } 7028 } 7029 again: 7030 /* 7031 * If we've recently taken a timeout, snd_max will be greater than 7032 * snd_nxt. There may be SACK information that allows us to avoid 7033 * resending already delivered data. Adjust snd_nxt accordingly. 7034 */ 7035 sendalot = 0; 7036 cts = tcp_ts_getticks(); 7037 tso = 0; 7038 mtu = 0; 7039 sb_offset = tp->snd_max - tp->snd_una; 7040 sendwin = min(tp->snd_wnd, tp->snd_cwnd); 7041 7042 flags = tcp_outflags[tp->t_state]; 7043 /* 7044 * Send any SACK-generated retransmissions. If we're explicitly 7045 * trying to send out new data (when sendalot is 1), bypass this 7046 * function. If we retransmit in fast recovery mode, decrement 7047 * snd_cwnd, since we're replacing a (future) new transmission with 7048 * a retransmission now, and we previously incremented snd_cwnd in 7049 * tcp_input(). 7050 */ 7051 /* 7052 * Still in sack recovery , reset rxmit flag to zero. 7053 */ 7054 while (rack->rc_free_cnt < rack_free_cache) { 7055 rsm = rack_alloc(rack); 7056 if (rsm == NULL) { 7057 if (inp->inp_hpts_calls) 7058 /* Retry in a ms */ 7059 slot = 1; 7060 goto just_return_nolock; 7061 } 7062 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_next); 7063 rack->rc_free_cnt++; 7064 rsm = NULL; 7065 } 7066 if (inp->inp_hpts_calls) 7067 inp->inp_hpts_calls = 0; 7068 sack_rxmit = 0; 7069 len = 0; 7070 rsm = NULL; 7071 if (flags & TH_RST) { 7072 SOCKBUF_LOCK(sb); 7073 goto send; 7074 } 7075 if (rack->r_ctl.rc_tlpsend) { 7076 /* Tail loss probe */ 7077 long cwin; 7078 long tlen; 7079 7080 doing_tlp = 1; 7081 rsm = rack->r_ctl.rc_tlpsend; 7082 rack->r_ctl.rc_tlpsend = NULL; 7083 sack_rxmit = 1; 7084 tlen = rsm->r_end - rsm->r_start; 7085 if (tlen > tp->t_maxseg) 7086 tlen = tp->t_maxseg; 7087 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start), 7088 ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p", 7089 __func__, __LINE__, 7090 rsm->r_start, tp->snd_una, tp, rack, rsm)); 7091 sb_offset = rsm->r_start - tp->snd_una; 7092 cwin = min(tp->snd_wnd, tlen); 7093 len = cwin; 7094 } else if (rack->r_ctl.rc_resend) { 7095 /* Retransmit timer */ 7096 rsm = rack->r_ctl.rc_resend; 7097 rack->r_ctl.rc_resend = NULL; 7098 len = rsm->r_end - rsm->r_start; 7099 sack_rxmit = 1; 7100 sendalot = 0; 7101 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start), 7102 ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p", 7103 __func__, __LINE__, 7104 rsm->r_start, tp->snd_una, tp, rack, rsm)); 7105 sb_offset = rsm->r_start - tp->snd_una; 7106 if (len >= tp->t_maxseg) { 7107 len = tp->t_maxseg; 7108 } 7109 } else if ((rack->rc_in_persist == 0) && 7110 ((rsm = tcp_rack_output(tp, rack, cts)) != NULL)) { 7111 long tlen; 7112 7113 if ((!IN_RECOVERY(tp->t_flags)) && 7114 ((tp->t_flags & (TF_WASFRECOVERY | TF_WASCRECOVERY)) == 0)) { 7115 /* Enter recovery if not induced by a time-out */ 7116 rack->r_ctl.rc_rsm_start = rsm->r_start; 7117 rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; 7118 rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; 7119 rack_cong_signal(tp, NULL, CC_NDUPACK); 7120 /* 7121 * When we enter recovery we need to assure we send 7122 * one packet. 7123 */ 7124 rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; 7125 } 7126 #ifdef INVARIANTS 7127 if (SEQ_LT(rsm->r_start, tp->snd_una)) { 7128 panic("Huh, tp:%p rack:%p rsm:%p start:%u < snd_una:%u\n", 7129 tp, rack, rsm, rsm->r_start, tp->snd_una); 7130 } 7131 #endif 7132 tlen = rsm->r_end - rsm->r_start; 7133 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start), 7134 ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p", 7135 __func__, __LINE__, 7136 rsm->r_start, tp->snd_una, tp, rack, rsm)); 7137 sb_offset = rsm->r_start - tp->snd_una; 7138 if (tlen > rack->r_ctl.rc_prr_sndcnt) { 7139 len = rack->r_ctl.rc_prr_sndcnt; 7140 } else { 7141 len = tlen; 7142 } 7143 if (len >= tp->t_maxseg) { 7144 sendalot = 1; 7145 len = tp->t_maxseg; 7146 } else { 7147 sendalot = 0; 7148 if ((rack->rc_timer_up == 0) && 7149 (len < tlen)) { 7150 /* 7151 * If its not a timer don't send a partial 7152 * segment. 7153 */ 7154 len = 0; 7155 goto just_return_nolock; 7156 } 7157 } 7158 if (len > 0) { 7159 sub_from_prr = 1; 7160 sack_rxmit = 1; 7161 TCPSTAT_INC(tcps_sack_rexmits); 7162 TCPSTAT_ADD(tcps_sack_rexmit_bytes, 7163 min(len, tp->t_maxseg)); 7164 counter_u64_add(rack_rtm_prr_retran, 1); 7165 } 7166 } 7167 if (rsm && (rsm->r_flags & RACK_HAS_FIN)) { 7168 /* we are retransmitting the fin */ 7169 len--; 7170 if (len) { 7171 /* 7172 * When retransmitting data do *not* include the 7173 * FIN. This could happen from a TLP probe. 7174 */ 7175 flags &= ~TH_FIN; 7176 } 7177 } 7178 #ifdef INVARIANTS 7179 /* For debugging */ 7180 rack->r_ctl.rc_rsm_at_retran = rsm; 7181 #endif 7182 /* 7183 * Get standard flags, and add SYN or FIN if requested by 'hidden' 7184 * state flags. 7185 */ 7186 if (tp->t_flags & TF_NEEDFIN) 7187 flags |= TH_FIN; 7188 if (tp->t_flags & TF_NEEDSYN) 7189 flags |= TH_SYN; 7190 if ((sack_rxmit == 0) && (prefetch_rsm == 0)) { 7191 void *end_rsm; 7192 end_rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext); 7193 if (end_rsm) 7194 kern_prefetch(end_rsm, &prefetch_rsm); 7195 prefetch_rsm = 1; 7196 } 7197 SOCKBUF_LOCK(sb); 7198 /* 7199 * If in persist timeout with window of 0, send 1 byte. Otherwise, 7200 * if window is small but nonzero and time TF_SENTFIN expired, we 7201 * will send what we can and go to transmit state. 7202 */ 7203 if (tp->t_flags & TF_FORCEDATA) { 7204 if (sendwin == 0) { 7205 /* 7206 * If we still have some data to send, then clear 7207 * the FIN bit. Usually this would happen below 7208 * when it realizes that we aren't sending all the 7209 * data. However, if we have exactly 1 byte of 7210 * unsent data, then it won't clear the FIN bit 7211 * below, and if we are in persist state, we wind up 7212 * sending the packet without recording that we sent 7213 * the FIN bit. 7214 * 7215 * We can't just blindly clear the FIN bit, because 7216 * if we don't have any more data to send then the 7217 * probe will be the FIN itself. 7218 */ 7219 if (sb_offset < sbused(sb)) 7220 flags &= ~TH_FIN; 7221 sendwin = 1; 7222 } else { 7223 if (rack->rc_in_persist) 7224 rack_exit_persist(tp, rack); 7225 /* 7226 * If we are dropping persist mode then we need to 7227 * correct snd_nxt/snd_max and off. 7228 */ 7229 tp->snd_nxt = tp->snd_max; 7230 sb_offset = tp->snd_nxt - tp->snd_una; 7231 } 7232 } 7233 /* 7234 * If snd_nxt == snd_max and we have transmitted a FIN, the 7235 * sb_offset will be > 0 even if so_snd.sb_cc is 0, resulting in a 7236 * negative length. This can also occur when TCP opens up its 7237 * congestion window while receiving additional duplicate acks after 7238 * fast-retransmit because TCP will reset snd_nxt to snd_max after 7239 * the fast-retransmit. 7240 * 7241 * In the normal retransmit-FIN-only case, however, snd_nxt will be 7242 * set to snd_una, the sb_offset will be 0, and the length may wind 7243 * up 0. 7244 * 7245 * If sack_rxmit is true we are retransmitting from the scoreboard 7246 * in which case len is already set. 7247 */ 7248 if (sack_rxmit == 0) { 7249 uint32_t avail; 7250 7251 avail = sbavail(sb); 7252 if (SEQ_GT(tp->snd_nxt, tp->snd_una) && avail) 7253 sb_offset = tp->snd_nxt - tp->snd_una; 7254 else 7255 sb_offset = 0; 7256 if (IN_RECOVERY(tp->t_flags) == 0) { 7257 if (rack->r_ctl.rc_tlp_new_data) { 7258 /* TLP is forcing out new data */ 7259 if (rack->r_ctl.rc_tlp_new_data > (uint32_t) (avail - sb_offset)) { 7260 rack->r_ctl.rc_tlp_new_data = (uint32_t) (avail - sb_offset); 7261 } 7262 if (rack->r_ctl.rc_tlp_new_data > tp->snd_wnd) 7263 len = tp->snd_wnd; 7264 else 7265 len = rack->r_ctl.rc_tlp_new_data; 7266 rack->r_ctl.rc_tlp_new_data = 0; 7267 doing_tlp = 1; 7268 } else { 7269 if (sendwin > avail) { 7270 /* use the available */ 7271 if (avail > sb_offset) { 7272 len = (int32_t)(avail - sb_offset); 7273 } else { 7274 len = 0; 7275 } 7276 } else { 7277 if (sendwin > sb_offset) { 7278 len = (int32_t)(sendwin - sb_offset); 7279 } else { 7280 len = 0; 7281 } 7282 } 7283 } 7284 } else { 7285 uint32_t outstanding; 7286 7287 /* 7288 * We are inside of a SACK recovery episode and are 7289 * sending new data, having retransmitted all the 7290 * data possible so far in the scoreboard. 7291 */ 7292 outstanding = tp->snd_max - tp->snd_una; 7293 if ((rack->r_ctl.rc_prr_sndcnt + outstanding) > tp->snd_wnd) 7294 len = 0; 7295 else if (avail > sb_offset) 7296 len = avail - sb_offset; 7297 else 7298 len = 0; 7299 if (len > 0) { 7300 if (len > rack->r_ctl.rc_prr_sndcnt) 7301 len = rack->r_ctl.rc_prr_sndcnt; 7302 7303 if (len > 0) { 7304 sub_from_prr = 1; 7305 counter_u64_add(rack_rtm_prr_newdata, 1); 7306 } 7307 } 7308 if (len > tp->t_maxseg) { 7309 /* 7310 * We should never send more than a MSS when 7311 * retransmitting or sending new data in prr 7312 * mode unless the override flag is on. Most 7313 * likely the PRR algorithm is not going to 7314 * let us send a lot as well :-) 7315 */ 7316 if (rack->r_ctl.rc_prr_sendalot == 0) 7317 len = tp->t_maxseg; 7318 } else if (len < tp->t_maxseg) { 7319 /* 7320 * Do we send any? The idea here is if the 7321 * send empty's the socket buffer we want to 7322 * do it. However if not then lets just wait 7323 * for our prr_sndcnt to get bigger. 7324 */ 7325 long leftinsb; 7326 7327 leftinsb = sbavail(sb) - sb_offset; 7328 if (leftinsb > len) { 7329 /* This send does not empty the sb */ 7330 len = 0; 7331 } 7332 } 7333 } 7334 } 7335 if (prefetch_so_done == 0) { 7336 kern_prefetch(so, &prefetch_so_done); 7337 prefetch_so_done = 1; 7338 } 7339 /* 7340 * Lop off SYN bit if it has already been sent. However, if this is 7341 * SYN-SENT state and if segment contains data and if we don't know 7342 * that foreign host supports TAO, suppress sending segment. 7343 */ 7344 if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una) && 7345 ((sack_rxmit == 0) && (tp->t_rxtshift == 0))) { 7346 if (tp->t_state != TCPS_SYN_RECEIVED) 7347 flags &= ~TH_SYN; 7348 /* 7349 * When sending additional segments following a TFO SYN|ACK, 7350 * do not include the SYN bit. 7351 */ 7352 if (IS_FASTOPEN(tp->t_flags) && 7353 (tp->t_state == TCPS_SYN_RECEIVED)) 7354 flags &= ~TH_SYN; 7355 sb_offset--, len++; 7356 } 7357 /* 7358 * Be careful not to send data and/or FIN on SYN segments. This 7359 * measure is needed to prevent interoperability problems with not 7360 * fully conformant TCP implementations. 7361 */ 7362 if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) { 7363 len = 0; 7364 flags &= ~TH_FIN; 7365 } 7366 /* 7367 * On TFO sockets, ensure no data is sent in the following cases: 7368 * 7369 * - When retransmitting SYN|ACK on a passively-created socket 7370 * 7371 * - When retransmitting SYN on an actively created socket 7372 * 7373 * - When sending a zero-length cookie (cookie request) on an 7374 * actively created socket 7375 * 7376 * - When the socket is in the CLOSED state (RST is being sent) 7377 */ 7378 if (IS_FASTOPEN(tp->t_flags) && 7379 (((flags & TH_SYN) && (tp->t_rxtshift > 0)) || 7380 ((tp->t_state == TCPS_SYN_SENT) && 7381 (tp->t_tfo_client_cookie_len == 0)) || 7382 (flags & TH_RST))) { 7383 sack_rxmit = 0; 7384 len = 0; 7385 } 7386 /* Without fast-open there should never be data sent on a SYN */ 7387 if ((flags & TH_SYN) && (!IS_FASTOPEN(tp->t_flags))) 7388 len = 0; 7389 if (len <= 0) { 7390 /* 7391 * If FIN has been sent but not acked, but we haven't been 7392 * called to retransmit, len will be < 0. Otherwise, window 7393 * shrank after we sent into it. If window shrank to 0, 7394 * cancel pending retransmit, pull snd_nxt back to (closed) 7395 * window, and set the persist timer if it isn't already 7396 * going. If the window didn't close completely, just wait 7397 * for an ACK. 7398 * 7399 * We also do a general check here to ensure that we will 7400 * set the persist timer when we have data to send, but a 7401 * 0-byte window. This makes sure the persist timer is set 7402 * even if the packet hits one of the "goto send" lines 7403 * below. 7404 */ 7405 len = 0; 7406 if ((tp->snd_wnd == 0) && 7407 (TCPS_HAVEESTABLISHED(tp->t_state)) && 7408 (sb_offset < (int)sbavail(sb))) { 7409 tp->snd_nxt = tp->snd_una; 7410 rack_enter_persist(tp, rack, cts); 7411 } 7412 } 7413 /* len will be >= 0 after this point. */ 7414 KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); 7415 tcp_sndbuf_autoscale(tp, so, sendwin); 7416 /* 7417 * Decide if we can use TCP Segmentation Offloading (if supported by 7418 * hardware). 7419 * 7420 * TSO may only be used if we are in a pure bulk sending state. The 7421 * presence of TCP-MD5, SACK retransmits, SACK advertizements and IP 7422 * options prevent using TSO. With TSO the TCP header is the same 7423 * (except for the sequence number) for all generated packets. This 7424 * makes it impossible to transmit any options which vary per 7425 * generated segment or packet. 7426 * 7427 * IPv4 handling has a clear separation of ip options and ip header 7428 * flags while IPv6 combines both in in6p_outputopts. ip6_optlen() does 7429 * the right thing below to provide length of just ip options and thus 7430 * checking for ipoptlen is enough to decide if ip options are present. 7431 */ 7432 7433 #ifdef INET6 7434 if (isipv6) 7435 ipoptlen = ip6_optlen(tp->t_inpcb); 7436 else 7437 #endif 7438 if (tp->t_inpcb->inp_options) 7439 ipoptlen = tp->t_inpcb->inp_options->m_len - 7440 offsetof(struct ipoption, ipopt_list); 7441 else 7442 ipoptlen = 0; 7443 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 7444 /* 7445 * Pre-calculate here as we save another lookup into the darknesses 7446 * of IPsec that way and can actually decide if TSO is ok. 7447 */ 7448 #ifdef INET6 7449 if (isipv6 && IPSEC_ENABLED(ipv6)) 7450 ipsec_optlen = IPSEC_HDRSIZE(ipv6, tp->t_inpcb); 7451 #ifdef INET 7452 else 7453 #endif 7454 #endif /* INET6 */ 7455 #ifdef INET 7456 if (IPSEC_ENABLED(ipv4)) 7457 ipsec_optlen = IPSEC_HDRSIZE(ipv4, tp->t_inpcb); 7458 #endif /* INET */ 7459 #endif 7460 7461 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 7462 ipoptlen += ipsec_optlen; 7463 #endif 7464 if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > tp->t_maxseg && 7465 (tp->t_port == 0) && 7466 ((tp->t_flags & TF_SIGNATURE) == 0) && 7467 tp->rcv_numsacks == 0 && sack_rxmit == 0 && 7468 ipoptlen == 0) 7469 tso = 1; 7470 { 7471 uint32_t outstanding; 7472 7473 outstanding = tp->snd_max - tp->snd_una; 7474 if (tp->t_flags & TF_SENTFIN) { 7475 /* 7476 * If we sent a fin, snd_max is 1 higher than 7477 * snd_una 7478 */ 7479 outstanding--; 7480 } 7481 if (outstanding > 0) { 7482 /* 7483 * This is sub-optimal. We only send a stand alone 7484 * FIN on its own segment. 7485 */ 7486 if (flags & TH_FIN) { 7487 flags &= ~TH_FIN; 7488 would_have_fin = 1; 7489 } 7490 } else if (sack_rxmit) { 7491 if ((rsm->r_flags & RACK_HAS_FIN) == 0) 7492 flags &= ~TH_FIN; 7493 } else { 7494 if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + 7495 sbused(sb))) 7496 flags &= ~TH_FIN; 7497 } 7498 } 7499 recwin = sbspace(&so->so_rcv); 7500 7501 /* 7502 * Sender silly window avoidance. We transmit under the following 7503 * conditions when len is non-zero: 7504 * 7505 * - We have a full segment (or more with TSO) - This is the last 7506 * buffer in a write()/send() and we are either idle or running 7507 * NODELAY - we've timed out (e.g. persist timer) - we have more 7508 * then 1/2 the maximum send window's worth of data (receiver may be 7509 * limited the window size) - we need to retransmit 7510 */ 7511 if (len) { 7512 if (len >= tp->t_maxseg) { 7513 pass = 1; 7514 goto send; 7515 } 7516 /* 7517 * NOTE! on localhost connections an 'ack' from the remote 7518 * end may occur synchronously with the output and cause us 7519 * to flush a buffer queued with moretocome. XXX 7520 * 7521 */ 7522 if (!(tp->t_flags & TF_MORETOCOME) && /* normal case */ 7523 (idle || (tp->t_flags & TF_NODELAY)) && 7524 ((uint32_t)len + (uint32_t)sb_offset >= sbavail(&so->so_snd)) && 7525 (tp->t_flags & TF_NOPUSH) == 0) { 7526 pass = 2; 7527 goto send; 7528 } 7529 if (tp->t_flags & TF_FORCEDATA) { /* typ. timeout case */ 7530 pass = 3; 7531 goto send; 7532 } 7533 if ((tp->snd_una == tp->snd_max) && len) { /* Nothing outstanding */ 7534 goto send; 7535 } 7536 if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) { 7537 pass = 4; 7538 goto send; 7539 } 7540 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { /* retransmit case */ 7541 pass = 5; 7542 goto send; 7543 } 7544 if (sack_rxmit) { 7545 pass = 6; 7546 goto send; 7547 } 7548 } 7549 /* 7550 * Sending of standalone window updates. 7551 * 7552 * Window updates are important when we close our window due to a 7553 * full socket buffer and are opening it again after the application 7554 * reads data from it. Once the window has opened again and the 7555 * remote end starts to send again the ACK clock takes over and 7556 * provides the most current window information. 7557 * 7558 * We must avoid the silly window syndrome whereas every read from 7559 * the receive buffer, no matter how small, causes a window update 7560 * to be sent. We also should avoid sending a flurry of window 7561 * updates when the socket buffer had queued a lot of data and the 7562 * application is doing small reads. 7563 * 7564 * Prevent a flurry of pointless window updates by only sending an 7565 * update when we can increase the advertized window by more than 7566 * 1/4th of the socket buffer capacity. When the buffer is getting 7567 * full or is very small be more aggressive and send an update 7568 * whenever we can increase by two mss sized segments. In all other 7569 * situations the ACK's to new incoming data will carry further 7570 * window increases. 7571 * 7572 * Don't send an independent window update if a delayed ACK is 7573 * pending (it will get piggy-backed on it) or the remote side 7574 * already has done a half-close and won't send more data. Skip 7575 * this if the connection is in T/TCP half-open state. 7576 */ 7577 if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN) && 7578 !(tp->t_flags & TF_DELACK) && 7579 !TCPS_HAVERCVDFIN(tp->t_state)) { 7580 /* 7581 * "adv" is the amount we could increase the window, taking 7582 * into account that we are limited by TCP_MAXWIN << 7583 * tp->rcv_scale. 7584 */ 7585 int32_t adv; 7586 int oldwin; 7587 7588 adv = min(recwin, (long)TCP_MAXWIN << tp->rcv_scale); 7589 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) { 7590 oldwin = (tp->rcv_adv - tp->rcv_nxt); 7591 adv -= oldwin; 7592 } else 7593 oldwin = 0; 7594 7595 /* 7596 * If the new window size ends up being the same as the old 7597 * size when it is scaled, then don't force a window update. 7598 */ 7599 if (oldwin >> tp->rcv_scale == (adv + oldwin) >> tp->rcv_scale) 7600 goto dontupdate; 7601 7602 if (adv >= (int32_t)(2 * tp->t_maxseg) && 7603 (adv >= (int32_t)(so->so_rcv.sb_hiwat / 4) || 7604 recwin <= (int32_t)(so->so_rcv.sb_hiwat / 8) || 7605 so->so_rcv.sb_hiwat <= 8 * tp->t_maxseg)) { 7606 pass = 7; 7607 goto send; 7608 } 7609 if (2 * adv >= (int32_t) so->so_rcv.sb_hiwat) 7610 goto send; 7611 } 7612 dontupdate: 7613 7614 /* 7615 * Send if we owe the peer an ACK, RST, SYN, or urgent data. ACKNOW 7616 * is also a catch-all for the retransmit timer timeout case. 7617 */ 7618 if (tp->t_flags & TF_ACKNOW) { 7619 pass = 8; 7620 goto send; 7621 } 7622 if (((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0)) { 7623 pass = 9; 7624 goto send; 7625 } 7626 if (SEQ_GT(tp->snd_up, tp->snd_una)) { 7627 pass = 10; 7628 goto send; 7629 } 7630 /* 7631 * If our state indicates that FIN should be sent and we have not 7632 * yet done so, then we need to send. 7633 */ 7634 if ((flags & TH_FIN) && 7635 (tp->snd_nxt == tp->snd_una)) { 7636 pass = 11; 7637 goto send; 7638 } 7639 /* 7640 * No reason to send a segment, just return. 7641 */ 7642 just_return: 7643 SOCKBUF_UNLOCK(sb); 7644 just_return_nolock: 7645 if (tot_len_this_send == 0) 7646 counter_u64_add(rack_out_size[TCP_MSS_ACCT_JUSTRET], 1); 7647 rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, tot_len_this_send, 1); 7648 rack_log_type_just_return(rack, cts, tot_len_this_send, slot, hpts_calling); 7649 tp->t_flags &= ~TF_FORCEDATA; 7650 return (0); 7651 7652 send: 7653 if (doing_tlp == 0) { 7654 /* 7655 * Data not a TLP, and its not the rxt firing. If it is the 7656 * rxt firing, we want to leave the tlp_in_progress flag on 7657 * so we don't send another TLP. It has to be a rack timer 7658 * or normal send (response to acked data) to clear the tlp 7659 * in progress flag. 7660 */ 7661 rack->rc_tlp_in_progress = 0; 7662 } 7663 SOCKBUF_LOCK_ASSERT(sb); 7664 if (len > 0) { 7665 if (len >= tp->t_maxseg) 7666 tp->t_flags2 |= TF2_PLPMTU_MAXSEGSNT; 7667 else 7668 tp->t_flags2 &= ~TF2_PLPMTU_MAXSEGSNT; 7669 } 7670 /* 7671 * Before ESTABLISHED, force sending of initial options unless TCP 7672 * set not to do any options. NOTE: we assume that the IP/TCP header 7673 * plus TCP options always fit in a single mbuf, leaving room for a 7674 * maximum link header, i.e. max_linkhdr + sizeof (struct tcpiphdr) 7675 * + optlen <= MCLBYTES 7676 */ 7677 optlen = 0; 7678 #ifdef INET6 7679 if (isipv6) 7680 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 7681 else 7682 #endif 7683 hdrlen = sizeof(struct tcpiphdr); 7684 7685 /* 7686 * Compute options for segment. We only have to care about SYN and 7687 * established connection segments. Options for SYN-ACK segments 7688 * are handled in TCP syncache. 7689 */ 7690 to.to_flags = 0; 7691 if ((tp->t_flags & TF_NOOPT) == 0) { 7692 /* Maximum segment size. */ 7693 if (flags & TH_SYN) { 7694 tp->snd_nxt = tp->iss; 7695 to.to_mss = tcp_mssopt(&inp->inp_inc); 7696 #ifdef NETFLIX_TCPOUDP 7697 if (tp->t_port) 7698 to.to_mss -= V_tcp_udp_tunneling_overhead; 7699 #endif 7700 to.to_flags |= TOF_MSS; 7701 7702 /* 7703 * On SYN or SYN|ACK transmits on TFO connections, 7704 * only include the TFO option if it is not a 7705 * retransmit, as the presence of the TFO option may 7706 * have caused the original SYN or SYN|ACK to have 7707 * been dropped by a middlebox. 7708 */ 7709 if (IS_FASTOPEN(tp->t_flags) && 7710 (tp->t_rxtshift == 0)) { 7711 if (tp->t_state == TCPS_SYN_RECEIVED) { 7712 to.to_tfo_len = TCP_FASTOPEN_COOKIE_LEN; 7713 to.to_tfo_cookie = 7714 (u_int8_t *)&tp->t_tfo_cookie.server; 7715 to.to_flags |= TOF_FASTOPEN; 7716 wanted_cookie = 1; 7717 } else if (tp->t_state == TCPS_SYN_SENT) { 7718 to.to_tfo_len = 7719 tp->t_tfo_client_cookie_len; 7720 to.to_tfo_cookie = 7721 tp->t_tfo_cookie.client; 7722 to.to_flags |= TOF_FASTOPEN; 7723 wanted_cookie = 1; 7724 /* 7725 * If we wind up having more data to 7726 * send with the SYN than can fit in 7727 * one segment, don't send any more 7728 * until the SYN|ACK comes back from 7729 * the other end. 7730 */ 7731 sendalot = 0; 7732 } 7733 } 7734 } 7735 /* Window scaling. */ 7736 if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) { 7737 to.to_wscale = tp->request_r_scale; 7738 to.to_flags |= TOF_SCALE; 7739 } 7740 /* Timestamps. */ 7741 if ((tp->t_flags & TF_RCVD_TSTMP) || 7742 ((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) { 7743 to.to_tsval = cts + tp->ts_offset; 7744 to.to_tsecr = tp->ts_recent; 7745 to.to_flags |= TOF_TS; 7746 } 7747 /* Set receive buffer autosizing timestamp. */ 7748 if (tp->rfbuf_ts == 0 && 7749 (so->so_rcv.sb_flags & SB_AUTOSIZE)) 7750 tp->rfbuf_ts = tcp_ts_getticks(); 7751 /* Selective ACK's. */ 7752 if (flags & TH_SYN) 7753 to.to_flags |= TOF_SACKPERM; 7754 else if (TCPS_HAVEESTABLISHED(tp->t_state) && 7755 tp->rcv_numsacks > 0) { 7756 to.to_flags |= TOF_SACK; 7757 to.to_nsacks = tp->rcv_numsacks; 7758 to.to_sacks = (u_char *)tp->sackblks; 7759 } 7760 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) 7761 /* TCP-MD5 (RFC2385). */ 7762 if (tp->t_flags & TF_SIGNATURE) 7763 to.to_flags |= TOF_SIGNATURE; 7764 #endif /* TCP_SIGNATURE */ 7765 7766 /* Processing the options. */ 7767 hdrlen += optlen = tcp_addoptions(&to, opt); 7768 /* 7769 * If we wanted a TFO option to be added, but it was unable 7770 * to fit, ensure no data is sent. 7771 */ 7772 if (IS_FASTOPEN(tp->t_flags) && wanted_cookie && 7773 !(to.to_flags & TOF_FASTOPEN)) 7774 len = 0; 7775 } 7776 #ifdef NETFLIX_TCPOUDP 7777 if (tp->t_port) { 7778 if (V_tcp_udp_tunneling_port == 0) { 7779 /* The port was removed?? */ 7780 SOCKBUF_UNLOCK(&so->so_snd); 7781 return (EHOSTUNREACH); 7782 } 7783 hdrlen += sizeof(struct udphdr); 7784 } 7785 #endif 7786 ipoptlen = 0; 7787 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 7788 ipoptlen += ipsec_optlen; 7789 #endif 7790 7791 /* 7792 * Adjust data length if insertion of options will bump the packet 7793 * length beyond the t_maxseg length. Clear the FIN bit because we 7794 * cut off the tail of the segment. 7795 */ 7796 if (len + optlen + ipoptlen > tp->t_maxseg) { 7797 if (flags & TH_FIN) { 7798 would_have_fin = 1; 7799 flags &= ~TH_FIN; 7800 } 7801 if (tso) { 7802 uint32_t if_hw_tsomax; 7803 uint32_t moff; 7804 int32_t max_len; 7805 7806 /* extract TSO information */ 7807 if_hw_tsomax = tp->t_tsomax; 7808 if_hw_tsomaxsegcount = tp->t_tsomaxsegcount; 7809 if_hw_tsomaxsegsize = tp->t_tsomaxsegsize; 7810 KASSERT(ipoptlen == 0, 7811 ("%s: TSO can't do IP options", __func__)); 7812 7813 /* 7814 * Check if we should limit by maximum payload 7815 * length: 7816 */ 7817 if (if_hw_tsomax != 0) { 7818 /* compute maximum TSO length */ 7819 max_len = (if_hw_tsomax - hdrlen - 7820 max_linkhdr); 7821 if (max_len <= 0) { 7822 len = 0; 7823 } else if (len > max_len) { 7824 sendalot = 1; 7825 len = max_len; 7826 } 7827 } 7828 /* 7829 * Prevent the last segment from being fractional 7830 * unless the send sockbuf can be emptied: 7831 */ 7832 max_len = (tp->t_maxseg - optlen); 7833 if ((sb_offset + len) < sbavail(sb)) { 7834 moff = len % (u_int)max_len; 7835 if (moff != 0) { 7836 len -= moff; 7837 sendalot = 1; 7838 } 7839 } 7840 /* 7841 * In case there are too many small fragments don't 7842 * use TSO: 7843 */ 7844 if (len <= max_len) { 7845 len = max_len; 7846 sendalot = 1; 7847 tso = 0; 7848 } 7849 /* 7850 * Send the FIN in a separate segment after the bulk 7851 * sending is done. We don't trust the TSO 7852 * implementations to clear the FIN flag on all but 7853 * the last segment. 7854 */ 7855 if (tp->t_flags & TF_NEEDFIN) 7856 sendalot = 1; 7857 7858 } else { 7859 len = tp->t_maxseg - optlen - ipoptlen; 7860 sendalot = 1; 7861 } 7862 } else 7863 tso = 0; 7864 KASSERT(len + hdrlen + ipoptlen <= IP_MAXPACKET, 7865 ("%s: len > IP_MAXPACKET", __func__)); 7866 #ifdef DIAGNOSTIC 7867 #ifdef INET6 7868 if (max_linkhdr + hdrlen > MCLBYTES) 7869 #else 7870 if (max_linkhdr + hdrlen > MHLEN) 7871 #endif 7872 panic("tcphdr too big"); 7873 #endif 7874 7875 /* 7876 * This KASSERT is here to catch edge cases at a well defined place. 7877 * Before, those had triggered (random) panic conditions further 7878 * down. 7879 */ 7880 KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); 7881 if ((len == 0) && 7882 (flags & TH_FIN) && 7883 (sbused(sb))) { 7884 /* 7885 * We have outstanding data, don't send a fin by itself!. 7886 */ 7887 goto just_return; 7888 } 7889 /* 7890 * Grab a header mbuf, attaching a copy of data to be transmitted, 7891 * and initialize the header from the template for sends on this 7892 * connection. 7893 */ 7894 if (len) { 7895 uint32_t max_val; 7896 uint32_t moff; 7897 7898 if (rack->rc_pace_max_segs) 7899 max_val = rack->rc_pace_max_segs * tp->t_maxseg; 7900 else 7901 max_val = len; 7902 /* 7903 * We allow a limit on sending with hptsi. 7904 */ 7905 if (len > max_val) { 7906 len = max_val; 7907 } 7908 #ifdef INET6 7909 if (MHLEN < hdrlen + max_linkhdr) 7910 m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); 7911 else 7912 #endif 7913 m = m_gethdr(M_NOWAIT, MT_DATA); 7914 7915 if (m == NULL) { 7916 SOCKBUF_UNLOCK(sb); 7917 error = ENOBUFS; 7918 sack_rxmit = 0; 7919 goto out; 7920 } 7921 m->m_data += max_linkhdr; 7922 m->m_len = hdrlen; 7923 7924 /* 7925 * Start the m_copy functions from the closest mbuf to the 7926 * sb_offset in the socket buffer chain. 7927 */ 7928 mb = sbsndptr_noadv(sb, sb_offset, &moff); 7929 if (len <= MHLEN - hdrlen - max_linkhdr) { 7930 m_copydata(mb, moff, (int)len, 7931 mtod(m, caddr_t)+hdrlen); 7932 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) 7933 sbsndptr_adv(sb, mb, len); 7934 m->m_len += len; 7935 } else { 7936 struct sockbuf *msb; 7937 7938 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) 7939 msb = NULL; 7940 else 7941 msb = sb; 7942 m->m_next = tcp_m_copym(mb, moff, &len, 7943 if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb); 7944 if (len <= (tp->t_maxseg - optlen)) { 7945 /* 7946 * Must have ran out of mbufs for the copy 7947 * shorten it to no longer need tso. Lets 7948 * not put on sendalot since we are low on 7949 * mbufs. 7950 */ 7951 tso = 0; 7952 } 7953 if (m->m_next == NULL) { 7954 SOCKBUF_UNLOCK(sb); 7955 (void)m_free(m); 7956 error = ENOBUFS; 7957 sack_rxmit = 0; 7958 goto out; 7959 } 7960 } 7961 if ((tp->t_flags & TF_FORCEDATA) && len == 1) { 7962 TCPSTAT_INC(tcps_sndprobe); 7963 #ifdef NETFLIX_STATS 7964 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) 7965 stats_voi_update_abs_u32(tp->t_stats, 7966 VOI_TCP_RETXPB, len); 7967 else 7968 stats_voi_update_abs_u64(tp->t_stats, 7969 VOI_TCP_TXPB, len); 7970 #endif 7971 } else if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) { 7972 if (rsm && (rsm->r_flags & RACK_TLP)) { 7973 /* 7974 * TLP should not count in retran count, but 7975 * in its own bin 7976 */ 7977 counter_u64_add(rack_tlp_retran, 1); 7978 counter_u64_add(rack_tlp_retran_bytes, len); 7979 } else { 7980 tp->t_sndrexmitpack++; 7981 TCPSTAT_INC(tcps_sndrexmitpack); 7982 TCPSTAT_ADD(tcps_sndrexmitbyte, len); 7983 } 7984 #ifdef NETFLIX_STATS 7985 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB, 7986 len); 7987 #endif 7988 } else { 7989 TCPSTAT_INC(tcps_sndpack); 7990 TCPSTAT_ADD(tcps_sndbyte, len); 7991 #ifdef NETFLIX_STATS 7992 stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB, 7993 len); 7994 #endif 7995 } 7996 /* 7997 * If we're sending everything we've got, set PUSH. (This 7998 * will keep happy those implementations which only give 7999 * data to the user when a buffer fills or a PUSH comes in.) 8000 */ 8001 if (sb_offset + len == sbused(sb) && 8002 sbused(sb) && 8003 !(flags & TH_SYN)) 8004 flags |= TH_PUSH; 8005 8006 /* 8007 * Are we doing hptsi, if so we must calculate the slot. We 8008 * only do hptsi in ESTABLISHED and with no RESET being 8009 * sent where we have data to send. 8010 */ 8011 if (((tp->t_state == TCPS_ESTABLISHED) || 8012 (tp->t_state == TCPS_CLOSE_WAIT) || 8013 ((tp->t_state == TCPS_FIN_WAIT_1) && 8014 ((tp->t_flags & TF_SENTFIN) == 0) && 8015 ((flags & TH_FIN) == 0))) && 8016 ((flags & TH_RST) == 0) && 8017 (rack->rc_always_pace)) { 8018 /* 8019 * We use the most optimistic possible cwnd/srtt for 8020 * sending calculations. This will make our 8021 * calculation anticipate getting more through 8022 * quicker then possible. But thats ok we don't want 8023 * the peer to have a gap in data sending. 8024 */ 8025 uint32_t srtt, cwnd, tr_perms = 0; 8026 8027 if (rack->r_ctl.rc_rack_min_rtt) 8028 srtt = rack->r_ctl.rc_rack_min_rtt; 8029 else 8030 srtt = TICKS_2_MSEC((tp->t_srtt >> TCP_RTT_SHIFT)); 8031 if (rack->r_ctl.rc_rack_largest_cwnd) 8032 cwnd = rack->r_ctl.rc_rack_largest_cwnd; 8033 else 8034 cwnd = tp->snd_cwnd; 8035 tr_perms = cwnd / srtt; 8036 if (tr_perms == 0) { 8037 tr_perms = tp->t_maxseg; 8038 } 8039 tot_len_this_send += len; 8040 /* 8041 * Calculate how long this will take to drain, if 8042 * the calculation comes out to zero, thats ok we 8043 * will use send_a_lot to possibly spin around for 8044 * more increasing tot_len_this_send to the point 8045 * that its going to require a pace, or we hit the 8046 * cwnd. Which in that case we are just waiting for 8047 * a ACK. 8048 */ 8049 slot = tot_len_this_send / tr_perms; 8050 /* Now do we reduce the time so we don't run dry? */ 8051 if (slot && rack->rc_pace_reduce) { 8052 int32_t reduce; 8053 8054 reduce = (slot / rack->rc_pace_reduce); 8055 if (reduce < slot) { 8056 slot -= reduce; 8057 } else 8058 slot = 0; 8059 } 8060 if (rack->r_enforce_min_pace && 8061 (slot == 0) && 8062 (tot_len_this_send >= (rack->r_min_pace_seg_thresh * tp->t_maxseg))) { 8063 /* We are enforcing a minimum pace time of 1ms */ 8064 slot = rack->r_enforce_min_pace; 8065 } 8066 } 8067 SOCKBUF_UNLOCK(sb); 8068 } else { 8069 SOCKBUF_UNLOCK(sb); 8070 if (tp->t_flags & TF_ACKNOW) 8071 TCPSTAT_INC(tcps_sndacks); 8072 else if (flags & (TH_SYN | TH_FIN | TH_RST)) 8073 TCPSTAT_INC(tcps_sndctrl); 8074 else if (SEQ_GT(tp->snd_up, tp->snd_una)) 8075 TCPSTAT_INC(tcps_sndurg); 8076 else 8077 TCPSTAT_INC(tcps_sndwinup); 8078 8079 m = m_gethdr(M_NOWAIT, MT_DATA); 8080 if (m == NULL) { 8081 error = ENOBUFS; 8082 sack_rxmit = 0; 8083 goto out; 8084 } 8085 #ifdef INET6 8086 if (isipv6 && (MHLEN < hdrlen + max_linkhdr) && 8087 MHLEN >= hdrlen) { 8088 M_ALIGN(m, hdrlen); 8089 } else 8090 #endif 8091 m->m_data += max_linkhdr; 8092 m->m_len = hdrlen; 8093 } 8094 SOCKBUF_UNLOCK_ASSERT(sb); 8095 m->m_pkthdr.rcvif = (struct ifnet *)0; 8096 #ifdef MAC 8097 mac_inpcb_create_mbuf(inp, m); 8098 #endif 8099 #ifdef INET6 8100 if (isipv6) { 8101 ip6 = mtod(m, struct ip6_hdr *); 8102 #ifdef NETFLIX_TCPOUDP 8103 if (tp->t_port) { 8104 udp = (struct udphdr *)((caddr_t)ip6 + ipoptlen + sizeof(struct ip6_hdr)); 8105 udp->uh_sport = htons(V_tcp_udp_tunneling_port); 8106 udp->uh_dport = tp->t_port; 8107 ulen = hdrlen + len - sizeof(struct ip6_hdr); 8108 udp->uh_ulen = htons(ulen); 8109 th = (struct tcphdr *)(udp + 1); 8110 } else 8111 #endif 8112 th = (struct tcphdr *)(ip6 + 1); 8113 tcpip_fillheaders(inp, ip6, th); 8114 } else 8115 #endif /* INET6 */ 8116 { 8117 ip = mtod(m, struct ip *); 8118 #ifdef TCPDEBUG 8119 ipov = (struct ipovly *)ip; 8120 #endif 8121 #ifdef NETFLIX_TCPOUDP 8122 if (tp->t_port) { 8123 udp = (struct udphdr *)((caddr_t)ip + ipoptlen + sizeof(struct ip)); 8124 udp->uh_sport = htons(V_tcp_udp_tunneling_port); 8125 udp->uh_dport = tp->t_port; 8126 ulen = hdrlen + len - sizeof(struct ip); 8127 udp->uh_ulen = htons(ulen); 8128 th = (struct tcphdr *)(udp + 1); 8129 } else 8130 #endif 8131 th = (struct tcphdr *)(ip + 1); 8132 tcpip_fillheaders(inp, ip, th); 8133 } 8134 /* 8135 * Fill in fields, remembering maximum advertised window for use in 8136 * delaying messages about window sizes. If resending a FIN, be sure 8137 * not to use a new sequence number. 8138 */ 8139 if (flags & TH_FIN && tp->t_flags & TF_SENTFIN && 8140 tp->snd_nxt == tp->snd_max) 8141 tp->snd_nxt--; 8142 /* 8143 * If we are starting a connection, send ECN setup SYN packet. If we 8144 * are on a retransmit, we may resend those bits a number of times 8145 * as per RFC 3168. 8146 */ 8147 if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn == 1) { 8148 if (tp->t_rxtshift >= 1) { 8149 if (tp->t_rxtshift <= V_tcp_ecn_maxretries) 8150 flags |= TH_ECE | TH_CWR; 8151 } else 8152 flags |= TH_ECE | TH_CWR; 8153 } 8154 if (tp->t_state == TCPS_ESTABLISHED && 8155 (tp->t_flags & TF_ECN_PERMIT)) { 8156 /* 8157 * If the peer has ECN, mark data packets with ECN capable 8158 * transmission (ECT). Ignore pure ack packets, 8159 * retransmissions and window probes. 8160 */ 8161 if (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) && 8162 !((tp->t_flags & TF_FORCEDATA) && len == 1)) { 8163 #ifdef INET6 8164 if (isipv6) 8165 ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20); 8166 else 8167 #endif 8168 ip->ip_tos |= IPTOS_ECN_ECT0; 8169 TCPSTAT_INC(tcps_ecn_ect0); 8170 } 8171 /* 8172 * Reply with proper ECN notifications. 8173 */ 8174 if (tp->t_flags & TF_ECN_SND_CWR) { 8175 flags |= TH_CWR; 8176 tp->t_flags &= ~TF_ECN_SND_CWR; 8177 } 8178 if (tp->t_flags & TF_ECN_SND_ECE) 8179 flags |= TH_ECE; 8180 } 8181 /* 8182 * If we are doing retransmissions, then snd_nxt will not reflect 8183 * the first unsent octet. For ACK only packets, we do not want the 8184 * sequence number of the retransmitted packet, we want the sequence 8185 * number of the next unsent octet. So, if there is no data (and no 8186 * SYN or FIN), use snd_max instead of snd_nxt when filling in 8187 * ti_seq. But if we are in persist state, snd_max might reflect 8188 * one byte beyond the right edge of the window, so use snd_nxt in 8189 * that case, since we know we aren't doing a retransmission. 8190 * (retransmit and persist are mutually exclusive...) 8191 */ 8192 if (sack_rxmit == 0) { 8193 if (len || (flags & (TH_SYN | TH_FIN)) || 8194 rack->rc_in_persist) { 8195 th->th_seq = htonl(tp->snd_nxt); 8196 rack_seq = tp->snd_nxt; 8197 } else if (flags & TH_RST) { 8198 /* 8199 * For a Reset send the last cum ack in sequence 8200 * (this like any other choice may still generate a 8201 * challenge ack, if a ack-update packet is in 8202 * flight). 8203 */ 8204 th->th_seq = htonl(tp->snd_una); 8205 rack_seq = tp->snd_una; 8206 } else { 8207 th->th_seq = htonl(tp->snd_max); 8208 rack_seq = tp->snd_max; 8209 } 8210 } else { 8211 th->th_seq = htonl(rsm->r_start); 8212 rack_seq = rsm->r_start; 8213 } 8214 th->th_ack = htonl(tp->rcv_nxt); 8215 if (optlen) { 8216 bcopy(opt, th + 1, optlen); 8217 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 8218 } 8219 th->th_flags = flags; 8220 /* 8221 * Calculate receive window. Don't shrink window, but avoid silly 8222 * window syndrome. 8223 * If a RST segment is sent, advertise a window of zero. 8224 */ 8225 if (flags & TH_RST) { 8226 recwin = 0; 8227 } else { 8228 if (recwin < (long)(so->so_rcv.sb_hiwat / 4) && 8229 recwin < (long)tp->t_maxseg) 8230 recwin = 0; 8231 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) && 8232 recwin < (long)(tp->rcv_adv - tp->rcv_nxt)) 8233 recwin = (long)(tp->rcv_adv - tp->rcv_nxt); 8234 if (recwin > (long)TCP_MAXWIN << tp->rcv_scale) 8235 recwin = (long)TCP_MAXWIN << tp->rcv_scale; 8236 } 8237 8238 /* 8239 * According to RFC1323 the window field in a SYN (i.e., a <SYN> or 8240 * <SYN,ACK>) segment itself is never scaled. The <SYN,ACK> case is 8241 * handled in syncache. 8242 */ 8243 if (flags & TH_SYN) 8244 th->th_win = htons((u_short) 8245 (min(sbspace(&so->so_rcv), TCP_MAXWIN))); 8246 else 8247 th->th_win = htons((u_short)(recwin >> tp->rcv_scale)); 8248 /* 8249 * Adjust the RXWIN0SENT flag - indicate that we have advertised a 0 8250 * window. This may cause the remote transmitter to stall. This 8251 * flag tells soreceive() to disable delayed acknowledgements when 8252 * draining the buffer. This can occur if the receiver is 8253 * attempting to read more data than can be buffered prior to 8254 * transmitting on the connection. 8255 */ 8256 if (th->th_win == 0) { 8257 tp->t_sndzerowin++; 8258 tp->t_flags |= TF_RXWIN0SENT; 8259 } else 8260 tp->t_flags &= ~TF_RXWIN0SENT; 8261 if (SEQ_GT(tp->snd_up, tp->snd_nxt)) { 8262 th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt)); 8263 th->th_flags |= TH_URG; 8264 } else 8265 /* 8266 * If no urgent pointer to send, then we pull the urgent 8267 * pointer to the left edge of the send window so that it 8268 * doesn't drift into the send window on sequence number 8269 * wraparound. 8270 */ 8271 tp->snd_up = tp->snd_una; /* drag it along */ 8272 8273 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) 8274 if (to.to_flags & TOF_SIGNATURE) { 8275 /* 8276 * Calculate MD5 signature and put it into the place 8277 * determined before. 8278 * NOTE: since TCP options buffer doesn't point into 8279 * mbuf's data, calculate offset and use it. 8280 */ 8281 if (!TCPMD5_ENABLED() || TCPMD5_OUTPUT(m, th, 8282 (u_char *)(th + 1) + (to.to_signature - opt)) != 0) { 8283 /* 8284 * Do not send segment if the calculation of MD5 8285 * digest has failed. 8286 */ 8287 goto out; 8288 } 8289 } 8290 #endif 8291 8292 /* 8293 * Put TCP length in extended header, and then checksum extended 8294 * header and data. 8295 */ 8296 m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */ 8297 #ifdef INET6 8298 if (isipv6) { 8299 /* 8300 * ip6_plen is not need to be filled now, and will be filled 8301 * in ip6_output. 8302 */ 8303 if (tp->t_port) { 8304 m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; 8305 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 8306 udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0); 8307 th->th_sum = htons(0); 8308 } else { 8309 m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; 8310 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 8311 th->th_sum = in6_cksum_pseudo(ip6, 8312 sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP, 8313 0); 8314 } 8315 } 8316 #endif 8317 #if defined(INET6) && defined(INET) 8318 else 8319 #endif 8320 #ifdef INET 8321 { 8322 if (tp->t_port) { 8323 m->m_pkthdr.csum_flags = CSUM_UDP; 8324 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 8325 udp->uh_sum = in_pseudo(ip->ip_src.s_addr, 8326 ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP)); 8327 th->th_sum = htons(0); 8328 } else { 8329 m->m_pkthdr.csum_flags = CSUM_TCP; 8330 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 8331 th->th_sum = in_pseudo(ip->ip_src.s_addr, 8332 ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + 8333 IPPROTO_TCP + len + optlen)); 8334 } 8335 /* IP version must be set here for ipv4/ipv6 checking later */ 8336 KASSERT(ip->ip_v == IPVERSION, 8337 ("%s: IP version incorrect: %d", __func__, ip->ip_v)); 8338 } 8339 #endif 8340 8341 /* 8342 * Enable TSO and specify the size of the segments. The TCP pseudo 8343 * header checksum is always provided. XXX: Fixme: This is currently 8344 * not the case for IPv6. 8345 */ 8346 if (tso) { 8347 KASSERT(len > tp->t_maxseg - optlen, 8348 ("%s: len <= tso_segsz", __func__)); 8349 m->m_pkthdr.csum_flags |= CSUM_TSO; 8350 m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen; 8351 } 8352 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 8353 KASSERT(len + hdrlen + ipoptlen - ipsec_optlen == m_length(m, NULL), 8354 ("%s: mbuf chain shorter than expected: %d + %u + %u - %u != %u", 8355 __func__, len, hdrlen, ipoptlen, ipsec_optlen, m_length(m, NULL))); 8356 #else 8357 KASSERT(len + hdrlen + ipoptlen == m_length(m, NULL), 8358 ("%s: mbuf chain shorter than expected: %d + %u + %u != %u", 8359 __func__, len, hdrlen, ipoptlen, m_length(m, NULL))); 8360 #endif 8361 8362 #ifdef TCP_HHOOK 8363 /* Run HHOOK_TCP_ESTABLISHED_OUT helper hooks. */ 8364 hhook_run_tcp_est_out(tp, th, &to, len, tso); 8365 #endif 8366 8367 #ifdef TCPDEBUG 8368 /* 8369 * Trace. 8370 */ 8371 if (so->so_options & SO_DEBUG) { 8372 u_short save = 0; 8373 8374 #ifdef INET6 8375 if (!isipv6) 8376 #endif 8377 { 8378 save = ipov->ih_len; 8379 ipov->ih_len = htons(m->m_pkthdr.len /* - hdrlen + 8380 * (th->th_off << 2) */ ); 8381 } 8382 tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0); 8383 #ifdef INET6 8384 if (!isipv6) 8385 #endif 8386 ipov->ih_len = save; 8387 } 8388 #endif /* TCPDEBUG */ 8389 8390 /* We're getting ready to send; log now. */ 8391 if (tp->t_logstate != TCP_LOG_STATE_OFF) { 8392 union tcp_log_stackspecific log; 8393 8394 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 8395 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 8396 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 8397 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; 8398 if (rsm || sack_rxmit) { 8399 log.u_bbr.flex8 = 1; 8400 } else { 8401 log.u_bbr.flex8 = 0; 8402 } 8403 lgb = tcp_log_event_(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_OUT, ERRNO_UNK, 8404 len, &log, false, NULL, NULL, 0, NULL); 8405 } else 8406 lgb = NULL; 8407 8408 /* 8409 * Fill in IP length and desired time to live and send to IP level. 8410 * There should be a better way to handle ttl and tos; we could keep 8411 * them in the template, but need a way to checksum without them. 8412 */ 8413 /* 8414 * m->m_pkthdr.len should have been set before cksum calcuration, 8415 * because in6_cksum() need it. 8416 */ 8417 #ifdef INET6 8418 if (isipv6) { 8419 /* 8420 * we separately set hoplimit for every segment, since the 8421 * user might want to change the value via setsockopt. Also, 8422 * desired default hop limit might be changed via Neighbor 8423 * Discovery. 8424 */ 8425 ip6->ip6_hlim = in6_selecthlim(inp, NULL); 8426 8427 /* 8428 * Set the packet size here for the benefit of DTrace 8429 * probes. ip6_output() will set it properly; it's supposed 8430 * to include the option header lengths as well. 8431 */ 8432 ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6)); 8433 8434 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) 8435 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 8436 else 8437 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 8438 8439 if (tp->t_state == TCPS_SYN_SENT) 8440 TCP_PROBE5(connect__request, NULL, tp, ip6, tp, th); 8441 8442 TCP_PROBE5(send, NULL, tp, ip6, tp, th); 8443 /* TODO: IPv6 IP6TOS_ECT bit on */ 8444 error = ip6_output(m, tp->t_inpcb->in6p_outputopts, 8445 &inp->inp_route6, 8446 ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), 8447 NULL, NULL, inp); 8448 8449 if (error == EMSGSIZE && inp->inp_route6.ro_rt != NULL) 8450 mtu = inp->inp_route6.ro_rt->rt_mtu; 8451 } 8452 #endif /* INET6 */ 8453 #if defined(INET) && defined(INET6) 8454 else 8455 #endif 8456 #ifdef INET 8457 { 8458 ip->ip_len = htons(m->m_pkthdr.len); 8459 #ifdef INET6 8460 if (inp->inp_vflag & INP_IPV6PROTO) 8461 ip->ip_ttl = in6_selecthlim(inp, NULL); 8462 #endif /* INET6 */ 8463 /* 8464 * If we do path MTU discovery, then we set DF on every 8465 * packet. This might not be the best thing to do according 8466 * to RFC3390 Section 2. However the tcp hostcache migitates 8467 * the problem so it affects only the first tcp connection 8468 * with a host. 8469 * 8470 * NB: Don't set DF on small MTU/MSS to have a safe 8471 * fallback. 8472 */ 8473 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) { 8474 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 8475 if (tp->t_port == 0 || len < V_tcp_minmss) { 8476 ip->ip_off |= htons(IP_DF); 8477 } 8478 } else { 8479 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 8480 } 8481 8482 if (tp->t_state == TCPS_SYN_SENT) 8483 TCP_PROBE5(connect__request, NULL, tp, ip, tp, th); 8484 8485 TCP_PROBE5(send, NULL, tp, ip, tp, th); 8486 8487 error = ip_output(m, tp->t_inpcb->inp_options, &inp->inp_route, 8488 ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), 0, 8489 inp); 8490 if (error == EMSGSIZE && inp->inp_route.ro_rt != NULL) 8491 mtu = inp->inp_route.ro_rt->rt_mtu; 8492 } 8493 #endif /* INET */ 8494 8495 out: 8496 if (lgb) { 8497 lgb->tlb_errno = error; 8498 lgb = NULL; 8499 } 8500 /* 8501 * In transmit state, time the transmission and arrange for the 8502 * retransmit. In persist state, just set snd_max. 8503 */ 8504 if (error == 0) { 8505 if (len == 0) 8506 counter_u64_add(rack_out_size[TCP_MSS_ACCT_SNDACK], 1); 8507 else if (len == 1) { 8508 counter_u64_add(rack_out_size[TCP_MSS_ACCT_PERSIST], 1); 8509 } else if (len > 1) { 8510 int idx; 8511 8512 idx = (len / tp->t_maxseg) + 3; 8513 if (idx >= TCP_MSS_ACCT_ATIMER) 8514 counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1); 8515 else 8516 counter_u64_add(rack_out_size[idx], 1); 8517 } 8518 } 8519 if (sub_from_prr && (error == 0)) { 8520 rack->r_ctl.rc_prr_sndcnt -= len; 8521 } 8522 sub_from_prr = 0; 8523 rack_log_output(tp, &to, len, rack_seq, (uint8_t) flags, error, cts, 8524 pass, rsm); 8525 if ((tp->t_flags & TF_FORCEDATA) == 0 || 8526 (rack->rc_in_persist == 0)) { 8527 tcp_seq startseq = tp->snd_nxt; 8528 8529 /* 8530 * Advance snd_nxt over sequence space of this segment. 8531 */ 8532 if (error) 8533 /* We don't log or do anything with errors */ 8534 goto timer; 8535 8536 if (flags & (TH_SYN | TH_FIN)) { 8537 if (flags & TH_SYN) 8538 tp->snd_nxt++; 8539 if (flags & TH_FIN) { 8540 tp->snd_nxt++; 8541 tp->t_flags |= TF_SENTFIN; 8542 } 8543 } 8544 /* In the ENOBUFS case we do *not* update snd_max */ 8545 if (sack_rxmit) 8546 goto timer; 8547 8548 tp->snd_nxt += len; 8549 if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { 8550 if (tp->snd_una == tp->snd_max) { 8551 /* 8552 * Update the time we just added data since 8553 * none was outstanding. 8554 */ 8555 rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__); 8556 tp->t_acktime = ticks; 8557 } 8558 tp->snd_max = tp->snd_nxt; 8559 /* 8560 * Time this transmission if not a retransmission and 8561 * not currently timing anything. 8562 * This is only relevant in case of switching back to 8563 * the base stack. 8564 */ 8565 if (tp->t_rtttime == 0) { 8566 tp->t_rtttime = ticks; 8567 tp->t_rtseq = startseq; 8568 TCPSTAT_INC(tcps_segstimed); 8569 } 8570 #ifdef NETFLIX_STATS 8571 if (!(tp->t_flags & TF_GPUTINPROG) && len) { 8572 tp->t_flags |= TF_GPUTINPROG; 8573 tp->gput_seq = startseq; 8574 tp->gput_ack = startseq + 8575 ulmin(sbavail(sb) - sb_offset, sendwin); 8576 tp->gput_ts = tcp_ts_getticks(); 8577 } 8578 #endif 8579 } 8580 /* 8581 * Set retransmit timer if not currently set, and not doing 8582 * a pure ack or a keep-alive probe. Initial value for 8583 * retransmit timer is smoothed round-trip time + 2 * 8584 * round-trip time variance. Initialize shift counter which 8585 * is used for backoff of retransmit time. 8586 */ 8587 timer: 8588 if ((tp->snd_wnd == 0) && 8589 TCPS_HAVEESTABLISHED(tp->t_state)) { 8590 /* 8591 * If the persists timer was set above (right before 8592 * the goto send), and still needs to be on. Lets 8593 * make sure all is canceled. If the persist timer 8594 * is not running, we want to get it up. 8595 */ 8596 if (rack->rc_in_persist == 0) { 8597 rack_enter_persist(tp, rack, cts); 8598 } 8599 } 8600 } else { 8601 /* 8602 * Persist case, update snd_max but since we are in persist 8603 * mode (no window) we do not update snd_nxt. 8604 */ 8605 int32_t xlen = len; 8606 8607 if (error) 8608 goto nomore; 8609 8610 if (flags & TH_SYN) 8611 ++xlen; 8612 if (flags & TH_FIN) { 8613 ++xlen; 8614 tp->t_flags |= TF_SENTFIN; 8615 } 8616 /* In the ENOBUFS case we do *not* update snd_max */ 8617 if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max)) { 8618 if (tp->snd_una == tp->snd_max) { 8619 /* 8620 * Update the time we just added data since 8621 * none was outstanding. 8622 */ 8623 rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__); 8624 tp->t_acktime = ticks; 8625 } 8626 tp->snd_max = tp->snd_nxt + len; 8627 } 8628 } 8629 nomore: 8630 if (error) { 8631 SOCKBUF_UNLOCK_ASSERT(sb); /* Check gotos. */ 8632 /* 8633 * Failures do not advance the seq counter above. For the 8634 * case of ENOBUFS we will fall out and retry in 1ms with 8635 * the hpts. Everything else will just have to retransmit 8636 * with the timer. 8637 * 8638 * In any case, we do not want to loop around for another 8639 * send without a good reason. 8640 */ 8641 sendalot = 0; 8642 switch (error) { 8643 case EPERM: 8644 tp->t_flags &= ~TF_FORCEDATA; 8645 tp->t_softerror = error; 8646 return (error); 8647 case ENOBUFS: 8648 if (slot == 0) { 8649 /* 8650 * Pace us right away to retry in a some 8651 * time 8652 */ 8653 slot = 1 + rack->rc_enobuf; 8654 if (rack->rc_enobuf < 255) 8655 rack->rc_enobuf++; 8656 if (slot > (rack->rc_rack_rtt / 2)) { 8657 slot = rack->rc_rack_rtt / 2; 8658 } 8659 if (slot < 10) 8660 slot = 10; 8661 } 8662 counter_u64_add(rack_saw_enobuf, 1); 8663 error = 0; 8664 goto enobufs; 8665 case EMSGSIZE: 8666 /* 8667 * For some reason the interface we used initially 8668 * to send segments changed to another or lowered 8669 * its MTU. If TSO was active we either got an 8670 * interface without TSO capabilits or TSO was 8671 * turned off. If we obtained mtu from ip_output() 8672 * then update it and try again. 8673 */ 8674 if (tso) 8675 tp->t_flags &= ~TF_TSO; 8676 if (mtu != 0) { 8677 tcp_mss_update(tp, -1, mtu, NULL, NULL); 8678 goto again; 8679 } 8680 slot = 10; 8681 rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, 0, 1); 8682 tp->t_flags &= ~TF_FORCEDATA; 8683 return (error); 8684 case ENETUNREACH: 8685 counter_u64_add(rack_saw_enetunreach, 1); 8686 case EHOSTDOWN: 8687 case EHOSTUNREACH: 8688 case ENETDOWN: 8689 if (TCPS_HAVERCVDSYN(tp->t_state)) { 8690 tp->t_softerror = error; 8691 } 8692 /* FALLTHROUGH */ 8693 default: 8694 slot = 10; 8695 rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, 0, 1); 8696 tp->t_flags &= ~TF_FORCEDATA; 8697 return (error); 8698 } 8699 } else { 8700 rack->rc_enobuf = 0; 8701 } 8702 TCPSTAT_INC(tcps_sndtotal); 8703 8704 /* 8705 * Data sent (as far as we can tell). If this advertises a larger 8706 * window than any other segment, then remember the size of the 8707 * advertised window. Any pending ACK has now been sent. 8708 */ 8709 if (recwin > 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv)) 8710 tp->rcv_adv = tp->rcv_nxt + recwin; 8711 tp->last_ack_sent = tp->rcv_nxt; 8712 tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); 8713 enobufs: 8714 rack->r_tlp_running = 0; 8715 if ((flags & TH_RST) || (would_have_fin == 1)) { 8716 /* 8717 * We don't send again after a RST. We also do *not* send 8718 * again if we would have had a find, but now have 8719 * outstanding data. 8720 */ 8721 slot = 0; 8722 sendalot = 0; 8723 } 8724 if (slot) { 8725 /* set the rack tcb into the slot N */ 8726 counter_u64_add(rack_paced_segments, 1); 8727 } else if (sendalot) { 8728 if (len) 8729 counter_u64_add(rack_unpaced_segments, 1); 8730 sack_rxmit = 0; 8731 tp->t_flags &= ~TF_FORCEDATA; 8732 goto again; 8733 } else if (len) { 8734 counter_u64_add(rack_unpaced_segments, 1); 8735 } 8736 tp->t_flags &= ~TF_FORCEDATA; 8737 rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, tot_len_this_send, 1); 8738 return (error); 8739 } 8740 8741 /* 8742 * rack_ctloutput() must drop the inpcb lock before performing copyin on 8743 * socket option arguments. When it re-acquires the lock after the copy, it 8744 * has to revalidate that the connection is still valid for the socket 8745 * option. 8746 */ 8747 static int 8748 rack_set_sockopt(struct socket *so, struct sockopt *sopt, 8749 struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack) 8750 { 8751 int32_t error = 0, optval; 8752 8753 switch (sopt->sopt_name) { 8754 case TCP_RACK_PROP_RATE: 8755 case TCP_RACK_PROP: 8756 case TCP_RACK_TLP_REDUCE: 8757 case TCP_RACK_EARLY_RECOV: 8758 case TCP_RACK_PACE_ALWAYS: 8759 case TCP_DELACK: 8760 case TCP_RACK_PACE_REDUCE: 8761 case TCP_RACK_PACE_MAX_SEG: 8762 case TCP_RACK_PRR_SENDALOT: 8763 case TCP_RACK_MIN_TO: 8764 case TCP_RACK_EARLY_SEG: 8765 case TCP_RACK_REORD_THRESH: 8766 case TCP_RACK_REORD_FADE: 8767 case TCP_RACK_TLP_THRESH: 8768 case TCP_RACK_PKT_DELAY: 8769 case TCP_RACK_TLP_USE: 8770 case TCP_RACK_TLP_INC_VAR: 8771 case TCP_RACK_IDLE_REDUCE_HIGH: 8772 case TCP_RACK_MIN_PACE: 8773 case TCP_RACK_MIN_PACE_SEG: 8774 case TCP_BBR_RACK_RTT_USE: 8775 case TCP_DATA_AFTER_CLOSE: 8776 break; 8777 default: 8778 return (tcp_default_ctloutput(so, sopt, inp, tp)); 8779 break; 8780 } 8781 INP_WUNLOCK(inp); 8782 error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); 8783 if (error) 8784 return (error); 8785 INP_WLOCK(inp); 8786 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 8787 INP_WUNLOCK(inp); 8788 return (ECONNRESET); 8789 } 8790 tp = intotcpcb(inp); 8791 rack = (struct tcp_rack *)tp->t_fb_ptr; 8792 switch (sopt->sopt_name) { 8793 case TCP_RACK_PROP_RATE: 8794 if ((optval <= 0) || (optval >= 100)) { 8795 error = EINVAL; 8796 break; 8797 } 8798 RACK_OPTS_INC(tcp_rack_prop_rate); 8799 rack->r_ctl.rc_prop_rate = optval; 8800 break; 8801 case TCP_RACK_TLP_USE: 8802 if ((optval < TLP_USE_ID) || (optval > TLP_USE_TWO_TWO)) { 8803 error = EINVAL; 8804 break; 8805 } 8806 RACK_OPTS_INC(tcp_tlp_use); 8807 rack->rack_tlp_threshold_use = optval; 8808 break; 8809 case TCP_RACK_PROP: 8810 /* RACK proportional rate reduction (bool) */ 8811 RACK_OPTS_INC(tcp_rack_prop); 8812 rack->r_ctl.rc_prop_reduce = optval; 8813 break; 8814 case TCP_RACK_TLP_REDUCE: 8815 /* RACK TLP cwnd reduction (bool) */ 8816 RACK_OPTS_INC(tcp_rack_tlp_reduce); 8817 rack->r_ctl.rc_tlp_cwnd_reduce = optval; 8818 break; 8819 case TCP_RACK_EARLY_RECOV: 8820 /* Should recovery happen early (bool) */ 8821 RACK_OPTS_INC(tcp_rack_early_recov); 8822 rack->r_ctl.rc_early_recovery = optval; 8823 break; 8824 case TCP_RACK_PACE_ALWAYS: 8825 /* Use the always pace method (bool) */ 8826 RACK_OPTS_INC(tcp_rack_pace_always); 8827 if (optval > 0) 8828 rack->rc_always_pace = 1; 8829 else 8830 rack->rc_always_pace = 0; 8831 break; 8832 case TCP_RACK_PACE_REDUCE: 8833 /* RACK Hptsi reduction factor (divisor) */ 8834 RACK_OPTS_INC(tcp_rack_pace_reduce); 8835 if (optval) 8836 /* Must be non-zero */ 8837 rack->rc_pace_reduce = optval; 8838 else 8839 error = EINVAL; 8840 break; 8841 case TCP_RACK_PACE_MAX_SEG: 8842 /* Max segments in a pace */ 8843 RACK_OPTS_INC(tcp_rack_max_seg); 8844 rack->rc_pace_max_segs = optval; 8845 break; 8846 case TCP_RACK_PRR_SENDALOT: 8847 /* Allow PRR to send more than one seg */ 8848 RACK_OPTS_INC(tcp_rack_prr_sendalot); 8849 rack->r_ctl.rc_prr_sendalot = optval; 8850 break; 8851 case TCP_RACK_MIN_TO: 8852 /* Minimum time between rack t-o's in ms */ 8853 RACK_OPTS_INC(tcp_rack_min_to); 8854 rack->r_ctl.rc_min_to = optval; 8855 break; 8856 case TCP_RACK_EARLY_SEG: 8857 /* If early recovery max segments */ 8858 RACK_OPTS_INC(tcp_rack_early_seg); 8859 rack->r_ctl.rc_early_recovery_segs = optval; 8860 break; 8861 case TCP_RACK_REORD_THRESH: 8862 /* RACK reorder threshold (shift amount) */ 8863 RACK_OPTS_INC(tcp_rack_reord_thresh); 8864 if ((optval > 0) && (optval < 31)) 8865 rack->r_ctl.rc_reorder_shift = optval; 8866 else 8867 error = EINVAL; 8868 break; 8869 case TCP_RACK_REORD_FADE: 8870 /* Does reordering fade after ms time */ 8871 RACK_OPTS_INC(tcp_rack_reord_fade); 8872 rack->r_ctl.rc_reorder_fade = optval; 8873 break; 8874 case TCP_RACK_TLP_THRESH: 8875 /* RACK TLP theshold i.e. srtt+(srtt/N) */ 8876 RACK_OPTS_INC(tcp_rack_tlp_thresh); 8877 if (optval) 8878 rack->r_ctl.rc_tlp_threshold = optval; 8879 else 8880 error = EINVAL; 8881 break; 8882 case TCP_RACK_PKT_DELAY: 8883 /* RACK added ms i.e. rack-rtt + reord + N */ 8884 RACK_OPTS_INC(tcp_rack_pkt_delay); 8885 rack->r_ctl.rc_pkt_delay = optval; 8886 break; 8887 case TCP_RACK_TLP_INC_VAR: 8888 /* Does TLP include rtt variance in t-o */ 8889 RACK_OPTS_INC(tcp_rack_tlp_inc_var); 8890 rack->r_ctl.rc_prr_inc_var = optval; 8891 break; 8892 case TCP_RACK_IDLE_REDUCE_HIGH: 8893 RACK_OPTS_INC(tcp_rack_idle_reduce_high); 8894 if (optval) 8895 rack->r_idle_reduce_largest = 1; 8896 else 8897 rack->r_idle_reduce_largest = 0; 8898 break; 8899 case TCP_DELACK: 8900 if (optval == 0) 8901 tp->t_delayed_ack = 0; 8902 else 8903 tp->t_delayed_ack = 1; 8904 if (tp->t_flags & TF_DELACK) { 8905 tp->t_flags &= ~TF_DELACK; 8906 tp->t_flags |= TF_ACKNOW; 8907 rack_output(tp); 8908 } 8909 break; 8910 case TCP_RACK_MIN_PACE: 8911 RACK_OPTS_INC(tcp_rack_min_pace); 8912 if (optval > 3) 8913 rack->r_enforce_min_pace = 3; 8914 else 8915 rack->r_enforce_min_pace = optval; 8916 break; 8917 case TCP_RACK_MIN_PACE_SEG: 8918 RACK_OPTS_INC(tcp_rack_min_pace_seg); 8919 if (optval >= 16) 8920 rack->r_min_pace_seg_thresh = 15; 8921 else 8922 rack->r_min_pace_seg_thresh = optval; 8923 break; 8924 case TCP_BBR_RACK_RTT_USE: 8925 if ((optval != USE_RTT_HIGH) && 8926 (optval != USE_RTT_LOW) && 8927 (optval != USE_RTT_AVG)) 8928 error = EINVAL; 8929 else 8930 rack->r_ctl.rc_rate_sample_method = optval; 8931 break; 8932 case TCP_DATA_AFTER_CLOSE: 8933 if (optval) 8934 rack->rc_allow_data_af_clo = 1; 8935 else 8936 rack->rc_allow_data_af_clo = 0; 8937 break; 8938 default: 8939 return (tcp_default_ctloutput(so, sopt, inp, tp)); 8940 break; 8941 } 8942 #ifdef NETFLIX_STATS 8943 tcp_log_socket_option(tp, sopt->sopt_name, optval, error); 8944 #endif 8945 INP_WUNLOCK(inp); 8946 return (error); 8947 } 8948 8949 static int 8950 rack_get_sockopt(struct socket *so, struct sockopt *sopt, 8951 struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack) 8952 { 8953 int32_t error, optval; 8954 8955 /* 8956 * Because all our options are either boolean or an int, we can just 8957 * pull everything into optval and then unlock and copy. If we ever 8958 * add a option that is not a int, then this will have quite an 8959 * impact to this routine. 8960 */ 8961 switch (sopt->sopt_name) { 8962 case TCP_RACK_PROP_RATE: 8963 optval = rack->r_ctl.rc_prop_rate; 8964 break; 8965 case TCP_RACK_PROP: 8966 /* RACK proportional rate reduction (bool) */ 8967 optval = rack->r_ctl.rc_prop_reduce; 8968 break; 8969 case TCP_RACK_TLP_REDUCE: 8970 /* RACK TLP cwnd reduction (bool) */ 8971 optval = rack->r_ctl.rc_tlp_cwnd_reduce; 8972 break; 8973 case TCP_RACK_EARLY_RECOV: 8974 /* Should recovery happen early (bool) */ 8975 optval = rack->r_ctl.rc_early_recovery; 8976 break; 8977 case TCP_RACK_PACE_REDUCE: 8978 /* RACK Hptsi reduction factor (divisor) */ 8979 optval = rack->rc_pace_reduce; 8980 break; 8981 case TCP_RACK_PACE_MAX_SEG: 8982 /* Max segments in a pace */ 8983 optval = rack->rc_pace_max_segs; 8984 break; 8985 case TCP_RACK_PACE_ALWAYS: 8986 /* Use the always pace method */ 8987 optval = rack->rc_always_pace; 8988 break; 8989 case TCP_RACK_PRR_SENDALOT: 8990 /* Allow PRR to send more than one seg */ 8991 optval = rack->r_ctl.rc_prr_sendalot; 8992 break; 8993 case TCP_RACK_MIN_TO: 8994 /* Minimum time between rack t-o's in ms */ 8995 optval = rack->r_ctl.rc_min_to; 8996 break; 8997 case TCP_RACK_EARLY_SEG: 8998 /* If early recovery max segments */ 8999 optval = rack->r_ctl.rc_early_recovery_segs; 9000 break; 9001 case TCP_RACK_REORD_THRESH: 9002 /* RACK reorder threshold (shift amount) */ 9003 optval = rack->r_ctl.rc_reorder_shift; 9004 break; 9005 case TCP_RACK_REORD_FADE: 9006 /* Does reordering fade after ms time */ 9007 optval = rack->r_ctl.rc_reorder_fade; 9008 break; 9009 case TCP_RACK_TLP_THRESH: 9010 /* RACK TLP theshold i.e. srtt+(srtt/N) */ 9011 optval = rack->r_ctl.rc_tlp_threshold; 9012 break; 9013 case TCP_RACK_PKT_DELAY: 9014 /* RACK added ms i.e. rack-rtt + reord + N */ 9015 optval = rack->r_ctl.rc_pkt_delay; 9016 break; 9017 case TCP_RACK_TLP_USE: 9018 optval = rack->rack_tlp_threshold_use; 9019 break; 9020 case TCP_RACK_TLP_INC_VAR: 9021 /* Does TLP include rtt variance in t-o */ 9022 optval = rack->r_ctl.rc_prr_inc_var; 9023 break; 9024 case TCP_RACK_IDLE_REDUCE_HIGH: 9025 optval = rack->r_idle_reduce_largest; 9026 break; 9027 case TCP_RACK_MIN_PACE: 9028 optval = rack->r_enforce_min_pace; 9029 break; 9030 case TCP_RACK_MIN_PACE_SEG: 9031 optval = rack->r_min_pace_seg_thresh; 9032 break; 9033 case TCP_BBR_RACK_RTT_USE: 9034 optval = rack->r_ctl.rc_rate_sample_method; 9035 break; 9036 case TCP_DELACK: 9037 optval = tp->t_delayed_ack; 9038 break; 9039 case TCP_DATA_AFTER_CLOSE: 9040 optval = rack->rc_allow_data_af_clo; 9041 break; 9042 default: 9043 return (tcp_default_ctloutput(so, sopt, inp, tp)); 9044 break; 9045 } 9046 INP_WUNLOCK(inp); 9047 error = sooptcopyout(sopt, &optval, sizeof optval); 9048 return (error); 9049 } 9050 9051 static int 9052 rack_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp) 9053 { 9054 int32_t error = EINVAL; 9055 struct tcp_rack *rack; 9056 9057 rack = (struct tcp_rack *)tp->t_fb_ptr; 9058 if (rack == NULL) { 9059 /* Huh? */ 9060 goto out; 9061 } 9062 if (sopt->sopt_dir == SOPT_SET) { 9063 return (rack_set_sockopt(so, sopt, inp, tp, rack)); 9064 } else if (sopt->sopt_dir == SOPT_GET) { 9065 return (rack_get_sockopt(so, sopt, inp, tp, rack)); 9066 } 9067 out: 9068 INP_WUNLOCK(inp); 9069 return (error); 9070 } 9071 9072 9073 struct tcp_function_block __tcp_rack = { 9074 .tfb_tcp_block_name = __XSTRING(STACKNAME), 9075 .tfb_tcp_output = rack_output, 9076 .tfb_tcp_do_segment = rack_do_segment, 9077 .tfb_tcp_hpts_do_segment = rack_hpts_do_segment, 9078 .tfb_tcp_ctloutput = rack_ctloutput, 9079 .tfb_tcp_fb_init = rack_init, 9080 .tfb_tcp_fb_fini = rack_fini, 9081 .tfb_tcp_timer_stop_all = rack_stopall, 9082 .tfb_tcp_timer_activate = rack_timer_activate, 9083 .tfb_tcp_timer_active = rack_timer_active, 9084 .tfb_tcp_timer_stop = rack_timer_stop, 9085 .tfb_tcp_rexmit_tmr = rack_remxt_tmr, 9086 .tfb_tcp_handoff_ok = rack_handoff_ok 9087 }; 9088 9089 static const char *rack_stack_names[] = { 9090 __XSTRING(STACKNAME), 9091 #ifdef STACKALIAS 9092 __XSTRING(STACKALIAS), 9093 #endif 9094 }; 9095 9096 static int 9097 rack_ctor(void *mem, int32_t size, void *arg, int32_t how) 9098 { 9099 memset(mem, 0, size); 9100 return (0); 9101 } 9102 9103 static void 9104 rack_dtor(void *mem, int32_t size, void *arg) 9105 { 9106 9107 } 9108 9109 static bool rack_mod_inited = false; 9110 9111 static int 9112 tcp_addrack(module_t mod, int32_t type, void *data) 9113 { 9114 int32_t err = 0; 9115 int num_stacks; 9116 9117 switch (type) { 9118 case MOD_LOAD: 9119 rack_zone = uma_zcreate(__XSTRING(MODNAME) "_map", 9120 sizeof(struct rack_sendmap), 9121 rack_ctor, rack_dtor, NULL, NULL, UMA_ALIGN_PTR, 0); 9122 9123 rack_pcb_zone = uma_zcreate(__XSTRING(MODNAME) "_pcb", 9124 sizeof(struct tcp_rack), 9125 rack_ctor, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0); 9126 9127 sysctl_ctx_init(&rack_sysctl_ctx); 9128 rack_sysctl_root = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 9129 SYSCTL_STATIC_CHILDREN(_net_inet_tcp), 9130 OID_AUTO, 9131 __XSTRING(STACKNAME), 9132 CTLFLAG_RW, 0, 9133 ""); 9134 if (rack_sysctl_root == NULL) { 9135 printf("Failed to add sysctl node\n"); 9136 err = EFAULT; 9137 goto free_uma; 9138 } 9139 rack_init_sysctls(); 9140 num_stacks = nitems(rack_stack_names); 9141 err = register_tcp_functions_as_names(&__tcp_rack, M_WAITOK, 9142 rack_stack_names, &num_stacks); 9143 if (err) { 9144 printf("Failed to register %s stack name for " 9145 "%s module\n", rack_stack_names[num_stacks], 9146 __XSTRING(MODNAME)); 9147 sysctl_ctx_free(&rack_sysctl_ctx); 9148 free_uma: 9149 uma_zdestroy(rack_zone); 9150 uma_zdestroy(rack_pcb_zone); 9151 rack_counter_destroy(); 9152 printf("Failed to register rack module -- err:%d\n", err); 9153 return (err); 9154 } 9155 rack_mod_inited = true; 9156 break; 9157 case MOD_QUIESCE: 9158 err = deregister_tcp_functions(&__tcp_rack, true, false); 9159 break; 9160 case MOD_UNLOAD: 9161 err = deregister_tcp_functions(&__tcp_rack, false, true); 9162 if (err == EBUSY) 9163 break; 9164 if (rack_mod_inited) { 9165 uma_zdestroy(rack_zone); 9166 uma_zdestroy(rack_pcb_zone); 9167 sysctl_ctx_free(&rack_sysctl_ctx); 9168 rack_counter_destroy(); 9169 rack_mod_inited = false; 9170 } 9171 err = 0; 9172 break; 9173 default: 9174 return (EOPNOTSUPP); 9175 } 9176 return (err); 9177 } 9178 9179 static moduledata_t tcp_rack = { 9180 .name = __XSTRING(MODNAME), 9181 .evhand = tcp_addrack, 9182 .priv = 0 9183 }; 9184 9185 MODULE_VERSION(MODNAME, 1); 9186 DECLARE_MODULE(MODNAME, tcp_rack, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY); 9187 MODULE_DEPEND(MODNAME, tcphpts, 1, 1, 1); 9188