1 /*- 2 * Copyright (c) 2016-2018 3 * Netflix Inc. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 * 26 */ 27 28 #include <sys/cdefs.h> 29 __FBSDID("$FreeBSD$"); 30 31 #include "opt_inet.h" 32 #include "opt_inet6.h" 33 #include "opt_ipsec.h" 34 #include "opt_tcpdebug.h" 35 36 #include <sys/param.h> 37 #include <sys/module.h> 38 #include <sys/kernel.h> 39 #ifdef TCP_HHOOK 40 #include <sys/hhook.h> 41 #endif 42 #include <sys/lock.h> 43 #include <sys/malloc.h> 44 #include <sys/lock.h> 45 #include <sys/mutex.h> 46 #include <sys/mbuf.h> 47 #include <sys/proc.h> /* for proc0 declaration */ 48 #include <sys/socket.h> 49 #include <sys/socketvar.h> 50 #include <sys/sysctl.h> 51 #include <sys/systm.h> 52 #ifdef NETFLIX_STATS 53 #include <sys/stats.h> 54 #endif 55 #include <sys/refcount.h> 56 #include <sys/queue.h> 57 #include <sys/smp.h> 58 #include <sys/kthread.h> 59 #include <sys/kern_prefetch.h> 60 61 #include <vm/uma.h> 62 63 #include <net/route.h> 64 #include <net/vnet.h> 65 66 #define TCPSTATES /* for logging */ 67 68 #include <netinet/in.h> 69 #include <netinet/in_kdtrace.h> 70 #include <netinet/in_pcb.h> 71 #include <netinet/ip.h> 72 #include <netinet/ip_icmp.h> /* required for icmp_var.h */ 73 #include <netinet/icmp_var.h> /* for ICMP_BANDLIM */ 74 #include <netinet/ip_var.h> 75 #include <netinet/ip6.h> 76 #include <netinet6/in6_pcb.h> 77 #include <netinet6/ip6_var.h> 78 #include <netinet/tcp.h> 79 #define TCPOUTFLAGS 80 #include <netinet/tcp_fsm.h> 81 #include <netinet/tcp_log_buf.h> 82 #include <netinet/tcp_seq.h> 83 #include <netinet/tcp_timer.h> 84 #include <netinet/tcp_var.h> 85 #include <netinet/tcp_hpts.h> 86 #include <netinet/tcpip.h> 87 #include <netinet/cc/cc.h> 88 #ifdef NETFLIX_CWV 89 #include <netinet/tcp_newcwv.h> 90 #endif 91 #include <netinet/tcp_fastopen.h> 92 #ifdef TCPDEBUG 93 #include <netinet/tcp_debug.h> 94 #endif /* TCPDEBUG */ 95 #ifdef TCP_OFFLOAD 96 #include <netinet/tcp_offload.h> 97 #endif 98 #ifdef INET6 99 #include <netinet6/tcp6_var.h> 100 #endif 101 102 #include <netipsec/ipsec_support.h> 103 104 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 105 #include <netipsec/ipsec.h> 106 #include <netipsec/ipsec6.h> 107 #endif /* IPSEC */ 108 109 #include <netinet/udp.h> 110 #include <netinet/udp_var.h> 111 #include <machine/in_cksum.h> 112 113 #ifdef MAC 114 #include <security/mac/mac_framework.h> 115 #endif 116 #include "sack_filter.h" 117 #include "tcp_rack.h" 118 #include "rack_bbr_common.h" 119 120 uma_zone_t rack_zone; 121 uma_zone_t rack_pcb_zone; 122 123 #ifndef TICKS2SBT 124 #define TICKS2SBT(__t) (tick_sbt * ((sbintime_t)(__t))) 125 #endif 126 127 struct sysctl_ctx_list rack_sysctl_ctx; 128 struct sysctl_oid *rack_sysctl_root; 129 130 #define CUM_ACKED 1 131 #define SACKED 2 132 133 /* 134 * The RACK module incorporates a number of 135 * TCP ideas that have been put out into the IETF 136 * over the last few years: 137 * - Matt Mathis's Rate Halving which slowly drops 138 * the congestion window so that the ack clock can 139 * be maintained during a recovery. 140 * - Yuchung Cheng's RACK TCP (for which its named) that 141 * will stop us using the number of dup acks and instead 142 * use time as the gage of when we retransmit. 143 * - Reorder Detection of RFC4737 and the Tail-Loss probe draft 144 * of Dukkipati et.al. 145 * RACK depends on SACK, so if an endpoint arrives that 146 * cannot do SACK the state machine below will shuttle the 147 * connection back to using the "default" TCP stack that is 148 * in FreeBSD. 149 * 150 * To implement RACK the original TCP stack was first decomposed 151 * into a functional state machine with individual states 152 * for each of the possible TCP connection states. The do_segement 153 * functions role in life is to mandate the connection supports SACK 154 * initially and then assure that the RACK state matches the conenction 155 * state before calling the states do_segment function. Each 156 * state is simplified due to the fact that the original do_segment 157 * has been decomposed and we *know* what state we are in (no 158 * switches on the state) and all tests for SACK are gone. This 159 * greatly simplifies what each state does. 160 * 161 * TCP output is also over-written with a new version since it 162 * must maintain the new rack scoreboard. 163 * 164 */ 165 static int32_t rack_precache = 1; 166 static int32_t rack_tlp_thresh = 1; 167 static int32_t rack_reorder_thresh = 2; 168 static int32_t rack_reorder_fade = 60000; /* 0 - never fade, def 60,000 169 * - 60 seconds */ 170 static int32_t rack_pkt_delay = 1; 171 static int32_t rack_inc_var = 0;/* For TLP */ 172 static int32_t rack_reduce_largest_on_idle = 0; 173 static int32_t rack_min_pace_time = 0; 174 static int32_t rack_min_pace_time_seg_req=6; 175 static int32_t rack_early_recovery = 1; 176 static int32_t rack_early_recovery_max_seg = 6; 177 static int32_t rack_send_a_lot_in_prr = 1; 178 static int32_t rack_min_to = 1; /* Number of ms minimum timeout */ 179 static int32_t rack_tlp_in_recovery = 1; /* Can we do TLP in recovery? */ 180 static int32_t rack_verbose_logging = 0; 181 static int32_t rack_ignore_data_after_close = 1; 182 /* 183 * Currently regular tcp has a rto_min of 30ms 184 * the backoff goes 12 times so that ends up 185 * being a total of 122.850 seconds before a 186 * connection is killed. 187 */ 188 static int32_t rack_tlp_min = 10; 189 static int32_t rack_rto_min = 30; /* 30ms same as main freebsd */ 190 static int32_t rack_rto_max = 30000; /* 30 seconds */ 191 static const int32_t rack_free_cache = 2; 192 static int32_t rack_hptsi_segments = 40; 193 static int32_t rack_rate_sample_method = USE_RTT_LOW; 194 static int32_t rack_pace_every_seg = 1; 195 static int32_t rack_delayed_ack_time = 200; /* 200ms */ 196 static int32_t rack_slot_reduction = 4; 197 static int32_t rack_lower_cwnd_at_tlp = 0; 198 static int32_t rack_use_proportional_reduce = 0; 199 static int32_t rack_proportional_rate = 10; 200 static int32_t rack_tlp_max_resend = 2; 201 static int32_t rack_limited_retran = 0; 202 static int32_t rack_always_send_oldest = 0; 203 static int32_t rack_sack_block_limit = 128; 204 static int32_t rack_use_sack_filter = 1; 205 static int32_t rack_tlp_threshold_use = TLP_USE_TWO_ONE; 206 207 /* Rack specific counters */ 208 counter_u64_t rack_badfr; 209 counter_u64_t rack_badfr_bytes; 210 counter_u64_t rack_rtm_prr_retran; 211 counter_u64_t rack_rtm_prr_newdata; 212 counter_u64_t rack_timestamp_mismatch; 213 counter_u64_t rack_reorder_seen; 214 counter_u64_t rack_paced_segments; 215 counter_u64_t rack_unpaced_segments; 216 counter_u64_t rack_saw_enobuf; 217 counter_u64_t rack_saw_enetunreach; 218 219 /* Tail loss probe counters */ 220 counter_u64_t rack_tlp_tot; 221 counter_u64_t rack_tlp_newdata; 222 counter_u64_t rack_tlp_retran; 223 counter_u64_t rack_tlp_retran_bytes; 224 counter_u64_t rack_tlp_retran_fail; 225 counter_u64_t rack_to_tot; 226 counter_u64_t rack_to_arm_rack; 227 counter_u64_t rack_to_arm_tlp; 228 counter_u64_t rack_to_alloc; 229 counter_u64_t rack_to_alloc_hard; 230 counter_u64_t rack_to_alloc_emerg; 231 232 counter_u64_t rack_sack_proc_all; 233 counter_u64_t rack_sack_proc_short; 234 counter_u64_t rack_sack_proc_restart; 235 counter_u64_t rack_runt_sacks; 236 counter_u64_t rack_used_tlpmethod; 237 counter_u64_t rack_used_tlpmethod2; 238 counter_u64_t rack_enter_tlp_calc; 239 counter_u64_t rack_input_idle_reduces; 240 counter_u64_t rack_tlp_does_nada; 241 242 /* Temp CPU counters */ 243 counter_u64_t rack_find_high; 244 245 counter_u64_t rack_progress_drops; 246 counter_u64_t rack_out_size[TCP_MSS_ACCT_SIZE]; 247 counter_u64_t rack_opts_arry[RACK_OPTS_SIZE]; 248 249 static void 250 rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, int event, int line); 251 252 static int 253 rack_process_ack(struct mbuf *m, struct tcphdr *th, 254 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t * ti_locked, 255 uint32_t tiwin, int32_t tlen, int32_t * ofia, int32_t thflags, int32_t * ret_val); 256 static int 257 rack_process_data(struct mbuf *m, struct tcphdr *th, 258 struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, 259 int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); 260 static void 261 rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, 262 struct tcphdr *th, uint16_t nsegs, uint16_t type, int32_t recovery); 263 static struct rack_sendmap *rack_alloc(struct tcp_rack *rack); 264 static struct rack_sendmap * 265 rack_check_recovery_mode(struct tcpcb *tp, 266 uint32_t tsused); 267 static void 268 rack_cong_signal(struct tcpcb *tp, struct tcphdr *th, 269 uint32_t type); 270 static void rack_counter_destroy(void); 271 static int 272 rack_ctloutput(struct socket *so, struct sockopt *sopt, 273 struct inpcb *inp, struct tcpcb *tp); 274 static int32_t rack_ctor(void *mem, int32_t size, void *arg, int32_t how); 275 static void 276 rack_do_segment(struct mbuf *m, struct tcphdr *th, 277 struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, 278 uint8_t iptos, int32_t ti_locked); 279 static void rack_dtor(void *mem, int32_t size, void *arg); 280 static void 281 rack_earlier_retran(struct tcpcb *tp, struct rack_sendmap *rsm, 282 uint32_t t, uint32_t cts); 283 static struct rack_sendmap * 284 rack_find_high_nonack(struct tcp_rack *rack, 285 struct rack_sendmap *rsm); 286 static struct rack_sendmap *rack_find_lowest_rsm(struct tcp_rack *rack); 287 static void rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm); 288 static void rack_fini(struct tcpcb *tp, int32_t tcb_is_purged); 289 static int 290 rack_get_sockopt(struct socket *so, struct sockopt *sopt, 291 struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack); 292 static int32_t rack_handoff_ok(struct tcpcb *tp); 293 static int32_t rack_init(struct tcpcb *tp); 294 static void rack_init_sysctls(void); 295 static void 296 rack_log_ack(struct tcpcb *tp, struct tcpopt *to, 297 struct tcphdr *th); 298 static void 299 rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len, 300 uint32_t seq_out, uint8_t th_flags, int32_t err, uint32_t ts, 301 uint8_t pass, struct rack_sendmap *hintrsm); 302 static void 303 rack_log_sack_passed(struct tcpcb *tp, struct tcp_rack *rack, 304 struct rack_sendmap *rsm); 305 static void rack_log_to_event(struct tcp_rack *rack, int32_t to_num); 306 static int32_t rack_output(struct tcpcb *tp); 307 static void 308 rack_hpts_do_segment(struct mbuf *m, struct tcphdr *th, 309 struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, 310 uint8_t iptos, int32_t ti_locked, int32_t nxt_pkt, struct timeval *tv); 311 312 static uint32_t 313 rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, 314 struct sackblk *sack, struct tcpopt *to, struct rack_sendmap **prsm, 315 uint32_t cts); 316 static void rack_post_recovery(struct tcpcb *tp, struct tcphdr *th); 317 static void rack_remxt_tmr(struct tcpcb *tp); 318 static int 319 rack_set_sockopt(struct socket *so, struct sockopt *sopt, 320 struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack); 321 static void rack_set_state(struct tcpcb *tp, struct tcp_rack *rack); 322 static int32_t rack_stopall(struct tcpcb *tp); 323 static void 324 rack_timer_activate(struct tcpcb *tp, uint32_t timer_type, 325 uint32_t delta); 326 static int32_t rack_timer_active(struct tcpcb *tp, uint32_t timer_type); 327 static void rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line); 328 static void rack_timer_stop(struct tcpcb *tp, uint32_t timer_type); 329 static uint32_t 330 rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack, 331 struct rack_sendmap *rsm, uint32_t ts, int32_t * lenp); 332 static void 333 rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack, 334 struct rack_sendmap *rsm, uint32_t ts); 335 static int 336 rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack, 337 struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type); 338 static int32_t tcp_addrack(module_t mod, int32_t type, void *data); 339 static void 340 rack_challenge_ack(struct mbuf *m, struct tcphdr *th, 341 struct tcpcb *tp, int32_t * ti_locked, int32_t * ret_val); 342 static int 343 rack_do_close_wait(struct mbuf *m, struct tcphdr *th, 344 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 345 int32_t tlen, int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); 346 static int 347 rack_do_closing(struct mbuf *m, struct tcphdr *th, 348 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 349 int32_t tlen, int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); 350 static void rack_do_drop(struct mbuf *m, struct tcpcb *tp, int32_t * ti_locked); 351 static void 352 rack_do_dropafterack(struct mbuf *m, struct tcpcb *tp, 353 struct tcphdr *th, int32_t * ti_locked, int32_t thflags, int32_t tlen, int32_t * ret_val); 354 static void 355 rack_do_dropwithreset(struct mbuf *m, struct tcpcb *tp, 356 struct tcphdr *th, int32_t * ti_locked, int32_t rstreason, int32_t tlen); 357 static int 358 rack_do_established(struct mbuf *m, struct tcphdr *th, 359 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 360 int32_t tlen, int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); 361 static int 362 rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, 363 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 364 int32_t tlen, int32_t * ti_locked, uint32_t tiwin, int32_t nxt_pkt); 365 static int 366 rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, 367 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 368 int32_t tlen, int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); 369 static int 370 rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, 371 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 372 int32_t tlen, int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); 373 static int 374 rack_do_lastack(struct mbuf *m, struct tcphdr *th, 375 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 376 int32_t tlen, int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); 377 static int 378 rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, 379 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 380 int32_t tlen, int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); 381 static int 382 rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, 383 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 384 int32_t tlen, int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); 385 static int 386 rack_drop_checks(struct tcpopt *to, struct mbuf *m, 387 struct tcphdr *th, struct tcpcb *tp, int32_t * tlenp, int32_t * ti_locked, int32_t * thf, 388 int32_t * drop_hdrlen, int32_t * ret_val); 389 static int 390 rack_process_rst(struct mbuf *m, struct tcphdr *th, 391 struct socket *so, struct tcpcb *tp, int32_t * ti_locked); 392 struct rack_sendmap * 393 tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, 394 uint32_t tsused); 395 static void tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt); 396 static void 397 tcp_rack_partialack(struct tcpcb *tp, struct tcphdr *th); 398 399 static int 400 rack_ts_check(struct mbuf *m, struct tcphdr *th, 401 struct tcpcb *tp, int32_t * ti_locked, int32_t tlen, int32_t thflags, int32_t * ret_val); 402 403 int32_t rack_clear_counter=0; 404 405 406 static int 407 sysctl_rack_clear(SYSCTL_HANDLER_ARGS) 408 { 409 uint32_t stat; 410 int32_t error; 411 412 error = SYSCTL_OUT(req, &rack_clear_counter, sizeof(uint32_t)); 413 if (error || req->newptr == NULL) 414 return error; 415 416 error = SYSCTL_IN(req, &stat, sizeof(uint32_t)); 417 if (error) 418 return (error); 419 if (stat == 1) { 420 #ifdef INVARIANTS 421 printf("Clearing RACK counters\n"); 422 #endif 423 counter_u64_zero(rack_badfr); 424 counter_u64_zero(rack_badfr_bytes); 425 counter_u64_zero(rack_rtm_prr_retran); 426 counter_u64_zero(rack_rtm_prr_newdata); 427 counter_u64_zero(rack_timestamp_mismatch); 428 counter_u64_zero(rack_reorder_seen); 429 counter_u64_zero(rack_tlp_tot); 430 counter_u64_zero(rack_tlp_newdata); 431 counter_u64_zero(rack_tlp_retran); 432 counter_u64_zero(rack_tlp_retran_bytes); 433 counter_u64_zero(rack_tlp_retran_fail); 434 counter_u64_zero(rack_to_tot); 435 counter_u64_zero(rack_to_arm_rack); 436 counter_u64_zero(rack_to_arm_tlp); 437 counter_u64_zero(rack_paced_segments); 438 counter_u64_zero(rack_unpaced_segments); 439 counter_u64_zero(rack_saw_enobuf); 440 counter_u64_zero(rack_saw_enetunreach); 441 counter_u64_zero(rack_to_alloc_hard); 442 counter_u64_zero(rack_to_alloc_emerg); 443 counter_u64_zero(rack_sack_proc_all); 444 counter_u64_zero(rack_sack_proc_short); 445 counter_u64_zero(rack_sack_proc_restart); 446 counter_u64_zero(rack_to_alloc); 447 counter_u64_zero(rack_find_high); 448 counter_u64_zero(rack_runt_sacks); 449 counter_u64_zero(rack_used_tlpmethod); 450 counter_u64_zero(rack_used_tlpmethod2); 451 counter_u64_zero(rack_enter_tlp_calc); 452 counter_u64_zero(rack_progress_drops); 453 counter_u64_zero(rack_tlp_does_nada); 454 } 455 rack_clear_counter = 0; 456 return (0); 457 } 458 459 460 461 static void 462 rack_init_sysctls() 463 { 464 SYSCTL_ADD_S32(&rack_sysctl_ctx, 465 SYSCTL_CHILDREN(rack_sysctl_root), 466 OID_AUTO, "rate_sample_method", CTLFLAG_RW, 467 &rack_rate_sample_method , USE_RTT_LOW, 468 "What method should we use for rate sampling 0=high, 1=low "); 469 SYSCTL_ADD_S32(&rack_sysctl_ctx, 470 SYSCTL_CHILDREN(rack_sysctl_root), 471 OID_AUTO, "data_after_close", CTLFLAG_RW, 472 &rack_ignore_data_after_close, 0, 473 "Do we hold off sending a RST until all pending data is ack'd"); 474 SYSCTL_ADD_S32(&rack_sysctl_ctx, 475 SYSCTL_CHILDREN(rack_sysctl_root), 476 OID_AUTO, "tlpmethod", CTLFLAG_RW, 477 &rack_tlp_threshold_use, TLP_USE_TWO_ONE, 478 "What method do we do for TLP time calc 0=no-de-ack-comp, 1=ID, 2=2.1, 3=2.2"); 479 SYSCTL_ADD_S32(&rack_sysctl_ctx, 480 SYSCTL_CHILDREN(rack_sysctl_root), 481 OID_AUTO, "min_pace_time", CTLFLAG_RW, 482 &rack_min_pace_time, 0, 483 "Should we enforce a minimum pace time of 1ms"); 484 SYSCTL_ADD_S32(&rack_sysctl_ctx, 485 SYSCTL_CHILDREN(rack_sysctl_root), 486 OID_AUTO, "min_pace_segs", CTLFLAG_RW, 487 &rack_min_pace_time_seg_req, 6, 488 "How many segments have to be in the len to enforce min-pace-time"); 489 SYSCTL_ADD_S32(&rack_sysctl_ctx, 490 SYSCTL_CHILDREN(rack_sysctl_root), 491 OID_AUTO, "idle_reduce_high", CTLFLAG_RW, 492 &rack_reduce_largest_on_idle, 0, 493 "Should we reduce the largest cwnd seen to IW on idle reduction"); 494 SYSCTL_ADD_S32(&rack_sysctl_ctx, 495 SYSCTL_CHILDREN(rack_sysctl_root), 496 OID_AUTO, "bb_verbose", CTLFLAG_RW, 497 &rack_verbose_logging, 0, 498 "Should RACK black box logging be verbose"); 499 SYSCTL_ADD_S32(&rack_sysctl_ctx, 500 SYSCTL_CHILDREN(rack_sysctl_root), 501 OID_AUTO, "sackfiltering", CTLFLAG_RW, 502 &rack_use_sack_filter, 1, 503 "Do we use sack filtering?"); 504 SYSCTL_ADD_S32(&rack_sysctl_ctx, 505 SYSCTL_CHILDREN(rack_sysctl_root), 506 OID_AUTO, "delayed_ack", CTLFLAG_RW, 507 &rack_delayed_ack_time, 200, 508 "Delayed ack time (200ms)"); 509 SYSCTL_ADD_S32(&rack_sysctl_ctx, 510 SYSCTL_CHILDREN(rack_sysctl_root), 511 OID_AUTO, "tlpminto", CTLFLAG_RW, 512 &rack_tlp_min, 10, 513 "TLP minimum timeout per the specification (10ms)"); 514 SYSCTL_ADD_S32(&rack_sysctl_ctx, 515 SYSCTL_CHILDREN(rack_sysctl_root), 516 OID_AUTO, "precache", CTLFLAG_RW, 517 &rack_precache, 0, 518 "Where should we precache the mcopy (0 is not at all)"); 519 SYSCTL_ADD_S32(&rack_sysctl_ctx, 520 SYSCTL_CHILDREN(rack_sysctl_root), 521 OID_AUTO, "sblklimit", CTLFLAG_RW, 522 &rack_sack_block_limit, 128, 523 "When do we start paying attention to small sack blocks"); 524 SYSCTL_ADD_S32(&rack_sysctl_ctx, 525 SYSCTL_CHILDREN(rack_sysctl_root), 526 OID_AUTO, "send_oldest", CTLFLAG_RW, 527 &rack_always_send_oldest, 1, 528 "Should we always send the oldest TLP and RACK-TLP"); 529 SYSCTL_ADD_S32(&rack_sysctl_ctx, 530 SYSCTL_CHILDREN(rack_sysctl_root), 531 OID_AUTO, "rack_tlp_in_recovery", CTLFLAG_RW, 532 &rack_tlp_in_recovery, 1, 533 "Can we do a TLP during recovery?"); 534 SYSCTL_ADD_S32(&rack_sysctl_ctx, 535 SYSCTL_CHILDREN(rack_sysctl_root), 536 OID_AUTO, "rack_tlimit", CTLFLAG_RW, 537 &rack_limited_retran, 0, 538 "How many times can a rack timeout drive out sends"); 539 SYSCTL_ADD_S32(&rack_sysctl_ctx, 540 SYSCTL_CHILDREN(rack_sysctl_root), 541 OID_AUTO, "minrto", CTLFLAG_RW, 542 &rack_rto_min, 0, 543 "Minimum RTO in ms -- set with caution below 1000 due to TLP"); 544 SYSCTL_ADD_S32(&rack_sysctl_ctx, 545 SYSCTL_CHILDREN(rack_sysctl_root), 546 OID_AUTO, "maxrto", CTLFLAG_RW, 547 &rack_rto_max, 0, 548 "Maxiumum RTO in ms -- should be at least as large as min_rto"); 549 SYSCTL_ADD_S32(&rack_sysctl_ctx, 550 SYSCTL_CHILDREN(rack_sysctl_root), 551 OID_AUTO, "tlp_retry", CTLFLAG_RW, 552 &rack_tlp_max_resend, 2, 553 "How many times does TLP retry a single segment or multiple with no ACK"); 554 SYSCTL_ADD_S32(&rack_sysctl_ctx, 555 SYSCTL_CHILDREN(rack_sysctl_root), 556 OID_AUTO, "recovery_loss_prop", CTLFLAG_RW, 557 &rack_use_proportional_reduce, 0, 558 "Should we proportionaly reduce cwnd based on the number of losses "); 559 SYSCTL_ADD_S32(&rack_sysctl_ctx, 560 SYSCTL_CHILDREN(rack_sysctl_root), 561 OID_AUTO, "recovery_prop", CTLFLAG_RW, 562 &rack_proportional_rate, 10, 563 "What percent reduction per loss"); 564 SYSCTL_ADD_S32(&rack_sysctl_ctx, 565 SYSCTL_CHILDREN(rack_sysctl_root), 566 OID_AUTO, "tlp_cwnd_flag", CTLFLAG_RW, 567 &rack_lower_cwnd_at_tlp, 0, 568 "When a TLP completes a retran should we enter recovery?"); 569 SYSCTL_ADD_S32(&rack_sysctl_ctx, 570 SYSCTL_CHILDREN(rack_sysctl_root), 571 OID_AUTO, "hptsi_reduces", CTLFLAG_RW, 572 &rack_slot_reduction, 4, 573 "When setting a slot should we reduce by divisor"); 574 SYSCTL_ADD_S32(&rack_sysctl_ctx, 575 SYSCTL_CHILDREN(rack_sysctl_root), 576 OID_AUTO, "hptsi_every_seg", CTLFLAG_RW, 577 &rack_pace_every_seg, 1, 578 "Should we pace out every segment hptsi"); 579 SYSCTL_ADD_S32(&rack_sysctl_ctx, 580 SYSCTL_CHILDREN(rack_sysctl_root), 581 OID_AUTO, "hptsi_seg_max", CTLFLAG_RW, 582 &rack_hptsi_segments, 6, 583 "Should we pace out only a limited size of segments"); 584 SYSCTL_ADD_S32(&rack_sysctl_ctx, 585 SYSCTL_CHILDREN(rack_sysctl_root), 586 OID_AUTO, "prr_sendalot", CTLFLAG_RW, 587 &rack_send_a_lot_in_prr, 1, 588 "Send a lot in prr"); 589 SYSCTL_ADD_S32(&rack_sysctl_ctx, 590 SYSCTL_CHILDREN(rack_sysctl_root), 591 OID_AUTO, "minto", CTLFLAG_RW, 592 &rack_min_to, 1, 593 "Minimum rack timeout in milliseconds"); 594 SYSCTL_ADD_S32(&rack_sysctl_ctx, 595 SYSCTL_CHILDREN(rack_sysctl_root), 596 OID_AUTO, "earlyrecoveryseg", CTLFLAG_RW, 597 &rack_early_recovery_max_seg, 6, 598 "Max segments in early recovery"); 599 SYSCTL_ADD_S32(&rack_sysctl_ctx, 600 SYSCTL_CHILDREN(rack_sysctl_root), 601 OID_AUTO, "earlyrecovery", CTLFLAG_RW, 602 &rack_early_recovery, 1, 603 "Do we do early recovery with rack"); 604 SYSCTL_ADD_S32(&rack_sysctl_ctx, 605 SYSCTL_CHILDREN(rack_sysctl_root), 606 OID_AUTO, "reorder_thresh", CTLFLAG_RW, 607 &rack_reorder_thresh, 2, 608 "What factor for rack will be added when seeing reordering (shift right)"); 609 SYSCTL_ADD_S32(&rack_sysctl_ctx, 610 SYSCTL_CHILDREN(rack_sysctl_root), 611 OID_AUTO, "rtt_tlp_thresh", CTLFLAG_RW, 612 &rack_tlp_thresh, 1, 613 "what divisor for TLP rtt/retran will be added (1=rtt, 2=1/2 rtt etc)"); 614 SYSCTL_ADD_S32(&rack_sysctl_ctx, 615 SYSCTL_CHILDREN(rack_sysctl_root), 616 OID_AUTO, "reorder_fade", CTLFLAG_RW, 617 &rack_reorder_fade, 0, 618 "Does reorder detection fade, if so how many ms (0 means never)"); 619 SYSCTL_ADD_S32(&rack_sysctl_ctx, 620 SYSCTL_CHILDREN(rack_sysctl_root), 621 OID_AUTO, "pktdelay", CTLFLAG_RW, 622 &rack_pkt_delay, 1, 623 "Extra RACK time (in ms) besides reordering thresh"); 624 SYSCTL_ADD_S32(&rack_sysctl_ctx, 625 SYSCTL_CHILDREN(rack_sysctl_root), 626 OID_AUTO, "inc_var", CTLFLAG_RW, 627 &rack_inc_var, 0, 628 "Should rack add to the TLP timer the variance in rtt calculation"); 629 rack_badfr = counter_u64_alloc(M_WAITOK); 630 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 631 SYSCTL_CHILDREN(rack_sysctl_root), 632 OID_AUTO, "badfr", CTLFLAG_RD, 633 &rack_badfr, "Total number of bad FRs"); 634 rack_badfr_bytes = counter_u64_alloc(M_WAITOK); 635 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 636 SYSCTL_CHILDREN(rack_sysctl_root), 637 OID_AUTO, "badfr_bytes", CTLFLAG_RD, 638 &rack_badfr_bytes, "Total number of bad FRs"); 639 rack_rtm_prr_retran = counter_u64_alloc(M_WAITOK); 640 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 641 SYSCTL_CHILDREN(rack_sysctl_root), 642 OID_AUTO, "prrsndret", CTLFLAG_RD, 643 &rack_rtm_prr_retran, 644 "Total number of prr based retransmits"); 645 rack_rtm_prr_newdata = counter_u64_alloc(M_WAITOK); 646 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 647 SYSCTL_CHILDREN(rack_sysctl_root), 648 OID_AUTO, "prrsndnew", CTLFLAG_RD, 649 &rack_rtm_prr_newdata, 650 "Total number of prr based new transmits"); 651 rack_timestamp_mismatch = counter_u64_alloc(M_WAITOK); 652 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 653 SYSCTL_CHILDREN(rack_sysctl_root), 654 OID_AUTO, "tsnf", CTLFLAG_RD, 655 &rack_timestamp_mismatch, 656 "Total number of timestamps that we could not find the reported ts"); 657 rack_find_high = counter_u64_alloc(M_WAITOK); 658 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 659 SYSCTL_CHILDREN(rack_sysctl_root), 660 OID_AUTO, "findhigh", CTLFLAG_RD, 661 &rack_find_high, 662 "Total number of FIN causing find-high"); 663 rack_reorder_seen = counter_u64_alloc(M_WAITOK); 664 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 665 SYSCTL_CHILDREN(rack_sysctl_root), 666 OID_AUTO, "reordering", CTLFLAG_RD, 667 &rack_reorder_seen, 668 "Total number of times we added delay due to reordering"); 669 rack_tlp_tot = counter_u64_alloc(M_WAITOK); 670 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 671 SYSCTL_CHILDREN(rack_sysctl_root), 672 OID_AUTO, "tlp_to_total", CTLFLAG_RD, 673 &rack_tlp_tot, 674 "Total number of tail loss probe expirations"); 675 rack_tlp_newdata = counter_u64_alloc(M_WAITOK); 676 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 677 SYSCTL_CHILDREN(rack_sysctl_root), 678 OID_AUTO, "tlp_new", CTLFLAG_RD, 679 &rack_tlp_newdata, 680 "Total number of tail loss probe sending new data"); 681 682 rack_tlp_retran = counter_u64_alloc(M_WAITOK); 683 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 684 SYSCTL_CHILDREN(rack_sysctl_root), 685 OID_AUTO, "tlp_retran", CTLFLAG_RD, 686 &rack_tlp_retran, 687 "Total number of tail loss probe sending retransmitted data"); 688 rack_tlp_retran_bytes = counter_u64_alloc(M_WAITOK); 689 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 690 SYSCTL_CHILDREN(rack_sysctl_root), 691 OID_AUTO, "tlp_retran_bytes", CTLFLAG_RD, 692 &rack_tlp_retran_bytes, 693 "Total bytes of tail loss probe sending retransmitted data"); 694 rack_tlp_retran_fail = counter_u64_alloc(M_WAITOK); 695 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 696 SYSCTL_CHILDREN(rack_sysctl_root), 697 OID_AUTO, "tlp_retran_fail", CTLFLAG_RD, 698 &rack_tlp_retran_fail, 699 "Total number of tail loss probe sending retransmitted data that failed (wait for t3)"); 700 rack_to_tot = counter_u64_alloc(M_WAITOK); 701 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 702 SYSCTL_CHILDREN(rack_sysctl_root), 703 OID_AUTO, "rack_to_tot", CTLFLAG_RD, 704 &rack_to_tot, 705 "Total number of times the rack to expired?"); 706 rack_to_arm_rack = counter_u64_alloc(M_WAITOK); 707 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 708 SYSCTL_CHILDREN(rack_sysctl_root), 709 OID_AUTO, "arm_rack", CTLFLAG_RD, 710 &rack_to_arm_rack, 711 "Total number of times the rack timer armed?"); 712 rack_to_arm_tlp = counter_u64_alloc(M_WAITOK); 713 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 714 SYSCTL_CHILDREN(rack_sysctl_root), 715 OID_AUTO, "arm_tlp", CTLFLAG_RD, 716 &rack_to_arm_tlp, 717 "Total number of times the tlp timer armed?"); 718 rack_paced_segments = counter_u64_alloc(M_WAITOK); 719 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 720 SYSCTL_CHILDREN(rack_sysctl_root), 721 OID_AUTO, "paced", CTLFLAG_RD, 722 &rack_paced_segments, 723 "Total number of times a segment send caused hptsi"); 724 rack_unpaced_segments = counter_u64_alloc(M_WAITOK); 725 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 726 SYSCTL_CHILDREN(rack_sysctl_root), 727 OID_AUTO, "unpaced", CTLFLAG_RD, 728 &rack_unpaced_segments, 729 "Total number of times a segment did not cause hptsi"); 730 rack_saw_enobuf = counter_u64_alloc(M_WAITOK); 731 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 732 SYSCTL_CHILDREN(rack_sysctl_root), 733 OID_AUTO, "saw_enobufs", CTLFLAG_RD, 734 &rack_saw_enobuf, 735 "Total number of times a segment did not cause hptsi"); 736 rack_saw_enetunreach = counter_u64_alloc(M_WAITOK); 737 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 738 SYSCTL_CHILDREN(rack_sysctl_root), 739 OID_AUTO, "saw_enetunreach", CTLFLAG_RD, 740 &rack_saw_enetunreach, 741 "Total number of times a segment did not cause hptsi"); 742 rack_to_alloc = counter_u64_alloc(M_WAITOK); 743 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 744 SYSCTL_CHILDREN(rack_sysctl_root), 745 OID_AUTO, "allocs", CTLFLAG_RD, 746 &rack_to_alloc, 747 "Total allocations of tracking structures"); 748 rack_to_alloc_hard = counter_u64_alloc(M_WAITOK); 749 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 750 SYSCTL_CHILDREN(rack_sysctl_root), 751 OID_AUTO, "allochard", CTLFLAG_RD, 752 &rack_to_alloc_hard, 753 "Total allocations done with sleeping the hard way"); 754 rack_to_alloc_emerg = counter_u64_alloc(M_WAITOK); 755 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 756 SYSCTL_CHILDREN(rack_sysctl_root), 757 OID_AUTO, "allocemerg", CTLFLAG_RD, 758 &rack_to_alloc_emerg, 759 "Total alocations done from emergency cache"); 760 rack_sack_proc_all = counter_u64_alloc(M_WAITOK); 761 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 762 SYSCTL_CHILDREN(rack_sysctl_root), 763 OID_AUTO, "sack_long", CTLFLAG_RD, 764 &rack_sack_proc_all, 765 "Total times we had to walk whole list for sack processing"); 766 767 rack_sack_proc_restart = counter_u64_alloc(M_WAITOK); 768 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 769 SYSCTL_CHILDREN(rack_sysctl_root), 770 OID_AUTO, "sack_restart", CTLFLAG_RD, 771 &rack_sack_proc_restart, 772 "Total times we had to walk whole list due to a restart"); 773 rack_sack_proc_short = counter_u64_alloc(M_WAITOK); 774 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 775 SYSCTL_CHILDREN(rack_sysctl_root), 776 OID_AUTO, "sack_short", CTLFLAG_RD, 777 &rack_sack_proc_short, 778 "Total times we took shortcut for sack processing"); 779 rack_enter_tlp_calc = counter_u64_alloc(M_WAITOK); 780 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 781 SYSCTL_CHILDREN(rack_sysctl_root), 782 OID_AUTO, "tlp_calc_entered", CTLFLAG_RD, 783 &rack_enter_tlp_calc, 784 "Total times we called calc-tlp"); 785 rack_used_tlpmethod = counter_u64_alloc(M_WAITOK); 786 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 787 SYSCTL_CHILDREN(rack_sysctl_root), 788 OID_AUTO, "hit_tlp_method", CTLFLAG_RD, 789 &rack_used_tlpmethod, 790 "Total number of runt sacks"); 791 rack_used_tlpmethod2 = counter_u64_alloc(M_WAITOK); 792 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 793 SYSCTL_CHILDREN(rack_sysctl_root), 794 OID_AUTO, "hit_tlp_method2", CTLFLAG_RD, 795 &rack_used_tlpmethod2, 796 "Total number of runt sacks 2"); 797 rack_runt_sacks = counter_u64_alloc(M_WAITOK); 798 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 799 SYSCTL_CHILDREN(rack_sysctl_root), 800 OID_AUTO, "runtsacks", CTLFLAG_RD, 801 &rack_runt_sacks, 802 "Total number of runt sacks"); 803 rack_progress_drops = counter_u64_alloc(M_WAITOK); 804 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 805 SYSCTL_CHILDREN(rack_sysctl_root), 806 OID_AUTO, "prog_drops", CTLFLAG_RD, 807 &rack_progress_drops, 808 "Total number of progress drops"); 809 rack_input_idle_reduces = counter_u64_alloc(M_WAITOK); 810 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 811 SYSCTL_CHILDREN(rack_sysctl_root), 812 OID_AUTO, "idle_reduce_oninput", CTLFLAG_RD, 813 &rack_input_idle_reduces, 814 "Total number of idle reductions on input"); 815 rack_tlp_does_nada = counter_u64_alloc(M_WAITOK); 816 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 817 SYSCTL_CHILDREN(rack_sysctl_root), 818 OID_AUTO, "tlp_nada", CTLFLAG_RD, 819 &rack_tlp_does_nada, 820 "Total number of nada tlp calls"); 821 COUNTER_ARRAY_ALLOC(rack_out_size, TCP_MSS_ACCT_SIZE, M_WAITOK); 822 SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), 823 OID_AUTO, "outsize", CTLFLAG_RD, 824 rack_out_size, TCP_MSS_ACCT_SIZE, "MSS send sizes"); 825 COUNTER_ARRAY_ALLOC(rack_opts_arry, RACK_OPTS_SIZE, M_WAITOK); 826 SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), 827 OID_AUTO, "opts", CTLFLAG_RD, 828 rack_opts_arry, RACK_OPTS_SIZE, "RACK Option Stats"); 829 SYSCTL_ADD_PROC(&rack_sysctl_ctx, 830 SYSCTL_CHILDREN(rack_sysctl_root), 831 OID_AUTO, "clear", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, 832 &rack_clear_counter, 0, sysctl_rack_clear, "IU", "Clear counters"); 833 } 834 835 static inline int32_t 836 rack_progress_timeout_check(struct tcpcb *tp) 837 { 838 if (tp->t_maxunacktime && tp->t_acktime && TSTMP_GT(ticks, tp->t_acktime)) { 839 if ((ticks - tp->t_acktime) >= tp->t_maxunacktime) { 840 /* 841 * There is an assumption that the caller 842 * will drop the connection so we will 843 * increment the counters here. 844 */ 845 struct tcp_rack *rack; 846 rack = (struct tcp_rack *)tp->t_fb_ptr; 847 counter_u64_add(rack_progress_drops, 1); 848 #ifdef NETFLIX_STATS 849 TCPSTAT_INC(tcps_progdrops); 850 #endif 851 rack_log_progress_event(rack, tp, ticks, PROGRESS_DROP, __LINE__); 852 return (1); 853 } 854 } 855 return (0); 856 } 857 858 859 static void 860 rack_log_to_start(struct tcp_rack *rack, uint32_t cts, uint32_t to, int32_t slot, uint8_t which) 861 { 862 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 863 union tcp_log_stackspecific log; 864 865 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 866 log.u_bbr.flex1 = TICKS_2_MSEC(rack->rc_tp->t_srtt >> TCP_RTT_SHIFT); 867 log.u_bbr.flex2 = to; 868 log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags; 869 log.u_bbr.flex4 = slot; 870 log.u_bbr.flex5 = rack->rc_inp->inp_hptsslot; 871 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur; 872 log.u_bbr.flex8 = which; 873 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 874 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 875 TCP_LOG_EVENT(rack->rc_tp, NULL, 876 &rack->rc_inp->inp_socket->so_rcv, 877 &rack->rc_inp->inp_socket->so_snd, 878 BBR_LOG_TIMERSTAR, 0, 879 0, &log, false); 880 } 881 } 882 883 static void 884 rack_log_to_event(struct tcp_rack *rack, int32_t to_num) 885 { 886 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 887 union tcp_log_stackspecific log; 888 889 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 890 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 891 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 892 log.u_bbr.flex8 = to_num; 893 log.u_bbr.flex1 = rack->r_ctl.rc_rack_min_rtt; 894 log.u_bbr.flex2 = rack->rc_rack_rtt; 895 TCP_LOG_EVENT(rack->rc_tp, NULL, 896 &rack->rc_inp->inp_socket->so_rcv, 897 &rack->rc_inp->inp_socket->so_snd, 898 BBR_LOG_RTO, 0, 899 0, &log, false); 900 } 901 } 902 903 static void 904 rack_log_rtt_upd(struct tcpcb *tp, struct tcp_rack *rack, int32_t t, 905 uint32_t o_srtt, uint32_t o_var) 906 { 907 if (tp->t_logstate != TCP_LOG_STATE_OFF) { 908 union tcp_log_stackspecific log; 909 910 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 911 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 912 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 913 log.u_bbr.flex1 = t; 914 log.u_bbr.flex2 = o_srtt; 915 log.u_bbr.flex3 = o_var; 916 log.u_bbr.flex4 = rack->r_ctl.rack_rs.rs_rtt_lowest; 917 log.u_bbr.flex5 = rack->r_ctl.rack_rs.rs_rtt_highest; 918 log.u_bbr.flex6 = rack->r_ctl.rack_rs.rs_rtt_cnt; 919 log.u_bbr.rttProp = rack->r_ctl.rack_rs.rs_rtt_tot; 920 log.u_bbr.flex8 = rack->r_ctl.rc_rate_sample_method; 921 TCP_LOG_EVENT(tp, NULL, 922 &rack->rc_inp->inp_socket->so_rcv, 923 &rack->rc_inp->inp_socket->so_snd, 924 BBR_LOG_BBRRTT, 0, 925 0, &log, false); 926 } 927 } 928 929 static void 930 rack_log_rtt_sample(struct tcp_rack *rack, uint32_t rtt) 931 { 932 /* 933 * Log the rtt sample we are 934 * applying to the srtt algorithm in 935 * useconds. 936 */ 937 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 938 union tcp_log_stackspecific log; 939 struct timeval tv; 940 941 /* Convert our ms to a microsecond */ 942 log.u_bbr.flex1 = rtt * 1000; 943 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 944 TCP_LOG_EVENTP(rack->rc_tp, NULL, 945 &rack->rc_inp->inp_socket->so_rcv, 946 &rack->rc_inp->inp_socket->so_snd, 947 TCP_LOG_RTT, 0, 948 0, &log, false, &tv); 949 } 950 } 951 952 953 static inline void 954 rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, int event, int line) 955 { 956 if (rack_verbose_logging && (tp->t_logstate != TCP_LOG_STATE_OFF)) { 957 union tcp_log_stackspecific log; 958 959 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 960 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 961 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 962 log.u_bbr.flex1 = line; 963 log.u_bbr.flex2 = tick; 964 log.u_bbr.flex3 = tp->t_maxunacktime; 965 log.u_bbr.flex4 = tp->t_acktime; 966 log.u_bbr.flex8 = event; 967 TCP_LOG_EVENT(tp, NULL, 968 &rack->rc_inp->inp_socket->so_rcv, 969 &rack->rc_inp->inp_socket->so_snd, 970 BBR_LOG_PROGRESS, 0, 971 0, &log, false); 972 } 973 } 974 975 static void 976 rack_log_type_bbrsnd(struct tcp_rack *rack, uint32_t len, uint32_t slot, uint32_t cts) 977 { 978 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 979 union tcp_log_stackspecific log; 980 981 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 982 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 983 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 984 log.u_bbr.flex1 = slot; 985 log.u_bbr.flex7 = (0x0000ffff & rack->r_ctl.rc_hpts_flags); 986 log.u_bbr.flex8 = rack->rc_in_persist; 987 TCP_LOG_EVENT(rack->rc_tp, NULL, 988 &rack->rc_inp->inp_socket->so_rcv, 989 &rack->rc_inp->inp_socket->so_snd, 990 BBR_LOG_BBRSND, 0, 991 0, &log, false); 992 } 993 } 994 995 static void 996 rack_log_doseg_done(struct tcp_rack *rack, uint32_t cts, int32_t nxt_pkt, int32_t did_out, int way_out) 997 { 998 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 999 union tcp_log_stackspecific log; 1000 log.u_bbr.flex1 = did_out; 1001 log.u_bbr.flex2 = nxt_pkt; 1002 log.u_bbr.flex3 = way_out; 1003 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; 1004 log.u_bbr.flex7 = rack->r_wanted_output; 1005 log.u_bbr.flex8 = rack->rc_in_persist; 1006 TCP_LOG_EVENT(rack->rc_tp, NULL, 1007 &rack->rc_inp->inp_socket->so_rcv, 1008 &rack->rc_inp->inp_socket->so_snd, 1009 BBR_LOG_DOSEG_DONE, 0, 1010 0, &log, false); 1011 } 1012 } 1013 1014 1015 static void 1016 rack_log_type_just_return(struct tcp_rack *rack, uint32_t cts, uint32_t tlen, uint32_t slot, uint8_t hpts_calling) 1017 { 1018 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 1019 union tcp_log_stackspecific log; 1020 1021 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 1022 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 1023 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 1024 log.u_bbr.flex1 = slot; 1025 log.u_bbr.flex2 = rack->r_ctl.rc_hpts_flags; 1026 log.u_bbr.flex7 = hpts_calling; 1027 log.u_bbr.flex8 = rack->rc_in_persist; 1028 TCP_LOG_EVENT(rack->rc_tp, NULL, 1029 &rack->rc_inp->inp_socket->so_rcv, 1030 &rack->rc_inp->inp_socket->so_snd, 1031 BBR_LOG_JUSTRET, 0, 1032 tlen, &log, false); 1033 } 1034 } 1035 1036 static void 1037 rack_log_to_cancel(struct tcp_rack *rack, int32_t hpts_removed, int line) 1038 { 1039 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 1040 union tcp_log_stackspecific log; 1041 1042 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 1043 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 1044 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 1045 log.u_bbr.flex1 = line; 1046 log.u_bbr.flex2 = 0; 1047 log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags; 1048 log.u_bbr.flex4 = 0; 1049 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur; 1050 log.u_bbr.flex8 = hpts_removed; 1051 TCP_LOG_EVENT(rack->rc_tp, NULL, 1052 &rack->rc_inp->inp_socket->so_rcv, 1053 &rack->rc_inp->inp_socket->so_snd, 1054 BBR_LOG_TIMERCANC, 0, 1055 0, &log, false); 1056 } 1057 } 1058 1059 static void 1060 rack_log_to_processing(struct tcp_rack *rack, uint32_t cts, int32_t ret, int32_t timers) 1061 { 1062 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 1063 union tcp_log_stackspecific log; 1064 1065 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 1066 log.u_bbr.flex1 = timers; 1067 log.u_bbr.flex2 = ret; 1068 log.u_bbr.flex3 = rack->r_ctl.rc_timer_exp; 1069 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; 1070 log.u_bbr.flex5 = cts; 1071 TCP_LOG_EVENT(rack->rc_tp, NULL, 1072 &rack->rc_inp->inp_socket->so_rcv, 1073 &rack->rc_inp->inp_socket->so_snd, 1074 BBR_LOG_TO_PROCESS, 0, 1075 0, &log, false); 1076 } 1077 } 1078 1079 static void 1080 rack_counter_destroy() 1081 { 1082 counter_u64_free(rack_badfr); 1083 counter_u64_free(rack_badfr_bytes); 1084 counter_u64_free(rack_rtm_prr_retran); 1085 counter_u64_free(rack_rtm_prr_newdata); 1086 counter_u64_free(rack_timestamp_mismatch); 1087 counter_u64_free(rack_reorder_seen); 1088 counter_u64_free(rack_tlp_tot); 1089 counter_u64_free(rack_tlp_newdata); 1090 counter_u64_free(rack_tlp_retran); 1091 counter_u64_free(rack_tlp_retran_bytes); 1092 counter_u64_free(rack_tlp_retran_fail); 1093 counter_u64_free(rack_to_tot); 1094 counter_u64_free(rack_to_arm_rack); 1095 counter_u64_free(rack_to_arm_tlp); 1096 counter_u64_free(rack_paced_segments); 1097 counter_u64_free(rack_unpaced_segments); 1098 counter_u64_free(rack_saw_enobuf); 1099 counter_u64_free(rack_saw_enetunreach); 1100 counter_u64_free(rack_to_alloc_hard); 1101 counter_u64_free(rack_to_alloc_emerg); 1102 counter_u64_free(rack_sack_proc_all); 1103 counter_u64_free(rack_sack_proc_short); 1104 counter_u64_free(rack_sack_proc_restart); 1105 counter_u64_free(rack_to_alloc); 1106 counter_u64_free(rack_find_high); 1107 counter_u64_free(rack_runt_sacks); 1108 counter_u64_free(rack_enter_tlp_calc); 1109 counter_u64_free(rack_used_tlpmethod); 1110 counter_u64_free(rack_used_tlpmethod2); 1111 counter_u64_free(rack_progress_drops); 1112 counter_u64_free(rack_input_idle_reduces); 1113 counter_u64_free(rack_tlp_does_nada); 1114 COUNTER_ARRAY_FREE(rack_out_size, TCP_MSS_ACCT_SIZE); 1115 COUNTER_ARRAY_FREE(rack_opts_arry, RACK_OPTS_SIZE); 1116 } 1117 1118 static struct rack_sendmap * 1119 rack_alloc(struct tcp_rack *rack) 1120 { 1121 struct rack_sendmap *rsm; 1122 1123 counter_u64_add(rack_to_alloc, 1); 1124 rack->r_ctl.rc_num_maps_alloced++; 1125 rsm = uma_zalloc(rack_zone, M_NOWAIT); 1126 if (rsm) { 1127 return (rsm); 1128 } 1129 if (rack->rc_free_cnt) { 1130 counter_u64_add(rack_to_alloc_emerg, 1); 1131 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); 1132 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_next); 1133 rack->rc_free_cnt--; 1134 return (rsm); 1135 } 1136 return (NULL); 1137 } 1138 1139 static void 1140 rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm) 1141 { 1142 rack->r_ctl.rc_num_maps_alloced--; 1143 if (rack->r_ctl.rc_tlpsend == rsm) 1144 rack->r_ctl.rc_tlpsend = NULL; 1145 if (rack->r_ctl.rc_next == rsm) 1146 rack->r_ctl.rc_next = NULL; 1147 if (rack->r_ctl.rc_sacklast == rsm) 1148 rack->r_ctl.rc_sacklast = NULL; 1149 if (rack->rc_free_cnt < rack_free_cache) { 1150 memset(rsm, 0, sizeof(struct rack_sendmap)); 1151 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_next); 1152 rack->rc_free_cnt++; 1153 return; 1154 } 1155 uma_zfree(rack_zone, rsm); 1156 } 1157 1158 /* 1159 * CC wrapper hook functions 1160 */ 1161 static void 1162 rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, struct tcphdr *th, uint16_t nsegs, 1163 uint16_t type, int32_t recovery) 1164 { 1165 #ifdef NETFLIX_STATS 1166 int32_t gput; 1167 #endif 1168 #ifdef NETFLIX_CWV 1169 u_long old_cwnd = tp->snd_cwnd; 1170 #endif 1171 1172 INP_WLOCK_ASSERT(tp->t_inpcb); 1173 tp->ccv->nsegs = nsegs; 1174 tp->ccv->bytes_this_ack = BYTES_THIS_ACK(tp, th); 1175 if ((recovery) && (rack->r_ctl.rc_early_recovery_segs)) { 1176 uint32_t max; 1177 1178 max = rack->r_ctl.rc_early_recovery_segs * tp->t_maxseg; 1179 if (tp->ccv->bytes_this_ack > max) { 1180 tp->ccv->bytes_this_ack = max; 1181 } 1182 } 1183 if (tp->snd_cwnd <= tp->snd_wnd) 1184 tp->ccv->flags |= CCF_CWND_LIMITED; 1185 else 1186 tp->ccv->flags &= ~CCF_CWND_LIMITED; 1187 1188 if (type == CC_ACK) { 1189 #ifdef NETFLIX_STATS 1190 stats_voi_update_abs_s32(tp->t_stats, VOI_TCP_CALCFRWINDIFF, 1191 ((int32_t) tp->snd_cwnd) - tp->snd_wnd); 1192 if ((tp->t_flags & TF_GPUTINPROG) && 1193 SEQ_GEQ(th->th_ack, tp->gput_ack)) { 1194 gput = (((int64_t) (th->th_ack - tp->gput_seq)) << 3) / 1195 max(1, tcp_ts_getticks() - tp->gput_ts); 1196 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_GPUT, 1197 gput); 1198 /* 1199 * XXXLAS: This is a temporary hack, and should be 1200 * chained off VOI_TCP_GPUT when stats(9) grows an 1201 * API to deal with chained VOIs. 1202 */ 1203 if (tp->t_stats_gput_prev > 0) 1204 stats_voi_update_abs_s32(tp->t_stats, 1205 VOI_TCP_GPUT_ND, 1206 ((gput - tp->t_stats_gput_prev) * 100) / 1207 tp->t_stats_gput_prev); 1208 tp->t_flags &= ~TF_GPUTINPROG; 1209 tp->t_stats_gput_prev = gput; 1210 #ifdef NETFLIX_CWV 1211 if (tp->t_maxpeakrate) { 1212 /* 1213 * We update t_peakrate_thr. This gives us roughly 1214 * one update per round trip time. 1215 */ 1216 tcp_update_peakrate_thr(tp); 1217 } 1218 #endif 1219 } 1220 #endif 1221 if (tp->snd_cwnd > tp->snd_ssthresh) { 1222 tp->t_bytes_acked += min(tp->ccv->bytes_this_ack, 1223 nsegs * V_tcp_abc_l_var * tp->t_maxseg); 1224 if (tp->t_bytes_acked >= tp->snd_cwnd) { 1225 tp->t_bytes_acked -= tp->snd_cwnd; 1226 tp->ccv->flags |= CCF_ABC_SENTAWND; 1227 } 1228 } else { 1229 tp->ccv->flags &= ~CCF_ABC_SENTAWND; 1230 tp->t_bytes_acked = 0; 1231 } 1232 } 1233 if (CC_ALGO(tp)->ack_received != NULL) { 1234 /* XXXLAS: Find a way to live without this */ 1235 tp->ccv->curack = th->th_ack; 1236 CC_ALGO(tp)->ack_received(tp->ccv, type); 1237 } 1238 #ifdef NETFLIX_STATS 1239 stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_LCWIN, tp->snd_cwnd); 1240 #endif 1241 if (rack->r_ctl.rc_rack_largest_cwnd < tp->snd_cwnd) { 1242 rack->r_ctl.rc_rack_largest_cwnd = tp->snd_cwnd; 1243 } 1244 #ifdef NETFLIX_CWV 1245 if (tp->cwv_enabled) { 1246 /* 1247 * Per RFC 7661: The behaviour in the non-validated phase is 1248 * specified as: o A sender determines whether to increase 1249 * the cwnd based upon whether it is cwnd-limited (see 1250 * Section 4.5.3): * A sender that is cwnd-limited MAY use 1251 * the standard TCP method to increase cwnd (i.e., the 1252 * standard method permits a TCP sender that fully utilises 1253 * the cwnd to increase the cwnd each time it receives an 1254 * ACK). * A sender that is not cwnd-limited MUST NOT 1255 * increase the cwnd when ACK packets are received in this 1256 * phase (i.e., needs to avoid growing the cwnd when it has 1257 * not recently sent using the current size of cwnd). 1258 */ 1259 if ((tp->snd_cwnd > old_cwnd) && 1260 (tp->cwv_cwnd_valid == 0) && 1261 (!(tp->ccv->flags & CCF_CWND_LIMITED))) { 1262 tp->snd_cwnd = old_cwnd; 1263 } 1264 /* Try to update pipeAck and NCWV state */ 1265 if (TCPS_HAVEESTABLISHED(tp->t_state) && 1266 !IN_RECOVERY(tp->t_flags)) { 1267 uint32_t data = sbavail(&(tp->t_inpcb->inp_socket->so_snd)); 1268 1269 tcp_newcwv_update_pipeack(tp, data); 1270 } 1271 } 1272 /* we enforce max peak rate if it is set. */ 1273 if (tp->t_peakrate_thr && tp->snd_cwnd > tp->t_peakrate_thr) { 1274 tp->snd_cwnd = tp->t_peakrate_thr; 1275 } 1276 #endif 1277 } 1278 1279 static void 1280 tcp_rack_partialack(struct tcpcb *tp, struct tcphdr *th) 1281 { 1282 struct tcp_rack *rack; 1283 1284 rack = (struct tcp_rack *)tp->t_fb_ptr; 1285 INP_WLOCK_ASSERT(tp->t_inpcb); 1286 if (rack->r_ctl.rc_prr_sndcnt > 0) 1287 rack->r_wanted_output++; 1288 } 1289 1290 static void 1291 rack_post_recovery(struct tcpcb *tp, struct tcphdr *th) 1292 { 1293 struct tcp_rack *rack; 1294 1295 INP_WLOCK_ASSERT(tp->t_inpcb); 1296 rack = (struct tcp_rack *)tp->t_fb_ptr; 1297 if (CC_ALGO(tp)->post_recovery != NULL) { 1298 tp->ccv->curack = th->th_ack; 1299 CC_ALGO(tp)->post_recovery(tp->ccv); 1300 } 1301 /* 1302 * Here we can in theory adjust cwnd to be based on the number of 1303 * losses in the window (rack->r_ctl.rc_loss_count). This is done 1304 * based on the rack_use_proportional flag. 1305 */ 1306 if (rack->r_ctl.rc_prop_reduce && rack->r_ctl.rc_prop_rate) { 1307 int32_t reduce; 1308 1309 reduce = (rack->r_ctl.rc_loss_count * rack->r_ctl.rc_prop_rate); 1310 if (reduce > 50) { 1311 reduce = 50; 1312 } 1313 tp->snd_cwnd -= ((reduce * tp->snd_cwnd) / 100); 1314 } else { 1315 if (tp->snd_cwnd > tp->snd_ssthresh) { 1316 /* Drop us down to the ssthresh (1/2 cwnd at loss) */ 1317 tp->snd_cwnd = tp->snd_ssthresh; 1318 } 1319 } 1320 if (rack->r_ctl.rc_prr_sndcnt > 0) { 1321 /* Suck the next prr cnt back into cwnd */ 1322 tp->snd_cwnd += rack->r_ctl.rc_prr_sndcnt; 1323 rack->r_ctl.rc_prr_sndcnt = 0; 1324 } 1325 EXIT_RECOVERY(tp->t_flags); 1326 1327 1328 #ifdef NETFLIX_CWV 1329 if (tp->cwv_enabled) { 1330 if ((tp->cwv_cwnd_valid == 0) && 1331 (tp->snd_cwv.in_recovery)) 1332 tcp_newcwv_end_recovery(tp); 1333 } 1334 #endif 1335 } 1336 1337 static void 1338 rack_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type) 1339 { 1340 struct tcp_rack *rack; 1341 1342 INP_WLOCK_ASSERT(tp->t_inpcb); 1343 1344 rack = (struct tcp_rack *)tp->t_fb_ptr; 1345 switch (type) { 1346 case CC_NDUPACK: 1347 /* rack->r_ctl.rc_ssthresh_set = 1;*/ 1348 if (!IN_FASTRECOVERY(tp->t_flags)) { 1349 rack->r_ctl.rc_tlp_rtx_out = 0; 1350 rack->r_ctl.rc_prr_delivered = 0; 1351 rack->r_ctl.rc_prr_out = 0; 1352 rack->r_ctl.rc_loss_count = 0; 1353 rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; 1354 rack->r_ctl.rc_prr_recovery_fs = tp->snd_max - tp->snd_una; 1355 tp->snd_recover = tp->snd_max; 1356 if (tp->t_flags & TF_ECN_PERMIT) 1357 tp->t_flags |= TF_ECN_SND_CWR; 1358 } 1359 break; 1360 case CC_ECN: 1361 if (!IN_CONGRECOVERY(tp->t_flags)) { 1362 TCPSTAT_INC(tcps_ecn_rcwnd); 1363 tp->snd_recover = tp->snd_max; 1364 if (tp->t_flags & TF_ECN_PERMIT) 1365 tp->t_flags |= TF_ECN_SND_CWR; 1366 } 1367 break; 1368 case CC_RTO: 1369 tp->t_dupacks = 0; 1370 tp->t_bytes_acked = 0; 1371 EXIT_RECOVERY(tp->t_flags); 1372 tp->snd_ssthresh = max(2, min(tp->snd_wnd, tp->snd_cwnd) / 2 / 1373 tp->t_maxseg) * tp->t_maxseg; 1374 tp->snd_cwnd = tp->t_maxseg; 1375 break; 1376 case CC_RTO_ERR: 1377 TCPSTAT_INC(tcps_sndrexmitbad); 1378 /* RTO was unnecessary, so reset everything. */ 1379 tp->snd_cwnd = tp->snd_cwnd_prev; 1380 tp->snd_ssthresh = tp->snd_ssthresh_prev; 1381 tp->snd_recover = tp->snd_recover_prev; 1382 if (tp->t_flags & TF_WASFRECOVERY) 1383 ENTER_FASTRECOVERY(tp->t_flags); 1384 if (tp->t_flags & TF_WASCRECOVERY) 1385 ENTER_CONGRECOVERY(tp->t_flags); 1386 tp->snd_nxt = tp->snd_max; 1387 tp->t_badrxtwin = 0; 1388 break; 1389 } 1390 1391 if (CC_ALGO(tp)->cong_signal != NULL) { 1392 if (th != NULL) 1393 tp->ccv->curack = th->th_ack; 1394 CC_ALGO(tp)->cong_signal(tp->ccv, type); 1395 } 1396 #ifdef NETFLIX_CWV 1397 if (tp->cwv_enabled) { 1398 if (tp->snd_cwv.in_recovery == 0 && IN_RECOVERY(tp->t_flags)) { 1399 tcp_newcwv_enter_recovery(tp); 1400 } 1401 if (type == CC_RTO) { 1402 tcp_newcwv_reset(tp); 1403 } 1404 } 1405 #endif 1406 } 1407 1408 1409 1410 static inline void 1411 rack_cc_after_idle(struct tcpcb *tp, int reduce_largest) 1412 { 1413 uint32_t i_cwnd; 1414 1415 INP_WLOCK_ASSERT(tp->t_inpcb); 1416 1417 #ifdef NETFLIX_STATS 1418 TCPSTAT_INC(tcps_idle_restarts); 1419 if (tp->t_state == TCPS_ESTABLISHED) 1420 TCPSTAT_INC(tcps_idle_estrestarts); 1421 #endif 1422 if (CC_ALGO(tp)->after_idle != NULL) 1423 CC_ALGO(tp)->after_idle(tp->ccv); 1424 1425 if (tp->snd_cwnd == 1) 1426 i_cwnd = tp->t_maxseg; /* SYN(-ACK) lost */ 1427 else if (V_tcp_initcwnd_segments) 1428 i_cwnd = min((V_tcp_initcwnd_segments * tp->t_maxseg), 1429 max(2 * tp->t_maxseg, V_tcp_initcwnd_segments * 1460)); 1430 else if (V_tcp_do_rfc3390) 1431 i_cwnd = min(4 * tp->t_maxseg, 1432 max(2 * tp->t_maxseg, 4380)); 1433 else { 1434 /* Per RFC5681 Section 3.1 */ 1435 if (tp->t_maxseg > 2190) 1436 i_cwnd = 2 * tp->t_maxseg; 1437 else if (tp->t_maxseg > 1095) 1438 i_cwnd = 3 * tp->t_maxseg; 1439 else 1440 i_cwnd = 4 * tp->t_maxseg; 1441 } 1442 if (reduce_largest) { 1443 /* 1444 * Do we reduce the largest cwnd to make 1445 * rack play nice on restart hptsi wise? 1446 */ 1447 if (((struct tcp_rack *)tp->t_fb_ptr)->r_ctl.rc_rack_largest_cwnd > i_cwnd) 1448 ((struct tcp_rack *)tp->t_fb_ptr)->r_ctl.rc_rack_largest_cwnd = i_cwnd; 1449 } 1450 /* 1451 * Being idle is no differnt than the initial window. If the cc 1452 * clamps it down below the initial window raise it to the initial 1453 * window. 1454 */ 1455 if (tp->snd_cwnd < i_cwnd) { 1456 tp->snd_cwnd = i_cwnd; 1457 } 1458 } 1459 1460 1461 /* 1462 * Indicate whether this ack should be delayed. We can delay the ack if 1463 * following conditions are met: 1464 * - There is no delayed ack timer in progress. 1465 * - Our last ack wasn't a 0-sized window. We never want to delay 1466 * the ack that opens up a 0-sized window. 1467 * - LRO wasn't used for this segment. We make sure by checking that the 1468 * segment size is not larger than the MSS. 1469 * - Delayed acks are enabled or this is a half-synchronized T/TCP 1470 * connection. 1471 */ 1472 #define DELAY_ACK(tp, tlen) \ 1473 (((tp->t_flags & TF_RXWIN0SENT) == 0) && \ 1474 ((tp->t_flags & TF_DELACK) == 0) && \ 1475 (tlen <= tp->t_maxseg) && \ 1476 (tp->t_delayed_ack || (tp->t_flags & TF_NEEDSYN))) 1477 1478 static inline void 1479 rack_calc_rwin(struct socket *so, struct tcpcb *tp) 1480 { 1481 int32_t win; 1482 1483 /* 1484 * Calculate amount of space in receive window, and then do TCP 1485 * input processing. Receive window is amount of space in rcv queue, 1486 * but not less than advertised window. 1487 */ 1488 win = sbspace(&so->so_rcv); 1489 if (win < 0) 1490 win = 0; 1491 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); 1492 } 1493 1494 static void 1495 rack_do_drop(struct mbuf *m, struct tcpcb *tp, int32_t * ti_locked) 1496 { 1497 if (*ti_locked == TI_RLOCKED) { 1498 INP_INFO_RUNLOCK(&V_tcbinfo); 1499 *ti_locked = TI_UNLOCKED; 1500 } 1501 /* 1502 * Drop space held by incoming segment and return. 1503 */ 1504 if (tp != NULL) 1505 INP_WUNLOCK(tp->t_inpcb); 1506 if (m) 1507 m_freem(m); 1508 } 1509 1510 static void 1511 rack_do_dropwithreset(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t * ti_locked, int32_t rstreason, int32_t tlen) 1512 { 1513 if (*ti_locked == TI_RLOCKED) { 1514 INP_INFO_RUNLOCK(&V_tcbinfo); 1515 *ti_locked = TI_UNLOCKED; 1516 } 1517 if (tp != NULL) { 1518 tcp_dropwithreset(m, th, tp, tlen, rstreason); 1519 INP_WUNLOCK(tp->t_inpcb); 1520 } else 1521 tcp_dropwithreset(m, th, NULL, tlen, rstreason); 1522 } 1523 1524 /* 1525 * The value in ret_val informs the caller 1526 * if we dropped the tcb (and lock) or not. 1527 * 1 = we dropped it, 0 = the TCB is still locked 1528 * and valid. 1529 */ 1530 static void 1531 rack_do_dropafterack(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t * ti_locked, int32_t thflags, int32_t tlen, int32_t * ret_val) 1532 { 1533 /* 1534 * Generate an ACK dropping incoming segment if it occupies sequence 1535 * space, where the ACK reflects our state. 1536 * 1537 * We can now skip the test for the RST flag since all paths to this 1538 * code happen after packets containing RST have been dropped. 1539 * 1540 * In the SYN-RECEIVED state, don't send an ACK unless the segment 1541 * we received passes the SYN-RECEIVED ACK test. If it fails send a 1542 * RST. This breaks the loop in the "LAND" DoS attack, and also 1543 * prevents an ACK storm between two listening ports that have been 1544 * sent forged SYN segments, each with the source address of the 1545 * other. 1546 */ 1547 struct tcp_rack *rack; 1548 1549 if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) && 1550 (SEQ_GT(tp->snd_una, th->th_ack) || 1551 SEQ_GT(th->th_ack, tp->snd_max))) { 1552 *ret_val = 1; 1553 rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen); 1554 return; 1555 } else 1556 *ret_val = 0; 1557 if (*ti_locked == TI_RLOCKED) { 1558 INP_INFO_RUNLOCK(&V_tcbinfo); 1559 *ti_locked = TI_UNLOCKED; 1560 } 1561 rack = (struct tcp_rack *)tp->t_fb_ptr; 1562 rack->r_wanted_output++; 1563 tp->t_flags |= TF_ACKNOW; 1564 if (m) 1565 m_freem(m); 1566 } 1567 1568 1569 static int 1570 rack_process_rst(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, int32_t * ti_locked) 1571 { 1572 /* 1573 * RFC5961 Section 3.2 1574 * 1575 * - RST drops connection only if SEG.SEQ == RCV.NXT. - If RST is in 1576 * window, we send challenge ACK. 1577 * 1578 * Note: to take into account delayed ACKs, we should test against 1579 * last_ack_sent instead of rcv_nxt. Note 2: we handle special case 1580 * of closed window, not covered by the RFC. 1581 */ 1582 int dropped = 0; 1583 1584 if ((SEQ_GEQ(th->th_seq, (tp->last_ack_sent - 1)) && 1585 SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) || 1586 (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) { 1587 1588 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 1589 KASSERT(*ti_locked == TI_RLOCKED, 1590 ("%s: TH_RST ti_locked %d, th %p tp %p", 1591 __func__, *ti_locked, th, tp)); 1592 KASSERT(tp->t_state != TCPS_SYN_SENT, 1593 ("%s: TH_RST for TCPS_SYN_SENT th %p tp %p", 1594 __func__, th, tp)); 1595 1596 if (V_tcp_insecure_rst || 1597 (tp->last_ack_sent == th->th_seq) || 1598 (tp->rcv_nxt == th->th_seq) || 1599 ((tp->last_ack_sent - 1) == th->th_seq)) { 1600 TCPSTAT_INC(tcps_drops); 1601 /* Drop the connection. */ 1602 switch (tp->t_state) { 1603 case TCPS_SYN_RECEIVED: 1604 so->so_error = ECONNREFUSED; 1605 goto close; 1606 case TCPS_ESTABLISHED: 1607 case TCPS_FIN_WAIT_1: 1608 case TCPS_FIN_WAIT_2: 1609 case TCPS_CLOSE_WAIT: 1610 case TCPS_CLOSING: 1611 case TCPS_LAST_ACK: 1612 so->so_error = ECONNRESET; 1613 close: 1614 tcp_state_change(tp, TCPS_CLOSED); 1615 /* FALLTHROUGH */ 1616 default: 1617 tp = tcp_close(tp); 1618 } 1619 dropped = 1; 1620 rack_do_drop(m, tp, ti_locked); 1621 } else { 1622 TCPSTAT_INC(tcps_badrst); 1623 /* Send challenge ACK. */ 1624 tcp_respond(tp, mtod(m, void *), th, m, 1625 tp->rcv_nxt, tp->snd_nxt, TH_ACK); 1626 tp->last_ack_sent = tp->rcv_nxt; 1627 } 1628 } else { 1629 m_freem(m); 1630 } 1631 return (dropped); 1632 } 1633 1634 /* 1635 * The value in ret_val informs the caller 1636 * if we dropped the tcb (and lock) or not. 1637 * 1 = we dropped it, 0 = the TCB is still locked 1638 * and valid. 1639 */ 1640 static void 1641 rack_challenge_ack(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * ti_locked, int32_t * ret_val) 1642 { 1643 KASSERT(*ti_locked == TI_RLOCKED, 1644 ("tcp_do_segment: TH_SYN ti_locked %d", *ti_locked)); 1645 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 1646 1647 TCPSTAT_INC(tcps_badsyn); 1648 if (V_tcp_insecure_syn && 1649 SEQ_GEQ(th->th_seq, tp->last_ack_sent) && 1650 SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) { 1651 tp = tcp_drop(tp, ECONNRESET); 1652 *ret_val = 1; 1653 rack_do_drop(m, tp, ti_locked); 1654 } else { 1655 /* Send challenge ACK. */ 1656 tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt, 1657 tp->snd_nxt, TH_ACK); 1658 tp->last_ack_sent = tp->rcv_nxt; 1659 m = NULL; 1660 *ret_val = 0; 1661 rack_do_drop(m, NULL, ti_locked); 1662 } 1663 } 1664 1665 /* 1666 * rack_ts_check returns 1 for you should not proceed. It places 1667 * in ret_val what should be returned 1/0 by the caller. The 1 indicates 1668 * that the TCB is unlocked and probably dropped. The 0 indicates the 1669 * TCB is still valid and locked. 1670 */ 1671 static int 1672 rack_ts_check(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * ti_locked, int32_t tlen, int32_t thflags, int32_t * ret_val) 1673 { 1674 1675 /* Check to see if ts_recent is over 24 days old. */ 1676 if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) { 1677 /* 1678 * Invalidate ts_recent. If this segment updates ts_recent, 1679 * the age will be reset later and ts_recent will get a 1680 * valid value. If it does not, setting ts_recent to zero 1681 * will at least satisfy the requirement that zero be placed 1682 * in the timestamp echo reply when ts_recent isn't valid. 1683 * The age isn't reset until we get a valid ts_recent 1684 * because we don't want out-of-order segments to be dropped 1685 * when ts_recent is old. 1686 */ 1687 tp->ts_recent = 0; 1688 } else { 1689 TCPSTAT_INC(tcps_rcvduppack); 1690 TCPSTAT_ADD(tcps_rcvdupbyte, tlen); 1691 TCPSTAT_INC(tcps_pawsdrop); 1692 *ret_val = 0; 1693 if (tlen) { 1694 rack_do_dropafterack(m, tp, th, ti_locked, thflags, tlen, ret_val); 1695 } else { 1696 rack_do_drop(m, NULL, ti_locked); 1697 } 1698 return (1); 1699 } 1700 return (0); 1701 } 1702 1703 /* 1704 * rack_drop_checks returns 1 for you should not proceed. It places 1705 * in ret_val what should be returned 1/0 by the caller. The 1 indicates 1706 * that the TCB is unlocked and probably dropped. The 0 indicates the 1707 * TCB is still valid and locked. 1708 */ 1709 static int 1710 rack_drop_checks(struct tcpopt *to, struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * tlenp, int32_t * ti_locked, int32_t * thf, int32_t * drop_hdrlen, int32_t * ret_val) 1711 { 1712 int32_t todrop; 1713 int32_t thflags; 1714 int32_t tlen; 1715 1716 thflags = *thf; 1717 tlen = *tlenp; 1718 todrop = tp->rcv_nxt - th->th_seq; 1719 if (todrop > 0) { 1720 if (thflags & TH_SYN) { 1721 thflags &= ~TH_SYN; 1722 th->th_seq++; 1723 if (th->th_urp > 1) 1724 th->th_urp--; 1725 else 1726 thflags &= ~TH_URG; 1727 todrop--; 1728 } 1729 /* 1730 * Following if statement from Stevens, vol. 2, p. 960. 1731 */ 1732 if (todrop > tlen 1733 || (todrop == tlen && (thflags & TH_FIN) == 0)) { 1734 /* 1735 * Any valid FIN must be to the left of the window. 1736 * At this point the FIN must be a duplicate or out 1737 * of sequence; drop it. 1738 */ 1739 thflags &= ~TH_FIN; 1740 /* 1741 * Send an ACK to resynchronize and drop any data. 1742 * But keep on processing for RST or ACK. 1743 */ 1744 tp->t_flags |= TF_ACKNOW; 1745 todrop = tlen; 1746 TCPSTAT_INC(tcps_rcvduppack); 1747 TCPSTAT_ADD(tcps_rcvdupbyte, todrop); 1748 } else { 1749 TCPSTAT_INC(tcps_rcvpartduppack); 1750 TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop); 1751 } 1752 *drop_hdrlen += todrop; /* drop from the top afterwards */ 1753 th->th_seq += todrop; 1754 tlen -= todrop; 1755 if (th->th_urp > todrop) 1756 th->th_urp -= todrop; 1757 else { 1758 thflags &= ~TH_URG; 1759 th->th_urp = 0; 1760 } 1761 } 1762 /* 1763 * If segment ends after window, drop trailing data (and PUSH and 1764 * FIN); if nothing left, just ACK. 1765 */ 1766 todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd); 1767 if (todrop > 0) { 1768 TCPSTAT_INC(tcps_rcvpackafterwin); 1769 if (todrop >= tlen) { 1770 TCPSTAT_ADD(tcps_rcvbyteafterwin, tlen); 1771 /* 1772 * If window is closed can only take segments at 1773 * window edge, and have to drop data and PUSH from 1774 * incoming segments. Continue processing, but 1775 * remember to ack. Otherwise, drop segment and 1776 * ack. 1777 */ 1778 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { 1779 tp->t_flags |= TF_ACKNOW; 1780 TCPSTAT_INC(tcps_rcvwinprobe); 1781 } else { 1782 rack_do_dropafterack(m, tp, th, ti_locked, thflags, tlen, ret_val); 1783 return (1); 1784 } 1785 } else 1786 TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); 1787 m_adj(m, -todrop); 1788 tlen -= todrop; 1789 thflags &= ~(TH_PUSH | TH_FIN); 1790 } 1791 *thf = thflags; 1792 *tlenp = tlen; 1793 return (0); 1794 } 1795 1796 static struct rack_sendmap * 1797 rack_find_lowest_rsm(struct tcp_rack *rack) 1798 { 1799 struct rack_sendmap *rsm; 1800 1801 /* 1802 * Walk the time-order transmitted list looking for an rsm that is 1803 * not acked. This will be the one that was sent the longest time 1804 * ago that is still outstanding. 1805 */ 1806 TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) { 1807 if (rsm->r_flags & RACK_ACKED) { 1808 continue; 1809 } 1810 goto finish; 1811 } 1812 finish: 1813 return (rsm); 1814 } 1815 1816 static struct rack_sendmap * 1817 rack_find_high_nonack(struct tcp_rack *rack, struct rack_sendmap *rsm) 1818 { 1819 struct rack_sendmap *prsm; 1820 1821 /* 1822 * Walk the sequence order list backward until we hit and arrive at 1823 * the highest seq not acked. In theory when this is called it 1824 * should be the last segment (which it was not). 1825 */ 1826 counter_u64_add(rack_find_high, 1); 1827 prsm = rsm; 1828 TAILQ_FOREACH_REVERSE_FROM(prsm, &rack->r_ctl.rc_map, rack_head, r_next) { 1829 if (prsm->r_flags & (RACK_ACKED | RACK_HAS_FIN)) { 1830 continue; 1831 } 1832 return (prsm); 1833 } 1834 return (NULL); 1835 } 1836 1837 1838 static uint32_t 1839 rack_calc_thresh_rack(struct tcp_rack *rack, uint32_t srtt, uint32_t cts) 1840 { 1841 int32_t lro; 1842 uint32_t thresh; 1843 1844 /* 1845 * lro is the flag we use to determine if we have seen reordering. 1846 * If it gets set we have seen reordering. The reorder logic either 1847 * works in one of two ways: 1848 * 1849 * If reorder-fade is configured, then we track the last time we saw 1850 * re-ordering occur. If we reach the point where enough time as 1851 * passed we no longer consider reordering has occuring. 1852 * 1853 * Or if reorder-face is 0, then once we see reordering we consider 1854 * the connection to alway be subject to reordering and just set lro 1855 * to 1. 1856 * 1857 * In the end if lro is non-zero we add the extra time for 1858 * reordering in. 1859 */ 1860 if (srtt == 0) 1861 srtt = 1; 1862 if (rack->r_ctl.rc_reorder_ts) { 1863 if (rack->r_ctl.rc_reorder_fade) { 1864 if (SEQ_GEQ(cts, rack->r_ctl.rc_reorder_ts)) { 1865 lro = cts - rack->r_ctl.rc_reorder_ts; 1866 if (lro == 0) { 1867 /* 1868 * No time as passed since the last 1869 * reorder, mark it as reordering. 1870 */ 1871 lro = 1; 1872 } 1873 } else { 1874 /* Negative time? */ 1875 lro = 0; 1876 } 1877 if (lro > rack->r_ctl.rc_reorder_fade) { 1878 /* Turn off reordering seen too */ 1879 rack->r_ctl.rc_reorder_ts = 0; 1880 lro = 0; 1881 } 1882 } else { 1883 /* Reodering does not fade */ 1884 lro = 1; 1885 } 1886 } else { 1887 lro = 0; 1888 } 1889 thresh = srtt + rack->r_ctl.rc_pkt_delay; 1890 if (lro) { 1891 /* It must be set, if not you get 1/4 rtt */ 1892 if (rack->r_ctl.rc_reorder_shift) 1893 thresh += (srtt >> rack->r_ctl.rc_reorder_shift); 1894 else 1895 thresh += (srtt >> 2); 1896 } else { 1897 thresh += 1; 1898 } 1899 /* We don't let the rack timeout be above a RTO */ 1900 1901 if (thresh > TICKS_2_MSEC(rack->rc_tp->t_rxtcur)) { 1902 thresh = TICKS_2_MSEC(rack->rc_tp->t_rxtcur); 1903 } 1904 /* And we don't want it above the RTO max either */ 1905 if (thresh > rack_rto_max) { 1906 thresh = rack_rto_max; 1907 } 1908 return (thresh); 1909 } 1910 1911 static uint32_t 1912 rack_calc_thresh_tlp(struct tcpcb *tp, struct tcp_rack *rack, 1913 struct rack_sendmap *rsm, uint32_t srtt) 1914 { 1915 struct rack_sendmap *prsm; 1916 uint32_t thresh, len; 1917 int maxseg; 1918 1919 if (srtt == 0) 1920 srtt = 1; 1921 if (rack->r_ctl.rc_tlp_threshold) 1922 thresh = srtt + (srtt / rack->r_ctl.rc_tlp_threshold); 1923 else 1924 thresh = (srtt * 2); 1925 1926 /* Get the previous sent packet, if any */ 1927 maxseg = tcp_maxseg(tp); 1928 counter_u64_add(rack_enter_tlp_calc, 1); 1929 len = rsm->r_end - rsm->r_start; 1930 if (rack->rack_tlp_threshold_use == TLP_USE_ID) { 1931 /* Exactly like the ID */ 1932 if (((tp->snd_max - tp->snd_una) - rack->r_ctl.rc_sacked + rack->r_ctl.rc_holes_rxt) <= maxseg) { 1933 uint32_t alt_thresh; 1934 /* 1935 * Compensate for delayed-ack with the d-ack time. 1936 */ 1937 counter_u64_add(rack_used_tlpmethod, 1); 1938 alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time; 1939 if (alt_thresh > thresh) 1940 thresh = alt_thresh; 1941 } 1942 } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_ONE) { 1943 /* 2.1 behavior */ 1944 prsm = TAILQ_PREV(rsm, rack_head, r_tnext); 1945 if (prsm && (len <= maxseg)) { 1946 /* 1947 * Two packets outstanding, thresh should be (2*srtt) + 1948 * possible inter-packet delay (if any). 1949 */ 1950 uint32_t inter_gap = 0; 1951 int idx, nidx; 1952 1953 counter_u64_add(rack_used_tlpmethod, 1); 1954 idx = rsm->r_rtr_cnt - 1; 1955 nidx = prsm->r_rtr_cnt - 1; 1956 if (TSTMP_GEQ(rsm->r_tim_lastsent[nidx], prsm->r_tim_lastsent[idx])) { 1957 /* Yes it was sent later (or at the same time) */ 1958 inter_gap = rsm->r_tim_lastsent[idx] - prsm->r_tim_lastsent[nidx]; 1959 } 1960 thresh += inter_gap; 1961 } else if (len <= maxseg) { 1962 /* 1963 * Possibly compensate for delayed-ack. 1964 */ 1965 uint32_t alt_thresh; 1966 1967 counter_u64_add(rack_used_tlpmethod2, 1); 1968 alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time; 1969 if (alt_thresh > thresh) 1970 thresh = alt_thresh; 1971 } 1972 } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_TWO) { 1973 /* 2.2 behavior */ 1974 if (len <= maxseg) { 1975 uint32_t alt_thresh; 1976 /* 1977 * Compensate for delayed-ack with the d-ack time. 1978 */ 1979 counter_u64_add(rack_used_tlpmethod, 1); 1980 alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time; 1981 if (alt_thresh > thresh) 1982 thresh = alt_thresh; 1983 } 1984 } 1985 /* Not above an RTO */ 1986 if (thresh > TICKS_2_MSEC(tp->t_rxtcur)) { 1987 thresh = TICKS_2_MSEC(tp->t_rxtcur); 1988 } 1989 /* Not above a RTO max */ 1990 if (thresh > rack_rto_max) { 1991 thresh = rack_rto_max; 1992 } 1993 /* Apply user supplied min TLP */ 1994 if (thresh < rack_tlp_min) { 1995 thresh = rack_tlp_min; 1996 } 1997 return (thresh); 1998 } 1999 2000 static struct rack_sendmap * 2001 rack_check_recovery_mode(struct tcpcb *tp, uint32_t tsused) 2002 { 2003 /* 2004 * Check to see that we don't need to fall into recovery. We will 2005 * need to do so if our oldest transmit is past the time we should 2006 * have had an ack. 2007 */ 2008 struct tcp_rack *rack; 2009 struct rack_sendmap *rsm; 2010 int32_t idx; 2011 uint32_t srtt_cur, srtt, thresh; 2012 2013 rack = (struct tcp_rack *)tp->t_fb_ptr; 2014 if (TAILQ_EMPTY(&rack->r_ctl.rc_map)) { 2015 return (NULL); 2016 } 2017 srtt_cur = tp->t_srtt >> TCP_RTT_SHIFT; 2018 srtt = TICKS_2_MSEC(srtt_cur); 2019 if (rack->rc_rack_rtt && (srtt > rack->rc_rack_rtt)) 2020 srtt = rack->rc_rack_rtt; 2021 2022 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 2023 if (rsm == NULL) 2024 return (NULL); 2025 2026 if (rsm->r_flags & RACK_ACKED) { 2027 rsm = rack_find_lowest_rsm(rack); 2028 if (rsm == NULL) 2029 return (NULL); 2030 } 2031 idx = rsm->r_rtr_cnt - 1; 2032 thresh = rack_calc_thresh_rack(rack, srtt, tsused); 2033 if (tsused < rsm->r_tim_lastsent[idx]) { 2034 return (NULL); 2035 } 2036 if ((tsused - rsm->r_tim_lastsent[idx]) < thresh) { 2037 return (NULL); 2038 } 2039 /* Ok if we reach here we are over-due */ 2040 rack->r_ctl.rc_rsm_start = rsm->r_start; 2041 rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; 2042 rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; 2043 rack_cong_signal(tp, NULL, CC_NDUPACK); 2044 return (rsm); 2045 } 2046 2047 static uint32_t 2048 rack_get_persists_timer_val(struct tcpcb *tp, struct tcp_rack *rack) 2049 { 2050 int32_t t; 2051 int32_t tt; 2052 uint32_t ret_val; 2053 2054 t = TICKS_2_MSEC((tp->t_srtt >> TCP_RTT_SHIFT) + ((tp->t_rttvar * 4) >> TCP_RTT_SHIFT)); 2055 TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift], 2056 tcp_persmin, tcp_persmax); 2057 if (tp->t_rxtshift < TCP_MAXRXTSHIFT) 2058 tp->t_rxtshift++; 2059 rack->r_ctl.rc_hpts_flags |= PACE_TMR_PERSIT; 2060 ret_val = (uint32_t)tt; 2061 return (ret_val); 2062 } 2063 2064 static uint32_t 2065 rack_timer_start(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 2066 { 2067 /* 2068 * Start the FR timer, we do this based on getting the first one in 2069 * the rc_tmap. Note that if its NULL we must stop the timer. in all 2070 * events we need to stop the running timer (if its running) before 2071 * starting the new one. 2072 */ 2073 uint32_t thresh, exp, to, srtt, time_since_sent; 2074 uint32_t srtt_cur; 2075 int32_t idx; 2076 int32_t is_tlp_timer = 0; 2077 struct rack_sendmap *rsm; 2078 2079 if (rack->t_timers_stopped) { 2080 /* All timers have been stopped none are to run */ 2081 return (0); 2082 } 2083 if (rack->rc_in_persist) { 2084 /* We can't start any timer in persists */ 2085 return (rack_get_persists_timer_val(tp, rack)); 2086 } 2087 if (tp->t_state < TCPS_ESTABLISHED) 2088 goto activate_rxt; 2089 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 2090 if (rsm == NULL) { 2091 /* Nothing on the send map */ 2092 activate_rxt: 2093 if (SEQ_LT(tp->snd_una, tp->snd_max) || sbavail(&(tp->t_inpcb->inp_socket->so_snd))) { 2094 rack->r_ctl.rc_hpts_flags |= PACE_TMR_RXT; 2095 to = TICKS_2_MSEC(tp->t_rxtcur); 2096 if (to == 0) 2097 to = 1; 2098 return (to); 2099 } 2100 return (0); 2101 } 2102 if (rsm->r_flags & RACK_ACKED) { 2103 rsm = rack_find_lowest_rsm(rack); 2104 if (rsm == NULL) { 2105 /* No lowest? */ 2106 goto activate_rxt; 2107 } 2108 } 2109 /* Convert from ms to usecs */ 2110 if (rsm->r_flags & RACK_SACK_PASSED) { 2111 if ((tp->t_flags & TF_SENTFIN) && 2112 ((tp->snd_max - tp->snd_una) == 1) && 2113 (rsm->r_flags & RACK_HAS_FIN)) { 2114 /* 2115 * We don't start a rack timer if all we have is a 2116 * FIN outstanding. 2117 */ 2118 goto activate_rxt; 2119 } 2120 if (tp->t_srtt) { 2121 srtt_cur = (tp->t_srtt >> TCP_RTT_SHIFT); 2122 srtt = TICKS_2_MSEC(srtt_cur); 2123 } else 2124 srtt = RACK_INITIAL_RTO; 2125 2126 thresh = rack_calc_thresh_rack(rack, srtt, cts); 2127 idx = rsm->r_rtr_cnt - 1; 2128 exp = rsm->r_tim_lastsent[idx] + thresh; 2129 if (SEQ_GEQ(exp, cts)) { 2130 to = exp - cts; 2131 if (to < rack->r_ctl.rc_min_to) { 2132 to = rack->r_ctl.rc_min_to; 2133 } 2134 } else { 2135 to = rack->r_ctl.rc_min_to; 2136 } 2137 } else { 2138 /* Ok we need to do a TLP not RACK */ 2139 if ((rack->rc_tlp_in_progress != 0) || 2140 (rack->r_ctl.rc_tlp_rtx_out != 0)) { 2141 /* 2142 * The previous send was a TLP or a tlp_rtx is in 2143 * process. 2144 */ 2145 goto activate_rxt; 2146 } 2147 rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext); 2148 if (rsm == NULL) { 2149 /* We found no rsm to TLP with. */ 2150 goto activate_rxt; 2151 } 2152 if (rsm->r_flags & RACK_HAS_FIN) { 2153 /* If its a FIN we dont do TLP */ 2154 rsm = NULL; 2155 goto activate_rxt; 2156 } 2157 idx = rsm->r_rtr_cnt - 1; 2158 if (TSTMP_GT(cts, rsm->r_tim_lastsent[idx])) 2159 time_since_sent = cts - rsm->r_tim_lastsent[idx]; 2160 else 2161 time_since_sent = 0; 2162 is_tlp_timer = 1; 2163 if (tp->t_srtt) { 2164 srtt_cur = (tp->t_srtt >> TCP_RTT_SHIFT); 2165 srtt = TICKS_2_MSEC(srtt_cur); 2166 } else 2167 srtt = RACK_INITIAL_RTO; 2168 thresh = rack_calc_thresh_tlp(tp, rack, rsm, srtt); 2169 if (thresh > time_since_sent) 2170 to = thresh - time_since_sent; 2171 else 2172 to = rack->r_ctl.rc_min_to; 2173 if (to > TCPTV_REXMTMAX) { 2174 /* 2175 * If the TLP time works out to larger than the max 2176 * RTO lets not do TLP.. just RTO. 2177 */ 2178 goto activate_rxt; 2179 } 2180 if (rsm->r_start != rack->r_ctl.rc_last_tlp_seq) { 2181 /* 2182 * The tail is no longer the last one I did a probe 2183 * on 2184 */ 2185 rack->r_ctl.rc_tlp_seg_send_cnt = 0; 2186 rack->r_ctl.rc_last_tlp_seq = rsm->r_start; 2187 } 2188 } 2189 if (is_tlp_timer == 0) { 2190 rack->r_ctl.rc_hpts_flags |= PACE_TMR_RACK; 2191 } else { 2192 if ((rack->r_ctl.rc_tlp_send_cnt > rack_tlp_max_resend) || 2193 (rack->r_ctl.rc_tlp_seg_send_cnt > rack_tlp_max_resend)) { 2194 /* 2195 * We have exceeded how many times we can retran the 2196 * current TLP timer, switch to the RTO timer. 2197 */ 2198 goto activate_rxt; 2199 } else { 2200 rack->r_ctl.rc_hpts_flags |= PACE_TMR_TLP; 2201 } 2202 } 2203 if (to == 0) 2204 to = 1; 2205 return (to); 2206 } 2207 2208 static void 2209 rack_enter_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 2210 { 2211 if (rack->rc_in_persist == 0) { 2212 if (((tp->t_flags & TF_SENTFIN) == 0) && 2213 (tp->snd_max - tp->snd_una) >= sbavail(&rack->rc_inp->inp_socket->so_snd)) 2214 /* Must need to send more data to enter persist */ 2215 return; 2216 rack->r_ctl.rc_went_idle_time = cts; 2217 rack_timer_cancel(tp, rack, cts, __LINE__); 2218 tp->t_rxtshift = 0; 2219 rack->rc_in_persist = 1; 2220 } 2221 } 2222 2223 static void 2224 rack_exit_persist(struct tcpcb *tp, struct tcp_rack *rack) 2225 { 2226 if (rack->rc_inp->inp_in_hpts) { 2227 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); 2228 rack->r_ctl.rc_hpts_flags = 0; 2229 } 2230 rack->rc_in_persist = 0; 2231 rack->r_ctl.rc_went_idle_time = 0; 2232 tp->t_flags &= ~TF_FORCEDATA; 2233 tp->t_rxtshift = 0; 2234 } 2235 2236 static void 2237 rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, int32_t line, 2238 int32_t slot, uint32_t tot_len_this_send, int32_t frm_out_sbavail) 2239 { 2240 struct inpcb *inp; 2241 uint32_t delayed_ack = 0; 2242 uint32_t hpts_timeout; 2243 uint8_t stopped; 2244 uint32_t left = 0; 2245 2246 inp = tp->t_inpcb; 2247 if (inp->inp_in_hpts) { 2248 /* A previous call is already set up */ 2249 return; 2250 } 2251 if (tp->t_state == TCPS_CLOSED) { 2252 return; 2253 } 2254 stopped = rack->rc_tmr_stopped; 2255 if (stopped && TSTMP_GT(rack->r_ctl.rc_timer_exp, cts)) { 2256 left = rack->r_ctl.rc_timer_exp - cts; 2257 } 2258 rack->r_ctl.rc_timer_exp = 0; 2259 if (rack->rc_inp->inp_in_hpts == 0) { 2260 rack->r_ctl.rc_hpts_flags = 0; 2261 } 2262 if (slot) { 2263 /* We are hptsi too */ 2264 rack->r_ctl.rc_hpts_flags |= PACE_PKT_OUTPUT; 2265 } else if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { 2266 /* 2267 * We are still left on the hpts when the to goes 2268 * it will be for output. 2269 */ 2270 if (TSTMP_GT(cts, rack->r_ctl.rc_last_output_to)) 2271 slot = cts - rack->r_ctl.rc_last_output_to; 2272 else 2273 slot = 1; 2274 } 2275 if ((tp->snd_wnd == 0) && TCPS_HAVEESTABLISHED(tp->t_state)) { 2276 /* No send window.. we must enter persist */ 2277 rack_enter_persist(tp, rack, cts); 2278 } else if ((frm_out_sbavail && 2279 (frm_out_sbavail > (tp->snd_max - tp->snd_una)) && 2280 (tp->snd_wnd < tp->t_maxseg)) && 2281 TCPS_HAVEESTABLISHED(tp->t_state)) { 2282 /* 2283 * If we have no window or we can't send a segment (and have 2284 * data to send.. we cheat here and frm_out_sbavail is 2285 * passed in with the sbavail(sb) only from bbr_output) and 2286 * we are established, then we must enter persits (if not 2287 * already in persits). 2288 */ 2289 rack_enter_persist(tp, rack, cts); 2290 } 2291 hpts_timeout = rack_timer_start(tp, rack, cts); 2292 if (tp->t_flags & TF_DELACK) { 2293 delayed_ack = tcp_delacktime; 2294 rack->r_ctl.rc_hpts_flags |= PACE_TMR_DELACK; 2295 } 2296 if (delayed_ack && ((hpts_timeout == 0) || 2297 (delayed_ack < hpts_timeout))) 2298 hpts_timeout = delayed_ack; 2299 else 2300 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK; 2301 /* 2302 * If no timers are going to run and we will fall off the hptsi 2303 * wheel, we resort to a keep-alive timer if its configured. 2304 */ 2305 if ((hpts_timeout == 0) && 2306 (slot == 0)) { 2307 if ((tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && 2308 (tp->t_state <= TCPS_CLOSING)) { 2309 /* 2310 * Ok we have no timer (persists, rack, tlp, rxt or 2311 * del-ack), we don't have segments being paced. So 2312 * all that is left is the keepalive timer. 2313 */ 2314 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 2315 /* Get the established keep-alive time */ 2316 hpts_timeout = TP_KEEPIDLE(tp); 2317 } else { 2318 /* Get the initial setup keep-alive time */ 2319 hpts_timeout = TP_KEEPINIT(tp); 2320 } 2321 rack->r_ctl.rc_hpts_flags |= PACE_TMR_KEEP; 2322 } 2323 } 2324 if (left && (stopped & (PACE_TMR_KEEP | PACE_TMR_DELACK)) == 2325 (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK)) { 2326 /* 2327 * RACK, TLP, persists and RXT timers all are restartable 2328 * based on actions input .. i.e we received a packet (ack 2329 * or sack) and that changes things (rw, or snd_una etc). 2330 * Thus we can restart them with a new value. For 2331 * keep-alive, delayed_ack we keep track of what was left 2332 * and restart the timer with a smaller value. 2333 */ 2334 if (left < hpts_timeout) 2335 hpts_timeout = left; 2336 } 2337 if (hpts_timeout) { 2338 /* 2339 * Hack alert for now we can't time-out over 2,147,483 2340 * seconds (a bit more than 596 hours), which is probably ok 2341 * :). 2342 */ 2343 if (hpts_timeout > 0x7ffffffe) 2344 hpts_timeout = 0x7ffffffe; 2345 rack->r_ctl.rc_timer_exp = cts + hpts_timeout; 2346 } 2347 if (slot) { 2348 rack->r_ctl.rc_last_output_to = cts + slot; 2349 if ((hpts_timeout == 0) || (hpts_timeout > slot)) { 2350 if (rack->rc_inp->inp_in_hpts == 0) 2351 tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(slot)); 2352 rack_log_to_start(rack, cts, hpts_timeout, slot, 1); 2353 } else { 2354 /* 2355 * Arrange for the hpts to kick back in after the 2356 * t-o if the t-o does not cause a send. 2357 */ 2358 if (rack->rc_inp->inp_in_hpts == 0) 2359 tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(hpts_timeout)); 2360 rack_log_to_start(rack, cts, hpts_timeout, slot, 0); 2361 } 2362 } else if (hpts_timeout) { 2363 if (rack->rc_inp->inp_in_hpts == 0) 2364 tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(hpts_timeout)); 2365 rack_log_to_start(rack, cts, hpts_timeout, slot, 0); 2366 } else { 2367 /* No timer starting */ 2368 #ifdef INVARIANTS 2369 if (SEQ_GT(tp->snd_max, tp->snd_una)) { 2370 panic("tp:%p rack:%p tlts:%d cts:%u slot:%u pto:%u -- no timer started?", 2371 tp, rack, tot_len_this_send, cts, slot, hpts_timeout); 2372 } 2373 #endif 2374 } 2375 rack->rc_tmr_stopped = 0; 2376 if (slot) 2377 rack_log_type_bbrsnd(rack, tot_len_this_send, slot, cts); 2378 } 2379 2380 /* 2381 * RACK Timer, here we simply do logging and house keeping. 2382 * the normal rack_output() function will call the 2383 * appropriate thing to check if we need to do a RACK retransmit. 2384 * We return 1, saying don't proceed with rack_output only 2385 * when all timers have been stopped (destroyed PCB?). 2386 */ 2387 static int 2388 rack_timeout_rack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 2389 { 2390 /* 2391 * This timer simply provides an internal trigger to send out data. 2392 * The check_recovery_mode call will see if there are needed 2393 * retransmissions, if so we will enter fast-recovery. The output 2394 * call may or may not do the same thing depending on sysctl 2395 * settings. 2396 */ 2397 struct rack_sendmap *rsm; 2398 int32_t recovery; 2399 2400 if (tp->t_timers->tt_flags & TT_STOPPED) { 2401 return (1); 2402 } 2403 if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) { 2404 /* Its not time yet */ 2405 return (0); 2406 } 2407 rack_log_to_event(rack, RACK_TO_FRM_RACK); 2408 recovery = IN_RECOVERY(tp->t_flags); 2409 counter_u64_add(rack_to_tot, 1); 2410 if (rack->r_state && (rack->r_state != tp->t_state)) 2411 rack_set_state(tp, rack); 2412 rsm = rack_check_recovery_mode(tp, cts); 2413 if (rsm) { 2414 uint32_t rtt; 2415 2416 rtt = rack->rc_rack_rtt; 2417 if (rtt == 0) 2418 rtt = 1; 2419 if ((recovery == 0) && 2420 (rack->r_ctl.rc_prr_sndcnt < tp->t_maxseg)) { 2421 /* 2422 * The rack-timeout that enter's us into recovery 2423 * will force out one MSS and set us up so that we 2424 * can do one more send in 2*rtt (transitioning the 2425 * rack timeout into a rack-tlp). 2426 */ 2427 rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; 2428 } else if ((rack->r_ctl.rc_prr_sndcnt < tp->t_maxseg) && 2429 ((rsm->r_end - rsm->r_start) > rack->r_ctl.rc_prr_sndcnt)) { 2430 /* 2431 * When a rack timer goes, we have to send at 2432 * least one segment. They will be paced a min of 1ms 2433 * apart via the next rack timer (or further 2434 * if the rack timer dictates it). 2435 */ 2436 rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; 2437 } 2438 } else { 2439 /* This is a case that should happen rarely if ever */ 2440 counter_u64_add(rack_tlp_does_nada, 1); 2441 #ifdef TCP_BLACKBOX 2442 tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true); 2443 #endif 2444 rack->r_ctl.rc_resend = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 2445 } 2446 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RACK; 2447 return (0); 2448 } 2449 2450 /* 2451 * TLP Timer, here we simply setup what segment we want to 2452 * have the TLP expire on, the normal rack_output() will then 2453 * send it out. 2454 * 2455 * We return 1, saying don't proceed with rack_output only 2456 * when all timers have been stopped (destroyed PCB?). 2457 */ 2458 static int 2459 rack_timeout_tlp(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 2460 { 2461 /* 2462 * Tail Loss Probe. 2463 */ 2464 struct rack_sendmap *rsm = NULL; 2465 struct socket *so; 2466 uint32_t amm, old_prr_snd = 0; 2467 uint32_t out, avail; 2468 2469 if (tp->t_timers->tt_flags & TT_STOPPED) { 2470 return (1); 2471 } 2472 if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) { 2473 /* Its not time yet */ 2474 return (0); 2475 } 2476 if (rack_progress_timeout_check(tp)) { 2477 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 2478 return (1); 2479 } 2480 /* 2481 * A TLP timer has expired. We have been idle for 2 rtts. So we now 2482 * need to figure out how to force a full MSS segment out. 2483 */ 2484 rack_log_to_event(rack, RACK_TO_FRM_TLP); 2485 counter_u64_add(rack_tlp_tot, 1); 2486 if (rack->r_state && (rack->r_state != tp->t_state)) 2487 rack_set_state(tp, rack); 2488 so = tp->t_inpcb->inp_socket; 2489 avail = sbavail(&so->so_snd); 2490 out = tp->snd_max - tp->snd_una; 2491 rack->rc_timer_up = 1; 2492 /* 2493 * If we are in recovery we can jazz out a segment if new data is 2494 * present simply by setting rc_prr_sndcnt to a segment. 2495 */ 2496 if ((avail > out) && 2497 ((rack_always_send_oldest == 0) || (TAILQ_EMPTY(&rack->r_ctl.rc_tmap)))) { 2498 /* New data is available */ 2499 amm = avail - out; 2500 if (amm > tp->t_maxseg) { 2501 amm = tp->t_maxseg; 2502 } else if ((amm < tp->t_maxseg) && ((tp->t_flags & TF_NODELAY) == 0)) { 2503 /* not enough to fill a MTU and no-delay is off */ 2504 goto need_retran; 2505 } 2506 if (IN_RECOVERY(tp->t_flags)) { 2507 /* Unlikely */ 2508 old_prr_snd = rack->r_ctl.rc_prr_sndcnt; 2509 if (out + amm <= tp->snd_wnd) 2510 rack->r_ctl.rc_prr_sndcnt = amm; 2511 else 2512 goto need_retran; 2513 } else { 2514 /* Set the send-new override */ 2515 if (out + amm <= tp->snd_wnd) 2516 rack->r_ctl.rc_tlp_new_data = amm; 2517 else 2518 goto need_retran; 2519 } 2520 rack->r_ctl.rc_tlp_seg_send_cnt = 0; 2521 rack->r_ctl.rc_last_tlp_seq = tp->snd_max; 2522 rack->r_ctl.rc_tlpsend = NULL; 2523 counter_u64_add(rack_tlp_newdata, 1); 2524 goto send; 2525 } 2526 need_retran: 2527 /* 2528 * Ok we need to arrange the last un-acked segment to be re-sent, or 2529 * optionally the first un-acked segment. 2530 */ 2531 if (rack_always_send_oldest) 2532 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 2533 else { 2534 rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_map, rack_sendmap, r_next); 2535 if (rsm && (rsm->r_flags & (RACK_ACKED | RACK_HAS_FIN))) { 2536 rsm = rack_find_high_nonack(rack, rsm); 2537 } 2538 } 2539 if (rsm == NULL) { 2540 counter_u64_add(rack_tlp_does_nada, 1); 2541 #ifdef TCP_BLACKBOX 2542 tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true); 2543 #endif 2544 goto out; 2545 } 2546 if ((rsm->r_end - rsm->r_start) > tp->t_maxseg) { 2547 /* 2548 * We need to split this the last segment in two. 2549 */ 2550 int32_t idx; 2551 struct rack_sendmap *nrsm; 2552 2553 nrsm = rack_alloc(rack); 2554 if (nrsm == NULL) { 2555 /* 2556 * No memory to split, we will just exit and punt 2557 * off to the RXT timer. 2558 */ 2559 counter_u64_add(rack_tlp_does_nada, 1); 2560 goto out; 2561 } 2562 nrsm->r_start = (rsm->r_end - tp->t_maxseg); 2563 nrsm->r_end = rsm->r_end; 2564 nrsm->r_rtr_cnt = rsm->r_rtr_cnt; 2565 nrsm->r_flags = rsm->r_flags; 2566 nrsm->r_sndcnt = rsm->r_sndcnt; 2567 nrsm->r_rtr_bytes = 0; 2568 rsm->r_end = nrsm->r_start; 2569 for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { 2570 nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; 2571 } 2572 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next); 2573 if (rsm->r_in_tmap) { 2574 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 2575 nrsm->r_in_tmap = 1; 2576 } 2577 rsm->r_flags &= (~RACK_HAS_FIN); 2578 rsm = nrsm; 2579 } 2580 rack->r_ctl.rc_tlpsend = rsm; 2581 rack->r_ctl.rc_tlp_rtx_out = 1; 2582 if (rsm->r_start == rack->r_ctl.rc_last_tlp_seq) { 2583 rack->r_ctl.rc_tlp_seg_send_cnt++; 2584 tp->t_rxtshift++; 2585 } else { 2586 rack->r_ctl.rc_last_tlp_seq = rsm->r_start; 2587 rack->r_ctl.rc_tlp_seg_send_cnt = 1; 2588 } 2589 send: 2590 rack->r_ctl.rc_tlp_send_cnt++; 2591 if (rack->r_ctl.rc_tlp_send_cnt > rack_tlp_max_resend) { 2592 /* 2593 * Can't [re]/transmit a segment we have not heard from the 2594 * peer in max times. We need the retransmit timer to take 2595 * over. 2596 */ 2597 restore: 2598 rack->r_ctl.rc_tlpsend = NULL; 2599 if (rsm) 2600 rsm->r_flags &= ~RACK_TLP; 2601 rack->r_ctl.rc_prr_sndcnt = old_prr_snd; 2602 counter_u64_add(rack_tlp_retran_fail, 1); 2603 goto out; 2604 } else if (rsm) { 2605 rsm->r_flags |= RACK_TLP; 2606 } 2607 if (rsm && (rsm->r_start == rack->r_ctl.rc_last_tlp_seq) && 2608 (rack->r_ctl.rc_tlp_seg_send_cnt > rack_tlp_max_resend)) { 2609 /* 2610 * We don't want to send a single segment more than the max 2611 * either. 2612 */ 2613 goto restore; 2614 } 2615 rack->r_timer_override = 1; 2616 rack->r_tlp_running = 1; 2617 rack->rc_tlp_in_progress = 1; 2618 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP; 2619 return (0); 2620 out: 2621 rack->rc_timer_up = 0; 2622 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP; 2623 return (0); 2624 } 2625 2626 /* 2627 * Delayed ack Timer, here we simply need to setup the 2628 * ACK_NOW flag and remove the DELACK flag. From there 2629 * the output routine will send the ack out. 2630 * 2631 * We only return 1, saying don't proceed, if all timers 2632 * are stopped (destroyed PCB?). 2633 */ 2634 static int 2635 rack_timeout_delack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 2636 { 2637 if (tp->t_timers->tt_flags & TT_STOPPED) { 2638 return (1); 2639 } 2640 rack_log_to_event(rack, RACK_TO_FRM_DELACK); 2641 tp->t_flags &= ~TF_DELACK; 2642 tp->t_flags |= TF_ACKNOW; 2643 TCPSTAT_INC(tcps_delack); 2644 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK; 2645 return (0); 2646 } 2647 2648 /* 2649 * Persists timer, here we simply need to setup the 2650 * FORCE-DATA flag the output routine will send 2651 * the one byte send. 2652 * 2653 * We only return 1, saying don't proceed, if all timers 2654 * are stopped (destroyed PCB?). 2655 */ 2656 static int 2657 rack_timeout_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 2658 { 2659 struct inpcb *inp; 2660 int32_t retval = 0; 2661 2662 inp = tp->t_inpcb; 2663 2664 if (tp->t_timers->tt_flags & TT_STOPPED) { 2665 return (1); 2666 } 2667 if (rack->rc_in_persist == 0) 2668 return (0); 2669 if (rack_progress_timeout_check(tp)) { 2670 tcp_set_inp_to_drop(inp, ETIMEDOUT); 2671 return (1); 2672 } 2673 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 2674 /* 2675 * Persistence timer into zero window. Force a byte to be output, if 2676 * possible. 2677 */ 2678 TCPSTAT_INC(tcps_persisttimeo); 2679 /* 2680 * Hack: if the peer is dead/unreachable, we do not time out if the 2681 * window is closed. After a full backoff, drop the connection if 2682 * the idle time (no responses to probes) reaches the maximum 2683 * backoff that we would use if retransmitting. 2684 */ 2685 if (tp->t_rxtshift == TCP_MAXRXTSHIFT && 2686 (ticks - tp->t_rcvtime >= tcp_maxpersistidle || 2687 ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) { 2688 TCPSTAT_INC(tcps_persistdrop); 2689 retval = 1; 2690 tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT); 2691 goto out; 2692 } 2693 if ((sbavail(&rack->rc_inp->inp_socket->so_snd) == 0) && 2694 tp->snd_una == tp->snd_max) 2695 rack_exit_persist(tp, rack); 2696 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_PERSIT; 2697 /* 2698 * If the user has closed the socket then drop a persisting 2699 * connection after a much reduced timeout. 2700 */ 2701 if (tp->t_state > TCPS_CLOSE_WAIT && 2702 (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) { 2703 retval = 1; 2704 TCPSTAT_INC(tcps_persistdrop); 2705 tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT); 2706 goto out; 2707 } 2708 tp->t_flags |= TF_FORCEDATA; 2709 out: 2710 rack_log_to_event(rack, RACK_TO_FRM_PERSIST); 2711 return (retval); 2712 } 2713 2714 /* 2715 * If a keepalive goes off, we had no other timers 2716 * happening. We always return 1 here since this 2717 * routine either drops the connection or sends 2718 * out a segment with respond. 2719 */ 2720 static int 2721 rack_timeout_keepalive(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 2722 { 2723 struct tcptemp *t_template; 2724 struct inpcb *inp; 2725 2726 if (tp->t_timers->tt_flags & TT_STOPPED) { 2727 return (1); 2728 } 2729 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_KEEP; 2730 inp = tp->t_inpcb; 2731 rack_log_to_event(rack, RACK_TO_FRM_KEEP); 2732 /* 2733 * Keep-alive timer went off; send something or drop connection if 2734 * idle for too long. 2735 */ 2736 TCPSTAT_INC(tcps_keeptimeo); 2737 if (tp->t_state < TCPS_ESTABLISHED) 2738 goto dropit; 2739 if ((tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && 2740 tp->t_state <= TCPS_CLOSING) { 2741 if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp)) 2742 goto dropit; 2743 /* 2744 * Send a packet designed to force a response if the peer is 2745 * up and reachable: either an ACK if the connection is 2746 * still alive, or an RST if the peer has closed the 2747 * connection due to timeout or reboot. Using sequence 2748 * number tp->snd_una-1 causes the transmitted zero-length 2749 * segment to lie outside the receive window; by the 2750 * protocol spec, this requires the correspondent TCP to 2751 * respond. 2752 */ 2753 TCPSTAT_INC(tcps_keepprobe); 2754 t_template = tcpip_maketemplate(inp); 2755 if (t_template) { 2756 tcp_respond(tp, t_template->tt_ipgen, 2757 &t_template->tt_t, (struct mbuf *)NULL, 2758 tp->rcv_nxt, tp->snd_una - 1, 0); 2759 free(t_template, M_TEMP); 2760 } 2761 } 2762 rack_start_hpts_timer(rack, tp, cts, __LINE__, 0, 0, 0); 2763 return (1); 2764 dropit: 2765 TCPSTAT_INC(tcps_keepdrops); 2766 tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT); 2767 return (1); 2768 } 2769 2770 /* 2771 * Retransmit helper function, clear up all the ack 2772 * flags and take care of important book keeping. 2773 */ 2774 static void 2775 rack_remxt_tmr(struct tcpcb *tp) 2776 { 2777 /* 2778 * The retransmit timer went off, all sack'd blocks must be 2779 * un-acked. 2780 */ 2781 struct rack_sendmap *rsm, *trsm = NULL; 2782 struct tcp_rack *rack; 2783 int32_t cnt = 0; 2784 2785 rack = (struct tcp_rack *)tp->t_fb_ptr; 2786 rack_timer_cancel(tp, rack, tcp_ts_getticks(), __LINE__); 2787 rack_log_to_event(rack, RACK_TO_FRM_TMR); 2788 if (rack->r_state && (rack->r_state != tp->t_state)) 2789 rack_set_state(tp, rack); 2790 /* 2791 * Ideally we would like to be able to 2792 * mark SACK-PASS on anything not acked here. 2793 * However, if we do that we would burst out 2794 * all that data 1ms apart. This would be unwise, 2795 * so for now we will just let the normal rxt timer 2796 * and tlp timer take care of it. 2797 */ 2798 TAILQ_FOREACH(rsm, &rack->r_ctl.rc_map, r_next) { 2799 if (rsm->r_flags & RACK_ACKED) { 2800 cnt++; 2801 rsm->r_sndcnt = 0; 2802 if (rsm->r_in_tmap == 0) { 2803 /* We must re-add it back to the tlist */ 2804 if (trsm == NULL) { 2805 TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext); 2806 } else { 2807 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, trsm, rsm, r_tnext); 2808 } 2809 rsm->r_in_tmap = 1; 2810 trsm = rsm; 2811 } 2812 } 2813 rsm->r_flags &= ~(RACK_ACKED | RACK_SACK_PASSED | RACK_WAS_SACKPASS); 2814 } 2815 /* Clear the count (we just un-acked them) */ 2816 rack->r_ctl.rc_sacked = 0; 2817 /* Clear the tlp rtx mark */ 2818 rack->r_ctl.rc_tlp_rtx_out = 0; 2819 rack->r_ctl.rc_tlp_seg_send_cnt = 0; 2820 rack->r_ctl.rc_resend = TAILQ_FIRST(&rack->r_ctl.rc_map); 2821 /* Setup so we send one segment */ 2822 if (rack->r_ctl.rc_prr_sndcnt < tp->t_maxseg) 2823 rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; 2824 rack->r_timer_override = 1; 2825 } 2826 2827 /* 2828 * Re-transmit timeout! If we drop the PCB we will return 1, otherwise 2829 * we will setup to retransmit the lowest seq number outstanding. 2830 */ 2831 static int 2832 rack_timeout_rxt(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 2833 { 2834 int32_t rexmt; 2835 struct inpcb *inp; 2836 int32_t retval = 0; 2837 2838 inp = tp->t_inpcb; 2839 if (tp->t_timers->tt_flags & TT_STOPPED) { 2840 return (1); 2841 } 2842 if (rack_progress_timeout_check(tp)) { 2843 tcp_set_inp_to_drop(inp, ETIMEDOUT); 2844 return (1); 2845 } 2846 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RXT; 2847 if (TCPS_HAVEESTABLISHED(tp->t_state) && 2848 (tp->snd_una == tp->snd_max)) { 2849 /* Nothing outstanding .. nothing to do */ 2850 return (0); 2851 } 2852 /* 2853 * Retransmission timer went off. Message has not been acked within 2854 * retransmit interval. Back off to a longer retransmit interval 2855 * and retransmit one segment. 2856 */ 2857 if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) { 2858 tp->t_rxtshift = TCP_MAXRXTSHIFT; 2859 TCPSTAT_INC(tcps_timeoutdrop); 2860 retval = 1; 2861 tcp_set_inp_to_drop(rack->rc_inp, 2862 (tp->t_softerror ? (uint16_t) tp->t_softerror : ETIMEDOUT)); 2863 goto out; 2864 } 2865 rack_remxt_tmr(tp); 2866 if (tp->t_state == TCPS_SYN_SENT) { 2867 /* 2868 * If the SYN was retransmitted, indicate CWND to be limited 2869 * to 1 segment in cc_conn_init(). 2870 */ 2871 tp->snd_cwnd = 1; 2872 } else if (tp->t_rxtshift == 1) { 2873 /* 2874 * first retransmit; record ssthresh and cwnd so they can be 2875 * recovered if this turns out to be a "bad" retransmit. A 2876 * retransmit is considered "bad" if an ACK for this segment 2877 * is received within RTT/2 interval; the assumption here is 2878 * that the ACK was already in flight. See "On Estimating 2879 * End-to-End Network Path Properties" by Allman and Paxson 2880 * for more details. 2881 */ 2882 tp->snd_cwnd_prev = tp->snd_cwnd; 2883 tp->snd_ssthresh_prev = tp->snd_ssthresh; 2884 tp->snd_recover_prev = tp->snd_recover; 2885 if (IN_FASTRECOVERY(tp->t_flags)) 2886 tp->t_flags |= TF_WASFRECOVERY; 2887 else 2888 tp->t_flags &= ~TF_WASFRECOVERY; 2889 if (IN_CONGRECOVERY(tp->t_flags)) 2890 tp->t_flags |= TF_WASCRECOVERY; 2891 else 2892 tp->t_flags &= ~TF_WASCRECOVERY; 2893 tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1)); 2894 tp->t_flags |= TF_PREVVALID; 2895 } else 2896 tp->t_flags &= ~TF_PREVVALID; 2897 TCPSTAT_INC(tcps_rexmttimeo); 2898 if ((tp->t_state == TCPS_SYN_SENT) || 2899 (tp->t_state == TCPS_SYN_RECEIVED)) 2900 rexmt = MSEC_2_TICKS(RACK_INITIAL_RTO * tcp_syn_backoff[tp->t_rxtshift]); 2901 else 2902 rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; 2903 TCPT_RANGESET(tp->t_rxtcur, rexmt, 2904 max(MSEC_2_TICKS(rack_rto_min), rexmt), 2905 MSEC_2_TICKS(rack_rto_max)); 2906 /* 2907 * We enter the path for PLMTUD if connection is established or, if 2908 * connection is FIN_WAIT_1 status, reason for the last is that if 2909 * amount of data we send is very small, we could send it in couple 2910 * of packets and process straight to FIN. In that case we won't 2911 * catch ESTABLISHED state. 2912 */ 2913 if (V_tcp_pmtud_blackhole_detect && (((tp->t_state == TCPS_ESTABLISHED)) 2914 || (tp->t_state == TCPS_FIN_WAIT_1))) { 2915 #ifdef INET6 2916 int32_t isipv6; 2917 #endif 2918 2919 /* 2920 * Idea here is that at each stage of mtu probe (usually, 2921 * 1448 -> 1188 -> 524) should be given 2 chances to recover 2922 * before further clamping down. 'tp->t_rxtshift % 2 == 0' 2923 * should take care of that. 2924 */ 2925 if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) == 2926 (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) && 2927 (tp->t_rxtshift >= 2 && tp->t_rxtshift < 6 && 2928 tp->t_rxtshift % 2 == 0)) { 2929 /* 2930 * Enter Path MTU Black-hole Detection mechanism: - 2931 * Disable Path MTU Discovery (IP "DF" bit). - 2932 * Reduce MTU to lower value than what we negotiated 2933 * with peer. 2934 */ 2935 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) == 0) { 2936 /* Record that we may have found a black hole. */ 2937 tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE; 2938 /* Keep track of previous MSS. */ 2939 tp->t_pmtud_saved_maxseg = tp->t_maxseg; 2940 } 2941 2942 /* 2943 * Reduce the MSS to blackhole value or to the 2944 * default in an attempt to retransmit. 2945 */ 2946 #ifdef INET6 2947 isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? 1 : 0; 2948 if (isipv6 && 2949 tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) { 2950 /* Use the sysctl tuneable blackhole MSS. */ 2951 tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss; 2952 TCPSTAT_INC(tcps_pmtud_blackhole_activated); 2953 } else if (isipv6) { 2954 /* Use the default MSS. */ 2955 tp->t_maxseg = V_tcp_v6mssdflt; 2956 /* 2957 * Disable Path MTU Discovery when we switch 2958 * to minmss. 2959 */ 2960 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 2961 TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); 2962 } 2963 #endif 2964 #if defined(INET6) && defined(INET) 2965 else 2966 #endif 2967 #ifdef INET 2968 if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) { 2969 /* Use the sysctl tuneable blackhole MSS. */ 2970 tp->t_maxseg = V_tcp_pmtud_blackhole_mss; 2971 TCPSTAT_INC(tcps_pmtud_blackhole_activated); 2972 } else { 2973 /* Use the default MSS. */ 2974 tp->t_maxseg = V_tcp_mssdflt; 2975 /* 2976 * Disable Path MTU Discovery when we switch 2977 * to minmss. 2978 */ 2979 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 2980 TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); 2981 } 2982 #endif 2983 } else { 2984 /* 2985 * If further retransmissions are still unsuccessful 2986 * with a lowered MTU, maybe this isn't a blackhole 2987 * and we restore the previous MSS and blackhole 2988 * detection flags. The limit '6' is determined by 2989 * giving each probe stage (1448, 1188, 524) 2 2990 * chances to recover. 2991 */ 2992 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) && 2993 (tp->t_rxtshift >= 6)) { 2994 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 2995 tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE; 2996 tp->t_maxseg = tp->t_pmtud_saved_maxseg; 2997 TCPSTAT_INC(tcps_pmtud_blackhole_failed); 2998 } 2999 } 3000 } 3001 /* 3002 * Disable RFC1323 and SACK if we haven't got any response to our 3003 * third SYN to work-around some broken terminal servers (most of 3004 * which have hopefully been retired) that have bad VJ header 3005 * compression code which trashes TCP segments containing 3006 * unknown-to-them TCP options. 3007 */ 3008 if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) && 3009 (tp->t_rxtshift == 3)) 3010 tp->t_flags &= ~(TF_REQ_SCALE | TF_REQ_TSTMP | TF_SACK_PERMIT); 3011 /* 3012 * If we backed off this far, our srtt estimate is probably bogus. 3013 * Clobber it so we'll take the next rtt measurement as our srtt; 3014 * move the current srtt into rttvar to keep the current retransmit 3015 * times until then. 3016 */ 3017 if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) { 3018 #ifdef INET6 3019 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) 3020 in6_losing(tp->t_inpcb); 3021 else 3022 #endif 3023 in_losing(tp->t_inpcb); 3024 tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT); 3025 tp->t_srtt = 0; 3026 } 3027 if (rack_use_sack_filter) 3028 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 3029 tp->snd_recover = tp->snd_max; 3030 tp->t_flags |= TF_ACKNOW; 3031 tp->t_rtttime = 0; 3032 rack_cong_signal(tp, NULL, CC_RTO); 3033 out: 3034 return (retval); 3035 } 3036 3037 static int 3038 rack_process_timers(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8_t hpts_calling) 3039 { 3040 int32_t ret = 0; 3041 int32_t timers = (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK); 3042 3043 if (timers == 0) { 3044 return (0); 3045 } 3046 if (tp->t_state == TCPS_LISTEN) { 3047 /* no timers on listen sockets */ 3048 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) 3049 return (0); 3050 return (1); 3051 } 3052 if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) { 3053 uint32_t left; 3054 3055 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { 3056 ret = -1; 3057 rack_log_to_processing(rack, cts, ret, 0); 3058 return (0); 3059 } 3060 if (hpts_calling == 0) { 3061 ret = -2; 3062 rack_log_to_processing(rack, cts, ret, 0); 3063 return (0); 3064 } 3065 /* 3066 * Ok our timer went off early and we are not paced false 3067 * alarm, go back to sleep. 3068 */ 3069 ret = -3; 3070 left = rack->r_ctl.rc_timer_exp - cts; 3071 tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(left)); 3072 rack_log_to_processing(rack, cts, ret, left); 3073 rack->rc_last_pto_set = 0; 3074 return (1); 3075 } 3076 rack->rc_tmr_stopped = 0; 3077 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_MASK; 3078 if (timers & PACE_TMR_DELACK) { 3079 ret = rack_timeout_delack(tp, rack, cts); 3080 } else if (timers & PACE_TMR_RACK) { 3081 ret = rack_timeout_rack(tp, rack, cts); 3082 } else if (timers & PACE_TMR_TLP) { 3083 ret = rack_timeout_tlp(tp, rack, cts); 3084 } else if (timers & PACE_TMR_RXT) { 3085 ret = rack_timeout_rxt(tp, rack, cts); 3086 } else if (timers & PACE_TMR_PERSIT) { 3087 ret = rack_timeout_persist(tp, rack, cts); 3088 } else if (timers & PACE_TMR_KEEP) { 3089 ret = rack_timeout_keepalive(tp, rack, cts); 3090 } 3091 rack_log_to_processing(rack, cts, ret, timers); 3092 return (ret); 3093 } 3094 3095 static void 3096 rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line) 3097 { 3098 uint8_t hpts_removed = 0; 3099 3100 if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) && 3101 TSTMP_GEQ(cts, rack->r_ctl.rc_last_output_to)) { 3102 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); 3103 hpts_removed = 1; 3104 } 3105 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { 3106 rack->rc_tmr_stopped = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK; 3107 if (rack->rc_inp->inp_in_hpts && 3108 ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0)) { 3109 /* 3110 * Canceling timer's when we have no output being 3111 * paced. We also must remove ourselves from the 3112 * hpts. 3113 */ 3114 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); 3115 hpts_removed = 1; 3116 } 3117 rack_log_to_cancel(rack, hpts_removed, line); 3118 rack->r_ctl.rc_hpts_flags &= ~(PACE_TMR_MASK); 3119 } 3120 } 3121 3122 static void 3123 rack_timer_stop(struct tcpcb *tp, uint32_t timer_type) 3124 { 3125 return; 3126 } 3127 3128 static int 3129 rack_stopall(struct tcpcb *tp) 3130 { 3131 struct tcp_rack *rack; 3132 rack = (struct tcp_rack *)tp->t_fb_ptr; 3133 rack->t_timers_stopped = 1; 3134 return (0); 3135 } 3136 3137 static void 3138 rack_timer_activate(struct tcpcb *tp, uint32_t timer_type, uint32_t delta) 3139 { 3140 return; 3141 } 3142 3143 static int 3144 rack_timer_active(struct tcpcb *tp, uint32_t timer_type) 3145 { 3146 return (0); 3147 } 3148 3149 static void 3150 rack_stop_all_timers(struct tcpcb *tp) 3151 { 3152 struct tcp_rack *rack; 3153 3154 /* 3155 * Assure no timers are running. 3156 */ 3157 if (tcp_timer_active(tp, TT_PERSIST)) { 3158 /* We enter in persists, set the flag appropriately */ 3159 rack = (struct tcp_rack *)tp->t_fb_ptr; 3160 rack->rc_in_persist = 1; 3161 } 3162 tcp_timer_suspend(tp, TT_PERSIST); 3163 tcp_timer_suspend(tp, TT_REXMT); 3164 tcp_timer_suspend(tp, TT_KEEP); 3165 tcp_timer_suspend(tp, TT_DELACK); 3166 } 3167 3168 static void 3169 rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack, 3170 struct rack_sendmap *rsm, uint32_t ts) 3171 { 3172 int32_t idx; 3173 3174 rsm->r_rtr_cnt++; 3175 rsm->r_sndcnt++; 3176 if (rsm->r_rtr_cnt > RACK_NUM_OF_RETRANS) { 3177 rsm->r_rtr_cnt = RACK_NUM_OF_RETRANS; 3178 rsm->r_flags |= RACK_OVERMAX; 3179 } 3180 if ((rsm->r_rtr_cnt > 1) && (rack->r_tlp_running == 0)) { 3181 rack->r_ctl.rc_holes_rxt += (rsm->r_end - rsm->r_start); 3182 rsm->r_rtr_bytes += (rsm->r_end - rsm->r_start); 3183 } 3184 idx = rsm->r_rtr_cnt - 1; 3185 rsm->r_tim_lastsent[idx] = ts; 3186 if (rsm->r_flags & RACK_ACKED) { 3187 /* Problably MTU discovery messing with us */ 3188 rsm->r_flags &= ~RACK_ACKED; 3189 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); 3190 } 3191 if (rsm->r_in_tmap) { 3192 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 3193 } 3194 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); 3195 rsm->r_in_tmap = 1; 3196 if (rsm->r_flags & RACK_SACK_PASSED) { 3197 /* We have retransmitted due to the SACK pass */ 3198 rsm->r_flags &= ~RACK_SACK_PASSED; 3199 rsm->r_flags |= RACK_WAS_SACKPASS; 3200 } 3201 /* Update memory for next rtr */ 3202 rack->r_ctl.rc_next = TAILQ_NEXT(rsm, r_next); 3203 } 3204 3205 3206 static uint32_t 3207 rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack, 3208 struct rack_sendmap *rsm, uint32_t ts, int32_t * lenp) 3209 { 3210 /* 3211 * We (re-)transmitted starting at rsm->r_start for some length 3212 * (possibly less than r_end. 3213 */ 3214 struct rack_sendmap *nrsm; 3215 uint32_t c_end; 3216 int32_t len; 3217 int32_t idx; 3218 3219 len = *lenp; 3220 c_end = rsm->r_start + len; 3221 if (SEQ_GEQ(c_end, rsm->r_end)) { 3222 /* 3223 * We retransmitted the whole piece or more than the whole 3224 * slopping into the next rsm. 3225 */ 3226 rack_update_rsm(tp, rack, rsm, ts); 3227 if (c_end == rsm->r_end) { 3228 *lenp = 0; 3229 return (0); 3230 } else { 3231 int32_t act_len; 3232 3233 /* Hangs over the end return whats left */ 3234 act_len = rsm->r_end - rsm->r_start; 3235 *lenp = (len - act_len); 3236 return (rsm->r_end); 3237 } 3238 /* We don't get out of this block. */ 3239 } 3240 /* 3241 * Here we retransmitted less than the whole thing which means we 3242 * have to split this into what was transmitted and what was not. 3243 */ 3244 nrsm = rack_alloc(rack); 3245 if (nrsm == NULL) { 3246 /* 3247 * We can't get memory, so lets not proceed. 3248 */ 3249 *lenp = 0; 3250 return (0); 3251 } 3252 /* 3253 * So here we are going to take the original rsm and make it what we 3254 * retransmitted. nrsm will be the tail portion we did not 3255 * retransmit. For example say the chunk was 1, 11 (10 bytes). And 3256 * we retransmitted 5 bytes i.e. 1, 5. The original piece shrinks to 3257 * 1, 6 and the new piece will be 6, 11. 3258 */ 3259 nrsm->r_start = c_end; 3260 nrsm->r_end = rsm->r_end; 3261 nrsm->r_rtr_cnt = rsm->r_rtr_cnt; 3262 nrsm->r_flags = rsm->r_flags; 3263 nrsm->r_sndcnt = rsm->r_sndcnt; 3264 nrsm->r_rtr_bytes = 0; 3265 rsm->r_end = c_end; 3266 for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { 3267 nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; 3268 } 3269 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next); 3270 if (rsm->r_in_tmap) { 3271 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 3272 nrsm->r_in_tmap = 1; 3273 } 3274 rsm->r_flags &= (~RACK_HAS_FIN); 3275 rack_update_rsm(tp, rack, rsm, ts); 3276 *lenp = 0; 3277 return (0); 3278 } 3279 3280 3281 static void 3282 rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len, 3283 uint32_t seq_out, uint8_t th_flags, int32_t err, uint32_t ts, 3284 uint8_t pass, struct rack_sendmap *hintrsm) 3285 { 3286 struct tcp_rack *rack; 3287 struct rack_sendmap *rsm, *nrsm; 3288 register uint32_t snd_max, snd_una; 3289 int32_t idx; 3290 3291 /* 3292 * Add to the RACK log of packets in flight or retransmitted. If 3293 * there is a TS option we will use the TS echoed, if not we will 3294 * grab a TS. 3295 * 3296 * Retransmissions will increment the count and move the ts to its 3297 * proper place. Note that if options do not include TS's then we 3298 * won't be able to effectively use the ACK for an RTT on a retran. 3299 * 3300 * Notes about r_start and r_end. Lets consider a send starting at 3301 * sequence 1 for 10 bytes. In such an example the r_start would be 3302 * 1 (starting sequence) but the r_end would be r_start+len i.e. 11. 3303 * This means that r_end is actually the first sequence for the next 3304 * slot (11). 3305 * 3306 */ 3307 /* 3308 * If err is set what do we do XXXrrs? should we not add the thing? 3309 * -- i.e. return if err != 0 or should we pretend we sent it? -- 3310 * i.e. proceed with add ** do this for now. 3311 */ 3312 INP_WLOCK_ASSERT(tp->t_inpcb); 3313 if (err) 3314 /* 3315 * We don't log errors -- we could but snd_max does not 3316 * advance in this case either. 3317 */ 3318 return; 3319 3320 if (th_flags & TH_RST) { 3321 /* 3322 * We don't log resets and we return immediately from 3323 * sending 3324 */ 3325 return; 3326 } 3327 rack = (struct tcp_rack *)tp->t_fb_ptr; 3328 snd_una = tp->snd_una; 3329 if (SEQ_LEQ((seq_out + len), snd_una)) { 3330 /* Are sending an old segment to induce an ack (keep-alive)? */ 3331 return; 3332 } 3333 if (SEQ_LT(seq_out, snd_una)) { 3334 /* huh? should we panic? */ 3335 uint32_t end; 3336 3337 end = seq_out + len; 3338 seq_out = snd_una; 3339 len = end - seq_out; 3340 } 3341 snd_max = tp->snd_max; 3342 if (th_flags & (TH_SYN | TH_FIN)) { 3343 /* 3344 * The call to rack_log_output is made before bumping 3345 * snd_max. This means we can record one extra byte on a SYN 3346 * or FIN if seq_out is adding more on and a FIN is present 3347 * (and we are not resending). 3348 */ 3349 if (th_flags & TH_SYN) 3350 len++; 3351 if (th_flags & TH_FIN) 3352 len++; 3353 if (SEQ_LT(snd_max, tp->snd_nxt)) { 3354 /* 3355 * The add/update as not been done for the FIN/SYN 3356 * yet. 3357 */ 3358 snd_max = tp->snd_nxt; 3359 } 3360 } 3361 if (len == 0) { 3362 /* We don't log zero window probes */ 3363 return; 3364 } 3365 rack->r_ctl.rc_time_last_sent = ts; 3366 if (IN_RECOVERY(tp->t_flags)) { 3367 rack->r_ctl.rc_prr_out += len; 3368 } 3369 /* First question is it a retransmission? */ 3370 if (seq_out == snd_max) { 3371 again: 3372 rsm = rack_alloc(rack); 3373 if (rsm == NULL) { 3374 /* 3375 * Hmm out of memory and the tcb got destroyed while 3376 * we tried to wait. 3377 */ 3378 #ifdef INVARIANTS 3379 panic("Out of memory when we should not be rack:%p", rack); 3380 #endif 3381 return; 3382 } 3383 if (th_flags & TH_FIN) { 3384 rsm->r_flags = RACK_HAS_FIN; 3385 } else { 3386 rsm->r_flags = 0; 3387 } 3388 rsm->r_tim_lastsent[0] = ts; 3389 rsm->r_rtr_cnt = 1; 3390 rsm->r_rtr_bytes = 0; 3391 if (th_flags & TH_SYN) { 3392 /* The data space is one beyond snd_una */ 3393 rsm->r_start = seq_out + 1; 3394 rsm->r_end = rsm->r_start + (len - 1); 3395 } else { 3396 /* Normal case */ 3397 rsm->r_start = seq_out; 3398 rsm->r_end = rsm->r_start + len; 3399 } 3400 rsm->r_sndcnt = 0; 3401 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_map, rsm, r_next); 3402 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); 3403 rsm->r_in_tmap = 1; 3404 return; 3405 } 3406 /* 3407 * If we reach here its a retransmission and we need to find it. 3408 */ 3409 more: 3410 if (hintrsm && (hintrsm->r_start == seq_out)) { 3411 rsm = hintrsm; 3412 hintrsm = NULL; 3413 } else if (rack->r_ctl.rc_next) { 3414 /* We have a hint from a previous run */ 3415 rsm = rack->r_ctl.rc_next; 3416 } else { 3417 /* No hints sorry */ 3418 rsm = NULL; 3419 } 3420 if ((rsm) && (rsm->r_start == seq_out)) { 3421 /* 3422 * We used rc_next or hintrsm to retransmit, hopefully the 3423 * likely case. 3424 */ 3425 seq_out = rack_update_entry(tp, rack, rsm, ts, &len); 3426 if (len == 0) { 3427 return; 3428 } else { 3429 goto more; 3430 } 3431 } 3432 /* Ok it was not the last pointer go through it the hard way. */ 3433 TAILQ_FOREACH(rsm, &rack->r_ctl.rc_map, r_next) { 3434 if (rsm->r_start == seq_out) { 3435 seq_out = rack_update_entry(tp, rack, rsm, ts, &len); 3436 rack->r_ctl.rc_next = TAILQ_NEXT(rsm, r_next); 3437 if (len == 0) { 3438 return; 3439 } else { 3440 continue; 3441 } 3442 } 3443 if (SEQ_GEQ(seq_out, rsm->r_start) && SEQ_LT(seq_out, rsm->r_end)) { 3444 /* Transmitted within this piece */ 3445 /* 3446 * Ok we must split off the front and then let the 3447 * update do the rest 3448 */ 3449 nrsm = rack_alloc(rack); 3450 if (nrsm == NULL) { 3451 #ifdef INVARIANTS 3452 panic("Ran out of memory that was preallocated? rack:%p", rack); 3453 #endif 3454 rack_update_rsm(tp, rack, rsm, ts); 3455 return; 3456 } 3457 /* 3458 * copy rsm to nrsm and then trim the front of rsm 3459 * to not include this part. 3460 */ 3461 nrsm->r_start = seq_out; 3462 nrsm->r_end = rsm->r_end; 3463 nrsm->r_rtr_cnt = rsm->r_rtr_cnt; 3464 nrsm->r_flags = rsm->r_flags; 3465 nrsm->r_sndcnt = rsm->r_sndcnt; 3466 nrsm->r_rtr_bytes = 0; 3467 for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { 3468 nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; 3469 } 3470 rsm->r_end = nrsm->r_start; 3471 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next); 3472 if (rsm->r_in_tmap) { 3473 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 3474 nrsm->r_in_tmap = 1; 3475 } 3476 rsm->r_flags &= (~RACK_HAS_FIN); 3477 seq_out = rack_update_entry(tp, rack, nrsm, ts, &len); 3478 if (len == 0) { 3479 return; 3480 } 3481 } 3482 } 3483 /* 3484 * Hmm not found in map did they retransmit both old and on into the 3485 * new? 3486 */ 3487 if (seq_out == tp->snd_max) { 3488 goto again; 3489 } else if (SEQ_LT(seq_out, tp->snd_max)) { 3490 #ifdef INVARIANTS 3491 printf("seq_out:%u len:%d snd_una:%u snd_max:%u -- but rsm not found?\n", 3492 seq_out, len, tp->snd_una, tp->snd_max); 3493 printf("Starting Dump of all rack entries\n"); 3494 TAILQ_FOREACH(rsm, &rack->r_ctl.rc_map, r_next) { 3495 printf("rsm:%p start:%u end:%u\n", 3496 rsm, rsm->r_start, rsm->r_end); 3497 } 3498 printf("Dump complete\n"); 3499 panic("seq_out not found rack:%p tp:%p", 3500 rack, tp); 3501 #endif 3502 } else { 3503 #ifdef INVARIANTS 3504 /* 3505 * Hmm beyond sndmax? (only if we are using the new rtt-pack 3506 * flag) 3507 */ 3508 panic("seq_out:%u(%d) is beyond snd_max:%u tp:%p", 3509 seq_out, len, tp->snd_max, tp); 3510 #endif 3511 } 3512 } 3513 3514 /* 3515 * Record one of the RTT updates from an ack into 3516 * our sample structure. 3517 */ 3518 static void 3519 tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt) 3520 { 3521 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) || 3522 (rack->r_ctl.rack_rs.rs_rtt_lowest > rtt)) { 3523 rack->r_ctl.rack_rs.rs_rtt_lowest = rtt; 3524 } 3525 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) || 3526 (rack->r_ctl.rack_rs.rs_rtt_highest < rtt)) { 3527 rack->r_ctl.rack_rs.rs_rtt_highest = rtt; 3528 } 3529 rack->r_ctl.rack_rs.rs_flags = RACK_RTT_VALID; 3530 rack->r_ctl.rack_rs.rs_rtt_tot += rtt; 3531 rack->r_ctl.rack_rs.rs_rtt_cnt++; 3532 } 3533 3534 /* 3535 * Collect new round-trip time estimate 3536 * and update averages and current timeout. 3537 */ 3538 static void 3539 tcp_rack_xmit_timer_commit(struct tcp_rack *rack, struct tcpcb *tp) 3540 { 3541 int32_t delta; 3542 uint32_t o_srtt, o_var; 3543 int32_t rtt; 3544 3545 if (rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) 3546 /* No valid sample */ 3547 return; 3548 if (rack->r_ctl.rc_rate_sample_method == USE_RTT_LOW) { 3549 /* We are to use the lowest RTT seen in a single ack */ 3550 rtt = rack->r_ctl.rack_rs.rs_rtt_lowest; 3551 } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_HIGH) { 3552 /* We are to use the highest RTT seen in a single ack */ 3553 rtt = rack->r_ctl.rack_rs.rs_rtt_highest; 3554 } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_AVG) { 3555 /* We are to use the average RTT seen in a single ack */ 3556 rtt = (int32_t)(rack->r_ctl.rack_rs.rs_rtt_tot / 3557 (uint64_t)rack->r_ctl.rack_rs.rs_rtt_cnt); 3558 } else { 3559 #ifdef INVARIANTS 3560 panic("Unknown rtt variant %d", rack->r_ctl.rc_rate_sample_method); 3561 #endif 3562 return; 3563 } 3564 if (rtt == 0) 3565 rtt = 1; 3566 rack_log_rtt_sample(rack, rtt); 3567 o_srtt = tp->t_srtt; 3568 o_var = tp->t_rttvar; 3569 rack = (struct tcp_rack *)tp->t_fb_ptr; 3570 if (tp->t_srtt != 0) { 3571 /* 3572 * srtt is stored as fixed point with 5 bits after the 3573 * binary point (i.e., scaled by 8). The following magic is 3574 * equivalent to the smoothing algorithm in rfc793 with an 3575 * alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed point). 3576 * Adjust rtt to origin 0. 3577 */ 3578 delta = ((rtt - 1) << TCP_DELTA_SHIFT) 3579 - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)); 3580 3581 tp->t_srtt += delta; 3582 if (tp->t_srtt <= 0) 3583 tp->t_srtt = 1; 3584 3585 /* 3586 * We accumulate a smoothed rtt variance (actually, a 3587 * smoothed mean difference), then set the retransmit timer 3588 * to smoothed rtt + 4 times the smoothed variance. rttvar 3589 * is stored as fixed point with 4 bits after the binary 3590 * point (scaled by 16). The following is equivalent to 3591 * rfc793 smoothing with an alpha of .75 (rttvar = 3592 * rttvar*3/4 + |delta| / 4). This replaces rfc793's 3593 * wired-in beta. 3594 */ 3595 if (delta < 0) 3596 delta = -delta; 3597 delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT); 3598 tp->t_rttvar += delta; 3599 if (tp->t_rttvar <= 0) 3600 tp->t_rttvar = 1; 3601 if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar) 3602 tp->t_rttbest = tp->t_srtt + tp->t_rttvar; 3603 } else { 3604 /* 3605 * No rtt measurement yet - use the unsmoothed rtt. Set the 3606 * variance to half the rtt (so our first retransmit happens 3607 * at 3*rtt). 3608 */ 3609 tp->t_srtt = rtt << TCP_RTT_SHIFT; 3610 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1); 3611 tp->t_rttbest = tp->t_srtt + tp->t_rttvar; 3612 } 3613 TCPSTAT_INC(tcps_rttupdated); 3614 rack_log_rtt_upd(tp, rack, rtt, o_srtt, o_var); 3615 tp->t_rttupdated++; 3616 #ifdef NETFLIX_STATS 3617 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rtt)); 3618 #endif 3619 tp->t_rxtshift = 0; 3620 3621 /* 3622 * the retransmit should happen at rtt + 4 * rttvar. Because of the 3623 * way we do the smoothing, srtt and rttvar will each average +1/2 3624 * tick of bias. When we compute the retransmit timer, we want 1/2 3625 * tick of rounding and 1 extra tick because of +-1/2 tick 3626 * uncertainty in the firing of the timer. The bias will give us 3627 * exactly the 1.5 tick we need. But, because the bias is 3628 * statistical, we have to test that we don't drop below the minimum 3629 * feasible timer (which is 2 ticks). 3630 */ 3631 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), 3632 max(MSEC_2_TICKS(rack_rto_min), rtt + 2), MSEC_2_TICKS(rack_rto_max)); 3633 tp->t_softerror = 0; 3634 } 3635 3636 static void 3637 rack_earlier_retran(struct tcpcb *tp, struct rack_sendmap *rsm, 3638 uint32_t t, uint32_t cts) 3639 { 3640 /* 3641 * For this RSM, we acknowledged the data from a previous 3642 * transmission, not the last one we made. This means we did a false 3643 * retransmit. 3644 */ 3645 struct tcp_rack *rack; 3646 3647 if (rsm->r_flags & RACK_HAS_FIN) { 3648 /* 3649 * The sending of the FIN often is multiple sent when we 3650 * have everything outstanding ack'd. We ignore this case 3651 * since its over now. 3652 */ 3653 return; 3654 } 3655 if (rsm->r_flags & RACK_TLP) { 3656 /* 3657 * We expect TLP's to have this occur. 3658 */ 3659 return; 3660 } 3661 rack = (struct tcp_rack *)tp->t_fb_ptr; 3662 /* should we undo cc changes and exit recovery? */ 3663 if (IN_RECOVERY(tp->t_flags)) { 3664 if (rack->r_ctl.rc_rsm_start == rsm->r_start) { 3665 /* 3666 * Undo what we ratched down and exit recovery if 3667 * possible 3668 */ 3669 EXIT_RECOVERY(tp->t_flags); 3670 tp->snd_recover = tp->snd_una; 3671 if (rack->r_ctl.rc_cwnd_at > tp->snd_cwnd) 3672 tp->snd_cwnd = rack->r_ctl.rc_cwnd_at; 3673 if (rack->r_ctl.rc_ssthresh_at > tp->snd_ssthresh) 3674 tp->snd_ssthresh = rack->r_ctl.rc_ssthresh_at; 3675 } 3676 } 3677 if (rsm->r_flags & RACK_WAS_SACKPASS) { 3678 /* 3679 * We retransmitted based on a sack and the earlier 3680 * retransmission ack'd it - re-ordering is occuring. 3681 */ 3682 counter_u64_add(rack_reorder_seen, 1); 3683 rack->r_ctl.rc_reorder_ts = cts; 3684 } 3685 counter_u64_add(rack_badfr, 1); 3686 counter_u64_add(rack_badfr_bytes, (rsm->r_end - rsm->r_start)); 3687 } 3688 3689 3690 static int 3691 rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack, 3692 struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type) 3693 { 3694 int32_t i; 3695 uint32_t t; 3696 3697 if (rsm->r_flags & RACK_ACKED) 3698 /* Already done */ 3699 return (0); 3700 3701 3702 if ((rsm->r_rtr_cnt == 1) || 3703 ((ack_type == CUM_ACKED) && 3704 (to->to_flags & TOF_TS) && 3705 (to->to_tsecr) && 3706 (rsm->r_tim_lastsent[rsm->r_rtr_cnt - 1] == to->to_tsecr)) 3707 ) { 3708 /* 3709 * We will only find a matching timestamp if its cum-acked. 3710 * But if its only one retransmission its for-sure matching 3711 * :-) 3712 */ 3713 t = cts - rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; 3714 if ((int)t <= 0) 3715 t = 1; 3716 if (!tp->t_rttlow || tp->t_rttlow > t) 3717 tp->t_rttlow = t; 3718 if (!rack->r_ctl.rc_rack_min_rtt || 3719 SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 3720 rack->r_ctl.rc_rack_min_rtt = t; 3721 if (rack->r_ctl.rc_rack_min_rtt == 0) { 3722 rack->r_ctl.rc_rack_min_rtt = 1; 3723 } 3724 } 3725 tcp_rack_xmit_timer(rack, TCP_TS_TO_TICKS(t) + 1); 3726 if ((rsm->r_flags & RACK_TLP) && 3727 (!IN_RECOVERY(tp->t_flags))) { 3728 /* Segment was a TLP and our retrans matched */ 3729 if (rack->r_ctl.rc_tlp_cwnd_reduce) { 3730 rack->r_ctl.rc_rsm_start = tp->snd_max; 3731 rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; 3732 rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; 3733 rack_cong_signal(tp, NULL, CC_NDUPACK); 3734 /* 3735 * When we enter recovery we need to assure 3736 * we send one packet. 3737 */ 3738 rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; 3739 } else 3740 rack->r_ctl.rc_tlp_rtx_out = 0; 3741 } 3742 if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) { 3743 /* New more recent rack_tmit_time */ 3744 rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; 3745 rack->rc_rack_rtt = t; 3746 } 3747 return (1); 3748 } 3749 /* 3750 * We clear the soft/rxtshift since we got an ack. 3751 * There is no assurance we will call the commit() function 3752 * so we need to clear these to avoid incorrect handling. 3753 */ 3754 tp->t_rxtshift = 0; 3755 tp->t_softerror = 0; 3756 if ((to->to_flags & TOF_TS) && 3757 (ack_type == CUM_ACKED) && 3758 (to->to_tsecr) && 3759 ((rsm->r_flags & (RACK_DEFERRED | RACK_OVERMAX)) == 0)) { 3760 /* 3761 * Now which timestamp does it match? In this block the ACK 3762 * must be coming from a previous transmission. 3763 */ 3764 for (i = 0; i < rsm->r_rtr_cnt; i++) { 3765 if (rsm->r_tim_lastsent[i] == to->to_tsecr) { 3766 t = cts - rsm->r_tim_lastsent[i]; 3767 if ((int)t <= 0) 3768 t = 1; 3769 if ((i + 1) < rsm->r_rtr_cnt) { 3770 /* Likely */ 3771 rack_earlier_retran(tp, rsm, t, cts); 3772 } 3773 if (!tp->t_rttlow || tp->t_rttlow > t) 3774 tp->t_rttlow = t; 3775 if (!rack->r_ctl.rc_rack_min_rtt || SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 3776 rack->r_ctl.rc_rack_min_rtt = t; 3777 if (rack->r_ctl.rc_rack_min_rtt == 0) { 3778 rack->r_ctl.rc_rack_min_rtt = 1; 3779 } 3780 } 3781 /* 3782 * Note the following calls to 3783 * tcp_rack_xmit_timer() are being commented 3784 * out for now. They give us no more accuracy 3785 * and often lead to a wrong choice. We have 3786 * enough samples that have not been 3787 * retransmitted. I leave the commented out 3788 * code in here in case in the future we 3789 * decide to add it back (though I can't forsee 3790 * doing that). That way we will easily see 3791 * where they need to be placed. 3792 */ 3793 if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, 3794 rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) { 3795 /* New more recent rack_tmit_time */ 3796 rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; 3797 rack->rc_rack_rtt = t; 3798 } 3799 return (1); 3800 } 3801 } 3802 goto ts_not_found; 3803 } else { 3804 /* 3805 * Ok its a SACK block that we retransmitted. or a windows 3806 * machine without timestamps. We can tell nothing from the 3807 * time-stamp since its not there or the time the peer last 3808 * recieved a segment that moved forward its cum-ack point. 3809 */ 3810 ts_not_found: 3811 i = rsm->r_rtr_cnt - 1; 3812 t = cts - rsm->r_tim_lastsent[i]; 3813 if ((int)t <= 0) 3814 t = 1; 3815 if (rack->r_ctl.rc_rack_min_rtt && SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 3816 /* 3817 * We retransmitted and the ack came back in less 3818 * than the smallest rtt we have observed. We most 3819 * likey did an improper retransmit as outlined in 3820 * 4.2 Step 3 point 2 in the rack-draft. 3821 */ 3822 i = rsm->r_rtr_cnt - 2; 3823 t = cts - rsm->r_tim_lastsent[i]; 3824 rack_earlier_retran(tp, rsm, t, cts); 3825 } else if (rack->r_ctl.rc_rack_min_rtt) { 3826 /* 3827 * We retransmitted it and the retransmit did the 3828 * job. 3829 */ 3830 if (!rack->r_ctl.rc_rack_min_rtt || 3831 SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 3832 rack->r_ctl.rc_rack_min_rtt = t; 3833 if (rack->r_ctl.rc_rack_min_rtt == 0) { 3834 rack->r_ctl.rc_rack_min_rtt = 1; 3835 } 3836 } 3837 if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, rsm->r_tim_lastsent[i])) { 3838 /* New more recent rack_tmit_time */ 3839 rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[i]; 3840 rack->rc_rack_rtt = t; 3841 } 3842 return (1); 3843 } 3844 } 3845 return (0); 3846 } 3847 3848 /* 3849 * Mark the SACK_PASSED flag on all entries prior to rsm send wise. 3850 */ 3851 static void 3852 rack_log_sack_passed(struct tcpcb *tp, 3853 struct tcp_rack *rack, struct rack_sendmap *rsm) 3854 { 3855 struct rack_sendmap *nrsm; 3856 uint32_t ts; 3857 int32_t idx; 3858 3859 idx = rsm->r_rtr_cnt - 1; 3860 ts = rsm->r_tim_lastsent[idx]; 3861 nrsm = rsm; 3862 TAILQ_FOREACH_REVERSE_FROM(nrsm, &rack->r_ctl.rc_tmap, 3863 rack_head, r_tnext) { 3864 if (nrsm == rsm) { 3865 /* Skip orginal segment he is acked */ 3866 continue; 3867 } 3868 if (nrsm->r_flags & RACK_ACKED) { 3869 /* Skip ack'd segments */ 3870 continue; 3871 } 3872 idx = nrsm->r_rtr_cnt - 1; 3873 if (ts == nrsm->r_tim_lastsent[idx]) { 3874 /* 3875 * For this case lets use seq no, if we sent in a 3876 * big block (TSO) we would have a bunch of segments 3877 * sent at the same time. 3878 * 3879 * We would only get a report if its SEQ is earlier. 3880 * If we have done multiple retransmits the times 3881 * would not be equal. 3882 */ 3883 if (SEQ_LT(nrsm->r_start, rsm->r_start)) { 3884 nrsm->r_flags |= RACK_SACK_PASSED; 3885 nrsm->r_flags &= ~RACK_WAS_SACKPASS; 3886 } 3887 } else { 3888 /* 3889 * Here they were sent at different times, not a big 3890 * block. Since we transmitted this one later and 3891 * see it sack'd then this must also be missing (or 3892 * we would have gotten a sack block for it) 3893 */ 3894 nrsm->r_flags |= RACK_SACK_PASSED; 3895 nrsm->r_flags &= ~RACK_WAS_SACKPASS; 3896 } 3897 } 3898 } 3899 3900 static uint32_t 3901 rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, struct sackblk *sack, 3902 struct tcpopt *to, struct rack_sendmap **prsm, uint32_t cts) 3903 { 3904 int32_t idx; 3905 int32_t times = 0; 3906 uint32_t start, end, changed = 0; 3907 struct rack_sendmap *rsm, *nrsm; 3908 int32_t used_ref = 1; 3909 3910 start = sack->start; 3911 end = sack->end; 3912 rsm = *prsm; 3913 if (rsm && SEQ_LT(start, rsm->r_start)) { 3914 TAILQ_FOREACH_REVERSE_FROM(rsm, &rack->r_ctl.rc_map, rack_head, r_next) { 3915 if (SEQ_GEQ(start, rsm->r_start) && 3916 SEQ_LT(start, rsm->r_end)) { 3917 goto do_rest_ofb; 3918 } 3919 } 3920 } 3921 if (rsm == NULL) { 3922 start_at_beginning: 3923 rsm = NULL; 3924 used_ref = 0; 3925 } 3926 /* First lets locate the block where this guy is */ 3927 TAILQ_FOREACH_FROM(rsm, &rack->r_ctl.rc_map, r_next) { 3928 if (SEQ_GEQ(start, rsm->r_start) && 3929 SEQ_LT(start, rsm->r_end)) { 3930 break; 3931 } 3932 } 3933 do_rest_ofb: 3934 if (rsm == NULL) { 3935 /* 3936 * This happens when we get duplicate sack blocks with the 3937 * same end. For example SACK 4: 100 SACK 3: 100 The sort 3938 * will not change there location so we would just start at 3939 * the end of the first one and get lost. 3940 */ 3941 if (tp->t_flags & TF_SENTFIN) { 3942 /* 3943 * Check to see if we have not logged the FIN that 3944 * went out. 3945 */ 3946 nrsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_map, rack_sendmap, r_next); 3947 if (nrsm && (nrsm->r_end + 1) == tp->snd_max) { 3948 /* 3949 * Ok we did not get the FIN logged. 3950 */ 3951 nrsm->r_end++; 3952 rsm = nrsm; 3953 goto do_rest_ofb; 3954 } 3955 } 3956 if (times == 1) { 3957 #ifdef INVARIANTS 3958 panic("tp:%p rack:%p sack:%p to:%p prsm:%p", 3959 tp, rack, sack, to, prsm); 3960 #else 3961 goto out; 3962 #endif 3963 } 3964 times++; 3965 counter_u64_add(rack_sack_proc_restart, 1); 3966 goto start_at_beginning; 3967 } 3968 /* Ok we have an ACK for some piece of rsm */ 3969 if (rsm->r_start != start) { 3970 /* 3971 * Need to split this in two pieces the before and after. 3972 */ 3973 nrsm = rack_alloc(rack); 3974 if (nrsm == NULL) { 3975 /* 3976 * failed XXXrrs what can we do but loose the sack 3977 * info? 3978 */ 3979 goto out; 3980 } 3981 nrsm->r_start = start; 3982 nrsm->r_rtr_bytes = 0; 3983 nrsm->r_end = rsm->r_end; 3984 nrsm->r_rtr_cnt = rsm->r_rtr_cnt; 3985 nrsm->r_flags = rsm->r_flags; 3986 nrsm->r_sndcnt = rsm->r_sndcnt; 3987 for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { 3988 nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; 3989 } 3990 rsm->r_end = nrsm->r_start; 3991 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next); 3992 if (rsm->r_in_tmap) { 3993 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 3994 nrsm->r_in_tmap = 1; 3995 } 3996 rsm->r_flags &= (~RACK_HAS_FIN); 3997 rsm = nrsm; 3998 } 3999 if (SEQ_GEQ(end, rsm->r_end)) { 4000 /* 4001 * The end of this block is either beyond this guy or right 4002 * at this guy. 4003 */ 4004 4005 if ((rsm->r_flags & RACK_ACKED) == 0) { 4006 rack_update_rtt(tp, rack, rsm, to, cts, SACKED); 4007 changed += (rsm->r_end - rsm->r_start); 4008 rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start); 4009 rack_log_sack_passed(tp, rack, rsm); 4010 /* Is Reordering occuring? */ 4011 if (rsm->r_flags & RACK_SACK_PASSED) { 4012 counter_u64_add(rack_reorder_seen, 1); 4013 rack->r_ctl.rc_reorder_ts = cts; 4014 } 4015 rsm->r_flags |= RACK_ACKED; 4016 rsm->r_flags &= ~RACK_TLP; 4017 if (rsm->r_in_tmap) { 4018 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 4019 rsm->r_in_tmap = 0; 4020 } 4021 } 4022 if (end == rsm->r_end) { 4023 /* This block only - done */ 4024 goto out; 4025 } 4026 /* There is more not coverend by this rsm move on */ 4027 start = rsm->r_end; 4028 nrsm = TAILQ_NEXT(rsm, r_next); 4029 rsm = nrsm; 4030 times = 0; 4031 goto do_rest_ofb; 4032 } 4033 /* Ok we need to split off this one at the tail */ 4034 nrsm = rack_alloc(rack); 4035 if (nrsm == NULL) { 4036 /* failed rrs what can we do but loose the sack info? */ 4037 goto out; 4038 } 4039 /* Clone it */ 4040 nrsm->r_start = end; 4041 nrsm->r_end = rsm->r_end; 4042 nrsm->r_rtr_bytes = 0; 4043 nrsm->r_rtr_cnt = rsm->r_rtr_cnt; 4044 nrsm->r_flags = rsm->r_flags; 4045 nrsm->r_sndcnt = rsm->r_sndcnt; 4046 for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { 4047 nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; 4048 } 4049 /* The sack block does not cover this guy fully */ 4050 rsm->r_flags &= (~RACK_HAS_FIN); 4051 rsm->r_end = end; 4052 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next); 4053 if (rsm->r_in_tmap) { 4054 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 4055 nrsm->r_in_tmap = 1; 4056 } 4057 if (rsm->r_flags & RACK_ACKED) { 4058 /* Been here done that */ 4059 goto out; 4060 } 4061 rack_update_rtt(tp, rack, rsm, to, cts, SACKED); 4062 changed += (rsm->r_end - rsm->r_start); 4063 rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start); 4064 rack_log_sack_passed(tp, rack, rsm); 4065 /* Is Reordering occuring? */ 4066 if (rsm->r_flags & RACK_SACK_PASSED) { 4067 counter_u64_add(rack_reorder_seen, 1); 4068 rack->r_ctl.rc_reorder_ts = cts; 4069 } 4070 rsm->r_flags |= RACK_ACKED; 4071 rsm->r_flags &= ~RACK_TLP; 4072 if (rsm->r_in_tmap) { 4073 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 4074 rsm->r_in_tmap = 0; 4075 } 4076 out: 4077 if (used_ref == 0) { 4078 counter_u64_add(rack_sack_proc_all, 1); 4079 } else { 4080 counter_u64_add(rack_sack_proc_short, 1); 4081 } 4082 /* Save off where we last were */ 4083 if (rsm) 4084 rack->r_ctl.rc_sacklast = TAILQ_NEXT(rsm, r_next); 4085 else 4086 rack->r_ctl.rc_sacklast = NULL; 4087 *prsm = rsm; 4088 return (changed); 4089 } 4090 4091 static void inline 4092 rack_peer_reneges(struct tcp_rack *rack, struct rack_sendmap *rsm, tcp_seq th_ack) 4093 { 4094 struct rack_sendmap *tmap; 4095 4096 tmap = NULL; 4097 while (rsm && (rsm->r_flags & RACK_ACKED)) { 4098 /* Its no longer sacked, mark it so */ 4099 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); 4100 #ifdef INVARIANTS 4101 if (rsm->r_in_tmap) { 4102 panic("rack:%p rsm:%p flags:0x%x in tmap?", 4103 rack, rsm, rsm->r_flags); 4104 } 4105 #endif 4106 rsm->r_flags &= ~(RACK_ACKED|RACK_SACK_PASSED|RACK_WAS_SACKPASS); 4107 /* Rebuild it into our tmap */ 4108 if (tmap == NULL) { 4109 TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext); 4110 tmap = rsm; 4111 } else { 4112 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, tmap, rsm, r_tnext); 4113 tmap = rsm; 4114 } 4115 tmap->r_in_tmap = 1; 4116 rsm = TAILQ_NEXT(rsm, r_next); 4117 } 4118 /* 4119 * Now lets possibly clear the sack filter so we start 4120 * recognizing sacks that cover this area. 4121 */ 4122 if (rack_use_sack_filter) 4123 sack_filter_clear(&rack->r_ctl.rack_sf, th_ack); 4124 4125 } 4126 4127 static void 4128 rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th) 4129 { 4130 uint32_t changed, last_seq, entered_recovery = 0; 4131 struct tcp_rack *rack; 4132 struct rack_sendmap *rsm; 4133 struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1]; 4134 register uint32_t th_ack; 4135 int32_t i, j, k, num_sack_blks = 0; 4136 uint32_t cts, acked, ack_point, sack_changed = 0; 4137 4138 INP_WLOCK_ASSERT(tp->t_inpcb); 4139 if (th->th_flags & TH_RST) { 4140 /* We don't log resets */ 4141 return; 4142 } 4143 rack = (struct tcp_rack *)tp->t_fb_ptr; 4144 cts = tcp_ts_getticks(); 4145 rsm = TAILQ_FIRST(&rack->r_ctl.rc_map); 4146 changed = 0; 4147 th_ack = th->th_ack; 4148 4149 if (SEQ_GT(th_ack, tp->snd_una)) { 4150 rack_log_progress_event(rack, tp, ticks, PROGRESS_UPDATE, __LINE__); 4151 tp->t_acktime = ticks; 4152 } 4153 if (rsm && SEQ_GT(th_ack, rsm->r_start)) 4154 changed = th_ack - rsm->r_start; 4155 if (changed) { 4156 /* 4157 * The ACK point is advancing to th_ack, we must drop off 4158 * the packets in the rack log and calculate any eligble 4159 * RTT's. 4160 */ 4161 rack->r_wanted_output++; 4162 more: 4163 rsm = TAILQ_FIRST(&rack->r_ctl.rc_map); 4164 if (rsm == NULL) { 4165 if ((th_ack - 1) == tp->iss) { 4166 /* 4167 * For the SYN incoming case we will not 4168 * have called tcp_output for the sending of 4169 * the SYN, so there will be no map. All 4170 * other cases should probably be a panic. 4171 */ 4172 goto proc_sack; 4173 } 4174 if (tp->t_flags & TF_SENTFIN) { 4175 /* if we send a FIN we will not hav a map */ 4176 goto proc_sack; 4177 } 4178 #ifdef INVARIANTS 4179 panic("No rack map tp:%p for th:%p state:%d rack:%p snd_una:%u snd_max:%u snd_nxt:%u chg:%d\n", 4180 tp, 4181 th, tp->t_state, rack, 4182 tp->snd_una, tp->snd_max, tp->snd_nxt, changed); 4183 #endif 4184 goto proc_sack; 4185 } 4186 if (SEQ_LT(th_ack, rsm->r_start)) { 4187 /* Huh map is missing this */ 4188 #ifdef INVARIANTS 4189 printf("Rack map starts at r_start:%u for th_ack:%u huh? ts:%d rs:%d\n", 4190 rsm->r_start, 4191 th_ack, tp->t_state, rack->r_state); 4192 #endif 4193 goto proc_sack; 4194 } 4195 rack_update_rtt(tp, rack, rsm, to, cts, CUM_ACKED); 4196 /* Now do we consume the whole thing? */ 4197 if (SEQ_GEQ(th_ack, rsm->r_end)) { 4198 /* Its all consumed. */ 4199 uint32_t left; 4200 4201 rack->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes; 4202 rsm->r_rtr_bytes = 0; 4203 TAILQ_REMOVE(&rack->r_ctl.rc_map, rsm, r_next); 4204 if (rsm->r_in_tmap) { 4205 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 4206 rsm->r_in_tmap = 0; 4207 } 4208 if (rack->r_ctl.rc_next == rsm) { 4209 /* scoot along the marker */ 4210 rack->r_ctl.rc_next = TAILQ_FIRST(&rack->r_ctl.rc_map); 4211 } 4212 if (rsm->r_flags & RACK_ACKED) { 4213 /* 4214 * It was acked on the scoreboard -- remove 4215 * it from total 4216 */ 4217 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); 4218 } else if (rsm->r_flags & RACK_SACK_PASSED) { 4219 /* 4220 * There are acked segments ACKED on the 4221 * scoreboard further up. We are seeing 4222 * reordering. 4223 */ 4224 counter_u64_add(rack_reorder_seen, 1); 4225 rsm->r_flags |= RACK_ACKED; 4226 rack->r_ctl.rc_reorder_ts = cts; 4227 } 4228 left = th_ack - rsm->r_end; 4229 if (rsm->r_rtr_cnt > 1) { 4230 /* 4231 * Technically we should make r_rtr_cnt be 4232 * monotonicly increasing and just mod it to 4233 * the timestamp it is replacing.. that way 4234 * we would have the last 3 retransmits. Now 4235 * rc_loss_count will be wrong if we 4236 * retransmit something more than 2 times in 4237 * recovery :( 4238 */ 4239 rack->r_ctl.rc_loss_count += (rsm->r_rtr_cnt - 1); 4240 } 4241 /* Free back to zone */ 4242 rack_free(rack, rsm); 4243 if (left) { 4244 goto more; 4245 } 4246 goto proc_sack; 4247 } 4248 if (rsm->r_flags & RACK_ACKED) { 4249 /* 4250 * It was acked on the scoreboard -- remove it from 4251 * total for the part being cum-acked. 4252 */ 4253 rack->r_ctl.rc_sacked -= (th_ack - rsm->r_start); 4254 } 4255 rack->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes; 4256 rsm->r_rtr_bytes = 0; 4257 rsm->r_start = th_ack; 4258 } 4259 proc_sack: 4260 /* Check for reneging */ 4261 rsm = TAILQ_FIRST(&rack->r_ctl.rc_map); 4262 if (rsm && (rsm->r_flags & RACK_ACKED) && (th_ack == rsm->r_start)) { 4263 /* 4264 * The peer has moved snd_una up to 4265 * the edge of this send, i.e. one 4266 * that it had previously acked. The only 4267 * way that can be true if the peer threw 4268 * away data (space issues) that it had 4269 * previously sacked (else it would have 4270 * given us snd_una up to (rsm->r_end). 4271 * We need to undo the acked markings here. 4272 * 4273 * Note we have to look to make sure th_ack is 4274 * our rsm->r_start in case we get an old ack 4275 * where th_ack is behind snd_una. 4276 */ 4277 rack_peer_reneges(rack, rsm, th->th_ack); 4278 } 4279 if ((to->to_flags & TOF_SACK) == 0) { 4280 /* We are done nothing left to log */ 4281 goto out; 4282 } 4283 rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_map, rack_sendmap, r_next); 4284 if (rsm) { 4285 last_seq = rsm->r_end; 4286 } else { 4287 last_seq = tp->snd_max; 4288 } 4289 /* Sack block processing */ 4290 if (SEQ_GT(th_ack, tp->snd_una)) 4291 ack_point = th_ack; 4292 else 4293 ack_point = tp->snd_una; 4294 for (i = 0; i < to->to_nsacks; i++) { 4295 bcopy((to->to_sacks + i * TCPOLEN_SACK), 4296 &sack, sizeof(sack)); 4297 sack.start = ntohl(sack.start); 4298 sack.end = ntohl(sack.end); 4299 if (SEQ_GT(sack.end, sack.start) && 4300 SEQ_GT(sack.start, ack_point) && 4301 SEQ_LT(sack.start, tp->snd_max) && 4302 SEQ_GT(sack.end, ack_point) && 4303 SEQ_LEQ(sack.end, tp->snd_max)) { 4304 if ((rack->r_ctl.rc_num_maps_alloced > rack_sack_block_limit) && 4305 (SEQ_LT(sack.end, last_seq)) && 4306 ((sack.end - sack.start) < (tp->t_maxseg / 8))) { 4307 /* 4308 * Not the last piece and its smaller than 4309 * 1/8th of a MSS. We ignore this. 4310 */ 4311 counter_u64_add(rack_runt_sacks, 1); 4312 continue; 4313 } 4314 sack_blocks[num_sack_blks] = sack; 4315 num_sack_blks++; 4316 #ifdef NETFLIX_STATS 4317 } else if (SEQ_LEQ(sack.start, th_ack) && 4318 SEQ_LEQ(sack.end, th_ack)) { 4319 /* 4320 * Its a D-SACK block. 4321 */ 4322 tcp_record_dsack(sack.start, sack.end); 4323 #endif 4324 } 4325 4326 } 4327 if (num_sack_blks == 0) 4328 goto out; 4329 /* 4330 * Sort the SACK blocks so we can update the rack scoreboard with 4331 * just one pass. 4332 */ 4333 if (rack_use_sack_filter) { 4334 num_sack_blks = sack_filter_blks(&rack->r_ctl.rack_sf, sack_blocks, num_sack_blks, th->th_ack); 4335 } 4336 if (num_sack_blks < 2) { 4337 goto do_sack_work; 4338 } 4339 /* Sort the sacks */ 4340 for (i = 0; i < num_sack_blks; i++) { 4341 for (j = i + 1; j < num_sack_blks; j++) { 4342 if (SEQ_GT(sack_blocks[i].end, sack_blocks[j].end)) { 4343 sack = sack_blocks[i]; 4344 sack_blocks[i] = sack_blocks[j]; 4345 sack_blocks[j] = sack; 4346 } 4347 } 4348 } 4349 /* 4350 * Now are any of the sack block ends the same (yes some 4351 * implememtations send these)? 4352 */ 4353 again: 4354 if (num_sack_blks > 1) { 4355 for (i = 0; i < num_sack_blks; i++) { 4356 for (j = i + 1; j < num_sack_blks; j++) { 4357 if (sack_blocks[i].end == sack_blocks[j].end) { 4358 /* 4359 * Ok these two have the same end we 4360 * want the smallest end and then 4361 * throw away the larger and start 4362 * again. 4363 */ 4364 if (SEQ_LT(sack_blocks[j].start, sack_blocks[i].start)) { 4365 /* 4366 * The second block covers 4367 * more area use that 4368 */ 4369 sack_blocks[i].start = sack_blocks[j].start; 4370 } 4371 /* 4372 * Now collapse out the dup-sack and 4373 * lower the count 4374 */ 4375 for (k = (j + 1); k < num_sack_blks; k++) { 4376 sack_blocks[j].start = sack_blocks[k].start; 4377 sack_blocks[j].end = sack_blocks[k].end; 4378 j++; 4379 } 4380 num_sack_blks--; 4381 goto again; 4382 } 4383 } 4384 } 4385 } 4386 do_sack_work: 4387 rsm = rack->r_ctl.rc_sacklast; 4388 for (i = 0; i < num_sack_blks; i++) { 4389 acked = rack_proc_sack_blk(tp, rack, &sack_blocks[i], to, &rsm, cts); 4390 if (acked) { 4391 rack->r_wanted_output++; 4392 changed += acked; 4393 sack_changed += acked; 4394 } 4395 } 4396 out: 4397 if (changed) { 4398 /* Something changed cancel the rack timer */ 4399 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 4400 } 4401 if ((sack_changed) && (!IN_RECOVERY(tp->t_flags))) { 4402 /* 4403 * Ok we have a high probability that we need to go in to 4404 * recovery since we have data sack'd 4405 */ 4406 struct rack_sendmap *rsm; 4407 uint32_t tsused; 4408 4409 tsused = tcp_ts_getticks(); 4410 rsm = tcp_rack_output(tp, rack, tsused); 4411 if (rsm) { 4412 /* Enter recovery */ 4413 rack->r_ctl.rc_rsm_start = rsm->r_start; 4414 rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; 4415 rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; 4416 entered_recovery = 1; 4417 rack_cong_signal(tp, NULL, CC_NDUPACK); 4418 /* 4419 * When we enter recovery we need to assure we send 4420 * one packet. 4421 */ 4422 rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; 4423 rack->r_timer_override = 1; 4424 } 4425 } 4426 if (IN_RECOVERY(tp->t_flags) && (entered_recovery == 0)) { 4427 /* Deal with changed an PRR here (in recovery only) */ 4428 uint32_t pipe, snd_una; 4429 4430 rack->r_ctl.rc_prr_delivered += changed; 4431 /* Compute prr_sndcnt */ 4432 if (SEQ_GT(tp->snd_una, th_ack)) { 4433 snd_una = tp->snd_una; 4434 } else { 4435 snd_una = th_ack; 4436 } 4437 pipe = ((tp->snd_max - snd_una) - rack->r_ctl.rc_sacked) + rack->r_ctl.rc_holes_rxt; 4438 if (pipe > tp->snd_ssthresh) { 4439 long sndcnt; 4440 4441 sndcnt = rack->r_ctl.rc_prr_delivered * tp->snd_ssthresh; 4442 if (rack->r_ctl.rc_prr_recovery_fs > 0) 4443 sndcnt /= (long)rack->r_ctl.rc_prr_recovery_fs; 4444 else { 4445 rack->r_ctl.rc_prr_sndcnt = 0; 4446 sndcnt = 0; 4447 } 4448 sndcnt++; 4449 if (sndcnt > (long)rack->r_ctl.rc_prr_out) 4450 sndcnt -= rack->r_ctl.rc_prr_out; 4451 else 4452 sndcnt = 0; 4453 rack->r_ctl.rc_prr_sndcnt = sndcnt; 4454 } else { 4455 uint32_t limit; 4456 4457 if (rack->r_ctl.rc_prr_delivered > rack->r_ctl.rc_prr_out) 4458 limit = (rack->r_ctl.rc_prr_delivered - rack->r_ctl.rc_prr_out); 4459 else 4460 limit = 0; 4461 if (changed > limit) 4462 limit = changed; 4463 limit += tp->t_maxseg; 4464 if (tp->snd_ssthresh > pipe) { 4465 rack->r_ctl.rc_prr_sndcnt = min((tp->snd_ssthresh - pipe), limit); 4466 } else { 4467 rack->r_ctl.rc_prr_sndcnt = min(0, limit); 4468 } 4469 } 4470 if (rack->r_ctl.rc_prr_sndcnt >= tp->t_maxseg) { 4471 rack->r_timer_override = 1; 4472 } 4473 } 4474 } 4475 4476 /* 4477 * Return value of 1, we do not need to call rack_process_data(). 4478 * return value of 0, rack_process_data can be called. 4479 * For ret_val if its 0 the TCP is locked, if its non-zero 4480 * its unlocked and probably unsafe to touch the TCB. 4481 */ 4482 static int 4483 rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so, 4484 struct tcpcb *tp, struct tcpopt *to, 4485 int32_t * ti_locked, uint32_t tiwin, int32_t tlen, 4486 int32_t * ofia, int32_t thflags, int32_t * ret_val) 4487 { 4488 int32_t ourfinisacked = 0; 4489 int32_t nsegs, acked_amount; 4490 int32_t acked; 4491 struct mbuf *mfree; 4492 struct tcp_rack *rack; 4493 int32_t recovery = 0; 4494 4495 rack = (struct tcp_rack *)tp->t_fb_ptr; 4496 if (SEQ_GT(th->th_ack, tp->snd_max)) { 4497 rack_do_dropafterack(m, tp, th, ti_locked, thflags, tlen, ret_val); 4498 return (1); 4499 } 4500 if (SEQ_GEQ(th->th_ack, tp->snd_una) || to->to_nsacks) { 4501 rack_log_ack(tp, to, th); 4502 } 4503 if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) { 4504 /* 4505 * Old ack, behind (or duplicate to) the last one rcv'd 4506 * Note: Should mark reordering is occuring! We should also 4507 * look for sack blocks arriving e.g. ack 1, 4-4 then ack 1, 4508 * 3-3, 4-4 would be reording. As well as ack 1, 3-3 <no 4509 * retran and> ack 3 4510 */ 4511 return (0); 4512 } 4513 /* 4514 * If we reach this point, ACK is not a duplicate, i.e., it ACKs 4515 * something we sent. 4516 */ 4517 if (tp->t_flags & TF_NEEDSYN) { 4518 /* 4519 * T/TCP: Connection was half-synchronized, and our SYN has 4520 * been ACK'd (so connection is now fully synchronized). Go 4521 * to non-starred state, increment snd_una for ACK of SYN, 4522 * and check if we can do window scaling. 4523 */ 4524 tp->t_flags &= ~TF_NEEDSYN; 4525 tp->snd_una++; 4526 /* Do window scaling? */ 4527 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 4528 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 4529 tp->rcv_scale = tp->request_r_scale; 4530 /* Send window already scaled. */ 4531 } 4532 } 4533 nsegs = max(1, m->m_pkthdr.lro_nsegs); 4534 INP_WLOCK_ASSERT(tp->t_inpcb); 4535 4536 acked = BYTES_THIS_ACK(tp, th); 4537 TCPSTAT_ADD(tcps_rcvackpack, nsegs); 4538 TCPSTAT_ADD(tcps_rcvackbyte, acked); 4539 4540 /* 4541 * If we just performed our first retransmit, and the ACK arrives 4542 * within our recovery window, then it was a mistake to do the 4543 * retransmit in the first place. Recover our original cwnd and 4544 * ssthresh, and proceed to transmit where we left off. 4545 */ 4546 if (tp->t_flags & TF_PREVVALID) { 4547 tp->t_flags &= ~TF_PREVVALID; 4548 if (tp->t_rxtshift == 1 && 4549 (int)(ticks - tp->t_badrxtwin) < 0) 4550 rack_cong_signal(tp, th, CC_RTO_ERR); 4551 } 4552 /* 4553 * If we have a timestamp reply, update smoothed round trip time. If 4554 * no timestamp is present but transmit timer is running and timed 4555 * sequence number was acked, update smoothed round trip time. Since 4556 * we now have an rtt measurement, cancel the timer backoff (cf., 4557 * Phil Karn's retransmit alg.). Recompute the initial retransmit 4558 * timer. 4559 * 4560 * Some boxes send broken timestamp replies during the SYN+ACK 4561 * phase, ignore timestamps of 0 or we could calculate a huge RTT 4562 * and blow up the retransmit timer. 4563 */ 4564 /* 4565 * If all outstanding data is acked, stop retransmit timer and 4566 * remember to restart (more output or persist). If there is more 4567 * data to be acked, restart retransmit timer, using current 4568 * (possibly backed-off) value. 4569 */ 4570 if (th->th_ack == tp->snd_max) { 4571 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 4572 rack->r_wanted_output++; 4573 } 4574 /* 4575 * If no data (only SYN) was ACK'd, skip rest of ACK processing. 4576 */ 4577 if (acked == 0) { 4578 if (ofia) 4579 *ofia = ourfinisacked; 4580 return (0); 4581 } 4582 if (rack->r_ctl.rc_early_recovery) { 4583 if (IN_FASTRECOVERY(tp->t_flags)) { 4584 if (SEQ_LT(th->th_ack, tp->snd_recover)) { 4585 tcp_rack_partialack(tp, th); 4586 } else { 4587 rack_post_recovery(tp, th); 4588 recovery = 1; 4589 } 4590 } 4591 } 4592 /* 4593 * Let the congestion control algorithm update congestion control 4594 * related information. This typically means increasing the 4595 * congestion window. 4596 */ 4597 rack_ack_received(tp, rack, th, nsegs, CC_ACK, recovery); 4598 SOCKBUF_LOCK(&so->so_snd); 4599 acked_amount = min(acked, (int)sbavail(&so->so_snd)); 4600 tp->snd_wnd -= acked_amount; 4601 mfree = sbcut_locked(&so->so_snd, acked_amount); 4602 if ((sbused(&so->so_snd) == 0) && 4603 (acked > acked_amount) && 4604 (tp->t_state >= TCPS_FIN_WAIT_1)) { 4605 ourfinisacked = 1; 4606 } 4607 /* NB: sowwakeup_locked() does an implicit unlock. */ 4608 sowwakeup_locked(so); 4609 m_freem(mfree); 4610 if (rack->r_ctl.rc_early_recovery == 0) { 4611 if (IN_FASTRECOVERY(tp->t_flags)) { 4612 if (SEQ_LT(th->th_ack, tp->snd_recover)) { 4613 tcp_rack_partialack(tp, th); 4614 } else { 4615 rack_post_recovery(tp, th); 4616 } 4617 } 4618 } 4619 tp->snd_una = th->th_ack; 4620 if (SEQ_GT(tp->snd_una, tp->snd_recover)) 4621 tp->snd_recover = tp->snd_una; 4622 4623 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) { 4624 tp->snd_nxt = tp->snd_una; 4625 } 4626 if (tp->snd_una == tp->snd_max) { 4627 /* Nothing left outstanding */ 4628 rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__); 4629 tp->t_acktime = 0; 4630 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 4631 /* Set need output so persist might get set */ 4632 rack->r_wanted_output++; 4633 if (rack_use_sack_filter) 4634 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 4635 if ((tp->t_state >= TCPS_FIN_WAIT_1) && 4636 (sbavail(&so->so_snd) == 0) && 4637 (tp->t_flags2 & TF2_DROP_AF_DATA)) { 4638 /* 4639 * The socket was gone and the 4640 * peer sent data, time to 4641 * reset him. 4642 */ 4643 *ret_val = 1; 4644 tp = tcp_close(tp); 4645 rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_UNLIMITED, tlen); 4646 return (1); 4647 } 4648 } 4649 if (ofia) 4650 *ofia = ourfinisacked; 4651 return (0); 4652 } 4653 4654 4655 /* 4656 * Return value of 1, the TCB is unlocked and most 4657 * likely gone, return value of 0, the TCP is still 4658 * locked. 4659 */ 4660 static int 4661 rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so, 4662 struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, 4663 int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) 4664 { 4665 /* 4666 * Update window information. Don't look at window if no ACK: TAC's 4667 * send garbage on first SYN. 4668 */ 4669 int32_t nsegs; 4670 int32_t tfo_syn; 4671 struct tcp_rack *rack; 4672 4673 rack = (struct tcp_rack *)tp->t_fb_ptr; 4674 INP_WLOCK_ASSERT(tp->t_inpcb); 4675 4676 nsegs = max(1, m->m_pkthdr.lro_nsegs); 4677 if ((thflags & TH_ACK) && 4678 (SEQ_LT(tp->snd_wl1, th->th_seq) || 4679 (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || 4680 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { 4681 /* keep track of pure window updates */ 4682 if (tlen == 0 && 4683 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) 4684 TCPSTAT_INC(tcps_rcvwinupd); 4685 tp->snd_wnd = tiwin; 4686 tp->snd_wl1 = th->th_seq; 4687 tp->snd_wl2 = th->th_ack; 4688 if (tp->snd_wnd > tp->max_sndwnd) 4689 tp->max_sndwnd = tp->snd_wnd; 4690 rack->r_wanted_output++; 4691 } else if (thflags & TH_ACK) { 4692 if ((tp->snd_wl2 == th->th_ack) && (tiwin < tp->snd_wnd)) { 4693 tp->snd_wnd = tiwin; 4694 tp->snd_wl1 = th->th_seq; 4695 tp->snd_wl2 = th->th_ack; 4696 } 4697 } 4698 /* Was persist timer active and now we have window space? */ 4699 if ((rack->rc_in_persist != 0) && tp->snd_wnd) { 4700 rack_exit_persist(tp, rack); 4701 tp->snd_nxt = tp->snd_max; 4702 /* Make sure we output to start the timer */ 4703 rack->r_wanted_output++; 4704 } 4705 /* 4706 * Process segments with URG. 4707 */ 4708 if ((thflags & TH_URG) && th->th_urp && 4709 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 4710 /* 4711 * This is a kludge, but if we receive and accept random 4712 * urgent pointers, we'll crash in soreceive. It's hard to 4713 * imagine someone actually wanting to send this much urgent 4714 * data. 4715 */ 4716 SOCKBUF_LOCK(&so->so_rcv); 4717 if (th->th_urp + sbavail(&so->so_rcv) > sb_max) { 4718 th->th_urp = 0; /* XXX */ 4719 thflags &= ~TH_URG; /* XXX */ 4720 SOCKBUF_UNLOCK(&so->so_rcv); /* XXX */ 4721 goto dodata; /* XXX */ 4722 } 4723 /* 4724 * If this segment advances the known urgent pointer, then 4725 * mark the data stream. This should not happen in 4726 * CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since a 4727 * FIN has been received from the remote side. In these 4728 * states we ignore the URG. 4729 * 4730 * According to RFC961 (Assigned Protocols), the urgent 4731 * pointer points to the last octet of urgent data. We 4732 * continue, however, to consider it to indicate the first 4733 * octet of data past the urgent section as the original 4734 * spec states (in one of two places). 4735 */ 4736 if (SEQ_GT(th->th_seq + th->th_urp, tp->rcv_up)) { 4737 tp->rcv_up = th->th_seq + th->th_urp; 4738 so->so_oobmark = sbavail(&so->so_rcv) + 4739 (tp->rcv_up - tp->rcv_nxt) - 1; 4740 if (so->so_oobmark == 0) 4741 so->so_rcv.sb_state |= SBS_RCVATMARK; 4742 sohasoutofband(so); 4743 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); 4744 } 4745 SOCKBUF_UNLOCK(&so->so_rcv); 4746 /* 4747 * Remove out of band data so doesn't get presented to user. 4748 * This can happen independent of advancing the URG pointer, 4749 * but if two URG's are pending at once, some out-of-band 4750 * data may creep in... ick. 4751 */ 4752 if (th->th_urp <= (uint32_t) tlen && 4753 !(so->so_options & SO_OOBINLINE)) { 4754 /* hdr drop is delayed */ 4755 tcp_pulloutofband(so, th, m, drop_hdrlen); 4756 } 4757 } else { 4758 /* 4759 * If no out of band data is expected, pull receive urgent 4760 * pointer along with the receive window. 4761 */ 4762 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) 4763 tp->rcv_up = tp->rcv_nxt; 4764 } 4765 dodata: /* XXX */ 4766 INP_WLOCK_ASSERT(tp->t_inpcb); 4767 4768 /* 4769 * Process the segment text, merging it into the TCP sequencing 4770 * queue, and arranging for acknowledgment of receipt if necessary. 4771 * This process logically involves adjusting tp->rcv_wnd as data is 4772 * presented to the user (this happens in tcp_usrreq.c, case 4773 * PRU_RCVD). If a FIN has already been received on this connection 4774 * then we just ignore the text. 4775 */ 4776 tfo_syn = ((tp->t_state == TCPS_SYN_RECEIVED) && 4777 IS_FASTOPEN(tp->t_flags)); 4778 if ((tlen || (thflags & TH_FIN) || tfo_syn) && 4779 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 4780 tcp_seq save_start = th->th_seq; 4781 4782 m_adj(m, drop_hdrlen); /* delayed header drop */ 4783 /* 4784 * Insert segment which includes th into TCP reassembly 4785 * queue with control block tp. Set thflags to whether 4786 * reassembly now includes a segment with FIN. This handles 4787 * the common case inline (segment is the next to be 4788 * received on an established connection, and the queue is 4789 * empty), avoiding linkage into and removal from the queue 4790 * and repetition of various conversions. Set DELACK for 4791 * segments received in order, but ack immediately when 4792 * segments are out of order (so fast retransmit can work). 4793 */ 4794 if (th->th_seq == tp->rcv_nxt && 4795 LIST_EMPTY(&tp->t_segq) && 4796 (TCPS_HAVEESTABLISHED(tp->t_state) || 4797 tfo_syn)) { 4798 if (DELAY_ACK(tp, tlen) || tfo_syn) { 4799 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 4800 tp->t_flags |= TF_DELACK; 4801 } else { 4802 rack->r_wanted_output++; 4803 tp->t_flags |= TF_ACKNOW; 4804 } 4805 tp->rcv_nxt += tlen; 4806 thflags = th->th_flags & TH_FIN; 4807 TCPSTAT_ADD(tcps_rcvpack, nsegs); 4808 TCPSTAT_ADD(tcps_rcvbyte, tlen); 4809 SOCKBUF_LOCK(&so->so_rcv); 4810 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) 4811 m_freem(m); 4812 else 4813 sbappendstream_locked(&so->so_rcv, m, 0); 4814 /* NB: sorwakeup_locked() does an implicit unlock. */ 4815 sorwakeup_locked(so); 4816 } else { 4817 /* 4818 * XXX: Due to the header drop above "th" is 4819 * theoretically invalid by now. Fortunately 4820 * m_adj() doesn't actually frees any mbufs when 4821 * trimming from the head. 4822 */ 4823 thflags = tcp_reass(tp, th, &tlen, m); 4824 tp->t_flags |= TF_ACKNOW; 4825 } 4826 if (tlen > 0) 4827 tcp_update_sack_list(tp, save_start, save_start + tlen); 4828 } else { 4829 m_freem(m); 4830 thflags &= ~TH_FIN; 4831 } 4832 4833 /* 4834 * If FIN is received ACK the FIN and let the user know that the 4835 * connection is closing. 4836 */ 4837 if (thflags & TH_FIN) { 4838 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 4839 socantrcvmore(so); 4840 /* 4841 * If connection is half-synchronized (ie NEEDSYN 4842 * flag on) then delay ACK, so it may be piggybacked 4843 * when SYN is sent. Otherwise, since we received a 4844 * FIN then no more input can be expected, send ACK 4845 * now. 4846 */ 4847 if (tp->t_flags & TF_NEEDSYN) { 4848 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 4849 tp->t_flags |= TF_DELACK; 4850 } else { 4851 tp->t_flags |= TF_ACKNOW; 4852 } 4853 tp->rcv_nxt++; 4854 } 4855 switch (tp->t_state) { 4856 4857 /* 4858 * In SYN_RECEIVED and ESTABLISHED STATES enter the 4859 * CLOSE_WAIT state. 4860 */ 4861 case TCPS_SYN_RECEIVED: 4862 tp->t_starttime = ticks; 4863 /* FALLTHROUGH */ 4864 case TCPS_ESTABLISHED: 4865 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 4866 tcp_state_change(tp, TCPS_CLOSE_WAIT); 4867 break; 4868 4869 /* 4870 * If still in FIN_WAIT_1 STATE FIN has not been 4871 * acked so enter the CLOSING state. 4872 */ 4873 case TCPS_FIN_WAIT_1: 4874 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 4875 tcp_state_change(tp, TCPS_CLOSING); 4876 break; 4877 4878 /* 4879 * In FIN_WAIT_2 state enter the TIME_WAIT state, 4880 * starting the time-wait timer, turning off the 4881 * other standard timers. 4882 */ 4883 case TCPS_FIN_WAIT_2: 4884 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 4885 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 4886 KASSERT(*ti_locked == TI_RLOCKED, ("%s: dodata " 4887 "TCP_FIN_WAIT_2 ti_locked: %d", __func__, 4888 *ti_locked)); 4889 tcp_twstart(tp); 4890 *ti_locked = TI_UNLOCKED; 4891 INP_INFO_RUNLOCK(&V_tcbinfo); 4892 return (1); 4893 } 4894 } 4895 if (*ti_locked == TI_RLOCKED) { 4896 INP_INFO_RUNLOCK(&V_tcbinfo); 4897 *ti_locked = TI_UNLOCKED; 4898 } 4899 /* 4900 * Return any desired output. 4901 */ 4902 if ((tp->t_flags & TF_ACKNOW) || (sbavail(&so->so_snd) > (tp->snd_max - tp->snd_una))) { 4903 rack->r_wanted_output++; 4904 } 4905 KASSERT(*ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d", 4906 __func__, *ti_locked)); 4907 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); 4908 INP_WLOCK_ASSERT(tp->t_inpcb); 4909 return (0); 4910 } 4911 4912 /* 4913 * Here nothing is really faster, its just that we 4914 * have broken out the fast-data path also just like 4915 * the fast-ack. 4916 */ 4917 static int 4918 rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so, 4919 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 4920 int32_t * ti_locked, uint32_t tiwin, int32_t nxt_pkt) 4921 { 4922 int32_t nsegs; 4923 int32_t newsize = 0; /* automatic sockbuf scaling */ 4924 struct tcp_rack *rack; 4925 #ifdef TCPDEBUG 4926 /* 4927 * The size of tcp_saveipgen must be the size of the max ip header, 4928 * now IPv6. 4929 */ 4930 u_char tcp_saveipgen[IP6_HDR_LEN]; 4931 struct tcphdr tcp_savetcp; 4932 short ostate = 0; 4933 4934 #endif 4935 /* 4936 * If last ACK falls within this segment's sequence numbers, record 4937 * the timestamp. NOTE that the test is modified according to the 4938 * latest proposal of the tcplw@cray.com list (Braden 1993/04/26). 4939 */ 4940 if (__predict_false(th->th_seq != tp->rcv_nxt)) { 4941 return (0); 4942 } 4943 if (__predict_false(tp->snd_nxt != tp->snd_max)) { 4944 return (0); 4945 } 4946 if (tiwin && tiwin != tp->snd_wnd) { 4947 return (0); 4948 } 4949 if (__predict_false((tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN)))) { 4950 return (0); 4951 } 4952 if (__predict_false((to->to_flags & TOF_TS) && 4953 (TSTMP_LT(to->to_tsval, tp->ts_recent)))) { 4954 return (0); 4955 } 4956 if (__predict_false((th->th_ack != tp->snd_una))) { 4957 return (0); 4958 } 4959 if (__predict_false(tlen > sbspace(&so->so_rcv))) { 4960 return (0); 4961 } 4962 if ((to->to_flags & TOF_TS) != 0 && 4963 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 4964 tp->ts_recent_age = tcp_ts_getticks(); 4965 tp->ts_recent = to->to_tsval; 4966 } 4967 rack = (struct tcp_rack *)tp->t_fb_ptr; 4968 /* 4969 * This is a pure, in-sequence data packet with nothing on the 4970 * reassembly queue and we have enough buffer space to take it. 4971 */ 4972 if (*ti_locked == TI_RLOCKED) { 4973 INP_INFO_RUNLOCK(&V_tcbinfo); 4974 *ti_locked = TI_UNLOCKED; 4975 } 4976 nsegs = max(1, m->m_pkthdr.lro_nsegs); 4977 4978 4979 /* Clean receiver SACK report if present */ 4980 if (tp->rcv_numsacks) 4981 tcp_clean_sackreport(tp); 4982 TCPSTAT_INC(tcps_preddat); 4983 tp->rcv_nxt += tlen; 4984 /* 4985 * Pull snd_wl1 up to prevent seq wrap relative to th_seq. 4986 */ 4987 tp->snd_wl1 = th->th_seq; 4988 /* 4989 * Pull rcv_up up to prevent seq wrap relative to rcv_nxt. 4990 */ 4991 tp->rcv_up = tp->rcv_nxt; 4992 TCPSTAT_ADD(tcps_rcvpack, nsegs); 4993 TCPSTAT_ADD(tcps_rcvbyte, tlen); 4994 #ifdef TCPDEBUG 4995 if (so->so_options & SO_DEBUG) 4996 tcp_trace(TA_INPUT, ostate, tp, 4997 (void *)tcp_saveipgen, &tcp_savetcp, 0); 4998 #endif 4999 newsize = tcp_autorcvbuf(m, th, so, tp, tlen); 5000 5001 /* Add data to socket buffer. */ 5002 SOCKBUF_LOCK(&so->so_rcv); 5003 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 5004 m_freem(m); 5005 } else { 5006 /* 5007 * Set new socket buffer size. Give up when limit is 5008 * reached. 5009 */ 5010 if (newsize) 5011 if (!sbreserve_locked(&so->so_rcv, 5012 newsize, so, NULL)) 5013 so->so_rcv.sb_flags &= ~SB_AUTOSIZE; 5014 m_adj(m, drop_hdrlen); /* delayed header drop */ 5015 sbappendstream_locked(&so->so_rcv, m, 0); 5016 rack_calc_rwin(so, tp); 5017 } 5018 /* NB: sorwakeup_locked() does an implicit unlock. */ 5019 sorwakeup_locked(so); 5020 if (DELAY_ACK(tp, tlen)) { 5021 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 5022 tp->t_flags |= TF_DELACK; 5023 } else { 5024 tp->t_flags |= TF_ACKNOW; 5025 rack->r_wanted_output++; 5026 } 5027 if ((tp->snd_una == tp->snd_max) && rack_use_sack_filter) 5028 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 5029 return (1); 5030 } 5031 5032 /* 5033 * This subfunction is used to try to highly optimize the 5034 * fast path. We again allow window updates that are 5035 * in sequence to remain in the fast-path. We also add 5036 * in the __predict's to attempt to help the compiler. 5037 * Note that if we return a 0, then we can *not* process 5038 * it and the caller should push the packet into the 5039 * slow-path. 5040 */ 5041 static int 5042 rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so, 5043 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 5044 int32_t * ti_locked, uint32_t tiwin, int32_t nxt_pkt, uint32_t cts) 5045 { 5046 int32_t acked; 5047 int32_t nsegs; 5048 5049 #ifdef TCPDEBUG 5050 /* 5051 * The size of tcp_saveipgen must be the size of the max ip header, 5052 * now IPv6. 5053 */ 5054 u_char tcp_saveipgen[IP6_HDR_LEN]; 5055 struct tcphdr tcp_savetcp; 5056 short ostate = 0; 5057 5058 #endif 5059 struct tcp_rack *rack; 5060 5061 if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) { 5062 /* Old ack, behind (or duplicate to) the last one rcv'd */ 5063 return (0); 5064 } 5065 if (__predict_false(SEQ_GT(th->th_ack, tp->snd_max))) { 5066 /* Above what we have sent? */ 5067 return (0); 5068 } 5069 if (__predict_false(tp->snd_nxt != tp->snd_max)) { 5070 /* We are retransmitting */ 5071 return (0); 5072 } 5073 if (__predict_false(tiwin == 0)) { 5074 /* zero window */ 5075 return (0); 5076 } 5077 if (__predict_false(tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN))) { 5078 /* We need a SYN or a FIN, unlikely.. */ 5079 return (0); 5080 } 5081 if ((to->to_flags & TOF_TS) && __predict_false(TSTMP_LT(to->to_tsval, tp->ts_recent))) { 5082 /* Timestamp is behind .. old ack with seq wrap? */ 5083 return (0); 5084 } 5085 if (__predict_false(IN_RECOVERY(tp->t_flags))) { 5086 /* Still recovering */ 5087 return (0); 5088 } 5089 rack = (struct tcp_rack *)tp->t_fb_ptr; 5090 if (rack->r_ctl.rc_sacked) { 5091 /* We have sack holes on our scoreboard */ 5092 return (0); 5093 } 5094 /* Ok if we reach here, we can process a fast-ack */ 5095 nsegs = max(1, m->m_pkthdr.lro_nsegs); 5096 rack_log_ack(tp, to, th); 5097 /* Did the window get updated? */ 5098 if (tiwin != tp->snd_wnd) { 5099 tp->snd_wnd = tiwin; 5100 tp->snd_wl1 = th->th_seq; 5101 if (tp->snd_wnd > tp->max_sndwnd) 5102 tp->max_sndwnd = tp->snd_wnd; 5103 } 5104 if ((rack->rc_in_persist != 0) && (tp->snd_wnd >= tp->t_maxseg)) { 5105 rack_exit_persist(tp, rack); 5106 } 5107 /* 5108 * If last ACK falls within this segment's sequence numbers, record 5109 * the timestamp. NOTE that the test is modified according to the 5110 * latest proposal of the tcplw@cray.com list (Braden 1993/04/26). 5111 */ 5112 if ((to->to_flags & TOF_TS) != 0 && 5113 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 5114 tp->ts_recent_age = tcp_ts_getticks(); 5115 tp->ts_recent = to->to_tsval; 5116 } 5117 /* 5118 * This is a pure ack for outstanding data. 5119 */ 5120 if (*ti_locked == TI_RLOCKED) { 5121 INP_INFO_RUNLOCK(&V_tcbinfo); 5122 *ti_locked = TI_UNLOCKED; 5123 } 5124 TCPSTAT_INC(tcps_predack); 5125 5126 /* 5127 * "bad retransmit" recovery. 5128 */ 5129 if (tp->t_flags & TF_PREVVALID) { 5130 tp->t_flags &= ~TF_PREVVALID; 5131 if (tp->t_rxtshift == 1 && 5132 (int)(ticks - tp->t_badrxtwin) < 0) 5133 rack_cong_signal(tp, th, CC_RTO_ERR); 5134 } 5135 /* 5136 * Recalculate the transmit timer / rtt. 5137 * 5138 * Some boxes send broken timestamp replies during the SYN+ACK 5139 * phase, ignore timestamps of 0 or we could calculate a huge RTT 5140 * and blow up the retransmit timer. 5141 */ 5142 acked = BYTES_THIS_ACK(tp, th); 5143 5144 #ifdef TCP_HHOOK 5145 /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ 5146 hhook_run_tcp_est_in(tp, th, to); 5147 #endif 5148 5149 TCPSTAT_ADD(tcps_rcvackpack, nsegs); 5150 TCPSTAT_ADD(tcps_rcvackbyte, acked); 5151 sbdrop(&so->so_snd, acked); 5152 /* 5153 * Let the congestion control algorithm update congestion control 5154 * related information. This typically means increasing the 5155 * congestion window. 5156 */ 5157 rack_ack_received(tp, rack, th, nsegs, CC_ACK, 0); 5158 5159 tp->snd_una = th->th_ack; 5160 /* 5161 * Pull snd_wl2 up to prevent seq wrap relative to th_ack. 5162 */ 5163 tp->snd_wl2 = th->th_ack; 5164 tp->t_dupacks = 0; 5165 m_freem(m); 5166 /* ND6_HINT(tp); *//* Some progress has been made. */ 5167 5168 /* 5169 * If all outstanding data are acked, stop retransmit timer, 5170 * otherwise restart timer using current (possibly backed-off) 5171 * value. If process is waiting for space, wakeup/selwakeup/signal. 5172 * If data are ready to send, let tcp_output decide between more 5173 * output or persist. 5174 */ 5175 #ifdef TCPDEBUG 5176 if (so->so_options & SO_DEBUG) 5177 tcp_trace(TA_INPUT, ostate, tp, 5178 (void *)tcp_saveipgen, 5179 &tcp_savetcp, 0); 5180 #endif 5181 if (tp->snd_una == tp->snd_max) { 5182 rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__); 5183 tp->t_acktime = 0; 5184 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 5185 } 5186 /* Wake up the socket if we have room to write more */ 5187 sowwakeup(so); 5188 if (sbavail(&so->so_snd)) { 5189 rack->r_wanted_output++; 5190 } 5191 return (1); 5192 } 5193 5194 /* 5195 * Return value of 1, the TCB is unlocked and most 5196 * likely gone, return value of 0, the TCP is still 5197 * locked. 5198 */ 5199 static int 5200 rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so, 5201 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 5202 int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) 5203 { 5204 int32_t ret_val = 0; 5205 int32_t todrop; 5206 int32_t ourfinisacked = 0; 5207 5208 rack_calc_rwin(so, tp); 5209 /* 5210 * If the state is SYN_SENT: if seg contains an ACK, but not for our 5211 * SYN, drop the input. if seg contains a RST, then drop the 5212 * connection. if seg does not contain SYN, then drop it. Otherwise 5213 * this is an acceptable SYN segment initialize tp->rcv_nxt and 5214 * tp->irs if seg contains ack then advance tp->snd_una if seg 5215 * contains an ECE and ECN support is enabled, the stream is ECN 5216 * capable. if SYN has been acked change to ESTABLISHED else 5217 * SYN_RCVD state arrange for segment to be acked (eventually) 5218 * continue processing rest of data/controls, beginning with URG 5219 */ 5220 if ((thflags & TH_ACK) && 5221 (SEQ_LEQ(th->th_ack, tp->iss) || 5222 SEQ_GT(th->th_ack, tp->snd_max))) { 5223 rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen); 5224 return (1); 5225 } 5226 if ((thflags & (TH_ACK | TH_RST)) == (TH_ACK | TH_RST)) { 5227 TCP_PROBE5(connect__refused, NULL, tp, 5228 mtod(m, const char *), tp, th); 5229 tp = tcp_drop(tp, ECONNREFUSED); 5230 rack_do_drop(m, tp, ti_locked); 5231 return (1); 5232 } 5233 if (thflags & TH_RST) { 5234 rack_do_drop(m, tp, ti_locked); 5235 return (1); 5236 } 5237 if (!(thflags & TH_SYN)) { 5238 rack_do_drop(m, tp, ti_locked); 5239 return (1); 5240 } 5241 tp->irs = th->th_seq; 5242 tcp_rcvseqinit(tp); 5243 if (thflags & TH_ACK) { 5244 int tfo_partial = 0; 5245 5246 TCPSTAT_INC(tcps_connects); 5247 soisconnected(so); 5248 #ifdef MAC 5249 mac_socketpeer_set_from_mbuf(m, so); 5250 #endif 5251 /* Do window scaling on this connection? */ 5252 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 5253 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 5254 tp->rcv_scale = tp->request_r_scale; 5255 } 5256 tp->rcv_adv += min(tp->rcv_wnd, 5257 TCP_MAXWIN << tp->rcv_scale); 5258 /* 5259 * If not all the data that was sent in the TFO SYN 5260 * has been acked, resend the remainder right away. 5261 */ 5262 if (IS_FASTOPEN(tp->t_flags) && 5263 (tp->snd_una != tp->snd_max)) { 5264 tp->snd_nxt = th->th_ack; 5265 tfo_partial = 1; 5266 } 5267 /* 5268 * If there's data, delay ACK; if there's also a FIN ACKNOW 5269 * will be turned on later. 5270 */ 5271 if (DELAY_ACK(tp, tlen) && tlen != 0 && (tfo_partial == 0)) { 5272 rack_timer_cancel(tp, (struct tcp_rack *)tp->t_fb_ptr, 5273 ((struct tcp_rack *)tp->t_fb_ptr)->r_ctl.rc_rcvtime, __LINE__); 5274 tp->t_flags |= TF_DELACK; 5275 } else { 5276 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output++; 5277 tp->t_flags |= TF_ACKNOW; 5278 } 5279 5280 if ((thflags & TH_ECE) && V_tcp_do_ecn) { 5281 tp->t_flags |= TF_ECN_PERMIT; 5282 TCPSTAT_INC(tcps_ecn_shs); 5283 } 5284 if (SEQ_GT(th->th_ack, tp->snd_una)) { 5285 /* 5286 * We advance snd_una for the 5287 * fast open case. If th_ack is 5288 * acknowledging data beyond 5289 * snd_una we can't just call 5290 * ack-processing since the 5291 * data stream in our send-map 5292 * will start at snd_una + 1 (one 5293 * beyond the SYN). If its just 5294 * equal we don't need to do that 5295 * and there is no send_map. 5296 */ 5297 tp->snd_una++; 5298 } 5299 /* 5300 * Received <SYN,ACK> in SYN_SENT[*] state. Transitions: 5301 * SYN_SENT --> ESTABLISHED SYN_SENT* --> FIN_WAIT_1 5302 */ 5303 tp->t_starttime = ticks; 5304 if (tp->t_flags & TF_NEEDFIN) { 5305 tcp_state_change(tp, TCPS_FIN_WAIT_1); 5306 tp->t_flags &= ~TF_NEEDFIN; 5307 thflags &= ~TH_SYN; 5308 } else { 5309 tcp_state_change(tp, TCPS_ESTABLISHED); 5310 TCP_PROBE5(connect__established, NULL, tp, 5311 mtod(m, const char *), tp, th); 5312 cc_conn_init(tp); 5313 } 5314 } else { 5315 /* 5316 * Received initial SYN in SYN-SENT[*] state => simultaneous 5317 * open. If segment contains CC option and there is a 5318 * cached CC, apply TAO test. If it succeeds, connection is * 5319 * half-synchronized. Otherwise, do 3-way handshake: 5320 * SYN-SENT -> SYN-RECEIVED SYN-SENT* -> SYN-RECEIVED* If 5321 * there was no CC option, clear cached CC value. 5322 */ 5323 tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN); 5324 tcp_state_change(tp, TCPS_SYN_RECEIVED); 5325 } 5326 KASSERT(*ti_locked == TI_RLOCKED, ("%s: trimthenstep6: " 5327 "ti_locked %d", __func__, *ti_locked)); 5328 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 5329 INP_WLOCK_ASSERT(tp->t_inpcb); 5330 /* 5331 * Advance th->th_seq to correspond to first data byte. If data, 5332 * trim to stay within window, dropping FIN if necessary. 5333 */ 5334 th->th_seq++; 5335 if (tlen > tp->rcv_wnd) { 5336 todrop = tlen - tp->rcv_wnd; 5337 m_adj(m, -todrop); 5338 tlen = tp->rcv_wnd; 5339 thflags &= ~TH_FIN; 5340 TCPSTAT_INC(tcps_rcvpackafterwin); 5341 TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); 5342 } 5343 tp->snd_wl1 = th->th_seq - 1; 5344 tp->rcv_up = th->th_seq; 5345 /* 5346 * Client side of transaction: already sent SYN and data. If the 5347 * remote host used T/TCP to validate the SYN, our data will be 5348 * ACK'd; if so, enter normal data segment processing in the middle 5349 * of step 5, ack processing. Otherwise, goto step 6. 5350 */ 5351 if (thflags & TH_ACK) { 5352 if (rack_process_ack(m, th, so, tp, to, ti_locked, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) 5353 return (ret_val); 5354 /* We may have changed to FIN_WAIT_1 above */ 5355 if (tp->t_state == TCPS_FIN_WAIT_1) { 5356 /* 5357 * In FIN_WAIT_1 STATE in addition to the processing 5358 * for the ESTABLISHED state if our FIN is now 5359 * acknowledged then enter FIN_WAIT_2. 5360 */ 5361 if (ourfinisacked) { 5362 /* 5363 * If we can't receive any more data, then 5364 * closing user can proceed. Starting the 5365 * timer is contrary to the specification, 5366 * but if we don't get a FIN we'll hang 5367 * forever. 5368 * 5369 * XXXjl: we should release the tp also, and 5370 * use a compressed state. 5371 */ 5372 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 5373 soisdisconnected(so); 5374 tcp_timer_activate(tp, TT_2MSL, 5375 (tcp_fast_finwait2_recycle ? 5376 tcp_finwait2_timeout : 5377 TP_MAXIDLE(tp))); 5378 } 5379 tcp_state_change(tp, TCPS_FIN_WAIT_2); 5380 } 5381 } 5382 } 5383 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 5384 ti_locked, tiwin, thflags, nxt_pkt)); 5385 } 5386 5387 /* 5388 * Return value of 1, the TCB is unlocked and most 5389 * likely gone, return value of 0, the TCP is still 5390 * locked. 5391 */ 5392 static int 5393 rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, 5394 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 5395 int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) 5396 { 5397 int32_t ret_val = 0; 5398 int32_t ourfinisacked = 0; 5399 5400 rack_calc_rwin(so, tp); 5401 5402 if ((thflags & TH_ACK) && 5403 (SEQ_LEQ(th->th_ack, tp->snd_una) || 5404 SEQ_GT(th->th_ack, tp->snd_max))) { 5405 rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen); 5406 return (1); 5407 } 5408 if (IS_FASTOPEN(tp->t_flags)) { 5409 /* 5410 * When a TFO connection is in SYN_RECEIVED, the 5411 * only valid packets are the initial SYN, a 5412 * retransmit/copy of the initial SYN (possibly with 5413 * a subset of the original data), a valid ACK, a 5414 * FIN, or a RST. 5415 */ 5416 if ((thflags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) { 5417 rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen); 5418 return (1); 5419 } else if (thflags & TH_SYN) { 5420 /* non-initial SYN is ignored */ 5421 struct tcp_rack *rack; 5422 5423 rack = (struct tcp_rack *)tp->t_fb_ptr; 5424 if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) || 5425 (rack->r_ctl.rc_hpts_flags & PACE_TMR_TLP) || 5426 (rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK)) { 5427 rack_do_drop(m, NULL, ti_locked); 5428 return (0); 5429 } 5430 } else if (!(thflags & (TH_ACK | TH_FIN | TH_RST))) { 5431 rack_do_drop(m, NULL, ti_locked); 5432 return (0); 5433 } 5434 } 5435 if (thflags & TH_RST) 5436 return (rack_process_rst(m, th, so, tp, ti_locked)); 5437 /* 5438 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 5439 * synchronized state. 5440 */ 5441 if (thflags & TH_SYN) { 5442 rack_challenge_ack(m, th, tp, ti_locked, &ret_val); 5443 return (ret_val); 5444 } 5445 /* 5446 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 5447 * it's less than ts_recent, drop it. 5448 */ 5449 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 5450 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 5451 if (rack_ts_check(m, th, tp, ti_locked, tlen, thflags, &ret_val)) 5452 return (ret_val); 5453 } 5454 /* 5455 * In the SYN-RECEIVED state, validate that the packet belongs to 5456 * this connection before trimming the data to fit the receive 5457 * window. Check the sequence number versus IRS since we know the 5458 * sequence numbers haven't wrapped. This is a partial fix for the 5459 * "LAND" DoS attack. 5460 */ 5461 if (SEQ_LT(th->th_seq, tp->irs)) { 5462 rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen); 5463 return (1); 5464 } 5465 if (rack_drop_checks(to, m, th, tp, &tlen, ti_locked, &thflags, &drop_hdrlen, &ret_val)) { 5466 return (ret_val); 5467 } 5468 /* 5469 * If last ACK falls within this segment's sequence numbers, record 5470 * its timestamp. NOTE: 1) That the test incorporates suggestions 5471 * from the latest proposal of the tcplw@cray.com list (Braden 5472 * 1993/04/26). 2) That updating only on newer timestamps interferes 5473 * with our earlier PAWS tests, so this check should be solely 5474 * predicated on the sequence space of this segment. 3) That we 5475 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 5476 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 5477 * SEG.Len, This modified check allows us to overcome RFC1323's 5478 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 5479 * p.869. In such cases, we can still calculate the RTT correctly 5480 * when RCV.NXT == Last.ACK.Sent. 5481 */ 5482 if ((to->to_flags & TOF_TS) != 0 && 5483 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 5484 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 5485 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 5486 tp->ts_recent_age = tcp_ts_getticks(); 5487 tp->ts_recent = to->to_tsval; 5488 } 5489 /* 5490 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 5491 * is on (half-synchronized state), then queue data for later 5492 * processing; else drop segment and return. 5493 */ 5494 if ((thflags & TH_ACK) == 0) { 5495 if (IS_FASTOPEN(tp->t_flags)) { 5496 tp->snd_wnd = tiwin; 5497 cc_conn_init(tp); 5498 } 5499 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 5500 ti_locked, tiwin, thflags, nxt_pkt)); 5501 } 5502 TCPSTAT_INC(tcps_connects); 5503 soisconnected(so); 5504 /* Do window scaling? */ 5505 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 5506 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 5507 tp->rcv_scale = tp->request_r_scale; 5508 tp->snd_wnd = tiwin; 5509 } 5510 /* 5511 * Make transitions: SYN-RECEIVED -> ESTABLISHED SYN-RECEIVED* -> 5512 * FIN-WAIT-1 5513 */ 5514 tp->t_starttime = ticks; 5515 if (tp->t_flags & TF_NEEDFIN) { 5516 tcp_state_change(tp, TCPS_FIN_WAIT_1); 5517 tp->t_flags &= ~TF_NEEDFIN; 5518 } else { 5519 tcp_state_change(tp, TCPS_ESTABLISHED); 5520 TCP_PROBE5(accept__established, NULL, tp, 5521 mtod(m, const char *), tp, th); 5522 if (IS_FASTOPEN(tp->t_flags) && tp->t_tfo_pending) { 5523 tcp_fastopen_decrement_counter(tp->t_tfo_pending); 5524 tp->t_tfo_pending = NULL; 5525 5526 /* 5527 * Account for the ACK of our SYN prior to regular 5528 * ACK processing below. 5529 */ 5530 tp->snd_una++; 5531 } 5532 /* 5533 * TFO connections call cc_conn_init() during SYN 5534 * processing. Calling it again here for such connections 5535 * is not harmless as it would undo the snd_cwnd reduction 5536 * that occurs when a TFO SYN|ACK is retransmitted. 5537 */ 5538 if (!IS_FASTOPEN(tp->t_flags)) 5539 cc_conn_init(tp); 5540 } 5541 /* 5542 * If segment contains data or ACK, will call tcp_reass() later; if 5543 * not, do so now to pass queued data to user. 5544 */ 5545 if (tlen == 0 && (thflags & TH_FIN) == 0) 5546 (void)tcp_reass(tp, (struct tcphdr *)0, 0, 5547 (struct mbuf *)0); 5548 tp->snd_wl1 = th->th_seq - 1; 5549 if (rack_process_ack(m, th, so, tp, to, ti_locked, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 5550 return (ret_val); 5551 } 5552 if (tp->t_state == TCPS_FIN_WAIT_1) { 5553 /* We could have went to FIN_WAIT_1 (or EST) above */ 5554 /* 5555 * In FIN_WAIT_1 STATE in addition to the processing for the 5556 * ESTABLISHED state if our FIN is now acknowledged then 5557 * enter FIN_WAIT_2. 5558 */ 5559 if (ourfinisacked) { 5560 /* 5561 * If we can't receive any more data, then closing 5562 * user can proceed. Starting the timer is contrary 5563 * to the specification, but if we don't get a FIN 5564 * we'll hang forever. 5565 * 5566 * XXXjl: we should release the tp also, and use a 5567 * compressed state. 5568 */ 5569 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 5570 soisdisconnected(so); 5571 tcp_timer_activate(tp, TT_2MSL, 5572 (tcp_fast_finwait2_recycle ? 5573 tcp_finwait2_timeout : 5574 TP_MAXIDLE(tp))); 5575 } 5576 tcp_state_change(tp, TCPS_FIN_WAIT_2); 5577 } 5578 } 5579 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 5580 ti_locked, tiwin, thflags, nxt_pkt)); 5581 } 5582 5583 /* 5584 * Return value of 1, the TCB is unlocked and most 5585 * likely gone, return value of 0, the TCP is still 5586 * locked. 5587 */ 5588 static int 5589 rack_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so, 5590 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 5591 int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) 5592 { 5593 int32_t ret_val = 0; 5594 5595 /* 5596 * Header prediction: check for the two common cases of a 5597 * uni-directional data xfer. If the packet has no control flags, 5598 * is in-sequence, the window didn't change and we're not 5599 * retransmitting, it's a candidate. If the length is zero and the 5600 * ack moved forward, we're the sender side of the xfer. Just free 5601 * the data acked & wake any higher level process that was blocked 5602 * waiting for space. If the length is non-zero and the ack didn't 5603 * move, we're the receiver side. If we're getting packets in-order 5604 * (the reassembly queue is empty), add the data toc The socket 5605 * buffer and note that we need a delayed ack. Make sure that the 5606 * hidden state-flags are also off. Since we check for 5607 * TCPS_ESTABLISHED first, it can only be TH_NEEDSYN. 5608 */ 5609 if (__predict_true(((to->to_flags & TOF_SACK) == 0)) && 5610 __predict_true((thflags & (TH_SYN | TH_FIN | TH_RST | TH_URG | TH_ACK)) == TH_ACK) && 5611 __predict_true(LIST_EMPTY(&tp->t_segq)) && 5612 __predict_true(th->th_seq == tp->rcv_nxt)) { 5613 struct tcp_rack *rack; 5614 5615 rack = (struct tcp_rack *)tp->t_fb_ptr; 5616 if (tlen == 0) { 5617 if (rack_fastack(m, th, so, tp, to, drop_hdrlen, tlen, 5618 ti_locked, tiwin, nxt_pkt, rack->r_ctl.rc_rcvtime)) { 5619 return (0); 5620 } 5621 } else { 5622 if (rack_do_fastnewdata(m, th, so, tp, to, drop_hdrlen, tlen, 5623 ti_locked, tiwin, nxt_pkt)) { 5624 return (0); 5625 } 5626 } 5627 } 5628 rack_calc_rwin(so, tp); 5629 5630 if (thflags & TH_RST) 5631 return (rack_process_rst(m, th, so, tp, ti_locked)); 5632 5633 /* 5634 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 5635 * synchronized state. 5636 */ 5637 if (thflags & TH_SYN) { 5638 rack_challenge_ack(m, th, tp, ti_locked, &ret_val); 5639 return (ret_val); 5640 } 5641 /* 5642 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 5643 * it's less than ts_recent, drop it. 5644 */ 5645 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 5646 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 5647 if (rack_ts_check(m, th, tp, ti_locked, tlen, thflags, &ret_val)) 5648 return (ret_val); 5649 } 5650 if (rack_drop_checks(to, m, th, tp, &tlen, ti_locked, &thflags, &drop_hdrlen, &ret_val)) { 5651 return (ret_val); 5652 } 5653 /* 5654 * If last ACK falls within this segment's sequence numbers, record 5655 * its timestamp. NOTE: 1) That the test incorporates suggestions 5656 * from the latest proposal of the tcplw@cray.com list (Braden 5657 * 1993/04/26). 2) That updating only on newer timestamps interferes 5658 * with our earlier PAWS tests, so this check should be solely 5659 * predicated on the sequence space of this segment. 3) That we 5660 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 5661 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 5662 * SEG.Len, This modified check allows us to overcome RFC1323's 5663 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 5664 * p.869. In such cases, we can still calculate the RTT correctly 5665 * when RCV.NXT == Last.ACK.Sent. 5666 */ 5667 if ((to->to_flags & TOF_TS) != 0 && 5668 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 5669 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 5670 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 5671 tp->ts_recent_age = tcp_ts_getticks(); 5672 tp->ts_recent = to->to_tsval; 5673 } 5674 /* 5675 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 5676 * is on (half-synchronized state), then queue data for later 5677 * processing; else drop segment and return. 5678 */ 5679 if ((thflags & TH_ACK) == 0) { 5680 if (tp->t_flags & TF_NEEDSYN) { 5681 5682 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 5683 ti_locked, tiwin, thflags, nxt_pkt)); 5684 5685 } else if (tp->t_flags & TF_ACKNOW) { 5686 rack_do_dropafterack(m, tp, th, ti_locked, thflags, tlen, &ret_val); 5687 return (ret_val); 5688 } else { 5689 rack_do_drop(m, NULL, ti_locked); 5690 return (0); 5691 } 5692 } 5693 /* 5694 * Ack processing. 5695 */ 5696 if (rack_process_ack(m, th, so, tp, to, ti_locked, tiwin, tlen, NULL, thflags, &ret_val)) { 5697 return (ret_val); 5698 } 5699 if (sbavail(&so->so_snd)) { 5700 if (rack_progress_timeout_check(tp)) { 5701 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 5702 rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen); 5703 return (1); 5704 } 5705 } 5706 /* State changes only happen in rack_process_data() */ 5707 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 5708 ti_locked, tiwin, thflags, nxt_pkt)); 5709 } 5710 5711 /* 5712 * Return value of 1, the TCB is unlocked and most 5713 * likely gone, return value of 0, the TCP is still 5714 * locked. 5715 */ 5716 static int 5717 rack_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so, 5718 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 5719 int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) 5720 { 5721 int32_t ret_val = 0; 5722 5723 rack_calc_rwin(so, tp); 5724 if (thflags & TH_RST) 5725 return (rack_process_rst(m, th, so, tp, ti_locked)); 5726 /* 5727 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 5728 * synchronized state. 5729 */ 5730 if (thflags & TH_SYN) { 5731 rack_challenge_ack(m, th, tp, ti_locked, &ret_val); 5732 return (ret_val); 5733 } 5734 /* 5735 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 5736 * it's less than ts_recent, drop it. 5737 */ 5738 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 5739 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 5740 if (rack_ts_check(m, th, tp, ti_locked, tlen, thflags, &ret_val)) 5741 return (ret_val); 5742 } 5743 if (rack_drop_checks(to, m, th, tp, &tlen, ti_locked, &thflags, &drop_hdrlen, &ret_val)) { 5744 return (ret_val); 5745 } 5746 /* 5747 * If last ACK falls within this segment's sequence numbers, record 5748 * its timestamp. NOTE: 1) That the test incorporates suggestions 5749 * from the latest proposal of the tcplw@cray.com list (Braden 5750 * 1993/04/26). 2) That updating only on newer timestamps interferes 5751 * with our earlier PAWS tests, so this check should be solely 5752 * predicated on the sequence space of this segment. 3) That we 5753 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 5754 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 5755 * SEG.Len, This modified check allows us to overcome RFC1323's 5756 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 5757 * p.869. In such cases, we can still calculate the RTT correctly 5758 * when RCV.NXT == Last.ACK.Sent. 5759 */ 5760 if ((to->to_flags & TOF_TS) != 0 && 5761 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 5762 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 5763 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 5764 tp->ts_recent_age = tcp_ts_getticks(); 5765 tp->ts_recent = to->to_tsval; 5766 } 5767 /* 5768 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 5769 * is on (half-synchronized state), then queue data for later 5770 * processing; else drop segment and return. 5771 */ 5772 if ((thflags & TH_ACK) == 0) { 5773 if (tp->t_flags & TF_NEEDSYN) { 5774 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 5775 ti_locked, tiwin, thflags, nxt_pkt)); 5776 5777 } else if (tp->t_flags & TF_ACKNOW) { 5778 rack_do_dropafterack(m, tp, th, ti_locked, thflags, tlen, &ret_val); 5779 return (ret_val); 5780 } else { 5781 rack_do_drop(m, NULL, ti_locked); 5782 return (0); 5783 } 5784 } 5785 /* 5786 * Ack processing. 5787 */ 5788 if (rack_process_ack(m, th, so, tp, to, ti_locked, tiwin, tlen, NULL, thflags, &ret_val)) { 5789 return (ret_val); 5790 } 5791 if (sbavail(&so->so_snd)) { 5792 if (rack_progress_timeout_check(tp)) { 5793 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 5794 rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen); 5795 return (1); 5796 } 5797 } 5798 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 5799 ti_locked, tiwin, thflags, nxt_pkt)); 5800 } 5801 5802 static int 5803 rack_check_data_after_close(struct mbuf *m, 5804 struct tcpcb *tp, int32_t *ti_locked, int32_t *tlen, struct tcphdr *th, struct socket *so) 5805 { 5806 struct tcp_rack *rack; 5807 5808 KASSERT(*ti_locked == TI_RLOCKED, ("%s: SS_NOFDEREF && " 5809 "CLOSE_WAIT && tlen ti_locked %d", __func__, *ti_locked)); 5810 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 5811 rack = (struct tcp_rack *)tp->t_fb_ptr; 5812 if (rack->rc_allow_data_af_clo == 0) { 5813 close_now: 5814 tp = tcp_close(tp); 5815 TCPSTAT_INC(tcps_rcvafterclose); 5816 rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_UNLIMITED, (*tlen)); 5817 return (1); 5818 } 5819 if (sbavail(&so->so_snd) == 0) 5820 goto close_now; 5821 /* Ok we allow data that is ignored and a followup reset */ 5822 tp->rcv_nxt = th->th_seq + *tlen; 5823 tp->t_flags2 |= TF2_DROP_AF_DATA; 5824 rack->r_wanted_output = 1; 5825 *tlen = 0; 5826 return (0); 5827 } 5828 5829 /* 5830 * Return value of 1, the TCB is unlocked and most 5831 * likely gone, return value of 0, the TCP is still 5832 * locked. 5833 */ 5834 static int 5835 rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so, 5836 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 5837 int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) 5838 { 5839 int32_t ret_val = 0; 5840 int32_t ourfinisacked = 0; 5841 5842 rack_calc_rwin(so, tp); 5843 5844 if (thflags & TH_RST) 5845 return (rack_process_rst(m, th, so, tp, ti_locked)); 5846 /* 5847 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 5848 * synchronized state. 5849 */ 5850 if (thflags & TH_SYN) { 5851 rack_challenge_ack(m, th, tp, ti_locked, &ret_val); 5852 return (ret_val); 5853 } 5854 /* 5855 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 5856 * it's less than ts_recent, drop it. 5857 */ 5858 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 5859 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 5860 if (rack_ts_check(m, th, tp, ti_locked, tlen, thflags, &ret_val)) 5861 return (ret_val); 5862 } 5863 if (rack_drop_checks(to, m, th, tp, &tlen, ti_locked, &thflags, &drop_hdrlen, &ret_val)) { 5864 return (ret_val); 5865 } 5866 /* 5867 * If new data are received on a connection after the user processes 5868 * are gone, then RST the other end. 5869 */ 5870 if ((so->so_state & SS_NOFDREF) && tlen) { 5871 if (rack_check_data_after_close(m, tp, ti_locked, &tlen, th, so)) 5872 return (1); 5873 } 5874 /* 5875 * If last ACK falls within this segment's sequence numbers, record 5876 * its timestamp. NOTE: 1) That the test incorporates suggestions 5877 * from the latest proposal of the tcplw@cray.com list (Braden 5878 * 1993/04/26). 2) That updating only on newer timestamps interferes 5879 * with our earlier PAWS tests, so this check should be solely 5880 * predicated on the sequence space of this segment. 3) That we 5881 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 5882 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 5883 * SEG.Len, This modified check allows us to overcome RFC1323's 5884 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 5885 * p.869. In such cases, we can still calculate the RTT correctly 5886 * when RCV.NXT == Last.ACK.Sent. 5887 */ 5888 if ((to->to_flags & TOF_TS) != 0 && 5889 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 5890 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 5891 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 5892 tp->ts_recent_age = tcp_ts_getticks(); 5893 tp->ts_recent = to->to_tsval; 5894 } 5895 /* 5896 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 5897 * is on (half-synchronized state), then queue data for later 5898 * processing; else drop segment and return. 5899 */ 5900 if ((thflags & TH_ACK) == 0) { 5901 if (tp->t_flags & TF_NEEDSYN) { 5902 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 5903 ti_locked, tiwin, thflags, nxt_pkt)); 5904 } else if (tp->t_flags & TF_ACKNOW) { 5905 rack_do_dropafterack(m, tp, th, ti_locked, thflags, tlen, &ret_val); 5906 return (ret_val); 5907 } else { 5908 rack_do_drop(m, NULL, ti_locked); 5909 return (0); 5910 } 5911 } 5912 /* 5913 * Ack processing. 5914 */ 5915 if (rack_process_ack(m, th, so, tp, to, ti_locked, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 5916 return (ret_val); 5917 } 5918 if (ourfinisacked) { 5919 /* 5920 * If we can't receive any more data, then closing user can 5921 * proceed. Starting the timer is contrary to the 5922 * specification, but if we don't get a FIN we'll hang 5923 * forever. 5924 * 5925 * XXXjl: we should release the tp also, and use a 5926 * compressed state. 5927 */ 5928 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 5929 soisdisconnected(so); 5930 tcp_timer_activate(tp, TT_2MSL, 5931 (tcp_fast_finwait2_recycle ? 5932 tcp_finwait2_timeout : 5933 TP_MAXIDLE(tp))); 5934 } 5935 tcp_state_change(tp, TCPS_FIN_WAIT_2); 5936 } 5937 if (sbavail(&so->so_snd)) { 5938 if (rack_progress_timeout_check(tp)) { 5939 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 5940 rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen); 5941 return (1); 5942 } 5943 } 5944 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 5945 ti_locked, tiwin, thflags, nxt_pkt)); 5946 } 5947 5948 /* 5949 * Return value of 1, the TCB is unlocked and most 5950 * likely gone, return value of 0, the TCP is still 5951 * locked. 5952 */ 5953 static int 5954 rack_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so, 5955 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 5956 int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) 5957 { 5958 int32_t ret_val = 0; 5959 int32_t ourfinisacked = 0; 5960 5961 rack_calc_rwin(so, tp); 5962 5963 if (thflags & TH_RST) 5964 return (rack_process_rst(m, th, so, tp, ti_locked)); 5965 /* 5966 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 5967 * synchronized state. 5968 */ 5969 if (thflags & TH_SYN) { 5970 rack_challenge_ack(m, th, tp, ti_locked, &ret_val); 5971 return (ret_val); 5972 } 5973 /* 5974 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 5975 * it's less than ts_recent, drop it. 5976 */ 5977 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 5978 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 5979 if (rack_ts_check(m, th, tp, ti_locked, tlen, thflags, &ret_val)) 5980 return (ret_val); 5981 } 5982 if (rack_drop_checks(to, m, th, tp, &tlen, ti_locked, &thflags, &drop_hdrlen, &ret_val)) { 5983 return (ret_val); 5984 } 5985 /* 5986 * If new data are received on a connection after the user processes 5987 * are gone, then RST the other end. 5988 */ 5989 if ((so->so_state & SS_NOFDREF) && tlen) { 5990 if (rack_check_data_after_close(m, tp, ti_locked, &tlen, th, so)) 5991 return (1); 5992 } 5993 /* 5994 * If last ACK falls within this segment's sequence numbers, record 5995 * its timestamp. NOTE: 1) That the test incorporates suggestions 5996 * from the latest proposal of the tcplw@cray.com list (Braden 5997 * 1993/04/26). 2) That updating only on newer timestamps interferes 5998 * with our earlier PAWS tests, so this check should be solely 5999 * predicated on the sequence space of this segment. 3) That we 6000 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 6001 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 6002 * SEG.Len, This modified check allows us to overcome RFC1323's 6003 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 6004 * p.869. In such cases, we can still calculate the RTT correctly 6005 * when RCV.NXT == Last.ACK.Sent. 6006 */ 6007 if ((to->to_flags & TOF_TS) != 0 && 6008 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 6009 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 6010 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 6011 tp->ts_recent_age = tcp_ts_getticks(); 6012 tp->ts_recent = to->to_tsval; 6013 } 6014 /* 6015 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 6016 * is on (half-synchronized state), then queue data for later 6017 * processing; else drop segment and return. 6018 */ 6019 if ((thflags & TH_ACK) == 0) { 6020 if (tp->t_flags & TF_NEEDSYN) { 6021 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 6022 ti_locked, tiwin, thflags, nxt_pkt)); 6023 } else if (tp->t_flags & TF_ACKNOW) { 6024 rack_do_dropafterack(m, tp, th, ti_locked, thflags, tlen, &ret_val); 6025 return (ret_val); 6026 } else { 6027 rack_do_drop(m, NULL, ti_locked); 6028 return (0); 6029 } 6030 } 6031 /* 6032 * Ack processing. 6033 */ 6034 if (rack_process_ack(m, th, so, tp, to, ti_locked, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 6035 return (ret_val); 6036 } 6037 if (ourfinisacked) { 6038 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 6039 tcp_twstart(tp); 6040 INP_INFO_RUNLOCK(&V_tcbinfo); 6041 *ti_locked = TI_UNLOCKED; 6042 m_freem(m); 6043 return (1); 6044 } 6045 if (sbavail(&so->so_snd)) { 6046 if (rack_progress_timeout_check(tp)) { 6047 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 6048 rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen); 6049 return (1); 6050 } 6051 } 6052 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 6053 ti_locked, tiwin, thflags, nxt_pkt)); 6054 } 6055 6056 /* 6057 * Return value of 1, the TCB is unlocked and most 6058 * likely gone, return value of 0, the TCP is still 6059 * locked. 6060 */ 6061 static int 6062 rack_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so, 6063 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 6064 int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) 6065 { 6066 int32_t ret_val = 0; 6067 int32_t ourfinisacked = 0; 6068 6069 rack_calc_rwin(so, tp); 6070 6071 if (thflags & TH_RST) 6072 return (rack_process_rst(m, th, so, tp, ti_locked)); 6073 /* 6074 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 6075 * synchronized state. 6076 */ 6077 if (thflags & TH_SYN) { 6078 rack_challenge_ack(m, th, tp, ti_locked, &ret_val); 6079 return (ret_val); 6080 } 6081 /* 6082 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 6083 * it's less than ts_recent, drop it. 6084 */ 6085 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 6086 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 6087 if (rack_ts_check(m, th, tp, ti_locked, tlen, thflags, &ret_val)) 6088 return (ret_val); 6089 } 6090 if (rack_drop_checks(to, m, th, tp, &tlen, ti_locked, &thflags, &drop_hdrlen, &ret_val)) { 6091 return (ret_val); 6092 } 6093 /* 6094 * If new data are received on a connection after the user processes 6095 * are gone, then RST the other end. 6096 */ 6097 if ((so->so_state & SS_NOFDREF) && tlen) { 6098 if (rack_check_data_after_close(m, tp, ti_locked, &tlen, th, so)) 6099 return (1); 6100 } 6101 /* 6102 * If last ACK falls within this segment's sequence numbers, record 6103 * its timestamp. NOTE: 1) That the test incorporates suggestions 6104 * from the latest proposal of the tcplw@cray.com list (Braden 6105 * 1993/04/26). 2) That updating only on newer timestamps interferes 6106 * with our earlier PAWS tests, so this check should be solely 6107 * predicated on the sequence space of this segment. 3) That we 6108 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 6109 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 6110 * SEG.Len, This modified check allows us to overcome RFC1323's 6111 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 6112 * p.869. In such cases, we can still calculate the RTT correctly 6113 * when RCV.NXT == Last.ACK.Sent. 6114 */ 6115 if ((to->to_flags & TOF_TS) != 0 && 6116 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 6117 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 6118 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 6119 tp->ts_recent_age = tcp_ts_getticks(); 6120 tp->ts_recent = to->to_tsval; 6121 } 6122 /* 6123 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 6124 * is on (half-synchronized state), then queue data for later 6125 * processing; else drop segment and return. 6126 */ 6127 if ((thflags & TH_ACK) == 0) { 6128 if (tp->t_flags & TF_NEEDSYN) { 6129 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 6130 ti_locked, tiwin, thflags, nxt_pkt)); 6131 } else if (tp->t_flags & TF_ACKNOW) { 6132 rack_do_dropafterack(m, tp, th, ti_locked, thflags, tlen, &ret_val); 6133 return (ret_val); 6134 } else { 6135 rack_do_drop(m, NULL, ti_locked); 6136 return (0); 6137 } 6138 } 6139 /* 6140 * case TCPS_LAST_ACK: Ack processing. 6141 */ 6142 if (rack_process_ack(m, th, so, tp, to, ti_locked, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 6143 return (ret_val); 6144 } 6145 if (ourfinisacked) { 6146 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 6147 tp = tcp_close(tp); 6148 rack_do_drop(m, tp, ti_locked); 6149 return (1); 6150 } 6151 if (sbavail(&so->so_snd)) { 6152 if (rack_progress_timeout_check(tp)) { 6153 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 6154 rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen); 6155 return (1); 6156 } 6157 } 6158 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 6159 ti_locked, tiwin, thflags, nxt_pkt)); 6160 } 6161 6162 6163 /* 6164 * Return value of 1, the TCB is unlocked and most 6165 * likely gone, return value of 0, the TCP is still 6166 * locked. 6167 */ 6168 static int 6169 rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so, 6170 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 6171 int32_t * ti_locked, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) 6172 { 6173 int32_t ret_val = 0; 6174 int32_t ourfinisacked = 0; 6175 6176 rack_calc_rwin(so, tp); 6177 6178 /* Reset receive buffer auto scaling when not in bulk receive mode. */ 6179 if (thflags & TH_RST) 6180 return (rack_process_rst(m, th, so, tp, ti_locked)); 6181 /* 6182 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 6183 * synchronized state. 6184 */ 6185 if (thflags & TH_SYN) { 6186 rack_challenge_ack(m, th, tp, ti_locked, &ret_val); 6187 return (ret_val); 6188 } 6189 /* 6190 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 6191 * it's less than ts_recent, drop it. 6192 */ 6193 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 6194 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 6195 if (rack_ts_check(m, th, tp, ti_locked, tlen, thflags, &ret_val)) 6196 return (ret_val); 6197 } 6198 if (rack_drop_checks(to, m, th, tp, &tlen, ti_locked, &thflags, &drop_hdrlen, &ret_val)) { 6199 return (ret_val); 6200 } 6201 /* 6202 * If new data are received on a connection after the user processes 6203 * are gone, then RST the other end. 6204 */ 6205 if ((so->so_state & SS_NOFDREF) && 6206 tlen) { 6207 if (rack_check_data_after_close(m, tp, ti_locked, &tlen, th, so)) 6208 return (1); 6209 } 6210 /* 6211 * If last ACK falls within this segment's sequence numbers, record 6212 * its timestamp. NOTE: 1) That the test incorporates suggestions 6213 * from the latest proposal of the tcplw@cray.com list (Braden 6214 * 1993/04/26). 2) That updating only on newer timestamps interferes 6215 * with our earlier PAWS tests, so this check should be solely 6216 * predicated on the sequence space of this segment. 3) That we 6217 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 6218 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 6219 * SEG.Len, This modified check allows us to overcome RFC1323's 6220 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 6221 * p.869. In such cases, we can still calculate the RTT correctly 6222 * when RCV.NXT == Last.ACK.Sent. 6223 */ 6224 if ((to->to_flags & TOF_TS) != 0 && 6225 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 6226 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 6227 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 6228 tp->ts_recent_age = tcp_ts_getticks(); 6229 tp->ts_recent = to->to_tsval; 6230 } 6231 /* 6232 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 6233 * is on (half-synchronized state), then queue data for later 6234 * processing; else drop segment and return. 6235 */ 6236 if ((thflags & TH_ACK) == 0) { 6237 if (tp->t_flags & TF_NEEDSYN) { 6238 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 6239 ti_locked, tiwin, thflags, nxt_pkt)); 6240 } else if (tp->t_flags & TF_ACKNOW) { 6241 rack_do_dropafterack(m, tp, th, ti_locked, thflags, tlen, &ret_val); 6242 return (ret_val); 6243 } else { 6244 rack_do_drop(m, NULL, ti_locked); 6245 return (0); 6246 } 6247 } 6248 /* 6249 * Ack processing. 6250 */ 6251 if (rack_process_ack(m, th, so, tp, to, ti_locked, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 6252 return (ret_val); 6253 } 6254 if (sbavail(&so->so_snd)) { 6255 if (rack_progress_timeout_check(tp)) { 6256 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 6257 rack_do_dropwithreset(m, tp, th, ti_locked, BANDLIM_RST_OPENPORT, tlen); 6258 return (1); 6259 } 6260 } 6261 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 6262 ti_locked, tiwin, thflags, nxt_pkt)); 6263 } 6264 6265 6266 static void inline 6267 rack_clear_rate_sample(struct tcp_rack *rack) 6268 { 6269 rack->r_ctl.rack_rs.rs_flags = RACK_RTT_EMPTY; 6270 rack->r_ctl.rack_rs.rs_rtt_cnt = 0; 6271 rack->r_ctl.rack_rs.rs_rtt_tot = 0; 6272 } 6273 6274 static int 6275 rack_init(struct tcpcb *tp) 6276 { 6277 struct tcp_rack *rack = NULL; 6278 6279 tp->t_fb_ptr = uma_zalloc(rack_pcb_zone, M_NOWAIT); 6280 if (tp->t_fb_ptr == NULL) { 6281 /* 6282 * We need to allocate memory but cant. The INP and INP_INFO 6283 * locks and they are recusive (happens during setup. So a 6284 * scheme to drop the locks fails :( 6285 * 6286 */ 6287 return (ENOMEM); 6288 } 6289 memset(tp->t_fb_ptr, 0, sizeof(struct tcp_rack)); 6290 6291 rack = (struct tcp_rack *)tp->t_fb_ptr; 6292 TAILQ_INIT(&rack->r_ctl.rc_map); 6293 TAILQ_INIT(&rack->r_ctl.rc_free); 6294 TAILQ_INIT(&rack->r_ctl.rc_tmap); 6295 rack->rc_tp = tp; 6296 if (tp->t_inpcb) { 6297 rack->rc_inp = tp->t_inpcb; 6298 } 6299 /* Probably not needed but lets be sure */ 6300 rack_clear_rate_sample(rack); 6301 rack->r_cpu = 0; 6302 rack->r_ctl.rc_reorder_fade = rack_reorder_fade; 6303 rack->rc_allow_data_af_clo = rack_ignore_data_after_close; 6304 rack->r_ctl.rc_tlp_threshold = rack_tlp_thresh; 6305 rack->rc_pace_reduce = rack_slot_reduction; 6306 if (V_tcp_delack_enabled) 6307 tp->t_delayed_ack = 1; 6308 else 6309 tp->t_delayed_ack = 0; 6310 rack->rc_pace_max_segs = rack_hptsi_segments; 6311 rack->r_ctl.rc_early_recovery_segs = rack_early_recovery_max_seg; 6312 rack->r_ctl.rc_reorder_shift = rack_reorder_thresh; 6313 rack->r_ctl.rc_pkt_delay = rack_pkt_delay; 6314 rack->r_ctl.rc_prop_reduce = rack_use_proportional_reduce; 6315 rack->r_idle_reduce_largest = rack_reduce_largest_on_idle; 6316 rack->r_enforce_min_pace = rack_min_pace_time; 6317 rack->r_min_pace_seg_thresh = rack_min_pace_time_seg_req; 6318 rack->r_ctl.rc_prop_rate = rack_proportional_rate; 6319 rack->r_ctl.rc_tlp_cwnd_reduce = rack_lower_cwnd_at_tlp; 6320 rack->r_ctl.rc_early_recovery = rack_early_recovery; 6321 rack->rc_always_pace = rack_pace_every_seg; 6322 rack->r_ctl.rc_rate_sample_method = rack_rate_sample_method; 6323 rack->rack_tlp_threshold_use = rack_tlp_threshold_use; 6324 rack->r_ctl.rc_prr_sendalot = rack_send_a_lot_in_prr; 6325 rack->r_ctl.rc_min_to = rack_min_to; 6326 rack->r_ctl.rc_prr_inc_var = rack_inc_var; 6327 rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0); 6328 if (tp->snd_una != tp->snd_max) { 6329 /* Create a send map for the current outstanding data */ 6330 struct rack_sendmap *rsm; 6331 6332 rsm = rack_alloc(rack); 6333 if (rsm == NULL) { 6334 uma_zfree(rack_pcb_zone, tp->t_fb_ptr); 6335 tp->t_fb_ptr = NULL; 6336 return (ENOMEM); 6337 } 6338 rsm->r_flags = RACK_OVERMAX; 6339 rsm->r_tim_lastsent[0] = tcp_ts_getticks(); 6340 rsm->r_rtr_cnt = 1; 6341 rsm->r_rtr_bytes = 0; 6342 rsm->r_start = tp->snd_una; 6343 rsm->r_end = tp->snd_max; 6344 rsm->r_sndcnt = 0; 6345 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_map, rsm, r_next); 6346 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); 6347 rsm->r_in_tmap = 1; 6348 } 6349 return (0); 6350 } 6351 6352 static int 6353 rack_handoff_ok(struct tcpcb *tp) 6354 { 6355 if ((tp->t_state == TCPS_CLOSED) || 6356 (tp->t_state == TCPS_LISTEN)) { 6357 /* Sure no problem though it may not stick */ 6358 return (0); 6359 } 6360 if ((tp->t_state == TCPS_SYN_SENT) || 6361 (tp->t_state == TCPS_SYN_RECEIVED)) { 6362 /* 6363 * We really don't know you have to get to ESTAB or beyond 6364 * to tell. 6365 */ 6366 return (EAGAIN); 6367 } 6368 if (tp->t_flags & TF_SACK_PERMIT) { 6369 return (0); 6370 } 6371 /* 6372 * If we reach here we don't do SACK on this connection so we can 6373 * never do rack. 6374 */ 6375 return (EINVAL); 6376 } 6377 6378 static void 6379 rack_fini(struct tcpcb *tp, int32_t tcb_is_purged) 6380 { 6381 if (tp->t_fb_ptr) { 6382 struct tcp_rack *rack; 6383 struct rack_sendmap *rsm; 6384 6385 rack = (struct tcp_rack *)tp->t_fb_ptr; 6386 #ifdef TCP_BLACKBOX 6387 tcp_log_flowend(tp); 6388 #endif 6389 rsm = TAILQ_FIRST(&rack->r_ctl.rc_map); 6390 while (rsm) { 6391 TAILQ_REMOVE(&rack->r_ctl.rc_map, rsm, r_next); 6392 uma_zfree(rack_zone, rsm); 6393 rsm = TAILQ_FIRST(&rack->r_ctl.rc_map); 6394 } 6395 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); 6396 while (rsm) { 6397 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_next); 6398 uma_zfree(rack_zone, rsm); 6399 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); 6400 } 6401 rack->rc_free_cnt = 0; 6402 uma_zfree(rack_pcb_zone, tp->t_fb_ptr); 6403 tp->t_fb_ptr = NULL; 6404 } 6405 } 6406 6407 static void 6408 rack_set_state(struct tcpcb *tp, struct tcp_rack *rack) 6409 { 6410 switch (tp->t_state) { 6411 case TCPS_SYN_SENT: 6412 rack->r_state = TCPS_SYN_SENT; 6413 rack->r_substate = rack_do_syn_sent; 6414 break; 6415 case TCPS_SYN_RECEIVED: 6416 rack->r_state = TCPS_SYN_RECEIVED; 6417 rack->r_substate = rack_do_syn_recv; 6418 break; 6419 case TCPS_ESTABLISHED: 6420 rack->r_state = TCPS_ESTABLISHED; 6421 rack->r_substate = rack_do_established; 6422 break; 6423 case TCPS_CLOSE_WAIT: 6424 rack->r_state = TCPS_CLOSE_WAIT; 6425 rack->r_substate = rack_do_close_wait; 6426 break; 6427 case TCPS_FIN_WAIT_1: 6428 rack->r_state = TCPS_FIN_WAIT_1; 6429 rack->r_substate = rack_do_fin_wait_1; 6430 break; 6431 case TCPS_CLOSING: 6432 rack->r_state = TCPS_CLOSING; 6433 rack->r_substate = rack_do_closing; 6434 break; 6435 case TCPS_LAST_ACK: 6436 rack->r_state = TCPS_LAST_ACK; 6437 rack->r_substate = rack_do_lastack; 6438 break; 6439 case TCPS_FIN_WAIT_2: 6440 rack->r_state = TCPS_FIN_WAIT_2; 6441 rack->r_substate = rack_do_fin_wait_2; 6442 break; 6443 case TCPS_LISTEN: 6444 case TCPS_CLOSED: 6445 case TCPS_TIME_WAIT: 6446 default: 6447 #ifdef INVARIANTS 6448 panic("tcp tp:%p state:%d sees impossible state?", tp, tp->t_state); 6449 #endif 6450 break; 6451 }; 6452 } 6453 6454 6455 static void 6456 rack_timer_audit(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb) 6457 { 6458 /* 6459 * We received an ack, and then did not 6460 * call send or were bounced out due to the 6461 * hpts was running. Now a timer is up as well, is 6462 * it the right timer? 6463 */ 6464 struct rack_sendmap *rsm; 6465 int tmr_up; 6466 6467 tmr_up = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK; 6468 if (rack->rc_in_persist && (tmr_up == PACE_TMR_PERSIT)) 6469 return; 6470 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 6471 if (((rsm == NULL) || (tp->t_state < TCPS_ESTABLISHED)) && 6472 (tmr_up == PACE_TMR_RXT)) { 6473 /* Should be an RXT */ 6474 return; 6475 } 6476 if (rsm == NULL) { 6477 /* Nothing outstanding? */ 6478 if (tp->t_flags & TF_DELACK) { 6479 if (tmr_up == PACE_TMR_DELACK) 6480 /* We are supposed to have delayed ack up and we do */ 6481 return; 6482 } else if (sbavail(&tp->t_inpcb->inp_socket->so_snd) && (tmr_up == PACE_TMR_RXT)) { 6483 /* 6484 * if we hit enobufs then we would expect the possiblity 6485 * of nothing outstanding and the RXT up (and the hptsi timer). 6486 */ 6487 return; 6488 } else if (((tcp_always_keepalive || 6489 rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) && 6490 (tp->t_state <= TCPS_CLOSING)) && 6491 (tmr_up == PACE_TMR_KEEP) && 6492 (tp->snd_max == tp->snd_una)) { 6493 /* We should have keep alive up and we do */ 6494 return; 6495 } 6496 } 6497 if (rsm && (rsm->r_flags & RACK_SACK_PASSED)) { 6498 if ((tp->t_flags & TF_SENTFIN) && 6499 ((tp->snd_max - tp->snd_una) == 1) && 6500 (rsm->r_flags & RACK_HAS_FIN)) { 6501 /* needs to be a RXT */ 6502 if (tmr_up == PACE_TMR_RXT) 6503 return; 6504 } else if (tmr_up == PACE_TMR_RACK) 6505 return; 6506 } else if (SEQ_GT(tp->snd_max,tp->snd_una) && 6507 ((tmr_up == PACE_TMR_TLP) || 6508 (tmr_up == PACE_TMR_RXT))) { 6509 /* 6510 * Either a TLP or RXT is fine if no sack-passed 6511 * is in place and data is outstanding. 6512 */ 6513 return; 6514 } else if (tmr_up == PACE_TMR_DELACK) { 6515 /* 6516 * If the delayed ack was going to go off 6517 * before the rtx/tlp/rack timer were going to 6518 * expire, then that would be the timer in control. 6519 * Note we don't check the time here trusting the 6520 * code is correct. 6521 */ 6522 return; 6523 } 6524 /* 6525 * Ok the timer originally started is not what we want now. 6526 * We will force the hpts to be stopped if any, and restart 6527 * with the slot set to what was in the saved slot. 6528 */ 6529 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 6530 rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0); 6531 } 6532 6533 static void 6534 rack_hpts_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, 6535 struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos, 6536 int32_t ti_locked, int32_t nxt_pkt, struct timeval *tv) 6537 { 6538 int32_t thflags, retval, did_out = 0; 6539 int32_t way_out = 0; 6540 uint32_t cts; 6541 uint32_t tiwin; 6542 struct tcpopt to; 6543 struct tcp_rack *rack; 6544 struct rack_sendmap *rsm; 6545 int32_t prev_state = 0; 6546 6547 cts = tcp_tv_to_mssectick(tv); 6548 rack = (struct tcp_rack *)tp->t_fb_ptr; 6549 6550 kern_prefetch(rack, &prev_state); 6551 prev_state = 0; 6552 thflags = th->th_flags; 6553 /* 6554 * If this is either a state-changing packet or current state isn't 6555 * established, we require a read lock on tcbinfo. Otherwise, we 6556 * allow the tcbinfo to be in either locked or unlocked, as the 6557 * caller may have unnecessarily acquired a lock due to a race. 6558 */ 6559 if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 || 6560 tp->t_state != TCPS_ESTABLISHED) { 6561 KASSERT(ti_locked == TI_RLOCKED, ("%s ti_locked %d for " 6562 "SYN/FIN/RST/!EST", __func__, ti_locked)); 6563 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 6564 } else { 6565 #ifdef INVARIANTS 6566 if (ti_locked == TI_RLOCKED) { 6567 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 6568 } else { 6569 KASSERT(ti_locked == TI_UNLOCKED, ("%s: EST " 6570 "ti_locked: %d", __func__, ti_locked)); 6571 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); 6572 } 6573 #endif 6574 } 6575 INP_WLOCK_ASSERT(tp->t_inpcb); 6576 KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", 6577 __func__)); 6578 KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT", 6579 __func__)); 6580 { 6581 union tcp_log_stackspecific log; 6582 6583 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 6584 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 6585 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 6586 TCP_LOG_EVENT(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_IN, 0, 6587 tlen, &log, true); 6588 } 6589 /* 6590 * Segment received on connection. Reset idle time and keep-alive 6591 * timer. XXX: This should be done after segment validation to 6592 * ignore broken/spoofed segs. 6593 */ 6594 if (tp->t_idle_reduce && (tp->snd_max == tp->snd_una)) { 6595 #ifdef NETFLIX_CWV 6596 if ((tp->cwv_enabled) && 6597 ((tp->cwv_cwnd_valid == 0) && 6598 TCPS_HAVEESTABLISHED(tp->t_state) && 6599 (tp->snd_cwnd > tp->snd_cwv.init_cwnd))) { 6600 tcp_newcwv_nvp_closedown(tp); 6601 } else 6602 #endif 6603 if ((ticks - tp->t_rcvtime) >= tp->t_rxtcur) { 6604 counter_u64_add(rack_input_idle_reduces, 1); 6605 rack_cc_after_idle(tp, 6606 (rack->r_idle_reduce_largest ? 1 :0)); 6607 } 6608 } 6609 rack->r_ctl.rc_rcvtime = cts; 6610 tp->t_rcvtime = ticks; 6611 6612 #ifdef NETFLIX_CWV 6613 if (tp->cwv_enabled) { 6614 if ((tp->cwv_cwnd_valid == 0) && 6615 TCPS_HAVEESTABLISHED(tp->t_state) && 6616 (tp->snd_cwnd > tp->snd_cwv.init_cwnd)) 6617 tcp_newcwv_nvp_closedown(tp); 6618 } 6619 #endif 6620 /* 6621 * Unscale the window into a 32-bit value. For the SYN_SENT state 6622 * the scale is zero. 6623 */ 6624 tiwin = th->th_win << tp->snd_scale; 6625 #ifdef NETFLIX_STATS 6626 stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_FRWIN, tiwin); 6627 #endif 6628 /* 6629 * TCP ECN processing. XXXJTL: If we ever use ECN, we need to move 6630 * this to occur after we've validated the segment. 6631 */ 6632 if (tp->t_flags & TF_ECN_PERMIT) { 6633 if (thflags & TH_CWR) 6634 tp->t_flags &= ~TF_ECN_SND_ECE; 6635 switch (iptos & IPTOS_ECN_MASK) { 6636 case IPTOS_ECN_CE: 6637 tp->t_flags |= TF_ECN_SND_ECE; 6638 TCPSTAT_INC(tcps_ecn_ce); 6639 break; 6640 case IPTOS_ECN_ECT0: 6641 TCPSTAT_INC(tcps_ecn_ect0); 6642 break; 6643 case IPTOS_ECN_ECT1: 6644 TCPSTAT_INC(tcps_ecn_ect1); 6645 break; 6646 } 6647 /* Congestion experienced. */ 6648 if (thflags & TH_ECE) { 6649 rack_cong_signal(tp, th, CC_ECN); 6650 } 6651 } 6652 /* 6653 * Parse options on any incoming segment. 6654 */ 6655 tcp_dooptions(&to, (u_char *)(th + 1), 6656 (th->th_off << 2) - sizeof(struct tcphdr), 6657 (thflags & TH_SYN) ? TO_SYN : 0); 6658 6659 /* 6660 * If echoed timestamp is later than the current time, fall back to 6661 * non RFC1323 RTT calculation. Normalize timestamp if syncookies 6662 * were used when this connection was established. 6663 */ 6664 if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) { 6665 to.to_tsecr -= tp->ts_offset; 6666 if (TSTMP_GT(to.to_tsecr, cts)) 6667 to.to_tsecr = 0; 6668 } 6669 /* 6670 * If its the first time in we need to take care of options and 6671 * verify we can do SACK for rack! 6672 */ 6673 if (rack->r_state == 0) { 6674 /* Should be init'd by rack_init() */ 6675 KASSERT(rack->rc_inp != NULL, 6676 ("%s: rack->rc_inp unexpectedly NULL", __func__)); 6677 if (rack->rc_inp == NULL) { 6678 rack->rc_inp = tp->t_inpcb; 6679 } 6680 6681 /* 6682 * Process options only when we get SYN/ACK back. The SYN 6683 * case for incoming connections is handled in tcp_syncache. 6684 * According to RFC1323 the window field in a SYN (i.e., a 6685 * <SYN> or <SYN,ACK>) segment itself is never scaled. XXX 6686 * this is traditional behavior, may need to be cleaned up. 6687 */ 6688 rack->r_cpu = inp_to_cpuid(tp->t_inpcb); 6689 if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) { 6690 if ((to.to_flags & TOF_SCALE) && 6691 (tp->t_flags & TF_REQ_SCALE)) { 6692 tp->t_flags |= TF_RCVD_SCALE; 6693 tp->snd_scale = to.to_wscale; 6694 } 6695 /* 6696 * Initial send window. It will be updated with the 6697 * next incoming segment to the scaled value. 6698 */ 6699 tp->snd_wnd = th->th_win; 6700 if (to.to_flags & TOF_TS) { 6701 tp->t_flags |= TF_RCVD_TSTMP; 6702 tp->ts_recent = to.to_tsval; 6703 tp->ts_recent_age = cts; 6704 } 6705 if (to.to_flags & TOF_MSS) 6706 tcp_mss(tp, to.to_mss); 6707 if ((tp->t_flags & TF_SACK_PERMIT) && 6708 (to.to_flags & TOF_SACKPERM) == 0) 6709 tp->t_flags &= ~TF_SACK_PERMIT; 6710 } 6711 /* 6712 * At this point we are at the initial call. Here we decide 6713 * if we are doing RACK or not. We do this by seeing if 6714 * TF_SACK_PERMIT is set, if not rack is *not* possible and 6715 * we switch to the default code. 6716 */ 6717 if ((tp->t_flags & TF_SACK_PERMIT) == 0) { 6718 tcp_switch_back_to_default(tp); 6719 (*tp->t_fb->tfb_tcp_do_segment) (m, th, so, tp, drop_hdrlen, 6720 tlen, iptos, ti_locked); 6721 return; 6722 } 6723 /* Set the flag */ 6724 rack->r_is_v6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; 6725 tcp_set_hpts(tp->t_inpcb); 6726 rack_stop_all_timers(tp); 6727 sack_filter_clear(&rack->r_ctl.rack_sf, th->th_ack); 6728 } 6729 /* 6730 * This is the one exception case where we set the rack state 6731 * always. All other times (timers etc) we must have a rack-state 6732 * set (so we assure we have done the checks above for SACK). 6733 */ 6734 if (rack->r_state != tp->t_state) 6735 rack_set_state(tp, rack); 6736 if (SEQ_GT(th->th_ack, tp->snd_una) && (rsm = TAILQ_FIRST(&rack->r_ctl.rc_map)) != NULL) 6737 kern_prefetch(rsm, &prev_state); 6738 prev_state = rack->r_state; 6739 rack->r_ctl.rc_tlp_send_cnt = 0; 6740 rack_clear_rate_sample(rack); 6741 retval = (*rack->r_substate) (m, th, so, 6742 tp, &to, drop_hdrlen, 6743 tlen, &ti_locked, tiwin, thflags, nxt_pkt); 6744 #ifdef INVARIANTS 6745 if ((retval == 0) && 6746 (tp->t_inpcb == NULL)) { 6747 panic("retval:%d tp:%p t_inpcb:NULL state:%d", 6748 retval, tp, prev_state); 6749 } 6750 #endif 6751 if (ti_locked != TI_UNLOCKED) { 6752 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 6753 INP_INFO_RUNLOCK(&V_tcbinfo); 6754 ti_locked = TI_UNLOCKED; 6755 } 6756 if (retval == 0) { 6757 /* 6758 * If retval is 1 the tcb is unlocked and most likely the tp 6759 * is gone. 6760 */ 6761 INP_WLOCK_ASSERT(tp->t_inpcb); 6762 tcp_rack_xmit_timer_commit(rack, tp); 6763 if (((tp->snd_max - tp->snd_una) > tp->snd_wnd) && 6764 (rack->rc_in_persist == 0)){ 6765 /* 6766 * The peer shrunk its window on us to the point 6767 * where we have sent too much. The only thing 6768 * we can do here is stop any timers and 6769 * enter persist. We most likely lost the last 6770 * bytes we sent but oh well, we will have to 6771 * retransmit them after the peer is caught up. 6772 */ 6773 if (rack->rc_inp->inp_in_hpts) 6774 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); 6775 rack_timer_cancel(tp, rack, cts, __LINE__); 6776 rack_enter_persist(tp, rack, cts); 6777 rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0); 6778 way_out = 3; 6779 goto done_with_input; 6780 } 6781 if (nxt_pkt == 0) { 6782 if (rack->r_wanted_output != 0) { 6783 did_out = 1; 6784 (void)tp->t_fb->tfb_tcp_output(tp); 6785 } 6786 rack_start_hpts_timer(rack, tp, cts, __LINE__, 0, 0, 0); 6787 } 6788 if (((rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) == 0) && 6789 (SEQ_GT(tp->snd_max, tp->snd_una) || 6790 (tp->t_flags & TF_DELACK) || 6791 ((tcp_always_keepalive || rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) && 6792 (tp->t_state <= TCPS_CLOSING)))) { 6793 /* We could not send (probably in the hpts but stopped the timer earlier)? */ 6794 if ((tp->snd_max == tp->snd_una) && 6795 ((tp->t_flags & TF_DELACK) == 0) && 6796 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { 6797 /* keep alive not needed if we are hptsi output yet */ 6798 ; 6799 } else { 6800 if (rack->rc_inp->inp_in_hpts) 6801 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); 6802 rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0); 6803 } 6804 way_out = 1; 6805 } else { 6806 /* Do we have the correct timer running? */ 6807 rack_timer_audit(tp, rack, &so->so_snd); 6808 way_out = 2; 6809 } 6810 done_with_input: 6811 rack_log_doseg_done(rack, cts, nxt_pkt, did_out, way_out); 6812 if (did_out) 6813 rack->r_wanted_output = 0; 6814 #ifdef INVARIANTS 6815 if (tp->t_inpcb == NULL) { 6816 panic("OP:%d retval:%d tp:%p t_inpcb:NULL state:%d", 6817 did_out, 6818 retval, tp, prev_state); 6819 } 6820 #endif 6821 INP_WUNLOCK(tp->t_inpcb); 6822 } 6823 } 6824 6825 void 6826 rack_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, 6827 struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos, 6828 int32_t ti_locked) 6829 { 6830 struct timeval tv; 6831 #ifdef RSS 6832 struct tcp_function_block *tfb; 6833 struct tcp_rack *rack; 6834 struct inpcb *inp; 6835 6836 rack = (struct tcp_rack *)tp->t_fb_ptr; 6837 if (rack->r_state == 0) { 6838 /* 6839 * Initial input (ACK to SYN-ACK etc)lets go ahead and get 6840 * it processed 6841 */ 6842 INP_INFO_RLOCK(); 6843 ti_locked = TI_RLOCKED; 6844 tcp_get_usecs(&tv); 6845 rack_hpts_do_segment(m, th, so, tp, drop_hdrlen, 6846 tlen, iptos, ti_locked, 0, &tv); 6847 return; 6848 } 6849 if (ti_locked == TI_RLOCKED) 6850 INP_INFO_RUNLOCK(&V_tcbinfo); 6851 tcp_queue_to_input(tp, m, th, tlen, drop_hdrlen, iptos, (uint8_t) ti_locked); 6852 INP_WUNLOCK(tp->t_inpcb); 6853 #else 6854 tcp_get_usecs(&tv); 6855 rack_hpts_do_segment(m, th, so, tp, drop_hdrlen, 6856 tlen, iptos, ti_locked, 0, &tv); 6857 #endif 6858 } 6859 6860 struct rack_sendmap * 6861 tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tsused) 6862 { 6863 struct rack_sendmap *rsm = NULL; 6864 int32_t idx; 6865 uint32_t srtt_cur, srtt = 0, thresh = 0, ts_low = 0; 6866 6867 /* Return the next guy to be re-transmitted */ 6868 if (TAILQ_EMPTY(&rack->r_ctl.rc_map)) { 6869 return (NULL); 6870 } 6871 if (tp->t_flags & TF_SENTFIN) { 6872 /* retran the end FIN? */ 6873 return (NULL); 6874 } 6875 /* ok lets look at this one */ 6876 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 6877 if (rsm && ((rsm->r_flags & RACK_ACKED) == 0)) { 6878 goto check_it; 6879 } 6880 rsm = rack_find_lowest_rsm(rack); 6881 if (rsm == NULL) { 6882 return (NULL); 6883 } 6884 check_it: 6885 srtt_cur = tp->t_srtt >> TCP_RTT_SHIFT; 6886 srtt = TICKS_2_MSEC(srtt_cur); 6887 if (rack->rc_rack_rtt && (srtt > rack->rc_rack_rtt)) 6888 srtt = rack->rc_rack_rtt; 6889 if (rsm->r_flags & RACK_ACKED) { 6890 return (NULL); 6891 } 6892 if ((rsm->r_flags & RACK_SACK_PASSED) == 0) { 6893 /* Its not yet ready */ 6894 return (NULL); 6895 } 6896 idx = rsm->r_rtr_cnt - 1; 6897 ts_low = rsm->r_tim_lastsent[idx]; 6898 thresh = rack_calc_thresh_rack(rack, srtt, tsused); 6899 if (tsused <= ts_low) { 6900 return (NULL); 6901 } 6902 if ((tsused - ts_low) >= thresh) { 6903 return (rsm); 6904 } 6905 return (NULL); 6906 } 6907 6908 static int 6909 rack_output(struct tcpcb *tp) 6910 { 6911 struct socket *so; 6912 uint32_t recwin, sendwin; 6913 uint32_t sb_offset; 6914 int32_t len, flags, error = 0; 6915 struct mbuf *m; 6916 struct mbuf *mb; 6917 uint32_t if_hw_tsomaxsegcount = 0; 6918 uint32_t if_hw_tsomaxsegsize; 6919 long tot_len_this_send = 0; 6920 struct ip *ip = NULL; 6921 #ifdef TCPDEBUG 6922 struct ipovly *ipov = NULL; 6923 #endif 6924 struct udphdr *udp = NULL; 6925 struct tcp_rack *rack; 6926 struct tcphdr *th; 6927 uint8_t pass = 0; 6928 uint8_t wanted_cookie = 0; 6929 u_char opt[TCP_MAXOLEN]; 6930 unsigned ipoptlen, optlen, hdrlen, ulen=0; 6931 uint32_t rack_seq; 6932 6933 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 6934 unsigned ipsec_optlen = 0; 6935 6936 #endif 6937 int32_t idle, sendalot; 6938 int32_t sub_from_prr = 0; 6939 volatile int32_t sack_rxmit; 6940 struct rack_sendmap *rsm = NULL; 6941 int32_t tso, mtu, would_have_fin = 0; 6942 struct tcpopt to; 6943 int32_t slot = 0; 6944 uint32_t cts; 6945 uint8_t hpts_calling, doing_tlp = 0; 6946 int32_t do_a_prefetch; 6947 int32_t prefetch_rsm = 0; 6948 int32_t prefetch_so_done = 0; 6949 struct tcp_log_buffer *lgb = NULL; 6950 struct inpcb *inp; 6951 struct sockbuf *sb; 6952 #ifdef INET6 6953 struct ip6_hdr *ip6 = NULL; 6954 int32_t isipv6; 6955 #endif 6956 /* setup and take the cache hits here */ 6957 rack = (struct tcp_rack *)tp->t_fb_ptr; 6958 inp = rack->rc_inp; 6959 so = inp->inp_socket; 6960 sb = &so->so_snd; 6961 kern_prefetch(sb, &do_a_prefetch); 6962 do_a_prefetch = 1; 6963 6964 INP_WLOCK_ASSERT(inp); 6965 #ifdef TCP_OFFLOAD 6966 if (tp->t_flags & TF_TOE) 6967 return (tcp_offload_output(tp)); 6968 #endif 6969 6970 /* 6971 * For TFO connections in SYN_RECEIVED, only allow the initial 6972 * SYN|ACK and those sent by the retransmit timer. 6973 */ 6974 if (IS_FASTOPEN(tp->t_flags) && 6975 (tp->t_state == TCPS_SYN_RECEIVED) && 6976 SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN|ACK sent */ 6977 (rack->r_ctl.rc_resend == NULL)) /* not a retransmit */ 6978 return (0); 6979 #ifdef INET6 6980 if (rack->r_state) { 6981 /* Use the cache line loaded if possible */ 6982 isipv6 = rack->r_is_v6; 6983 } else { 6984 isipv6 = (inp->inp_vflag & INP_IPV6) != 0; 6985 } 6986 #endif 6987 cts = tcp_ts_getticks(); 6988 if (((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) && 6989 inp->inp_in_hpts) { 6990 /* 6991 * We are on the hpts for some timer but not hptsi output. 6992 * Remove from the hpts unconditionally. 6993 */ 6994 rack_timer_cancel(tp, rack, cts, __LINE__); 6995 } 6996 /* Mark that we have called rack_output(). */ 6997 if ((rack->r_timer_override) || 6998 (tp->t_flags & TF_FORCEDATA) || 6999 (tp->t_state < TCPS_ESTABLISHED)) { 7000 if (tp->t_inpcb->inp_in_hpts) 7001 tcp_hpts_remove(tp->t_inpcb, HPTS_REMOVE_OUTPUT); 7002 } else if (tp->t_inpcb->inp_in_hpts) { 7003 /* 7004 * On the hpts you can't pass even if ACKNOW is on, we will 7005 * when the hpts fires. 7006 */ 7007 counter_u64_add(rack_out_size[TCP_MSS_ACCT_INPACE], 1); 7008 return (0); 7009 } 7010 hpts_calling = inp->inp_hpts_calls; 7011 inp->inp_hpts_calls = 0; 7012 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { 7013 if (rack_process_timers(tp, rack, cts, hpts_calling)) { 7014 counter_u64_add(rack_out_size[TCP_MSS_ACCT_ATIMER], 1); 7015 return (0); 7016 } 7017 } 7018 rack->r_wanted_output = 0; 7019 rack->r_timer_override = 0; 7020 /* 7021 * Determine length of data that should be transmitted, and flags 7022 * that will be used. If there is some data or critical controls 7023 * (SYN, RST) to send, then transmit; otherwise, investigate 7024 * further. 7025 */ 7026 idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una); 7027 #ifdef NETFLIX_CWV 7028 if (tp->cwv_enabled) { 7029 if ((tp->cwv_cwnd_valid == 0) && 7030 TCPS_HAVEESTABLISHED(tp->t_state) && 7031 (tp->snd_cwnd > tp->snd_cwv.init_cwnd)) 7032 tcp_newcwv_nvp_closedown(tp); 7033 } else 7034 #endif 7035 if (tp->t_idle_reduce) { 7036 if (idle && ((ticks - tp->t_rcvtime) >= tp->t_rxtcur)) 7037 rack_cc_after_idle(tp, 7038 (rack->r_idle_reduce_largest ? 1 :0)); 7039 } 7040 tp->t_flags &= ~TF_LASTIDLE; 7041 if (idle) { 7042 if (tp->t_flags & TF_MORETOCOME) { 7043 tp->t_flags |= TF_LASTIDLE; 7044 idle = 0; 7045 } 7046 } 7047 again: 7048 /* 7049 * If we've recently taken a timeout, snd_max will be greater than 7050 * snd_nxt. There may be SACK information that allows us to avoid 7051 * resending already delivered data. Adjust snd_nxt accordingly. 7052 */ 7053 sendalot = 0; 7054 cts = tcp_ts_getticks(); 7055 tso = 0; 7056 mtu = 0; 7057 sb_offset = tp->snd_max - tp->snd_una; 7058 sendwin = min(tp->snd_wnd, tp->snd_cwnd); 7059 7060 flags = tcp_outflags[tp->t_state]; 7061 /* 7062 * Send any SACK-generated retransmissions. If we're explicitly 7063 * trying to send out new data (when sendalot is 1), bypass this 7064 * function. If we retransmit in fast recovery mode, decrement 7065 * snd_cwnd, since we're replacing a (future) new transmission with 7066 * a retransmission now, and we previously incremented snd_cwnd in 7067 * tcp_input(). 7068 */ 7069 /* 7070 * Still in sack recovery , reset rxmit flag to zero. 7071 */ 7072 while (rack->rc_free_cnt < rack_free_cache) { 7073 rsm = rack_alloc(rack); 7074 if (rsm == NULL) { 7075 if (inp->inp_hpts_calls) 7076 /* Retry in a ms */ 7077 slot = 1; 7078 goto just_return_nolock; 7079 } 7080 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_next); 7081 rack->rc_free_cnt++; 7082 rsm = NULL; 7083 } 7084 if (inp->inp_hpts_calls) 7085 inp->inp_hpts_calls = 0; 7086 sack_rxmit = 0; 7087 len = 0; 7088 rsm = NULL; 7089 if (flags & TH_RST) { 7090 SOCKBUF_LOCK(sb); 7091 goto send; 7092 } 7093 if (rack->r_ctl.rc_tlpsend) { 7094 /* Tail loss probe */ 7095 long cwin; 7096 long tlen; 7097 7098 doing_tlp = 1; 7099 rsm = rack->r_ctl.rc_tlpsend; 7100 rack->r_ctl.rc_tlpsend = NULL; 7101 sack_rxmit = 1; 7102 tlen = rsm->r_end - rsm->r_start; 7103 if (tlen > tp->t_maxseg) 7104 tlen = tp->t_maxseg; 7105 #ifdef INVARIANTS 7106 if (SEQ_GT(tp->snd_una, rsm->r_start)) { 7107 panic("tp:%p rack:%p snd_una:%u rsm:%p r_start:%u", 7108 tp, rack, tp->snd_una, rsm, rsm->r_start); 7109 } 7110 #endif 7111 sb_offset = rsm->r_start - tp->snd_una; 7112 cwin = min(tp->snd_wnd, tlen); 7113 len = cwin; 7114 } else if (rack->r_ctl.rc_resend) { 7115 /* Retransmit timer */ 7116 rsm = rack->r_ctl.rc_resend; 7117 rack->r_ctl.rc_resend = NULL; 7118 len = rsm->r_end - rsm->r_start; 7119 sack_rxmit = 1; 7120 sendalot = 0; 7121 sb_offset = rsm->r_start - tp->snd_una; 7122 if (len >= tp->t_maxseg) { 7123 len = tp->t_maxseg; 7124 } 7125 KASSERT(sb_offset >= 0, ("%s: sack block to the left of una : %d", 7126 __func__, sb_offset)); 7127 } else if ((rack->rc_in_persist == 0) && 7128 ((rsm = tcp_rack_output(tp, rack, cts)) != NULL)) { 7129 long tlen; 7130 7131 if ((!IN_RECOVERY(tp->t_flags)) && 7132 ((tp->t_flags & (TF_WASFRECOVERY | TF_WASCRECOVERY)) == 0)) { 7133 /* Enter recovery if not induced by a time-out */ 7134 rack->r_ctl.rc_rsm_start = rsm->r_start; 7135 rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; 7136 rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; 7137 rack_cong_signal(tp, NULL, CC_NDUPACK); 7138 /* 7139 * When we enter recovery we need to assure we send 7140 * one packet. 7141 */ 7142 rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; 7143 } 7144 #ifdef INVARIANTS 7145 if (SEQ_LT(rsm->r_start, tp->snd_una)) { 7146 panic("Huh, tp:%p rack:%p rsm:%p start:%u < snd_una:%u\n", 7147 tp, rack, rsm, rsm->r_start, tp->snd_una); 7148 } 7149 #endif 7150 tlen = rsm->r_end - rsm->r_start; 7151 sb_offset = rsm->r_start - tp->snd_una; 7152 if (tlen > rack->r_ctl.rc_prr_sndcnt) { 7153 len = rack->r_ctl.rc_prr_sndcnt; 7154 } else { 7155 len = tlen; 7156 } 7157 if (len >= tp->t_maxseg) { 7158 sendalot = 1; 7159 len = tp->t_maxseg; 7160 } else { 7161 sendalot = 0; 7162 if ((rack->rc_timer_up == 0) && 7163 (len < tlen)) { 7164 /* 7165 * If its not a timer don't send a partial 7166 * segment. 7167 */ 7168 len = 0; 7169 goto just_return_nolock; 7170 } 7171 } 7172 KASSERT(sb_offset >= 0, ("%s: sack block to the left of una : %d", 7173 __func__, sb_offset)); 7174 if (len > 0) { 7175 sub_from_prr = 1; 7176 sack_rxmit = 1; 7177 TCPSTAT_INC(tcps_sack_rexmits); 7178 TCPSTAT_ADD(tcps_sack_rexmit_bytes, 7179 min(len, tp->t_maxseg)); 7180 counter_u64_add(rack_rtm_prr_retran, 1); 7181 } 7182 } 7183 if (rsm && (rsm->r_flags & RACK_HAS_FIN)) { 7184 /* we are retransmitting the fin */ 7185 len--; 7186 if (len) { 7187 /* 7188 * When retransmitting data do *not* include the 7189 * FIN. This could happen from a TLP probe. 7190 */ 7191 flags &= ~TH_FIN; 7192 } 7193 } 7194 #ifdef INVARIANTS 7195 /* For debugging */ 7196 rack->r_ctl.rc_rsm_at_retran = rsm; 7197 #endif 7198 /* 7199 * Get standard flags, and add SYN or FIN if requested by 'hidden' 7200 * state flags. 7201 */ 7202 if (tp->t_flags & TF_NEEDFIN) 7203 flags |= TH_FIN; 7204 if (tp->t_flags & TF_NEEDSYN) 7205 flags |= TH_SYN; 7206 if ((sack_rxmit == 0) && (prefetch_rsm == 0)) { 7207 void *end_rsm; 7208 end_rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext); 7209 if (end_rsm) 7210 kern_prefetch(end_rsm, &prefetch_rsm); 7211 prefetch_rsm = 1; 7212 } 7213 SOCKBUF_LOCK(sb); 7214 /* 7215 * If in persist timeout with window of 0, send 1 byte. Otherwise, 7216 * if window is small but nonzero and time TF_SENTFIN expired, we 7217 * will send what we can and go to transmit state. 7218 */ 7219 if (tp->t_flags & TF_FORCEDATA) { 7220 if (sendwin == 0) { 7221 /* 7222 * If we still have some data to send, then clear 7223 * the FIN bit. Usually this would happen below 7224 * when it realizes that we aren't sending all the 7225 * data. However, if we have exactly 1 byte of 7226 * unsent data, then it won't clear the FIN bit 7227 * below, and if we are in persist state, we wind up 7228 * sending the packet without recording that we sent 7229 * the FIN bit. 7230 * 7231 * We can't just blindly clear the FIN bit, because 7232 * if we don't have any more data to send then the 7233 * probe will be the FIN itself. 7234 */ 7235 if (sb_offset < sbused(sb)) 7236 flags &= ~TH_FIN; 7237 sendwin = 1; 7238 } else { 7239 if (rack->rc_in_persist) 7240 rack_exit_persist(tp, rack); 7241 /* 7242 * If we are dropping persist mode then we need to 7243 * correct snd_nxt/snd_max and off. 7244 */ 7245 tp->snd_nxt = tp->snd_max; 7246 sb_offset = tp->snd_nxt - tp->snd_una; 7247 } 7248 } 7249 /* 7250 * If snd_nxt == snd_max and we have transmitted a FIN, the 7251 * sb_offset will be > 0 even if so_snd.sb_cc is 0, resulting in a 7252 * negative length. This can also occur when TCP opens up its 7253 * congestion window while receiving additional duplicate acks after 7254 * fast-retransmit because TCP will reset snd_nxt to snd_max after 7255 * the fast-retransmit. 7256 * 7257 * In the normal retransmit-FIN-only case, however, snd_nxt will be 7258 * set to snd_una, the sb_offset will be 0, and the length may wind 7259 * up 0. 7260 * 7261 * If sack_rxmit is true we are retransmitting from the scoreboard 7262 * in which case len is already set. 7263 */ 7264 if (sack_rxmit == 0) { 7265 uint32_t avail; 7266 7267 avail = sbavail(sb); 7268 if (SEQ_GT(tp->snd_nxt, tp->snd_una) && avail) 7269 sb_offset = tp->snd_nxt - tp->snd_una; 7270 else 7271 sb_offset = 0; 7272 if (IN_RECOVERY(tp->t_flags) == 0) { 7273 if (rack->r_ctl.rc_tlp_new_data) { 7274 /* TLP is forcing out new data */ 7275 if (rack->r_ctl.rc_tlp_new_data > (uint32_t) (avail - sb_offset)) { 7276 rack->r_ctl.rc_tlp_new_data = (uint32_t) (avail - sb_offset); 7277 } 7278 if (rack->r_ctl.rc_tlp_new_data > tp->snd_wnd) 7279 len = tp->snd_wnd; 7280 else 7281 len = rack->r_ctl.rc_tlp_new_data; 7282 rack->r_ctl.rc_tlp_new_data = 0; 7283 doing_tlp = 1; 7284 } else { 7285 if (sendwin > avail) { 7286 /* use the available */ 7287 if (avail > sb_offset) { 7288 len = (int32_t)(avail - sb_offset); 7289 } else { 7290 len = 0; 7291 } 7292 } else { 7293 if (sendwin > sb_offset) { 7294 len = (int32_t)(sendwin - sb_offset); 7295 } else { 7296 len = 0; 7297 } 7298 } 7299 } 7300 } else { 7301 uint32_t outstanding; 7302 7303 /* 7304 * We are inside of a SACK recovery episode and are 7305 * sending new data, having retransmitted all the 7306 * data possible so far in the scoreboard. 7307 */ 7308 outstanding = tp->snd_max - tp->snd_una; 7309 if ((rack->r_ctl.rc_prr_sndcnt + outstanding) > tp->snd_wnd) 7310 len = 0; 7311 else if (avail > sb_offset) 7312 len = avail - sb_offset; 7313 else 7314 len = 0; 7315 if (len > 0) { 7316 if (len > rack->r_ctl.rc_prr_sndcnt) 7317 len = rack->r_ctl.rc_prr_sndcnt; 7318 7319 if (len > 0) { 7320 sub_from_prr = 1; 7321 counter_u64_add(rack_rtm_prr_newdata, 1); 7322 } 7323 } 7324 if (len > tp->t_maxseg) { 7325 /* 7326 * We should never send more than a MSS when 7327 * retransmitting or sending new data in prr 7328 * mode unless the override flag is on. Most 7329 * likely the PRR algorithm is not going to 7330 * let us send a lot as well :-) 7331 */ 7332 if (rack->r_ctl.rc_prr_sendalot == 0) 7333 len = tp->t_maxseg; 7334 } else if (len < tp->t_maxseg) { 7335 /* 7336 * Do we send any? The idea here is if the 7337 * send empty's the socket buffer we want to 7338 * do it. However if not then lets just wait 7339 * for our prr_sndcnt to get bigger. 7340 */ 7341 long leftinsb; 7342 7343 leftinsb = sbavail(sb) - sb_offset; 7344 if (leftinsb > len) { 7345 /* This send does not empty the sb */ 7346 len = 0; 7347 } 7348 } 7349 } 7350 } 7351 if (prefetch_so_done == 0) { 7352 kern_prefetch(so, &prefetch_so_done); 7353 prefetch_so_done = 1; 7354 } 7355 /* 7356 * Lop off SYN bit if it has already been sent. However, if this is 7357 * SYN-SENT state and if segment contains data and if we don't know 7358 * that foreign host supports TAO, suppress sending segment. 7359 */ 7360 if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una) && 7361 ((sack_rxmit == 0) && (tp->t_rxtshift == 0))) { 7362 if (tp->t_state != TCPS_SYN_RECEIVED) 7363 flags &= ~TH_SYN; 7364 /* 7365 * When sending additional segments following a TFO SYN|ACK, 7366 * do not include the SYN bit. 7367 */ 7368 if (IS_FASTOPEN(tp->t_flags) && 7369 (tp->t_state == TCPS_SYN_RECEIVED)) 7370 flags &= ~TH_SYN; 7371 sb_offset--, len++; 7372 } 7373 /* 7374 * Be careful not to send data and/or FIN on SYN segments. This 7375 * measure is needed to prevent interoperability problems with not 7376 * fully conformant TCP implementations. 7377 */ 7378 if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) { 7379 len = 0; 7380 flags &= ~TH_FIN; 7381 } 7382 /* 7383 * On TFO sockets, ensure no data is sent in the following cases: 7384 * 7385 * - When retransmitting SYN|ACK on a passively-created socket 7386 * 7387 * - When retransmitting SYN on an actively created socket 7388 * 7389 * - When sending a zero-length cookie (cookie request) on an 7390 * actively created socket 7391 * 7392 * - When the socket is in the CLOSED state (RST is being sent) 7393 */ 7394 if (IS_FASTOPEN(tp->t_flags) && 7395 (((flags & TH_SYN) && (tp->t_rxtshift > 0)) || 7396 ((tp->t_state == TCPS_SYN_SENT) && 7397 (tp->t_tfo_client_cookie_len == 0)) || 7398 (flags & TH_RST))) 7399 len = 0; 7400 /* Without fast-open there should never be data sent on a SYN */ 7401 if ((flags & TH_SYN) && (!IS_FASTOPEN(tp->t_flags))) 7402 len = 0; 7403 if (len <= 0) { 7404 /* 7405 * If FIN has been sent but not acked, but we haven't been 7406 * called to retransmit, len will be < 0. Otherwise, window 7407 * shrank after we sent into it. If window shrank to 0, 7408 * cancel pending retransmit, pull snd_nxt back to (closed) 7409 * window, and set the persist timer if it isn't already 7410 * going. If the window didn't close completely, just wait 7411 * for an ACK. 7412 * 7413 * We also do a general check here to ensure that we will 7414 * set the persist timer when we have data to send, but a 7415 * 0-byte window. This makes sure the persist timer is set 7416 * even if the packet hits one of the "goto send" lines 7417 * below. 7418 */ 7419 len = 0; 7420 if ((tp->snd_wnd == 0) && 7421 (TCPS_HAVEESTABLISHED(tp->t_state)) && 7422 (sb_offset < (int)sbavail(sb))) { 7423 tp->snd_nxt = tp->snd_una; 7424 rack_enter_persist(tp, rack, cts); 7425 } 7426 } 7427 /* len will be >= 0 after this point. */ 7428 KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); 7429 tcp_sndbuf_autoscale(tp, so, sendwin); 7430 /* 7431 * Decide if we can use TCP Segmentation Offloading (if supported by 7432 * hardware). 7433 * 7434 * TSO may only be used if we are in a pure bulk sending state. The 7435 * presence of TCP-MD5, SACK retransmits, SACK advertizements and IP 7436 * options prevent using TSO. With TSO the TCP header is the same 7437 * (except for the sequence number) for all generated packets. This 7438 * makes it impossible to transmit any options which vary per 7439 * generated segment or packet. 7440 * 7441 * IPv4 handling has a clear separation of ip options and ip header 7442 * flags while IPv6 combines both in in6p_outputopts. ip6_optlen() does 7443 * the right thing below to provide length of just ip options and thus 7444 * checking for ipoptlen is enough to decide if ip options are present. 7445 */ 7446 7447 #ifdef INET6 7448 if (isipv6) 7449 ipoptlen = ip6_optlen(tp->t_inpcb); 7450 else 7451 #endif 7452 if (tp->t_inpcb->inp_options) 7453 ipoptlen = tp->t_inpcb->inp_options->m_len - 7454 offsetof(struct ipoption, ipopt_list); 7455 else 7456 ipoptlen = 0; 7457 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 7458 /* 7459 * Pre-calculate here as we save another lookup into the darknesses 7460 * of IPsec that way and can actually decide if TSO is ok. 7461 */ 7462 #ifdef INET6 7463 if (isipv6 && IPSEC_ENABLED(ipv6)) 7464 ipsec_optlen = IPSEC_HDRSIZE(ipv6, tp->t_inpcb); 7465 #ifdef INET 7466 else 7467 #endif 7468 #endif /* INET6 */ 7469 #ifdef INET 7470 if (IPSEC_ENABLED(ipv4)) 7471 ipsec_optlen = IPSEC_HDRSIZE(ipv4, tp->t_inpcb); 7472 #endif /* INET */ 7473 #endif 7474 7475 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 7476 ipoptlen += ipsec_optlen; 7477 #endif 7478 if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > tp->t_maxseg && 7479 (tp->t_port == 0) && 7480 ((tp->t_flags & TF_SIGNATURE) == 0) && 7481 tp->rcv_numsacks == 0 && sack_rxmit == 0 && 7482 ipoptlen == 0) 7483 tso = 1; 7484 { 7485 uint32_t outstanding; 7486 7487 outstanding = tp->snd_max - tp->snd_una; 7488 if (tp->t_flags & TF_SENTFIN) { 7489 /* 7490 * If we sent a fin, snd_max is 1 higher than 7491 * snd_una 7492 */ 7493 outstanding--; 7494 } 7495 if (outstanding > 0) { 7496 /* 7497 * This is sub-optimal. We only send a stand alone 7498 * FIN on its own segment. 7499 */ 7500 if (flags & TH_FIN) { 7501 flags &= ~TH_FIN; 7502 would_have_fin = 1; 7503 } 7504 } else if (sack_rxmit) { 7505 if ((rsm->r_flags & RACK_HAS_FIN) == 0) 7506 flags &= ~TH_FIN; 7507 } else { 7508 if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + 7509 sbused(sb))) 7510 flags &= ~TH_FIN; 7511 } 7512 } 7513 recwin = sbspace(&so->so_rcv); 7514 7515 /* 7516 * Sender silly window avoidance. We transmit under the following 7517 * conditions when len is non-zero: 7518 * 7519 * - We have a full segment (or more with TSO) - This is the last 7520 * buffer in a write()/send() and we are either idle or running 7521 * NODELAY - we've timed out (e.g. persist timer) - we have more 7522 * then 1/2 the maximum send window's worth of data (receiver may be 7523 * limited the window size) - we need to retransmit 7524 */ 7525 if (len) { 7526 if (len >= tp->t_maxseg) { 7527 pass = 1; 7528 goto send; 7529 } 7530 /* 7531 * NOTE! on localhost connections an 'ack' from the remote 7532 * end may occur synchronously with the output and cause us 7533 * to flush a buffer queued with moretocome. XXX 7534 * 7535 */ 7536 if (!(tp->t_flags & TF_MORETOCOME) && /* normal case */ 7537 (idle || (tp->t_flags & TF_NODELAY)) && 7538 ((uint32_t)len + (uint32_t)sb_offset >= sbavail(&so->so_snd)) && 7539 (tp->t_flags & TF_NOPUSH) == 0) { 7540 pass = 2; 7541 goto send; 7542 } 7543 if (tp->t_flags & TF_FORCEDATA) { /* typ. timeout case */ 7544 pass = 3; 7545 goto send; 7546 } 7547 if ((tp->snd_una == tp->snd_max) && len) { /* Nothing outstanding */ 7548 goto send; 7549 } 7550 if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) { 7551 pass = 4; 7552 goto send; 7553 } 7554 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { /* retransmit case */ 7555 pass = 5; 7556 goto send; 7557 } 7558 if (sack_rxmit) { 7559 pass = 6; 7560 goto send; 7561 } 7562 } 7563 /* 7564 * Sending of standalone window updates. 7565 * 7566 * Window updates are important when we close our window due to a 7567 * full socket buffer and are opening it again after the application 7568 * reads data from it. Once the window has opened again and the 7569 * remote end starts to send again the ACK clock takes over and 7570 * provides the most current window information. 7571 * 7572 * We must avoid the silly window syndrome whereas every read from 7573 * the receive buffer, no matter how small, causes a window update 7574 * to be sent. We also should avoid sending a flurry of window 7575 * updates when the socket buffer had queued a lot of data and the 7576 * application is doing small reads. 7577 * 7578 * Prevent a flurry of pointless window updates by only sending an 7579 * update when we can increase the advertized window by more than 7580 * 1/4th of the socket buffer capacity. When the buffer is getting 7581 * full or is very small be more aggressive and send an update 7582 * whenever we can increase by two mss sized segments. In all other 7583 * situations the ACK's to new incoming data will carry further 7584 * window increases. 7585 * 7586 * Don't send an independent window update if a delayed ACK is 7587 * pending (it will get piggy-backed on it) or the remote side 7588 * already has done a half-close and won't send more data. Skip 7589 * this if the connection is in T/TCP half-open state. 7590 */ 7591 if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN) && 7592 !(tp->t_flags & TF_DELACK) && 7593 !TCPS_HAVERCVDFIN(tp->t_state)) { 7594 /* 7595 * "adv" is the amount we could increase the window, taking 7596 * into account that we are limited by TCP_MAXWIN << 7597 * tp->rcv_scale. 7598 */ 7599 int32_t adv; 7600 int oldwin; 7601 7602 adv = min(recwin, (long)TCP_MAXWIN << tp->rcv_scale); 7603 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) { 7604 oldwin = (tp->rcv_adv - tp->rcv_nxt); 7605 adv -= oldwin; 7606 } else 7607 oldwin = 0; 7608 7609 /* 7610 * If the new window size ends up being the same as the old 7611 * size when it is scaled, then don't force a window update. 7612 */ 7613 if (oldwin >> tp->rcv_scale == (adv + oldwin) >> tp->rcv_scale) 7614 goto dontupdate; 7615 7616 if (adv >= (int32_t)(2 * tp->t_maxseg) && 7617 (adv >= (int32_t)(so->so_rcv.sb_hiwat / 4) || 7618 recwin <= (int32_t)(so->so_rcv.sb_hiwat / 8) || 7619 so->so_rcv.sb_hiwat <= 8 * tp->t_maxseg)) { 7620 pass = 7; 7621 goto send; 7622 } 7623 if (2 * adv >= (int32_t) so->so_rcv.sb_hiwat) 7624 goto send; 7625 } 7626 dontupdate: 7627 7628 /* 7629 * Send if we owe the peer an ACK, RST, SYN, or urgent data. ACKNOW 7630 * is also a catch-all for the retransmit timer timeout case. 7631 */ 7632 if (tp->t_flags & TF_ACKNOW) { 7633 pass = 8; 7634 goto send; 7635 } 7636 if (((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0)) { 7637 pass = 9; 7638 goto send; 7639 } 7640 if (SEQ_GT(tp->snd_up, tp->snd_una)) { 7641 pass = 10; 7642 goto send; 7643 } 7644 /* 7645 * If our state indicates that FIN should be sent and we have not 7646 * yet done so, then we need to send. 7647 */ 7648 if (flags & TH_FIN) { 7649 if ((tp->t_flags & TF_SENTFIN) || 7650 (((tp->t_flags & TF_SENTFIN) == 0) && 7651 (tp->snd_nxt == tp->snd_una))) { 7652 pass = 11; 7653 goto send; 7654 } 7655 } 7656 /* 7657 * No reason to send a segment, just return. 7658 */ 7659 just_return: 7660 SOCKBUF_UNLOCK(sb); 7661 just_return_nolock: 7662 if (tot_len_this_send == 0) 7663 counter_u64_add(rack_out_size[TCP_MSS_ACCT_JUSTRET], 1); 7664 rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, tot_len_this_send, 1); 7665 rack_log_type_just_return(rack, cts, tot_len_this_send, slot, hpts_calling); 7666 tp->t_flags &= ~TF_FORCEDATA; 7667 return (0); 7668 7669 send: 7670 if (doing_tlp == 0) { 7671 /* 7672 * Data not a TLP, and its not the rxt firing. If it is the 7673 * rxt firing, we want to leave the tlp_in_progress flag on 7674 * so we don't send another TLP. It has to be a rack timer 7675 * or normal send (response to acked data) to clear the tlp 7676 * in progress flag. 7677 */ 7678 rack->rc_tlp_in_progress = 0; 7679 } 7680 SOCKBUF_LOCK_ASSERT(sb); 7681 if (len > 0) { 7682 if (len >= tp->t_maxseg) 7683 tp->t_flags2 |= TF2_PLPMTU_MAXSEGSNT; 7684 else 7685 tp->t_flags2 &= ~TF2_PLPMTU_MAXSEGSNT; 7686 } 7687 /* 7688 * Before ESTABLISHED, force sending of initial options unless TCP 7689 * set not to do any options. NOTE: we assume that the IP/TCP header 7690 * plus TCP options always fit in a single mbuf, leaving room for a 7691 * maximum link header, i.e. max_linkhdr + sizeof (struct tcpiphdr) 7692 * + optlen <= MCLBYTES 7693 */ 7694 optlen = 0; 7695 #ifdef INET6 7696 if (isipv6) 7697 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 7698 else 7699 #endif 7700 hdrlen = sizeof(struct tcpiphdr); 7701 7702 /* 7703 * Compute options for segment. We only have to care about SYN and 7704 * established connection segments. Options for SYN-ACK segments 7705 * are handled in TCP syncache. 7706 */ 7707 to.to_flags = 0; 7708 if ((tp->t_flags & TF_NOOPT) == 0) { 7709 /* Maximum segment size. */ 7710 if (flags & TH_SYN) { 7711 tp->snd_nxt = tp->iss; 7712 to.to_mss = tcp_mssopt(&inp->inp_inc); 7713 #ifdef NETFLIX_TCPOUDP 7714 if (tp->t_port) 7715 to.to_mss -= V_tcp_udp_tunneling_overhead; 7716 #endif 7717 to.to_flags |= TOF_MSS; 7718 7719 /* 7720 * On SYN or SYN|ACK transmits on TFO connections, 7721 * only include the TFO option if it is not a 7722 * retransmit, as the presence of the TFO option may 7723 * have caused the original SYN or SYN|ACK to have 7724 * been dropped by a middlebox. 7725 */ 7726 if (IS_FASTOPEN(tp->t_flags) && 7727 (tp->t_rxtshift == 0)) { 7728 if (tp->t_state == TCPS_SYN_RECEIVED) { 7729 to.to_tfo_len = TCP_FASTOPEN_COOKIE_LEN; 7730 to.to_tfo_cookie = 7731 (u_int8_t *)&tp->t_tfo_cookie.server; 7732 to.to_flags |= TOF_FASTOPEN; 7733 wanted_cookie = 1; 7734 } else if (tp->t_state == TCPS_SYN_SENT) { 7735 to.to_tfo_len = 7736 tp->t_tfo_client_cookie_len; 7737 to.to_tfo_cookie = 7738 tp->t_tfo_cookie.client; 7739 to.to_flags |= TOF_FASTOPEN; 7740 wanted_cookie = 1; 7741 /* 7742 * If we wind up having more data to 7743 * send with the SYN than can fit in 7744 * one segment, don't send any more 7745 * until the SYN|ACK comes back from 7746 * the other end. 7747 */ 7748 sendalot = 0; 7749 } 7750 } 7751 } 7752 /* Window scaling. */ 7753 if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) { 7754 to.to_wscale = tp->request_r_scale; 7755 to.to_flags |= TOF_SCALE; 7756 } 7757 /* Timestamps. */ 7758 if ((tp->t_flags & TF_RCVD_TSTMP) || 7759 ((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) { 7760 to.to_tsval = cts + tp->ts_offset; 7761 to.to_tsecr = tp->ts_recent; 7762 to.to_flags |= TOF_TS; 7763 } 7764 /* Set receive buffer autosizing timestamp. */ 7765 if (tp->rfbuf_ts == 0 && 7766 (so->so_rcv.sb_flags & SB_AUTOSIZE)) 7767 tp->rfbuf_ts = tcp_ts_getticks(); 7768 /* Selective ACK's. */ 7769 if (flags & TH_SYN) 7770 to.to_flags |= TOF_SACKPERM; 7771 else if (TCPS_HAVEESTABLISHED(tp->t_state) && 7772 tp->rcv_numsacks > 0) { 7773 to.to_flags |= TOF_SACK; 7774 to.to_nsacks = tp->rcv_numsacks; 7775 to.to_sacks = (u_char *)tp->sackblks; 7776 } 7777 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) 7778 /* TCP-MD5 (RFC2385). */ 7779 if (tp->t_flags & TF_SIGNATURE) 7780 to.to_flags |= TOF_SIGNATURE; 7781 #endif /* TCP_SIGNATURE */ 7782 7783 /* Processing the options. */ 7784 hdrlen += optlen = tcp_addoptions(&to, opt); 7785 /* 7786 * If we wanted a TFO option to be added, but it was unable 7787 * to fit, ensure no data is sent. 7788 */ 7789 if (IS_FASTOPEN(tp->t_flags) && wanted_cookie && 7790 !(to.to_flags & TOF_FASTOPEN)) 7791 len = 0; 7792 } 7793 #ifdef NETFLIX_TCPOUDP 7794 if (tp->t_port) { 7795 if (V_tcp_udp_tunneling_port == 0) { 7796 /* The port was removed?? */ 7797 SOCKBUF_UNLOCK(&so->so_snd); 7798 return (EHOSTUNREACH); 7799 } 7800 hdrlen += sizeof(struct udphdr); 7801 } 7802 #endif 7803 ipoptlen = 0; 7804 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 7805 ipoptlen += ipsec_optlen; 7806 #endif 7807 7808 /* 7809 * Adjust data length if insertion of options will bump the packet 7810 * length beyond the t_maxseg length. Clear the FIN bit because we 7811 * cut off the tail of the segment. 7812 */ 7813 if (len + optlen + ipoptlen > tp->t_maxseg) { 7814 if (flags & TH_FIN) { 7815 would_have_fin = 1; 7816 flags &= ~TH_FIN; 7817 } 7818 if (tso) { 7819 uint32_t if_hw_tsomax; 7820 uint32_t moff; 7821 int32_t max_len; 7822 7823 /* extract TSO information */ 7824 if_hw_tsomax = tp->t_tsomax; 7825 if_hw_tsomaxsegcount = tp->t_tsomaxsegcount; 7826 if_hw_tsomaxsegsize = tp->t_tsomaxsegsize; 7827 KASSERT(ipoptlen == 0, 7828 ("%s: TSO can't do IP options", __func__)); 7829 7830 /* 7831 * Check if we should limit by maximum payload 7832 * length: 7833 */ 7834 if (if_hw_tsomax != 0) { 7835 /* compute maximum TSO length */ 7836 max_len = (if_hw_tsomax - hdrlen - 7837 max_linkhdr); 7838 if (max_len <= 0) { 7839 len = 0; 7840 } else if (len > max_len) { 7841 sendalot = 1; 7842 len = max_len; 7843 } 7844 } 7845 /* 7846 * Prevent the last segment from being fractional 7847 * unless the send sockbuf can be emptied: 7848 */ 7849 max_len = (tp->t_maxseg - optlen); 7850 if ((sb_offset + len) < sbavail(sb)) { 7851 moff = len % (u_int)max_len; 7852 if (moff != 0) { 7853 len -= moff; 7854 sendalot = 1; 7855 } 7856 } 7857 /* 7858 * In case there are too many small fragments don't 7859 * use TSO: 7860 */ 7861 if (len <= max_len) { 7862 len = max_len; 7863 sendalot = 1; 7864 tso = 0; 7865 } 7866 /* 7867 * Send the FIN in a separate segment after the bulk 7868 * sending is done. We don't trust the TSO 7869 * implementations to clear the FIN flag on all but 7870 * the last segment. 7871 */ 7872 if (tp->t_flags & TF_NEEDFIN) 7873 sendalot = 1; 7874 7875 } else { 7876 len = tp->t_maxseg - optlen - ipoptlen; 7877 sendalot = 1; 7878 } 7879 } else 7880 tso = 0; 7881 KASSERT(len + hdrlen + ipoptlen <= IP_MAXPACKET, 7882 ("%s: len > IP_MAXPACKET", __func__)); 7883 #ifdef DIAGNOSTIC 7884 #ifdef INET6 7885 if (max_linkhdr + hdrlen > MCLBYTES) 7886 #else 7887 if (max_linkhdr + hdrlen > MHLEN) 7888 #endif 7889 panic("tcphdr too big"); 7890 #endif 7891 7892 /* 7893 * This KASSERT is here to catch edge cases at a well defined place. 7894 * Before, those had triggered (random) panic conditions further 7895 * down. 7896 */ 7897 KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); 7898 if ((len == 0) && 7899 (flags & TH_FIN) && 7900 (sbused(sb))) { 7901 /* 7902 * We have outstanding data, don't send a fin by itself!. 7903 */ 7904 goto just_return; 7905 } 7906 /* 7907 * Grab a header mbuf, attaching a copy of data to be transmitted, 7908 * and initialize the header from the template for sends on this 7909 * connection. 7910 */ 7911 if (len) { 7912 uint32_t max_val; 7913 uint32_t moff; 7914 7915 if (rack->rc_pace_max_segs) 7916 max_val = rack->rc_pace_max_segs * tp->t_maxseg; 7917 else 7918 max_val = len; 7919 /* 7920 * We allow a limit on sending with hptsi. 7921 */ 7922 if (len > max_val) { 7923 len = max_val; 7924 } 7925 #ifdef INET6 7926 if (MHLEN < hdrlen + max_linkhdr) 7927 m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); 7928 else 7929 #endif 7930 m = m_gethdr(M_NOWAIT, MT_DATA); 7931 7932 if (m == NULL) { 7933 SOCKBUF_UNLOCK(sb); 7934 error = ENOBUFS; 7935 sack_rxmit = 0; 7936 goto out; 7937 } 7938 m->m_data += max_linkhdr; 7939 m->m_len = hdrlen; 7940 7941 /* 7942 * Start the m_copy functions from the closest mbuf to the 7943 * sb_offset in the socket buffer chain. 7944 */ 7945 mb = sbsndptr_noadv(sb, sb_offset, &moff); 7946 if (len <= MHLEN - hdrlen - max_linkhdr) { 7947 m_copydata(mb, moff, (int)len, 7948 mtod(m, caddr_t)+hdrlen); 7949 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) 7950 sbsndptr_adv(sb, mb, len); 7951 m->m_len += len; 7952 } else { 7953 struct sockbuf *msb; 7954 7955 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) 7956 msb = NULL; 7957 else 7958 msb = sb; 7959 m->m_next = tcp_m_copym(mb, moff, &len, 7960 if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb); 7961 if (len <= (tp->t_maxseg - optlen)) { 7962 /* 7963 * Must have ran out of mbufs for the copy 7964 * shorten it to no longer need tso. Lets 7965 * not put on sendalot since we are low on 7966 * mbufs. 7967 */ 7968 tso = 0; 7969 } 7970 if (m->m_next == NULL) { 7971 SOCKBUF_UNLOCK(sb); 7972 (void)m_free(m); 7973 error = ENOBUFS; 7974 sack_rxmit = 0; 7975 goto out; 7976 } 7977 } 7978 if ((tp->t_flags & TF_FORCEDATA) && len == 1) { 7979 TCPSTAT_INC(tcps_sndprobe); 7980 #ifdef NETFLIX_STATS 7981 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) 7982 stats_voi_update_abs_u32(tp->t_stats, 7983 VOI_TCP_RETXPB, len); 7984 else 7985 stats_voi_update_abs_u64(tp->t_stats, 7986 VOI_TCP_TXPB, len); 7987 #endif 7988 } else if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) { 7989 if (rsm && (rsm->r_flags & RACK_TLP)) { 7990 /* 7991 * TLP should not count in retran count, but 7992 * in its own bin 7993 */ 7994 counter_u64_add(rack_tlp_retran, 1); 7995 counter_u64_add(rack_tlp_retran_bytes, len); 7996 } else { 7997 tp->t_sndrexmitpack++; 7998 TCPSTAT_INC(tcps_sndrexmitpack); 7999 TCPSTAT_ADD(tcps_sndrexmitbyte, len); 8000 } 8001 #ifdef NETFLIX_STATS 8002 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB, 8003 len); 8004 #endif 8005 } else { 8006 TCPSTAT_INC(tcps_sndpack); 8007 TCPSTAT_ADD(tcps_sndbyte, len); 8008 #ifdef NETFLIX_STATS 8009 stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB, 8010 len); 8011 #endif 8012 } 8013 /* 8014 * If we're sending everything we've got, set PUSH. (This 8015 * will keep happy those implementations which only give 8016 * data to the user when a buffer fills or a PUSH comes in.) 8017 */ 8018 if (sb_offset + len == sbused(sb) && 8019 sbused(sb) && 8020 !(flags & TH_SYN)) 8021 flags |= TH_PUSH; 8022 8023 /* 8024 * Are we doing hptsi, if so we must calculate the slot. We 8025 * only do hptsi in ESTABLISHED and with no RESET being 8026 * sent where we have data to send. 8027 */ 8028 if (((tp->t_state == TCPS_ESTABLISHED) || 8029 (tp->t_state == TCPS_CLOSE_WAIT) || 8030 ((tp->t_state == TCPS_FIN_WAIT_1) && 8031 ((tp->t_flags & TF_SENTFIN) == 0) && 8032 ((flags & TH_FIN) == 0))) && 8033 ((flags & TH_RST) == 0) && 8034 (rack->rc_always_pace)) { 8035 /* 8036 * We use the most optimistic possible cwnd/srtt for 8037 * sending calculations. This will make our 8038 * calculation anticipate getting more through 8039 * quicker then possible. But thats ok we don't want 8040 * the peer to have a gap in data sending. 8041 */ 8042 uint32_t srtt, cwnd, tr_perms = 0; 8043 8044 if (rack->r_ctl.rc_rack_min_rtt) 8045 srtt = rack->r_ctl.rc_rack_min_rtt; 8046 else 8047 srtt = TICKS_2_MSEC((tp->t_srtt >> TCP_RTT_SHIFT)); 8048 if (rack->r_ctl.rc_rack_largest_cwnd) 8049 cwnd = rack->r_ctl.rc_rack_largest_cwnd; 8050 else 8051 cwnd = tp->snd_cwnd; 8052 tr_perms = cwnd / srtt; 8053 if (tr_perms == 0) { 8054 tr_perms = tp->t_maxseg; 8055 } 8056 tot_len_this_send += len; 8057 /* 8058 * Calculate how long this will take to drain, if 8059 * the calculation comes out to zero, thats ok we 8060 * will use send_a_lot to possibly spin around for 8061 * more increasing tot_len_this_send to the point 8062 * that its going to require a pace, or we hit the 8063 * cwnd. Which in that case we are just waiting for 8064 * a ACK. 8065 */ 8066 slot = tot_len_this_send / tr_perms; 8067 /* Now do we reduce the time so we don't run dry? */ 8068 if (slot && rack->rc_pace_reduce) { 8069 int32_t reduce; 8070 8071 reduce = (slot / rack->rc_pace_reduce); 8072 if (reduce < slot) { 8073 slot -= reduce; 8074 } else 8075 slot = 0; 8076 } 8077 if (rack->r_enforce_min_pace && 8078 (slot == 0) && 8079 (tot_len_this_send >= (rack->r_min_pace_seg_thresh * tp->t_maxseg))) { 8080 /* We are enforcing a minimum pace time of 1ms */ 8081 slot = rack->r_enforce_min_pace; 8082 } 8083 } 8084 SOCKBUF_UNLOCK(sb); 8085 } else { 8086 SOCKBUF_UNLOCK(sb); 8087 if (tp->t_flags & TF_ACKNOW) 8088 TCPSTAT_INC(tcps_sndacks); 8089 else if (flags & (TH_SYN | TH_FIN | TH_RST)) 8090 TCPSTAT_INC(tcps_sndctrl); 8091 else if (SEQ_GT(tp->snd_up, tp->snd_una)) 8092 TCPSTAT_INC(tcps_sndurg); 8093 else 8094 TCPSTAT_INC(tcps_sndwinup); 8095 8096 m = m_gethdr(M_NOWAIT, MT_DATA); 8097 if (m == NULL) { 8098 error = ENOBUFS; 8099 sack_rxmit = 0; 8100 goto out; 8101 } 8102 #ifdef INET6 8103 if (isipv6 && (MHLEN < hdrlen + max_linkhdr) && 8104 MHLEN >= hdrlen) { 8105 M_ALIGN(m, hdrlen); 8106 } else 8107 #endif 8108 m->m_data += max_linkhdr; 8109 m->m_len = hdrlen; 8110 } 8111 SOCKBUF_UNLOCK_ASSERT(sb); 8112 m->m_pkthdr.rcvif = (struct ifnet *)0; 8113 #ifdef MAC 8114 mac_inpcb_create_mbuf(inp, m); 8115 #endif 8116 #ifdef INET6 8117 if (isipv6) { 8118 ip6 = mtod(m, struct ip6_hdr *); 8119 #ifdef NETFLIX_TCPOUDP 8120 if (tp->t_port) { 8121 udp = (struct udphdr *)((caddr_t)ip6 + ipoptlen + sizeof(struct ip6_hdr)); 8122 udp->uh_sport = htons(V_tcp_udp_tunneling_port); 8123 udp->uh_dport = tp->t_port; 8124 ulen = hdrlen + len - sizeof(struct ip6_hdr); 8125 udp->uh_ulen = htons(ulen); 8126 th = (struct tcphdr *)(udp + 1); 8127 } else 8128 #endif 8129 th = (struct tcphdr *)(ip6 + 1); 8130 tcpip_fillheaders(inp, ip6, th); 8131 } else 8132 #endif /* INET6 */ 8133 { 8134 ip = mtod(m, struct ip *); 8135 #ifdef TCPDEBUG 8136 ipov = (struct ipovly *)ip; 8137 #endif 8138 #ifdef NETFLIX_TCPOUDP 8139 if (tp->t_port) { 8140 udp = (struct udphdr *)((caddr_t)ip + ipoptlen + sizeof(struct ip)); 8141 udp->uh_sport = htons(V_tcp_udp_tunneling_port); 8142 udp->uh_dport = tp->t_port; 8143 ulen = hdrlen + len - sizeof(struct ip); 8144 udp->uh_ulen = htons(ulen); 8145 th = (struct tcphdr *)(udp + 1); 8146 } else 8147 #endif 8148 th = (struct tcphdr *)(ip + 1); 8149 tcpip_fillheaders(inp, ip, th); 8150 } 8151 /* 8152 * Fill in fields, remembering maximum advertised window for use in 8153 * delaying messages about window sizes. If resending a FIN, be sure 8154 * not to use a new sequence number. 8155 */ 8156 if (flags & TH_FIN && tp->t_flags & TF_SENTFIN && 8157 tp->snd_nxt == tp->snd_max) 8158 tp->snd_nxt--; 8159 /* 8160 * If we are starting a connection, send ECN setup SYN packet. If we 8161 * are on a retransmit, we may resend those bits a number of times 8162 * as per RFC 3168. 8163 */ 8164 if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn == 1) { 8165 if (tp->t_rxtshift >= 1) { 8166 if (tp->t_rxtshift <= V_tcp_ecn_maxretries) 8167 flags |= TH_ECE | TH_CWR; 8168 } else 8169 flags |= TH_ECE | TH_CWR; 8170 } 8171 if (tp->t_state == TCPS_ESTABLISHED && 8172 (tp->t_flags & TF_ECN_PERMIT)) { 8173 /* 8174 * If the peer has ECN, mark data packets with ECN capable 8175 * transmission (ECT). Ignore pure ack packets, 8176 * retransmissions and window probes. 8177 */ 8178 if (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) && 8179 !((tp->t_flags & TF_FORCEDATA) && len == 1)) { 8180 #ifdef INET6 8181 if (isipv6) 8182 ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20); 8183 else 8184 #endif 8185 ip->ip_tos |= IPTOS_ECN_ECT0; 8186 TCPSTAT_INC(tcps_ecn_ect0); 8187 } 8188 /* 8189 * Reply with proper ECN notifications. 8190 */ 8191 if (tp->t_flags & TF_ECN_SND_CWR) { 8192 flags |= TH_CWR; 8193 tp->t_flags &= ~TF_ECN_SND_CWR; 8194 } 8195 if (tp->t_flags & TF_ECN_SND_ECE) 8196 flags |= TH_ECE; 8197 } 8198 /* 8199 * If we are doing retransmissions, then snd_nxt will not reflect 8200 * the first unsent octet. For ACK only packets, we do not want the 8201 * sequence number of the retransmitted packet, we want the sequence 8202 * number of the next unsent octet. So, if there is no data (and no 8203 * SYN or FIN), use snd_max instead of snd_nxt when filling in 8204 * ti_seq. But if we are in persist state, snd_max might reflect 8205 * one byte beyond the right edge of the window, so use snd_nxt in 8206 * that case, since we know we aren't doing a retransmission. 8207 * (retransmit and persist are mutually exclusive...) 8208 */ 8209 if (sack_rxmit == 0) { 8210 if (len || (flags & (TH_SYN | TH_FIN)) || 8211 rack->rc_in_persist) { 8212 th->th_seq = htonl(tp->snd_nxt); 8213 rack_seq = tp->snd_nxt; 8214 } else if (flags & TH_RST) { 8215 /* 8216 * For a Reset send the last cum ack in sequence 8217 * (this like any other choice may still generate a 8218 * challenge ack, if a ack-update packet is in 8219 * flight). 8220 */ 8221 th->th_seq = htonl(tp->snd_una); 8222 rack_seq = tp->snd_una; 8223 } else { 8224 th->th_seq = htonl(tp->snd_max); 8225 rack_seq = tp->snd_max; 8226 } 8227 } else { 8228 th->th_seq = htonl(rsm->r_start); 8229 rack_seq = rsm->r_start; 8230 } 8231 th->th_ack = htonl(tp->rcv_nxt); 8232 if (optlen) { 8233 bcopy(opt, th + 1, optlen); 8234 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 8235 } 8236 th->th_flags = flags; 8237 /* 8238 * Calculate receive window. Don't shrink window, but avoid silly 8239 * window syndrome. 8240 */ 8241 if (recwin < (long)(so->so_rcv.sb_hiwat / 4) && 8242 recwin < (long)tp->t_maxseg) 8243 recwin = 0; 8244 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) && 8245 recwin < (long)(tp->rcv_adv - tp->rcv_nxt)) 8246 recwin = (long)(tp->rcv_adv - tp->rcv_nxt); 8247 if (recwin > (long)TCP_MAXWIN << tp->rcv_scale) 8248 recwin = (long)TCP_MAXWIN << tp->rcv_scale; 8249 8250 /* 8251 * According to RFC1323 the window field in a SYN (i.e., a <SYN> or 8252 * <SYN,ACK>) segment itself is never scaled. The <SYN,ACK> case is 8253 * handled in syncache. 8254 */ 8255 if (flags & TH_SYN) 8256 th->th_win = htons((u_short) 8257 (min(sbspace(&so->so_rcv), TCP_MAXWIN))); 8258 else 8259 th->th_win = htons((u_short)(recwin >> tp->rcv_scale)); 8260 /* 8261 * Adjust the RXWIN0SENT flag - indicate that we have advertised a 0 8262 * window. This may cause the remote transmitter to stall. This 8263 * flag tells soreceive() to disable delayed acknowledgements when 8264 * draining the buffer. This can occur if the receiver is 8265 * attempting to read more data than can be buffered prior to 8266 * transmitting on the connection. 8267 */ 8268 if (th->th_win == 0) { 8269 tp->t_sndzerowin++; 8270 tp->t_flags |= TF_RXWIN0SENT; 8271 } else 8272 tp->t_flags &= ~TF_RXWIN0SENT; 8273 if (SEQ_GT(tp->snd_up, tp->snd_nxt)) { 8274 th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt)); 8275 th->th_flags |= TH_URG; 8276 } else 8277 /* 8278 * If no urgent pointer to send, then we pull the urgent 8279 * pointer to the left edge of the send window so that it 8280 * doesn't drift into the send window on sequence number 8281 * wraparound. 8282 */ 8283 tp->snd_up = tp->snd_una; /* drag it along */ 8284 8285 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) 8286 if (to.to_flags & TOF_SIGNATURE) { 8287 /* 8288 * Calculate MD5 signature and put it into the place 8289 * determined before. 8290 * NOTE: since TCP options buffer doesn't point into 8291 * mbuf's data, calculate offset and use it. 8292 */ 8293 if (!TCPMD5_ENABLED() || TCPMD5_OUTPUT(m, th, 8294 (u_char *)(th + 1) + (to.to_signature - opt)) != 0) { 8295 /* 8296 * Do not send segment if the calculation of MD5 8297 * digest has failed. 8298 */ 8299 goto out; 8300 } 8301 } 8302 #endif 8303 8304 /* 8305 * Put TCP length in extended header, and then checksum extended 8306 * header and data. 8307 */ 8308 m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */ 8309 #ifdef INET6 8310 if (isipv6) { 8311 /* 8312 * ip6_plen is not need to be filled now, and will be filled 8313 * in ip6_output. 8314 */ 8315 if (tp->t_port) { 8316 m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; 8317 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 8318 udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0); 8319 th->th_sum = htons(0); 8320 } else { 8321 m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; 8322 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 8323 th->th_sum = in6_cksum_pseudo(ip6, 8324 sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP, 8325 0); 8326 } 8327 } 8328 #endif 8329 #if defined(INET6) && defined(INET) 8330 else 8331 #endif 8332 #ifdef INET 8333 { 8334 if (tp->t_port) { 8335 m->m_pkthdr.csum_flags = CSUM_UDP; 8336 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 8337 udp->uh_sum = in_pseudo(ip->ip_src.s_addr, 8338 ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP)); 8339 th->th_sum = htons(0); 8340 } else { 8341 m->m_pkthdr.csum_flags = CSUM_TCP; 8342 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 8343 th->th_sum = in_pseudo(ip->ip_src.s_addr, 8344 ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + 8345 IPPROTO_TCP + len + optlen)); 8346 } 8347 /* IP version must be set here for ipv4/ipv6 checking later */ 8348 KASSERT(ip->ip_v == IPVERSION, 8349 ("%s: IP version incorrect: %d", __func__, ip->ip_v)); 8350 } 8351 #endif 8352 8353 /* 8354 * Enable TSO and specify the size of the segments. The TCP pseudo 8355 * header checksum is always provided. XXX: Fixme: This is currently 8356 * not the case for IPv6. 8357 */ 8358 if (tso) { 8359 KASSERT(len > tp->t_maxseg - optlen, 8360 ("%s: len <= tso_segsz", __func__)); 8361 m->m_pkthdr.csum_flags |= CSUM_TSO; 8362 m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen; 8363 } 8364 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 8365 KASSERT(len + hdrlen + ipoptlen - ipsec_optlen == m_length(m, NULL), 8366 ("%s: mbuf chain shorter than expected: %d + %u + %u - %u != %u", 8367 __func__, len, hdrlen, ipoptlen, ipsec_optlen, m_length(m, NULL))); 8368 #else 8369 KASSERT(len + hdrlen + ipoptlen == m_length(m, NULL), 8370 ("%s: mbuf chain shorter than expected: %d + %u + %u != %u", 8371 __func__, len, hdrlen, ipoptlen, m_length(m, NULL))); 8372 #endif 8373 8374 #ifdef TCP_HHOOK 8375 /* Run HHOOK_TCP_ESTABLISHED_OUT helper hooks. */ 8376 hhook_run_tcp_est_out(tp, th, &to, len, tso); 8377 #endif 8378 8379 #ifdef TCPDEBUG 8380 /* 8381 * Trace. 8382 */ 8383 if (so->so_options & SO_DEBUG) { 8384 u_short save = 0; 8385 8386 #ifdef INET6 8387 if (!isipv6) 8388 #endif 8389 { 8390 save = ipov->ih_len; 8391 ipov->ih_len = htons(m->m_pkthdr.len /* - hdrlen + 8392 * (th->th_off << 2) */ ); 8393 } 8394 tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0); 8395 #ifdef INET6 8396 if (!isipv6) 8397 #endif 8398 ipov->ih_len = save; 8399 } 8400 #endif /* TCPDEBUG */ 8401 8402 /* We're getting ready to send; log now. */ 8403 if (tp->t_logstate != TCP_LOG_STATE_OFF) { 8404 union tcp_log_stackspecific log; 8405 8406 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 8407 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 8408 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 8409 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; 8410 if (rsm || sack_rxmit) { 8411 log.u_bbr.flex8 = 1; 8412 } else { 8413 log.u_bbr.flex8 = 0; 8414 } 8415 lgb = tcp_log_event_(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_OUT, ERRNO_UNK, 8416 len, &log, false, NULL, NULL, 0, NULL); 8417 } else 8418 lgb = NULL; 8419 8420 /* 8421 * Fill in IP length and desired time to live and send to IP level. 8422 * There should be a better way to handle ttl and tos; we could keep 8423 * them in the template, but need a way to checksum without them. 8424 */ 8425 /* 8426 * m->m_pkthdr.len should have been set before cksum calcuration, 8427 * because in6_cksum() need it. 8428 */ 8429 #ifdef INET6 8430 if (isipv6) { 8431 /* 8432 * we separately set hoplimit for every segment, since the 8433 * user might want to change the value via setsockopt. Also, 8434 * desired default hop limit might be changed via Neighbor 8435 * Discovery. 8436 */ 8437 ip6->ip6_hlim = in6_selecthlim(inp, NULL); 8438 8439 /* 8440 * Set the packet size here for the benefit of DTrace 8441 * probes. ip6_output() will set it properly; it's supposed 8442 * to include the option header lengths as well. 8443 */ 8444 ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6)); 8445 8446 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) 8447 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 8448 else 8449 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 8450 8451 if (tp->t_state == TCPS_SYN_SENT) 8452 TCP_PROBE5(connect__request, NULL, tp, ip6, tp, th); 8453 8454 TCP_PROBE5(send, NULL, tp, ip6, tp, th); 8455 /* TODO: IPv6 IP6TOS_ECT bit on */ 8456 error = ip6_output(m, tp->t_inpcb->in6p_outputopts, 8457 &inp->inp_route6, 8458 ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), 8459 NULL, NULL, inp); 8460 8461 if (error == EMSGSIZE && inp->inp_route6.ro_rt != NULL) 8462 mtu = inp->inp_route6.ro_rt->rt_mtu; 8463 } 8464 #endif /* INET6 */ 8465 #if defined(INET) && defined(INET6) 8466 else 8467 #endif 8468 #ifdef INET 8469 { 8470 ip->ip_len = htons(m->m_pkthdr.len); 8471 #ifdef INET6 8472 if (inp->inp_vflag & INP_IPV6PROTO) 8473 ip->ip_ttl = in6_selecthlim(inp, NULL); 8474 #endif /* INET6 */ 8475 /* 8476 * If we do path MTU discovery, then we set DF on every 8477 * packet. This might not be the best thing to do according 8478 * to RFC3390 Section 2. However the tcp hostcache migitates 8479 * the problem so it affects only the first tcp connection 8480 * with a host. 8481 * 8482 * NB: Don't set DF on small MTU/MSS to have a safe 8483 * fallback. 8484 */ 8485 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) { 8486 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 8487 if (tp->t_port == 0 || len < V_tcp_minmss) { 8488 ip->ip_off |= htons(IP_DF); 8489 } 8490 } else { 8491 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 8492 } 8493 8494 if (tp->t_state == TCPS_SYN_SENT) 8495 TCP_PROBE5(connect__request, NULL, tp, ip, tp, th); 8496 8497 TCP_PROBE5(send, NULL, tp, ip, tp, th); 8498 8499 error = ip_output(m, tp->t_inpcb->inp_options, &inp->inp_route, 8500 ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), 0, 8501 inp); 8502 if (error == EMSGSIZE && inp->inp_route.ro_rt != NULL) 8503 mtu = inp->inp_route.ro_rt->rt_mtu; 8504 } 8505 #endif /* INET */ 8506 8507 out: 8508 if (lgb) { 8509 lgb->tlb_errno = error; 8510 lgb = NULL; 8511 } 8512 /* 8513 * In transmit state, time the transmission and arrange for the 8514 * retransmit. In persist state, just set snd_max. 8515 */ 8516 if (error == 0) { 8517 if (len == 0) 8518 counter_u64_add(rack_out_size[TCP_MSS_ACCT_SNDACK], 1); 8519 else if (len == 1) { 8520 counter_u64_add(rack_out_size[TCP_MSS_ACCT_PERSIST], 1); 8521 } else if (len > 1) { 8522 int idx; 8523 8524 idx = (len / tp->t_maxseg) + 3; 8525 if (idx >= TCP_MSS_ACCT_ATIMER) 8526 counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1); 8527 else 8528 counter_u64_add(rack_out_size[idx], 1); 8529 } 8530 } 8531 if (sub_from_prr && (error == 0)) { 8532 rack->r_ctl.rc_prr_sndcnt -= len; 8533 } 8534 sub_from_prr = 0; 8535 rack_log_output(tp, &to, len, rack_seq, (uint8_t) flags, error, cts, 8536 pass, rsm); 8537 if ((tp->t_flags & TF_FORCEDATA) == 0 || 8538 (rack->rc_in_persist == 0)) { 8539 #ifdef NETFLIX_STATS 8540 tcp_seq startseq = tp->snd_nxt; 8541 #endif 8542 8543 /* 8544 * Advance snd_nxt over sequence space of this segment. 8545 */ 8546 if (error) 8547 /* We don't log or do anything with errors */ 8548 goto timer; 8549 8550 if (flags & (TH_SYN | TH_FIN)) { 8551 if (flags & TH_SYN) 8552 tp->snd_nxt++; 8553 if (flags & TH_FIN) { 8554 tp->snd_nxt++; 8555 tp->t_flags |= TF_SENTFIN; 8556 } 8557 } 8558 /* In the ENOBUFS case we do *not* update snd_max */ 8559 if (sack_rxmit) 8560 goto timer; 8561 8562 tp->snd_nxt += len; 8563 if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { 8564 if (tp->snd_una == tp->snd_max) { 8565 /* 8566 * Update the time we just added data since 8567 * none was outstanding. 8568 */ 8569 rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__); 8570 tp->t_acktime = ticks; 8571 } 8572 tp->snd_max = tp->snd_nxt; 8573 #ifdef NETFLIX_STATS 8574 if (!(tp->t_flags & TF_GPUTINPROG) && len) { 8575 tp->t_flags |= TF_GPUTINPROG; 8576 tp->gput_seq = startseq; 8577 tp->gput_ack = startseq + 8578 ulmin(sbavail(sb) - sb_offset, sendwin); 8579 tp->gput_ts = tcp_ts_getticks(); 8580 } 8581 #endif 8582 } 8583 /* 8584 * Set retransmit timer if not currently set, and not doing 8585 * a pure ack or a keep-alive probe. Initial value for 8586 * retransmit timer is smoothed round-trip time + 2 * 8587 * round-trip time variance. Initialize shift counter which 8588 * is used for backoff of retransmit time. 8589 */ 8590 timer: 8591 if ((tp->snd_wnd == 0) && 8592 TCPS_HAVEESTABLISHED(tp->t_state)) { 8593 /* 8594 * If the persists timer was set above (right before 8595 * the goto send), and still needs to be on. Lets 8596 * make sure all is canceled. If the persist timer 8597 * is not running, we want to get it up. 8598 */ 8599 if (rack->rc_in_persist == 0) { 8600 rack_enter_persist(tp, rack, cts); 8601 } 8602 } 8603 } else { 8604 /* 8605 * Persist case, update snd_max but since we are in persist 8606 * mode (no window) we do not update snd_nxt. 8607 */ 8608 int32_t xlen = len; 8609 8610 if (error) 8611 goto nomore; 8612 8613 if (flags & TH_SYN) 8614 ++xlen; 8615 if (flags & TH_FIN) { 8616 ++xlen; 8617 tp->t_flags |= TF_SENTFIN; 8618 } 8619 /* In the ENOBUFS case we do *not* update snd_max */ 8620 if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max)) { 8621 if (tp->snd_una == tp->snd_max) { 8622 /* 8623 * Update the time we just added data since 8624 * none was outstanding. 8625 */ 8626 rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__); 8627 tp->t_acktime = ticks; 8628 } 8629 tp->snd_max = tp->snd_nxt + len; 8630 } 8631 } 8632 nomore: 8633 if (error) { 8634 SOCKBUF_UNLOCK_ASSERT(sb); /* Check gotos. */ 8635 /* 8636 * Failures do not advance the seq counter above. For the 8637 * case of ENOBUFS we will fall out and retry in 1ms with 8638 * the hpts. Everything else will just have to retransmit 8639 * with the timer. 8640 * 8641 * In any case, we do not want to loop around for another 8642 * send without a good reason. 8643 */ 8644 sendalot = 0; 8645 switch (error) { 8646 case EPERM: 8647 tp->t_flags &= ~TF_FORCEDATA; 8648 tp->t_softerror = error; 8649 return (error); 8650 case ENOBUFS: 8651 if (slot == 0) { 8652 /* 8653 * Pace us right away to retry in a some 8654 * time 8655 */ 8656 slot = 1 + rack->rc_enobuf; 8657 if (rack->rc_enobuf < 255) 8658 rack->rc_enobuf++; 8659 if (slot > (rack->rc_rack_rtt / 2)) { 8660 slot = rack->rc_rack_rtt / 2; 8661 } 8662 if (slot < 10) 8663 slot = 10; 8664 } 8665 counter_u64_add(rack_saw_enobuf, 1); 8666 error = 0; 8667 goto enobufs; 8668 case EMSGSIZE: 8669 /* 8670 * For some reason the interface we used initially 8671 * to send segments changed to another or lowered 8672 * its MTU. If TSO was active we either got an 8673 * interface without TSO capabilits or TSO was 8674 * turned off. If we obtained mtu from ip_output() 8675 * then update it and try again. 8676 */ 8677 if (tso) 8678 tp->t_flags &= ~TF_TSO; 8679 if (mtu != 0) { 8680 tcp_mss_update(tp, -1, mtu, NULL, NULL); 8681 goto again; 8682 } 8683 slot = 10; 8684 rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, 0, 1); 8685 tp->t_flags &= ~TF_FORCEDATA; 8686 return (error); 8687 case ENETUNREACH: 8688 counter_u64_add(rack_saw_enetunreach, 1); 8689 case EHOSTDOWN: 8690 case EHOSTUNREACH: 8691 case ENETDOWN: 8692 if (TCPS_HAVERCVDSYN(tp->t_state)) { 8693 tp->t_softerror = error; 8694 } 8695 /* FALLTHROUGH */ 8696 default: 8697 slot = 10; 8698 rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, 0, 1); 8699 tp->t_flags &= ~TF_FORCEDATA; 8700 return (error); 8701 } 8702 } else { 8703 rack->rc_enobuf = 0; 8704 } 8705 TCPSTAT_INC(tcps_sndtotal); 8706 8707 /* 8708 * Data sent (as far as we can tell). If this advertises a larger 8709 * window than any other segment, then remember the size of the 8710 * advertised window. Any pending ACK has now been sent. 8711 */ 8712 if (recwin > 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv)) 8713 tp->rcv_adv = tp->rcv_nxt + recwin; 8714 tp->last_ack_sent = tp->rcv_nxt; 8715 tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); 8716 enobufs: 8717 rack->r_tlp_running = 0; 8718 if ((flags & TH_RST) || (would_have_fin == 1)) { 8719 /* 8720 * We don't send again after a RST. We also do *not* send 8721 * again if we would have had a find, but now have 8722 * outstanding data. 8723 */ 8724 slot = 0; 8725 sendalot = 0; 8726 } 8727 if (slot) { 8728 /* set the rack tcb into the slot N */ 8729 counter_u64_add(rack_paced_segments, 1); 8730 } else if (sendalot) { 8731 if (len) 8732 counter_u64_add(rack_unpaced_segments, 1); 8733 sack_rxmit = 0; 8734 tp->t_flags &= ~TF_FORCEDATA; 8735 goto again; 8736 } else if (len) { 8737 counter_u64_add(rack_unpaced_segments, 1); 8738 } 8739 tp->t_flags &= ~TF_FORCEDATA; 8740 rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, tot_len_this_send, 1); 8741 return (error); 8742 } 8743 8744 /* 8745 * rack_ctloutput() must drop the inpcb lock before performing copyin on 8746 * socket option arguments. When it re-acquires the lock after the copy, it 8747 * has to revalidate that the connection is still valid for the socket 8748 * option. 8749 */ 8750 static int 8751 rack_set_sockopt(struct socket *so, struct sockopt *sopt, 8752 struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack) 8753 { 8754 int32_t error = 0, optval; 8755 8756 switch (sopt->sopt_name) { 8757 case TCP_RACK_PROP_RATE: 8758 case TCP_RACK_PROP: 8759 case TCP_RACK_TLP_REDUCE: 8760 case TCP_RACK_EARLY_RECOV: 8761 case TCP_RACK_PACE_ALWAYS: 8762 case TCP_DELACK: 8763 case TCP_RACK_PACE_REDUCE: 8764 case TCP_RACK_PACE_MAX_SEG: 8765 case TCP_RACK_PRR_SENDALOT: 8766 case TCP_RACK_MIN_TO: 8767 case TCP_RACK_EARLY_SEG: 8768 case TCP_RACK_REORD_THRESH: 8769 case TCP_RACK_REORD_FADE: 8770 case TCP_RACK_TLP_THRESH: 8771 case TCP_RACK_PKT_DELAY: 8772 case TCP_RACK_TLP_USE: 8773 case TCP_RACK_TLP_INC_VAR: 8774 case TCP_RACK_IDLE_REDUCE_HIGH: 8775 case TCP_RACK_MIN_PACE: 8776 case TCP_RACK_MIN_PACE_SEG: 8777 case TCP_BBR_RACK_RTT_USE: 8778 case TCP_DATA_AFTER_CLOSE: 8779 break; 8780 default: 8781 return (tcp_default_ctloutput(so, sopt, inp, tp)); 8782 break; 8783 } 8784 INP_WUNLOCK(inp); 8785 error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); 8786 if (error) 8787 return (error); 8788 INP_WLOCK(inp); 8789 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 8790 INP_WUNLOCK(inp); 8791 return (ECONNRESET); 8792 } 8793 tp = intotcpcb(inp); 8794 rack = (struct tcp_rack *)tp->t_fb_ptr; 8795 switch (sopt->sopt_name) { 8796 case TCP_RACK_PROP_RATE: 8797 if ((optval <= 0) || (optval >= 100)) { 8798 error = EINVAL; 8799 break; 8800 } 8801 RACK_OPTS_INC(tcp_rack_prop_rate); 8802 rack->r_ctl.rc_prop_rate = optval; 8803 break; 8804 case TCP_RACK_TLP_USE: 8805 if ((optval < TLP_USE_ID) || (optval > TLP_USE_TWO_TWO)) { 8806 error = EINVAL; 8807 break; 8808 } 8809 RACK_OPTS_INC(tcp_tlp_use); 8810 rack->rack_tlp_threshold_use = optval; 8811 break; 8812 case TCP_RACK_PROP: 8813 /* RACK proportional rate reduction (bool) */ 8814 RACK_OPTS_INC(tcp_rack_prop); 8815 rack->r_ctl.rc_prop_reduce = optval; 8816 break; 8817 case TCP_RACK_TLP_REDUCE: 8818 /* RACK TLP cwnd reduction (bool) */ 8819 RACK_OPTS_INC(tcp_rack_tlp_reduce); 8820 rack->r_ctl.rc_tlp_cwnd_reduce = optval; 8821 break; 8822 case TCP_RACK_EARLY_RECOV: 8823 /* Should recovery happen early (bool) */ 8824 RACK_OPTS_INC(tcp_rack_early_recov); 8825 rack->r_ctl.rc_early_recovery = optval; 8826 break; 8827 case TCP_RACK_PACE_ALWAYS: 8828 /* Use the always pace method (bool) */ 8829 RACK_OPTS_INC(tcp_rack_pace_always); 8830 if (optval > 0) 8831 rack->rc_always_pace = 1; 8832 else 8833 rack->rc_always_pace = 0; 8834 break; 8835 case TCP_RACK_PACE_REDUCE: 8836 /* RACK Hptsi reduction factor (divisor) */ 8837 RACK_OPTS_INC(tcp_rack_pace_reduce); 8838 if (optval) 8839 /* Must be non-zero */ 8840 rack->rc_pace_reduce = optval; 8841 else 8842 error = EINVAL; 8843 break; 8844 case TCP_RACK_PACE_MAX_SEG: 8845 /* Max segments in a pace */ 8846 RACK_OPTS_INC(tcp_rack_max_seg); 8847 rack->rc_pace_max_segs = optval; 8848 break; 8849 case TCP_RACK_PRR_SENDALOT: 8850 /* Allow PRR to send more than one seg */ 8851 RACK_OPTS_INC(tcp_rack_prr_sendalot); 8852 rack->r_ctl.rc_prr_sendalot = optval; 8853 break; 8854 case TCP_RACK_MIN_TO: 8855 /* Minimum time between rack t-o's in ms */ 8856 RACK_OPTS_INC(tcp_rack_min_to); 8857 rack->r_ctl.rc_min_to = optval; 8858 break; 8859 case TCP_RACK_EARLY_SEG: 8860 /* If early recovery max segments */ 8861 RACK_OPTS_INC(tcp_rack_early_seg); 8862 rack->r_ctl.rc_early_recovery_segs = optval; 8863 break; 8864 case TCP_RACK_REORD_THRESH: 8865 /* RACK reorder threshold (shift amount) */ 8866 RACK_OPTS_INC(tcp_rack_reord_thresh); 8867 if ((optval > 0) && (optval < 31)) 8868 rack->r_ctl.rc_reorder_shift = optval; 8869 else 8870 error = EINVAL; 8871 break; 8872 case TCP_RACK_REORD_FADE: 8873 /* Does reordering fade after ms time */ 8874 RACK_OPTS_INC(tcp_rack_reord_fade); 8875 rack->r_ctl.rc_reorder_fade = optval; 8876 break; 8877 case TCP_RACK_TLP_THRESH: 8878 /* RACK TLP theshold i.e. srtt+(srtt/N) */ 8879 RACK_OPTS_INC(tcp_rack_tlp_thresh); 8880 if (optval) 8881 rack->r_ctl.rc_tlp_threshold = optval; 8882 else 8883 error = EINVAL; 8884 break; 8885 case TCP_RACK_PKT_DELAY: 8886 /* RACK added ms i.e. rack-rtt + reord + N */ 8887 RACK_OPTS_INC(tcp_rack_pkt_delay); 8888 rack->r_ctl.rc_pkt_delay = optval; 8889 break; 8890 case TCP_RACK_TLP_INC_VAR: 8891 /* Does TLP include rtt variance in t-o */ 8892 RACK_OPTS_INC(tcp_rack_tlp_inc_var); 8893 rack->r_ctl.rc_prr_inc_var = optval; 8894 break; 8895 case TCP_RACK_IDLE_REDUCE_HIGH: 8896 RACK_OPTS_INC(tcp_rack_idle_reduce_high); 8897 if (optval) 8898 rack->r_idle_reduce_largest = 1; 8899 else 8900 rack->r_idle_reduce_largest = 0; 8901 break; 8902 case TCP_DELACK: 8903 if (optval == 0) 8904 tp->t_delayed_ack = 0; 8905 else 8906 tp->t_delayed_ack = 1; 8907 if (tp->t_flags & TF_DELACK) { 8908 tp->t_flags &= ~TF_DELACK; 8909 tp->t_flags |= TF_ACKNOW; 8910 rack_output(tp); 8911 } 8912 break; 8913 case TCP_RACK_MIN_PACE: 8914 RACK_OPTS_INC(tcp_rack_min_pace); 8915 if (optval > 3) 8916 rack->r_enforce_min_pace = 3; 8917 else 8918 rack->r_enforce_min_pace = optval; 8919 break; 8920 case TCP_RACK_MIN_PACE_SEG: 8921 RACK_OPTS_INC(tcp_rack_min_pace_seg); 8922 if (optval >= 16) 8923 rack->r_min_pace_seg_thresh = 15; 8924 else 8925 rack->r_min_pace_seg_thresh = optval; 8926 break; 8927 case TCP_BBR_RACK_RTT_USE: 8928 if ((optval != USE_RTT_HIGH) && 8929 (optval != USE_RTT_LOW) && 8930 (optval != USE_RTT_AVG)) 8931 error = EINVAL; 8932 else 8933 rack->r_ctl.rc_rate_sample_method = optval; 8934 break; 8935 case TCP_DATA_AFTER_CLOSE: 8936 if (optval) 8937 rack->rc_allow_data_af_clo = 1; 8938 else 8939 rack->rc_allow_data_af_clo = 0; 8940 break; 8941 default: 8942 return (tcp_default_ctloutput(so, sopt, inp, tp)); 8943 break; 8944 } 8945 #ifdef NETFLIX_STATS 8946 tcp_log_socket_option(tp, sopt->sopt_name, optval, error); 8947 #endif 8948 INP_WUNLOCK(inp); 8949 return (error); 8950 } 8951 8952 static int 8953 rack_get_sockopt(struct socket *so, struct sockopt *sopt, 8954 struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack) 8955 { 8956 int32_t error, optval; 8957 8958 /* 8959 * Because all our options are either boolean or an int, we can just 8960 * pull everything into optval and then unlock and copy. If we ever 8961 * add a option that is not a int, then this will have quite an 8962 * impact to this routine. 8963 */ 8964 switch (sopt->sopt_name) { 8965 case TCP_RACK_PROP_RATE: 8966 optval = rack->r_ctl.rc_prop_rate; 8967 break; 8968 case TCP_RACK_PROP: 8969 /* RACK proportional rate reduction (bool) */ 8970 optval = rack->r_ctl.rc_prop_reduce; 8971 break; 8972 case TCP_RACK_TLP_REDUCE: 8973 /* RACK TLP cwnd reduction (bool) */ 8974 optval = rack->r_ctl.rc_tlp_cwnd_reduce; 8975 break; 8976 case TCP_RACK_EARLY_RECOV: 8977 /* Should recovery happen early (bool) */ 8978 optval = rack->r_ctl.rc_early_recovery; 8979 break; 8980 case TCP_RACK_PACE_REDUCE: 8981 /* RACK Hptsi reduction factor (divisor) */ 8982 optval = rack->rc_pace_reduce; 8983 break; 8984 case TCP_RACK_PACE_MAX_SEG: 8985 /* Max segments in a pace */ 8986 optval = rack->rc_pace_max_segs; 8987 break; 8988 case TCP_RACK_PACE_ALWAYS: 8989 /* Use the always pace method */ 8990 optval = rack->rc_always_pace; 8991 break; 8992 case TCP_RACK_PRR_SENDALOT: 8993 /* Allow PRR to send more than one seg */ 8994 optval = rack->r_ctl.rc_prr_sendalot; 8995 break; 8996 case TCP_RACK_MIN_TO: 8997 /* Minimum time between rack t-o's in ms */ 8998 optval = rack->r_ctl.rc_min_to; 8999 break; 9000 case TCP_RACK_EARLY_SEG: 9001 /* If early recovery max segments */ 9002 optval = rack->r_ctl.rc_early_recovery_segs; 9003 break; 9004 case TCP_RACK_REORD_THRESH: 9005 /* RACK reorder threshold (shift amount) */ 9006 optval = rack->r_ctl.rc_reorder_shift; 9007 break; 9008 case TCP_RACK_REORD_FADE: 9009 /* Does reordering fade after ms time */ 9010 optval = rack->r_ctl.rc_reorder_fade; 9011 break; 9012 case TCP_RACK_TLP_THRESH: 9013 /* RACK TLP theshold i.e. srtt+(srtt/N) */ 9014 optval = rack->r_ctl.rc_tlp_threshold; 9015 break; 9016 case TCP_RACK_PKT_DELAY: 9017 /* RACK added ms i.e. rack-rtt + reord + N */ 9018 optval = rack->r_ctl.rc_pkt_delay; 9019 break; 9020 case TCP_RACK_TLP_USE: 9021 optval = rack->rack_tlp_threshold_use; 9022 break; 9023 case TCP_RACK_TLP_INC_VAR: 9024 /* Does TLP include rtt variance in t-o */ 9025 optval = rack->r_ctl.rc_prr_inc_var; 9026 break; 9027 case TCP_RACK_IDLE_REDUCE_HIGH: 9028 optval = rack->r_idle_reduce_largest; 9029 break; 9030 case TCP_RACK_MIN_PACE: 9031 optval = rack->r_enforce_min_pace; 9032 break; 9033 case TCP_RACK_MIN_PACE_SEG: 9034 optval = rack->r_min_pace_seg_thresh; 9035 break; 9036 case TCP_BBR_RACK_RTT_USE: 9037 optval = rack->r_ctl.rc_rate_sample_method; 9038 break; 9039 case TCP_DELACK: 9040 optval = tp->t_delayed_ack; 9041 break; 9042 case TCP_DATA_AFTER_CLOSE: 9043 optval = rack->rc_allow_data_af_clo; 9044 break; 9045 default: 9046 return (tcp_default_ctloutput(so, sopt, inp, tp)); 9047 break; 9048 } 9049 INP_WUNLOCK(inp); 9050 error = sooptcopyout(sopt, &optval, sizeof optval); 9051 return (error); 9052 } 9053 9054 static int 9055 rack_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp) 9056 { 9057 int32_t error = EINVAL; 9058 struct tcp_rack *rack; 9059 9060 rack = (struct tcp_rack *)tp->t_fb_ptr; 9061 if (rack == NULL) { 9062 /* Huh? */ 9063 goto out; 9064 } 9065 if (sopt->sopt_dir == SOPT_SET) { 9066 return (rack_set_sockopt(so, sopt, inp, tp, rack)); 9067 } else if (sopt->sopt_dir == SOPT_GET) { 9068 return (rack_get_sockopt(so, sopt, inp, tp, rack)); 9069 } 9070 out: 9071 INP_WUNLOCK(inp); 9072 return (error); 9073 } 9074 9075 9076 struct tcp_function_block __tcp_rack = { 9077 .tfb_tcp_block_name = __XSTRING(STACKNAME), 9078 .tfb_tcp_output = rack_output, 9079 .tfb_tcp_do_segment = rack_do_segment, 9080 .tfb_tcp_hpts_do_segment = rack_hpts_do_segment, 9081 .tfb_tcp_ctloutput = rack_ctloutput, 9082 .tfb_tcp_fb_init = rack_init, 9083 .tfb_tcp_fb_fini = rack_fini, 9084 .tfb_tcp_timer_stop_all = rack_stopall, 9085 .tfb_tcp_timer_activate = rack_timer_activate, 9086 .tfb_tcp_timer_active = rack_timer_active, 9087 .tfb_tcp_timer_stop = rack_timer_stop, 9088 .tfb_tcp_rexmit_tmr = rack_remxt_tmr, 9089 .tfb_tcp_handoff_ok = rack_handoff_ok 9090 }; 9091 9092 static const char *rack_stack_names[] = { 9093 __XSTRING(STACKNAME), 9094 #ifdef STACKALIAS 9095 __XSTRING(STACKALIAS), 9096 #endif 9097 }; 9098 9099 static int 9100 rack_ctor(void *mem, int32_t size, void *arg, int32_t how) 9101 { 9102 memset(mem, 0, size); 9103 return (0); 9104 } 9105 9106 static void 9107 rack_dtor(void *mem, int32_t size, void *arg) 9108 { 9109 9110 } 9111 9112 static bool rack_mod_inited = false; 9113 9114 static int 9115 tcp_addrack(module_t mod, int32_t type, void *data) 9116 { 9117 int32_t err = 0; 9118 int num_stacks; 9119 9120 switch (type) { 9121 case MOD_LOAD: 9122 rack_zone = uma_zcreate(__XSTRING(MODNAME) "_map", 9123 sizeof(struct rack_sendmap), 9124 rack_ctor, rack_dtor, NULL, NULL, UMA_ALIGN_PTR, 0); 9125 9126 rack_pcb_zone = uma_zcreate(__XSTRING(MODNAME) "_pcb", 9127 sizeof(struct tcp_rack), 9128 rack_ctor, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0); 9129 9130 sysctl_ctx_init(&rack_sysctl_ctx); 9131 rack_sysctl_root = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 9132 SYSCTL_STATIC_CHILDREN(_net_inet_tcp), 9133 OID_AUTO, 9134 __XSTRING(STACKNAME), 9135 CTLFLAG_RW, 0, 9136 ""); 9137 if (rack_sysctl_root == NULL) { 9138 printf("Failed to add sysctl node\n"); 9139 err = EFAULT; 9140 goto free_uma; 9141 } 9142 rack_init_sysctls(); 9143 num_stacks = nitems(rack_stack_names); 9144 err = register_tcp_functions_as_names(&__tcp_rack, M_WAITOK, 9145 rack_stack_names, &num_stacks); 9146 if (err) { 9147 printf("Failed to register %s stack name for " 9148 "%s module\n", rack_stack_names[num_stacks], 9149 __XSTRING(MODNAME)); 9150 sysctl_ctx_free(&rack_sysctl_ctx); 9151 free_uma: 9152 uma_zdestroy(rack_zone); 9153 uma_zdestroy(rack_pcb_zone); 9154 rack_counter_destroy(); 9155 printf("Failed to register rack module -- err:%d\n", err); 9156 return (err); 9157 } 9158 rack_mod_inited = true; 9159 break; 9160 case MOD_QUIESCE: 9161 err = deregister_tcp_functions(&__tcp_rack, true, false); 9162 break; 9163 case MOD_UNLOAD: 9164 err = deregister_tcp_functions(&__tcp_rack, false, true); 9165 if (err == EBUSY) 9166 break; 9167 if (rack_mod_inited) { 9168 uma_zdestroy(rack_zone); 9169 uma_zdestroy(rack_pcb_zone); 9170 sysctl_ctx_free(&rack_sysctl_ctx); 9171 rack_counter_destroy(); 9172 rack_mod_inited = false; 9173 } 9174 err = 0; 9175 break; 9176 default: 9177 return (EOPNOTSUPP); 9178 } 9179 return (err); 9180 } 9181 9182 static moduledata_t tcp_rack = { 9183 .name = __XSTRING(MODNAME), 9184 .evhand = tcp_addrack, 9185 .priv = 0 9186 }; 9187 9188 MODULE_VERSION(MODNAME, 1); 9189 DECLARE_MODULE(MODNAME, tcp_rack, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY); 9190 MODULE_DEPEND(MODNAME, tcphpts, 1, 1, 1); 9191