1 /*- 2 * Copyright (c) 2016-2019 3 * Netflix Inc. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 * 26 */ 27 28 #include <sys/cdefs.h> 29 __FBSDID("$FreeBSD$"); 30 31 #include "opt_inet.h" 32 #include "opt_inet6.h" 33 #include "opt_ipsec.h" 34 #include "opt_tcpdebug.h" 35 36 #include <sys/param.h> 37 #include <sys/module.h> 38 #include <sys/kernel.h> 39 #ifdef TCP_HHOOK 40 #include <sys/hhook.h> 41 #endif 42 #include <sys/lock.h> 43 #include <sys/malloc.h> 44 #include <sys/lock.h> 45 #include <sys/mutex.h> 46 #include <sys/mbuf.h> 47 #include <sys/proc.h> /* for proc0 declaration */ 48 #ifdef NETFLIX_STATS 49 #include <sys/qmath.h> 50 #endif 51 #include <sys/socket.h> 52 #include <sys/socketvar.h> 53 #include <sys/sysctl.h> 54 #include <sys/systm.h> 55 #include <sys/tree.h> 56 #ifdef NETFLIX_STATS 57 #include <sys/stats.h> /* Must come after qmath.h and tree.h */ 58 #endif 59 #include <sys/refcount.h> 60 #include <sys/queue.h> 61 #include <sys/smp.h> 62 #include <sys/kthread.h> 63 #include <sys/kern_prefetch.h> 64 65 #include <vm/uma.h> 66 67 #include <net/route.h> 68 #include <net/vnet.h> 69 70 #define TCPSTATES /* for logging */ 71 72 #include <netinet/in.h> 73 #include <netinet/in_kdtrace.h> 74 #include <netinet/in_pcb.h> 75 #include <netinet/ip.h> 76 #include <netinet/ip_icmp.h> /* required for icmp_var.h */ 77 #include <netinet/icmp_var.h> /* for ICMP_BANDLIM */ 78 #include <netinet/ip_var.h> 79 #include <netinet/ip6.h> 80 #include <netinet6/in6_pcb.h> 81 #include <netinet6/ip6_var.h> 82 #define TCPOUTFLAGS 83 #include <netinet/tcp.h> 84 #include <netinet/tcp_fsm.h> 85 #include <netinet/tcp_log_buf.h> 86 #include <netinet/tcp_seq.h> 87 #include <netinet/tcp_timer.h> 88 #include <netinet/tcp_var.h> 89 #include <netinet/tcp_hpts.h> 90 #include <netinet/tcpip.h> 91 #include <netinet/cc/cc.h> 92 #include <netinet/tcp_fastopen.h> 93 #ifdef TCPDEBUG 94 #include <netinet/tcp_debug.h> 95 #endif /* TCPDEBUG */ 96 #ifdef TCP_OFFLOAD 97 #include <netinet/tcp_offload.h> 98 #endif 99 #ifdef INET6 100 #include <netinet6/tcp6_var.h> 101 #endif 102 103 #include <netipsec/ipsec_support.h> 104 105 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 106 #include <netipsec/ipsec.h> 107 #include <netipsec/ipsec6.h> 108 #endif /* IPSEC */ 109 110 #include <netinet/udp.h> 111 #include <netinet/udp_var.h> 112 #include <machine/in_cksum.h> 113 114 #ifdef MAC 115 #include <security/mac/mac_framework.h> 116 #endif 117 #include "sack_filter.h" 118 #include "tcp_rack.h" 119 #include "rack_bbr_common.h" 120 121 uma_zone_t rack_zone; 122 uma_zone_t rack_pcb_zone; 123 124 #ifndef TICKS2SBT 125 #define TICKS2SBT(__t) (tick_sbt * ((sbintime_t)(__t))) 126 #endif 127 128 struct sysctl_ctx_list rack_sysctl_ctx; 129 struct sysctl_oid *rack_sysctl_root; 130 131 #define CUM_ACKED 1 132 #define SACKED 2 133 134 /* 135 * The RACK module incorporates a number of 136 * TCP ideas that have been put out into the IETF 137 * over the last few years: 138 * - Matt Mathis's Rate Halving which slowly drops 139 * the congestion window so that the ack clock can 140 * be maintained during a recovery. 141 * - Yuchung Cheng's RACK TCP (for which its named) that 142 * will stop us using the number of dup acks and instead 143 * use time as the gage of when we retransmit. 144 * - Reorder Detection of RFC4737 and the Tail-Loss probe draft 145 * of Dukkipati et.al. 146 * RACK depends on SACK, so if an endpoint arrives that 147 * cannot do SACK the state machine below will shuttle the 148 * connection back to using the "default" TCP stack that is 149 * in FreeBSD. 150 * 151 * To implement RACK the original TCP stack was first decomposed 152 * into a functional state machine with individual states 153 * for each of the possible TCP connection states. The do_segement 154 * functions role in life is to mandate the connection supports SACK 155 * initially and then assure that the RACK state matches the conenction 156 * state before calling the states do_segment function. Each 157 * state is simplified due to the fact that the original do_segment 158 * has been decomposed and we *know* what state we are in (no 159 * switches on the state) and all tests for SACK are gone. This 160 * greatly simplifies what each state does. 161 * 162 * TCP output is also over-written with a new version since it 163 * must maintain the new rack scoreboard. 164 * 165 */ 166 static int32_t rack_precache = 1; 167 static int32_t rack_tlp_thresh = 1; 168 static int32_t rack_reorder_thresh = 2; 169 static int32_t rack_reorder_fade = 60000; /* 0 - never fade, def 60,000 170 * - 60 seconds */ 171 static int32_t rack_pkt_delay = 1; 172 static int32_t rack_inc_var = 0;/* For TLP */ 173 static int32_t rack_reduce_largest_on_idle = 0; 174 static int32_t rack_min_pace_time = 0; 175 static int32_t rack_min_pace_time_seg_req=6; 176 static int32_t rack_early_recovery = 1; 177 static int32_t rack_early_recovery_max_seg = 6; 178 static int32_t rack_send_a_lot_in_prr = 1; 179 static int32_t rack_min_to = 1; /* Number of ms minimum timeout */ 180 static int32_t rack_tlp_in_recovery = 1; /* Can we do TLP in recovery? */ 181 static int32_t rack_verbose_logging = 0; 182 static int32_t rack_ignore_data_after_close = 1; 183 static int32_t rack_map_entries_limit = 1024; 184 static int32_t rack_map_split_limit = 256; 185 186 /* 187 * Currently regular tcp has a rto_min of 30ms 188 * the backoff goes 12 times so that ends up 189 * being a total of 122.850 seconds before a 190 * connection is killed. 191 */ 192 static int32_t rack_tlp_min = 10; 193 static int32_t rack_rto_min = 30; /* 30ms same as main freebsd */ 194 static int32_t rack_rto_max = 30000; /* 30 seconds */ 195 static const int32_t rack_free_cache = 2; 196 static int32_t rack_hptsi_segments = 40; 197 static int32_t rack_rate_sample_method = USE_RTT_LOW; 198 static int32_t rack_pace_every_seg = 1; 199 static int32_t rack_delayed_ack_time = 200; /* 200ms */ 200 static int32_t rack_slot_reduction = 4; 201 static int32_t rack_lower_cwnd_at_tlp = 0; 202 static int32_t rack_use_proportional_reduce = 0; 203 static int32_t rack_proportional_rate = 10; 204 static int32_t rack_tlp_max_resend = 2; 205 static int32_t rack_limited_retran = 0; 206 static int32_t rack_always_send_oldest = 0; 207 static int32_t rack_sack_block_limit = 128; 208 static int32_t rack_use_sack_filter = 1; 209 static int32_t rack_tlp_threshold_use = TLP_USE_TWO_ONE; 210 211 /* Rack specific counters */ 212 counter_u64_t rack_badfr; 213 counter_u64_t rack_badfr_bytes; 214 counter_u64_t rack_rtm_prr_retran; 215 counter_u64_t rack_rtm_prr_newdata; 216 counter_u64_t rack_timestamp_mismatch; 217 counter_u64_t rack_reorder_seen; 218 counter_u64_t rack_paced_segments; 219 counter_u64_t rack_unpaced_segments; 220 counter_u64_t rack_saw_enobuf; 221 counter_u64_t rack_saw_enetunreach; 222 223 /* Tail loss probe counters */ 224 counter_u64_t rack_tlp_tot; 225 counter_u64_t rack_tlp_newdata; 226 counter_u64_t rack_tlp_retran; 227 counter_u64_t rack_tlp_retran_bytes; 228 counter_u64_t rack_tlp_retran_fail; 229 counter_u64_t rack_to_tot; 230 counter_u64_t rack_to_arm_rack; 231 counter_u64_t rack_to_arm_tlp; 232 counter_u64_t rack_to_alloc; 233 counter_u64_t rack_to_alloc_hard; 234 counter_u64_t rack_to_alloc_emerg; 235 counter_u64_t rack_to_alloc_limited; 236 counter_u64_t rack_alloc_limited_conns; 237 counter_u64_t rack_split_limited; 238 239 counter_u64_t rack_sack_proc_all; 240 counter_u64_t rack_sack_proc_short; 241 counter_u64_t rack_sack_proc_restart; 242 counter_u64_t rack_runt_sacks; 243 counter_u64_t rack_used_tlpmethod; 244 counter_u64_t rack_used_tlpmethod2; 245 counter_u64_t rack_enter_tlp_calc; 246 counter_u64_t rack_input_idle_reduces; 247 counter_u64_t rack_tlp_does_nada; 248 249 /* Temp CPU counters */ 250 counter_u64_t rack_find_high; 251 252 counter_u64_t rack_progress_drops; 253 counter_u64_t rack_out_size[TCP_MSS_ACCT_SIZE]; 254 counter_u64_t rack_opts_arry[RACK_OPTS_SIZE]; 255 256 /* 257 * This was originally defined in tcp_timer.c, but is now reproduced here given 258 * the unification of the SYN and non-SYN retransmit timer exponents combined 259 * with wanting to retain previous behaviour for previously deployed stack 260 * versions. 261 */ 262 int tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] = 263 { 1, 1, 1, 1, 1, 2, 4, 8, 16, 32, 64, 64, 64 }; 264 265 static void 266 rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, int event, int line); 267 268 static int 269 rack_process_ack(struct mbuf *m, struct tcphdr *th, 270 struct socket *so, struct tcpcb *tp, struct tcpopt *to, 271 uint32_t tiwin, int32_t tlen, int32_t * ofia, int32_t thflags, int32_t * ret_val); 272 static int 273 rack_process_data(struct mbuf *m, struct tcphdr *th, 274 struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, 275 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); 276 static void 277 rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, 278 struct tcphdr *th, uint16_t nsegs, uint16_t type, int32_t recovery); 279 static struct rack_sendmap *rack_alloc(struct tcp_rack *rack); 280 static struct rack_sendmap *rack_alloc_limit(struct tcp_rack *rack, 281 uint8_t limit_type); 282 static struct rack_sendmap * 283 rack_check_recovery_mode(struct tcpcb *tp, 284 uint32_t tsused); 285 static void 286 rack_cong_signal(struct tcpcb *tp, struct tcphdr *th, 287 uint32_t type); 288 static void rack_counter_destroy(void); 289 static int 290 rack_ctloutput(struct socket *so, struct sockopt *sopt, 291 struct inpcb *inp, struct tcpcb *tp); 292 static int32_t rack_ctor(void *mem, int32_t size, void *arg, int32_t how); 293 static void 294 rack_do_segment(struct mbuf *m, struct tcphdr *th, 295 struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, 296 uint8_t iptos); 297 static void rack_dtor(void *mem, int32_t size, void *arg); 298 static void 299 rack_earlier_retran(struct tcpcb *tp, struct rack_sendmap *rsm, 300 uint32_t t, uint32_t cts); 301 static struct rack_sendmap * 302 rack_find_high_nonack(struct tcp_rack *rack, 303 struct rack_sendmap *rsm); 304 static struct rack_sendmap *rack_find_lowest_rsm(struct tcp_rack *rack); 305 static void rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm); 306 static void rack_fini(struct tcpcb *tp, int32_t tcb_is_purged); 307 static int 308 rack_get_sockopt(struct socket *so, struct sockopt *sopt, 309 struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack); 310 static int32_t rack_handoff_ok(struct tcpcb *tp); 311 static int32_t rack_init(struct tcpcb *tp); 312 static void rack_init_sysctls(void); 313 static void 314 rack_log_ack(struct tcpcb *tp, struct tcpopt *to, 315 struct tcphdr *th); 316 static void 317 rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len, 318 uint32_t seq_out, uint8_t th_flags, int32_t err, uint32_t ts, 319 uint8_t pass, struct rack_sendmap *hintrsm); 320 static void 321 rack_log_sack_passed(struct tcpcb *tp, struct tcp_rack *rack, 322 struct rack_sendmap *rsm); 323 static void rack_log_to_event(struct tcp_rack *rack, int32_t to_num); 324 static int32_t rack_output(struct tcpcb *tp); 325 static void 326 rack_hpts_do_segment(struct mbuf *m, struct tcphdr *th, 327 struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, 328 uint8_t iptos, int32_t nxt_pkt, struct timeval *tv); 329 330 static uint32_t 331 rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, 332 struct sackblk *sack, struct tcpopt *to, struct rack_sendmap **prsm, 333 uint32_t cts); 334 static void rack_post_recovery(struct tcpcb *tp, struct tcphdr *th); 335 static void rack_remxt_tmr(struct tcpcb *tp); 336 static int 337 rack_set_sockopt(struct socket *so, struct sockopt *sopt, 338 struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack); 339 static void rack_set_state(struct tcpcb *tp, struct tcp_rack *rack); 340 static int32_t rack_stopall(struct tcpcb *tp); 341 static void 342 rack_timer_activate(struct tcpcb *tp, uint32_t timer_type, 343 uint32_t delta); 344 static int32_t rack_timer_active(struct tcpcb *tp, uint32_t timer_type); 345 static void rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line); 346 static void rack_timer_stop(struct tcpcb *tp, uint32_t timer_type); 347 static uint32_t 348 rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack, 349 struct rack_sendmap *rsm, uint32_t ts, int32_t * lenp); 350 static void 351 rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack, 352 struct rack_sendmap *rsm, uint32_t ts); 353 static int 354 rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack, 355 struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type); 356 static int32_t tcp_addrack(module_t mod, int32_t type, void *data); 357 static void 358 rack_challenge_ack(struct mbuf *m, struct tcphdr *th, 359 struct tcpcb *tp, int32_t * ret_val); 360 static int 361 rack_do_close_wait(struct mbuf *m, struct tcphdr *th, 362 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 363 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); 364 static int 365 rack_do_closing(struct mbuf *m, struct tcphdr *th, 366 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 367 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); 368 static void rack_do_drop(struct mbuf *m, struct tcpcb *tp); 369 static void 370 rack_do_dropafterack(struct mbuf *m, struct tcpcb *tp, 371 struct tcphdr *th, int32_t thflags, int32_t tlen, int32_t * ret_val); 372 static void 373 rack_do_dropwithreset(struct mbuf *m, struct tcpcb *tp, 374 struct tcphdr *th, int32_t rstreason, int32_t tlen); 375 static int 376 rack_do_established(struct mbuf *m, struct tcphdr *th, 377 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 378 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); 379 static int 380 rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, 381 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 382 int32_t tlen, uint32_t tiwin, int32_t nxt_pkt); 383 static int 384 rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, 385 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 386 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); 387 static int 388 rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, 389 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 390 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); 391 static int 392 rack_do_lastack(struct mbuf *m, struct tcphdr *th, 393 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 394 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); 395 static int 396 rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, 397 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 398 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); 399 static int 400 rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, 401 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 402 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); 403 static int 404 rack_drop_checks(struct tcpopt *to, struct mbuf *m, 405 struct tcphdr *th, struct tcpcb *tp, int32_t * tlenp, int32_t * thf, 406 int32_t * drop_hdrlen, int32_t * ret_val); 407 static int 408 rack_process_rst(struct mbuf *m, struct tcphdr *th, 409 struct socket *so, struct tcpcb *tp); 410 struct rack_sendmap * 411 tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, 412 uint32_t tsused); 413 static void tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt); 414 static void 415 tcp_rack_partialack(struct tcpcb *tp, struct tcphdr *th); 416 417 static int 418 rack_ts_check(struct mbuf *m, struct tcphdr *th, 419 struct tcpcb *tp, int32_t tlen, int32_t thflags, int32_t * ret_val); 420 421 int32_t rack_clear_counter=0; 422 423 424 static int 425 sysctl_rack_clear(SYSCTL_HANDLER_ARGS) 426 { 427 uint32_t stat; 428 int32_t error; 429 430 error = SYSCTL_OUT(req, &rack_clear_counter, sizeof(uint32_t)); 431 if (error || req->newptr == NULL) 432 return error; 433 434 error = SYSCTL_IN(req, &stat, sizeof(uint32_t)); 435 if (error) 436 return (error); 437 if (stat == 1) { 438 #ifdef INVARIANTS 439 printf("Clearing RACK counters\n"); 440 #endif 441 counter_u64_zero(rack_badfr); 442 counter_u64_zero(rack_badfr_bytes); 443 counter_u64_zero(rack_rtm_prr_retran); 444 counter_u64_zero(rack_rtm_prr_newdata); 445 counter_u64_zero(rack_timestamp_mismatch); 446 counter_u64_zero(rack_reorder_seen); 447 counter_u64_zero(rack_tlp_tot); 448 counter_u64_zero(rack_tlp_newdata); 449 counter_u64_zero(rack_tlp_retran); 450 counter_u64_zero(rack_tlp_retran_bytes); 451 counter_u64_zero(rack_tlp_retran_fail); 452 counter_u64_zero(rack_to_tot); 453 counter_u64_zero(rack_to_arm_rack); 454 counter_u64_zero(rack_to_arm_tlp); 455 counter_u64_zero(rack_paced_segments); 456 counter_u64_zero(rack_unpaced_segments); 457 counter_u64_zero(rack_saw_enobuf); 458 counter_u64_zero(rack_saw_enetunreach); 459 counter_u64_zero(rack_to_alloc_hard); 460 counter_u64_zero(rack_to_alloc_emerg); 461 counter_u64_zero(rack_sack_proc_all); 462 counter_u64_zero(rack_sack_proc_short); 463 counter_u64_zero(rack_sack_proc_restart); 464 counter_u64_zero(rack_to_alloc); 465 counter_u64_zero(rack_to_alloc_limited); 466 counter_u64_zero(rack_alloc_limited_conns); 467 counter_u64_zero(rack_split_limited); 468 counter_u64_zero(rack_find_high); 469 counter_u64_zero(rack_runt_sacks); 470 counter_u64_zero(rack_used_tlpmethod); 471 counter_u64_zero(rack_used_tlpmethod2); 472 counter_u64_zero(rack_enter_tlp_calc); 473 counter_u64_zero(rack_progress_drops); 474 counter_u64_zero(rack_tlp_does_nada); 475 } 476 rack_clear_counter = 0; 477 return (0); 478 } 479 480 481 482 static void 483 rack_init_sysctls() 484 { 485 SYSCTL_ADD_S32(&rack_sysctl_ctx, 486 SYSCTL_CHILDREN(rack_sysctl_root), 487 OID_AUTO, "map_limit", CTLFLAG_RW, 488 &rack_map_entries_limit , 1024, 489 "Is there a limit on how big the sendmap can grow? "); 490 491 SYSCTL_ADD_S32(&rack_sysctl_ctx, 492 SYSCTL_CHILDREN(rack_sysctl_root), 493 OID_AUTO, "map_splitlimit", CTLFLAG_RW, 494 &rack_map_split_limit , 256, 495 "Is there a limit on how much splitting a peer can do?"); 496 497 SYSCTL_ADD_S32(&rack_sysctl_ctx, 498 SYSCTL_CHILDREN(rack_sysctl_root), 499 OID_AUTO, "rate_sample_method", CTLFLAG_RW, 500 &rack_rate_sample_method , USE_RTT_LOW, 501 "What method should we use for rate sampling 0=high, 1=low "); 502 SYSCTL_ADD_S32(&rack_sysctl_ctx, 503 SYSCTL_CHILDREN(rack_sysctl_root), 504 OID_AUTO, "data_after_close", CTLFLAG_RW, 505 &rack_ignore_data_after_close, 0, 506 "Do we hold off sending a RST until all pending data is ack'd"); 507 SYSCTL_ADD_S32(&rack_sysctl_ctx, 508 SYSCTL_CHILDREN(rack_sysctl_root), 509 OID_AUTO, "tlpmethod", CTLFLAG_RW, 510 &rack_tlp_threshold_use, TLP_USE_TWO_ONE, 511 "What method do we do for TLP time calc 0=no-de-ack-comp, 1=ID, 2=2.1, 3=2.2"); 512 SYSCTL_ADD_S32(&rack_sysctl_ctx, 513 SYSCTL_CHILDREN(rack_sysctl_root), 514 OID_AUTO, "min_pace_time", CTLFLAG_RW, 515 &rack_min_pace_time, 0, 516 "Should we enforce a minimum pace time of 1ms"); 517 SYSCTL_ADD_S32(&rack_sysctl_ctx, 518 SYSCTL_CHILDREN(rack_sysctl_root), 519 OID_AUTO, "min_pace_segs", CTLFLAG_RW, 520 &rack_min_pace_time_seg_req, 6, 521 "How many segments have to be in the len to enforce min-pace-time"); 522 SYSCTL_ADD_S32(&rack_sysctl_ctx, 523 SYSCTL_CHILDREN(rack_sysctl_root), 524 OID_AUTO, "idle_reduce_high", CTLFLAG_RW, 525 &rack_reduce_largest_on_idle, 0, 526 "Should we reduce the largest cwnd seen to IW on idle reduction"); 527 SYSCTL_ADD_S32(&rack_sysctl_ctx, 528 SYSCTL_CHILDREN(rack_sysctl_root), 529 OID_AUTO, "bb_verbose", CTLFLAG_RW, 530 &rack_verbose_logging, 0, 531 "Should RACK black box logging be verbose"); 532 SYSCTL_ADD_S32(&rack_sysctl_ctx, 533 SYSCTL_CHILDREN(rack_sysctl_root), 534 OID_AUTO, "sackfiltering", CTLFLAG_RW, 535 &rack_use_sack_filter, 1, 536 "Do we use sack filtering?"); 537 SYSCTL_ADD_S32(&rack_sysctl_ctx, 538 SYSCTL_CHILDREN(rack_sysctl_root), 539 OID_AUTO, "delayed_ack", CTLFLAG_RW, 540 &rack_delayed_ack_time, 200, 541 "Delayed ack time (200ms)"); 542 SYSCTL_ADD_S32(&rack_sysctl_ctx, 543 SYSCTL_CHILDREN(rack_sysctl_root), 544 OID_AUTO, "tlpminto", CTLFLAG_RW, 545 &rack_tlp_min, 10, 546 "TLP minimum timeout per the specification (10ms)"); 547 SYSCTL_ADD_S32(&rack_sysctl_ctx, 548 SYSCTL_CHILDREN(rack_sysctl_root), 549 OID_AUTO, "precache", CTLFLAG_RW, 550 &rack_precache, 0, 551 "Where should we precache the mcopy (0 is not at all)"); 552 SYSCTL_ADD_S32(&rack_sysctl_ctx, 553 SYSCTL_CHILDREN(rack_sysctl_root), 554 OID_AUTO, "sblklimit", CTLFLAG_RW, 555 &rack_sack_block_limit, 128, 556 "When do we start paying attention to small sack blocks"); 557 SYSCTL_ADD_S32(&rack_sysctl_ctx, 558 SYSCTL_CHILDREN(rack_sysctl_root), 559 OID_AUTO, "send_oldest", CTLFLAG_RW, 560 &rack_always_send_oldest, 1, 561 "Should we always send the oldest TLP and RACK-TLP"); 562 SYSCTL_ADD_S32(&rack_sysctl_ctx, 563 SYSCTL_CHILDREN(rack_sysctl_root), 564 OID_AUTO, "rack_tlp_in_recovery", CTLFLAG_RW, 565 &rack_tlp_in_recovery, 1, 566 "Can we do a TLP during recovery?"); 567 SYSCTL_ADD_S32(&rack_sysctl_ctx, 568 SYSCTL_CHILDREN(rack_sysctl_root), 569 OID_AUTO, "rack_tlimit", CTLFLAG_RW, 570 &rack_limited_retran, 0, 571 "How many times can a rack timeout drive out sends"); 572 SYSCTL_ADD_S32(&rack_sysctl_ctx, 573 SYSCTL_CHILDREN(rack_sysctl_root), 574 OID_AUTO, "minrto", CTLFLAG_RW, 575 &rack_rto_min, 0, 576 "Minimum RTO in ms -- set with caution below 1000 due to TLP"); 577 SYSCTL_ADD_S32(&rack_sysctl_ctx, 578 SYSCTL_CHILDREN(rack_sysctl_root), 579 OID_AUTO, "maxrto", CTLFLAG_RW, 580 &rack_rto_max, 0, 581 "Maxiumum RTO in ms -- should be at least as large as min_rto"); 582 SYSCTL_ADD_S32(&rack_sysctl_ctx, 583 SYSCTL_CHILDREN(rack_sysctl_root), 584 OID_AUTO, "tlp_retry", CTLFLAG_RW, 585 &rack_tlp_max_resend, 2, 586 "How many times does TLP retry a single segment or multiple with no ACK"); 587 SYSCTL_ADD_S32(&rack_sysctl_ctx, 588 SYSCTL_CHILDREN(rack_sysctl_root), 589 OID_AUTO, "recovery_loss_prop", CTLFLAG_RW, 590 &rack_use_proportional_reduce, 0, 591 "Should we proportionaly reduce cwnd based on the number of losses "); 592 SYSCTL_ADD_S32(&rack_sysctl_ctx, 593 SYSCTL_CHILDREN(rack_sysctl_root), 594 OID_AUTO, "recovery_prop", CTLFLAG_RW, 595 &rack_proportional_rate, 10, 596 "What percent reduction per loss"); 597 SYSCTL_ADD_S32(&rack_sysctl_ctx, 598 SYSCTL_CHILDREN(rack_sysctl_root), 599 OID_AUTO, "tlp_cwnd_flag", CTLFLAG_RW, 600 &rack_lower_cwnd_at_tlp, 0, 601 "When a TLP completes a retran should we enter recovery?"); 602 SYSCTL_ADD_S32(&rack_sysctl_ctx, 603 SYSCTL_CHILDREN(rack_sysctl_root), 604 OID_AUTO, "hptsi_reduces", CTLFLAG_RW, 605 &rack_slot_reduction, 4, 606 "When setting a slot should we reduce by divisor"); 607 SYSCTL_ADD_S32(&rack_sysctl_ctx, 608 SYSCTL_CHILDREN(rack_sysctl_root), 609 OID_AUTO, "hptsi_every_seg", CTLFLAG_RW, 610 &rack_pace_every_seg, 1, 611 "Should we pace out every segment hptsi"); 612 SYSCTL_ADD_S32(&rack_sysctl_ctx, 613 SYSCTL_CHILDREN(rack_sysctl_root), 614 OID_AUTO, "hptsi_seg_max", CTLFLAG_RW, 615 &rack_hptsi_segments, 6, 616 "Should we pace out only a limited size of segments"); 617 SYSCTL_ADD_S32(&rack_sysctl_ctx, 618 SYSCTL_CHILDREN(rack_sysctl_root), 619 OID_AUTO, "prr_sendalot", CTLFLAG_RW, 620 &rack_send_a_lot_in_prr, 1, 621 "Send a lot in prr"); 622 SYSCTL_ADD_S32(&rack_sysctl_ctx, 623 SYSCTL_CHILDREN(rack_sysctl_root), 624 OID_AUTO, "minto", CTLFLAG_RW, 625 &rack_min_to, 1, 626 "Minimum rack timeout in milliseconds"); 627 SYSCTL_ADD_S32(&rack_sysctl_ctx, 628 SYSCTL_CHILDREN(rack_sysctl_root), 629 OID_AUTO, "earlyrecoveryseg", CTLFLAG_RW, 630 &rack_early_recovery_max_seg, 6, 631 "Max segments in early recovery"); 632 SYSCTL_ADD_S32(&rack_sysctl_ctx, 633 SYSCTL_CHILDREN(rack_sysctl_root), 634 OID_AUTO, "earlyrecovery", CTLFLAG_RW, 635 &rack_early_recovery, 1, 636 "Do we do early recovery with rack"); 637 SYSCTL_ADD_S32(&rack_sysctl_ctx, 638 SYSCTL_CHILDREN(rack_sysctl_root), 639 OID_AUTO, "reorder_thresh", CTLFLAG_RW, 640 &rack_reorder_thresh, 2, 641 "What factor for rack will be added when seeing reordering (shift right)"); 642 SYSCTL_ADD_S32(&rack_sysctl_ctx, 643 SYSCTL_CHILDREN(rack_sysctl_root), 644 OID_AUTO, "rtt_tlp_thresh", CTLFLAG_RW, 645 &rack_tlp_thresh, 1, 646 "what divisor for TLP rtt/retran will be added (1=rtt, 2=1/2 rtt etc)"); 647 SYSCTL_ADD_S32(&rack_sysctl_ctx, 648 SYSCTL_CHILDREN(rack_sysctl_root), 649 OID_AUTO, "reorder_fade", CTLFLAG_RW, 650 &rack_reorder_fade, 0, 651 "Does reorder detection fade, if so how many ms (0 means never)"); 652 SYSCTL_ADD_S32(&rack_sysctl_ctx, 653 SYSCTL_CHILDREN(rack_sysctl_root), 654 OID_AUTO, "pktdelay", CTLFLAG_RW, 655 &rack_pkt_delay, 1, 656 "Extra RACK time (in ms) besides reordering thresh"); 657 SYSCTL_ADD_S32(&rack_sysctl_ctx, 658 SYSCTL_CHILDREN(rack_sysctl_root), 659 OID_AUTO, "inc_var", CTLFLAG_RW, 660 &rack_inc_var, 0, 661 "Should rack add to the TLP timer the variance in rtt calculation"); 662 rack_badfr = counter_u64_alloc(M_WAITOK); 663 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 664 SYSCTL_CHILDREN(rack_sysctl_root), 665 OID_AUTO, "badfr", CTLFLAG_RD, 666 &rack_badfr, "Total number of bad FRs"); 667 rack_badfr_bytes = counter_u64_alloc(M_WAITOK); 668 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 669 SYSCTL_CHILDREN(rack_sysctl_root), 670 OID_AUTO, "badfr_bytes", CTLFLAG_RD, 671 &rack_badfr_bytes, "Total number of bad FRs"); 672 rack_rtm_prr_retran = counter_u64_alloc(M_WAITOK); 673 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 674 SYSCTL_CHILDREN(rack_sysctl_root), 675 OID_AUTO, "prrsndret", CTLFLAG_RD, 676 &rack_rtm_prr_retran, 677 "Total number of prr based retransmits"); 678 rack_rtm_prr_newdata = counter_u64_alloc(M_WAITOK); 679 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 680 SYSCTL_CHILDREN(rack_sysctl_root), 681 OID_AUTO, "prrsndnew", CTLFLAG_RD, 682 &rack_rtm_prr_newdata, 683 "Total number of prr based new transmits"); 684 rack_timestamp_mismatch = counter_u64_alloc(M_WAITOK); 685 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 686 SYSCTL_CHILDREN(rack_sysctl_root), 687 OID_AUTO, "tsnf", CTLFLAG_RD, 688 &rack_timestamp_mismatch, 689 "Total number of timestamps that we could not find the reported ts"); 690 rack_find_high = counter_u64_alloc(M_WAITOK); 691 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 692 SYSCTL_CHILDREN(rack_sysctl_root), 693 OID_AUTO, "findhigh", CTLFLAG_RD, 694 &rack_find_high, 695 "Total number of FIN causing find-high"); 696 rack_reorder_seen = counter_u64_alloc(M_WAITOK); 697 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 698 SYSCTL_CHILDREN(rack_sysctl_root), 699 OID_AUTO, "reordering", CTLFLAG_RD, 700 &rack_reorder_seen, 701 "Total number of times we added delay due to reordering"); 702 rack_tlp_tot = counter_u64_alloc(M_WAITOK); 703 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 704 SYSCTL_CHILDREN(rack_sysctl_root), 705 OID_AUTO, "tlp_to_total", CTLFLAG_RD, 706 &rack_tlp_tot, 707 "Total number of tail loss probe expirations"); 708 rack_tlp_newdata = counter_u64_alloc(M_WAITOK); 709 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 710 SYSCTL_CHILDREN(rack_sysctl_root), 711 OID_AUTO, "tlp_new", CTLFLAG_RD, 712 &rack_tlp_newdata, 713 "Total number of tail loss probe sending new data"); 714 715 rack_tlp_retran = counter_u64_alloc(M_WAITOK); 716 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 717 SYSCTL_CHILDREN(rack_sysctl_root), 718 OID_AUTO, "tlp_retran", CTLFLAG_RD, 719 &rack_tlp_retran, 720 "Total number of tail loss probe sending retransmitted data"); 721 rack_tlp_retran_bytes = counter_u64_alloc(M_WAITOK); 722 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 723 SYSCTL_CHILDREN(rack_sysctl_root), 724 OID_AUTO, "tlp_retran_bytes", CTLFLAG_RD, 725 &rack_tlp_retran_bytes, 726 "Total bytes of tail loss probe sending retransmitted data"); 727 rack_tlp_retran_fail = counter_u64_alloc(M_WAITOK); 728 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 729 SYSCTL_CHILDREN(rack_sysctl_root), 730 OID_AUTO, "tlp_retran_fail", CTLFLAG_RD, 731 &rack_tlp_retran_fail, 732 "Total number of tail loss probe sending retransmitted data that failed (wait for t3)"); 733 rack_to_tot = counter_u64_alloc(M_WAITOK); 734 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 735 SYSCTL_CHILDREN(rack_sysctl_root), 736 OID_AUTO, "rack_to_tot", CTLFLAG_RD, 737 &rack_to_tot, 738 "Total number of times the rack to expired?"); 739 rack_to_arm_rack = counter_u64_alloc(M_WAITOK); 740 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 741 SYSCTL_CHILDREN(rack_sysctl_root), 742 OID_AUTO, "arm_rack", CTLFLAG_RD, 743 &rack_to_arm_rack, 744 "Total number of times the rack timer armed?"); 745 rack_to_arm_tlp = counter_u64_alloc(M_WAITOK); 746 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 747 SYSCTL_CHILDREN(rack_sysctl_root), 748 OID_AUTO, "arm_tlp", CTLFLAG_RD, 749 &rack_to_arm_tlp, 750 "Total number of times the tlp timer armed?"); 751 rack_paced_segments = counter_u64_alloc(M_WAITOK); 752 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 753 SYSCTL_CHILDREN(rack_sysctl_root), 754 OID_AUTO, "paced", CTLFLAG_RD, 755 &rack_paced_segments, 756 "Total number of times a segment send caused hptsi"); 757 rack_unpaced_segments = counter_u64_alloc(M_WAITOK); 758 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 759 SYSCTL_CHILDREN(rack_sysctl_root), 760 OID_AUTO, "unpaced", CTLFLAG_RD, 761 &rack_unpaced_segments, 762 "Total number of times a segment did not cause hptsi"); 763 rack_saw_enobuf = counter_u64_alloc(M_WAITOK); 764 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 765 SYSCTL_CHILDREN(rack_sysctl_root), 766 OID_AUTO, "saw_enobufs", CTLFLAG_RD, 767 &rack_saw_enobuf, 768 "Total number of times a segment did not cause hptsi"); 769 rack_saw_enetunreach = counter_u64_alloc(M_WAITOK); 770 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 771 SYSCTL_CHILDREN(rack_sysctl_root), 772 OID_AUTO, "saw_enetunreach", CTLFLAG_RD, 773 &rack_saw_enetunreach, 774 "Total number of times a segment did not cause hptsi"); 775 rack_to_alloc = counter_u64_alloc(M_WAITOK); 776 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 777 SYSCTL_CHILDREN(rack_sysctl_root), 778 OID_AUTO, "allocs", CTLFLAG_RD, 779 &rack_to_alloc, 780 "Total allocations of tracking structures"); 781 rack_to_alloc_hard = counter_u64_alloc(M_WAITOK); 782 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 783 SYSCTL_CHILDREN(rack_sysctl_root), 784 OID_AUTO, "allochard", CTLFLAG_RD, 785 &rack_to_alloc_hard, 786 "Total allocations done with sleeping the hard way"); 787 rack_to_alloc_emerg = counter_u64_alloc(M_WAITOK); 788 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 789 SYSCTL_CHILDREN(rack_sysctl_root), 790 OID_AUTO, "allocemerg", CTLFLAG_RD, 791 &rack_to_alloc_emerg, 792 "Total allocations done from emergency cache"); 793 rack_to_alloc_limited = counter_u64_alloc(M_WAITOK); 794 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 795 SYSCTL_CHILDREN(rack_sysctl_root), 796 OID_AUTO, "alloc_limited", CTLFLAG_RD, 797 &rack_to_alloc_limited, 798 "Total allocations dropped due to limit"); 799 rack_alloc_limited_conns = counter_u64_alloc(M_WAITOK); 800 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 801 SYSCTL_CHILDREN(rack_sysctl_root), 802 OID_AUTO, "alloc_limited_conns", CTLFLAG_RD, 803 &rack_alloc_limited_conns, 804 "Connections with allocations dropped due to limit"); 805 rack_split_limited = counter_u64_alloc(M_WAITOK); 806 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 807 SYSCTL_CHILDREN(rack_sysctl_root), 808 OID_AUTO, "split_limited", CTLFLAG_RD, 809 &rack_split_limited, 810 "Split allocations dropped due to limit"); 811 rack_sack_proc_all = counter_u64_alloc(M_WAITOK); 812 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 813 SYSCTL_CHILDREN(rack_sysctl_root), 814 OID_AUTO, "sack_long", CTLFLAG_RD, 815 &rack_sack_proc_all, 816 "Total times we had to walk whole list for sack processing"); 817 818 rack_sack_proc_restart = counter_u64_alloc(M_WAITOK); 819 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 820 SYSCTL_CHILDREN(rack_sysctl_root), 821 OID_AUTO, "sack_restart", CTLFLAG_RD, 822 &rack_sack_proc_restart, 823 "Total times we had to walk whole list due to a restart"); 824 rack_sack_proc_short = counter_u64_alloc(M_WAITOK); 825 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 826 SYSCTL_CHILDREN(rack_sysctl_root), 827 OID_AUTO, "sack_short", CTLFLAG_RD, 828 &rack_sack_proc_short, 829 "Total times we took shortcut for sack processing"); 830 rack_enter_tlp_calc = counter_u64_alloc(M_WAITOK); 831 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 832 SYSCTL_CHILDREN(rack_sysctl_root), 833 OID_AUTO, "tlp_calc_entered", CTLFLAG_RD, 834 &rack_enter_tlp_calc, 835 "Total times we called calc-tlp"); 836 rack_used_tlpmethod = counter_u64_alloc(M_WAITOK); 837 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 838 SYSCTL_CHILDREN(rack_sysctl_root), 839 OID_AUTO, "hit_tlp_method", CTLFLAG_RD, 840 &rack_used_tlpmethod, 841 "Total number of runt sacks"); 842 rack_used_tlpmethod2 = counter_u64_alloc(M_WAITOK); 843 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 844 SYSCTL_CHILDREN(rack_sysctl_root), 845 OID_AUTO, "hit_tlp_method2", CTLFLAG_RD, 846 &rack_used_tlpmethod2, 847 "Total number of runt sacks 2"); 848 rack_runt_sacks = counter_u64_alloc(M_WAITOK); 849 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 850 SYSCTL_CHILDREN(rack_sysctl_root), 851 OID_AUTO, "runtsacks", CTLFLAG_RD, 852 &rack_runt_sacks, 853 "Total number of runt sacks"); 854 rack_progress_drops = counter_u64_alloc(M_WAITOK); 855 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 856 SYSCTL_CHILDREN(rack_sysctl_root), 857 OID_AUTO, "prog_drops", CTLFLAG_RD, 858 &rack_progress_drops, 859 "Total number of progress drops"); 860 rack_input_idle_reduces = counter_u64_alloc(M_WAITOK); 861 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 862 SYSCTL_CHILDREN(rack_sysctl_root), 863 OID_AUTO, "idle_reduce_oninput", CTLFLAG_RD, 864 &rack_input_idle_reduces, 865 "Total number of idle reductions on input"); 866 rack_tlp_does_nada = counter_u64_alloc(M_WAITOK); 867 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 868 SYSCTL_CHILDREN(rack_sysctl_root), 869 OID_AUTO, "tlp_nada", CTLFLAG_RD, 870 &rack_tlp_does_nada, 871 "Total number of nada tlp calls"); 872 COUNTER_ARRAY_ALLOC(rack_out_size, TCP_MSS_ACCT_SIZE, M_WAITOK); 873 SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), 874 OID_AUTO, "outsize", CTLFLAG_RD, 875 rack_out_size, TCP_MSS_ACCT_SIZE, "MSS send sizes"); 876 COUNTER_ARRAY_ALLOC(rack_opts_arry, RACK_OPTS_SIZE, M_WAITOK); 877 SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), 878 OID_AUTO, "opts", CTLFLAG_RD, 879 rack_opts_arry, RACK_OPTS_SIZE, "RACK Option Stats"); 880 SYSCTL_ADD_PROC(&rack_sysctl_ctx, 881 SYSCTL_CHILDREN(rack_sysctl_root), 882 OID_AUTO, "clear", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, 883 &rack_clear_counter, 0, sysctl_rack_clear, "IU", "Clear counters"); 884 } 885 886 static inline int32_t 887 rack_progress_timeout_check(struct tcpcb *tp) 888 { 889 #ifdef NETFLIX_PROGRESS 890 if (tp->t_maxunacktime && tp->t_acktime && TSTMP_GT(ticks, tp->t_acktime)) { 891 if ((ticks - tp->t_acktime) >= tp->t_maxunacktime) { 892 /* 893 * There is an assumption that the caller 894 * will drop the connection so we will 895 * increment the counters here. 896 */ 897 struct tcp_rack *rack; 898 rack = (struct tcp_rack *)tp->t_fb_ptr; 899 counter_u64_add(rack_progress_drops, 1); 900 TCPSTAT_INC(tcps_progdrops); 901 rack_log_progress_event(rack, tp, ticks, PROGRESS_DROP, __LINE__); 902 return (1); 903 } 904 } 905 #endif 906 return (0); 907 } 908 909 910 static void 911 rack_log_to_start(struct tcp_rack *rack, uint32_t cts, uint32_t to, int32_t slot, uint8_t which) 912 { 913 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 914 union tcp_log_stackspecific log; 915 916 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 917 log.u_bbr.flex1 = TICKS_2_MSEC(rack->rc_tp->t_srtt >> TCP_RTT_SHIFT); 918 log.u_bbr.flex2 = to; 919 log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags; 920 log.u_bbr.flex4 = slot; 921 log.u_bbr.flex5 = rack->rc_inp->inp_hptsslot; 922 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur; 923 log.u_bbr.flex8 = which; 924 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 925 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 926 TCP_LOG_EVENT(rack->rc_tp, NULL, 927 &rack->rc_inp->inp_socket->so_rcv, 928 &rack->rc_inp->inp_socket->so_snd, 929 BBR_LOG_TIMERSTAR, 0, 930 0, &log, false); 931 } 932 } 933 934 static void 935 rack_log_to_event(struct tcp_rack *rack, int32_t to_num) 936 { 937 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 938 union tcp_log_stackspecific log; 939 940 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 941 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 942 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 943 log.u_bbr.flex8 = to_num; 944 log.u_bbr.flex1 = rack->r_ctl.rc_rack_min_rtt; 945 log.u_bbr.flex2 = rack->rc_rack_rtt; 946 TCP_LOG_EVENT(rack->rc_tp, NULL, 947 &rack->rc_inp->inp_socket->so_rcv, 948 &rack->rc_inp->inp_socket->so_snd, 949 BBR_LOG_RTO, 0, 950 0, &log, false); 951 } 952 } 953 954 static void 955 rack_log_rtt_upd(struct tcpcb *tp, struct tcp_rack *rack, int32_t t, 956 uint32_t o_srtt, uint32_t o_var) 957 { 958 if (tp->t_logstate != TCP_LOG_STATE_OFF) { 959 union tcp_log_stackspecific log; 960 961 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 962 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 963 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 964 log.u_bbr.flex1 = t; 965 log.u_bbr.flex2 = o_srtt; 966 log.u_bbr.flex3 = o_var; 967 log.u_bbr.flex4 = rack->r_ctl.rack_rs.rs_rtt_lowest; 968 log.u_bbr.flex5 = rack->r_ctl.rack_rs.rs_rtt_highest; 969 log.u_bbr.flex6 = rack->r_ctl.rack_rs.rs_rtt_cnt; 970 log.u_bbr.rttProp = rack->r_ctl.rack_rs.rs_rtt_tot; 971 log.u_bbr.flex8 = rack->r_ctl.rc_rate_sample_method; 972 TCP_LOG_EVENT(tp, NULL, 973 &rack->rc_inp->inp_socket->so_rcv, 974 &rack->rc_inp->inp_socket->so_snd, 975 BBR_LOG_BBRRTT, 0, 976 0, &log, false); 977 } 978 } 979 980 static void 981 rack_log_rtt_sample(struct tcp_rack *rack, uint32_t rtt) 982 { 983 /* 984 * Log the rtt sample we are 985 * applying to the srtt algorithm in 986 * useconds. 987 */ 988 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 989 union tcp_log_stackspecific log; 990 struct timeval tv; 991 992 memset(&log, 0, sizeof(log)); 993 /* Convert our ms to a microsecond */ 994 log.u_bbr.flex1 = rtt * 1000; 995 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 996 TCP_LOG_EVENTP(rack->rc_tp, NULL, 997 &rack->rc_inp->inp_socket->so_rcv, 998 &rack->rc_inp->inp_socket->so_snd, 999 TCP_LOG_RTT, 0, 1000 0, &log, false, &tv); 1001 } 1002 } 1003 1004 1005 static inline void 1006 rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, int event, int line) 1007 { 1008 if (rack_verbose_logging && (tp->t_logstate != TCP_LOG_STATE_OFF)) { 1009 union tcp_log_stackspecific log; 1010 1011 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 1012 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 1013 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 1014 log.u_bbr.flex1 = line; 1015 log.u_bbr.flex2 = tick; 1016 log.u_bbr.flex3 = tp->t_maxunacktime; 1017 log.u_bbr.flex4 = tp->t_acktime; 1018 log.u_bbr.flex8 = event; 1019 TCP_LOG_EVENT(tp, NULL, 1020 &rack->rc_inp->inp_socket->so_rcv, 1021 &rack->rc_inp->inp_socket->so_snd, 1022 BBR_LOG_PROGRESS, 0, 1023 0, &log, false); 1024 } 1025 } 1026 1027 static void 1028 rack_log_type_bbrsnd(struct tcp_rack *rack, uint32_t len, uint32_t slot, uint32_t cts) 1029 { 1030 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 1031 union tcp_log_stackspecific log; 1032 1033 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 1034 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 1035 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 1036 log.u_bbr.flex1 = slot; 1037 log.u_bbr.flex7 = (0x0000ffff & rack->r_ctl.rc_hpts_flags); 1038 log.u_bbr.flex8 = rack->rc_in_persist; 1039 TCP_LOG_EVENT(rack->rc_tp, NULL, 1040 &rack->rc_inp->inp_socket->so_rcv, 1041 &rack->rc_inp->inp_socket->so_snd, 1042 BBR_LOG_BBRSND, 0, 1043 0, &log, false); 1044 } 1045 } 1046 1047 static void 1048 rack_log_doseg_done(struct tcp_rack *rack, uint32_t cts, int32_t nxt_pkt, int32_t did_out, int way_out) 1049 { 1050 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 1051 union tcp_log_stackspecific log; 1052 1053 memset(&log, 0, sizeof(log)); 1054 log.u_bbr.flex1 = did_out; 1055 log.u_bbr.flex2 = nxt_pkt; 1056 log.u_bbr.flex3 = way_out; 1057 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; 1058 log.u_bbr.flex7 = rack->r_wanted_output; 1059 log.u_bbr.flex8 = rack->rc_in_persist; 1060 TCP_LOG_EVENT(rack->rc_tp, NULL, 1061 &rack->rc_inp->inp_socket->so_rcv, 1062 &rack->rc_inp->inp_socket->so_snd, 1063 BBR_LOG_DOSEG_DONE, 0, 1064 0, &log, false); 1065 } 1066 } 1067 1068 1069 static void 1070 rack_log_type_just_return(struct tcp_rack *rack, uint32_t cts, uint32_t tlen, uint32_t slot, uint8_t hpts_calling) 1071 { 1072 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 1073 union tcp_log_stackspecific log; 1074 1075 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 1076 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 1077 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 1078 log.u_bbr.flex1 = slot; 1079 log.u_bbr.flex2 = rack->r_ctl.rc_hpts_flags; 1080 log.u_bbr.flex7 = hpts_calling; 1081 log.u_bbr.flex8 = rack->rc_in_persist; 1082 TCP_LOG_EVENT(rack->rc_tp, NULL, 1083 &rack->rc_inp->inp_socket->so_rcv, 1084 &rack->rc_inp->inp_socket->so_snd, 1085 BBR_LOG_JUSTRET, 0, 1086 tlen, &log, false); 1087 } 1088 } 1089 1090 static void 1091 rack_log_to_cancel(struct tcp_rack *rack, int32_t hpts_removed, int line) 1092 { 1093 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 1094 union tcp_log_stackspecific log; 1095 1096 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 1097 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 1098 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 1099 log.u_bbr.flex1 = line; 1100 log.u_bbr.flex2 = 0; 1101 log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags; 1102 log.u_bbr.flex4 = 0; 1103 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur; 1104 log.u_bbr.flex8 = hpts_removed; 1105 TCP_LOG_EVENT(rack->rc_tp, NULL, 1106 &rack->rc_inp->inp_socket->so_rcv, 1107 &rack->rc_inp->inp_socket->so_snd, 1108 BBR_LOG_TIMERCANC, 0, 1109 0, &log, false); 1110 } 1111 } 1112 1113 static void 1114 rack_log_to_processing(struct tcp_rack *rack, uint32_t cts, int32_t ret, int32_t timers) 1115 { 1116 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 1117 union tcp_log_stackspecific log; 1118 1119 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 1120 log.u_bbr.flex1 = timers; 1121 log.u_bbr.flex2 = ret; 1122 log.u_bbr.flex3 = rack->r_ctl.rc_timer_exp; 1123 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; 1124 log.u_bbr.flex5 = cts; 1125 TCP_LOG_EVENT(rack->rc_tp, NULL, 1126 &rack->rc_inp->inp_socket->so_rcv, 1127 &rack->rc_inp->inp_socket->so_snd, 1128 BBR_LOG_TO_PROCESS, 0, 1129 0, &log, false); 1130 } 1131 } 1132 1133 static void 1134 rack_counter_destroy() 1135 { 1136 counter_u64_free(rack_badfr); 1137 counter_u64_free(rack_badfr_bytes); 1138 counter_u64_free(rack_rtm_prr_retran); 1139 counter_u64_free(rack_rtm_prr_newdata); 1140 counter_u64_free(rack_timestamp_mismatch); 1141 counter_u64_free(rack_reorder_seen); 1142 counter_u64_free(rack_tlp_tot); 1143 counter_u64_free(rack_tlp_newdata); 1144 counter_u64_free(rack_tlp_retran); 1145 counter_u64_free(rack_tlp_retran_bytes); 1146 counter_u64_free(rack_tlp_retran_fail); 1147 counter_u64_free(rack_to_tot); 1148 counter_u64_free(rack_to_arm_rack); 1149 counter_u64_free(rack_to_arm_tlp); 1150 counter_u64_free(rack_paced_segments); 1151 counter_u64_free(rack_unpaced_segments); 1152 counter_u64_free(rack_saw_enobuf); 1153 counter_u64_free(rack_saw_enetunreach); 1154 counter_u64_free(rack_to_alloc_hard); 1155 counter_u64_free(rack_to_alloc_emerg); 1156 counter_u64_free(rack_sack_proc_all); 1157 counter_u64_free(rack_sack_proc_short); 1158 counter_u64_free(rack_sack_proc_restart); 1159 counter_u64_free(rack_to_alloc); 1160 counter_u64_free(rack_to_alloc_limited); 1161 counter_u64_free(rack_split_limited); 1162 counter_u64_free(rack_find_high); 1163 counter_u64_free(rack_runt_sacks); 1164 counter_u64_free(rack_enter_tlp_calc); 1165 counter_u64_free(rack_used_tlpmethod); 1166 counter_u64_free(rack_used_tlpmethod2); 1167 counter_u64_free(rack_progress_drops); 1168 counter_u64_free(rack_input_idle_reduces); 1169 counter_u64_free(rack_tlp_does_nada); 1170 COUNTER_ARRAY_FREE(rack_out_size, TCP_MSS_ACCT_SIZE); 1171 COUNTER_ARRAY_FREE(rack_opts_arry, RACK_OPTS_SIZE); 1172 } 1173 1174 static struct rack_sendmap * 1175 rack_alloc(struct tcp_rack *rack) 1176 { 1177 struct rack_sendmap *rsm; 1178 1179 rsm = uma_zalloc(rack_zone, M_NOWAIT); 1180 if (rsm) { 1181 rack->r_ctl.rc_num_maps_alloced++; 1182 counter_u64_add(rack_to_alloc, 1); 1183 return (rsm); 1184 } 1185 if (rack->rc_free_cnt) { 1186 counter_u64_add(rack_to_alloc_emerg, 1); 1187 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); 1188 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_next); 1189 rack->rc_free_cnt--; 1190 return (rsm); 1191 } 1192 return (NULL); 1193 } 1194 1195 static struct rack_sendmap * 1196 rack_alloc_full_limit(struct tcp_rack *rack) 1197 { 1198 if ((rack_map_entries_limit > 0) && 1199 (rack->r_ctl.rc_num_maps_alloced >= rack_map_entries_limit)) { 1200 counter_u64_add(rack_to_alloc_limited, 1); 1201 if (!rack->alloc_limit_reported) { 1202 rack->alloc_limit_reported = 1; 1203 counter_u64_add(rack_alloc_limited_conns, 1); 1204 } 1205 return (NULL); 1206 } 1207 return (rack_alloc(rack)); 1208 } 1209 1210 /* wrapper to allocate a sendmap entry, subject to a specific limit */ 1211 static struct rack_sendmap * 1212 rack_alloc_limit(struct tcp_rack *rack, uint8_t limit_type) 1213 { 1214 struct rack_sendmap *rsm; 1215 1216 if (limit_type) { 1217 /* currently there is only one limit type */ 1218 if (rack_map_split_limit > 0 && 1219 rack->r_ctl.rc_num_split_allocs >= rack_map_split_limit) { 1220 counter_u64_add(rack_split_limited, 1); 1221 if (!rack->alloc_limit_reported) { 1222 rack->alloc_limit_reported = 1; 1223 counter_u64_add(rack_alloc_limited_conns, 1); 1224 } 1225 return (NULL); 1226 } 1227 } 1228 1229 /* allocate and mark in the limit type, if set */ 1230 rsm = rack_alloc(rack); 1231 if (rsm != NULL && limit_type) { 1232 rsm->r_limit_type = limit_type; 1233 rack->r_ctl.rc_num_split_allocs++; 1234 } 1235 return (rsm); 1236 } 1237 1238 static void 1239 rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm) 1240 { 1241 if (rsm->r_limit_type) { 1242 /* currently there is only one limit type */ 1243 rack->r_ctl.rc_num_split_allocs--; 1244 } 1245 if (rack->r_ctl.rc_tlpsend == rsm) 1246 rack->r_ctl.rc_tlpsend = NULL; 1247 if (rack->r_ctl.rc_next == rsm) 1248 rack->r_ctl.rc_next = NULL; 1249 if (rack->r_ctl.rc_sacklast == rsm) 1250 rack->r_ctl.rc_sacklast = NULL; 1251 if (rack->rc_free_cnt < rack_free_cache) { 1252 memset(rsm, 0, sizeof(struct rack_sendmap)); 1253 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_next); 1254 rsm->r_limit_type = 0; 1255 rack->rc_free_cnt++; 1256 return; 1257 } 1258 rack->r_ctl.rc_num_maps_alloced--; 1259 uma_zfree(rack_zone, rsm); 1260 } 1261 1262 /* 1263 * CC wrapper hook functions 1264 */ 1265 static void 1266 rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, struct tcphdr *th, uint16_t nsegs, 1267 uint16_t type, int32_t recovery) 1268 { 1269 #ifdef NETFLIX_STATS 1270 int32_t gput; 1271 #endif 1272 1273 INP_WLOCK_ASSERT(tp->t_inpcb); 1274 1275 tp->ccv->nsegs = nsegs; 1276 tp->ccv->bytes_this_ack = BYTES_THIS_ACK(tp, th); 1277 if ((recovery) && (rack->r_ctl.rc_early_recovery_segs)) { 1278 uint32_t max; 1279 1280 max = rack->r_ctl.rc_early_recovery_segs * tp->t_maxseg; 1281 if (tp->ccv->bytes_this_ack > max) { 1282 tp->ccv->bytes_this_ack = max; 1283 } 1284 } 1285 if (tp->snd_cwnd <= tp->snd_wnd) 1286 tp->ccv->flags |= CCF_CWND_LIMITED; 1287 else 1288 tp->ccv->flags &= ~CCF_CWND_LIMITED; 1289 1290 if (type == CC_ACK) { 1291 #ifdef NETFLIX_STATS 1292 stats_voi_update_abs_s32(tp->t_stats, VOI_TCP_CALCFRWINDIFF, 1293 ((int32_t) tp->snd_cwnd) - tp->snd_wnd); 1294 if ((tp->t_flags & TF_GPUTINPROG) && 1295 SEQ_GEQ(th->th_ack, tp->gput_ack)) { 1296 gput = (((int64_t) (th->th_ack - tp->gput_seq)) << 3) / 1297 max(1, tcp_ts_getticks() - tp->gput_ts); 1298 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_GPUT, 1299 gput); 1300 /* 1301 * XXXLAS: This is a temporary hack, and should be 1302 * chained off VOI_TCP_GPUT when stats(9) grows an 1303 * API to deal with chained VOIs. 1304 */ 1305 if (tp->t_stats_gput_prev > 0) 1306 stats_voi_update_abs_s32(tp->t_stats, 1307 VOI_TCP_GPUT_ND, 1308 ((gput - tp->t_stats_gput_prev) * 100) / 1309 tp->t_stats_gput_prev); 1310 tp->t_flags &= ~TF_GPUTINPROG; 1311 tp->t_stats_gput_prev = gput; 1312 if (tp->t_maxpeakrate) { 1313 /* 1314 * We update t_peakrate_thr. This gives us roughly 1315 * one update per round trip time. 1316 */ 1317 tcp_update_peakrate_thr(tp); 1318 } 1319 } 1320 #endif 1321 if (tp->snd_cwnd > tp->snd_ssthresh) { 1322 tp->t_bytes_acked += min(tp->ccv->bytes_this_ack, 1323 nsegs * V_tcp_abc_l_var * tp->t_maxseg); 1324 if (tp->t_bytes_acked >= tp->snd_cwnd) { 1325 tp->t_bytes_acked -= tp->snd_cwnd; 1326 tp->ccv->flags |= CCF_ABC_SENTAWND; 1327 } 1328 } else { 1329 tp->ccv->flags &= ~CCF_ABC_SENTAWND; 1330 tp->t_bytes_acked = 0; 1331 } 1332 } 1333 if (CC_ALGO(tp)->ack_received != NULL) { 1334 /* XXXLAS: Find a way to live without this */ 1335 tp->ccv->curack = th->th_ack; 1336 CC_ALGO(tp)->ack_received(tp->ccv, type); 1337 } 1338 #ifdef NETFLIX_STATS 1339 stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_LCWIN, tp->snd_cwnd); 1340 #endif 1341 if (rack->r_ctl.rc_rack_largest_cwnd < tp->snd_cwnd) { 1342 rack->r_ctl.rc_rack_largest_cwnd = tp->snd_cwnd; 1343 } 1344 /* we enforce max peak rate if it is set. */ 1345 if (tp->t_peakrate_thr && tp->snd_cwnd > tp->t_peakrate_thr) { 1346 tp->snd_cwnd = tp->t_peakrate_thr; 1347 } 1348 } 1349 1350 static void 1351 tcp_rack_partialack(struct tcpcb *tp, struct tcphdr *th) 1352 { 1353 struct tcp_rack *rack; 1354 1355 rack = (struct tcp_rack *)tp->t_fb_ptr; 1356 INP_WLOCK_ASSERT(tp->t_inpcb); 1357 if (rack->r_ctl.rc_prr_sndcnt > 0) 1358 rack->r_wanted_output++; 1359 } 1360 1361 static void 1362 rack_post_recovery(struct tcpcb *tp, struct tcphdr *th) 1363 { 1364 struct tcp_rack *rack; 1365 1366 INP_WLOCK_ASSERT(tp->t_inpcb); 1367 rack = (struct tcp_rack *)tp->t_fb_ptr; 1368 if (CC_ALGO(tp)->post_recovery != NULL) { 1369 tp->ccv->curack = th->th_ack; 1370 CC_ALGO(tp)->post_recovery(tp->ccv); 1371 } 1372 /* 1373 * Here we can in theory adjust cwnd to be based on the number of 1374 * losses in the window (rack->r_ctl.rc_loss_count). This is done 1375 * based on the rack_use_proportional flag. 1376 */ 1377 if (rack->r_ctl.rc_prop_reduce && rack->r_ctl.rc_prop_rate) { 1378 int32_t reduce; 1379 1380 reduce = (rack->r_ctl.rc_loss_count * rack->r_ctl.rc_prop_rate); 1381 if (reduce > 50) { 1382 reduce = 50; 1383 } 1384 tp->snd_cwnd -= ((reduce * tp->snd_cwnd) / 100); 1385 } else { 1386 if (tp->snd_cwnd > tp->snd_ssthresh) { 1387 /* Drop us down to the ssthresh (1/2 cwnd at loss) */ 1388 tp->snd_cwnd = tp->snd_ssthresh; 1389 } 1390 } 1391 if (rack->r_ctl.rc_prr_sndcnt > 0) { 1392 /* Suck the next prr cnt back into cwnd */ 1393 tp->snd_cwnd += rack->r_ctl.rc_prr_sndcnt; 1394 rack->r_ctl.rc_prr_sndcnt = 0; 1395 } 1396 tp->snd_recover = tp->snd_una; 1397 EXIT_RECOVERY(tp->t_flags); 1398 } 1399 1400 static void 1401 rack_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type) 1402 { 1403 struct tcp_rack *rack; 1404 1405 INP_WLOCK_ASSERT(tp->t_inpcb); 1406 1407 rack = (struct tcp_rack *)tp->t_fb_ptr; 1408 switch (type) { 1409 case CC_NDUPACK: 1410 /* rack->r_ctl.rc_ssthresh_set = 1;*/ 1411 if (!IN_FASTRECOVERY(tp->t_flags)) { 1412 rack->r_ctl.rc_tlp_rtx_out = 0; 1413 rack->r_ctl.rc_prr_delivered = 0; 1414 rack->r_ctl.rc_prr_out = 0; 1415 rack->r_ctl.rc_loss_count = 0; 1416 rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; 1417 rack->r_ctl.rc_prr_recovery_fs = tp->snd_max - tp->snd_una; 1418 tp->snd_recover = tp->snd_max; 1419 if (tp->t_flags & TF_ECN_PERMIT) 1420 tp->t_flags |= TF_ECN_SND_CWR; 1421 } 1422 break; 1423 case CC_ECN: 1424 if (!IN_CONGRECOVERY(tp->t_flags)) { 1425 TCPSTAT_INC(tcps_ecn_rcwnd); 1426 tp->snd_recover = tp->snd_max; 1427 if (tp->t_flags & TF_ECN_PERMIT) 1428 tp->t_flags |= TF_ECN_SND_CWR; 1429 } 1430 break; 1431 case CC_RTO: 1432 tp->t_dupacks = 0; 1433 tp->t_bytes_acked = 0; 1434 EXIT_RECOVERY(tp->t_flags); 1435 tp->snd_ssthresh = max(2, min(tp->snd_wnd, tp->snd_cwnd) / 2 / 1436 tp->t_maxseg) * tp->t_maxseg; 1437 tp->snd_cwnd = tp->t_maxseg; 1438 break; 1439 case CC_RTO_ERR: 1440 TCPSTAT_INC(tcps_sndrexmitbad); 1441 /* RTO was unnecessary, so reset everything. */ 1442 tp->snd_cwnd = tp->snd_cwnd_prev; 1443 tp->snd_ssthresh = tp->snd_ssthresh_prev; 1444 tp->snd_recover = tp->snd_recover_prev; 1445 if (tp->t_flags & TF_WASFRECOVERY) 1446 ENTER_FASTRECOVERY(tp->t_flags); 1447 if (tp->t_flags & TF_WASCRECOVERY) 1448 ENTER_CONGRECOVERY(tp->t_flags); 1449 tp->snd_nxt = tp->snd_max; 1450 tp->t_badrxtwin = 0; 1451 break; 1452 } 1453 1454 if (CC_ALGO(tp)->cong_signal != NULL) { 1455 if (th != NULL) 1456 tp->ccv->curack = th->th_ack; 1457 CC_ALGO(tp)->cong_signal(tp->ccv, type); 1458 } 1459 } 1460 1461 1462 1463 static inline void 1464 rack_cc_after_idle(struct tcpcb *tp, int reduce_largest) 1465 { 1466 uint32_t i_cwnd; 1467 1468 INP_WLOCK_ASSERT(tp->t_inpcb); 1469 1470 #ifdef NETFLIX_STATS 1471 TCPSTAT_INC(tcps_idle_restarts); 1472 if (tp->t_state == TCPS_ESTABLISHED) 1473 TCPSTAT_INC(tcps_idle_estrestarts); 1474 #endif 1475 if (CC_ALGO(tp)->after_idle != NULL) 1476 CC_ALGO(tp)->after_idle(tp->ccv); 1477 1478 if (V_tcp_initcwnd_segments) 1479 i_cwnd = min((V_tcp_initcwnd_segments * tp->t_maxseg), 1480 max(2 * tp->t_maxseg, 14600)); 1481 else if (V_tcp_do_rfc3390) 1482 i_cwnd = min(4 * tp->t_maxseg, 1483 max(2 * tp->t_maxseg, 4380)); 1484 else { 1485 /* Per RFC5681 Section 3.1 */ 1486 if (tp->t_maxseg > 2190) 1487 i_cwnd = 2 * tp->t_maxseg; 1488 else if (tp->t_maxseg > 1095) 1489 i_cwnd = 3 * tp->t_maxseg; 1490 else 1491 i_cwnd = 4 * tp->t_maxseg; 1492 } 1493 if (reduce_largest) { 1494 /* 1495 * Do we reduce the largest cwnd to make 1496 * rack play nice on restart hptsi wise? 1497 */ 1498 if (((struct tcp_rack *)tp->t_fb_ptr)->r_ctl.rc_rack_largest_cwnd > i_cwnd) 1499 ((struct tcp_rack *)tp->t_fb_ptr)->r_ctl.rc_rack_largest_cwnd = i_cwnd; 1500 } 1501 /* 1502 * Being idle is no differnt than the initial window. If the cc 1503 * clamps it down below the initial window raise it to the initial 1504 * window. 1505 */ 1506 if (tp->snd_cwnd < i_cwnd) { 1507 tp->snd_cwnd = i_cwnd; 1508 } 1509 } 1510 1511 1512 /* 1513 * Indicate whether this ack should be delayed. We can delay the ack if 1514 * following conditions are met: 1515 * - There is no delayed ack timer in progress. 1516 * - Our last ack wasn't a 0-sized window. We never want to delay 1517 * the ack that opens up a 0-sized window. 1518 * - LRO wasn't used for this segment. We make sure by checking that the 1519 * segment size is not larger than the MSS. 1520 * - Delayed acks are enabled or this is a half-synchronized T/TCP 1521 * connection. 1522 */ 1523 #define DELAY_ACK(tp, tlen) \ 1524 (((tp->t_flags & TF_RXWIN0SENT) == 0) && \ 1525 ((tp->t_flags & TF_DELACK) == 0) && \ 1526 (tlen <= tp->t_maxseg) && \ 1527 (tp->t_delayed_ack || (tp->t_flags & TF_NEEDSYN))) 1528 1529 static inline void 1530 rack_calc_rwin(struct socket *so, struct tcpcb *tp) 1531 { 1532 int32_t win; 1533 1534 /* 1535 * Calculate amount of space in receive window, and then do TCP 1536 * input processing. Receive window is amount of space in rcv queue, 1537 * but not less than advertised window. 1538 */ 1539 win = sbspace(&so->so_rcv); 1540 if (win < 0) 1541 win = 0; 1542 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); 1543 } 1544 1545 static void 1546 rack_do_drop(struct mbuf *m, struct tcpcb *tp) 1547 { 1548 /* 1549 * Drop space held by incoming segment and return. 1550 */ 1551 if (tp != NULL) 1552 INP_WUNLOCK(tp->t_inpcb); 1553 if (m) 1554 m_freem(m); 1555 } 1556 1557 static void 1558 rack_do_dropwithreset(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t rstreason, int32_t tlen) 1559 { 1560 if (tp != NULL) { 1561 tcp_dropwithreset(m, th, tp, tlen, rstreason); 1562 INP_WUNLOCK(tp->t_inpcb); 1563 } else 1564 tcp_dropwithreset(m, th, NULL, tlen, rstreason); 1565 } 1566 1567 /* 1568 * The value in ret_val informs the caller 1569 * if we dropped the tcb (and lock) or not. 1570 * 1 = we dropped it, 0 = the TCB is still locked 1571 * and valid. 1572 */ 1573 static void 1574 rack_do_dropafterack(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t thflags, int32_t tlen, int32_t * ret_val) 1575 { 1576 /* 1577 * Generate an ACK dropping incoming segment if it occupies sequence 1578 * space, where the ACK reflects our state. 1579 * 1580 * We can now skip the test for the RST flag since all paths to this 1581 * code happen after packets containing RST have been dropped. 1582 * 1583 * In the SYN-RECEIVED state, don't send an ACK unless the segment 1584 * we received passes the SYN-RECEIVED ACK test. If it fails send a 1585 * RST. This breaks the loop in the "LAND" DoS attack, and also 1586 * prevents an ACK storm between two listening ports that have been 1587 * sent forged SYN segments, each with the source address of the 1588 * other. 1589 */ 1590 struct tcp_rack *rack; 1591 1592 if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) && 1593 (SEQ_GT(tp->snd_una, th->th_ack) || 1594 SEQ_GT(th->th_ack, tp->snd_max))) { 1595 *ret_val = 1; 1596 rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 1597 return; 1598 } else 1599 *ret_val = 0; 1600 rack = (struct tcp_rack *)tp->t_fb_ptr; 1601 rack->r_wanted_output++; 1602 tp->t_flags |= TF_ACKNOW; 1603 if (m) 1604 m_freem(m); 1605 } 1606 1607 1608 static int 1609 rack_process_rst(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp) 1610 { 1611 /* 1612 * RFC5961 Section 3.2 1613 * 1614 * - RST drops connection only if SEG.SEQ == RCV.NXT. - If RST is in 1615 * window, we send challenge ACK. 1616 * 1617 * Note: to take into account delayed ACKs, we should test against 1618 * last_ack_sent instead of rcv_nxt. Note 2: we handle special case 1619 * of closed window, not covered by the RFC. 1620 */ 1621 int dropped = 0; 1622 1623 if ((SEQ_GEQ(th->th_seq, (tp->last_ack_sent - 1)) && 1624 SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) || 1625 (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) { 1626 1627 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 1628 KASSERT(tp->t_state != TCPS_SYN_SENT, 1629 ("%s: TH_RST for TCPS_SYN_SENT th %p tp %p", 1630 __func__, th, tp)); 1631 1632 if (V_tcp_insecure_rst || 1633 (tp->last_ack_sent == th->th_seq) || 1634 (tp->rcv_nxt == th->th_seq) || 1635 ((tp->last_ack_sent - 1) == th->th_seq)) { 1636 TCPSTAT_INC(tcps_drops); 1637 /* Drop the connection. */ 1638 switch (tp->t_state) { 1639 case TCPS_SYN_RECEIVED: 1640 so->so_error = ECONNREFUSED; 1641 goto close; 1642 case TCPS_ESTABLISHED: 1643 case TCPS_FIN_WAIT_1: 1644 case TCPS_FIN_WAIT_2: 1645 case TCPS_CLOSE_WAIT: 1646 case TCPS_CLOSING: 1647 case TCPS_LAST_ACK: 1648 so->so_error = ECONNRESET; 1649 close: 1650 tcp_state_change(tp, TCPS_CLOSED); 1651 /* FALLTHROUGH */ 1652 default: 1653 tp = tcp_close(tp); 1654 } 1655 dropped = 1; 1656 rack_do_drop(m, tp); 1657 } else { 1658 TCPSTAT_INC(tcps_badrst); 1659 /* Send challenge ACK. */ 1660 tcp_respond(tp, mtod(m, void *), th, m, 1661 tp->rcv_nxt, tp->snd_nxt, TH_ACK); 1662 tp->last_ack_sent = tp->rcv_nxt; 1663 } 1664 } else { 1665 m_freem(m); 1666 } 1667 return (dropped); 1668 } 1669 1670 /* 1671 * The value in ret_val informs the caller 1672 * if we dropped the tcb (and lock) or not. 1673 * 1 = we dropped it, 0 = the TCB is still locked 1674 * and valid. 1675 */ 1676 static void 1677 rack_challenge_ack(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * ret_val) 1678 { 1679 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 1680 1681 TCPSTAT_INC(tcps_badsyn); 1682 if (V_tcp_insecure_syn && 1683 SEQ_GEQ(th->th_seq, tp->last_ack_sent) && 1684 SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) { 1685 tp = tcp_drop(tp, ECONNRESET); 1686 *ret_val = 1; 1687 rack_do_drop(m, tp); 1688 } else { 1689 /* Send challenge ACK. */ 1690 tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt, 1691 tp->snd_nxt, TH_ACK); 1692 tp->last_ack_sent = tp->rcv_nxt; 1693 m = NULL; 1694 *ret_val = 0; 1695 rack_do_drop(m, NULL); 1696 } 1697 } 1698 1699 /* 1700 * rack_ts_check returns 1 for you should not proceed. It places 1701 * in ret_val what should be returned 1/0 by the caller. The 1 indicates 1702 * that the TCB is unlocked and probably dropped. The 0 indicates the 1703 * TCB is still valid and locked. 1704 */ 1705 static int 1706 rack_ts_check(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t tlen, int32_t thflags, int32_t * ret_val) 1707 { 1708 1709 /* Check to see if ts_recent is over 24 days old. */ 1710 if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) { 1711 /* 1712 * Invalidate ts_recent. If this segment updates ts_recent, 1713 * the age will be reset later and ts_recent will get a 1714 * valid value. If it does not, setting ts_recent to zero 1715 * will at least satisfy the requirement that zero be placed 1716 * in the timestamp echo reply when ts_recent isn't valid. 1717 * The age isn't reset until we get a valid ts_recent 1718 * because we don't want out-of-order segments to be dropped 1719 * when ts_recent is old. 1720 */ 1721 tp->ts_recent = 0; 1722 } else { 1723 TCPSTAT_INC(tcps_rcvduppack); 1724 TCPSTAT_ADD(tcps_rcvdupbyte, tlen); 1725 TCPSTAT_INC(tcps_pawsdrop); 1726 *ret_val = 0; 1727 if (tlen) { 1728 rack_do_dropafterack(m, tp, th, thflags, tlen, ret_val); 1729 } else { 1730 rack_do_drop(m, NULL); 1731 } 1732 return (1); 1733 } 1734 return (0); 1735 } 1736 1737 /* 1738 * rack_drop_checks returns 1 for you should not proceed. It places 1739 * in ret_val what should be returned 1/0 by the caller. The 1 indicates 1740 * that the TCB is unlocked and probably dropped. The 0 indicates the 1741 * TCB is still valid and locked. 1742 */ 1743 static int 1744 rack_drop_checks(struct tcpopt *to, struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * tlenp, int32_t * thf, int32_t * drop_hdrlen, int32_t * ret_val) 1745 { 1746 int32_t todrop; 1747 int32_t thflags; 1748 int32_t tlen; 1749 1750 thflags = *thf; 1751 tlen = *tlenp; 1752 todrop = tp->rcv_nxt - th->th_seq; 1753 if (todrop > 0) { 1754 if (thflags & TH_SYN) { 1755 thflags &= ~TH_SYN; 1756 th->th_seq++; 1757 if (th->th_urp > 1) 1758 th->th_urp--; 1759 else 1760 thflags &= ~TH_URG; 1761 todrop--; 1762 } 1763 /* 1764 * Following if statement from Stevens, vol. 2, p. 960. 1765 */ 1766 if (todrop > tlen 1767 || (todrop == tlen && (thflags & TH_FIN) == 0)) { 1768 /* 1769 * Any valid FIN must be to the left of the window. 1770 * At this point the FIN must be a duplicate or out 1771 * of sequence; drop it. 1772 */ 1773 thflags &= ~TH_FIN; 1774 /* 1775 * Send an ACK to resynchronize and drop any data. 1776 * But keep on processing for RST or ACK. 1777 */ 1778 tp->t_flags |= TF_ACKNOW; 1779 todrop = tlen; 1780 TCPSTAT_INC(tcps_rcvduppack); 1781 TCPSTAT_ADD(tcps_rcvdupbyte, todrop); 1782 } else { 1783 TCPSTAT_INC(tcps_rcvpartduppack); 1784 TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop); 1785 } 1786 *drop_hdrlen += todrop; /* drop from the top afterwards */ 1787 th->th_seq += todrop; 1788 tlen -= todrop; 1789 if (th->th_urp > todrop) 1790 th->th_urp -= todrop; 1791 else { 1792 thflags &= ~TH_URG; 1793 th->th_urp = 0; 1794 } 1795 } 1796 /* 1797 * If segment ends after window, drop trailing data (and PUSH and 1798 * FIN); if nothing left, just ACK. 1799 */ 1800 todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd); 1801 if (todrop > 0) { 1802 TCPSTAT_INC(tcps_rcvpackafterwin); 1803 if (todrop >= tlen) { 1804 TCPSTAT_ADD(tcps_rcvbyteafterwin, tlen); 1805 /* 1806 * If window is closed can only take segments at 1807 * window edge, and have to drop data and PUSH from 1808 * incoming segments. Continue processing, but 1809 * remember to ack. Otherwise, drop segment and 1810 * ack. 1811 */ 1812 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { 1813 tp->t_flags |= TF_ACKNOW; 1814 TCPSTAT_INC(tcps_rcvwinprobe); 1815 } else { 1816 rack_do_dropafterack(m, tp, th, thflags, tlen, ret_val); 1817 return (1); 1818 } 1819 } else 1820 TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); 1821 m_adj(m, -todrop); 1822 tlen -= todrop; 1823 thflags &= ~(TH_PUSH | TH_FIN); 1824 } 1825 *thf = thflags; 1826 *tlenp = tlen; 1827 return (0); 1828 } 1829 1830 static struct rack_sendmap * 1831 rack_find_lowest_rsm(struct tcp_rack *rack) 1832 { 1833 struct rack_sendmap *rsm; 1834 1835 /* 1836 * Walk the time-order transmitted list looking for an rsm that is 1837 * not acked. This will be the one that was sent the longest time 1838 * ago that is still outstanding. 1839 */ 1840 TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) { 1841 if (rsm->r_flags & RACK_ACKED) { 1842 continue; 1843 } 1844 goto finish; 1845 } 1846 finish: 1847 return (rsm); 1848 } 1849 1850 static struct rack_sendmap * 1851 rack_find_high_nonack(struct tcp_rack *rack, struct rack_sendmap *rsm) 1852 { 1853 struct rack_sendmap *prsm; 1854 1855 /* 1856 * Walk the sequence order list backward until we hit and arrive at 1857 * the highest seq not acked. In theory when this is called it 1858 * should be the last segment (which it was not). 1859 */ 1860 counter_u64_add(rack_find_high, 1); 1861 prsm = rsm; 1862 TAILQ_FOREACH_REVERSE_FROM(prsm, &rack->r_ctl.rc_map, rack_head, r_next) { 1863 if (prsm->r_flags & (RACK_ACKED | RACK_HAS_FIN)) { 1864 continue; 1865 } 1866 return (prsm); 1867 } 1868 return (NULL); 1869 } 1870 1871 1872 static uint32_t 1873 rack_calc_thresh_rack(struct tcp_rack *rack, uint32_t srtt, uint32_t cts) 1874 { 1875 int32_t lro; 1876 uint32_t thresh; 1877 1878 /* 1879 * lro is the flag we use to determine if we have seen reordering. 1880 * If it gets set we have seen reordering. The reorder logic either 1881 * works in one of two ways: 1882 * 1883 * If reorder-fade is configured, then we track the last time we saw 1884 * re-ordering occur. If we reach the point where enough time as 1885 * passed we no longer consider reordering has occuring. 1886 * 1887 * Or if reorder-face is 0, then once we see reordering we consider 1888 * the connection to alway be subject to reordering and just set lro 1889 * to 1. 1890 * 1891 * In the end if lro is non-zero we add the extra time for 1892 * reordering in. 1893 */ 1894 if (srtt == 0) 1895 srtt = 1; 1896 if (rack->r_ctl.rc_reorder_ts) { 1897 if (rack->r_ctl.rc_reorder_fade) { 1898 if (SEQ_GEQ(cts, rack->r_ctl.rc_reorder_ts)) { 1899 lro = cts - rack->r_ctl.rc_reorder_ts; 1900 if (lro == 0) { 1901 /* 1902 * No time as passed since the last 1903 * reorder, mark it as reordering. 1904 */ 1905 lro = 1; 1906 } 1907 } else { 1908 /* Negative time? */ 1909 lro = 0; 1910 } 1911 if (lro > rack->r_ctl.rc_reorder_fade) { 1912 /* Turn off reordering seen too */ 1913 rack->r_ctl.rc_reorder_ts = 0; 1914 lro = 0; 1915 } 1916 } else { 1917 /* Reodering does not fade */ 1918 lro = 1; 1919 } 1920 } else { 1921 lro = 0; 1922 } 1923 thresh = srtt + rack->r_ctl.rc_pkt_delay; 1924 if (lro) { 1925 /* It must be set, if not you get 1/4 rtt */ 1926 if (rack->r_ctl.rc_reorder_shift) 1927 thresh += (srtt >> rack->r_ctl.rc_reorder_shift); 1928 else 1929 thresh += (srtt >> 2); 1930 } else { 1931 thresh += 1; 1932 } 1933 /* We don't let the rack timeout be above a RTO */ 1934 1935 if (thresh > TICKS_2_MSEC(rack->rc_tp->t_rxtcur)) { 1936 thresh = TICKS_2_MSEC(rack->rc_tp->t_rxtcur); 1937 } 1938 /* And we don't want it above the RTO max either */ 1939 if (thresh > rack_rto_max) { 1940 thresh = rack_rto_max; 1941 } 1942 return (thresh); 1943 } 1944 1945 static uint32_t 1946 rack_calc_thresh_tlp(struct tcpcb *tp, struct tcp_rack *rack, 1947 struct rack_sendmap *rsm, uint32_t srtt) 1948 { 1949 struct rack_sendmap *prsm; 1950 uint32_t thresh, len; 1951 int maxseg; 1952 1953 if (srtt == 0) 1954 srtt = 1; 1955 if (rack->r_ctl.rc_tlp_threshold) 1956 thresh = srtt + (srtt / rack->r_ctl.rc_tlp_threshold); 1957 else 1958 thresh = (srtt * 2); 1959 1960 /* Get the previous sent packet, if any */ 1961 maxseg = tcp_maxseg(tp); 1962 counter_u64_add(rack_enter_tlp_calc, 1); 1963 len = rsm->r_end - rsm->r_start; 1964 if (rack->rack_tlp_threshold_use == TLP_USE_ID) { 1965 /* Exactly like the ID */ 1966 if (((tp->snd_max - tp->snd_una) - rack->r_ctl.rc_sacked + rack->r_ctl.rc_holes_rxt) <= maxseg) { 1967 uint32_t alt_thresh; 1968 /* 1969 * Compensate for delayed-ack with the d-ack time. 1970 */ 1971 counter_u64_add(rack_used_tlpmethod, 1); 1972 alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time; 1973 if (alt_thresh > thresh) 1974 thresh = alt_thresh; 1975 } 1976 } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_ONE) { 1977 /* 2.1 behavior */ 1978 prsm = TAILQ_PREV(rsm, rack_head, r_tnext); 1979 if (prsm && (len <= maxseg)) { 1980 /* 1981 * Two packets outstanding, thresh should be (2*srtt) + 1982 * possible inter-packet delay (if any). 1983 */ 1984 uint32_t inter_gap = 0; 1985 int idx, nidx; 1986 1987 counter_u64_add(rack_used_tlpmethod, 1); 1988 idx = rsm->r_rtr_cnt - 1; 1989 nidx = prsm->r_rtr_cnt - 1; 1990 if (TSTMP_GEQ(rsm->r_tim_lastsent[nidx], prsm->r_tim_lastsent[idx])) { 1991 /* Yes it was sent later (or at the same time) */ 1992 inter_gap = rsm->r_tim_lastsent[idx] - prsm->r_tim_lastsent[nidx]; 1993 } 1994 thresh += inter_gap; 1995 } else if (len <= maxseg) { 1996 /* 1997 * Possibly compensate for delayed-ack. 1998 */ 1999 uint32_t alt_thresh; 2000 2001 counter_u64_add(rack_used_tlpmethod2, 1); 2002 alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time; 2003 if (alt_thresh > thresh) 2004 thresh = alt_thresh; 2005 } 2006 } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_TWO) { 2007 /* 2.2 behavior */ 2008 if (len <= maxseg) { 2009 uint32_t alt_thresh; 2010 /* 2011 * Compensate for delayed-ack with the d-ack time. 2012 */ 2013 counter_u64_add(rack_used_tlpmethod, 1); 2014 alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time; 2015 if (alt_thresh > thresh) 2016 thresh = alt_thresh; 2017 } 2018 } 2019 /* Not above an RTO */ 2020 if (thresh > TICKS_2_MSEC(tp->t_rxtcur)) { 2021 thresh = TICKS_2_MSEC(tp->t_rxtcur); 2022 } 2023 /* Not above a RTO max */ 2024 if (thresh > rack_rto_max) { 2025 thresh = rack_rto_max; 2026 } 2027 /* Apply user supplied min TLP */ 2028 if (thresh < rack_tlp_min) { 2029 thresh = rack_tlp_min; 2030 } 2031 return (thresh); 2032 } 2033 2034 static struct rack_sendmap * 2035 rack_check_recovery_mode(struct tcpcb *tp, uint32_t tsused) 2036 { 2037 /* 2038 * Check to see that we don't need to fall into recovery. We will 2039 * need to do so if our oldest transmit is past the time we should 2040 * have had an ack. 2041 */ 2042 struct tcp_rack *rack; 2043 struct rack_sendmap *rsm; 2044 int32_t idx; 2045 uint32_t srtt_cur, srtt, thresh; 2046 2047 rack = (struct tcp_rack *)tp->t_fb_ptr; 2048 if (TAILQ_EMPTY(&rack->r_ctl.rc_map)) { 2049 return (NULL); 2050 } 2051 srtt_cur = tp->t_srtt >> TCP_RTT_SHIFT; 2052 srtt = TICKS_2_MSEC(srtt_cur); 2053 if (rack->rc_rack_rtt && (srtt > rack->rc_rack_rtt)) 2054 srtt = rack->rc_rack_rtt; 2055 2056 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 2057 if (rsm == NULL) 2058 return (NULL); 2059 2060 if (rsm->r_flags & RACK_ACKED) { 2061 rsm = rack_find_lowest_rsm(rack); 2062 if (rsm == NULL) 2063 return (NULL); 2064 } 2065 idx = rsm->r_rtr_cnt - 1; 2066 thresh = rack_calc_thresh_rack(rack, srtt, tsused); 2067 if (tsused < rsm->r_tim_lastsent[idx]) { 2068 return (NULL); 2069 } 2070 if ((tsused - rsm->r_tim_lastsent[idx]) < thresh) { 2071 return (NULL); 2072 } 2073 /* Ok if we reach here we are over-due */ 2074 rack->r_ctl.rc_rsm_start = rsm->r_start; 2075 rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; 2076 rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; 2077 rack_cong_signal(tp, NULL, CC_NDUPACK); 2078 return (rsm); 2079 } 2080 2081 static uint32_t 2082 rack_get_persists_timer_val(struct tcpcb *tp, struct tcp_rack *rack) 2083 { 2084 int32_t t; 2085 int32_t tt; 2086 uint32_t ret_val; 2087 2088 t = TICKS_2_MSEC((tp->t_srtt >> TCP_RTT_SHIFT) + ((tp->t_rttvar * 4) >> TCP_RTT_SHIFT)); 2089 TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift], 2090 tcp_persmin, tcp_persmax); 2091 if (tp->t_rxtshift < TCP_MAXRXTSHIFT) 2092 tp->t_rxtshift++; 2093 rack->r_ctl.rc_hpts_flags |= PACE_TMR_PERSIT; 2094 ret_val = (uint32_t)tt; 2095 return (ret_val); 2096 } 2097 2098 static uint32_t 2099 rack_timer_start(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 2100 { 2101 /* 2102 * Start the FR timer, we do this based on getting the first one in 2103 * the rc_tmap. Note that if its NULL we must stop the timer. in all 2104 * events we need to stop the running timer (if its running) before 2105 * starting the new one. 2106 */ 2107 uint32_t thresh, exp, to, srtt, time_since_sent; 2108 uint32_t srtt_cur; 2109 int32_t idx; 2110 int32_t is_tlp_timer = 0; 2111 struct rack_sendmap *rsm; 2112 2113 if (rack->t_timers_stopped) { 2114 /* All timers have been stopped none are to run */ 2115 return (0); 2116 } 2117 if (rack->rc_in_persist) { 2118 /* We can't start any timer in persists */ 2119 return (rack_get_persists_timer_val(tp, rack)); 2120 } 2121 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 2122 if (rsm == NULL) { 2123 /* Nothing on the send map */ 2124 activate_rxt: 2125 if (SEQ_LT(tp->snd_una, tp->snd_max) || sbavail(&(tp->t_inpcb->inp_socket->so_snd))) { 2126 rack->r_ctl.rc_hpts_flags |= PACE_TMR_RXT; 2127 to = TICKS_2_MSEC(tp->t_rxtcur); 2128 if (to == 0) 2129 to = 1; 2130 return (to); 2131 } 2132 return (0); 2133 } 2134 if (rsm->r_flags & RACK_ACKED) { 2135 rsm = rack_find_lowest_rsm(rack); 2136 if (rsm == NULL) { 2137 /* No lowest? */ 2138 goto activate_rxt; 2139 } 2140 } 2141 /* Convert from ms to usecs */ 2142 if (rsm->r_flags & RACK_SACK_PASSED) { 2143 if ((tp->t_flags & TF_SENTFIN) && 2144 ((tp->snd_max - tp->snd_una) == 1) && 2145 (rsm->r_flags & RACK_HAS_FIN)) { 2146 /* 2147 * We don't start a rack timer if all we have is a 2148 * FIN outstanding. 2149 */ 2150 goto activate_rxt; 2151 } 2152 if (tp->t_srtt) { 2153 srtt_cur = (tp->t_srtt >> TCP_RTT_SHIFT); 2154 srtt = TICKS_2_MSEC(srtt_cur); 2155 } else 2156 srtt = RACK_INITIAL_RTO; 2157 2158 thresh = rack_calc_thresh_rack(rack, srtt, cts); 2159 idx = rsm->r_rtr_cnt - 1; 2160 exp = rsm->r_tim_lastsent[idx] + thresh; 2161 if (SEQ_GEQ(exp, cts)) { 2162 to = exp - cts; 2163 if (to < rack->r_ctl.rc_min_to) { 2164 to = rack->r_ctl.rc_min_to; 2165 } 2166 } else { 2167 to = rack->r_ctl.rc_min_to; 2168 } 2169 } else { 2170 /* Ok we need to do a TLP not RACK */ 2171 if ((rack->rc_tlp_in_progress != 0) || 2172 (rack->r_ctl.rc_tlp_rtx_out != 0)) { 2173 /* 2174 * The previous send was a TLP or a tlp_rtx is in 2175 * process. 2176 */ 2177 goto activate_rxt; 2178 } 2179 if ((tp->snd_max - tp->snd_una) > tp->snd_wnd) { 2180 /* 2181 * Peer collapsed rwnd, don't do TLP. 2182 */ 2183 goto activate_rxt; 2184 } 2185 rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext); 2186 if (rsm == NULL) { 2187 /* We found no rsm to TLP with. */ 2188 goto activate_rxt; 2189 } 2190 if (rsm->r_flags & RACK_HAS_FIN) { 2191 /* If its a FIN we dont do TLP */ 2192 rsm = NULL; 2193 goto activate_rxt; 2194 } 2195 idx = rsm->r_rtr_cnt - 1; 2196 if (TSTMP_GT(cts, rsm->r_tim_lastsent[idx])) 2197 time_since_sent = cts - rsm->r_tim_lastsent[idx]; 2198 else 2199 time_since_sent = 0; 2200 is_tlp_timer = 1; 2201 if (tp->t_srtt) { 2202 srtt_cur = (tp->t_srtt >> TCP_RTT_SHIFT); 2203 srtt = TICKS_2_MSEC(srtt_cur); 2204 } else 2205 srtt = RACK_INITIAL_RTO; 2206 thresh = rack_calc_thresh_tlp(tp, rack, rsm, srtt); 2207 if (thresh > time_since_sent) 2208 to = thresh - time_since_sent; 2209 else 2210 to = rack->r_ctl.rc_min_to; 2211 if (to > TCPTV_REXMTMAX) { 2212 /* 2213 * If the TLP time works out to larger than the max 2214 * RTO lets not do TLP.. just RTO. 2215 */ 2216 goto activate_rxt; 2217 } 2218 if (rsm->r_start != rack->r_ctl.rc_last_tlp_seq) { 2219 /* 2220 * The tail is no longer the last one I did a probe 2221 * on 2222 */ 2223 rack->r_ctl.rc_tlp_seg_send_cnt = 0; 2224 rack->r_ctl.rc_last_tlp_seq = rsm->r_start; 2225 } 2226 } 2227 if (is_tlp_timer == 0) { 2228 rack->r_ctl.rc_hpts_flags |= PACE_TMR_RACK; 2229 } else { 2230 if ((rack->r_ctl.rc_tlp_send_cnt > rack_tlp_max_resend) || 2231 (rack->r_ctl.rc_tlp_seg_send_cnt > rack_tlp_max_resend)) { 2232 /* 2233 * We have exceeded how many times we can retran the 2234 * current TLP timer, switch to the RTO timer. 2235 */ 2236 goto activate_rxt; 2237 } else { 2238 rack->r_ctl.rc_hpts_flags |= PACE_TMR_TLP; 2239 } 2240 } 2241 if (to == 0) 2242 to = 1; 2243 return (to); 2244 } 2245 2246 static void 2247 rack_enter_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 2248 { 2249 if (rack->rc_in_persist == 0) { 2250 if (((tp->t_flags & TF_SENTFIN) == 0) && 2251 (tp->snd_max - tp->snd_una) >= sbavail(&rack->rc_inp->inp_socket->so_snd)) 2252 /* Must need to send more data to enter persist */ 2253 return; 2254 rack->r_ctl.rc_went_idle_time = cts; 2255 rack_timer_cancel(tp, rack, cts, __LINE__); 2256 tp->t_rxtshift = 0; 2257 rack->rc_in_persist = 1; 2258 } 2259 } 2260 2261 static void 2262 rack_exit_persist(struct tcpcb *tp, struct tcp_rack *rack) 2263 { 2264 if (rack->rc_inp->inp_in_hpts) { 2265 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); 2266 rack->r_ctl.rc_hpts_flags = 0; 2267 } 2268 rack->rc_in_persist = 0; 2269 rack->r_ctl.rc_went_idle_time = 0; 2270 tp->t_flags &= ~TF_FORCEDATA; 2271 tp->t_rxtshift = 0; 2272 } 2273 2274 static void 2275 rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, int32_t line, 2276 int32_t slot, uint32_t tot_len_this_send, int32_t frm_out_sbavail) 2277 { 2278 struct inpcb *inp; 2279 uint32_t delayed_ack = 0; 2280 uint32_t hpts_timeout; 2281 uint8_t stopped; 2282 uint32_t left = 0; 2283 2284 inp = tp->t_inpcb; 2285 if (inp->inp_in_hpts) { 2286 /* A previous call is already set up */ 2287 return; 2288 } 2289 2290 if ((tp->t_state == TCPS_CLOSED) || 2291 (tp->t_state == TCPS_LISTEN)) { 2292 return; 2293 } 2294 stopped = rack->rc_tmr_stopped; 2295 if (stopped && TSTMP_GT(rack->r_ctl.rc_timer_exp, cts)) { 2296 left = rack->r_ctl.rc_timer_exp - cts; 2297 } 2298 rack->r_ctl.rc_timer_exp = 0; 2299 if (rack->rc_inp->inp_in_hpts == 0) { 2300 rack->r_ctl.rc_hpts_flags = 0; 2301 } 2302 if (slot) { 2303 /* We are hptsi too */ 2304 rack->r_ctl.rc_hpts_flags |= PACE_PKT_OUTPUT; 2305 } else if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { 2306 /* 2307 * We are still left on the hpts when the to goes 2308 * it will be for output. 2309 */ 2310 if (TSTMP_GT(rack->r_ctl.rc_last_output_to, cts)) 2311 slot = rack->r_ctl.rc_last_output_to - cts; 2312 else 2313 slot = 1; 2314 } 2315 if ((tp->snd_wnd == 0) && TCPS_HAVEESTABLISHED(tp->t_state)) { 2316 /* No send window.. we must enter persist */ 2317 rack_enter_persist(tp, rack, cts); 2318 } else if ((frm_out_sbavail && 2319 (frm_out_sbavail > (tp->snd_max - tp->snd_una)) && 2320 (tp->snd_wnd < tp->t_maxseg)) && 2321 TCPS_HAVEESTABLISHED(tp->t_state)) { 2322 /* 2323 * If we have no window or we can't send a segment (and have 2324 * data to send.. we cheat here and frm_out_sbavail is 2325 * passed in with the sbavail(sb) only from bbr_output) and 2326 * we are established, then we must enter persits (if not 2327 * already in persits). 2328 */ 2329 rack_enter_persist(tp, rack, cts); 2330 } 2331 hpts_timeout = rack_timer_start(tp, rack, cts); 2332 if (tp->t_flags & TF_DELACK) { 2333 delayed_ack = tcp_delacktime; 2334 rack->r_ctl.rc_hpts_flags |= PACE_TMR_DELACK; 2335 } 2336 if (delayed_ack && ((hpts_timeout == 0) || 2337 (delayed_ack < hpts_timeout))) 2338 hpts_timeout = delayed_ack; 2339 else 2340 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK; 2341 /* 2342 * If no timers are going to run and we will fall off the hptsi 2343 * wheel, we resort to a keep-alive timer if its configured. 2344 */ 2345 if ((hpts_timeout == 0) && 2346 (slot == 0)) { 2347 if ((tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && 2348 (tp->t_state <= TCPS_CLOSING)) { 2349 /* 2350 * Ok we have no timer (persists, rack, tlp, rxt or 2351 * del-ack), we don't have segments being paced. So 2352 * all that is left is the keepalive timer. 2353 */ 2354 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 2355 /* Get the established keep-alive time */ 2356 hpts_timeout = TP_KEEPIDLE(tp); 2357 } else { 2358 /* Get the initial setup keep-alive time */ 2359 hpts_timeout = TP_KEEPINIT(tp); 2360 } 2361 rack->r_ctl.rc_hpts_flags |= PACE_TMR_KEEP; 2362 } 2363 } 2364 if (left && (stopped & (PACE_TMR_KEEP | PACE_TMR_DELACK)) == 2365 (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK)) { 2366 /* 2367 * RACK, TLP, persists and RXT timers all are restartable 2368 * based on actions input .. i.e we received a packet (ack 2369 * or sack) and that changes things (rw, or snd_una etc). 2370 * Thus we can restart them with a new value. For 2371 * keep-alive, delayed_ack we keep track of what was left 2372 * and restart the timer with a smaller value. 2373 */ 2374 if (left < hpts_timeout) 2375 hpts_timeout = left; 2376 } 2377 if (hpts_timeout) { 2378 /* 2379 * Hack alert for now we can't time-out over 2,147,483 2380 * seconds (a bit more than 596 hours), which is probably ok 2381 * :). 2382 */ 2383 if (hpts_timeout > 0x7ffffffe) 2384 hpts_timeout = 0x7ffffffe; 2385 rack->r_ctl.rc_timer_exp = cts + hpts_timeout; 2386 } 2387 if (slot) { 2388 rack->r_ctl.rc_last_output_to = cts + slot; 2389 if ((hpts_timeout == 0) || (hpts_timeout > slot)) { 2390 if (rack->rc_inp->inp_in_hpts == 0) 2391 tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(slot)); 2392 rack_log_to_start(rack, cts, hpts_timeout, slot, 1); 2393 } else { 2394 /* 2395 * Arrange for the hpts to kick back in after the 2396 * t-o if the t-o does not cause a send. 2397 */ 2398 if (rack->rc_inp->inp_in_hpts == 0) 2399 tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(hpts_timeout)); 2400 rack_log_to_start(rack, cts, hpts_timeout, slot, 0); 2401 } 2402 } else if (hpts_timeout) { 2403 if (rack->rc_inp->inp_in_hpts == 0) 2404 tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(hpts_timeout)); 2405 rack_log_to_start(rack, cts, hpts_timeout, slot, 0); 2406 } else { 2407 /* No timer starting */ 2408 #ifdef INVARIANTS 2409 if (SEQ_GT(tp->snd_max, tp->snd_una)) { 2410 panic("tp:%p rack:%p tlts:%d cts:%u slot:%u pto:%u -- no timer started?", 2411 tp, rack, tot_len_this_send, cts, slot, hpts_timeout); 2412 } 2413 #endif 2414 } 2415 rack->rc_tmr_stopped = 0; 2416 if (slot) 2417 rack_log_type_bbrsnd(rack, tot_len_this_send, slot, cts); 2418 } 2419 2420 /* 2421 * RACK Timer, here we simply do logging and house keeping. 2422 * the normal rack_output() function will call the 2423 * appropriate thing to check if we need to do a RACK retransmit. 2424 * We return 1, saying don't proceed with rack_output only 2425 * when all timers have been stopped (destroyed PCB?). 2426 */ 2427 static int 2428 rack_timeout_rack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 2429 { 2430 /* 2431 * This timer simply provides an internal trigger to send out data. 2432 * The check_recovery_mode call will see if there are needed 2433 * retransmissions, if so we will enter fast-recovery. The output 2434 * call may or may not do the same thing depending on sysctl 2435 * settings. 2436 */ 2437 struct rack_sendmap *rsm; 2438 int32_t recovery; 2439 2440 if (tp->t_timers->tt_flags & TT_STOPPED) { 2441 return (1); 2442 } 2443 if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) { 2444 /* Its not time yet */ 2445 return (0); 2446 } 2447 rack_log_to_event(rack, RACK_TO_FRM_RACK); 2448 recovery = IN_RECOVERY(tp->t_flags); 2449 counter_u64_add(rack_to_tot, 1); 2450 if (rack->r_state && (rack->r_state != tp->t_state)) 2451 rack_set_state(tp, rack); 2452 rsm = rack_check_recovery_mode(tp, cts); 2453 if (rsm) { 2454 uint32_t rtt; 2455 2456 rtt = rack->rc_rack_rtt; 2457 if (rtt == 0) 2458 rtt = 1; 2459 if ((recovery == 0) && 2460 (rack->r_ctl.rc_prr_sndcnt < tp->t_maxseg)) { 2461 /* 2462 * The rack-timeout that enter's us into recovery 2463 * will force out one MSS and set us up so that we 2464 * can do one more send in 2*rtt (transitioning the 2465 * rack timeout into a rack-tlp). 2466 */ 2467 rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; 2468 } else if ((rack->r_ctl.rc_prr_sndcnt < tp->t_maxseg) && 2469 ((rsm->r_end - rsm->r_start) > rack->r_ctl.rc_prr_sndcnt)) { 2470 /* 2471 * When a rack timer goes, we have to send at 2472 * least one segment. They will be paced a min of 1ms 2473 * apart via the next rack timer (or further 2474 * if the rack timer dictates it). 2475 */ 2476 rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; 2477 } 2478 } else { 2479 /* This is a case that should happen rarely if ever */ 2480 counter_u64_add(rack_tlp_does_nada, 1); 2481 #ifdef TCP_BLACKBOX 2482 tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true); 2483 #endif 2484 rack->r_ctl.rc_resend = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 2485 } 2486 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RACK; 2487 return (0); 2488 } 2489 2490 static struct rack_sendmap * 2491 rack_merge_rsm(struct tcp_rack *rack, 2492 struct rack_sendmap *l_rsm, 2493 struct rack_sendmap *r_rsm) 2494 { 2495 /* 2496 * We are merging two ack'd RSM's, 2497 * the l_rsm is on the left (lower seq 2498 * values) and the r_rsm is on the right 2499 * (higher seq value). The simplest way 2500 * to merge these is to move the right 2501 * one into the left. I don't think there 2502 * is any reason we need to try to find 2503 * the oldest (or last oldest retransmitted). 2504 */ 2505 l_rsm->r_end = r_rsm->r_end; 2506 if (r_rsm->r_rtr_bytes) 2507 l_rsm->r_rtr_bytes += r_rsm->r_rtr_bytes; 2508 if (r_rsm->r_in_tmap) { 2509 /* This really should not happen */ 2510 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, r_rsm, r_tnext); 2511 } 2512 /* Now the flags */ 2513 if (r_rsm->r_flags & RACK_HAS_FIN) 2514 l_rsm->r_flags |= RACK_HAS_FIN; 2515 if (r_rsm->r_flags & RACK_TLP) 2516 l_rsm->r_flags |= RACK_TLP; 2517 TAILQ_REMOVE(&rack->r_ctl.rc_map, r_rsm, r_next); 2518 if ((r_rsm->r_limit_type == 0) && (l_rsm->r_limit_type != 0)) { 2519 /* Transfer the split limit to the map we free */ 2520 r_rsm->r_limit_type = l_rsm->r_limit_type; 2521 l_rsm->r_limit_type = 0; 2522 } 2523 rack_free(rack, r_rsm); 2524 return(l_rsm); 2525 } 2526 2527 /* 2528 * TLP Timer, here we simply setup what segment we want to 2529 * have the TLP expire on, the normal rack_output() will then 2530 * send it out. 2531 * 2532 * We return 1, saying don't proceed with rack_output only 2533 * when all timers have been stopped (destroyed PCB?). 2534 */ 2535 static int 2536 rack_timeout_tlp(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 2537 { 2538 /* 2539 * Tail Loss Probe. 2540 */ 2541 struct rack_sendmap *rsm = NULL; 2542 struct socket *so; 2543 uint32_t amm, old_prr_snd = 0; 2544 uint32_t out, avail; 2545 2546 if (tp->t_timers->tt_flags & TT_STOPPED) { 2547 return (1); 2548 } 2549 if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) { 2550 /* Its not time yet */ 2551 return (0); 2552 } 2553 if (rack_progress_timeout_check(tp)) { 2554 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 2555 return (1); 2556 } 2557 /* 2558 * A TLP timer has expired. We have been idle for 2 rtts. So we now 2559 * need to figure out how to force a full MSS segment out. 2560 */ 2561 rack_log_to_event(rack, RACK_TO_FRM_TLP); 2562 counter_u64_add(rack_tlp_tot, 1); 2563 if (rack->r_state && (rack->r_state != tp->t_state)) 2564 rack_set_state(tp, rack); 2565 so = tp->t_inpcb->inp_socket; 2566 avail = sbavail(&so->so_snd); 2567 out = tp->snd_max - tp->snd_una; 2568 rack->rc_timer_up = 1; 2569 /* 2570 * If we are in recovery we can jazz out a segment if new data is 2571 * present simply by setting rc_prr_sndcnt to a segment. 2572 */ 2573 if ((avail > out) && 2574 ((rack_always_send_oldest == 0) || (TAILQ_EMPTY(&rack->r_ctl.rc_tmap)))) { 2575 /* New data is available */ 2576 amm = avail - out; 2577 if (amm > tp->t_maxseg) { 2578 amm = tp->t_maxseg; 2579 } else if ((amm < tp->t_maxseg) && ((tp->t_flags & TF_NODELAY) == 0)) { 2580 /* not enough to fill a MTU and no-delay is off */ 2581 goto need_retran; 2582 } 2583 if (IN_RECOVERY(tp->t_flags)) { 2584 /* Unlikely */ 2585 old_prr_snd = rack->r_ctl.rc_prr_sndcnt; 2586 if (out + amm <= tp->snd_wnd) 2587 rack->r_ctl.rc_prr_sndcnt = amm; 2588 else 2589 goto need_retran; 2590 } else { 2591 /* Set the send-new override */ 2592 if (out + amm <= tp->snd_wnd) 2593 rack->r_ctl.rc_tlp_new_data = amm; 2594 else 2595 goto need_retran; 2596 } 2597 rack->r_ctl.rc_tlp_seg_send_cnt = 0; 2598 rack->r_ctl.rc_last_tlp_seq = tp->snd_max; 2599 rack->r_ctl.rc_tlpsend = NULL; 2600 counter_u64_add(rack_tlp_newdata, 1); 2601 goto send; 2602 } 2603 need_retran: 2604 /* 2605 * Ok we need to arrange the last un-acked segment to be re-sent, or 2606 * optionally the first un-acked segment. 2607 */ 2608 if (rack_always_send_oldest) 2609 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 2610 else { 2611 rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_map, rack_sendmap, r_next); 2612 if (rsm && (rsm->r_flags & (RACK_ACKED | RACK_HAS_FIN))) { 2613 rsm = rack_find_high_nonack(rack, rsm); 2614 } 2615 } 2616 if (rsm == NULL) { 2617 counter_u64_add(rack_tlp_does_nada, 1); 2618 #ifdef TCP_BLACKBOX 2619 tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true); 2620 #endif 2621 goto out; 2622 } 2623 if ((rsm->r_end - rsm->r_start) > tp->t_maxseg) { 2624 /* 2625 * We need to split this the last segment in two. 2626 */ 2627 int32_t idx; 2628 struct rack_sendmap *nrsm; 2629 2630 nrsm = rack_alloc_full_limit(rack); 2631 if (nrsm == NULL) { 2632 /* 2633 * No memory to split, we will just exit and punt 2634 * off to the RXT timer. 2635 */ 2636 counter_u64_add(rack_tlp_does_nada, 1); 2637 goto out; 2638 } 2639 nrsm->r_start = (rsm->r_end - tp->t_maxseg); 2640 nrsm->r_end = rsm->r_end; 2641 nrsm->r_rtr_cnt = rsm->r_rtr_cnt; 2642 nrsm->r_flags = rsm->r_flags; 2643 nrsm->r_sndcnt = rsm->r_sndcnt; 2644 nrsm->r_rtr_bytes = 0; 2645 rsm->r_end = nrsm->r_start; 2646 for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { 2647 nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; 2648 } 2649 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next); 2650 if (rsm->r_in_tmap) { 2651 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 2652 nrsm->r_in_tmap = 1; 2653 } 2654 rsm->r_flags &= (~RACK_HAS_FIN); 2655 rsm = nrsm; 2656 } 2657 rack->r_ctl.rc_tlpsend = rsm; 2658 rack->r_ctl.rc_tlp_rtx_out = 1; 2659 if (rsm->r_start == rack->r_ctl.rc_last_tlp_seq) { 2660 rack->r_ctl.rc_tlp_seg_send_cnt++; 2661 tp->t_rxtshift++; 2662 } else { 2663 rack->r_ctl.rc_last_tlp_seq = rsm->r_start; 2664 rack->r_ctl.rc_tlp_seg_send_cnt = 1; 2665 } 2666 send: 2667 rack->r_ctl.rc_tlp_send_cnt++; 2668 if (rack->r_ctl.rc_tlp_send_cnt > rack_tlp_max_resend) { 2669 /* 2670 * Can't [re]/transmit a segment we have not heard from the 2671 * peer in max times. We need the retransmit timer to take 2672 * over. 2673 */ 2674 restore: 2675 rack->r_ctl.rc_tlpsend = NULL; 2676 if (rsm) 2677 rsm->r_flags &= ~RACK_TLP; 2678 rack->r_ctl.rc_prr_sndcnt = old_prr_snd; 2679 counter_u64_add(rack_tlp_retran_fail, 1); 2680 goto out; 2681 } else if (rsm) { 2682 rsm->r_flags |= RACK_TLP; 2683 } 2684 if (rsm && (rsm->r_start == rack->r_ctl.rc_last_tlp_seq) && 2685 (rack->r_ctl.rc_tlp_seg_send_cnt > rack_tlp_max_resend)) { 2686 /* 2687 * We don't want to send a single segment more than the max 2688 * either. 2689 */ 2690 goto restore; 2691 } 2692 rack->r_timer_override = 1; 2693 rack->r_tlp_running = 1; 2694 rack->rc_tlp_in_progress = 1; 2695 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP; 2696 return (0); 2697 out: 2698 rack->rc_timer_up = 0; 2699 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP; 2700 return (0); 2701 } 2702 2703 /* 2704 * Delayed ack Timer, here we simply need to setup the 2705 * ACK_NOW flag and remove the DELACK flag. From there 2706 * the output routine will send the ack out. 2707 * 2708 * We only return 1, saying don't proceed, if all timers 2709 * are stopped (destroyed PCB?). 2710 */ 2711 static int 2712 rack_timeout_delack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 2713 { 2714 if (tp->t_timers->tt_flags & TT_STOPPED) { 2715 return (1); 2716 } 2717 rack_log_to_event(rack, RACK_TO_FRM_DELACK); 2718 tp->t_flags &= ~TF_DELACK; 2719 tp->t_flags |= TF_ACKNOW; 2720 TCPSTAT_INC(tcps_delack); 2721 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK; 2722 return (0); 2723 } 2724 2725 /* 2726 * Persists timer, here we simply need to setup the 2727 * FORCE-DATA flag the output routine will send 2728 * the one byte send. 2729 * 2730 * We only return 1, saying don't proceed, if all timers 2731 * are stopped (destroyed PCB?). 2732 */ 2733 static int 2734 rack_timeout_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 2735 { 2736 struct inpcb *inp; 2737 int32_t retval = 0; 2738 2739 inp = tp->t_inpcb; 2740 2741 if (tp->t_timers->tt_flags & TT_STOPPED) { 2742 return (1); 2743 } 2744 if (rack->rc_in_persist == 0) 2745 return (0); 2746 if (rack_progress_timeout_check(tp)) { 2747 tcp_set_inp_to_drop(inp, ETIMEDOUT); 2748 return (1); 2749 } 2750 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 2751 /* 2752 * Persistence timer into zero window. Force a byte to be output, if 2753 * possible. 2754 */ 2755 TCPSTAT_INC(tcps_persisttimeo); 2756 /* 2757 * Hack: if the peer is dead/unreachable, we do not time out if the 2758 * window is closed. After a full backoff, drop the connection if 2759 * the idle time (no responses to probes) reaches the maximum 2760 * backoff that we would use if retransmitting. 2761 */ 2762 if (tp->t_rxtshift == TCP_MAXRXTSHIFT && 2763 (ticks - tp->t_rcvtime >= tcp_maxpersistidle || 2764 ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) { 2765 TCPSTAT_INC(tcps_persistdrop); 2766 retval = 1; 2767 tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT); 2768 goto out; 2769 } 2770 if ((sbavail(&rack->rc_inp->inp_socket->so_snd) == 0) && 2771 tp->snd_una == tp->snd_max) 2772 rack_exit_persist(tp, rack); 2773 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_PERSIT; 2774 /* 2775 * If the user has closed the socket then drop a persisting 2776 * connection after a much reduced timeout. 2777 */ 2778 if (tp->t_state > TCPS_CLOSE_WAIT && 2779 (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) { 2780 retval = 1; 2781 TCPSTAT_INC(tcps_persistdrop); 2782 tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT); 2783 goto out; 2784 } 2785 tp->t_flags |= TF_FORCEDATA; 2786 out: 2787 rack_log_to_event(rack, RACK_TO_FRM_PERSIST); 2788 return (retval); 2789 } 2790 2791 /* 2792 * If a keepalive goes off, we had no other timers 2793 * happening. We always return 1 here since this 2794 * routine either drops the connection or sends 2795 * out a segment with respond. 2796 */ 2797 static int 2798 rack_timeout_keepalive(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 2799 { 2800 struct tcptemp *t_template; 2801 struct inpcb *inp; 2802 2803 if (tp->t_timers->tt_flags & TT_STOPPED) { 2804 return (1); 2805 } 2806 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_KEEP; 2807 inp = tp->t_inpcb; 2808 rack_log_to_event(rack, RACK_TO_FRM_KEEP); 2809 /* 2810 * Keep-alive timer went off; send something or drop connection if 2811 * idle for too long. 2812 */ 2813 TCPSTAT_INC(tcps_keeptimeo); 2814 if (tp->t_state < TCPS_ESTABLISHED) 2815 goto dropit; 2816 if ((tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && 2817 tp->t_state <= TCPS_CLOSING) { 2818 if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp)) 2819 goto dropit; 2820 /* 2821 * Send a packet designed to force a response if the peer is 2822 * up and reachable: either an ACK if the connection is 2823 * still alive, or an RST if the peer has closed the 2824 * connection due to timeout or reboot. Using sequence 2825 * number tp->snd_una-1 causes the transmitted zero-length 2826 * segment to lie outside the receive window; by the 2827 * protocol spec, this requires the correspondent TCP to 2828 * respond. 2829 */ 2830 TCPSTAT_INC(tcps_keepprobe); 2831 t_template = tcpip_maketemplate(inp); 2832 if (t_template) { 2833 tcp_respond(tp, t_template->tt_ipgen, 2834 &t_template->tt_t, (struct mbuf *)NULL, 2835 tp->rcv_nxt, tp->snd_una - 1, 0); 2836 free(t_template, M_TEMP); 2837 } 2838 } 2839 rack_start_hpts_timer(rack, tp, cts, __LINE__, 0, 0, 0); 2840 return (1); 2841 dropit: 2842 TCPSTAT_INC(tcps_keepdrops); 2843 tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT); 2844 return (1); 2845 } 2846 2847 /* 2848 * Retransmit helper function, clear up all the ack 2849 * flags and take care of important book keeping. 2850 */ 2851 static void 2852 rack_remxt_tmr(struct tcpcb *tp) 2853 { 2854 /* 2855 * The retransmit timer went off, all sack'd blocks must be 2856 * un-acked. 2857 */ 2858 struct rack_sendmap *rsm, *trsm = NULL; 2859 struct tcp_rack *rack; 2860 int32_t cnt = 0; 2861 2862 rack = (struct tcp_rack *)tp->t_fb_ptr; 2863 rack_timer_cancel(tp, rack, tcp_ts_getticks(), __LINE__); 2864 rack_log_to_event(rack, RACK_TO_FRM_TMR); 2865 if (rack->r_state && (rack->r_state != tp->t_state)) 2866 rack_set_state(tp, rack); 2867 /* 2868 * Ideally we would like to be able to 2869 * mark SACK-PASS on anything not acked here. 2870 * However, if we do that we would burst out 2871 * all that data 1ms apart. This would be unwise, 2872 * so for now we will just let the normal rxt timer 2873 * and tlp timer take care of it. 2874 */ 2875 TAILQ_FOREACH(rsm, &rack->r_ctl.rc_map, r_next) { 2876 if (rsm->r_flags & RACK_ACKED) { 2877 cnt++; 2878 rsm->r_sndcnt = 0; 2879 if (rsm->r_in_tmap == 0) { 2880 /* We must re-add it back to the tlist */ 2881 if (trsm == NULL) { 2882 TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext); 2883 } else { 2884 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, trsm, rsm, r_tnext); 2885 } 2886 rsm->r_in_tmap = 1; 2887 trsm = rsm; 2888 } 2889 } 2890 rsm->r_flags &= ~(RACK_ACKED | RACK_SACK_PASSED | RACK_WAS_SACKPASS); 2891 } 2892 /* Clear the count (we just un-acked them) */ 2893 rack->r_ctl.rc_sacked = 0; 2894 /* Clear the tlp rtx mark */ 2895 rack->r_ctl.rc_tlp_rtx_out = 0; 2896 rack->r_ctl.rc_tlp_seg_send_cnt = 0; 2897 rack->r_ctl.rc_resend = TAILQ_FIRST(&rack->r_ctl.rc_map); 2898 /* Setup so we send one segment */ 2899 if (rack->r_ctl.rc_prr_sndcnt < tp->t_maxseg) 2900 rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; 2901 rack->r_timer_override = 1; 2902 } 2903 2904 /* 2905 * Re-transmit timeout! If we drop the PCB we will return 1, otherwise 2906 * we will setup to retransmit the lowest seq number outstanding. 2907 */ 2908 static int 2909 rack_timeout_rxt(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 2910 { 2911 int32_t rexmt; 2912 struct inpcb *inp; 2913 int32_t retval = 0; 2914 2915 inp = tp->t_inpcb; 2916 if (tp->t_timers->tt_flags & TT_STOPPED) { 2917 return (1); 2918 } 2919 if (rack_progress_timeout_check(tp)) { 2920 tcp_set_inp_to_drop(inp, ETIMEDOUT); 2921 return (1); 2922 } 2923 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RXT; 2924 if (TCPS_HAVEESTABLISHED(tp->t_state) && 2925 (tp->snd_una == tp->snd_max)) { 2926 /* Nothing outstanding .. nothing to do */ 2927 return (0); 2928 } 2929 /* 2930 * Retransmission timer went off. Message has not been acked within 2931 * retransmit interval. Back off to a longer retransmit interval 2932 * and retransmit one segment. 2933 */ 2934 if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) { 2935 tp->t_rxtshift = TCP_MAXRXTSHIFT; 2936 TCPSTAT_INC(tcps_timeoutdrop); 2937 retval = 1; 2938 tcp_set_inp_to_drop(rack->rc_inp, 2939 (tp->t_softerror ? (uint16_t) tp->t_softerror : ETIMEDOUT)); 2940 goto out; 2941 } 2942 rack_remxt_tmr(tp); 2943 if (tp->t_state == TCPS_SYN_SENT) { 2944 /* 2945 * If the SYN was retransmitted, indicate CWND to be limited 2946 * to 1 segment in cc_conn_init(). 2947 */ 2948 tp->snd_cwnd = 1; 2949 } else if (tp->t_rxtshift == 1) { 2950 /* 2951 * first retransmit; record ssthresh and cwnd so they can be 2952 * recovered if this turns out to be a "bad" retransmit. A 2953 * retransmit is considered "bad" if an ACK for this segment 2954 * is received within RTT/2 interval; the assumption here is 2955 * that the ACK was already in flight. See "On Estimating 2956 * End-to-End Network Path Properties" by Allman and Paxson 2957 * for more details. 2958 */ 2959 tp->snd_cwnd_prev = tp->snd_cwnd; 2960 tp->snd_ssthresh_prev = tp->snd_ssthresh; 2961 tp->snd_recover_prev = tp->snd_recover; 2962 if (IN_FASTRECOVERY(tp->t_flags)) 2963 tp->t_flags |= TF_WASFRECOVERY; 2964 else 2965 tp->t_flags &= ~TF_WASFRECOVERY; 2966 if (IN_CONGRECOVERY(tp->t_flags)) 2967 tp->t_flags |= TF_WASCRECOVERY; 2968 else 2969 tp->t_flags &= ~TF_WASCRECOVERY; 2970 tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1)); 2971 tp->t_flags |= TF_PREVVALID; 2972 } else 2973 tp->t_flags &= ~TF_PREVVALID; 2974 TCPSTAT_INC(tcps_rexmttimeo); 2975 if ((tp->t_state == TCPS_SYN_SENT) || 2976 (tp->t_state == TCPS_SYN_RECEIVED)) 2977 rexmt = MSEC_2_TICKS(RACK_INITIAL_RTO * tcp_syn_backoff[tp->t_rxtshift]); 2978 else 2979 rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; 2980 TCPT_RANGESET(tp->t_rxtcur, rexmt, 2981 max(MSEC_2_TICKS(rack_rto_min), rexmt), 2982 MSEC_2_TICKS(rack_rto_max)); 2983 /* 2984 * We enter the path for PLMTUD if connection is established or, if 2985 * connection is FIN_WAIT_1 status, reason for the last is that if 2986 * amount of data we send is very small, we could send it in couple 2987 * of packets and process straight to FIN. In that case we won't 2988 * catch ESTABLISHED state. 2989 */ 2990 if (V_tcp_pmtud_blackhole_detect && (((tp->t_state == TCPS_ESTABLISHED)) 2991 || (tp->t_state == TCPS_FIN_WAIT_1))) { 2992 #ifdef INET6 2993 int32_t isipv6; 2994 #endif 2995 2996 /* 2997 * Idea here is that at each stage of mtu probe (usually, 2998 * 1448 -> 1188 -> 524) should be given 2 chances to recover 2999 * before further clamping down. 'tp->t_rxtshift % 2 == 0' 3000 * should take care of that. 3001 */ 3002 if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) == 3003 (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) && 3004 (tp->t_rxtshift >= 2 && tp->t_rxtshift < 6 && 3005 tp->t_rxtshift % 2 == 0)) { 3006 /* 3007 * Enter Path MTU Black-hole Detection mechanism: - 3008 * Disable Path MTU Discovery (IP "DF" bit). - 3009 * Reduce MTU to lower value than what we negotiated 3010 * with peer. 3011 */ 3012 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) == 0) { 3013 /* Record that we may have found a black hole. */ 3014 tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE; 3015 /* Keep track of previous MSS. */ 3016 tp->t_pmtud_saved_maxseg = tp->t_maxseg; 3017 } 3018 3019 /* 3020 * Reduce the MSS to blackhole value or to the 3021 * default in an attempt to retransmit. 3022 */ 3023 #ifdef INET6 3024 isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? 1 : 0; 3025 if (isipv6 && 3026 tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) { 3027 /* Use the sysctl tuneable blackhole MSS. */ 3028 tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss; 3029 TCPSTAT_INC(tcps_pmtud_blackhole_activated); 3030 } else if (isipv6) { 3031 /* Use the default MSS. */ 3032 tp->t_maxseg = V_tcp_v6mssdflt; 3033 /* 3034 * Disable Path MTU Discovery when we switch 3035 * to minmss. 3036 */ 3037 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 3038 TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); 3039 } 3040 #endif 3041 #if defined(INET6) && defined(INET) 3042 else 3043 #endif 3044 #ifdef INET 3045 if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) { 3046 /* Use the sysctl tuneable blackhole MSS. */ 3047 tp->t_maxseg = V_tcp_pmtud_blackhole_mss; 3048 TCPSTAT_INC(tcps_pmtud_blackhole_activated); 3049 } else { 3050 /* Use the default MSS. */ 3051 tp->t_maxseg = V_tcp_mssdflt; 3052 /* 3053 * Disable Path MTU Discovery when we switch 3054 * to minmss. 3055 */ 3056 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 3057 TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); 3058 } 3059 #endif 3060 } else { 3061 /* 3062 * If further retransmissions are still unsuccessful 3063 * with a lowered MTU, maybe this isn't a blackhole 3064 * and we restore the previous MSS and blackhole 3065 * detection flags. The limit '6' is determined by 3066 * giving each probe stage (1448, 1188, 524) 2 3067 * chances to recover. 3068 */ 3069 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) && 3070 (tp->t_rxtshift >= 6)) { 3071 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 3072 tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE; 3073 tp->t_maxseg = tp->t_pmtud_saved_maxseg; 3074 TCPSTAT_INC(tcps_pmtud_blackhole_failed); 3075 } 3076 } 3077 } 3078 /* 3079 * Disable RFC1323 and SACK if we haven't got any response to our 3080 * third SYN to work-around some broken terminal servers (most of 3081 * which have hopefully been retired) that have bad VJ header 3082 * compression code which trashes TCP segments containing 3083 * unknown-to-them TCP options. 3084 */ 3085 if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) && 3086 (tp->t_rxtshift == 3)) 3087 tp->t_flags &= ~(TF_REQ_SCALE | TF_REQ_TSTMP | TF_SACK_PERMIT); 3088 /* 3089 * If we backed off this far, our srtt estimate is probably bogus. 3090 * Clobber it so we'll take the next rtt measurement as our srtt; 3091 * move the current srtt into rttvar to keep the current retransmit 3092 * times until then. 3093 */ 3094 if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) { 3095 #ifdef INET6 3096 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) 3097 in6_losing(tp->t_inpcb); 3098 else 3099 #endif 3100 in_losing(tp->t_inpcb); 3101 tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT); 3102 tp->t_srtt = 0; 3103 } 3104 if (rack_use_sack_filter) 3105 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 3106 tp->snd_recover = tp->snd_max; 3107 tp->t_flags |= TF_ACKNOW; 3108 tp->t_rtttime = 0; 3109 rack_cong_signal(tp, NULL, CC_RTO); 3110 out: 3111 return (retval); 3112 } 3113 3114 static int 3115 rack_process_timers(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8_t hpts_calling) 3116 { 3117 int32_t ret = 0; 3118 int32_t timers = (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK); 3119 3120 if (timers == 0) { 3121 return (0); 3122 } 3123 if (tp->t_state == TCPS_LISTEN) { 3124 /* no timers on listen sockets */ 3125 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) 3126 return (0); 3127 return (1); 3128 } 3129 if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) { 3130 uint32_t left; 3131 3132 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { 3133 ret = -1; 3134 rack_log_to_processing(rack, cts, ret, 0); 3135 return (0); 3136 } 3137 if (hpts_calling == 0) { 3138 ret = -2; 3139 rack_log_to_processing(rack, cts, ret, 0); 3140 return (0); 3141 } 3142 /* 3143 * Ok our timer went off early and we are not paced false 3144 * alarm, go back to sleep. 3145 */ 3146 ret = -3; 3147 left = rack->r_ctl.rc_timer_exp - cts; 3148 tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(left)); 3149 rack_log_to_processing(rack, cts, ret, left); 3150 rack->rc_last_pto_set = 0; 3151 return (1); 3152 } 3153 rack->rc_tmr_stopped = 0; 3154 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_MASK; 3155 if (timers & PACE_TMR_DELACK) { 3156 ret = rack_timeout_delack(tp, rack, cts); 3157 } else if (timers & PACE_TMR_RACK) { 3158 ret = rack_timeout_rack(tp, rack, cts); 3159 } else if (timers & PACE_TMR_TLP) { 3160 ret = rack_timeout_tlp(tp, rack, cts); 3161 } else if (timers & PACE_TMR_RXT) { 3162 ret = rack_timeout_rxt(tp, rack, cts); 3163 } else if (timers & PACE_TMR_PERSIT) { 3164 ret = rack_timeout_persist(tp, rack, cts); 3165 } else if (timers & PACE_TMR_KEEP) { 3166 ret = rack_timeout_keepalive(tp, rack, cts); 3167 } 3168 rack_log_to_processing(rack, cts, ret, timers); 3169 return (ret); 3170 } 3171 3172 static void 3173 rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line) 3174 { 3175 uint8_t hpts_removed = 0; 3176 3177 if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) && 3178 TSTMP_GEQ(cts, rack->r_ctl.rc_last_output_to)) { 3179 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); 3180 hpts_removed = 1; 3181 } 3182 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { 3183 rack->rc_tmr_stopped = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK; 3184 if (rack->rc_inp->inp_in_hpts && 3185 ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0)) { 3186 /* 3187 * Canceling timer's when we have no output being 3188 * paced. We also must remove ourselves from the 3189 * hpts. 3190 */ 3191 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); 3192 hpts_removed = 1; 3193 } 3194 rack_log_to_cancel(rack, hpts_removed, line); 3195 rack->r_ctl.rc_hpts_flags &= ~(PACE_TMR_MASK); 3196 } 3197 } 3198 3199 static void 3200 rack_timer_stop(struct tcpcb *tp, uint32_t timer_type) 3201 { 3202 return; 3203 } 3204 3205 static int 3206 rack_stopall(struct tcpcb *tp) 3207 { 3208 struct tcp_rack *rack; 3209 rack = (struct tcp_rack *)tp->t_fb_ptr; 3210 rack->t_timers_stopped = 1; 3211 return (0); 3212 } 3213 3214 static void 3215 rack_timer_activate(struct tcpcb *tp, uint32_t timer_type, uint32_t delta) 3216 { 3217 return; 3218 } 3219 3220 static int 3221 rack_timer_active(struct tcpcb *tp, uint32_t timer_type) 3222 { 3223 return (0); 3224 } 3225 3226 static void 3227 rack_stop_all_timers(struct tcpcb *tp) 3228 { 3229 struct tcp_rack *rack; 3230 3231 /* 3232 * Assure no timers are running. 3233 */ 3234 if (tcp_timer_active(tp, TT_PERSIST)) { 3235 /* We enter in persists, set the flag appropriately */ 3236 rack = (struct tcp_rack *)tp->t_fb_ptr; 3237 rack->rc_in_persist = 1; 3238 } 3239 tcp_timer_suspend(tp, TT_PERSIST); 3240 tcp_timer_suspend(tp, TT_REXMT); 3241 tcp_timer_suspend(tp, TT_KEEP); 3242 tcp_timer_suspend(tp, TT_DELACK); 3243 } 3244 3245 static void 3246 rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack, 3247 struct rack_sendmap *rsm, uint32_t ts) 3248 { 3249 int32_t idx; 3250 3251 rsm->r_rtr_cnt++; 3252 rsm->r_sndcnt++; 3253 if (rsm->r_rtr_cnt > RACK_NUM_OF_RETRANS) { 3254 rsm->r_rtr_cnt = RACK_NUM_OF_RETRANS; 3255 rsm->r_flags |= RACK_OVERMAX; 3256 } 3257 if ((rsm->r_rtr_cnt > 1) && (rack->r_tlp_running == 0)) { 3258 rack->r_ctl.rc_holes_rxt += (rsm->r_end - rsm->r_start); 3259 rsm->r_rtr_bytes += (rsm->r_end - rsm->r_start); 3260 } 3261 idx = rsm->r_rtr_cnt - 1; 3262 rsm->r_tim_lastsent[idx] = ts; 3263 if (rsm->r_flags & RACK_ACKED) { 3264 /* Problably MTU discovery messing with us */ 3265 rsm->r_flags &= ~RACK_ACKED; 3266 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); 3267 } 3268 if (rsm->r_in_tmap) { 3269 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 3270 } 3271 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); 3272 rsm->r_in_tmap = 1; 3273 if (rsm->r_flags & RACK_SACK_PASSED) { 3274 /* We have retransmitted due to the SACK pass */ 3275 rsm->r_flags &= ~RACK_SACK_PASSED; 3276 rsm->r_flags |= RACK_WAS_SACKPASS; 3277 } 3278 /* Update memory for next rtr */ 3279 rack->r_ctl.rc_next = TAILQ_NEXT(rsm, r_next); 3280 } 3281 3282 3283 static uint32_t 3284 rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack, 3285 struct rack_sendmap *rsm, uint32_t ts, int32_t * lenp) 3286 { 3287 /* 3288 * We (re-)transmitted starting at rsm->r_start for some length 3289 * (possibly less than r_end. 3290 */ 3291 struct rack_sendmap *nrsm; 3292 uint32_t c_end; 3293 int32_t len; 3294 int32_t idx; 3295 3296 len = *lenp; 3297 c_end = rsm->r_start + len; 3298 if (SEQ_GEQ(c_end, rsm->r_end)) { 3299 /* 3300 * We retransmitted the whole piece or more than the whole 3301 * slopping into the next rsm. 3302 */ 3303 rack_update_rsm(tp, rack, rsm, ts); 3304 if (c_end == rsm->r_end) { 3305 *lenp = 0; 3306 return (0); 3307 } else { 3308 int32_t act_len; 3309 3310 /* Hangs over the end return whats left */ 3311 act_len = rsm->r_end - rsm->r_start; 3312 *lenp = (len - act_len); 3313 return (rsm->r_end); 3314 } 3315 /* We don't get out of this block. */ 3316 } 3317 /* 3318 * Here we retransmitted less than the whole thing which means we 3319 * have to split this into what was transmitted and what was not. 3320 */ 3321 nrsm = rack_alloc_full_limit(rack); 3322 if (nrsm == NULL) { 3323 /* 3324 * We can't get memory, so lets not proceed. 3325 */ 3326 *lenp = 0; 3327 return (0); 3328 } 3329 /* 3330 * So here we are going to take the original rsm and make it what we 3331 * retransmitted. nrsm will be the tail portion we did not 3332 * retransmit. For example say the chunk was 1, 11 (10 bytes). And 3333 * we retransmitted 5 bytes i.e. 1, 5. The original piece shrinks to 3334 * 1, 6 and the new piece will be 6, 11. 3335 */ 3336 nrsm->r_start = c_end; 3337 nrsm->r_end = rsm->r_end; 3338 nrsm->r_rtr_cnt = rsm->r_rtr_cnt; 3339 nrsm->r_flags = rsm->r_flags; 3340 nrsm->r_sndcnt = rsm->r_sndcnt; 3341 nrsm->r_rtr_bytes = 0; 3342 rsm->r_end = c_end; 3343 for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { 3344 nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; 3345 } 3346 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next); 3347 if (rsm->r_in_tmap) { 3348 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 3349 nrsm->r_in_tmap = 1; 3350 } 3351 rsm->r_flags &= (~RACK_HAS_FIN); 3352 rack_update_rsm(tp, rack, rsm, ts); 3353 *lenp = 0; 3354 return (0); 3355 } 3356 3357 3358 static void 3359 rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len, 3360 uint32_t seq_out, uint8_t th_flags, int32_t err, uint32_t ts, 3361 uint8_t pass, struct rack_sendmap *hintrsm) 3362 { 3363 struct tcp_rack *rack; 3364 struct rack_sendmap *rsm, *nrsm; 3365 register uint32_t snd_max, snd_una; 3366 int32_t idx; 3367 3368 /* 3369 * Add to the RACK log of packets in flight or retransmitted. If 3370 * there is a TS option we will use the TS echoed, if not we will 3371 * grab a TS. 3372 * 3373 * Retransmissions will increment the count and move the ts to its 3374 * proper place. Note that if options do not include TS's then we 3375 * won't be able to effectively use the ACK for an RTT on a retran. 3376 * 3377 * Notes about r_start and r_end. Lets consider a send starting at 3378 * sequence 1 for 10 bytes. In such an example the r_start would be 3379 * 1 (starting sequence) but the r_end would be r_start+len i.e. 11. 3380 * This means that r_end is actually the first sequence for the next 3381 * slot (11). 3382 * 3383 */ 3384 /* 3385 * If err is set what do we do XXXrrs? should we not add the thing? 3386 * -- i.e. return if err != 0 or should we pretend we sent it? -- 3387 * i.e. proceed with add ** do this for now. 3388 */ 3389 INP_WLOCK_ASSERT(tp->t_inpcb); 3390 if (err) 3391 /* 3392 * We don't log errors -- we could but snd_max does not 3393 * advance in this case either. 3394 */ 3395 return; 3396 3397 if (th_flags & TH_RST) { 3398 /* 3399 * We don't log resets and we return immediately from 3400 * sending 3401 */ 3402 return; 3403 } 3404 rack = (struct tcp_rack *)tp->t_fb_ptr; 3405 snd_una = tp->snd_una; 3406 if (SEQ_LEQ((seq_out + len), snd_una)) { 3407 /* Are sending an old segment to induce an ack (keep-alive)? */ 3408 return; 3409 } 3410 if (SEQ_LT(seq_out, snd_una)) { 3411 /* huh? should we panic? */ 3412 uint32_t end; 3413 3414 end = seq_out + len; 3415 seq_out = snd_una; 3416 len = end - seq_out; 3417 } 3418 snd_max = tp->snd_max; 3419 if (th_flags & (TH_SYN | TH_FIN)) { 3420 /* 3421 * The call to rack_log_output is made before bumping 3422 * snd_max. This means we can record one extra byte on a SYN 3423 * or FIN if seq_out is adding more on and a FIN is present 3424 * (and we are not resending). 3425 */ 3426 if (th_flags & TH_SYN) 3427 len++; 3428 if (th_flags & TH_FIN) 3429 len++; 3430 if (SEQ_LT(snd_max, tp->snd_nxt)) { 3431 /* 3432 * The add/update as not been done for the FIN/SYN 3433 * yet. 3434 */ 3435 snd_max = tp->snd_nxt; 3436 } 3437 } 3438 if (len == 0) { 3439 /* We don't log zero window probes */ 3440 return; 3441 } 3442 rack->r_ctl.rc_time_last_sent = ts; 3443 if (IN_RECOVERY(tp->t_flags)) { 3444 rack->r_ctl.rc_prr_out += len; 3445 } 3446 /* First question is it a retransmission? */ 3447 if (seq_out == snd_max) { 3448 again: 3449 rsm = rack_alloc(rack); 3450 if (rsm == NULL) { 3451 /* 3452 * Hmm out of memory and the tcb got destroyed while 3453 * we tried to wait. 3454 */ 3455 return; 3456 } 3457 if (th_flags & TH_FIN) { 3458 rsm->r_flags = RACK_HAS_FIN; 3459 } else { 3460 rsm->r_flags = 0; 3461 } 3462 rsm->r_tim_lastsent[0] = ts; 3463 rsm->r_rtr_cnt = 1; 3464 rsm->r_rtr_bytes = 0; 3465 rsm->r_start = seq_out; 3466 rsm->r_end = rsm->r_start + len; 3467 rsm->r_sndcnt = 0; 3468 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_map, rsm, r_next); 3469 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); 3470 rsm->r_in_tmap = 1; 3471 return; 3472 } 3473 /* 3474 * If we reach here its a retransmission and we need to find it. 3475 */ 3476 more: 3477 if (hintrsm && (hintrsm->r_start == seq_out)) { 3478 rsm = hintrsm; 3479 hintrsm = NULL; 3480 } else if (rack->r_ctl.rc_next) { 3481 /* We have a hint from a previous run */ 3482 rsm = rack->r_ctl.rc_next; 3483 } else { 3484 /* No hints sorry */ 3485 rsm = NULL; 3486 } 3487 if ((rsm) && (rsm->r_start == seq_out)) { 3488 /* 3489 * We used rc_next or hintrsm to retransmit, hopefully the 3490 * likely case. 3491 */ 3492 seq_out = rack_update_entry(tp, rack, rsm, ts, &len); 3493 if (len == 0) { 3494 return; 3495 } else { 3496 goto more; 3497 } 3498 } 3499 /* Ok it was not the last pointer go through it the hard way. */ 3500 TAILQ_FOREACH(rsm, &rack->r_ctl.rc_map, r_next) { 3501 if (rsm->r_start == seq_out) { 3502 seq_out = rack_update_entry(tp, rack, rsm, ts, &len); 3503 rack->r_ctl.rc_next = TAILQ_NEXT(rsm, r_next); 3504 if (len == 0) { 3505 return; 3506 } else { 3507 continue; 3508 } 3509 } 3510 if (SEQ_GEQ(seq_out, rsm->r_start) && SEQ_LT(seq_out, rsm->r_end)) { 3511 /* Transmitted within this piece */ 3512 /* 3513 * Ok we must split off the front and then let the 3514 * update do the rest 3515 */ 3516 nrsm = rack_alloc_full_limit(rack); 3517 if (nrsm == NULL) { 3518 rack_update_rsm(tp, rack, rsm, ts); 3519 return; 3520 } 3521 /* 3522 * copy rsm to nrsm and then trim the front of rsm 3523 * to not include this part. 3524 */ 3525 nrsm->r_start = seq_out; 3526 nrsm->r_end = rsm->r_end; 3527 nrsm->r_rtr_cnt = rsm->r_rtr_cnt; 3528 nrsm->r_flags = rsm->r_flags; 3529 nrsm->r_sndcnt = rsm->r_sndcnt; 3530 nrsm->r_rtr_bytes = 0; 3531 for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { 3532 nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; 3533 } 3534 rsm->r_end = nrsm->r_start; 3535 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next); 3536 if (rsm->r_in_tmap) { 3537 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 3538 nrsm->r_in_tmap = 1; 3539 } 3540 rsm->r_flags &= (~RACK_HAS_FIN); 3541 seq_out = rack_update_entry(tp, rack, nrsm, ts, &len); 3542 if (len == 0) { 3543 return; 3544 } 3545 } 3546 } 3547 /* 3548 * Hmm not found in map did they retransmit both old and on into the 3549 * new? 3550 */ 3551 if (seq_out == tp->snd_max) { 3552 goto again; 3553 } else if (SEQ_LT(seq_out, tp->snd_max)) { 3554 #ifdef INVARIANTS 3555 printf("seq_out:%u len:%d snd_una:%u snd_max:%u -- but rsm not found?\n", 3556 seq_out, len, tp->snd_una, tp->snd_max); 3557 printf("Starting Dump of all rack entries\n"); 3558 TAILQ_FOREACH(rsm, &rack->r_ctl.rc_map, r_next) { 3559 printf("rsm:%p start:%u end:%u\n", 3560 rsm, rsm->r_start, rsm->r_end); 3561 } 3562 printf("Dump complete\n"); 3563 panic("seq_out not found rack:%p tp:%p", 3564 rack, tp); 3565 #endif 3566 } else { 3567 #ifdef INVARIANTS 3568 /* 3569 * Hmm beyond sndmax? (only if we are using the new rtt-pack 3570 * flag) 3571 */ 3572 panic("seq_out:%u(%d) is beyond snd_max:%u tp:%p", 3573 seq_out, len, tp->snd_max, tp); 3574 #endif 3575 } 3576 } 3577 3578 /* 3579 * Record one of the RTT updates from an ack into 3580 * our sample structure. 3581 */ 3582 static void 3583 tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt) 3584 { 3585 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) || 3586 (rack->r_ctl.rack_rs.rs_rtt_lowest > rtt)) { 3587 rack->r_ctl.rack_rs.rs_rtt_lowest = rtt; 3588 } 3589 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) || 3590 (rack->r_ctl.rack_rs.rs_rtt_highest < rtt)) { 3591 rack->r_ctl.rack_rs.rs_rtt_highest = rtt; 3592 } 3593 rack->r_ctl.rack_rs.rs_flags = RACK_RTT_VALID; 3594 rack->r_ctl.rack_rs.rs_rtt_tot += rtt; 3595 rack->r_ctl.rack_rs.rs_rtt_cnt++; 3596 } 3597 3598 /* 3599 * Collect new round-trip time estimate 3600 * and update averages and current timeout. 3601 */ 3602 static void 3603 tcp_rack_xmit_timer_commit(struct tcp_rack *rack, struct tcpcb *tp) 3604 { 3605 int32_t delta; 3606 uint32_t o_srtt, o_var; 3607 int32_t rtt; 3608 3609 if (rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) 3610 /* No valid sample */ 3611 return; 3612 if (rack->r_ctl.rc_rate_sample_method == USE_RTT_LOW) { 3613 /* We are to use the lowest RTT seen in a single ack */ 3614 rtt = rack->r_ctl.rack_rs.rs_rtt_lowest; 3615 } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_HIGH) { 3616 /* We are to use the highest RTT seen in a single ack */ 3617 rtt = rack->r_ctl.rack_rs.rs_rtt_highest; 3618 } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_AVG) { 3619 /* We are to use the average RTT seen in a single ack */ 3620 rtt = (int32_t)(rack->r_ctl.rack_rs.rs_rtt_tot / 3621 (uint64_t)rack->r_ctl.rack_rs.rs_rtt_cnt); 3622 } else { 3623 #ifdef INVARIANTS 3624 panic("Unknown rtt variant %d", rack->r_ctl.rc_rate_sample_method); 3625 #endif 3626 return; 3627 } 3628 if (rtt == 0) 3629 rtt = 1; 3630 rack_log_rtt_sample(rack, rtt); 3631 o_srtt = tp->t_srtt; 3632 o_var = tp->t_rttvar; 3633 rack = (struct tcp_rack *)tp->t_fb_ptr; 3634 if (tp->t_srtt != 0) { 3635 /* 3636 * srtt is stored as fixed point with 5 bits after the 3637 * binary point (i.e., scaled by 8). The following magic is 3638 * equivalent to the smoothing algorithm in rfc793 with an 3639 * alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed point). 3640 * Adjust rtt to origin 0. 3641 */ 3642 delta = ((rtt - 1) << TCP_DELTA_SHIFT) 3643 - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)); 3644 3645 tp->t_srtt += delta; 3646 if (tp->t_srtt <= 0) 3647 tp->t_srtt = 1; 3648 3649 /* 3650 * We accumulate a smoothed rtt variance (actually, a 3651 * smoothed mean difference), then set the retransmit timer 3652 * to smoothed rtt + 4 times the smoothed variance. rttvar 3653 * is stored as fixed point with 4 bits after the binary 3654 * point (scaled by 16). The following is equivalent to 3655 * rfc793 smoothing with an alpha of .75 (rttvar = 3656 * rttvar*3/4 + |delta| / 4). This replaces rfc793's 3657 * wired-in beta. 3658 */ 3659 if (delta < 0) 3660 delta = -delta; 3661 delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT); 3662 tp->t_rttvar += delta; 3663 if (tp->t_rttvar <= 0) 3664 tp->t_rttvar = 1; 3665 if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar) 3666 tp->t_rttbest = tp->t_srtt + tp->t_rttvar; 3667 } else { 3668 /* 3669 * No rtt measurement yet - use the unsmoothed rtt. Set the 3670 * variance to half the rtt (so our first retransmit happens 3671 * at 3*rtt). 3672 */ 3673 tp->t_srtt = rtt << TCP_RTT_SHIFT; 3674 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1); 3675 tp->t_rttbest = tp->t_srtt + tp->t_rttvar; 3676 } 3677 TCPSTAT_INC(tcps_rttupdated); 3678 rack_log_rtt_upd(tp, rack, rtt, o_srtt, o_var); 3679 tp->t_rttupdated++; 3680 #ifdef NETFLIX_STATS 3681 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rtt)); 3682 #endif 3683 tp->t_rxtshift = 0; 3684 3685 /* 3686 * the retransmit should happen at rtt + 4 * rttvar. Because of the 3687 * way we do the smoothing, srtt and rttvar will each average +1/2 3688 * tick of bias. When we compute the retransmit timer, we want 1/2 3689 * tick of rounding and 1 extra tick because of +-1/2 tick 3690 * uncertainty in the firing of the timer. The bias will give us 3691 * exactly the 1.5 tick we need. But, because the bias is 3692 * statistical, we have to test that we don't drop below the minimum 3693 * feasible timer (which is 2 ticks). 3694 */ 3695 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), 3696 max(MSEC_2_TICKS(rack_rto_min), rtt + 2), MSEC_2_TICKS(rack_rto_max)); 3697 tp->t_softerror = 0; 3698 } 3699 3700 static void 3701 rack_earlier_retran(struct tcpcb *tp, struct rack_sendmap *rsm, 3702 uint32_t t, uint32_t cts) 3703 { 3704 /* 3705 * For this RSM, we acknowledged the data from a previous 3706 * transmission, not the last one we made. This means we did a false 3707 * retransmit. 3708 */ 3709 struct tcp_rack *rack; 3710 3711 if (rsm->r_flags & RACK_HAS_FIN) { 3712 /* 3713 * The sending of the FIN often is multiple sent when we 3714 * have everything outstanding ack'd. We ignore this case 3715 * since its over now. 3716 */ 3717 return; 3718 } 3719 if (rsm->r_flags & RACK_TLP) { 3720 /* 3721 * We expect TLP's to have this occur. 3722 */ 3723 return; 3724 } 3725 rack = (struct tcp_rack *)tp->t_fb_ptr; 3726 /* should we undo cc changes and exit recovery? */ 3727 if (IN_RECOVERY(tp->t_flags)) { 3728 if (rack->r_ctl.rc_rsm_start == rsm->r_start) { 3729 /* 3730 * Undo what we ratched down and exit recovery if 3731 * possible 3732 */ 3733 EXIT_RECOVERY(tp->t_flags); 3734 tp->snd_recover = tp->snd_una; 3735 if (rack->r_ctl.rc_cwnd_at > tp->snd_cwnd) 3736 tp->snd_cwnd = rack->r_ctl.rc_cwnd_at; 3737 if (rack->r_ctl.rc_ssthresh_at > tp->snd_ssthresh) 3738 tp->snd_ssthresh = rack->r_ctl.rc_ssthresh_at; 3739 } 3740 } 3741 if (rsm->r_flags & RACK_WAS_SACKPASS) { 3742 /* 3743 * We retransmitted based on a sack and the earlier 3744 * retransmission ack'd it - re-ordering is occuring. 3745 */ 3746 counter_u64_add(rack_reorder_seen, 1); 3747 rack->r_ctl.rc_reorder_ts = cts; 3748 } 3749 counter_u64_add(rack_badfr, 1); 3750 counter_u64_add(rack_badfr_bytes, (rsm->r_end - rsm->r_start)); 3751 } 3752 3753 3754 static int 3755 rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack, 3756 struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type) 3757 { 3758 int32_t i; 3759 uint32_t t; 3760 3761 if (rsm->r_flags & RACK_ACKED) 3762 /* Already done */ 3763 return (0); 3764 3765 3766 if ((rsm->r_rtr_cnt == 1) || 3767 ((ack_type == CUM_ACKED) && 3768 (to->to_flags & TOF_TS) && 3769 (to->to_tsecr) && 3770 (rsm->r_tim_lastsent[rsm->r_rtr_cnt - 1] == to->to_tsecr)) 3771 ) { 3772 /* 3773 * We will only find a matching timestamp if its cum-acked. 3774 * But if its only one retransmission its for-sure matching 3775 * :-) 3776 */ 3777 t = cts - rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; 3778 if ((int)t <= 0) 3779 t = 1; 3780 if (!tp->t_rttlow || tp->t_rttlow > t) 3781 tp->t_rttlow = t; 3782 if (!rack->r_ctl.rc_rack_min_rtt || 3783 SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 3784 rack->r_ctl.rc_rack_min_rtt = t; 3785 if (rack->r_ctl.rc_rack_min_rtt == 0) { 3786 rack->r_ctl.rc_rack_min_rtt = 1; 3787 } 3788 } 3789 tcp_rack_xmit_timer(rack, TCP_TS_TO_TICKS(t) + 1); 3790 if ((rsm->r_flags & RACK_TLP) && 3791 (!IN_RECOVERY(tp->t_flags))) { 3792 /* Segment was a TLP and our retrans matched */ 3793 if (rack->r_ctl.rc_tlp_cwnd_reduce) { 3794 rack->r_ctl.rc_rsm_start = tp->snd_max; 3795 rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; 3796 rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; 3797 rack_cong_signal(tp, NULL, CC_NDUPACK); 3798 /* 3799 * When we enter recovery we need to assure 3800 * we send one packet. 3801 */ 3802 rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; 3803 } else 3804 rack->r_ctl.rc_tlp_rtx_out = 0; 3805 } 3806 if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) { 3807 /* New more recent rack_tmit_time */ 3808 rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; 3809 rack->rc_rack_rtt = t; 3810 } 3811 return (1); 3812 } 3813 /* 3814 * We clear the soft/rxtshift since we got an ack. 3815 * There is no assurance we will call the commit() function 3816 * so we need to clear these to avoid incorrect handling. 3817 */ 3818 tp->t_rxtshift = 0; 3819 tp->t_softerror = 0; 3820 if ((to->to_flags & TOF_TS) && 3821 (ack_type == CUM_ACKED) && 3822 (to->to_tsecr) && 3823 ((rsm->r_flags & (RACK_DEFERRED | RACK_OVERMAX)) == 0)) { 3824 /* 3825 * Now which timestamp does it match? In this block the ACK 3826 * must be coming from a previous transmission. 3827 */ 3828 for (i = 0; i < rsm->r_rtr_cnt; i++) { 3829 if (rsm->r_tim_lastsent[i] == to->to_tsecr) { 3830 t = cts - rsm->r_tim_lastsent[i]; 3831 if ((int)t <= 0) 3832 t = 1; 3833 if ((i + 1) < rsm->r_rtr_cnt) { 3834 /* Likely */ 3835 rack_earlier_retran(tp, rsm, t, cts); 3836 } 3837 if (!tp->t_rttlow || tp->t_rttlow > t) 3838 tp->t_rttlow = t; 3839 if (!rack->r_ctl.rc_rack_min_rtt || SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 3840 rack->r_ctl.rc_rack_min_rtt = t; 3841 if (rack->r_ctl.rc_rack_min_rtt == 0) { 3842 rack->r_ctl.rc_rack_min_rtt = 1; 3843 } 3844 } 3845 /* 3846 * Note the following calls to 3847 * tcp_rack_xmit_timer() are being commented 3848 * out for now. They give us no more accuracy 3849 * and often lead to a wrong choice. We have 3850 * enough samples that have not been 3851 * retransmitted. I leave the commented out 3852 * code in here in case in the future we 3853 * decide to add it back (though I can't forsee 3854 * doing that). That way we will easily see 3855 * where they need to be placed. 3856 */ 3857 if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, 3858 rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) { 3859 /* New more recent rack_tmit_time */ 3860 rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; 3861 rack->rc_rack_rtt = t; 3862 } 3863 return (1); 3864 } 3865 } 3866 goto ts_not_found; 3867 } else { 3868 /* 3869 * Ok its a SACK block that we retransmitted. or a windows 3870 * machine without timestamps. We can tell nothing from the 3871 * time-stamp since its not there or the time the peer last 3872 * recieved a segment that moved forward its cum-ack point. 3873 */ 3874 ts_not_found: 3875 i = rsm->r_rtr_cnt - 1; 3876 t = cts - rsm->r_tim_lastsent[i]; 3877 if ((int)t <= 0) 3878 t = 1; 3879 if (rack->r_ctl.rc_rack_min_rtt && SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 3880 /* 3881 * We retransmitted and the ack came back in less 3882 * than the smallest rtt we have observed. We most 3883 * likey did an improper retransmit as outlined in 3884 * 4.2 Step 3 point 2 in the rack-draft. 3885 */ 3886 i = rsm->r_rtr_cnt - 2; 3887 t = cts - rsm->r_tim_lastsent[i]; 3888 rack_earlier_retran(tp, rsm, t, cts); 3889 } else if (rack->r_ctl.rc_rack_min_rtt) { 3890 /* 3891 * We retransmitted it and the retransmit did the 3892 * job. 3893 */ 3894 if (!rack->r_ctl.rc_rack_min_rtt || 3895 SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 3896 rack->r_ctl.rc_rack_min_rtt = t; 3897 if (rack->r_ctl.rc_rack_min_rtt == 0) { 3898 rack->r_ctl.rc_rack_min_rtt = 1; 3899 } 3900 } 3901 if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, rsm->r_tim_lastsent[i])) { 3902 /* New more recent rack_tmit_time */ 3903 rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[i]; 3904 rack->rc_rack_rtt = t; 3905 } 3906 return (1); 3907 } 3908 } 3909 return (0); 3910 } 3911 3912 /* 3913 * Mark the SACK_PASSED flag on all entries prior to rsm send wise. 3914 */ 3915 static void 3916 rack_log_sack_passed(struct tcpcb *tp, 3917 struct tcp_rack *rack, struct rack_sendmap *rsm) 3918 { 3919 struct rack_sendmap *nrsm; 3920 uint32_t ts; 3921 int32_t idx; 3922 3923 idx = rsm->r_rtr_cnt - 1; 3924 ts = rsm->r_tim_lastsent[idx]; 3925 nrsm = rsm; 3926 TAILQ_FOREACH_REVERSE_FROM(nrsm, &rack->r_ctl.rc_tmap, 3927 rack_head, r_tnext) { 3928 if (nrsm == rsm) { 3929 /* Skip orginal segment he is acked */ 3930 continue; 3931 } 3932 if (nrsm->r_flags & RACK_ACKED) { 3933 /* Skip ack'd segments */ 3934 continue; 3935 } 3936 if (nrsm->r_flags & RACK_SACK_PASSED) { 3937 /* 3938 * We found one that is already marked 3939 * passed, we have been here before and 3940 * so all others below this are marked. 3941 */ 3942 break; 3943 } 3944 idx = nrsm->r_rtr_cnt - 1; 3945 if (ts == nrsm->r_tim_lastsent[idx]) { 3946 /* 3947 * For this case lets use seq no, if we sent in a 3948 * big block (TSO) we would have a bunch of segments 3949 * sent at the same time. 3950 * 3951 * We would only get a report if its SEQ is earlier. 3952 * If we have done multiple retransmits the times 3953 * would not be equal. 3954 */ 3955 if (SEQ_LT(nrsm->r_start, rsm->r_start)) { 3956 nrsm->r_flags |= RACK_SACK_PASSED; 3957 nrsm->r_flags &= ~RACK_WAS_SACKPASS; 3958 } 3959 } else { 3960 /* 3961 * Here they were sent at different times, not a big 3962 * block. Since we transmitted this one later and 3963 * see it sack'd then this must also be missing (or 3964 * we would have gotten a sack block for it) 3965 */ 3966 nrsm->r_flags |= RACK_SACK_PASSED; 3967 nrsm->r_flags &= ~RACK_WAS_SACKPASS; 3968 } 3969 } 3970 } 3971 3972 static uint32_t 3973 rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, struct sackblk *sack, 3974 struct tcpopt *to, struct rack_sendmap **prsm, uint32_t cts) 3975 { 3976 int32_t idx; 3977 int32_t times = 0; 3978 uint32_t start, end, changed = 0; 3979 struct rack_sendmap *rsm, *nrsm; 3980 int32_t used_ref = 1; 3981 3982 start = sack->start; 3983 end = sack->end; 3984 rsm = *prsm; 3985 if (rsm && SEQ_LT(start, rsm->r_start)) { 3986 TAILQ_FOREACH_REVERSE_FROM(rsm, &rack->r_ctl.rc_map, rack_head, r_next) { 3987 if (SEQ_GEQ(start, rsm->r_start) && 3988 SEQ_LT(start, rsm->r_end)) { 3989 goto do_rest_ofb; 3990 } 3991 } 3992 } 3993 if (rsm == NULL) { 3994 start_at_beginning: 3995 rsm = NULL; 3996 used_ref = 0; 3997 } 3998 /* First lets locate the block where this guy is */ 3999 TAILQ_FOREACH_FROM(rsm, &rack->r_ctl.rc_map, r_next) { 4000 if (SEQ_GEQ(start, rsm->r_start) && 4001 SEQ_LT(start, rsm->r_end)) { 4002 break; 4003 } 4004 } 4005 do_rest_ofb: 4006 if (rsm == NULL) { 4007 /* 4008 * This happens when we get duplicate sack blocks with the 4009 * same end. For example SACK 4: 100 SACK 3: 100 The sort 4010 * will not change there location so we would just start at 4011 * the end of the first one and get lost. 4012 */ 4013 if (tp->t_flags & TF_SENTFIN) { 4014 /* 4015 * Check to see if we have not logged the FIN that 4016 * went out. 4017 */ 4018 nrsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_map, rack_sendmap, r_next); 4019 if (nrsm && (nrsm->r_end + 1) == tp->snd_max) { 4020 /* 4021 * Ok we did not get the FIN logged. 4022 */ 4023 nrsm->r_end++; 4024 rsm = nrsm; 4025 goto do_rest_ofb; 4026 } 4027 } 4028 if (times == 1) { 4029 #ifdef INVARIANTS 4030 panic("tp:%p rack:%p sack:%p to:%p prsm:%p", 4031 tp, rack, sack, to, prsm); 4032 #else 4033 goto out; 4034 #endif 4035 } 4036 times++; 4037 counter_u64_add(rack_sack_proc_restart, 1); 4038 goto start_at_beginning; 4039 } 4040 /* Ok we have an ACK for some piece of rsm */ 4041 if (rsm->r_start != start) { 4042 /* 4043 * Need to split this in two pieces the before and after. 4044 */ 4045 nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT); 4046 if (nrsm == NULL) { 4047 /* 4048 * failed XXXrrs what can we do but loose the sack 4049 * info? 4050 */ 4051 goto out; 4052 } 4053 nrsm->r_start = start; 4054 nrsm->r_rtr_bytes = 0; 4055 nrsm->r_end = rsm->r_end; 4056 nrsm->r_rtr_cnt = rsm->r_rtr_cnt; 4057 nrsm->r_flags = rsm->r_flags; 4058 nrsm->r_sndcnt = rsm->r_sndcnt; 4059 for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { 4060 nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; 4061 } 4062 rsm->r_end = nrsm->r_start; 4063 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next); 4064 if (rsm->r_in_tmap) { 4065 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 4066 nrsm->r_in_tmap = 1; 4067 } 4068 rsm->r_flags &= (~RACK_HAS_FIN); 4069 rsm = nrsm; 4070 } 4071 if (SEQ_GEQ(end, rsm->r_end)) { 4072 /* 4073 * The end of this block is either beyond this guy or right 4074 * at this guy. 4075 */ 4076 4077 if ((rsm->r_flags & RACK_ACKED) == 0) { 4078 rack_update_rtt(tp, rack, rsm, to, cts, SACKED); 4079 changed += (rsm->r_end - rsm->r_start); 4080 rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start); 4081 rack_log_sack_passed(tp, rack, rsm); 4082 /* Is Reordering occuring? */ 4083 if (rsm->r_flags & RACK_SACK_PASSED) { 4084 counter_u64_add(rack_reorder_seen, 1); 4085 rack->r_ctl.rc_reorder_ts = cts; 4086 } 4087 rsm->r_flags |= RACK_ACKED; 4088 rsm->r_flags &= ~RACK_TLP; 4089 if (rsm->r_in_tmap) { 4090 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 4091 rsm->r_in_tmap = 0; 4092 } 4093 } 4094 if (end == rsm->r_end) { 4095 /* This block only - done */ 4096 goto out; 4097 } 4098 /* There is more not coverend by this rsm move on */ 4099 start = rsm->r_end; 4100 nrsm = TAILQ_NEXT(rsm, r_next); 4101 rsm = nrsm; 4102 times = 0; 4103 goto do_rest_ofb; 4104 } 4105 /* Ok we need to split off this one at the tail */ 4106 nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT); 4107 if (nrsm == NULL) { 4108 /* failed rrs what can we do but loose the sack info? */ 4109 goto out; 4110 } 4111 /* Clone it */ 4112 nrsm->r_start = end; 4113 nrsm->r_end = rsm->r_end; 4114 nrsm->r_rtr_bytes = 0; 4115 nrsm->r_rtr_cnt = rsm->r_rtr_cnt; 4116 nrsm->r_flags = rsm->r_flags; 4117 nrsm->r_sndcnt = rsm->r_sndcnt; 4118 for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { 4119 nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; 4120 } 4121 /* The sack block does not cover this guy fully */ 4122 rsm->r_flags &= (~RACK_HAS_FIN); 4123 rsm->r_end = end; 4124 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next); 4125 if (rsm->r_in_tmap) { 4126 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 4127 nrsm->r_in_tmap = 1; 4128 } 4129 if (rsm->r_flags & RACK_ACKED) { 4130 /* Been here done that */ 4131 goto out; 4132 } 4133 rack_update_rtt(tp, rack, rsm, to, cts, SACKED); 4134 changed += (rsm->r_end - rsm->r_start); 4135 rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start); 4136 rack_log_sack_passed(tp, rack, rsm); 4137 /* Is Reordering occuring? */ 4138 if (rsm->r_flags & RACK_SACK_PASSED) { 4139 counter_u64_add(rack_reorder_seen, 1); 4140 rack->r_ctl.rc_reorder_ts = cts; 4141 } 4142 rsm->r_flags |= RACK_ACKED; 4143 rsm->r_flags &= ~RACK_TLP; 4144 if (rsm->r_in_tmap) { 4145 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 4146 rsm->r_in_tmap = 0; 4147 } 4148 out: 4149 if (rsm && (rsm->r_flags & RACK_ACKED)) { 4150 /* 4151 * Now can we merge this newly acked 4152 * block with either the previous or 4153 * next block? 4154 */ 4155 nrsm = TAILQ_NEXT(rsm, r_next); 4156 if (nrsm && 4157 (nrsm->r_flags & RACK_ACKED)) { 4158 /* yep this and next can be merged */ 4159 rsm = rack_merge_rsm(rack, rsm, nrsm); 4160 } 4161 /* Now what about the previous? */ 4162 nrsm = TAILQ_PREV(rsm, rack_head, r_next); 4163 if (nrsm && 4164 (nrsm->r_flags & RACK_ACKED)) { 4165 /* yep the previous and this can be merged */ 4166 rsm = rack_merge_rsm(rack, nrsm, rsm); 4167 } 4168 } 4169 if (used_ref == 0) { 4170 counter_u64_add(rack_sack_proc_all, 1); 4171 } else { 4172 counter_u64_add(rack_sack_proc_short, 1); 4173 } 4174 /* Save off where we last were */ 4175 if (rsm) 4176 rack->r_ctl.rc_sacklast = TAILQ_NEXT(rsm, r_next); 4177 else 4178 rack->r_ctl.rc_sacklast = NULL; 4179 *prsm = rsm; 4180 return (changed); 4181 } 4182 4183 static void inline 4184 rack_peer_reneges(struct tcp_rack *rack, struct rack_sendmap *rsm, tcp_seq th_ack) 4185 { 4186 struct rack_sendmap *tmap; 4187 4188 tmap = NULL; 4189 while (rsm && (rsm->r_flags & RACK_ACKED)) { 4190 /* Its no longer sacked, mark it so */ 4191 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); 4192 #ifdef INVARIANTS 4193 if (rsm->r_in_tmap) { 4194 panic("rack:%p rsm:%p flags:0x%x in tmap?", 4195 rack, rsm, rsm->r_flags); 4196 } 4197 #endif 4198 rsm->r_flags &= ~(RACK_ACKED|RACK_SACK_PASSED|RACK_WAS_SACKPASS); 4199 /* Rebuild it into our tmap */ 4200 if (tmap == NULL) { 4201 TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext); 4202 tmap = rsm; 4203 } else { 4204 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, tmap, rsm, r_tnext); 4205 tmap = rsm; 4206 } 4207 tmap->r_in_tmap = 1; 4208 rsm = TAILQ_NEXT(rsm, r_next); 4209 } 4210 /* 4211 * Now lets possibly clear the sack filter so we start 4212 * recognizing sacks that cover this area. 4213 */ 4214 if (rack_use_sack_filter) 4215 sack_filter_clear(&rack->r_ctl.rack_sf, th_ack); 4216 4217 } 4218 4219 static void 4220 rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th) 4221 { 4222 uint32_t changed, last_seq, entered_recovery = 0; 4223 struct tcp_rack *rack; 4224 struct rack_sendmap *rsm; 4225 struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1]; 4226 register uint32_t th_ack; 4227 int32_t i, j, k, num_sack_blks = 0; 4228 uint32_t cts, acked, ack_point, sack_changed = 0; 4229 4230 INP_WLOCK_ASSERT(tp->t_inpcb); 4231 if (th->th_flags & TH_RST) { 4232 /* We don't log resets */ 4233 return; 4234 } 4235 rack = (struct tcp_rack *)tp->t_fb_ptr; 4236 cts = tcp_ts_getticks(); 4237 rsm = TAILQ_FIRST(&rack->r_ctl.rc_map); 4238 changed = 0; 4239 th_ack = th->th_ack; 4240 4241 if (SEQ_GT(th_ack, tp->snd_una)) { 4242 rack_log_progress_event(rack, tp, ticks, PROGRESS_UPDATE, __LINE__); 4243 tp->t_acktime = ticks; 4244 } 4245 if (rsm && SEQ_GT(th_ack, rsm->r_start)) 4246 changed = th_ack - rsm->r_start; 4247 if (changed) { 4248 /* 4249 * The ACK point is advancing to th_ack, we must drop off 4250 * the packets in the rack log and calculate any eligble 4251 * RTT's. 4252 */ 4253 rack->r_wanted_output++; 4254 more: 4255 rsm = TAILQ_FIRST(&rack->r_ctl.rc_map); 4256 if (rsm == NULL) { 4257 if ((th_ack - 1) == tp->iss) { 4258 /* 4259 * For the SYN incoming case we will not 4260 * have called tcp_output for the sending of 4261 * the SYN, so there will be no map. All 4262 * other cases should probably be a panic. 4263 */ 4264 goto proc_sack; 4265 } 4266 if (tp->t_flags & TF_SENTFIN) { 4267 /* if we send a FIN we will not hav a map */ 4268 goto proc_sack; 4269 } 4270 #ifdef INVARIANTS 4271 panic("No rack map tp:%p for th:%p state:%d rack:%p snd_una:%u snd_max:%u snd_nxt:%u chg:%d\n", 4272 tp, 4273 th, tp->t_state, rack, 4274 tp->snd_una, tp->snd_max, tp->snd_nxt, changed); 4275 #endif 4276 goto proc_sack; 4277 } 4278 if (SEQ_LT(th_ack, rsm->r_start)) { 4279 /* Huh map is missing this */ 4280 #ifdef INVARIANTS 4281 printf("Rack map starts at r_start:%u for th_ack:%u huh? ts:%d rs:%d\n", 4282 rsm->r_start, 4283 th_ack, tp->t_state, rack->r_state); 4284 #endif 4285 goto proc_sack; 4286 } 4287 rack_update_rtt(tp, rack, rsm, to, cts, CUM_ACKED); 4288 /* Now do we consume the whole thing? */ 4289 if (SEQ_GEQ(th_ack, rsm->r_end)) { 4290 /* Its all consumed. */ 4291 uint32_t left; 4292 4293 rack->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes; 4294 rsm->r_rtr_bytes = 0; 4295 TAILQ_REMOVE(&rack->r_ctl.rc_map, rsm, r_next); 4296 if (rsm->r_in_tmap) { 4297 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 4298 rsm->r_in_tmap = 0; 4299 } 4300 if (rack->r_ctl.rc_next == rsm) { 4301 /* scoot along the marker */ 4302 rack->r_ctl.rc_next = TAILQ_FIRST(&rack->r_ctl.rc_map); 4303 } 4304 if (rsm->r_flags & RACK_ACKED) { 4305 /* 4306 * It was acked on the scoreboard -- remove 4307 * it from total 4308 */ 4309 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); 4310 } else if (rsm->r_flags & RACK_SACK_PASSED) { 4311 /* 4312 * There are acked segments ACKED on the 4313 * scoreboard further up. We are seeing 4314 * reordering. 4315 */ 4316 counter_u64_add(rack_reorder_seen, 1); 4317 rsm->r_flags |= RACK_ACKED; 4318 rack->r_ctl.rc_reorder_ts = cts; 4319 } 4320 left = th_ack - rsm->r_end; 4321 if (rsm->r_rtr_cnt > 1) { 4322 /* 4323 * Technically we should make r_rtr_cnt be 4324 * monotonicly increasing and just mod it to 4325 * the timestamp it is replacing.. that way 4326 * we would have the last 3 retransmits. Now 4327 * rc_loss_count will be wrong if we 4328 * retransmit something more than 2 times in 4329 * recovery :( 4330 */ 4331 rack->r_ctl.rc_loss_count += (rsm->r_rtr_cnt - 1); 4332 } 4333 /* Free back to zone */ 4334 rack_free(rack, rsm); 4335 if (left) { 4336 goto more; 4337 } 4338 goto proc_sack; 4339 } 4340 if (rsm->r_flags & RACK_ACKED) { 4341 /* 4342 * It was acked on the scoreboard -- remove it from 4343 * total for the part being cum-acked. 4344 */ 4345 rack->r_ctl.rc_sacked -= (th_ack - rsm->r_start); 4346 } 4347 rack->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes; 4348 rsm->r_rtr_bytes = 0; 4349 rsm->r_start = th_ack; 4350 } 4351 proc_sack: 4352 /* Check for reneging */ 4353 rsm = TAILQ_FIRST(&rack->r_ctl.rc_map); 4354 if (rsm && (rsm->r_flags & RACK_ACKED) && (th_ack == rsm->r_start)) { 4355 /* 4356 * The peer has moved snd_una up to 4357 * the edge of this send, i.e. one 4358 * that it had previously acked. The only 4359 * way that can be true if the peer threw 4360 * away data (space issues) that it had 4361 * previously sacked (else it would have 4362 * given us snd_una up to (rsm->r_end). 4363 * We need to undo the acked markings here. 4364 * 4365 * Note we have to look to make sure th_ack is 4366 * our rsm->r_start in case we get an old ack 4367 * where th_ack is behind snd_una. 4368 */ 4369 rack_peer_reneges(rack, rsm, th->th_ack); 4370 } 4371 if ((to->to_flags & TOF_SACK) == 0) { 4372 /* We are done nothing left to log */ 4373 goto out; 4374 } 4375 rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_map, rack_sendmap, r_next); 4376 if (rsm) { 4377 last_seq = rsm->r_end; 4378 } else { 4379 last_seq = tp->snd_max; 4380 } 4381 /* Sack block processing */ 4382 if (SEQ_GT(th_ack, tp->snd_una)) 4383 ack_point = th_ack; 4384 else 4385 ack_point = tp->snd_una; 4386 for (i = 0; i < to->to_nsacks; i++) { 4387 bcopy((to->to_sacks + i * TCPOLEN_SACK), 4388 &sack, sizeof(sack)); 4389 sack.start = ntohl(sack.start); 4390 sack.end = ntohl(sack.end); 4391 if (SEQ_GT(sack.end, sack.start) && 4392 SEQ_GT(sack.start, ack_point) && 4393 SEQ_LT(sack.start, tp->snd_max) && 4394 SEQ_GT(sack.end, ack_point) && 4395 SEQ_LEQ(sack.end, tp->snd_max)) { 4396 if ((rack->r_ctl.rc_num_maps_alloced > rack_sack_block_limit) && 4397 (SEQ_LT(sack.end, last_seq)) && 4398 ((sack.end - sack.start) < (tp->t_maxseg / 8))) { 4399 /* 4400 * Not the last piece and its smaller than 4401 * 1/8th of a MSS. We ignore this. 4402 */ 4403 counter_u64_add(rack_runt_sacks, 1); 4404 continue; 4405 } 4406 sack_blocks[num_sack_blks] = sack; 4407 num_sack_blks++; 4408 } else if (SEQ_LEQ(sack.start, th_ack) && 4409 SEQ_LEQ(sack.end, th_ack)) { 4410 /* 4411 * Its a D-SACK block. 4412 */ 4413 /* tcp_record_dsack(sack.start, sack.end); */ 4414 } 4415 } 4416 if (num_sack_blks == 0) 4417 goto out; 4418 /* 4419 * Sort the SACK blocks so we can update the rack scoreboard with 4420 * just one pass. 4421 */ 4422 if (rack_use_sack_filter) { 4423 num_sack_blks = sack_filter_blks(&rack->r_ctl.rack_sf, sack_blocks, 4424 num_sack_blks, th->th_ack); 4425 ctf_log_sack_filter(rack->rc_tp, num_sack_blks, sack_blocks); 4426 } 4427 if (num_sack_blks < 2) { 4428 goto do_sack_work; 4429 } 4430 /* Sort the sacks */ 4431 for (i = 0; i < num_sack_blks; i++) { 4432 for (j = i + 1; j < num_sack_blks; j++) { 4433 if (SEQ_GT(sack_blocks[i].end, sack_blocks[j].end)) { 4434 sack = sack_blocks[i]; 4435 sack_blocks[i] = sack_blocks[j]; 4436 sack_blocks[j] = sack; 4437 } 4438 } 4439 } 4440 /* 4441 * Now are any of the sack block ends the same (yes some 4442 * implememtations send these)? 4443 */ 4444 again: 4445 if (num_sack_blks > 1) { 4446 for (i = 0; i < num_sack_blks; i++) { 4447 for (j = i + 1; j < num_sack_blks; j++) { 4448 if (sack_blocks[i].end == sack_blocks[j].end) { 4449 /* 4450 * Ok these two have the same end we 4451 * want the smallest end and then 4452 * throw away the larger and start 4453 * again. 4454 */ 4455 if (SEQ_LT(sack_blocks[j].start, sack_blocks[i].start)) { 4456 /* 4457 * The second block covers 4458 * more area use that 4459 */ 4460 sack_blocks[i].start = sack_blocks[j].start; 4461 } 4462 /* 4463 * Now collapse out the dup-sack and 4464 * lower the count 4465 */ 4466 for (k = (j + 1); k < num_sack_blks; k++) { 4467 sack_blocks[j].start = sack_blocks[k].start; 4468 sack_blocks[j].end = sack_blocks[k].end; 4469 j++; 4470 } 4471 num_sack_blks--; 4472 goto again; 4473 } 4474 } 4475 } 4476 } 4477 do_sack_work: 4478 rsm = rack->r_ctl.rc_sacklast; 4479 for (i = 0; i < num_sack_blks; i++) { 4480 acked = rack_proc_sack_blk(tp, rack, &sack_blocks[i], to, &rsm, cts); 4481 if (acked) { 4482 rack->r_wanted_output++; 4483 changed += acked; 4484 sack_changed += acked; 4485 } 4486 } 4487 out: 4488 if (changed) { 4489 /* Something changed cancel the rack timer */ 4490 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 4491 } 4492 if ((sack_changed) && (!IN_RECOVERY(tp->t_flags))) { 4493 /* 4494 * Ok we have a high probability that we need to go in to 4495 * recovery since we have data sack'd 4496 */ 4497 struct rack_sendmap *rsm; 4498 uint32_t tsused; 4499 4500 tsused = tcp_ts_getticks(); 4501 rsm = tcp_rack_output(tp, rack, tsused); 4502 if (rsm) { 4503 /* Enter recovery */ 4504 rack->r_ctl.rc_rsm_start = rsm->r_start; 4505 rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; 4506 rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; 4507 entered_recovery = 1; 4508 rack_cong_signal(tp, NULL, CC_NDUPACK); 4509 /* 4510 * When we enter recovery we need to assure we send 4511 * one packet. 4512 */ 4513 rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; 4514 rack->r_timer_override = 1; 4515 } 4516 } 4517 if (IN_RECOVERY(tp->t_flags) && (entered_recovery == 0)) { 4518 /* Deal with changed an PRR here (in recovery only) */ 4519 uint32_t pipe, snd_una; 4520 4521 rack->r_ctl.rc_prr_delivered += changed; 4522 /* Compute prr_sndcnt */ 4523 if (SEQ_GT(tp->snd_una, th_ack)) { 4524 snd_una = tp->snd_una; 4525 } else { 4526 snd_una = th_ack; 4527 } 4528 pipe = ((tp->snd_max - snd_una) - rack->r_ctl.rc_sacked) + rack->r_ctl.rc_holes_rxt; 4529 if (pipe > tp->snd_ssthresh) { 4530 long sndcnt; 4531 4532 sndcnt = rack->r_ctl.rc_prr_delivered * tp->snd_ssthresh; 4533 if (rack->r_ctl.rc_prr_recovery_fs > 0) 4534 sndcnt /= (long)rack->r_ctl.rc_prr_recovery_fs; 4535 else { 4536 rack->r_ctl.rc_prr_sndcnt = 0; 4537 sndcnt = 0; 4538 } 4539 sndcnt++; 4540 if (sndcnt > (long)rack->r_ctl.rc_prr_out) 4541 sndcnt -= rack->r_ctl.rc_prr_out; 4542 else 4543 sndcnt = 0; 4544 rack->r_ctl.rc_prr_sndcnt = sndcnt; 4545 } else { 4546 uint32_t limit; 4547 4548 if (rack->r_ctl.rc_prr_delivered > rack->r_ctl.rc_prr_out) 4549 limit = (rack->r_ctl.rc_prr_delivered - rack->r_ctl.rc_prr_out); 4550 else 4551 limit = 0; 4552 if (changed > limit) 4553 limit = changed; 4554 limit += tp->t_maxseg; 4555 if (tp->snd_ssthresh > pipe) { 4556 rack->r_ctl.rc_prr_sndcnt = min((tp->snd_ssthresh - pipe), limit); 4557 } else { 4558 rack->r_ctl.rc_prr_sndcnt = min(0, limit); 4559 } 4560 } 4561 if (rack->r_ctl.rc_prr_sndcnt >= tp->t_maxseg) { 4562 rack->r_timer_override = 1; 4563 } 4564 } 4565 } 4566 4567 /* 4568 * Return value of 1, we do not need to call rack_process_data(). 4569 * return value of 0, rack_process_data can be called. 4570 * For ret_val if its 0 the TCP is locked, if its non-zero 4571 * its unlocked and probably unsafe to touch the TCB. 4572 */ 4573 static int 4574 rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so, 4575 struct tcpcb *tp, struct tcpopt *to, 4576 uint32_t tiwin, int32_t tlen, 4577 int32_t * ofia, int32_t thflags, int32_t * ret_val) 4578 { 4579 int32_t ourfinisacked = 0; 4580 int32_t nsegs, acked_amount; 4581 int32_t acked; 4582 struct mbuf *mfree; 4583 struct tcp_rack *rack; 4584 int32_t recovery = 0; 4585 4586 rack = (struct tcp_rack *)tp->t_fb_ptr; 4587 if (SEQ_GT(th->th_ack, tp->snd_max)) { 4588 rack_do_dropafterack(m, tp, th, thflags, tlen, ret_val); 4589 return (1); 4590 } 4591 if (SEQ_GEQ(th->th_ack, tp->snd_una) || to->to_nsacks) { 4592 rack_log_ack(tp, to, th); 4593 } 4594 if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) { 4595 /* 4596 * Old ack, behind (or duplicate to) the last one rcv'd 4597 * Note: Should mark reordering is occuring! We should also 4598 * look for sack blocks arriving e.g. ack 1, 4-4 then ack 1, 4599 * 3-3, 4-4 would be reording. As well as ack 1, 3-3 <no 4600 * retran and> ack 3 4601 */ 4602 return (0); 4603 } 4604 /* 4605 * If we reach this point, ACK is not a duplicate, i.e., it ACKs 4606 * something we sent. 4607 */ 4608 if (tp->t_flags & TF_NEEDSYN) { 4609 /* 4610 * T/TCP: Connection was half-synchronized, and our SYN has 4611 * been ACK'd (so connection is now fully synchronized). Go 4612 * to non-starred state, increment snd_una for ACK of SYN, 4613 * and check if we can do window scaling. 4614 */ 4615 tp->t_flags &= ~TF_NEEDSYN; 4616 tp->snd_una++; 4617 /* Do window scaling? */ 4618 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 4619 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 4620 tp->rcv_scale = tp->request_r_scale; 4621 /* Send window already scaled. */ 4622 } 4623 } 4624 nsegs = max(1, m->m_pkthdr.lro_nsegs); 4625 INP_WLOCK_ASSERT(tp->t_inpcb); 4626 4627 acked = BYTES_THIS_ACK(tp, th); 4628 TCPSTAT_ADD(tcps_rcvackpack, nsegs); 4629 TCPSTAT_ADD(tcps_rcvackbyte, acked); 4630 4631 /* 4632 * If we just performed our first retransmit, and the ACK arrives 4633 * within our recovery window, then it was a mistake to do the 4634 * retransmit in the first place. Recover our original cwnd and 4635 * ssthresh, and proceed to transmit where we left off. 4636 */ 4637 if (tp->t_flags & TF_PREVVALID) { 4638 tp->t_flags &= ~TF_PREVVALID; 4639 if (tp->t_rxtshift == 1 && 4640 (int)(ticks - tp->t_badrxtwin) < 0) 4641 rack_cong_signal(tp, th, CC_RTO_ERR); 4642 } 4643 /* 4644 * If we have a timestamp reply, update smoothed round trip time. If 4645 * no timestamp is present but transmit timer is running and timed 4646 * sequence number was acked, update smoothed round trip time. Since 4647 * we now have an rtt measurement, cancel the timer backoff (cf., 4648 * Phil Karn's retransmit alg.). Recompute the initial retransmit 4649 * timer. 4650 * 4651 * Some boxes send broken timestamp replies during the SYN+ACK 4652 * phase, ignore timestamps of 0 or we could calculate a huge RTT 4653 * and blow up the retransmit timer. 4654 */ 4655 /* 4656 * If all outstanding data is acked, stop retransmit timer and 4657 * remember to restart (more output or persist). If there is more 4658 * data to be acked, restart retransmit timer, using current 4659 * (possibly backed-off) value. 4660 */ 4661 if (th->th_ack == tp->snd_max) { 4662 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 4663 rack->r_wanted_output++; 4664 } 4665 /* 4666 * If no data (only SYN) was ACK'd, skip rest of ACK processing. 4667 */ 4668 if (acked == 0) { 4669 if (ofia) 4670 *ofia = ourfinisacked; 4671 return (0); 4672 } 4673 if (rack->r_ctl.rc_early_recovery) { 4674 if (IN_RECOVERY(tp->t_flags)) { 4675 if (SEQ_LT(th->th_ack, tp->snd_recover) && 4676 (SEQ_LT(th->th_ack, tp->snd_max))) { 4677 tcp_rack_partialack(tp, th); 4678 } else { 4679 rack_post_recovery(tp, th); 4680 recovery = 1; 4681 } 4682 } 4683 } 4684 /* 4685 * Let the congestion control algorithm update congestion control 4686 * related information. This typically means increasing the 4687 * congestion window. 4688 */ 4689 rack_ack_received(tp, rack, th, nsegs, CC_ACK, recovery); 4690 SOCKBUF_LOCK(&so->so_snd); 4691 acked_amount = min(acked, (int)sbavail(&so->so_snd)); 4692 tp->snd_wnd -= acked_amount; 4693 mfree = sbcut_locked(&so->so_snd, acked_amount); 4694 if ((sbused(&so->so_snd) == 0) && 4695 (acked > acked_amount) && 4696 (tp->t_state >= TCPS_FIN_WAIT_1)) { 4697 ourfinisacked = 1; 4698 } 4699 /* NB: sowwakeup_locked() does an implicit unlock. */ 4700 sowwakeup_locked(so); 4701 m_freem(mfree); 4702 if (rack->r_ctl.rc_early_recovery == 0) { 4703 if (IN_RECOVERY(tp->t_flags)) { 4704 if (SEQ_LT(th->th_ack, tp->snd_recover) && 4705 (SEQ_LT(th->th_ack, tp->snd_max))) { 4706 tcp_rack_partialack(tp, th); 4707 } else { 4708 rack_post_recovery(tp, th); 4709 } 4710 } 4711 } 4712 tp->snd_una = th->th_ack; 4713 if (SEQ_GT(tp->snd_una, tp->snd_recover)) 4714 tp->snd_recover = tp->snd_una; 4715 4716 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) { 4717 tp->snd_nxt = tp->snd_una; 4718 } 4719 if (tp->snd_una == tp->snd_max) { 4720 /* Nothing left outstanding */ 4721 rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__); 4722 tp->t_acktime = 0; 4723 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 4724 /* Set need output so persist might get set */ 4725 rack->r_wanted_output++; 4726 if (rack_use_sack_filter) 4727 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 4728 if ((tp->t_state >= TCPS_FIN_WAIT_1) && 4729 (sbavail(&so->so_snd) == 0) && 4730 (tp->t_flags2 & TF2_DROP_AF_DATA)) { 4731 /* 4732 * The socket was gone and the 4733 * peer sent data, time to 4734 * reset him. 4735 */ 4736 *ret_val = 1; 4737 tp = tcp_close(tp); 4738 rack_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, tlen); 4739 return (1); 4740 } 4741 } 4742 if (ofia) 4743 *ofia = ourfinisacked; 4744 return (0); 4745 } 4746 4747 4748 /* 4749 * Return value of 1, the TCB is unlocked and most 4750 * likely gone, return value of 0, the TCP is still 4751 * locked. 4752 */ 4753 static int 4754 rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so, 4755 struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, 4756 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) 4757 { 4758 /* 4759 * Update window information. Don't look at window if no ACK: TAC's 4760 * send garbage on first SYN. 4761 */ 4762 int32_t nsegs; 4763 #ifdef TCP_RFC7413 4764 int32_t tfo_syn; 4765 #else 4766 #define tfo_syn (FALSE) 4767 #endif 4768 struct tcp_rack *rack; 4769 4770 rack = (struct tcp_rack *)tp->t_fb_ptr; 4771 INP_WLOCK_ASSERT(tp->t_inpcb); 4772 nsegs = max(1, m->m_pkthdr.lro_nsegs); 4773 if ((thflags & TH_ACK) && 4774 (SEQ_LT(tp->snd_wl1, th->th_seq) || 4775 (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || 4776 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { 4777 /* keep track of pure window updates */ 4778 if (tlen == 0 && 4779 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) 4780 TCPSTAT_INC(tcps_rcvwinupd); 4781 tp->snd_wnd = tiwin; 4782 tp->snd_wl1 = th->th_seq; 4783 tp->snd_wl2 = th->th_ack; 4784 if (tp->snd_wnd > tp->max_sndwnd) 4785 tp->max_sndwnd = tp->snd_wnd; 4786 rack->r_wanted_output++; 4787 } else if (thflags & TH_ACK) { 4788 if ((tp->snd_wl2 == th->th_ack) && (tiwin < tp->snd_wnd)) { 4789 tp->snd_wnd = tiwin; 4790 tp->snd_wl1 = th->th_seq; 4791 tp->snd_wl2 = th->th_ack; 4792 } 4793 } 4794 /* Was persist timer active and now we have window space? */ 4795 if ((rack->rc_in_persist != 0) && tp->snd_wnd) { 4796 rack_exit_persist(tp, rack); 4797 tp->snd_nxt = tp->snd_max; 4798 /* Make sure we output to start the timer */ 4799 rack->r_wanted_output++; 4800 } 4801 if (tp->t_flags2 & TF2_DROP_AF_DATA) { 4802 m_freem(m); 4803 return (0); 4804 } 4805 /* 4806 * Process segments with URG. 4807 */ 4808 if ((thflags & TH_URG) && th->th_urp && 4809 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 4810 /* 4811 * This is a kludge, but if we receive and accept random 4812 * urgent pointers, we'll crash in soreceive. It's hard to 4813 * imagine someone actually wanting to send this much urgent 4814 * data. 4815 */ 4816 SOCKBUF_LOCK(&so->so_rcv); 4817 if (th->th_urp + sbavail(&so->so_rcv) > sb_max) { 4818 th->th_urp = 0; /* XXX */ 4819 thflags &= ~TH_URG; /* XXX */ 4820 SOCKBUF_UNLOCK(&so->so_rcv); /* XXX */ 4821 goto dodata; /* XXX */ 4822 } 4823 /* 4824 * If this segment advances the known urgent pointer, then 4825 * mark the data stream. This should not happen in 4826 * CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since a 4827 * FIN has been received from the remote side. In these 4828 * states we ignore the URG. 4829 * 4830 * According to RFC961 (Assigned Protocols), the urgent 4831 * pointer points to the last octet of urgent data. We 4832 * continue, however, to consider it to indicate the first 4833 * octet of data past the urgent section as the original 4834 * spec states (in one of two places). 4835 */ 4836 if (SEQ_GT(th->th_seq + th->th_urp, tp->rcv_up)) { 4837 tp->rcv_up = th->th_seq + th->th_urp; 4838 so->so_oobmark = sbavail(&so->so_rcv) + 4839 (tp->rcv_up - tp->rcv_nxt) - 1; 4840 if (so->so_oobmark == 0) 4841 so->so_rcv.sb_state |= SBS_RCVATMARK; 4842 sohasoutofband(so); 4843 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); 4844 } 4845 SOCKBUF_UNLOCK(&so->so_rcv); 4846 /* 4847 * Remove out of band data so doesn't get presented to user. 4848 * This can happen independent of advancing the URG pointer, 4849 * but if two URG's are pending at once, some out-of-band 4850 * data may creep in... ick. 4851 */ 4852 if (th->th_urp <= (uint32_t) tlen && 4853 !(so->so_options & SO_OOBINLINE)) { 4854 /* hdr drop is delayed */ 4855 tcp_pulloutofband(so, th, m, drop_hdrlen); 4856 } 4857 } else { 4858 /* 4859 * If no out of band data is expected, pull receive urgent 4860 * pointer along with the receive window. 4861 */ 4862 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) 4863 tp->rcv_up = tp->rcv_nxt; 4864 } 4865 dodata: /* XXX */ 4866 INP_WLOCK_ASSERT(tp->t_inpcb); 4867 4868 /* 4869 * Process the segment text, merging it into the TCP sequencing 4870 * queue, and arranging for acknowledgment of receipt if necessary. 4871 * This process logically involves adjusting tp->rcv_wnd as data is 4872 * presented to the user (this happens in tcp_usrreq.c, case 4873 * PRU_RCVD). If a FIN has already been received on this connection 4874 * then we just ignore the text. 4875 */ 4876 #ifdef TCP_RFC7413 4877 tfo_syn = ((tp->t_state == TCPS_SYN_RECEIVED) && 4878 (tp->t_flags & TF_FASTOPEN)); 4879 #endif 4880 if ((tlen || (thflags & TH_FIN) || tfo_syn) && 4881 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 4882 tcp_seq save_start = th->th_seq; 4883 tcp_seq save_rnxt = tp->rcv_nxt; 4884 int save_tlen = tlen; 4885 4886 m_adj(m, drop_hdrlen); /* delayed header drop */ 4887 /* 4888 * Insert segment which includes th into TCP reassembly 4889 * queue with control block tp. Set thflags to whether 4890 * reassembly now includes a segment with FIN. This handles 4891 * the common case inline (segment is the next to be 4892 * received on an established connection, and the queue is 4893 * empty), avoiding linkage into and removal from the queue 4894 * and repetition of various conversions. Set DELACK for 4895 * segments received in order, but ack immediately when 4896 * segments are out of order (so fast retransmit can work). 4897 */ 4898 if (th->th_seq == tp->rcv_nxt && 4899 SEGQ_EMPTY(tp) && 4900 (TCPS_HAVEESTABLISHED(tp->t_state) || 4901 tfo_syn)) { 4902 if (DELAY_ACK(tp, tlen) || tfo_syn) { 4903 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 4904 tp->t_flags |= TF_DELACK; 4905 } else { 4906 rack->r_wanted_output++; 4907 tp->t_flags |= TF_ACKNOW; 4908 } 4909 tp->rcv_nxt += tlen; 4910 thflags = th->th_flags & TH_FIN; 4911 TCPSTAT_ADD(tcps_rcvpack, nsegs); 4912 TCPSTAT_ADD(tcps_rcvbyte, tlen); 4913 SOCKBUF_LOCK(&so->so_rcv); 4914 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) 4915 m_freem(m); 4916 else 4917 sbappendstream_locked(&so->so_rcv, m, 0); 4918 /* NB: sorwakeup_locked() does an implicit unlock. */ 4919 sorwakeup_locked(so); 4920 } else { 4921 /* 4922 * XXX: Due to the header drop above "th" is 4923 * theoretically invalid by now. Fortunately 4924 * m_adj() doesn't actually frees any mbufs when 4925 * trimming from the head. 4926 */ 4927 tcp_seq temp = save_start; 4928 thflags = tcp_reass(tp, th, &temp, &tlen, m); 4929 tp->t_flags |= TF_ACKNOW; 4930 } 4931 if (((tlen == 0) && (save_tlen > 0) && 4932 (SEQ_LT(save_start, save_rnxt)))) { 4933 /* 4934 * DSACK actually handled in the fastpath 4935 * above. 4936 */ 4937 tcp_update_sack_list(tp, save_start, save_start + save_tlen); 4938 } else if ((tlen > 0) && SEQ_GT(tp->rcv_nxt, save_rnxt)) { 4939 /* 4940 * Cleaning sackblks by using zero length 4941 * update. 4942 */ 4943 tcp_update_sack_list(tp, save_start, save_start); 4944 } else if ((tlen > 0) && (tlen >= save_tlen)) { 4945 /* Update of sackblks. */ 4946 tcp_update_sack_list(tp, save_start, save_start + save_tlen); 4947 } else if (tlen > 0) { 4948 tcp_update_sack_list(tp, save_start, save_start+tlen); 4949 } 4950 } else { 4951 m_freem(m); 4952 thflags &= ~TH_FIN; 4953 } 4954 4955 /* 4956 * If FIN is received ACK the FIN and let the user know that the 4957 * connection is closing. 4958 */ 4959 if (thflags & TH_FIN) { 4960 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 4961 socantrcvmore(so); 4962 /* 4963 * If connection is half-synchronized (ie NEEDSYN 4964 * flag on) then delay ACK, so it may be piggybacked 4965 * when SYN is sent. Otherwise, since we received a 4966 * FIN then no more input can be expected, send ACK 4967 * now. 4968 */ 4969 if (tp->t_flags & TF_NEEDSYN) { 4970 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 4971 tp->t_flags |= TF_DELACK; 4972 } else { 4973 tp->t_flags |= TF_ACKNOW; 4974 } 4975 tp->rcv_nxt++; 4976 } 4977 switch (tp->t_state) { 4978 4979 /* 4980 * In SYN_RECEIVED and ESTABLISHED STATES enter the 4981 * CLOSE_WAIT state. 4982 */ 4983 case TCPS_SYN_RECEIVED: 4984 tp->t_starttime = ticks; 4985 /* FALLTHROUGH */ 4986 case TCPS_ESTABLISHED: 4987 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 4988 tcp_state_change(tp, TCPS_CLOSE_WAIT); 4989 break; 4990 4991 /* 4992 * If still in FIN_WAIT_1 STATE FIN has not been 4993 * acked so enter the CLOSING state. 4994 */ 4995 case TCPS_FIN_WAIT_1: 4996 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 4997 tcp_state_change(tp, TCPS_CLOSING); 4998 break; 4999 5000 /* 5001 * In FIN_WAIT_2 state enter the TIME_WAIT state, 5002 * starting the time-wait timer, turning off the 5003 * other standard timers. 5004 */ 5005 case TCPS_FIN_WAIT_2: 5006 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 5007 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 5008 tcp_twstart(tp); 5009 return (1); 5010 } 5011 } 5012 /* 5013 * Return any desired output. 5014 */ 5015 if ((tp->t_flags & TF_ACKNOW) || (sbavail(&so->so_snd) > (tp->snd_max - tp->snd_una))) { 5016 rack->r_wanted_output++; 5017 } 5018 INP_WLOCK_ASSERT(tp->t_inpcb); 5019 return (0); 5020 } 5021 5022 /* 5023 * Here nothing is really faster, its just that we 5024 * have broken out the fast-data path also just like 5025 * the fast-ack. 5026 */ 5027 static int 5028 rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so, 5029 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 5030 uint32_t tiwin, int32_t nxt_pkt) 5031 { 5032 int32_t nsegs; 5033 int32_t newsize = 0; /* automatic sockbuf scaling */ 5034 struct tcp_rack *rack; 5035 #ifdef TCPDEBUG 5036 /* 5037 * The size of tcp_saveipgen must be the size of the max ip header, 5038 * now IPv6. 5039 */ 5040 u_char tcp_saveipgen[IP6_HDR_LEN]; 5041 struct tcphdr tcp_savetcp; 5042 short ostate = 0; 5043 5044 #endif 5045 /* 5046 * If last ACK falls within this segment's sequence numbers, record 5047 * the timestamp. NOTE that the test is modified according to the 5048 * latest proposal of the tcplw@cray.com list (Braden 1993/04/26). 5049 */ 5050 if (__predict_false(th->th_seq != tp->rcv_nxt)) { 5051 return (0); 5052 } 5053 if (__predict_false(tp->snd_nxt != tp->snd_max)) { 5054 return (0); 5055 } 5056 if (tiwin && tiwin != tp->snd_wnd) { 5057 return (0); 5058 } 5059 if (__predict_false((tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN)))) { 5060 return (0); 5061 } 5062 if (__predict_false((to->to_flags & TOF_TS) && 5063 (TSTMP_LT(to->to_tsval, tp->ts_recent)))) { 5064 return (0); 5065 } 5066 if (__predict_false((th->th_ack != tp->snd_una))) { 5067 return (0); 5068 } 5069 if (__predict_false(tlen > sbspace(&so->so_rcv))) { 5070 return (0); 5071 } 5072 if ((to->to_flags & TOF_TS) != 0 && 5073 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 5074 tp->ts_recent_age = tcp_ts_getticks(); 5075 tp->ts_recent = to->to_tsval; 5076 } 5077 rack = (struct tcp_rack *)tp->t_fb_ptr; 5078 /* 5079 * This is a pure, in-sequence data packet with nothing on the 5080 * reassembly queue and we have enough buffer space to take it. 5081 */ 5082 nsegs = max(1, m->m_pkthdr.lro_nsegs); 5083 5084 5085 /* Clean receiver SACK report if present */ 5086 if (tp->rcv_numsacks) 5087 tcp_clean_sackreport(tp); 5088 TCPSTAT_INC(tcps_preddat); 5089 tp->rcv_nxt += tlen; 5090 /* 5091 * Pull snd_wl1 up to prevent seq wrap relative to th_seq. 5092 */ 5093 tp->snd_wl1 = th->th_seq; 5094 /* 5095 * Pull rcv_up up to prevent seq wrap relative to rcv_nxt. 5096 */ 5097 tp->rcv_up = tp->rcv_nxt; 5098 TCPSTAT_ADD(tcps_rcvpack, nsegs); 5099 TCPSTAT_ADD(tcps_rcvbyte, tlen); 5100 #ifdef TCPDEBUG 5101 if (so->so_options & SO_DEBUG) 5102 tcp_trace(TA_INPUT, ostate, tp, 5103 (void *)tcp_saveipgen, &tcp_savetcp, 0); 5104 #endif 5105 newsize = tcp_autorcvbuf(m, th, so, tp, tlen); 5106 5107 /* Add data to socket buffer. */ 5108 SOCKBUF_LOCK(&so->so_rcv); 5109 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 5110 m_freem(m); 5111 } else { 5112 /* 5113 * Set new socket buffer size. Give up when limit is 5114 * reached. 5115 */ 5116 if (newsize) 5117 if (!sbreserve_locked(&so->so_rcv, 5118 newsize, so, NULL)) 5119 so->so_rcv.sb_flags &= ~SB_AUTOSIZE; 5120 m_adj(m, drop_hdrlen); /* delayed header drop */ 5121 sbappendstream_locked(&so->so_rcv, m, 0); 5122 rack_calc_rwin(so, tp); 5123 } 5124 /* NB: sorwakeup_locked() does an implicit unlock. */ 5125 sorwakeup_locked(so); 5126 if (DELAY_ACK(tp, tlen)) { 5127 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 5128 tp->t_flags |= TF_DELACK; 5129 } else { 5130 tp->t_flags |= TF_ACKNOW; 5131 rack->r_wanted_output++; 5132 } 5133 if ((tp->snd_una == tp->snd_max) && rack_use_sack_filter) 5134 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 5135 return (1); 5136 } 5137 5138 /* 5139 * This subfunction is used to try to highly optimize the 5140 * fast path. We again allow window updates that are 5141 * in sequence to remain in the fast-path. We also add 5142 * in the __predict's to attempt to help the compiler. 5143 * Note that if we return a 0, then we can *not* process 5144 * it and the caller should push the packet into the 5145 * slow-path. 5146 */ 5147 static int 5148 rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so, 5149 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 5150 uint32_t tiwin, int32_t nxt_pkt, uint32_t cts) 5151 { 5152 int32_t acked; 5153 int32_t nsegs; 5154 5155 #ifdef TCPDEBUG 5156 /* 5157 * The size of tcp_saveipgen must be the size of the max ip header, 5158 * now IPv6. 5159 */ 5160 u_char tcp_saveipgen[IP6_HDR_LEN]; 5161 struct tcphdr tcp_savetcp; 5162 short ostate = 0; 5163 5164 #endif 5165 struct tcp_rack *rack; 5166 5167 if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) { 5168 /* Old ack, behind (or duplicate to) the last one rcv'd */ 5169 return (0); 5170 } 5171 if (__predict_false(SEQ_GT(th->th_ack, tp->snd_max))) { 5172 /* Above what we have sent? */ 5173 return (0); 5174 } 5175 if (__predict_false(tp->snd_nxt != tp->snd_max)) { 5176 /* We are retransmitting */ 5177 return (0); 5178 } 5179 if (__predict_false(tiwin == 0)) { 5180 /* zero window */ 5181 return (0); 5182 } 5183 if (__predict_false(tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN))) { 5184 /* We need a SYN or a FIN, unlikely.. */ 5185 return (0); 5186 } 5187 if ((to->to_flags & TOF_TS) && __predict_false(TSTMP_LT(to->to_tsval, tp->ts_recent))) { 5188 /* Timestamp is behind .. old ack with seq wrap? */ 5189 return (0); 5190 } 5191 if (__predict_false(IN_RECOVERY(tp->t_flags))) { 5192 /* Still recovering */ 5193 return (0); 5194 } 5195 rack = (struct tcp_rack *)tp->t_fb_ptr; 5196 if (rack->r_ctl.rc_sacked) { 5197 /* We have sack holes on our scoreboard */ 5198 return (0); 5199 } 5200 /* Ok if we reach here, we can process a fast-ack */ 5201 nsegs = max(1, m->m_pkthdr.lro_nsegs); 5202 rack_log_ack(tp, to, th); 5203 /* Did the window get updated? */ 5204 if (tiwin != tp->snd_wnd) { 5205 tp->snd_wnd = tiwin; 5206 tp->snd_wl1 = th->th_seq; 5207 if (tp->snd_wnd > tp->max_sndwnd) 5208 tp->max_sndwnd = tp->snd_wnd; 5209 } 5210 if ((rack->rc_in_persist != 0) && (tp->snd_wnd >= tp->t_maxseg)) { 5211 rack_exit_persist(tp, rack); 5212 } 5213 /* 5214 * If last ACK falls within this segment's sequence numbers, record 5215 * the timestamp. NOTE that the test is modified according to the 5216 * latest proposal of the tcplw@cray.com list (Braden 1993/04/26). 5217 */ 5218 if ((to->to_flags & TOF_TS) != 0 && 5219 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 5220 tp->ts_recent_age = tcp_ts_getticks(); 5221 tp->ts_recent = to->to_tsval; 5222 } 5223 /* 5224 * This is a pure ack for outstanding data. 5225 */ 5226 TCPSTAT_INC(tcps_predack); 5227 5228 /* 5229 * "bad retransmit" recovery. 5230 */ 5231 if (tp->t_flags & TF_PREVVALID) { 5232 tp->t_flags &= ~TF_PREVVALID; 5233 if (tp->t_rxtshift == 1 && 5234 (int)(ticks - tp->t_badrxtwin) < 0) 5235 rack_cong_signal(tp, th, CC_RTO_ERR); 5236 } 5237 /* 5238 * Recalculate the transmit timer / rtt. 5239 * 5240 * Some boxes send broken timestamp replies during the SYN+ACK 5241 * phase, ignore timestamps of 0 or we could calculate a huge RTT 5242 * and blow up the retransmit timer. 5243 */ 5244 acked = BYTES_THIS_ACK(tp, th); 5245 5246 #ifdef TCP_HHOOK 5247 /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ 5248 hhook_run_tcp_est_in(tp, th, to); 5249 #endif 5250 5251 TCPSTAT_ADD(tcps_rcvackpack, nsegs); 5252 TCPSTAT_ADD(tcps_rcvackbyte, acked); 5253 sbdrop(&so->so_snd, acked); 5254 /* 5255 * Let the congestion control algorithm update congestion control 5256 * related information. This typically means increasing the 5257 * congestion window. 5258 */ 5259 rack_ack_received(tp, rack, th, nsegs, CC_ACK, 0); 5260 5261 tp->snd_una = th->th_ack; 5262 /* 5263 * Pull snd_wl2 up to prevent seq wrap relative to th_ack. 5264 */ 5265 tp->snd_wl2 = th->th_ack; 5266 tp->t_dupacks = 0; 5267 m_freem(m); 5268 /* ND6_HINT(tp); *//* Some progress has been made. */ 5269 5270 /* 5271 * If all outstanding data are acked, stop retransmit timer, 5272 * otherwise restart timer using current (possibly backed-off) 5273 * value. If process is waiting for space, wakeup/selwakeup/signal. 5274 * If data are ready to send, let tcp_output decide between more 5275 * output or persist. 5276 */ 5277 #ifdef TCPDEBUG 5278 if (so->so_options & SO_DEBUG) 5279 tcp_trace(TA_INPUT, ostate, tp, 5280 (void *)tcp_saveipgen, 5281 &tcp_savetcp, 0); 5282 #endif 5283 if (tp->snd_una == tp->snd_max) { 5284 rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__); 5285 tp->t_acktime = 0; 5286 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 5287 } 5288 /* Wake up the socket if we have room to write more */ 5289 sowwakeup(so); 5290 if (sbavail(&so->so_snd)) { 5291 rack->r_wanted_output++; 5292 } 5293 return (1); 5294 } 5295 5296 /* 5297 * Return value of 1, the TCB is unlocked and most 5298 * likely gone, return value of 0, the TCP is still 5299 * locked. 5300 */ 5301 static int 5302 rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so, 5303 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 5304 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) 5305 { 5306 int32_t ret_val = 0; 5307 int32_t todrop; 5308 int32_t ourfinisacked = 0; 5309 5310 rack_calc_rwin(so, tp); 5311 /* 5312 * If the state is SYN_SENT: if seg contains an ACK, but not for our 5313 * SYN, drop the input. if seg contains a RST, then drop the 5314 * connection. if seg does not contain SYN, then drop it. Otherwise 5315 * this is an acceptable SYN segment initialize tp->rcv_nxt and 5316 * tp->irs if seg contains ack then advance tp->snd_una if seg 5317 * contains an ECE and ECN support is enabled, the stream is ECN 5318 * capable. if SYN has been acked change to ESTABLISHED else 5319 * SYN_RCVD state arrange for segment to be acked (eventually) 5320 * continue processing rest of data/controls, beginning with URG 5321 */ 5322 if ((thflags & TH_ACK) && 5323 (SEQ_LEQ(th->th_ack, tp->iss) || 5324 SEQ_GT(th->th_ack, tp->snd_max))) { 5325 rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 5326 return (1); 5327 } 5328 if ((thflags & (TH_ACK | TH_RST)) == (TH_ACK | TH_RST)) { 5329 TCP_PROBE5(connect__refused, NULL, tp, 5330 mtod(m, const char *), tp, th); 5331 tp = tcp_drop(tp, ECONNREFUSED); 5332 rack_do_drop(m, tp); 5333 return (1); 5334 } 5335 if (thflags & TH_RST) { 5336 rack_do_drop(m, tp); 5337 return (1); 5338 } 5339 if (!(thflags & TH_SYN)) { 5340 rack_do_drop(m, tp); 5341 return (1); 5342 } 5343 tp->irs = th->th_seq; 5344 tcp_rcvseqinit(tp); 5345 if (thflags & TH_ACK) { 5346 TCPSTAT_INC(tcps_connects); 5347 soisconnected(so); 5348 #ifdef MAC 5349 mac_socketpeer_set_from_mbuf(m, so); 5350 #endif 5351 /* Do window scaling on this connection? */ 5352 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 5353 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 5354 tp->rcv_scale = tp->request_r_scale; 5355 } 5356 tp->rcv_adv += min(tp->rcv_wnd, 5357 TCP_MAXWIN << tp->rcv_scale); 5358 /* 5359 * If there's data, delay ACK; if there's also a FIN ACKNOW 5360 * will be turned on later. 5361 */ 5362 if (DELAY_ACK(tp, tlen) && tlen != 0) { 5363 rack_timer_cancel(tp, (struct tcp_rack *)tp->t_fb_ptr, 5364 ((struct tcp_rack *)tp->t_fb_ptr)->r_ctl.rc_rcvtime, __LINE__); 5365 tp->t_flags |= TF_DELACK; 5366 } else { 5367 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output++; 5368 tp->t_flags |= TF_ACKNOW; 5369 } 5370 5371 if ((thflags & TH_ECE) && V_tcp_do_ecn) { 5372 tp->t_flags |= TF_ECN_PERMIT; 5373 TCPSTAT_INC(tcps_ecn_shs); 5374 } 5375 /* 5376 * Received <SYN,ACK> in SYN_SENT[*] state. Transitions: 5377 * SYN_SENT --> ESTABLISHED SYN_SENT* --> FIN_WAIT_1 5378 */ 5379 tp->t_starttime = ticks; 5380 if (tp->t_flags & TF_NEEDFIN) { 5381 tcp_state_change(tp, TCPS_FIN_WAIT_1); 5382 tp->t_flags &= ~TF_NEEDFIN; 5383 thflags &= ~TH_SYN; 5384 } else { 5385 tcp_state_change(tp, TCPS_ESTABLISHED); 5386 TCP_PROBE5(connect__established, NULL, tp, 5387 mtod(m, const char *), tp, th); 5388 cc_conn_init(tp); 5389 } 5390 } else { 5391 /* 5392 * Received initial SYN in SYN-SENT[*] state => simultaneous 5393 * open. If segment contains CC option and there is a 5394 * cached CC, apply TAO test. If it succeeds, connection is * 5395 * half-synchronized. Otherwise, do 3-way handshake: 5396 * SYN-SENT -> SYN-RECEIVED SYN-SENT* -> SYN-RECEIVED* If 5397 * there was no CC option, clear cached CC value. 5398 */ 5399 tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN); 5400 tcp_state_change(tp, TCPS_SYN_RECEIVED); 5401 } 5402 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 5403 INP_WLOCK_ASSERT(tp->t_inpcb); 5404 /* 5405 * Advance th->th_seq to correspond to first data byte. If data, 5406 * trim to stay within window, dropping FIN if necessary. 5407 */ 5408 th->th_seq++; 5409 if (tlen > tp->rcv_wnd) { 5410 todrop = tlen - tp->rcv_wnd; 5411 m_adj(m, -todrop); 5412 tlen = tp->rcv_wnd; 5413 thflags &= ~TH_FIN; 5414 TCPSTAT_INC(tcps_rcvpackafterwin); 5415 TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); 5416 } 5417 tp->snd_wl1 = th->th_seq - 1; 5418 tp->rcv_up = th->th_seq; 5419 /* 5420 * Client side of transaction: already sent SYN and data. If the 5421 * remote host used T/TCP to validate the SYN, our data will be 5422 * ACK'd; if so, enter normal data segment processing in the middle 5423 * of step 5, ack processing. Otherwise, goto step 6. 5424 */ 5425 if (thflags & TH_ACK) { 5426 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) 5427 return (ret_val); 5428 /* We may have changed to FIN_WAIT_1 above */ 5429 if (tp->t_state == TCPS_FIN_WAIT_1) { 5430 /* 5431 * In FIN_WAIT_1 STATE in addition to the processing 5432 * for the ESTABLISHED state if our FIN is now 5433 * acknowledged then enter FIN_WAIT_2. 5434 */ 5435 if (ourfinisacked) { 5436 /* 5437 * If we can't receive any more data, then 5438 * closing user can proceed. Starting the 5439 * timer is contrary to the specification, 5440 * but if we don't get a FIN we'll hang 5441 * forever. 5442 * 5443 * XXXjl: we should release the tp also, and 5444 * use a compressed state. 5445 */ 5446 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 5447 soisdisconnected(so); 5448 tcp_timer_activate(tp, TT_2MSL, 5449 (tcp_fast_finwait2_recycle ? 5450 tcp_finwait2_timeout : 5451 TP_MAXIDLE(tp))); 5452 } 5453 tcp_state_change(tp, TCPS_FIN_WAIT_2); 5454 } 5455 } 5456 } 5457 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 5458 tiwin, thflags, nxt_pkt)); 5459 } 5460 5461 /* 5462 * Return value of 1, the TCB is unlocked and most 5463 * likely gone, return value of 0, the TCP is still 5464 * locked. 5465 */ 5466 static int 5467 rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, 5468 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 5469 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) 5470 { 5471 int32_t ret_val = 0; 5472 int32_t ourfinisacked = 0; 5473 5474 rack_calc_rwin(so, tp); 5475 5476 if ((thflags & TH_ACK) && 5477 (SEQ_LEQ(th->th_ack, tp->snd_una) || 5478 SEQ_GT(th->th_ack, tp->snd_max))) { 5479 rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 5480 return (1); 5481 } 5482 #ifdef TCP_RFC7413 5483 if (tp->t_flags & TF_FASTOPEN) { 5484 /* 5485 * When a TFO connection is in SYN_RECEIVED, the only valid 5486 * packets are the initial SYN, a retransmit/copy of the 5487 * initial SYN (possibly with a subset of the original 5488 * data), a valid ACK, a FIN, or a RST. 5489 */ 5490 if ((thflags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) { 5491 rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 5492 return (1); 5493 } else if (thflags & TH_SYN) { 5494 /* non-initial SYN is ignored */ 5495 struct tcp_rack *rack; 5496 5497 rack = (struct tcp_rack *)tp->t_fb_ptr; 5498 if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) || 5499 (rack->r_ctl.rc_hpts_flags & PACE_TMR_TLP) || 5500 (rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK)) { 5501 rack_do_drop(m, NULL); 5502 return (0); 5503 } 5504 } else if (!(thflags & (TH_ACK | TH_FIN | TH_RST))) { 5505 rack_do_drop(m, NULL); 5506 return (0); 5507 } 5508 } 5509 #endif 5510 if (thflags & TH_RST) 5511 return (rack_process_rst(m, th, so, tp)); 5512 /* 5513 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 5514 * synchronized state. 5515 */ 5516 if (thflags & TH_SYN) { 5517 rack_challenge_ack(m, th, tp, &ret_val); 5518 return (ret_val); 5519 } 5520 /* 5521 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 5522 * it's less than ts_recent, drop it. 5523 */ 5524 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 5525 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 5526 if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val)) 5527 return (ret_val); 5528 } 5529 /* 5530 * In the SYN-RECEIVED state, validate that the packet belongs to 5531 * this connection before trimming the data to fit the receive 5532 * window. Check the sequence number versus IRS since we know the 5533 * sequence numbers haven't wrapped. This is a partial fix for the 5534 * "LAND" DoS attack. 5535 */ 5536 if (SEQ_LT(th->th_seq, tp->irs)) { 5537 rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 5538 return (1); 5539 } 5540 if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 5541 return (ret_val); 5542 } 5543 /* 5544 * If last ACK falls within this segment's sequence numbers, record 5545 * its timestamp. NOTE: 1) That the test incorporates suggestions 5546 * from the latest proposal of the tcplw@cray.com list (Braden 5547 * 1993/04/26). 2) That updating only on newer timestamps interferes 5548 * with our earlier PAWS tests, so this check should be solely 5549 * predicated on the sequence space of this segment. 3) That we 5550 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 5551 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 5552 * SEG.Len, This modified check allows us to overcome RFC1323's 5553 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 5554 * p.869. In such cases, we can still calculate the RTT correctly 5555 * when RCV.NXT == Last.ACK.Sent. 5556 */ 5557 if ((to->to_flags & TOF_TS) != 0 && 5558 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 5559 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 5560 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 5561 tp->ts_recent_age = tcp_ts_getticks(); 5562 tp->ts_recent = to->to_tsval; 5563 } 5564 /* 5565 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 5566 * is on (half-synchronized state), then queue data for later 5567 * processing; else drop segment and return. 5568 */ 5569 if ((thflags & TH_ACK) == 0) { 5570 #ifdef TCP_RFC7413 5571 if (tp->t_flags & TF_FASTOPEN) { 5572 tp->snd_wnd = tiwin; 5573 cc_conn_init(tp); 5574 } 5575 #endif 5576 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 5577 tiwin, thflags, nxt_pkt)); 5578 } 5579 TCPSTAT_INC(tcps_connects); 5580 soisconnected(so); 5581 /* Do window scaling? */ 5582 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 5583 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 5584 tp->rcv_scale = tp->request_r_scale; 5585 tp->snd_wnd = tiwin; 5586 } 5587 /* 5588 * Make transitions: SYN-RECEIVED -> ESTABLISHED SYN-RECEIVED* -> 5589 * FIN-WAIT-1 5590 */ 5591 tp->t_starttime = ticks; 5592 if (tp->t_flags & TF_NEEDFIN) { 5593 tcp_state_change(tp, TCPS_FIN_WAIT_1); 5594 tp->t_flags &= ~TF_NEEDFIN; 5595 } else { 5596 tcp_state_change(tp, TCPS_ESTABLISHED); 5597 TCP_PROBE5(accept__established, NULL, tp, 5598 mtod(m, const char *), tp, th); 5599 #ifdef TCP_RFC7413 5600 if (tp->t_tfo_pending) { 5601 tcp_fastopen_decrement_counter(tp->t_tfo_pending); 5602 tp->t_tfo_pending = NULL; 5603 5604 /* 5605 * Account for the ACK of our SYN prior to regular 5606 * ACK processing below. 5607 */ 5608 tp->snd_una++; 5609 } 5610 /* 5611 * TFO connections call cc_conn_init() during SYN 5612 * processing. Calling it again here for such connections 5613 * is not harmless as it would undo the snd_cwnd reduction 5614 * that occurs when a TFO SYN|ACK is retransmitted. 5615 */ 5616 if (!(tp->t_flags & TF_FASTOPEN)) 5617 #endif 5618 cc_conn_init(tp); 5619 } 5620 /* 5621 * If segment contains data or ACK, will call tcp_reass() later; if 5622 * not, do so now to pass queued data to user. 5623 */ 5624 if (tlen == 0 && (thflags & TH_FIN) == 0) 5625 (void)tcp_reass(tp, (struct tcphdr *)0, NULL, 0, 5626 (struct mbuf *)0); 5627 tp->snd_wl1 = th->th_seq - 1; 5628 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 5629 return (ret_val); 5630 } 5631 if (tp->t_state == TCPS_FIN_WAIT_1) { 5632 /* We could have went to FIN_WAIT_1 (or EST) above */ 5633 /* 5634 * In FIN_WAIT_1 STATE in addition to the processing for the 5635 * ESTABLISHED state if our FIN is now acknowledged then 5636 * enter FIN_WAIT_2. 5637 */ 5638 if (ourfinisacked) { 5639 /* 5640 * If we can't receive any more data, then closing 5641 * user can proceed. Starting the timer is contrary 5642 * to the specification, but if we don't get a FIN 5643 * we'll hang forever. 5644 * 5645 * XXXjl: we should release the tp also, and use a 5646 * compressed state. 5647 */ 5648 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 5649 soisdisconnected(so); 5650 tcp_timer_activate(tp, TT_2MSL, 5651 (tcp_fast_finwait2_recycle ? 5652 tcp_finwait2_timeout : 5653 TP_MAXIDLE(tp))); 5654 } 5655 tcp_state_change(tp, TCPS_FIN_WAIT_2); 5656 } 5657 } 5658 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 5659 tiwin, thflags, nxt_pkt)); 5660 } 5661 5662 /* 5663 * Return value of 1, the TCB is unlocked and most 5664 * likely gone, return value of 0, the TCP is still 5665 * locked. 5666 */ 5667 static int 5668 rack_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so, 5669 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 5670 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) 5671 { 5672 int32_t ret_val = 0; 5673 5674 /* 5675 * Header prediction: check for the two common cases of a 5676 * uni-directional data xfer. If the packet has no control flags, 5677 * is in-sequence, the window didn't change and we're not 5678 * retransmitting, it's a candidate. If the length is zero and the 5679 * ack moved forward, we're the sender side of the xfer. Just free 5680 * the data acked & wake any higher level process that was blocked 5681 * waiting for space. If the length is non-zero and the ack didn't 5682 * move, we're the receiver side. If we're getting packets in-order 5683 * (the reassembly queue is empty), add the data toc The socket 5684 * buffer and note that we need a delayed ack. Make sure that the 5685 * hidden state-flags are also off. Since we check for 5686 * TCPS_ESTABLISHED first, it can only be TH_NEEDSYN. 5687 */ 5688 if (__predict_true(((to->to_flags & TOF_SACK) == 0)) && 5689 __predict_true((thflags & (TH_SYN | TH_FIN | TH_RST | TH_URG | TH_ACK)) == TH_ACK) && 5690 __predict_true(SEGQ_EMPTY(tp)) && 5691 __predict_true(th->th_seq == tp->rcv_nxt)) { 5692 struct tcp_rack *rack; 5693 5694 rack = (struct tcp_rack *)tp->t_fb_ptr; 5695 if (tlen == 0) { 5696 if (rack_fastack(m, th, so, tp, to, drop_hdrlen, tlen, 5697 tiwin, nxt_pkt, rack->r_ctl.rc_rcvtime)) { 5698 return (0); 5699 } 5700 } else { 5701 if (rack_do_fastnewdata(m, th, so, tp, to, drop_hdrlen, tlen, 5702 tiwin, nxt_pkt)) { 5703 return (0); 5704 } 5705 } 5706 } 5707 rack_calc_rwin(so, tp); 5708 5709 if (thflags & TH_RST) 5710 return (rack_process_rst(m, th, so, tp)); 5711 5712 /* 5713 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 5714 * synchronized state. 5715 */ 5716 if (thflags & TH_SYN) { 5717 rack_challenge_ack(m, th, tp, &ret_val); 5718 return (ret_val); 5719 } 5720 /* 5721 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 5722 * it's less than ts_recent, drop it. 5723 */ 5724 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 5725 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 5726 if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val)) 5727 return (ret_val); 5728 } 5729 if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 5730 return (ret_val); 5731 } 5732 /* 5733 * If last ACK falls within this segment's sequence numbers, record 5734 * its timestamp. NOTE: 1) That the test incorporates suggestions 5735 * from the latest proposal of the tcplw@cray.com list (Braden 5736 * 1993/04/26). 2) That updating only on newer timestamps interferes 5737 * with our earlier PAWS tests, so this check should be solely 5738 * predicated on the sequence space of this segment. 3) That we 5739 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 5740 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 5741 * SEG.Len, This modified check allows us to overcome RFC1323's 5742 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 5743 * p.869. In such cases, we can still calculate the RTT correctly 5744 * when RCV.NXT == Last.ACK.Sent. 5745 */ 5746 if ((to->to_flags & TOF_TS) != 0 && 5747 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 5748 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 5749 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 5750 tp->ts_recent_age = tcp_ts_getticks(); 5751 tp->ts_recent = to->to_tsval; 5752 } 5753 /* 5754 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 5755 * is on (half-synchronized state), then queue data for later 5756 * processing; else drop segment and return. 5757 */ 5758 if ((thflags & TH_ACK) == 0) { 5759 if (tp->t_flags & TF_NEEDSYN) { 5760 5761 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 5762 tiwin, thflags, nxt_pkt)); 5763 5764 } else if (tp->t_flags & TF_ACKNOW) { 5765 rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 5766 return (ret_val); 5767 } else { 5768 rack_do_drop(m, NULL); 5769 return (0); 5770 } 5771 } 5772 /* 5773 * Ack processing. 5774 */ 5775 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) { 5776 return (ret_val); 5777 } 5778 if (sbavail(&so->so_snd)) { 5779 if (rack_progress_timeout_check(tp)) { 5780 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 5781 rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 5782 return (1); 5783 } 5784 } 5785 /* State changes only happen in rack_process_data() */ 5786 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 5787 tiwin, thflags, nxt_pkt)); 5788 } 5789 5790 /* 5791 * Return value of 1, the TCB is unlocked and most 5792 * likely gone, return value of 0, the TCP is still 5793 * locked. 5794 */ 5795 static int 5796 rack_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so, 5797 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 5798 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) 5799 { 5800 int32_t ret_val = 0; 5801 5802 rack_calc_rwin(so, tp); 5803 if (thflags & TH_RST) 5804 return (rack_process_rst(m, th, so, tp)); 5805 /* 5806 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 5807 * synchronized state. 5808 */ 5809 if (thflags & TH_SYN) { 5810 rack_challenge_ack(m, th, tp, &ret_val); 5811 return (ret_val); 5812 } 5813 /* 5814 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 5815 * it's less than ts_recent, drop it. 5816 */ 5817 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 5818 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 5819 if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val)) 5820 return (ret_val); 5821 } 5822 if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 5823 return (ret_val); 5824 } 5825 /* 5826 * If last ACK falls within this segment's sequence numbers, record 5827 * its timestamp. NOTE: 1) That the test incorporates suggestions 5828 * from the latest proposal of the tcplw@cray.com list (Braden 5829 * 1993/04/26). 2) That updating only on newer timestamps interferes 5830 * with our earlier PAWS tests, so this check should be solely 5831 * predicated on the sequence space of this segment. 3) That we 5832 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 5833 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 5834 * SEG.Len, This modified check allows us to overcome RFC1323's 5835 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 5836 * p.869. In such cases, we can still calculate the RTT correctly 5837 * when RCV.NXT == Last.ACK.Sent. 5838 */ 5839 if ((to->to_flags & TOF_TS) != 0 && 5840 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 5841 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 5842 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 5843 tp->ts_recent_age = tcp_ts_getticks(); 5844 tp->ts_recent = to->to_tsval; 5845 } 5846 /* 5847 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 5848 * is on (half-synchronized state), then queue data for later 5849 * processing; else drop segment and return. 5850 */ 5851 if ((thflags & TH_ACK) == 0) { 5852 if (tp->t_flags & TF_NEEDSYN) { 5853 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 5854 tiwin, thflags, nxt_pkt)); 5855 5856 } else if (tp->t_flags & TF_ACKNOW) { 5857 rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 5858 return (ret_val); 5859 } else { 5860 rack_do_drop(m, NULL); 5861 return (0); 5862 } 5863 } 5864 /* 5865 * Ack processing. 5866 */ 5867 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) { 5868 return (ret_val); 5869 } 5870 if (sbavail(&so->so_snd)) { 5871 if (rack_progress_timeout_check(tp)) { 5872 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 5873 rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 5874 return (1); 5875 } 5876 } 5877 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 5878 tiwin, thflags, nxt_pkt)); 5879 } 5880 5881 static int 5882 rack_check_data_after_close(struct mbuf *m, 5883 struct tcpcb *tp, int32_t *tlen, struct tcphdr *th, struct socket *so) 5884 { 5885 struct tcp_rack *rack; 5886 5887 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 5888 rack = (struct tcp_rack *)tp->t_fb_ptr; 5889 if (rack->rc_allow_data_af_clo == 0) { 5890 close_now: 5891 tp = tcp_close(tp); 5892 TCPSTAT_INC(tcps_rcvafterclose); 5893 rack_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, (*tlen)); 5894 return (1); 5895 } 5896 if (sbavail(&so->so_snd) == 0) 5897 goto close_now; 5898 /* Ok we allow data that is ignored and a followup reset */ 5899 tp->rcv_nxt = th->th_seq + *tlen; 5900 tp->t_flags2 |= TF2_DROP_AF_DATA; 5901 rack->r_wanted_output = 1; 5902 *tlen = 0; 5903 return (0); 5904 } 5905 5906 /* 5907 * Return value of 1, the TCB is unlocked and most 5908 * likely gone, return value of 0, the TCP is still 5909 * locked. 5910 */ 5911 static int 5912 rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so, 5913 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 5914 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) 5915 { 5916 int32_t ret_val = 0; 5917 int32_t ourfinisacked = 0; 5918 5919 rack_calc_rwin(so, tp); 5920 5921 if (thflags & TH_RST) 5922 return (rack_process_rst(m, th, so, tp)); 5923 /* 5924 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 5925 * synchronized state. 5926 */ 5927 if (thflags & TH_SYN) { 5928 rack_challenge_ack(m, th, tp, &ret_val); 5929 return (ret_val); 5930 } 5931 /* 5932 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 5933 * it's less than ts_recent, drop it. 5934 */ 5935 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 5936 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 5937 if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val)) 5938 return (ret_val); 5939 } 5940 if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 5941 return (ret_val); 5942 } 5943 /* 5944 * If new data are received on a connection after the user processes 5945 * are gone, then RST the other end. 5946 */ 5947 if ((so->so_state & SS_NOFDREF) && tlen) { 5948 if (rack_check_data_after_close(m, tp, &tlen, th, so)) 5949 return (1); 5950 } 5951 /* 5952 * If last ACK falls within this segment's sequence numbers, record 5953 * its timestamp. NOTE: 1) That the test incorporates suggestions 5954 * from the latest proposal of the tcplw@cray.com list (Braden 5955 * 1993/04/26). 2) That updating only on newer timestamps interferes 5956 * with our earlier PAWS tests, so this check should be solely 5957 * predicated on the sequence space of this segment. 3) That we 5958 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 5959 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 5960 * SEG.Len, This modified check allows us to overcome RFC1323's 5961 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 5962 * p.869. In such cases, we can still calculate the RTT correctly 5963 * when RCV.NXT == Last.ACK.Sent. 5964 */ 5965 if ((to->to_flags & TOF_TS) != 0 && 5966 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 5967 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 5968 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 5969 tp->ts_recent_age = tcp_ts_getticks(); 5970 tp->ts_recent = to->to_tsval; 5971 } 5972 /* 5973 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 5974 * is on (half-synchronized state), then queue data for later 5975 * processing; else drop segment and return. 5976 */ 5977 if ((thflags & TH_ACK) == 0) { 5978 if (tp->t_flags & TF_NEEDSYN) { 5979 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 5980 tiwin, thflags, nxt_pkt)); 5981 } else if (tp->t_flags & TF_ACKNOW) { 5982 rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 5983 return (ret_val); 5984 } else { 5985 rack_do_drop(m, NULL); 5986 return (0); 5987 } 5988 } 5989 /* 5990 * Ack processing. 5991 */ 5992 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 5993 return (ret_val); 5994 } 5995 if (ourfinisacked) { 5996 /* 5997 * If we can't receive any more data, then closing user can 5998 * proceed. Starting the timer is contrary to the 5999 * specification, but if we don't get a FIN we'll hang 6000 * forever. 6001 * 6002 * XXXjl: we should release the tp also, and use a 6003 * compressed state. 6004 */ 6005 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 6006 soisdisconnected(so); 6007 tcp_timer_activate(tp, TT_2MSL, 6008 (tcp_fast_finwait2_recycle ? 6009 tcp_finwait2_timeout : 6010 TP_MAXIDLE(tp))); 6011 } 6012 tcp_state_change(tp, TCPS_FIN_WAIT_2); 6013 } 6014 if (sbavail(&so->so_snd)) { 6015 if (rack_progress_timeout_check(tp)) { 6016 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 6017 rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 6018 return (1); 6019 } 6020 } 6021 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 6022 tiwin, thflags, nxt_pkt)); 6023 } 6024 6025 /* 6026 * Return value of 1, the TCB is unlocked and most 6027 * likely gone, return value of 0, the TCP is still 6028 * locked. 6029 */ 6030 static int 6031 rack_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so, 6032 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 6033 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) 6034 { 6035 int32_t ret_val = 0; 6036 int32_t ourfinisacked = 0; 6037 6038 rack_calc_rwin(so, tp); 6039 6040 if (thflags & TH_RST) 6041 return (rack_process_rst(m, th, so, tp)); 6042 /* 6043 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 6044 * synchronized state. 6045 */ 6046 if (thflags & TH_SYN) { 6047 rack_challenge_ack(m, th, tp, &ret_val); 6048 return (ret_val); 6049 } 6050 /* 6051 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 6052 * it's less than ts_recent, drop it. 6053 */ 6054 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 6055 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 6056 if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val)) 6057 return (ret_val); 6058 } 6059 if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 6060 return (ret_val); 6061 } 6062 /* 6063 * If new data are received on a connection after the user processes 6064 * are gone, then RST the other end. 6065 */ 6066 if ((so->so_state & SS_NOFDREF) && tlen) { 6067 if (rack_check_data_after_close(m, tp, &tlen, th, so)) 6068 return (1); 6069 } 6070 /* 6071 * If last ACK falls within this segment's sequence numbers, record 6072 * its timestamp. NOTE: 1) That the test incorporates suggestions 6073 * from the latest proposal of the tcplw@cray.com list (Braden 6074 * 1993/04/26). 2) That updating only on newer timestamps interferes 6075 * with our earlier PAWS tests, so this check should be solely 6076 * predicated on the sequence space of this segment. 3) That we 6077 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 6078 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 6079 * SEG.Len, This modified check allows us to overcome RFC1323's 6080 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 6081 * p.869. In such cases, we can still calculate the RTT correctly 6082 * when RCV.NXT == Last.ACK.Sent. 6083 */ 6084 if ((to->to_flags & TOF_TS) != 0 && 6085 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 6086 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 6087 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 6088 tp->ts_recent_age = tcp_ts_getticks(); 6089 tp->ts_recent = to->to_tsval; 6090 } 6091 /* 6092 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 6093 * is on (half-synchronized state), then queue data for later 6094 * processing; else drop segment and return. 6095 */ 6096 if ((thflags & TH_ACK) == 0) { 6097 if (tp->t_flags & TF_NEEDSYN) { 6098 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 6099 tiwin, thflags, nxt_pkt)); 6100 } else if (tp->t_flags & TF_ACKNOW) { 6101 rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 6102 return (ret_val); 6103 } else { 6104 rack_do_drop(m, NULL); 6105 return (0); 6106 } 6107 } 6108 /* 6109 * Ack processing. 6110 */ 6111 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 6112 return (ret_val); 6113 } 6114 if (ourfinisacked) { 6115 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 6116 tcp_twstart(tp); 6117 m_freem(m); 6118 return (1); 6119 } 6120 if (sbavail(&so->so_snd)) { 6121 if (rack_progress_timeout_check(tp)) { 6122 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 6123 rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 6124 return (1); 6125 } 6126 } 6127 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 6128 tiwin, thflags, nxt_pkt)); 6129 } 6130 6131 /* 6132 * Return value of 1, the TCB is unlocked and most 6133 * likely gone, return value of 0, the TCP is still 6134 * locked. 6135 */ 6136 static int 6137 rack_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so, 6138 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 6139 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) 6140 { 6141 int32_t ret_val = 0; 6142 int32_t ourfinisacked = 0; 6143 6144 rack_calc_rwin(so, tp); 6145 6146 if (thflags & TH_RST) 6147 return (rack_process_rst(m, th, so, tp)); 6148 /* 6149 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 6150 * synchronized state. 6151 */ 6152 if (thflags & TH_SYN) { 6153 rack_challenge_ack(m, th, tp, &ret_val); 6154 return (ret_val); 6155 } 6156 /* 6157 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 6158 * it's less than ts_recent, drop it. 6159 */ 6160 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 6161 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 6162 if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val)) 6163 return (ret_val); 6164 } 6165 if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 6166 return (ret_val); 6167 } 6168 /* 6169 * If new data are received on a connection after the user processes 6170 * are gone, then RST the other end. 6171 */ 6172 if ((so->so_state & SS_NOFDREF) && tlen) { 6173 if (rack_check_data_after_close(m, tp, &tlen, th, so)) 6174 return (1); 6175 } 6176 /* 6177 * If last ACK falls within this segment's sequence numbers, record 6178 * its timestamp. NOTE: 1) That the test incorporates suggestions 6179 * from the latest proposal of the tcplw@cray.com list (Braden 6180 * 1993/04/26). 2) That updating only on newer timestamps interferes 6181 * with our earlier PAWS tests, so this check should be solely 6182 * predicated on the sequence space of this segment. 3) That we 6183 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 6184 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 6185 * SEG.Len, This modified check allows us to overcome RFC1323's 6186 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 6187 * p.869. In such cases, we can still calculate the RTT correctly 6188 * when RCV.NXT == Last.ACK.Sent. 6189 */ 6190 if ((to->to_flags & TOF_TS) != 0 && 6191 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 6192 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 6193 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 6194 tp->ts_recent_age = tcp_ts_getticks(); 6195 tp->ts_recent = to->to_tsval; 6196 } 6197 /* 6198 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 6199 * is on (half-synchronized state), then queue data for later 6200 * processing; else drop segment and return. 6201 */ 6202 if ((thflags & TH_ACK) == 0) { 6203 if (tp->t_flags & TF_NEEDSYN) { 6204 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 6205 tiwin, thflags, nxt_pkt)); 6206 } else if (tp->t_flags & TF_ACKNOW) { 6207 rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 6208 return (ret_val); 6209 } else { 6210 rack_do_drop(m, NULL); 6211 return (0); 6212 } 6213 } 6214 /* 6215 * case TCPS_LAST_ACK: Ack processing. 6216 */ 6217 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 6218 return (ret_val); 6219 } 6220 if (ourfinisacked) { 6221 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); 6222 tp = tcp_close(tp); 6223 rack_do_drop(m, tp); 6224 return (1); 6225 } 6226 if (sbavail(&so->so_snd)) { 6227 if (rack_progress_timeout_check(tp)) { 6228 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 6229 rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 6230 return (1); 6231 } 6232 } 6233 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 6234 tiwin, thflags, nxt_pkt)); 6235 } 6236 6237 6238 /* 6239 * Return value of 1, the TCB is unlocked and most 6240 * likely gone, return value of 0, the TCP is still 6241 * locked. 6242 */ 6243 static int 6244 rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so, 6245 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 6246 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) 6247 { 6248 int32_t ret_val = 0; 6249 int32_t ourfinisacked = 0; 6250 6251 rack_calc_rwin(so, tp); 6252 6253 /* Reset receive buffer auto scaling when not in bulk receive mode. */ 6254 if (thflags & TH_RST) 6255 return (rack_process_rst(m, th, so, tp)); 6256 /* 6257 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 6258 * synchronized state. 6259 */ 6260 if (thflags & TH_SYN) { 6261 rack_challenge_ack(m, th, tp, &ret_val); 6262 return (ret_val); 6263 } 6264 /* 6265 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 6266 * it's less than ts_recent, drop it. 6267 */ 6268 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 6269 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 6270 if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val)) 6271 return (ret_val); 6272 } 6273 if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 6274 return (ret_val); 6275 } 6276 /* 6277 * If new data are received on a connection after the user processes 6278 * are gone, then RST the other end. 6279 */ 6280 if ((so->so_state & SS_NOFDREF) && 6281 tlen) { 6282 if (rack_check_data_after_close(m, tp, &tlen, th, so)) 6283 return (1); 6284 } 6285 /* 6286 * If last ACK falls within this segment's sequence numbers, record 6287 * its timestamp. NOTE: 1) That the test incorporates suggestions 6288 * from the latest proposal of the tcplw@cray.com list (Braden 6289 * 1993/04/26). 2) That updating only on newer timestamps interferes 6290 * with our earlier PAWS tests, so this check should be solely 6291 * predicated on the sequence space of this segment. 3) That we 6292 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 6293 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 6294 * SEG.Len, This modified check allows us to overcome RFC1323's 6295 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 6296 * p.869. In such cases, we can still calculate the RTT correctly 6297 * when RCV.NXT == Last.ACK.Sent. 6298 */ 6299 if ((to->to_flags & TOF_TS) != 0 && 6300 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 6301 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 6302 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 6303 tp->ts_recent_age = tcp_ts_getticks(); 6304 tp->ts_recent = to->to_tsval; 6305 } 6306 /* 6307 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 6308 * is on (half-synchronized state), then queue data for later 6309 * processing; else drop segment and return. 6310 */ 6311 if ((thflags & TH_ACK) == 0) { 6312 if (tp->t_flags & TF_NEEDSYN) { 6313 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 6314 tiwin, thflags, nxt_pkt)); 6315 } else if (tp->t_flags & TF_ACKNOW) { 6316 rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 6317 return (ret_val); 6318 } else { 6319 rack_do_drop(m, NULL); 6320 return (0); 6321 } 6322 } 6323 /* 6324 * Ack processing. 6325 */ 6326 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 6327 return (ret_val); 6328 } 6329 if (sbavail(&so->so_snd)) { 6330 if (rack_progress_timeout_check(tp)) { 6331 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 6332 rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 6333 return (1); 6334 } 6335 } 6336 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 6337 tiwin, thflags, nxt_pkt)); 6338 } 6339 6340 6341 static void inline 6342 rack_clear_rate_sample(struct tcp_rack *rack) 6343 { 6344 rack->r_ctl.rack_rs.rs_flags = RACK_RTT_EMPTY; 6345 rack->r_ctl.rack_rs.rs_rtt_cnt = 0; 6346 rack->r_ctl.rack_rs.rs_rtt_tot = 0; 6347 } 6348 6349 static int 6350 rack_init(struct tcpcb *tp) 6351 { 6352 struct tcp_rack *rack = NULL; 6353 6354 tp->t_fb_ptr = uma_zalloc(rack_pcb_zone, M_NOWAIT); 6355 if (tp->t_fb_ptr == NULL) { 6356 /* 6357 * We need to allocate memory but cant. The INP and INP_INFO 6358 * locks and they are recusive (happens during setup. So a 6359 * scheme to drop the locks fails :( 6360 * 6361 */ 6362 return (ENOMEM); 6363 } 6364 memset(tp->t_fb_ptr, 0, sizeof(struct tcp_rack)); 6365 6366 rack = (struct tcp_rack *)tp->t_fb_ptr; 6367 TAILQ_INIT(&rack->r_ctl.rc_map); 6368 TAILQ_INIT(&rack->r_ctl.rc_free); 6369 TAILQ_INIT(&rack->r_ctl.rc_tmap); 6370 rack->rc_tp = tp; 6371 if (tp->t_inpcb) { 6372 rack->rc_inp = tp->t_inpcb; 6373 } 6374 /* Probably not needed but lets be sure */ 6375 rack_clear_rate_sample(rack); 6376 rack->r_cpu = 0; 6377 rack->r_ctl.rc_reorder_fade = rack_reorder_fade; 6378 rack->rc_allow_data_af_clo = rack_ignore_data_after_close; 6379 rack->r_ctl.rc_tlp_threshold = rack_tlp_thresh; 6380 rack->rc_pace_reduce = rack_slot_reduction; 6381 if (V_tcp_delack_enabled) 6382 tp->t_delayed_ack = 1; 6383 else 6384 tp->t_delayed_ack = 0; 6385 rack->rc_pace_max_segs = rack_hptsi_segments; 6386 rack->r_ctl.rc_early_recovery_segs = rack_early_recovery_max_seg; 6387 rack->r_ctl.rc_reorder_shift = rack_reorder_thresh; 6388 rack->r_ctl.rc_pkt_delay = rack_pkt_delay; 6389 rack->r_ctl.rc_prop_reduce = rack_use_proportional_reduce; 6390 rack->r_idle_reduce_largest = rack_reduce_largest_on_idle; 6391 rack->r_enforce_min_pace = rack_min_pace_time; 6392 rack->r_min_pace_seg_thresh = rack_min_pace_time_seg_req; 6393 rack->r_ctl.rc_prop_rate = rack_proportional_rate; 6394 rack->r_ctl.rc_tlp_cwnd_reduce = rack_lower_cwnd_at_tlp; 6395 rack->r_ctl.rc_early_recovery = rack_early_recovery; 6396 rack->rc_always_pace = rack_pace_every_seg; 6397 rack->r_ctl.rc_rate_sample_method = rack_rate_sample_method; 6398 rack->rack_tlp_threshold_use = rack_tlp_threshold_use; 6399 rack->r_ctl.rc_prr_sendalot = rack_send_a_lot_in_prr; 6400 rack->r_ctl.rc_min_to = rack_min_to; 6401 rack->r_ctl.rc_prr_inc_var = rack_inc_var; 6402 if (tp->snd_una != tp->snd_max) { 6403 /* Create a send map for the current outstanding data */ 6404 struct rack_sendmap *rsm; 6405 6406 rsm = rack_alloc(rack); 6407 if (rsm == NULL) { 6408 uma_zfree(rack_pcb_zone, tp->t_fb_ptr); 6409 tp->t_fb_ptr = NULL; 6410 return (ENOMEM); 6411 } 6412 rsm->r_flags = RACK_OVERMAX; 6413 rsm->r_tim_lastsent[0] = tcp_ts_getticks(); 6414 rsm->r_rtr_cnt = 1; 6415 rsm->r_rtr_bytes = 0; 6416 rsm->r_start = tp->snd_una; 6417 rsm->r_end = tp->snd_max; 6418 rsm->r_sndcnt = 0; 6419 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_map, rsm, r_next); 6420 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); 6421 rsm->r_in_tmap = 1; 6422 } 6423 rack_stop_all_timers(tp); 6424 rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0); 6425 return (0); 6426 } 6427 6428 static int 6429 rack_handoff_ok(struct tcpcb *tp) 6430 { 6431 if ((tp->t_state == TCPS_CLOSED) || 6432 (tp->t_state == TCPS_LISTEN)) { 6433 /* Sure no problem though it may not stick */ 6434 return (0); 6435 } 6436 if ((tp->t_state == TCPS_SYN_SENT) || 6437 (tp->t_state == TCPS_SYN_RECEIVED)) { 6438 /* 6439 * We really don't know you have to get to ESTAB or beyond 6440 * to tell. 6441 */ 6442 return (EAGAIN); 6443 } 6444 if (tp->t_flags & TF_SACK_PERMIT) { 6445 return (0); 6446 } 6447 /* 6448 * If we reach here we don't do SACK on this connection so we can 6449 * never do rack. 6450 */ 6451 return (EINVAL); 6452 } 6453 6454 static void 6455 rack_fini(struct tcpcb *tp, int32_t tcb_is_purged) 6456 { 6457 if (tp->t_fb_ptr) { 6458 struct tcp_rack *rack; 6459 struct rack_sendmap *rsm; 6460 6461 rack = (struct tcp_rack *)tp->t_fb_ptr; 6462 #ifdef TCP_BLACKBOX 6463 tcp_log_flowend(tp); 6464 #endif 6465 rsm = TAILQ_FIRST(&rack->r_ctl.rc_map); 6466 while (rsm) { 6467 TAILQ_REMOVE(&rack->r_ctl.rc_map, rsm, r_next); 6468 uma_zfree(rack_zone, rsm); 6469 rsm = TAILQ_FIRST(&rack->r_ctl.rc_map); 6470 } 6471 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); 6472 while (rsm) { 6473 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_next); 6474 uma_zfree(rack_zone, rsm); 6475 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); 6476 } 6477 rack->rc_free_cnt = 0; 6478 uma_zfree(rack_pcb_zone, tp->t_fb_ptr); 6479 tp->t_fb_ptr = NULL; 6480 } 6481 /* Make sure snd_nxt is correctly set */ 6482 tp->snd_nxt = tp->snd_max; 6483 } 6484 6485 static void 6486 rack_set_state(struct tcpcb *tp, struct tcp_rack *rack) 6487 { 6488 switch (tp->t_state) { 6489 case TCPS_SYN_SENT: 6490 rack->r_state = TCPS_SYN_SENT; 6491 rack->r_substate = rack_do_syn_sent; 6492 break; 6493 case TCPS_SYN_RECEIVED: 6494 rack->r_state = TCPS_SYN_RECEIVED; 6495 rack->r_substate = rack_do_syn_recv; 6496 break; 6497 case TCPS_ESTABLISHED: 6498 rack->r_state = TCPS_ESTABLISHED; 6499 rack->r_substate = rack_do_established; 6500 break; 6501 case TCPS_CLOSE_WAIT: 6502 rack->r_state = TCPS_CLOSE_WAIT; 6503 rack->r_substate = rack_do_close_wait; 6504 break; 6505 case TCPS_FIN_WAIT_1: 6506 rack->r_state = TCPS_FIN_WAIT_1; 6507 rack->r_substate = rack_do_fin_wait_1; 6508 break; 6509 case TCPS_CLOSING: 6510 rack->r_state = TCPS_CLOSING; 6511 rack->r_substate = rack_do_closing; 6512 break; 6513 case TCPS_LAST_ACK: 6514 rack->r_state = TCPS_LAST_ACK; 6515 rack->r_substate = rack_do_lastack; 6516 break; 6517 case TCPS_FIN_WAIT_2: 6518 rack->r_state = TCPS_FIN_WAIT_2; 6519 rack->r_substate = rack_do_fin_wait_2; 6520 break; 6521 case TCPS_LISTEN: 6522 case TCPS_CLOSED: 6523 case TCPS_TIME_WAIT: 6524 default: 6525 break; 6526 }; 6527 } 6528 6529 6530 static void 6531 rack_timer_audit(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb) 6532 { 6533 /* 6534 * We received an ack, and then did not 6535 * call send or were bounced out due to the 6536 * hpts was running. Now a timer is up as well, is 6537 * it the right timer? 6538 */ 6539 struct rack_sendmap *rsm; 6540 int tmr_up; 6541 6542 tmr_up = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK; 6543 if (rack->rc_in_persist && (tmr_up == PACE_TMR_PERSIT)) 6544 return; 6545 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 6546 if (((rsm == NULL) || (tp->t_state < TCPS_ESTABLISHED)) && 6547 (tmr_up == PACE_TMR_RXT)) { 6548 /* Should be an RXT */ 6549 return; 6550 } 6551 if (rsm == NULL) { 6552 /* Nothing outstanding? */ 6553 if (tp->t_flags & TF_DELACK) { 6554 if (tmr_up == PACE_TMR_DELACK) 6555 /* We are supposed to have delayed ack up and we do */ 6556 return; 6557 } else if (sbavail(&tp->t_inpcb->inp_socket->so_snd) && (tmr_up == PACE_TMR_RXT)) { 6558 /* 6559 * if we hit enobufs then we would expect the possiblity 6560 * of nothing outstanding and the RXT up (and the hptsi timer). 6561 */ 6562 return; 6563 } else if (((tcp_always_keepalive || 6564 rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) && 6565 (tp->t_state <= TCPS_CLOSING)) && 6566 (tmr_up == PACE_TMR_KEEP) && 6567 (tp->snd_max == tp->snd_una)) { 6568 /* We should have keep alive up and we do */ 6569 return; 6570 } 6571 } 6572 if (rsm && (rsm->r_flags & RACK_SACK_PASSED)) { 6573 if ((tp->t_flags & TF_SENTFIN) && 6574 ((tp->snd_max - tp->snd_una) == 1) && 6575 (rsm->r_flags & RACK_HAS_FIN)) { 6576 /* needs to be a RXT */ 6577 if (tmr_up == PACE_TMR_RXT) 6578 return; 6579 } else if (tmr_up == PACE_TMR_RACK) 6580 return; 6581 } else if (SEQ_GT(tp->snd_max,tp->snd_una) && 6582 ((tmr_up == PACE_TMR_TLP) || 6583 (tmr_up == PACE_TMR_RXT))) { 6584 /* 6585 * Either a TLP or RXT is fine if no sack-passed 6586 * is in place and data is outstanding. 6587 */ 6588 return; 6589 } else if (tmr_up == PACE_TMR_DELACK) { 6590 /* 6591 * If the delayed ack was going to go off 6592 * before the rtx/tlp/rack timer were going to 6593 * expire, then that would be the timer in control. 6594 * Note we don't check the time here trusting the 6595 * code is correct. 6596 */ 6597 return; 6598 } 6599 /* 6600 * Ok the timer originally started is not what we want now. 6601 * We will force the hpts to be stopped if any, and restart 6602 * with the slot set to what was in the saved slot. 6603 */ 6604 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 6605 rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0); 6606 } 6607 6608 static void 6609 rack_hpts_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, 6610 struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos, 6611 int32_t nxt_pkt, struct timeval *tv) 6612 { 6613 int32_t thflags, retval, did_out = 0; 6614 int32_t way_out = 0; 6615 uint32_t cts; 6616 uint32_t tiwin; 6617 struct tcpopt to; 6618 struct tcp_rack *rack; 6619 struct rack_sendmap *rsm; 6620 int32_t prev_state = 0; 6621 6622 cts = tcp_tv_to_mssectick(tv); 6623 rack = (struct tcp_rack *)tp->t_fb_ptr; 6624 6625 kern_prefetch(rack, &prev_state); 6626 prev_state = 0; 6627 thflags = th->th_flags; 6628 /* 6629 * If this is either a state-changing packet or current state isn't 6630 * established, we require a read lock on tcbinfo. Otherwise, we 6631 * allow the tcbinfo to be in either locked or unlocked, as the 6632 * caller may have unnecessarily acquired a lock due to a race. 6633 */ 6634 INP_WLOCK_ASSERT(tp->t_inpcb); 6635 KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", 6636 __func__)); 6637 KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT", 6638 __func__)); 6639 { 6640 union tcp_log_stackspecific log; 6641 6642 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 6643 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 6644 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 6645 log.u_bbr.flex2 = rack->r_ctl.rc_num_maps_alloced; 6646 TCP_LOG_EVENT(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_IN, 0, 6647 tlen, &log, true); 6648 } 6649 /* 6650 * Segment received on connection. Reset idle time and keep-alive 6651 * timer. XXX: This should be done after segment validation to 6652 * ignore broken/spoofed segs. 6653 */ 6654 if (tp->t_idle_reduce && (tp->snd_max == tp->snd_una)) { 6655 if ((ticks - tp->t_rcvtime) >= tp->t_rxtcur) { 6656 counter_u64_add(rack_input_idle_reduces, 1); 6657 rack_cc_after_idle(tp, 6658 (rack->r_idle_reduce_largest ? 1 :0)); 6659 } 6660 } 6661 rack->r_ctl.rc_rcvtime = cts; 6662 tp->t_rcvtime = ticks; 6663 6664 /* 6665 * Unscale the window into a 32-bit value. For the SYN_SENT state 6666 * the scale is zero. 6667 */ 6668 tiwin = th->th_win << tp->snd_scale; 6669 #ifdef NETFLIX_STATS 6670 stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_FRWIN, tiwin); 6671 #endif 6672 /* 6673 * TCP ECN processing. XXXJTL: If we ever use ECN, we need to move 6674 * this to occur after we've validated the segment. 6675 */ 6676 if (tp->t_flags & TF_ECN_PERMIT) { 6677 if (thflags & TH_CWR) 6678 tp->t_flags &= ~TF_ECN_SND_ECE; 6679 switch (iptos & IPTOS_ECN_MASK) { 6680 case IPTOS_ECN_CE: 6681 tp->t_flags |= TF_ECN_SND_ECE; 6682 TCPSTAT_INC(tcps_ecn_ce); 6683 break; 6684 case IPTOS_ECN_ECT0: 6685 TCPSTAT_INC(tcps_ecn_ect0); 6686 break; 6687 case IPTOS_ECN_ECT1: 6688 TCPSTAT_INC(tcps_ecn_ect1); 6689 break; 6690 } 6691 /* Congestion experienced. */ 6692 if (thflags & TH_ECE) { 6693 rack_cong_signal(tp, th, CC_ECN); 6694 } 6695 } 6696 /* 6697 * Parse options on any incoming segment. 6698 */ 6699 tcp_dooptions(&to, (u_char *)(th + 1), 6700 (th->th_off << 2) - sizeof(struct tcphdr), 6701 (thflags & TH_SYN) ? TO_SYN : 0); 6702 6703 /* 6704 * If echoed timestamp is later than the current time, fall back to 6705 * non RFC1323 RTT calculation. Normalize timestamp if syncookies 6706 * were used when this connection was established. 6707 */ 6708 if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) { 6709 to.to_tsecr -= tp->ts_offset; 6710 if (TSTMP_GT(to.to_tsecr, cts)) 6711 to.to_tsecr = 0; 6712 } 6713 /* 6714 * If its the first time in we need to take care of options and 6715 * verify we can do SACK for rack! 6716 */ 6717 if (rack->r_state == 0) { 6718 /* Should be init'd by rack_init() */ 6719 KASSERT(rack->rc_inp != NULL, 6720 ("%s: rack->rc_inp unexpectedly NULL", __func__)); 6721 if (rack->rc_inp == NULL) { 6722 rack->rc_inp = tp->t_inpcb; 6723 } 6724 6725 /* 6726 * Process options only when we get SYN/ACK back. The SYN 6727 * case for incoming connections is handled in tcp_syncache. 6728 * According to RFC1323 the window field in a SYN (i.e., a 6729 * <SYN> or <SYN,ACK>) segment itself is never scaled. XXX 6730 * this is traditional behavior, may need to be cleaned up. 6731 */ 6732 rack->r_cpu = inp_to_cpuid(tp->t_inpcb); 6733 if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) { 6734 if ((to.to_flags & TOF_SCALE) && 6735 (tp->t_flags & TF_REQ_SCALE)) { 6736 tp->t_flags |= TF_RCVD_SCALE; 6737 tp->snd_scale = to.to_wscale; 6738 } 6739 /* 6740 * Initial send window. It will be updated with the 6741 * next incoming segment to the scaled value. 6742 */ 6743 tp->snd_wnd = th->th_win; 6744 if (to.to_flags & TOF_TS) { 6745 tp->t_flags |= TF_RCVD_TSTMP; 6746 tp->ts_recent = to.to_tsval; 6747 tp->ts_recent_age = cts; 6748 } 6749 if (to.to_flags & TOF_MSS) 6750 tcp_mss(tp, to.to_mss); 6751 if ((tp->t_flags & TF_SACK_PERMIT) && 6752 (to.to_flags & TOF_SACKPERM) == 0) 6753 tp->t_flags &= ~TF_SACK_PERMIT; 6754 } 6755 /* 6756 * At this point we are at the initial call. Here we decide 6757 * if we are doing RACK or not. We do this by seeing if 6758 * TF_SACK_PERMIT is set, if not rack is *not* possible and 6759 * we switch to the default code. 6760 */ 6761 if ((tp->t_flags & TF_SACK_PERMIT) == 0) { 6762 tcp_switch_back_to_default(tp); 6763 (*tp->t_fb->tfb_tcp_do_segment) (m, th, so, tp, drop_hdrlen, 6764 tlen, iptos); 6765 return; 6766 } 6767 /* Set the flag */ 6768 rack->r_is_v6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; 6769 tcp_set_hpts(tp->t_inpcb); 6770 sack_filter_clear(&rack->r_ctl.rack_sf, th->th_ack); 6771 } 6772 /* 6773 * This is the one exception case where we set the rack state 6774 * always. All other times (timers etc) we must have a rack-state 6775 * set (so we assure we have done the checks above for SACK). 6776 */ 6777 if (rack->r_state != tp->t_state) 6778 rack_set_state(tp, rack); 6779 if (SEQ_GT(th->th_ack, tp->snd_una) && (rsm = TAILQ_FIRST(&rack->r_ctl.rc_map)) != NULL) 6780 kern_prefetch(rsm, &prev_state); 6781 prev_state = rack->r_state; 6782 rack->r_ctl.rc_tlp_send_cnt = 0; 6783 rack_clear_rate_sample(rack); 6784 retval = (*rack->r_substate) (m, th, so, 6785 tp, &to, drop_hdrlen, 6786 tlen, tiwin, thflags, nxt_pkt); 6787 #ifdef INVARIANTS 6788 if ((retval == 0) && 6789 (tp->t_inpcb == NULL)) { 6790 panic("retval:%d tp:%p t_inpcb:NULL state:%d", 6791 retval, tp, prev_state); 6792 } 6793 #endif 6794 if (retval == 0) { 6795 /* 6796 * If retval is 1 the tcb is unlocked and most likely the tp 6797 * is gone. 6798 */ 6799 INP_WLOCK_ASSERT(tp->t_inpcb); 6800 tcp_rack_xmit_timer_commit(rack, tp); 6801 if (nxt_pkt == 0) { 6802 if (rack->r_wanted_output != 0) { 6803 did_out = 1; 6804 (void)tp->t_fb->tfb_tcp_output(tp); 6805 } 6806 rack_start_hpts_timer(rack, tp, cts, __LINE__, 0, 0, 0); 6807 } 6808 if (((rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) == 0) && 6809 (SEQ_GT(tp->snd_max, tp->snd_una) || 6810 (tp->t_flags & TF_DELACK) || 6811 ((tcp_always_keepalive || rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) && 6812 (tp->t_state <= TCPS_CLOSING)))) { 6813 /* We could not send (probably in the hpts but stopped the timer earlier)? */ 6814 if ((tp->snd_max == tp->snd_una) && 6815 ((tp->t_flags & TF_DELACK) == 0) && 6816 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { 6817 /* keep alive not needed if we are hptsi output yet */ 6818 ; 6819 } else { 6820 if (rack->rc_inp->inp_in_hpts) 6821 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); 6822 rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0); 6823 } 6824 way_out = 1; 6825 } else { 6826 /* Do we have the correct timer running? */ 6827 rack_timer_audit(tp, rack, &so->so_snd); 6828 way_out = 2; 6829 } 6830 rack_log_doseg_done(rack, cts, nxt_pkt, did_out, way_out); 6831 if (did_out) 6832 rack->r_wanted_output = 0; 6833 #ifdef INVARIANTS 6834 if (tp->t_inpcb == NULL) { 6835 panic("OP:%d retval:%d tp:%p t_inpcb:NULL state:%d", 6836 did_out, 6837 retval, tp, prev_state); 6838 } 6839 #endif 6840 INP_WUNLOCK(tp->t_inpcb); 6841 } 6842 } 6843 6844 void 6845 rack_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, 6846 struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos) 6847 { 6848 struct timeval tv; 6849 #ifdef RSS 6850 struct tcp_function_block *tfb; 6851 struct tcp_rack *rack; 6852 struct inpcb *inp; 6853 6854 rack = (struct tcp_rack *)tp->t_fb_ptr; 6855 if (rack->r_state == 0) { 6856 /* 6857 * Initial input (ACK to SYN-ACK etc)lets go ahead and get 6858 * it processed 6859 */ 6860 tcp_get_usecs(&tv); 6861 rack_hpts_do_segment(m, th, so, tp, drop_hdrlen, 6862 tlen, iptos, 0, &tv); 6863 return; 6864 } 6865 tcp_queue_to_input(tp, m, th, tlen, drop_hdrlen, iptos); 6866 INP_WUNLOCK(tp->t_inpcb); 6867 #else 6868 tcp_get_usecs(&tv); 6869 rack_hpts_do_segment(m, th, so, tp, drop_hdrlen, 6870 tlen, iptos, 0, &tv); 6871 #endif 6872 } 6873 6874 struct rack_sendmap * 6875 tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tsused) 6876 { 6877 struct rack_sendmap *rsm = NULL; 6878 int32_t idx; 6879 uint32_t srtt_cur, srtt = 0, thresh = 0, ts_low = 0; 6880 6881 /* Return the next guy to be re-transmitted */ 6882 if (TAILQ_EMPTY(&rack->r_ctl.rc_map)) { 6883 return (NULL); 6884 } 6885 if (tp->t_flags & TF_SENTFIN) { 6886 /* retran the end FIN? */ 6887 return (NULL); 6888 } 6889 /* ok lets look at this one */ 6890 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 6891 if (rsm && ((rsm->r_flags & RACK_ACKED) == 0)) { 6892 goto check_it; 6893 } 6894 rsm = rack_find_lowest_rsm(rack); 6895 if (rsm == NULL) { 6896 return (NULL); 6897 } 6898 check_it: 6899 srtt_cur = tp->t_srtt >> TCP_RTT_SHIFT; 6900 srtt = TICKS_2_MSEC(srtt_cur); 6901 if (rack->rc_rack_rtt && (srtt > rack->rc_rack_rtt)) 6902 srtt = rack->rc_rack_rtt; 6903 if (rsm->r_flags & RACK_ACKED) { 6904 return (NULL); 6905 } 6906 if ((rsm->r_flags & RACK_SACK_PASSED) == 0) { 6907 /* Its not yet ready */ 6908 return (NULL); 6909 } 6910 idx = rsm->r_rtr_cnt - 1; 6911 ts_low = rsm->r_tim_lastsent[idx]; 6912 thresh = rack_calc_thresh_rack(rack, srtt, tsused); 6913 if (tsused <= ts_low) { 6914 return (NULL); 6915 } 6916 if ((tsused - ts_low) >= thresh) { 6917 return (rsm); 6918 } 6919 return (NULL); 6920 } 6921 6922 static int 6923 rack_output(struct tcpcb *tp) 6924 { 6925 struct socket *so; 6926 uint32_t recwin, sendwin; 6927 uint32_t sb_offset; 6928 int32_t len, flags, error = 0; 6929 struct mbuf *m; 6930 struct mbuf *mb; 6931 uint32_t if_hw_tsomaxsegcount = 0; 6932 uint32_t if_hw_tsomaxsegsize; 6933 long tot_len_this_send = 0; 6934 struct ip *ip = NULL; 6935 #ifdef TCPDEBUG 6936 struct ipovly *ipov = NULL; 6937 #endif 6938 #ifdef NETFLIX_TCP_O_UDP 6939 struct udphdr *udp = NULL; 6940 #endif 6941 struct tcp_rack *rack; 6942 struct tcphdr *th; 6943 uint8_t pass = 0; 6944 u_char opt[TCP_MAXOLEN]; 6945 unsigned ipoptlen, optlen, hdrlen; 6946 #ifdef NETFLIX_TCP_O_UDP 6947 unsigned ulen; 6948 #endif 6949 uint32_t rack_seq; 6950 6951 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 6952 unsigned ipsec_optlen = 0; 6953 6954 #endif 6955 int32_t idle, sendalot; 6956 int32_t sub_from_prr = 0; 6957 volatile int32_t sack_rxmit; 6958 struct rack_sendmap *rsm = NULL; 6959 int32_t tso, mtu, would_have_fin = 0; 6960 struct tcpopt to; 6961 int32_t slot = 0; 6962 uint32_t cts; 6963 uint8_t hpts_calling, doing_tlp = 0; 6964 int32_t do_a_prefetch; 6965 int32_t prefetch_rsm = 0; 6966 int32_t prefetch_so_done = 0; 6967 struct tcp_log_buffer *lgb = NULL; 6968 struct inpcb *inp; 6969 struct sockbuf *sb; 6970 #ifdef INET6 6971 struct ip6_hdr *ip6 = NULL; 6972 int32_t isipv6; 6973 #endif 6974 /* setup and take the cache hits here */ 6975 rack = (struct tcp_rack *)tp->t_fb_ptr; 6976 inp = rack->rc_inp; 6977 so = inp->inp_socket; 6978 sb = &so->so_snd; 6979 kern_prefetch(sb, &do_a_prefetch); 6980 do_a_prefetch = 1; 6981 6982 INP_WLOCK_ASSERT(inp); 6983 #ifdef TCP_OFFLOAD 6984 if (tp->t_flags & TF_TOE) 6985 return (tcp_offload_output(tp)); 6986 #endif 6987 6988 #ifdef TCP_RFC7413 6989 /* 6990 * For TFO connections in SYN_RECEIVED, only allow the initial 6991 * SYN|ACK and those sent by the retransmit timer. 6992 */ 6993 if ((tp->t_flags & TF_FASTOPEN) && 6994 (tp->t_state == TCPS_SYN_RECEIVED) && 6995 SEQ_GT(tp->snd_max, tp->snd_una) && /* inital SYN|ACK sent */ 6996 (tp->snd_nxt != tp->snd_una)) /* not a retransmit */ 6997 return (0); 6998 #endif 6999 #ifdef INET6 7000 if (rack->r_state) { 7001 /* Use the cache line loaded if possible */ 7002 isipv6 = rack->r_is_v6; 7003 } else { 7004 isipv6 = (inp->inp_vflag & INP_IPV6) != 0; 7005 } 7006 #endif 7007 cts = tcp_ts_getticks(); 7008 if (((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) && 7009 inp->inp_in_hpts) { 7010 /* 7011 * We are on the hpts for some timer but not hptsi output. 7012 * Remove from the hpts unconditionally. 7013 */ 7014 rack_timer_cancel(tp, rack, cts, __LINE__); 7015 } 7016 /* Mark that we have called rack_output(). */ 7017 if ((rack->r_timer_override) || 7018 (tp->t_flags & TF_FORCEDATA) || 7019 (tp->t_state < TCPS_ESTABLISHED)) { 7020 if (tp->t_inpcb->inp_in_hpts) 7021 tcp_hpts_remove(tp->t_inpcb, HPTS_REMOVE_OUTPUT); 7022 } else if (tp->t_inpcb->inp_in_hpts) { 7023 /* 7024 * On the hpts you can't pass even if ACKNOW is on, we will 7025 * when the hpts fires. 7026 */ 7027 counter_u64_add(rack_out_size[TCP_MSS_ACCT_INPACE], 1); 7028 return (0); 7029 } 7030 hpts_calling = inp->inp_hpts_calls; 7031 inp->inp_hpts_calls = 0; 7032 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { 7033 if (rack_process_timers(tp, rack, cts, hpts_calling)) { 7034 counter_u64_add(rack_out_size[TCP_MSS_ACCT_ATIMER], 1); 7035 return (0); 7036 } 7037 } 7038 rack->r_wanted_output = 0; 7039 rack->r_timer_override = 0; 7040 /* 7041 * Determine length of data that should be transmitted, and flags 7042 * that will be used. If there is some data or critical controls 7043 * (SYN, RST) to send, then transmit; otherwise, investigate 7044 * further. 7045 */ 7046 idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una); 7047 if (tp->t_idle_reduce) { 7048 if (idle && ((ticks - tp->t_rcvtime) >= tp->t_rxtcur)) 7049 rack_cc_after_idle(tp, 7050 (rack->r_idle_reduce_largest ? 1 :0)); 7051 } 7052 tp->t_flags &= ~TF_LASTIDLE; 7053 if (idle) { 7054 if (tp->t_flags & TF_MORETOCOME) { 7055 tp->t_flags |= TF_LASTIDLE; 7056 idle = 0; 7057 } 7058 } 7059 again: 7060 /* 7061 * If we've recently taken a timeout, snd_max will be greater than 7062 * snd_nxt. There may be SACK information that allows us to avoid 7063 * resending already delivered data. Adjust snd_nxt accordingly. 7064 */ 7065 sendalot = 0; 7066 cts = tcp_ts_getticks(); 7067 tso = 0; 7068 mtu = 0; 7069 sb_offset = tp->snd_max - tp->snd_una; 7070 sendwin = min(tp->snd_wnd, tp->snd_cwnd); 7071 7072 flags = tcp_outflags[tp->t_state]; 7073 /* 7074 * Send any SACK-generated retransmissions. If we're explicitly 7075 * trying to send out new data (when sendalot is 1), bypass this 7076 * function. If we retransmit in fast recovery mode, decrement 7077 * snd_cwnd, since we're replacing a (future) new transmission with 7078 * a retransmission now, and we previously incremented snd_cwnd in 7079 * tcp_input(). 7080 */ 7081 /* 7082 * Still in sack recovery , reset rxmit flag to zero. 7083 */ 7084 while (rack->rc_free_cnt < rack_free_cache) { 7085 rsm = rack_alloc(rack); 7086 if (rsm == NULL) { 7087 if (inp->inp_hpts_calls) 7088 /* Retry in a ms */ 7089 slot = 1; 7090 goto just_return_nolock; 7091 } 7092 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_next); 7093 rack->rc_free_cnt++; 7094 rsm = NULL; 7095 } 7096 if (inp->inp_hpts_calls) 7097 inp->inp_hpts_calls = 0; 7098 sack_rxmit = 0; 7099 len = 0; 7100 rsm = NULL; 7101 if (flags & TH_RST) { 7102 SOCKBUF_LOCK(sb); 7103 goto send; 7104 } 7105 if (rack->r_ctl.rc_tlpsend) { 7106 /* Tail loss probe */ 7107 long cwin; 7108 long tlen; 7109 7110 doing_tlp = 1; 7111 rsm = rack->r_ctl.rc_tlpsend; 7112 rack->r_ctl.rc_tlpsend = NULL; 7113 sack_rxmit = 1; 7114 tlen = rsm->r_end - rsm->r_start; 7115 if (tlen > tp->t_maxseg) 7116 tlen = tp->t_maxseg; 7117 #ifdef INVARIANTS 7118 if (SEQ_GT(tp->snd_una, rsm->r_start)) { 7119 panic("tp:%p rack:%p snd_una:%u rsm:%p r_start:%u", 7120 tp, rack, tp->snd_una, rsm, rsm->r_start); 7121 } 7122 #endif 7123 sb_offset = rsm->r_start - tp->snd_una; 7124 cwin = min(tp->snd_wnd, tlen); 7125 len = cwin; 7126 } else if (rack->r_ctl.rc_resend) { 7127 /* Retransmit timer */ 7128 rsm = rack->r_ctl.rc_resend; 7129 rack->r_ctl.rc_resend = NULL; 7130 len = rsm->r_end - rsm->r_start; 7131 sack_rxmit = 1; 7132 sendalot = 0; 7133 sb_offset = rsm->r_start - tp->snd_una; 7134 if (len >= tp->t_maxseg) { 7135 len = tp->t_maxseg; 7136 } 7137 KASSERT(sb_offset >= 0, ("%s: sack block to the left of una : %d", 7138 __func__, sb_offset)); 7139 } else if ((rack->rc_in_persist == 0) && 7140 ((rsm = tcp_rack_output(tp, rack, cts)) != NULL)) { 7141 long tlen; 7142 7143 if ((!IN_RECOVERY(tp->t_flags)) && 7144 ((tp->t_flags & (TF_WASFRECOVERY | TF_WASCRECOVERY)) == 0)) { 7145 /* Enter recovery if not induced by a time-out */ 7146 rack->r_ctl.rc_rsm_start = rsm->r_start; 7147 rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; 7148 rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; 7149 rack_cong_signal(tp, NULL, CC_NDUPACK); 7150 /* 7151 * When we enter recovery we need to assure we send 7152 * one packet. 7153 */ 7154 rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; 7155 } 7156 #ifdef INVARIANTS 7157 if (SEQ_LT(rsm->r_start, tp->snd_una)) { 7158 panic("Huh, tp:%p rack:%p rsm:%p start:%u < snd_una:%u\n", 7159 tp, rack, rsm, rsm->r_start, tp->snd_una); 7160 } 7161 #endif 7162 tlen = rsm->r_end - rsm->r_start; 7163 sb_offset = rsm->r_start - tp->snd_una; 7164 if (tlen > rack->r_ctl.rc_prr_sndcnt) { 7165 len = rack->r_ctl.rc_prr_sndcnt; 7166 } else { 7167 len = tlen; 7168 } 7169 if (len >= tp->t_maxseg) { 7170 sendalot = 1; 7171 len = tp->t_maxseg; 7172 } else { 7173 sendalot = 0; 7174 if ((rack->rc_timer_up == 0) && 7175 (len < tlen)) { 7176 /* 7177 * If its not a timer don't send a partial 7178 * segment. 7179 */ 7180 len = 0; 7181 goto just_return_nolock; 7182 } 7183 } 7184 KASSERT(sb_offset >= 0, ("%s: sack block to the left of una : %d", 7185 __func__, sb_offset)); 7186 if (len > 0) { 7187 sub_from_prr = 1; 7188 sack_rxmit = 1; 7189 TCPSTAT_INC(tcps_sack_rexmits); 7190 TCPSTAT_ADD(tcps_sack_rexmit_bytes, 7191 min(len, tp->t_maxseg)); 7192 counter_u64_add(rack_rtm_prr_retran, 1); 7193 } 7194 } 7195 if (rsm && (rsm->r_flags & RACK_HAS_FIN)) { 7196 /* we are retransmitting the fin */ 7197 len--; 7198 if (len) { 7199 /* 7200 * When retransmitting data do *not* include the 7201 * FIN. This could happen from a TLP probe. 7202 */ 7203 flags &= ~TH_FIN; 7204 } 7205 } 7206 #ifdef INVARIANTS 7207 /* For debugging */ 7208 rack->r_ctl.rc_rsm_at_retran = rsm; 7209 #endif 7210 /* 7211 * Enforce a connection sendmap count limit if set 7212 * as long as we are not retransmiting. 7213 */ 7214 if ((rsm == NULL) && 7215 (rack_map_entries_limit > 0) && 7216 (rack->r_ctl.rc_num_maps_alloced >= rack_map_entries_limit)) { 7217 counter_u64_add(rack_to_alloc_limited, 1); 7218 if (!rack->alloc_limit_reported) { 7219 rack->alloc_limit_reported = 1; 7220 counter_u64_add(rack_alloc_limited_conns, 1); 7221 } 7222 goto just_return_nolock; 7223 } 7224 /* 7225 * Get standard flags, and add SYN or FIN if requested by 'hidden' 7226 * state flags. 7227 */ 7228 if (tp->t_flags & TF_NEEDFIN) 7229 flags |= TH_FIN; 7230 if (tp->t_flags & TF_NEEDSYN) 7231 flags |= TH_SYN; 7232 if ((sack_rxmit == 0) && (prefetch_rsm == 0)) { 7233 void *end_rsm; 7234 end_rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext); 7235 if (end_rsm) 7236 kern_prefetch(end_rsm, &prefetch_rsm); 7237 prefetch_rsm = 1; 7238 } 7239 SOCKBUF_LOCK(sb); 7240 /* 7241 * If in persist timeout with window of 0, send 1 byte. Otherwise, 7242 * if window is small but nonzero and time TF_SENTFIN expired, we 7243 * will send what we can and go to transmit state. 7244 */ 7245 if (tp->t_flags & TF_FORCEDATA) { 7246 if (sendwin == 0) { 7247 /* 7248 * If we still have some data to send, then clear 7249 * the FIN bit. Usually this would happen below 7250 * when it realizes that we aren't sending all the 7251 * data. However, if we have exactly 1 byte of 7252 * unsent data, then it won't clear the FIN bit 7253 * below, and if we are in persist state, we wind up 7254 * sending the packet without recording that we sent 7255 * the FIN bit. 7256 * 7257 * We can't just blindly clear the FIN bit, because 7258 * if we don't have any more data to send then the 7259 * probe will be the FIN itself. 7260 */ 7261 if (sb_offset < sbused(sb)) 7262 flags &= ~TH_FIN; 7263 sendwin = 1; 7264 } else { 7265 if (rack->rc_in_persist) 7266 rack_exit_persist(tp, rack); 7267 /* 7268 * If we are dropping persist mode then we need to 7269 * correct snd_nxt/snd_max and off. 7270 */ 7271 tp->snd_nxt = tp->snd_max; 7272 sb_offset = tp->snd_nxt - tp->snd_una; 7273 } 7274 } 7275 /* 7276 * If snd_nxt == snd_max and we have transmitted a FIN, the 7277 * sb_offset will be > 0 even if so_snd.sb_cc is 0, resulting in a 7278 * negative length. This can also occur when TCP opens up its 7279 * congestion window while receiving additional duplicate acks after 7280 * fast-retransmit because TCP will reset snd_nxt to snd_max after 7281 * the fast-retransmit. 7282 * 7283 * In the normal retransmit-FIN-only case, however, snd_nxt will be 7284 * set to snd_una, the sb_offset will be 0, and the length may wind 7285 * up 0. 7286 * 7287 * If sack_rxmit is true we are retransmitting from the scoreboard 7288 * in which case len is already set. 7289 */ 7290 if (sack_rxmit == 0) { 7291 uint32_t avail; 7292 7293 avail = sbavail(sb); 7294 if (SEQ_GT(tp->snd_nxt, tp->snd_una)) 7295 sb_offset = tp->snd_nxt - tp->snd_una; 7296 else 7297 sb_offset = 0; 7298 if (IN_RECOVERY(tp->t_flags) == 0) { 7299 if (rack->r_ctl.rc_tlp_new_data) { 7300 /* TLP is forcing out new data */ 7301 if (rack->r_ctl.rc_tlp_new_data > (uint32_t) (avail - sb_offset)) { 7302 rack->r_ctl.rc_tlp_new_data = (uint32_t) (avail - sb_offset); 7303 } 7304 if (rack->r_ctl.rc_tlp_new_data > tp->snd_wnd) 7305 len = tp->snd_wnd; 7306 else 7307 len = rack->r_ctl.rc_tlp_new_data; 7308 rack->r_ctl.rc_tlp_new_data = 0; 7309 doing_tlp = 1; 7310 } else { 7311 if (sendwin > avail) { 7312 /* use the available */ 7313 if (avail > sb_offset) { 7314 len = (int32_t)(avail - sb_offset); 7315 } else { 7316 len = 0; 7317 } 7318 } else { 7319 if (sendwin > sb_offset) { 7320 len = (int32_t)(sendwin - sb_offset); 7321 } else { 7322 len = 0; 7323 } 7324 } 7325 } 7326 } else { 7327 uint32_t outstanding; 7328 7329 /* 7330 * We are inside of a SACK recovery episode and are 7331 * sending new data, having retransmitted all the 7332 * data possible so far in the scoreboard. 7333 */ 7334 outstanding = tp->snd_max - tp->snd_una; 7335 if ((rack->r_ctl.rc_prr_sndcnt + outstanding) > tp->snd_wnd) { 7336 if (tp->snd_wnd > outstanding) { 7337 len = tp->snd_wnd - outstanding; 7338 /* Check to see if we have the data */ 7339 if (((sb_offset + len) > avail) && 7340 (avail > sb_offset)) 7341 len = avail - sb_offset; 7342 else 7343 len = 0; 7344 } else 7345 len = 0; 7346 } else if (avail > sb_offset) 7347 len = avail - sb_offset; 7348 else 7349 len = 0; 7350 if (len > 0) { 7351 if (len > rack->r_ctl.rc_prr_sndcnt) 7352 len = rack->r_ctl.rc_prr_sndcnt; 7353 7354 if (len > 0) { 7355 sub_from_prr = 1; 7356 counter_u64_add(rack_rtm_prr_newdata, 1); 7357 } 7358 } 7359 if (len > tp->t_maxseg) { 7360 /* 7361 * We should never send more than a MSS when 7362 * retransmitting or sending new data in prr 7363 * mode unless the override flag is on. Most 7364 * likely the PRR algorithm is not going to 7365 * let us send a lot as well :-) 7366 */ 7367 if (rack->r_ctl.rc_prr_sendalot == 0) 7368 len = tp->t_maxseg; 7369 } else if (len < tp->t_maxseg) { 7370 /* 7371 * Do we send any? The idea here is if the 7372 * send empty's the socket buffer we want to 7373 * do it. However if not then lets just wait 7374 * for our prr_sndcnt to get bigger. 7375 */ 7376 long leftinsb; 7377 7378 leftinsb = sbavail(sb) - sb_offset; 7379 if (leftinsb > len) { 7380 /* This send does not empty the sb */ 7381 len = 0; 7382 } 7383 } 7384 } 7385 } 7386 if (prefetch_so_done == 0) { 7387 kern_prefetch(so, &prefetch_so_done); 7388 prefetch_so_done = 1; 7389 } 7390 /* 7391 * Lop off SYN bit if it has already been sent. However, if this is 7392 * SYN-SENT state and if segment contains data and if we don't know 7393 * that foreign host supports TAO, suppress sending segment. 7394 */ 7395 if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) { 7396 if ((tp->t_state != TCPS_SYN_RECEIVED) && 7397 (tp->t_state != TCPS_SYN_SENT)) 7398 flags &= ~TH_SYN; 7399 #ifdef TCP_RFC7413 7400 /* 7401 * When sending additional segments following a TFO SYN|ACK, 7402 * do not include the SYN bit. 7403 */ 7404 if ((tp->t_flags & TF_FASTOPEN) && 7405 (tp->t_state == TCPS_SYN_RECEIVED)) 7406 flags &= ~TH_SYN; 7407 #endif 7408 sb_offset--, len++; 7409 if (sbavail(sb) == 0) 7410 len = 0; 7411 } 7412 /* 7413 * Be careful not to send data and/or FIN on SYN segments. This 7414 * measure is needed to prevent interoperability problems with not 7415 * fully conformant TCP implementations. 7416 */ 7417 if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) { 7418 len = 0; 7419 flags &= ~TH_FIN; 7420 } 7421 #ifdef TCP_RFC7413 7422 /* 7423 * When retransmitting SYN|ACK on a passively-created TFO socket, 7424 * don't include data, as the presence of data may have caused the 7425 * original SYN|ACK to have been dropped by a middlebox. 7426 */ 7427 if ((tp->t_flags & TF_FASTOPEN) && 7428 ((tp->t_state == TCPS_SYN_RECEIVED) && (tp->t_rxtshift > 0))) 7429 len = 0; 7430 #endif 7431 if (len <= 0) { 7432 /* 7433 * If FIN has been sent but not acked, but we haven't been 7434 * called to retransmit, len will be < 0. Otherwise, window 7435 * shrank after we sent into it. If window shrank to 0, 7436 * cancel pending retransmit, pull snd_nxt back to (closed) 7437 * window, and set the persist timer if it isn't already 7438 * going. If the window didn't close completely, just wait 7439 * for an ACK. 7440 * 7441 * We also do a general check here to ensure that we will 7442 * set the persist timer when we have data to send, but a 7443 * 0-byte window. This makes sure the persist timer is set 7444 * even if the packet hits one of the "goto send" lines 7445 * below. 7446 */ 7447 len = 0; 7448 if ((tp->snd_wnd == 0) && 7449 (TCPS_HAVEESTABLISHED(tp->t_state)) && 7450 (sb_offset < (int)sbavail(sb))) { 7451 tp->snd_nxt = tp->snd_una; 7452 rack_enter_persist(tp, rack, cts); 7453 } 7454 } 7455 /* len will be >= 0 after this point. */ 7456 KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); 7457 tcp_sndbuf_autoscale(tp, so, sendwin); 7458 /* 7459 * Decide if we can use TCP Segmentation Offloading (if supported by 7460 * hardware). 7461 * 7462 * TSO may only be used if we are in a pure bulk sending state. The 7463 * presence of TCP-MD5, SACK retransmits, SACK advertizements and IP 7464 * options prevent using TSO. With TSO the TCP header is the same 7465 * (except for the sequence number) for all generated packets. This 7466 * makes it impossible to transmit any options which vary per 7467 * generated segment or packet. 7468 * 7469 * IPv4 handling has a clear separation of ip options and ip header 7470 * flags while IPv6 combines both in in6p_outputopts. ip6_optlen() does 7471 * the right thing below to provide length of just ip options and thus 7472 * checking for ipoptlen is enough to decide if ip options are present. 7473 */ 7474 7475 #ifdef INET6 7476 if (isipv6) 7477 ipoptlen = ip6_optlen(tp->t_inpcb); 7478 else 7479 #endif 7480 if (tp->t_inpcb->inp_options) 7481 ipoptlen = tp->t_inpcb->inp_options->m_len - 7482 offsetof(struct ipoption, ipopt_list); 7483 else 7484 ipoptlen = 0; 7485 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 7486 /* 7487 * Pre-calculate here as we save another lookup into the darknesses 7488 * of IPsec that way and can actually decide if TSO is ok. 7489 */ 7490 #ifdef INET6 7491 if (isipv6 && IPSEC_ENABLED(ipv6)) 7492 ipsec_optlen = IPSEC_HDRSIZE(ipv6, tp->t_inpcb); 7493 #ifdef INET 7494 else 7495 #endif 7496 #endif /* INET6 */ 7497 #ifdef INET 7498 if (IPSEC_ENABLED(ipv4)) 7499 ipsec_optlen = IPSEC_HDRSIZE(ipv4, tp->t_inpcb); 7500 #endif /* INET */ 7501 #endif 7502 7503 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 7504 ipoptlen += ipsec_optlen; 7505 #endif 7506 if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > tp->t_maxseg && 7507 #ifdef NETFLIX_TCP_O_UDP 7508 (tp->t_port == 0) && 7509 #endif 7510 ((tp->t_flags & TF_SIGNATURE) == 0) && 7511 tp->rcv_numsacks == 0 && sack_rxmit == 0 && 7512 ipoptlen == 0) 7513 tso = 1; 7514 { 7515 uint32_t outstanding; 7516 7517 outstanding = tp->snd_max - tp->snd_una; 7518 if (tp->t_flags & TF_SENTFIN) { 7519 /* 7520 * If we sent a fin, snd_max is 1 higher than 7521 * snd_una 7522 */ 7523 outstanding--; 7524 } 7525 if (outstanding > 0) { 7526 /* 7527 * This is sub-optimal. We only send a stand alone 7528 * FIN on its own segment. 7529 */ 7530 if (flags & TH_FIN) { 7531 flags &= ~TH_FIN; 7532 would_have_fin = 1; 7533 } 7534 } else if (sack_rxmit) { 7535 if ((rsm->r_flags & RACK_HAS_FIN) == 0) 7536 flags &= ~TH_FIN; 7537 } else { 7538 if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + 7539 sbused(sb))) 7540 flags &= ~TH_FIN; 7541 } 7542 } 7543 recwin = sbspace(&so->so_rcv); 7544 7545 /* 7546 * Sender silly window avoidance. We transmit under the following 7547 * conditions when len is non-zero: 7548 * 7549 * - We have a full segment (or more with TSO) - This is the last 7550 * buffer in a write()/send() and we are either idle or running 7551 * NODELAY - we've timed out (e.g. persist timer) - we have more 7552 * then 1/2 the maximum send window's worth of data (receiver may be 7553 * limited the window size) - we need to retransmit 7554 */ 7555 if (len) { 7556 if (len >= tp->t_maxseg) { 7557 pass = 1; 7558 goto send; 7559 } 7560 /* 7561 * NOTE! on localhost connections an 'ack' from the remote 7562 * end may occur synchronously with the output and cause us 7563 * to flush a buffer queued with moretocome. XXX 7564 * 7565 */ 7566 if (!(tp->t_flags & TF_MORETOCOME) && /* normal case */ 7567 (idle || (tp->t_flags & TF_NODELAY)) && 7568 ((uint32_t)len + (uint32_t)sb_offset >= sbavail(&so->so_snd)) && 7569 (tp->t_flags & TF_NOPUSH) == 0) { 7570 pass = 2; 7571 goto send; 7572 } 7573 if (tp->t_flags & TF_FORCEDATA) { /* typ. timeout case */ 7574 pass = 3; 7575 goto send; 7576 } 7577 if ((tp->snd_una == tp->snd_max) && len) { /* Nothing outstanding */ 7578 goto send; 7579 } 7580 if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) { 7581 pass = 4; 7582 goto send; 7583 } 7584 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { /* retransmit case */ 7585 pass = 5; 7586 goto send; 7587 } 7588 if (sack_rxmit) { 7589 pass = 6; 7590 goto send; 7591 } 7592 } 7593 /* 7594 * Sending of standalone window updates. 7595 * 7596 * Window updates are important when we close our window due to a 7597 * full socket buffer and are opening it again after the application 7598 * reads data from it. Once the window has opened again and the 7599 * remote end starts to send again the ACK clock takes over and 7600 * provides the most current window information. 7601 * 7602 * We must avoid the silly window syndrome whereas every read from 7603 * the receive buffer, no matter how small, causes a window update 7604 * to be sent. We also should avoid sending a flurry of window 7605 * updates when the socket buffer had queued a lot of data and the 7606 * application is doing small reads. 7607 * 7608 * Prevent a flurry of pointless window updates by only sending an 7609 * update when we can increase the advertized window by more than 7610 * 1/4th of the socket buffer capacity. When the buffer is getting 7611 * full or is very small be more aggressive and send an update 7612 * whenever we can increase by two mss sized segments. In all other 7613 * situations the ACK's to new incoming data will carry further 7614 * window increases. 7615 * 7616 * Don't send an independent window update if a delayed ACK is 7617 * pending (it will get piggy-backed on it) or the remote side 7618 * already has done a half-close and won't send more data. Skip 7619 * this if the connection is in T/TCP half-open state. 7620 */ 7621 if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN) && 7622 !(tp->t_flags & TF_DELACK) && 7623 !TCPS_HAVERCVDFIN(tp->t_state)) { 7624 /* 7625 * "adv" is the amount we could increase the window, taking 7626 * into account that we are limited by TCP_MAXWIN << 7627 * tp->rcv_scale. 7628 */ 7629 int32_t adv; 7630 int oldwin; 7631 7632 adv = min(recwin, (long)TCP_MAXWIN << tp->rcv_scale); 7633 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) { 7634 oldwin = (tp->rcv_adv - tp->rcv_nxt); 7635 adv -= oldwin; 7636 } else 7637 oldwin = 0; 7638 7639 /* 7640 * If the new window size ends up being the same as the old 7641 * size when it is scaled, then don't force a window update. 7642 */ 7643 if (oldwin >> tp->rcv_scale == (adv + oldwin) >> tp->rcv_scale) 7644 goto dontupdate; 7645 7646 if (adv >= (int32_t)(2 * tp->t_maxseg) && 7647 (adv >= (int32_t)(so->so_rcv.sb_hiwat / 4) || 7648 recwin <= (int32_t)(so->so_rcv.sb_hiwat / 8) || 7649 so->so_rcv.sb_hiwat <= 8 * tp->t_maxseg)) { 7650 pass = 7; 7651 goto send; 7652 } 7653 if (2 * adv >= (int32_t) so->so_rcv.sb_hiwat) 7654 goto send; 7655 } 7656 dontupdate: 7657 7658 /* 7659 * Send if we owe the peer an ACK, RST, SYN, or urgent data. ACKNOW 7660 * is also a catch-all for the retransmit timer timeout case. 7661 */ 7662 if (tp->t_flags & TF_ACKNOW) { 7663 pass = 8; 7664 goto send; 7665 } 7666 if (((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0)) { 7667 pass = 9; 7668 goto send; 7669 } 7670 if (SEQ_GT(tp->snd_up, tp->snd_una)) { 7671 pass = 10; 7672 goto send; 7673 } 7674 /* 7675 * If our state indicates that FIN should be sent and we have not 7676 * yet done so, then we need to send. 7677 */ 7678 if (flags & TH_FIN) { 7679 if ((tp->t_flags & TF_SENTFIN) || 7680 (((tp->t_flags & TF_SENTFIN) == 0) && 7681 (tp->snd_nxt == tp->snd_una))) { 7682 pass = 11; 7683 goto send; 7684 } 7685 } 7686 /* 7687 * No reason to send a segment, just return. 7688 */ 7689 just_return: 7690 SOCKBUF_UNLOCK(sb); 7691 just_return_nolock: 7692 if (tot_len_this_send == 0) 7693 counter_u64_add(rack_out_size[TCP_MSS_ACCT_JUSTRET], 1); 7694 rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, tot_len_this_send, 1); 7695 rack_log_type_just_return(rack, cts, tot_len_this_send, slot, hpts_calling); 7696 tp->t_flags &= ~TF_FORCEDATA; 7697 return (0); 7698 7699 send: 7700 if (doing_tlp == 0) { 7701 /* 7702 * Data not a TLP, and its not the rxt firing. If it is the 7703 * rxt firing, we want to leave the tlp_in_progress flag on 7704 * so we don't send another TLP. It has to be a rack timer 7705 * or normal send (response to acked data) to clear the tlp 7706 * in progress flag. 7707 */ 7708 rack->rc_tlp_in_progress = 0; 7709 } 7710 SOCKBUF_LOCK_ASSERT(sb); 7711 if (len > 0) { 7712 if (len >= tp->t_maxseg) 7713 tp->t_flags2 |= TF2_PLPMTU_MAXSEGSNT; 7714 else 7715 tp->t_flags2 &= ~TF2_PLPMTU_MAXSEGSNT; 7716 } 7717 /* 7718 * Before ESTABLISHED, force sending of initial options unless TCP 7719 * set not to do any options. NOTE: we assume that the IP/TCP header 7720 * plus TCP options always fit in a single mbuf, leaving room for a 7721 * maximum link header, i.e. max_linkhdr + sizeof (struct tcpiphdr) 7722 * + optlen <= MCLBYTES 7723 */ 7724 optlen = 0; 7725 #ifdef INET6 7726 if (isipv6) 7727 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 7728 else 7729 #endif 7730 hdrlen = sizeof(struct tcpiphdr); 7731 7732 /* 7733 * Compute options for segment. We only have to care about SYN and 7734 * established connection segments. Options for SYN-ACK segments 7735 * are handled in TCP syncache. 7736 */ 7737 to.to_flags = 0; 7738 if ((tp->t_flags & TF_NOOPT) == 0) { 7739 /* Maximum segment size. */ 7740 if (flags & TH_SYN) { 7741 tp->snd_nxt = tp->iss; 7742 to.to_mss = tcp_mssopt(&inp->inp_inc); 7743 #ifdef NETFLIX_TCP_O_UDP 7744 if (tp->t_port) 7745 to.to_mss -= V_tcp_udp_tunneling_overhead; 7746 #endif 7747 to.to_flags |= TOF_MSS; 7748 #ifdef TCP_RFC7413 7749 /* 7750 * Only include the TFO option on the first 7751 * transmission of the SYN|ACK on a 7752 * passively-created TFO socket, as the presence of 7753 * the TFO option may have caused the original 7754 * SYN|ACK to have been dropped by a middlebox. 7755 */ 7756 if ((tp->t_flags & TF_FASTOPEN) && 7757 (tp->t_state == TCPS_SYN_RECEIVED) && 7758 (tp->t_rxtshift == 0)) { 7759 to.to_tfo_len = TCP_FASTOPEN_MAX_COOKIE_LEN; 7760 to.to_tfo_cookie = (u_char *)&tp->t_tfo_cookie; 7761 to.to_flags |= TOF_FASTOPEN; 7762 } 7763 #endif 7764 } 7765 /* Window scaling. */ 7766 if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) { 7767 to.to_wscale = tp->request_r_scale; 7768 to.to_flags |= TOF_SCALE; 7769 } 7770 /* Timestamps. */ 7771 if ((tp->t_flags & TF_RCVD_TSTMP) || 7772 ((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) { 7773 to.to_tsval = cts + tp->ts_offset; 7774 to.to_tsecr = tp->ts_recent; 7775 to.to_flags |= TOF_TS; 7776 } 7777 /* Set receive buffer autosizing timestamp. */ 7778 if (tp->rfbuf_ts == 0 && 7779 (so->so_rcv.sb_flags & SB_AUTOSIZE)) 7780 tp->rfbuf_ts = tcp_ts_getticks(); 7781 /* Selective ACK's. */ 7782 if (flags & TH_SYN) 7783 to.to_flags |= TOF_SACKPERM; 7784 else if (TCPS_HAVEESTABLISHED(tp->t_state) && 7785 tp->rcv_numsacks > 0) { 7786 to.to_flags |= TOF_SACK; 7787 to.to_nsacks = tp->rcv_numsacks; 7788 to.to_sacks = (u_char *)tp->sackblks; 7789 } 7790 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) 7791 /* TCP-MD5 (RFC2385). */ 7792 if (tp->t_flags & TF_SIGNATURE) 7793 to.to_flags |= TOF_SIGNATURE; 7794 #endif /* TCP_SIGNATURE */ 7795 7796 /* Processing the options. */ 7797 hdrlen += optlen = tcp_addoptions(&to, opt); 7798 } 7799 #ifdef NETFLIX_TCP_O_UDP 7800 if (tp->t_port) { 7801 if (V_tcp_udp_tunneling_port == 0) { 7802 /* The port was removed?? */ 7803 SOCKBUF_UNLOCK(&so->so_snd); 7804 return (EHOSTUNREACH); 7805 } 7806 hdrlen += sizeof(struct udphdr); 7807 } 7808 #endif 7809 ipoptlen = 0; 7810 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 7811 ipoptlen += ipsec_optlen; 7812 #endif 7813 7814 /* 7815 * Adjust data length if insertion of options will bump the packet 7816 * length beyond the t_maxseg length. Clear the FIN bit because we 7817 * cut off the tail of the segment. 7818 */ 7819 if (len + optlen + ipoptlen > tp->t_maxseg) { 7820 if (flags & TH_FIN) { 7821 would_have_fin = 1; 7822 flags &= ~TH_FIN; 7823 } 7824 if (tso) { 7825 uint32_t if_hw_tsomax; 7826 uint32_t moff; 7827 int32_t max_len; 7828 7829 /* extract TSO information */ 7830 if_hw_tsomax = tp->t_tsomax; 7831 if_hw_tsomaxsegcount = tp->t_tsomaxsegcount; 7832 if_hw_tsomaxsegsize = tp->t_tsomaxsegsize; 7833 KASSERT(ipoptlen == 0, 7834 ("%s: TSO can't do IP options", __func__)); 7835 7836 /* 7837 * Check if we should limit by maximum payload 7838 * length: 7839 */ 7840 if (if_hw_tsomax != 0) { 7841 /* compute maximum TSO length */ 7842 max_len = (if_hw_tsomax - hdrlen - 7843 max_linkhdr); 7844 if (max_len <= 0) { 7845 len = 0; 7846 } else if (len > max_len) { 7847 sendalot = 1; 7848 len = max_len; 7849 } 7850 } 7851 /* 7852 * Prevent the last segment from being fractional 7853 * unless the send sockbuf can be emptied: 7854 */ 7855 max_len = (tp->t_maxseg - optlen); 7856 if ((sb_offset + len) < sbavail(sb)) { 7857 moff = len % (u_int)max_len; 7858 if (moff != 0) { 7859 len -= moff; 7860 sendalot = 1; 7861 } 7862 } 7863 /* 7864 * In case there are too many small fragments don't 7865 * use TSO: 7866 */ 7867 if (len <= max_len) { 7868 len = max_len; 7869 sendalot = 1; 7870 tso = 0; 7871 } 7872 /* 7873 * Send the FIN in a separate segment after the bulk 7874 * sending is done. We don't trust the TSO 7875 * implementations to clear the FIN flag on all but 7876 * the last segment. 7877 */ 7878 if (tp->t_flags & TF_NEEDFIN) 7879 sendalot = 1; 7880 7881 } else { 7882 len = tp->t_maxseg - optlen - ipoptlen; 7883 sendalot = 1; 7884 } 7885 } else 7886 tso = 0; 7887 KASSERT(len + hdrlen + ipoptlen <= IP_MAXPACKET, 7888 ("%s: len > IP_MAXPACKET", __func__)); 7889 #ifdef DIAGNOSTIC 7890 #ifdef INET6 7891 if (max_linkhdr + hdrlen > MCLBYTES) 7892 #else 7893 if (max_linkhdr + hdrlen > MHLEN) 7894 #endif 7895 panic("tcphdr too big"); 7896 #endif 7897 7898 /* 7899 * This KASSERT is here to catch edge cases at a well defined place. 7900 * Before, those had triggered (random) panic conditions further 7901 * down. 7902 */ 7903 KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); 7904 if ((len == 0) && 7905 (flags & TH_FIN) && 7906 (sbused(sb))) { 7907 /* 7908 * We have outstanding data, don't send a fin by itself!. 7909 */ 7910 goto just_return; 7911 } 7912 /* 7913 * Grab a header mbuf, attaching a copy of data to be transmitted, 7914 * and initialize the header from the template for sends on this 7915 * connection. 7916 */ 7917 if (len) { 7918 uint32_t max_val; 7919 uint32_t moff; 7920 7921 if (rack->rc_pace_max_segs) 7922 max_val = rack->rc_pace_max_segs * tp->t_maxseg; 7923 else 7924 max_val = len; 7925 /* 7926 * We allow a limit on sending with hptsi. 7927 */ 7928 if (len > max_val) { 7929 len = max_val; 7930 } 7931 #ifdef INET6 7932 if (MHLEN < hdrlen + max_linkhdr) 7933 m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); 7934 else 7935 #endif 7936 m = m_gethdr(M_NOWAIT, MT_DATA); 7937 7938 if (m == NULL) { 7939 SOCKBUF_UNLOCK(sb); 7940 error = ENOBUFS; 7941 sack_rxmit = 0; 7942 goto out; 7943 } 7944 m->m_data += max_linkhdr; 7945 m->m_len = hdrlen; 7946 7947 /* 7948 * Start the m_copy functions from the closest mbuf to the 7949 * sb_offset in the socket buffer chain. 7950 */ 7951 mb = sbsndptr_noadv(sb, sb_offset, &moff); 7952 if (len <= MHLEN - hdrlen - max_linkhdr) { 7953 m_copydata(mb, moff, (int)len, 7954 mtod(m, caddr_t)+hdrlen); 7955 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) 7956 sbsndptr_adv(sb, mb, len); 7957 m->m_len += len; 7958 } else { 7959 struct sockbuf *msb; 7960 7961 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) 7962 msb = NULL; 7963 else 7964 msb = sb; 7965 m->m_next = tcp_m_copym(/*tp, */ mb, moff, &len, 7966 if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb /*, 0, NULL*/); 7967 if (len <= (tp->t_maxseg - optlen)) { 7968 /* 7969 * Must have ran out of mbufs for the copy 7970 * shorten it to no longer need tso. Lets 7971 * not put on sendalot since we are low on 7972 * mbufs. 7973 */ 7974 tso = 0; 7975 } 7976 if (m->m_next == NULL) { 7977 SOCKBUF_UNLOCK(sb); 7978 (void)m_free(m); 7979 error = ENOBUFS; 7980 sack_rxmit = 0; 7981 goto out; 7982 } 7983 } 7984 if ((tp->t_flags & TF_FORCEDATA) && len == 1) { 7985 TCPSTAT_INC(tcps_sndprobe); 7986 #ifdef NETFLIX_STATS 7987 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) 7988 stats_voi_update_abs_u32(tp->t_stats, 7989 VOI_TCP_RETXPB, len); 7990 else 7991 stats_voi_update_abs_u64(tp->t_stats, 7992 VOI_TCP_TXPB, len); 7993 #endif 7994 } else if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) { 7995 if (rsm && (rsm->r_flags & RACK_TLP)) { 7996 /* 7997 * TLP should not count in retran count, but 7998 * in its own bin 7999 */ 8000 /* tp->t_sndtlppack++;*/ 8001 /* tp->t_sndtlpbyte += len;*/ 8002 counter_u64_add(rack_tlp_retran, 1); 8003 counter_u64_add(rack_tlp_retran_bytes, len); 8004 } else { 8005 tp->t_sndrexmitpack++; 8006 TCPSTAT_INC(tcps_sndrexmitpack); 8007 TCPSTAT_ADD(tcps_sndrexmitbyte, len); 8008 } 8009 #ifdef NETFLIX_STATS 8010 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB, 8011 len); 8012 #endif 8013 } else { 8014 TCPSTAT_INC(tcps_sndpack); 8015 TCPSTAT_ADD(tcps_sndbyte, len); 8016 #ifdef NETFLIX_STATS 8017 stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB, 8018 len); 8019 #endif 8020 } 8021 /* 8022 * If we're sending everything we've got, set PUSH. (This 8023 * will keep happy those implementations which only give 8024 * data to the user when a buffer fills or a PUSH comes in.) 8025 */ 8026 if (sb_offset + len == sbused(sb) && 8027 sbused(sb) && 8028 !(flags & TH_SYN)) 8029 flags |= TH_PUSH; 8030 8031 /* 8032 * Are we doing hptsi, if so we must calculate the slot. We 8033 * only do hptsi in ESTABLISHED and with no RESET being 8034 * sent where we have data to send. 8035 */ 8036 if (((tp->t_state == TCPS_ESTABLISHED) || 8037 (tp->t_state == TCPS_CLOSE_WAIT) || 8038 ((tp->t_state == TCPS_FIN_WAIT_1) && 8039 ((tp->t_flags & TF_SENTFIN) == 0) && 8040 ((flags & TH_FIN) == 0))) && 8041 ((flags & TH_RST) == 0) && 8042 (rack->rc_always_pace)) { 8043 /* 8044 * We use the most optimistic possible cwnd/srtt for 8045 * sending calculations. This will make our 8046 * calculation anticipate getting more through 8047 * quicker then possible. But thats ok we don't want 8048 * the peer to have a gap in data sending. 8049 */ 8050 uint32_t srtt, cwnd, tr_perms = 0; 8051 8052 if (rack->r_ctl.rc_rack_min_rtt) 8053 srtt = rack->r_ctl.rc_rack_min_rtt; 8054 else 8055 srtt = TICKS_2_MSEC((tp->t_srtt >> TCP_RTT_SHIFT)); 8056 if (rack->r_ctl.rc_rack_largest_cwnd) 8057 cwnd = rack->r_ctl.rc_rack_largest_cwnd; 8058 else 8059 cwnd = tp->snd_cwnd; 8060 tr_perms = cwnd / srtt; 8061 if (tr_perms == 0) { 8062 tr_perms = tp->t_maxseg; 8063 } 8064 tot_len_this_send += len; 8065 /* 8066 * Calculate how long this will take to drain, if 8067 * the calculation comes out to zero, thats ok we 8068 * will use send_a_lot to possibly spin around for 8069 * more increasing tot_len_this_send to the point 8070 * that its going to require a pace, or we hit the 8071 * cwnd. Which in that case we are just waiting for 8072 * a ACK. 8073 */ 8074 slot = tot_len_this_send / tr_perms; 8075 /* Now do we reduce the time so we don't run dry? */ 8076 if (slot && rack->rc_pace_reduce) { 8077 int32_t reduce; 8078 8079 reduce = (slot / rack->rc_pace_reduce); 8080 if (reduce < slot) { 8081 slot -= reduce; 8082 } else 8083 slot = 0; 8084 } 8085 if (rack->r_enforce_min_pace && 8086 (slot == 0) && 8087 (tot_len_this_send >= (rack->r_min_pace_seg_thresh * tp->t_maxseg))) { 8088 /* We are enforcing a minimum pace time of 1ms */ 8089 slot = rack->r_enforce_min_pace; 8090 } 8091 } 8092 SOCKBUF_UNLOCK(sb); 8093 } else { 8094 SOCKBUF_UNLOCK(sb); 8095 if (tp->t_flags & TF_ACKNOW) 8096 TCPSTAT_INC(tcps_sndacks); 8097 else if (flags & (TH_SYN | TH_FIN | TH_RST)) 8098 TCPSTAT_INC(tcps_sndctrl); 8099 else if (SEQ_GT(tp->snd_up, tp->snd_una)) 8100 TCPSTAT_INC(tcps_sndurg); 8101 else 8102 TCPSTAT_INC(tcps_sndwinup); 8103 8104 m = m_gethdr(M_NOWAIT, MT_DATA); 8105 if (m == NULL) { 8106 error = ENOBUFS; 8107 sack_rxmit = 0; 8108 goto out; 8109 } 8110 #ifdef INET6 8111 if (isipv6 && (MHLEN < hdrlen + max_linkhdr) && 8112 MHLEN >= hdrlen) { 8113 M_ALIGN(m, hdrlen); 8114 } else 8115 #endif 8116 m->m_data += max_linkhdr; 8117 m->m_len = hdrlen; 8118 } 8119 SOCKBUF_UNLOCK_ASSERT(sb); 8120 m->m_pkthdr.rcvif = (struct ifnet *)0; 8121 #ifdef MAC 8122 mac_inpcb_create_mbuf(inp, m); 8123 #endif 8124 #ifdef INET6 8125 if (isipv6) { 8126 ip6 = mtod(m, struct ip6_hdr *); 8127 #ifdef NETFLIX_TCP_O_UDP 8128 if (tp->t_port) { 8129 udp = (struct udphdr *)((caddr_t)ip6 + ipoptlen + sizeof(struct ip6_hdr)); 8130 udp->uh_sport = htons(V_tcp_udp_tunneling_port); 8131 udp->uh_dport = tp->t_port; 8132 ulen = hdrlen + len - sizeof(struct ip6_hdr); 8133 udp->uh_ulen = htons(ulen); 8134 th = (struct tcphdr *)(udp + 1); 8135 } else 8136 #endif 8137 th = (struct tcphdr *)(ip6 + 1); 8138 tcpip_fillheaders(inp, /*tp->t_port, */ ip6, th); 8139 } else 8140 #endif /* INET6 */ 8141 { 8142 ip = mtod(m, struct ip *); 8143 #ifdef TCPDEBUG 8144 ipov = (struct ipovly *)ip; 8145 #endif 8146 #ifdef NETFLIX_TCP_O_UDP 8147 if (tp->t_port) { 8148 udp = (struct udphdr *)((caddr_t)ip + ipoptlen + sizeof(struct ip)); 8149 udp->uh_sport = htons(V_tcp_udp_tunneling_port); 8150 udp->uh_dport = tp->t_port; 8151 ulen = hdrlen + len - sizeof(struct ip); 8152 udp->uh_ulen = htons(ulen); 8153 th = (struct tcphdr *)(udp + 1); 8154 } else 8155 #endif 8156 th = (struct tcphdr *)(ip + 1); 8157 tcpip_fillheaders(inp,/*tp->t_port, */ ip, th); 8158 } 8159 /* 8160 * Fill in fields, remembering maximum advertised window for use in 8161 * delaying messages about window sizes. If resending a FIN, be sure 8162 * not to use a new sequence number. 8163 */ 8164 if (flags & TH_FIN && tp->t_flags & TF_SENTFIN && 8165 tp->snd_nxt == tp->snd_max) 8166 tp->snd_nxt--; 8167 /* 8168 * If we are starting a connection, send ECN setup SYN packet. If we 8169 * are on a retransmit, we may resend those bits a number of times 8170 * as per RFC 3168. 8171 */ 8172 if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn == 1) { 8173 if (tp->t_rxtshift >= 1) { 8174 if (tp->t_rxtshift <= V_tcp_ecn_maxretries) 8175 flags |= TH_ECE | TH_CWR; 8176 } else 8177 flags |= TH_ECE | TH_CWR; 8178 } 8179 if (tp->t_state == TCPS_ESTABLISHED && 8180 (tp->t_flags & TF_ECN_PERMIT)) { 8181 /* 8182 * If the peer has ECN, mark data packets with ECN capable 8183 * transmission (ECT). Ignore pure ack packets, 8184 * retransmissions and window probes. 8185 */ 8186 if (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) && 8187 !((tp->t_flags & TF_FORCEDATA) && len == 1)) { 8188 #ifdef INET6 8189 if (isipv6) 8190 ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20); 8191 else 8192 #endif 8193 ip->ip_tos |= IPTOS_ECN_ECT0; 8194 TCPSTAT_INC(tcps_ecn_ect0); 8195 } 8196 /* 8197 * Reply with proper ECN notifications. 8198 */ 8199 if (tp->t_flags & TF_ECN_SND_CWR) { 8200 flags |= TH_CWR; 8201 tp->t_flags &= ~TF_ECN_SND_CWR; 8202 } 8203 if (tp->t_flags & TF_ECN_SND_ECE) 8204 flags |= TH_ECE; 8205 } 8206 /* 8207 * If we are doing retransmissions, then snd_nxt will not reflect 8208 * the first unsent octet. For ACK only packets, we do not want the 8209 * sequence number of the retransmitted packet, we want the sequence 8210 * number of the next unsent octet. So, if there is no data (and no 8211 * SYN or FIN), use snd_max instead of snd_nxt when filling in 8212 * ti_seq. But if we are in persist state, snd_max might reflect 8213 * one byte beyond the right edge of the window, so use snd_nxt in 8214 * that case, since we know we aren't doing a retransmission. 8215 * (retransmit and persist are mutually exclusive...) 8216 */ 8217 if (sack_rxmit == 0) { 8218 if (len || (flags & (TH_SYN | TH_FIN)) || 8219 rack->rc_in_persist) { 8220 th->th_seq = htonl(tp->snd_nxt); 8221 rack_seq = tp->snd_nxt; 8222 } else if (flags & TH_RST) { 8223 /* 8224 * For a Reset send the last cum ack in sequence 8225 * (this like any other choice may still generate a 8226 * challenge ack, if a ack-update packet is in 8227 * flight). 8228 */ 8229 th->th_seq = htonl(tp->snd_una); 8230 rack_seq = tp->snd_una; 8231 } else { 8232 th->th_seq = htonl(tp->snd_max); 8233 rack_seq = tp->snd_max; 8234 } 8235 } else { 8236 th->th_seq = htonl(rsm->r_start); 8237 rack_seq = rsm->r_start; 8238 } 8239 th->th_ack = htonl(tp->rcv_nxt); 8240 if (optlen) { 8241 bcopy(opt, th + 1, optlen); 8242 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 8243 } 8244 th->th_flags = flags; 8245 /* 8246 * Calculate receive window. Don't shrink window, but avoid silly 8247 * window syndrome. 8248 */ 8249 if (recwin < (long)(so->so_rcv.sb_hiwat / 4) && 8250 recwin < (long)tp->t_maxseg) 8251 recwin = 0; 8252 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) && 8253 recwin < (long)(tp->rcv_adv - tp->rcv_nxt)) 8254 recwin = (long)(tp->rcv_adv - tp->rcv_nxt); 8255 if (recwin > (long)TCP_MAXWIN << tp->rcv_scale) 8256 recwin = (long)TCP_MAXWIN << tp->rcv_scale; 8257 8258 /* 8259 * According to RFC1323 the window field in a SYN (i.e., a <SYN> or 8260 * <SYN,ACK>) segment itself is never scaled. The <SYN,ACK> case is 8261 * handled in syncache. 8262 */ 8263 if (flags & TH_SYN) 8264 th->th_win = htons((u_short) 8265 (min(sbspace(&so->so_rcv), TCP_MAXWIN))); 8266 else 8267 th->th_win = htons((u_short)(recwin >> tp->rcv_scale)); 8268 /* 8269 * Adjust the RXWIN0SENT flag - indicate that we have advertised a 0 8270 * window. This may cause the remote transmitter to stall. This 8271 * flag tells soreceive() to disable delayed acknowledgements when 8272 * draining the buffer. This can occur if the receiver is 8273 * attempting to read more data than can be buffered prior to 8274 * transmitting on the connection. 8275 */ 8276 if (th->th_win == 0) { 8277 tp->t_sndzerowin++; 8278 tp->t_flags |= TF_RXWIN0SENT; 8279 } else 8280 tp->t_flags &= ~TF_RXWIN0SENT; 8281 if (SEQ_GT(tp->snd_up, tp->snd_nxt)) { 8282 th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt)); 8283 th->th_flags |= TH_URG; 8284 } else 8285 /* 8286 * If no urgent pointer to send, then we pull the urgent 8287 * pointer to the left edge of the send window so that it 8288 * doesn't drift into the send window on sequence number 8289 * wraparound. 8290 */ 8291 tp->snd_up = tp->snd_una; /* drag it along */ 8292 8293 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) 8294 if (to.to_flags & TOF_SIGNATURE) { 8295 /* 8296 * Calculate MD5 signature and put it into the place 8297 * determined before. 8298 * NOTE: since TCP options buffer doesn't point into 8299 * mbuf's data, calculate offset and use it. 8300 */ 8301 if (!TCPMD5_ENABLED() || TCPMD5_OUTPUT(m, th, 8302 (u_char *)(th + 1) + (to.to_signature - opt)) != 0) { 8303 /* 8304 * Do not send segment if the calculation of MD5 8305 * digest has failed. 8306 */ 8307 goto out; 8308 } 8309 } 8310 #endif 8311 8312 /* 8313 * Put TCP length in extended header, and then checksum extended 8314 * header and data. 8315 */ 8316 m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */ 8317 #ifdef INET6 8318 if (isipv6) { 8319 /* 8320 * ip6_plen is not need to be filled now, and will be filled 8321 * in ip6_output. 8322 */ 8323 #ifdef NETFLIX_TCP_O_UDP 8324 if (tp->t_port) { 8325 m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; 8326 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 8327 udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0); 8328 th->th_sum = htons(0); 8329 UDPSTAT_INC(udps_opackets); 8330 } else { 8331 #endif 8332 m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; 8333 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 8334 th->th_sum = in6_cksum_pseudo(ip6, 8335 sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP, 8336 0); 8337 #ifdef NETFLIX_TCP_O_UDP 8338 } 8339 #endif 8340 } 8341 #endif 8342 #if defined(INET6) && defined(INET) 8343 else 8344 #endif 8345 #ifdef INET 8346 { 8347 #ifdef NETFLIX_TCP_O_UDP 8348 if (tp->t_port) { 8349 m->m_pkthdr.csum_flags = CSUM_UDP; 8350 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 8351 udp->uh_sum = in_pseudo(ip->ip_src.s_addr, 8352 ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP)); 8353 th->th_sum = htons(0); 8354 UDPSTAT_INC(udps_opackets); 8355 } else { 8356 #endif 8357 m->m_pkthdr.csum_flags = CSUM_TCP; 8358 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 8359 th->th_sum = in_pseudo(ip->ip_src.s_addr, 8360 ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + 8361 IPPROTO_TCP + len + optlen)); 8362 #ifdef NETFLIX_TCP_O_UDP 8363 } 8364 #endif 8365 /* IP version must be set here for ipv4/ipv6 checking later */ 8366 KASSERT(ip->ip_v == IPVERSION, 8367 ("%s: IP version incorrect: %d", __func__, ip->ip_v)); 8368 } 8369 #endif 8370 8371 /* 8372 * Enable TSO and specify the size of the segments. The TCP pseudo 8373 * header checksum is always provided. XXX: Fixme: This is currently 8374 * not the case for IPv6. 8375 */ 8376 if (tso) { 8377 KASSERT(len > tp->t_maxseg - optlen, 8378 ("%s: len <= tso_segsz", __func__)); 8379 m->m_pkthdr.csum_flags |= CSUM_TSO; 8380 m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen; 8381 } 8382 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 8383 KASSERT(len + hdrlen + ipoptlen - ipsec_optlen == m_length(m, NULL), 8384 ("%s: mbuf chain shorter than expected: %d + %u + %u - %u != %u", 8385 __func__, len, hdrlen, ipoptlen, ipsec_optlen, m_length(m, NULL))); 8386 #else 8387 KASSERT(len + hdrlen + ipoptlen == m_length(m, NULL), 8388 ("%s: mbuf chain shorter than expected: %d + %u + %u != %u", 8389 __func__, len, hdrlen, ipoptlen, m_length(m, NULL))); 8390 #endif 8391 8392 #ifdef TCP_HHOOK 8393 /* Run HHOOK_TCP_ESTABLISHED_OUT helper hooks. */ 8394 hhook_run_tcp_est_out(tp, th, &to, len, tso); 8395 #endif 8396 8397 #ifdef TCPDEBUG 8398 /* 8399 * Trace. 8400 */ 8401 if (so->so_options & SO_DEBUG) { 8402 u_short save = 0; 8403 8404 #ifdef INET6 8405 if (!isipv6) 8406 #endif 8407 { 8408 save = ipov->ih_len; 8409 ipov->ih_len = htons(m->m_pkthdr.len /* - hdrlen + 8410 * (th->th_off << 2) */ ); 8411 } 8412 tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0); 8413 #ifdef INET6 8414 if (!isipv6) 8415 #endif 8416 ipov->ih_len = save; 8417 } 8418 #endif /* TCPDEBUG */ 8419 8420 /* We're getting ready to send; log now. */ 8421 if (tp->t_logstate != TCP_LOG_STATE_OFF) { 8422 union tcp_log_stackspecific log; 8423 8424 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 8425 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 8426 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 8427 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; 8428 if (rsm || sack_rxmit) { 8429 log.u_bbr.flex8 = 1; 8430 } else { 8431 log.u_bbr.flex8 = 0; 8432 } 8433 lgb = tcp_log_event_(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_OUT, ERRNO_UNK, 8434 len, &log, false, NULL, NULL, 0, NULL); 8435 } else 8436 lgb = NULL; 8437 8438 /* 8439 * Fill in IP length and desired time to live and send to IP level. 8440 * There should be a better way to handle ttl and tos; we could keep 8441 * them in the template, but need a way to checksum without them. 8442 */ 8443 /* 8444 * m->m_pkthdr.len should have been set before cksum calcuration, 8445 * because in6_cksum() need it. 8446 */ 8447 #ifdef INET6 8448 if (isipv6) { 8449 /* 8450 * we separately set hoplimit for every segment, since the 8451 * user might want to change the value via setsockopt. Also, 8452 * desired default hop limit might be changed via Neighbor 8453 * Discovery. 8454 */ 8455 ip6->ip6_hlim = in6_selecthlim(inp, NULL); 8456 8457 /* 8458 * Set the packet size here for the benefit of DTrace 8459 * probes. ip6_output() will set it properly; it's supposed 8460 * to include the option header lengths as well. 8461 */ 8462 ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6)); 8463 8464 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) 8465 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 8466 else 8467 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 8468 8469 if (tp->t_state == TCPS_SYN_SENT) 8470 TCP_PROBE5(connect__request, NULL, tp, ip6, tp, th); 8471 8472 TCP_PROBE5(send, NULL, tp, ip6, tp, th); 8473 /* TODO: IPv6 IP6TOS_ECT bit on */ 8474 error = ip6_output(m, tp->t_inpcb->in6p_outputopts, 8475 &inp->inp_route6, 8476 ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), 8477 NULL, NULL, inp); 8478 8479 if (error == EMSGSIZE && inp->inp_route6.ro_rt != NULL) 8480 mtu = inp->inp_route6.ro_rt->rt_mtu; 8481 } 8482 #endif /* INET6 */ 8483 #if defined(INET) && defined(INET6) 8484 else 8485 #endif 8486 #ifdef INET 8487 { 8488 ip->ip_len = htons(m->m_pkthdr.len); 8489 #ifdef INET6 8490 if (inp->inp_vflag & INP_IPV6PROTO) 8491 ip->ip_ttl = in6_selecthlim(inp, NULL); 8492 #endif /* INET6 */ 8493 /* 8494 * If we do path MTU discovery, then we set DF on every 8495 * packet. This might not be the best thing to do according 8496 * to RFC3390 Section 2. However the tcp hostcache migitates 8497 * the problem so it affects only the first tcp connection 8498 * with a host. 8499 * 8500 * NB: Don't set DF on small MTU/MSS to have a safe 8501 * fallback. 8502 */ 8503 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) { 8504 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 8505 if (tp->t_port == 0 || len < V_tcp_minmss) { 8506 ip->ip_off |= htons(IP_DF); 8507 } 8508 } else { 8509 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 8510 } 8511 8512 if (tp->t_state == TCPS_SYN_SENT) 8513 TCP_PROBE5(connect__request, NULL, tp, ip, tp, th); 8514 8515 TCP_PROBE5(send, NULL, tp, ip, tp, th); 8516 8517 error = ip_output(m, tp->t_inpcb->inp_options, &inp->inp_route, 8518 ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), 0, 8519 inp); 8520 if (error == EMSGSIZE && inp->inp_route.ro_rt != NULL) 8521 mtu = inp->inp_route.ro_rt->rt_mtu; 8522 } 8523 #endif /* INET */ 8524 8525 out: 8526 if (lgb) { 8527 lgb->tlb_errno = error; 8528 lgb = NULL; 8529 } 8530 /* 8531 * In transmit state, time the transmission and arrange for the 8532 * retransmit. In persist state, just set snd_max. 8533 */ 8534 if (error == 0) { 8535 if (TCPS_HAVEESTABLISHED(tp->t_state) && 8536 (tp->t_flags & TF_SACK_PERMIT) && 8537 tp->rcv_numsacks > 0) 8538 tcp_clean_dsack_blocks(tp); 8539 if (len == 0) 8540 counter_u64_add(rack_out_size[TCP_MSS_ACCT_SNDACK], 1); 8541 else if (len == 1) { 8542 counter_u64_add(rack_out_size[TCP_MSS_ACCT_PERSIST], 1); 8543 } else if (len > 1) { 8544 int idx; 8545 8546 idx = (len / tp->t_maxseg) + 3; 8547 if (idx >= TCP_MSS_ACCT_ATIMER) 8548 counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1); 8549 else 8550 counter_u64_add(rack_out_size[idx], 1); 8551 } 8552 } 8553 if (sub_from_prr && (error == 0)) { 8554 if (rack->r_ctl.rc_prr_sndcnt >= len) 8555 rack->r_ctl.rc_prr_sndcnt -= len; 8556 else 8557 rack->r_ctl.rc_prr_sndcnt = 0; 8558 } 8559 sub_from_prr = 0; 8560 rack_log_output(tp, &to, len, rack_seq, (uint8_t) flags, error, cts, 8561 pass, rsm); 8562 if ((tp->t_flags & TF_FORCEDATA) == 0 || 8563 (rack->rc_in_persist == 0)) { 8564 #ifdef NETFLIX_STATS 8565 tcp_seq startseq = tp->snd_nxt; 8566 #endif 8567 /* 8568 * Advance snd_nxt over sequence space of this segment. 8569 */ 8570 if (error) 8571 /* We don't log or do anything with errors */ 8572 goto timer; 8573 8574 if (flags & (TH_SYN | TH_FIN)) { 8575 if (flags & TH_SYN) 8576 tp->snd_nxt++; 8577 if (flags & TH_FIN) { 8578 tp->snd_nxt++; 8579 tp->t_flags |= TF_SENTFIN; 8580 } 8581 } 8582 /* In the ENOBUFS case we do *not* update snd_max */ 8583 if (sack_rxmit) 8584 goto timer; 8585 8586 tp->snd_nxt += len; 8587 if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { 8588 if (tp->snd_una == tp->snd_max) { 8589 /* 8590 * Update the time we just added data since 8591 * none was outstanding. 8592 */ 8593 rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__); 8594 tp->t_acktime = ticks; 8595 } 8596 tp->snd_max = tp->snd_nxt; 8597 #ifdef NETFLIX_STATS 8598 if (!(tp->t_flags & TF_GPUTINPROG) && len) { 8599 tp->t_flags |= TF_GPUTINPROG; 8600 tp->gput_seq = startseq; 8601 tp->gput_ack = startseq + 8602 ulmin(sbavail(sb) - sb_offset, sendwin); 8603 tp->gput_ts = tcp_ts_getticks(); 8604 } 8605 #endif 8606 } 8607 /* 8608 * Set retransmit timer if not currently set, and not doing 8609 * a pure ack or a keep-alive probe. Initial value for 8610 * retransmit timer is smoothed round-trip time + 2 * 8611 * round-trip time variance. Initialize shift counter which 8612 * is used for backoff of retransmit time. 8613 */ 8614 timer: 8615 if ((tp->snd_wnd == 0) && 8616 TCPS_HAVEESTABLISHED(tp->t_state)) { 8617 /* 8618 * If the persists timer was set above (right before 8619 * the goto send), and still needs to be on. Lets 8620 * make sure all is canceled. If the persist timer 8621 * is not running, we want to get it up. 8622 */ 8623 if (rack->rc_in_persist == 0) { 8624 rack_enter_persist(tp, rack, cts); 8625 } 8626 } 8627 } else { 8628 /* 8629 * Persist case, update snd_max but since we are in persist 8630 * mode (no window) we do not update snd_nxt. 8631 */ 8632 int32_t xlen = len; 8633 8634 if (error) 8635 goto nomore; 8636 8637 if (flags & TH_SYN) 8638 ++xlen; 8639 if (flags & TH_FIN) { 8640 ++xlen; 8641 tp->t_flags |= TF_SENTFIN; 8642 } 8643 /* In the ENOBUFS case we do *not* update snd_max */ 8644 if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max)) { 8645 if (tp->snd_una == tp->snd_max) { 8646 /* 8647 * Update the time we just added data since 8648 * none was outstanding. 8649 */ 8650 rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__); 8651 tp->t_acktime = ticks; 8652 } 8653 tp->snd_max = tp->snd_nxt + len; 8654 } 8655 } 8656 nomore: 8657 if (error) { 8658 SOCKBUF_UNLOCK_ASSERT(sb); /* Check gotos. */ 8659 /* 8660 * Failures do not advance the seq counter above. For the 8661 * case of ENOBUFS we will fall out and retry in 1ms with 8662 * the hpts. Everything else will just have to retransmit 8663 * with the timer. 8664 * 8665 * In any case, we do not want to loop around for another 8666 * send without a good reason. 8667 */ 8668 sendalot = 0; 8669 switch (error) { 8670 case EPERM: 8671 tp->t_flags &= ~TF_FORCEDATA; 8672 tp->t_softerror = error; 8673 return (error); 8674 case ENOBUFS: 8675 if (slot == 0) { 8676 /* 8677 * Pace us right away to retry in a some 8678 * time 8679 */ 8680 slot = 1 + rack->rc_enobuf; 8681 if (rack->rc_enobuf < 255) 8682 rack->rc_enobuf++; 8683 if (slot > (rack->rc_rack_rtt / 2)) { 8684 slot = rack->rc_rack_rtt / 2; 8685 } 8686 if (slot < 10) 8687 slot = 10; 8688 } 8689 counter_u64_add(rack_saw_enobuf, 1); 8690 error = 0; 8691 goto enobufs; 8692 case EMSGSIZE: 8693 /* 8694 * For some reason the interface we used initially 8695 * to send segments changed to another or lowered 8696 * its MTU. If TSO was active we either got an 8697 * interface without TSO capabilits or TSO was 8698 * turned off. If we obtained mtu from ip_output() 8699 * then update it and try again. 8700 */ 8701 if (tso) 8702 tp->t_flags &= ~TF_TSO; 8703 if (mtu != 0) { 8704 tcp_mss_update(tp, -1, mtu, NULL, NULL); 8705 goto again; 8706 } 8707 slot = 10; 8708 rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, 0, 1); 8709 tp->t_flags &= ~TF_FORCEDATA; 8710 return (error); 8711 case ENETUNREACH: 8712 counter_u64_add(rack_saw_enetunreach, 1); 8713 case EHOSTDOWN: 8714 case EHOSTUNREACH: 8715 case ENETDOWN: 8716 if (TCPS_HAVERCVDSYN(tp->t_state)) { 8717 tp->t_softerror = error; 8718 } 8719 /* FALLTHROUGH */ 8720 default: 8721 slot = 10; 8722 rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, 0, 1); 8723 tp->t_flags &= ~TF_FORCEDATA; 8724 return (error); 8725 } 8726 } else { 8727 rack->rc_enobuf = 0; 8728 } 8729 TCPSTAT_INC(tcps_sndtotal); 8730 8731 /* 8732 * Data sent (as far as we can tell). If this advertises a larger 8733 * window than any other segment, then remember the size of the 8734 * advertised window. Any pending ACK has now been sent. 8735 */ 8736 if (recwin > 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv)) 8737 tp->rcv_adv = tp->rcv_nxt + recwin; 8738 tp->last_ack_sent = tp->rcv_nxt; 8739 tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); 8740 enobufs: 8741 rack->r_tlp_running = 0; 8742 if ((flags & TH_RST) || (would_have_fin == 1)) { 8743 /* 8744 * We don't send again after a RST. We also do *not* send 8745 * again if we would have had a find, but now have 8746 * outstanding data. 8747 */ 8748 slot = 0; 8749 sendalot = 0; 8750 } 8751 if (slot) { 8752 /* set the rack tcb into the slot N */ 8753 counter_u64_add(rack_paced_segments, 1); 8754 } else if (sendalot) { 8755 if (len) 8756 counter_u64_add(rack_unpaced_segments, 1); 8757 sack_rxmit = 0; 8758 tp->t_flags &= ~TF_FORCEDATA; 8759 goto again; 8760 } else if (len) { 8761 counter_u64_add(rack_unpaced_segments, 1); 8762 } 8763 tp->t_flags &= ~TF_FORCEDATA; 8764 rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, tot_len_this_send, 1); 8765 return (error); 8766 } 8767 8768 /* 8769 * rack_ctloutput() must drop the inpcb lock before performing copyin on 8770 * socket option arguments. When it re-acquires the lock after the copy, it 8771 * has to revalidate that the connection is still valid for the socket 8772 * option. 8773 */ 8774 static int 8775 rack_set_sockopt(struct socket *so, struct sockopt *sopt, 8776 struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack) 8777 { 8778 int32_t error = 0, optval; 8779 8780 switch (sopt->sopt_name) { 8781 case TCP_RACK_PROP_RATE: 8782 case TCP_RACK_PROP: 8783 case TCP_RACK_TLP_REDUCE: 8784 case TCP_RACK_EARLY_RECOV: 8785 case TCP_RACK_PACE_ALWAYS: 8786 case TCP_DELACK: 8787 case TCP_RACK_PACE_REDUCE: 8788 case TCP_RACK_PACE_MAX_SEG: 8789 case TCP_RACK_PRR_SENDALOT: 8790 case TCP_RACK_MIN_TO: 8791 case TCP_RACK_EARLY_SEG: 8792 case TCP_RACK_REORD_THRESH: 8793 case TCP_RACK_REORD_FADE: 8794 case TCP_RACK_TLP_THRESH: 8795 case TCP_RACK_PKT_DELAY: 8796 case TCP_RACK_TLP_USE: 8797 case TCP_RACK_TLP_INC_VAR: 8798 case TCP_RACK_IDLE_REDUCE_HIGH: 8799 case TCP_RACK_MIN_PACE: 8800 case TCP_RACK_MIN_PACE_SEG: 8801 case TCP_BBR_RACK_RTT_USE: 8802 case TCP_DATA_AFTER_CLOSE: 8803 break; 8804 default: 8805 return (tcp_default_ctloutput(so, sopt, inp, tp)); 8806 break; 8807 } 8808 INP_WUNLOCK(inp); 8809 error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); 8810 if (error) 8811 return (error); 8812 INP_WLOCK(inp); 8813 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 8814 INP_WUNLOCK(inp); 8815 return (ECONNRESET); 8816 } 8817 tp = intotcpcb(inp); 8818 rack = (struct tcp_rack *)tp->t_fb_ptr; 8819 switch (sopt->sopt_name) { 8820 case TCP_RACK_PROP_RATE: 8821 if ((optval <= 0) || (optval >= 100)) { 8822 error = EINVAL; 8823 break; 8824 } 8825 RACK_OPTS_INC(tcp_rack_prop_rate); 8826 rack->r_ctl.rc_prop_rate = optval; 8827 break; 8828 case TCP_RACK_TLP_USE: 8829 if ((optval < TLP_USE_ID) || (optval > TLP_USE_TWO_TWO)) { 8830 error = EINVAL; 8831 break; 8832 } 8833 RACK_OPTS_INC(tcp_tlp_use); 8834 rack->rack_tlp_threshold_use = optval; 8835 break; 8836 case TCP_RACK_PROP: 8837 /* RACK proportional rate reduction (bool) */ 8838 RACK_OPTS_INC(tcp_rack_prop); 8839 rack->r_ctl.rc_prop_reduce = optval; 8840 break; 8841 case TCP_RACK_TLP_REDUCE: 8842 /* RACK TLP cwnd reduction (bool) */ 8843 RACK_OPTS_INC(tcp_rack_tlp_reduce); 8844 rack->r_ctl.rc_tlp_cwnd_reduce = optval; 8845 break; 8846 case TCP_RACK_EARLY_RECOV: 8847 /* Should recovery happen early (bool) */ 8848 RACK_OPTS_INC(tcp_rack_early_recov); 8849 rack->r_ctl.rc_early_recovery = optval; 8850 break; 8851 case TCP_RACK_PACE_ALWAYS: 8852 /* Use the always pace method (bool) */ 8853 RACK_OPTS_INC(tcp_rack_pace_always); 8854 if (optval > 0) 8855 rack->rc_always_pace = 1; 8856 else 8857 rack->rc_always_pace = 0; 8858 break; 8859 case TCP_RACK_PACE_REDUCE: 8860 /* RACK Hptsi reduction factor (divisor) */ 8861 RACK_OPTS_INC(tcp_rack_pace_reduce); 8862 if (optval) 8863 /* Must be non-zero */ 8864 rack->rc_pace_reduce = optval; 8865 else 8866 error = EINVAL; 8867 break; 8868 case TCP_RACK_PACE_MAX_SEG: 8869 /* Max segments in a pace */ 8870 RACK_OPTS_INC(tcp_rack_max_seg); 8871 rack->rc_pace_max_segs = optval; 8872 break; 8873 case TCP_RACK_PRR_SENDALOT: 8874 /* Allow PRR to send more than one seg */ 8875 RACK_OPTS_INC(tcp_rack_prr_sendalot); 8876 rack->r_ctl.rc_prr_sendalot = optval; 8877 break; 8878 case TCP_RACK_MIN_TO: 8879 /* Minimum time between rack t-o's in ms */ 8880 RACK_OPTS_INC(tcp_rack_min_to); 8881 rack->r_ctl.rc_min_to = optval; 8882 break; 8883 case TCP_RACK_EARLY_SEG: 8884 /* If early recovery max segments */ 8885 RACK_OPTS_INC(tcp_rack_early_seg); 8886 rack->r_ctl.rc_early_recovery_segs = optval; 8887 break; 8888 case TCP_RACK_REORD_THRESH: 8889 /* RACK reorder threshold (shift amount) */ 8890 RACK_OPTS_INC(tcp_rack_reord_thresh); 8891 if ((optval > 0) && (optval < 31)) 8892 rack->r_ctl.rc_reorder_shift = optval; 8893 else 8894 error = EINVAL; 8895 break; 8896 case TCP_RACK_REORD_FADE: 8897 /* Does reordering fade after ms time */ 8898 RACK_OPTS_INC(tcp_rack_reord_fade); 8899 rack->r_ctl.rc_reorder_fade = optval; 8900 break; 8901 case TCP_RACK_TLP_THRESH: 8902 /* RACK TLP theshold i.e. srtt+(srtt/N) */ 8903 RACK_OPTS_INC(tcp_rack_tlp_thresh); 8904 if (optval) 8905 rack->r_ctl.rc_tlp_threshold = optval; 8906 else 8907 error = EINVAL; 8908 break; 8909 case TCP_RACK_PKT_DELAY: 8910 /* RACK added ms i.e. rack-rtt + reord + N */ 8911 RACK_OPTS_INC(tcp_rack_pkt_delay); 8912 rack->r_ctl.rc_pkt_delay = optval; 8913 break; 8914 case TCP_RACK_TLP_INC_VAR: 8915 /* Does TLP include rtt variance in t-o */ 8916 RACK_OPTS_INC(tcp_rack_tlp_inc_var); 8917 rack->r_ctl.rc_prr_inc_var = optval; 8918 break; 8919 case TCP_RACK_IDLE_REDUCE_HIGH: 8920 RACK_OPTS_INC(tcp_rack_idle_reduce_high); 8921 if (optval) 8922 rack->r_idle_reduce_largest = 1; 8923 else 8924 rack->r_idle_reduce_largest = 0; 8925 break; 8926 case TCP_DELACK: 8927 if (optval == 0) 8928 tp->t_delayed_ack = 0; 8929 else 8930 tp->t_delayed_ack = 1; 8931 if (tp->t_flags & TF_DELACK) { 8932 tp->t_flags &= ~TF_DELACK; 8933 tp->t_flags |= TF_ACKNOW; 8934 rack_output(tp); 8935 } 8936 break; 8937 case TCP_RACK_MIN_PACE: 8938 RACK_OPTS_INC(tcp_rack_min_pace); 8939 if (optval > 3) 8940 rack->r_enforce_min_pace = 3; 8941 else 8942 rack->r_enforce_min_pace = optval; 8943 break; 8944 case TCP_RACK_MIN_PACE_SEG: 8945 RACK_OPTS_INC(tcp_rack_min_pace_seg); 8946 if (optval >= 16) 8947 rack->r_min_pace_seg_thresh = 15; 8948 else 8949 rack->r_min_pace_seg_thresh = optval; 8950 break; 8951 case TCP_BBR_RACK_RTT_USE: 8952 if ((optval != USE_RTT_HIGH) && 8953 (optval != USE_RTT_LOW) && 8954 (optval != USE_RTT_AVG)) 8955 error = EINVAL; 8956 else 8957 rack->r_ctl.rc_rate_sample_method = optval; 8958 break; 8959 case TCP_DATA_AFTER_CLOSE: 8960 if (optval) 8961 rack->rc_allow_data_af_clo = 1; 8962 else 8963 rack->rc_allow_data_af_clo = 0; 8964 break; 8965 default: 8966 return (tcp_default_ctloutput(so, sopt, inp, tp)); 8967 break; 8968 } 8969 /* tcp_log_socket_option(tp, sopt->sopt_name, optval, error);*/ 8970 INP_WUNLOCK(inp); 8971 return (error); 8972 } 8973 8974 static int 8975 rack_get_sockopt(struct socket *so, struct sockopt *sopt, 8976 struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack) 8977 { 8978 int32_t error, optval; 8979 8980 /* 8981 * Because all our options are either boolean or an int, we can just 8982 * pull everything into optval and then unlock and copy. If we ever 8983 * add a option that is not a int, then this will have quite an 8984 * impact to this routine. 8985 */ 8986 switch (sopt->sopt_name) { 8987 case TCP_RACK_PROP_RATE: 8988 optval = rack->r_ctl.rc_prop_rate; 8989 break; 8990 case TCP_RACK_PROP: 8991 /* RACK proportional rate reduction (bool) */ 8992 optval = rack->r_ctl.rc_prop_reduce; 8993 break; 8994 case TCP_RACK_TLP_REDUCE: 8995 /* RACK TLP cwnd reduction (bool) */ 8996 optval = rack->r_ctl.rc_tlp_cwnd_reduce; 8997 break; 8998 case TCP_RACK_EARLY_RECOV: 8999 /* Should recovery happen early (bool) */ 9000 optval = rack->r_ctl.rc_early_recovery; 9001 break; 9002 case TCP_RACK_PACE_REDUCE: 9003 /* RACK Hptsi reduction factor (divisor) */ 9004 optval = rack->rc_pace_reduce; 9005 break; 9006 case TCP_RACK_PACE_MAX_SEG: 9007 /* Max segments in a pace */ 9008 optval = rack->rc_pace_max_segs; 9009 break; 9010 case TCP_RACK_PACE_ALWAYS: 9011 /* Use the always pace method */ 9012 optval = rack->rc_always_pace; 9013 break; 9014 case TCP_RACK_PRR_SENDALOT: 9015 /* Allow PRR to send more than one seg */ 9016 optval = rack->r_ctl.rc_prr_sendalot; 9017 break; 9018 case TCP_RACK_MIN_TO: 9019 /* Minimum time between rack t-o's in ms */ 9020 optval = rack->r_ctl.rc_min_to; 9021 break; 9022 case TCP_RACK_EARLY_SEG: 9023 /* If early recovery max segments */ 9024 optval = rack->r_ctl.rc_early_recovery_segs; 9025 break; 9026 case TCP_RACK_REORD_THRESH: 9027 /* RACK reorder threshold (shift amount) */ 9028 optval = rack->r_ctl.rc_reorder_shift; 9029 break; 9030 case TCP_RACK_REORD_FADE: 9031 /* Does reordering fade after ms time */ 9032 optval = rack->r_ctl.rc_reorder_fade; 9033 break; 9034 case TCP_RACK_TLP_THRESH: 9035 /* RACK TLP theshold i.e. srtt+(srtt/N) */ 9036 optval = rack->r_ctl.rc_tlp_threshold; 9037 break; 9038 case TCP_RACK_PKT_DELAY: 9039 /* RACK added ms i.e. rack-rtt + reord + N */ 9040 optval = rack->r_ctl.rc_pkt_delay; 9041 break; 9042 case TCP_RACK_TLP_USE: 9043 optval = rack->rack_tlp_threshold_use; 9044 break; 9045 case TCP_RACK_TLP_INC_VAR: 9046 /* Does TLP include rtt variance in t-o */ 9047 optval = rack->r_ctl.rc_prr_inc_var; 9048 break; 9049 case TCP_RACK_IDLE_REDUCE_HIGH: 9050 optval = rack->r_idle_reduce_largest; 9051 break; 9052 case TCP_RACK_MIN_PACE: 9053 optval = rack->r_enforce_min_pace; 9054 break; 9055 case TCP_RACK_MIN_PACE_SEG: 9056 optval = rack->r_min_pace_seg_thresh; 9057 break; 9058 case TCP_BBR_RACK_RTT_USE: 9059 optval = rack->r_ctl.rc_rate_sample_method; 9060 break; 9061 case TCP_DELACK: 9062 optval = tp->t_delayed_ack; 9063 break; 9064 case TCP_DATA_AFTER_CLOSE: 9065 optval = rack->rc_allow_data_af_clo; 9066 break; 9067 default: 9068 return (tcp_default_ctloutput(so, sopt, inp, tp)); 9069 break; 9070 } 9071 INP_WUNLOCK(inp); 9072 error = sooptcopyout(sopt, &optval, sizeof optval); 9073 return (error); 9074 } 9075 9076 static int 9077 rack_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp) 9078 { 9079 int32_t error = EINVAL; 9080 struct tcp_rack *rack; 9081 9082 rack = (struct tcp_rack *)tp->t_fb_ptr; 9083 if (rack == NULL) { 9084 /* Huh? */ 9085 goto out; 9086 } 9087 if (sopt->sopt_dir == SOPT_SET) { 9088 return (rack_set_sockopt(so, sopt, inp, tp, rack)); 9089 } else if (sopt->sopt_dir == SOPT_GET) { 9090 return (rack_get_sockopt(so, sopt, inp, tp, rack)); 9091 } 9092 out: 9093 INP_WUNLOCK(inp); 9094 return (error); 9095 } 9096 9097 9098 struct tcp_function_block __tcp_rack = { 9099 .tfb_tcp_block_name = __XSTRING(STACKNAME), 9100 .tfb_tcp_output = rack_output, 9101 .tfb_tcp_do_segment = rack_do_segment, 9102 .tfb_tcp_ctloutput = rack_ctloutput, 9103 .tfb_tcp_fb_init = rack_init, 9104 .tfb_tcp_fb_fini = rack_fini, 9105 .tfb_tcp_timer_stop_all = rack_stopall, 9106 .tfb_tcp_timer_activate = rack_timer_activate, 9107 .tfb_tcp_timer_active = rack_timer_active, 9108 .tfb_tcp_timer_stop = rack_timer_stop, 9109 .tfb_tcp_rexmit_tmr = rack_remxt_tmr, 9110 .tfb_tcp_handoff_ok = rack_handoff_ok 9111 }; 9112 9113 static const char *rack_stack_names[] = { 9114 __XSTRING(STACKNAME), 9115 #ifdef STACKALIAS 9116 __XSTRING(STACKALIAS), 9117 #endif 9118 }; 9119 9120 static int 9121 rack_ctor(void *mem, int32_t size, void *arg, int32_t how) 9122 { 9123 memset(mem, 0, size); 9124 return (0); 9125 } 9126 9127 static void 9128 rack_dtor(void *mem, int32_t size, void *arg) 9129 { 9130 9131 } 9132 9133 static bool rack_mod_inited = false; 9134 9135 static int 9136 tcp_addrack(module_t mod, int32_t type, void *data) 9137 { 9138 int32_t err = 0; 9139 int num_stacks; 9140 9141 switch (type) { 9142 case MOD_LOAD: 9143 rack_zone = uma_zcreate(__XSTRING(MODNAME) "_map", 9144 sizeof(struct rack_sendmap), 9145 rack_ctor, rack_dtor, NULL, NULL, UMA_ALIGN_PTR, 0); 9146 9147 rack_pcb_zone = uma_zcreate(__XSTRING(MODNAME) "_pcb", 9148 sizeof(struct tcp_rack), 9149 rack_ctor, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0); 9150 9151 sysctl_ctx_init(&rack_sysctl_ctx); 9152 rack_sysctl_root = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 9153 SYSCTL_STATIC_CHILDREN(_net_inet_tcp), 9154 OID_AUTO, 9155 __XSTRING(STACKNAME), 9156 CTLFLAG_RW, 0, 9157 ""); 9158 if (rack_sysctl_root == NULL) { 9159 printf("Failed to add sysctl node\n"); 9160 err = EFAULT; 9161 goto free_uma; 9162 } 9163 rack_init_sysctls(); 9164 num_stacks = nitems(rack_stack_names); 9165 err = register_tcp_functions_as_names(&__tcp_rack, M_WAITOK, 9166 rack_stack_names, &num_stacks); 9167 if (err) { 9168 printf("Failed to register %s stack name for " 9169 "%s module\n", rack_stack_names[num_stacks], 9170 __XSTRING(MODNAME)); 9171 sysctl_ctx_free(&rack_sysctl_ctx); 9172 free_uma: 9173 uma_zdestroy(rack_zone); 9174 uma_zdestroy(rack_pcb_zone); 9175 rack_counter_destroy(); 9176 printf("Failed to register rack module -- err:%d\n", err); 9177 return (err); 9178 } 9179 rack_mod_inited = true; 9180 break; 9181 case MOD_QUIESCE: 9182 err = deregister_tcp_functions(&__tcp_rack, true, false); 9183 break; 9184 case MOD_UNLOAD: 9185 err = deregister_tcp_functions(&__tcp_rack, false, true); 9186 if (err == EBUSY) 9187 break; 9188 if (rack_mod_inited) { 9189 uma_zdestroy(rack_zone); 9190 uma_zdestroy(rack_pcb_zone); 9191 sysctl_ctx_free(&rack_sysctl_ctx); 9192 rack_counter_destroy(); 9193 rack_mod_inited = false; 9194 } 9195 err = 0; 9196 break; 9197 default: 9198 return (EOPNOTSUPP); 9199 } 9200 return (err); 9201 } 9202 9203 static moduledata_t tcp_rack = { 9204 .name = __XSTRING(MODNAME), 9205 .evhand = tcp_addrack, 9206 .priv = 0 9207 }; 9208 9209 MODULE_VERSION(MODNAME, 1); 9210 DECLARE_MODULE(MODNAME, tcp_rack, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY); 9211 MODULE_DEPEND(MODNAME, tcphpts, 1, 1, 1); 9212