1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 5 * The Regents of the University of California. All rights reserved. 6 * Copyright (c) 2007-2008,2010 7 * Swinburne University of Technology, Melbourne, Australia. 8 * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org> 9 * Copyright (c) 2010 The FreeBSD Foundation 10 * Copyright (c) 2010-2011 Juniper Networks, Inc. 11 * All rights reserved. 12 * 13 * Portions of this software were developed at the Centre for Advanced Internet 14 * Architectures, Swinburne University of Technology, by Lawrence Stewart, 15 * James Healy and David Hayes, made possible in part by a grant from the Cisco 16 * University Research Program Fund at Community Foundation Silicon Valley. 17 * 18 * Portions of this software were developed at the Centre for Advanced 19 * Internet Architectures, Swinburne University of Technology, Melbourne, 20 * Australia by David Hayes under sponsorship from the FreeBSD Foundation. 21 * 22 * Portions of this software were developed by Robert N. M. Watson under 23 * contract to Juniper Networks, Inc. 24 * 25 * Redistribution and use in source and binary forms, with or without 26 * modification, are permitted provided that the following conditions 27 * are met: 28 * 1. Redistributions of source code must retain the above copyright 29 * notice, this list of conditions and the following disclaimer. 30 * 2. Redistributions in binary form must reproduce the above copyright 31 * notice, this list of conditions and the following disclaimer in the 32 * documentation and/or other materials provided with the distribution. 33 * 3. Neither the name of the University nor the names of its contributors 34 * may be used to endorse or promote products derived from this software 35 * without specific prior written permission. 36 * 37 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 38 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 39 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 40 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 41 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 42 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 43 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 44 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 45 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 46 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 47 * SUCH DAMAGE. 48 */ 49 50 #include "opt_inet.h" 51 #include "opt_inet6.h" 52 #include "opt_ipsec.h" 53 #include "opt_rss.h" 54 55 #include <sys/param.h> 56 #include <sys/arb.h> 57 #include <sys/kernel.h> 58 #ifdef TCP_HHOOK 59 #include <sys/hhook.h> 60 #endif 61 #include <sys/malloc.h> 62 #include <sys/mbuf.h> 63 #include <sys/proc.h> /* for proc0 declaration */ 64 #include <sys/protosw.h> 65 #include <sys/qmath.h> 66 #include <sys/sdt.h> 67 #include <sys/signalvar.h> 68 #include <sys/socket.h> 69 #include <sys/socketvar.h> 70 #include <sys/sysctl.h> 71 #include <sys/syslog.h> 72 #include <sys/systm.h> 73 #include <sys/stats.h> 74 75 #include <machine/cpu.h> /* before tcp_seq.h, for tcp_random18() */ 76 77 #include <vm/uma.h> 78 79 #include <net/if.h> 80 #include <net/if_var.h> 81 #include <net/route.h> 82 #include <net/rss_config.h> 83 #include <net/vnet.h> 84 85 #define TCPSTATES /* for logging */ 86 87 #include <netinet/in.h> 88 #include <netinet/in_kdtrace.h> 89 #include <netinet/in_pcb.h> 90 #include <netinet/in_rss.h> 91 #include <netinet/in_systm.h> 92 #include <netinet/ip.h> 93 #include <netinet/ip_icmp.h> /* required for icmp_var.h */ 94 #include <netinet/icmp_var.h> /* for ICMP_BANDLIM */ 95 #include <netinet/ip_var.h> 96 #include <netinet/ip_options.h> 97 #include <netinet/ip6.h> 98 #include <netinet/icmp6.h> 99 #include <netinet6/in6_pcb.h> 100 #include <netinet6/in6_rss.h> 101 #include <netinet6/in6_var.h> 102 #include <netinet6/ip6_var.h> 103 #include <netinet6/nd6.h> 104 #include <netinet/tcp.h> 105 #include <netinet/tcp_fsm.h> 106 #include <netinet/tcp_seq.h> 107 #include <netinet/tcp_timer.h> 108 #include <netinet/tcp_var.h> 109 #include <netinet/tcp_log_buf.h> 110 #include <netinet6/tcp6_var.h> 111 #include <netinet/tcpip.h> 112 #include <netinet/cc/cc.h> 113 #include <netinet/tcp_fastopen.h> 114 #include <netinet/tcp_syncache.h> 115 #ifdef TCP_OFFLOAD 116 #include <netinet/tcp_offload.h> 117 #endif 118 #include <netinet/tcp_ecn.h> 119 #include <netinet/udp.h> 120 121 #include <netipsec/ipsec_support.h> 122 123 #include <machine/in_cksum.h> 124 125 #include <security/mac/mac_framework.h> 126 127 const int tcprexmtthresh = 3; 128 129 VNET_DEFINE(int, tcp_log_in_vain) = 0; 130 SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_VNET | CTLFLAG_RW, 131 &VNET_NAME(tcp_log_in_vain), 0, 132 "Log all incoming TCP segments to closed ports"); 133 134 VNET_DEFINE(int, tcp_bind_all_fibs) = 1; 135 SYSCTL_INT(_net_inet_tcp, OID_AUTO, bind_all_fibs, CTLFLAG_VNET | CTLFLAG_RDTUN, 136 &VNET_NAME(tcp_bind_all_fibs), 0, 137 "Bound sockets receive traffic from all FIBs"); 138 139 VNET_DEFINE(int, blackhole) = 0; 140 #define V_blackhole VNET(blackhole) 141 SYSCTL_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_VNET | CTLFLAG_RW, 142 &VNET_NAME(blackhole), 0, 143 "Do not send RST on segments to closed ports"); 144 145 VNET_DEFINE(bool, blackhole_local) = false; 146 #define V_blackhole_local VNET(blackhole_local) 147 SYSCTL_BOOL(_net_inet_tcp, OID_AUTO, blackhole_local, CTLFLAG_VNET | 148 CTLFLAG_RW, &VNET_NAME(blackhole_local), false, 149 "Enforce net.inet.tcp.blackhole for locally originated packets"); 150 151 VNET_DEFINE(int, tcp_delack_enabled) = 1; 152 SYSCTL_INT(_net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_VNET | CTLFLAG_RW, 153 &VNET_NAME(tcp_delack_enabled), 0, 154 "Delay ACK to try and piggyback it onto a data packet"); 155 156 VNET_DEFINE(int, drop_synfin) = 0; 157 SYSCTL_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_VNET | CTLFLAG_RW, 158 &VNET_NAME(drop_synfin), 0, 159 "Drop TCP packets with SYN+FIN set"); 160 161 VNET_DEFINE(int, tcp_do_prr) = 1; 162 SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_prr, CTLFLAG_VNET | CTLFLAG_RW, 163 &VNET_NAME(tcp_do_prr), 1, 164 "Enable Proportional Rate Reduction per RFC 6937"); 165 166 VNET_DEFINE(int, tcp_do_newcwv) = 0; 167 SYSCTL_INT(_net_inet_tcp, OID_AUTO, newcwv, CTLFLAG_VNET | CTLFLAG_RW, 168 &VNET_NAME(tcp_do_newcwv), 0, 169 "Enable New Congestion Window Validation per RFC7661"); 170 171 VNET_DEFINE(int, tcp_do_rfc3042) = 1; 172 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3042, CTLFLAG_VNET | CTLFLAG_RW, 173 &VNET_NAME(tcp_do_rfc3042), 0, 174 "Enable RFC 3042 (Limited Transmit)"); 175 176 VNET_DEFINE(int, tcp_do_rfc3390) = 1; 177 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3390, CTLFLAG_VNET | CTLFLAG_RW, 178 &VNET_NAME(tcp_do_rfc3390), 0, 179 "Enable RFC 3390 (Increasing TCP's Initial Congestion Window)"); 180 181 VNET_DEFINE(int, tcp_initcwnd_segments) = 10; 182 SYSCTL_INT(_net_inet_tcp, OID_AUTO, initcwnd_segments, 183 CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_initcwnd_segments), 0, 184 "Slow-start flight size (initial congestion window) in number of segments"); 185 186 VNET_DEFINE(int, tcp_do_rfc3465) = 1; 187 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3465, CTLFLAG_VNET | CTLFLAG_RW, 188 &VNET_NAME(tcp_do_rfc3465), 0, 189 "Enable RFC 3465 (Appropriate Byte Counting)"); 190 191 VNET_DEFINE(int, tcp_abc_l_var) = 2; 192 SYSCTL_INT(_net_inet_tcp, OID_AUTO, abc_l_var, CTLFLAG_VNET | CTLFLAG_RW, 193 &VNET_NAME(tcp_abc_l_var), 2, 194 "Cap the max cwnd increment during slow-start to this number of segments"); 195 196 VNET_DEFINE(int, tcp_insecure_syn) = 0; 197 SYSCTL_INT(_net_inet_tcp, OID_AUTO, insecure_syn, CTLFLAG_VNET | CTLFLAG_RW, 198 &VNET_NAME(tcp_insecure_syn), 0, 199 "Follow RFC793 instead of RFC5961 criteria for accepting SYN packets"); 200 201 VNET_DEFINE(int, tcp_insecure_rst) = 0; 202 SYSCTL_INT(_net_inet_tcp, OID_AUTO, insecure_rst, CTLFLAG_VNET | CTLFLAG_RW, 203 &VNET_NAME(tcp_insecure_rst), 0, 204 "Follow RFC793 instead of RFC5961 criteria for accepting RST packets"); 205 206 VNET_DEFINE(int, tcp_insecure_ack) = 0; 207 SYSCTL_INT(_net_inet_tcp, OID_AUTO, insecure_ack, CTLFLAG_VNET | CTLFLAG_RW, 208 &VNET_NAME(tcp_insecure_ack), 0, 209 "Follow RFC793 criteria for validating SEG.ACK"); 210 211 VNET_DEFINE(int, tcp_recvspace) = 1024*64; 212 #define V_tcp_recvspace VNET(tcp_recvspace) 213 SYSCTL_INT(_net_inet_tcp, TCPCTL_RECVSPACE, recvspace, CTLFLAG_VNET | CTLFLAG_RW, 214 &VNET_NAME(tcp_recvspace), 0, "Initial receive socket buffer size"); 215 216 VNET_DEFINE(int, tcp_do_autorcvbuf) = 1; 217 SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_auto, CTLFLAG_VNET | CTLFLAG_RW, 218 &VNET_NAME(tcp_do_autorcvbuf), 0, 219 "Enable automatic receive buffer sizing"); 220 221 VNET_DEFINE(int, tcp_autorcvbuf_max) = 8*1024*1024; 222 SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_max, CTLFLAG_VNET | CTLFLAG_RW, 223 &VNET_NAME(tcp_autorcvbuf_max), 0, 224 "Max size of automatic receive buffer"); 225 226 VNET_DEFINE(struct inpcbinfo, tcbinfo); 227 228 /* 229 * TCP statistics are stored in an array of counter(9)s, which size matches 230 * size of struct tcpstat. TCP running connection count is a regular array. 231 */ 232 VNET_PCPUSTAT_DEFINE(struct tcpstat, tcpstat); 233 SYSCTL_VNET_PCPUSTAT(_net_inet_tcp, TCPCTL_STATS, stats, struct tcpstat, 234 tcpstat, "TCP statistics (struct tcpstat, netinet/tcp_var.h)"); 235 VNET_DEFINE(counter_u64_t, tcps_states[TCP_NSTATES]); 236 SYSCTL_COUNTER_U64_ARRAY(_net_inet_tcp, TCPCTL_STATES, states, CTLFLAG_RD | 237 CTLFLAG_VNET, &VNET_NAME(tcps_states)[0], TCP_NSTATES, 238 "TCP connection counts by TCP state"); 239 240 /* 241 * Kernel module interface for updating tcpstat. The first argument is an index 242 * into tcpstat treated as an array. 243 */ 244 void 245 kmod_tcpstat_add(int statnum, int val) 246 { 247 248 counter_u64_add(VNET(tcpstat)[statnum], val); 249 } 250 251 /* 252 * Make sure that we only start a SACK loss recovery when 253 * receiving a duplicate ACK with a SACK block, and also 254 * complete SACK loss recovery in case the other end 255 * reneges. 256 */ 257 static bool inline 258 tcp_is_sack_recovery(struct tcpcb *tp, struct tcpopt *to) 259 { 260 return ((tp->t_flags & TF_SACK_PERMIT) && 261 ((to->to_flags & TOF_SACK) || 262 (!TAILQ_EMPTY(&tp->snd_holes)))); 263 } 264 265 #ifdef TCP_HHOOK 266 /* 267 * Wrapper for the TCP established input helper hook. 268 */ 269 void 270 hhook_run_tcp_est_in(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to) 271 { 272 struct tcp_hhook_data hhook_data; 273 274 if (V_tcp_hhh[HHOOK_TCP_EST_IN]->hhh_nhooks > 0) { 275 hhook_data.tp = tp; 276 hhook_data.th = th; 277 hhook_data.to = to; 278 279 hhook_run_hooks(V_tcp_hhh[HHOOK_TCP_EST_IN], &hhook_data, 280 &tp->t_osd); 281 } 282 } 283 #endif 284 285 /* 286 * CC wrapper hook functions 287 */ 288 void 289 cc_ack_received(struct tcpcb *tp, struct tcphdr *th, uint16_t nsegs, 290 uint16_t type) 291 { 292 #ifdef STATS 293 int32_t gput; 294 #endif 295 296 INP_WLOCK_ASSERT(tptoinpcb(tp)); 297 298 tp->t_ccv.nsegs = nsegs; 299 tp->t_ccv.bytes_this_ack = BYTES_THIS_ACK(tp, th); 300 if ((!V_tcp_do_newcwv && (tp->snd_cwnd <= tp->snd_wnd)) || 301 (V_tcp_do_newcwv && (tp->snd_cwnd <= tp->snd_wnd) && 302 (tp->snd_cwnd < (tcp_compute_pipe(tp) * 2)))) 303 tp->t_ccv.flags |= CCF_CWND_LIMITED; 304 else 305 tp->t_ccv.flags &= ~CCF_CWND_LIMITED; 306 307 if (type == CC_ACK) { 308 #ifdef STATS 309 stats_voi_update_abs_s32(tp->t_stats, VOI_TCP_CALCFRWINDIFF, 310 ((int32_t)tp->snd_cwnd) - tp->snd_wnd); 311 if (!IN_RECOVERY(tp->t_flags)) 312 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_ACKLEN, 313 tp->t_ccv.bytes_this_ack / (tcp_maxseg(tp) * nsegs)); 314 if ((tp->t_flags & TF_GPUTINPROG) && 315 SEQ_GEQ(th->th_ack, tp->gput_ack)) { 316 /* 317 * Compute goodput in bits per millisecond. 318 */ 319 gput = (((int64_t)SEQ_SUB(th->th_ack, tp->gput_seq)) << 3) / 320 max(1, tcp_ts_getticks() - tp->gput_ts); 321 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_GPUT, 322 gput); 323 /* 324 * XXXLAS: This is a temporary hack, and should be 325 * chained off VOI_TCP_GPUT when stats(9) grows an API 326 * to deal with chained VOIs. 327 */ 328 if (tp->t_stats_gput_prev > 0) 329 stats_voi_update_abs_s32(tp->t_stats, 330 VOI_TCP_GPUT_ND, 331 ((gput - tp->t_stats_gput_prev) * 100) / 332 tp->t_stats_gput_prev); 333 tp->t_flags &= ~TF_GPUTINPROG; 334 tp->t_stats_gput_prev = gput; 335 } 336 #endif /* STATS */ 337 if (tp->snd_cwnd > tp->snd_ssthresh) { 338 tp->t_bytes_acked += tp->t_ccv.bytes_this_ack; 339 if (tp->t_bytes_acked >= tp->snd_cwnd) { 340 tp->t_bytes_acked -= tp->snd_cwnd; 341 tp->t_ccv.flags |= CCF_ABC_SENTAWND; 342 } 343 } else { 344 tp->t_ccv.flags &= ~CCF_ABC_SENTAWND; 345 tp->t_bytes_acked = 0; 346 } 347 } 348 349 if (CC_ALGO(tp)->ack_received != NULL) { 350 /* XXXLAS: Find a way to live without this */ 351 tp->t_ccv.curack = th->th_ack; 352 CC_ALGO(tp)->ack_received(&tp->t_ccv, type); 353 } 354 #ifdef STATS 355 stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_LCWIN, tp->snd_cwnd); 356 #endif 357 } 358 359 void 360 cc_conn_init(struct tcpcb *tp) 361 { 362 struct hc_metrics_lite metrics; 363 struct inpcb *inp = tptoinpcb(tp); 364 u_int maxseg; 365 int rtt; 366 367 INP_WLOCK_ASSERT(inp); 368 369 tcp_hc_get(&inp->inp_inc, &metrics); 370 maxseg = tcp_maxseg(tp); 371 372 if (tp->t_srtt == 0 && (rtt = metrics.hc_rtt)) { 373 tp->t_srtt = rtt; 374 TCPSTAT_INC(tcps_usedrtt); 375 if (metrics.hc_rttvar) { 376 tp->t_rttvar = metrics.hc_rttvar; 377 TCPSTAT_INC(tcps_usedrttvar); 378 } else { 379 /* default variation is +- 1 rtt */ 380 tp->t_rttvar = 381 tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE; 382 } 383 TCPT_RANGESET(tp->t_rxtcur, 384 ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1, 385 tp->t_rttmin, tcp_rexmit_max); 386 } 387 if (metrics.hc_ssthresh) { 388 /* 389 * There's some sort of gateway or interface 390 * buffer limit on the path. Use this to set 391 * the slow start threshold, but set the 392 * threshold to no less than 2*mss. 393 */ 394 tp->snd_ssthresh = max(2 * maxseg, metrics.hc_ssthresh); 395 TCPSTAT_INC(tcps_usedssthresh); 396 } 397 398 /* 399 * Set the initial slow-start flight size. 400 * 401 * If a SYN or SYN/ACK was lost and retransmitted, we have to 402 * reduce the initial CWND to one segment as congestion is likely 403 * requiring us to be cautious. 404 */ 405 if (tp->snd_cwnd == 1) 406 tp->snd_cwnd = maxseg; /* SYN(-ACK) lost */ 407 else 408 tp->snd_cwnd = tcp_compute_initwnd(maxseg); 409 410 if (CC_ALGO(tp)->conn_init != NULL) 411 CC_ALGO(tp)->conn_init(&tp->t_ccv); 412 } 413 414 void inline 415 cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type) 416 { 417 INP_WLOCK_ASSERT(tptoinpcb(tp)); 418 419 #ifdef STATS 420 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_CSIG, type); 421 #endif 422 423 switch(type) { 424 case CC_NDUPACK: 425 if (!IN_FASTRECOVERY(tp->t_flags)) { 426 tp->snd_recover = tp->snd_max; 427 if (tp->t_flags2 & TF2_ECN_PERMIT) 428 tp->t_flags2 |= TF2_ECN_SND_CWR; 429 } 430 break; 431 case CC_ECN: 432 if (!IN_CONGRECOVERY(tp->t_flags) || 433 /* 434 * Allow ECN reaction on ACK to CWR, if 435 * that data segment was also CE marked. 436 */ 437 SEQ_GEQ(th->th_ack, tp->snd_recover)) { 438 EXIT_CONGRECOVERY(tp->t_flags); 439 TCPSTAT_INC(tcps_ecn_rcwnd); 440 tp->snd_recover = tp->snd_max + 1; 441 if (tp->t_flags2 & TF2_ECN_PERMIT) 442 tp->t_flags2 |= TF2_ECN_SND_CWR; 443 } 444 break; 445 case CC_RTO: 446 tp->t_dupacks = 0; 447 tp->t_bytes_acked = 0; 448 EXIT_RECOVERY(tp->t_flags); 449 if (tp->t_flags2 & TF2_ECN_PERMIT) 450 tp->t_flags2 |= TF2_ECN_SND_CWR; 451 break; 452 case CC_RTO_ERR: 453 TCPSTAT_INC(tcps_sndrexmitbad); 454 /* RTO was unnecessary, so reset everything. */ 455 tp->snd_cwnd = tp->snd_cwnd_prev; 456 tp->snd_ssthresh = tp->snd_ssthresh_prev; 457 tp->snd_recover = tp->snd_recover_prev; 458 if (tp->t_flags & TF_WASFRECOVERY) 459 ENTER_FASTRECOVERY(tp->t_flags); 460 if (tp->t_flags & TF_WASCRECOVERY) 461 ENTER_CONGRECOVERY(tp->t_flags); 462 tp->snd_nxt = tp->snd_max; 463 tp->t_flags &= ~TF_PREVVALID; 464 tp->t_rxtshift = 0; 465 tp->t_badrxtwin = 0; 466 break; 467 } 468 if (SEQ_LT(tp->snd_fack, tp->snd_una) || 469 SEQ_GT(tp->snd_fack, tp->snd_max)) { 470 tp->snd_fack = tp->snd_una; 471 } 472 473 if (CC_ALGO(tp)->cong_signal != NULL) { 474 if (th != NULL) 475 tp->t_ccv.curack = th->th_ack; 476 CC_ALGO(tp)->cong_signal(&tp->t_ccv, type); 477 } 478 } 479 480 void inline 481 cc_post_recovery(struct tcpcb *tp, struct tcphdr *th) 482 { 483 INP_WLOCK_ASSERT(tptoinpcb(tp)); 484 485 if (CC_ALGO(tp)->post_recovery != NULL) { 486 if (SEQ_LT(tp->snd_fack, th->th_ack) || 487 SEQ_GT(tp->snd_fack, tp->snd_max)) { 488 tp->snd_fack = th->th_ack; 489 } 490 tp->t_ccv.curack = th->th_ack; 491 CC_ALGO(tp)->post_recovery(&tp->t_ccv); 492 } 493 EXIT_RECOVERY(tp->t_flags); 494 495 tp->t_bytes_acked = 0; 496 tp->sackhint.delivered_data = 0; 497 tp->sackhint.prr_delivered = 0; 498 tp->sackhint.prr_out = 0; 499 } 500 501 /* 502 * Indicate whether this ack should be delayed. We can delay the ack if 503 * following conditions are met: 504 * - There is no delayed ack timer in progress. 505 * - Our last ack wasn't a 0-sized window. We never want to delay 506 * the ack that opens up a 0-sized window. 507 * - LRO wasn't used for this segment. We make sure by checking that the 508 * segment size is not larger than the MSS. 509 */ 510 #define DELAY_ACK(tp, tlen) \ 511 ((!tcp_timer_active(tp, TT_DELACK) && \ 512 (tp->t_flags & TF_RXWIN0SENT) == 0) && \ 513 (tlen <= tp->t_maxseg) && \ 514 (V_tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN))) 515 516 void inline 517 cc_ecnpkt_handler_flags(struct tcpcb *tp, uint16_t flags, uint8_t iptos) 518 { 519 INP_WLOCK_ASSERT(tptoinpcb(tp)); 520 521 if (CC_ALGO(tp)->ecnpkt_handler != NULL) { 522 switch (iptos & IPTOS_ECN_MASK) { 523 case IPTOS_ECN_CE: 524 tp->t_ccv.flags |= CCF_IPHDR_CE; 525 break; 526 case IPTOS_ECN_ECT0: 527 /* FALLTHROUGH */ 528 case IPTOS_ECN_ECT1: 529 /* FALLTHROUGH */ 530 case IPTOS_ECN_NOTECT: 531 tp->t_ccv.flags &= ~CCF_IPHDR_CE; 532 break; 533 } 534 535 if (flags & TH_CWR) 536 tp->t_ccv.flags |= CCF_TCPHDR_CWR; 537 else 538 tp->t_ccv.flags &= ~CCF_TCPHDR_CWR; 539 540 CC_ALGO(tp)->ecnpkt_handler(&tp->t_ccv); 541 542 if (tp->t_ccv.flags & CCF_ACKNOW) { 543 tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); 544 tp->t_flags |= TF_ACKNOW; 545 } 546 } 547 } 548 549 void inline 550 cc_ecnpkt_handler(struct tcpcb *tp, struct tcphdr *th, uint8_t iptos) 551 { 552 cc_ecnpkt_handler_flags(tp, tcp_get_flags(th), iptos); 553 } 554 555 /* 556 * TCP input handling is split into multiple parts: 557 * tcp6_input is a thin wrapper around tcp_input for the extended 558 * ip6_protox[] call format in ip6_input 559 * tcp_input handles primary segment validation, inpcb lookup and 560 * SYN processing on listen sockets 561 * tcp_do_segment processes the ACK and text of the segment for 562 * establishing, established and closing connections 563 */ 564 #ifdef INET6 565 int 566 tcp6_input_with_port(struct mbuf **mp, int *offp, int proto, uint16_t port) 567 { 568 struct mbuf *m; 569 570 m = *mp; 571 if (m->m_len < *offp + sizeof(struct tcphdr)) { 572 m = m_pullup(m, *offp + sizeof(struct tcphdr)); 573 if (m == NULL) { 574 *mp = m; 575 TCPSTAT_INC(tcps_rcvshort); 576 return (IPPROTO_DONE); 577 } 578 } 579 580 *mp = m; 581 return (tcp_input_with_port(mp, offp, proto, port)); 582 } 583 584 int 585 tcp6_input(struct mbuf **mp, int *offp, int proto) 586 { 587 588 return(tcp6_input_with_port(mp, offp, proto, 0)); 589 } 590 #endif /* INET6 */ 591 592 int 593 tcp_input_with_port(struct mbuf **mp, int *offp, int proto, uint16_t port) 594 { 595 struct mbuf *m = *mp; 596 struct tcphdr *th = NULL; 597 struct ip *ip = NULL; 598 struct inpcb *inp = NULL; 599 struct tcpcb *tp = NULL; 600 struct socket *so = NULL; 601 u_char *optp = NULL; 602 int off0; 603 int optlen = 0; 604 #ifdef INET 605 int len; 606 uint8_t ipttl; 607 #endif 608 int tlen = 0, off; 609 int drop_hdrlen; 610 int thflags; 611 int lookupflag; 612 uint8_t iptos; 613 struct m_tag *fwd_tag = NULL; 614 #ifdef INET6 615 struct ip6_hdr *ip6 = NULL; 616 int isipv6; 617 #else 618 const void *ip6 = NULL; 619 #endif /* INET6 */ 620 struct tcpopt to; /* options in this segment */ 621 char *s = NULL; /* address and port logging */ 622 bool closed_port = false; /* segment is hitting a closed port */ 623 624 NET_EPOCH_ASSERT(); 625 626 #ifdef INET6 627 isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0; 628 #endif 629 630 off0 = *offp; 631 m = *mp; 632 *mp = NULL; 633 to.to_flags = 0; 634 TCPSTAT_INC(tcps_rcvtotal); 635 636 m->m_pkthdr.tcp_tun_port = port; 637 #ifdef INET6 638 if (isipv6) { 639 ip6 = mtod(m, struct ip6_hdr *); 640 th = (struct tcphdr *)((caddr_t)ip6 + off0); 641 tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0; 642 if (port) 643 goto skip6_csum; 644 if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID_IPV6) { 645 if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) 646 th->th_sum = m->m_pkthdr.csum_data; 647 else 648 th->th_sum = in6_cksum_pseudo(ip6, tlen, 649 IPPROTO_TCP, m->m_pkthdr.csum_data); 650 th->th_sum ^= 0xffff; 651 } else if (m->m_pkthdr.csum_flags & CSUM_IP6_TCP) { 652 /* 653 * Packet from local host (maybe from a VM). 654 * Checksum not required. 655 */ 656 th->th_sum = 0; 657 } else 658 th->th_sum = in6_cksum(m, IPPROTO_TCP, off0, tlen); 659 if (th->th_sum) { 660 TCPSTAT_INC(tcps_rcvbadsum); 661 goto drop; 662 } 663 skip6_csum: 664 /* 665 * Be proactive about unspecified IPv6 address in source. 666 * As we use all-zero to indicate unbounded/unconnected pcb, 667 * unspecified IPv6 address can be used to confuse us. 668 * 669 * Note that packets with unspecified IPv6 destination is 670 * already dropped in ip6_input. 671 */ 672 KASSERT(!IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_dst), 673 ("%s: unspecified destination v6 address", __func__)); 674 if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { 675 IP6STAT_INC(ip6s_badscope); /* XXX */ 676 goto drop; 677 } 678 iptos = IPV6_TRAFFIC_CLASS(ip6); 679 } 680 #endif 681 #if defined(INET) && defined(INET6) 682 else 683 #endif 684 #ifdef INET 685 { 686 /* 687 * Get IP and TCP header together in first mbuf. 688 * Note: IP leaves IP header in first mbuf. 689 */ 690 if (off0 > sizeof (struct ip)) { 691 ip_stripoptions(m); 692 off0 = sizeof(struct ip); 693 } 694 if (m->m_len < sizeof (struct tcpiphdr)) { 695 if ((m = m_pullup(m, sizeof (struct tcpiphdr))) 696 == NULL) { 697 TCPSTAT_INC(tcps_rcvshort); 698 return (IPPROTO_DONE); 699 } 700 } 701 ip = mtod(m, struct ip *); 702 th = (struct tcphdr *)((caddr_t)ip + off0); 703 tlen = ntohs(ip->ip_len) - off0; 704 705 iptos = ip->ip_tos; 706 if (port) 707 goto skip_csum; 708 if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) { 709 if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) 710 th->th_sum = m->m_pkthdr.csum_data; 711 else 712 th->th_sum = in_pseudo(ip->ip_src.s_addr, 713 ip->ip_dst.s_addr, 714 htonl(m->m_pkthdr.csum_data + tlen + 715 IPPROTO_TCP)); 716 th->th_sum ^= 0xffff; 717 } else if (m->m_pkthdr.csum_flags & CSUM_IP_TCP) { 718 /* 719 * Packet from local host (maybe from a VM). 720 * Checksum not required. 721 */ 722 th->th_sum = 0; 723 } else { 724 struct ipovly *ipov = (struct ipovly *)ip; 725 726 /* 727 * Checksum extended TCP header and data. 728 */ 729 len = off0 + tlen; 730 ipttl = ip->ip_ttl; 731 bzero(ipov->ih_x1, sizeof(ipov->ih_x1)); 732 ipov->ih_len = htons(tlen); 733 th->th_sum = in_cksum(m, len); 734 /* Reset length for SDT probes. */ 735 ip->ip_len = htons(len); 736 /* Reset TOS bits */ 737 ip->ip_tos = iptos; 738 /* Re-initialization for later version check */ 739 ip->ip_ttl = ipttl; 740 ip->ip_v = IPVERSION; 741 ip->ip_hl = off0 >> 2; 742 } 743 skip_csum: 744 if (th->th_sum && (port == 0)) { 745 TCPSTAT_INC(tcps_rcvbadsum); 746 goto drop; 747 } 748 KASSERT(ip->ip_dst.s_addr != INADDR_ANY, 749 ("%s: unspecified destination v4 address", __func__)); 750 if (__predict_false(ip->ip_src.s_addr == INADDR_ANY)) { 751 IPSTAT_INC(ips_badaddr); 752 goto drop; 753 } 754 } 755 #endif /* INET */ 756 757 /* 758 * Check that TCP offset makes sense, 759 * pull out TCP options and adjust length. XXX 760 */ 761 off = th->th_off << 2; 762 if (off < sizeof (struct tcphdr) || off > tlen) { 763 TCPSTAT_INC(tcps_rcvbadoff); 764 goto drop; 765 } 766 tlen -= off; /* tlen is used instead of ti->ti_len */ 767 if (off > sizeof (struct tcphdr)) { 768 #ifdef INET6 769 if (isipv6) { 770 if (m->m_len < off0 + off) { 771 m = m_pullup(m, off0 + off); 772 if (m == NULL) { 773 TCPSTAT_INC(tcps_rcvshort); 774 return (IPPROTO_DONE); 775 } 776 } 777 ip6 = mtod(m, struct ip6_hdr *); 778 th = (struct tcphdr *)((caddr_t)ip6 + off0); 779 } 780 #endif 781 #if defined(INET) && defined(INET6) 782 else 783 #endif 784 #ifdef INET 785 { 786 if (m->m_len < sizeof(struct ip) + off) { 787 if ((m = m_pullup(m, sizeof (struct ip) + off)) 788 == NULL) { 789 TCPSTAT_INC(tcps_rcvshort); 790 return (IPPROTO_DONE); 791 } 792 ip = mtod(m, struct ip *); 793 th = (struct tcphdr *)((caddr_t)ip + off0); 794 } 795 } 796 #endif 797 optlen = off - sizeof (struct tcphdr); 798 optp = (u_char *)(th + 1); 799 } 800 thflags = tcp_get_flags(th); 801 802 /* 803 * Convert TCP protocol specific fields to host format. 804 */ 805 tcp_fields_to_host(th); 806 807 /* 808 * Delay dropping TCP, IP headers, IPv6 ext headers, and TCP options. 809 */ 810 drop_hdrlen = off0 + off; 811 812 /* 813 * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain. 814 */ 815 if ( 816 #ifdef INET6 817 (isipv6 && (m->m_flags & M_IP6_NEXTHOP)) 818 #ifdef INET 819 || (!isipv6 && (m->m_flags & M_IP_NEXTHOP)) 820 #endif 821 #endif 822 #if defined(INET) && !defined(INET6) 823 (m->m_flags & M_IP_NEXTHOP) 824 #endif 825 ) 826 fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL); 827 828 /* 829 * For initial SYN packets we don't need write lock on matching 830 * PCB, be it a listening one or a synchronized one. The packet 831 * shall not modify its state. 832 */ 833 lookupflag = INPLOOKUP_WILDCARD | 834 ((thflags & (TH_ACK|TH_SYN)) == TH_SYN ? 835 INPLOOKUP_RLOCKPCB : INPLOOKUP_WLOCKPCB) | 836 (V_tcp_bind_all_fibs ? 0 : INPLOOKUP_FIB); 837 findpcb: 838 tp = NULL; 839 #ifdef INET6 840 if (isipv6 && fwd_tag != NULL) { 841 struct sockaddr_in6 *next_hop6; 842 843 next_hop6 = (struct sockaddr_in6 *)(fwd_tag + 1); 844 /* 845 * Transparently forwarded. Pretend to be the destination. 846 * Already got one like this? 847 */ 848 inp = in6_pcblookup_mbuf(&V_tcbinfo, 849 &ip6->ip6_src, th->th_sport, &ip6->ip6_dst, th->th_dport, 850 lookupflag & ~INPLOOKUP_WILDCARD, m->m_pkthdr.rcvif, m); 851 if (!inp) { 852 /* 853 * It's new. Try to find the ambushing socket. 854 * Because we've rewritten the destination address, 855 * any hardware-generated hash is ignored. 856 */ 857 inp = in6_pcblookup(&V_tcbinfo, &ip6->ip6_src, 858 th->th_sport, &next_hop6->sin6_addr, 859 next_hop6->sin6_port ? ntohs(next_hop6->sin6_port) : 860 th->th_dport, lookupflag, m->m_pkthdr.rcvif); 861 } 862 } else if (isipv6) { 863 inp = in6_pcblookup_mbuf(&V_tcbinfo, &ip6->ip6_src, 864 th->th_sport, &ip6->ip6_dst, th->th_dport, lookupflag, 865 m->m_pkthdr.rcvif, m); 866 } 867 #endif /* INET6 */ 868 #if defined(INET6) && defined(INET) 869 else 870 #endif 871 #ifdef INET 872 if (fwd_tag != NULL) { 873 struct sockaddr_in *next_hop; 874 875 next_hop = (struct sockaddr_in *)(fwd_tag+1); 876 /* 877 * Transparently forwarded. Pretend to be the destination. 878 * already got one like this? 879 */ 880 inp = in_pcblookup_mbuf(&V_tcbinfo, ip->ip_src, th->th_sport, 881 ip->ip_dst, th->th_dport, lookupflag & ~INPLOOKUP_WILDCARD, 882 m->m_pkthdr.rcvif, m); 883 if (!inp) { 884 /* 885 * It's new. Try to find the ambushing socket. 886 * Because we've rewritten the destination address, 887 * any hardware-generated hash is ignored. 888 */ 889 inp = in_pcblookup(&V_tcbinfo, ip->ip_src, 890 th->th_sport, next_hop->sin_addr, 891 next_hop->sin_port ? ntohs(next_hop->sin_port) : 892 th->th_dport, lookupflag, m->m_pkthdr.rcvif); 893 } 894 } else 895 inp = in_pcblookup_mbuf(&V_tcbinfo, ip->ip_src, 896 th->th_sport, ip->ip_dst, th->th_dport, lookupflag, 897 m->m_pkthdr.rcvif, m); 898 #endif /* INET */ 899 900 /* 901 * If the INPCB does not exist then all data in the incoming 902 * segment is discarded and an appropriate RST is sent back. 903 * XXX MRT Send RST using which routing table? 904 */ 905 if (inp == NULL) { 906 if ((lookupflag & INPLOOKUP_WILDCARD) == 0) { 907 /* We came here after second (safety) lookup. */ 908 MPASS(!closed_port); 909 } else { 910 /* 911 * Log communication attempts to ports that are not 912 * in use. 913 */ 914 if (((V_tcp_log_in_vain == 1 && (thflags & TH_SYN)) || 915 V_tcp_log_in_vain == 2) && 916 (s = tcp_log_vain(NULL, th, (void *)ip, ip6))) { 917 log(LOG_INFO, "%s; %s: Connection attempt " 918 "to closed port\n", s, __func__); 919 } 920 closed_port = true; 921 } 922 goto dropwithreset; 923 } 924 INP_LOCK_ASSERT(inp); 925 926 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 927 #ifdef INET6 928 if (isipv6 && IPSEC_ENABLED(ipv6) && 929 IPSEC_CHECK_POLICY(ipv6, m, inp) != 0) { 930 goto dropunlock; 931 } 932 #ifdef INET 933 else 934 #endif 935 #endif /* INET6 */ 936 #ifdef INET 937 if (IPSEC_ENABLED(ipv4) && 938 IPSEC_CHECK_POLICY(ipv4, m, inp) != 0) { 939 goto dropunlock; 940 } 941 #endif /* INET */ 942 #endif /* IPSEC */ 943 944 /* 945 * Check the minimum TTL for socket. 946 */ 947 if (inp->inp_ip_minttl != 0) { 948 #ifdef INET6 949 if (isipv6) { 950 if (inp->inp_ip_minttl > ip6->ip6_hlim) 951 goto dropunlock; 952 } else 953 #endif 954 if (inp->inp_ip_minttl > ip->ip_ttl) 955 goto dropunlock; 956 } 957 958 tp = intotcpcb(inp); 959 switch (tp->t_state) { 960 case TCPS_TIME_WAIT: 961 /* 962 * A previous connection in TIMEWAIT state is supposed to catch 963 * stray or duplicate segments arriving late. If this segment 964 * was a legitimate new connection attempt, the old INPCB gets 965 * removed and we can try again to find a listening socket. 966 */ 967 tcp_dooptions(&to, optp, optlen, 968 (thflags & TH_SYN) ? TO_SYN : 0); 969 /* 970 * tcp_twcheck unlocks the inp always, and frees the m if fails. 971 */ 972 if (tcp_twcheck(inp, &to, th, m, tlen)) 973 goto findpcb; 974 return (IPPROTO_DONE); 975 case TCPS_CLOSED: 976 /* 977 * The TCPCB may no longer exist if the connection is winding 978 * down or it is in the CLOSED state. Either way we drop the 979 * segment and send an appropriate response. 980 */ 981 closed_port = true; 982 goto dropwithreset; 983 } 984 985 if ((tp->t_port != port) && (tp->t_state > TCPS_LISTEN)) { 986 closed_port = true; 987 goto dropwithreset; 988 } 989 990 #ifdef TCP_OFFLOAD 991 if (tp->t_flags & TF_TOE) { 992 tcp_offload_input(tp, m); 993 m = NULL; /* consumed by the TOE driver */ 994 goto dropunlock; 995 } 996 #endif 997 998 #ifdef MAC 999 if (mac_inpcb_check_deliver(inp, m)) 1000 goto dropunlock; 1001 #endif 1002 so = inp->inp_socket; 1003 KASSERT(so != NULL, ("%s: so == NULL", __func__)); 1004 /* 1005 * When the socket is accepting connections (the INPCB is in LISTEN 1006 * state) we look into the SYN cache if this is a new connection 1007 * attempt or the completion of a previous one. 1008 */ 1009 KASSERT(tp->t_state == TCPS_LISTEN || !SOLISTENING(so), 1010 ("%s: so accepting but tp %p not listening", __func__, tp)); 1011 if (tp->t_state == TCPS_LISTEN && SOLISTENING(so)) { 1012 struct in_conninfo inc; 1013 1014 bzero(&inc, sizeof(inc)); 1015 #ifdef INET6 1016 if (isipv6) { 1017 inc.inc_flags |= INC_ISIPV6; 1018 if (inp->inp_inc.inc_flags & INC_IPV6MINMTU) 1019 inc.inc_flags |= INC_IPV6MINMTU; 1020 inc.inc6_faddr = ip6->ip6_src; 1021 inc.inc6_laddr = ip6->ip6_dst; 1022 } else 1023 #endif 1024 { 1025 inc.inc_faddr = ip->ip_src; 1026 inc.inc_laddr = ip->ip_dst; 1027 } 1028 inc.inc_fport = th->th_sport; 1029 inc.inc_lport = th->th_dport; 1030 inc.inc_fibnum = so->so_fibnum; 1031 1032 /* 1033 * Check for an existing connection attempt in syncache if 1034 * the flag is only ACK. A successful lookup creates a new 1035 * socket appended to the listen queue in SYN_RECEIVED state. 1036 */ 1037 if ((thflags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK) { 1038 int result; 1039 1040 /* 1041 * Parse the TCP options here because 1042 * syncookies need access to the reflected 1043 * timestamp. 1044 */ 1045 tcp_dooptions(&to, optp, optlen, 0); 1046 /* 1047 * NB: syncache_expand() doesn't unlock inp. 1048 */ 1049 result = syncache_expand(&inc, &to, th, &so, m, port); 1050 if (result < 0) { 1051 /* 1052 * A failing TCP MD5 signature comparison 1053 * must result in the segment being dropped 1054 * and must not produce any response back 1055 * to the sender. 1056 */ 1057 goto dropunlock; 1058 } else if (result == 0) { 1059 /* 1060 * No syncache entry, or ACK was not for our 1061 * SYN/ACK. Do our protection against double 1062 * ACK. If peer sent us 2 ACKs, then for the 1063 * first one syncache_expand() successfully 1064 * converted syncache entry into a socket, 1065 * while we were waiting on the inpcb lock. We 1066 * don't want to sent RST for the second ACK, 1067 * so we perform second lookup without wildcard 1068 * match, hoping to find the new socket. If 1069 * the ACK is stray indeed, the missing 1070 * INPLOOKUP_WILDCARD flag in lookupflag would 1071 * hint the above code that the lookup was a 1072 * second attempt. 1073 * 1074 * NB: syncache did its own logging 1075 * of the failure cause. 1076 */ 1077 INP_WUNLOCK(inp); 1078 lookupflag &= ~INPLOOKUP_WILDCARD; 1079 goto findpcb; 1080 } 1081 tfo_socket_result: 1082 if (so == NULL) { 1083 /* 1084 * We completed the 3-way handshake 1085 * but could not allocate a socket 1086 * either due to memory shortage, 1087 * listen queue length limits or 1088 * global socket limits. Send RST 1089 * or wait and have the remote end 1090 * retransmit the ACK for another 1091 * try. 1092 */ 1093 if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) 1094 log(LOG_DEBUG, "%s; %s: Listen socket: " 1095 "Socket allocation failed due to " 1096 "limits or memory shortage, %s\n", 1097 s, __func__, 1098 V_tcp_sc_rst_sock_fail ? 1099 "sending RST" : "try again"); 1100 if (V_tcp_sc_rst_sock_fail) { 1101 goto dropwithreset; 1102 } else 1103 goto dropunlock; 1104 } 1105 /* 1106 * Socket is created in state SYN_RECEIVED. 1107 * Unlock the listen socket, lock the newly 1108 * created socket and update the tp variable. 1109 * If we came here via jump to tfo_socket_result, 1110 * then listening socket is read-locked. 1111 */ 1112 INP_UNLOCK(inp); /* listen socket */ 1113 inp = sotoinpcb(so); 1114 /* 1115 * New connection inpcb is already locked by 1116 * syncache_expand(). 1117 */ 1118 INP_WLOCK_ASSERT(inp); 1119 tp = intotcpcb(inp); 1120 KASSERT(tp->t_state == TCPS_SYN_RECEIVED, 1121 ("%s: ", __func__)); 1122 /* 1123 * Process the segment and the data it 1124 * contains. tcp_do_segment() consumes 1125 * the mbuf chain and unlocks the inpcb. 1126 */ 1127 TCP_PROBE5(receive, NULL, tp, m, tp, th); 1128 tp->t_fb->tfb_tcp_do_segment(tp, m, th, drop_hdrlen, 1129 tlen, iptos); 1130 return (IPPROTO_DONE); 1131 } 1132 /* 1133 * Segment flag validation for new connection attempts: 1134 * 1135 * Our (SYN|ACK) response was rejected. 1136 * Check with syncache and remove entry to prevent 1137 * retransmits. 1138 * 1139 * NB: syncache_chkrst does its own logging of failure 1140 * causes. 1141 */ 1142 if (thflags & TH_RST) { 1143 syncache_chkrst(&inc, th, port); 1144 goto dropunlock; 1145 } 1146 /* 1147 * We can't do anything without SYN. 1148 */ 1149 if ((thflags & TH_SYN) == 0) { 1150 if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) 1151 log(LOG_DEBUG, "%s; %s: Listen socket: " 1152 "SYN is missing, segment ignored\n", 1153 s, __func__); 1154 TCPSTAT_INC(tcps_badsyn); 1155 goto dropunlock; 1156 } 1157 /* 1158 * (SYN|ACK) is bogus on a listen socket. 1159 */ 1160 if (thflags & TH_ACK) { 1161 if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) 1162 log(LOG_DEBUG, "%s; %s: Listen socket: " 1163 "SYN|ACK invalid, segment ignored\n", 1164 s, __func__); 1165 TCPSTAT_INC(tcps_badsyn); 1166 goto dropunlock; 1167 } 1168 /* 1169 * If the drop_synfin option is enabled, drop all 1170 * segments with both the SYN and FIN bits set. 1171 * This prevents e.g. nmap from identifying the 1172 * TCP/IP stack. 1173 * XXX: Poor reasoning. nmap has other methods 1174 * and is constantly refining its stack detection 1175 * strategies. 1176 * XXX: This is a violation of the TCP specification 1177 * and was used by RFC1644. 1178 */ 1179 if ((thflags & TH_FIN) && V_drop_synfin) { 1180 if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) 1181 log(LOG_DEBUG, "%s; %s: Listen socket: " 1182 "SYN|FIN segment ignored (based on " 1183 "sysctl setting)\n", s, __func__); 1184 TCPSTAT_INC(tcps_badsyn); 1185 goto dropunlock; 1186 } 1187 /* 1188 * Segment's flags are (SYN) or (SYN|FIN). 1189 * 1190 * TH_PUSH, TH_URG, TH_ECE, TH_CWR are ignored 1191 * as they do not affect the state of the TCP FSM. 1192 * The data pointed to by TH_URG and th_urp is ignored. 1193 */ 1194 KASSERT((thflags & (TH_RST|TH_ACK)) == 0, 1195 ("%s: Listen socket: TH_RST or TH_ACK set", __func__)); 1196 KASSERT(thflags & (TH_SYN), 1197 ("%s: Listen socket: TH_SYN not set", __func__)); 1198 INP_RLOCK_ASSERT(inp); 1199 #ifdef INET6 1200 /* 1201 * If deprecated address is forbidden, 1202 * we do not accept SYN to deprecated interface 1203 * address to prevent any new inbound connection from 1204 * getting established. 1205 * When we do not accept SYN, we send a TCP RST, 1206 * with deprecated source address (instead of dropping 1207 * it). We compromise it as it is much better for peer 1208 * to send a RST, and RST will be the final packet 1209 * for the exchange. 1210 * 1211 * If we do not forbid deprecated addresses, we accept 1212 * the SYN packet. RFC2462 does not suggest dropping 1213 * SYN in this case. 1214 * If we decipher RFC2462 5.5.4, it says like this: 1215 * 1. use of deprecated addr with existing 1216 * communication is okay - "SHOULD continue to be 1217 * used" 1218 * 2. use of it with new communication: 1219 * (2a) "SHOULD NOT be used if alternate address 1220 * with sufficient scope is available" 1221 * (2b) nothing mentioned otherwise. 1222 * Here we fall into (2b) case as we have no choice in 1223 * our source address selection - we must obey the peer. 1224 * 1225 * The wording in RFC2462 is confusing, and there are 1226 * multiple description text for deprecated address 1227 * handling - worse, they are not exactly the same. 1228 * I believe 5.5.4 is the best one, so we follow 5.5.4. 1229 */ 1230 if (isipv6 && !V_ip6_use_deprecated) { 1231 struct in6_ifaddr *ia6; 1232 1233 ia6 = in6ifa_ifwithaddr(&ip6->ip6_dst, 0 /* XXX */, false); 1234 if (ia6 != NULL && 1235 (ia6->ia6_flags & IN6_IFF_DEPRECATED)) { 1236 if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) 1237 log(LOG_DEBUG, "%s; %s: Listen socket: " 1238 "Connection attempt to deprecated " 1239 "IPv6 address rejected\n", 1240 s, __func__); 1241 goto dropwithreset; 1242 } 1243 } 1244 #endif /* INET6 */ 1245 /* 1246 * Basic sanity checks on incoming SYN requests: 1247 * Don't respond if the destination is a link layer 1248 * broadcast according to RFC1122 4.2.3.10, p. 104. 1249 * If it is from this socket it must be forged. 1250 * Don't respond if the source or destination is a 1251 * global or subnet broad- or multicast address. 1252 * Note that it is quite possible to receive unicast 1253 * link-layer packets with a broadcast IP address. Use 1254 * in_ifnet_broadcast() to find them. 1255 */ 1256 if (m->m_flags & (M_BCAST|M_MCAST)) { 1257 if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) 1258 log(LOG_DEBUG, "%s; %s: Listen socket: " 1259 "Connection attempt from broad- or multicast " 1260 "link layer address ignored\n", s, __func__); 1261 goto dropunlock; 1262 } 1263 #ifdef INET6 1264 if (isipv6) { 1265 if (th->th_dport == th->th_sport && 1266 IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &ip6->ip6_src)) { 1267 if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) 1268 log(LOG_DEBUG, "%s; %s: Listen socket: " 1269 "Connection attempt to/from self " 1270 "ignored\n", s, __func__); 1271 goto dropunlock; 1272 } 1273 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || 1274 IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) { 1275 if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) 1276 log(LOG_DEBUG, "%s; %s: Listen socket: " 1277 "Connection attempt from/to multicast " 1278 "address ignored\n", s, __func__); 1279 goto dropunlock; 1280 } 1281 } 1282 #endif 1283 #if defined(INET) && defined(INET6) 1284 else 1285 #endif 1286 #ifdef INET 1287 { 1288 if (th->th_dport == th->th_sport && 1289 ip->ip_dst.s_addr == ip->ip_src.s_addr) { 1290 if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) 1291 log(LOG_DEBUG, "%s; %s: Listen socket: " 1292 "Connection attempt from/to self " 1293 "ignored\n", s, __func__); 1294 goto dropunlock; 1295 } 1296 if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || 1297 IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || 1298 ip->ip_src.s_addr == htonl(INADDR_BROADCAST) || 1299 in_ifnet_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) { 1300 if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) 1301 log(LOG_DEBUG, "%s; %s: Listen socket: " 1302 "Connection attempt from/to broad- " 1303 "or multicast address ignored\n", 1304 s, __func__); 1305 goto dropunlock; 1306 } 1307 } 1308 #endif 1309 /* 1310 * SYN appears to be valid. Create compressed TCP state 1311 * for syncache. 1312 */ 1313 TCP_PROBE3(debug__input, tp, th, m); 1314 tcp_dooptions(&to, optp, optlen, TO_SYN); 1315 if ((so = syncache_add(&inc, &to, th, inp, so, m, NULL, NULL, 1316 iptos, port)) != NULL) 1317 goto tfo_socket_result; 1318 1319 /* 1320 * Entry added to syncache and mbuf consumed. 1321 * Only the listen socket is unlocked by syncache_add(). 1322 */ 1323 return (IPPROTO_DONE); 1324 } 1325 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) 1326 if (tp->t_flags & TF_SIGNATURE) { 1327 tcp_dooptions(&to, optp, optlen, thflags); 1328 if ((to.to_flags & TOF_SIGNATURE) == 0) { 1329 TCPSTAT_INC(tcps_sig_err_nosigopt); 1330 goto dropunlock; 1331 } 1332 if (!TCPMD5_ENABLED() || 1333 TCPMD5_INPUT(m, th, to.to_signature) != 0) 1334 goto dropunlock; 1335 } 1336 #endif 1337 TCP_PROBE5(receive, NULL, tp, m, tp, th); 1338 1339 /* 1340 * Segment belongs to a connection in SYN_SENT, ESTABLISHED or later 1341 * state. tcp_do_segment() always consumes the mbuf chain, unlocks 1342 * the inpcb, and unlocks pcbinfo. 1343 * 1344 * XXXGL: in case of a pure SYN arriving on existing connection 1345 * TCP stacks won't need to modify the PCB, they would either drop 1346 * the segment silently, or send a challenge ACK. However, we try 1347 * to upgrade the lock, because calling convention for stacks is 1348 * write-lock on PCB. If upgrade fails, drop the SYN. 1349 */ 1350 if ((lookupflag & INPLOOKUP_RLOCKPCB) && INP_TRY_UPGRADE(inp) == 0) 1351 goto dropunlock; 1352 1353 tp->t_fb->tfb_tcp_do_segment(tp, m, th, drop_hdrlen, tlen, iptos); 1354 return (IPPROTO_DONE); 1355 1356 dropwithreset: 1357 /* 1358 * When blackholing do not respond with a RST but 1359 * completely ignore the segment and drop it. 1360 */ 1361 if (((!closed_port && V_blackhole == 3) || 1362 (closed_port && 1363 ((V_blackhole == 1 && (thflags & TH_SYN)) || V_blackhole > 1))) && 1364 (V_blackhole_local || ( 1365 #ifdef INET6 1366 isipv6 ? !in6_localip(&ip6->ip6_src) : 1367 #endif 1368 #ifdef INET 1369 !in_localip(ip->ip_src) 1370 #else 1371 true 1372 #endif 1373 ))) 1374 goto dropunlock; 1375 TCP_PROBE5(receive, NULL, tp, m, tp, th); 1376 tcp_dropwithreset(m, th, tp, tlen); 1377 m = NULL; /* mbuf chain got consumed. */ 1378 1379 dropunlock: 1380 if (m != NULL) 1381 TCP_PROBE5(receive, NULL, tp, m, tp, th); 1382 1383 if (inp != NULL) 1384 INP_UNLOCK(inp); 1385 1386 drop: 1387 if (s != NULL) 1388 free(s, M_TCPLOG); 1389 if (m != NULL) 1390 m_freem(m); 1391 return (IPPROTO_DONE); 1392 } 1393 1394 /* 1395 * Automatic sizing of receive socket buffer. Often the send 1396 * buffer size is not optimally adjusted to the actual network 1397 * conditions at hand (delay bandwidth product). Setting the 1398 * buffer size too small limits throughput on links with high 1399 * bandwidth and high delay (eg. trans-continental/oceanic links). 1400 * 1401 * On the receive side the socket buffer memory is only rarely 1402 * used to any significant extent. This allows us to be much 1403 * more aggressive in scaling the receive socket buffer. For 1404 * the case that the buffer space is actually used to a large 1405 * extent and we run out of kernel memory we can simply drop 1406 * the new segments; TCP on the sender will just retransmit it 1407 * later. Setting the buffer size too big may only consume too 1408 * much kernel memory if the application doesn't read() from 1409 * the socket or packet loss or reordering makes use of the 1410 * reassembly queue. 1411 * 1412 * The criteria to step up the receive buffer one notch are: 1413 * 1. Application has not set receive buffer size with 1414 * SO_RCVBUF. Setting SO_RCVBUF clears SB_AUTOSIZE. 1415 * 2. the number of bytes received during 1/2 of an sRTT 1416 * is at least 3/8 of the current socket buffer size. 1417 * 3. receive buffer size has not hit maximal automatic size; 1418 * 1419 * If all of the criteria are met, we increase the socket buffer 1420 * by a 1/2 (bounded by the max). This allows us to keep ahead 1421 * of slow-start but also makes it so our peer never gets limited 1422 * by our rwnd which we then open up causing a burst. 1423 * 1424 * This algorithm does two steps per RTT at most and only if 1425 * we receive a bulk stream w/o packet losses or reorderings. 1426 * Shrinking the buffer during idle times is not necessary as 1427 * it doesn't consume any memory when idle. 1428 * 1429 * TODO: Only step up if the application is actually serving 1430 * the buffer to better manage the socket buffer resources. 1431 */ 1432 int 1433 tcp_autorcvbuf(struct mbuf *m, struct tcphdr *th, struct socket *so, 1434 struct tcpcb *tp, int tlen) 1435 { 1436 int newsize = 0; 1437 1438 if (V_tcp_do_autorcvbuf && (so->so_rcv.sb_flags & SB_AUTOSIZE) && 1439 tp->t_srtt != 0 && tp->rfbuf_ts != 0 && 1440 TCP_TS_TO_TICKS(tcp_ts_getticks() - tp->rfbuf_ts) > 1441 ((tp->t_srtt >> TCP_RTT_SHIFT)/2)) { 1442 if (tp->rfbuf_cnt > ((so->so_rcv.sb_hiwat / 2)/ 4 * 3) && 1443 so->so_rcv.sb_hiwat < V_tcp_autorcvbuf_max) { 1444 newsize = min((so->so_rcv.sb_hiwat + (so->so_rcv.sb_hiwat/2)), V_tcp_autorcvbuf_max); 1445 } 1446 TCP_PROBE6(receive__autoresize, NULL, tp, m, tp, th, newsize); 1447 1448 /* Start over with next RTT. */ 1449 tp->rfbuf_ts = 0; 1450 tp->rfbuf_cnt = 0; 1451 } else { 1452 tp->rfbuf_cnt += tlen; /* add up */ 1453 } 1454 return (newsize); 1455 } 1456 1457 int 1458 tcp_input(struct mbuf **mp, int *offp, int proto) 1459 { 1460 return(tcp_input_with_port(mp, offp, proto, 0)); 1461 } 1462 1463 static void 1464 tcp_handle_wakeup(struct tcpcb *tp) 1465 { 1466 1467 INP_WLOCK_ASSERT(tptoinpcb(tp)); 1468 1469 if (tp->t_flags & TF_WAKESOR) { 1470 struct socket *so = tptosocket(tp); 1471 1472 tp->t_flags &= ~TF_WAKESOR; 1473 SOCK_RECVBUF_LOCK_ASSERT(so); 1474 sorwakeup_locked(so); 1475 } 1476 } 1477 1478 void 1479 tcp_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, 1480 int drop_hdrlen, int tlen, uint8_t iptos) 1481 { 1482 uint16_t thflags; 1483 int acked, ourfinisacked, needoutput = 0; 1484 sackstatus_t sack_changed; 1485 int todrop, win, incforsyn = 0; 1486 uint32_t tiwin; 1487 uint16_t nsegs; 1488 char *s; 1489 struct inpcb *inp = tptoinpcb(tp); 1490 struct socket *so = tptosocket(tp); 1491 struct in_conninfo *inc = &inp->inp_inc; 1492 struct mbuf *mfree; 1493 struct tcpopt to; 1494 int tfo_syn; 1495 u_int maxseg = 0; 1496 bool no_data; 1497 1498 no_data = (tlen == 0); 1499 thflags = tcp_get_flags(th); 1500 tp->sackhint.last_sack_ack = 0; 1501 sack_changed = SACK_NOCHANGE; 1502 nsegs = max(1, m->m_pkthdr.lro_nsegs); 1503 1504 NET_EPOCH_ASSERT(); 1505 INP_WLOCK_ASSERT(inp); 1506 KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", 1507 __func__)); 1508 KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT", 1509 __func__)); 1510 1511 TCP_LOG_EVENT(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_IN, 0, 1512 tlen, NULL, true); 1513 1514 if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) { 1515 if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { 1516 log(LOG_DEBUG, "%s; %s: " 1517 "SYN|FIN segment ignored (based on " 1518 "sysctl setting)\n", s, __func__); 1519 free(s, M_TCPLOG); 1520 } 1521 goto drop; 1522 } 1523 1524 /* 1525 * If a segment with the ACK-bit set arrives in the SYN-SENT state 1526 * check SEQ.ACK first. 1527 */ 1528 if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) && 1529 (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { 1530 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 1531 goto dropwithreset; 1532 } 1533 1534 /* 1535 * Segment received on connection. 1536 * Reset idle time and keep-alive timer. 1537 * XXX: This should be done after segment 1538 * validation to ignore broken/spoofed segs. 1539 */ 1540 if (tp->t_idle_reduce && 1541 (tp->snd_max == tp->snd_una) && 1542 ((ticks - tp->t_rcvtime) >= tp->t_rxtcur)) 1543 cc_after_idle(tp); 1544 tp->t_rcvtime = ticks; 1545 1546 if (thflags & TH_FIN) 1547 tcp_log_end_status(tp, TCP_EI_STATUS_CLIENT_FIN); 1548 /* 1549 * Scale up the window into a 32-bit value. 1550 * For the SYN_SENT state the scale is zero. 1551 */ 1552 tiwin = th->th_win << tp->snd_scale; 1553 #ifdef STATS 1554 stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_FRWIN, tiwin); 1555 #endif 1556 1557 /* 1558 * TCP ECN processing. 1559 */ 1560 if (tcp_ecn_input_segment(tp, thflags, tlen, 1561 tcp_packets_this_ack(tp, th->th_ack), 1562 iptos)) 1563 cc_cong_signal(tp, th, CC_ECN); 1564 1565 /* 1566 * Parse options on any incoming segment. 1567 */ 1568 tcp_dooptions(&to, (u_char *)(th + 1), 1569 (th->th_off << 2) - sizeof(struct tcphdr), 1570 (thflags & TH_SYN) ? TO_SYN : 0); 1571 if (tp->t_flags2 & TF2_PROC_SACK_PROHIBIT) { 1572 /* 1573 * We don't look at sack's from the 1574 * peer because the MSS is too small which 1575 * can subject us to an attack. 1576 */ 1577 to.to_flags &= ~TOF_SACK; 1578 } 1579 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) 1580 if ((tp->t_flags & TF_SIGNATURE) != 0 && 1581 (to.to_flags & TOF_SIGNATURE) == 0) { 1582 TCPSTAT_INC(tcps_sig_err_sigopt); 1583 /* XXX: should drop? */ 1584 } 1585 #endif 1586 /* 1587 * If echoed timestamp is later than the current time, 1588 * fall back to non RFC1323 RTT calculation. Normalize 1589 * timestamp if syncookies were used when this connection 1590 * was established. 1591 */ 1592 if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) { 1593 to.to_tsecr -= tp->ts_offset; 1594 if (TSTMP_GT(to.to_tsecr, tcp_ts_getticks())) { 1595 to.to_tsecr = 0; 1596 } 1597 } 1598 /* 1599 * Process options only when we get SYN/ACK back. The SYN case 1600 * for incoming connections is handled in tcp_syncache. 1601 * According to RFC1323 the window field in a SYN (i.e., a <SYN> 1602 * or <SYN,ACK>) segment itself is never scaled. 1603 * XXX this is traditional behavior, may need to be cleaned up. 1604 */ 1605 if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) { 1606 /* Handle parallel SYN for ECN */ 1607 tcp_ecn_input_parallel_syn(tp, thflags, iptos); 1608 if ((to.to_flags & TOF_SCALE) && 1609 (tp->t_flags & TF_REQ_SCALE) && 1610 !(tp->t_flags & TF_NOOPT)) { 1611 tp->t_flags |= TF_RCVD_SCALE; 1612 tp->snd_scale = to.to_wscale; 1613 } else { 1614 tp->t_flags &= ~TF_REQ_SCALE; 1615 } 1616 /* 1617 * Initial send window. It will be updated with 1618 * the next incoming segment to the scaled value. 1619 */ 1620 tp->snd_wnd = th->th_win; 1621 if ((to.to_flags & TOF_TS) && 1622 (tp->t_flags & TF_REQ_TSTMP) && 1623 !(tp->t_flags & TF_NOOPT)) { 1624 tp->t_flags |= TF_RCVD_TSTMP; 1625 tp->ts_recent = to.to_tsval; 1626 tp->ts_recent_age = tcp_ts_getticks(); 1627 } else { 1628 tp->t_flags &= ~TF_REQ_TSTMP; 1629 } 1630 if (to.to_flags & TOF_MSS) { 1631 tcp_mss(tp, to.to_mss); 1632 } 1633 if ((tp->t_flags & TF_SACK_PERMIT) && 1634 (!(to.to_flags & TOF_SACKPERM) || 1635 (tp->t_flags & TF_NOOPT))) { 1636 tp->t_flags &= ~TF_SACK_PERMIT; 1637 } 1638 if (tp->t_flags & TF_FASTOPEN) { 1639 if ((to.to_flags & TOF_FASTOPEN) && 1640 !(tp->t_flags & TF_NOOPT)) { 1641 uint16_t mss; 1642 1643 if (to.to_flags & TOF_MSS) { 1644 mss = to.to_mss; 1645 } else { 1646 if ((inp->inp_vflag & INP_IPV6) != 0) { 1647 mss = TCP6_MSS; 1648 } else { 1649 mss = TCP_MSS; 1650 } 1651 } 1652 tcp_fastopen_update_cache(tp, mss, 1653 to.to_tfo_len, to.to_tfo_cookie); 1654 } else { 1655 tcp_fastopen_disable_path(tp); 1656 } 1657 } 1658 } 1659 1660 /* 1661 * If timestamps were negotiated during SYN/ACK and a 1662 * segment without a timestamp is received, silently drop 1663 * the segment, unless it is a RST segment or missing timestamps are 1664 * tolerated. 1665 * See section 3.2 of RFC 7323. 1666 */ 1667 if ((tp->t_flags & TF_RCVD_TSTMP) && !(to.to_flags & TOF_TS)) { 1668 if (((thflags & TH_RST) != 0) || V_tcp_tolerate_missing_ts) { 1669 if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { 1670 log(LOG_DEBUG, "%s; %s: Timestamp missing, " 1671 "segment processed normally\n", 1672 s, __func__); 1673 free(s, M_TCPLOG); 1674 } 1675 } else { 1676 if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { 1677 log(LOG_DEBUG, "%s; %s: Timestamp missing, " 1678 "segment silently dropped\n", s, __func__); 1679 free(s, M_TCPLOG); 1680 } 1681 goto drop; 1682 } 1683 } 1684 /* 1685 * If timestamps were not negotiated during SYN/ACK and a 1686 * segment with a timestamp is received, ignore the 1687 * timestamp and process the packet normally. 1688 * See section 3.2 of RFC 7323. 1689 */ 1690 if (!(tp->t_flags & TF_RCVD_TSTMP) && (to.to_flags & TOF_TS)) { 1691 if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { 1692 log(LOG_DEBUG, "%s; %s: Timestamp not expected, " 1693 "segment processed normally\n", s, __func__); 1694 free(s, M_TCPLOG); 1695 } 1696 } 1697 1698 /* 1699 * Header prediction: check for the two common cases 1700 * of a uni-directional data xfer. If the packet has 1701 * no control flags, is in-sequence, the window didn't 1702 * change and we're not retransmitting, it's a 1703 * candidate. If the length is zero and the ack moved 1704 * forward, we're the sender side of the xfer. Just 1705 * free the data acked & wake any higher level process 1706 * that was blocked waiting for space. If the length 1707 * is non-zero and the ack didn't move, we're the 1708 * receiver side. If we're getting packets in-order 1709 * (the reassembly queue is empty), add the data to 1710 * the socket buffer and note that we need a delayed ack. 1711 * Make sure that the hidden state-flags are also off. 1712 * Since we check for TCPS_ESTABLISHED first, it can only 1713 * be TH_NEEDSYN. 1714 */ 1715 if (tp->t_state == TCPS_ESTABLISHED && 1716 th->th_seq == tp->rcv_nxt && 1717 (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && 1718 tp->snd_nxt == tp->snd_max && 1719 tiwin && tiwin == tp->snd_wnd && 1720 ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) && 1721 SEGQ_EMPTY(tp) && 1722 ((to.to_flags & TOF_TS) == 0 || 1723 TSTMP_GEQ(to.to_tsval, tp->ts_recent)) ) { 1724 /* 1725 * If last ACK falls within this segment's sequence numbers, 1726 * record the timestamp. 1727 * NOTE that the test is modified according to the latest 1728 * proposal of the tcplw@cray.com list (Braden 1993/04/26). 1729 */ 1730 if ((to.to_flags & TOF_TS) != 0 && 1731 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 1732 tp->ts_recent_age = tcp_ts_getticks(); 1733 tp->ts_recent = to.to_tsval; 1734 } 1735 1736 if (no_data) { 1737 if (SEQ_GT(th->th_ack, tp->snd_una) && 1738 SEQ_LEQ(th->th_ack, tp->snd_max) && 1739 !IN_RECOVERY(tp->t_flags) && 1740 (to.to_flags & TOF_SACK) == 0 && 1741 TAILQ_EMPTY(&tp->snd_holes)) { 1742 /* 1743 * This is a pure ack for outstanding data. 1744 */ 1745 TCPSTAT_INC(tcps_predack); 1746 1747 /* 1748 * "bad retransmit" recovery. 1749 */ 1750 if (tp->t_rxtshift == 1 && 1751 tp->t_flags & TF_PREVVALID && 1752 tp->t_badrxtwin != 0 && 1753 (((to.to_flags & TOF_TS) != 0 && 1754 to.to_tsecr != 0 && 1755 TSTMP_LT(to.to_tsecr, tp->t_badrxtwin)) || 1756 ((to.to_flags & TOF_TS) == 0 && 1757 TSTMP_LT(ticks, tp->t_badrxtwin)))) 1758 cc_cong_signal(tp, th, CC_RTO_ERR); 1759 1760 /* 1761 * Recalculate the transmit timer / rtt. 1762 * 1763 * Some boxes send broken timestamp replies 1764 * during the SYN+ACK phase, ignore 1765 * timestamps of 0 or we could calculate a 1766 * huge RTT and blow up the retransmit timer. 1767 */ 1768 if ((to.to_flags & TOF_TS) != 0 && 1769 to.to_tsecr) { 1770 uint32_t t; 1771 1772 t = tcp_ts_getticks() - to.to_tsecr; 1773 if (!tp->t_rttlow || tp->t_rttlow > t) 1774 tp->t_rttlow = t; 1775 tcp_xmit_timer(tp, 1776 TCP_TS_TO_TICKS(t) + 1); 1777 } else if (tp->t_rtttime && 1778 SEQ_GT(th->th_ack, tp->t_rtseq)) { 1779 if (!tp->t_rttlow || 1780 tp->t_rttlow > ticks - tp->t_rtttime) 1781 tp->t_rttlow = ticks - tp->t_rtttime; 1782 tcp_xmit_timer(tp, 1783 ticks - tp->t_rtttime); 1784 } 1785 acked = BYTES_THIS_ACK(tp, th); 1786 1787 #ifdef TCP_HHOOK 1788 /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ 1789 hhook_run_tcp_est_in(tp, th, &to); 1790 #endif 1791 1792 TCPSTAT_ADD(tcps_rcvackpack, nsegs); 1793 TCPSTAT_ADD(tcps_rcvackbyte, acked); 1794 sbdrop(&so->so_snd, acked); 1795 if (SEQ_GT(tp->snd_una, tp->snd_recover) && 1796 SEQ_LEQ(th->th_ack, tp->snd_recover)) 1797 tp->snd_recover = th->th_ack - 1; 1798 1799 /* 1800 * Let the congestion control algorithm update 1801 * congestion control related information. This 1802 * typically means increasing the congestion 1803 * window. 1804 */ 1805 cc_ack_received(tp, th, nsegs, CC_ACK); 1806 1807 tp->snd_una = th->th_ack; 1808 /* 1809 * Pull snd_wl2 up to prevent seq wrap relative 1810 * to th_ack. 1811 */ 1812 tp->snd_wl2 = th->th_ack; 1813 tp->t_dupacks = 0; 1814 m_freem(m); 1815 1816 /* 1817 * If all outstanding data are acked, stop 1818 * retransmit timer, otherwise restart timer 1819 * using current (possibly backed-off) value. 1820 * If process is waiting for space, 1821 * wakeup/selwakeup/signal. If data 1822 * are ready to send, let tcp_output 1823 * decide between more output or persist. 1824 */ 1825 TCP_PROBE3(debug__input, tp, th, m); 1826 /* 1827 * Clear t_acktime if remote side has ACKd 1828 * all data in the socket buffer. 1829 * Otherwise, update t_acktime if we received 1830 * a sufficiently large ACK. 1831 */ 1832 if (sbavail(&so->so_snd) == 0) 1833 tp->t_acktime = 0; 1834 else if (acked > 1) 1835 tp->t_acktime = ticks; 1836 if (tp->snd_una == tp->snd_max) 1837 tcp_timer_activate(tp, TT_REXMT, 0); 1838 else if (!tcp_timer_active(tp, TT_PERSIST)) 1839 tcp_timer_activate(tp, TT_REXMT, 1840 TP_RXTCUR(tp)); 1841 sowwakeup(so); 1842 /* 1843 * Only call tcp_output when there 1844 * is new data available to be sent 1845 * or we need to send an ACK. 1846 */ 1847 if ((tp->t_flags & TF_ACKNOW) || 1848 (sbavail(&so->so_snd) >= 1849 SEQ_SUB(tp->snd_max, tp->snd_una))) { 1850 (void) tcp_output(tp); 1851 } 1852 goto check_delack; 1853 } 1854 } else if (th->th_ack == tp->snd_una && 1855 tlen <= sbspace(&so->so_rcv)) { 1856 int newsize = 0; /* automatic sockbuf scaling */ 1857 1858 /* 1859 * This is a pure, in-sequence data packet with 1860 * nothing on the reassembly queue and we have enough 1861 * buffer space to take it. 1862 */ 1863 /* Clean receiver SACK report if present */ 1864 if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks) 1865 tcp_clean_sackreport(tp); 1866 TCPSTAT_INC(tcps_preddat); 1867 tp->rcv_nxt += tlen; 1868 if (tlen && 1869 ((tp->t_flags2 & TF2_FBYTES_COMPLETE) == 0) && 1870 (tp->t_fbyte_in == 0)) { 1871 tp->t_fbyte_in = ticks; 1872 if (tp->t_fbyte_in == 0) 1873 tp->t_fbyte_in = 1; 1874 if (tp->t_fbyte_out && tp->t_fbyte_in) 1875 tp->t_flags2 |= TF2_FBYTES_COMPLETE; 1876 } 1877 /* 1878 * Pull snd_wl1 up to prevent seq wrap relative to 1879 * th_seq. 1880 */ 1881 tp->snd_wl1 = th->th_seq; 1882 /* 1883 * Pull rcv_up up to prevent seq wrap relative to 1884 * rcv_nxt. 1885 */ 1886 tp->rcv_up = tp->rcv_nxt; 1887 TCPSTAT_ADD(tcps_rcvpack, nsegs); 1888 TCPSTAT_ADD(tcps_rcvbyte, tlen); 1889 TCP_PROBE3(debug__input, tp, th, m); 1890 1891 newsize = tcp_autorcvbuf(m, th, so, tp, tlen); 1892 1893 /* Add data to socket buffer. */ 1894 SOCK_RECVBUF_LOCK(so); 1895 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 1896 m_freem(m); 1897 } else { 1898 /* 1899 * Set new socket buffer size. 1900 * Give up when limit is reached. 1901 */ 1902 if (newsize) 1903 if (!sbreserve_locked(so, SO_RCV, 1904 newsize, NULL)) 1905 so->so_rcv.sb_flags &= ~SB_AUTOSIZE; 1906 m_adj(m, drop_hdrlen); /* delayed header drop */ 1907 sbappendstream_locked(&so->so_rcv, m, 0); 1908 } 1909 /* NB: sorwakeup_locked() does an implicit unlock. */ 1910 sorwakeup_locked(so); 1911 if (DELAY_ACK(tp, tlen)) { 1912 tp->t_flags |= TF_DELACK; 1913 } else { 1914 tp->t_flags |= TF_ACKNOW; 1915 (void) tcp_output(tp); 1916 } 1917 goto check_delack; 1918 } 1919 } 1920 1921 /* 1922 * Calculate amount of space in receive window, 1923 * and then do TCP input processing. 1924 * Receive window is amount of space in rcv queue, 1925 * but not less than advertised window. 1926 */ 1927 win = sbspace(&so->so_rcv); 1928 if (win < 0) 1929 win = 0; 1930 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); 1931 1932 switch (tp->t_state) { 1933 /* 1934 * If the state is SYN_RECEIVED: 1935 * if seg contains an ACK, but not for our SYN/ACK, send a RST. 1936 */ 1937 case TCPS_SYN_RECEIVED: 1938 if (thflags & TH_RST) { 1939 /* Handle RST segments later. */ 1940 break; 1941 } 1942 if ((thflags & TH_ACK) && 1943 (SEQ_LEQ(th->th_ack, tp->snd_una) || 1944 SEQ_GT(th->th_ack, tp->snd_max))) { 1945 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 1946 goto dropwithreset; 1947 } 1948 if (tp->t_flags & TF_FASTOPEN) { 1949 /* 1950 * When a TFO connection is in SYN_RECEIVED, the 1951 * only valid packets are the initial SYN, a 1952 * retransmit/copy of the initial SYN (possibly with 1953 * a subset of the original data), a valid ACK, a 1954 * FIN, or a RST. 1955 */ 1956 if ((thflags & (TH_SYN|TH_ACK)) == (TH_SYN|TH_ACK)) { 1957 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 1958 goto dropwithreset; 1959 } else if (thflags & TH_SYN) { 1960 /* non-initial SYN is ignored */ 1961 if ((tcp_timer_active(tp, TT_DELACK) || 1962 tcp_timer_active(tp, TT_REXMT))) 1963 goto drop; 1964 } else if (!(thflags & (TH_ACK|TH_FIN|TH_RST))) { 1965 goto drop; 1966 } 1967 } 1968 break; 1969 1970 /* 1971 * If the state is SYN_SENT: 1972 * if seg contains a RST with valid ACK (SEQ.ACK has already 1973 * been verified), then drop the connection. 1974 * if seg contains a RST without an ACK, drop the seg. 1975 * if seg does not contain SYN, then drop the seg. 1976 * Otherwise this is an acceptable SYN segment 1977 * initialize tp->rcv_nxt and tp->irs 1978 * if seg contains ack then advance tp->snd_una 1979 * if seg contains an ECE and ECN support is enabled, the stream 1980 * is ECN capable. 1981 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state 1982 * arrange for segment to be acked (eventually) 1983 * continue processing rest of data/controls, beginning with URG 1984 */ 1985 case TCPS_SYN_SENT: 1986 if ((thflags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) { 1987 TCP_PROBE5(connect__refused, NULL, tp, 1988 m, tp, th); 1989 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 1990 tp = tcp_drop(tp, ECONNREFUSED); 1991 } 1992 if (thflags & TH_RST) 1993 goto drop; 1994 if (!(thflags & TH_SYN)) 1995 goto drop; 1996 1997 tp->irs = th->th_seq; 1998 tcp_rcvseqinit(tp); 1999 if (thflags & TH_ACK) { 2000 int tfo_partial_ack = 0; 2001 2002 TCPSTAT_INC(tcps_connects); 2003 soisconnected(so); 2004 #ifdef MAC 2005 mac_socketpeer_set_from_mbuf(m, so); 2006 #endif 2007 /* Do window scaling on this connection? */ 2008 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 2009 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 2010 tp->rcv_scale = tp->request_r_scale; 2011 } 2012 tp->rcv_adv += min(tp->rcv_wnd, 2013 TCP_MAXWIN << tp->rcv_scale); 2014 tp->snd_una++; /* SYN is acked */ 2015 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 2016 tp->snd_nxt = tp->snd_una; 2017 /* 2018 * If not all the data that was sent in the TFO SYN 2019 * has been acked, resend the remainder right away. 2020 */ 2021 if ((tp->t_flags & TF_FASTOPEN) && 2022 (tp->snd_una != tp->snd_max)) { 2023 tp->snd_nxt = th->th_ack; 2024 tfo_partial_ack = 1; 2025 } 2026 /* 2027 * If there's data, delay ACK; if there's also a FIN 2028 * ACKNOW will be turned on later. 2029 */ 2030 if (DELAY_ACK(tp, tlen) && tlen != 0 && !tfo_partial_ack) 2031 tcp_timer_activate(tp, TT_DELACK, 2032 tcp_delacktime); 2033 else 2034 tp->t_flags |= TF_ACKNOW; 2035 2036 tcp_ecn_input_syn_sent(tp, thflags, iptos); 2037 2038 /* 2039 * Received <SYN,ACK> in SYN_SENT[*] state. 2040 * Transitions: 2041 * SYN_SENT --> ESTABLISHED 2042 * SYN_SENT* --> FIN_WAIT_1 2043 */ 2044 tp->t_starttime = ticks; 2045 if (tp->t_flags & TF_NEEDFIN) { 2046 tp->t_acktime = ticks; 2047 tcp_state_change(tp, TCPS_FIN_WAIT_1); 2048 tp->t_flags &= ~TF_NEEDFIN; 2049 thflags &= ~TH_SYN; 2050 } else { 2051 tcp_state_change(tp, TCPS_ESTABLISHED); 2052 TCP_PROBE5(connect__established, NULL, tp, 2053 m, tp, th); 2054 cc_conn_init(tp); 2055 tcp_timer_activate(tp, TT_KEEP, 2056 TP_KEEPIDLE(tp)); 2057 } 2058 } else { 2059 /* 2060 * Received initial SYN in SYN-SENT[*] state => 2061 * simultaneous open. 2062 * If it succeeds, connection is * half-synchronized. 2063 * Otherwise, do 3-way handshake: 2064 * SYN-SENT -> SYN-RECEIVED 2065 * SYN-SENT* -> SYN-RECEIVED* 2066 */ 2067 tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN | TF_SONOTCONN); 2068 tcp_timer_activate(tp, TT_REXMT, 0); 2069 tcp_state_change(tp, TCPS_SYN_RECEIVED); 2070 } 2071 2072 /* 2073 * Advance th->th_seq to correspond to first data byte. 2074 * If data, trim to stay within window, 2075 * dropping FIN if necessary. 2076 */ 2077 th->th_seq++; 2078 if (tlen > tp->rcv_wnd) { 2079 todrop = tlen - tp->rcv_wnd; 2080 m_adj(m, -todrop); 2081 tlen = tp->rcv_wnd; 2082 thflags &= ~TH_FIN; 2083 TCPSTAT_INC(tcps_rcvpackafterwin); 2084 TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); 2085 } 2086 tp->snd_wl1 = th->th_seq - 1; 2087 tp->rcv_up = th->th_seq; 2088 /* 2089 * Client side of transaction: already sent SYN and data. 2090 * If the remote host used T/TCP to validate the SYN, 2091 * our data will be ACK'd; if so, enter normal data segment 2092 * processing in the middle of step 5, ack processing. 2093 * Otherwise, goto step 6. 2094 */ 2095 if (thflags & TH_ACK) 2096 goto process_ACK; 2097 2098 goto step6; 2099 } 2100 2101 /* 2102 * States other than LISTEN or SYN_SENT. 2103 * First check the RST flag and sequence number since reset segments 2104 * are exempt from the timestamp and connection count tests. This 2105 * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix 2106 * below which allowed reset segments in half the sequence space 2107 * to fall though and be processed (which gives forged reset 2108 * segments with a random sequence number a 50 percent chance of 2109 * killing a connection). 2110 * Then check timestamp, if present. 2111 * Then check the connection count, if present. 2112 * Then check that at least some bytes of segment are within 2113 * receive window. If segment begins before rcv_nxt, 2114 * drop leading data (and SYN); if nothing left, just ack. 2115 */ 2116 if (thflags & TH_RST) { 2117 /* 2118 * RFC5961 Section 3.2 2119 * 2120 * - RST drops connection only if SEG.SEQ == RCV.NXT. 2121 * - If RST is in window, we send challenge ACK. 2122 * 2123 * Note: to take into account delayed ACKs, we should 2124 * test against last_ack_sent instead of rcv_nxt. 2125 * Note 2: we handle special case of closed window, not 2126 * covered by the RFC. 2127 */ 2128 if ((SEQ_GEQ(th->th_seq, tp->last_ack_sent) && 2129 SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) || 2130 (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) { 2131 KASSERT(tp->t_state != TCPS_SYN_SENT, 2132 ("%s: TH_RST for TCPS_SYN_SENT th %p tp %p", 2133 __func__, th, tp)); 2134 2135 if (V_tcp_insecure_rst || 2136 tp->last_ack_sent == th->th_seq) { 2137 TCPSTAT_INC(tcps_drops); 2138 /* Drop the connection. */ 2139 switch (tp->t_state) { 2140 case TCPS_SYN_RECEIVED: 2141 so->so_error = ECONNREFUSED; 2142 goto close; 2143 case TCPS_ESTABLISHED: 2144 case TCPS_FIN_WAIT_1: 2145 case TCPS_FIN_WAIT_2: 2146 case TCPS_CLOSE_WAIT: 2147 case TCPS_CLOSING: 2148 case TCPS_LAST_ACK: 2149 so->so_error = ECONNRESET; 2150 close: 2151 /* FALLTHROUGH */ 2152 default: 2153 tcp_log_end_status(tp, TCP_EI_STATUS_CLIENT_RST); 2154 tp = tcp_close(tp); 2155 } 2156 } else { 2157 TCPSTAT_INC(tcps_badrst); 2158 tcp_send_challenge_ack(tp, th, m); 2159 m = NULL; 2160 } 2161 } 2162 goto drop; 2163 } 2164 2165 /* 2166 * RFC5961 Section 4.2 2167 * Send challenge ACK for any SYN in synchronized state. 2168 */ 2169 if ((thflags & TH_SYN) && tp->t_state != TCPS_SYN_SENT && 2170 tp->t_state != TCPS_SYN_RECEIVED) { 2171 TCPSTAT_INC(tcps_badsyn); 2172 if (V_tcp_insecure_syn && 2173 SEQ_GEQ(th->th_seq, tp->last_ack_sent) && 2174 SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) { 2175 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 2176 tp = tcp_drop(tp, ECONNRESET); 2177 } else { 2178 tcp_ecn_input_syn_sent(tp, thflags, iptos); 2179 tcp_send_challenge_ack(tp, th, m); 2180 m = NULL; 2181 } 2182 goto drop; 2183 } 2184 2185 /* 2186 * RFC 1323 PAWS: If we have a timestamp reply on this segment 2187 * and it's less than ts_recent, drop it. 2188 */ 2189 if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent && 2190 TSTMP_LT(to.to_tsval, tp->ts_recent)) { 2191 /* Check to see if ts_recent is over 24 days old. */ 2192 if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) { 2193 /* 2194 * Invalidate ts_recent. If this segment updates 2195 * ts_recent, the age will be reset later and ts_recent 2196 * will get a valid value. If it does not, setting 2197 * ts_recent to zero will at least satisfy the 2198 * requirement that zero be placed in the timestamp 2199 * echo reply when ts_recent isn't valid. The 2200 * age isn't reset until we get a valid ts_recent 2201 * because we don't want out-of-order segments to be 2202 * dropped when ts_recent is old. 2203 */ 2204 tp->ts_recent = 0; 2205 } else { 2206 TCPSTAT_INC(tcps_rcvduppack); 2207 TCPSTAT_ADD(tcps_rcvdupbyte, tlen); 2208 TCPSTAT_INC(tcps_pawsdrop); 2209 if (tlen) 2210 goto dropafterack; 2211 goto drop; 2212 } 2213 } 2214 2215 /* 2216 * In the SYN-RECEIVED state, validate that the packet belongs to 2217 * this connection before trimming the data to fit the receive 2218 * window. Check the sequence number versus IRS since we know 2219 * the sequence numbers haven't wrapped. This is a partial fix 2220 * for the "LAND" DoS attack. 2221 */ 2222 if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) { 2223 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 2224 goto dropwithreset; 2225 } 2226 2227 todrop = tp->rcv_nxt - th->th_seq; 2228 if (todrop > 0) { 2229 if (thflags & TH_SYN) { 2230 thflags &= ~TH_SYN; 2231 th->th_seq++; 2232 if (th->th_urp > 1) 2233 th->th_urp--; 2234 else 2235 thflags &= ~TH_URG; 2236 todrop--; 2237 } 2238 /* 2239 * Following if statement from Stevens, vol. 2, p. 960. 2240 */ 2241 if (todrop > tlen 2242 || (todrop == tlen && (thflags & TH_FIN) == 0)) { 2243 /* 2244 * Any valid FIN must be to the left of the window. 2245 * At this point the FIN must be a duplicate or out 2246 * of sequence; drop it. 2247 */ 2248 thflags &= ~TH_FIN; 2249 2250 /* 2251 * Send an ACK to resynchronize and drop any data. 2252 * But keep on processing for RST or ACK. 2253 */ 2254 tp->t_flags |= TF_ACKNOW; 2255 todrop = tlen; 2256 TCPSTAT_INC(tcps_rcvduppack); 2257 TCPSTAT_ADD(tcps_rcvdupbyte, todrop); 2258 } else { 2259 TCPSTAT_INC(tcps_rcvpartduppack); 2260 TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop); 2261 } 2262 /* 2263 * DSACK - add SACK block for dropped range 2264 */ 2265 if ((todrop > 0) && (tp->t_flags & TF_SACK_PERMIT)) { 2266 tcp_update_sack_list(tp, th->th_seq, 2267 th->th_seq + todrop); 2268 /* 2269 * ACK now, as the next in-sequence segment 2270 * will clear the DSACK block again 2271 */ 2272 tp->t_flags |= TF_ACKNOW; 2273 } 2274 drop_hdrlen += todrop; /* drop from the top afterwards */ 2275 th->th_seq += todrop; 2276 tlen -= todrop; 2277 if (th->th_urp > todrop) 2278 th->th_urp -= todrop; 2279 else { 2280 thflags &= ~TH_URG; 2281 th->th_urp = 0; 2282 } 2283 } 2284 2285 /* 2286 * If new data are received on a connection after the 2287 * user processes are gone, then RST the other end if 2288 * no FIN has been processed. 2289 */ 2290 if ((tp->t_flags & TF_CLOSED) && tlen > 0 && 2291 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2292 if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { 2293 log(LOG_DEBUG, "%s; %s: %s: Received %d bytes of data " 2294 "after socket was closed, " 2295 "sending RST and removing tcpcb\n", 2296 s, __func__, tcpstates[tp->t_state], tlen); 2297 free(s, M_TCPLOG); 2298 } 2299 tcp_log_end_status(tp, TCP_EI_STATUS_DATA_A_CLOSE); 2300 /* tcp_close will kill the inp pre-log the Reset */ 2301 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); 2302 tp = tcp_close(tp); 2303 TCPSTAT_INC(tcps_rcvafterclose); 2304 goto dropwithreset; 2305 } 2306 2307 /* 2308 * If segment ends after window, drop trailing data 2309 * (and PUSH and FIN); if nothing left, just ACK. 2310 */ 2311 todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd); 2312 if (todrop > 0) { 2313 TCPSTAT_INC(tcps_rcvpackafterwin); 2314 if (todrop >= tlen) { 2315 TCPSTAT_ADD(tcps_rcvbyteafterwin, tlen); 2316 /* 2317 * If window is closed can only take segments at 2318 * window edge, and have to drop data and PUSH from 2319 * incoming segments. Continue processing, but 2320 * remember to ack. Otherwise, drop segment 2321 * and ack. 2322 */ 2323 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { 2324 tp->t_flags |= TF_ACKNOW; 2325 TCPSTAT_INC(tcps_rcvwinprobe); 2326 } else 2327 goto dropafterack; 2328 } else 2329 TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); 2330 m_adj(m, -todrop); 2331 tlen -= todrop; 2332 thflags &= ~(TH_PUSH|TH_FIN); 2333 } 2334 2335 /* 2336 * If last ACK falls within this segment's sequence numbers, 2337 * record its timestamp. 2338 * NOTE: 2339 * 1) That the test incorporates suggestions from the latest 2340 * proposal of the tcplw@cray.com list (Braden 1993/04/26). 2341 * 2) That updating only on newer timestamps interferes with 2342 * our earlier PAWS tests, so this check should be solely 2343 * predicated on the sequence space of this segment. 2344 * 3) That we modify the segment boundary check to be 2345 * Last.ACK.Sent <= SEG.SEQ + SEG.Len 2346 * instead of RFC1323's 2347 * Last.ACK.Sent < SEG.SEQ + SEG.Len, 2348 * This modified check allows us to overcome RFC1323's 2349 * limitations as described in Stevens TCP/IP Illustrated 2350 * Vol. 2 p.869. In such cases, we can still calculate the 2351 * RTT correctly when RCV.NXT == Last.ACK.Sent. 2352 */ 2353 if ((to.to_flags & TOF_TS) != 0 && 2354 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 2355 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 2356 ((thflags & (TH_SYN|TH_FIN)) != 0))) { 2357 tp->ts_recent_age = tcp_ts_getticks(); 2358 tp->ts_recent = to.to_tsval; 2359 } 2360 2361 /* 2362 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN 2363 * flag is on (half-synchronized state), then queue data for 2364 * later processing; else drop segment and return. 2365 */ 2366 if ((thflags & TH_ACK) == 0) { 2367 if (tp->t_state == TCPS_SYN_RECEIVED || 2368 (tp->t_flags & TF_NEEDSYN)) { 2369 if (tp->t_state == TCPS_SYN_RECEIVED && 2370 (tp->t_flags & TF_FASTOPEN)) { 2371 tp->snd_wnd = tiwin; 2372 cc_conn_init(tp); 2373 } 2374 goto step6; 2375 } else if (tp->t_flags & TF_ACKNOW) 2376 goto dropafterack; 2377 else 2378 goto drop; 2379 } 2380 2381 /* 2382 * Ack processing. 2383 */ 2384 if (SEQ_GEQ(tp->snd_una, tp->iss + (TCP_MAXWIN << tp->snd_scale))) { 2385 /* Checking SEG.ACK against ISS is definitely redundant. */ 2386 tp->t_flags2 |= TF2_NO_ISS_CHECK; 2387 } 2388 if (!V_tcp_insecure_ack) { 2389 tcp_seq seq_min; 2390 bool ghost_ack_check; 2391 2392 if (tp->t_flags2 & TF2_NO_ISS_CHECK) { 2393 /* Check for too old ACKs (RFC 5961, Section 5.2). */ 2394 seq_min = tp->snd_una - tp->max_sndwnd; 2395 ghost_ack_check = false; 2396 } else { 2397 if (SEQ_GT(tp->iss + 1, tp->snd_una - tp->max_sndwnd)) { 2398 /* Checking for ghost ACKs is stricter. */ 2399 seq_min = tp->iss + 1; 2400 ghost_ack_check = true; 2401 } else { 2402 /* 2403 * Checking for too old ACKs (RFC 5961, 2404 * Section 5.2) is stricter. 2405 */ 2406 seq_min = tp->snd_una - tp->max_sndwnd; 2407 ghost_ack_check = false; 2408 } 2409 } 2410 if (SEQ_LT(th->th_ack, seq_min)) { 2411 if (ghost_ack_check) 2412 TCPSTAT_INC(tcps_rcvghostack); 2413 else 2414 TCPSTAT_INC(tcps_rcvacktooold); 2415 tcp_send_challenge_ack(tp, th, m); 2416 m = NULL; 2417 goto drop; 2418 } 2419 } 2420 switch (tp->t_state) { 2421 /* 2422 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter 2423 * ESTABLISHED state and continue processing. 2424 * The ACK was checked above. 2425 */ 2426 case TCPS_SYN_RECEIVED: 2427 2428 TCPSTAT_INC(tcps_connects); 2429 if (tp->t_flags & TF_SONOTCONN) { 2430 /* 2431 * Usually SYN_RECEIVED had been created from a LISTEN, 2432 * and solisten_enqueue() has already marked the socket 2433 * layer as connected. If it didn't, which can happen 2434 * only with an accept_filter(9), then the tp is marked 2435 * with TF_SONOTCONN. The other reason for this mark 2436 * to be set is a simultaneous open, a SYN_RECEIVED 2437 * that had been created from SYN_SENT. 2438 */ 2439 tp->t_flags &= ~TF_SONOTCONN; 2440 soisconnected(so); 2441 } 2442 /* Do window scaling? */ 2443 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 2444 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 2445 tp->rcv_scale = tp->request_r_scale; 2446 } 2447 tp->snd_wnd = tiwin; 2448 /* 2449 * Make transitions: 2450 * SYN-RECEIVED -> ESTABLISHED 2451 * SYN-RECEIVED* -> FIN-WAIT-1 2452 */ 2453 tp->t_starttime = ticks; 2454 if ((tp->t_flags & TF_FASTOPEN) && tp->t_tfo_pending) { 2455 tcp_fastopen_decrement_counter(tp->t_tfo_pending); 2456 tp->t_tfo_pending = NULL; 2457 } 2458 if (tp->t_flags & TF_NEEDFIN) { 2459 tp->t_acktime = ticks; 2460 tcp_state_change(tp, TCPS_FIN_WAIT_1); 2461 tp->t_flags &= ~TF_NEEDFIN; 2462 } else { 2463 tcp_state_change(tp, TCPS_ESTABLISHED); 2464 TCP_PROBE5(accept__established, NULL, tp, 2465 m, tp, th); 2466 /* 2467 * TFO connections call cc_conn_init() during SYN 2468 * processing. Calling it again here for such 2469 * connections is not harmless as it would undo the 2470 * snd_cwnd reduction that occurs when a TFO SYN|ACK 2471 * is retransmitted. 2472 */ 2473 if (!(tp->t_flags & TF_FASTOPEN)) 2474 cc_conn_init(tp); 2475 tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp)); 2476 } 2477 /* 2478 * Account for the ACK of our SYN prior to 2479 * regular ACK processing below, except for 2480 * simultaneous SYN, which is handled later. 2481 */ 2482 if (SEQ_GT(th->th_ack, tp->snd_una) && !(tp->t_flags & TF_NEEDSYN)) 2483 incforsyn = 1; 2484 /* 2485 * If segment contains data or ACK, will call tcp_reass() 2486 * later; if not, do so now to pass queued data to user. 2487 */ 2488 if (tlen == 0 && (thflags & TH_FIN) == 0) { 2489 (void) tcp_reass(tp, (struct tcphdr *)0, NULL, 0, 2490 (struct mbuf *)0); 2491 tcp_handle_wakeup(tp); 2492 } 2493 tp->snd_wl1 = th->th_seq - 1; 2494 /* FALLTHROUGH */ 2495 2496 /* 2497 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range 2498 * ACKs. If the ack is in the range 2499 * tp->snd_una < th->th_ack <= tp->snd_max 2500 * then advance tp->snd_una to th->th_ack and drop 2501 * data from the retransmission queue. If this ACK reflects 2502 * more up to date window information we update our window information. 2503 */ 2504 case TCPS_ESTABLISHED: 2505 case TCPS_FIN_WAIT_1: 2506 case TCPS_FIN_WAIT_2: 2507 case TCPS_CLOSE_WAIT: 2508 case TCPS_CLOSING: 2509 case TCPS_LAST_ACK: 2510 if (SEQ_GT(th->th_ack, tp->snd_max)) { 2511 TCPSTAT_INC(tcps_rcvacktoomuch); 2512 goto dropafterack; 2513 } 2514 if (tcp_is_sack_recovery(tp, &to)) { 2515 sack_changed = tcp_sack_doack(tp, &to, th->th_ack); 2516 if ((sack_changed != SACK_NOCHANGE) && 2517 (tp->t_flags & TF_LRD)) { 2518 tcp_sack_lost_retransmission(tp, th); 2519 } 2520 } else 2521 /* 2522 * Reset the value so that previous (valid) value 2523 * from the last ack with SACK doesn't get used. 2524 */ 2525 tp->sackhint.sacked_bytes = 0; 2526 2527 #ifdef TCP_HHOOK 2528 /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ 2529 hhook_run_tcp_est_in(tp, th, &to); 2530 #endif 2531 2532 if (SEQ_LT(th->th_ack, tp->snd_una)) { 2533 /* This is old ACK information, don't process it. */ 2534 break; 2535 } 2536 if (th->th_ack == tp->snd_una) { 2537 /* Check if this is a duplicate ACK. */ 2538 if ((tp->t_flags & TF_SACK_PERMIT) && 2539 V_tcp_do_newsack) { 2540 /* 2541 * If SEG.ACK == SND.UNA, RFC 6675 requires a 2542 * duplicate ACK to selectively acknowledge 2543 * at least one byte, which was not selectively 2544 * acknowledged before. 2545 */ 2546 if (sack_changed == SACK_NOCHANGE) { 2547 break; 2548 } 2549 } else { 2550 /* 2551 * If SEG.ACK == SND.UNA, RFC 5681 requires a 2552 * duplicate ACK to have no data on it and to 2553 * not be a window update. 2554 */ 2555 if (!no_data || tiwin != tp->snd_wnd) { 2556 break; 2557 } 2558 } 2559 /* 2560 * If this is the first time we've seen a 2561 * FIN from the remote, this is not a 2562 * duplicate ACK and it needs to be processed 2563 * normally. 2564 * This happens during a simultaneous close. 2565 */ 2566 if ((thflags & TH_FIN) && 2567 (TCPS_HAVERCVDFIN(tp->t_state) == 0)) { 2568 tp->t_dupacks = 0; 2569 break; 2570 } 2571 /* Perform duplicate ACK processing. */ 2572 TCPSTAT_INC(tcps_rcvdupack); 2573 maxseg = tcp_maxseg(tp); 2574 if (!tcp_timer_active(tp, TT_REXMT)) { 2575 tp->t_dupacks = 0; 2576 } else if (++tp->t_dupacks > tcprexmtthresh || 2577 IN_FASTRECOVERY(tp->t_flags)) { 2578 cc_ack_received(tp, th, nsegs, CC_DUPACK); 2579 if (V_tcp_do_prr && 2580 IN_FASTRECOVERY(tp->t_flags) && 2581 (tp->t_flags & TF_SACK_PERMIT)) { 2582 tcp_do_prr_ack(tp, th, &to, 2583 sack_changed, &maxseg); 2584 } else if (tcp_is_sack_recovery(tp, &to) && 2585 IN_FASTRECOVERY(tp->t_flags) && 2586 (tp->snd_nxt == tp->snd_max)) { 2587 int awnd; 2588 2589 /* 2590 * Compute the amount of data in flight first. 2591 * We can inject new data into the pipe iff 2592 * we have less than ssthresh 2593 * worth of data in flight. 2594 */ 2595 awnd = tcp_compute_pipe(tp); 2596 if (awnd < tp->snd_ssthresh) { 2597 tp->snd_cwnd += imax(maxseg, 2598 imin(2 * maxseg, 2599 tp->sackhint.delivered_data)); 2600 if (tp->snd_cwnd > tp->snd_ssthresh) 2601 tp->snd_cwnd = tp->snd_ssthresh; 2602 } 2603 } else if (tcp_is_sack_recovery(tp, &to) && 2604 IN_FASTRECOVERY(tp->t_flags) && 2605 SEQ_LT(tp->snd_nxt, tp->snd_max)) { 2606 tp->snd_cwnd += imax(maxseg, 2607 imin(2 * maxseg, 2608 tp->sackhint.delivered_data)); 2609 } else { 2610 tp->snd_cwnd += maxseg; 2611 } 2612 (void) tcp_output(tp); 2613 goto drop; 2614 } else if (tp->t_dupacks == tcprexmtthresh || 2615 (tp->t_flags & TF_SACK_PERMIT && 2616 V_tcp_do_newsack && 2617 tp->sackhint.sacked_bytes > 2618 (tcprexmtthresh - 1) * maxseg)) { 2619 enter_recovery: 2620 /* 2621 * Above is the RFC6675 trigger condition of 2622 * more than (dupthresh-1)*maxseg sacked data. 2623 * If the count of holes in the 2624 * scoreboard is >= dupthresh, we could 2625 * also enter loss recovery, but don't 2626 * have that value readily available. 2627 */ 2628 tp->t_dupacks = tcprexmtthresh; 2629 tcp_seq onxt = tp->snd_nxt; 2630 2631 /* 2632 * If we're doing sack, check to 2633 * see if we're already in sack 2634 * recovery. If we're not doing sack, 2635 * check to see if we're in newreno 2636 * recovery. 2637 */ 2638 if (tcp_is_sack_recovery(tp, &to)) { 2639 if (IN_FASTRECOVERY(tp->t_flags)) { 2640 tp->t_dupacks = 0; 2641 break; 2642 } 2643 } else { 2644 if (SEQ_LEQ(th->th_ack, 2645 tp->snd_recover)) { 2646 tp->t_dupacks = 0; 2647 break; 2648 } 2649 } 2650 /* Congestion signal before ack. */ 2651 cc_cong_signal(tp, th, CC_NDUPACK); 2652 cc_ack_received(tp, th, nsegs, CC_DUPACK); 2653 tcp_timer_activate(tp, TT_REXMT, 0); 2654 tp->t_rtttime = 0; 2655 if (V_tcp_do_prr) { 2656 /* 2657 * snd_ssthresh and snd_recover are 2658 * already updated by cc_cong_signal. 2659 */ 2660 if (tcp_is_sack_recovery(tp, &to)) { 2661 /* 2662 * Include Limited Transmit 2663 * segments here 2664 */ 2665 tp->sackhint.prr_delivered = 2666 imin(tp->snd_max - th->th_ack, 2667 (tp->snd_limited + 1) * maxseg); 2668 } else { 2669 tp->sackhint.prr_delivered = 2670 maxseg; 2671 } 2672 tp->sackhint.recover_fs = max(1, 2673 tp->snd_nxt - tp->snd_una); 2674 } 2675 tp->snd_limited = 0; 2676 if (tcp_is_sack_recovery(tp, &to)) { 2677 TCPSTAT_INC(tcps_sack_recovery_episode); 2678 /* 2679 * When entering LR after RTO due to 2680 * Duplicate ACKs, retransmit existing 2681 * holes from the scoreboard. 2682 */ 2683 tcp_resend_sackholes(tp); 2684 /* Avoid inflating cwnd in tcp_output */ 2685 tp->snd_nxt = tp->snd_max; 2686 tp->snd_cwnd = tcp_compute_pipe(tp) + 2687 maxseg; 2688 (void) tcp_output(tp); 2689 /* Set cwnd to the expected flightsize */ 2690 tp->snd_cwnd = tp->snd_ssthresh; 2691 goto drop; 2692 } 2693 tp->snd_nxt = th->th_ack; 2694 tp->snd_cwnd = maxseg; 2695 (void) tcp_output(tp); 2696 KASSERT(tp->snd_limited <= 2, 2697 ("%s: tp->snd_limited too big", 2698 __func__)); 2699 tp->snd_cwnd = tp->snd_ssthresh + 2700 maxseg * 2701 (tp->t_dupacks - tp->snd_limited); 2702 if (SEQ_GT(onxt, tp->snd_nxt)) 2703 tp->snd_nxt = onxt; 2704 goto drop; 2705 } else if (V_tcp_do_rfc3042) { 2706 /* 2707 * Process first and second duplicate 2708 * ACKs. Each indicates a segment 2709 * leaving the network, creating room 2710 * for more. Make sure we can send a 2711 * packet on reception of each duplicate 2712 * ACK by increasing snd_cwnd by one 2713 * segment. Restore the original 2714 * snd_cwnd after packet transmission. 2715 */ 2716 cc_ack_received(tp, th, nsegs, CC_DUPACK); 2717 uint32_t oldcwnd = tp->snd_cwnd; 2718 tcp_seq oldsndmax = tp->snd_max; 2719 u_int sent; 2720 int avail; 2721 2722 KASSERT(tp->t_dupacks == 1 || 2723 tp->t_dupacks == 2, 2724 ("%s: dupacks not 1 or 2", 2725 __func__)); 2726 if (tp->t_dupacks == 1) 2727 tp->snd_limited = 0; 2728 if ((tp->snd_nxt == tp->snd_max) && 2729 (tp->t_rxtshift == 0)) 2730 tp->snd_cwnd = 2731 SEQ_SUB(tp->snd_nxt, tp->snd_una); 2732 tp->snd_cwnd += 2733 (tp->t_dupacks - tp->snd_limited) * maxseg; 2734 tp->snd_cwnd -= tcp_sack_adjust(tp); 2735 /* 2736 * Only call tcp_output when there 2737 * is new data available to be sent 2738 * or we need to send an ACK. 2739 */ 2740 SOCK_SENDBUF_LOCK(so); 2741 avail = sbavail(&so->so_snd); 2742 SOCK_SENDBUF_UNLOCK(so); 2743 if (tp->t_flags & TF_ACKNOW || 2744 (avail >= 2745 SEQ_SUB(tp->snd_nxt, tp->snd_una))) { 2746 (void) tcp_output(tp); 2747 } 2748 sent = SEQ_SUB(tp->snd_max, oldsndmax); 2749 if (sent > maxseg) { 2750 KASSERT((tp->t_dupacks == 2 && 2751 tp->snd_limited == 0) || 2752 (sent == maxseg + 1 && 2753 tp->t_flags & TF_SENTFIN) || 2754 (sent < 2 * maxseg && 2755 tp->t_flags & TF_NODELAY), 2756 ("%s: sent too much: %u>%u", 2757 __func__, sent, maxseg)); 2758 tp->snd_limited = 2; 2759 } else if (sent > 0) { 2760 ++tp->snd_limited; 2761 } 2762 tp->snd_cwnd = oldcwnd; 2763 goto drop; 2764 } 2765 break; 2766 } 2767 KASSERT(SEQ_GT(th->th_ack, tp->snd_una), 2768 ("%s: SEQ_LEQ(th_ack, snd_una)", __func__)); 2769 /* 2770 * This ack is advancing the left edge, reset the 2771 * counter. 2772 */ 2773 tp->t_dupacks = 0; 2774 /* 2775 * If this ack also has new SACK info, increment the 2776 * t_dupacks as per RFC 6675. The variable 2777 * sack_changed tracks all changes to the SACK 2778 * scoreboard, including when partial ACKs without 2779 * SACK options are received, and clear the scoreboard 2780 * from the left side. Such partial ACKs should not be 2781 * counted as dupacks here. 2782 */ 2783 if (V_tcp_do_newsack && 2784 tcp_is_sack_recovery(tp, &to) && 2785 (((tp->t_rxtshift == 0) && (sack_changed != SACK_NOCHANGE)) || 2786 ((tp->t_rxtshift > 0) && (sack_changed == SACK_NEWLOSS))) && 2787 (tp->snd_nxt == tp->snd_max)) { 2788 tp->t_dupacks++; 2789 /* limit overhead by setting maxseg last */ 2790 if (!IN_FASTRECOVERY(tp->t_flags) && 2791 (tp->sackhint.sacked_bytes > 2792 (tcprexmtthresh - 1) * (maxseg = tcp_maxseg(tp)))) { 2793 goto enter_recovery; 2794 } 2795 } 2796 /* 2797 * If the congestion window was inflated to account 2798 * for the other side's cached packets, retract it. 2799 */ 2800 if (SEQ_LT(th->th_ack, tp->snd_recover)) { 2801 if (IN_FASTRECOVERY(tp->t_flags)) { 2802 if (tp->t_flags & TF_SACK_PERMIT) { 2803 if (V_tcp_do_prr && 2804 (to.to_flags & TOF_SACK)) { 2805 tcp_timer_activate(tp, 2806 TT_REXMT, 0); 2807 tp->t_rtttime = 0; 2808 tcp_do_prr_ack(tp, th, &to, 2809 sack_changed, &maxseg); 2810 tp->t_flags |= TF_ACKNOW; 2811 (void) tcp_output(tp); 2812 } else { 2813 tcp_sack_partialack(tp, th, 2814 &maxseg); 2815 } 2816 } else { 2817 tcp_newreno_partial_ack(tp, th); 2818 } 2819 } else if (IN_CONGRECOVERY(tp->t_flags) && 2820 (V_tcp_do_prr)) { 2821 tp->sackhint.delivered_data = 2822 BYTES_THIS_ACK(tp, th); 2823 tp->snd_fack = th->th_ack; 2824 /* 2825 * During ECN cwnd reduction 2826 * always use PRR-SSRB 2827 */ 2828 tcp_do_prr_ack(tp, th, &to, SACK_CHANGE, 2829 &maxseg); 2830 (void) tcp_output(tp); 2831 } 2832 } 2833 /* 2834 * If we reach this point, ACK is not a duplicate, 2835 * i.e., it ACKs something we sent. 2836 */ 2837 if (tp->t_flags & TF_NEEDSYN) { 2838 /* 2839 * T/TCP: Connection was half-synchronized, and our 2840 * SYN has been ACK'd (so connection is now fully 2841 * synchronized). Go to non-starred state, 2842 * increment snd_una for ACK of SYN, and check if 2843 * we can do window scaling. 2844 */ 2845 tp->t_flags &= ~TF_NEEDSYN; 2846 tp->snd_una++; 2847 /* Do window scaling? */ 2848 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 2849 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 2850 tp->rcv_scale = tp->request_r_scale; 2851 /* Send window already scaled. */ 2852 } 2853 } 2854 2855 process_ACK: 2856 INP_WLOCK_ASSERT(inp); 2857 2858 /* 2859 * Adjust for the SYN bit in sequence space, 2860 * but don't account for it in cwnd calculations. 2861 * This is for the SYN_RECEIVED, non-simultaneous 2862 * SYN case. SYN_SENT and simultaneous SYN are 2863 * treated elsewhere. 2864 */ 2865 if (incforsyn) 2866 tp->snd_una++; 2867 acked = BYTES_THIS_ACK(tp, th); 2868 KASSERT(acked >= 0, ("%s: acked unexepectedly negative " 2869 "(tp->snd_una=%u, th->th_ack=%u, tp=%p, m=%p)", __func__, 2870 tp->snd_una, th->th_ack, tp, m)); 2871 TCPSTAT_ADD(tcps_rcvackpack, nsegs); 2872 TCPSTAT_ADD(tcps_rcvackbyte, acked); 2873 2874 /* 2875 * If we just performed our first retransmit, and the ACK 2876 * arrives within our recovery window, then it was a mistake 2877 * to do the retransmit in the first place. Recover our 2878 * original cwnd and ssthresh, and proceed to transmit where 2879 * we left off. 2880 */ 2881 if (tp->t_rxtshift == 1 && 2882 tp->t_flags & TF_PREVVALID && 2883 tp->t_badrxtwin != 0 && 2884 to.to_flags & TOF_TS && 2885 to.to_tsecr != 0 && 2886 TSTMP_LT(to.to_tsecr, tp->t_badrxtwin)) 2887 cc_cong_signal(tp, th, CC_RTO_ERR); 2888 2889 /* 2890 * If we have a timestamp reply, update smoothed 2891 * round trip time. If no timestamp is present but 2892 * transmit timer is running and timed sequence 2893 * number was acked, update smoothed round trip time. 2894 * Since we now have an rtt measurement, cancel the 2895 * timer backoff (cf., Phil Karn's retransmit alg.). 2896 * Recompute the initial retransmit timer. 2897 * 2898 * Some boxes send broken timestamp replies 2899 * during the SYN+ACK phase, ignore 2900 * timestamps of 0 or we could calculate a 2901 * huge RTT and blow up the retransmit timer. 2902 */ 2903 if ((to.to_flags & TOF_TS) != 0 && to.to_tsecr) { 2904 uint32_t t; 2905 2906 t = tcp_ts_getticks() - to.to_tsecr; 2907 if (!tp->t_rttlow || tp->t_rttlow > t) 2908 tp->t_rttlow = t; 2909 tcp_xmit_timer(tp, TCP_TS_TO_TICKS(t) + 1); 2910 } else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) { 2911 if (!tp->t_rttlow || tp->t_rttlow > ticks - tp->t_rtttime) 2912 tp->t_rttlow = ticks - tp->t_rtttime; 2913 tcp_xmit_timer(tp, ticks - tp->t_rtttime); 2914 } 2915 2916 SOCK_SENDBUF_LOCK(so); 2917 /* 2918 * Clear t_acktime if remote side has ACKd all data in the 2919 * socket buffer and FIN (if applicable). 2920 * Otherwise, update t_acktime if we received a sufficiently 2921 * large ACK. 2922 */ 2923 if ((tp->t_state <= TCPS_CLOSE_WAIT && 2924 acked == sbavail(&so->so_snd)) || 2925 acked > sbavail(&so->so_snd)) 2926 tp->t_acktime = 0; 2927 else if (acked > 1) 2928 tp->t_acktime = ticks; 2929 2930 /* 2931 * If all outstanding data is acked, stop retransmit 2932 * timer and remember to restart (more output or persist). 2933 * If there is more data to be acked, restart retransmit 2934 * timer, using current (possibly backed-off) value. 2935 */ 2936 if (th->th_ack == tp->snd_max) { 2937 tcp_timer_activate(tp, TT_REXMT, 0); 2938 needoutput = 1; 2939 } else if (!tcp_timer_active(tp, TT_PERSIST)) 2940 tcp_timer_activate(tp, TT_REXMT, TP_RXTCUR(tp)); 2941 2942 /* 2943 * If no data (only SYN) was ACK'd, 2944 * skip rest of ACK processing. 2945 */ 2946 if (acked == 0) { 2947 SOCK_SENDBUF_UNLOCK(so); 2948 goto step6; 2949 } 2950 2951 /* 2952 * Let the congestion control algorithm update congestion 2953 * control related information. This typically means increasing 2954 * the congestion window. 2955 */ 2956 cc_ack_received(tp, th, nsegs, CC_ACK); 2957 2958 if (acked > sbavail(&so->so_snd)) { 2959 if (tp->snd_wnd >= sbavail(&so->so_snd)) 2960 tp->snd_wnd -= sbavail(&so->so_snd); 2961 else 2962 tp->snd_wnd = 0; 2963 mfree = sbcut_locked(&so->so_snd, 2964 (int)sbavail(&so->so_snd)); 2965 ourfinisacked = 1; 2966 } else { 2967 mfree = sbcut_locked(&so->so_snd, acked); 2968 if (tp->snd_wnd >= (uint32_t) acked) 2969 tp->snd_wnd -= acked; 2970 else 2971 tp->snd_wnd = 0; 2972 ourfinisacked = 0; 2973 } 2974 /* NB: sowwakeup_locked() does an implicit unlock. */ 2975 sowwakeup_locked(so); 2976 m_freem(mfree); 2977 /* Detect una wraparound. */ 2978 if (!IN_RECOVERY(tp->t_flags) && 2979 SEQ_GT(tp->snd_una, tp->snd_recover) && 2980 SEQ_LEQ(th->th_ack, tp->snd_recover)) 2981 tp->snd_recover = th->th_ack - 1; 2982 tp->snd_una = th->th_ack; 2983 if (IN_RECOVERY(tp->t_flags) && 2984 SEQ_GEQ(th->th_ack, tp->snd_recover)) { 2985 cc_post_recovery(tp, th); 2986 } 2987 if (SEQ_GT(tp->snd_una, tp->snd_recover)) { 2988 tp->snd_recover = tp->snd_una; 2989 } 2990 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 2991 tp->snd_nxt = tp->snd_una; 2992 2993 switch (tp->t_state) { 2994 /* 2995 * In FIN_WAIT_1 STATE in addition to the processing 2996 * for the ESTABLISHED state if our FIN is now acknowledged 2997 * then enter FIN_WAIT_2. 2998 */ 2999 case TCPS_FIN_WAIT_1: 3000 if (ourfinisacked) { 3001 /* 3002 * If we can't receive any more 3003 * data, then closing user can proceed. 3004 * Starting the timer is contrary to the 3005 * specification, but if we don't get a FIN 3006 * we'll hang forever. 3007 */ 3008 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 3009 tcp_free_sackholes(tp); 3010 soisdisconnected(so); 3011 tcp_timer_activate(tp, TT_2MSL, 3012 (tcp_fast_finwait2_recycle ? 3013 tcp_finwait2_timeout : 3014 TP_MAXIDLE(tp))); 3015 } 3016 tcp_state_change(tp, TCPS_FIN_WAIT_2); 3017 } 3018 break; 3019 3020 /* 3021 * In CLOSING STATE in addition to the processing for 3022 * the ESTABLISHED state if the ACK acknowledges our FIN 3023 * then enter the TIME-WAIT state, otherwise ignore 3024 * the segment. 3025 */ 3026 case TCPS_CLOSING: 3027 if (ourfinisacked) { 3028 tcp_twstart(tp); 3029 m_freem(m); 3030 return; 3031 } 3032 break; 3033 3034 /* 3035 * In LAST_ACK, we may still be waiting for data to drain 3036 * and/or to be acked, as well as for the ack of our FIN. 3037 * If our FIN is now acknowledged, delete the TCB, 3038 * enter the closed state and return. 3039 */ 3040 case TCPS_LAST_ACK: 3041 if (ourfinisacked) { 3042 tp = tcp_close(tp); 3043 goto drop; 3044 } 3045 break; 3046 } 3047 } 3048 3049 step6: 3050 INP_WLOCK_ASSERT(inp); 3051 3052 /* 3053 * Update window information. 3054 * Don't look at window if no ACK: TAC's send garbage on first SYN. 3055 */ 3056 if ((thflags & TH_ACK) && 3057 (SEQ_LT(tp->snd_wl1, th->th_seq) || 3058 (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || 3059 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { 3060 /* keep track of pure window updates */ 3061 if (no_data && tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) 3062 TCPSTAT_INC(tcps_rcvwinupd); 3063 tp->snd_wnd = tiwin; 3064 tp->snd_wl1 = th->th_seq; 3065 tp->snd_wl2 = th->th_ack; 3066 if (tp->snd_wnd > tp->max_sndwnd) 3067 tp->max_sndwnd = tp->snd_wnd; 3068 needoutput = 1; 3069 } 3070 3071 /* 3072 * Process segments with URG. 3073 */ 3074 if ((thflags & TH_URG) && th->th_urp && 3075 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 3076 /* 3077 * This is a kludge, but if we receive and accept 3078 * random urgent pointers, we'll crash in 3079 * soreceive. It's hard to imagine someone 3080 * actually wanting to send this much urgent data. 3081 */ 3082 SOCK_RECVBUF_LOCK(so); 3083 if (th->th_urp + sbavail(&so->so_rcv) > sb_max) { 3084 th->th_urp = 0; /* XXX */ 3085 thflags &= ~TH_URG; /* XXX */ 3086 SOCK_RECVBUF_UNLOCK(so); /* XXX */ 3087 goto dodata; /* XXX */ 3088 } 3089 /* 3090 * If this segment advances the known urgent pointer, 3091 * then mark the data stream. This should not happen 3092 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since 3093 * a FIN has been received from the remote side. 3094 * In these states we ignore the URG. 3095 * 3096 * According to RFC961 (Assigned Protocols), 3097 * the urgent pointer points to the last octet 3098 * of urgent data. We continue, however, 3099 * to consider it to indicate the first octet 3100 * of data past the urgent section as the original 3101 * spec states (in one of two places). 3102 */ 3103 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { 3104 tp->rcv_up = th->th_seq + th->th_urp; 3105 so->so_oobmark = sbavail(&so->so_rcv) + 3106 (tp->rcv_up - tp->rcv_nxt) - 1; 3107 if (so->so_oobmark == 0) 3108 so->so_rcv.sb_state |= SBS_RCVATMARK; 3109 sohasoutofband(so); 3110 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); 3111 } 3112 SOCK_RECVBUF_UNLOCK(so); 3113 /* 3114 * Remove out of band data so doesn't get presented to user. 3115 * This can happen independent of advancing the URG pointer, 3116 * but if two URG's are pending at once, some out-of-band 3117 * data may creep in... ick. 3118 */ 3119 if (th->th_urp <= (uint32_t)tlen && 3120 !(so->so_options & SO_OOBINLINE)) { 3121 /* hdr drop is delayed */ 3122 tcp_pulloutofband(so, th, m, drop_hdrlen); 3123 } 3124 } else { 3125 /* 3126 * If no out of band data is expected, 3127 * pull receive urgent pointer along 3128 * with the receive window. 3129 */ 3130 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) 3131 tp->rcv_up = tp->rcv_nxt; 3132 } 3133 dodata: /* XXX */ 3134 INP_WLOCK_ASSERT(inp); 3135 3136 /* 3137 * Process the segment text, merging it into the TCP sequencing queue, 3138 * and arranging for acknowledgment of receipt if necessary. 3139 * This process logically involves adjusting tp->rcv_wnd as data 3140 * is presented to the user (this happens in tcp_usrreq.c, 3141 * case PRU_RCVD). If a FIN has already been received on this 3142 * connection then we just ignore the text. 3143 */ 3144 tfo_syn = ((tp->t_state == TCPS_SYN_RECEIVED) && 3145 (tp->t_flags & TF_FASTOPEN)); 3146 if ((tlen || (thflags & TH_FIN) || (tfo_syn && tlen > 0)) && 3147 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 3148 tcp_seq save_start = th->th_seq; 3149 tcp_seq save_rnxt = tp->rcv_nxt; 3150 int save_tlen = tlen; 3151 m_adj(m, drop_hdrlen); /* delayed header drop */ 3152 /* 3153 * Insert segment which includes th into TCP reassembly queue 3154 * with control block tp. Set thflags to whether reassembly now 3155 * includes a segment with FIN. This handles the common case 3156 * inline (segment is the next to be received on an established 3157 * connection, and the queue is empty), avoiding linkage into 3158 * and removal from the queue and repetition of various 3159 * conversions. 3160 * Set DELACK for segments received in order, but ack 3161 * immediately when segments are out of order (so 3162 * fast retransmit can work). 3163 */ 3164 if (th->th_seq == tp->rcv_nxt && 3165 SEGQ_EMPTY(tp) && 3166 (TCPS_HAVEESTABLISHED(tp->t_state) || 3167 tfo_syn)) { 3168 if (DELAY_ACK(tp, tlen) || tfo_syn) 3169 tp->t_flags |= TF_DELACK; 3170 else 3171 tp->t_flags |= TF_ACKNOW; 3172 tp->rcv_nxt += tlen; 3173 if (tlen && 3174 ((tp->t_flags2 & TF2_FBYTES_COMPLETE) == 0) && 3175 (tp->t_fbyte_in == 0)) { 3176 tp->t_fbyte_in = ticks; 3177 if (tp->t_fbyte_in == 0) 3178 tp->t_fbyte_in = 1; 3179 if (tp->t_fbyte_out && tp->t_fbyte_in) 3180 tp->t_flags2 |= TF2_FBYTES_COMPLETE; 3181 } 3182 thflags = tcp_get_flags(th) & TH_FIN; 3183 TCPSTAT_INC(tcps_rcvpack); 3184 TCPSTAT_ADD(tcps_rcvbyte, tlen); 3185 SOCK_RECVBUF_LOCK(so); 3186 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) 3187 m_freem(m); 3188 else 3189 sbappendstream_locked(&so->so_rcv, m, 0); 3190 tp->t_flags |= TF_WAKESOR; 3191 } else { 3192 /* 3193 * XXX: Due to the header drop above "th" is 3194 * theoretically invalid by now. Fortunately 3195 * m_adj() doesn't actually frees any mbufs 3196 * when trimming from the head. 3197 */ 3198 tcp_seq temp = save_start; 3199 3200 thflags = tcp_reass(tp, th, &temp, &tlen, m); 3201 tp->t_flags |= TF_ACKNOW; 3202 } 3203 if ((tp->t_flags & TF_SACK_PERMIT) && 3204 (save_tlen > 0) && 3205 TCPS_HAVEESTABLISHED(tp->t_state)) { 3206 if ((tlen == 0) && (SEQ_LT(save_start, save_rnxt))) { 3207 /* 3208 * DSACK actually handled in the fastpath 3209 * above. 3210 */ 3211 tcp_update_sack_list(tp, save_start, 3212 save_start + save_tlen); 3213 } else if ((tlen > 0) && SEQ_GT(tp->rcv_nxt, save_rnxt)) { 3214 if ((tp->rcv_numsacks >= 1) && 3215 (tp->sackblks[0].end == save_start)) { 3216 /* 3217 * Partial overlap, recorded at todrop 3218 * above. 3219 */ 3220 tcp_update_sack_list(tp, 3221 tp->sackblks[0].start, 3222 tp->sackblks[0].end); 3223 } else { 3224 tcp_update_dsack_list(tp, save_start, 3225 save_start + save_tlen); 3226 } 3227 } else if (tlen >= save_tlen) { 3228 /* Update of sackblks. */ 3229 tcp_update_dsack_list(tp, save_start, 3230 save_start + save_tlen); 3231 } else if (tlen > 0) { 3232 tcp_update_dsack_list(tp, save_start, 3233 save_start + tlen); 3234 } 3235 } 3236 tcp_handle_wakeup(tp); 3237 #if 0 3238 /* 3239 * Note the amount of data that peer has sent into 3240 * our window, in order to estimate the sender's 3241 * buffer size. 3242 * XXX: Unused. 3243 */ 3244 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) 3245 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); 3246 else 3247 len = so->so_rcv.sb_hiwat; 3248 #endif 3249 } else { 3250 if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { 3251 if (tlen > 0) { 3252 if ((thflags & TH_FIN) != 0) { 3253 log(LOG_DEBUG, "%s; %s: %s: " 3254 "Received %d bytes of data and FIN " 3255 "after having received a FIN, " 3256 "just dropping both\n", 3257 s, __func__, 3258 tcpstates[tp->t_state], tlen); 3259 } else { 3260 log(LOG_DEBUG, "%s; %s: %s: " 3261 "Received %d bytes of data " 3262 "after having received a FIN, " 3263 "just dropping it\n", 3264 s, __func__, 3265 tcpstates[tp->t_state], tlen); 3266 } 3267 } else { 3268 if ((thflags & TH_FIN) != 0) { 3269 log(LOG_DEBUG, "%s; %s: %s: " 3270 "Received FIN " 3271 "after having received a FIN, " 3272 "just dropping it\n", 3273 s, __func__, 3274 tcpstates[tp->t_state]); 3275 } 3276 } 3277 free(s, M_TCPLOG); 3278 } 3279 m_freem(m); 3280 thflags &= ~TH_FIN; 3281 } 3282 3283 /* 3284 * If FIN is received ACK the FIN and let the user know 3285 * that the connection is closing. 3286 */ 3287 if (thflags & TH_FIN) { 3288 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 3289 /* The socket upcall is handled by socantrcvmore. */ 3290 socantrcvmore(so); 3291 /* 3292 * If connection is half-synchronized 3293 * (ie NEEDSYN flag on) then delay ACK, 3294 * so it may be piggybacked when SYN is sent. 3295 * Otherwise, since we received a FIN then no 3296 * more input can be expected, send ACK now. 3297 */ 3298 if (tp->t_flags & TF_NEEDSYN) 3299 tp->t_flags |= TF_DELACK; 3300 else 3301 tp->t_flags |= TF_ACKNOW; 3302 tp->rcv_nxt++; 3303 } 3304 switch (tp->t_state) { 3305 /* 3306 * In SYN_RECEIVED and ESTABLISHED STATES 3307 * enter the CLOSE_WAIT state. 3308 */ 3309 case TCPS_SYN_RECEIVED: 3310 tp->t_starttime = ticks; 3311 /* FALLTHROUGH */ 3312 case TCPS_ESTABLISHED: 3313 tcp_state_change(tp, TCPS_CLOSE_WAIT); 3314 break; 3315 3316 /* 3317 * If still in FIN_WAIT_1 STATE FIN has not been acked so 3318 * enter the CLOSING state. 3319 */ 3320 case TCPS_FIN_WAIT_1: 3321 tcp_state_change(tp, TCPS_CLOSING); 3322 break; 3323 3324 /* 3325 * In FIN_WAIT_2 state enter the TIME_WAIT state, 3326 * starting the time-wait timer, turning off the other 3327 * standard timers. 3328 */ 3329 case TCPS_FIN_WAIT_2: 3330 tcp_twstart(tp); 3331 return; 3332 } 3333 } 3334 TCP_PROBE3(debug__input, tp, th, m); 3335 3336 /* 3337 * Return any desired output. 3338 */ 3339 if (needoutput || (tp->t_flags & TF_ACKNOW)) { 3340 (void) tcp_output(tp); 3341 } 3342 check_delack: 3343 INP_WLOCK_ASSERT(inp); 3344 3345 if (tp->t_flags & TF_DELACK) { 3346 tp->t_flags &= ~TF_DELACK; 3347 tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); 3348 } 3349 INP_WUNLOCK(inp); 3350 return; 3351 3352 dropafterack: 3353 /* 3354 * Generate an ACK dropping incoming segment if it occupies 3355 * sequence space, where the ACK reflects our state. 3356 * 3357 * We can now skip the test for the RST flag since all 3358 * paths to this code happen after packets containing 3359 * RST have been dropped. 3360 * 3361 * In the SYN-RECEIVED state, don't send an ACK unless the 3362 * segment we received passes the SYN-RECEIVED ACK test. 3363 * If it fails send a RST. This breaks the loop in the 3364 * "LAND" DoS attack, and also prevents an ACK storm 3365 * between two listening ports that have been sent forged 3366 * SYN segments, each with the source address of the other. 3367 */ 3368 if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) && 3369 (SEQ_GT(tp->snd_una, th->th_ack) || 3370 SEQ_GT(th->th_ack, tp->snd_max)) ) { 3371 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 3372 goto dropwithreset; 3373 } 3374 TCP_PROBE3(debug__input, tp, th, m); 3375 tp->t_flags |= TF_ACKNOW; 3376 (void) tcp_output(tp); 3377 INP_WUNLOCK(inp); 3378 m_freem(m); 3379 return; 3380 3381 dropwithreset: 3382 tcp_dropwithreset(m, th, tp, tlen); 3383 if (tp != NULL) { 3384 INP_WUNLOCK(inp); 3385 } 3386 return; 3387 3388 drop: 3389 /* 3390 * Drop space held by incoming segment and return. 3391 */ 3392 TCP_PROBE3(debug__input, tp, th, m); 3393 if (tp != NULL) { 3394 INP_WUNLOCK(inp); 3395 } 3396 m_freem(m); 3397 } 3398 3399 /* 3400 * Issue RST and make ACK acceptable to originator of segment. 3401 * The mbuf must still include the original packet header. 3402 * tp may be NULL. 3403 */ 3404 void 3405 tcp_dropwithreset(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int tlen) 3406 { 3407 #ifdef INET 3408 struct ip *ip; 3409 #endif 3410 #ifdef INET6 3411 struct ip6_hdr *ip6; 3412 #endif 3413 3414 if (tp != NULL) { 3415 INP_LOCK_ASSERT(tptoinpcb(tp)); 3416 } 3417 3418 /* Don't bother if destination was broadcast/multicast. */ 3419 if ((tcp_get_flags(th) & TH_RST) || m->m_flags & (M_BCAST|M_MCAST)) 3420 goto drop; 3421 #ifdef INET6 3422 if (mtod(m, struct ip *)->ip_v == 6) { 3423 ip6 = mtod(m, struct ip6_hdr *); 3424 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || 3425 IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) 3426 goto drop; 3427 /* IPv6 anycast check is done at tcp6_input() */ 3428 } 3429 #endif 3430 #if defined(INET) && defined(INET6) 3431 else 3432 #endif 3433 #ifdef INET 3434 { 3435 ip = mtod(m, struct ip *); 3436 if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || 3437 IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || 3438 ip->ip_src.s_addr == htonl(INADDR_BROADCAST) || 3439 in_ifnet_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) 3440 goto drop; 3441 } 3442 #endif 3443 3444 /* Perform bandwidth limiting. */ 3445 if (badport_bandlim(BANDLIM_TCP_RST) < 0) 3446 goto drop; 3447 3448 /* tcp_respond consumes the mbuf chain. */ 3449 if (tcp_get_flags(th) & TH_ACK) { 3450 tcp_respond(tp, mtod(m, void *), th, m, (tcp_seq)0, 3451 th->th_ack, TH_RST); 3452 } else { 3453 if (tcp_get_flags(th) & TH_SYN) 3454 tlen++; 3455 if (tcp_get_flags(th) & TH_FIN) 3456 tlen++; 3457 tcp_respond(tp, mtod(m, void *), th, m, th->th_seq+tlen, 3458 (tcp_seq)0, TH_RST|TH_ACK); 3459 } 3460 return; 3461 drop: 3462 m_freem(m); 3463 } 3464 3465 /* 3466 * Parse TCP options and place in tcpopt. 3467 */ 3468 void 3469 tcp_dooptions(struct tcpopt *to, u_char *cp, int cnt, int flags) 3470 { 3471 int opt, optlen; 3472 3473 to->to_flags = 0; 3474 for (; cnt > 0; cnt -= optlen, cp += optlen) { 3475 opt = cp[0]; 3476 if (opt == TCPOPT_EOL) 3477 break; 3478 if (opt == TCPOPT_NOP) 3479 optlen = 1; 3480 else { 3481 if (cnt < 2) 3482 break; 3483 optlen = cp[1]; 3484 if (optlen < 2 || optlen > cnt) 3485 break; 3486 } 3487 switch (opt) { 3488 case TCPOPT_MAXSEG: 3489 if (optlen != TCPOLEN_MAXSEG) 3490 continue; 3491 if (!(flags & TO_SYN)) 3492 continue; 3493 to->to_flags |= TOF_MSS; 3494 bcopy((char *)cp + 2, 3495 (char *)&to->to_mss, sizeof(to->to_mss)); 3496 to->to_mss = ntohs(to->to_mss); 3497 break; 3498 case TCPOPT_WINDOW: 3499 if (optlen != TCPOLEN_WINDOW) 3500 continue; 3501 if (!(flags & TO_SYN)) 3502 continue; 3503 to->to_flags |= TOF_SCALE; 3504 to->to_wscale = min(cp[2], TCP_MAX_WINSHIFT); 3505 break; 3506 case TCPOPT_TIMESTAMP: 3507 if (optlen != TCPOLEN_TIMESTAMP) 3508 continue; 3509 to->to_flags |= TOF_TS; 3510 bcopy((char *)cp + 2, 3511 (char *)&to->to_tsval, sizeof(to->to_tsval)); 3512 to->to_tsval = ntohl(to->to_tsval); 3513 bcopy((char *)cp + 6, 3514 (char *)&to->to_tsecr, sizeof(to->to_tsecr)); 3515 to->to_tsecr = ntohl(to->to_tsecr); 3516 break; 3517 case TCPOPT_SIGNATURE: 3518 /* 3519 * In order to reply to a host which has set the 3520 * TCP_SIGNATURE option in its initial SYN, we have 3521 * to record the fact that the option was observed 3522 * here for the syncache code to perform the correct 3523 * response. 3524 */ 3525 if (optlen != TCPOLEN_SIGNATURE) 3526 continue; 3527 to->to_flags |= TOF_SIGNATURE; 3528 to->to_signature = cp + 2; 3529 break; 3530 case TCPOPT_SACK_PERMITTED: 3531 if (optlen != TCPOLEN_SACK_PERMITTED) 3532 continue; 3533 if (!(flags & TO_SYN)) 3534 continue; 3535 if (!V_tcp_do_sack) 3536 continue; 3537 to->to_flags |= TOF_SACKPERM; 3538 break; 3539 case TCPOPT_SACK: 3540 if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0) 3541 continue; 3542 if (flags & TO_SYN) 3543 continue; 3544 to->to_flags |= TOF_SACK; 3545 to->to_nsacks = (optlen - 2) / TCPOLEN_SACK; 3546 to->to_sacks = cp + 2; 3547 TCPSTAT_INC(tcps_sack_rcv_blocks); 3548 break; 3549 case TCPOPT_FAST_OPEN: 3550 /* 3551 * Cookie length validation is performed by the 3552 * server side cookie checking code or the client 3553 * side cookie cache update code. 3554 */ 3555 if (!(flags & TO_SYN)) 3556 continue; 3557 if (!V_tcp_fastopen_client_enable && 3558 !V_tcp_fastopen_server_enable) 3559 continue; 3560 to->to_flags |= TOF_FASTOPEN; 3561 to->to_tfo_len = optlen - 2; 3562 to->to_tfo_cookie = to->to_tfo_len ? cp + 2 : NULL; 3563 break; 3564 default: 3565 continue; 3566 } 3567 } 3568 } 3569 3570 /* 3571 * Pull out of band byte out of a segment so 3572 * it doesn't appear in the user's data queue. 3573 * It is still reflected in the segment length for 3574 * sequencing purposes. 3575 */ 3576 void 3577 tcp_pulloutofband(struct socket *so, struct tcphdr *th, struct mbuf *m, 3578 int off) 3579 { 3580 int cnt = off + th->th_urp - 1; 3581 3582 while (cnt >= 0) { 3583 if (m->m_len > cnt) { 3584 char *cp = mtod(m, caddr_t) + cnt; 3585 struct tcpcb *tp = sototcpcb(so); 3586 3587 INP_WLOCK_ASSERT(tptoinpcb(tp)); 3588 3589 tp->t_iobc = *cp; 3590 tp->t_oobflags |= TCPOOB_HAVEDATA; 3591 bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1)); 3592 m->m_len--; 3593 if (m->m_flags & M_PKTHDR) 3594 m->m_pkthdr.len--; 3595 return; 3596 } 3597 cnt -= m->m_len; 3598 m = m->m_next; 3599 if (m == NULL) 3600 break; 3601 } 3602 panic("tcp_pulloutofband"); 3603 } 3604 3605 /* 3606 * Collect new round-trip time estimate 3607 * and update averages and current timeout. 3608 */ 3609 void 3610 tcp_xmit_timer(struct tcpcb *tp, int rtt) 3611 { 3612 int delta; 3613 3614 INP_WLOCK_ASSERT(tptoinpcb(tp)); 3615 3616 TCPSTAT_INC(tcps_rttupdated); 3617 if (tp->t_rttupdated < UCHAR_MAX) 3618 tp->t_rttupdated++; 3619 #ifdef STATS 3620 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, 3621 imax(0, rtt * 1000 / hz)); 3622 #endif 3623 if ((tp->t_srtt != 0) && (tp->t_rxtshift <= TCP_RTT_INVALIDATE)) { 3624 /* 3625 * srtt is stored as fixed point with 5 bits after the 3626 * binary point (i.e., scaled by 8). The following magic 3627 * is equivalent to the smoothing algorithm in rfc793 with 3628 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed 3629 * point). Adjust rtt to origin 0. 3630 */ 3631 delta = ((rtt - 1) << TCP_DELTA_SHIFT) 3632 - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)); 3633 3634 if ((tp->t_srtt += delta) <= 0) 3635 tp->t_srtt = 1; 3636 3637 /* 3638 * We accumulate a smoothed rtt variance (actually, a 3639 * smoothed mean difference), then set the retransmit 3640 * timer to smoothed rtt + 4 times the smoothed variance. 3641 * rttvar is stored as fixed point with 4 bits after the 3642 * binary point (scaled by 16). The following is 3643 * equivalent to rfc793 smoothing with an alpha of .75 3644 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces 3645 * rfc793's wired-in beta. 3646 */ 3647 if (delta < 0) 3648 delta = -delta; 3649 delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT); 3650 if ((tp->t_rttvar += delta) <= 0) 3651 tp->t_rttvar = 1; 3652 } else { 3653 /* 3654 * No rtt measurement yet - use the unsmoothed rtt. 3655 * Set the variance to half the rtt (so our first 3656 * retransmit happens at 3*rtt). 3657 */ 3658 tp->t_srtt = rtt << TCP_RTT_SHIFT; 3659 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1); 3660 } 3661 tp->t_rtttime = 0; 3662 tp->t_rxtshift = 0; 3663 3664 /* 3665 * the retransmit should happen at rtt + 4 * rttvar. 3666 * Because of the way we do the smoothing, srtt and rttvar 3667 * will each average +1/2 tick of bias. When we compute 3668 * the retransmit timer, we want 1/2 tick of rounding and 3669 * 1 extra tick because of +-1/2 tick uncertainty in the 3670 * firing of the timer. The bias will give us exactly the 3671 * 1.5 tick we need. But, because the bias is 3672 * statistical, we have to test that we don't drop below 3673 * the minimum feasible timer (which is 2 ticks). 3674 */ 3675 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), 3676 max(tp->t_rttmin, rtt + 2), tcp_rexmit_max); 3677 3678 /* 3679 * We received an ack for a packet that wasn't retransmitted; 3680 * it is probably safe to discard any error indications we've 3681 * received recently. This isn't quite right, but close enough 3682 * for now (a route might have failed after we sent a segment, 3683 * and the return path might not be symmetrical). 3684 */ 3685 tp->t_softerror = 0; 3686 } 3687 3688 /* 3689 * Determine a reasonable value for maxseg size. 3690 * If the route is known, check route for mtu. 3691 * If none, use an mss that can be handled on the outgoing interface 3692 * without forcing IP to fragment. If no route is found, route has no mtu, 3693 * or the destination isn't local, use a default, hopefully conservative 3694 * size (usually 512 or the default IP max size, but no more than the mtu 3695 * of the interface), as we can't discover anything about intervening 3696 * gateways or networks. We also initialize the congestion/slow start 3697 * window to be a single segment if the destination isn't local. 3698 * While looking at the routing entry, we also initialize other path-dependent 3699 * parameters from pre-set or cached values in the routing entry. 3700 * 3701 * NOTE that resulting t_maxseg doesn't include space for TCP options or 3702 * IP options, e.g. IPSEC data, since length of this data may vary, and 3703 * thus it is calculated for every segment separately in tcp_output(). 3704 * 3705 * NOTE that this routine is only called when we process an incoming 3706 * segment, or an ICMP need fragmentation datagram. Outgoing SYN/ACK MSS 3707 * settings are handled in tcp_mssopt(). 3708 */ 3709 void 3710 tcp_mss_update(struct tcpcb *tp, int offer, int mtuoffer, 3711 struct hc_metrics_lite *metricptr, struct tcp_ifcap *cap) 3712 { 3713 int mss = 0; 3714 uint32_t maxmtu = 0; 3715 struct inpcb *inp = tptoinpcb(tp); 3716 struct hc_metrics_lite metrics; 3717 #ifdef INET6 3718 int isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0; 3719 size_t min_protoh = isipv6 ? 3720 sizeof (struct ip6_hdr) + sizeof (struct tcphdr) : 3721 sizeof (struct tcpiphdr); 3722 #else 3723 size_t min_protoh = sizeof(struct tcpiphdr); 3724 #endif 3725 3726 INP_WLOCK_ASSERT(inp); 3727 3728 if (tp->t_port) 3729 min_protoh += V_tcp_udp_tunneling_overhead; 3730 if (mtuoffer != -1) { 3731 KASSERT(offer == -1, ("%s: conflict", __func__)); 3732 offer = mtuoffer - min_protoh; 3733 } 3734 3735 /* Initialize. */ 3736 #ifdef INET6 3737 if (isipv6) { 3738 maxmtu = tcp_maxmtu6(&inp->inp_inc, cap); 3739 tp->t_maxseg = V_tcp_v6mssdflt; 3740 } 3741 #endif 3742 #if defined(INET) && defined(INET6) 3743 else 3744 #endif 3745 #ifdef INET 3746 { 3747 maxmtu = tcp_maxmtu(&inp->inp_inc, cap); 3748 tp->t_maxseg = V_tcp_mssdflt; 3749 } 3750 #endif 3751 3752 /* 3753 * No route to sender, stay with default mss and return. 3754 */ 3755 if (maxmtu == 0) { 3756 /* 3757 * In case we return early we need to initialize metrics 3758 * to a defined state as tcp_hc_get() would do for us 3759 * if there was no cache hit. 3760 */ 3761 if (metricptr != NULL) 3762 bzero(metricptr, sizeof(struct hc_metrics_lite)); 3763 return; 3764 } 3765 3766 /* What have we got? */ 3767 switch (offer) { 3768 case 0: 3769 /* 3770 * Offer == 0 means that there was no MSS on the SYN 3771 * segment, in this case we use tcp_mssdflt as 3772 * already assigned to t_maxseg above. 3773 */ 3774 offer = tp->t_maxseg; 3775 break; 3776 3777 case -1: 3778 /* 3779 * Offer == -1 means that we didn't receive SYN yet. 3780 */ 3781 /* FALLTHROUGH */ 3782 3783 default: 3784 /* 3785 * Prevent DoS attack with too small MSS. Round up 3786 * to at least minmss. 3787 */ 3788 offer = max(offer, V_tcp_minmss); 3789 } 3790 3791 if (metricptr == NULL) 3792 metricptr = &metrics; 3793 tcp_hc_get(&inp->inp_inc, metricptr); 3794 3795 /* 3796 * If there's a discovered mtu in tcp hostcache, use it. 3797 * Else, use the link mtu. 3798 */ 3799 if (metricptr->hc_mtu) 3800 mss = min(metricptr->hc_mtu, maxmtu) - min_protoh; 3801 else { 3802 #ifdef INET6 3803 if (isipv6) { 3804 mss = maxmtu - min_protoh; 3805 if (!V_path_mtu_discovery && 3806 !in6_localaddr(&inp->in6p_faddr)) 3807 mss = min(mss, V_tcp_v6mssdflt); 3808 } 3809 #endif 3810 #if defined(INET) && defined(INET6) 3811 else 3812 #endif 3813 #ifdef INET 3814 { 3815 mss = maxmtu - min_protoh; 3816 if (!V_path_mtu_discovery && 3817 !in_localaddr(inp->inp_faddr)) 3818 mss = min(mss, V_tcp_mssdflt); 3819 } 3820 #endif 3821 /* 3822 * XXX - The above conditional (mss = maxmtu - min_protoh) 3823 * probably violates the TCP spec. 3824 * The problem is that, since we don't know the 3825 * other end's MSS, we are supposed to use a conservative 3826 * default. But, if we do that, then MTU discovery will 3827 * never actually take place, because the conservative 3828 * default is much less than the MTUs typically seen 3829 * on the Internet today. For the moment, we'll sweep 3830 * this under the carpet. 3831 * 3832 * The conservative default might not actually be a problem 3833 * if the only case this occurs is when sending an initial 3834 * SYN with options and data to a host we've never talked 3835 * to before. Then, they will reply with an MSS value which 3836 * will get recorded and the new parameters should get 3837 * recomputed. For Further Study. 3838 */ 3839 } 3840 mss = min(mss, offer); 3841 3842 /* 3843 * Sanity check: make sure that maxseg will be large 3844 * enough to allow some data on segments even if the 3845 * all the option space is used (40bytes). Otherwise 3846 * funny things may happen in tcp_output. 3847 * 3848 * XXXGL: shouldn't we reserve space for IP/IPv6 options? 3849 */ 3850 mss = max(mss, 64); 3851 3852 tp->t_maxseg = mss; 3853 if (tp->t_maxseg < V_tcp_mssdflt) { 3854 /* 3855 * The MSS is so small we should not process incoming 3856 * SACK's since we are subject to attack in such a 3857 * case. 3858 */ 3859 tp->t_flags2 |= TF2_PROC_SACK_PROHIBIT; 3860 } else { 3861 tp->t_flags2 &= ~TF2_PROC_SACK_PROHIBIT; 3862 } 3863 3864 } 3865 3866 void 3867 tcp_mss(struct tcpcb *tp, int offer) 3868 { 3869 int mss; 3870 uint32_t bufsize; 3871 struct inpcb *inp = tptoinpcb(tp); 3872 struct socket *so; 3873 struct hc_metrics_lite metrics; 3874 struct tcp_ifcap cap; 3875 3876 KASSERT(tp != NULL, ("%s: tp == NULL", __func__)); 3877 3878 bzero(&cap, sizeof(cap)); 3879 tcp_mss_update(tp, offer, -1, &metrics, &cap); 3880 3881 mss = tp->t_maxseg; 3882 3883 /* 3884 * If there's a pipesize, change the socket buffer to that size, 3885 * don't change if sb_hiwat is different than default (then it 3886 * has been changed on purpose with setsockopt). 3887 * Make the socket buffers an integral number of mss units; 3888 * if the mss is larger than the socket buffer, decrease the mss. 3889 */ 3890 so = inp->inp_socket; 3891 SOCK_SENDBUF_LOCK(so); 3892 if ((so->so_snd.sb_hiwat == V_tcp_sendspace) && metrics.hc_sendpipe) 3893 bufsize = metrics.hc_sendpipe; 3894 else 3895 bufsize = so->so_snd.sb_hiwat; 3896 if (bufsize < mss) 3897 mss = bufsize; 3898 else { 3899 bufsize = roundup(bufsize, mss); 3900 if (bufsize > sb_max) 3901 bufsize = sb_max; 3902 if (bufsize > so->so_snd.sb_hiwat) 3903 (void)sbreserve_locked(so, SO_SND, bufsize, NULL); 3904 } 3905 SOCK_SENDBUF_UNLOCK(so); 3906 /* 3907 * Sanity check: make sure that maxseg will be large 3908 * enough to allow some data on segments even if the 3909 * all the option space is used (40bytes). Otherwise 3910 * funny things may happen in tcp_output. 3911 * 3912 * XXXGL: shouldn't we reserve space for IP/IPv6 options? 3913 */ 3914 tp->t_maxseg = max(mss, 64); 3915 if (tp->t_maxseg < V_tcp_mssdflt) { 3916 /* 3917 * The MSS is so small we should not process incoming 3918 * SACK's since we are subject to attack in such a 3919 * case. 3920 */ 3921 tp->t_flags2 |= TF2_PROC_SACK_PROHIBIT; 3922 } else { 3923 tp->t_flags2 &= ~TF2_PROC_SACK_PROHIBIT; 3924 } 3925 3926 SOCK_RECVBUF_LOCK(so); 3927 if ((so->so_rcv.sb_hiwat == V_tcp_recvspace) && metrics.hc_recvpipe) 3928 bufsize = metrics.hc_recvpipe; 3929 else 3930 bufsize = so->so_rcv.sb_hiwat; 3931 if (bufsize > mss) { 3932 bufsize = roundup(bufsize, mss); 3933 if (bufsize > sb_max) 3934 bufsize = sb_max; 3935 if (bufsize > so->so_rcv.sb_hiwat) 3936 (void)sbreserve_locked(so, SO_RCV, bufsize, NULL); 3937 } 3938 SOCK_RECVBUF_UNLOCK(so); 3939 3940 /* Check the interface for TSO capabilities. */ 3941 if (cap.ifcap & CSUM_TSO) { 3942 tp->t_flags |= TF_TSO; 3943 tp->t_tsomax = cap.tsomax; 3944 tp->t_tsomaxsegcount = cap.tsomaxsegcount; 3945 tp->t_tsomaxsegsize = cap.tsomaxsegsize; 3946 if (cap.ipsec_tso) 3947 tp->t_flags2 |= TF2_IPSEC_TSO; 3948 } 3949 } 3950 3951 /* 3952 * Determine the MSS option to send on an outgoing SYN. 3953 */ 3954 int 3955 tcp_mssopt(struct in_conninfo *inc) 3956 { 3957 int mss = 0; 3958 uint32_t thcmtu = 0; 3959 uint32_t maxmtu = 0; 3960 size_t min_protoh; 3961 3962 KASSERT(inc != NULL, ("tcp_mssopt with NULL in_conninfo pointer")); 3963 3964 #ifdef INET6 3965 if (inc->inc_flags & INC_ISIPV6) { 3966 mss = V_tcp_v6mssdflt; 3967 maxmtu = tcp_maxmtu6(inc, NULL); 3968 min_protoh = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 3969 } 3970 #endif 3971 #if defined(INET) && defined(INET6) 3972 else 3973 #endif 3974 #ifdef INET 3975 { 3976 mss = V_tcp_mssdflt; 3977 maxmtu = tcp_maxmtu(inc, NULL); 3978 min_protoh = sizeof(struct tcpiphdr); 3979 } 3980 #endif 3981 #if defined(INET6) || defined(INET) 3982 thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */ 3983 #endif 3984 3985 if (maxmtu && thcmtu) 3986 mss = min(maxmtu, thcmtu) - min_protoh; 3987 else if (maxmtu || thcmtu) 3988 mss = max(maxmtu, thcmtu) - min_protoh; 3989 3990 return (mss); 3991 } 3992 3993 void 3994 tcp_do_prr_ack(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to, 3995 sackstatus_t sack_changed, u_int *maxsegp) 3996 { 3997 int snd_cnt = 0, limit = 0, del_data = 0, pipe = 0; 3998 u_int maxseg; 3999 4000 INP_WLOCK_ASSERT(tptoinpcb(tp)); 4001 4002 if (*maxsegp == 0) { 4003 *maxsegp = tcp_maxseg(tp); 4004 } 4005 maxseg = *maxsegp; 4006 /* 4007 * Compute the amount of data that this ACK is indicating 4008 * (del_data) and an estimate of how many bytes are in the 4009 * network. 4010 */ 4011 if (tcp_is_sack_recovery(tp, to) || 4012 (IN_CONGRECOVERY(tp->t_flags) && 4013 !IN_FASTRECOVERY(tp->t_flags))) { 4014 del_data = tp->sackhint.delivered_data; 4015 pipe = tcp_compute_pipe(tp); 4016 } else { 4017 if (tp->sackhint.prr_delivered < (tcprexmtthresh * maxseg + 4018 tp->snd_recover - tp->snd_una)) { 4019 del_data = maxseg; 4020 } 4021 pipe = imax(0, tp->snd_max - tp->snd_una - 4022 imin(INT_MAX / 65536, tp->t_dupacks) * maxseg); 4023 } 4024 tp->sackhint.prr_delivered += del_data; 4025 /* 4026 * Proportional Rate Reduction 4027 */ 4028 if (pipe >= tp->snd_ssthresh) { 4029 if (tp->sackhint.recover_fs == 0) 4030 tp->sackhint.recover_fs = 4031 imax(1, tp->snd_nxt - tp->snd_una); 4032 snd_cnt = howmany((long)tp->sackhint.prr_delivered * 4033 tp->snd_ssthresh, tp->sackhint.recover_fs) - 4034 tp->sackhint.prr_out + maxseg - 1; 4035 } else { 4036 /* 4037 * PRR 6937bis heuristic: 4038 * - A partial ack without SACK block beneath snd_recover 4039 * indicates further loss. 4040 * - An SACK scoreboard update adding a new hole indicates 4041 * further loss, so be conservative and send at most one 4042 * segment. 4043 * - Prevent ACK splitting attacks, by being conservative 4044 * when no new data is acked. 4045 */ 4046 if ((sack_changed == SACK_NEWLOSS) || (del_data == 0)) { 4047 limit = tp->sackhint.prr_delivered - 4048 tp->sackhint.prr_out; 4049 } else { 4050 limit = imax(tp->sackhint.prr_delivered - 4051 tp->sackhint.prr_out, del_data) + 4052 maxseg; 4053 } 4054 snd_cnt = imin((tp->snd_ssthresh - pipe), limit); 4055 } 4056 snd_cnt = imax(snd_cnt, 0) / maxseg; 4057 /* 4058 * Send snd_cnt new data into the network in response to this ack. 4059 * If there is going to be a SACK retransmission, adjust snd_cwnd 4060 * accordingly. 4061 */ 4062 if (IN_FASTRECOVERY(tp->t_flags)) { 4063 if (tcp_is_sack_recovery(tp, to)) { 4064 tp->snd_cwnd = pipe - del_data + (snd_cnt * maxseg); 4065 } else { 4066 tp->snd_cwnd = (tp->snd_max - tp->snd_una) + 4067 (snd_cnt * maxseg); 4068 } 4069 } else if (IN_CONGRECOVERY(tp->t_flags)) { 4070 tp->snd_cwnd = pipe - del_data + (snd_cnt * maxseg); 4071 } 4072 tp->snd_cwnd = imax(maxseg, tp->snd_cwnd); 4073 } 4074 4075 /* 4076 * On a partial ack arrives, force the retransmission of the 4077 * next unacknowledged segment. Do not clear tp->t_dupacks. 4078 * By setting snd_nxt to ti_ack, this forces retransmission timer to 4079 * be started again. 4080 */ 4081 void 4082 tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th) 4083 { 4084 tcp_seq onxt = tp->snd_nxt; 4085 uint32_t ocwnd = tp->snd_cwnd; 4086 u_int maxseg = tcp_maxseg(tp); 4087 4088 INP_WLOCK_ASSERT(tptoinpcb(tp)); 4089 4090 tcp_timer_activate(tp, TT_REXMT, 0); 4091 tp->t_rtttime = 0; 4092 if (IN_FASTRECOVERY(tp->t_flags)) { 4093 tp->snd_nxt = th->th_ack; 4094 /* 4095 * Set snd_cwnd to one segment beyond acknowledged offset. 4096 * (tp->snd_una has not yet been updated when this function is called.) 4097 */ 4098 tp->snd_cwnd = maxseg + BYTES_THIS_ACK(tp, th); 4099 tp->t_flags |= TF_ACKNOW; 4100 (void) tcp_output(tp); 4101 tp->snd_cwnd = ocwnd; 4102 if (SEQ_GT(onxt, tp->snd_nxt)) 4103 tp->snd_nxt = onxt; 4104 } 4105 /* 4106 * Partial window deflation. Relies on fact that tp->snd_una 4107 * not updated yet. 4108 */ 4109 if (tp->snd_cwnd > BYTES_THIS_ACK(tp, th)) 4110 tp->snd_cwnd -= BYTES_THIS_ACK(tp, th); 4111 else 4112 tp->snd_cwnd = 0; 4113 tp->snd_cwnd += maxseg; 4114 } 4115 4116 int 4117 tcp_compute_pipe(struct tcpcb *tp) 4118 { 4119 int pipe; 4120 4121 if (tp->t_fb->tfb_compute_pipe != NULL) { 4122 pipe = (*tp->t_fb->tfb_compute_pipe)(tp); 4123 } else if (V_tcp_do_newsack) { 4124 pipe = tp->snd_max - tp->snd_una + 4125 tp->sackhint.sack_bytes_rexmit - 4126 tp->sackhint.sacked_bytes - 4127 tp->sackhint.lost_bytes; 4128 } else { 4129 pipe = tp->snd_nxt - tp->snd_fack + tp->sackhint.sack_bytes_rexmit; 4130 } 4131 return (imax(pipe, 0)); 4132 } 4133 4134 uint32_t 4135 tcp_compute_initwnd(uint32_t maxseg) 4136 { 4137 /* 4138 * Calculate the Initial Window, also used as Restart Window 4139 * 4140 * RFC5681 Section 3.1 specifies the default conservative values. 4141 * RFC3390 specifies slightly more aggressive values. 4142 * RFC6928 increases it to ten segments. 4143 * Support for user specified value for initial flight size. 4144 */ 4145 if (V_tcp_initcwnd_segments) 4146 return min(V_tcp_initcwnd_segments * maxseg, 4147 max(2 * maxseg, V_tcp_initcwnd_segments * 1460)); 4148 else if (V_tcp_do_rfc3390) 4149 return min(4 * maxseg, max(2 * maxseg, 4380)); 4150 else { 4151 /* Per RFC5681 Section 3.1 */ 4152 if (maxseg > 2190) 4153 return (2 * maxseg); 4154 else if (maxseg > 1095) 4155 return (3 * maxseg); 4156 else 4157 return (4 * maxseg); 4158 } 4159 } 4160