1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 5 * The Regents of the University of California. All rights reserved. 6 * Copyright (c) 2007-2008,2010 7 * Swinburne University of Technology, Melbourne, Australia. 8 * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org> 9 * Copyright (c) 2010 The FreeBSD Foundation 10 * Copyright (c) 2010-2011 Juniper Networks, Inc. 11 * All rights reserved. 12 * 13 * Portions of this software were developed at the Centre for Advanced Internet 14 * Architectures, Swinburne University of Technology, by Lawrence Stewart, 15 * James Healy and David Hayes, made possible in part by a grant from the Cisco 16 * University Research Program Fund at Community Foundation Silicon Valley. 17 * 18 * Portions of this software were developed at the Centre for Advanced 19 * Internet Architectures, Swinburne University of Technology, Melbourne, 20 * Australia by David Hayes under sponsorship from the FreeBSD Foundation. 21 * 22 * Portions of this software were developed by Robert N. M. Watson under 23 * contract to Juniper Networks, Inc. 24 * 25 * Redistribution and use in source and binary forms, with or without 26 * modification, are permitted provided that the following conditions 27 * are met: 28 * 1. Redistributions of source code must retain the above copyright 29 * notice, this list of conditions and the following disclaimer. 30 * 2. Redistributions in binary form must reproduce the above copyright 31 * notice, this list of conditions and the following disclaimer in the 32 * documentation and/or other materials provided with the distribution. 33 * 3. Neither the name of the University nor the names of its contributors 34 * may be used to endorse or promote products derived from this software 35 * without specific prior written permission. 36 * 37 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 38 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 39 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 40 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 41 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 42 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 43 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 44 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 45 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 46 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 47 * SUCH DAMAGE. 48 */ 49 50 #include "opt_inet.h" 51 #include "opt_inet6.h" 52 #include "opt_ipsec.h" 53 #include "opt_rss.h" 54 55 #include <sys/param.h> 56 #include <sys/arb.h> 57 #include <sys/kernel.h> 58 #ifdef TCP_HHOOK 59 #include <sys/hhook.h> 60 #endif 61 #include <sys/malloc.h> 62 #include <sys/mbuf.h> 63 #include <sys/proc.h> /* for proc0 declaration */ 64 #include <sys/protosw.h> 65 #include <sys/qmath.h> 66 #include <sys/sdt.h> 67 #include <sys/signalvar.h> 68 #include <sys/socket.h> 69 #include <sys/socketvar.h> 70 #include <sys/sysctl.h> 71 #include <sys/syslog.h> 72 #include <sys/systm.h> 73 #include <sys/stats.h> 74 75 #include <machine/cpu.h> /* before tcp_seq.h, for tcp_random18() */ 76 77 #include <vm/uma.h> 78 79 #include <net/if.h> 80 #include <net/if_var.h> 81 #include <net/route.h> 82 #include <net/rss_config.h> 83 #include <net/vnet.h> 84 85 #define TCPSTATES /* for logging */ 86 87 #include <netinet/in.h> 88 #include <netinet/in_kdtrace.h> 89 #include <netinet/in_pcb.h> 90 #include <netinet/in_rss.h> 91 #include <netinet/in_systm.h> 92 #include <netinet/ip.h> 93 #include <netinet/ip_icmp.h> /* required for icmp_var.h */ 94 #include <netinet/icmp_var.h> /* for ICMP_BANDLIM */ 95 #include <netinet/ip_var.h> 96 #include <netinet/ip_options.h> 97 #include <netinet/ip6.h> 98 #include <netinet/icmp6.h> 99 #include <netinet6/in6_pcb.h> 100 #include <netinet6/in6_rss.h> 101 #include <netinet6/in6_var.h> 102 #include <netinet6/ip6_var.h> 103 #include <netinet6/nd6.h> 104 #include <netinet/tcp.h> 105 #include <netinet/tcp_fsm.h> 106 #include <netinet/tcp_seq.h> 107 #include <netinet/tcp_timer.h> 108 #include <netinet/tcp_var.h> 109 #include <netinet/tcp_log_buf.h> 110 #include <netinet6/tcp6_var.h> 111 #include <netinet/tcpip.h> 112 #include <netinet/cc/cc.h> 113 #include <netinet/tcp_fastopen.h> 114 #include <netinet/tcp_syncache.h> 115 #ifdef TCP_OFFLOAD 116 #include <netinet/tcp_offload.h> 117 #endif 118 #include <netinet/tcp_ecn.h> 119 #include <netinet/udp.h> 120 121 #include <netipsec/ipsec_support.h> 122 123 #include <machine/in_cksum.h> 124 125 #include <security/mac/mac_framework.h> 126 127 const int tcprexmtthresh = 3; 128 129 VNET_DEFINE(int, tcp_log_in_vain) = 0; 130 SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_VNET | CTLFLAG_RW, 131 &VNET_NAME(tcp_log_in_vain), 0, 132 "Log all incoming TCP segments to closed ports"); 133 134 VNET_DEFINE(int, tcp_bind_all_fibs) = 1; 135 SYSCTL_INT(_net_inet_tcp, OID_AUTO, bind_all_fibs, CTLFLAG_VNET | CTLFLAG_RDTUN, 136 &VNET_NAME(tcp_bind_all_fibs), 0, 137 "Bound sockets receive traffic from all FIBs"); 138 139 VNET_DEFINE(int, blackhole) = 0; 140 #define V_blackhole VNET(blackhole) 141 SYSCTL_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_VNET | CTLFLAG_RW, 142 &VNET_NAME(blackhole), 0, 143 "Do not send RST on segments to closed ports"); 144 145 VNET_DEFINE(bool, blackhole_local) = false; 146 #define V_blackhole_local VNET(blackhole_local) 147 SYSCTL_BOOL(_net_inet_tcp, OID_AUTO, blackhole_local, CTLFLAG_VNET | 148 CTLFLAG_RW, &VNET_NAME(blackhole_local), false, 149 "Enforce net.inet.tcp.blackhole for locally originated packets"); 150 151 VNET_DEFINE(int, tcp_delack_enabled) = 1; 152 SYSCTL_INT(_net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_VNET | CTLFLAG_RW, 153 &VNET_NAME(tcp_delack_enabled), 0, 154 "Delay ACK to try and piggyback it onto a data packet"); 155 156 VNET_DEFINE(int, drop_synfin) = 0; 157 SYSCTL_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_VNET | CTLFLAG_RW, 158 &VNET_NAME(drop_synfin), 0, 159 "Drop TCP packets with SYN+FIN set"); 160 161 VNET_DEFINE(int, tcp_do_prr) = 1; 162 SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_prr, CTLFLAG_VNET | CTLFLAG_RW, 163 &VNET_NAME(tcp_do_prr), 1, 164 "Enable Proportional Rate Reduction per RFC 6937"); 165 166 VNET_DEFINE(int, tcp_do_newcwv) = 0; 167 SYSCTL_INT(_net_inet_tcp, OID_AUTO, newcwv, CTLFLAG_VNET | CTLFLAG_RW, 168 &VNET_NAME(tcp_do_newcwv), 0, 169 "Enable New Congestion Window Validation per RFC7661"); 170 171 VNET_DEFINE(int, tcp_do_rfc3042) = 1; 172 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3042, CTLFLAG_VNET | CTLFLAG_RW, 173 &VNET_NAME(tcp_do_rfc3042), 0, 174 "Enable RFC 3042 (Limited Transmit)"); 175 176 VNET_DEFINE(int, tcp_do_rfc3390) = 1; 177 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3390, CTLFLAG_VNET | CTLFLAG_RW, 178 &VNET_NAME(tcp_do_rfc3390), 0, 179 "Enable RFC 3390 (Increasing TCP's Initial Congestion Window)"); 180 181 VNET_DEFINE(int, tcp_initcwnd_segments) = 10; 182 SYSCTL_INT(_net_inet_tcp, OID_AUTO, initcwnd_segments, 183 CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_initcwnd_segments), 0, 184 "Slow-start flight size (initial congestion window) in number of segments"); 185 186 VNET_DEFINE(int, tcp_do_rfc3465) = 1; 187 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3465, CTLFLAG_VNET | CTLFLAG_RW, 188 &VNET_NAME(tcp_do_rfc3465), 0, 189 "Enable RFC 3465 (Appropriate Byte Counting)"); 190 191 VNET_DEFINE(int, tcp_abc_l_var) = 2; 192 SYSCTL_INT(_net_inet_tcp, OID_AUTO, abc_l_var, CTLFLAG_VNET | CTLFLAG_RW, 193 &VNET_NAME(tcp_abc_l_var), 2, 194 "Cap the max cwnd increment during slow-start to this number of segments"); 195 196 VNET_DEFINE(int, tcp_insecure_syn) = 0; 197 SYSCTL_INT(_net_inet_tcp, OID_AUTO, insecure_syn, CTLFLAG_VNET | CTLFLAG_RW, 198 &VNET_NAME(tcp_insecure_syn), 0, 199 "Follow RFC793 instead of RFC5961 criteria for accepting SYN packets"); 200 201 VNET_DEFINE(int, tcp_insecure_rst) = 0; 202 SYSCTL_INT(_net_inet_tcp, OID_AUTO, insecure_rst, CTLFLAG_VNET | CTLFLAG_RW, 203 &VNET_NAME(tcp_insecure_rst), 0, 204 "Follow RFC793 instead of RFC5961 criteria for accepting RST packets"); 205 206 VNET_DEFINE(int, tcp_insecure_ack) = 0; 207 SYSCTL_INT(_net_inet_tcp, OID_AUTO, insecure_ack, CTLFLAG_VNET | CTLFLAG_RW, 208 &VNET_NAME(tcp_insecure_ack), 0, 209 "Follow RFC793 criteria for validating SEG.ACK"); 210 211 VNET_DEFINE(int, tcp_recvspace) = 1024*64; 212 #define V_tcp_recvspace VNET(tcp_recvspace) 213 SYSCTL_INT(_net_inet_tcp, TCPCTL_RECVSPACE, recvspace, CTLFLAG_VNET | CTLFLAG_RW, 214 &VNET_NAME(tcp_recvspace), 0, "Initial receive socket buffer size"); 215 216 VNET_DEFINE(int, tcp_do_autorcvbuf) = 1; 217 SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_auto, CTLFLAG_VNET | CTLFLAG_RW, 218 &VNET_NAME(tcp_do_autorcvbuf), 0, 219 "Enable automatic receive buffer sizing"); 220 221 VNET_DEFINE(int, tcp_autorcvbuf_max) = 8*1024*1024; 222 SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_max, CTLFLAG_VNET | CTLFLAG_RW, 223 &VNET_NAME(tcp_autorcvbuf_max), 0, 224 "Max size of automatic receive buffer"); 225 226 VNET_DEFINE(struct inpcbinfo, tcbinfo); 227 228 /* 229 * TCP statistics are stored in an array of counter(9)s, which size matches 230 * size of struct tcpstat. TCP running connection count is a regular array. 231 */ 232 VNET_PCPUSTAT_DEFINE(struct tcpstat, tcpstat); 233 SYSCTL_VNET_PCPUSTAT(_net_inet_tcp, TCPCTL_STATS, stats, struct tcpstat, 234 tcpstat, "TCP statistics (struct tcpstat, netinet/tcp_var.h)"); 235 VNET_DEFINE(counter_u64_t, tcps_states[TCP_NSTATES]); 236 SYSCTL_COUNTER_U64_ARRAY(_net_inet_tcp, TCPCTL_STATES, states, CTLFLAG_RD | 237 CTLFLAG_VNET, &VNET_NAME(tcps_states)[0], TCP_NSTATES, 238 "TCP connection counts by TCP state"); 239 240 /* 241 * Kernel module interface for updating tcpstat. The first argument is an index 242 * into tcpstat treated as an array. 243 */ 244 void 245 kmod_tcpstat_add(int statnum, int val) 246 { 247 248 counter_u64_add(VNET(tcpstat)[statnum], val); 249 } 250 251 /* 252 * Make sure that we only start a SACK loss recovery when 253 * receiving a duplicate ACK with a SACK block, and also 254 * complete SACK loss recovery in case the other end 255 * reneges. 256 */ 257 static bool inline 258 tcp_is_sack_recovery(struct tcpcb *tp, struct tcpopt *to) 259 { 260 return ((tp->t_flags & TF_SACK_PERMIT) && 261 ((to->to_flags & TOF_SACK) || 262 (!TAILQ_EMPTY(&tp->snd_holes)))); 263 } 264 265 #ifdef TCP_HHOOK 266 /* 267 * Wrapper for the TCP established input helper hook. 268 */ 269 void 270 hhook_run_tcp_est_in(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to) 271 { 272 struct tcp_hhook_data hhook_data; 273 274 if (V_tcp_hhh[HHOOK_TCP_EST_IN]->hhh_nhooks > 0) { 275 hhook_data.tp = tp; 276 hhook_data.th = th; 277 hhook_data.to = to; 278 279 hhook_run_hooks(V_tcp_hhh[HHOOK_TCP_EST_IN], &hhook_data, 280 &tp->t_osd); 281 } 282 } 283 #endif 284 285 /* 286 * CC wrapper hook functions 287 */ 288 void 289 cc_ack_received(struct tcpcb *tp, struct tcphdr *th, uint16_t nsegs, 290 uint16_t type) 291 { 292 #ifdef STATS 293 int32_t gput; 294 #endif 295 296 INP_WLOCK_ASSERT(tptoinpcb(tp)); 297 298 tp->t_ccv.nsegs = nsegs; 299 tp->t_ccv.bytes_this_ack = BYTES_THIS_ACK(tp, th); 300 if ((!V_tcp_do_newcwv && (tp->snd_cwnd <= tp->snd_wnd)) || 301 (V_tcp_do_newcwv && (tp->snd_cwnd <= tp->snd_wnd) && 302 (tp->snd_cwnd < (tcp_compute_pipe(tp) * 2)))) 303 tp->t_ccv.flags |= CCF_CWND_LIMITED; 304 else 305 tp->t_ccv.flags &= ~CCF_CWND_LIMITED; 306 307 if (type == CC_ACK) { 308 #ifdef STATS 309 stats_voi_update_abs_s32(tp->t_stats, VOI_TCP_CALCFRWINDIFF, 310 ((int32_t)tp->snd_cwnd) - tp->snd_wnd); 311 if (!IN_RECOVERY(tp->t_flags)) 312 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_ACKLEN, 313 tp->t_ccv.bytes_this_ack / (tcp_maxseg(tp) * nsegs)); 314 if ((tp->t_flags & TF_GPUTINPROG) && 315 SEQ_GEQ(th->th_ack, tp->gput_ack)) { 316 /* 317 * Compute goodput in bits per millisecond. 318 */ 319 gput = (((int64_t)SEQ_SUB(th->th_ack, tp->gput_seq)) << 3) / 320 max(1, tcp_ts_getticks() - tp->gput_ts); 321 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_GPUT, 322 gput); 323 /* 324 * XXXLAS: This is a temporary hack, and should be 325 * chained off VOI_TCP_GPUT when stats(9) grows an API 326 * to deal with chained VOIs. 327 */ 328 if (tp->t_stats_gput_prev > 0) 329 stats_voi_update_abs_s32(tp->t_stats, 330 VOI_TCP_GPUT_ND, 331 ((gput - tp->t_stats_gput_prev) * 100) / 332 tp->t_stats_gput_prev); 333 tp->t_flags &= ~TF_GPUTINPROG; 334 tp->t_stats_gput_prev = gput; 335 } 336 #endif /* STATS */ 337 if (tp->snd_cwnd > tp->snd_ssthresh) { 338 tp->t_bytes_acked += tp->t_ccv.bytes_this_ack; 339 if (tp->t_bytes_acked >= tp->snd_cwnd) { 340 tp->t_bytes_acked -= tp->snd_cwnd; 341 tp->t_ccv.flags |= CCF_ABC_SENTAWND; 342 } 343 } else { 344 tp->t_ccv.flags &= ~CCF_ABC_SENTAWND; 345 tp->t_bytes_acked = 0; 346 } 347 } 348 349 if (CC_ALGO(tp)->ack_received != NULL) { 350 /* XXXLAS: Find a way to live without this */ 351 tp->t_ccv.curack = th->th_ack; 352 CC_ALGO(tp)->ack_received(&tp->t_ccv, type); 353 } 354 #ifdef STATS 355 stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_LCWIN, tp->snd_cwnd); 356 #endif 357 } 358 359 void 360 cc_conn_init(struct tcpcb *tp) 361 { 362 struct hc_metrics_lite metrics; 363 struct inpcb *inp = tptoinpcb(tp); 364 u_int maxseg; 365 int rtt; 366 367 INP_WLOCK_ASSERT(inp); 368 369 tcp_hc_get(&inp->inp_inc, &metrics); 370 maxseg = tcp_maxseg(tp); 371 372 if (tp->t_srtt == 0 && (rtt = metrics.hc_rtt)) { 373 tp->t_srtt = rtt; 374 TCPSTAT_INC(tcps_usedrtt); 375 if (metrics.hc_rttvar) { 376 tp->t_rttvar = metrics.hc_rttvar; 377 TCPSTAT_INC(tcps_usedrttvar); 378 } else { 379 /* default variation is +- 1 rtt */ 380 tp->t_rttvar = 381 tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE; 382 } 383 TCPT_RANGESET(tp->t_rxtcur, 384 ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1, 385 tp->t_rttmin, tcp_rexmit_max); 386 } 387 if (metrics.hc_ssthresh) { 388 /* 389 * There's some sort of gateway or interface 390 * buffer limit on the path. Use this to set 391 * the slow start threshold, but set the 392 * threshold to no less than 2*mss. 393 */ 394 tp->snd_ssthresh = max(2 * maxseg, metrics.hc_ssthresh); 395 TCPSTAT_INC(tcps_usedssthresh); 396 } 397 398 /* 399 * Set the initial slow-start flight size. 400 * 401 * If a SYN or SYN/ACK was lost and retransmitted, we have to 402 * reduce the initial CWND to one segment as congestion is likely 403 * requiring us to be cautious. 404 */ 405 if (tp->snd_cwnd == 1) 406 tp->snd_cwnd = maxseg; /* SYN(-ACK) lost */ 407 else 408 tp->snd_cwnd = tcp_compute_initwnd(maxseg); 409 410 if (CC_ALGO(tp)->conn_init != NULL) 411 CC_ALGO(tp)->conn_init(&tp->t_ccv); 412 } 413 414 void inline 415 cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type) 416 { 417 INP_WLOCK_ASSERT(tptoinpcb(tp)); 418 419 #ifdef STATS 420 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_CSIG, type); 421 #endif 422 423 switch(type) { 424 case CC_NDUPACK: 425 if (!IN_FASTRECOVERY(tp->t_flags)) { 426 tp->snd_recover = tp->snd_max; 427 if (tp->t_flags2 & TF2_ECN_PERMIT) 428 tp->t_flags2 |= TF2_ECN_SND_CWR; 429 } 430 break; 431 case CC_ECN: 432 if (!IN_CONGRECOVERY(tp->t_flags) || 433 /* 434 * Allow ECN reaction on ACK to CWR, if 435 * that data segment was also CE marked. 436 */ 437 SEQ_GEQ(th->th_ack, tp->snd_recover)) { 438 EXIT_CONGRECOVERY(tp->t_flags); 439 TCPSTAT_INC(tcps_ecn_rcwnd); 440 tp->snd_recover = tp->snd_max + 1; 441 if (tp->t_flags2 & TF2_ECN_PERMIT) 442 tp->t_flags2 |= TF2_ECN_SND_CWR; 443 } 444 break; 445 case CC_RTO: 446 tp->t_dupacks = 0; 447 tp->t_bytes_acked = 0; 448 EXIT_RECOVERY(tp->t_flags); 449 if (tp->t_flags2 & TF2_ECN_PERMIT) 450 tp->t_flags2 |= TF2_ECN_SND_CWR; 451 break; 452 case CC_RTO_ERR: 453 TCPSTAT_INC(tcps_sndrexmitbad); 454 /* RTO was unnecessary, so reset everything. */ 455 tp->snd_cwnd = tp->snd_cwnd_prev; 456 tp->snd_ssthresh = tp->snd_ssthresh_prev; 457 tp->snd_recover = tp->snd_recover_prev; 458 if (tp->t_flags & TF_WASFRECOVERY) 459 ENTER_FASTRECOVERY(tp->t_flags); 460 if (tp->t_flags & TF_WASCRECOVERY) 461 ENTER_CONGRECOVERY(tp->t_flags); 462 tp->snd_nxt = tp->snd_max; 463 tp->t_flags &= ~TF_PREVVALID; 464 tp->t_rxtshift = 0; 465 tp->t_badrxtwin = 0; 466 break; 467 } 468 if (SEQ_LT(tp->snd_fack, tp->snd_una) || 469 SEQ_GT(tp->snd_fack, tp->snd_max)) { 470 tp->snd_fack = tp->snd_una; 471 } 472 473 if (CC_ALGO(tp)->cong_signal != NULL) { 474 if (th != NULL) 475 tp->t_ccv.curack = th->th_ack; 476 CC_ALGO(tp)->cong_signal(&tp->t_ccv, type); 477 } 478 } 479 480 void inline 481 cc_post_recovery(struct tcpcb *tp, struct tcphdr *th) 482 { 483 INP_WLOCK_ASSERT(tptoinpcb(tp)); 484 485 if (CC_ALGO(tp)->post_recovery != NULL) { 486 if (SEQ_LT(tp->snd_fack, th->th_ack) || 487 SEQ_GT(tp->snd_fack, tp->snd_max)) { 488 tp->snd_fack = th->th_ack; 489 } 490 tp->t_ccv.curack = th->th_ack; 491 CC_ALGO(tp)->post_recovery(&tp->t_ccv); 492 } 493 EXIT_RECOVERY(tp->t_flags); 494 495 tp->t_bytes_acked = 0; 496 tp->sackhint.delivered_data = 0; 497 tp->sackhint.prr_delivered = 0; 498 tp->sackhint.prr_out = 0; 499 } 500 501 /* 502 * Indicate whether this ack should be delayed. We can delay the ack if 503 * following conditions are met: 504 * - There is no delayed ack timer in progress. 505 * - Our last ack wasn't a 0-sized window. We never want to delay 506 * the ack that opens up a 0-sized window. 507 * - LRO wasn't used for this segment. We make sure by checking that the 508 * segment size is not larger than the MSS. 509 */ 510 #define DELAY_ACK(tp, tlen) \ 511 ((!tcp_timer_active(tp, TT_DELACK) && \ 512 (tp->t_flags & TF_RXWIN0SENT) == 0) && \ 513 (tlen <= tp->t_maxseg) && \ 514 (V_tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN))) 515 516 void inline 517 cc_ecnpkt_handler_flags(struct tcpcb *tp, uint16_t flags, uint8_t iptos) 518 { 519 INP_WLOCK_ASSERT(tptoinpcb(tp)); 520 521 if (CC_ALGO(tp)->ecnpkt_handler != NULL) { 522 switch (iptos & IPTOS_ECN_MASK) { 523 case IPTOS_ECN_CE: 524 tp->t_ccv.flags |= CCF_IPHDR_CE; 525 break; 526 case IPTOS_ECN_ECT0: 527 /* FALLTHROUGH */ 528 case IPTOS_ECN_ECT1: 529 /* FALLTHROUGH */ 530 case IPTOS_ECN_NOTECT: 531 tp->t_ccv.flags &= ~CCF_IPHDR_CE; 532 break; 533 } 534 535 if (flags & TH_CWR) 536 tp->t_ccv.flags |= CCF_TCPHDR_CWR; 537 else 538 tp->t_ccv.flags &= ~CCF_TCPHDR_CWR; 539 540 CC_ALGO(tp)->ecnpkt_handler(&tp->t_ccv); 541 542 if (tp->t_ccv.flags & CCF_ACKNOW) { 543 tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); 544 tp->t_flags |= TF_ACKNOW; 545 } 546 } 547 } 548 549 void inline 550 cc_ecnpkt_handler(struct tcpcb *tp, struct tcphdr *th, uint8_t iptos) 551 { 552 cc_ecnpkt_handler_flags(tp, tcp_get_flags(th), iptos); 553 } 554 555 /* 556 * TCP input handling is split into multiple parts: 557 * tcp6_input is a thin wrapper around tcp_input for the extended 558 * ip6_protox[] call format in ip6_input 559 * tcp_input handles primary segment validation, inpcb lookup and 560 * SYN processing on listen sockets 561 * tcp_do_segment processes the ACK and text of the segment for 562 * establishing, established and closing connections 563 */ 564 #ifdef INET6 565 int 566 tcp6_input_with_port(struct mbuf **mp, int *offp, int proto, uint16_t port) 567 { 568 struct mbuf *m; 569 570 m = *mp; 571 if (m->m_len < *offp + sizeof(struct tcphdr)) { 572 m = m_pullup(m, *offp + sizeof(struct tcphdr)); 573 if (m == NULL) { 574 *mp = m; 575 TCPSTAT_INC(tcps_rcvshort); 576 return (IPPROTO_DONE); 577 } 578 } 579 580 *mp = m; 581 return (tcp_input_with_port(mp, offp, proto, port)); 582 } 583 584 int 585 tcp6_input(struct mbuf **mp, int *offp, int proto) 586 { 587 588 return(tcp6_input_with_port(mp, offp, proto, 0)); 589 } 590 #endif /* INET6 */ 591 592 int 593 tcp_input_with_port(struct mbuf **mp, int *offp, int proto, uint16_t port) 594 { 595 struct mbuf *m = *mp; 596 struct tcphdr *th = NULL; 597 struct ip *ip = NULL; 598 struct inpcb *inp = NULL; 599 struct tcpcb *tp = NULL; 600 struct socket *so = NULL; 601 u_char *optp = NULL; 602 int off0; 603 int optlen = 0; 604 #ifdef INET 605 int len; 606 uint8_t ipttl; 607 #endif 608 int tlen = 0, off; 609 int drop_hdrlen; 610 int thflags; 611 int lookupflag; 612 uint8_t iptos; 613 struct m_tag *fwd_tag = NULL; 614 #ifdef INET6 615 struct ip6_hdr *ip6 = NULL; 616 int isipv6; 617 #else 618 const void *ip6 = NULL; 619 #endif /* INET6 */ 620 struct tcpopt to; /* options in this segment */ 621 char *s = NULL; /* address and port logging */ 622 bool closed_port = false; /* segment is hitting a closed port */ 623 624 NET_EPOCH_ASSERT(); 625 626 #ifdef INET6 627 isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0; 628 #endif 629 630 off0 = *offp; 631 m = *mp; 632 *mp = NULL; 633 to.to_flags = 0; 634 TCPSTAT_INC(tcps_rcvtotal); 635 636 m->m_pkthdr.tcp_tun_port = port; 637 #ifdef INET6 638 if (isipv6) { 639 ip6 = mtod(m, struct ip6_hdr *); 640 th = (struct tcphdr *)((caddr_t)ip6 + off0); 641 tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0; 642 if (port) 643 goto skip6_csum; 644 if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID_IPV6) { 645 if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) 646 th->th_sum = m->m_pkthdr.csum_data; 647 else 648 th->th_sum = in6_cksum_pseudo(ip6, tlen, 649 IPPROTO_TCP, m->m_pkthdr.csum_data); 650 th->th_sum ^= 0xffff; 651 } else if (m->m_pkthdr.csum_flags & CSUM_IP6_TCP) { 652 /* 653 * Packet from local host (maybe from a VM). 654 * Checksum not required. 655 */ 656 th->th_sum = 0; 657 } else 658 th->th_sum = in6_cksum(m, IPPROTO_TCP, off0, tlen); 659 if (th->th_sum) { 660 TCPSTAT_INC(tcps_rcvbadsum); 661 goto drop; 662 } 663 skip6_csum: 664 /* 665 * Be proactive about unspecified IPv6 address in source. 666 * As we use all-zero to indicate unbounded/unconnected pcb, 667 * unspecified IPv6 address can be used to confuse us. 668 * 669 * Note that packets with unspecified IPv6 destination is 670 * already dropped in ip6_input. 671 */ 672 KASSERT(!IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_dst), 673 ("%s: unspecified destination v6 address", __func__)); 674 if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { 675 IP6STAT_INC(ip6s_badscope); /* XXX */ 676 goto drop; 677 } 678 iptos = IPV6_TRAFFIC_CLASS(ip6); 679 } 680 #endif 681 #if defined(INET) && defined(INET6) 682 else 683 #endif 684 #ifdef INET 685 { 686 /* 687 * Get IP and TCP header together in first mbuf. 688 * Note: IP leaves IP header in first mbuf. 689 */ 690 if (off0 > sizeof (struct ip)) { 691 ip_stripoptions(m); 692 off0 = sizeof(struct ip); 693 } 694 if (m->m_len < sizeof (struct tcpiphdr)) { 695 if ((m = m_pullup(m, sizeof (struct tcpiphdr))) 696 == NULL) { 697 TCPSTAT_INC(tcps_rcvshort); 698 return (IPPROTO_DONE); 699 } 700 } 701 ip = mtod(m, struct ip *); 702 th = (struct tcphdr *)((caddr_t)ip + off0); 703 tlen = ntohs(ip->ip_len) - off0; 704 705 iptos = ip->ip_tos; 706 if (port) 707 goto skip_csum; 708 if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) { 709 if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) 710 th->th_sum = m->m_pkthdr.csum_data; 711 else 712 th->th_sum = in_pseudo(ip->ip_src.s_addr, 713 ip->ip_dst.s_addr, 714 htonl(m->m_pkthdr.csum_data + tlen + 715 IPPROTO_TCP)); 716 th->th_sum ^= 0xffff; 717 } else if (m->m_pkthdr.csum_flags & CSUM_IP_TCP) { 718 /* 719 * Packet from local host (maybe from a VM). 720 * Checksum not required. 721 */ 722 th->th_sum = 0; 723 } else { 724 struct ipovly *ipov = (struct ipovly *)ip; 725 726 /* 727 * Checksum extended TCP header and data. 728 */ 729 len = off0 + tlen; 730 ipttl = ip->ip_ttl; 731 bzero(ipov->ih_x1, sizeof(ipov->ih_x1)); 732 ipov->ih_len = htons(tlen); 733 th->th_sum = in_cksum(m, len); 734 /* Reset length for SDT probes. */ 735 ip->ip_len = htons(len); 736 /* Reset TOS bits */ 737 ip->ip_tos = iptos; 738 /* Re-initialization for later version check */ 739 ip->ip_ttl = ipttl; 740 ip->ip_v = IPVERSION; 741 ip->ip_hl = off0 >> 2; 742 } 743 skip_csum: 744 if (th->th_sum && (port == 0)) { 745 TCPSTAT_INC(tcps_rcvbadsum); 746 goto drop; 747 } 748 KASSERT(ip->ip_dst.s_addr != INADDR_ANY, 749 ("%s: unspecified destination v4 address", __func__)); 750 if (__predict_false(ip->ip_src.s_addr == INADDR_ANY)) { 751 IPSTAT_INC(ips_badaddr); 752 goto drop; 753 } 754 } 755 #endif /* INET */ 756 757 /* 758 * Check that TCP offset makes sense, 759 * pull out TCP options and adjust length. XXX 760 */ 761 off = th->th_off << 2; 762 if (off < sizeof (struct tcphdr) || off > tlen) { 763 TCPSTAT_INC(tcps_rcvbadoff); 764 goto drop; 765 } 766 tlen -= off; /* tlen is used instead of ti->ti_len */ 767 if (off > sizeof (struct tcphdr)) { 768 #ifdef INET6 769 if (isipv6) { 770 if (m->m_len < off0 + off) { 771 m = m_pullup(m, off0 + off); 772 if (m == NULL) { 773 TCPSTAT_INC(tcps_rcvshort); 774 return (IPPROTO_DONE); 775 } 776 } 777 ip6 = mtod(m, struct ip6_hdr *); 778 th = (struct tcphdr *)((caddr_t)ip6 + off0); 779 } 780 #endif 781 #if defined(INET) && defined(INET6) 782 else 783 #endif 784 #ifdef INET 785 { 786 if (m->m_len < sizeof(struct ip) + off) { 787 if ((m = m_pullup(m, sizeof (struct ip) + off)) 788 == NULL) { 789 TCPSTAT_INC(tcps_rcvshort); 790 return (IPPROTO_DONE); 791 } 792 ip = mtod(m, struct ip *); 793 th = (struct tcphdr *)((caddr_t)ip + off0); 794 } 795 } 796 #endif 797 optlen = off - sizeof (struct tcphdr); 798 optp = (u_char *)(th + 1); 799 } 800 thflags = tcp_get_flags(th); 801 802 /* 803 * Convert TCP protocol specific fields to host format. 804 */ 805 tcp_fields_to_host(th); 806 807 /* 808 * Delay dropping TCP, IP headers, IPv6 ext headers, and TCP options. 809 */ 810 drop_hdrlen = off0 + off; 811 812 /* 813 * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain. 814 */ 815 if ( 816 #ifdef INET6 817 (isipv6 && (m->m_flags & M_IP6_NEXTHOP)) 818 #ifdef INET 819 || (!isipv6 && (m->m_flags & M_IP_NEXTHOP)) 820 #endif 821 #endif 822 #if defined(INET) && !defined(INET6) 823 (m->m_flags & M_IP_NEXTHOP) 824 #endif 825 ) 826 fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL); 827 828 /* 829 * For initial SYN packets we don't need write lock on matching 830 * PCB, be it a listening one or a synchronized one. The packet 831 * shall not modify its state. 832 */ 833 lookupflag = INPLOOKUP_WILDCARD | 834 ((thflags & (TH_ACK|TH_SYN)) == TH_SYN ? 835 INPLOOKUP_RLOCKPCB : INPLOOKUP_WLOCKPCB) | 836 (V_tcp_bind_all_fibs ? 0 : INPLOOKUP_FIB); 837 findpcb: 838 tp = NULL; 839 #ifdef INET6 840 if (isipv6 && fwd_tag != NULL) { 841 struct sockaddr_in6 *next_hop6; 842 843 next_hop6 = (struct sockaddr_in6 *)(fwd_tag + 1); 844 /* 845 * Transparently forwarded. Pretend to be the destination. 846 * Already got one like this? 847 */ 848 inp = in6_pcblookup_mbuf(&V_tcbinfo, 849 &ip6->ip6_src, th->th_sport, &ip6->ip6_dst, th->th_dport, 850 lookupflag & ~INPLOOKUP_WILDCARD, m->m_pkthdr.rcvif, m); 851 if (!inp) { 852 /* 853 * It's new. Try to find the ambushing socket. 854 * Because we've rewritten the destination address, 855 * any hardware-generated hash is ignored. 856 */ 857 inp = in6_pcblookup(&V_tcbinfo, &ip6->ip6_src, 858 th->th_sport, &next_hop6->sin6_addr, 859 next_hop6->sin6_port ? ntohs(next_hop6->sin6_port) : 860 th->th_dport, lookupflag, m->m_pkthdr.rcvif); 861 } 862 } else if (isipv6) { 863 inp = in6_pcblookup_mbuf(&V_tcbinfo, &ip6->ip6_src, 864 th->th_sport, &ip6->ip6_dst, th->th_dport, lookupflag, 865 m->m_pkthdr.rcvif, m); 866 } 867 #endif /* INET6 */ 868 #if defined(INET6) && defined(INET) 869 else 870 #endif 871 #ifdef INET 872 if (fwd_tag != NULL) { 873 struct sockaddr_in *next_hop; 874 875 next_hop = (struct sockaddr_in *)(fwd_tag+1); 876 /* 877 * Transparently forwarded. Pretend to be the destination. 878 * already got one like this? 879 */ 880 inp = in_pcblookup_mbuf(&V_tcbinfo, ip->ip_src, th->th_sport, 881 ip->ip_dst, th->th_dport, lookupflag & ~INPLOOKUP_WILDCARD, 882 m->m_pkthdr.rcvif, m); 883 if (!inp) { 884 /* 885 * It's new. Try to find the ambushing socket. 886 * Because we've rewritten the destination address, 887 * any hardware-generated hash is ignored. 888 */ 889 inp = in_pcblookup(&V_tcbinfo, ip->ip_src, 890 th->th_sport, next_hop->sin_addr, 891 next_hop->sin_port ? ntohs(next_hop->sin_port) : 892 th->th_dport, lookupflag, m->m_pkthdr.rcvif); 893 } 894 } else 895 inp = in_pcblookup_mbuf(&V_tcbinfo, ip->ip_src, 896 th->th_sport, ip->ip_dst, th->th_dport, lookupflag, 897 m->m_pkthdr.rcvif, m); 898 #endif /* INET */ 899 900 /* 901 * If the INPCB does not exist then all data in the incoming 902 * segment is discarded and an appropriate RST is sent back. 903 * XXX MRT Send RST using which routing table? 904 */ 905 if (inp == NULL) { 906 if ((lookupflag & INPLOOKUP_WILDCARD) == 0) { 907 /* We came here after second (safety) lookup. */ 908 MPASS(!closed_port); 909 } else { 910 /* 911 * Log communication attempts to ports that are not 912 * in use. 913 */ 914 if (((V_tcp_log_in_vain == 1 && (thflags & TH_SYN)) || 915 V_tcp_log_in_vain == 2) && 916 (s = tcp_log_vain(NULL, th, (void *)ip, ip6))) { 917 log(LOG_INFO, "%s; %s: Connection attempt " 918 "to closed port\n", s, __func__); 919 } 920 closed_port = true; 921 } 922 goto dropwithreset; 923 } 924 INP_LOCK_ASSERT(inp); 925 926 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 927 #ifdef INET6 928 if (isipv6 && IPSEC_ENABLED(ipv6) && 929 IPSEC_CHECK_POLICY(ipv6, m, inp) != 0) { 930 goto dropunlock; 931 } 932 #ifdef INET 933 else 934 #endif 935 #endif /* INET6 */ 936 #ifdef INET 937 if (IPSEC_ENABLED(ipv4) && 938 IPSEC_CHECK_POLICY(ipv4, m, inp) != 0) { 939 goto dropunlock; 940 } 941 #endif /* INET */ 942 #endif /* IPSEC */ 943 944 /* 945 * Check the minimum TTL for socket. 946 */ 947 if (inp->inp_ip_minttl != 0) { 948 #ifdef INET6 949 if (isipv6) { 950 if (inp->inp_ip_minttl > ip6->ip6_hlim) 951 goto dropunlock; 952 } else 953 #endif 954 if (inp->inp_ip_minttl > ip->ip_ttl) 955 goto dropunlock; 956 } 957 958 tp = intotcpcb(inp); 959 switch (tp->t_state) { 960 case TCPS_TIME_WAIT: 961 /* 962 * A previous connection in TIMEWAIT state is supposed to catch 963 * stray or duplicate segments arriving late. If this segment 964 * was a legitimate new connection attempt, the old INPCB gets 965 * removed and we can try again to find a listening socket. 966 */ 967 tcp_dooptions(&to, optp, optlen, 968 (thflags & TH_SYN) ? TO_SYN : 0); 969 /* 970 * tcp_twcheck unlocks the inp always, and frees the m if fails. 971 */ 972 if (tcp_twcheck(inp, &to, th, m, tlen)) 973 goto findpcb; 974 return (IPPROTO_DONE); 975 case TCPS_CLOSED: 976 /* 977 * The TCPCB may no longer exist if the connection is winding 978 * down or it is in the CLOSED state. Either way we drop the 979 * segment and send an appropriate response. 980 */ 981 closed_port = true; 982 goto dropwithreset; 983 } 984 985 if ((tp->t_port != port) && (tp->t_state > TCPS_LISTEN)) { 986 closed_port = true; 987 goto dropwithreset; 988 } 989 990 #ifdef TCP_OFFLOAD 991 if (tp->t_flags & TF_TOE) { 992 tcp_offload_input(tp, m); 993 m = NULL; /* consumed by the TOE driver */ 994 goto dropunlock; 995 } 996 #endif 997 998 #ifdef MAC 999 if (mac_inpcb_check_deliver(inp, m)) 1000 goto dropunlock; 1001 #endif 1002 so = inp->inp_socket; 1003 KASSERT(so != NULL, ("%s: so == NULL", __func__)); 1004 /* 1005 * When the socket is accepting connections (the INPCB is in LISTEN 1006 * state) we look into the SYN cache if this is a new connection 1007 * attempt or the completion of a previous one. 1008 */ 1009 KASSERT(tp->t_state == TCPS_LISTEN || !SOLISTENING(so), 1010 ("%s: so accepting but tp %p not listening", __func__, tp)); 1011 if (tp->t_state == TCPS_LISTEN && SOLISTENING(so)) { 1012 struct in_conninfo inc; 1013 1014 bzero(&inc, sizeof(inc)); 1015 #ifdef INET6 1016 if (isipv6) { 1017 inc.inc_flags |= INC_ISIPV6; 1018 if (inp->inp_inc.inc_flags & INC_IPV6MINMTU) 1019 inc.inc_flags |= INC_IPV6MINMTU; 1020 inc.inc6_faddr = ip6->ip6_src; 1021 inc.inc6_laddr = ip6->ip6_dst; 1022 } else 1023 #endif 1024 { 1025 inc.inc_faddr = ip->ip_src; 1026 inc.inc_laddr = ip->ip_dst; 1027 } 1028 inc.inc_fport = th->th_sport; 1029 inc.inc_lport = th->th_dport; 1030 inc.inc_fibnum = so->so_fibnum; 1031 1032 /* 1033 * Check for an existing connection attempt in syncache if 1034 * the flag is only ACK. A successful lookup creates a new 1035 * socket appended to the listen queue in SYN_RECEIVED state. 1036 */ 1037 if ((thflags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK) { 1038 int result; 1039 1040 /* 1041 * Parse the TCP options here because 1042 * syncookies need access to the reflected 1043 * timestamp. 1044 */ 1045 tcp_dooptions(&to, optp, optlen, 0); 1046 /* 1047 * NB: syncache_expand() doesn't unlock inp. 1048 */ 1049 result = syncache_expand(&inc, &to, th, &so, m, port); 1050 if (result < 0) { 1051 /* 1052 * A failing TCP MD5 signature comparison 1053 * must result in the segment being dropped 1054 * and must not produce any response back 1055 * to the sender. 1056 */ 1057 goto dropunlock; 1058 } else if (result == 0) { 1059 /* 1060 * No syncache entry, or ACK was not for our 1061 * SYN/ACK. Do our protection against double 1062 * ACK. If peer sent us 2 ACKs, then for the 1063 * first one syncache_expand() successfully 1064 * converted syncache entry into a socket, 1065 * while we were waiting on the inpcb lock. We 1066 * don't want to sent RST for the second ACK, 1067 * so we perform second lookup without wildcard 1068 * match, hoping to find the new socket. If 1069 * the ACK is stray indeed, the missing 1070 * INPLOOKUP_WILDCARD flag in lookupflag would 1071 * hint the above code that the lookup was a 1072 * second attempt. 1073 * 1074 * NB: syncache did its own logging 1075 * of the failure cause. 1076 */ 1077 INP_WUNLOCK(inp); 1078 lookupflag &= ~INPLOOKUP_WILDCARD; 1079 goto findpcb; 1080 } 1081 tfo_socket_result: 1082 if (so == NULL) { 1083 /* 1084 * We completed the 3-way handshake 1085 * but could not allocate a socket 1086 * either due to memory shortage, 1087 * listen queue length limits or 1088 * global socket limits. Send RST 1089 * or wait and have the remote end 1090 * retransmit the ACK for another 1091 * try. 1092 */ 1093 if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) 1094 log(LOG_DEBUG, "%s; %s: Listen socket: " 1095 "Socket allocation failed due to " 1096 "limits or memory shortage, %s\n", 1097 s, __func__, 1098 V_tcp_sc_rst_sock_fail ? 1099 "sending RST" : "try again"); 1100 if (V_tcp_sc_rst_sock_fail) { 1101 goto dropwithreset; 1102 } else 1103 goto dropunlock; 1104 } 1105 /* 1106 * Socket is created in state SYN_RECEIVED. 1107 * Unlock the listen socket, lock the newly 1108 * created socket and update the tp variable. 1109 * If we came here via jump to tfo_socket_result, 1110 * then listening socket is read-locked. 1111 */ 1112 INP_UNLOCK(inp); /* listen socket */ 1113 inp = sotoinpcb(so); 1114 /* 1115 * New connection inpcb is already locked by 1116 * syncache_expand(). 1117 */ 1118 INP_WLOCK_ASSERT(inp); 1119 tp = intotcpcb(inp); 1120 KASSERT(tp->t_state == TCPS_SYN_RECEIVED, 1121 ("%s: ", __func__)); 1122 /* 1123 * Process the segment and the data it 1124 * contains. tcp_do_segment() consumes 1125 * the mbuf chain and unlocks the inpcb. 1126 */ 1127 TCP_PROBE5(receive, NULL, tp, m, tp, th); 1128 tp->t_fb->tfb_tcp_do_segment(tp, m, th, drop_hdrlen, 1129 tlen, iptos); 1130 return (IPPROTO_DONE); 1131 } 1132 /* 1133 * Segment flag validation for new connection attempts: 1134 * 1135 * Our (SYN|ACK) response was rejected. 1136 * Check with syncache and remove entry to prevent 1137 * retransmits. 1138 * 1139 * NB: syncache_chkrst does its own logging of failure 1140 * causes. 1141 */ 1142 if (thflags & TH_RST) { 1143 syncache_chkrst(&inc, th, port); 1144 goto dropunlock; 1145 } 1146 /* 1147 * We can't do anything without SYN. 1148 */ 1149 if ((thflags & TH_SYN) == 0) { 1150 if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) 1151 log(LOG_DEBUG, "%s; %s: Listen socket: " 1152 "SYN is missing, segment ignored\n", 1153 s, __func__); 1154 TCPSTAT_INC(tcps_badsyn); 1155 goto dropunlock; 1156 } 1157 /* 1158 * (SYN|ACK) is bogus on a listen socket. 1159 */ 1160 if (thflags & TH_ACK) { 1161 if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) 1162 log(LOG_DEBUG, "%s; %s: Listen socket: " 1163 "SYN|ACK invalid, segment ignored\n", 1164 s, __func__); 1165 TCPSTAT_INC(tcps_badsyn); 1166 goto dropunlock; 1167 } 1168 /* 1169 * If the drop_synfin option is enabled, drop all 1170 * segments with both the SYN and FIN bits set. 1171 * This prevents e.g. nmap from identifying the 1172 * TCP/IP stack. 1173 * XXX: Poor reasoning. nmap has other methods 1174 * and is constantly refining its stack detection 1175 * strategies. 1176 * XXX: This is a violation of the TCP specification 1177 * and was used by RFC1644. 1178 */ 1179 if ((thflags & TH_FIN) && V_drop_synfin) { 1180 if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) 1181 log(LOG_DEBUG, "%s; %s: Listen socket: " 1182 "SYN|FIN segment ignored (based on " 1183 "sysctl setting)\n", s, __func__); 1184 TCPSTAT_INC(tcps_badsyn); 1185 goto dropunlock; 1186 } 1187 /* 1188 * Segment's flags are (SYN) or (SYN|FIN). 1189 * 1190 * TH_PUSH, TH_URG, TH_ECE, TH_CWR are ignored 1191 * as they do not affect the state of the TCP FSM. 1192 * The data pointed to by TH_URG and th_urp is ignored. 1193 */ 1194 KASSERT((thflags & (TH_RST|TH_ACK)) == 0, 1195 ("%s: Listen socket: TH_RST or TH_ACK set", __func__)); 1196 KASSERT(thflags & (TH_SYN), 1197 ("%s: Listen socket: TH_SYN not set", __func__)); 1198 INP_RLOCK_ASSERT(inp); 1199 #ifdef INET6 1200 /* 1201 * If deprecated address is forbidden, 1202 * we do not accept SYN to deprecated interface 1203 * address to prevent any new inbound connection from 1204 * getting established. 1205 * When we do not accept SYN, we send a TCP RST, 1206 * with deprecated source address (instead of dropping 1207 * it). We compromise it as it is much better for peer 1208 * to send a RST, and RST will be the final packet 1209 * for the exchange. 1210 * 1211 * If we do not forbid deprecated addresses, we accept 1212 * the SYN packet. RFC2462 does not suggest dropping 1213 * SYN in this case. 1214 * If we decipher RFC2462 5.5.4, it says like this: 1215 * 1. use of deprecated addr with existing 1216 * communication is okay - "SHOULD continue to be 1217 * used" 1218 * 2. use of it with new communication: 1219 * (2a) "SHOULD NOT be used if alternate address 1220 * with sufficient scope is available" 1221 * (2b) nothing mentioned otherwise. 1222 * Here we fall into (2b) case as we have no choice in 1223 * our source address selection - we must obey the peer. 1224 * 1225 * The wording in RFC2462 is confusing, and there are 1226 * multiple description text for deprecated address 1227 * handling - worse, they are not exactly the same. 1228 * I believe 5.5.4 is the best one, so we follow 5.5.4. 1229 */ 1230 if (isipv6 && !V_ip6_use_deprecated) { 1231 struct in6_ifaddr *ia6; 1232 1233 ia6 = in6ifa_ifwithaddr(&ip6->ip6_dst, 0 /* XXX */, false); 1234 if (ia6 != NULL && 1235 (ia6->ia6_flags & IN6_IFF_DEPRECATED)) { 1236 if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) 1237 log(LOG_DEBUG, "%s; %s: Listen socket: " 1238 "Connection attempt to deprecated " 1239 "IPv6 address rejected\n", 1240 s, __func__); 1241 goto dropwithreset; 1242 } 1243 } 1244 #endif /* INET6 */ 1245 /* 1246 * Basic sanity checks on incoming SYN requests: 1247 * Don't respond if the destination is a link layer 1248 * broadcast according to RFC1122 4.2.3.10, p. 104. 1249 * If it is from this socket it must be forged. 1250 * Don't respond if the source or destination is a 1251 * global or subnet broad- or multicast address. 1252 * Note that it is quite possible to receive unicast 1253 * link-layer packets with a broadcast IP address. Use 1254 * in_ifnet_broadcast() to find them. 1255 */ 1256 if (m->m_flags & (M_BCAST|M_MCAST)) { 1257 if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) 1258 log(LOG_DEBUG, "%s; %s: Listen socket: " 1259 "Connection attempt from broad- or multicast " 1260 "link layer address ignored\n", s, __func__); 1261 goto dropunlock; 1262 } 1263 #ifdef INET6 1264 if (isipv6) { 1265 if (th->th_dport == th->th_sport && 1266 IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &ip6->ip6_src)) { 1267 if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) 1268 log(LOG_DEBUG, "%s; %s: Listen socket: " 1269 "Connection attempt to/from self " 1270 "ignored\n", s, __func__); 1271 goto dropunlock; 1272 } 1273 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || 1274 IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) { 1275 if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) 1276 log(LOG_DEBUG, "%s; %s: Listen socket: " 1277 "Connection attempt from/to multicast " 1278 "address ignored\n", s, __func__); 1279 goto dropunlock; 1280 } 1281 } 1282 #endif 1283 #if defined(INET) && defined(INET6) 1284 else 1285 #endif 1286 #ifdef INET 1287 { 1288 if (th->th_dport == th->th_sport && 1289 ip->ip_dst.s_addr == ip->ip_src.s_addr) { 1290 if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) 1291 log(LOG_DEBUG, "%s; %s: Listen socket: " 1292 "Connection attempt from/to self " 1293 "ignored\n", s, __func__); 1294 goto dropunlock; 1295 } 1296 if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || 1297 IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || 1298 ip->ip_src.s_addr == htonl(INADDR_BROADCAST) || 1299 in_ifnet_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) { 1300 if ((s = tcp_log_addrs(&inc, th, NULL, NULL))) 1301 log(LOG_DEBUG, "%s; %s: Listen socket: " 1302 "Connection attempt from/to broad- " 1303 "or multicast address ignored\n", 1304 s, __func__); 1305 goto dropunlock; 1306 } 1307 } 1308 #endif 1309 /* 1310 * SYN appears to be valid. Create compressed TCP state 1311 * for syncache. 1312 */ 1313 TCP_PROBE3(debug__input, tp, th, m); 1314 tcp_dooptions(&to, optp, optlen, TO_SYN); 1315 if ((so = syncache_add(&inc, &to, th, inp, so, m, NULL, NULL, 1316 iptos, port)) != NULL) 1317 goto tfo_socket_result; 1318 1319 /* 1320 * Entry added to syncache and mbuf consumed. 1321 * Only the listen socket is unlocked by syncache_add(). 1322 */ 1323 return (IPPROTO_DONE); 1324 } 1325 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) 1326 if (tp->t_flags & TF_SIGNATURE) { 1327 tcp_dooptions(&to, optp, optlen, thflags); 1328 if ((to.to_flags & TOF_SIGNATURE) == 0) { 1329 TCPSTAT_INC(tcps_sig_err_nosigopt); 1330 goto dropunlock; 1331 } 1332 if (!TCPMD5_ENABLED() || 1333 TCPMD5_INPUT(m, th, to.to_signature) != 0) 1334 goto dropunlock; 1335 } 1336 #endif 1337 TCP_PROBE5(receive, NULL, tp, m, tp, th); 1338 1339 /* 1340 * Segment belongs to a connection in SYN_SENT, ESTABLISHED or later 1341 * state. tcp_do_segment() always consumes the mbuf chain, unlocks 1342 * the inpcb, and unlocks pcbinfo. 1343 * 1344 * XXXGL: in case of a pure SYN arriving on existing connection 1345 * TCP stacks won't need to modify the PCB, they would either drop 1346 * the segment silently, or send a challenge ACK. However, we try 1347 * to upgrade the lock, because calling convention for stacks is 1348 * write-lock on PCB. If upgrade fails, drop the SYN. 1349 */ 1350 if ((lookupflag & INPLOOKUP_RLOCKPCB) && INP_TRY_UPGRADE(inp) == 0) 1351 goto dropunlock; 1352 1353 tp->t_fb->tfb_tcp_do_segment(tp, m, th, drop_hdrlen, tlen, iptos); 1354 return (IPPROTO_DONE); 1355 1356 dropwithreset: 1357 /* 1358 * When blackholing do not respond with a RST but 1359 * completely ignore the segment and drop it. 1360 */ 1361 if (((!closed_port && V_blackhole == 3) || 1362 (closed_port && 1363 ((V_blackhole == 1 && (thflags & TH_SYN)) || V_blackhole > 1))) && 1364 (V_blackhole_local || ( 1365 #ifdef INET6 1366 isipv6 ? !in6_localip(&ip6->ip6_src) : 1367 #endif 1368 #ifdef INET 1369 !in_localip(ip->ip_src) 1370 #else 1371 true 1372 #endif 1373 ))) 1374 goto dropunlock; 1375 TCP_PROBE5(receive, NULL, tp, m, tp, th); 1376 tcp_dropwithreset(m, th, tp, tlen); 1377 m = NULL; /* mbuf chain got consumed. */ 1378 1379 dropunlock: 1380 if (m != NULL) 1381 TCP_PROBE5(receive, NULL, tp, m, tp, th); 1382 1383 if (inp != NULL) 1384 INP_UNLOCK(inp); 1385 1386 drop: 1387 if (s != NULL) 1388 free(s, M_TCPLOG); 1389 if (m != NULL) 1390 m_freem(m); 1391 return (IPPROTO_DONE); 1392 } 1393 1394 /* 1395 * Automatic sizing of receive socket buffer. Often the send 1396 * buffer size is not optimally adjusted to the actual network 1397 * conditions at hand (delay bandwidth product). Setting the 1398 * buffer size too small limits throughput on links with high 1399 * bandwidth and high delay (eg. trans-continental/oceanic links). 1400 * 1401 * On the receive side the socket buffer memory is only rarely 1402 * used to any significant extent. This allows us to be much 1403 * more aggressive in scaling the receive socket buffer. For 1404 * the case that the buffer space is actually used to a large 1405 * extent and we run out of kernel memory we can simply drop 1406 * the new segments; TCP on the sender will just retransmit it 1407 * later. Setting the buffer size too big may only consume too 1408 * much kernel memory if the application doesn't read() from 1409 * the socket or packet loss or reordering makes use of the 1410 * reassembly queue. 1411 * 1412 * The criteria to step up the receive buffer one notch are: 1413 * 1. Application has not set receive buffer size with 1414 * SO_RCVBUF. Setting SO_RCVBUF clears SB_AUTOSIZE. 1415 * 2. the number of bytes received during 1/2 of an sRTT 1416 * is at least 3/8 of the current socket buffer size. 1417 * 3. receive buffer size has not hit maximal automatic size; 1418 * 1419 * If all of the criteria are met, we increase the socket buffer 1420 * by a 1/2 (bounded by the max). This allows us to keep ahead 1421 * of slow-start but also makes it so our peer never gets limited 1422 * by our rwnd which we then open up causing a burst. 1423 * 1424 * This algorithm does two steps per RTT at most and only if 1425 * we receive a bulk stream w/o packet losses or reorderings. 1426 * Shrinking the buffer during idle times is not necessary as 1427 * it doesn't consume any memory when idle. 1428 * 1429 * TODO: Only step up if the application is actually serving 1430 * the buffer to better manage the socket buffer resources. 1431 */ 1432 int 1433 tcp_autorcvbuf(struct mbuf *m, struct tcphdr *th, struct socket *so, 1434 struct tcpcb *tp, int tlen) 1435 { 1436 int newsize = 0; 1437 1438 if (V_tcp_do_autorcvbuf && (so->so_rcv.sb_flags & SB_AUTOSIZE) && 1439 tp->t_srtt != 0 && tp->rfbuf_ts != 0 && 1440 TCP_TS_TO_TICKS(tcp_ts_getticks() - tp->rfbuf_ts) > 1441 ((tp->t_srtt >> TCP_RTT_SHIFT)/2)) { 1442 if (tp->rfbuf_cnt > ((so->so_rcv.sb_hiwat / 2)/ 4 * 3) && 1443 so->so_rcv.sb_hiwat < V_tcp_autorcvbuf_max) { 1444 newsize = min((so->so_rcv.sb_hiwat + (so->so_rcv.sb_hiwat/2)), V_tcp_autorcvbuf_max); 1445 } 1446 TCP_PROBE6(receive__autoresize, NULL, tp, m, tp, th, newsize); 1447 1448 /* Start over with next RTT. */ 1449 tp->rfbuf_ts = 0; 1450 tp->rfbuf_cnt = 0; 1451 } else { 1452 tp->rfbuf_cnt += tlen; /* add up */ 1453 } 1454 return (newsize); 1455 } 1456 1457 int 1458 tcp_input(struct mbuf **mp, int *offp, int proto) 1459 { 1460 return(tcp_input_with_port(mp, offp, proto, 0)); 1461 } 1462 1463 static void 1464 tcp_handle_wakeup(struct tcpcb *tp) 1465 { 1466 1467 INP_WLOCK_ASSERT(tptoinpcb(tp)); 1468 1469 if (tp->t_flags & TF_WAKESOR) { 1470 struct socket *so = tptosocket(tp); 1471 1472 tp->t_flags &= ~TF_WAKESOR; 1473 SOCK_RECVBUF_LOCK_ASSERT(so); 1474 sorwakeup_locked(so); 1475 } 1476 } 1477 1478 void 1479 tcp_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, 1480 int drop_hdrlen, int tlen, uint8_t iptos) 1481 { 1482 uint16_t thflags; 1483 int acked, ourfinisacked, needoutput = 0; 1484 sackstatus_t sack_changed; 1485 int todrop, win, incforsyn = 0; 1486 uint32_t tiwin; 1487 uint16_t nsegs; 1488 char *s; 1489 struct inpcb *inp = tptoinpcb(tp); 1490 struct socket *so = tptosocket(tp); 1491 struct in_conninfo *inc = &inp->inp_inc; 1492 struct mbuf *mfree; 1493 struct tcpopt to; 1494 int tfo_syn; 1495 u_int maxseg = 0; 1496 bool no_data; 1497 1498 no_data = (tlen == 0); 1499 thflags = tcp_get_flags(th); 1500 tp->sackhint.last_sack_ack = 0; 1501 sack_changed = SACK_NOCHANGE; 1502 nsegs = max(1, m->m_pkthdr.lro_nsegs); 1503 1504 NET_EPOCH_ASSERT(); 1505 INP_WLOCK_ASSERT(inp); 1506 KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", 1507 __func__)); 1508 KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT", 1509 __func__)); 1510 1511 TCP_LOG_EVENT(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_IN, 0, 1512 tlen, NULL, true); 1513 1514 if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) { 1515 if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { 1516 log(LOG_DEBUG, "%s; %s: " 1517 "SYN|FIN segment ignored (based on " 1518 "sysctl setting)\n", s, __func__); 1519 free(s, M_TCPLOG); 1520 } 1521 goto drop; 1522 } 1523 1524 /* 1525 * If a segment with the ACK-bit set arrives in the SYN-SENT state 1526 * check SEQ.ACK first. 1527 */ 1528 if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) && 1529 (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { 1530 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 1531 goto dropwithreset; 1532 } 1533 1534 /* 1535 * Segment received on connection. 1536 * Reset idle time and keep-alive timer. 1537 * XXX: This should be done after segment 1538 * validation to ignore broken/spoofed segs. 1539 */ 1540 if (tp->t_idle_reduce && 1541 (tp->snd_max == tp->snd_una) && 1542 ((ticks - tp->t_rcvtime) >= tp->t_rxtcur)) 1543 cc_after_idle(tp); 1544 tp->t_rcvtime = ticks; 1545 1546 if (thflags & TH_FIN) 1547 tcp_log_end_status(tp, TCP_EI_STATUS_CLIENT_FIN); 1548 /* 1549 * Scale up the window into a 32-bit value. 1550 * For the SYN_SENT state the scale is zero. 1551 */ 1552 tiwin = th->th_win << tp->snd_scale; 1553 #ifdef STATS 1554 stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_FRWIN, tiwin); 1555 #endif 1556 1557 /* 1558 * TCP ECN processing. 1559 */ 1560 if (tcp_ecn_input_segment(tp, thflags, tlen, 1561 tcp_packets_this_ack(tp, th->th_ack), 1562 iptos)) 1563 cc_cong_signal(tp, th, CC_ECN); 1564 1565 /* 1566 * Parse options on any incoming segment. 1567 */ 1568 tcp_dooptions(&to, (u_char *)(th + 1), 1569 (th->th_off << 2) - sizeof(struct tcphdr), 1570 (thflags & TH_SYN) ? TO_SYN : 0); 1571 if (tp->t_flags2 & TF2_PROC_SACK_PROHIBIT) { 1572 /* 1573 * We don't look at sack's from the 1574 * peer because the MSS is too small which 1575 * can subject us to an attack. 1576 */ 1577 to.to_flags &= ~TOF_SACK; 1578 } 1579 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) 1580 if ((tp->t_flags & TF_SIGNATURE) != 0 && 1581 (to.to_flags & TOF_SIGNATURE) == 0) { 1582 TCPSTAT_INC(tcps_sig_err_sigopt); 1583 /* XXX: should drop? */ 1584 } 1585 #endif 1586 /* 1587 * If echoed timestamp is later than the current time, 1588 * fall back to non RFC1323 RTT calculation. Normalize 1589 * timestamp if syncookies were used when this connection 1590 * was established. 1591 */ 1592 if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) { 1593 to.to_tsecr -= tp->ts_offset; 1594 if (TSTMP_GT(to.to_tsecr, tcp_ts_getticks())) { 1595 to.to_tsecr = 0; 1596 } 1597 } 1598 /* 1599 * Process options only when we get SYN/ACK back. The SYN case 1600 * for incoming connections is handled in tcp_syncache. 1601 * According to RFC1323 the window field in a SYN (i.e., a <SYN> 1602 * or <SYN,ACK>) segment itself is never scaled. 1603 * XXX this is traditional behavior, may need to be cleaned up. 1604 */ 1605 if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) { 1606 /* Handle parallel SYN for ECN */ 1607 tcp_ecn_input_parallel_syn(tp, thflags, iptos); 1608 if ((to.to_flags & TOF_SCALE) && 1609 (tp->t_flags & TF_REQ_SCALE) && 1610 !(tp->t_flags & TF_NOOPT)) { 1611 tp->t_flags |= TF_RCVD_SCALE; 1612 tp->snd_scale = to.to_wscale; 1613 } else { 1614 tp->t_flags &= ~TF_REQ_SCALE; 1615 } 1616 /* 1617 * Initial send window. It will be updated with 1618 * the next incoming segment to the scaled value. 1619 */ 1620 tp->snd_wnd = th->th_win; 1621 if ((to.to_flags & TOF_TS) && 1622 (tp->t_flags & TF_REQ_TSTMP) && 1623 !(tp->t_flags & TF_NOOPT)) { 1624 tp->t_flags |= TF_RCVD_TSTMP; 1625 tp->ts_recent = to.to_tsval; 1626 tp->ts_recent_age = tcp_ts_getticks(); 1627 } else { 1628 tp->t_flags &= ~TF_REQ_TSTMP; 1629 } 1630 if (to.to_flags & TOF_MSS) { 1631 tcp_mss(tp, to.to_mss); 1632 } 1633 if ((tp->t_flags & TF_SACK_PERMIT) && 1634 (!(to.to_flags & TOF_SACKPERM) || 1635 (tp->t_flags & TF_NOOPT))) { 1636 tp->t_flags &= ~TF_SACK_PERMIT; 1637 } 1638 if (tp->t_flags & TF_FASTOPEN) { 1639 if ((to.to_flags & TOF_FASTOPEN) && 1640 !(tp->t_flags & TF_NOOPT)) { 1641 uint16_t mss; 1642 1643 if (to.to_flags & TOF_MSS) { 1644 mss = to.to_mss; 1645 } else { 1646 if ((inp->inp_vflag & INP_IPV6) != 0) { 1647 mss = TCP6_MSS; 1648 } else { 1649 mss = TCP_MSS; 1650 } 1651 } 1652 tcp_fastopen_update_cache(tp, mss, 1653 to.to_tfo_len, to.to_tfo_cookie); 1654 } else { 1655 tcp_fastopen_disable_path(tp); 1656 } 1657 } 1658 } 1659 1660 /* 1661 * If timestamps were negotiated during SYN/ACK and a 1662 * segment without a timestamp is received, silently drop 1663 * the segment, unless it is a RST segment or missing timestamps are 1664 * tolerated. 1665 * See section 3.2 of RFC 7323. 1666 */ 1667 if ((tp->t_flags & TF_RCVD_TSTMP) && !(to.to_flags & TOF_TS)) { 1668 if (((thflags & TH_RST) != 0) || V_tcp_tolerate_missing_ts) { 1669 if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { 1670 log(LOG_DEBUG, "%s; %s: Timestamp missing, " 1671 "segment processed normally\n", 1672 s, __func__); 1673 free(s, M_TCPLOG); 1674 } 1675 } else { 1676 if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { 1677 log(LOG_DEBUG, "%s; %s: Timestamp missing, " 1678 "segment silently dropped\n", s, __func__); 1679 free(s, M_TCPLOG); 1680 } 1681 goto drop; 1682 } 1683 } 1684 /* 1685 * If timestamps were not negotiated during SYN/ACK and a 1686 * segment with a timestamp is received, ignore the 1687 * timestamp and process the packet normally. 1688 * See section 3.2 of RFC 7323. 1689 */ 1690 if (!(tp->t_flags & TF_RCVD_TSTMP) && (to.to_flags & TOF_TS)) { 1691 if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { 1692 log(LOG_DEBUG, "%s; %s: Timestamp not expected, " 1693 "segment processed normally\n", s, __func__); 1694 free(s, M_TCPLOG); 1695 } 1696 } 1697 1698 /* 1699 * Header prediction: check for the two common cases 1700 * of a uni-directional data xfer. If the packet has 1701 * no control flags, is in-sequence, the window didn't 1702 * change and we're not retransmitting, it's a 1703 * candidate. If the length is zero and the ack moved 1704 * forward, we're the sender side of the xfer. Just 1705 * free the data acked & wake any higher level process 1706 * that was blocked waiting for space. If the length 1707 * is non-zero and the ack didn't move, we're the 1708 * receiver side. If we're getting packets in-order 1709 * (the reassembly queue is empty), add the data to 1710 * the socket buffer and note that we need a delayed ack. 1711 * Make sure that the hidden state-flags are also off. 1712 * Since we check for TCPS_ESTABLISHED first, it can only 1713 * be TH_NEEDSYN. 1714 */ 1715 if (tp->t_state == TCPS_ESTABLISHED && 1716 th->th_seq == tp->rcv_nxt && 1717 (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && 1718 tp->snd_nxt == tp->snd_max && 1719 tiwin && tiwin == tp->snd_wnd && 1720 ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) && 1721 SEGQ_EMPTY(tp) && 1722 ((to.to_flags & TOF_TS) == 0 || 1723 TSTMP_GEQ(to.to_tsval, tp->ts_recent)) ) { 1724 /* 1725 * If last ACK falls within this segment's sequence numbers, 1726 * record the timestamp. 1727 * NOTE that the test is modified according to the latest 1728 * proposal of the tcplw@cray.com list (Braden 1993/04/26). 1729 */ 1730 if ((to.to_flags & TOF_TS) != 0 && 1731 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 1732 tp->ts_recent_age = tcp_ts_getticks(); 1733 tp->ts_recent = to.to_tsval; 1734 } 1735 1736 if (no_data) { 1737 if (SEQ_GT(th->th_ack, tp->snd_una) && 1738 SEQ_LEQ(th->th_ack, tp->snd_max) && 1739 !IN_RECOVERY(tp->t_flags) && 1740 (to.to_flags & TOF_SACK) == 0 && 1741 TAILQ_EMPTY(&tp->snd_holes)) { 1742 /* 1743 * This is a pure ack for outstanding data. 1744 */ 1745 TCPSTAT_INC(tcps_predack); 1746 1747 /* 1748 * "bad retransmit" recovery. 1749 */ 1750 if (tp->t_rxtshift == 1 && 1751 tp->t_flags & TF_PREVVALID && 1752 tp->t_badrxtwin != 0 && 1753 (((to.to_flags & TOF_TS) != 0 && 1754 to.to_tsecr != 0 && 1755 TSTMP_LT(to.to_tsecr, tp->t_badrxtwin)) || 1756 ((to.to_flags & TOF_TS) == 0 && 1757 TSTMP_LT(ticks, tp->t_badrxtwin)))) 1758 cc_cong_signal(tp, th, CC_RTO_ERR); 1759 1760 /* 1761 * Recalculate the transmit timer / rtt. 1762 * 1763 * Some boxes send broken timestamp replies 1764 * during the SYN+ACK phase, ignore 1765 * timestamps of 0 or we could calculate a 1766 * huge RTT and blow up the retransmit timer. 1767 */ 1768 if ((to.to_flags & TOF_TS) != 0 && 1769 to.to_tsecr) { 1770 uint32_t t; 1771 1772 t = tcp_ts_getticks() - to.to_tsecr; 1773 if (!tp->t_rttlow || tp->t_rttlow > t) 1774 tp->t_rttlow = t; 1775 tcp_xmit_timer(tp, 1776 TCP_TS_TO_TICKS(t) + 1); 1777 } else if (tp->t_rtttime && 1778 SEQ_GT(th->th_ack, tp->t_rtseq)) { 1779 if (!tp->t_rttlow || 1780 tp->t_rttlow > ticks - tp->t_rtttime) 1781 tp->t_rttlow = ticks - tp->t_rtttime; 1782 tcp_xmit_timer(tp, 1783 ticks - tp->t_rtttime); 1784 } 1785 acked = BYTES_THIS_ACK(tp, th); 1786 1787 #ifdef TCP_HHOOK 1788 /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ 1789 hhook_run_tcp_est_in(tp, th, &to); 1790 #endif 1791 1792 TCPSTAT_ADD(tcps_rcvackpack, nsegs); 1793 TCPSTAT_ADD(tcps_rcvackbyte, acked); 1794 sbdrop(&so->so_snd, acked); 1795 if (SEQ_GT(tp->snd_una, tp->snd_recover) && 1796 SEQ_LEQ(th->th_ack, tp->snd_recover)) 1797 tp->snd_recover = th->th_ack - 1; 1798 1799 /* 1800 * Let the congestion control algorithm update 1801 * congestion control related information. This 1802 * typically means increasing the congestion 1803 * window. 1804 */ 1805 cc_ack_received(tp, th, nsegs, CC_ACK); 1806 1807 tp->snd_una = th->th_ack; 1808 /* 1809 * Pull snd_wl2 up to prevent seq wrap relative 1810 * to th_ack. 1811 */ 1812 tp->snd_wl2 = th->th_ack; 1813 tp->t_dupacks = 0; 1814 m_freem(m); 1815 1816 /* 1817 * If all outstanding data are acked, stop 1818 * retransmit timer, otherwise restart timer 1819 * using current (possibly backed-off) value. 1820 * If process is waiting for space, 1821 * wakeup/selwakeup/signal. If data 1822 * are ready to send, let tcp_output 1823 * decide between more output or persist. 1824 */ 1825 TCP_PROBE3(debug__input, tp, th, m); 1826 /* 1827 * Clear t_acktime if remote side has ACKd 1828 * all data in the socket buffer. 1829 * Otherwise, update t_acktime if we received 1830 * a sufficiently large ACK. 1831 */ 1832 if (sbavail(&so->so_snd) == 0) 1833 tp->t_acktime = 0; 1834 else if (acked > 1) 1835 tp->t_acktime = ticks; 1836 if (tp->snd_una == tp->snd_max) 1837 tcp_timer_activate(tp, TT_REXMT, 0); 1838 else if (!tcp_timer_active(tp, TT_PERSIST)) 1839 tcp_timer_activate(tp, TT_REXMT, 1840 TP_RXTCUR(tp)); 1841 sowwakeup(so); 1842 /* 1843 * Only call tcp_output when there 1844 * is new data available to be sent 1845 * or we need to send an ACK. 1846 */ 1847 if ((tp->t_flags & TF_ACKNOW) || 1848 (sbavail(&so->so_snd) >= 1849 SEQ_SUB(tp->snd_max, tp->snd_una))) { 1850 (void) tcp_output(tp); 1851 } 1852 goto check_delack; 1853 } 1854 } else if (th->th_ack == tp->snd_una && 1855 tlen <= sbspace(&so->so_rcv)) { 1856 int newsize = 0; /* automatic sockbuf scaling */ 1857 1858 /* 1859 * This is a pure, in-sequence data packet with 1860 * nothing on the reassembly queue and we have enough 1861 * buffer space to take it. 1862 */ 1863 /* Clean receiver SACK report if present */ 1864 if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks) 1865 tcp_clean_sackreport(tp); 1866 TCPSTAT_INC(tcps_preddat); 1867 tp->rcv_nxt += tlen; 1868 if (tlen && 1869 ((tp->t_flags2 & TF2_FBYTES_COMPLETE) == 0) && 1870 (tp->t_fbyte_in == 0)) { 1871 tp->t_fbyte_in = ticks; 1872 if (tp->t_fbyte_in == 0) 1873 tp->t_fbyte_in = 1; 1874 if (tp->t_fbyte_out && tp->t_fbyte_in) 1875 tp->t_flags2 |= TF2_FBYTES_COMPLETE; 1876 } 1877 /* 1878 * Pull snd_wl1 up to prevent seq wrap relative to 1879 * th_seq. 1880 */ 1881 tp->snd_wl1 = th->th_seq; 1882 /* 1883 * Pull rcv_up up to prevent seq wrap relative to 1884 * rcv_nxt. 1885 */ 1886 tp->rcv_up = tp->rcv_nxt; 1887 TCPSTAT_ADD(tcps_rcvpack, nsegs); 1888 TCPSTAT_ADD(tcps_rcvbyte, tlen); 1889 TCP_PROBE3(debug__input, tp, th, m); 1890 1891 newsize = tcp_autorcvbuf(m, th, so, tp, tlen); 1892 1893 /* Add data to socket buffer. */ 1894 SOCK_RECVBUF_LOCK(so); 1895 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 1896 m_freem(m); 1897 } else { 1898 /* 1899 * Set new socket buffer size. 1900 * Give up when limit is reached. 1901 */ 1902 if (newsize) 1903 if (!sbreserve_locked(so, SO_RCV, 1904 newsize, NULL)) 1905 so->so_rcv.sb_flags &= ~SB_AUTOSIZE; 1906 m_adj(m, drop_hdrlen); /* delayed header drop */ 1907 sbappendstream_locked(&so->so_rcv, m, 0); 1908 } 1909 /* NB: sorwakeup_locked() does an implicit unlock. */ 1910 sorwakeup_locked(so); 1911 if (DELAY_ACK(tp, tlen)) { 1912 tp->t_flags |= TF_DELACK; 1913 } else { 1914 tp->t_flags |= TF_ACKNOW; 1915 (void) tcp_output(tp); 1916 } 1917 goto check_delack; 1918 } 1919 } 1920 1921 /* 1922 * Calculate amount of space in receive window, 1923 * and then do TCP input processing. 1924 * Receive window is amount of space in rcv queue, 1925 * but not less than advertised window. 1926 */ 1927 win = sbspace(&so->so_rcv); 1928 if (win < 0) 1929 win = 0; 1930 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); 1931 1932 switch (tp->t_state) { 1933 /* 1934 * If the state is SYN_RECEIVED: 1935 * if seg contains an ACK, but not for our SYN/ACK, send a RST. 1936 */ 1937 case TCPS_SYN_RECEIVED: 1938 if (thflags & TH_RST) { 1939 /* Handle RST segments later. */ 1940 break; 1941 } 1942 if ((thflags & TH_ACK) && 1943 (SEQ_LEQ(th->th_ack, tp->snd_una) || 1944 SEQ_GT(th->th_ack, tp->snd_max))) { 1945 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 1946 goto dropwithreset; 1947 } 1948 if (tp->t_flags & TF_FASTOPEN) { 1949 /* 1950 * When a TFO connection is in SYN_RECEIVED, the 1951 * only valid packets are the initial SYN, a 1952 * retransmit/copy of the initial SYN (possibly with 1953 * a subset of the original data), a valid ACK, a 1954 * FIN, or a RST. 1955 */ 1956 if ((thflags & (TH_SYN|TH_ACK)) == (TH_SYN|TH_ACK)) { 1957 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 1958 goto dropwithreset; 1959 } else if (thflags & TH_SYN) { 1960 /* non-initial SYN is ignored */ 1961 if ((tcp_timer_active(tp, TT_DELACK) || 1962 tcp_timer_active(tp, TT_REXMT))) 1963 goto drop; 1964 } else if (!(thflags & (TH_ACK|TH_FIN|TH_RST))) { 1965 goto drop; 1966 } 1967 } 1968 break; 1969 1970 /* 1971 * If the state is SYN_SENT: 1972 * if seg contains a RST with valid ACK (SEQ.ACK has already 1973 * been verified), then drop the connection. 1974 * if seg contains a RST without an ACK, drop the seg. 1975 * if seg does not contain SYN, then drop the seg. 1976 * Otherwise this is an acceptable SYN segment 1977 * initialize tp->rcv_nxt and tp->irs 1978 * if seg contains ack then advance tp->snd_una 1979 * if seg contains an ECE and ECN support is enabled, the stream 1980 * is ECN capable. 1981 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state 1982 * arrange for segment to be acked (eventually) 1983 * continue processing rest of data/controls, beginning with URG 1984 */ 1985 case TCPS_SYN_SENT: 1986 if ((thflags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) { 1987 TCP_PROBE5(connect__refused, NULL, tp, 1988 m, tp, th); 1989 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 1990 tp = tcp_drop(tp, ECONNREFUSED); 1991 } 1992 if (thflags & TH_RST) 1993 goto drop; 1994 if (!(thflags & TH_SYN)) 1995 goto drop; 1996 1997 tp->irs = th->th_seq; 1998 tcp_rcvseqinit(tp); 1999 if (thflags & TH_ACK) { 2000 int tfo_partial_ack = 0; 2001 2002 TCPSTAT_INC(tcps_connects); 2003 soisconnected(so); 2004 #ifdef MAC 2005 mac_socketpeer_set_from_mbuf(m, so); 2006 #endif 2007 /* Do window scaling on this connection? */ 2008 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 2009 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 2010 tp->rcv_scale = tp->request_r_scale; 2011 } 2012 tp->rcv_adv += min(tp->rcv_wnd, 2013 TCP_MAXWIN << tp->rcv_scale); 2014 tp->snd_una++; /* SYN is acked */ 2015 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 2016 tp->snd_nxt = tp->snd_una; 2017 /* 2018 * If not all the data that was sent in the TFO SYN 2019 * has been acked, resend the remainder right away. 2020 */ 2021 if ((tp->t_flags & TF_FASTOPEN) && 2022 (tp->snd_una != tp->snd_max)) { 2023 tp->snd_nxt = th->th_ack; 2024 tfo_partial_ack = 1; 2025 } 2026 /* 2027 * If there's data, delay ACK; if there's also a FIN 2028 * ACKNOW will be turned on later. 2029 */ 2030 if (DELAY_ACK(tp, tlen) && tlen != 0 && !tfo_partial_ack) 2031 tcp_timer_activate(tp, TT_DELACK, 2032 tcp_delacktime); 2033 else 2034 tp->t_flags |= TF_ACKNOW; 2035 2036 tcp_ecn_input_syn_sent(tp, thflags, iptos); 2037 2038 /* 2039 * Received <SYN,ACK> in SYN_SENT[*] state. 2040 * Transitions: 2041 * SYN_SENT --> ESTABLISHED 2042 * SYN_SENT* --> FIN_WAIT_1 2043 */ 2044 tp->t_starttime = ticks; 2045 if (tp->t_flags & TF_NEEDFIN) { 2046 tp->t_acktime = ticks; 2047 tcp_state_change(tp, TCPS_FIN_WAIT_1); 2048 tp->t_flags &= ~TF_NEEDFIN; 2049 thflags &= ~TH_SYN; 2050 } else { 2051 tcp_state_change(tp, TCPS_ESTABLISHED); 2052 TCP_PROBE5(connect__established, NULL, tp, 2053 m, tp, th); 2054 cc_conn_init(tp); 2055 tcp_timer_activate(tp, TT_KEEP, 2056 TP_KEEPIDLE(tp)); 2057 } 2058 } else { 2059 /* 2060 * Received initial SYN in SYN-SENT[*] state => 2061 * simultaneous open. 2062 * If it succeeds, connection is * half-synchronized. 2063 * Otherwise, do 3-way handshake: 2064 * SYN-SENT -> SYN-RECEIVED 2065 * SYN-SENT* -> SYN-RECEIVED* 2066 */ 2067 tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN | TF_SONOTCONN); 2068 tcp_timer_activate(tp, TT_REXMT, 0); 2069 tcp_state_change(tp, TCPS_SYN_RECEIVED); 2070 } 2071 2072 /* 2073 * Advance th->th_seq to correspond to first data byte. 2074 * If data, trim to stay within window, 2075 * dropping FIN if necessary. 2076 */ 2077 th->th_seq++; 2078 if (tlen > tp->rcv_wnd) { 2079 todrop = tlen - tp->rcv_wnd; 2080 m_adj(m, -todrop); 2081 tlen = tp->rcv_wnd; 2082 thflags &= ~TH_FIN; 2083 TCPSTAT_INC(tcps_rcvpackafterwin); 2084 TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); 2085 } 2086 tp->snd_wl1 = th->th_seq - 1; 2087 tp->rcv_up = th->th_seq; 2088 /* 2089 * Client side of transaction: already sent SYN and data. 2090 * If the remote host used T/TCP to validate the SYN, 2091 * our data will be ACK'd; if so, enter normal data segment 2092 * processing in the middle of step 5, ack processing. 2093 * Otherwise, goto step 6. 2094 */ 2095 if (thflags & TH_ACK) 2096 goto process_ACK; 2097 2098 goto step6; 2099 } 2100 2101 /* 2102 * States other than LISTEN or SYN_SENT. 2103 * First check the RST flag and sequence number since reset segments 2104 * are exempt from the timestamp and connection count tests. This 2105 * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix 2106 * below which allowed reset segments in half the sequence space 2107 * to fall though and be processed (which gives forged reset 2108 * segments with a random sequence number a 50 percent chance of 2109 * killing a connection). 2110 * Then check timestamp, if present. 2111 * Then check the connection count, if present. 2112 * Then check that at least some bytes of segment are within 2113 * receive window. If segment begins before rcv_nxt, 2114 * drop leading data (and SYN); if nothing left, just ack. 2115 */ 2116 if (thflags & TH_RST) { 2117 /* 2118 * RFC5961 Section 3.2 2119 * 2120 * - RST drops connection only if SEG.SEQ == RCV.NXT. 2121 * - If RST is in window, we send challenge ACK. 2122 * 2123 * Note: to take into account delayed ACKs, we should 2124 * test against last_ack_sent instead of rcv_nxt. 2125 * Note 2: we handle special case of closed window, not 2126 * covered by the RFC. 2127 */ 2128 if ((SEQ_GEQ(th->th_seq, tp->last_ack_sent) && 2129 SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) || 2130 (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) { 2131 KASSERT(tp->t_state != TCPS_SYN_SENT, 2132 ("%s: TH_RST for TCPS_SYN_SENT th %p tp %p", 2133 __func__, th, tp)); 2134 2135 if (V_tcp_insecure_rst || 2136 tp->last_ack_sent == th->th_seq) { 2137 TCPSTAT_INC(tcps_drops); 2138 /* Drop the connection. */ 2139 switch (tp->t_state) { 2140 case TCPS_SYN_RECEIVED: 2141 so->so_error = ECONNREFUSED; 2142 goto close; 2143 case TCPS_ESTABLISHED: 2144 case TCPS_FIN_WAIT_1: 2145 case TCPS_FIN_WAIT_2: 2146 case TCPS_CLOSE_WAIT: 2147 case TCPS_CLOSING: 2148 case TCPS_LAST_ACK: 2149 so->so_error = ECONNRESET; 2150 close: 2151 /* FALLTHROUGH */ 2152 default: 2153 tcp_log_end_status(tp, TCP_EI_STATUS_CLIENT_RST); 2154 tp = tcp_close(tp); 2155 } 2156 } else { 2157 TCPSTAT_INC(tcps_badrst); 2158 tcp_send_challenge_ack(tp, th, m); 2159 m = NULL; 2160 } 2161 } 2162 goto drop; 2163 } 2164 2165 /* 2166 * RFC5961 Section 4.2 2167 * Send challenge ACK for any SYN in synchronized state. 2168 */ 2169 if ((thflags & TH_SYN) && tp->t_state != TCPS_SYN_SENT && 2170 tp->t_state != TCPS_SYN_RECEIVED) { 2171 TCPSTAT_INC(tcps_badsyn); 2172 if (V_tcp_insecure_syn && 2173 SEQ_GEQ(th->th_seq, tp->last_ack_sent) && 2174 SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) { 2175 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 2176 tp = tcp_drop(tp, ECONNRESET); 2177 } else { 2178 tcp_ecn_input_syn_sent(tp, thflags, iptos); 2179 tcp_send_challenge_ack(tp, th, m); 2180 m = NULL; 2181 } 2182 goto drop; 2183 } 2184 2185 /* 2186 * RFC 1323 PAWS: If we have a timestamp reply on this segment 2187 * and it's less than ts_recent, drop it. 2188 */ 2189 if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent && 2190 TSTMP_LT(to.to_tsval, tp->ts_recent)) { 2191 /* Check to see if ts_recent is over 24 days old. */ 2192 if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) { 2193 /* 2194 * Invalidate ts_recent. If this segment updates 2195 * ts_recent, the age will be reset later and ts_recent 2196 * will get a valid value. If it does not, setting 2197 * ts_recent to zero will at least satisfy the 2198 * requirement that zero be placed in the timestamp 2199 * echo reply when ts_recent isn't valid. The 2200 * age isn't reset until we get a valid ts_recent 2201 * because we don't want out-of-order segments to be 2202 * dropped when ts_recent is old. 2203 */ 2204 tp->ts_recent = 0; 2205 } else { 2206 TCPSTAT_INC(tcps_rcvduppack); 2207 TCPSTAT_ADD(tcps_rcvdupbyte, tlen); 2208 TCPSTAT_INC(tcps_pawsdrop); 2209 if (tlen) 2210 goto dropafterack; 2211 goto drop; 2212 } 2213 } 2214 2215 /* 2216 * In the SYN-RECEIVED state, validate that the packet belongs to 2217 * this connection before trimming the data to fit the receive 2218 * window. Check the sequence number versus IRS since we know 2219 * the sequence numbers haven't wrapped. This is a partial fix 2220 * for the "LAND" DoS attack. 2221 */ 2222 if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) { 2223 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 2224 goto dropwithreset; 2225 } 2226 2227 todrop = tp->rcv_nxt - th->th_seq; 2228 if (todrop > 0) { 2229 if (thflags & TH_SYN) { 2230 thflags &= ~TH_SYN; 2231 th->th_seq++; 2232 if (th->th_urp > 1) 2233 th->th_urp--; 2234 else 2235 thflags &= ~TH_URG; 2236 todrop--; 2237 } 2238 /* 2239 * Following if statement from Stevens, vol. 2, p. 960. 2240 */ 2241 if (todrop > tlen 2242 || (todrop == tlen && (thflags & TH_FIN) == 0)) { 2243 /* 2244 * Any valid FIN must be to the left of the window. 2245 * At this point the FIN must be a duplicate or out 2246 * of sequence; drop it. 2247 */ 2248 thflags &= ~TH_FIN; 2249 2250 /* 2251 * Send an ACK to resynchronize and drop any data. 2252 * But keep on processing for RST or ACK. 2253 */ 2254 tp->t_flags |= TF_ACKNOW; 2255 todrop = tlen; 2256 TCPSTAT_INC(tcps_rcvduppack); 2257 TCPSTAT_ADD(tcps_rcvdupbyte, todrop); 2258 } else { 2259 TCPSTAT_INC(tcps_rcvpartduppack); 2260 TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop); 2261 } 2262 /* 2263 * DSACK - add SACK block for dropped range 2264 */ 2265 if ((todrop > 0) && (tp->t_flags & TF_SACK_PERMIT)) { 2266 tcp_update_sack_list(tp, th->th_seq, 2267 th->th_seq + todrop); 2268 /* 2269 * ACK now, as the next in-sequence segment 2270 * will clear the DSACK block again 2271 */ 2272 tp->t_flags |= TF_ACKNOW; 2273 } 2274 drop_hdrlen += todrop; /* drop from the top afterwards */ 2275 th->th_seq += todrop; 2276 tlen -= todrop; 2277 if (th->th_urp > todrop) 2278 th->th_urp -= todrop; 2279 else { 2280 thflags &= ~TH_URG; 2281 th->th_urp = 0; 2282 } 2283 } 2284 2285 /* 2286 * If new data are received on a connection after the 2287 * user processes are gone, then RST the other end if 2288 * no FIN has been processed. 2289 */ 2290 if ((tp->t_flags & TF_CLOSED) && tlen > 0 && 2291 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 2292 if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { 2293 log(LOG_DEBUG, "%s; %s: %s: Received %d bytes of data " 2294 "after socket was closed, " 2295 "sending RST and removing tcpcb\n", 2296 s, __func__, tcpstates[tp->t_state], tlen); 2297 free(s, M_TCPLOG); 2298 } 2299 tcp_log_end_status(tp, TCP_EI_STATUS_DATA_A_CLOSE); 2300 /* tcp_close will kill the inp pre-log the Reset */ 2301 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); 2302 tp = tcp_close(tp); 2303 TCPSTAT_INC(tcps_rcvafterclose); 2304 goto dropwithreset; 2305 } 2306 2307 /* 2308 * If segment ends after window, drop trailing data 2309 * (and PUSH and FIN); if nothing left, just ACK. 2310 */ 2311 todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd); 2312 if (todrop > 0) { 2313 TCPSTAT_INC(tcps_rcvpackafterwin); 2314 if (todrop >= tlen) { 2315 TCPSTAT_ADD(tcps_rcvbyteafterwin, tlen); 2316 /* 2317 * If window is closed can only take segments at 2318 * window edge, and have to drop data and PUSH from 2319 * incoming segments. Continue processing, but 2320 * remember to ack. Otherwise, drop segment 2321 * and ack. 2322 */ 2323 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { 2324 tp->t_flags |= TF_ACKNOW; 2325 TCPSTAT_INC(tcps_rcvwinprobe); 2326 } else 2327 goto dropafterack; 2328 } else 2329 TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); 2330 m_adj(m, -todrop); 2331 tlen -= todrop; 2332 thflags &= ~(TH_PUSH|TH_FIN); 2333 } 2334 2335 /* 2336 * If last ACK falls within this segment's sequence numbers, 2337 * record its timestamp. 2338 * NOTE: 2339 * 1) That the test incorporates suggestions from the latest 2340 * proposal of the tcplw@cray.com list (Braden 1993/04/26). 2341 * 2) That updating only on newer timestamps interferes with 2342 * our earlier PAWS tests, so this check should be solely 2343 * predicated on the sequence space of this segment. 2344 * 3) That we modify the segment boundary check to be 2345 * Last.ACK.Sent <= SEG.SEQ + SEG.Len 2346 * instead of RFC1323's 2347 * Last.ACK.Sent < SEG.SEQ + SEG.Len, 2348 * This modified check allows us to overcome RFC1323's 2349 * limitations as described in Stevens TCP/IP Illustrated 2350 * Vol. 2 p.869. In such cases, we can still calculate the 2351 * RTT correctly when RCV.NXT == Last.ACK.Sent. 2352 */ 2353 if ((to.to_flags & TOF_TS) != 0 && 2354 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 2355 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 2356 ((thflags & (TH_SYN|TH_FIN)) != 0))) { 2357 tp->ts_recent_age = tcp_ts_getticks(); 2358 tp->ts_recent = to.to_tsval; 2359 } 2360 2361 /* 2362 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN 2363 * flag is on (half-synchronized state), then queue data for 2364 * later processing; else drop segment and return. 2365 */ 2366 if ((thflags & TH_ACK) == 0) { 2367 if (tp->t_state == TCPS_SYN_RECEIVED || 2368 (tp->t_flags & TF_NEEDSYN)) { 2369 if (tp->t_state == TCPS_SYN_RECEIVED && 2370 (tp->t_flags & TF_FASTOPEN)) { 2371 tp->snd_wnd = tiwin; 2372 cc_conn_init(tp); 2373 } 2374 goto step6; 2375 } else if (tp->t_flags & TF_ACKNOW) 2376 goto dropafterack; 2377 else 2378 goto drop; 2379 } 2380 2381 /* 2382 * Ack processing. 2383 */ 2384 if (SEQ_GEQ(tp->snd_una, tp->iss + (TCP_MAXWIN << tp->snd_scale))) { 2385 /* Checking SEG.ACK against ISS is definitely redundant. */ 2386 tp->t_flags2 |= TF2_NO_ISS_CHECK; 2387 } 2388 if (!V_tcp_insecure_ack) { 2389 tcp_seq seq_min; 2390 bool ghost_ack_check; 2391 2392 if (tp->t_flags2 & TF2_NO_ISS_CHECK) { 2393 /* Check for too old ACKs (RFC 5961, Section 5.2). */ 2394 seq_min = tp->snd_una - tp->max_sndwnd; 2395 ghost_ack_check = false; 2396 } else { 2397 if (SEQ_GT(tp->iss + 1, tp->snd_una - tp->max_sndwnd)) { 2398 /* Checking for ghost ACKs is stricter. */ 2399 seq_min = tp->iss + 1; 2400 ghost_ack_check = true; 2401 } else { 2402 /* 2403 * Checking for too old ACKs (RFC 5961, 2404 * Section 5.2) is stricter. 2405 */ 2406 seq_min = tp->snd_una - tp->max_sndwnd; 2407 ghost_ack_check = false; 2408 } 2409 } 2410 if (SEQ_LT(th->th_ack, seq_min)) { 2411 if (ghost_ack_check) 2412 TCPSTAT_INC(tcps_rcvghostack); 2413 else 2414 TCPSTAT_INC(tcps_rcvacktooold); 2415 tcp_send_challenge_ack(tp, th, m); 2416 m = NULL; 2417 goto drop; 2418 } 2419 } 2420 switch (tp->t_state) { 2421 /* 2422 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter 2423 * ESTABLISHED state and continue processing. 2424 * The ACK was checked above. 2425 */ 2426 case TCPS_SYN_RECEIVED: 2427 2428 TCPSTAT_INC(tcps_connects); 2429 if (tp->t_flags & TF_SONOTCONN) { 2430 /* 2431 * Usually SYN_RECEIVED had been created from a LISTEN, 2432 * and solisten_enqueue() has already marked the socket 2433 * layer as connected. If it didn't, which can happen 2434 * only with an accept_filter(9), then the tp is marked 2435 * with TF_SONOTCONN. The other reason for this mark 2436 * to be set is a simultaneous open, a SYN_RECEIVED 2437 * that had been created from SYN_SENT. 2438 */ 2439 tp->t_flags &= ~TF_SONOTCONN; 2440 soisconnected(so); 2441 } 2442 /* Do window scaling? */ 2443 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 2444 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 2445 tp->rcv_scale = tp->request_r_scale; 2446 } 2447 tp->snd_wnd = tiwin; 2448 /* 2449 * Make transitions: 2450 * SYN-RECEIVED -> ESTABLISHED 2451 * SYN-RECEIVED* -> FIN-WAIT-1 2452 */ 2453 tp->t_starttime = ticks; 2454 if ((tp->t_flags & TF_FASTOPEN) && tp->t_tfo_pending) { 2455 tcp_fastopen_decrement_counter(tp->t_tfo_pending); 2456 tp->t_tfo_pending = NULL; 2457 } 2458 if (tp->t_flags & TF_NEEDFIN) { 2459 tp->t_acktime = ticks; 2460 tcp_state_change(tp, TCPS_FIN_WAIT_1); 2461 tp->t_flags &= ~TF_NEEDFIN; 2462 } else { 2463 tcp_state_change(tp, TCPS_ESTABLISHED); 2464 TCP_PROBE5(accept__established, NULL, tp, 2465 m, tp, th); 2466 /* 2467 * TFO connections call cc_conn_init() during SYN 2468 * processing. Calling it again here for such 2469 * connections is not harmless as it would undo the 2470 * snd_cwnd reduction that occurs when a TFO SYN|ACK 2471 * is retransmitted. 2472 */ 2473 if (!(tp->t_flags & TF_FASTOPEN)) 2474 cc_conn_init(tp); 2475 tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp)); 2476 } 2477 /* 2478 * Account for the ACK of our SYN prior to 2479 * regular ACK processing below, except for 2480 * simultaneous SYN, which is handled later. 2481 */ 2482 if (SEQ_GT(th->th_ack, tp->snd_una) && !(tp->t_flags & TF_NEEDSYN)) 2483 incforsyn = 1; 2484 /* 2485 * If segment contains data or ACK, will call tcp_reass() 2486 * later; if not, do so now to pass queued data to user. 2487 */ 2488 if (tlen == 0 && (thflags & TH_FIN) == 0) { 2489 (void) tcp_reass(tp, (struct tcphdr *)0, NULL, 0, 2490 (struct mbuf *)0); 2491 tcp_handle_wakeup(tp); 2492 } 2493 tp->snd_wl1 = th->th_seq - 1; 2494 /* FALLTHROUGH */ 2495 2496 /* 2497 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range 2498 * ACKs. If the ack is in the range 2499 * tp->snd_una < th->th_ack <= tp->snd_max 2500 * then advance tp->snd_una to th->th_ack and drop 2501 * data from the retransmission queue. If this ACK reflects 2502 * more up to date window information we update our window information. 2503 */ 2504 case TCPS_ESTABLISHED: 2505 case TCPS_FIN_WAIT_1: 2506 case TCPS_FIN_WAIT_2: 2507 case TCPS_CLOSE_WAIT: 2508 case TCPS_CLOSING: 2509 case TCPS_LAST_ACK: 2510 if (SEQ_GT(th->th_ack, tp->snd_max)) { 2511 TCPSTAT_INC(tcps_rcvacktoomuch); 2512 goto dropafterack; 2513 } 2514 if (tcp_is_sack_recovery(tp, &to)) { 2515 sack_changed = tcp_sack_doack(tp, &to, th->th_ack); 2516 if ((sack_changed != SACK_NOCHANGE) && 2517 (tp->t_flags & TF_LRD)) { 2518 tcp_sack_lost_retransmission(tp, th); 2519 } 2520 } else 2521 /* 2522 * Reset the value so that previous (valid) value 2523 * from the last ack with SACK doesn't get used. 2524 */ 2525 tp->sackhint.sacked_bytes = 0; 2526 2527 #ifdef TCP_HHOOK 2528 /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ 2529 hhook_run_tcp_est_in(tp, th, &to); 2530 #endif 2531 2532 if (SEQ_LT(th->th_ack, tp->snd_una)) { 2533 /* This is old ACK information, don't process it. */ 2534 break; 2535 } 2536 if (th->th_ack == tp->snd_una) { 2537 /* Check if this is a duplicate ACK. */ 2538 if (tp->t_flags & TF_SACK_PERMIT) { 2539 /* 2540 * If SEG.ACK == SND.UNA, RFC 6675 requires a 2541 * duplicate ACK to selectively acknowledge 2542 * at least one byte, which was not selectively 2543 * acknowledged before. 2544 */ 2545 if (sack_changed == SACK_NOCHANGE) { 2546 break; 2547 } 2548 } else { 2549 /* 2550 * If SEG.ACK == SND.UNA, RFC 5681 requires a 2551 * duplicate ACK to have no data on it and to 2552 * not be a window update. 2553 */ 2554 if (!no_data || tiwin != tp->snd_wnd) { 2555 break; 2556 } 2557 } 2558 /* 2559 * If this is the first time we've seen a 2560 * FIN from the remote, this is not a 2561 * duplicate ACK and it needs to be processed 2562 * normally. 2563 * This happens during a simultaneous close. 2564 */ 2565 if ((thflags & TH_FIN) && 2566 (TCPS_HAVERCVDFIN(tp->t_state) == 0)) { 2567 tp->t_dupacks = 0; 2568 break; 2569 } 2570 /* Perform duplicate ACK processing. */ 2571 TCPSTAT_INC(tcps_rcvdupack); 2572 maxseg = tcp_maxseg(tp); 2573 if (!tcp_timer_active(tp, TT_REXMT)) { 2574 tp->t_dupacks = 0; 2575 } else if (++tp->t_dupacks > tcprexmtthresh || 2576 IN_FASTRECOVERY(tp->t_flags)) { 2577 cc_ack_received(tp, th, nsegs, CC_DUPACK); 2578 if (V_tcp_do_prr && 2579 IN_FASTRECOVERY(tp->t_flags) && 2580 (tp->t_flags & TF_SACK_PERMIT)) { 2581 tcp_do_prr_ack(tp, th, &to, 2582 sack_changed, &maxseg); 2583 } else if (tcp_is_sack_recovery(tp, &to) && 2584 IN_FASTRECOVERY(tp->t_flags) && 2585 (tp->snd_nxt == tp->snd_max)) { 2586 int awnd; 2587 2588 /* 2589 * Compute the amount of data in flight first. 2590 * We can inject new data into the pipe iff 2591 * we have less than ssthresh 2592 * worth of data in flight. 2593 */ 2594 awnd = tcp_compute_pipe(tp); 2595 if (awnd < tp->snd_ssthresh) { 2596 tp->snd_cwnd += imax(maxseg, 2597 imin(2 * maxseg, 2598 tp->sackhint.delivered_data)); 2599 if (tp->snd_cwnd > tp->snd_ssthresh) 2600 tp->snd_cwnd = tp->snd_ssthresh; 2601 } 2602 } else if (tcp_is_sack_recovery(tp, &to) && 2603 IN_FASTRECOVERY(tp->t_flags) && 2604 SEQ_LT(tp->snd_nxt, tp->snd_max)) { 2605 tp->snd_cwnd += imax(maxseg, 2606 imin(2 * maxseg, 2607 tp->sackhint.delivered_data)); 2608 } else { 2609 tp->snd_cwnd += maxseg; 2610 } 2611 (void) tcp_output(tp); 2612 goto drop; 2613 } else if (tp->t_dupacks == tcprexmtthresh || 2614 (tp->t_flags & TF_SACK_PERMIT && 2615 tp->sackhint.sacked_bytes > 2616 (tcprexmtthresh - 1) * maxseg)) { 2617 enter_recovery: 2618 /* 2619 * Above is the RFC6675 trigger condition of 2620 * more than (dupthresh-1)*maxseg sacked data. 2621 * If the count of holes in the 2622 * scoreboard is >= dupthresh, we could 2623 * also enter loss recovery, but don't 2624 * have that value readily available. 2625 */ 2626 tp->t_dupacks = tcprexmtthresh; 2627 tcp_seq onxt = tp->snd_nxt; 2628 2629 /* 2630 * If we're doing sack, check to 2631 * see if we're already in sack 2632 * recovery. If we're not doing sack, 2633 * check to see if we're in newreno 2634 * recovery. 2635 */ 2636 if (tcp_is_sack_recovery(tp, &to)) { 2637 if (IN_FASTRECOVERY(tp->t_flags)) { 2638 tp->t_dupacks = 0; 2639 break; 2640 } 2641 } else { 2642 if (SEQ_LEQ(th->th_ack, 2643 tp->snd_recover)) { 2644 tp->t_dupacks = 0; 2645 break; 2646 } 2647 } 2648 /* Congestion signal before ack. */ 2649 cc_cong_signal(tp, th, CC_NDUPACK); 2650 cc_ack_received(tp, th, nsegs, CC_DUPACK); 2651 tcp_timer_activate(tp, TT_REXMT, 0); 2652 tp->t_rtttime = 0; 2653 if (V_tcp_do_prr) { 2654 /* 2655 * snd_ssthresh and snd_recover are 2656 * already updated by cc_cong_signal. 2657 */ 2658 if (tcp_is_sack_recovery(tp, &to)) { 2659 /* 2660 * Include Limited Transmit 2661 * segments here 2662 */ 2663 tp->sackhint.prr_delivered = 2664 imin(tp->snd_max - th->th_ack, 2665 (tp->snd_limited + 1) * maxseg); 2666 } else { 2667 tp->sackhint.prr_delivered = 2668 maxseg; 2669 } 2670 tp->sackhint.recover_fs = max(1, 2671 tp->snd_nxt - tp->snd_una); 2672 } 2673 tp->snd_limited = 0; 2674 if (tcp_is_sack_recovery(tp, &to)) { 2675 TCPSTAT_INC(tcps_sack_recovery_episode); 2676 /* 2677 * When entering LR after RTO due to 2678 * Duplicate ACKs, retransmit existing 2679 * holes from the scoreboard. 2680 */ 2681 tcp_resend_sackholes(tp); 2682 /* Avoid inflating cwnd in tcp_output */ 2683 tp->snd_nxt = tp->snd_max; 2684 tp->snd_cwnd = tcp_compute_pipe(tp) + 2685 maxseg; 2686 (void) tcp_output(tp); 2687 /* Set cwnd to the expected flightsize */ 2688 tp->snd_cwnd = tp->snd_ssthresh; 2689 goto drop; 2690 } 2691 tp->snd_nxt = th->th_ack; 2692 tp->snd_cwnd = maxseg; 2693 (void) tcp_output(tp); 2694 KASSERT(tp->snd_limited <= 2, 2695 ("%s: tp->snd_limited too big", 2696 __func__)); 2697 tp->snd_cwnd = tp->snd_ssthresh + 2698 maxseg * 2699 (tp->t_dupacks - tp->snd_limited); 2700 if (SEQ_GT(onxt, tp->snd_nxt)) 2701 tp->snd_nxt = onxt; 2702 goto drop; 2703 } else if (V_tcp_do_rfc3042) { 2704 /* 2705 * Process first and second duplicate 2706 * ACKs. Each indicates a segment 2707 * leaving the network, creating room 2708 * for more. Make sure we can send a 2709 * packet on reception of each duplicate 2710 * ACK by increasing snd_cwnd by one 2711 * segment. Restore the original 2712 * snd_cwnd after packet transmission. 2713 */ 2714 cc_ack_received(tp, th, nsegs, CC_DUPACK); 2715 uint32_t oldcwnd = tp->snd_cwnd; 2716 tcp_seq oldsndmax = tp->snd_max; 2717 u_int sent; 2718 int avail; 2719 2720 KASSERT(tp->t_dupacks == 1 || 2721 tp->t_dupacks == 2, 2722 ("%s: dupacks not 1 or 2", 2723 __func__)); 2724 if (tp->t_dupacks == 1) 2725 tp->snd_limited = 0; 2726 if ((tp->snd_nxt == tp->snd_max) && 2727 (tp->t_rxtshift == 0)) 2728 tp->snd_cwnd = 2729 SEQ_SUB(tp->snd_nxt, tp->snd_una); 2730 tp->snd_cwnd += 2731 (tp->t_dupacks - tp->snd_limited) * maxseg; 2732 tp->snd_cwnd -= tcp_sack_adjust(tp); 2733 /* 2734 * Only call tcp_output when there 2735 * is new data available to be sent 2736 * or we need to send an ACK. 2737 */ 2738 SOCK_SENDBUF_LOCK(so); 2739 avail = sbavail(&so->so_snd); 2740 SOCK_SENDBUF_UNLOCK(so); 2741 if (tp->t_flags & TF_ACKNOW || 2742 (avail >= 2743 SEQ_SUB(tp->snd_nxt, tp->snd_una))) { 2744 (void) tcp_output(tp); 2745 } 2746 sent = SEQ_SUB(tp->snd_max, oldsndmax); 2747 if (sent > maxseg) { 2748 KASSERT((tp->t_dupacks == 2 && 2749 tp->snd_limited == 0) || 2750 (sent == maxseg + 1 && 2751 tp->t_flags & TF_SENTFIN) || 2752 (sent < 2 * maxseg && 2753 tp->t_flags & TF_NODELAY), 2754 ("%s: sent too much: %u>%u", 2755 __func__, sent, maxseg)); 2756 tp->snd_limited = 2; 2757 } else if (sent > 0) { 2758 ++tp->snd_limited; 2759 } 2760 tp->snd_cwnd = oldcwnd; 2761 goto drop; 2762 } 2763 break; 2764 } 2765 KASSERT(SEQ_GT(th->th_ack, tp->snd_una), 2766 ("%s: SEQ_LEQ(th_ack, snd_una)", __func__)); 2767 /* 2768 * This ack is advancing the left edge, reset the 2769 * counter. 2770 */ 2771 tp->t_dupacks = 0; 2772 /* 2773 * If this ack also has new SACK info, increment the 2774 * t_dupacks as per RFC 6675. The variable 2775 * sack_changed tracks all changes to the SACK 2776 * scoreboard, including when partial ACKs without 2777 * SACK options are received, and clear the scoreboard 2778 * from the left side. Such partial ACKs should not be 2779 * counted as dupacks here. 2780 */ 2781 if (tcp_is_sack_recovery(tp, &to) && 2782 (((tp->t_rxtshift == 0) && (sack_changed != SACK_NOCHANGE)) || 2783 ((tp->t_rxtshift > 0) && (sack_changed == SACK_NEWLOSS))) && 2784 (tp->snd_nxt == tp->snd_max)) { 2785 tp->t_dupacks++; 2786 /* limit overhead by setting maxseg last */ 2787 if (!IN_FASTRECOVERY(tp->t_flags) && 2788 (tp->sackhint.sacked_bytes > 2789 (tcprexmtthresh - 1) * (maxseg = tcp_maxseg(tp)))) { 2790 goto enter_recovery; 2791 } 2792 } 2793 /* 2794 * If the congestion window was inflated to account 2795 * for the other side's cached packets, retract it. 2796 */ 2797 if (SEQ_LT(th->th_ack, tp->snd_recover)) { 2798 if (IN_FASTRECOVERY(tp->t_flags)) { 2799 if (tp->t_flags & TF_SACK_PERMIT) { 2800 if (V_tcp_do_prr && 2801 (to.to_flags & TOF_SACK)) { 2802 tcp_timer_activate(tp, 2803 TT_REXMT, 0); 2804 tp->t_rtttime = 0; 2805 tcp_do_prr_ack(tp, th, &to, 2806 sack_changed, &maxseg); 2807 tp->t_flags |= TF_ACKNOW; 2808 (void) tcp_output(tp); 2809 } else { 2810 tcp_sack_partialack(tp, th, 2811 &maxseg); 2812 } 2813 } else { 2814 tcp_newreno_partial_ack(tp, th); 2815 } 2816 } else if (IN_CONGRECOVERY(tp->t_flags) && 2817 (V_tcp_do_prr)) { 2818 tp->sackhint.delivered_data = 2819 BYTES_THIS_ACK(tp, th); 2820 tp->snd_fack = th->th_ack; 2821 /* 2822 * During ECN cwnd reduction 2823 * always use PRR-SSRB 2824 */ 2825 tcp_do_prr_ack(tp, th, &to, SACK_CHANGE, 2826 &maxseg); 2827 (void) tcp_output(tp); 2828 } 2829 } 2830 /* 2831 * If we reach this point, ACK is not a duplicate, 2832 * i.e., it ACKs something we sent. 2833 */ 2834 if (tp->t_flags & TF_NEEDSYN) { 2835 /* 2836 * T/TCP: Connection was half-synchronized, and our 2837 * SYN has been ACK'd (so connection is now fully 2838 * synchronized). Go to non-starred state, 2839 * increment snd_una for ACK of SYN, and check if 2840 * we can do window scaling. 2841 */ 2842 tp->t_flags &= ~TF_NEEDSYN; 2843 tp->snd_una++; 2844 /* Do window scaling? */ 2845 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == 2846 (TF_RCVD_SCALE|TF_REQ_SCALE)) { 2847 tp->rcv_scale = tp->request_r_scale; 2848 /* Send window already scaled. */ 2849 } 2850 } 2851 2852 process_ACK: 2853 INP_WLOCK_ASSERT(inp); 2854 2855 /* 2856 * Adjust for the SYN bit in sequence space, 2857 * but don't account for it in cwnd calculations. 2858 * This is for the SYN_RECEIVED, non-simultaneous 2859 * SYN case. SYN_SENT and simultaneous SYN are 2860 * treated elsewhere. 2861 */ 2862 if (incforsyn) 2863 tp->snd_una++; 2864 acked = BYTES_THIS_ACK(tp, th); 2865 KASSERT(acked >= 0, ("%s: acked unexepectedly negative " 2866 "(tp->snd_una=%u, th->th_ack=%u, tp=%p, m=%p)", __func__, 2867 tp->snd_una, th->th_ack, tp, m)); 2868 TCPSTAT_ADD(tcps_rcvackpack, nsegs); 2869 TCPSTAT_ADD(tcps_rcvackbyte, acked); 2870 2871 /* 2872 * If we just performed our first retransmit, and the ACK 2873 * arrives within our recovery window, then it was a mistake 2874 * to do the retransmit in the first place. Recover our 2875 * original cwnd and ssthresh, and proceed to transmit where 2876 * we left off. 2877 */ 2878 if (tp->t_rxtshift == 1 && 2879 tp->t_flags & TF_PREVVALID && 2880 tp->t_badrxtwin != 0 && 2881 to.to_flags & TOF_TS && 2882 to.to_tsecr != 0 && 2883 TSTMP_LT(to.to_tsecr, tp->t_badrxtwin)) 2884 cc_cong_signal(tp, th, CC_RTO_ERR); 2885 2886 /* 2887 * If we have a timestamp reply, update smoothed 2888 * round trip time. If no timestamp is present but 2889 * transmit timer is running and timed sequence 2890 * number was acked, update smoothed round trip time. 2891 * Since we now have an rtt measurement, cancel the 2892 * timer backoff (cf., Phil Karn's retransmit alg.). 2893 * Recompute the initial retransmit timer. 2894 * 2895 * Some boxes send broken timestamp replies 2896 * during the SYN+ACK phase, ignore 2897 * timestamps of 0 or we could calculate a 2898 * huge RTT and blow up the retransmit timer. 2899 */ 2900 if ((to.to_flags & TOF_TS) != 0 && to.to_tsecr) { 2901 uint32_t t; 2902 2903 t = tcp_ts_getticks() - to.to_tsecr; 2904 if (!tp->t_rttlow || tp->t_rttlow > t) 2905 tp->t_rttlow = t; 2906 tcp_xmit_timer(tp, TCP_TS_TO_TICKS(t) + 1); 2907 } else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) { 2908 if (!tp->t_rttlow || tp->t_rttlow > ticks - tp->t_rtttime) 2909 tp->t_rttlow = ticks - tp->t_rtttime; 2910 tcp_xmit_timer(tp, ticks - tp->t_rtttime); 2911 } 2912 2913 SOCK_SENDBUF_LOCK(so); 2914 /* 2915 * Clear t_acktime if remote side has ACKd all data in the 2916 * socket buffer and FIN (if applicable). 2917 * Otherwise, update t_acktime if we received a sufficiently 2918 * large ACK. 2919 */ 2920 if ((tp->t_state <= TCPS_CLOSE_WAIT && 2921 acked == sbavail(&so->so_snd)) || 2922 acked > sbavail(&so->so_snd)) 2923 tp->t_acktime = 0; 2924 else if (acked > 1) 2925 tp->t_acktime = ticks; 2926 2927 /* 2928 * If all outstanding data is acked, stop retransmit 2929 * timer and remember to restart (more output or persist). 2930 * If there is more data to be acked, restart retransmit 2931 * timer, using current (possibly backed-off) value. 2932 */ 2933 if (th->th_ack == tp->snd_max) { 2934 tcp_timer_activate(tp, TT_REXMT, 0); 2935 needoutput = 1; 2936 } else if (!tcp_timer_active(tp, TT_PERSIST)) 2937 tcp_timer_activate(tp, TT_REXMT, TP_RXTCUR(tp)); 2938 2939 /* 2940 * If no data (only SYN) was ACK'd, 2941 * skip rest of ACK processing. 2942 */ 2943 if (acked == 0) { 2944 SOCK_SENDBUF_UNLOCK(so); 2945 goto step6; 2946 } 2947 2948 /* 2949 * Let the congestion control algorithm update congestion 2950 * control related information. This typically means increasing 2951 * the congestion window. 2952 */ 2953 cc_ack_received(tp, th, nsegs, CC_ACK); 2954 2955 if (acked > sbavail(&so->so_snd)) { 2956 if (tp->snd_wnd >= sbavail(&so->so_snd)) 2957 tp->snd_wnd -= sbavail(&so->so_snd); 2958 else 2959 tp->snd_wnd = 0; 2960 mfree = sbcut_locked(&so->so_snd, 2961 (int)sbavail(&so->so_snd)); 2962 ourfinisacked = 1; 2963 } else { 2964 mfree = sbcut_locked(&so->so_snd, acked); 2965 if (tp->snd_wnd >= (uint32_t) acked) 2966 tp->snd_wnd -= acked; 2967 else 2968 tp->snd_wnd = 0; 2969 ourfinisacked = 0; 2970 } 2971 /* NB: sowwakeup_locked() does an implicit unlock. */ 2972 sowwakeup_locked(so); 2973 m_freem(mfree); 2974 /* Detect una wraparound. */ 2975 if (!IN_RECOVERY(tp->t_flags) && 2976 SEQ_GT(tp->snd_una, tp->snd_recover) && 2977 SEQ_LEQ(th->th_ack, tp->snd_recover)) 2978 tp->snd_recover = th->th_ack - 1; 2979 tp->snd_una = th->th_ack; 2980 if (IN_RECOVERY(tp->t_flags) && 2981 SEQ_GEQ(th->th_ack, tp->snd_recover)) { 2982 cc_post_recovery(tp, th); 2983 } 2984 if (SEQ_GT(tp->snd_una, tp->snd_recover)) { 2985 tp->snd_recover = tp->snd_una; 2986 } 2987 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 2988 tp->snd_nxt = tp->snd_una; 2989 2990 switch (tp->t_state) { 2991 /* 2992 * In FIN_WAIT_1 STATE in addition to the processing 2993 * for the ESTABLISHED state if our FIN is now acknowledged 2994 * then enter FIN_WAIT_2. 2995 */ 2996 case TCPS_FIN_WAIT_1: 2997 if (ourfinisacked) { 2998 /* 2999 * If we can't receive any more 3000 * data, then closing user can proceed. 3001 * Starting the timer is contrary to the 3002 * specification, but if we don't get a FIN 3003 * we'll hang forever. 3004 */ 3005 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 3006 tcp_free_sackholes(tp); 3007 soisdisconnected(so); 3008 tcp_timer_activate(tp, TT_2MSL, 3009 (tcp_fast_finwait2_recycle ? 3010 tcp_finwait2_timeout : 3011 TP_MAXIDLE(tp))); 3012 } 3013 tcp_state_change(tp, TCPS_FIN_WAIT_2); 3014 } 3015 break; 3016 3017 /* 3018 * In CLOSING STATE in addition to the processing for 3019 * the ESTABLISHED state if the ACK acknowledges our FIN 3020 * then enter the TIME-WAIT state, otherwise ignore 3021 * the segment. 3022 */ 3023 case TCPS_CLOSING: 3024 if (ourfinisacked) { 3025 tcp_twstart(tp); 3026 m_freem(m); 3027 return; 3028 } 3029 break; 3030 3031 /* 3032 * In LAST_ACK, we may still be waiting for data to drain 3033 * and/or to be acked, as well as for the ack of our FIN. 3034 * If our FIN is now acknowledged, delete the TCB, 3035 * enter the closed state and return. 3036 */ 3037 case TCPS_LAST_ACK: 3038 if (ourfinisacked) { 3039 tp = tcp_close(tp); 3040 goto drop; 3041 } 3042 break; 3043 } 3044 } 3045 3046 step6: 3047 INP_WLOCK_ASSERT(inp); 3048 3049 /* 3050 * Update window information. 3051 * Don't look at window if no ACK: TAC's send garbage on first SYN. 3052 */ 3053 if ((thflags & TH_ACK) && 3054 (SEQ_LT(tp->snd_wl1, th->th_seq) || 3055 (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || 3056 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { 3057 /* keep track of pure window updates */ 3058 if (no_data && tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) 3059 TCPSTAT_INC(tcps_rcvwinupd); 3060 tp->snd_wnd = tiwin; 3061 tp->snd_wl1 = th->th_seq; 3062 tp->snd_wl2 = th->th_ack; 3063 if (tp->snd_wnd > tp->max_sndwnd) 3064 tp->max_sndwnd = tp->snd_wnd; 3065 needoutput = 1; 3066 } 3067 3068 /* 3069 * Process segments with URG. 3070 */ 3071 if ((thflags & TH_URG) && th->th_urp && 3072 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 3073 /* 3074 * This is a kludge, but if we receive and accept 3075 * random urgent pointers, we'll crash in 3076 * soreceive. It's hard to imagine someone 3077 * actually wanting to send this much urgent data. 3078 */ 3079 SOCK_RECVBUF_LOCK(so); 3080 if (th->th_urp + sbavail(&so->so_rcv) > sb_max) { 3081 th->th_urp = 0; /* XXX */ 3082 thflags &= ~TH_URG; /* XXX */ 3083 SOCK_RECVBUF_UNLOCK(so); /* XXX */ 3084 goto dodata; /* XXX */ 3085 } 3086 /* 3087 * If this segment advances the known urgent pointer, 3088 * then mark the data stream. This should not happen 3089 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since 3090 * a FIN has been received from the remote side. 3091 * In these states we ignore the URG. 3092 * 3093 * According to RFC961 (Assigned Protocols), 3094 * the urgent pointer points to the last octet 3095 * of urgent data. We continue, however, 3096 * to consider it to indicate the first octet 3097 * of data past the urgent section as the original 3098 * spec states (in one of two places). 3099 */ 3100 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { 3101 tp->rcv_up = th->th_seq + th->th_urp; 3102 so->so_oobmark = sbavail(&so->so_rcv) + 3103 (tp->rcv_up - tp->rcv_nxt) - 1; 3104 if (so->so_oobmark == 0) 3105 so->so_rcv.sb_state |= SBS_RCVATMARK; 3106 sohasoutofband(so); 3107 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); 3108 } 3109 SOCK_RECVBUF_UNLOCK(so); 3110 /* 3111 * Remove out of band data so doesn't get presented to user. 3112 * This can happen independent of advancing the URG pointer, 3113 * but if two URG's are pending at once, some out-of-band 3114 * data may creep in... ick. 3115 */ 3116 if (th->th_urp <= (uint32_t)tlen && 3117 !(so->so_options & SO_OOBINLINE)) { 3118 /* hdr drop is delayed */ 3119 tcp_pulloutofband(so, th, m, drop_hdrlen); 3120 } 3121 } else { 3122 /* 3123 * If no out of band data is expected, 3124 * pull receive urgent pointer along 3125 * with the receive window. 3126 */ 3127 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) 3128 tp->rcv_up = tp->rcv_nxt; 3129 } 3130 dodata: /* XXX */ 3131 INP_WLOCK_ASSERT(inp); 3132 3133 /* 3134 * Process the segment text, merging it into the TCP sequencing queue, 3135 * and arranging for acknowledgment of receipt if necessary. 3136 * This process logically involves adjusting tp->rcv_wnd as data 3137 * is presented to the user (this happens in tcp_usrreq.c, 3138 * case PRU_RCVD). If a FIN has already been received on this 3139 * connection then we just ignore the text. 3140 */ 3141 tfo_syn = ((tp->t_state == TCPS_SYN_RECEIVED) && 3142 (tp->t_flags & TF_FASTOPEN)); 3143 if ((tlen || (thflags & TH_FIN) || (tfo_syn && tlen > 0)) && 3144 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 3145 tcp_seq save_start = th->th_seq; 3146 tcp_seq save_rnxt = tp->rcv_nxt; 3147 int save_tlen = tlen; 3148 m_adj(m, drop_hdrlen); /* delayed header drop */ 3149 /* 3150 * Insert segment which includes th into TCP reassembly queue 3151 * with control block tp. Set thflags to whether reassembly now 3152 * includes a segment with FIN. This handles the common case 3153 * inline (segment is the next to be received on an established 3154 * connection, and the queue is empty), avoiding linkage into 3155 * and removal from the queue and repetition of various 3156 * conversions. 3157 * Set DELACK for segments received in order, but ack 3158 * immediately when segments are out of order (so 3159 * fast retransmit can work). 3160 */ 3161 if (th->th_seq == tp->rcv_nxt && 3162 SEGQ_EMPTY(tp) && 3163 (TCPS_HAVEESTABLISHED(tp->t_state) || 3164 tfo_syn)) { 3165 if (DELAY_ACK(tp, tlen) || tfo_syn) 3166 tp->t_flags |= TF_DELACK; 3167 else 3168 tp->t_flags |= TF_ACKNOW; 3169 tp->rcv_nxt += tlen; 3170 if (tlen && 3171 ((tp->t_flags2 & TF2_FBYTES_COMPLETE) == 0) && 3172 (tp->t_fbyte_in == 0)) { 3173 tp->t_fbyte_in = ticks; 3174 if (tp->t_fbyte_in == 0) 3175 tp->t_fbyte_in = 1; 3176 if (tp->t_fbyte_out && tp->t_fbyte_in) 3177 tp->t_flags2 |= TF2_FBYTES_COMPLETE; 3178 } 3179 thflags = tcp_get_flags(th) & TH_FIN; 3180 TCPSTAT_INC(tcps_rcvpack); 3181 TCPSTAT_ADD(tcps_rcvbyte, tlen); 3182 SOCK_RECVBUF_LOCK(so); 3183 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) 3184 m_freem(m); 3185 else 3186 sbappendstream_locked(&so->so_rcv, m, 0); 3187 tp->t_flags |= TF_WAKESOR; 3188 } else { 3189 /* 3190 * XXX: Due to the header drop above "th" is 3191 * theoretically invalid by now. Fortunately 3192 * m_adj() doesn't actually frees any mbufs 3193 * when trimming from the head. 3194 */ 3195 tcp_seq temp = save_start; 3196 3197 thflags = tcp_reass(tp, th, &temp, &tlen, m); 3198 tp->t_flags |= TF_ACKNOW; 3199 } 3200 if ((tp->t_flags & TF_SACK_PERMIT) && 3201 (save_tlen > 0) && 3202 TCPS_HAVEESTABLISHED(tp->t_state)) { 3203 if ((tlen == 0) && (SEQ_LT(save_start, save_rnxt))) { 3204 /* 3205 * DSACK actually handled in the fastpath 3206 * above. 3207 */ 3208 tcp_update_sack_list(tp, save_start, 3209 save_start + save_tlen); 3210 } else if ((tlen > 0) && SEQ_GT(tp->rcv_nxt, save_rnxt)) { 3211 if ((tp->rcv_numsacks >= 1) && 3212 (tp->sackblks[0].end == save_start)) { 3213 /* 3214 * Partial overlap, recorded at todrop 3215 * above. 3216 */ 3217 tcp_update_sack_list(tp, 3218 tp->sackblks[0].start, 3219 tp->sackblks[0].end); 3220 } else { 3221 tcp_update_dsack_list(tp, save_start, 3222 save_start + save_tlen); 3223 } 3224 } else if (tlen >= save_tlen) { 3225 /* Update of sackblks. */ 3226 tcp_update_dsack_list(tp, save_start, 3227 save_start + save_tlen); 3228 } else if (tlen > 0) { 3229 tcp_update_dsack_list(tp, save_start, 3230 save_start + tlen); 3231 } 3232 } 3233 tcp_handle_wakeup(tp); 3234 #if 0 3235 /* 3236 * Note the amount of data that peer has sent into 3237 * our window, in order to estimate the sender's 3238 * buffer size. 3239 * XXX: Unused. 3240 */ 3241 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) 3242 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); 3243 else 3244 len = so->so_rcv.sb_hiwat; 3245 #endif 3246 } else { 3247 if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { 3248 if (tlen > 0) { 3249 if ((thflags & TH_FIN) != 0) { 3250 log(LOG_DEBUG, "%s; %s: %s: " 3251 "Received %d bytes of data and FIN " 3252 "after having received a FIN, " 3253 "just dropping both\n", 3254 s, __func__, 3255 tcpstates[tp->t_state], tlen); 3256 } else { 3257 log(LOG_DEBUG, "%s; %s: %s: " 3258 "Received %d bytes of data " 3259 "after having received a FIN, " 3260 "just dropping it\n", 3261 s, __func__, 3262 tcpstates[tp->t_state], tlen); 3263 } 3264 } else { 3265 if ((thflags & TH_FIN) != 0) { 3266 log(LOG_DEBUG, "%s; %s: %s: " 3267 "Received FIN " 3268 "after having received a FIN, " 3269 "just dropping it\n", 3270 s, __func__, 3271 tcpstates[tp->t_state]); 3272 } 3273 } 3274 free(s, M_TCPLOG); 3275 } 3276 m_freem(m); 3277 thflags &= ~TH_FIN; 3278 } 3279 3280 /* 3281 * If FIN is received ACK the FIN and let the user know 3282 * that the connection is closing. 3283 */ 3284 if (thflags & TH_FIN) { 3285 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 3286 /* The socket upcall is handled by socantrcvmore. */ 3287 socantrcvmore(so); 3288 /* 3289 * If connection is half-synchronized 3290 * (ie NEEDSYN flag on) then delay ACK, 3291 * so it may be piggybacked when SYN is sent. 3292 * Otherwise, since we received a FIN then no 3293 * more input can be expected, send ACK now. 3294 */ 3295 if (tp->t_flags & TF_NEEDSYN) 3296 tp->t_flags |= TF_DELACK; 3297 else 3298 tp->t_flags |= TF_ACKNOW; 3299 tp->rcv_nxt++; 3300 } 3301 switch (tp->t_state) { 3302 /* 3303 * In SYN_RECEIVED and ESTABLISHED STATES 3304 * enter the CLOSE_WAIT state. 3305 */ 3306 case TCPS_SYN_RECEIVED: 3307 tp->t_starttime = ticks; 3308 /* FALLTHROUGH */ 3309 case TCPS_ESTABLISHED: 3310 tcp_state_change(tp, TCPS_CLOSE_WAIT); 3311 break; 3312 3313 /* 3314 * If still in FIN_WAIT_1 STATE FIN has not been acked so 3315 * enter the CLOSING state. 3316 */ 3317 case TCPS_FIN_WAIT_1: 3318 tcp_state_change(tp, TCPS_CLOSING); 3319 break; 3320 3321 /* 3322 * In FIN_WAIT_2 state enter the TIME_WAIT state, 3323 * starting the time-wait timer, turning off the other 3324 * standard timers. 3325 */ 3326 case TCPS_FIN_WAIT_2: 3327 tcp_twstart(tp); 3328 return; 3329 } 3330 } 3331 TCP_PROBE3(debug__input, tp, th, m); 3332 3333 /* 3334 * Return any desired output. 3335 */ 3336 if (needoutput || (tp->t_flags & TF_ACKNOW)) { 3337 (void) tcp_output(tp); 3338 } 3339 check_delack: 3340 INP_WLOCK_ASSERT(inp); 3341 3342 if (tp->t_flags & TF_DELACK) { 3343 tp->t_flags &= ~TF_DELACK; 3344 tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); 3345 } 3346 INP_WUNLOCK(inp); 3347 return; 3348 3349 dropafterack: 3350 /* 3351 * Generate an ACK dropping incoming segment if it occupies 3352 * sequence space, where the ACK reflects our state. 3353 * 3354 * We can now skip the test for the RST flag since all 3355 * paths to this code happen after packets containing 3356 * RST have been dropped. 3357 * 3358 * In the SYN-RECEIVED state, don't send an ACK unless the 3359 * segment we received passes the SYN-RECEIVED ACK test. 3360 * If it fails send a RST. This breaks the loop in the 3361 * "LAND" DoS attack, and also prevents an ACK storm 3362 * between two listening ports that have been sent forged 3363 * SYN segments, each with the source address of the other. 3364 */ 3365 if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) && 3366 (SEQ_GT(tp->snd_una, th->th_ack) || 3367 SEQ_GT(th->th_ack, tp->snd_max)) ) { 3368 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 3369 goto dropwithreset; 3370 } 3371 TCP_PROBE3(debug__input, tp, th, m); 3372 tp->t_flags |= TF_ACKNOW; 3373 (void) tcp_output(tp); 3374 INP_WUNLOCK(inp); 3375 m_freem(m); 3376 return; 3377 3378 dropwithreset: 3379 tcp_dropwithreset(m, th, tp, tlen); 3380 if (tp != NULL) { 3381 INP_WUNLOCK(inp); 3382 } 3383 return; 3384 3385 drop: 3386 /* 3387 * Drop space held by incoming segment and return. 3388 */ 3389 TCP_PROBE3(debug__input, tp, th, m); 3390 if (tp != NULL) { 3391 INP_WUNLOCK(inp); 3392 } 3393 m_freem(m); 3394 } 3395 3396 /* 3397 * Issue RST and make ACK acceptable to originator of segment. 3398 * The mbuf must still include the original packet header. 3399 * tp may be NULL. 3400 */ 3401 void 3402 tcp_dropwithreset(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int tlen) 3403 { 3404 #ifdef INET 3405 struct ip *ip; 3406 #endif 3407 #ifdef INET6 3408 struct ip6_hdr *ip6; 3409 #endif 3410 3411 if (tp != NULL) { 3412 INP_LOCK_ASSERT(tptoinpcb(tp)); 3413 } 3414 3415 /* Don't bother if destination was broadcast/multicast. */ 3416 if ((tcp_get_flags(th) & TH_RST) || m->m_flags & (M_BCAST|M_MCAST)) 3417 goto drop; 3418 #ifdef INET6 3419 if (mtod(m, struct ip *)->ip_v == 6) { 3420 ip6 = mtod(m, struct ip6_hdr *); 3421 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || 3422 IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) 3423 goto drop; 3424 /* IPv6 anycast check is done at tcp6_input() */ 3425 } 3426 #endif 3427 #if defined(INET) && defined(INET6) 3428 else 3429 #endif 3430 #ifdef INET 3431 { 3432 ip = mtod(m, struct ip *); 3433 if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || 3434 IN_MULTICAST(ntohl(ip->ip_src.s_addr)) || 3435 ip->ip_src.s_addr == htonl(INADDR_BROADCAST) || 3436 in_ifnet_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) 3437 goto drop; 3438 } 3439 #endif 3440 3441 /* Perform bandwidth limiting. */ 3442 if (badport_bandlim(BANDLIM_TCP_RST) < 0) 3443 goto drop; 3444 3445 /* tcp_respond consumes the mbuf chain. */ 3446 if (tcp_get_flags(th) & TH_ACK) { 3447 tcp_respond(tp, mtod(m, void *), th, m, (tcp_seq)0, 3448 th->th_ack, TH_RST); 3449 } else { 3450 if (tcp_get_flags(th) & TH_SYN) 3451 tlen++; 3452 if (tcp_get_flags(th) & TH_FIN) 3453 tlen++; 3454 tcp_respond(tp, mtod(m, void *), th, m, th->th_seq+tlen, 3455 (tcp_seq)0, TH_RST|TH_ACK); 3456 } 3457 return; 3458 drop: 3459 m_freem(m); 3460 } 3461 3462 /* 3463 * Parse TCP options and place in tcpopt. 3464 */ 3465 void 3466 tcp_dooptions(struct tcpopt *to, u_char *cp, int cnt, int flags) 3467 { 3468 int opt, optlen; 3469 3470 to->to_flags = 0; 3471 for (; cnt > 0; cnt -= optlen, cp += optlen) { 3472 opt = cp[0]; 3473 if (opt == TCPOPT_EOL) 3474 break; 3475 if (opt == TCPOPT_NOP) 3476 optlen = 1; 3477 else { 3478 if (cnt < 2) 3479 break; 3480 optlen = cp[1]; 3481 if (optlen < 2 || optlen > cnt) 3482 break; 3483 } 3484 switch (opt) { 3485 case TCPOPT_MAXSEG: 3486 if (optlen != TCPOLEN_MAXSEG) 3487 continue; 3488 if (!(flags & TO_SYN)) 3489 continue; 3490 to->to_flags |= TOF_MSS; 3491 bcopy((char *)cp + 2, 3492 (char *)&to->to_mss, sizeof(to->to_mss)); 3493 to->to_mss = ntohs(to->to_mss); 3494 break; 3495 case TCPOPT_WINDOW: 3496 if (optlen != TCPOLEN_WINDOW) 3497 continue; 3498 if (!(flags & TO_SYN)) 3499 continue; 3500 to->to_flags |= TOF_SCALE; 3501 to->to_wscale = min(cp[2], TCP_MAX_WINSHIFT); 3502 break; 3503 case TCPOPT_TIMESTAMP: 3504 if (optlen != TCPOLEN_TIMESTAMP) 3505 continue; 3506 to->to_flags |= TOF_TS; 3507 bcopy((char *)cp + 2, 3508 (char *)&to->to_tsval, sizeof(to->to_tsval)); 3509 to->to_tsval = ntohl(to->to_tsval); 3510 bcopy((char *)cp + 6, 3511 (char *)&to->to_tsecr, sizeof(to->to_tsecr)); 3512 to->to_tsecr = ntohl(to->to_tsecr); 3513 break; 3514 case TCPOPT_SIGNATURE: 3515 /* 3516 * In order to reply to a host which has set the 3517 * TCP_SIGNATURE option in its initial SYN, we have 3518 * to record the fact that the option was observed 3519 * here for the syncache code to perform the correct 3520 * response. 3521 */ 3522 if (optlen != TCPOLEN_SIGNATURE) 3523 continue; 3524 to->to_flags |= TOF_SIGNATURE; 3525 to->to_signature = cp + 2; 3526 break; 3527 case TCPOPT_SACK_PERMITTED: 3528 if (optlen != TCPOLEN_SACK_PERMITTED) 3529 continue; 3530 if (!(flags & TO_SYN)) 3531 continue; 3532 if (!V_tcp_do_sack) 3533 continue; 3534 to->to_flags |= TOF_SACKPERM; 3535 break; 3536 case TCPOPT_SACK: 3537 if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0) 3538 continue; 3539 if (flags & TO_SYN) 3540 continue; 3541 to->to_flags |= TOF_SACK; 3542 to->to_nsacks = (optlen - 2) / TCPOLEN_SACK; 3543 to->to_sacks = cp + 2; 3544 TCPSTAT_INC(tcps_sack_rcv_blocks); 3545 break; 3546 case TCPOPT_FAST_OPEN: 3547 /* 3548 * Cookie length validation is performed by the 3549 * server side cookie checking code or the client 3550 * side cookie cache update code. 3551 */ 3552 if (!(flags & TO_SYN)) 3553 continue; 3554 if (!V_tcp_fastopen_client_enable && 3555 !V_tcp_fastopen_server_enable) 3556 continue; 3557 to->to_flags |= TOF_FASTOPEN; 3558 to->to_tfo_len = optlen - 2; 3559 to->to_tfo_cookie = to->to_tfo_len ? cp + 2 : NULL; 3560 break; 3561 default: 3562 continue; 3563 } 3564 } 3565 } 3566 3567 /* 3568 * Pull out of band byte out of a segment so 3569 * it doesn't appear in the user's data queue. 3570 * It is still reflected in the segment length for 3571 * sequencing purposes. 3572 */ 3573 void 3574 tcp_pulloutofband(struct socket *so, struct tcphdr *th, struct mbuf *m, 3575 int off) 3576 { 3577 int cnt = off + th->th_urp - 1; 3578 3579 while (cnt >= 0) { 3580 if (m->m_len > cnt) { 3581 char *cp = mtod(m, caddr_t) + cnt; 3582 struct tcpcb *tp = sototcpcb(so); 3583 3584 INP_WLOCK_ASSERT(tptoinpcb(tp)); 3585 3586 tp->t_iobc = *cp; 3587 tp->t_oobflags |= TCPOOB_HAVEDATA; 3588 bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1)); 3589 m->m_len--; 3590 if (m->m_flags & M_PKTHDR) 3591 m->m_pkthdr.len--; 3592 return; 3593 } 3594 cnt -= m->m_len; 3595 m = m->m_next; 3596 if (m == NULL) 3597 break; 3598 } 3599 panic("tcp_pulloutofband"); 3600 } 3601 3602 /* 3603 * Collect new round-trip time estimate 3604 * and update averages and current timeout. 3605 */ 3606 void 3607 tcp_xmit_timer(struct tcpcb *tp, int rtt) 3608 { 3609 int delta; 3610 3611 INP_WLOCK_ASSERT(tptoinpcb(tp)); 3612 3613 TCPSTAT_INC(tcps_rttupdated); 3614 if (tp->t_rttupdated < UCHAR_MAX) 3615 tp->t_rttupdated++; 3616 #ifdef STATS 3617 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, 3618 imax(0, rtt * 1000 / hz)); 3619 #endif 3620 if ((tp->t_srtt != 0) && (tp->t_rxtshift <= TCP_RTT_INVALIDATE)) { 3621 /* 3622 * srtt is stored as fixed point with 5 bits after the 3623 * binary point (i.e., scaled by 8). The following magic 3624 * is equivalent to the smoothing algorithm in rfc793 with 3625 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed 3626 * point). Adjust rtt to origin 0. 3627 */ 3628 delta = ((rtt - 1) << TCP_DELTA_SHIFT) 3629 - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)); 3630 3631 if ((tp->t_srtt += delta) <= 0) 3632 tp->t_srtt = 1; 3633 3634 /* 3635 * We accumulate a smoothed rtt variance (actually, a 3636 * smoothed mean difference), then set the retransmit 3637 * timer to smoothed rtt + 4 times the smoothed variance. 3638 * rttvar is stored as fixed point with 4 bits after the 3639 * binary point (scaled by 16). The following is 3640 * equivalent to rfc793 smoothing with an alpha of .75 3641 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces 3642 * rfc793's wired-in beta. 3643 */ 3644 if (delta < 0) 3645 delta = -delta; 3646 delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT); 3647 if ((tp->t_rttvar += delta) <= 0) 3648 tp->t_rttvar = 1; 3649 } else { 3650 /* 3651 * No rtt measurement yet - use the unsmoothed rtt. 3652 * Set the variance to half the rtt (so our first 3653 * retransmit happens at 3*rtt). 3654 */ 3655 tp->t_srtt = rtt << TCP_RTT_SHIFT; 3656 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1); 3657 } 3658 tp->t_rtttime = 0; 3659 tp->t_rxtshift = 0; 3660 3661 /* 3662 * the retransmit should happen at rtt + 4 * rttvar. 3663 * Because of the way we do the smoothing, srtt and rttvar 3664 * will each average +1/2 tick of bias. When we compute 3665 * the retransmit timer, we want 1/2 tick of rounding and 3666 * 1 extra tick because of +-1/2 tick uncertainty in the 3667 * firing of the timer. The bias will give us exactly the 3668 * 1.5 tick we need. But, because the bias is 3669 * statistical, we have to test that we don't drop below 3670 * the minimum feasible timer (which is 2 ticks). 3671 */ 3672 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), 3673 max(tp->t_rttmin, rtt + 2), tcp_rexmit_max); 3674 3675 /* 3676 * We received an ack for a packet that wasn't retransmitted; 3677 * it is probably safe to discard any error indications we've 3678 * received recently. This isn't quite right, but close enough 3679 * for now (a route might have failed after we sent a segment, 3680 * and the return path might not be symmetrical). 3681 */ 3682 tp->t_softerror = 0; 3683 } 3684 3685 /* 3686 * Determine a reasonable value for maxseg size. 3687 * If the route is known, check route for mtu. 3688 * If none, use an mss that can be handled on the outgoing interface 3689 * without forcing IP to fragment. If no route is found, route has no mtu, 3690 * or the destination isn't local, use a default, hopefully conservative 3691 * size (usually 512 or the default IP max size, but no more than the mtu 3692 * of the interface), as we can't discover anything about intervening 3693 * gateways or networks. We also initialize the congestion/slow start 3694 * window to be a single segment if the destination isn't local. 3695 * While looking at the routing entry, we also initialize other path-dependent 3696 * parameters from pre-set or cached values in the routing entry. 3697 * 3698 * NOTE that resulting t_maxseg doesn't include space for TCP options or 3699 * IP options, e.g. IPSEC data, since length of this data may vary, and 3700 * thus it is calculated for every segment separately in tcp_output(). 3701 * 3702 * NOTE that this routine is only called when we process an incoming 3703 * segment, or an ICMP need fragmentation datagram. Outgoing SYN/ACK MSS 3704 * settings are handled in tcp_mssopt(). 3705 */ 3706 void 3707 tcp_mss_update(struct tcpcb *tp, int offer, int mtuoffer, 3708 struct hc_metrics_lite *metricptr, struct tcp_ifcap *cap) 3709 { 3710 int mss = 0; 3711 uint32_t maxmtu = 0; 3712 struct inpcb *inp = tptoinpcb(tp); 3713 struct hc_metrics_lite metrics; 3714 #ifdef INET6 3715 int isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0; 3716 size_t min_protoh = isipv6 ? 3717 sizeof (struct ip6_hdr) + sizeof (struct tcphdr) : 3718 sizeof (struct tcpiphdr); 3719 #else 3720 size_t min_protoh = sizeof(struct tcpiphdr); 3721 #endif 3722 3723 INP_WLOCK_ASSERT(inp); 3724 3725 if (tp->t_port) 3726 min_protoh += V_tcp_udp_tunneling_overhead; 3727 if (mtuoffer != -1) { 3728 KASSERT(offer == -1, ("%s: conflict", __func__)); 3729 offer = mtuoffer - min_protoh; 3730 } 3731 3732 /* Initialize. */ 3733 #ifdef INET6 3734 if (isipv6) { 3735 maxmtu = tcp_maxmtu6(&inp->inp_inc, cap); 3736 tp->t_maxseg = V_tcp_v6mssdflt; 3737 } 3738 #endif 3739 #if defined(INET) && defined(INET6) 3740 else 3741 #endif 3742 #ifdef INET 3743 { 3744 maxmtu = tcp_maxmtu(&inp->inp_inc, cap); 3745 tp->t_maxseg = V_tcp_mssdflt; 3746 } 3747 #endif 3748 3749 /* 3750 * No route to sender, stay with default mss and return. 3751 */ 3752 if (maxmtu == 0) { 3753 /* 3754 * In case we return early we need to initialize metrics 3755 * to a defined state as tcp_hc_get() would do for us 3756 * if there was no cache hit. 3757 */ 3758 if (metricptr != NULL) 3759 bzero(metricptr, sizeof(struct hc_metrics_lite)); 3760 return; 3761 } 3762 3763 /* What have we got? */ 3764 switch (offer) { 3765 case 0: 3766 /* 3767 * Offer == 0 means that there was no MSS on the SYN 3768 * segment, in this case we use tcp_mssdflt as 3769 * already assigned to t_maxseg above. 3770 */ 3771 offer = tp->t_maxseg; 3772 break; 3773 3774 case -1: 3775 /* 3776 * Offer == -1 means that we didn't receive SYN yet. 3777 */ 3778 /* FALLTHROUGH */ 3779 3780 default: 3781 /* 3782 * Prevent DoS attack with too small MSS. Round up 3783 * to at least minmss. 3784 */ 3785 offer = max(offer, V_tcp_minmss); 3786 } 3787 3788 if (metricptr == NULL) 3789 metricptr = &metrics; 3790 tcp_hc_get(&inp->inp_inc, metricptr); 3791 3792 /* 3793 * If there's a discovered mtu in tcp hostcache, use it. 3794 * Else, use the link mtu. 3795 */ 3796 if (metricptr->hc_mtu) 3797 mss = min(metricptr->hc_mtu, maxmtu) - min_protoh; 3798 else { 3799 #ifdef INET6 3800 if (isipv6) { 3801 mss = maxmtu - min_protoh; 3802 if (!V_path_mtu_discovery && 3803 !in6_localaddr(&inp->in6p_faddr)) 3804 mss = min(mss, V_tcp_v6mssdflt); 3805 } 3806 #endif 3807 #if defined(INET) && defined(INET6) 3808 else 3809 #endif 3810 #ifdef INET 3811 { 3812 mss = maxmtu - min_protoh; 3813 if (!V_path_mtu_discovery && 3814 !in_localaddr(inp->inp_faddr)) 3815 mss = min(mss, V_tcp_mssdflt); 3816 } 3817 #endif 3818 /* 3819 * XXX - The above conditional (mss = maxmtu - min_protoh) 3820 * probably violates the TCP spec. 3821 * The problem is that, since we don't know the 3822 * other end's MSS, we are supposed to use a conservative 3823 * default. But, if we do that, then MTU discovery will 3824 * never actually take place, because the conservative 3825 * default is much less than the MTUs typically seen 3826 * on the Internet today. For the moment, we'll sweep 3827 * this under the carpet. 3828 * 3829 * The conservative default might not actually be a problem 3830 * if the only case this occurs is when sending an initial 3831 * SYN with options and data to a host we've never talked 3832 * to before. Then, they will reply with an MSS value which 3833 * will get recorded and the new parameters should get 3834 * recomputed. For Further Study. 3835 */ 3836 } 3837 mss = min(mss, offer); 3838 3839 /* 3840 * Sanity check: make sure that maxseg will be large 3841 * enough to allow some data on segments even if the 3842 * all the option space is used (40bytes). Otherwise 3843 * funny things may happen in tcp_output. 3844 * 3845 * XXXGL: shouldn't we reserve space for IP/IPv6 options? 3846 */ 3847 mss = max(mss, 64); 3848 3849 tp->t_maxseg = mss; 3850 if (tp->t_maxseg < V_tcp_mssdflt) { 3851 /* 3852 * The MSS is so small we should not process incoming 3853 * SACK's since we are subject to attack in such a 3854 * case. 3855 */ 3856 tp->t_flags2 |= TF2_PROC_SACK_PROHIBIT; 3857 } else { 3858 tp->t_flags2 &= ~TF2_PROC_SACK_PROHIBIT; 3859 } 3860 3861 } 3862 3863 void 3864 tcp_mss(struct tcpcb *tp, int offer) 3865 { 3866 int mss; 3867 uint32_t bufsize; 3868 struct inpcb *inp = tptoinpcb(tp); 3869 struct socket *so; 3870 struct hc_metrics_lite metrics; 3871 struct tcp_ifcap cap; 3872 3873 KASSERT(tp != NULL, ("%s: tp == NULL", __func__)); 3874 3875 bzero(&cap, sizeof(cap)); 3876 tcp_mss_update(tp, offer, -1, &metrics, &cap); 3877 3878 mss = tp->t_maxseg; 3879 3880 /* 3881 * If there's a pipesize, change the socket buffer to that size, 3882 * don't change if sb_hiwat is different than default (then it 3883 * has been changed on purpose with setsockopt). 3884 * Make the socket buffers an integral number of mss units; 3885 * if the mss is larger than the socket buffer, decrease the mss. 3886 */ 3887 so = inp->inp_socket; 3888 SOCK_SENDBUF_LOCK(so); 3889 if ((so->so_snd.sb_hiwat == V_tcp_sendspace) && metrics.hc_sendpipe) 3890 bufsize = metrics.hc_sendpipe; 3891 else 3892 bufsize = so->so_snd.sb_hiwat; 3893 if (bufsize < mss) 3894 mss = bufsize; 3895 else { 3896 bufsize = roundup(bufsize, mss); 3897 if (bufsize > sb_max) 3898 bufsize = sb_max; 3899 if (bufsize > so->so_snd.sb_hiwat) 3900 (void)sbreserve_locked(so, SO_SND, bufsize, NULL); 3901 } 3902 SOCK_SENDBUF_UNLOCK(so); 3903 /* 3904 * Sanity check: make sure that maxseg will be large 3905 * enough to allow some data on segments even if the 3906 * all the option space is used (40bytes). Otherwise 3907 * funny things may happen in tcp_output. 3908 * 3909 * XXXGL: shouldn't we reserve space for IP/IPv6 options? 3910 */ 3911 tp->t_maxseg = max(mss, 64); 3912 if (tp->t_maxseg < V_tcp_mssdflt) { 3913 /* 3914 * The MSS is so small we should not process incoming 3915 * SACK's since we are subject to attack in such a 3916 * case. 3917 */ 3918 tp->t_flags2 |= TF2_PROC_SACK_PROHIBIT; 3919 } else { 3920 tp->t_flags2 &= ~TF2_PROC_SACK_PROHIBIT; 3921 } 3922 3923 SOCK_RECVBUF_LOCK(so); 3924 if ((so->so_rcv.sb_hiwat == V_tcp_recvspace) && metrics.hc_recvpipe) 3925 bufsize = metrics.hc_recvpipe; 3926 else 3927 bufsize = so->so_rcv.sb_hiwat; 3928 if (bufsize > mss) { 3929 bufsize = roundup(bufsize, mss); 3930 if (bufsize > sb_max) 3931 bufsize = sb_max; 3932 if (bufsize > so->so_rcv.sb_hiwat) 3933 (void)sbreserve_locked(so, SO_RCV, bufsize, NULL); 3934 } 3935 SOCK_RECVBUF_UNLOCK(so); 3936 3937 /* Check the interface for TSO capabilities. */ 3938 if (cap.ifcap & CSUM_TSO) { 3939 tp->t_flags |= TF_TSO; 3940 tp->t_tsomax = cap.tsomax; 3941 tp->t_tsomaxsegcount = cap.tsomaxsegcount; 3942 tp->t_tsomaxsegsize = cap.tsomaxsegsize; 3943 if (cap.ipsec_tso) 3944 tp->t_flags2 |= TF2_IPSEC_TSO; 3945 } 3946 } 3947 3948 /* 3949 * Determine the MSS option to send on an outgoing SYN. 3950 */ 3951 int 3952 tcp_mssopt(struct in_conninfo *inc) 3953 { 3954 int mss = 0; 3955 uint32_t thcmtu = 0; 3956 uint32_t maxmtu = 0; 3957 size_t min_protoh; 3958 3959 KASSERT(inc != NULL, ("tcp_mssopt with NULL in_conninfo pointer")); 3960 3961 #ifdef INET6 3962 if (inc->inc_flags & INC_ISIPV6) { 3963 mss = V_tcp_v6mssdflt; 3964 maxmtu = tcp_maxmtu6(inc, NULL); 3965 min_protoh = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 3966 } 3967 #endif 3968 #if defined(INET) && defined(INET6) 3969 else 3970 #endif 3971 #ifdef INET 3972 { 3973 mss = V_tcp_mssdflt; 3974 maxmtu = tcp_maxmtu(inc, NULL); 3975 min_protoh = sizeof(struct tcpiphdr); 3976 } 3977 #endif 3978 #if defined(INET6) || defined(INET) 3979 thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */ 3980 #endif 3981 3982 if (maxmtu && thcmtu) 3983 mss = min(maxmtu, thcmtu) - min_protoh; 3984 else if (maxmtu || thcmtu) 3985 mss = max(maxmtu, thcmtu) - min_protoh; 3986 3987 return (mss); 3988 } 3989 3990 void 3991 tcp_do_prr_ack(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to, 3992 sackstatus_t sack_changed, u_int *maxsegp) 3993 { 3994 int snd_cnt = 0, limit = 0, del_data = 0, pipe = 0; 3995 u_int maxseg; 3996 3997 INP_WLOCK_ASSERT(tptoinpcb(tp)); 3998 3999 if (*maxsegp == 0) { 4000 *maxsegp = tcp_maxseg(tp); 4001 } 4002 maxseg = *maxsegp; 4003 /* 4004 * Compute the amount of data that this ACK is indicating 4005 * (del_data) and an estimate of how many bytes are in the 4006 * network. 4007 */ 4008 if (tcp_is_sack_recovery(tp, to) || 4009 (IN_CONGRECOVERY(tp->t_flags) && 4010 !IN_FASTRECOVERY(tp->t_flags))) { 4011 del_data = tp->sackhint.delivered_data; 4012 pipe = tcp_compute_pipe(tp); 4013 } else { 4014 if (tp->sackhint.prr_delivered < (tcprexmtthresh * maxseg + 4015 tp->snd_recover - tp->snd_una)) { 4016 del_data = maxseg; 4017 } 4018 pipe = imax(0, tp->snd_max - tp->snd_una - 4019 imin(INT_MAX / 65536, tp->t_dupacks) * maxseg); 4020 } 4021 tp->sackhint.prr_delivered += del_data; 4022 /* 4023 * Proportional Rate Reduction 4024 */ 4025 if (pipe >= tp->snd_ssthresh) { 4026 if (tp->sackhint.recover_fs == 0) 4027 tp->sackhint.recover_fs = 4028 imax(1, tp->snd_nxt - tp->snd_una); 4029 snd_cnt = howmany((long)tp->sackhint.prr_delivered * 4030 tp->snd_ssthresh, tp->sackhint.recover_fs) - 4031 tp->sackhint.prr_out + maxseg - 1; 4032 } else { 4033 /* 4034 * PRR 6937bis heuristic: 4035 * - A partial ack without SACK block beneath snd_recover 4036 * indicates further loss. 4037 * - An SACK scoreboard update adding a new hole indicates 4038 * further loss, so be conservative and send at most one 4039 * segment. 4040 * - Prevent ACK splitting attacks, by being conservative 4041 * when no new data is acked. 4042 */ 4043 if ((sack_changed == SACK_NEWLOSS) || (del_data == 0)) { 4044 limit = tp->sackhint.prr_delivered - 4045 tp->sackhint.prr_out; 4046 } else { 4047 limit = imax(tp->sackhint.prr_delivered - 4048 tp->sackhint.prr_out, del_data) + 4049 maxseg; 4050 } 4051 snd_cnt = imin((tp->snd_ssthresh - pipe), limit); 4052 } 4053 snd_cnt = imax(snd_cnt, 0) / maxseg; 4054 /* 4055 * Send snd_cnt new data into the network in response to this ack. 4056 * If there is going to be a SACK retransmission, adjust snd_cwnd 4057 * accordingly. 4058 */ 4059 if (IN_FASTRECOVERY(tp->t_flags)) { 4060 if (tcp_is_sack_recovery(tp, to)) { 4061 tp->snd_cwnd = pipe - del_data + (snd_cnt * maxseg); 4062 } else { 4063 tp->snd_cwnd = (tp->snd_max - tp->snd_una) + 4064 (snd_cnt * maxseg); 4065 } 4066 } else if (IN_CONGRECOVERY(tp->t_flags)) { 4067 tp->snd_cwnd = pipe - del_data + (snd_cnt * maxseg); 4068 } 4069 tp->snd_cwnd = imax(maxseg, tp->snd_cwnd); 4070 } 4071 4072 /* 4073 * On a partial ack arrives, force the retransmission of the 4074 * next unacknowledged segment. Do not clear tp->t_dupacks. 4075 * By setting snd_nxt to ti_ack, this forces retransmission timer to 4076 * be started again. 4077 */ 4078 void 4079 tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th) 4080 { 4081 tcp_seq onxt = tp->snd_nxt; 4082 uint32_t ocwnd = tp->snd_cwnd; 4083 u_int maxseg = tcp_maxseg(tp); 4084 4085 INP_WLOCK_ASSERT(tptoinpcb(tp)); 4086 4087 tcp_timer_activate(tp, TT_REXMT, 0); 4088 tp->t_rtttime = 0; 4089 if (IN_FASTRECOVERY(tp->t_flags)) { 4090 tp->snd_nxt = th->th_ack; 4091 /* 4092 * Set snd_cwnd to one segment beyond acknowledged offset. 4093 * (tp->snd_una has not yet been updated when this function is called.) 4094 */ 4095 tp->snd_cwnd = maxseg + BYTES_THIS_ACK(tp, th); 4096 tp->t_flags |= TF_ACKNOW; 4097 (void) tcp_output(tp); 4098 tp->snd_cwnd = ocwnd; 4099 if (SEQ_GT(onxt, tp->snd_nxt)) 4100 tp->snd_nxt = onxt; 4101 } 4102 /* 4103 * Partial window deflation. Relies on fact that tp->snd_una 4104 * not updated yet. 4105 */ 4106 if (tp->snd_cwnd > BYTES_THIS_ACK(tp, th)) 4107 tp->snd_cwnd -= BYTES_THIS_ACK(tp, th); 4108 else 4109 tp->snd_cwnd = 0; 4110 tp->snd_cwnd += maxseg; 4111 } 4112 4113 int 4114 tcp_compute_pipe(struct tcpcb *tp) 4115 { 4116 int pipe; 4117 4118 if (tp->t_fb->tfb_compute_pipe != NULL) { 4119 pipe = (*tp->t_fb->tfb_compute_pipe)(tp); 4120 } else { 4121 pipe = tp->snd_max - tp->snd_una + 4122 tp->sackhint.sack_bytes_rexmit - 4123 tp->sackhint.sacked_bytes - 4124 tp->sackhint.lost_bytes; 4125 } 4126 return (imax(pipe, 0)); 4127 } 4128 4129 uint32_t 4130 tcp_compute_initwnd(uint32_t maxseg) 4131 { 4132 /* 4133 * Calculate the Initial Window, also used as Restart Window 4134 * 4135 * RFC5681 Section 3.1 specifies the default conservative values. 4136 * RFC3390 specifies slightly more aggressive values. 4137 * RFC6928 increases it to ten segments. 4138 * Support for user specified value for initial flight size. 4139 */ 4140 if (V_tcp_initcwnd_segments) 4141 return min(V_tcp_initcwnd_segments * maxseg, 4142 max(2 * maxseg, V_tcp_initcwnd_segments * 1460)); 4143 else if (V_tcp_do_rfc3390) 4144 return min(4 * maxseg, max(2 * maxseg, 4380)); 4145 else { 4146 /* Per RFC5681 Section 3.1 */ 4147 if (maxseg > 2190) 4148 return (2 * maxseg); 4149 else if (maxseg > 1095) 4150 return (3 * maxseg); 4151 else 4152 return (4 * maxseg); 4153 } 4154 } 4155