1 /*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
5 * The Regents of the University of California. All rights reserved.
6 * Copyright (c) 2007-2008,2010
7 * Swinburne University of Technology, Melbourne, Australia.
8 * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org>
9 * Copyright (c) 2010 The FreeBSD Foundation
10 * Copyright (c) 2010-2011 Juniper Networks, Inc.
11 * All rights reserved.
12 *
13 * Portions of this software were developed at the Centre for Advanced Internet
14 * Architectures, Swinburne University of Technology, by Lawrence Stewart,
15 * James Healy and David Hayes, made possible in part by a grant from the Cisco
16 * University Research Program Fund at Community Foundation Silicon Valley.
17 *
18 * Portions of this software were developed at the Centre for Advanced
19 * Internet Architectures, Swinburne University of Technology, Melbourne,
20 * Australia by David Hayes under sponsorship from the FreeBSD Foundation.
21 *
22 * Portions of this software were developed by Robert N. M. Watson under
23 * contract to Juniper Networks, Inc.
24 *
25 * Redistribution and use in source and binary forms, with or without
26 * modification, are permitted provided that the following conditions
27 * are met:
28 * 1. Redistributions of source code must retain the above copyright
29 * notice, this list of conditions and the following disclaimer.
30 * 2. Redistributions in binary form must reproduce the above copyright
31 * notice, this list of conditions and the following disclaimer in the
32 * documentation and/or other materials provided with the distribution.
33 * 3. Neither the name of the University nor the names of its contributors
34 * may be used to endorse or promote products derived from this software
35 * without specific prior written permission.
36 *
37 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
38 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
39 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
40 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
41 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
42 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
43 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
44 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
45 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
46 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
47 * SUCH DAMAGE.
48 */
49
50 #include "opt_inet.h"
51 #include "opt_inet6.h"
52 #include "opt_ipsec.h"
53 #include "opt_rss.h"
54
55 #include <sys/param.h>
56 #include <sys/arb.h>
57 #include <sys/kernel.h>
58 #ifdef TCP_HHOOK
59 #include <sys/hhook.h>
60 #endif
61 #include <sys/malloc.h>
62 #include <sys/mbuf.h>
63 #include <sys/proc.h> /* for proc0 declaration */
64 #include <sys/protosw.h>
65 #include <sys/qmath.h>
66 #include <sys/sdt.h>
67 #include <sys/signalvar.h>
68 #include <sys/socket.h>
69 #include <sys/socketvar.h>
70 #include <sys/sysctl.h>
71 #include <sys/syslog.h>
72 #include <sys/systm.h>
73 #include <sys/stats.h>
74
75 #include <machine/cpu.h> /* before tcp_seq.h, for tcp_random18() */
76
77 #include <vm/uma.h>
78
79 #include <net/if.h>
80 #include <net/if_var.h>
81 #include <net/route.h>
82 #include <net/rss_config.h>
83 #include <net/vnet.h>
84
85 #define TCPSTATES /* for logging */
86
87 #include <netinet/in.h>
88 #include <netinet/in_kdtrace.h>
89 #include <netinet/in_pcb.h>
90 #include <netinet/in_rss.h>
91 #include <netinet/in_systm.h>
92 #include <netinet/ip.h>
93 #include <netinet/ip_icmp.h> /* required for icmp_var.h */
94 #include <netinet/icmp_var.h> /* for ICMP_BANDLIM */
95 #include <netinet/ip_var.h>
96 #include <netinet/ip_options.h>
97 #include <netinet/ip6.h>
98 #include <netinet/icmp6.h>
99 #include <netinet6/in6_pcb.h>
100 #include <netinet6/in6_rss.h>
101 #include <netinet6/in6_var.h>
102 #include <netinet6/ip6_var.h>
103 #include <netinet6/nd6.h>
104 #include <netinet/tcp.h>
105 #include <netinet/tcp_fsm.h>
106 #include <netinet/tcp_seq.h>
107 #include <netinet/tcp_timer.h>
108 #include <netinet/tcp_var.h>
109 #include <netinet/tcp_log_buf.h>
110 #include <netinet6/tcp6_var.h>
111 #include <netinet/tcpip.h>
112 #include <netinet/cc/cc.h>
113 #include <netinet/tcp_fastopen.h>
114 #include <netinet/tcp_syncache.h>
115 #ifdef TCP_OFFLOAD
116 #include <netinet/tcp_offload.h>
117 #endif
118 #include <netinet/tcp_ecn.h>
119 #include <netinet/udp.h>
120
121 #include <netipsec/ipsec_support.h>
122
123 #include <machine/in_cksum.h>
124
125 #include <security/mac/mac_framework.h>
126
127 const int tcprexmtthresh = 3;
128
129 VNET_DEFINE(int, tcp_log_in_vain) = 0;
130 SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_VNET | CTLFLAG_RW,
131 &VNET_NAME(tcp_log_in_vain), 0,
132 "Log all incoming TCP segments to closed ports");
133
134 VNET_DEFINE(int, tcp_bind_all_fibs) = 1;
135 SYSCTL_INT(_net_inet_tcp, OID_AUTO, bind_all_fibs, CTLFLAG_VNET | CTLFLAG_RDTUN,
136 &VNET_NAME(tcp_bind_all_fibs), 0,
137 "Bound sockets receive traffic from all FIBs");
138
139 VNET_DEFINE(int, blackhole) = 0;
140 #define V_blackhole VNET(blackhole)
141 SYSCTL_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_VNET | CTLFLAG_RW,
142 &VNET_NAME(blackhole), 0,
143 "Do not send RST on segments to closed ports");
144
145 VNET_DEFINE(bool, blackhole_local) = false;
146 #define V_blackhole_local VNET(blackhole_local)
147 SYSCTL_BOOL(_net_inet_tcp, OID_AUTO, blackhole_local, CTLFLAG_VNET |
148 CTLFLAG_RW, &VNET_NAME(blackhole_local), false,
149 "Enforce net.inet.tcp.blackhole for locally originated packets");
150
151 VNET_DEFINE(int, tcp_delack_enabled) = 1;
152 SYSCTL_INT(_net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_VNET | CTLFLAG_RW,
153 &VNET_NAME(tcp_delack_enabled), 0,
154 "Delay ACK to try and piggyback it onto a data packet");
155
156 VNET_DEFINE(int, drop_synfin) = 0;
157 SYSCTL_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_VNET | CTLFLAG_RW,
158 &VNET_NAME(drop_synfin), 0,
159 "Drop TCP packets with SYN+FIN set");
160
161 VNET_DEFINE(int, tcp_do_prr) = 1;
162 SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_prr, CTLFLAG_VNET | CTLFLAG_RW,
163 &VNET_NAME(tcp_do_prr), 1,
164 "Enable Proportional Rate Reduction per RFC 6937");
165
166 VNET_DEFINE(int, tcp_do_newcwv) = 0;
167 SYSCTL_INT(_net_inet_tcp, OID_AUTO, newcwv, CTLFLAG_VNET | CTLFLAG_RW,
168 &VNET_NAME(tcp_do_newcwv), 0,
169 "Enable New Congestion Window Validation per RFC7661");
170
171 VNET_DEFINE(int, tcp_do_rfc3042) = 1;
172 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3042, CTLFLAG_VNET | CTLFLAG_RW,
173 &VNET_NAME(tcp_do_rfc3042), 0,
174 "Enable RFC 3042 (Limited Transmit)");
175
176 VNET_DEFINE(int, tcp_do_rfc3390) = 1;
177 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3390, CTLFLAG_VNET | CTLFLAG_RW,
178 &VNET_NAME(tcp_do_rfc3390), 0,
179 "Enable RFC 3390 (Increasing TCP's Initial Congestion Window)");
180
181 VNET_DEFINE(int, tcp_initcwnd_segments) = 10;
182 SYSCTL_INT(_net_inet_tcp, OID_AUTO, initcwnd_segments,
183 CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_initcwnd_segments), 0,
184 "Slow-start flight size (initial congestion window) in number of segments");
185
186 VNET_DEFINE(int, tcp_do_rfc3465) = 1;
187 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3465, CTLFLAG_VNET | CTLFLAG_RW,
188 &VNET_NAME(tcp_do_rfc3465), 0,
189 "Enable RFC 3465 (Appropriate Byte Counting)");
190
191 VNET_DEFINE(int, tcp_abc_l_var) = 2;
192 SYSCTL_INT(_net_inet_tcp, OID_AUTO, abc_l_var, CTLFLAG_VNET | CTLFLAG_RW,
193 &VNET_NAME(tcp_abc_l_var), 2,
194 "Cap the max cwnd increment during slow-start to this number of segments");
195
196 VNET_DEFINE(int, tcp_insecure_syn) = 0;
197 SYSCTL_INT(_net_inet_tcp, OID_AUTO, insecure_syn, CTLFLAG_VNET | CTLFLAG_RW,
198 &VNET_NAME(tcp_insecure_syn), 0,
199 "Follow RFC793 instead of RFC5961 criteria for accepting SYN packets");
200
201 VNET_DEFINE(int, tcp_insecure_rst) = 0;
202 SYSCTL_INT(_net_inet_tcp, OID_AUTO, insecure_rst, CTLFLAG_VNET | CTLFLAG_RW,
203 &VNET_NAME(tcp_insecure_rst), 0,
204 "Follow RFC793 instead of RFC5961 criteria for accepting RST packets");
205
206 VNET_DEFINE(int, tcp_insecure_ack) = 0;
207 SYSCTL_INT(_net_inet_tcp, OID_AUTO, insecure_ack, CTLFLAG_VNET | CTLFLAG_RW,
208 &VNET_NAME(tcp_insecure_ack), 0,
209 "Follow RFC793 criteria for validating SEG.ACK");
210
211 VNET_DEFINE(int, tcp_recvspace) = 1024*64;
212 #define V_tcp_recvspace VNET(tcp_recvspace)
213 SYSCTL_INT(_net_inet_tcp, TCPCTL_RECVSPACE, recvspace, CTLFLAG_VNET | CTLFLAG_RW,
214 &VNET_NAME(tcp_recvspace), 0, "Initial receive socket buffer size");
215
216 VNET_DEFINE(int, tcp_do_autorcvbuf) = 1;
217 SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_auto, CTLFLAG_VNET | CTLFLAG_RW,
218 &VNET_NAME(tcp_do_autorcvbuf), 0,
219 "Enable automatic receive buffer sizing");
220
221 VNET_DEFINE(int, tcp_autorcvbuf_max) = 8*1024*1024;
222 SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_max, CTLFLAG_VNET | CTLFLAG_RW,
223 &VNET_NAME(tcp_autorcvbuf_max), 0,
224 "Max size of automatic receive buffer");
225
226 VNET_DEFINE(struct inpcbinfo, tcbinfo);
227
228 /*
229 * TCP statistics are stored in an array of counter(9)s, which size matches
230 * size of struct tcpstat. TCP running connection count is a regular array.
231 */
232 VNET_PCPUSTAT_DEFINE(struct tcpstat, tcpstat);
233 SYSCTL_VNET_PCPUSTAT(_net_inet_tcp, TCPCTL_STATS, stats, struct tcpstat,
234 tcpstat, "TCP statistics (struct tcpstat, netinet/tcp_var.h)");
235 VNET_DEFINE(counter_u64_t, tcps_states[TCP_NSTATES]);
236 SYSCTL_COUNTER_U64_ARRAY(_net_inet_tcp, TCPCTL_STATES, states, CTLFLAG_RD |
237 CTLFLAG_VNET, &VNET_NAME(tcps_states)[0], TCP_NSTATES,
238 "TCP connection counts by TCP state");
239
240 /*
241 * Kernel module interface for updating tcpstat. The first argument is an index
242 * into tcpstat treated as an array.
243 */
244 void
kmod_tcpstat_add(int statnum,int val)245 kmod_tcpstat_add(int statnum, int val)
246 {
247
248 counter_u64_add(VNET(tcpstat)[statnum], val);
249 }
250
251 /*
252 * Make sure that we only start a SACK loss recovery when
253 * receiving a duplicate ACK with a SACK block, and also
254 * complete SACK loss recovery in case the other end
255 * reneges.
256 */
257 static bool inline
tcp_is_sack_recovery(struct tcpcb * tp,struct tcpopt * to)258 tcp_is_sack_recovery(struct tcpcb *tp, struct tcpopt *to)
259 {
260 return ((tp->t_flags & TF_SACK_PERMIT) &&
261 ((to->to_flags & TOF_SACK) ||
262 (!TAILQ_EMPTY(&tp->snd_holes))));
263 }
264
265 #ifdef TCP_HHOOK
266 /*
267 * Wrapper for the TCP established input helper hook.
268 */
269 void
hhook_run_tcp_est_in(struct tcpcb * tp,struct tcphdr * th,struct tcpopt * to)270 hhook_run_tcp_est_in(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to)
271 {
272 struct tcp_hhook_data hhook_data;
273
274 if (V_tcp_hhh[HHOOK_TCP_EST_IN]->hhh_nhooks > 0) {
275 hhook_data.tp = tp;
276 hhook_data.th = th;
277 hhook_data.to = to;
278
279 hhook_run_hooks(V_tcp_hhh[HHOOK_TCP_EST_IN], &hhook_data,
280 &tp->t_osd);
281 }
282 }
283 #endif
284
285 /*
286 * CC wrapper hook functions
287 */
288 void
cc_ack_received(struct tcpcb * tp,struct tcphdr * th,uint16_t nsegs,uint16_t type)289 cc_ack_received(struct tcpcb *tp, struct tcphdr *th, uint16_t nsegs,
290 uint16_t type)
291 {
292 #ifdef STATS
293 int32_t gput;
294 #endif
295
296 INP_WLOCK_ASSERT(tptoinpcb(tp));
297
298 tp->t_ccv.nsegs = nsegs;
299 tp->t_ccv.bytes_this_ack = BYTES_THIS_ACK(tp, th);
300 if ((!V_tcp_do_newcwv && (tp->snd_cwnd <= tp->snd_wnd)) ||
301 (V_tcp_do_newcwv && (tp->snd_cwnd <= tp->snd_wnd) &&
302 (tp->snd_cwnd < (tcp_compute_pipe(tp) * 2))))
303 tp->t_ccv.flags |= CCF_CWND_LIMITED;
304 else
305 tp->t_ccv.flags &= ~CCF_CWND_LIMITED;
306
307 if (type == CC_ACK) {
308 #ifdef STATS
309 stats_voi_update_abs_s32(tp->t_stats, VOI_TCP_CALCFRWINDIFF,
310 ((int32_t)tp->snd_cwnd) - tp->snd_wnd);
311 if (!IN_RECOVERY(tp->t_flags))
312 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_ACKLEN,
313 tp->t_ccv.bytes_this_ack / (tcp_maxseg(tp) * nsegs));
314 if ((tp->t_flags & TF_GPUTINPROG) &&
315 SEQ_GEQ(th->th_ack, tp->gput_ack)) {
316 /*
317 * Compute goodput in bits per millisecond.
318 */
319 gput = (((int64_t)SEQ_SUB(th->th_ack, tp->gput_seq)) << 3) /
320 max(1, tcp_ts_getticks() - tp->gput_ts);
321 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_GPUT,
322 gput);
323 /*
324 * XXXLAS: This is a temporary hack, and should be
325 * chained off VOI_TCP_GPUT when stats(9) grows an API
326 * to deal with chained VOIs.
327 */
328 if (tp->t_stats_gput_prev > 0)
329 stats_voi_update_abs_s32(tp->t_stats,
330 VOI_TCP_GPUT_ND,
331 ((gput - tp->t_stats_gput_prev) * 100) /
332 tp->t_stats_gput_prev);
333 tp->t_flags &= ~TF_GPUTINPROG;
334 tp->t_stats_gput_prev = gput;
335 }
336 #endif /* STATS */
337 if (tp->snd_cwnd > tp->snd_ssthresh) {
338 tp->t_bytes_acked += tp->t_ccv.bytes_this_ack;
339 if (tp->t_bytes_acked >= tp->snd_cwnd) {
340 tp->t_bytes_acked -= tp->snd_cwnd;
341 tp->t_ccv.flags |= CCF_ABC_SENTAWND;
342 }
343 } else {
344 tp->t_ccv.flags &= ~CCF_ABC_SENTAWND;
345 tp->t_bytes_acked = 0;
346 }
347 }
348
349 if (CC_ALGO(tp)->ack_received != NULL) {
350 /* XXXLAS: Find a way to live without this */
351 tp->t_ccv.curack = th->th_ack;
352 CC_ALGO(tp)->ack_received(&tp->t_ccv, type);
353 }
354 #ifdef STATS
355 stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_LCWIN, tp->snd_cwnd);
356 #endif
357 }
358
359 void
cc_conn_init(struct tcpcb * tp)360 cc_conn_init(struct tcpcb *tp)
361 {
362 struct hc_metrics_lite metrics;
363 struct inpcb *inp = tptoinpcb(tp);
364 u_int maxseg;
365 int rtt;
366
367 INP_WLOCK_ASSERT(inp);
368
369 tcp_hc_get(&inp->inp_inc, &metrics);
370 maxseg = tcp_maxseg(tp);
371
372 if (tp->t_srtt == 0 && (rtt = metrics.hc_rtt)) {
373 tp->t_srtt = rtt;
374 TCPSTAT_INC(tcps_usedrtt);
375 if (metrics.hc_rttvar) {
376 tp->t_rttvar = metrics.hc_rttvar;
377 TCPSTAT_INC(tcps_usedrttvar);
378 } else {
379 /* default variation is +- 1 rtt */
380 tp->t_rttvar =
381 tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE;
382 }
383 TCPT_RANGESET(tp->t_rxtcur,
384 ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1,
385 tp->t_rttmin, tcp_rexmit_max);
386 }
387 if (metrics.hc_ssthresh) {
388 /*
389 * There's some sort of gateway or interface
390 * buffer limit on the path. Use this to set
391 * the slow start threshold, but set the
392 * threshold to no less than 2*mss.
393 */
394 tp->snd_ssthresh = max(2 * maxseg, metrics.hc_ssthresh);
395 TCPSTAT_INC(tcps_usedssthresh);
396 }
397
398 /*
399 * Set the initial slow-start flight size.
400 *
401 * If a SYN or SYN/ACK was lost and retransmitted, we have to
402 * reduce the initial CWND to one segment as congestion is likely
403 * requiring us to be cautious.
404 */
405 if (tp->snd_cwnd == 1)
406 tp->snd_cwnd = maxseg; /* SYN(-ACK) lost */
407 else
408 tp->snd_cwnd = tcp_compute_initwnd(maxseg);
409
410 if (CC_ALGO(tp)->conn_init != NULL)
411 CC_ALGO(tp)->conn_init(&tp->t_ccv);
412 }
413
414 void inline
cc_cong_signal(struct tcpcb * tp,struct tcphdr * th,uint32_t type)415 cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type)
416 {
417 INP_WLOCK_ASSERT(tptoinpcb(tp));
418
419 #ifdef STATS
420 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_CSIG, type);
421 #endif
422
423 switch(type) {
424 case CC_NDUPACK:
425 if (!IN_FASTRECOVERY(tp->t_flags)) {
426 tp->snd_recover = tp->snd_max;
427 if (tp->t_flags2 & TF2_ECN_PERMIT)
428 tp->t_flags2 |= TF2_ECN_SND_CWR;
429 }
430 break;
431 case CC_ECN:
432 if (!IN_CONGRECOVERY(tp->t_flags) ||
433 /*
434 * Allow ECN reaction on ACK to CWR, if
435 * that data segment was also CE marked.
436 */
437 SEQ_GEQ(th->th_ack, tp->snd_recover)) {
438 EXIT_CONGRECOVERY(tp->t_flags);
439 TCPSTAT_INC(tcps_ecn_rcwnd);
440 tp->snd_recover = tp->snd_max + 1;
441 if (tp->t_flags2 & TF2_ECN_PERMIT)
442 tp->t_flags2 |= TF2_ECN_SND_CWR;
443 }
444 break;
445 case CC_RTO:
446 tp->t_dupacks = 0;
447 tp->t_bytes_acked = 0;
448 EXIT_RECOVERY(tp->t_flags);
449 if (tp->t_flags2 & TF2_ECN_PERMIT)
450 tp->t_flags2 |= TF2_ECN_SND_CWR;
451 break;
452 case CC_RTO_ERR:
453 TCPSTAT_INC(tcps_sndrexmitbad);
454 /* RTO was unnecessary, so reset everything. */
455 tp->snd_cwnd = tp->snd_cwnd_prev;
456 tp->snd_ssthresh = tp->snd_ssthresh_prev;
457 tp->snd_recover = tp->snd_recover_prev;
458 if (tp->t_flags & TF_WASFRECOVERY)
459 ENTER_FASTRECOVERY(tp->t_flags);
460 if (tp->t_flags & TF_WASCRECOVERY)
461 ENTER_CONGRECOVERY(tp->t_flags);
462 tp->snd_nxt = tp->snd_max;
463 tp->t_flags &= ~TF_PREVVALID;
464 tp->t_rxtshift = 0;
465 tp->t_badrxtwin = 0;
466 break;
467 }
468 if (SEQ_LT(tp->snd_fack, tp->snd_una) ||
469 SEQ_GT(tp->snd_fack, tp->snd_max)) {
470 tp->snd_fack = tp->snd_una;
471 }
472
473 if (CC_ALGO(tp)->cong_signal != NULL) {
474 if (th != NULL)
475 tp->t_ccv.curack = th->th_ack;
476 CC_ALGO(tp)->cong_signal(&tp->t_ccv, type);
477 }
478 }
479
480 void inline
cc_post_recovery(struct tcpcb * tp,struct tcphdr * th)481 cc_post_recovery(struct tcpcb *tp, struct tcphdr *th)
482 {
483 INP_WLOCK_ASSERT(tptoinpcb(tp));
484
485 if (CC_ALGO(tp)->post_recovery != NULL) {
486 if (SEQ_LT(tp->snd_fack, th->th_ack) ||
487 SEQ_GT(tp->snd_fack, tp->snd_max)) {
488 tp->snd_fack = th->th_ack;
489 }
490 tp->t_ccv.curack = th->th_ack;
491 CC_ALGO(tp)->post_recovery(&tp->t_ccv);
492 }
493 EXIT_RECOVERY(tp->t_flags);
494
495 tp->t_bytes_acked = 0;
496 tp->sackhint.delivered_data = 0;
497 tp->sackhint.prr_delivered = 0;
498 tp->sackhint.prr_out = 0;
499 }
500
501 /*
502 * Indicate whether this ack should be delayed. We can delay the ack if
503 * following conditions are met:
504 * - There is no delayed ack timer in progress.
505 * - Our last ack wasn't a 0-sized window. We never want to delay
506 * the ack that opens up a 0-sized window.
507 * - LRO wasn't used for this segment. We make sure by checking that the
508 * segment size is not larger than the MSS.
509 */
510 #define DELAY_ACK(tp, tlen) \
511 ((!tcp_timer_active(tp, TT_DELACK) && \
512 (tp->t_flags & TF_RXWIN0SENT) == 0) && \
513 (tlen <= tp->t_maxseg) && \
514 (V_tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN)))
515
516 void inline
cc_ecnpkt_handler_flags(struct tcpcb * tp,uint16_t flags,uint8_t iptos)517 cc_ecnpkt_handler_flags(struct tcpcb *tp, uint16_t flags, uint8_t iptos)
518 {
519 INP_WLOCK_ASSERT(tptoinpcb(tp));
520
521 if (CC_ALGO(tp)->ecnpkt_handler != NULL) {
522 switch (iptos & IPTOS_ECN_MASK) {
523 case IPTOS_ECN_CE:
524 tp->t_ccv.flags |= CCF_IPHDR_CE;
525 break;
526 case IPTOS_ECN_ECT0:
527 /* FALLTHROUGH */
528 case IPTOS_ECN_ECT1:
529 /* FALLTHROUGH */
530 case IPTOS_ECN_NOTECT:
531 tp->t_ccv.flags &= ~CCF_IPHDR_CE;
532 break;
533 }
534
535 if (flags & TH_CWR)
536 tp->t_ccv.flags |= CCF_TCPHDR_CWR;
537 else
538 tp->t_ccv.flags &= ~CCF_TCPHDR_CWR;
539
540 CC_ALGO(tp)->ecnpkt_handler(&tp->t_ccv);
541
542 if (tp->t_ccv.flags & CCF_ACKNOW) {
543 tcp_timer_activate(tp, TT_DELACK, tcp_delacktime);
544 tp->t_flags |= TF_ACKNOW;
545 }
546 }
547 }
548
549 void inline
cc_ecnpkt_handler(struct tcpcb * tp,struct tcphdr * th,uint8_t iptos)550 cc_ecnpkt_handler(struct tcpcb *tp, struct tcphdr *th, uint8_t iptos)
551 {
552 cc_ecnpkt_handler_flags(tp, tcp_get_flags(th), iptos);
553 }
554
555 /*
556 * TCP input handling is split into multiple parts:
557 * tcp6_input is a thin wrapper around tcp_input for the extended
558 * ip6_protox[] call format in ip6_input
559 * tcp_input handles primary segment validation, inpcb lookup and
560 * SYN processing on listen sockets
561 * tcp_do_segment processes the ACK and text of the segment for
562 * establishing, established and closing connections
563 */
564 #ifdef INET6
565 int
tcp6_input_with_port(struct mbuf ** mp,int * offp,int proto,uint16_t port)566 tcp6_input_with_port(struct mbuf **mp, int *offp, int proto, uint16_t port)
567 {
568 struct mbuf *m;
569
570 m = *mp;
571 if (m->m_len < *offp + sizeof(struct tcphdr)) {
572 m = m_pullup(m, *offp + sizeof(struct tcphdr));
573 if (m == NULL) {
574 *mp = m;
575 TCPSTAT_INC(tcps_rcvshort);
576 return (IPPROTO_DONE);
577 }
578 }
579
580 *mp = m;
581 return (tcp_input_with_port(mp, offp, proto, port));
582 }
583
584 int
tcp6_input(struct mbuf ** mp,int * offp,int proto)585 tcp6_input(struct mbuf **mp, int *offp, int proto)
586 {
587
588 return(tcp6_input_with_port(mp, offp, proto, 0));
589 }
590 #endif /* INET6 */
591
592 int
tcp_input_with_port(struct mbuf ** mp,int * offp,int proto,uint16_t port)593 tcp_input_with_port(struct mbuf **mp, int *offp, int proto, uint16_t port)
594 {
595 struct mbuf *m = *mp;
596 struct tcphdr *th = NULL;
597 struct ip *ip = NULL;
598 struct inpcb *inp = NULL;
599 struct tcpcb *tp = NULL;
600 struct socket *so = NULL;
601 u_char *optp = NULL;
602 int off0;
603 int optlen = 0;
604 #ifdef INET
605 int len;
606 uint8_t ipttl;
607 #endif
608 int tlen = 0, off;
609 int drop_hdrlen;
610 int thflags;
611 int lookupflag;
612 uint8_t iptos;
613 struct m_tag *fwd_tag = NULL;
614 #ifdef INET6
615 struct ip6_hdr *ip6 = NULL;
616 int isipv6;
617 #else
618 const void *ip6 = NULL;
619 #endif /* INET6 */
620 struct tcpopt to; /* options in this segment */
621 char *s = NULL; /* address and port logging */
622 bool closed_port = false; /* segment is hitting a closed port */
623
624 NET_EPOCH_ASSERT();
625
626 #ifdef INET6
627 isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0;
628 #endif
629
630 off0 = *offp;
631 m = *mp;
632 *mp = NULL;
633 to.to_flags = 0;
634 TCPSTAT_INC(tcps_rcvtotal);
635
636 m->m_pkthdr.tcp_tun_port = port;
637 #ifdef INET6
638 if (isipv6) {
639 ip6 = mtod(m, struct ip6_hdr *);
640 th = (struct tcphdr *)((caddr_t)ip6 + off0);
641 tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0;
642 if (port)
643 goto skip6_csum;
644 if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID_IPV6) {
645 if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
646 th->th_sum = m->m_pkthdr.csum_data;
647 else
648 th->th_sum = in6_cksum_pseudo(ip6, tlen,
649 IPPROTO_TCP, m->m_pkthdr.csum_data);
650 th->th_sum ^= 0xffff;
651 } else if (m->m_pkthdr.csum_flags & CSUM_IP6_TCP) {
652 /*
653 * Packet from local host (maybe from a VM).
654 * Checksum not required.
655 */
656 th->th_sum = 0;
657 } else
658 th->th_sum = in6_cksum(m, IPPROTO_TCP, off0, tlen);
659 if (th->th_sum) {
660 TCPSTAT_INC(tcps_rcvbadsum);
661 goto drop;
662 }
663 skip6_csum:
664 /*
665 * Be proactive about unspecified IPv6 address in source.
666 * As we use all-zero to indicate unbounded/unconnected pcb,
667 * unspecified IPv6 address can be used to confuse us.
668 *
669 * Note that packets with unspecified IPv6 destination is
670 * already dropped in ip6_input.
671 */
672 KASSERT(!IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_dst),
673 ("%s: unspecified destination v6 address", __func__));
674 if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) {
675 IP6STAT_INC(ip6s_badscope); /* XXX */
676 goto drop;
677 }
678 iptos = IPV6_TRAFFIC_CLASS(ip6);
679 }
680 #endif
681 #if defined(INET) && defined(INET6)
682 else
683 #endif
684 #ifdef INET
685 {
686 /*
687 * Get IP and TCP header together in first mbuf.
688 * Note: IP leaves IP header in first mbuf.
689 */
690 if (off0 > sizeof (struct ip)) {
691 ip_stripoptions(m);
692 off0 = sizeof(struct ip);
693 }
694 if (m->m_len < sizeof (struct tcpiphdr)) {
695 if ((m = m_pullup(m, sizeof (struct tcpiphdr)))
696 == NULL) {
697 TCPSTAT_INC(tcps_rcvshort);
698 return (IPPROTO_DONE);
699 }
700 }
701 ip = mtod(m, struct ip *);
702 th = (struct tcphdr *)((caddr_t)ip + off0);
703 tlen = ntohs(ip->ip_len) - off0;
704
705 iptos = ip->ip_tos;
706 if (port)
707 goto skip_csum;
708 if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
709 if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
710 th->th_sum = m->m_pkthdr.csum_data;
711 else
712 th->th_sum = in_pseudo(ip->ip_src.s_addr,
713 ip->ip_dst.s_addr,
714 htonl(m->m_pkthdr.csum_data + tlen +
715 IPPROTO_TCP));
716 th->th_sum ^= 0xffff;
717 } else if (m->m_pkthdr.csum_flags & CSUM_IP_TCP) {
718 /*
719 * Packet from local host (maybe from a VM).
720 * Checksum not required.
721 */
722 th->th_sum = 0;
723 } else {
724 struct ipovly *ipov = (struct ipovly *)ip;
725
726 /*
727 * Checksum extended TCP header and data.
728 */
729 len = off0 + tlen;
730 ipttl = ip->ip_ttl;
731 bzero(ipov->ih_x1, sizeof(ipov->ih_x1));
732 ipov->ih_len = htons(tlen);
733 th->th_sum = in_cksum(m, len);
734 /* Reset length for SDT probes. */
735 ip->ip_len = htons(len);
736 /* Reset TOS bits */
737 ip->ip_tos = iptos;
738 /* Re-initialization for later version check */
739 ip->ip_ttl = ipttl;
740 ip->ip_v = IPVERSION;
741 ip->ip_hl = off0 >> 2;
742 }
743 skip_csum:
744 if (th->th_sum && (port == 0)) {
745 TCPSTAT_INC(tcps_rcvbadsum);
746 goto drop;
747 }
748 KASSERT(ip->ip_dst.s_addr != INADDR_ANY,
749 ("%s: unspecified destination v4 address", __func__));
750 if (__predict_false(ip->ip_src.s_addr == INADDR_ANY)) {
751 IPSTAT_INC(ips_badaddr);
752 goto drop;
753 }
754 }
755 #endif /* INET */
756
757 /*
758 * Check that TCP offset makes sense,
759 * pull out TCP options and adjust length. XXX
760 */
761 off = th->th_off << 2;
762 if (off < sizeof (struct tcphdr) || off > tlen) {
763 TCPSTAT_INC(tcps_rcvbadoff);
764 goto drop;
765 }
766 tlen -= off; /* tlen is used instead of ti->ti_len */
767 if (off > sizeof (struct tcphdr)) {
768 #ifdef INET6
769 if (isipv6) {
770 if (m->m_len < off0 + off) {
771 m = m_pullup(m, off0 + off);
772 if (m == NULL) {
773 TCPSTAT_INC(tcps_rcvshort);
774 return (IPPROTO_DONE);
775 }
776 }
777 ip6 = mtod(m, struct ip6_hdr *);
778 th = (struct tcphdr *)((caddr_t)ip6 + off0);
779 }
780 #endif
781 #if defined(INET) && defined(INET6)
782 else
783 #endif
784 #ifdef INET
785 {
786 if (m->m_len < sizeof(struct ip) + off) {
787 if ((m = m_pullup(m, sizeof (struct ip) + off))
788 == NULL) {
789 TCPSTAT_INC(tcps_rcvshort);
790 return (IPPROTO_DONE);
791 }
792 ip = mtod(m, struct ip *);
793 th = (struct tcphdr *)((caddr_t)ip + off0);
794 }
795 }
796 #endif
797 optlen = off - sizeof (struct tcphdr);
798 optp = (u_char *)(th + 1);
799 }
800 thflags = tcp_get_flags(th);
801
802 /*
803 * Convert TCP protocol specific fields to host format.
804 */
805 tcp_fields_to_host(th);
806
807 /*
808 * Delay dropping TCP, IP headers, IPv6 ext headers, and TCP options.
809 */
810 drop_hdrlen = off0 + off;
811
812 /*
813 * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain.
814 */
815 if (
816 #ifdef INET6
817 (isipv6 && (m->m_flags & M_IP6_NEXTHOP))
818 #ifdef INET
819 || (!isipv6 && (m->m_flags & M_IP_NEXTHOP))
820 #endif
821 #endif
822 #if defined(INET) && !defined(INET6)
823 (m->m_flags & M_IP_NEXTHOP)
824 #endif
825 )
826 fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL);
827
828 /*
829 * For initial SYN packets we don't need write lock on matching
830 * PCB, be it a listening one or a synchronized one. The packet
831 * shall not modify its state.
832 */
833 lookupflag = INPLOOKUP_WILDCARD |
834 ((thflags & (TH_ACK|TH_SYN)) == TH_SYN ?
835 INPLOOKUP_RLOCKPCB : INPLOOKUP_WLOCKPCB) |
836 (V_tcp_bind_all_fibs ? 0 : INPLOOKUP_FIB);
837 findpcb:
838 tp = NULL;
839 #ifdef INET6
840 if (isipv6 && fwd_tag != NULL) {
841 struct sockaddr_in6 *next_hop6;
842
843 next_hop6 = (struct sockaddr_in6 *)(fwd_tag + 1);
844 /*
845 * Transparently forwarded. Pretend to be the destination.
846 * Already got one like this?
847 */
848 inp = in6_pcblookup_mbuf(&V_tcbinfo,
849 &ip6->ip6_src, th->th_sport, &ip6->ip6_dst, th->th_dport,
850 lookupflag & ~INPLOOKUP_WILDCARD, m->m_pkthdr.rcvif, m);
851 if (!inp) {
852 /*
853 * It's new. Try to find the ambushing socket.
854 * Because we've rewritten the destination address,
855 * any hardware-generated hash is ignored.
856 */
857 inp = in6_pcblookup(&V_tcbinfo, &ip6->ip6_src,
858 th->th_sport, &next_hop6->sin6_addr,
859 next_hop6->sin6_port ? ntohs(next_hop6->sin6_port) :
860 th->th_dport, lookupflag, m->m_pkthdr.rcvif);
861 }
862 } else if (isipv6) {
863 inp = in6_pcblookup_mbuf(&V_tcbinfo, &ip6->ip6_src,
864 th->th_sport, &ip6->ip6_dst, th->th_dport, lookupflag,
865 m->m_pkthdr.rcvif, m);
866 }
867 #endif /* INET6 */
868 #if defined(INET6) && defined(INET)
869 else
870 #endif
871 #ifdef INET
872 if (fwd_tag != NULL) {
873 struct sockaddr_in *next_hop;
874
875 next_hop = (struct sockaddr_in *)(fwd_tag+1);
876 /*
877 * Transparently forwarded. Pretend to be the destination.
878 * already got one like this?
879 */
880 inp = in_pcblookup_mbuf(&V_tcbinfo, ip->ip_src, th->th_sport,
881 ip->ip_dst, th->th_dport, lookupflag & ~INPLOOKUP_WILDCARD,
882 m->m_pkthdr.rcvif, m);
883 if (!inp) {
884 /*
885 * It's new. Try to find the ambushing socket.
886 * Because we've rewritten the destination address,
887 * any hardware-generated hash is ignored.
888 */
889 inp = in_pcblookup(&V_tcbinfo, ip->ip_src,
890 th->th_sport, next_hop->sin_addr,
891 next_hop->sin_port ? ntohs(next_hop->sin_port) :
892 th->th_dport, lookupflag, m->m_pkthdr.rcvif);
893 }
894 } else
895 inp = in_pcblookup_mbuf(&V_tcbinfo, ip->ip_src,
896 th->th_sport, ip->ip_dst, th->th_dport, lookupflag,
897 m->m_pkthdr.rcvif, m);
898 #endif /* INET */
899
900 /*
901 * If the INPCB does not exist then all data in the incoming
902 * segment is discarded and an appropriate RST is sent back.
903 * XXX MRT Send RST using which routing table?
904 */
905 if (inp == NULL) {
906 if ((lookupflag & INPLOOKUP_WILDCARD) == 0) {
907 /* We came here after second (safety) lookup. */
908 MPASS(!closed_port);
909 } else {
910 /*
911 * Log communication attempts to ports that are not
912 * in use.
913 */
914 if (((V_tcp_log_in_vain == 1 && (thflags & TH_SYN)) ||
915 V_tcp_log_in_vain == 2) &&
916 (s = tcp_log_vain(NULL, th, (void *)ip, ip6))) {
917 log(LOG_INFO, "%s; %s: Connection attempt "
918 "to closed port\n", s, __func__);
919 }
920 closed_port = true;
921 }
922 goto dropwithreset;
923 }
924 INP_LOCK_ASSERT(inp);
925
926 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
927 #ifdef INET6
928 if (isipv6 && IPSEC_ENABLED(ipv6) &&
929 IPSEC_CHECK_POLICY(ipv6, m, inp) != 0) {
930 goto dropunlock;
931 }
932 #ifdef INET
933 else
934 #endif
935 #endif /* INET6 */
936 #ifdef INET
937 if (IPSEC_ENABLED(ipv4) &&
938 IPSEC_CHECK_POLICY(ipv4, m, inp) != 0) {
939 goto dropunlock;
940 }
941 #endif /* INET */
942 #endif /* IPSEC */
943
944 /*
945 * Check the minimum TTL for socket.
946 */
947 if (inp->inp_ip_minttl != 0) {
948 #ifdef INET6
949 if (isipv6) {
950 if (inp->inp_ip_minttl > ip6->ip6_hlim)
951 goto dropunlock;
952 } else
953 #endif
954 if (inp->inp_ip_minttl > ip->ip_ttl)
955 goto dropunlock;
956 }
957
958 tp = intotcpcb(inp);
959 switch (tp->t_state) {
960 case TCPS_TIME_WAIT:
961 /*
962 * A previous connection in TIMEWAIT state is supposed to catch
963 * stray or duplicate segments arriving late. If this segment
964 * was a legitimate new connection attempt, the old INPCB gets
965 * removed and we can try again to find a listening socket.
966 */
967 tcp_dooptions(&to, optp, optlen,
968 (thflags & TH_SYN) ? TO_SYN : 0);
969 /*
970 * tcp_twcheck unlocks the inp always, and frees the m if fails.
971 */
972 if (tcp_twcheck(inp, &to, th, m, tlen))
973 goto findpcb;
974 return (IPPROTO_DONE);
975 case TCPS_CLOSED:
976 /*
977 * The TCPCB may no longer exist if the connection is winding
978 * down or it is in the CLOSED state. Either way we drop the
979 * segment and send an appropriate response.
980 */
981 closed_port = true;
982 goto dropwithreset;
983 }
984
985 if ((tp->t_port != port) && (tp->t_state > TCPS_LISTEN)) {
986 closed_port = true;
987 goto dropwithreset;
988 }
989
990 #ifdef TCP_OFFLOAD
991 if (tp->t_flags & TF_TOE) {
992 tcp_offload_input(tp, m);
993 m = NULL; /* consumed by the TOE driver */
994 goto dropunlock;
995 }
996 #endif
997
998 #ifdef MAC
999 if (mac_inpcb_check_deliver(inp, m))
1000 goto dropunlock;
1001 #endif
1002 so = inp->inp_socket;
1003 KASSERT(so != NULL, ("%s: so == NULL", __func__));
1004 /*
1005 * When the socket is accepting connections (the INPCB is in LISTEN
1006 * state) we look into the SYN cache if this is a new connection
1007 * attempt or the completion of a previous one.
1008 */
1009 KASSERT(tp->t_state == TCPS_LISTEN || !SOLISTENING(so),
1010 ("%s: so accepting but tp %p not listening", __func__, tp));
1011 if (tp->t_state == TCPS_LISTEN && SOLISTENING(so)) {
1012 struct in_conninfo inc;
1013
1014 bzero(&inc, sizeof(inc));
1015 #ifdef INET6
1016 if (isipv6) {
1017 inc.inc_flags |= INC_ISIPV6;
1018 if (inp->inp_inc.inc_flags & INC_IPV6MINMTU)
1019 inc.inc_flags |= INC_IPV6MINMTU;
1020 inc.inc6_faddr = ip6->ip6_src;
1021 inc.inc6_laddr = ip6->ip6_dst;
1022 } else
1023 #endif
1024 {
1025 inc.inc_faddr = ip->ip_src;
1026 inc.inc_laddr = ip->ip_dst;
1027 }
1028 inc.inc_fport = th->th_sport;
1029 inc.inc_lport = th->th_dport;
1030 inc.inc_fibnum = so->so_fibnum;
1031
1032 /*
1033 * Check for an existing connection attempt in syncache if
1034 * the flag is only ACK. A successful lookup creates a new
1035 * socket appended to the listen queue in SYN_RECEIVED state.
1036 */
1037 if ((thflags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK) {
1038 int result;
1039
1040 /*
1041 * Parse the TCP options here because
1042 * syncookies need access to the reflected
1043 * timestamp.
1044 */
1045 tcp_dooptions(&to, optp, optlen, 0);
1046 /*
1047 * NB: syncache_expand() doesn't unlock inp.
1048 */
1049 result = syncache_expand(&inc, &to, th, &so, m, port);
1050 if (result < 0) {
1051 /*
1052 * A failing TCP MD5 signature comparison
1053 * must result in the segment being dropped
1054 * and must not produce any response back
1055 * to the sender.
1056 */
1057 goto dropunlock;
1058 } else if (result == 0) {
1059 /*
1060 * No syncache entry, or ACK was not for our
1061 * SYN/ACK. Do our protection against double
1062 * ACK. If peer sent us 2 ACKs, then for the
1063 * first one syncache_expand() successfully
1064 * converted syncache entry into a socket,
1065 * while we were waiting on the inpcb lock. We
1066 * don't want to sent RST for the second ACK,
1067 * so we perform second lookup without wildcard
1068 * match, hoping to find the new socket. If
1069 * the ACK is stray indeed, the missing
1070 * INPLOOKUP_WILDCARD flag in lookupflag would
1071 * hint the above code that the lookup was a
1072 * second attempt.
1073 *
1074 * NB: syncache did its own logging
1075 * of the failure cause.
1076 */
1077 INP_WUNLOCK(inp);
1078 lookupflag &= ~INPLOOKUP_WILDCARD;
1079 goto findpcb;
1080 }
1081 tfo_socket_result:
1082 if (so == NULL) {
1083 /*
1084 * We completed the 3-way handshake
1085 * but could not allocate a socket
1086 * either due to memory shortage,
1087 * listen queue length limits or
1088 * global socket limits. Send RST
1089 * or wait and have the remote end
1090 * retransmit the ACK for another
1091 * try.
1092 */
1093 if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
1094 log(LOG_DEBUG, "%s; %s: Listen socket: "
1095 "Socket allocation failed due to "
1096 "limits or memory shortage, %s\n",
1097 s, __func__,
1098 V_tcp_sc_rst_sock_fail ?
1099 "sending RST" : "try again");
1100 if (V_tcp_sc_rst_sock_fail) {
1101 goto dropwithreset;
1102 } else
1103 goto dropunlock;
1104 }
1105 /*
1106 * Socket is created in state SYN_RECEIVED.
1107 * Unlock the listen socket, lock the newly
1108 * created socket and update the tp variable.
1109 * If we came here via jump to tfo_socket_result,
1110 * then listening socket is read-locked.
1111 */
1112 INP_UNLOCK(inp); /* listen socket */
1113 inp = sotoinpcb(so);
1114 /*
1115 * New connection inpcb is already locked by
1116 * syncache_expand().
1117 */
1118 INP_WLOCK_ASSERT(inp);
1119 tp = intotcpcb(inp);
1120 KASSERT(tp->t_state == TCPS_SYN_RECEIVED,
1121 ("%s: ", __func__));
1122 /*
1123 * Process the segment and the data it
1124 * contains. tcp_do_segment() consumes
1125 * the mbuf chain and unlocks the inpcb.
1126 */
1127 TCP_PROBE5(receive, NULL, tp, m, tp, th);
1128 tp->t_fb->tfb_tcp_do_segment(tp, m, th, drop_hdrlen,
1129 tlen, iptos);
1130 return (IPPROTO_DONE);
1131 }
1132 /*
1133 * Segment flag validation for new connection attempts:
1134 *
1135 * Our (SYN|ACK) response was rejected.
1136 * Check with syncache and remove entry to prevent
1137 * retransmits.
1138 *
1139 * NB: syncache_chkrst does its own logging of failure
1140 * causes.
1141 */
1142 if (thflags & TH_RST) {
1143 syncache_chkrst(&inc, th, port);
1144 goto dropunlock;
1145 }
1146 /*
1147 * We can't do anything without SYN.
1148 */
1149 if ((thflags & TH_SYN) == 0) {
1150 if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
1151 log(LOG_DEBUG, "%s; %s: Listen socket: "
1152 "SYN is missing, segment ignored\n",
1153 s, __func__);
1154 TCPSTAT_INC(tcps_badsyn);
1155 goto dropunlock;
1156 }
1157 /*
1158 * (SYN|ACK) is bogus on a listen socket.
1159 */
1160 if (thflags & TH_ACK) {
1161 if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
1162 log(LOG_DEBUG, "%s; %s: Listen socket: "
1163 "SYN|ACK invalid, segment ignored\n",
1164 s, __func__);
1165 TCPSTAT_INC(tcps_badsyn);
1166 goto dropunlock;
1167 }
1168 /*
1169 * If the drop_synfin option is enabled, drop all
1170 * segments with both the SYN and FIN bits set.
1171 * This prevents e.g. nmap from identifying the
1172 * TCP/IP stack.
1173 * XXX: Poor reasoning. nmap has other methods
1174 * and is constantly refining its stack detection
1175 * strategies.
1176 * XXX: This is a violation of the TCP specification
1177 * and was used by RFC1644.
1178 */
1179 if ((thflags & TH_FIN) && V_drop_synfin) {
1180 if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
1181 log(LOG_DEBUG, "%s; %s: Listen socket: "
1182 "SYN|FIN segment ignored (based on "
1183 "sysctl setting)\n", s, __func__);
1184 TCPSTAT_INC(tcps_badsyn);
1185 goto dropunlock;
1186 }
1187 /*
1188 * Segment's flags are (SYN) or (SYN|FIN).
1189 *
1190 * TH_PUSH, TH_URG, TH_ECE, TH_CWR are ignored
1191 * as they do not affect the state of the TCP FSM.
1192 * The data pointed to by TH_URG and th_urp is ignored.
1193 */
1194 KASSERT((thflags & (TH_RST|TH_ACK)) == 0,
1195 ("%s: Listen socket: TH_RST or TH_ACK set", __func__));
1196 KASSERT(thflags & (TH_SYN),
1197 ("%s: Listen socket: TH_SYN not set", __func__));
1198 INP_RLOCK_ASSERT(inp);
1199 #ifdef INET6
1200 /*
1201 * If deprecated address is forbidden,
1202 * we do not accept SYN to deprecated interface
1203 * address to prevent any new inbound connection from
1204 * getting established.
1205 * When we do not accept SYN, we send a TCP RST,
1206 * with deprecated source address (instead of dropping
1207 * it). We compromise it as it is much better for peer
1208 * to send a RST, and RST will be the final packet
1209 * for the exchange.
1210 *
1211 * If we do not forbid deprecated addresses, we accept
1212 * the SYN packet. RFC2462 does not suggest dropping
1213 * SYN in this case.
1214 * If we decipher RFC2462 5.5.4, it says like this:
1215 * 1. use of deprecated addr with existing
1216 * communication is okay - "SHOULD continue to be
1217 * used"
1218 * 2. use of it with new communication:
1219 * (2a) "SHOULD NOT be used if alternate address
1220 * with sufficient scope is available"
1221 * (2b) nothing mentioned otherwise.
1222 * Here we fall into (2b) case as we have no choice in
1223 * our source address selection - we must obey the peer.
1224 *
1225 * The wording in RFC2462 is confusing, and there are
1226 * multiple description text for deprecated address
1227 * handling - worse, they are not exactly the same.
1228 * I believe 5.5.4 is the best one, so we follow 5.5.4.
1229 */
1230 if (isipv6 && !V_ip6_use_deprecated) {
1231 struct in6_ifaddr *ia6;
1232
1233 ia6 = in6ifa_ifwithaddr(&ip6->ip6_dst, 0 /* XXX */, false);
1234 if (ia6 != NULL &&
1235 (ia6->ia6_flags & IN6_IFF_DEPRECATED)) {
1236 if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
1237 log(LOG_DEBUG, "%s; %s: Listen socket: "
1238 "Connection attempt to deprecated "
1239 "IPv6 address rejected\n",
1240 s, __func__);
1241 goto dropwithreset;
1242 }
1243 }
1244 #endif /* INET6 */
1245 /*
1246 * Basic sanity checks on incoming SYN requests:
1247 * Don't respond if the destination is a link layer
1248 * broadcast according to RFC1122 4.2.3.10, p. 104.
1249 * If it is from this socket it must be forged.
1250 * Don't respond if the source or destination is a
1251 * global or subnet broad- or multicast address.
1252 * Note that it is quite possible to receive unicast
1253 * link-layer packets with a broadcast IP address. Use
1254 * in_ifnet_broadcast() to find them.
1255 */
1256 if (m->m_flags & (M_BCAST|M_MCAST)) {
1257 if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
1258 log(LOG_DEBUG, "%s; %s: Listen socket: "
1259 "Connection attempt from broad- or multicast "
1260 "link layer address ignored\n", s, __func__);
1261 goto dropunlock;
1262 }
1263 #ifdef INET6
1264 if (isipv6) {
1265 if (th->th_dport == th->th_sport &&
1266 IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &ip6->ip6_src)) {
1267 if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
1268 log(LOG_DEBUG, "%s; %s: Listen socket: "
1269 "Connection attempt to/from self "
1270 "ignored\n", s, __func__);
1271 goto dropunlock;
1272 }
1273 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
1274 IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) {
1275 if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
1276 log(LOG_DEBUG, "%s; %s: Listen socket: "
1277 "Connection attempt from/to multicast "
1278 "address ignored\n", s, __func__);
1279 goto dropunlock;
1280 }
1281 }
1282 #endif
1283 #if defined(INET) && defined(INET6)
1284 else
1285 #endif
1286 #ifdef INET
1287 {
1288 if (th->th_dport == th->th_sport &&
1289 ip->ip_dst.s_addr == ip->ip_src.s_addr) {
1290 if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
1291 log(LOG_DEBUG, "%s; %s: Listen socket: "
1292 "Connection attempt from/to self "
1293 "ignored\n", s, __func__);
1294 goto dropunlock;
1295 }
1296 if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
1297 IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
1298 ip->ip_src.s_addr == htonl(INADDR_BROADCAST) ||
1299 in_ifnet_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) {
1300 if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
1301 log(LOG_DEBUG, "%s; %s: Listen socket: "
1302 "Connection attempt from/to broad- "
1303 "or multicast address ignored\n",
1304 s, __func__);
1305 goto dropunlock;
1306 }
1307 }
1308 #endif
1309 /*
1310 * SYN appears to be valid. Create compressed TCP state
1311 * for syncache.
1312 */
1313 TCP_PROBE3(debug__input, tp, th, m);
1314 tcp_dooptions(&to, optp, optlen, TO_SYN);
1315 if ((so = syncache_add(&inc, &to, th, inp, so, m, NULL, NULL,
1316 iptos, port)) != NULL)
1317 goto tfo_socket_result;
1318
1319 /*
1320 * Entry added to syncache and mbuf consumed.
1321 * Only the listen socket is unlocked by syncache_add().
1322 */
1323 return (IPPROTO_DONE);
1324 }
1325 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
1326 if (tp->t_flags & TF_SIGNATURE) {
1327 tcp_dooptions(&to, optp, optlen, thflags);
1328 if ((to.to_flags & TOF_SIGNATURE) == 0) {
1329 TCPSTAT_INC(tcps_sig_err_nosigopt);
1330 goto dropunlock;
1331 }
1332 if (!TCPMD5_ENABLED() ||
1333 TCPMD5_INPUT(m, th, to.to_signature) != 0)
1334 goto dropunlock;
1335 }
1336 #endif
1337 TCP_PROBE5(receive, NULL, tp, m, tp, th);
1338
1339 /*
1340 * Segment belongs to a connection in SYN_SENT, ESTABLISHED or later
1341 * state. tcp_do_segment() always consumes the mbuf chain, unlocks
1342 * the inpcb, and unlocks pcbinfo.
1343 *
1344 * XXXGL: in case of a pure SYN arriving on existing connection
1345 * TCP stacks won't need to modify the PCB, they would either drop
1346 * the segment silently, or send a challenge ACK. However, we try
1347 * to upgrade the lock, because calling convention for stacks is
1348 * write-lock on PCB. If upgrade fails, drop the SYN.
1349 */
1350 if ((lookupflag & INPLOOKUP_RLOCKPCB) && INP_TRY_UPGRADE(inp) == 0)
1351 goto dropunlock;
1352
1353 tp->t_fb->tfb_tcp_do_segment(tp, m, th, drop_hdrlen, tlen, iptos);
1354 return (IPPROTO_DONE);
1355
1356 dropwithreset:
1357 /*
1358 * When blackholing do not respond with a RST but
1359 * completely ignore the segment and drop it.
1360 */
1361 if (((!closed_port && V_blackhole == 3) ||
1362 (closed_port &&
1363 ((V_blackhole == 1 && (thflags & TH_SYN)) || V_blackhole > 1))) &&
1364 (V_blackhole_local || (
1365 #ifdef INET6
1366 isipv6 ? !in6_localip(&ip6->ip6_src) :
1367 #endif
1368 #ifdef INET
1369 !in_localip(ip->ip_src)
1370 #else
1371 true
1372 #endif
1373 )))
1374 goto dropunlock;
1375 TCP_PROBE5(receive, NULL, tp, m, tp, th);
1376 tcp_dropwithreset(m, th, tp, tlen);
1377 m = NULL; /* mbuf chain got consumed. */
1378
1379 dropunlock:
1380 if (m != NULL)
1381 TCP_PROBE5(receive, NULL, tp, m, tp, th);
1382
1383 if (inp != NULL)
1384 INP_UNLOCK(inp);
1385
1386 drop:
1387 if (s != NULL)
1388 free(s, M_TCPLOG);
1389 if (m != NULL)
1390 m_freem(m);
1391 return (IPPROTO_DONE);
1392 }
1393
1394 /*
1395 * Automatic sizing of receive socket buffer. Often the send
1396 * buffer size is not optimally adjusted to the actual network
1397 * conditions at hand (delay bandwidth product). Setting the
1398 * buffer size too small limits throughput on links with high
1399 * bandwidth and high delay (eg. trans-continental/oceanic links).
1400 *
1401 * On the receive side the socket buffer memory is only rarely
1402 * used to any significant extent. This allows us to be much
1403 * more aggressive in scaling the receive socket buffer. For
1404 * the case that the buffer space is actually used to a large
1405 * extent and we run out of kernel memory we can simply drop
1406 * the new segments; TCP on the sender will just retransmit it
1407 * later. Setting the buffer size too big may only consume too
1408 * much kernel memory if the application doesn't read() from
1409 * the socket or packet loss or reordering makes use of the
1410 * reassembly queue.
1411 *
1412 * The criteria to step up the receive buffer one notch are:
1413 * 1. Application has not set receive buffer size with
1414 * SO_RCVBUF. Setting SO_RCVBUF clears SB_AUTOSIZE.
1415 * 2. the number of bytes received during 1/2 of an sRTT
1416 * is at least 3/8 of the current socket buffer size.
1417 * 3. receive buffer size has not hit maximal automatic size;
1418 *
1419 * If all of the criteria are met, we increase the socket buffer
1420 * by a 1/2 (bounded by the max). This allows us to keep ahead
1421 * of slow-start but also makes it so our peer never gets limited
1422 * by our rwnd which we then open up causing a burst.
1423 *
1424 * This algorithm does two steps per RTT at most and only if
1425 * we receive a bulk stream w/o packet losses or reorderings.
1426 * Shrinking the buffer during idle times is not necessary as
1427 * it doesn't consume any memory when idle.
1428 *
1429 * TODO: Only step up if the application is actually serving
1430 * the buffer to better manage the socket buffer resources.
1431 */
1432 int
tcp_autorcvbuf(struct mbuf * m,struct tcphdr * th,struct socket * so,struct tcpcb * tp,int tlen)1433 tcp_autorcvbuf(struct mbuf *m, struct tcphdr *th, struct socket *so,
1434 struct tcpcb *tp, int tlen)
1435 {
1436 int newsize = 0;
1437
1438 if (V_tcp_do_autorcvbuf && (so->so_rcv.sb_flags & SB_AUTOSIZE) &&
1439 tp->t_srtt != 0 && tp->rfbuf_ts != 0 &&
1440 TCP_TS_TO_TICKS(tcp_ts_getticks() - tp->rfbuf_ts) >
1441 ((tp->t_srtt >> TCP_RTT_SHIFT)/2)) {
1442 if (tp->rfbuf_cnt > ((so->so_rcv.sb_hiwat / 2)/ 4 * 3) &&
1443 so->so_rcv.sb_hiwat < V_tcp_autorcvbuf_max) {
1444 newsize = min((so->so_rcv.sb_hiwat + (so->so_rcv.sb_hiwat/2)), V_tcp_autorcvbuf_max);
1445 }
1446 TCP_PROBE6(receive__autoresize, NULL, tp, m, tp, th, newsize);
1447
1448 /* Start over with next RTT. */
1449 tp->rfbuf_ts = 0;
1450 tp->rfbuf_cnt = 0;
1451 } else {
1452 tp->rfbuf_cnt += tlen; /* add up */
1453 }
1454 return (newsize);
1455 }
1456
1457 int
tcp_input(struct mbuf ** mp,int * offp,int proto)1458 tcp_input(struct mbuf **mp, int *offp, int proto)
1459 {
1460 return(tcp_input_with_port(mp, offp, proto, 0));
1461 }
1462
1463 static void
tcp_handle_wakeup(struct tcpcb * tp)1464 tcp_handle_wakeup(struct tcpcb *tp)
1465 {
1466
1467 INP_WLOCK_ASSERT(tptoinpcb(tp));
1468
1469 if (tp->t_flags & TF_WAKESOR) {
1470 struct socket *so = tptosocket(tp);
1471
1472 tp->t_flags &= ~TF_WAKESOR;
1473 SOCK_RECVBUF_LOCK_ASSERT(so);
1474 sorwakeup_locked(so);
1475 }
1476 }
1477
1478 void
tcp_do_segment(struct tcpcb * tp,struct mbuf * m,struct tcphdr * th,int drop_hdrlen,int tlen,uint8_t iptos)1479 tcp_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
1480 int drop_hdrlen, int tlen, uint8_t iptos)
1481 {
1482 uint16_t thflags;
1483 int acked, ourfinisacked, needoutput = 0;
1484 sackstatus_t sack_changed;
1485 int todrop, win, incforsyn = 0;
1486 uint32_t tiwin;
1487 uint16_t nsegs;
1488 char *s;
1489 struct inpcb *inp = tptoinpcb(tp);
1490 struct socket *so = tptosocket(tp);
1491 struct in_conninfo *inc = &inp->inp_inc;
1492 struct mbuf *mfree;
1493 struct tcpopt to;
1494 int tfo_syn;
1495 u_int maxseg = 0;
1496 bool no_data;
1497
1498 no_data = (tlen == 0);
1499 thflags = tcp_get_flags(th);
1500 tp->sackhint.last_sack_ack = 0;
1501 sack_changed = SACK_NOCHANGE;
1502 nsegs = max(1, m->m_pkthdr.lro_nsegs);
1503
1504 NET_EPOCH_ASSERT();
1505 INP_WLOCK_ASSERT(inp);
1506 KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN",
1507 __func__));
1508 KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT",
1509 __func__));
1510
1511 TCP_LOG_EVENT(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_IN, 0,
1512 tlen, NULL, true);
1513
1514 if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) {
1515 if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
1516 log(LOG_DEBUG, "%s; %s: "
1517 "SYN|FIN segment ignored (based on "
1518 "sysctl setting)\n", s, __func__);
1519 free(s, M_TCPLOG);
1520 }
1521 goto drop;
1522 }
1523
1524 /*
1525 * If a segment with the ACK-bit set arrives in the SYN-SENT state
1526 * check SEQ.ACK first.
1527 */
1528 if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) &&
1529 (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) {
1530 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
1531 goto dropwithreset;
1532 }
1533
1534 /*
1535 * Segment received on connection.
1536 * Reset idle time and keep-alive timer.
1537 * XXX: This should be done after segment
1538 * validation to ignore broken/spoofed segs.
1539 */
1540 if (tp->t_idle_reduce &&
1541 (tp->snd_max == tp->snd_una) &&
1542 ((ticks - tp->t_rcvtime) >= tp->t_rxtcur))
1543 cc_after_idle(tp);
1544 tp->t_rcvtime = ticks;
1545
1546 if (thflags & TH_FIN)
1547 tcp_log_end_status(tp, TCP_EI_STATUS_CLIENT_FIN);
1548 /*
1549 * Scale up the window into a 32-bit value.
1550 * For the SYN_SENT state the scale is zero.
1551 */
1552 tiwin = th->th_win << tp->snd_scale;
1553 #ifdef STATS
1554 stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_FRWIN, tiwin);
1555 #endif
1556
1557 /*
1558 * TCP ECN processing.
1559 */
1560 if (tcp_ecn_input_segment(tp, thflags, tlen,
1561 tcp_packets_this_ack(tp, th->th_ack),
1562 iptos))
1563 cc_cong_signal(tp, th, CC_ECN);
1564
1565 /*
1566 * Parse options on any incoming segment.
1567 */
1568 tcp_dooptions(&to, (u_char *)(th + 1),
1569 (th->th_off << 2) - sizeof(struct tcphdr),
1570 (thflags & TH_SYN) ? TO_SYN : 0);
1571 if (tp->t_flags2 & TF2_PROC_SACK_PROHIBIT) {
1572 /*
1573 * We don't look at sack's from the
1574 * peer because the MSS is too small which
1575 * can subject us to an attack.
1576 */
1577 to.to_flags &= ~TOF_SACK;
1578 }
1579 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
1580 if ((tp->t_flags & TF_SIGNATURE) != 0 &&
1581 (to.to_flags & TOF_SIGNATURE) == 0) {
1582 TCPSTAT_INC(tcps_sig_err_sigopt);
1583 /* XXX: should drop? */
1584 }
1585 #endif
1586 /*
1587 * If echoed timestamp is later than the current time,
1588 * fall back to non RFC1323 RTT calculation. Normalize
1589 * timestamp if syncookies were used when this connection
1590 * was established.
1591 */
1592 if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) {
1593 to.to_tsecr -= tp->ts_offset;
1594 if (TSTMP_GT(to.to_tsecr, tcp_ts_getticks())) {
1595 to.to_tsecr = 0;
1596 }
1597 }
1598 /*
1599 * Process options only when we get SYN/ACK back. The SYN case
1600 * for incoming connections is handled in tcp_syncache.
1601 * According to RFC1323 the window field in a SYN (i.e., a <SYN>
1602 * or <SYN,ACK>) segment itself is never scaled.
1603 * XXX this is traditional behavior, may need to be cleaned up.
1604 */
1605 if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) {
1606 /* Handle parallel SYN for ECN */
1607 tcp_ecn_input_parallel_syn(tp, thflags, iptos);
1608 if ((to.to_flags & TOF_SCALE) &&
1609 (tp->t_flags & TF_REQ_SCALE) &&
1610 !(tp->t_flags & TF_NOOPT)) {
1611 tp->t_flags |= TF_RCVD_SCALE;
1612 tp->snd_scale = to.to_wscale;
1613 } else {
1614 tp->t_flags &= ~TF_REQ_SCALE;
1615 }
1616 /*
1617 * Initial send window. It will be updated with
1618 * the next incoming segment to the scaled value.
1619 */
1620 tp->snd_wnd = th->th_win;
1621 if ((to.to_flags & TOF_TS) &&
1622 (tp->t_flags & TF_REQ_TSTMP) &&
1623 !(tp->t_flags & TF_NOOPT)) {
1624 tp->t_flags |= TF_RCVD_TSTMP;
1625 tp->ts_recent = to.to_tsval;
1626 tp->ts_recent_age = tcp_ts_getticks();
1627 } else {
1628 tp->t_flags &= ~TF_REQ_TSTMP;
1629 }
1630 if (to.to_flags & TOF_MSS) {
1631 tcp_mss(tp, to.to_mss);
1632 }
1633 if ((tp->t_flags & TF_SACK_PERMIT) &&
1634 (!(to.to_flags & TOF_SACKPERM) ||
1635 (tp->t_flags & TF_NOOPT))) {
1636 tp->t_flags &= ~TF_SACK_PERMIT;
1637 }
1638 if (tp->t_flags & TF_FASTOPEN) {
1639 if ((to.to_flags & TOF_FASTOPEN) &&
1640 !(tp->t_flags & TF_NOOPT)) {
1641 uint16_t mss;
1642
1643 if (to.to_flags & TOF_MSS) {
1644 mss = to.to_mss;
1645 } else {
1646 if ((inp->inp_vflag & INP_IPV6) != 0) {
1647 mss = TCP6_MSS;
1648 } else {
1649 mss = TCP_MSS;
1650 }
1651 }
1652 tcp_fastopen_update_cache(tp, mss,
1653 to.to_tfo_len, to.to_tfo_cookie);
1654 } else {
1655 tcp_fastopen_disable_path(tp);
1656 }
1657 }
1658 }
1659
1660 /*
1661 * If timestamps were negotiated during SYN/ACK and a
1662 * segment without a timestamp is received, silently drop
1663 * the segment, unless it is a RST segment or missing timestamps are
1664 * tolerated.
1665 * See section 3.2 of RFC 7323.
1666 */
1667 if ((tp->t_flags & TF_RCVD_TSTMP) && !(to.to_flags & TOF_TS)) {
1668 if (((thflags & TH_RST) != 0) || V_tcp_tolerate_missing_ts) {
1669 if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
1670 log(LOG_DEBUG, "%s; %s: Timestamp missing, "
1671 "segment processed normally\n",
1672 s, __func__);
1673 free(s, M_TCPLOG);
1674 }
1675 } else {
1676 if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
1677 log(LOG_DEBUG, "%s; %s: Timestamp missing, "
1678 "segment silently dropped\n", s, __func__);
1679 free(s, M_TCPLOG);
1680 }
1681 goto drop;
1682 }
1683 }
1684 /*
1685 * If timestamps were not negotiated during SYN/ACK and a
1686 * segment with a timestamp is received, ignore the
1687 * timestamp and process the packet normally.
1688 * See section 3.2 of RFC 7323.
1689 */
1690 if (!(tp->t_flags & TF_RCVD_TSTMP) && (to.to_flags & TOF_TS)) {
1691 if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
1692 log(LOG_DEBUG, "%s; %s: Timestamp not expected, "
1693 "segment processed normally\n", s, __func__);
1694 free(s, M_TCPLOG);
1695 }
1696 }
1697
1698 /*
1699 * Header prediction: check for the two common cases
1700 * of a uni-directional data xfer. If the packet has
1701 * no control flags, is in-sequence, the window didn't
1702 * change and we're not retransmitting, it's a
1703 * candidate. If the length is zero and the ack moved
1704 * forward, we're the sender side of the xfer. Just
1705 * free the data acked & wake any higher level process
1706 * that was blocked waiting for space. If the length
1707 * is non-zero and the ack didn't move, we're the
1708 * receiver side. If we're getting packets in-order
1709 * (the reassembly queue is empty), add the data to
1710 * the socket buffer and note that we need a delayed ack.
1711 * Make sure that the hidden state-flags are also off.
1712 * Since we check for TCPS_ESTABLISHED first, it can only
1713 * be TH_NEEDSYN.
1714 */
1715 if (tp->t_state == TCPS_ESTABLISHED &&
1716 th->th_seq == tp->rcv_nxt &&
1717 (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK &&
1718 tp->snd_nxt == tp->snd_max &&
1719 tiwin && tiwin == tp->snd_wnd &&
1720 ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) &&
1721 SEGQ_EMPTY(tp) &&
1722 ((to.to_flags & TOF_TS) == 0 ||
1723 TSTMP_GEQ(to.to_tsval, tp->ts_recent)) ) {
1724 /*
1725 * If last ACK falls within this segment's sequence numbers,
1726 * record the timestamp.
1727 * NOTE that the test is modified according to the latest
1728 * proposal of the tcplw@cray.com list (Braden 1993/04/26).
1729 */
1730 if ((to.to_flags & TOF_TS) != 0 &&
1731 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
1732 tp->ts_recent_age = tcp_ts_getticks();
1733 tp->ts_recent = to.to_tsval;
1734 }
1735
1736 if (no_data) {
1737 if (SEQ_GT(th->th_ack, tp->snd_una) &&
1738 SEQ_LEQ(th->th_ack, tp->snd_max) &&
1739 !IN_RECOVERY(tp->t_flags) &&
1740 (to.to_flags & TOF_SACK) == 0 &&
1741 TAILQ_EMPTY(&tp->snd_holes)) {
1742 /*
1743 * This is a pure ack for outstanding data.
1744 */
1745 TCPSTAT_INC(tcps_predack);
1746
1747 /*
1748 * "bad retransmit" recovery.
1749 */
1750 if (tp->t_rxtshift == 1 &&
1751 tp->t_flags & TF_PREVVALID &&
1752 tp->t_badrxtwin != 0 &&
1753 (((to.to_flags & TOF_TS) != 0 &&
1754 to.to_tsecr != 0 &&
1755 TSTMP_LT(to.to_tsecr, tp->t_badrxtwin)) ||
1756 ((to.to_flags & TOF_TS) == 0 &&
1757 TSTMP_LT(ticks, tp->t_badrxtwin))))
1758 cc_cong_signal(tp, th, CC_RTO_ERR);
1759
1760 /*
1761 * Recalculate the transmit timer / rtt.
1762 *
1763 * Some boxes send broken timestamp replies
1764 * during the SYN+ACK phase, ignore
1765 * timestamps of 0 or we could calculate a
1766 * huge RTT and blow up the retransmit timer.
1767 */
1768 if ((to.to_flags & TOF_TS) != 0 &&
1769 to.to_tsecr) {
1770 uint32_t t;
1771
1772 t = tcp_ts_getticks() - to.to_tsecr;
1773 if (!tp->t_rttlow || tp->t_rttlow > t)
1774 tp->t_rttlow = t;
1775 tcp_xmit_timer(tp,
1776 TCP_TS_TO_TICKS(t) + 1);
1777 } else if (tp->t_rtttime &&
1778 SEQ_GT(th->th_ack, tp->t_rtseq)) {
1779 if (!tp->t_rttlow ||
1780 tp->t_rttlow > ticks - tp->t_rtttime)
1781 tp->t_rttlow = ticks - tp->t_rtttime;
1782 tcp_xmit_timer(tp,
1783 ticks - tp->t_rtttime);
1784 }
1785 acked = BYTES_THIS_ACK(tp, th);
1786
1787 #ifdef TCP_HHOOK
1788 /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */
1789 hhook_run_tcp_est_in(tp, th, &to);
1790 #endif
1791
1792 TCPSTAT_ADD(tcps_rcvackpack, nsegs);
1793 TCPSTAT_ADD(tcps_rcvackbyte, acked);
1794 sbdrop(&so->so_snd, acked);
1795 if (SEQ_GT(tp->snd_una, tp->snd_recover) &&
1796 SEQ_LEQ(th->th_ack, tp->snd_recover))
1797 tp->snd_recover = th->th_ack - 1;
1798
1799 /*
1800 * Let the congestion control algorithm update
1801 * congestion control related information. This
1802 * typically means increasing the congestion
1803 * window.
1804 */
1805 cc_ack_received(tp, th, nsegs, CC_ACK);
1806
1807 tp->snd_una = th->th_ack;
1808 /*
1809 * Pull snd_wl2 up to prevent seq wrap relative
1810 * to th_ack.
1811 */
1812 tp->snd_wl2 = th->th_ack;
1813 tp->t_dupacks = 0;
1814 m_freem(m);
1815
1816 /*
1817 * If all outstanding data are acked, stop
1818 * retransmit timer, otherwise restart timer
1819 * using current (possibly backed-off) value.
1820 * If process is waiting for space,
1821 * wakeup/selwakeup/signal. If data
1822 * are ready to send, let tcp_output
1823 * decide between more output or persist.
1824 */
1825 TCP_PROBE3(debug__input, tp, th, m);
1826 /*
1827 * Clear t_acktime if remote side has ACKd
1828 * all data in the socket buffer.
1829 * Otherwise, update t_acktime if we received
1830 * a sufficiently large ACK.
1831 */
1832 if (sbavail(&so->so_snd) == 0)
1833 tp->t_acktime = 0;
1834 else if (acked > 1)
1835 tp->t_acktime = ticks;
1836 if (tp->snd_una == tp->snd_max)
1837 tcp_timer_activate(tp, TT_REXMT, 0);
1838 else if (!tcp_timer_active(tp, TT_PERSIST))
1839 tcp_timer_activate(tp, TT_REXMT,
1840 TP_RXTCUR(tp));
1841 sowwakeup(so);
1842 /*
1843 * Only call tcp_output when there
1844 * is new data available to be sent
1845 * or we need to send an ACK.
1846 */
1847 if ((tp->t_flags & TF_ACKNOW) ||
1848 (sbavail(&so->so_snd) >=
1849 SEQ_SUB(tp->snd_max, tp->snd_una))) {
1850 (void) tcp_output(tp);
1851 }
1852 goto check_delack;
1853 }
1854 } else if (th->th_ack == tp->snd_una &&
1855 tlen <= sbspace(&so->so_rcv)) {
1856 int newsize = 0; /* automatic sockbuf scaling */
1857
1858 /*
1859 * This is a pure, in-sequence data packet with
1860 * nothing on the reassembly queue and we have enough
1861 * buffer space to take it.
1862 */
1863 /* Clean receiver SACK report if present */
1864 if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks)
1865 tcp_clean_sackreport(tp);
1866 TCPSTAT_INC(tcps_preddat);
1867 tp->rcv_nxt += tlen;
1868 if (tlen &&
1869 ((tp->t_flags2 & TF2_FBYTES_COMPLETE) == 0) &&
1870 (tp->t_fbyte_in == 0)) {
1871 tp->t_fbyte_in = ticks;
1872 if (tp->t_fbyte_in == 0)
1873 tp->t_fbyte_in = 1;
1874 if (tp->t_fbyte_out && tp->t_fbyte_in)
1875 tp->t_flags2 |= TF2_FBYTES_COMPLETE;
1876 }
1877 /*
1878 * Pull snd_wl1 up to prevent seq wrap relative to
1879 * th_seq.
1880 */
1881 tp->snd_wl1 = th->th_seq;
1882 /*
1883 * Pull rcv_up up to prevent seq wrap relative to
1884 * rcv_nxt.
1885 */
1886 tp->rcv_up = tp->rcv_nxt;
1887 TCPSTAT_ADD(tcps_rcvpack, nsegs);
1888 TCPSTAT_ADD(tcps_rcvbyte, tlen);
1889 TCP_PROBE3(debug__input, tp, th, m);
1890
1891 newsize = tcp_autorcvbuf(m, th, so, tp, tlen);
1892
1893 /* Add data to socket buffer. */
1894 SOCK_RECVBUF_LOCK(so);
1895 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
1896 m_freem(m);
1897 } else {
1898 /*
1899 * Set new socket buffer size.
1900 * Give up when limit is reached.
1901 */
1902 if (newsize)
1903 if (!sbreserve_locked(so, SO_RCV,
1904 newsize, NULL))
1905 so->so_rcv.sb_flags &= ~SB_AUTOSIZE;
1906 m_adj(m, drop_hdrlen); /* delayed header drop */
1907 sbappendstream_locked(&so->so_rcv, m, 0);
1908 }
1909 /* NB: sorwakeup_locked() does an implicit unlock. */
1910 sorwakeup_locked(so);
1911 if (DELAY_ACK(tp, tlen)) {
1912 tp->t_flags |= TF_DELACK;
1913 } else {
1914 tp->t_flags |= TF_ACKNOW;
1915 (void) tcp_output(tp);
1916 }
1917 goto check_delack;
1918 }
1919 }
1920
1921 /*
1922 * Calculate amount of space in receive window,
1923 * and then do TCP input processing.
1924 * Receive window is amount of space in rcv queue,
1925 * but not less than advertised window.
1926 */
1927 win = sbspace(&so->so_rcv);
1928 if (win < 0)
1929 win = 0;
1930 tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
1931
1932 switch (tp->t_state) {
1933 /*
1934 * If the state is SYN_RECEIVED:
1935 * if seg contains an ACK, but not for our SYN/ACK, send a RST.
1936 */
1937 case TCPS_SYN_RECEIVED:
1938 if (thflags & TH_RST) {
1939 /* Handle RST segments later. */
1940 break;
1941 }
1942 if ((thflags & TH_ACK) &&
1943 (SEQ_LEQ(th->th_ack, tp->snd_una) ||
1944 SEQ_GT(th->th_ack, tp->snd_max))) {
1945 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
1946 goto dropwithreset;
1947 }
1948 if (tp->t_flags & TF_FASTOPEN) {
1949 /*
1950 * When a TFO connection is in SYN_RECEIVED, the
1951 * only valid packets are the initial SYN, a
1952 * retransmit/copy of the initial SYN (possibly with
1953 * a subset of the original data), a valid ACK, a
1954 * FIN, or a RST.
1955 */
1956 if ((thflags & (TH_SYN|TH_ACK)) == (TH_SYN|TH_ACK)) {
1957 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
1958 goto dropwithreset;
1959 } else if (thflags & TH_SYN) {
1960 /* non-initial SYN is ignored */
1961 if ((tcp_timer_active(tp, TT_DELACK) ||
1962 tcp_timer_active(tp, TT_REXMT)))
1963 goto drop;
1964 } else if (!(thflags & (TH_ACK|TH_FIN|TH_RST))) {
1965 goto drop;
1966 }
1967 }
1968 break;
1969
1970 /*
1971 * If the state is SYN_SENT:
1972 * if seg contains a RST with valid ACK (SEQ.ACK has already
1973 * been verified), then drop the connection.
1974 * if seg contains a RST without an ACK, drop the seg.
1975 * if seg does not contain SYN, then drop the seg.
1976 * Otherwise this is an acceptable SYN segment
1977 * initialize tp->rcv_nxt and tp->irs
1978 * if seg contains ack then advance tp->snd_una
1979 * if seg contains an ECE and ECN support is enabled, the stream
1980 * is ECN capable.
1981 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state
1982 * arrange for segment to be acked (eventually)
1983 * continue processing rest of data/controls, beginning with URG
1984 */
1985 case TCPS_SYN_SENT:
1986 if ((thflags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) {
1987 TCP_PROBE5(connect__refused, NULL, tp,
1988 m, tp, th);
1989 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
1990 tp = tcp_drop(tp, ECONNREFUSED);
1991 }
1992 if (thflags & TH_RST)
1993 goto drop;
1994 if (!(thflags & TH_SYN))
1995 goto drop;
1996
1997 tp->irs = th->th_seq;
1998 tcp_rcvseqinit(tp);
1999 if (thflags & TH_ACK) {
2000 int tfo_partial_ack = 0;
2001
2002 TCPSTAT_INC(tcps_connects);
2003 soisconnected(so);
2004 #ifdef MAC
2005 mac_socketpeer_set_from_mbuf(m, so);
2006 #endif
2007 /* Do window scaling on this connection? */
2008 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
2009 (TF_RCVD_SCALE|TF_REQ_SCALE)) {
2010 tp->rcv_scale = tp->request_r_scale;
2011 }
2012 tp->rcv_adv += min(tp->rcv_wnd,
2013 TCP_MAXWIN << tp->rcv_scale);
2014 tp->snd_una++; /* SYN is acked */
2015 if (SEQ_LT(tp->snd_nxt, tp->snd_una))
2016 tp->snd_nxt = tp->snd_una;
2017 /*
2018 * If not all the data that was sent in the TFO SYN
2019 * has been acked, resend the remainder right away.
2020 */
2021 if ((tp->t_flags & TF_FASTOPEN) &&
2022 (tp->snd_una != tp->snd_max)) {
2023 tp->snd_nxt = th->th_ack;
2024 tfo_partial_ack = 1;
2025 }
2026 /*
2027 * If there's data, delay ACK; if there's also a FIN
2028 * ACKNOW will be turned on later.
2029 */
2030 if (DELAY_ACK(tp, tlen) && tlen != 0 && !tfo_partial_ack)
2031 tcp_timer_activate(tp, TT_DELACK,
2032 tcp_delacktime);
2033 else
2034 tp->t_flags |= TF_ACKNOW;
2035
2036 tcp_ecn_input_syn_sent(tp, thflags, iptos);
2037
2038 /*
2039 * Received <SYN,ACK> in SYN_SENT[*] state.
2040 * Transitions:
2041 * SYN_SENT --> ESTABLISHED
2042 * SYN_SENT* --> FIN_WAIT_1
2043 */
2044 tp->t_starttime = ticks;
2045 if (tp->t_flags & TF_NEEDFIN) {
2046 tp->t_acktime = ticks;
2047 tcp_state_change(tp, TCPS_FIN_WAIT_1);
2048 tp->t_flags &= ~TF_NEEDFIN;
2049 thflags &= ~TH_SYN;
2050 } else {
2051 tcp_state_change(tp, TCPS_ESTABLISHED);
2052 TCP_PROBE5(connect__established, NULL, tp,
2053 m, tp, th);
2054 cc_conn_init(tp);
2055 tcp_timer_activate(tp, TT_KEEP,
2056 TP_KEEPIDLE(tp));
2057 }
2058 } else {
2059 /*
2060 * Received initial SYN in SYN-SENT[*] state =>
2061 * simultaneous open.
2062 * If it succeeds, connection is * half-synchronized.
2063 * Otherwise, do 3-way handshake:
2064 * SYN-SENT -> SYN-RECEIVED
2065 * SYN-SENT* -> SYN-RECEIVED*
2066 */
2067 tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN | TF_SONOTCONN);
2068 tcp_timer_activate(tp, TT_REXMT, 0);
2069 tcp_state_change(tp, TCPS_SYN_RECEIVED);
2070 }
2071
2072 /*
2073 * Advance th->th_seq to correspond to first data byte.
2074 * If data, trim to stay within window,
2075 * dropping FIN if necessary.
2076 */
2077 th->th_seq++;
2078 if (tlen > tp->rcv_wnd) {
2079 todrop = tlen - tp->rcv_wnd;
2080 m_adj(m, -todrop);
2081 tlen = tp->rcv_wnd;
2082 thflags &= ~TH_FIN;
2083 TCPSTAT_INC(tcps_rcvpackafterwin);
2084 TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop);
2085 }
2086 tp->snd_wl1 = th->th_seq - 1;
2087 tp->rcv_up = th->th_seq;
2088 /*
2089 * Client side of transaction: already sent SYN and data.
2090 * If the remote host used T/TCP to validate the SYN,
2091 * our data will be ACK'd; if so, enter normal data segment
2092 * processing in the middle of step 5, ack processing.
2093 * Otherwise, goto step 6.
2094 */
2095 if (thflags & TH_ACK)
2096 goto process_ACK;
2097
2098 goto step6;
2099 }
2100
2101 /*
2102 * States other than LISTEN or SYN_SENT.
2103 * First check the RST flag and sequence number since reset segments
2104 * are exempt from the timestamp and connection count tests. This
2105 * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix
2106 * below which allowed reset segments in half the sequence space
2107 * to fall though and be processed (which gives forged reset
2108 * segments with a random sequence number a 50 percent chance of
2109 * killing a connection).
2110 * Then check timestamp, if present.
2111 * Then check the connection count, if present.
2112 * Then check that at least some bytes of segment are within
2113 * receive window. If segment begins before rcv_nxt,
2114 * drop leading data (and SYN); if nothing left, just ack.
2115 */
2116 if (thflags & TH_RST) {
2117 /*
2118 * RFC5961 Section 3.2
2119 *
2120 * - RST drops connection only if SEG.SEQ == RCV.NXT.
2121 * - If RST is in window, we send challenge ACK.
2122 *
2123 * Note: to take into account delayed ACKs, we should
2124 * test against last_ack_sent instead of rcv_nxt.
2125 * Note 2: we handle special case of closed window, not
2126 * covered by the RFC.
2127 */
2128 if ((SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
2129 SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) ||
2130 (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) {
2131 KASSERT(tp->t_state != TCPS_SYN_SENT,
2132 ("%s: TH_RST for TCPS_SYN_SENT th %p tp %p",
2133 __func__, th, tp));
2134
2135 if (V_tcp_insecure_rst ||
2136 tp->last_ack_sent == th->th_seq) {
2137 TCPSTAT_INC(tcps_drops);
2138 /* Drop the connection. */
2139 switch (tp->t_state) {
2140 case TCPS_SYN_RECEIVED:
2141 so->so_error = ECONNREFUSED;
2142 goto close;
2143 case TCPS_ESTABLISHED:
2144 case TCPS_FIN_WAIT_1:
2145 case TCPS_FIN_WAIT_2:
2146 case TCPS_CLOSE_WAIT:
2147 case TCPS_CLOSING:
2148 case TCPS_LAST_ACK:
2149 so->so_error = ECONNRESET;
2150 close:
2151 /* FALLTHROUGH */
2152 default:
2153 tcp_log_end_status(tp, TCP_EI_STATUS_CLIENT_RST);
2154 tp = tcp_close(tp);
2155 }
2156 } else {
2157 TCPSTAT_INC(tcps_badrst);
2158 tcp_send_challenge_ack(tp, th, m);
2159 m = NULL;
2160 }
2161 }
2162 goto drop;
2163 }
2164
2165 /*
2166 * RFC5961 Section 4.2
2167 * Send challenge ACK for any SYN in synchronized state.
2168 */
2169 if ((thflags & TH_SYN) && tp->t_state != TCPS_SYN_SENT &&
2170 tp->t_state != TCPS_SYN_RECEIVED) {
2171 TCPSTAT_INC(tcps_badsyn);
2172 if (V_tcp_insecure_syn &&
2173 SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
2174 SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) {
2175 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
2176 tp = tcp_drop(tp, ECONNRESET);
2177 } else {
2178 tcp_ecn_input_syn_sent(tp, thflags, iptos);
2179 tcp_send_challenge_ack(tp, th, m);
2180 m = NULL;
2181 }
2182 goto drop;
2183 }
2184
2185 /*
2186 * RFC 1323 PAWS: If we have a timestamp reply on this segment
2187 * and it's less than ts_recent, drop it.
2188 */
2189 if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent &&
2190 TSTMP_LT(to.to_tsval, tp->ts_recent)) {
2191 /* Check to see if ts_recent is over 24 days old. */
2192 if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) {
2193 /*
2194 * Invalidate ts_recent. If this segment updates
2195 * ts_recent, the age will be reset later and ts_recent
2196 * will get a valid value. If it does not, setting
2197 * ts_recent to zero will at least satisfy the
2198 * requirement that zero be placed in the timestamp
2199 * echo reply when ts_recent isn't valid. The
2200 * age isn't reset until we get a valid ts_recent
2201 * because we don't want out-of-order segments to be
2202 * dropped when ts_recent is old.
2203 */
2204 tp->ts_recent = 0;
2205 } else {
2206 TCPSTAT_INC(tcps_rcvduppack);
2207 TCPSTAT_ADD(tcps_rcvdupbyte, tlen);
2208 TCPSTAT_INC(tcps_pawsdrop);
2209 if (tlen)
2210 goto dropafterack;
2211 goto drop;
2212 }
2213 }
2214
2215 /*
2216 * In the SYN-RECEIVED state, validate that the packet belongs to
2217 * this connection before trimming the data to fit the receive
2218 * window. Check the sequence number versus IRS since we know
2219 * the sequence numbers haven't wrapped. This is a partial fix
2220 * for the "LAND" DoS attack.
2221 */
2222 if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) {
2223 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
2224 goto dropwithreset;
2225 }
2226
2227 todrop = tp->rcv_nxt - th->th_seq;
2228 if (todrop > 0) {
2229 if (thflags & TH_SYN) {
2230 thflags &= ~TH_SYN;
2231 th->th_seq++;
2232 if (th->th_urp > 1)
2233 th->th_urp--;
2234 else
2235 thflags &= ~TH_URG;
2236 todrop--;
2237 }
2238 /*
2239 * Following if statement from Stevens, vol. 2, p. 960.
2240 */
2241 if (todrop > tlen
2242 || (todrop == tlen && (thflags & TH_FIN) == 0)) {
2243 /*
2244 * Any valid FIN must be to the left of the window.
2245 * At this point the FIN must be a duplicate or out
2246 * of sequence; drop it.
2247 */
2248 thflags &= ~TH_FIN;
2249
2250 /*
2251 * Send an ACK to resynchronize and drop any data.
2252 * But keep on processing for RST or ACK.
2253 */
2254 tp->t_flags |= TF_ACKNOW;
2255 todrop = tlen;
2256 TCPSTAT_INC(tcps_rcvduppack);
2257 TCPSTAT_ADD(tcps_rcvdupbyte, todrop);
2258 } else {
2259 TCPSTAT_INC(tcps_rcvpartduppack);
2260 TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop);
2261 }
2262 /*
2263 * DSACK - add SACK block for dropped range
2264 */
2265 if ((todrop > 0) && (tp->t_flags & TF_SACK_PERMIT)) {
2266 tcp_update_sack_list(tp, th->th_seq,
2267 th->th_seq + todrop);
2268 /*
2269 * ACK now, as the next in-sequence segment
2270 * will clear the DSACK block again
2271 */
2272 tp->t_flags |= TF_ACKNOW;
2273 }
2274 drop_hdrlen += todrop; /* drop from the top afterwards */
2275 th->th_seq += todrop;
2276 tlen -= todrop;
2277 if (th->th_urp > todrop)
2278 th->th_urp -= todrop;
2279 else {
2280 thflags &= ~TH_URG;
2281 th->th_urp = 0;
2282 }
2283 }
2284
2285 /*
2286 * If new data are received on a connection after the
2287 * user processes are gone, then RST the other end if
2288 * no FIN has been processed.
2289 */
2290 if ((tp->t_flags & TF_CLOSED) && tlen > 0 &&
2291 TCPS_HAVERCVDFIN(tp->t_state) == 0) {
2292 if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
2293 log(LOG_DEBUG, "%s; %s: %s: Received %d bytes of data "
2294 "after socket was closed, "
2295 "sending RST and removing tcpcb\n",
2296 s, __func__, tcpstates[tp->t_state], tlen);
2297 free(s, M_TCPLOG);
2298 }
2299 tcp_log_end_status(tp, TCP_EI_STATUS_DATA_A_CLOSE);
2300 /* tcp_close will kill the inp pre-log the Reset */
2301 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST);
2302 tp = tcp_close(tp);
2303 TCPSTAT_INC(tcps_rcvafterclose);
2304 goto dropwithreset;
2305 }
2306
2307 /*
2308 * If segment ends after window, drop trailing data
2309 * (and PUSH and FIN); if nothing left, just ACK.
2310 */
2311 todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd);
2312 if (todrop > 0) {
2313 TCPSTAT_INC(tcps_rcvpackafterwin);
2314 if (todrop >= tlen) {
2315 TCPSTAT_ADD(tcps_rcvbyteafterwin, tlen);
2316 /*
2317 * If window is closed can only take segments at
2318 * window edge, and have to drop data and PUSH from
2319 * incoming segments. Continue processing, but
2320 * remember to ack. Otherwise, drop segment
2321 * and ack.
2322 */
2323 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) {
2324 tp->t_flags |= TF_ACKNOW;
2325 TCPSTAT_INC(tcps_rcvwinprobe);
2326 } else
2327 goto dropafterack;
2328 } else
2329 TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop);
2330 m_adj(m, -todrop);
2331 tlen -= todrop;
2332 thflags &= ~(TH_PUSH|TH_FIN);
2333 }
2334
2335 /*
2336 * If last ACK falls within this segment's sequence numbers,
2337 * record its timestamp.
2338 * NOTE:
2339 * 1) That the test incorporates suggestions from the latest
2340 * proposal of the tcplw@cray.com list (Braden 1993/04/26).
2341 * 2) That updating only on newer timestamps interferes with
2342 * our earlier PAWS tests, so this check should be solely
2343 * predicated on the sequence space of this segment.
2344 * 3) That we modify the segment boundary check to be
2345 * Last.ACK.Sent <= SEG.SEQ + SEG.Len
2346 * instead of RFC1323's
2347 * Last.ACK.Sent < SEG.SEQ + SEG.Len,
2348 * This modified check allows us to overcome RFC1323's
2349 * limitations as described in Stevens TCP/IP Illustrated
2350 * Vol. 2 p.869. In such cases, we can still calculate the
2351 * RTT correctly when RCV.NXT == Last.ACK.Sent.
2352 */
2353 if ((to.to_flags & TOF_TS) != 0 &&
2354 SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
2355 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
2356 ((thflags & (TH_SYN|TH_FIN)) != 0))) {
2357 tp->ts_recent_age = tcp_ts_getticks();
2358 tp->ts_recent = to.to_tsval;
2359 }
2360
2361 /*
2362 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN
2363 * flag is on (half-synchronized state), then queue data for
2364 * later processing; else drop segment and return.
2365 */
2366 if ((thflags & TH_ACK) == 0) {
2367 if (tp->t_state == TCPS_SYN_RECEIVED ||
2368 (tp->t_flags & TF_NEEDSYN)) {
2369 if (tp->t_state == TCPS_SYN_RECEIVED &&
2370 (tp->t_flags & TF_FASTOPEN)) {
2371 tp->snd_wnd = tiwin;
2372 cc_conn_init(tp);
2373 }
2374 goto step6;
2375 } else if (tp->t_flags & TF_ACKNOW)
2376 goto dropafterack;
2377 else
2378 goto drop;
2379 }
2380
2381 /*
2382 * Ack processing.
2383 */
2384 if (SEQ_GEQ(tp->snd_una, tp->iss + (TCP_MAXWIN << tp->snd_scale))) {
2385 /* Checking SEG.ACK against ISS is definitely redundant. */
2386 tp->t_flags2 |= TF2_NO_ISS_CHECK;
2387 }
2388 if (!V_tcp_insecure_ack) {
2389 tcp_seq seq_min;
2390 bool ghost_ack_check;
2391
2392 if (tp->t_flags2 & TF2_NO_ISS_CHECK) {
2393 /* Check for too old ACKs (RFC 5961, Section 5.2). */
2394 seq_min = tp->snd_una - tp->max_sndwnd;
2395 ghost_ack_check = false;
2396 } else {
2397 if (SEQ_GT(tp->iss + 1, tp->snd_una - tp->max_sndwnd)) {
2398 /* Checking for ghost ACKs is stricter. */
2399 seq_min = tp->iss + 1;
2400 ghost_ack_check = true;
2401 } else {
2402 /*
2403 * Checking for too old ACKs (RFC 5961,
2404 * Section 5.2) is stricter.
2405 */
2406 seq_min = tp->snd_una - tp->max_sndwnd;
2407 ghost_ack_check = false;
2408 }
2409 }
2410 if (SEQ_LT(th->th_ack, seq_min)) {
2411 if (ghost_ack_check)
2412 TCPSTAT_INC(tcps_rcvghostack);
2413 else
2414 TCPSTAT_INC(tcps_rcvacktooold);
2415 tcp_send_challenge_ack(tp, th, m);
2416 m = NULL;
2417 goto drop;
2418 }
2419 }
2420 switch (tp->t_state) {
2421 /*
2422 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter
2423 * ESTABLISHED state and continue processing.
2424 * The ACK was checked above.
2425 */
2426 case TCPS_SYN_RECEIVED:
2427
2428 TCPSTAT_INC(tcps_connects);
2429 if (tp->t_flags & TF_SONOTCONN) {
2430 /*
2431 * Usually SYN_RECEIVED had been created from a LISTEN,
2432 * and solisten_enqueue() has already marked the socket
2433 * layer as connected. If it didn't, which can happen
2434 * only with an accept_filter(9), then the tp is marked
2435 * with TF_SONOTCONN. The other reason for this mark
2436 * to be set is a simultaneous open, a SYN_RECEIVED
2437 * that had been created from SYN_SENT.
2438 */
2439 tp->t_flags &= ~TF_SONOTCONN;
2440 soisconnected(so);
2441 }
2442 /* Do window scaling? */
2443 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
2444 (TF_RCVD_SCALE|TF_REQ_SCALE)) {
2445 tp->rcv_scale = tp->request_r_scale;
2446 }
2447 tp->snd_wnd = tiwin;
2448 /*
2449 * Make transitions:
2450 * SYN-RECEIVED -> ESTABLISHED
2451 * SYN-RECEIVED* -> FIN-WAIT-1
2452 */
2453 tp->t_starttime = ticks;
2454 if ((tp->t_flags & TF_FASTOPEN) && tp->t_tfo_pending) {
2455 tcp_fastopen_decrement_counter(tp->t_tfo_pending);
2456 tp->t_tfo_pending = NULL;
2457 }
2458 if (tp->t_flags & TF_NEEDFIN) {
2459 tp->t_acktime = ticks;
2460 tcp_state_change(tp, TCPS_FIN_WAIT_1);
2461 tp->t_flags &= ~TF_NEEDFIN;
2462 } else {
2463 tcp_state_change(tp, TCPS_ESTABLISHED);
2464 TCP_PROBE5(accept__established, NULL, tp,
2465 m, tp, th);
2466 /*
2467 * TFO connections call cc_conn_init() during SYN
2468 * processing. Calling it again here for such
2469 * connections is not harmless as it would undo the
2470 * snd_cwnd reduction that occurs when a TFO SYN|ACK
2471 * is retransmitted.
2472 */
2473 if (!(tp->t_flags & TF_FASTOPEN))
2474 cc_conn_init(tp);
2475 tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp));
2476 }
2477 /*
2478 * Account for the ACK of our SYN prior to
2479 * regular ACK processing below, except for
2480 * simultaneous SYN, which is handled later.
2481 */
2482 if (SEQ_GT(th->th_ack, tp->snd_una) && !(tp->t_flags & TF_NEEDSYN))
2483 incforsyn = 1;
2484 /*
2485 * If segment contains data or ACK, will call tcp_reass()
2486 * later; if not, do so now to pass queued data to user.
2487 */
2488 if (tlen == 0 && (thflags & TH_FIN) == 0) {
2489 (void) tcp_reass(tp, (struct tcphdr *)0, NULL, 0,
2490 (struct mbuf *)0);
2491 tcp_handle_wakeup(tp);
2492 }
2493 tp->snd_wl1 = th->th_seq - 1;
2494 /* FALLTHROUGH */
2495
2496 /*
2497 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range
2498 * ACKs. If the ack is in the range
2499 * tp->snd_una < th->th_ack <= tp->snd_max
2500 * then advance tp->snd_una to th->th_ack and drop
2501 * data from the retransmission queue. If this ACK reflects
2502 * more up to date window information we update our window information.
2503 */
2504 case TCPS_ESTABLISHED:
2505 case TCPS_FIN_WAIT_1:
2506 case TCPS_FIN_WAIT_2:
2507 case TCPS_CLOSE_WAIT:
2508 case TCPS_CLOSING:
2509 case TCPS_LAST_ACK:
2510 if (SEQ_GT(th->th_ack, tp->snd_max)) {
2511 TCPSTAT_INC(tcps_rcvacktoomuch);
2512 goto dropafterack;
2513 }
2514 if (tcp_is_sack_recovery(tp, &to)) {
2515 sack_changed = tcp_sack_doack(tp, &to, th->th_ack);
2516 if ((sack_changed != SACK_NOCHANGE) &&
2517 (tp->t_flags & TF_LRD)) {
2518 tcp_sack_lost_retransmission(tp, th);
2519 }
2520 } else
2521 /*
2522 * Reset the value so that previous (valid) value
2523 * from the last ack with SACK doesn't get used.
2524 */
2525 tp->sackhint.sacked_bytes = 0;
2526
2527 #ifdef TCP_HHOOK
2528 /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */
2529 hhook_run_tcp_est_in(tp, th, &to);
2530 #endif
2531
2532 if (SEQ_LT(th->th_ack, tp->snd_una)) {
2533 /* This is old ACK information, don't process it. */
2534 break;
2535 }
2536 if (th->th_ack == tp->snd_una) {
2537 /* Check if this is a duplicate ACK. */
2538 if ((tp->t_flags & TF_SACK_PERMIT) &&
2539 V_tcp_do_newsack) {
2540 /*
2541 * If SEG.ACK == SND.UNA, RFC 6675 requires a
2542 * duplicate ACK to selectively acknowledge
2543 * at least one byte, which was not selectively
2544 * acknowledged before.
2545 */
2546 if (sack_changed == SACK_NOCHANGE) {
2547 break;
2548 }
2549 } else {
2550 /*
2551 * If SEG.ACK == SND.UNA, RFC 5681 requires a
2552 * duplicate ACK to have no data on it and to
2553 * not be a window update.
2554 */
2555 if (!no_data || tiwin != tp->snd_wnd) {
2556 break;
2557 }
2558 }
2559 /*
2560 * If this is the first time we've seen a
2561 * FIN from the remote, this is not a
2562 * duplicate ACK and it needs to be processed
2563 * normally.
2564 * This happens during a simultaneous close.
2565 */
2566 if ((thflags & TH_FIN) &&
2567 (TCPS_HAVERCVDFIN(tp->t_state) == 0)) {
2568 tp->t_dupacks = 0;
2569 break;
2570 }
2571 /* Perform duplicate ACK processing. */
2572 TCPSTAT_INC(tcps_rcvdupack);
2573 maxseg = tcp_maxseg(tp);
2574 if (!tcp_timer_active(tp, TT_REXMT)) {
2575 tp->t_dupacks = 0;
2576 } else if (++tp->t_dupacks > tcprexmtthresh ||
2577 IN_FASTRECOVERY(tp->t_flags)) {
2578 cc_ack_received(tp, th, nsegs, CC_DUPACK);
2579 if (V_tcp_do_prr &&
2580 IN_FASTRECOVERY(tp->t_flags) &&
2581 (tp->t_flags & TF_SACK_PERMIT)) {
2582 tcp_do_prr_ack(tp, th, &to,
2583 sack_changed, &maxseg);
2584 } else if (tcp_is_sack_recovery(tp, &to) &&
2585 IN_FASTRECOVERY(tp->t_flags) &&
2586 (tp->snd_nxt == tp->snd_max)) {
2587 int awnd;
2588
2589 /*
2590 * Compute the amount of data in flight first.
2591 * We can inject new data into the pipe iff
2592 * we have less than ssthresh
2593 * worth of data in flight.
2594 */
2595 awnd = tcp_compute_pipe(tp);
2596 if (awnd < tp->snd_ssthresh) {
2597 tp->snd_cwnd += imax(maxseg,
2598 imin(2 * maxseg,
2599 tp->sackhint.delivered_data));
2600 if (tp->snd_cwnd > tp->snd_ssthresh)
2601 tp->snd_cwnd = tp->snd_ssthresh;
2602 }
2603 } else if (tcp_is_sack_recovery(tp, &to) &&
2604 IN_FASTRECOVERY(tp->t_flags) &&
2605 SEQ_LT(tp->snd_nxt, tp->snd_max)) {
2606 tp->snd_cwnd += imax(maxseg,
2607 imin(2 * maxseg,
2608 tp->sackhint.delivered_data));
2609 } else {
2610 tp->snd_cwnd += maxseg;
2611 }
2612 (void) tcp_output(tp);
2613 goto drop;
2614 } else if (tp->t_dupacks == tcprexmtthresh ||
2615 (tp->t_flags & TF_SACK_PERMIT &&
2616 V_tcp_do_newsack &&
2617 tp->sackhint.sacked_bytes >
2618 (tcprexmtthresh - 1) * maxseg)) {
2619 enter_recovery:
2620 /*
2621 * Above is the RFC6675 trigger condition of
2622 * more than (dupthresh-1)*maxseg sacked data.
2623 * If the count of holes in the
2624 * scoreboard is >= dupthresh, we could
2625 * also enter loss recovery, but don't
2626 * have that value readily available.
2627 */
2628 tp->t_dupacks = tcprexmtthresh;
2629 tcp_seq onxt = tp->snd_nxt;
2630
2631 /*
2632 * If we're doing sack, check to
2633 * see if we're already in sack
2634 * recovery. If we're not doing sack,
2635 * check to see if we're in newreno
2636 * recovery.
2637 */
2638 if (tcp_is_sack_recovery(tp, &to)) {
2639 if (IN_FASTRECOVERY(tp->t_flags)) {
2640 tp->t_dupacks = 0;
2641 break;
2642 }
2643 } else {
2644 if (SEQ_LEQ(th->th_ack,
2645 tp->snd_recover)) {
2646 tp->t_dupacks = 0;
2647 break;
2648 }
2649 }
2650 /* Congestion signal before ack. */
2651 cc_cong_signal(tp, th, CC_NDUPACK);
2652 cc_ack_received(tp, th, nsegs, CC_DUPACK);
2653 tcp_timer_activate(tp, TT_REXMT, 0);
2654 tp->t_rtttime = 0;
2655 if (V_tcp_do_prr) {
2656 /*
2657 * snd_ssthresh and snd_recover are
2658 * already updated by cc_cong_signal.
2659 */
2660 if (tcp_is_sack_recovery(tp, &to)) {
2661 /*
2662 * Include Limited Transmit
2663 * segments here
2664 */
2665 tp->sackhint.prr_delivered =
2666 imin(tp->snd_max - th->th_ack,
2667 (tp->snd_limited + 1) * maxseg);
2668 } else {
2669 tp->sackhint.prr_delivered =
2670 maxseg;
2671 }
2672 tp->sackhint.recover_fs = max(1,
2673 tp->snd_nxt - tp->snd_una);
2674 }
2675 tp->snd_limited = 0;
2676 if (tcp_is_sack_recovery(tp, &to)) {
2677 TCPSTAT_INC(tcps_sack_recovery_episode);
2678 /*
2679 * When entering LR after RTO due to
2680 * Duplicate ACKs, retransmit existing
2681 * holes from the scoreboard.
2682 */
2683 tcp_resend_sackholes(tp);
2684 /* Avoid inflating cwnd in tcp_output */
2685 tp->snd_nxt = tp->snd_max;
2686 tp->snd_cwnd = tcp_compute_pipe(tp) +
2687 maxseg;
2688 (void) tcp_output(tp);
2689 /* Set cwnd to the expected flightsize */
2690 tp->snd_cwnd = tp->snd_ssthresh;
2691 goto drop;
2692 }
2693 tp->snd_nxt = th->th_ack;
2694 tp->snd_cwnd = maxseg;
2695 (void) tcp_output(tp);
2696 KASSERT(tp->snd_limited <= 2,
2697 ("%s: tp->snd_limited too big",
2698 __func__));
2699 tp->snd_cwnd = tp->snd_ssthresh +
2700 maxseg *
2701 (tp->t_dupacks - tp->snd_limited);
2702 if (SEQ_GT(onxt, tp->snd_nxt))
2703 tp->snd_nxt = onxt;
2704 goto drop;
2705 } else if (V_tcp_do_rfc3042) {
2706 /*
2707 * Process first and second duplicate
2708 * ACKs. Each indicates a segment
2709 * leaving the network, creating room
2710 * for more. Make sure we can send a
2711 * packet on reception of each duplicate
2712 * ACK by increasing snd_cwnd by one
2713 * segment. Restore the original
2714 * snd_cwnd after packet transmission.
2715 */
2716 cc_ack_received(tp, th, nsegs, CC_DUPACK);
2717 uint32_t oldcwnd = tp->snd_cwnd;
2718 tcp_seq oldsndmax = tp->snd_max;
2719 u_int sent;
2720 int avail;
2721
2722 KASSERT(tp->t_dupacks == 1 ||
2723 tp->t_dupacks == 2,
2724 ("%s: dupacks not 1 or 2",
2725 __func__));
2726 if (tp->t_dupacks == 1)
2727 tp->snd_limited = 0;
2728 if ((tp->snd_nxt == tp->snd_max) &&
2729 (tp->t_rxtshift == 0))
2730 tp->snd_cwnd =
2731 SEQ_SUB(tp->snd_nxt, tp->snd_una);
2732 tp->snd_cwnd +=
2733 (tp->t_dupacks - tp->snd_limited) * maxseg;
2734 tp->snd_cwnd -= tcp_sack_adjust(tp);
2735 /*
2736 * Only call tcp_output when there
2737 * is new data available to be sent
2738 * or we need to send an ACK.
2739 */
2740 SOCK_SENDBUF_LOCK(so);
2741 avail = sbavail(&so->so_snd);
2742 SOCK_SENDBUF_UNLOCK(so);
2743 if (tp->t_flags & TF_ACKNOW ||
2744 (avail >=
2745 SEQ_SUB(tp->snd_nxt, tp->snd_una))) {
2746 (void) tcp_output(tp);
2747 }
2748 sent = SEQ_SUB(tp->snd_max, oldsndmax);
2749 if (sent > maxseg) {
2750 KASSERT((tp->t_dupacks == 2 &&
2751 tp->snd_limited == 0) ||
2752 (sent == maxseg + 1 &&
2753 tp->t_flags & TF_SENTFIN) ||
2754 (sent < 2 * maxseg &&
2755 tp->t_flags & TF_NODELAY),
2756 ("%s: sent too much: %u>%u",
2757 __func__, sent, maxseg));
2758 tp->snd_limited = 2;
2759 } else if (sent > 0) {
2760 ++tp->snd_limited;
2761 }
2762 tp->snd_cwnd = oldcwnd;
2763 goto drop;
2764 }
2765 break;
2766 }
2767 KASSERT(SEQ_GT(th->th_ack, tp->snd_una),
2768 ("%s: SEQ_LEQ(th_ack, snd_una)", __func__));
2769 /*
2770 * This ack is advancing the left edge, reset the
2771 * counter.
2772 */
2773 tp->t_dupacks = 0;
2774 /*
2775 * If this ack also has new SACK info, increment the
2776 * t_dupacks as per RFC 6675. The variable
2777 * sack_changed tracks all changes to the SACK
2778 * scoreboard, including when partial ACKs without
2779 * SACK options are received, and clear the scoreboard
2780 * from the left side. Such partial ACKs should not be
2781 * counted as dupacks here.
2782 */
2783 if (V_tcp_do_newsack &&
2784 tcp_is_sack_recovery(tp, &to) &&
2785 (((tp->t_rxtshift == 0) && (sack_changed != SACK_NOCHANGE)) ||
2786 ((tp->t_rxtshift > 0) && (sack_changed == SACK_NEWLOSS))) &&
2787 (tp->snd_nxt == tp->snd_max)) {
2788 tp->t_dupacks++;
2789 /* limit overhead by setting maxseg last */
2790 if (!IN_FASTRECOVERY(tp->t_flags) &&
2791 (tp->sackhint.sacked_bytes >
2792 (tcprexmtthresh - 1) * (maxseg = tcp_maxseg(tp)))) {
2793 goto enter_recovery;
2794 }
2795 }
2796 /*
2797 * If the congestion window was inflated to account
2798 * for the other side's cached packets, retract it.
2799 */
2800 if (SEQ_LT(th->th_ack, tp->snd_recover)) {
2801 if (IN_FASTRECOVERY(tp->t_flags)) {
2802 if (tp->t_flags & TF_SACK_PERMIT) {
2803 if (V_tcp_do_prr &&
2804 (to.to_flags & TOF_SACK)) {
2805 tcp_timer_activate(tp,
2806 TT_REXMT, 0);
2807 tp->t_rtttime = 0;
2808 tcp_do_prr_ack(tp, th, &to,
2809 sack_changed, &maxseg);
2810 tp->t_flags |= TF_ACKNOW;
2811 (void) tcp_output(tp);
2812 } else {
2813 tcp_sack_partialack(tp, th,
2814 &maxseg);
2815 }
2816 } else {
2817 tcp_newreno_partial_ack(tp, th);
2818 }
2819 } else if (IN_CONGRECOVERY(tp->t_flags) &&
2820 (V_tcp_do_prr)) {
2821 tp->sackhint.delivered_data =
2822 BYTES_THIS_ACK(tp, th);
2823 tp->snd_fack = th->th_ack;
2824 /*
2825 * During ECN cwnd reduction
2826 * always use PRR-SSRB
2827 */
2828 tcp_do_prr_ack(tp, th, &to, SACK_CHANGE,
2829 &maxseg);
2830 (void) tcp_output(tp);
2831 }
2832 }
2833 /*
2834 * If we reach this point, ACK is not a duplicate,
2835 * i.e., it ACKs something we sent.
2836 */
2837 if (tp->t_flags & TF_NEEDSYN) {
2838 /*
2839 * T/TCP: Connection was half-synchronized, and our
2840 * SYN has been ACK'd (so connection is now fully
2841 * synchronized). Go to non-starred state,
2842 * increment snd_una for ACK of SYN, and check if
2843 * we can do window scaling.
2844 */
2845 tp->t_flags &= ~TF_NEEDSYN;
2846 tp->snd_una++;
2847 /* Do window scaling? */
2848 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
2849 (TF_RCVD_SCALE|TF_REQ_SCALE)) {
2850 tp->rcv_scale = tp->request_r_scale;
2851 /* Send window already scaled. */
2852 }
2853 }
2854
2855 process_ACK:
2856 INP_WLOCK_ASSERT(inp);
2857
2858 /*
2859 * Adjust for the SYN bit in sequence space,
2860 * but don't account for it in cwnd calculations.
2861 * This is for the SYN_RECEIVED, non-simultaneous
2862 * SYN case. SYN_SENT and simultaneous SYN are
2863 * treated elsewhere.
2864 */
2865 if (incforsyn)
2866 tp->snd_una++;
2867 acked = BYTES_THIS_ACK(tp, th);
2868 KASSERT(acked >= 0, ("%s: acked unexepectedly negative "
2869 "(tp->snd_una=%u, th->th_ack=%u, tp=%p, m=%p)", __func__,
2870 tp->snd_una, th->th_ack, tp, m));
2871 TCPSTAT_ADD(tcps_rcvackpack, nsegs);
2872 TCPSTAT_ADD(tcps_rcvackbyte, acked);
2873
2874 /*
2875 * If we just performed our first retransmit, and the ACK
2876 * arrives within our recovery window, then it was a mistake
2877 * to do the retransmit in the first place. Recover our
2878 * original cwnd and ssthresh, and proceed to transmit where
2879 * we left off.
2880 */
2881 if (tp->t_rxtshift == 1 &&
2882 tp->t_flags & TF_PREVVALID &&
2883 tp->t_badrxtwin != 0 &&
2884 to.to_flags & TOF_TS &&
2885 to.to_tsecr != 0 &&
2886 TSTMP_LT(to.to_tsecr, tp->t_badrxtwin))
2887 cc_cong_signal(tp, th, CC_RTO_ERR);
2888
2889 /*
2890 * If we have a timestamp reply, update smoothed
2891 * round trip time. If no timestamp is present but
2892 * transmit timer is running and timed sequence
2893 * number was acked, update smoothed round trip time.
2894 * Since we now have an rtt measurement, cancel the
2895 * timer backoff (cf., Phil Karn's retransmit alg.).
2896 * Recompute the initial retransmit timer.
2897 *
2898 * Some boxes send broken timestamp replies
2899 * during the SYN+ACK phase, ignore
2900 * timestamps of 0 or we could calculate a
2901 * huge RTT and blow up the retransmit timer.
2902 */
2903 if ((to.to_flags & TOF_TS) != 0 && to.to_tsecr) {
2904 uint32_t t;
2905
2906 t = tcp_ts_getticks() - to.to_tsecr;
2907 if (!tp->t_rttlow || tp->t_rttlow > t)
2908 tp->t_rttlow = t;
2909 tcp_xmit_timer(tp, TCP_TS_TO_TICKS(t) + 1);
2910 } else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) {
2911 if (!tp->t_rttlow || tp->t_rttlow > ticks - tp->t_rtttime)
2912 tp->t_rttlow = ticks - tp->t_rtttime;
2913 tcp_xmit_timer(tp, ticks - tp->t_rtttime);
2914 }
2915
2916 SOCK_SENDBUF_LOCK(so);
2917 /*
2918 * Clear t_acktime if remote side has ACKd all data in the
2919 * socket buffer and FIN (if applicable).
2920 * Otherwise, update t_acktime if we received a sufficiently
2921 * large ACK.
2922 */
2923 if ((tp->t_state <= TCPS_CLOSE_WAIT &&
2924 acked == sbavail(&so->so_snd)) ||
2925 acked > sbavail(&so->so_snd))
2926 tp->t_acktime = 0;
2927 else if (acked > 1)
2928 tp->t_acktime = ticks;
2929
2930 /*
2931 * If all outstanding data is acked, stop retransmit
2932 * timer and remember to restart (more output or persist).
2933 * If there is more data to be acked, restart retransmit
2934 * timer, using current (possibly backed-off) value.
2935 */
2936 if (th->th_ack == tp->snd_max) {
2937 tcp_timer_activate(tp, TT_REXMT, 0);
2938 needoutput = 1;
2939 } else if (!tcp_timer_active(tp, TT_PERSIST))
2940 tcp_timer_activate(tp, TT_REXMT, TP_RXTCUR(tp));
2941
2942 /*
2943 * If no data (only SYN) was ACK'd,
2944 * skip rest of ACK processing.
2945 */
2946 if (acked == 0) {
2947 SOCK_SENDBUF_UNLOCK(so);
2948 goto step6;
2949 }
2950
2951 /*
2952 * Let the congestion control algorithm update congestion
2953 * control related information. This typically means increasing
2954 * the congestion window.
2955 */
2956 cc_ack_received(tp, th, nsegs, CC_ACK);
2957
2958 if (acked > sbavail(&so->so_snd)) {
2959 if (tp->snd_wnd >= sbavail(&so->so_snd))
2960 tp->snd_wnd -= sbavail(&so->so_snd);
2961 else
2962 tp->snd_wnd = 0;
2963 mfree = sbcut_locked(&so->so_snd,
2964 (int)sbavail(&so->so_snd));
2965 ourfinisacked = 1;
2966 } else {
2967 mfree = sbcut_locked(&so->so_snd, acked);
2968 if (tp->snd_wnd >= (uint32_t) acked)
2969 tp->snd_wnd -= acked;
2970 else
2971 tp->snd_wnd = 0;
2972 ourfinisacked = 0;
2973 }
2974 /* NB: sowwakeup_locked() does an implicit unlock. */
2975 sowwakeup_locked(so);
2976 m_freem(mfree);
2977 /* Detect una wraparound. */
2978 if (!IN_RECOVERY(tp->t_flags) &&
2979 SEQ_GT(tp->snd_una, tp->snd_recover) &&
2980 SEQ_LEQ(th->th_ack, tp->snd_recover))
2981 tp->snd_recover = th->th_ack - 1;
2982 tp->snd_una = th->th_ack;
2983 if (IN_RECOVERY(tp->t_flags) &&
2984 SEQ_GEQ(th->th_ack, tp->snd_recover)) {
2985 cc_post_recovery(tp, th);
2986 }
2987 if (SEQ_GT(tp->snd_una, tp->snd_recover)) {
2988 tp->snd_recover = tp->snd_una;
2989 }
2990 if (SEQ_LT(tp->snd_nxt, tp->snd_una))
2991 tp->snd_nxt = tp->snd_una;
2992
2993 switch (tp->t_state) {
2994 /*
2995 * In FIN_WAIT_1 STATE in addition to the processing
2996 * for the ESTABLISHED state if our FIN is now acknowledged
2997 * then enter FIN_WAIT_2.
2998 */
2999 case TCPS_FIN_WAIT_1:
3000 if (ourfinisacked) {
3001 /*
3002 * If we can't receive any more
3003 * data, then closing user can proceed.
3004 * Starting the timer is contrary to the
3005 * specification, but if we don't get a FIN
3006 * we'll hang forever.
3007 */
3008 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
3009 tcp_free_sackholes(tp);
3010 soisdisconnected(so);
3011 tcp_timer_activate(tp, TT_2MSL,
3012 (tcp_fast_finwait2_recycle ?
3013 tcp_finwait2_timeout :
3014 TP_MAXIDLE(tp)));
3015 }
3016 tcp_state_change(tp, TCPS_FIN_WAIT_2);
3017 }
3018 break;
3019
3020 /*
3021 * In CLOSING STATE in addition to the processing for
3022 * the ESTABLISHED state if the ACK acknowledges our FIN
3023 * then enter the TIME-WAIT state, otherwise ignore
3024 * the segment.
3025 */
3026 case TCPS_CLOSING:
3027 if (ourfinisacked) {
3028 tcp_twstart(tp);
3029 m_freem(m);
3030 return;
3031 }
3032 break;
3033
3034 /*
3035 * In LAST_ACK, we may still be waiting for data to drain
3036 * and/or to be acked, as well as for the ack of our FIN.
3037 * If our FIN is now acknowledged, delete the TCB,
3038 * enter the closed state and return.
3039 */
3040 case TCPS_LAST_ACK:
3041 if (ourfinisacked) {
3042 tp = tcp_close(tp);
3043 goto drop;
3044 }
3045 break;
3046 }
3047 }
3048
3049 step6:
3050 INP_WLOCK_ASSERT(inp);
3051
3052 /*
3053 * Update window information.
3054 * Don't look at window if no ACK: TAC's send garbage on first SYN.
3055 */
3056 if ((thflags & TH_ACK) &&
3057 (SEQ_LT(tp->snd_wl1, th->th_seq) ||
3058 (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) ||
3059 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
3060 /* keep track of pure window updates */
3061 if (no_data && tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
3062 TCPSTAT_INC(tcps_rcvwinupd);
3063 tp->snd_wnd = tiwin;
3064 tp->snd_wl1 = th->th_seq;
3065 tp->snd_wl2 = th->th_ack;
3066 if (tp->snd_wnd > tp->max_sndwnd)
3067 tp->max_sndwnd = tp->snd_wnd;
3068 needoutput = 1;
3069 }
3070
3071 /*
3072 * Process segments with URG.
3073 */
3074 if ((thflags & TH_URG) && th->th_urp &&
3075 TCPS_HAVERCVDFIN(tp->t_state) == 0) {
3076 /*
3077 * This is a kludge, but if we receive and accept
3078 * random urgent pointers, we'll crash in
3079 * soreceive. It's hard to imagine someone
3080 * actually wanting to send this much urgent data.
3081 */
3082 SOCK_RECVBUF_LOCK(so);
3083 if (th->th_urp + sbavail(&so->so_rcv) > sb_max) {
3084 th->th_urp = 0; /* XXX */
3085 thflags &= ~TH_URG; /* XXX */
3086 SOCK_RECVBUF_UNLOCK(so); /* XXX */
3087 goto dodata; /* XXX */
3088 }
3089 /*
3090 * If this segment advances the known urgent pointer,
3091 * then mark the data stream. This should not happen
3092 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since
3093 * a FIN has been received from the remote side.
3094 * In these states we ignore the URG.
3095 *
3096 * According to RFC961 (Assigned Protocols),
3097 * the urgent pointer points to the last octet
3098 * of urgent data. We continue, however,
3099 * to consider it to indicate the first octet
3100 * of data past the urgent section as the original
3101 * spec states (in one of two places).
3102 */
3103 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) {
3104 tp->rcv_up = th->th_seq + th->th_urp;
3105 so->so_oobmark = sbavail(&so->so_rcv) +
3106 (tp->rcv_up - tp->rcv_nxt) - 1;
3107 if (so->so_oobmark == 0)
3108 so->so_rcv.sb_state |= SBS_RCVATMARK;
3109 sohasoutofband(so);
3110 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA);
3111 }
3112 SOCK_RECVBUF_UNLOCK(so);
3113 /*
3114 * Remove out of band data so doesn't get presented to user.
3115 * This can happen independent of advancing the URG pointer,
3116 * but if two URG's are pending at once, some out-of-band
3117 * data may creep in... ick.
3118 */
3119 if (th->th_urp <= (uint32_t)tlen &&
3120 !(so->so_options & SO_OOBINLINE)) {
3121 /* hdr drop is delayed */
3122 tcp_pulloutofband(so, th, m, drop_hdrlen);
3123 }
3124 } else {
3125 /*
3126 * If no out of band data is expected,
3127 * pull receive urgent pointer along
3128 * with the receive window.
3129 */
3130 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
3131 tp->rcv_up = tp->rcv_nxt;
3132 }
3133 dodata: /* XXX */
3134 INP_WLOCK_ASSERT(inp);
3135
3136 /*
3137 * Process the segment text, merging it into the TCP sequencing queue,
3138 * and arranging for acknowledgment of receipt if necessary.
3139 * This process logically involves adjusting tp->rcv_wnd as data
3140 * is presented to the user (this happens in tcp_usrreq.c,
3141 * case PRU_RCVD). If a FIN has already been received on this
3142 * connection then we just ignore the text.
3143 */
3144 tfo_syn = ((tp->t_state == TCPS_SYN_RECEIVED) &&
3145 (tp->t_flags & TF_FASTOPEN));
3146 if ((tlen || (thflags & TH_FIN) || (tfo_syn && tlen > 0)) &&
3147 TCPS_HAVERCVDFIN(tp->t_state) == 0) {
3148 tcp_seq save_start = th->th_seq;
3149 tcp_seq save_rnxt = tp->rcv_nxt;
3150 int save_tlen = tlen;
3151 m_adj(m, drop_hdrlen); /* delayed header drop */
3152 /*
3153 * Insert segment which includes th into TCP reassembly queue
3154 * with control block tp. Set thflags to whether reassembly now
3155 * includes a segment with FIN. This handles the common case
3156 * inline (segment is the next to be received on an established
3157 * connection, and the queue is empty), avoiding linkage into
3158 * and removal from the queue and repetition of various
3159 * conversions.
3160 * Set DELACK for segments received in order, but ack
3161 * immediately when segments are out of order (so
3162 * fast retransmit can work).
3163 */
3164 if (th->th_seq == tp->rcv_nxt &&
3165 SEGQ_EMPTY(tp) &&
3166 (TCPS_HAVEESTABLISHED(tp->t_state) ||
3167 tfo_syn)) {
3168 if (DELAY_ACK(tp, tlen) || tfo_syn)
3169 tp->t_flags |= TF_DELACK;
3170 else
3171 tp->t_flags |= TF_ACKNOW;
3172 tp->rcv_nxt += tlen;
3173 if (tlen &&
3174 ((tp->t_flags2 & TF2_FBYTES_COMPLETE) == 0) &&
3175 (tp->t_fbyte_in == 0)) {
3176 tp->t_fbyte_in = ticks;
3177 if (tp->t_fbyte_in == 0)
3178 tp->t_fbyte_in = 1;
3179 if (tp->t_fbyte_out && tp->t_fbyte_in)
3180 tp->t_flags2 |= TF2_FBYTES_COMPLETE;
3181 }
3182 thflags = tcp_get_flags(th) & TH_FIN;
3183 TCPSTAT_INC(tcps_rcvpack);
3184 TCPSTAT_ADD(tcps_rcvbyte, tlen);
3185 SOCK_RECVBUF_LOCK(so);
3186 if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
3187 m_freem(m);
3188 else
3189 sbappendstream_locked(&so->so_rcv, m, 0);
3190 tp->t_flags |= TF_WAKESOR;
3191 } else {
3192 /*
3193 * XXX: Due to the header drop above "th" is
3194 * theoretically invalid by now. Fortunately
3195 * m_adj() doesn't actually frees any mbufs
3196 * when trimming from the head.
3197 */
3198 tcp_seq temp = save_start;
3199
3200 thflags = tcp_reass(tp, th, &temp, &tlen, m);
3201 tp->t_flags |= TF_ACKNOW;
3202 }
3203 if ((tp->t_flags & TF_SACK_PERMIT) &&
3204 (save_tlen > 0) &&
3205 TCPS_HAVEESTABLISHED(tp->t_state)) {
3206 if ((tlen == 0) && (SEQ_LT(save_start, save_rnxt))) {
3207 /*
3208 * DSACK actually handled in the fastpath
3209 * above.
3210 */
3211 tcp_update_sack_list(tp, save_start,
3212 save_start + save_tlen);
3213 } else if ((tlen > 0) && SEQ_GT(tp->rcv_nxt, save_rnxt)) {
3214 if ((tp->rcv_numsacks >= 1) &&
3215 (tp->sackblks[0].end == save_start)) {
3216 /*
3217 * Partial overlap, recorded at todrop
3218 * above.
3219 */
3220 tcp_update_sack_list(tp,
3221 tp->sackblks[0].start,
3222 tp->sackblks[0].end);
3223 } else {
3224 tcp_update_dsack_list(tp, save_start,
3225 save_start + save_tlen);
3226 }
3227 } else if (tlen >= save_tlen) {
3228 /* Update of sackblks. */
3229 tcp_update_dsack_list(tp, save_start,
3230 save_start + save_tlen);
3231 } else if (tlen > 0) {
3232 tcp_update_dsack_list(tp, save_start,
3233 save_start + tlen);
3234 }
3235 }
3236 tcp_handle_wakeup(tp);
3237 #if 0
3238 /*
3239 * Note the amount of data that peer has sent into
3240 * our window, in order to estimate the sender's
3241 * buffer size.
3242 * XXX: Unused.
3243 */
3244 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt))
3245 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt);
3246 else
3247 len = so->so_rcv.sb_hiwat;
3248 #endif
3249 } else {
3250 if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
3251 if (tlen > 0) {
3252 if ((thflags & TH_FIN) != 0) {
3253 log(LOG_DEBUG, "%s; %s: %s: "
3254 "Received %d bytes of data and FIN "
3255 "after having received a FIN, "
3256 "just dropping both\n",
3257 s, __func__,
3258 tcpstates[tp->t_state], tlen);
3259 } else {
3260 log(LOG_DEBUG, "%s; %s: %s: "
3261 "Received %d bytes of data "
3262 "after having received a FIN, "
3263 "just dropping it\n",
3264 s, __func__,
3265 tcpstates[tp->t_state], tlen);
3266 }
3267 } else {
3268 if ((thflags & TH_FIN) != 0) {
3269 log(LOG_DEBUG, "%s; %s: %s: "
3270 "Received FIN "
3271 "after having received a FIN, "
3272 "just dropping it\n",
3273 s, __func__,
3274 tcpstates[tp->t_state]);
3275 }
3276 }
3277 free(s, M_TCPLOG);
3278 }
3279 m_freem(m);
3280 thflags &= ~TH_FIN;
3281 }
3282
3283 /*
3284 * If FIN is received ACK the FIN and let the user know
3285 * that the connection is closing.
3286 */
3287 if (thflags & TH_FIN) {
3288 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
3289 /* The socket upcall is handled by socantrcvmore. */
3290 socantrcvmore(so);
3291 /*
3292 * If connection is half-synchronized
3293 * (ie NEEDSYN flag on) then delay ACK,
3294 * so it may be piggybacked when SYN is sent.
3295 * Otherwise, since we received a FIN then no
3296 * more input can be expected, send ACK now.
3297 */
3298 if (tp->t_flags & TF_NEEDSYN)
3299 tp->t_flags |= TF_DELACK;
3300 else
3301 tp->t_flags |= TF_ACKNOW;
3302 tp->rcv_nxt++;
3303 }
3304 switch (tp->t_state) {
3305 /*
3306 * In SYN_RECEIVED and ESTABLISHED STATES
3307 * enter the CLOSE_WAIT state.
3308 */
3309 case TCPS_SYN_RECEIVED:
3310 tp->t_starttime = ticks;
3311 /* FALLTHROUGH */
3312 case TCPS_ESTABLISHED:
3313 tcp_state_change(tp, TCPS_CLOSE_WAIT);
3314 break;
3315
3316 /*
3317 * If still in FIN_WAIT_1 STATE FIN has not been acked so
3318 * enter the CLOSING state.
3319 */
3320 case TCPS_FIN_WAIT_1:
3321 tcp_state_change(tp, TCPS_CLOSING);
3322 break;
3323
3324 /*
3325 * In FIN_WAIT_2 state enter the TIME_WAIT state,
3326 * starting the time-wait timer, turning off the other
3327 * standard timers.
3328 */
3329 case TCPS_FIN_WAIT_2:
3330 tcp_twstart(tp);
3331 return;
3332 }
3333 }
3334 TCP_PROBE3(debug__input, tp, th, m);
3335
3336 /*
3337 * Return any desired output.
3338 */
3339 if (needoutput || (tp->t_flags & TF_ACKNOW)) {
3340 (void) tcp_output(tp);
3341 }
3342 check_delack:
3343 INP_WLOCK_ASSERT(inp);
3344
3345 if (tp->t_flags & TF_DELACK) {
3346 tp->t_flags &= ~TF_DELACK;
3347 tcp_timer_activate(tp, TT_DELACK, tcp_delacktime);
3348 }
3349 INP_WUNLOCK(inp);
3350 return;
3351
3352 dropafterack:
3353 /*
3354 * Generate an ACK dropping incoming segment if it occupies
3355 * sequence space, where the ACK reflects our state.
3356 *
3357 * We can now skip the test for the RST flag since all
3358 * paths to this code happen after packets containing
3359 * RST have been dropped.
3360 *
3361 * In the SYN-RECEIVED state, don't send an ACK unless the
3362 * segment we received passes the SYN-RECEIVED ACK test.
3363 * If it fails send a RST. This breaks the loop in the
3364 * "LAND" DoS attack, and also prevents an ACK storm
3365 * between two listening ports that have been sent forged
3366 * SYN segments, each with the source address of the other.
3367 */
3368 if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) &&
3369 (SEQ_GT(tp->snd_una, th->th_ack) ||
3370 SEQ_GT(th->th_ack, tp->snd_max)) ) {
3371 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
3372 goto dropwithreset;
3373 }
3374 TCP_PROBE3(debug__input, tp, th, m);
3375 tp->t_flags |= TF_ACKNOW;
3376 (void) tcp_output(tp);
3377 INP_WUNLOCK(inp);
3378 m_freem(m);
3379 return;
3380
3381 dropwithreset:
3382 tcp_dropwithreset(m, th, tp, tlen);
3383 if (tp != NULL) {
3384 INP_WUNLOCK(inp);
3385 }
3386 return;
3387
3388 drop:
3389 /*
3390 * Drop space held by incoming segment and return.
3391 */
3392 TCP_PROBE3(debug__input, tp, th, m);
3393 if (tp != NULL) {
3394 INP_WUNLOCK(inp);
3395 }
3396 m_freem(m);
3397 }
3398
3399 /*
3400 * Issue RST and make ACK acceptable to originator of segment.
3401 * The mbuf must still include the original packet header.
3402 * tp may be NULL.
3403 */
3404 void
tcp_dropwithreset(struct mbuf * m,struct tcphdr * th,struct tcpcb * tp,int tlen)3405 tcp_dropwithreset(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int tlen)
3406 {
3407 #ifdef INET
3408 struct ip *ip;
3409 #endif
3410 #ifdef INET6
3411 struct ip6_hdr *ip6;
3412 #endif
3413
3414 if (tp != NULL) {
3415 INP_LOCK_ASSERT(tptoinpcb(tp));
3416 }
3417
3418 /* Don't bother if destination was broadcast/multicast. */
3419 if ((tcp_get_flags(th) & TH_RST) || m->m_flags & (M_BCAST|M_MCAST))
3420 goto drop;
3421 #ifdef INET6
3422 if (mtod(m, struct ip *)->ip_v == 6) {
3423 ip6 = mtod(m, struct ip6_hdr *);
3424 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
3425 IN6_IS_ADDR_MULTICAST(&ip6->ip6_src))
3426 goto drop;
3427 /* IPv6 anycast check is done at tcp6_input() */
3428 }
3429 #endif
3430 #if defined(INET) && defined(INET6)
3431 else
3432 #endif
3433 #ifdef INET
3434 {
3435 ip = mtod(m, struct ip *);
3436 if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
3437 IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
3438 ip->ip_src.s_addr == htonl(INADDR_BROADCAST) ||
3439 in_ifnet_broadcast(ip->ip_dst, m->m_pkthdr.rcvif))
3440 goto drop;
3441 }
3442 #endif
3443
3444 /* Perform bandwidth limiting. */
3445 if (badport_bandlim(BANDLIM_TCP_RST) < 0)
3446 goto drop;
3447
3448 /* tcp_respond consumes the mbuf chain. */
3449 if (tcp_get_flags(th) & TH_ACK) {
3450 tcp_respond(tp, mtod(m, void *), th, m, (tcp_seq)0,
3451 th->th_ack, TH_RST);
3452 } else {
3453 if (tcp_get_flags(th) & TH_SYN)
3454 tlen++;
3455 if (tcp_get_flags(th) & TH_FIN)
3456 tlen++;
3457 tcp_respond(tp, mtod(m, void *), th, m, th->th_seq+tlen,
3458 (tcp_seq)0, TH_RST|TH_ACK);
3459 }
3460 return;
3461 drop:
3462 m_freem(m);
3463 }
3464
3465 /*
3466 * Parse TCP options and place in tcpopt.
3467 */
3468 void
tcp_dooptions(struct tcpopt * to,u_char * cp,int cnt,int flags)3469 tcp_dooptions(struct tcpopt *to, u_char *cp, int cnt, int flags)
3470 {
3471 int opt, optlen;
3472
3473 to->to_flags = 0;
3474 for (; cnt > 0; cnt -= optlen, cp += optlen) {
3475 opt = cp[0];
3476 if (opt == TCPOPT_EOL)
3477 break;
3478 if (opt == TCPOPT_NOP)
3479 optlen = 1;
3480 else {
3481 if (cnt < 2)
3482 break;
3483 optlen = cp[1];
3484 if (optlen < 2 || optlen > cnt)
3485 break;
3486 }
3487 switch (opt) {
3488 case TCPOPT_MAXSEG:
3489 if (optlen != TCPOLEN_MAXSEG)
3490 continue;
3491 if (!(flags & TO_SYN))
3492 continue;
3493 to->to_flags |= TOF_MSS;
3494 bcopy((char *)cp + 2,
3495 (char *)&to->to_mss, sizeof(to->to_mss));
3496 to->to_mss = ntohs(to->to_mss);
3497 break;
3498 case TCPOPT_WINDOW:
3499 if (optlen != TCPOLEN_WINDOW)
3500 continue;
3501 if (!(flags & TO_SYN))
3502 continue;
3503 to->to_flags |= TOF_SCALE;
3504 to->to_wscale = min(cp[2], TCP_MAX_WINSHIFT);
3505 break;
3506 case TCPOPT_TIMESTAMP:
3507 if (optlen != TCPOLEN_TIMESTAMP)
3508 continue;
3509 to->to_flags |= TOF_TS;
3510 bcopy((char *)cp + 2,
3511 (char *)&to->to_tsval, sizeof(to->to_tsval));
3512 to->to_tsval = ntohl(to->to_tsval);
3513 bcopy((char *)cp + 6,
3514 (char *)&to->to_tsecr, sizeof(to->to_tsecr));
3515 to->to_tsecr = ntohl(to->to_tsecr);
3516 break;
3517 case TCPOPT_SIGNATURE:
3518 /*
3519 * In order to reply to a host which has set the
3520 * TCP_SIGNATURE option in its initial SYN, we have
3521 * to record the fact that the option was observed
3522 * here for the syncache code to perform the correct
3523 * response.
3524 */
3525 if (optlen != TCPOLEN_SIGNATURE)
3526 continue;
3527 to->to_flags |= TOF_SIGNATURE;
3528 to->to_signature = cp + 2;
3529 break;
3530 case TCPOPT_SACK_PERMITTED:
3531 if (optlen != TCPOLEN_SACK_PERMITTED)
3532 continue;
3533 if (!(flags & TO_SYN))
3534 continue;
3535 if (!V_tcp_do_sack)
3536 continue;
3537 to->to_flags |= TOF_SACKPERM;
3538 break;
3539 case TCPOPT_SACK:
3540 if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0)
3541 continue;
3542 if (flags & TO_SYN)
3543 continue;
3544 to->to_flags |= TOF_SACK;
3545 to->to_nsacks = (optlen - 2) / TCPOLEN_SACK;
3546 to->to_sacks = cp + 2;
3547 TCPSTAT_INC(tcps_sack_rcv_blocks);
3548 break;
3549 case TCPOPT_FAST_OPEN:
3550 /*
3551 * Cookie length validation is performed by the
3552 * server side cookie checking code or the client
3553 * side cookie cache update code.
3554 */
3555 if (!(flags & TO_SYN))
3556 continue;
3557 if (!V_tcp_fastopen_client_enable &&
3558 !V_tcp_fastopen_server_enable)
3559 continue;
3560 to->to_flags |= TOF_FASTOPEN;
3561 to->to_tfo_len = optlen - 2;
3562 to->to_tfo_cookie = to->to_tfo_len ? cp + 2 : NULL;
3563 break;
3564 default:
3565 continue;
3566 }
3567 }
3568 }
3569
3570 /*
3571 * Pull out of band byte out of a segment so
3572 * it doesn't appear in the user's data queue.
3573 * It is still reflected in the segment length for
3574 * sequencing purposes.
3575 */
3576 void
tcp_pulloutofband(struct socket * so,struct tcphdr * th,struct mbuf * m,int off)3577 tcp_pulloutofband(struct socket *so, struct tcphdr *th, struct mbuf *m,
3578 int off)
3579 {
3580 int cnt = off + th->th_urp - 1;
3581
3582 while (cnt >= 0) {
3583 if (m->m_len > cnt) {
3584 char *cp = mtod(m, caddr_t) + cnt;
3585 struct tcpcb *tp = sototcpcb(so);
3586
3587 INP_WLOCK_ASSERT(tptoinpcb(tp));
3588
3589 tp->t_iobc = *cp;
3590 tp->t_oobflags |= TCPOOB_HAVEDATA;
3591 bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1));
3592 m->m_len--;
3593 if (m->m_flags & M_PKTHDR)
3594 m->m_pkthdr.len--;
3595 return;
3596 }
3597 cnt -= m->m_len;
3598 m = m->m_next;
3599 if (m == NULL)
3600 break;
3601 }
3602 panic("tcp_pulloutofband");
3603 }
3604
3605 /*
3606 * Collect new round-trip time estimate
3607 * and update averages and current timeout.
3608 */
3609 void
tcp_xmit_timer(struct tcpcb * tp,int rtt)3610 tcp_xmit_timer(struct tcpcb *tp, int rtt)
3611 {
3612 int delta;
3613
3614 INP_WLOCK_ASSERT(tptoinpcb(tp));
3615
3616 TCPSTAT_INC(tcps_rttupdated);
3617 if (tp->t_rttupdated < UCHAR_MAX)
3618 tp->t_rttupdated++;
3619 #ifdef STATS
3620 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT,
3621 imax(0, rtt * 1000 / hz));
3622 #endif
3623 if ((tp->t_srtt != 0) && (tp->t_rxtshift <= TCP_RTT_INVALIDATE)) {
3624 /*
3625 * srtt is stored as fixed point with 5 bits after the
3626 * binary point (i.e., scaled by 8). The following magic
3627 * is equivalent to the smoothing algorithm in rfc793 with
3628 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed
3629 * point). Adjust rtt to origin 0.
3630 */
3631 delta = ((rtt - 1) << TCP_DELTA_SHIFT)
3632 - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT));
3633
3634 if ((tp->t_srtt += delta) <= 0)
3635 tp->t_srtt = 1;
3636
3637 /*
3638 * We accumulate a smoothed rtt variance (actually, a
3639 * smoothed mean difference), then set the retransmit
3640 * timer to smoothed rtt + 4 times the smoothed variance.
3641 * rttvar is stored as fixed point with 4 bits after the
3642 * binary point (scaled by 16). The following is
3643 * equivalent to rfc793 smoothing with an alpha of .75
3644 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces
3645 * rfc793's wired-in beta.
3646 */
3647 if (delta < 0)
3648 delta = -delta;
3649 delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT);
3650 if ((tp->t_rttvar += delta) <= 0)
3651 tp->t_rttvar = 1;
3652 } else {
3653 /*
3654 * No rtt measurement yet - use the unsmoothed rtt.
3655 * Set the variance to half the rtt (so our first
3656 * retransmit happens at 3*rtt).
3657 */
3658 tp->t_srtt = rtt << TCP_RTT_SHIFT;
3659 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1);
3660 }
3661 tp->t_rtttime = 0;
3662 tp->t_rxtshift = 0;
3663
3664 /*
3665 * the retransmit should happen at rtt + 4 * rttvar.
3666 * Because of the way we do the smoothing, srtt and rttvar
3667 * will each average +1/2 tick of bias. When we compute
3668 * the retransmit timer, we want 1/2 tick of rounding and
3669 * 1 extra tick because of +-1/2 tick uncertainty in the
3670 * firing of the timer. The bias will give us exactly the
3671 * 1.5 tick we need. But, because the bias is
3672 * statistical, we have to test that we don't drop below
3673 * the minimum feasible timer (which is 2 ticks).
3674 */
3675 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
3676 max(tp->t_rttmin, rtt + 2), tcp_rexmit_max);
3677
3678 /*
3679 * We received an ack for a packet that wasn't retransmitted;
3680 * it is probably safe to discard any error indications we've
3681 * received recently. This isn't quite right, but close enough
3682 * for now (a route might have failed after we sent a segment,
3683 * and the return path might not be symmetrical).
3684 */
3685 tp->t_softerror = 0;
3686 }
3687
3688 /*
3689 * Determine a reasonable value for maxseg size.
3690 * If the route is known, check route for mtu.
3691 * If none, use an mss that can be handled on the outgoing interface
3692 * without forcing IP to fragment. If no route is found, route has no mtu,
3693 * or the destination isn't local, use a default, hopefully conservative
3694 * size (usually 512 or the default IP max size, but no more than the mtu
3695 * of the interface), as we can't discover anything about intervening
3696 * gateways or networks. We also initialize the congestion/slow start
3697 * window to be a single segment if the destination isn't local.
3698 * While looking at the routing entry, we also initialize other path-dependent
3699 * parameters from pre-set or cached values in the routing entry.
3700 *
3701 * NOTE that resulting t_maxseg doesn't include space for TCP options or
3702 * IP options, e.g. IPSEC data, since length of this data may vary, and
3703 * thus it is calculated for every segment separately in tcp_output().
3704 *
3705 * NOTE that this routine is only called when we process an incoming
3706 * segment, or an ICMP need fragmentation datagram. Outgoing SYN/ACK MSS
3707 * settings are handled in tcp_mssopt().
3708 */
3709 void
tcp_mss_update(struct tcpcb * tp,int offer,int mtuoffer,struct hc_metrics_lite * metricptr,struct tcp_ifcap * cap)3710 tcp_mss_update(struct tcpcb *tp, int offer, int mtuoffer,
3711 struct hc_metrics_lite *metricptr, struct tcp_ifcap *cap)
3712 {
3713 int mss = 0;
3714 uint32_t maxmtu = 0;
3715 struct inpcb *inp = tptoinpcb(tp);
3716 struct hc_metrics_lite metrics;
3717 #ifdef INET6
3718 int isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
3719 size_t min_protoh = isipv6 ?
3720 sizeof (struct ip6_hdr) + sizeof (struct tcphdr) :
3721 sizeof (struct tcpiphdr);
3722 #else
3723 size_t min_protoh = sizeof(struct tcpiphdr);
3724 #endif
3725
3726 INP_WLOCK_ASSERT(inp);
3727
3728 if (tp->t_port)
3729 min_protoh += V_tcp_udp_tunneling_overhead;
3730 if (mtuoffer != -1) {
3731 KASSERT(offer == -1, ("%s: conflict", __func__));
3732 offer = mtuoffer - min_protoh;
3733 }
3734
3735 /* Initialize. */
3736 #ifdef INET6
3737 if (isipv6) {
3738 maxmtu = tcp_maxmtu6(&inp->inp_inc, cap);
3739 tp->t_maxseg = V_tcp_v6mssdflt;
3740 }
3741 #endif
3742 #if defined(INET) && defined(INET6)
3743 else
3744 #endif
3745 #ifdef INET
3746 {
3747 maxmtu = tcp_maxmtu(&inp->inp_inc, cap);
3748 tp->t_maxseg = V_tcp_mssdflt;
3749 }
3750 #endif
3751
3752 /*
3753 * No route to sender, stay with default mss and return.
3754 */
3755 if (maxmtu == 0) {
3756 /*
3757 * In case we return early we need to initialize metrics
3758 * to a defined state as tcp_hc_get() would do for us
3759 * if there was no cache hit.
3760 */
3761 if (metricptr != NULL)
3762 bzero(metricptr, sizeof(struct hc_metrics_lite));
3763 return;
3764 }
3765
3766 /* What have we got? */
3767 switch (offer) {
3768 case 0:
3769 /*
3770 * Offer == 0 means that there was no MSS on the SYN
3771 * segment, in this case we use tcp_mssdflt as
3772 * already assigned to t_maxseg above.
3773 */
3774 offer = tp->t_maxseg;
3775 break;
3776
3777 case -1:
3778 /*
3779 * Offer == -1 means that we didn't receive SYN yet.
3780 */
3781 /* FALLTHROUGH */
3782
3783 default:
3784 /*
3785 * Prevent DoS attack with too small MSS. Round up
3786 * to at least minmss.
3787 */
3788 offer = max(offer, V_tcp_minmss);
3789 }
3790
3791 if (metricptr == NULL)
3792 metricptr = &metrics;
3793 tcp_hc_get(&inp->inp_inc, metricptr);
3794
3795 /*
3796 * If there's a discovered mtu in tcp hostcache, use it.
3797 * Else, use the link mtu.
3798 */
3799 if (metricptr->hc_mtu)
3800 mss = min(metricptr->hc_mtu, maxmtu) - min_protoh;
3801 else {
3802 #ifdef INET6
3803 if (isipv6) {
3804 mss = maxmtu - min_protoh;
3805 if (!V_path_mtu_discovery &&
3806 !in6_localaddr(&inp->in6p_faddr))
3807 mss = min(mss, V_tcp_v6mssdflt);
3808 }
3809 #endif
3810 #if defined(INET) && defined(INET6)
3811 else
3812 #endif
3813 #ifdef INET
3814 {
3815 mss = maxmtu - min_protoh;
3816 if (!V_path_mtu_discovery &&
3817 !in_localaddr(inp->inp_faddr))
3818 mss = min(mss, V_tcp_mssdflt);
3819 }
3820 #endif
3821 /*
3822 * XXX - The above conditional (mss = maxmtu - min_protoh)
3823 * probably violates the TCP spec.
3824 * The problem is that, since we don't know the
3825 * other end's MSS, we are supposed to use a conservative
3826 * default. But, if we do that, then MTU discovery will
3827 * never actually take place, because the conservative
3828 * default is much less than the MTUs typically seen
3829 * on the Internet today. For the moment, we'll sweep
3830 * this under the carpet.
3831 *
3832 * The conservative default might not actually be a problem
3833 * if the only case this occurs is when sending an initial
3834 * SYN with options and data to a host we've never talked
3835 * to before. Then, they will reply with an MSS value which
3836 * will get recorded and the new parameters should get
3837 * recomputed. For Further Study.
3838 */
3839 }
3840 mss = min(mss, offer);
3841
3842 /*
3843 * Sanity check: make sure that maxseg will be large
3844 * enough to allow some data on segments even if the
3845 * all the option space is used (40bytes). Otherwise
3846 * funny things may happen in tcp_output.
3847 *
3848 * XXXGL: shouldn't we reserve space for IP/IPv6 options?
3849 */
3850 mss = max(mss, 64);
3851
3852 tp->t_maxseg = mss;
3853 if (tp->t_maxseg < V_tcp_mssdflt) {
3854 /*
3855 * The MSS is so small we should not process incoming
3856 * SACK's since we are subject to attack in such a
3857 * case.
3858 */
3859 tp->t_flags2 |= TF2_PROC_SACK_PROHIBIT;
3860 } else {
3861 tp->t_flags2 &= ~TF2_PROC_SACK_PROHIBIT;
3862 }
3863
3864 }
3865
3866 void
tcp_mss(struct tcpcb * tp,int offer)3867 tcp_mss(struct tcpcb *tp, int offer)
3868 {
3869 int mss;
3870 uint32_t bufsize;
3871 struct inpcb *inp = tptoinpcb(tp);
3872 struct socket *so;
3873 struct hc_metrics_lite metrics;
3874 struct tcp_ifcap cap;
3875
3876 KASSERT(tp != NULL, ("%s: tp == NULL", __func__));
3877
3878 bzero(&cap, sizeof(cap));
3879 tcp_mss_update(tp, offer, -1, &metrics, &cap);
3880
3881 mss = tp->t_maxseg;
3882
3883 /*
3884 * If there's a pipesize, change the socket buffer to that size,
3885 * don't change if sb_hiwat is different than default (then it
3886 * has been changed on purpose with setsockopt).
3887 * Make the socket buffers an integral number of mss units;
3888 * if the mss is larger than the socket buffer, decrease the mss.
3889 */
3890 so = inp->inp_socket;
3891 SOCK_SENDBUF_LOCK(so);
3892 if ((so->so_snd.sb_hiwat == V_tcp_sendspace) && metrics.hc_sendpipe)
3893 bufsize = metrics.hc_sendpipe;
3894 else
3895 bufsize = so->so_snd.sb_hiwat;
3896 if (bufsize < mss)
3897 mss = bufsize;
3898 else {
3899 bufsize = roundup(bufsize, mss);
3900 if (bufsize > sb_max)
3901 bufsize = sb_max;
3902 if (bufsize > so->so_snd.sb_hiwat)
3903 (void)sbreserve_locked(so, SO_SND, bufsize, NULL);
3904 }
3905 SOCK_SENDBUF_UNLOCK(so);
3906 /*
3907 * Sanity check: make sure that maxseg will be large
3908 * enough to allow some data on segments even if the
3909 * all the option space is used (40bytes). Otherwise
3910 * funny things may happen in tcp_output.
3911 *
3912 * XXXGL: shouldn't we reserve space for IP/IPv6 options?
3913 */
3914 tp->t_maxseg = max(mss, 64);
3915 if (tp->t_maxseg < V_tcp_mssdflt) {
3916 /*
3917 * The MSS is so small we should not process incoming
3918 * SACK's since we are subject to attack in such a
3919 * case.
3920 */
3921 tp->t_flags2 |= TF2_PROC_SACK_PROHIBIT;
3922 } else {
3923 tp->t_flags2 &= ~TF2_PROC_SACK_PROHIBIT;
3924 }
3925
3926 SOCK_RECVBUF_LOCK(so);
3927 if ((so->so_rcv.sb_hiwat == V_tcp_recvspace) && metrics.hc_recvpipe)
3928 bufsize = metrics.hc_recvpipe;
3929 else
3930 bufsize = so->so_rcv.sb_hiwat;
3931 if (bufsize > mss) {
3932 bufsize = roundup(bufsize, mss);
3933 if (bufsize > sb_max)
3934 bufsize = sb_max;
3935 if (bufsize > so->so_rcv.sb_hiwat)
3936 (void)sbreserve_locked(so, SO_RCV, bufsize, NULL);
3937 }
3938 SOCK_RECVBUF_UNLOCK(so);
3939
3940 /* Check the interface for TSO capabilities. */
3941 if (cap.ifcap & CSUM_TSO) {
3942 tp->t_flags |= TF_TSO;
3943 tp->t_tsomax = cap.tsomax;
3944 tp->t_tsomaxsegcount = cap.tsomaxsegcount;
3945 tp->t_tsomaxsegsize = cap.tsomaxsegsize;
3946 if (cap.ipsec_tso)
3947 tp->t_flags2 |= TF2_IPSEC_TSO;
3948 }
3949 }
3950
3951 /*
3952 * Determine the MSS option to send on an outgoing SYN.
3953 */
3954 int
tcp_mssopt(struct in_conninfo * inc)3955 tcp_mssopt(struct in_conninfo *inc)
3956 {
3957 int mss = 0;
3958 uint32_t thcmtu = 0;
3959 uint32_t maxmtu = 0;
3960 size_t min_protoh;
3961
3962 KASSERT(inc != NULL, ("tcp_mssopt with NULL in_conninfo pointer"));
3963
3964 #ifdef INET6
3965 if (inc->inc_flags & INC_ISIPV6) {
3966 mss = V_tcp_v6mssdflt;
3967 maxmtu = tcp_maxmtu6(inc, NULL);
3968 min_protoh = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
3969 }
3970 #endif
3971 #if defined(INET) && defined(INET6)
3972 else
3973 #endif
3974 #ifdef INET
3975 {
3976 mss = V_tcp_mssdflt;
3977 maxmtu = tcp_maxmtu(inc, NULL);
3978 min_protoh = sizeof(struct tcpiphdr);
3979 }
3980 #endif
3981 #if defined(INET6) || defined(INET)
3982 thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */
3983 #endif
3984
3985 if (maxmtu && thcmtu)
3986 mss = min(maxmtu, thcmtu) - min_protoh;
3987 else if (maxmtu || thcmtu)
3988 mss = max(maxmtu, thcmtu) - min_protoh;
3989
3990 return (mss);
3991 }
3992
3993 void
tcp_do_prr_ack(struct tcpcb * tp,struct tcphdr * th,struct tcpopt * to,sackstatus_t sack_changed,u_int * maxsegp)3994 tcp_do_prr_ack(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to,
3995 sackstatus_t sack_changed, u_int *maxsegp)
3996 {
3997 int snd_cnt = 0, limit = 0, del_data = 0, pipe = 0;
3998 u_int maxseg;
3999
4000 INP_WLOCK_ASSERT(tptoinpcb(tp));
4001
4002 if (*maxsegp == 0) {
4003 *maxsegp = tcp_maxseg(tp);
4004 }
4005 maxseg = *maxsegp;
4006 /*
4007 * Compute the amount of data that this ACK is indicating
4008 * (del_data) and an estimate of how many bytes are in the
4009 * network.
4010 */
4011 if (tcp_is_sack_recovery(tp, to) ||
4012 (IN_CONGRECOVERY(tp->t_flags) &&
4013 !IN_FASTRECOVERY(tp->t_flags))) {
4014 del_data = tp->sackhint.delivered_data;
4015 pipe = tcp_compute_pipe(tp);
4016 } else {
4017 if (tp->sackhint.prr_delivered < (tcprexmtthresh * maxseg +
4018 tp->snd_recover - tp->snd_una)) {
4019 del_data = maxseg;
4020 }
4021 pipe = imax(0, tp->snd_max - tp->snd_una -
4022 imin(INT_MAX / 65536, tp->t_dupacks) * maxseg);
4023 }
4024 tp->sackhint.prr_delivered += del_data;
4025 /*
4026 * Proportional Rate Reduction
4027 */
4028 if (pipe >= tp->snd_ssthresh) {
4029 if (tp->sackhint.recover_fs == 0)
4030 tp->sackhint.recover_fs =
4031 imax(1, tp->snd_nxt - tp->snd_una);
4032 snd_cnt = howmany((long)tp->sackhint.prr_delivered *
4033 tp->snd_ssthresh, tp->sackhint.recover_fs) -
4034 tp->sackhint.prr_out + maxseg - 1;
4035 } else {
4036 /*
4037 * PRR 6937bis heuristic:
4038 * - A partial ack without SACK block beneath snd_recover
4039 * indicates further loss.
4040 * - An SACK scoreboard update adding a new hole indicates
4041 * further loss, so be conservative and send at most one
4042 * segment.
4043 * - Prevent ACK splitting attacks, by being conservative
4044 * when no new data is acked.
4045 */
4046 if ((sack_changed == SACK_NEWLOSS) || (del_data == 0)) {
4047 limit = tp->sackhint.prr_delivered -
4048 tp->sackhint.prr_out;
4049 } else {
4050 limit = imax(tp->sackhint.prr_delivered -
4051 tp->sackhint.prr_out, del_data) +
4052 maxseg;
4053 }
4054 snd_cnt = imin((tp->snd_ssthresh - pipe), limit);
4055 }
4056 snd_cnt = imax(snd_cnt, 0) / maxseg;
4057 /*
4058 * Send snd_cnt new data into the network in response to this ack.
4059 * If there is going to be a SACK retransmission, adjust snd_cwnd
4060 * accordingly.
4061 */
4062 if (IN_FASTRECOVERY(tp->t_flags)) {
4063 if (tcp_is_sack_recovery(tp, to)) {
4064 tp->snd_cwnd = pipe - del_data + (snd_cnt * maxseg);
4065 } else {
4066 tp->snd_cwnd = (tp->snd_max - tp->snd_una) +
4067 (snd_cnt * maxseg);
4068 }
4069 } else if (IN_CONGRECOVERY(tp->t_flags)) {
4070 tp->snd_cwnd = pipe - del_data + (snd_cnt * maxseg);
4071 }
4072 tp->snd_cwnd = imax(maxseg, tp->snd_cwnd);
4073 }
4074
4075 /*
4076 * On a partial ack arrives, force the retransmission of the
4077 * next unacknowledged segment. Do not clear tp->t_dupacks.
4078 * By setting snd_nxt to ti_ack, this forces retransmission timer to
4079 * be started again.
4080 */
4081 void
tcp_newreno_partial_ack(struct tcpcb * tp,struct tcphdr * th)4082 tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th)
4083 {
4084 tcp_seq onxt = tp->snd_nxt;
4085 uint32_t ocwnd = tp->snd_cwnd;
4086 u_int maxseg = tcp_maxseg(tp);
4087
4088 INP_WLOCK_ASSERT(tptoinpcb(tp));
4089
4090 tcp_timer_activate(tp, TT_REXMT, 0);
4091 tp->t_rtttime = 0;
4092 if (IN_FASTRECOVERY(tp->t_flags)) {
4093 tp->snd_nxt = th->th_ack;
4094 /*
4095 * Set snd_cwnd to one segment beyond acknowledged offset.
4096 * (tp->snd_una has not yet been updated when this function is called.)
4097 */
4098 tp->snd_cwnd = maxseg + BYTES_THIS_ACK(tp, th);
4099 tp->t_flags |= TF_ACKNOW;
4100 (void) tcp_output(tp);
4101 tp->snd_cwnd = ocwnd;
4102 if (SEQ_GT(onxt, tp->snd_nxt))
4103 tp->snd_nxt = onxt;
4104 }
4105 /*
4106 * Partial window deflation. Relies on fact that tp->snd_una
4107 * not updated yet.
4108 */
4109 if (tp->snd_cwnd > BYTES_THIS_ACK(tp, th))
4110 tp->snd_cwnd -= BYTES_THIS_ACK(tp, th);
4111 else
4112 tp->snd_cwnd = 0;
4113 tp->snd_cwnd += maxseg;
4114 }
4115
4116 int
tcp_compute_pipe(struct tcpcb * tp)4117 tcp_compute_pipe(struct tcpcb *tp)
4118 {
4119 int pipe;
4120
4121 if (tp->t_fb->tfb_compute_pipe != NULL) {
4122 pipe = (*tp->t_fb->tfb_compute_pipe)(tp);
4123 } else if (V_tcp_do_newsack) {
4124 pipe = tp->snd_max - tp->snd_una +
4125 tp->sackhint.sack_bytes_rexmit -
4126 tp->sackhint.sacked_bytes -
4127 tp->sackhint.lost_bytes;
4128 } else {
4129 pipe = tp->snd_nxt - tp->snd_fack + tp->sackhint.sack_bytes_rexmit;
4130 }
4131 return (imax(pipe, 0));
4132 }
4133
4134 uint32_t
tcp_compute_initwnd(uint32_t maxseg)4135 tcp_compute_initwnd(uint32_t maxseg)
4136 {
4137 /*
4138 * Calculate the Initial Window, also used as Restart Window
4139 *
4140 * RFC5681 Section 3.1 specifies the default conservative values.
4141 * RFC3390 specifies slightly more aggressive values.
4142 * RFC6928 increases it to ten segments.
4143 * Support for user specified value for initial flight size.
4144 */
4145 if (V_tcp_initcwnd_segments)
4146 return min(V_tcp_initcwnd_segments * maxseg,
4147 max(2 * maxseg, V_tcp_initcwnd_segments * 1460));
4148 else if (V_tcp_do_rfc3390)
4149 return min(4 * maxseg, max(2 * maxseg, 4380));
4150 else {
4151 /* Per RFC5681 Section 3.1 */
4152 if (maxseg > 2190)
4153 return (2 * maxseg);
4154 else if (maxseg > 1095)
4155 return (3 * maxseg);
4156 else
4157 return (4 * maxseg);
4158 }
4159 }
4160