xref: /freebsd/sys/netinet/tcp_timer.c (revision 9fc5c47fa5c7fa58d61245f0408611943e613164)
1 /*-
2  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 4. Neither the name of the University nor the names of its contributors
14  *    may be used to endorse or promote products derived from this software
15  *    without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  *
29  *	@(#)tcp_timer.c	8.2 (Berkeley) 5/24/95
30  */
31 
32 #include <sys/cdefs.h>
33 __FBSDID("$FreeBSD$");
34 
35 #include "opt_inet.h"
36 #include "opt_inet6.h"
37 #include "opt_tcpdebug.h"
38 #include "opt_rss.h"
39 
40 #include <sys/param.h>
41 #include <sys/kernel.h>
42 #include <sys/lock.h>
43 #include <sys/mbuf.h>
44 #include <sys/mutex.h>
45 #include <sys/protosw.h>
46 #include <sys/smp.h>
47 #include <sys/socket.h>
48 #include <sys/socketvar.h>
49 #include <sys/sysctl.h>
50 #include <sys/systm.h>
51 
52 #include <net/if.h>
53 #include <net/route.h>
54 #include <net/rss_config.h>
55 #include <net/vnet.h>
56 #include <net/netisr.h>
57 
58 #include <netinet/cc.h>
59 #include <netinet/in.h>
60 #include <netinet/in_pcb.h>
61 #include <netinet/in_rss.h>
62 #include <netinet/in_systm.h>
63 #ifdef INET6
64 #include <netinet6/in6_pcb.h>
65 #endif
66 #include <netinet/ip_var.h>
67 #include <netinet/tcp_fsm.h>
68 #include <netinet/tcp_timer.h>
69 #include <netinet/tcp_var.h>
70 #ifdef INET6
71 #include <netinet6/tcp6_var.h>
72 #endif
73 #include <netinet/tcpip.h>
74 #ifdef TCPDEBUG
75 #include <netinet/tcp_debug.h>
76 #endif
77 
78 int	tcp_keepinit;
79 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, CTLTYPE_INT|CTLFLAG_RW,
80     &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", "time to establish connection");
81 
82 int	tcp_keepidle;
83 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle, CTLTYPE_INT|CTLFLAG_RW,
84     &tcp_keepidle, 0, sysctl_msec_to_ticks, "I", "time before keepalive probes begin");
85 
86 int	tcp_keepintvl;
87 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl, CTLTYPE_INT|CTLFLAG_RW,
88     &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", "time between keepalive probes");
89 
90 int	tcp_delacktime;
91 SYSCTL_PROC(_net_inet_tcp, TCPCTL_DELACKTIME, delacktime, CTLTYPE_INT|CTLFLAG_RW,
92     &tcp_delacktime, 0, sysctl_msec_to_ticks, "I",
93     "Time before a delayed ACK is sent");
94 
95 int	tcp_msl;
96 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl, CTLTYPE_INT|CTLFLAG_RW,
97     &tcp_msl, 0, sysctl_msec_to_ticks, "I", "Maximum segment lifetime");
98 
99 int	tcp_rexmit_min;
100 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_min, CTLTYPE_INT|CTLFLAG_RW,
101     &tcp_rexmit_min, 0, sysctl_msec_to_ticks, "I",
102     "Minimum Retransmission Timeout");
103 
104 int	tcp_rexmit_slop;
105 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_slop, CTLTYPE_INT|CTLFLAG_RW,
106     &tcp_rexmit_slop, 0, sysctl_msec_to_ticks, "I",
107     "Retransmission Timer Slop");
108 
109 static int	always_keepalive = 1;
110 SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, CTLFLAG_RW,
111     &always_keepalive , 0, "Assume SO_KEEPALIVE on all TCP connections");
112 
113 int    tcp_fast_finwait2_recycle = 0;
114 SYSCTL_INT(_net_inet_tcp, OID_AUTO, fast_finwait2_recycle, CTLFLAG_RW,
115     &tcp_fast_finwait2_recycle, 0,
116     "Recycle closed FIN_WAIT_2 connections faster");
117 
118 int    tcp_finwait2_timeout;
119 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, finwait2_timeout, CTLTYPE_INT|CTLFLAG_RW,
120     &tcp_finwait2_timeout, 0, sysctl_msec_to_ticks, "I", "FIN-WAIT2 timeout");
121 
122 int	tcp_keepcnt = TCPTV_KEEPCNT;
123 SYSCTL_INT(_net_inet_tcp, OID_AUTO, keepcnt, CTLFLAG_RW, &tcp_keepcnt, 0,
124     "Number of keepalive probes to send");
125 
126 	/* max idle probes */
127 int	tcp_maxpersistidle;
128 
129 static int	tcp_rexmit_drop_options = 0;
130 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rexmit_drop_options, CTLFLAG_RW,
131     &tcp_rexmit_drop_options, 0,
132     "Drop TCP options from 3rd and later retransmitted SYN");
133 
134 static VNET_DEFINE(int, tcp_pmtud_blackhole_detect);
135 #define	V_tcp_pmtud_blackhole_detect	VNET(tcp_pmtud_blackhole_detect)
136 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_detection,
137     CTLFLAG_RW|CTLFLAG_VNET,
138     &VNET_NAME(tcp_pmtud_blackhole_detect), 0,
139     "Path MTU Discovery Black Hole Detection Enabled");
140 
141 static VNET_DEFINE(int, tcp_pmtud_blackhole_activated);
142 #define	V_tcp_pmtud_blackhole_activated \
143     VNET(tcp_pmtud_blackhole_activated)
144 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_activated,
145     CTLFLAG_RD|CTLFLAG_VNET,
146     &VNET_NAME(tcp_pmtud_blackhole_activated), 0,
147     "Path MTU Discovery Black Hole Detection, Activation Count");
148 
149 static VNET_DEFINE(int, tcp_pmtud_blackhole_activated_min_mss);
150 #define	V_tcp_pmtud_blackhole_activated_min_mss \
151     VNET(tcp_pmtud_blackhole_activated_min_mss)
152 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_activated_min_mss,
153     CTLFLAG_RD|CTLFLAG_VNET,
154     &VNET_NAME(tcp_pmtud_blackhole_activated_min_mss), 0,
155     "Path MTU Discovery Black Hole Detection, Activation Count at min MSS");
156 
157 static VNET_DEFINE(int, tcp_pmtud_blackhole_failed);
158 #define	V_tcp_pmtud_blackhole_failed	VNET(tcp_pmtud_blackhole_failed)
159 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_failed,
160     CTLFLAG_RD|CTLFLAG_VNET,
161     &VNET_NAME(tcp_pmtud_blackhole_failed), 0,
162     "Path MTU Discovery Black Hole Detection, Failure Count");
163 
164 #ifdef INET
165 static VNET_DEFINE(int, tcp_pmtud_blackhole_mss) = 1200;
166 #define	V_tcp_pmtud_blackhole_mss	VNET(tcp_pmtud_blackhole_mss)
167 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_mss,
168     CTLFLAG_RW|CTLFLAG_VNET,
169     &VNET_NAME(tcp_pmtud_blackhole_mss), 0,
170     "Path MTU Discovery Black Hole Detection lowered MSS");
171 #endif
172 
173 #ifdef INET6
174 static VNET_DEFINE(int, tcp_v6pmtud_blackhole_mss) = 1220;
175 #define	V_tcp_v6pmtud_blackhole_mss	VNET(tcp_v6pmtud_blackhole_mss)
176 SYSCTL_INT(_net_inet_tcp, OID_AUTO, v6pmtud_blackhole_mss,
177     CTLFLAG_RW|CTLFLAG_VNET,
178     &VNET_NAME(tcp_v6pmtud_blackhole_mss), 0,
179     "Path MTU Discovery IPv6 Black Hole Detection lowered MSS");
180 #endif
181 
182 #ifdef	RSS
183 static int	per_cpu_timers = 1;
184 #else
185 static int	per_cpu_timers = 0;
186 #endif
187 SYSCTL_INT(_net_inet_tcp, OID_AUTO, per_cpu_timers, CTLFLAG_RW,
188     &per_cpu_timers , 0, "run tcp timers on all cpus");
189 
190 #if 0
191 #define	INP_CPU(inp)	(per_cpu_timers ? (!CPU_ABSENT(((inp)->inp_flowid % (mp_maxid+1))) ? \
192 		((inp)->inp_flowid % (mp_maxid+1)) : curcpu) : 0)
193 #endif
194 
195 /*
196  * Map the given inp to a CPU id.
197  *
198  * This queries RSS if it's compiled in, else it defaults to the current
199  * CPU ID.
200  */
201 static inline int
202 inp_to_cpuid(struct inpcb *inp)
203 {
204 	u_int cpuid;
205 
206 #ifdef	RSS
207 	if (per_cpu_timers) {
208 		cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype);
209 		if (cpuid == NETISR_CPUID_NONE)
210 			return (curcpu);	/* XXX */
211 		else
212 			return (cpuid);
213 	}
214 #else
215 	/* Legacy, pre-RSS behaviour */
216 	if (per_cpu_timers) {
217 		/*
218 		 * We don't have a flowid -> cpuid mapping, so cheat and
219 		 * just map unknown cpuids to curcpu.  Not the best, but
220 		 * apparently better than defaulting to swi 0.
221 		 */
222 		cpuid = inp->inp_flowid % (mp_maxid + 1);
223 		if (! CPU_ABSENT(cpuid))
224 			return (cpuid);
225 		return (curcpu);
226 	}
227 #endif
228 	/* Default for RSS and non-RSS - cpuid 0 */
229 	else {
230 		return (0);
231 	}
232 }
233 
234 /*
235  * Tcp protocol timeout routine called every 500 ms.
236  * Updates timestamps used for TCP
237  * causes finite state machine actions if timers expire.
238  */
239 void
240 tcp_slowtimo(void)
241 {
242 	VNET_ITERATOR_DECL(vnet_iter);
243 
244 	VNET_LIST_RLOCK_NOSLEEP();
245 	VNET_FOREACH(vnet_iter) {
246 		CURVNET_SET(vnet_iter);
247 		(void) tcp_tw_2msl_scan(0);
248 		CURVNET_RESTORE();
249 	}
250 	VNET_LIST_RUNLOCK_NOSLEEP();
251 }
252 
253 int	tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] =
254     { 1, 1, 1, 1, 1, 2, 4, 8, 16, 32, 64, 64, 64 };
255 
256 int	tcp_backoff[TCP_MAXRXTSHIFT + 1] =
257     { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 512, 512, 512 };
258 
259 static int tcp_totbackoff = 2559;	/* sum of tcp_backoff[] */
260 
261 /*
262  * TCP timer processing.
263  */
264 
265 void
266 tcp_timer_delack(void *xtp)
267 {
268 	struct tcpcb *tp = xtp;
269 	struct inpcb *inp;
270 	CURVNET_SET(tp->t_vnet);
271 
272 	inp = tp->t_inpcb;
273 	KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
274 	INP_WLOCK(inp);
275 	if (callout_pending(&tp->t_timers->tt_delack) ||
276 	    !callout_active(&tp->t_timers->tt_delack)) {
277 		INP_WUNLOCK(inp);
278 		CURVNET_RESTORE();
279 		return;
280 	}
281 	callout_deactivate(&tp->t_timers->tt_delack);
282 	if ((inp->inp_flags & INP_DROPPED) != 0) {
283 		INP_WUNLOCK(inp);
284 		CURVNET_RESTORE();
285 		return;
286 	}
287 	KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
288 		("%s: tp %p tcpcb can't be stopped here", __func__, tp));
289 	KASSERT((tp->t_timers->tt_flags & TT_DELACK) != 0,
290 		("%s: tp %p delack callout should be running", __func__, tp));
291 
292 	tp->t_flags |= TF_ACKNOW;
293 	TCPSTAT_INC(tcps_delack);
294 	(void) tcp_output(tp);
295 	INP_WUNLOCK(inp);
296 	CURVNET_RESTORE();
297 }
298 
299 void
300 tcp_timer_2msl(void *xtp)
301 {
302 	struct tcpcb *tp = xtp;
303 	struct inpcb *inp;
304 	CURVNET_SET(tp->t_vnet);
305 #ifdef TCPDEBUG
306 	int ostate;
307 
308 	ostate = tp->t_state;
309 #endif
310 	INP_INFO_RLOCK(&V_tcbinfo);
311 	inp = tp->t_inpcb;
312 	KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
313 	INP_WLOCK(inp);
314 	tcp_free_sackholes(tp);
315 	if (callout_pending(&tp->t_timers->tt_2msl) ||
316 	    !callout_active(&tp->t_timers->tt_2msl)) {
317 		INP_WUNLOCK(tp->t_inpcb);
318 		INP_INFO_RUNLOCK(&V_tcbinfo);
319 		CURVNET_RESTORE();
320 		return;
321 	}
322 	callout_deactivate(&tp->t_timers->tt_2msl);
323 	if ((inp->inp_flags & INP_DROPPED) != 0) {
324 		INP_WUNLOCK(inp);
325 		INP_INFO_RUNLOCK(&V_tcbinfo);
326 		CURVNET_RESTORE();
327 		return;
328 	}
329 	KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
330 		("%s: tp %p tcpcb can't be stopped here", __func__, tp));
331 	KASSERT((tp->t_timers->tt_flags & TT_2MSL) != 0,
332 		("%s: tp %p 2msl callout should be running", __func__, tp));
333 	/*
334 	 * 2 MSL timeout in shutdown went off.  If we're closed but
335 	 * still waiting for peer to close and connection has been idle
336 	 * too long delete connection control block.  Otherwise, check
337 	 * again in a bit.
338 	 *
339 	 * If in TIME_WAIT state just ignore as this timeout is handled in
340 	 * tcp_tw_2msl_scan().
341 	 *
342 	 * If fastrecycle of FIN_WAIT_2, in FIN_WAIT_2 and receiver has closed,
343 	 * there's no point in hanging onto FIN_WAIT_2 socket. Just close it.
344 	 * Ignore fact that there were recent incoming segments.
345 	 */
346 	if ((inp->inp_flags & INP_TIMEWAIT) != 0) {
347 		INP_WUNLOCK(inp);
348 		INP_INFO_RUNLOCK(&V_tcbinfo);
349 		CURVNET_RESTORE();
350 		return;
351 	}
352 	if (tcp_fast_finwait2_recycle && tp->t_state == TCPS_FIN_WAIT_2 &&
353 	    tp->t_inpcb && tp->t_inpcb->inp_socket &&
354 	    (tp->t_inpcb->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE)) {
355 		TCPSTAT_INC(tcps_finwait2_drops);
356 		tp = tcp_close(tp);
357 	} else {
358 		if (ticks - tp->t_rcvtime <= TP_MAXIDLE(tp)) {
359 			if (!callout_reset(&tp->t_timers->tt_2msl,
360 			   TP_KEEPINTVL(tp), tcp_timer_2msl, tp)) {
361 				tp->t_timers->tt_flags &= ~TT_2MSL_RST;
362 			}
363 		} else
364 		       tp = tcp_close(tp);
365        }
366 
367 #ifdef TCPDEBUG
368 	if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
369 		tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
370 			  PRU_SLOWTIMO);
371 #endif
372 	if (tp != NULL)
373 		INP_WUNLOCK(inp);
374 	INP_INFO_RUNLOCK(&V_tcbinfo);
375 	CURVNET_RESTORE();
376 }
377 
378 void
379 tcp_timer_keep(void *xtp)
380 {
381 	struct tcpcb *tp = xtp;
382 	struct tcptemp *t_template;
383 	struct inpcb *inp;
384 	CURVNET_SET(tp->t_vnet);
385 #ifdef TCPDEBUG
386 	int ostate;
387 
388 	ostate = tp->t_state;
389 #endif
390 	INP_INFO_RLOCK(&V_tcbinfo);
391 	inp = tp->t_inpcb;
392 	KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
393 	INP_WLOCK(inp);
394 	if (callout_pending(&tp->t_timers->tt_keep) ||
395 	    !callout_active(&tp->t_timers->tt_keep)) {
396 		INP_WUNLOCK(inp);
397 		INP_INFO_RUNLOCK(&V_tcbinfo);
398 		CURVNET_RESTORE();
399 		return;
400 	}
401 	callout_deactivate(&tp->t_timers->tt_keep);
402 	if ((inp->inp_flags & INP_DROPPED) != 0) {
403 		INP_WUNLOCK(inp);
404 		INP_INFO_RUNLOCK(&V_tcbinfo);
405 		CURVNET_RESTORE();
406 		return;
407 	}
408 	KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
409 		("%s: tp %p tcpcb can't be stopped here", __func__, tp));
410 	KASSERT((tp->t_timers->tt_flags & TT_KEEP) != 0,
411 		("%s: tp %p keep callout should be running", __func__, tp));
412 	/*
413 	 * Keep-alive timer went off; send something
414 	 * or drop connection if idle for too long.
415 	 */
416 	TCPSTAT_INC(tcps_keeptimeo);
417 	if (tp->t_state < TCPS_ESTABLISHED)
418 		goto dropit;
419 	if ((always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) &&
420 	    tp->t_state <= TCPS_CLOSING) {
421 		if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp))
422 			goto dropit;
423 		/*
424 		 * Send a packet designed to force a response
425 		 * if the peer is up and reachable:
426 		 * either an ACK if the connection is still alive,
427 		 * or an RST if the peer has closed the connection
428 		 * due to timeout or reboot.
429 		 * Using sequence number tp->snd_una-1
430 		 * causes the transmitted zero-length segment
431 		 * to lie outside the receive window;
432 		 * by the protocol spec, this requires the
433 		 * correspondent TCP to respond.
434 		 */
435 		TCPSTAT_INC(tcps_keepprobe);
436 		t_template = tcpip_maketemplate(inp);
437 		if (t_template) {
438 			tcp_respond(tp, t_template->tt_ipgen,
439 				    &t_template->tt_t, (struct mbuf *)NULL,
440 				    tp->rcv_nxt, tp->snd_una - 1, 0);
441 			free(t_template, M_TEMP);
442 		}
443 		if (!callout_reset(&tp->t_timers->tt_keep, TP_KEEPINTVL(tp),
444 		    tcp_timer_keep, tp)) {
445 			tp->t_timers->tt_flags &= ~TT_KEEP_RST;
446 		}
447 	} else if (!callout_reset(&tp->t_timers->tt_keep, TP_KEEPIDLE(tp),
448 		    tcp_timer_keep, tp)) {
449 			tp->t_timers->tt_flags &= ~TT_KEEP_RST;
450 		}
451 
452 #ifdef TCPDEBUG
453 	if (inp->inp_socket->so_options & SO_DEBUG)
454 		tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
455 			  PRU_SLOWTIMO);
456 #endif
457 	INP_WUNLOCK(inp);
458 	INP_INFO_RUNLOCK(&V_tcbinfo);
459 	CURVNET_RESTORE();
460 	return;
461 
462 dropit:
463 	TCPSTAT_INC(tcps_keepdrops);
464 	tp = tcp_drop(tp, ETIMEDOUT);
465 
466 #ifdef TCPDEBUG
467 	if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
468 		tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
469 			  PRU_SLOWTIMO);
470 #endif
471 	if (tp != NULL)
472 		INP_WUNLOCK(tp->t_inpcb);
473 	INP_INFO_RUNLOCK(&V_tcbinfo);
474 	CURVNET_RESTORE();
475 }
476 
477 void
478 tcp_timer_persist(void *xtp)
479 {
480 	struct tcpcb *tp = xtp;
481 	struct inpcb *inp;
482 	CURVNET_SET(tp->t_vnet);
483 #ifdef TCPDEBUG
484 	int ostate;
485 
486 	ostate = tp->t_state;
487 #endif
488 	INP_INFO_RLOCK(&V_tcbinfo);
489 	inp = tp->t_inpcb;
490 	KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
491 	INP_WLOCK(inp);
492 	if (callout_pending(&tp->t_timers->tt_persist) ||
493 	    !callout_active(&tp->t_timers->tt_persist)) {
494 		INP_WUNLOCK(inp);
495 		INP_INFO_RUNLOCK(&V_tcbinfo);
496 		CURVNET_RESTORE();
497 		return;
498 	}
499 	callout_deactivate(&tp->t_timers->tt_persist);
500 	if ((inp->inp_flags & INP_DROPPED) != 0) {
501 		INP_WUNLOCK(inp);
502 		INP_INFO_RUNLOCK(&V_tcbinfo);
503 		CURVNET_RESTORE();
504 		return;
505 	}
506 	KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
507 		("%s: tp %p tcpcb can't be stopped here", __func__, tp));
508 	KASSERT((tp->t_timers->tt_flags & TT_PERSIST) != 0,
509 		("%s: tp %p persist callout should be running", __func__, tp));
510 	/*
511 	 * Persistance timer into zero window.
512 	 * Force a byte to be output, if possible.
513 	 */
514 	TCPSTAT_INC(tcps_persisttimeo);
515 	/*
516 	 * Hack: if the peer is dead/unreachable, we do not
517 	 * time out if the window is closed.  After a full
518 	 * backoff, drop the connection if the idle time
519 	 * (no responses to probes) reaches the maximum
520 	 * backoff that we would use if retransmitting.
521 	 */
522 	if (tp->t_rxtshift == TCP_MAXRXTSHIFT &&
523 	    (ticks - tp->t_rcvtime >= tcp_maxpersistidle ||
524 	     ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) {
525 		TCPSTAT_INC(tcps_persistdrop);
526 		tp = tcp_drop(tp, ETIMEDOUT);
527 		goto out;
528 	}
529 	/*
530 	 * If the user has closed the socket then drop a persisting
531 	 * connection after a much reduced timeout.
532 	 */
533 	if (tp->t_state > TCPS_CLOSE_WAIT &&
534 	    (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) {
535 		TCPSTAT_INC(tcps_persistdrop);
536 		tp = tcp_drop(tp, ETIMEDOUT);
537 		goto out;
538 	}
539 	tcp_setpersist(tp);
540 	tp->t_flags |= TF_FORCEDATA;
541 	(void) tcp_output(tp);
542 	tp->t_flags &= ~TF_FORCEDATA;
543 
544 out:
545 #ifdef TCPDEBUG
546 	if (tp != NULL && tp->t_inpcb->inp_socket->so_options & SO_DEBUG)
547 		tcp_trace(TA_USER, ostate, tp, NULL, NULL, PRU_SLOWTIMO);
548 #endif
549 	if (tp != NULL)
550 		INP_WUNLOCK(inp);
551 	INP_INFO_RUNLOCK(&V_tcbinfo);
552 	CURVNET_RESTORE();
553 }
554 
555 void
556 tcp_timer_rexmt(void * xtp)
557 {
558 	struct tcpcb *tp = xtp;
559 	CURVNET_SET(tp->t_vnet);
560 	int rexmt;
561 	int headlocked;
562 	struct inpcb *inp;
563 #ifdef TCPDEBUG
564 	int ostate;
565 
566 	ostate = tp->t_state;
567 #endif
568 
569 	INP_INFO_RLOCK(&V_tcbinfo);
570 	inp = tp->t_inpcb;
571 	KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
572 	INP_WLOCK(inp);
573 	if (callout_pending(&tp->t_timers->tt_rexmt) ||
574 	    !callout_active(&tp->t_timers->tt_rexmt)) {
575 		INP_WUNLOCK(inp);
576 		INP_INFO_RUNLOCK(&V_tcbinfo);
577 		CURVNET_RESTORE();
578 		return;
579 	}
580 	callout_deactivate(&tp->t_timers->tt_rexmt);
581 	if ((inp->inp_flags & INP_DROPPED) != 0) {
582 		INP_WUNLOCK(inp);
583 		INP_INFO_RUNLOCK(&V_tcbinfo);
584 		CURVNET_RESTORE();
585 		return;
586 	}
587 	KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
588 		("%s: tp %p tcpcb can't be stopped here", __func__, tp));
589 	KASSERT((tp->t_timers->tt_flags & TT_REXMT) != 0,
590 		("%s: tp %p rexmt callout should be running", __func__, tp));
591 	tcp_free_sackholes(tp);
592 	/*
593 	 * Retransmission timer went off.  Message has not
594 	 * been acked within retransmit interval.  Back off
595 	 * to a longer retransmit interval and retransmit one segment.
596 	 */
597 	if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) {
598 		tp->t_rxtshift = TCP_MAXRXTSHIFT;
599 		TCPSTAT_INC(tcps_timeoutdrop);
600 
601 		tp = tcp_drop(tp, tp->t_softerror ?
602 			      tp->t_softerror : ETIMEDOUT);
603 		headlocked = 1;
604 		goto out;
605 	}
606 	INP_INFO_RUNLOCK(&V_tcbinfo);
607 	headlocked = 0;
608 	if (tp->t_state == TCPS_SYN_SENT) {
609 		/*
610 		 * If the SYN was retransmitted, indicate CWND to be
611 		 * limited to 1 segment in cc_conn_init().
612 		 */
613 		tp->snd_cwnd = 1;
614 	} else if (tp->t_rxtshift == 1) {
615 		/*
616 		 * first retransmit; record ssthresh and cwnd so they can
617 		 * be recovered if this turns out to be a "bad" retransmit.
618 		 * A retransmit is considered "bad" if an ACK for this
619 		 * segment is received within RTT/2 interval; the assumption
620 		 * here is that the ACK was already in flight.  See
621 		 * "On Estimating End-to-End Network Path Properties" by
622 		 * Allman and Paxson for more details.
623 		 */
624 		tp->snd_cwnd_prev = tp->snd_cwnd;
625 		tp->snd_ssthresh_prev = tp->snd_ssthresh;
626 		tp->snd_recover_prev = tp->snd_recover;
627 		if (IN_FASTRECOVERY(tp->t_flags))
628 			tp->t_flags |= TF_WASFRECOVERY;
629 		else
630 			tp->t_flags &= ~TF_WASFRECOVERY;
631 		if (IN_CONGRECOVERY(tp->t_flags))
632 			tp->t_flags |= TF_WASCRECOVERY;
633 		else
634 			tp->t_flags &= ~TF_WASCRECOVERY;
635 		tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1));
636 		tp->t_flags |= TF_PREVVALID;
637 	} else
638 		tp->t_flags &= ~TF_PREVVALID;
639 	TCPSTAT_INC(tcps_rexmttimeo);
640 	if (tp->t_state == TCPS_SYN_SENT)
641 		rexmt = TCPTV_RTOBASE * tcp_syn_backoff[tp->t_rxtshift];
642 	else
643 		rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift];
644 	TCPT_RANGESET(tp->t_rxtcur, rexmt,
645 		      tp->t_rttmin, TCPTV_REXMTMAX);
646 
647 	/*
648 	 * We enter the path for PLMTUD if connection is established or, if
649 	 * connection is FIN_WAIT_1 status, reason for the last is that if
650 	 * amount of data we send is very small, we could send it in couple of
651 	 * packets and process straight to FIN. In that case we won't catch
652 	 * ESTABLISHED state.
653 	 */
654 	if (V_tcp_pmtud_blackhole_detect && (((tp->t_state == TCPS_ESTABLISHED))
655 	    || (tp->t_state == TCPS_FIN_WAIT_1))) {
656 		int optlen;
657 #ifdef INET6
658 		int isipv6;
659 #endif
660 
661 		if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) ==
662 		    (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) &&
663 		    (tp->t_rxtshift <= 2)) {
664 			/*
665 			 * Enter Path MTU Black-hole Detection mechanism:
666 			 * - Disable Path MTU Discovery (IP "DF" bit).
667 			 * - Reduce MTU to lower value than what we
668 			 *   negotiated with peer.
669 			 */
670 			/* Record that we may have found a black hole. */
671 			tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE;
672 
673 			/* Keep track of previous MSS. */
674 			optlen = tp->t_maxopd - tp->t_maxseg;
675 			tp->t_pmtud_saved_maxopd = tp->t_maxopd;
676 
677 			/*
678 			 * Reduce the MSS to blackhole value or to the default
679 			 * in an attempt to retransmit.
680 			 */
681 #ifdef INET6
682 			isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? 1 : 0;
683 			if (isipv6 &&
684 			    tp->t_maxopd > V_tcp_v6pmtud_blackhole_mss) {
685 				/* Use the sysctl tuneable blackhole MSS. */
686 				tp->t_maxopd = V_tcp_v6pmtud_blackhole_mss;
687 				V_tcp_pmtud_blackhole_activated++;
688 			} else if (isipv6) {
689 				/* Use the default MSS. */
690 				tp->t_maxopd = V_tcp_v6mssdflt;
691 				/*
692 				 * Disable Path MTU Discovery when we switch to
693 				 * minmss.
694 				 */
695 				tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
696 				V_tcp_pmtud_blackhole_activated_min_mss++;
697 			}
698 #endif
699 #if defined(INET6) && defined(INET)
700 			else
701 #endif
702 #ifdef INET
703 			if (tp->t_maxopd > V_tcp_pmtud_blackhole_mss) {
704 				/* Use the sysctl tuneable blackhole MSS. */
705 				tp->t_maxopd = V_tcp_pmtud_blackhole_mss;
706 				V_tcp_pmtud_blackhole_activated++;
707 			} else {
708 				/* Use the default MSS. */
709 				tp->t_maxopd = V_tcp_mssdflt;
710 				/*
711 				 * Disable Path MTU Discovery when we switch to
712 				 * minmss.
713 				 */
714 				tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
715 				V_tcp_pmtud_blackhole_activated_min_mss++;
716 			}
717 #endif
718 			tp->t_maxseg = tp->t_maxopd - optlen;
719 			/*
720 			 * Reset the slow-start flight size
721 			 * as it may depend on the new MSS.
722 			 */
723 			if (CC_ALGO(tp)->conn_init != NULL)
724 				CC_ALGO(tp)->conn_init(tp->ccv);
725 		} else {
726 			/*
727 			 * If further retransmissions are still unsuccessful
728 			 * with a lowered MTU, maybe this isn't a blackhole and
729 			 * we restore the previous MSS and blackhole detection
730 			 * flags.
731 			 */
732 			if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) &&
733 			    (tp->t_rxtshift > 4)) {
734 				tp->t_flags2 |= TF2_PLPMTU_PMTUD;
735 				tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE;
736 				optlen = tp->t_maxopd - tp->t_maxseg;
737 				tp->t_maxopd = tp->t_pmtud_saved_maxopd;
738 				tp->t_maxseg = tp->t_maxopd - optlen;
739 				V_tcp_pmtud_blackhole_failed++;
740 				/*
741 				 * Reset the slow-start flight size as it
742 				 * may depend on the new MSS.
743 				 */
744 				if (CC_ALGO(tp)->conn_init != NULL)
745 					CC_ALGO(tp)->conn_init(tp->ccv);
746 			}
747 		}
748 	}
749 
750 	/*
751 	 * Disable RFC1323 and SACK if we haven't got any response to
752 	 * our third SYN to work-around some broken terminal servers
753 	 * (most of which have hopefully been retired) that have bad VJ
754 	 * header compression code which trashes TCP segments containing
755 	 * unknown-to-them TCP options.
756 	 */
757 	if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) &&
758 	    (tp->t_rxtshift == 3))
759 		tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_SACK_PERMIT);
760 	/*
761 	 * If we backed off this far, our srtt estimate is probably bogus.
762 	 * Clobber it so we'll take the next rtt measurement as our srtt;
763 	 * move the current srtt into rttvar to keep the current
764 	 * retransmit times until then.
765 	 */
766 	if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) {
767 #ifdef INET6
768 		if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0)
769 			in6_losing(tp->t_inpcb);
770 #endif
771 		tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT);
772 		tp->t_srtt = 0;
773 	}
774 	tp->snd_nxt = tp->snd_una;
775 	tp->snd_recover = tp->snd_max;
776 	/*
777 	 * Force a segment to be sent.
778 	 */
779 	tp->t_flags |= TF_ACKNOW;
780 	/*
781 	 * If timing a segment in this window, stop the timer.
782 	 */
783 	tp->t_rtttime = 0;
784 
785 	cc_cong_signal(tp, NULL, CC_RTO);
786 
787 	(void) tcp_output(tp);
788 
789 out:
790 #ifdef TCPDEBUG
791 	if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
792 		tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
793 			  PRU_SLOWTIMO);
794 #endif
795 	if (tp != NULL)
796 		INP_WUNLOCK(inp);
797 	if (headlocked)
798 		INP_INFO_RUNLOCK(&V_tcbinfo);
799 	CURVNET_RESTORE();
800 }
801 
802 void
803 tcp_timer_activate(struct tcpcb *tp, uint32_t timer_type, u_int delta)
804 {
805 	struct callout *t_callout;
806 	timeout_t *f_callout;
807 	struct inpcb *inp = tp->t_inpcb;
808 	int cpu = inp_to_cpuid(inp);
809 	uint32_t f_reset;
810 
811 #ifdef TCP_OFFLOAD
812 	if (tp->t_flags & TF_TOE)
813 		return;
814 #endif
815 
816 	if (tp->t_timers->tt_flags & TT_STOPPED)
817 		return;
818 
819 	switch (timer_type) {
820 		case TT_DELACK:
821 			t_callout = &tp->t_timers->tt_delack;
822 			f_callout = tcp_timer_delack;
823 			f_reset = TT_DELACK_RST;
824 			break;
825 		case TT_REXMT:
826 			t_callout = &tp->t_timers->tt_rexmt;
827 			f_callout = tcp_timer_rexmt;
828 			f_reset = TT_REXMT_RST;
829 			break;
830 		case TT_PERSIST:
831 			t_callout = &tp->t_timers->tt_persist;
832 			f_callout = tcp_timer_persist;
833 			f_reset = TT_PERSIST_RST;
834 			break;
835 		case TT_KEEP:
836 			t_callout = &tp->t_timers->tt_keep;
837 			f_callout = tcp_timer_keep;
838 			f_reset = TT_KEEP_RST;
839 			break;
840 		case TT_2MSL:
841 			t_callout = &tp->t_timers->tt_2msl;
842 			f_callout = tcp_timer_2msl;
843 			f_reset = TT_2MSL_RST;
844 			break;
845 		default:
846 			panic("tp %p bad timer_type %#x", tp, timer_type);
847 		}
848 	if (delta == 0) {
849 		if ((tp->t_timers->tt_flags & timer_type) &&
850 		    callout_stop(t_callout) &&
851 		    (tp->t_timers->tt_flags & f_reset)) {
852 			tp->t_timers->tt_flags &= ~(timer_type | f_reset);
853 		}
854 	} else {
855 		if ((tp->t_timers->tt_flags & timer_type) == 0) {
856 			tp->t_timers->tt_flags |= (timer_type | f_reset);
857 			callout_reset_on(t_callout, delta, f_callout, tp, cpu);
858 		} else {
859 			/* Reset already running callout on the same CPU. */
860 			if (!callout_reset(t_callout, delta, f_callout, tp)) {
861 				/*
862 				 * Callout not cancelled, consider it as not
863 				 * properly restarted. */
864 				tp->t_timers->tt_flags &= ~f_reset;
865 			}
866 		}
867 	}
868 }
869 
870 int
871 tcp_timer_active(struct tcpcb *tp, uint32_t timer_type)
872 {
873 	struct callout *t_callout;
874 
875 	switch (timer_type) {
876 		case TT_DELACK:
877 			t_callout = &tp->t_timers->tt_delack;
878 			break;
879 		case TT_REXMT:
880 			t_callout = &tp->t_timers->tt_rexmt;
881 			break;
882 		case TT_PERSIST:
883 			t_callout = &tp->t_timers->tt_persist;
884 			break;
885 		case TT_KEEP:
886 			t_callout = &tp->t_timers->tt_keep;
887 			break;
888 		case TT_2MSL:
889 			t_callout = &tp->t_timers->tt_2msl;
890 			break;
891 		default:
892 			panic("tp %p bad timer_type %#x", tp, timer_type);
893 		}
894 	return callout_active(t_callout);
895 }
896 
897 void
898 tcp_timer_stop(struct tcpcb *tp, uint32_t timer_type)
899 {
900 	struct callout *t_callout;
901 	timeout_t *f_callout;
902 	uint32_t f_reset;
903 
904 	tp->t_timers->tt_flags |= TT_STOPPED;
905 
906 	switch (timer_type) {
907 		case TT_DELACK:
908 			t_callout = &tp->t_timers->tt_delack;
909 			f_callout = tcp_timer_delack_discard;
910 			f_reset = TT_DELACK_RST;
911 			break;
912 		case TT_REXMT:
913 			t_callout = &tp->t_timers->tt_rexmt;
914 			f_callout = tcp_timer_rexmt_discard;
915 			f_reset = TT_REXMT_RST;
916 			break;
917 		case TT_PERSIST:
918 			t_callout = &tp->t_timers->tt_persist;
919 			f_callout = tcp_timer_persist_discard;
920 			f_reset = TT_PERSIST_RST;
921 			break;
922 		case TT_KEEP:
923 			t_callout = &tp->t_timers->tt_keep;
924 			f_callout = tcp_timer_keep_discard;
925 			f_reset = TT_KEEP_RST;
926 			break;
927 		case TT_2MSL:
928 			t_callout = &tp->t_timers->tt_2msl;
929 			f_callout = tcp_timer_2msl_discard;
930 			f_reset = TT_2MSL_RST;
931 			break;
932 		default:
933 			panic("tp %p bad timer_type %#x", tp, timer_type);
934 		}
935 
936 	if (tp->t_timers->tt_flags & timer_type) {
937 		if (callout_stop(t_callout) &&
938 		    (tp->t_timers->tt_flags & f_reset)) {
939 			tp->t_timers->tt_flags &= ~(timer_type | f_reset);
940 		} else {
941 			/*
942 			 * Can't stop the callout, defer tcpcb actual deletion
943 			 * to the last tcp timer discard callout.
944 			 * The TT_STOPPED flag will ensure that no tcp timer
945 			 * callouts can be restarted on our behalf, and
946 			 * past this point currently running callouts waiting
947 			 * on inp lock will return right away after the
948 			 * classical check for callout reset/stop events:
949 			 * callout_pending() || !callout_active()
950 			 */
951 			callout_reset(t_callout, 1, f_callout, tp);
952 		}
953 	}
954 }
955 
956 #define	ticks_to_msecs(t)	(1000*(t) / hz)
957 
958 void
959 tcp_timer_to_xtimer(struct tcpcb *tp, struct tcp_timer *timer,
960     struct xtcp_timer *xtimer)
961 {
962 	sbintime_t now;
963 
964 	bzero(xtimer, sizeof(*xtimer));
965 	if (timer == NULL)
966 		return;
967 	now = getsbinuptime();
968 	if (callout_active(&timer->tt_delack))
969 		xtimer->tt_delack = (timer->tt_delack.c_time - now) / SBT_1MS;
970 	if (callout_active(&timer->tt_rexmt))
971 		xtimer->tt_rexmt = (timer->tt_rexmt.c_time - now) / SBT_1MS;
972 	if (callout_active(&timer->tt_persist))
973 		xtimer->tt_persist = (timer->tt_persist.c_time - now) / SBT_1MS;
974 	if (callout_active(&timer->tt_keep))
975 		xtimer->tt_keep = (timer->tt_keep.c_time - now) / SBT_1MS;
976 	if (callout_active(&timer->tt_2msl))
977 		xtimer->tt_2msl = (timer->tt_2msl.c_time - now) / SBT_1MS;
978 	xtimer->t_rcvtime = ticks_to_msecs(ticks - tp->t_rcvtime);
979 }
980