xref: /freebsd/sys/netinet/tcp_timer.c (revision 7431dfd4580e850375fe5478d92ec770344db098)
1 /*-
2  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 4. Neither the name of the University nor the names of its contributors
14  *    may be used to endorse or promote products derived from this software
15  *    without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  *
29  *	@(#)tcp_timer.c	8.2 (Berkeley) 5/24/95
30  */
31 
32 #include <sys/cdefs.h>
33 __FBSDID("$FreeBSD$");
34 
35 #include "opt_inet.h"
36 #include "opt_inet6.h"
37 #include "opt_tcpdebug.h"
38 #include "opt_rss.h"
39 
40 #include <sys/param.h>
41 #include <sys/kernel.h>
42 #include <sys/lock.h>
43 #include <sys/mbuf.h>
44 #include <sys/mutex.h>
45 #include <sys/protosw.h>
46 #include <sys/smp.h>
47 #include <sys/socket.h>
48 #include <sys/socketvar.h>
49 #include <sys/sysctl.h>
50 #include <sys/systm.h>
51 
52 #include <net/if.h>
53 #include <net/route.h>
54 #include <net/vnet.h>
55 #include <net/netisr.h>
56 
57 #include <netinet/cc.h>
58 #include <netinet/in.h>
59 #include <netinet/in_pcb.h>
60 #include <netinet/in_rss.h>
61 #include <netinet/in_systm.h>
62 #ifdef INET6
63 #include <netinet6/in6_pcb.h>
64 #endif
65 #include <netinet/ip_var.h>
66 #include <netinet/tcp_fsm.h>
67 #include <netinet/tcp_timer.h>
68 #include <netinet/tcp_var.h>
69 #ifdef INET6
70 #include <netinet6/tcp6_var.h>
71 #endif
72 #include <netinet/tcpip.h>
73 #ifdef TCPDEBUG
74 #include <netinet/tcp_debug.h>
75 #endif
76 
77 int	tcp_keepinit;
78 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, CTLTYPE_INT|CTLFLAG_RW,
79     &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", "time to establish connection");
80 
81 int	tcp_keepidle;
82 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle, CTLTYPE_INT|CTLFLAG_RW,
83     &tcp_keepidle, 0, sysctl_msec_to_ticks, "I", "time before keepalive probes begin");
84 
85 int	tcp_keepintvl;
86 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl, CTLTYPE_INT|CTLFLAG_RW,
87     &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", "time between keepalive probes");
88 
89 int	tcp_delacktime;
90 SYSCTL_PROC(_net_inet_tcp, TCPCTL_DELACKTIME, delacktime, CTLTYPE_INT|CTLFLAG_RW,
91     &tcp_delacktime, 0, sysctl_msec_to_ticks, "I",
92     "Time before a delayed ACK is sent");
93 
94 int	tcp_msl;
95 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl, CTLTYPE_INT|CTLFLAG_RW,
96     &tcp_msl, 0, sysctl_msec_to_ticks, "I", "Maximum segment lifetime");
97 
98 int	tcp_rexmit_min;
99 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_min, CTLTYPE_INT|CTLFLAG_RW,
100     &tcp_rexmit_min, 0, sysctl_msec_to_ticks, "I",
101     "Minimum Retransmission Timeout");
102 
103 int	tcp_rexmit_slop;
104 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_slop, CTLTYPE_INT|CTLFLAG_RW,
105     &tcp_rexmit_slop, 0, sysctl_msec_to_ticks, "I",
106     "Retransmission Timer Slop");
107 
108 static int	always_keepalive = 1;
109 SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, CTLFLAG_RW,
110     &always_keepalive , 0, "Assume SO_KEEPALIVE on all TCP connections");
111 
112 int    tcp_fast_finwait2_recycle = 0;
113 SYSCTL_INT(_net_inet_tcp, OID_AUTO, fast_finwait2_recycle, CTLFLAG_RW,
114     &tcp_fast_finwait2_recycle, 0,
115     "Recycle closed FIN_WAIT_2 connections faster");
116 
117 int    tcp_finwait2_timeout;
118 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, finwait2_timeout, CTLTYPE_INT|CTLFLAG_RW,
119     &tcp_finwait2_timeout, 0, sysctl_msec_to_ticks, "I", "FIN-WAIT2 timeout");
120 
121 int	tcp_keepcnt = TCPTV_KEEPCNT;
122 SYSCTL_INT(_net_inet_tcp, OID_AUTO, keepcnt, CTLFLAG_RW, &tcp_keepcnt, 0,
123     "Number of keepalive probes to send");
124 
125 	/* max idle probes */
126 int	tcp_maxpersistidle;
127 
128 static int	tcp_rexmit_drop_options = 0;
129 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rexmit_drop_options, CTLFLAG_RW,
130     &tcp_rexmit_drop_options, 0,
131     "Drop TCP options from 3rd and later retransmitted SYN");
132 
133 static VNET_DEFINE(int, tcp_pmtud_blackhole_detect);
134 #define	V_tcp_pmtud_blackhole_detect	VNET(tcp_pmtud_blackhole_detect)
135 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_detection,
136     CTLFLAG_RW|CTLFLAG_VNET,
137     &VNET_NAME(tcp_pmtud_blackhole_detect), 0,
138     "Path MTU Discovery Black Hole Detection Enabled");
139 
140 static VNET_DEFINE(int, tcp_pmtud_blackhole_activated);
141 #define	V_tcp_pmtud_blackhole_activated \
142     VNET(tcp_pmtud_blackhole_activated)
143 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_activated,
144     CTLFLAG_RD|CTLFLAG_VNET,
145     &VNET_NAME(tcp_pmtud_blackhole_activated), 0,
146     "Path MTU Discovery Black Hole Detection, Activation Count");
147 
148 static VNET_DEFINE(int, tcp_pmtud_blackhole_activated_min_mss);
149 #define	V_tcp_pmtud_blackhole_activated_min_mss \
150     VNET(tcp_pmtud_blackhole_activated_min_mss)
151 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_activated_min_mss,
152     CTLFLAG_RD|CTLFLAG_VNET,
153     &VNET_NAME(tcp_pmtud_blackhole_activated_min_mss), 0,
154     "Path MTU Discovery Black Hole Detection, Activation Count at min MSS");
155 
156 static VNET_DEFINE(int, tcp_pmtud_blackhole_failed);
157 #define	V_tcp_pmtud_blackhole_failed	VNET(tcp_pmtud_blackhole_failed)
158 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_failed,
159     CTLFLAG_RD|CTLFLAG_VNET,
160     &VNET_NAME(tcp_pmtud_blackhole_failed), 0,
161     "Path MTU Discovery Black Hole Detection, Failure Count");
162 
163 #ifdef INET
164 static VNET_DEFINE(int, tcp_pmtud_blackhole_mss) = 1200;
165 #define	V_tcp_pmtud_blackhole_mss	VNET(tcp_pmtud_blackhole_mss)
166 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_mss,
167     CTLFLAG_RW|CTLFLAG_VNET,
168     &VNET_NAME(tcp_pmtud_blackhole_mss), 0,
169     "Path MTU Discovery Black Hole Detection lowered MSS");
170 #endif
171 
172 #ifdef INET6
173 static VNET_DEFINE(int, tcp_v6pmtud_blackhole_mss) = 1220;
174 #define	V_tcp_v6pmtud_blackhole_mss	VNET(tcp_v6pmtud_blackhole_mss)
175 SYSCTL_INT(_net_inet_tcp, OID_AUTO, v6pmtud_blackhole_mss,
176     CTLFLAG_RW|CTLFLAG_VNET,
177     &VNET_NAME(tcp_v6pmtud_blackhole_mss), 0,
178     "Path MTU Discovery IPv6 Black Hole Detection lowered MSS");
179 #endif
180 
181 #ifdef	RSS
182 static int	per_cpu_timers = 1;
183 #else
184 static int	per_cpu_timers = 0;
185 #endif
186 SYSCTL_INT(_net_inet_tcp, OID_AUTO, per_cpu_timers, CTLFLAG_RW,
187     &per_cpu_timers , 0, "run tcp timers on all cpus");
188 
189 #if 0
190 #define	INP_CPU(inp)	(per_cpu_timers ? (!CPU_ABSENT(((inp)->inp_flowid % (mp_maxid+1))) ? \
191 		((inp)->inp_flowid % (mp_maxid+1)) : curcpu) : 0)
192 #endif
193 
194 /*
195  * Map the given inp to a CPU id.
196  *
197  * This queries RSS if it's compiled in, else it defaults to the current
198  * CPU ID.
199  */
200 static inline int
201 inp_to_cpuid(struct inpcb *inp)
202 {
203 	u_int cpuid;
204 
205 #ifdef	RSS
206 	if (per_cpu_timers) {
207 		cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype);
208 		if (cpuid == NETISR_CPUID_NONE)
209 			return (curcpu);	/* XXX */
210 		else
211 			return (cpuid);
212 	}
213 #else
214 	/* Legacy, pre-RSS behaviour */
215 	if (per_cpu_timers) {
216 		/*
217 		 * We don't have a flowid -> cpuid mapping, so cheat and
218 		 * just map unknown cpuids to curcpu.  Not the best, but
219 		 * apparently better than defaulting to swi 0.
220 		 */
221 		cpuid = inp->inp_flowid % (mp_maxid + 1);
222 		if (! CPU_ABSENT(cpuid))
223 			return (cpuid);
224 		return (curcpu);
225 	}
226 #endif
227 	/* Default for RSS and non-RSS - cpuid 0 */
228 	else {
229 		return (0);
230 	}
231 }
232 
233 /*
234  * Tcp protocol timeout routine called every 500 ms.
235  * Updates timestamps used for TCP
236  * causes finite state machine actions if timers expire.
237  */
238 void
239 tcp_slowtimo(void)
240 {
241 	VNET_ITERATOR_DECL(vnet_iter);
242 
243 	VNET_LIST_RLOCK_NOSLEEP();
244 	VNET_FOREACH(vnet_iter) {
245 		CURVNET_SET(vnet_iter);
246 		(void) tcp_tw_2msl_scan(0);
247 		CURVNET_RESTORE();
248 	}
249 	VNET_LIST_RUNLOCK_NOSLEEP();
250 }
251 
252 int	tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] =
253     { 1, 1, 1, 1, 1, 2, 4, 8, 16, 32, 64, 64, 64 };
254 
255 int	tcp_backoff[TCP_MAXRXTSHIFT + 1] =
256     { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 512, 512, 512 };
257 
258 static int tcp_totbackoff = 2559;	/* sum of tcp_backoff[] */
259 
260 static int tcp_timer_race;
261 SYSCTL_INT(_net_inet_tcp, OID_AUTO, timer_race, CTLFLAG_RD, &tcp_timer_race,
262     0, "Count of t_inpcb races on tcp_discardcb");
263 
264 /*
265  * TCP timer processing.
266  */
267 
268 void
269 tcp_timer_delack(void *xtp)
270 {
271 	struct tcpcb *tp = xtp;
272 	struct inpcb *inp;
273 	CURVNET_SET(tp->t_vnet);
274 
275 	inp = tp->t_inpcb;
276 	/*
277 	 * XXXRW: While this assert is in fact correct, bugs in the tcpcb
278 	 * tear-down mean we need it as a work-around for races between
279 	 * timers and tcp_discardcb().
280 	 *
281 	 * KASSERT(inp != NULL, ("tcp_timer_delack: inp == NULL"));
282 	 */
283 	if (inp == NULL) {
284 		tcp_timer_race++;
285 		CURVNET_RESTORE();
286 		return;
287 	}
288 	INP_WLOCK(inp);
289 	if (callout_pending(&tp->t_timers->tt_delack) ||
290 	    !callout_active(&tp->t_timers->tt_delack)) {
291 		INP_WUNLOCK(inp);
292 		CURVNET_RESTORE();
293 		return;
294 	}
295 	callout_deactivate(&tp->t_timers->tt_delack);
296 	if ((inp->inp_flags & INP_DROPPED) != 0) {
297 		INP_WUNLOCK(inp);
298 		CURVNET_RESTORE();
299 		return;
300 	}
301 
302 	tp->t_flags |= TF_ACKNOW;
303 	TCPSTAT_INC(tcps_delack);
304 	(void) tcp_output(tp);
305 	INP_WUNLOCK(inp);
306 	CURVNET_RESTORE();
307 }
308 
309 void
310 tcp_timer_2msl(void *xtp)
311 {
312 	struct tcpcb *tp = xtp;
313 	struct inpcb *inp;
314 	CURVNET_SET(tp->t_vnet);
315 #ifdef TCPDEBUG
316 	int ostate;
317 
318 	ostate = tp->t_state;
319 #endif
320 	/*
321 	 * XXXRW: Does this actually happen?
322 	 */
323 	INP_INFO_WLOCK(&V_tcbinfo);
324 	inp = tp->t_inpcb;
325 	/*
326 	 * XXXRW: While this assert is in fact correct, bugs in the tcpcb
327 	 * tear-down mean we need it as a work-around for races between
328 	 * timers and tcp_discardcb().
329 	 *
330 	 * KASSERT(inp != NULL, ("tcp_timer_2msl: inp == NULL"));
331 	 */
332 	if (inp == NULL) {
333 		tcp_timer_race++;
334 		INP_INFO_WUNLOCK(&V_tcbinfo);
335 		CURVNET_RESTORE();
336 		return;
337 	}
338 	INP_WLOCK(inp);
339 	tcp_free_sackholes(tp);
340 	if (callout_pending(&tp->t_timers->tt_2msl) ||
341 	    !callout_active(&tp->t_timers->tt_2msl)) {
342 		INP_WUNLOCK(tp->t_inpcb);
343 		INP_INFO_WUNLOCK(&V_tcbinfo);
344 		CURVNET_RESTORE();
345 		return;
346 	}
347 	callout_deactivate(&tp->t_timers->tt_2msl);
348 	if ((inp->inp_flags & INP_DROPPED) != 0) {
349 		INP_WUNLOCK(inp);
350 		INP_INFO_WUNLOCK(&V_tcbinfo);
351 		CURVNET_RESTORE();
352 		return;
353 	}
354 	/*
355 	 * 2 MSL timeout in shutdown went off.  If we're closed but
356 	 * still waiting for peer to close and connection has been idle
357 	 * too long, or if 2MSL time is up from TIME_WAIT, delete connection
358 	 * control block.  Otherwise, check again in a bit.
359 	 *
360 	 * If fastrecycle of FIN_WAIT_2, in FIN_WAIT_2 and receiver has closed,
361 	 * there's no point in hanging onto FIN_WAIT_2 socket. Just close it.
362 	 * Ignore fact that there were recent incoming segments.
363 	 */
364 	if (tcp_fast_finwait2_recycle && tp->t_state == TCPS_FIN_WAIT_2 &&
365 	    tp->t_inpcb && tp->t_inpcb->inp_socket &&
366 	    (tp->t_inpcb->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE)) {
367 		TCPSTAT_INC(tcps_finwait2_drops);
368 		tp = tcp_close(tp);
369 	} else {
370 		if (tp->t_state != TCPS_TIME_WAIT &&
371 		   ticks - tp->t_rcvtime <= TP_MAXIDLE(tp))
372 		       callout_reset_on(&tp->t_timers->tt_2msl,
373 			   TP_KEEPINTVL(tp), tcp_timer_2msl, tp,
374 			   inp_to_cpuid(inp));
375 	       else
376 		       tp = tcp_close(tp);
377        }
378 
379 #ifdef TCPDEBUG
380 	if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
381 		tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
382 			  PRU_SLOWTIMO);
383 #endif
384 	if (tp != NULL)
385 		INP_WUNLOCK(inp);
386 	INP_INFO_WUNLOCK(&V_tcbinfo);
387 	CURVNET_RESTORE();
388 }
389 
390 void
391 tcp_timer_keep(void *xtp)
392 {
393 	struct tcpcb *tp = xtp;
394 	struct tcptemp *t_template;
395 	struct inpcb *inp;
396 	CURVNET_SET(tp->t_vnet);
397 #ifdef TCPDEBUG
398 	int ostate;
399 
400 	ostate = tp->t_state;
401 #endif
402 	INP_INFO_WLOCK(&V_tcbinfo);
403 	inp = tp->t_inpcb;
404 	/*
405 	 * XXXRW: While this assert is in fact correct, bugs in the tcpcb
406 	 * tear-down mean we need it as a work-around for races between
407 	 * timers and tcp_discardcb().
408 	 *
409 	 * KASSERT(inp != NULL, ("tcp_timer_keep: inp == NULL"));
410 	 */
411 	if (inp == NULL) {
412 		tcp_timer_race++;
413 		INP_INFO_WUNLOCK(&V_tcbinfo);
414 		CURVNET_RESTORE();
415 		return;
416 	}
417 	INP_WLOCK(inp);
418 	if (callout_pending(&tp->t_timers->tt_keep) ||
419 	    !callout_active(&tp->t_timers->tt_keep)) {
420 		INP_WUNLOCK(inp);
421 		INP_INFO_WUNLOCK(&V_tcbinfo);
422 		CURVNET_RESTORE();
423 		return;
424 	}
425 	callout_deactivate(&tp->t_timers->tt_keep);
426 	if ((inp->inp_flags & INP_DROPPED) != 0) {
427 		INP_WUNLOCK(inp);
428 		INP_INFO_WUNLOCK(&V_tcbinfo);
429 		CURVNET_RESTORE();
430 		return;
431 	}
432 	/*
433 	 * Keep-alive timer went off; send something
434 	 * or drop connection if idle for too long.
435 	 */
436 	TCPSTAT_INC(tcps_keeptimeo);
437 	if (tp->t_state < TCPS_ESTABLISHED)
438 		goto dropit;
439 	if ((always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) &&
440 	    tp->t_state <= TCPS_CLOSING) {
441 		if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp))
442 			goto dropit;
443 		/*
444 		 * Send a packet designed to force a response
445 		 * if the peer is up and reachable:
446 		 * either an ACK if the connection is still alive,
447 		 * or an RST if the peer has closed the connection
448 		 * due to timeout or reboot.
449 		 * Using sequence number tp->snd_una-1
450 		 * causes the transmitted zero-length segment
451 		 * to lie outside the receive window;
452 		 * by the protocol spec, this requires the
453 		 * correspondent TCP to respond.
454 		 */
455 		TCPSTAT_INC(tcps_keepprobe);
456 		t_template = tcpip_maketemplate(inp);
457 		if (t_template) {
458 			tcp_respond(tp, t_template->tt_ipgen,
459 				    &t_template->tt_t, (struct mbuf *)NULL,
460 				    tp->rcv_nxt, tp->snd_una - 1, 0);
461 			free(t_template, M_TEMP);
462 		}
463 		callout_reset_on(&tp->t_timers->tt_keep, TP_KEEPINTVL(tp),
464 		    tcp_timer_keep, tp, inp_to_cpuid(inp));
465 	} else
466 		callout_reset_on(&tp->t_timers->tt_keep, TP_KEEPIDLE(tp),
467 		    tcp_timer_keep, tp, inp_to_cpuid(inp));
468 
469 #ifdef TCPDEBUG
470 	if (inp->inp_socket->so_options & SO_DEBUG)
471 		tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
472 			  PRU_SLOWTIMO);
473 #endif
474 	INP_WUNLOCK(inp);
475 	INP_INFO_WUNLOCK(&V_tcbinfo);
476 	CURVNET_RESTORE();
477 	return;
478 
479 dropit:
480 	TCPSTAT_INC(tcps_keepdrops);
481 	tp = tcp_drop(tp, ETIMEDOUT);
482 
483 #ifdef TCPDEBUG
484 	if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
485 		tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
486 			  PRU_SLOWTIMO);
487 #endif
488 	if (tp != NULL)
489 		INP_WUNLOCK(tp->t_inpcb);
490 	INP_INFO_WUNLOCK(&V_tcbinfo);
491 	CURVNET_RESTORE();
492 }
493 
494 void
495 tcp_timer_persist(void *xtp)
496 {
497 	struct tcpcb *tp = xtp;
498 	struct inpcb *inp;
499 	CURVNET_SET(tp->t_vnet);
500 #ifdef TCPDEBUG
501 	int ostate;
502 
503 	ostate = tp->t_state;
504 #endif
505 	INP_INFO_WLOCK(&V_tcbinfo);
506 	inp = tp->t_inpcb;
507 	/*
508 	 * XXXRW: While this assert is in fact correct, bugs in the tcpcb
509 	 * tear-down mean we need it as a work-around for races between
510 	 * timers and tcp_discardcb().
511 	 *
512 	 * KASSERT(inp != NULL, ("tcp_timer_persist: inp == NULL"));
513 	 */
514 	if (inp == NULL) {
515 		tcp_timer_race++;
516 		INP_INFO_WUNLOCK(&V_tcbinfo);
517 		CURVNET_RESTORE();
518 		return;
519 	}
520 	INP_WLOCK(inp);
521 	if (callout_pending(&tp->t_timers->tt_persist) ||
522 	    !callout_active(&tp->t_timers->tt_persist)) {
523 		INP_WUNLOCK(inp);
524 		INP_INFO_WUNLOCK(&V_tcbinfo);
525 		CURVNET_RESTORE();
526 		return;
527 	}
528 	callout_deactivate(&tp->t_timers->tt_persist);
529 	if ((inp->inp_flags & INP_DROPPED) != 0) {
530 		INP_WUNLOCK(inp);
531 		INP_INFO_WUNLOCK(&V_tcbinfo);
532 		CURVNET_RESTORE();
533 		return;
534 	}
535 	/*
536 	 * Persistance timer into zero window.
537 	 * Force a byte to be output, if possible.
538 	 */
539 	TCPSTAT_INC(tcps_persisttimeo);
540 	/*
541 	 * Hack: if the peer is dead/unreachable, we do not
542 	 * time out if the window is closed.  After a full
543 	 * backoff, drop the connection if the idle time
544 	 * (no responses to probes) reaches the maximum
545 	 * backoff that we would use if retransmitting.
546 	 */
547 	if (tp->t_rxtshift == TCP_MAXRXTSHIFT &&
548 	    (ticks - tp->t_rcvtime >= tcp_maxpersistidle ||
549 	     ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) {
550 		TCPSTAT_INC(tcps_persistdrop);
551 		tp = tcp_drop(tp, ETIMEDOUT);
552 		goto out;
553 	}
554 	/*
555 	 * If the user has closed the socket then drop a persisting
556 	 * connection after a much reduced timeout.
557 	 */
558 	if (tp->t_state > TCPS_CLOSE_WAIT &&
559 	    (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) {
560 		TCPSTAT_INC(tcps_persistdrop);
561 		tp = tcp_drop(tp, ETIMEDOUT);
562 		goto out;
563 	}
564 	tcp_setpersist(tp);
565 	tp->t_flags |= TF_FORCEDATA;
566 	(void) tcp_output(tp);
567 	tp->t_flags &= ~TF_FORCEDATA;
568 
569 out:
570 #ifdef TCPDEBUG
571 	if (tp != NULL && tp->t_inpcb->inp_socket->so_options & SO_DEBUG)
572 		tcp_trace(TA_USER, ostate, tp, NULL, NULL, PRU_SLOWTIMO);
573 #endif
574 	if (tp != NULL)
575 		INP_WUNLOCK(inp);
576 	INP_INFO_WUNLOCK(&V_tcbinfo);
577 	CURVNET_RESTORE();
578 }
579 
580 void
581 tcp_timer_rexmt(void * xtp)
582 {
583 	struct tcpcb *tp = xtp;
584 	CURVNET_SET(tp->t_vnet);
585 	int rexmt;
586 	int headlocked;
587 	struct inpcb *inp;
588 #ifdef TCPDEBUG
589 	int ostate;
590 
591 	ostate = tp->t_state;
592 #endif
593 
594 	INP_INFO_RLOCK(&V_tcbinfo);
595 	inp = tp->t_inpcb;
596 	/*
597 	 * XXXRW: While this assert is in fact correct, bugs in the tcpcb
598 	 * tear-down mean we need it as a work-around for races between
599 	 * timers and tcp_discardcb().
600 	 *
601 	 * KASSERT(inp != NULL, ("tcp_timer_rexmt: inp == NULL"));
602 	 */
603 	if (inp == NULL) {
604 		tcp_timer_race++;
605 		INP_INFO_RUNLOCK(&V_tcbinfo);
606 		CURVNET_RESTORE();
607 		return;
608 	}
609 	INP_WLOCK(inp);
610 	if (callout_pending(&tp->t_timers->tt_rexmt) ||
611 	    !callout_active(&tp->t_timers->tt_rexmt)) {
612 		INP_WUNLOCK(inp);
613 		INP_INFO_RUNLOCK(&V_tcbinfo);
614 		CURVNET_RESTORE();
615 		return;
616 	}
617 	callout_deactivate(&tp->t_timers->tt_rexmt);
618 	if ((inp->inp_flags & INP_DROPPED) != 0) {
619 		INP_WUNLOCK(inp);
620 		INP_INFO_RUNLOCK(&V_tcbinfo);
621 		CURVNET_RESTORE();
622 		return;
623 	}
624 	tcp_free_sackholes(tp);
625 	/*
626 	 * Retransmission timer went off.  Message has not
627 	 * been acked within retransmit interval.  Back off
628 	 * to a longer retransmit interval and retransmit one segment.
629 	 */
630 	if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) {
631 		tp->t_rxtshift = TCP_MAXRXTSHIFT;
632 		TCPSTAT_INC(tcps_timeoutdrop);
633 		in_pcbref(inp);
634 		INP_INFO_RUNLOCK(&V_tcbinfo);
635 		INP_WUNLOCK(inp);
636 		INP_INFO_WLOCK(&V_tcbinfo);
637 		INP_WLOCK(inp);
638 		if (in_pcbrele_wlocked(inp)) {
639 			INP_INFO_WUNLOCK(&V_tcbinfo);
640 			CURVNET_RESTORE();
641 			return;
642 		}
643 		if (inp->inp_flags & INP_DROPPED) {
644 			INP_WUNLOCK(inp);
645 			INP_INFO_WUNLOCK(&V_tcbinfo);
646 			CURVNET_RESTORE();
647 			return;
648 		}
649 
650 		tp = tcp_drop(tp, tp->t_softerror ?
651 			      tp->t_softerror : ETIMEDOUT);
652 		headlocked = 1;
653 		goto out;
654 	}
655 	INP_INFO_RUNLOCK(&V_tcbinfo);
656 	headlocked = 0;
657 	if (tp->t_state == TCPS_SYN_SENT) {
658 		/*
659 		 * If the SYN was retransmitted, indicate CWND to be
660 		 * limited to 1 segment in cc_conn_init().
661 		 */
662 		tp->snd_cwnd = 1;
663 	} else if (tp->t_rxtshift == 1) {
664 		/*
665 		 * first retransmit; record ssthresh and cwnd so they can
666 		 * be recovered if this turns out to be a "bad" retransmit.
667 		 * A retransmit is considered "bad" if an ACK for this
668 		 * segment is received within RTT/2 interval; the assumption
669 		 * here is that the ACK was already in flight.  See
670 		 * "On Estimating End-to-End Network Path Properties" by
671 		 * Allman and Paxson for more details.
672 		 */
673 		tp->snd_cwnd_prev = tp->snd_cwnd;
674 		tp->snd_ssthresh_prev = tp->snd_ssthresh;
675 		tp->snd_recover_prev = tp->snd_recover;
676 		if (IN_FASTRECOVERY(tp->t_flags))
677 			tp->t_flags |= TF_WASFRECOVERY;
678 		else
679 			tp->t_flags &= ~TF_WASFRECOVERY;
680 		if (IN_CONGRECOVERY(tp->t_flags))
681 			tp->t_flags |= TF_WASCRECOVERY;
682 		else
683 			tp->t_flags &= ~TF_WASCRECOVERY;
684 		tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1));
685 		tp->t_flags |= TF_PREVVALID;
686 	} else
687 		tp->t_flags &= ~TF_PREVVALID;
688 	TCPSTAT_INC(tcps_rexmttimeo);
689 	if (tp->t_state == TCPS_SYN_SENT)
690 		rexmt = TCPTV_RTOBASE * tcp_syn_backoff[tp->t_rxtshift];
691 	else
692 		rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift];
693 	TCPT_RANGESET(tp->t_rxtcur, rexmt,
694 		      tp->t_rttmin, TCPTV_REXMTMAX);
695 
696 	/*
697 	 * We enter the path for PLMTUD if connection is established or, if
698 	 * connection is FIN_WAIT_1 status, reason for the last is that if
699 	 * amount of data we send is very small, we could send it in couple of
700 	 * packets and process straight to FIN. In that case we won't catch
701 	 * ESTABLISHED state.
702 	 */
703 	if (V_tcp_pmtud_blackhole_detect && (((tp->t_state == TCPS_ESTABLISHED))
704 	    || (tp->t_state == TCPS_FIN_WAIT_1))) {
705 		int optlen;
706 #ifdef INET6
707 		int isipv6;
708 #endif
709 
710 		if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) ==
711 		    (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) &&
712 		    (tp->t_rxtshift <= 2)) {
713 			/*
714 			 * Enter Path MTU Black-hole Detection mechanism:
715 			 * - Disable Path MTU Discovery (IP "DF" bit).
716 			 * - Reduce MTU to lower value than what we
717 			 *   negotiated with peer.
718 			 */
719 			/* Record that we may have found a black hole. */
720 			tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE;
721 
722 			/* Keep track of previous MSS. */
723 			optlen = tp->t_maxopd - tp->t_maxseg;
724 			tp->t_pmtud_saved_maxopd = tp->t_maxopd;
725 
726 			/*
727 			 * Reduce the MSS to blackhole value or to the default
728 			 * in an attempt to retransmit.
729 			 */
730 #ifdef INET6
731 			isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? 1 : 0;
732 			if (isipv6 &&
733 			    tp->t_maxopd > V_tcp_v6pmtud_blackhole_mss) {
734 				/* Use the sysctl tuneable blackhole MSS. */
735 				tp->t_maxopd = V_tcp_v6pmtud_blackhole_mss;
736 				V_tcp_pmtud_blackhole_activated++;
737 			} else if (isipv6) {
738 				/* Use the default MSS. */
739 				tp->t_maxopd = V_tcp_v6mssdflt;
740 				/*
741 				 * Disable Path MTU Discovery when we switch to
742 				 * minmss.
743 				 */
744 				tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
745 				V_tcp_pmtud_blackhole_activated_min_mss++;
746 			}
747 #endif
748 #if defined(INET6) && defined(INET)
749 			else
750 #endif
751 #ifdef INET
752 			if (tp->t_maxopd > V_tcp_pmtud_blackhole_mss) {
753 				/* Use the sysctl tuneable blackhole MSS. */
754 				tp->t_maxopd = V_tcp_pmtud_blackhole_mss;
755 				V_tcp_pmtud_blackhole_activated++;
756 			} else {
757 				/* Use the default MSS. */
758 				tp->t_maxopd = V_tcp_mssdflt;
759 				/*
760 				 * Disable Path MTU Discovery when we switch to
761 				 * minmss.
762 				 */
763 				tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
764 				V_tcp_pmtud_blackhole_activated_min_mss++;
765 			}
766 #endif
767 			tp->t_maxseg = tp->t_maxopd - optlen;
768 			/*
769 			 * Reset the slow-start flight size
770 			 * as it may depend on the new MSS.
771 			 */
772 			if (CC_ALGO(tp)->conn_init != NULL)
773 				CC_ALGO(tp)->conn_init(tp->ccv);
774 		} else {
775 			/*
776 			 * If further retransmissions are still unsuccessful
777 			 * with a lowered MTU, maybe this isn't a blackhole and
778 			 * we restore the previous MSS and blackhole detection
779 			 * flags.
780 			 */
781 			if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) &&
782 			    (tp->t_rxtshift > 4)) {
783 				tp->t_flags2 |= TF2_PLPMTU_PMTUD;
784 				tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE;
785 				optlen = tp->t_maxopd - tp->t_maxseg;
786 				tp->t_maxopd = tp->t_pmtud_saved_maxopd;
787 				tp->t_maxseg = tp->t_maxopd - optlen;
788 				V_tcp_pmtud_blackhole_failed++;
789 				/*
790 				 * Reset the slow-start flight size as it
791 				 * may depend on the new MSS.
792 				 */
793 				if (CC_ALGO(tp)->conn_init != NULL)
794 					CC_ALGO(tp)->conn_init(tp->ccv);
795 			}
796 		}
797 	}
798 
799 	/*
800 	 * Disable RFC1323 and SACK if we haven't got any response to
801 	 * our third SYN to work-around some broken terminal servers
802 	 * (most of which have hopefully been retired) that have bad VJ
803 	 * header compression code which trashes TCP segments containing
804 	 * unknown-to-them TCP options.
805 	 */
806 	if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) &&
807 	    (tp->t_rxtshift == 3))
808 		tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_SACK_PERMIT);
809 	/*
810 	 * If we backed off this far, our srtt estimate is probably bogus.
811 	 * Clobber it so we'll take the next rtt measurement as our srtt;
812 	 * move the current srtt into rttvar to keep the current
813 	 * retransmit times until then.
814 	 */
815 	if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) {
816 #ifdef INET6
817 		if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0)
818 			in6_losing(tp->t_inpcb);
819 #endif
820 		tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT);
821 		tp->t_srtt = 0;
822 	}
823 	tp->snd_nxt = tp->snd_una;
824 	tp->snd_recover = tp->snd_max;
825 	/*
826 	 * Force a segment to be sent.
827 	 */
828 	tp->t_flags |= TF_ACKNOW;
829 	/*
830 	 * If timing a segment in this window, stop the timer.
831 	 */
832 	tp->t_rtttime = 0;
833 
834 	cc_cong_signal(tp, NULL, CC_RTO);
835 
836 	(void) tcp_output(tp);
837 
838 out:
839 #ifdef TCPDEBUG
840 	if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
841 		tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
842 			  PRU_SLOWTIMO);
843 #endif
844 	if (tp != NULL)
845 		INP_WUNLOCK(inp);
846 	if (headlocked)
847 		INP_INFO_WUNLOCK(&V_tcbinfo);
848 	CURVNET_RESTORE();
849 }
850 
851 void
852 tcp_timer_activate(struct tcpcb *tp, int timer_type, u_int delta)
853 {
854 	struct callout *t_callout;
855 	void *f_callout;
856 	struct inpcb *inp = tp->t_inpcb;
857 	int cpu = inp_to_cpuid(inp);
858 
859 #ifdef TCP_OFFLOAD
860 	if (tp->t_flags & TF_TOE)
861 		return;
862 #endif
863 
864 	switch (timer_type) {
865 		case TT_DELACK:
866 			t_callout = &tp->t_timers->tt_delack;
867 			f_callout = tcp_timer_delack;
868 			break;
869 		case TT_REXMT:
870 			t_callout = &tp->t_timers->tt_rexmt;
871 			f_callout = tcp_timer_rexmt;
872 			break;
873 		case TT_PERSIST:
874 			t_callout = &tp->t_timers->tt_persist;
875 			f_callout = tcp_timer_persist;
876 			break;
877 		case TT_KEEP:
878 			t_callout = &tp->t_timers->tt_keep;
879 			f_callout = tcp_timer_keep;
880 			break;
881 		case TT_2MSL:
882 			t_callout = &tp->t_timers->tt_2msl;
883 			f_callout = tcp_timer_2msl;
884 			break;
885 		default:
886 			panic("bad timer_type");
887 		}
888 	if (delta == 0) {
889 		callout_stop(t_callout);
890 	} else {
891 		callout_reset_on(t_callout, delta, f_callout, tp, cpu);
892 	}
893 }
894 
895 int
896 tcp_timer_active(struct tcpcb *tp, int timer_type)
897 {
898 	struct callout *t_callout;
899 
900 	switch (timer_type) {
901 		case TT_DELACK:
902 			t_callout = &tp->t_timers->tt_delack;
903 			break;
904 		case TT_REXMT:
905 			t_callout = &tp->t_timers->tt_rexmt;
906 			break;
907 		case TT_PERSIST:
908 			t_callout = &tp->t_timers->tt_persist;
909 			break;
910 		case TT_KEEP:
911 			t_callout = &tp->t_timers->tt_keep;
912 			break;
913 		case TT_2MSL:
914 			t_callout = &tp->t_timers->tt_2msl;
915 			break;
916 		default:
917 			panic("bad timer_type");
918 		}
919 	return callout_active(t_callout);
920 }
921 
922 #define	ticks_to_msecs(t)	(1000*(t) / hz)
923 
924 void
925 tcp_timer_to_xtimer(struct tcpcb *tp, struct tcp_timer *timer,
926     struct xtcp_timer *xtimer)
927 {
928 	sbintime_t now;
929 
930 	bzero(xtimer, sizeof(*xtimer));
931 	if (timer == NULL)
932 		return;
933 	now = getsbinuptime();
934 	if (callout_active(&timer->tt_delack))
935 		xtimer->tt_delack = (timer->tt_delack.c_time - now) / SBT_1MS;
936 	if (callout_active(&timer->tt_rexmt))
937 		xtimer->tt_rexmt = (timer->tt_rexmt.c_time - now) / SBT_1MS;
938 	if (callout_active(&timer->tt_persist))
939 		xtimer->tt_persist = (timer->tt_persist.c_time - now) / SBT_1MS;
940 	if (callout_active(&timer->tt_keep))
941 		xtimer->tt_keep = (timer->tt_keep.c_time - now) / SBT_1MS;
942 	if (callout_active(&timer->tt_2msl))
943 		xtimer->tt_2msl = (timer->tt_2msl.c_time - now) / SBT_1MS;
944 	xtimer->t_rcvtime = ticks_to_msecs(ticks - tp->t_rcvtime);
945 }
946