xref: /freebsd/sys/netinet/tcp_usrreq.c (revision 7b71f57f4e514a2ab7308ce4147e14d90e099ad0)
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 1982, 1986, 1988, 1993
5  *	The Regents of the University of California.
6  * Copyright (c) 2006-2007 Robert N. M. Watson
7  * Copyright (c) 2010-2011 Juniper Networks, Inc.
8  * All rights reserved.
9  *
10  * Portions of this software were developed by Robert N. M. Watson under
11  * contract to Juniper Networks, Inc.
12  *
13  * Redistribution and use in source and binary forms, with or without
14  * modification, are permitted provided that the following conditions
15  * are met:
16  * 1. Redistributions of source code must retain the above copyright
17  *    notice, this list of conditions and the following disclaimer.
18  * 2. Redistributions in binary form must reproduce the above copyright
19  *    notice, this list of conditions and the following disclaimer in the
20  *    documentation and/or other materials provided with the distribution.
21  * 3. Neither the name of the University nor the names of its contributors
22  *    may be used to endorse or promote products derived from this software
23  *    without specific prior written permission.
24  *
25  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35  * SUCH DAMAGE.
36  */
37 
38 #include "opt_ddb.h"
39 #include "opt_inet.h"
40 #include "opt_inet6.h"
41 #include "opt_ipsec.h"
42 #include "opt_kern_tls.h"
43 
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/arb.h>
47 #include <sys/limits.h>
48 #include <sys/malloc.h>
49 #include <sys/refcount.h>
50 #include <sys/kernel.h>
51 #include <sys/ktls.h>
52 #include <sys/qmath.h>
53 #include <sys/sysctl.h>
54 #include <sys/mbuf.h>
55 #ifdef INET6
56 #include <sys/domain.h>
57 #endif /* INET6 */
58 #include <sys/socket.h>
59 #include <sys/socketvar.h>
60 #include <sys/protosw.h>
61 #include <sys/proc.h>
62 #include <sys/jail.h>
63 #include <sys/stats.h>
64 
65 #ifdef DDB
66 #include <ddb/ddb.h>
67 #endif
68 
69 #include <net/if.h>
70 #include <net/if_var.h>
71 #include <net/route.h>
72 #include <net/vnet.h>
73 
74 #include <netinet/in.h>
75 #include <netinet/in_kdtrace.h>
76 #include <netinet/in_pcb.h>
77 #include <netinet/in_rss.h>
78 #include <netinet/in_systm.h>
79 #include <netinet/in_var.h>
80 #include <netinet/ip.h>
81 #include <netinet/ip_var.h>
82 #ifdef INET6
83 #include <netinet/ip6.h>
84 #include <netinet6/in6_pcb.h>
85 #include <netinet6/in6_rss.h>
86 #include <netinet6/ip6_var.h>
87 #include <netinet6/scope6_var.h>
88 #endif
89 #include <netinet/tcp.h>
90 #include <netinet/tcp_fsm.h>
91 #include <netinet/tcp_seq.h>
92 #include <netinet/tcp_timer.h>
93 #include <netinet/tcp_var.h>
94 #include <netinet/tcp_log_buf.h>
95 #include <netinet/tcpip.h>
96 #include <netinet/cc/cc.h>
97 #include <netinet/tcp_fastopen.h>
98 #include <netinet/tcp_hpts.h>
99 #ifdef TCP_OFFLOAD
100 #include <netinet/tcp_offload.h>
101 #endif
102 #include <netipsec/ipsec_support.h>
103 
104 #include <vm/vm.h>
105 #include <vm/vm_param.h>
106 #include <vm/pmap.h>
107 #include <vm/vm_extern.h>
108 #include <vm/vm_map.h>
109 #include <vm/vm_page.h>
110 
111 /*
112  * TCP protocol interface to socket abstraction.
113  */
114 #ifdef INET
115 static int	tcp_connect(struct tcpcb *, struct sockaddr_in *,
116 		    struct thread *td);
117 #endif /* INET */
118 #ifdef INET6
119 static int	tcp6_connect(struct tcpcb *, struct sockaddr_in6 *,
120 		    struct thread *td);
121 #endif /* INET6 */
122 static void	tcp_disconnect(struct tcpcb *);
123 static void	tcp_usrclosed(struct tcpcb *);
124 static void	tcp_fill_info(const struct tcpcb *, struct tcp_info *);
125 
126 static int	tcp_pru_options_support(struct tcpcb *tp, int flags);
127 
128 static void
tcp_bblog_pru(struct tcpcb * tp,uint32_t pru,int error)129 tcp_bblog_pru(struct tcpcb *tp, uint32_t pru, int error)
130 {
131 	struct tcp_log_buffer *lgb;
132 
133 	KASSERT(tp != NULL, ("tcp_bblog_pru: tp == NULL"));
134 	INP_WLOCK_ASSERT(tptoinpcb(tp));
135 	if (tcp_bblogging_on(tp)) {
136 		lgb = tcp_log_event(tp, NULL, NULL, NULL, TCP_LOG_PRU, error,
137 		    0, NULL, false, NULL, NULL, 0, NULL);
138 	} else {
139 		lgb = NULL;
140 	}
141 	if (lgb != NULL) {
142 		if (error >= 0) {
143 			lgb->tlb_errno = (uint32_t)error;
144 		}
145 		lgb->tlb_flex1 = pru;
146 	}
147 }
148 
149 /*
150  * TCP attaches to socket via pr_attach(), reserving space,
151  * and an internet control block.
152  */
153 static int
tcp_usr_attach(struct socket * so,int proto,struct thread * td)154 tcp_usr_attach(struct socket *so, int proto, struct thread *td)
155 {
156 	struct inpcb *inp;
157 	struct tcpcb *tp = NULL;
158 	int error;
159 
160 	inp = sotoinpcb(so);
161 	KASSERT(inp == NULL, ("tcp_usr_attach: inp != NULL"));
162 
163 	error = soreserve(so, V_tcp_sendspace, V_tcp_recvspace);
164 	if (error)
165 		goto out;
166 
167 	so->so_rcv.sb_flags |= SB_AUTOSIZE;
168 	so->so_snd.sb_flags |= (SB_AUTOLOWAT | SB_AUTOSIZE);
169 	error = in_pcballoc(so, &V_tcbinfo);
170 	if (error)
171 		goto out;
172 	inp = sotoinpcb(so);
173 	tp = tcp_newtcpcb(inp, NULL);
174 	if (tp == NULL) {
175 		error = ENOBUFS;
176 		in_pcbfree(inp);
177 		goto out;
178 	}
179 	tp->t_state = TCPS_CLOSED;
180 	tcp_bblog_pru(tp, PRU_ATTACH, error);
181 	INP_WUNLOCK(inp);
182 	TCPSTATES_INC(TCPS_CLOSED);
183 out:
184 	TCP_PROBE2(debug__user, tp, PRU_ATTACH);
185 	return (error);
186 }
187 
188 /*
189  * tcp_usr_detach is called when the socket layer loses its final reference
190  * to the socket, be it a file descriptor reference, a reference from TCP,
191  * etc.  At this point, there is only one case in which we will keep around
192  * inpcb state: time wait.
193  */
194 static void
tcp_usr_detach(struct socket * so)195 tcp_usr_detach(struct socket *so)
196 {
197 	struct inpcb *inp;
198 	struct tcpcb *tp;
199 
200 	inp = sotoinpcb(so);
201 	KASSERT(inp != NULL, ("%s: inp == NULL", __func__));
202 	INP_WLOCK(inp);
203 	KASSERT(so->so_pcb == inp && inp->inp_socket == so,
204 		("%s: socket %p inp %p mismatch", __func__, so, inp));
205 
206 	tp = intotcpcb(inp);
207 
208 	KASSERT(inp->inp_flags & INP_DROPPED ||
209 	    tp->t_state < TCPS_SYN_SENT,
210 	    ("%s: inp %p not dropped or embryonic", __func__, inp));
211 
212 	tcp_discardcb(tp);
213 	in_pcbfree(inp);
214 }
215 
216 #ifdef INET
217 /*
218  * Give the socket an address.
219  */
220 static int
tcp_usr_bind(struct socket * so,struct sockaddr * nam,struct thread * td)221 tcp_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
222 {
223 	int error = 0;
224 	struct inpcb *inp;
225 	struct tcpcb *tp;
226 	struct sockaddr_in *sinp;
227 
228 	inp = sotoinpcb(so);
229 	KASSERT(inp != NULL, ("tcp_usr_bind: inp == NULL"));
230 	INP_WLOCK(inp);
231 	if (inp->inp_flags & INP_DROPPED) {
232 		INP_WUNLOCK(inp);
233 		return (EINVAL);
234 	}
235 	tp = intotcpcb(inp);
236 
237 	sinp = (struct sockaddr_in *)nam;
238 	if (nam->sa_family != AF_INET) {
239 		/*
240 		 * Preserve compatibility with old programs.
241 		 */
242 		if (nam->sa_family != AF_UNSPEC ||
243 		    nam->sa_len < offsetof(struct sockaddr_in, sin_zero) ||
244 		    sinp->sin_addr.s_addr != INADDR_ANY) {
245 			error = EAFNOSUPPORT;
246 			goto out;
247 		}
248 		nam->sa_family = AF_INET;
249 	}
250 	if (nam->sa_len != sizeof(*sinp)) {
251 		error = EINVAL;
252 		goto out;
253 	}
254 	/*
255 	 * Must check for multicast addresses and disallow binding
256 	 * to them.
257 	 */
258 	if (IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) {
259 		error = EAFNOSUPPORT;
260 		goto out;
261 	}
262 	INP_HASH_WLOCK(&V_tcbinfo);
263 	error = in_pcbbind(inp, sinp, V_tcp_bind_all_fibs ? 0 : INPBIND_FIB,
264 	    td->td_ucred);
265 	INP_HASH_WUNLOCK(&V_tcbinfo);
266 out:
267 	tcp_bblog_pru(tp, PRU_BIND, error);
268 	TCP_PROBE2(debug__user, tp, PRU_BIND);
269 	INP_WUNLOCK(inp);
270 
271 	return (error);
272 }
273 #endif /* INET */
274 
275 #ifdef INET6
276 static int
tcp6_usr_bind(struct socket * so,struct sockaddr * nam,struct thread * td)277 tcp6_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
278 {
279 	int error = 0;
280 	struct inpcb *inp;
281 	struct tcpcb *tp;
282 	struct sockaddr_in6 *sin6;
283 	u_char vflagsav;
284 
285 	inp = sotoinpcb(so);
286 	KASSERT(inp != NULL, ("tcp6_usr_bind: inp == NULL"));
287 	INP_WLOCK(inp);
288 	if (inp->inp_flags & INP_DROPPED) {
289 		INP_WUNLOCK(inp);
290 		return (EINVAL);
291 	}
292 	tp = intotcpcb(inp);
293 
294 	vflagsav = inp->inp_vflag;
295 
296 	sin6 = (struct sockaddr_in6 *)nam;
297 	if (nam->sa_family != AF_INET6) {
298 		error = EAFNOSUPPORT;
299 		goto out;
300 	}
301 	if (nam->sa_len != sizeof(*sin6)) {
302 		error = EINVAL;
303 		goto out;
304 	}
305 	/*
306 	 * Must check for multicast addresses and disallow binding
307 	 * to them.
308 	 */
309 	if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) {
310 		error = EAFNOSUPPORT;
311 		goto out;
312 	}
313 
314 	INP_HASH_WLOCK(&V_tcbinfo);
315 	inp->inp_vflag &= ~INP_IPV4;
316 	inp->inp_vflag |= INP_IPV6;
317 #ifdef INET
318 	if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) {
319 		if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
320 			inp->inp_vflag |= INP_IPV4;
321 		else if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
322 			struct sockaddr_in sin;
323 
324 			in6_sin6_2_sin(&sin, sin6);
325 			if (IN_MULTICAST(ntohl(sin.sin_addr.s_addr))) {
326 				error = EAFNOSUPPORT;
327 				INP_HASH_WUNLOCK(&V_tcbinfo);
328 				goto out;
329 			}
330 			inp->inp_vflag |= INP_IPV4;
331 			inp->inp_vflag &= ~INP_IPV6;
332 			error = in_pcbbind(inp, &sin, 0, td->td_ucred);
333 			INP_HASH_WUNLOCK(&V_tcbinfo);
334 			goto out;
335 		}
336 	}
337 #endif
338 	error = in6_pcbbind(inp, sin6, V_tcp_bind_all_fibs ? 0 : INPBIND_FIB,
339 	    td->td_ucred);
340 	INP_HASH_WUNLOCK(&V_tcbinfo);
341 out:
342 	if (error != 0)
343 		inp->inp_vflag = vflagsav;
344 	tcp_bblog_pru(tp, PRU_BIND, error);
345 	TCP_PROBE2(debug__user, tp, PRU_BIND);
346 	INP_WUNLOCK(inp);
347 	return (error);
348 }
349 #endif /* INET6 */
350 
351 #ifdef INET
352 /*
353  * Prepare to accept connections.
354  */
355 static int
tcp_usr_listen(struct socket * so,int backlog,struct thread * td)356 tcp_usr_listen(struct socket *so, int backlog, struct thread *td)
357 {
358 	struct inpcb *inp;
359 	struct tcpcb *tp;
360 	int error = 0;
361 	bool already_listening;
362 
363 	inp = sotoinpcb(so);
364 	KASSERT(inp != NULL, ("tcp_usr_listen: inp == NULL"));
365 	INP_WLOCK(inp);
366 	if (inp->inp_flags & INP_DROPPED) {
367 		INP_WUNLOCK(inp);
368 		return (EINVAL);
369 	}
370 	tp = intotcpcb(inp);
371 
372 	SOCK_LOCK(so);
373 	already_listening = SOLISTENING(so);
374 	error = solisten_proto_check(so);
375 	if (error != 0) {
376 		SOCK_UNLOCK(so);
377 		goto out;
378 	}
379 	if (inp->inp_lport == 0) {
380 		INP_HASH_WLOCK(&V_tcbinfo);
381 		error = in_pcbbind(inp, NULL,
382 		    V_tcp_bind_all_fibs ? 0 : INPBIND_FIB, td->td_ucred);
383 		INP_HASH_WUNLOCK(&V_tcbinfo);
384 	}
385 	if (error == 0) {
386 		tcp_state_change(tp, TCPS_LISTEN);
387 		solisten_proto(so, backlog);
388 #ifdef TCP_OFFLOAD
389 		if ((so->so_options & SO_NO_OFFLOAD) == 0)
390 			tcp_offload_listen_start(tp);
391 #endif
392 	} else {
393 		solisten_proto_abort(so);
394 	}
395 	SOCK_UNLOCK(so);
396 	if (already_listening)
397 		goto out;
398 
399 	if (error == 0)
400 		in_pcblisten(inp);
401 	if (tp->t_flags & TF_FASTOPEN)
402 		tp->t_tfo_pending = tcp_fastopen_alloc_counter();
403 
404 out:
405 	tcp_bblog_pru(tp, PRU_LISTEN, error);
406 	TCP_PROBE2(debug__user, tp, PRU_LISTEN);
407 	INP_WUNLOCK(inp);
408 	return (error);
409 }
410 #endif /* INET */
411 
412 #ifdef INET6
413 static int
tcp6_usr_listen(struct socket * so,int backlog,struct thread * td)414 tcp6_usr_listen(struct socket *so, int backlog, struct thread *td)
415 {
416 	struct inpcb *inp;
417 	struct tcpcb *tp;
418 	u_char vflagsav;
419 	int error = 0;
420 	bool already_listening;
421 
422 	inp = sotoinpcb(so);
423 	KASSERT(inp != NULL, ("tcp6_usr_listen: inp == NULL"));
424 	INP_WLOCK(inp);
425 	if (inp->inp_flags & INP_DROPPED) {
426 		INP_WUNLOCK(inp);
427 		return (EINVAL);
428 	}
429 	tp = intotcpcb(inp);
430 
431 	vflagsav = inp->inp_vflag;
432 
433 	SOCK_LOCK(so);
434 	already_listening = SOLISTENING(so);
435 	error = solisten_proto_check(so);
436 	if (error != 0) {
437 		SOCK_UNLOCK(so);
438 		goto out;
439 	}
440 	INP_HASH_WLOCK(&V_tcbinfo);
441 	if (inp->inp_lport == 0) {
442 		inp->inp_vflag &= ~INP_IPV4;
443 		if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0)
444 			inp->inp_vflag |= INP_IPV4;
445 		error = in6_pcbbind(inp, NULL,
446 		    V_tcp_bind_all_fibs ? 0 : INPBIND_FIB, td->td_ucred);
447 	}
448 	INP_HASH_WUNLOCK(&V_tcbinfo);
449 	if (error == 0) {
450 		tcp_state_change(tp, TCPS_LISTEN);
451 		solisten_proto(so, backlog);
452 #ifdef TCP_OFFLOAD
453 		if ((so->so_options & SO_NO_OFFLOAD) == 0)
454 			tcp_offload_listen_start(tp);
455 #endif
456 	} else {
457 		solisten_proto_abort(so);
458 	}
459 	SOCK_UNLOCK(so);
460 	if (already_listening)
461 		goto out;
462 
463 	if (error == 0)
464 		in_pcblisten(inp);
465 	if (tp->t_flags & TF_FASTOPEN)
466 		tp->t_tfo_pending = tcp_fastopen_alloc_counter();
467 
468 	if (error != 0)
469 		inp->inp_vflag = vflagsav;
470 
471 out:
472 	tcp_bblog_pru(tp, PRU_LISTEN, error);
473 	TCP_PROBE2(debug__user, tp, PRU_LISTEN);
474 	INP_WUNLOCK(inp);
475 	return (error);
476 }
477 #endif /* INET6 */
478 
479 #ifdef INET
480 /*
481  * Initiate connection to peer.
482  * Create a template for use in transmissions on this connection.
483  * Enter SYN_SENT state, and mark socket as connecting.
484  * Start keep-alive timer, and seed output sequence space.
485  * Send initial segment on connection.
486  */
487 static int
tcp_usr_connect(struct socket * so,struct sockaddr * nam,struct thread * td)488 tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
489 {
490 	struct epoch_tracker et;
491 	int error = 0;
492 	struct inpcb *inp;
493 	struct tcpcb *tp;
494 	struct sockaddr_in *sinp;
495 
496 	inp = sotoinpcb(so);
497 	KASSERT(inp != NULL, ("tcp_usr_connect: inp == NULL"));
498 	INP_WLOCK(inp);
499 	if (inp->inp_flags & INP_DROPPED) {
500 		INP_WUNLOCK(inp);
501 		return (ECONNREFUSED);
502 	}
503 	tp = intotcpcb(inp);
504 
505 	sinp = (struct sockaddr_in *)nam;
506 	if (nam->sa_family != AF_INET) {
507 		error = EAFNOSUPPORT;
508 		goto out;
509 	}
510 	if (nam->sa_len != sizeof (*sinp)) {
511 		error = EINVAL;
512 		goto out;
513 	}
514 	/*
515 	 * Must disallow TCP ``connections'' to multicast addresses.
516 	 */
517 	if (IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) {
518 		error = EAFNOSUPPORT;
519 		goto out;
520 	}
521 	if (ntohl(sinp->sin_addr.s_addr) == INADDR_BROADCAST) {
522 		error = EACCES;
523 		goto out;
524 	}
525 	if ((error = prison_remote_ip4(td->td_ucred, &sinp->sin_addr)) != 0)
526 		goto out;
527 	if (SOLISTENING(so)) {
528 		error = EOPNOTSUPP;
529 		goto out;
530 	}
531 	NET_EPOCH_ENTER(et);
532 	if ((error = tcp_connect(tp, sinp, td)) != 0)
533 		goto out_in_epoch;
534 #ifdef TCP_OFFLOAD
535 	if (registered_toedevs > 0 &&
536 	    (so->so_options & SO_NO_OFFLOAD) == 0 &&
537 	    (error = tcp_offload_connect(so, nam)) == 0)
538 		goto out_in_epoch;
539 #endif
540 	tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp));
541 	error = tcp_output(tp);
542 	KASSERT(error >= 0, ("TCP stack %s requested tcp_drop(%p) at connect()"
543 	    ", error code %d", tp->t_fb->tfb_tcp_block_name, tp, -error));
544 out_in_epoch:
545 	NET_EPOCH_EXIT(et);
546 out:
547 	tcp_bblog_pru(tp, PRU_CONNECT, error);
548 	TCP_PROBE2(debug__user, tp, PRU_CONNECT);
549 	INP_WUNLOCK(inp);
550 	return (error);
551 }
552 #endif /* INET */
553 
554 #ifdef INET6
555 static int
tcp6_usr_connect(struct socket * so,struct sockaddr * nam,struct thread * td)556 tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
557 {
558 	struct epoch_tracker et;
559 	int error = 0;
560 	struct inpcb *inp;
561 	struct tcpcb *tp;
562 	struct sockaddr_in6 *sin6;
563 	u_int8_t incflagsav;
564 	u_char vflagsav;
565 
566 	inp = sotoinpcb(so);
567 	KASSERT(inp != NULL, ("tcp6_usr_connect: inp == NULL"));
568 	INP_WLOCK(inp);
569 	if (inp->inp_flags & INP_DROPPED) {
570 		INP_WUNLOCK(inp);
571 		return (ECONNREFUSED);
572 	}
573 	tp = intotcpcb(inp);
574 
575 	vflagsav = inp->inp_vflag;
576 	incflagsav = inp->inp_inc.inc_flags;
577 
578 	sin6 = (struct sockaddr_in6 *)nam;
579 	if (nam->sa_family != AF_INET6) {
580 		error = EAFNOSUPPORT;
581 		goto out;
582 	}
583 	if (nam->sa_len != sizeof (*sin6)) {
584 		error = EINVAL;
585 		goto out;
586 	}
587 	/*
588 	 * Must disallow TCP ``connections'' to multicast addresses.
589 	 */
590 	if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) {
591 		error = EAFNOSUPPORT;
592 		goto out;
593 	}
594 	if (SOLISTENING(so)) {
595 		error = EOPNOTSUPP;
596 		goto out;
597 	}
598 #ifdef INET
599 	/*
600 	 * XXXRW: Some confusion: V4/V6 flags relate to binding, and
601 	 * therefore probably require the hash lock, which isn't held here.
602 	 * Is this a significant problem?
603 	 */
604 	if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
605 		struct sockaddr_in sin;
606 
607 		if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) {
608 			error = EINVAL;
609 			goto out;
610 		}
611 		if ((inp->inp_vflag & INP_IPV4) == 0) {
612 			error = EAFNOSUPPORT;
613 			goto out;
614 		}
615 
616 		in6_sin6_2_sin(&sin, sin6);
617 		if (IN_MULTICAST(ntohl(sin.sin_addr.s_addr))) {
618 			error = EAFNOSUPPORT;
619 			goto out;
620 		}
621 		if (ntohl(sin.sin_addr.s_addr) == INADDR_BROADCAST) {
622 			error = EACCES;
623 			goto out;
624 		}
625 		if ((error = prison_remote_ip4(td->td_ucred,
626 		    &sin.sin_addr)) != 0)
627 			goto out;
628 		inp->inp_vflag |= INP_IPV4;
629 		inp->inp_vflag &= ~INP_IPV6;
630 		NET_EPOCH_ENTER(et);
631 		if ((error = tcp_connect(tp, &sin, td)) != 0)
632 			goto out_in_epoch;
633 #ifdef TCP_OFFLOAD
634 		if (registered_toedevs > 0 &&
635 		    (so->so_options & SO_NO_OFFLOAD) == 0 &&
636 		    (error = tcp_offload_connect(so, nam)) == 0)
637 			goto out_in_epoch;
638 #endif
639 		error = tcp_output(tp);
640 		goto out_in_epoch;
641 	} else {
642 		if ((inp->inp_vflag & INP_IPV6) == 0) {
643 			error = EAFNOSUPPORT;
644 			goto out;
645 		}
646 	}
647 #endif
648 	if ((error = prison_remote_ip6(td->td_ucred, &sin6->sin6_addr)) != 0)
649 		goto out;
650 	inp->inp_vflag &= ~INP_IPV4;
651 	inp->inp_vflag |= INP_IPV6;
652 	inp->inp_inc.inc_flags |= INC_ISIPV6;
653 	NET_EPOCH_ENTER(et);
654 	if ((error = tcp6_connect(tp, sin6, td)) != 0)
655 		goto out_in_epoch;
656 #ifdef TCP_OFFLOAD
657 	if (registered_toedevs > 0 &&
658 	    (so->so_options & SO_NO_OFFLOAD) == 0 &&
659 	    (error = tcp_offload_connect(so, nam)) == 0)
660 		goto out_in_epoch;
661 #endif
662 	tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp));
663 	error = tcp_output(tp);
664 out_in_epoch:
665 	NET_EPOCH_EXIT(et);
666 out:
667 	KASSERT(error >= 0, ("TCP stack %s requested tcp_drop(%p) at connect()"
668 	    ", error code %d", tp->t_fb->tfb_tcp_block_name, tp, -error));
669 	/*
670 	 * If the implicit bind in the connect call fails, restore
671 	 * the flags we modified.
672 	 */
673 	if (error != 0 && inp->inp_lport == 0) {
674 		inp->inp_vflag = vflagsav;
675 		inp->inp_inc.inc_flags = incflagsav;
676 	}
677 
678 	tcp_bblog_pru(tp, PRU_CONNECT, error);
679 	TCP_PROBE2(debug__user, tp, PRU_CONNECT);
680 	INP_WUNLOCK(inp);
681 	return (error);
682 }
683 #endif /* INET6 */
684 
685 /*
686  * Initiate disconnect from peer.
687  * If connection never passed embryonic stage, just drop;
688  * else if don't need to let data drain, then can just drop anyways,
689  * else have to begin TCP shutdown process: mark socket disconnecting,
690  * drain unread data, state switch to reflect user close, and
691  * send segment (e.g. FIN) to peer.  Socket will be really disconnected
692  * when peer sends FIN and acks ours.
693  *
694  * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB.
695  */
696 static int
tcp_usr_disconnect(struct socket * so)697 tcp_usr_disconnect(struct socket *so)
698 {
699 	struct inpcb *inp;
700 	struct tcpcb *tp = NULL;
701 	struct epoch_tracker et;
702 
703 	NET_EPOCH_ENTER(et);
704 	inp = sotoinpcb(so);
705 	KASSERT(inp != NULL, ("tcp_usr_disconnect: inp == NULL"));
706 	INP_WLOCK(inp);
707 	tp = intotcpcb(inp);
708 
709 	if (tp->t_state == TCPS_TIME_WAIT)
710 		goto out;
711 	tcp_disconnect(tp);
712 out:
713 	tcp_bblog_pru(tp, PRU_DISCONNECT, 0);
714 	TCP_PROBE2(debug__user, tp, PRU_DISCONNECT);
715 	INP_WUNLOCK(inp);
716 	NET_EPOCH_EXIT(et);
717 	return (0);
718 }
719 
720 #ifdef INET
721 /*
722  * Accept a connection.  Essentially all the work is done at higher levels;
723  * just return the address of the peer, storing through addr.
724  */
725 static int
tcp_usr_accept(struct socket * so,struct sockaddr * sa)726 tcp_usr_accept(struct socket *so, struct sockaddr *sa)
727 {
728 	struct inpcb *inp;
729 	struct tcpcb *tp;
730 	int error = 0;
731 
732 	inp = sotoinpcb(so);
733 	KASSERT(inp != NULL, ("tcp_usr_accept: inp == NULL"));
734 	INP_WLOCK(inp);
735 	if (inp->inp_flags & INP_DROPPED) {
736 		INP_WUNLOCK(inp);
737 		return (ECONNABORTED);
738 	}
739 	tp = intotcpcb(inp);
740 
741 	if (so->so_state & SS_ISDISCONNECTED)
742 		error = ECONNABORTED;
743 	else
744 		*(struct sockaddr_in *)sa = (struct sockaddr_in ){
745 			.sin_family = AF_INET,
746 			.sin_len = sizeof(struct sockaddr_in),
747 			.sin_port = inp->inp_fport,
748 			.sin_addr = inp->inp_faddr,
749 		};
750 	tcp_bblog_pru(tp, PRU_ACCEPT, error);
751 	TCP_PROBE2(debug__user, tp, PRU_ACCEPT);
752 	INP_WUNLOCK(inp);
753 
754 	return (error);
755 }
756 #endif /* INET */
757 
758 #ifdef INET6
759 static int
tcp6_usr_accept(struct socket * so,struct sockaddr * sa)760 tcp6_usr_accept(struct socket *so, struct sockaddr *sa)
761 {
762 	struct inpcb *inp;
763 	struct tcpcb *tp;
764 	int error = 0;
765 
766 	inp = sotoinpcb(so);
767 	KASSERT(inp != NULL, ("tcp6_usr_accept: inp == NULL"));
768 	INP_WLOCK(inp);
769 	if (inp->inp_flags & INP_DROPPED) {
770 		INP_WUNLOCK(inp);
771 		return (ECONNABORTED);
772 	}
773 	tp = intotcpcb(inp);
774 
775 	if (so->so_state & SS_ISDISCONNECTED) {
776 		error = ECONNABORTED;
777 	} else {
778 		if (inp->inp_vflag & INP_IPV4) {
779 			struct sockaddr_in sin = {
780 				.sin_family = AF_INET,
781 				.sin_len = sizeof(struct sockaddr_in),
782 				.sin_port = inp->inp_fport,
783 				.sin_addr = inp->inp_faddr,
784 			};
785 			in6_sin_2_v4mapsin6(&sin, (struct sockaddr_in6 *)sa);
786 		} else {
787 			*(struct sockaddr_in6 *)sa = (struct sockaddr_in6 ){
788 				.sin6_family = AF_INET6,
789 				.sin6_len = sizeof(struct sockaddr_in6),
790 				.sin6_port = inp->inp_fport,
791 				.sin6_addr = inp->in6p_faddr,
792 			};
793 			/* XXX: should catch errors */
794 			(void)sa6_recoverscope((struct sockaddr_in6 *)sa);
795 		}
796 	}
797 
798 	tcp_bblog_pru(tp, PRU_ACCEPT, error);
799 	TCP_PROBE2(debug__user, tp, PRU_ACCEPT);
800 	INP_WUNLOCK(inp);
801 
802 	return (error);
803 }
804 #endif /* INET6 */
805 
806 /*
807  * Mark the connection as being incapable of further output.
808  */
809 static int
tcp_usr_shutdown(struct socket * so,enum shutdown_how how)810 tcp_usr_shutdown(struct socket *so, enum shutdown_how how)
811 {
812 	struct epoch_tracker et;
813 	struct inpcb *inp = sotoinpcb(so);
814 	struct tcpcb *tp = intotcpcb(inp);
815 	int error = 0;
816 
817 	SOCK_LOCK(so);
818 	if (SOLISTENING(so)) {
819 		if (how != SHUT_WR) {
820 			so->so_error = ECONNABORTED;
821 			solisten_wakeup(so);	/* unlocks so */
822 		} else
823 			SOCK_UNLOCK(so);
824 		return (ENOTCONN);
825 	} else if ((so->so_state &
826 	    (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0) {
827 		SOCK_UNLOCK(so);
828 		return (ENOTCONN);
829 	}
830 	SOCK_UNLOCK(so);
831 
832 	switch (how) {
833 	case SHUT_RD:
834 		sorflush(so);
835 		break;
836 	case SHUT_RDWR:
837 		sorflush(so);
838 		/* FALLTHROUGH */
839 	case SHUT_WR:
840 		/*
841 		 * XXXGL: mimicing old soshutdown() here. But shouldn't we
842 		 * return ECONNRESEST for SHUT_RD as well?
843 		 */
844 		INP_WLOCK(inp);
845 		if (inp->inp_flags & INP_DROPPED) {
846 			INP_WUNLOCK(inp);
847 			return (ECONNRESET);
848 		}
849 
850 		socantsendmore(so);
851 		NET_EPOCH_ENTER(et);
852 		tcp_usrclosed(tp);
853 		error = tcp_output_nodrop(tp);
854 		tcp_bblog_pru(tp, PRU_SHUTDOWN, error);
855 		TCP_PROBE2(debug__user, tp, PRU_SHUTDOWN);
856 		error = tcp_unlock_or_drop(tp, error);
857 		NET_EPOCH_EXIT(et);
858 	}
859 	wakeup(&so->so_timeo);
860 
861 	return (error);
862 }
863 
864 /*
865  * After a receive, possibly send window update to peer.
866  */
867 static int
tcp_usr_rcvd(struct socket * so,int flags)868 tcp_usr_rcvd(struct socket *so, int flags)
869 {
870 	struct epoch_tracker et;
871 	struct inpcb *inp;
872 	struct tcpcb *tp;
873 	int outrv = 0, error = 0;
874 
875 	inp = sotoinpcb(so);
876 	KASSERT(inp != NULL, ("tcp_usr_rcvd: inp == NULL"));
877 	INP_WLOCK(inp);
878 	if (inp->inp_flags & INP_DROPPED) {
879 		INP_WUNLOCK(inp);
880 		return (ECONNRESET);
881 	}
882 	tp = intotcpcb(inp);
883 
884 	NET_EPOCH_ENTER(et);
885 	/*
886 	 * For passively-created TFO connections, don't attempt a window
887 	 * update while still in SYN_RECEIVED as this may trigger an early
888 	 * SYN|ACK.  It is preferable to have the SYN|ACK be sent along with
889 	 * application response data, or failing that, when the DELACK timer
890 	 * expires.
891 	 */
892 	if ((tp->t_flags & TF_FASTOPEN) && (tp->t_state == TCPS_SYN_RECEIVED))
893 		goto out;
894 #ifdef TCP_OFFLOAD
895 	if (tp->t_flags & TF_TOE)
896 		tcp_offload_rcvd(tp);
897 	else
898 #endif
899 		outrv = tcp_output_nodrop(tp);
900 out:
901 	tcp_bblog_pru(tp, PRU_RCVD, error);
902 	TCP_PROBE2(debug__user, tp, PRU_RCVD);
903 	(void) tcp_unlock_or_drop(tp, outrv);
904 	NET_EPOCH_EXIT(et);
905 	return (error);
906 }
907 
908 /*
909  * Do a send by putting data in output queue and updating urgent
910  * marker if URG set.  Possibly send more data.  Unlike the other
911  * pr_*() routines, the mbuf chains are our responsibility.  We
912  * must either enqueue them or free them.  The other pr_*() routines
913  * generally are caller-frees.
914  */
915 static int
tcp_usr_send(struct socket * so,int flags,struct mbuf * m,struct sockaddr * nam,struct mbuf * control,struct thread * td)916 tcp_usr_send(struct socket *so, int flags, struct mbuf *m,
917     struct sockaddr *nam, struct mbuf *control, struct thread *td)
918 {
919 	struct epoch_tracker et;
920 	int error = 0;
921 	struct inpcb *inp;
922 	struct tcpcb *tp;
923 #ifdef INET
924 #ifdef INET6
925 	struct sockaddr_in sin;
926 #endif
927 	struct sockaddr_in *sinp;
928 #endif
929 #ifdef INET6
930 	struct sockaddr_in6 *sin6;
931 	int isipv6;
932 #endif
933 	u_int8_t incflagsav;
934 	u_char vflagsav;
935 	bool restoreflags;
936 
937 	inp = sotoinpcb(so);
938 	KASSERT(inp != NULL, ("tcp_usr_send: inp == NULL"));
939 	INP_WLOCK(inp);
940 	if (inp->inp_flags & INP_DROPPED) {
941 		if (m != NULL && (flags & PRUS_NOTREADY) == 0)
942 			m_freem(m);
943 		INP_WUNLOCK(inp);
944 		return (ECONNRESET);
945 	}
946 	tp = intotcpcb(inp);
947 
948 	vflagsav = inp->inp_vflag;
949 	incflagsav = inp->inp_inc.inc_flags;
950 	restoreflags = false;
951 
952 	NET_EPOCH_ENTER(et);
953 	if (control != NULL) {
954 		/* TCP doesn't do control messages (rights, creds, etc) */
955 		if (control->m_len > 0) {
956 			m_freem(control);
957 			error = EINVAL;
958 			goto out;
959 		}
960 		m_freem(control);	/* empty control, just free it */
961 	}
962 
963 	if ((flags & PRUS_OOB) != 0 &&
964 	    (error = tcp_pru_options_support(tp, PRUS_OOB)) != 0)
965 		goto out;
966 
967 	if (nam != NULL && tp->t_state < TCPS_SYN_SENT) {
968 		if (tp->t_state == TCPS_LISTEN) {
969 			error = EINVAL;
970 			goto out;
971 		}
972 		switch (nam->sa_family) {
973 #ifdef INET
974 		case AF_INET:
975 			sinp = (struct sockaddr_in *)nam;
976 			if (sinp->sin_len != sizeof(struct sockaddr_in)) {
977 				error = EINVAL;
978 				goto out;
979 			}
980 			if ((inp->inp_vflag & INP_IPV6) != 0) {
981 				error = EAFNOSUPPORT;
982 				goto out;
983 			}
984 			if (IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) {
985 				error = EAFNOSUPPORT;
986 				goto out;
987 			}
988 			if (ntohl(sinp->sin_addr.s_addr) == INADDR_BROADCAST) {
989 				error = EACCES;
990 				goto out;
991 			}
992 			if ((error = prison_remote_ip4(td->td_ucred,
993 			    &sinp->sin_addr)))
994 				goto out;
995 #ifdef INET6
996 			isipv6 = 0;
997 #endif
998 			break;
999 #endif /* INET */
1000 #ifdef INET6
1001 		case AF_INET6:
1002 			sin6 = (struct sockaddr_in6 *)nam;
1003 			if (sin6->sin6_len != sizeof(*sin6)) {
1004 				error = EINVAL;
1005 				goto out;
1006 			}
1007 			if ((inp->inp_vflag & INP_IPV6PROTO) == 0) {
1008 				error = EAFNOSUPPORT;
1009 				goto out;
1010 			}
1011 			if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) {
1012 				error = EAFNOSUPPORT;
1013 				goto out;
1014 			}
1015 			if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
1016 #ifdef INET
1017 				if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) {
1018 					error = EINVAL;
1019 					goto out;
1020 				}
1021 				if ((inp->inp_vflag & INP_IPV4) == 0) {
1022 					error = EAFNOSUPPORT;
1023 					goto out;
1024 				}
1025 				restoreflags = true;
1026 				inp->inp_vflag &= ~INP_IPV6;
1027 				sinp = &sin;
1028 				in6_sin6_2_sin(sinp, sin6);
1029 				if (IN_MULTICAST(
1030 				    ntohl(sinp->sin_addr.s_addr))) {
1031 					error = EAFNOSUPPORT;
1032 					goto out;
1033 				}
1034 				if ((error = prison_remote_ip4(td->td_ucred,
1035 				    &sinp->sin_addr)))
1036 					goto out;
1037 				isipv6 = 0;
1038 #else /* !INET */
1039 				error = EAFNOSUPPORT;
1040 				goto out;
1041 #endif /* INET */
1042 			} else {
1043 				if ((inp->inp_vflag & INP_IPV6) == 0) {
1044 					error = EAFNOSUPPORT;
1045 					goto out;
1046 				}
1047 				restoreflags = true;
1048 				inp->inp_vflag &= ~INP_IPV4;
1049 				inp->inp_inc.inc_flags |= INC_ISIPV6;
1050 				if ((error = prison_remote_ip6(td->td_ucred,
1051 				    &sin6->sin6_addr)))
1052 					goto out;
1053 				isipv6 = 1;
1054 			}
1055 			break;
1056 #endif /* INET6 */
1057 		default:
1058 			error = EAFNOSUPPORT;
1059 			goto out;
1060 		}
1061 	}
1062 	if (!(flags & PRUS_OOB)) {
1063 		if (tp->t_acktime == 0)
1064 			tp->t_acktime = ticks;
1065 		sbappendstream(&so->so_snd, m, flags);
1066 		m = NULL;
1067 		if (nam && tp->t_state < TCPS_SYN_SENT) {
1068 			KASSERT(tp->t_state == TCPS_CLOSED,
1069 			    ("%s: tp %p is listening", __func__, tp));
1070 
1071 			/*
1072 			 * Do implied connect if not yet connected,
1073 			 * initialize window to default value, and
1074 			 * initialize maxseg using peer's cached MSS.
1075 			 */
1076 #ifdef INET6
1077 			if (isipv6)
1078 				error = tcp6_connect(tp, sin6, td);
1079 #endif /* INET6 */
1080 #if defined(INET6) && defined(INET)
1081 			else
1082 #endif
1083 #ifdef INET
1084 				error = tcp_connect(tp, sinp, td);
1085 #endif
1086 			/*
1087 			 * The bind operation in tcp_connect succeeded. We
1088 			 * no longer want to restore the flags if later
1089 			 * operations fail.
1090 			 */
1091 			if (error == 0 || inp->inp_lport != 0)
1092 				restoreflags = false;
1093 
1094 			if (error) {
1095 				/* m is freed if PRUS_NOTREADY is unset. */
1096 				sbflush(&so->so_snd);
1097 				goto out;
1098 			}
1099 			if (tp->t_flags & TF_FASTOPEN)
1100 				tcp_fastopen_connect(tp);
1101 			else {
1102 				tp->snd_wnd = TTCP_CLIENT_SND_WND;
1103 				tcp_mss(tp, -1);
1104 			}
1105 		}
1106 		if (flags & PRUS_EOF) {
1107 			/*
1108 			 * Close the send side of the connection after
1109 			 * the data is sent.
1110 			 */
1111 			socantsendmore(so);
1112 			tcp_usrclosed(tp);
1113 		}
1114 		if (TCPS_HAVEESTABLISHED(tp->t_state) &&
1115 		    ((tp->t_flags2 & TF2_FBYTES_COMPLETE) == 0) &&
1116 		    (tp->t_fbyte_out == 0) &&
1117 		    (so->so_snd.sb_ccc > 0)) {
1118 			tp->t_fbyte_out = ticks;
1119 			if (tp->t_fbyte_out == 0)
1120 				tp->t_fbyte_out = 1;
1121 			if (tp->t_fbyte_out && tp->t_fbyte_in)
1122 				tp->t_flags2 |= TF2_FBYTES_COMPLETE;
1123 		}
1124 		if (!(inp->inp_flags & INP_DROPPED) &&
1125 		    !(flags & PRUS_NOTREADY)) {
1126 			if (flags & PRUS_MORETOCOME)
1127 				tp->t_flags |= TF_MORETOCOME;
1128 			error = tcp_output_nodrop(tp);
1129 			if (flags & PRUS_MORETOCOME)
1130 				tp->t_flags &= ~TF_MORETOCOME;
1131 		}
1132 	} else {
1133 		/*
1134 		 * XXXRW: PRUS_EOF not implemented with PRUS_OOB?
1135 		 */
1136 		SOCK_SENDBUF_LOCK(so);
1137 		if (sbspace(&so->so_snd) < -512) {
1138 			SOCK_SENDBUF_UNLOCK(so);
1139 			error = ENOBUFS;
1140 			goto out;
1141 		}
1142 		/*
1143 		 * According to RFC961 (Assigned Protocols),
1144 		 * the urgent pointer points to the last octet
1145 		 * of urgent data.  We continue, however,
1146 		 * to consider it to indicate the first octet
1147 		 * of data past the urgent section.
1148 		 * Otherwise, snd_up should be one lower.
1149 		 */
1150 		if (tp->t_acktime == 0)
1151 			tp->t_acktime = ticks;
1152 		sbappendstream_locked(&so->so_snd, m, flags);
1153 		SOCK_SENDBUF_UNLOCK(so);
1154 		m = NULL;
1155 		if (nam && tp->t_state < TCPS_SYN_SENT) {
1156 			/*
1157 			 * Do implied connect if not yet connected,
1158 			 * initialize window to default value, and
1159 			 * initialize maxseg using peer's cached MSS.
1160 			 */
1161 
1162 			/*
1163 			 * Not going to contemplate SYN|URG
1164 			 */
1165 			if (tp->t_flags & TF_FASTOPEN)
1166 				tp->t_flags &= ~TF_FASTOPEN;
1167 #ifdef INET6
1168 			if (isipv6)
1169 				error = tcp6_connect(tp, sin6, td);
1170 #endif /* INET6 */
1171 #if defined(INET6) && defined(INET)
1172 			else
1173 #endif
1174 #ifdef INET
1175 				error = tcp_connect(tp, sinp, td);
1176 #endif
1177 			/*
1178 			 * The bind operation in tcp_connect succeeded. We
1179 			 * no longer want to restore the flags if later
1180 			 * operations fail.
1181 			 */
1182 			if (error == 0 || inp->inp_lport != 0)
1183 				restoreflags = false;
1184 
1185 			if (error != 0) {
1186 				/* m is freed if PRUS_NOTREADY is unset. */
1187 				sbflush(&so->so_snd);
1188 				goto out;
1189 			}
1190 			tp->snd_wnd = TTCP_CLIENT_SND_WND;
1191 			tcp_mss(tp, -1);
1192 		}
1193 		tp->snd_up = tp->snd_una + sbavail(&so->so_snd);
1194 		if ((flags & PRUS_NOTREADY) == 0) {
1195 			tp->t_flags |= TF_FORCEDATA;
1196 			error = tcp_output_nodrop(tp);
1197 			tp->t_flags &= ~TF_FORCEDATA;
1198 		}
1199 	}
1200 	TCP_LOG_EVENT(tp, NULL,
1201 	    &inp->inp_socket->so_rcv,
1202 	    &inp->inp_socket->so_snd,
1203 	    TCP_LOG_USERSEND, error,
1204 	    0, NULL, false);
1205 
1206 out:
1207 	/*
1208 	 * In case of PRUS_NOTREADY, the caller or tcp_usr_ready() is
1209 	 * responsible for freeing memory.
1210 	 */
1211 	if (m != NULL && (flags & PRUS_NOTREADY) == 0)
1212 		m_freem(m);
1213 
1214 	/*
1215 	 * If the request was unsuccessful and we changed flags,
1216 	 * restore the original flags.
1217 	 */
1218 	if (error != 0 && restoreflags) {
1219 		inp->inp_vflag = vflagsav;
1220 		inp->inp_inc.inc_flags = incflagsav;
1221 	}
1222 	tcp_bblog_pru(tp, (flags & PRUS_OOB) ? PRU_SENDOOB :
1223 		      ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND), error);
1224 	TCP_PROBE2(debug__user, tp, (flags & PRUS_OOB) ? PRU_SENDOOB :
1225 		   ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND));
1226 	error = tcp_unlock_or_drop(tp, error);
1227 	NET_EPOCH_EXIT(et);
1228 	return (error);
1229 }
1230 
1231 static int
tcp_usr_ready(struct socket * so,struct mbuf * m,int count)1232 tcp_usr_ready(struct socket *so, struct mbuf *m, int count)
1233 {
1234 	struct epoch_tracker et;
1235 	struct inpcb *inp;
1236 	struct tcpcb *tp;
1237 	int error;
1238 
1239 	inp = sotoinpcb(so);
1240 	INP_WLOCK(inp);
1241 	if (inp->inp_flags & INP_DROPPED) {
1242 		INP_WUNLOCK(inp);
1243 		mb_free_notready(m, count);
1244 		return (ECONNRESET);
1245 	}
1246 	tp = intotcpcb(inp);
1247 
1248 	SOCK_SENDBUF_LOCK(so);
1249 	error = sbready(&so->so_snd, m, count);
1250 	SOCK_SENDBUF_UNLOCK(so);
1251 	if (error) {
1252 		INP_WUNLOCK(inp);
1253 		return (error);
1254 	}
1255 	NET_EPOCH_ENTER(et);
1256 	error = tcp_output_unlock(tp);
1257 	NET_EPOCH_EXIT(et);
1258 
1259 	return (error);
1260 }
1261 
1262 /*
1263  * Abort the TCP.  Drop the connection abruptly.
1264  */
1265 static void
tcp_usr_abort(struct socket * so)1266 tcp_usr_abort(struct socket *so)
1267 {
1268 	struct inpcb *inp;
1269 	struct tcpcb *tp;
1270 	struct epoch_tracker et;
1271 
1272 	inp = sotoinpcb(so);
1273 	KASSERT(inp != NULL, ("tcp_usr_abort: inp == NULL"));
1274 
1275 	NET_EPOCH_ENTER(et);
1276 	INP_WLOCK(inp);
1277 	KASSERT(inp->inp_socket != NULL,
1278 	    ("tcp_usr_abort: inp_socket == NULL"));
1279 
1280 	/*
1281 	 * If we still have full TCP state, and we're not dropped, drop.
1282 	 */
1283 	if (!(inp->inp_flags & INP_DROPPED)) {
1284 		tp = intotcpcb(inp);
1285 		tp = tcp_drop(tp, ECONNABORTED);
1286 		if (tp == NULL)
1287 			goto dropped;
1288 		tcp_bblog_pru(tp, PRU_ABORT, 0);
1289 		TCP_PROBE2(debug__user, tp, PRU_ABORT);
1290 	}
1291 	if (!(inp->inp_flags & INP_DROPPED)) {
1292 		soref(so);
1293 		inp->inp_flags |= INP_SOCKREF;
1294 	}
1295 	INP_WUNLOCK(inp);
1296 dropped:
1297 	NET_EPOCH_EXIT(et);
1298 }
1299 
1300 /*
1301  * TCP socket is closed.  Start friendly disconnect.
1302  */
1303 static void
tcp_usr_close(struct socket * so)1304 tcp_usr_close(struct socket *so)
1305 {
1306 	struct inpcb *inp;
1307 	struct tcpcb *tp;
1308 	struct epoch_tracker et;
1309 
1310 	inp = sotoinpcb(so);
1311 	KASSERT(inp != NULL, ("tcp_usr_close: inp == NULL"));
1312 
1313 	NET_EPOCH_ENTER(et);
1314 	INP_WLOCK(inp);
1315 	KASSERT(inp->inp_socket != NULL,
1316 	    ("tcp_usr_close: inp_socket == NULL"));
1317 
1318 	/*
1319 	 * If we are still connected and we're not dropped, initiate
1320 	 * a disconnect.
1321 	 */
1322 	if (!(inp->inp_flags & INP_DROPPED)) {
1323 		tp = intotcpcb(inp);
1324 		if (tp->t_state != TCPS_TIME_WAIT) {
1325 			tp->t_flags |= TF_CLOSED;
1326 			tcp_disconnect(tp);
1327 			tcp_bblog_pru(tp, PRU_CLOSE, 0);
1328 			TCP_PROBE2(debug__user, tp, PRU_CLOSE);
1329 		}
1330 	}
1331 	if (!(inp->inp_flags & INP_DROPPED)) {
1332 		soref(so);
1333 		inp->inp_flags |= INP_SOCKREF;
1334 	}
1335 	INP_WUNLOCK(inp);
1336 	NET_EPOCH_EXIT(et);
1337 }
1338 
1339 static int
tcp_pru_options_support(struct tcpcb * tp,int flags)1340 tcp_pru_options_support(struct tcpcb *tp, int flags)
1341 {
1342 	/*
1343 	 * If the specific TCP stack has a pru_options
1344 	 * specified then it does not always support
1345 	 * all the PRU_XX options and we must ask it.
1346 	 * If the function is not specified then all
1347 	 * of the PRU_XX options are supported.
1348 	 */
1349 	int ret = 0;
1350 
1351 	if (tp->t_fb->tfb_pru_options) {
1352 		ret = (*tp->t_fb->tfb_pru_options)(tp, flags);
1353 	}
1354 	return (ret);
1355 }
1356 
1357 /*
1358  * Receive out-of-band data.
1359  */
1360 static int
tcp_usr_rcvoob(struct socket * so,struct mbuf * m,int flags)1361 tcp_usr_rcvoob(struct socket *so, struct mbuf *m, int flags)
1362 {
1363 	int error = 0;
1364 	struct inpcb *inp;
1365 	struct tcpcb *tp;
1366 
1367 	inp = sotoinpcb(so);
1368 	KASSERT(inp != NULL, ("tcp_usr_rcvoob: inp == NULL"));
1369 	INP_WLOCK(inp);
1370 	if (inp->inp_flags & INP_DROPPED) {
1371 		INP_WUNLOCK(inp);
1372 		return (ECONNRESET);
1373 	}
1374 	tp = intotcpcb(inp);
1375 
1376 	error = tcp_pru_options_support(tp, PRUS_OOB);
1377 	if (error) {
1378 		goto out;
1379 	}
1380 	if ((so->so_oobmark == 0 &&
1381 	     (so->so_rcv.sb_state & SBS_RCVATMARK) == 0) ||
1382 	    so->so_options & SO_OOBINLINE ||
1383 	    tp->t_oobflags & TCPOOB_HADDATA) {
1384 		error = EINVAL;
1385 		goto out;
1386 	}
1387 	if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) {
1388 		error = EWOULDBLOCK;
1389 		goto out;
1390 	}
1391 	m->m_len = 1;
1392 	*mtod(m, caddr_t) = tp->t_iobc;
1393 	if ((flags & MSG_PEEK) == 0)
1394 		tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA);
1395 
1396 out:
1397 	tcp_bblog_pru(tp, PRU_RCVOOB, error);
1398 	TCP_PROBE2(debug__user, tp, PRU_RCVOOB);
1399 	INP_WUNLOCK(inp);
1400 	return (error);
1401 }
1402 
1403 #ifdef INET
1404 struct protosw tcp_protosw = {
1405 	.pr_type =		SOCK_STREAM,
1406 	.pr_protocol =		IPPROTO_TCP,
1407 	.pr_flags =		PR_CONNREQUIRED | PR_IMPLOPCL | PR_WANTRCVD |
1408 				    PR_CAPATTACH,
1409 	.pr_ctloutput =		tcp_ctloutput,
1410 	.pr_abort =		tcp_usr_abort,
1411 	.pr_accept =		tcp_usr_accept,
1412 	.pr_attach =		tcp_usr_attach,
1413 	.pr_bind =		tcp_usr_bind,
1414 	.pr_connect =		tcp_usr_connect,
1415 	.pr_control =		in_control,
1416 	.pr_detach =		tcp_usr_detach,
1417 	.pr_disconnect =	tcp_usr_disconnect,
1418 	.pr_listen =		tcp_usr_listen,
1419 	.pr_peeraddr =		in_getpeeraddr,
1420 	.pr_rcvd =		tcp_usr_rcvd,
1421 	.pr_rcvoob =		tcp_usr_rcvoob,
1422 	.pr_send =		tcp_usr_send,
1423 	.pr_sendfile_wait =	sendfile_wait_generic,
1424 	.pr_ready =		tcp_usr_ready,
1425 	.pr_shutdown =		tcp_usr_shutdown,
1426 	.pr_sockaddr =		in_getsockaddr,
1427 	.pr_sosetlabel =	in_pcbsosetlabel,
1428 	.pr_close =		tcp_usr_close,
1429 };
1430 #endif /* INET */
1431 
1432 #ifdef INET6
1433 struct protosw tcp6_protosw = {
1434 	.pr_type =		SOCK_STREAM,
1435 	.pr_protocol =		IPPROTO_TCP,
1436 	.pr_flags =		PR_CONNREQUIRED | PR_IMPLOPCL |PR_WANTRCVD |
1437 				    PR_CAPATTACH,
1438 	.pr_ctloutput =		tcp_ctloutput,
1439 	.pr_abort =		tcp_usr_abort,
1440 	.pr_accept =		tcp6_usr_accept,
1441 	.pr_attach =		tcp_usr_attach,
1442 	.pr_bind =		tcp6_usr_bind,
1443 	.pr_connect =		tcp6_usr_connect,
1444 	.pr_control =		in6_control,
1445 	.pr_detach =		tcp_usr_detach,
1446 	.pr_disconnect =	tcp_usr_disconnect,
1447 	.pr_listen =		tcp6_usr_listen,
1448 	.pr_peeraddr =		in6_mapped_peeraddr,
1449 	.pr_rcvd =		tcp_usr_rcvd,
1450 	.pr_rcvoob =		tcp_usr_rcvoob,
1451 	.pr_send =		tcp_usr_send,
1452 	.pr_sendfile_wait =	sendfile_wait_generic,
1453 	.pr_ready =		tcp_usr_ready,
1454 	.pr_shutdown =		tcp_usr_shutdown,
1455 	.pr_sockaddr =		in6_mapped_sockaddr,
1456 	.pr_sosetlabel =	in_pcbsosetlabel,
1457 	.pr_close =		tcp_usr_close,
1458 };
1459 #endif /* INET6 */
1460 
1461 #ifdef INET
1462 /*
1463  * Common subroutine to open a TCP connection to remote host specified
1464  * by struct sockaddr_in.  Call in_pcbconnect() to choose local host address
1465  * and assign a local port number and install the inpcb into the hash.
1466  * Initialize connection parameters and enter SYN-SENT state.
1467  */
1468 static int
tcp_connect(struct tcpcb * tp,struct sockaddr_in * sin,struct thread * td)1469 tcp_connect(struct tcpcb *tp, struct sockaddr_in *sin, struct thread *td)
1470 {
1471 	struct inpcb *inp = tptoinpcb(tp);
1472 	struct socket *so = tptosocket(tp);
1473 	int error;
1474 
1475 	NET_EPOCH_ASSERT();
1476 	INP_WLOCK_ASSERT(inp);
1477 
1478 	if (__predict_false((so->so_state &
1479 	    (SS_ISCONNECTING | SS_ISCONNECTED | SS_ISDISCONNECTING |
1480 	    SS_ISDISCONNECTED)) != 0))
1481 		return (EISCONN);
1482 	if (__predict_false((so->so_options & SO_REUSEPORT_LB) != 0))
1483 		return (EOPNOTSUPP);
1484 
1485 	INP_HASH_WLOCK(&V_tcbinfo);
1486 	error = in_pcbconnect(inp, sin, td->td_ucred);
1487 	INP_HASH_WUNLOCK(&V_tcbinfo);
1488 	if (error != 0)
1489 		return (error);
1490 
1491 	/* set the hash on the connection */
1492 	rss_proto_software_hash_v4(inp->inp_faddr, inp->inp_laddr,
1493 	    inp->inp_fport, inp->inp_lport, IPPROTO_TCP,
1494 	    &inp->inp_flowid, &inp->inp_flowtype);
1495 	/*
1496 	 * Compute window scaling to request:
1497 	 * Scale to fit into sweet spot.  See tcp_syncache.c.
1498 	 * XXX: This should move to tcp_output().
1499 	 */
1500 	while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
1501 	    (TCP_MAXWIN << tp->request_r_scale) < sb_max)
1502 		tp->request_r_scale++;
1503 
1504 	soisconnecting(so);
1505 	TCPSTAT_INC(tcps_connattempt);
1506 	tcp_state_change(tp, TCPS_SYN_SENT);
1507 	tp->iss = tcp_new_isn(&inp->inp_inc);
1508 	if (tp->t_flags & TF_REQ_TSTMP)
1509 		tp->ts_offset = tcp_new_ts_offset(&inp->inp_inc);
1510 	tcp_sendseqinit(tp);
1511 
1512 	return (0);
1513 }
1514 #endif /* INET */
1515 
1516 #ifdef INET6
1517 static int
tcp6_connect(struct tcpcb * tp,struct sockaddr_in6 * sin6,struct thread * td)1518 tcp6_connect(struct tcpcb *tp, struct sockaddr_in6 *sin6, struct thread *td)
1519 {
1520 	struct inpcb *inp = tptoinpcb(tp);
1521 	struct socket *so = tptosocket(tp);
1522 	int error;
1523 
1524 	NET_EPOCH_ASSERT();
1525 	INP_WLOCK_ASSERT(inp);
1526 
1527 	if (__predict_false((so->so_state &
1528 	    (SS_ISCONNECTING | SS_ISCONNECTED | SS_ISDISCONNECTING |
1529 	    SS_ISDISCONNECTED)) != 0))
1530 		return (EISCONN);
1531 	if (__predict_false((so->so_options & SO_REUSEPORT_LB) != 0))
1532 		return (EOPNOTSUPP);
1533 
1534 	INP_HASH_WLOCK(&V_tcbinfo);
1535 	error = in6_pcbconnect(inp, sin6, td->td_ucred, true);
1536 	INP_HASH_WUNLOCK(&V_tcbinfo);
1537 	if (error != 0)
1538 		return (error);
1539 
1540 	/* set the hash on the connection */
1541 	rss_proto_software_hash_v6(&inp->in6p_faddr,
1542 	    &inp->in6p_laddr, inp->inp_fport, inp->inp_lport, IPPROTO_TCP,
1543 	    &inp->inp_flowid, &inp->inp_flowtype);
1544 	/* Compute window scaling to request.  */
1545 	while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
1546 	    (TCP_MAXWIN << tp->request_r_scale) < sb_max)
1547 		tp->request_r_scale++;
1548 
1549 	soisconnecting(so);
1550 	TCPSTAT_INC(tcps_connattempt);
1551 	tcp_state_change(tp, TCPS_SYN_SENT);
1552 	tp->iss = tcp_new_isn(&inp->inp_inc);
1553 	if (tp->t_flags & TF_REQ_TSTMP)
1554 		tp->ts_offset = tcp_new_ts_offset(&inp->inp_inc);
1555 	tcp_sendseqinit(tp);
1556 
1557 	return (0);
1558 }
1559 #endif /* INET6 */
1560 
1561 /*
1562  * Export TCP internal state information via a struct tcp_info, based on the
1563  * Linux 2.6 API.  Not ABI compatible as our constants are mapped differently
1564  * (TCP state machine, etc).  We export all information using FreeBSD-native
1565  * constants -- for example, the numeric values for tcpi_state will differ
1566  * from Linux.
1567  */
1568 void
tcp_fill_info(const struct tcpcb * tp,struct tcp_info * ti)1569 tcp_fill_info(const struct tcpcb *tp, struct tcp_info *ti)
1570 {
1571 
1572 	INP_LOCK_ASSERT(tptoinpcb(tp));
1573 	bzero(ti, sizeof(*ti));
1574 
1575 	ti->tcpi_state = tp->t_state;
1576 	if ((tp->t_flags & TF_REQ_TSTMP) && (tp->t_flags & TF_RCVD_TSTMP))
1577 		ti->tcpi_options |= TCPI_OPT_TIMESTAMPS;
1578 	if (tp->t_flags & TF_SACK_PERMIT)
1579 		ti->tcpi_options |= TCPI_OPT_SACK;
1580 	if ((tp->t_flags & TF_REQ_SCALE) && (tp->t_flags & TF_RCVD_SCALE)) {
1581 		ti->tcpi_options |= TCPI_OPT_WSCALE;
1582 		ti->tcpi_snd_wscale = tp->snd_scale;
1583 		ti->tcpi_rcv_wscale = tp->rcv_scale;
1584 	}
1585 	switch (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT)) {
1586 		case TF2_ECN_PERMIT:
1587 			ti->tcpi_options |= TCPI_OPT_ECN;
1588 			break;
1589 		case TF2_ACE_PERMIT:
1590 			/* FALLTHROUGH */
1591 		case TF2_ECN_PERMIT | TF2_ACE_PERMIT:
1592 			ti->tcpi_options |= TCPI_OPT_ACE;
1593 			break;
1594 		default:
1595 			break;
1596 	}
1597 	if (tp->t_flags & TF_FASTOPEN)
1598 		ti->tcpi_options |= TCPI_OPT_TFO;
1599 
1600 	ti->tcpi_rto = tp->t_rxtcur * tick;
1601 	ti->tcpi_last_data_recv = ((uint32_t)ticks - tp->t_rcvtime) * tick;
1602 	ti->tcpi_rtt = ((u_int64_t)tp->t_srtt * tick) >> TCP_RTT_SHIFT;
1603 	ti->tcpi_rttvar = ((u_int64_t)tp->t_rttvar * tick) >> TCP_RTTVAR_SHIFT;
1604 
1605 	ti->tcpi_snd_ssthresh = tp->snd_ssthresh;
1606 	ti->tcpi_snd_cwnd = tp->snd_cwnd;
1607 
1608 	/*
1609 	 * FreeBSD-specific extension fields for tcp_info.
1610 	 */
1611 	ti->tcpi_rcv_space = tp->rcv_wnd;
1612 	ti->tcpi_rcv_nxt = tp->rcv_nxt;
1613 	ti->tcpi_snd_wnd = tp->snd_wnd;
1614 	ti->tcpi_snd_bwnd = 0;		/* Unused, kept for compat. */
1615 	ti->tcpi_snd_nxt = tp->snd_nxt;
1616 	ti->tcpi_snd_mss = tp->t_maxseg;
1617 	ti->tcpi_rcv_mss = tp->t_maxseg;
1618 	ti->tcpi_snd_rexmitpack = tp->t_sndrexmitpack;
1619 	ti->tcpi_rcv_ooopack = tp->t_rcvoopack;
1620 	ti->tcpi_snd_zerowin = tp->t_sndzerowin;
1621 	ti->tcpi_snd_una = tp->snd_una;
1622 	ti->tcpi_snd_max = tp->snd_max;
1623 	ti->tcpi_rcv_numsacks = tp->rcv_numsacks;
1624 	ti->tcpi_rcv_adv = tp->rcv_adv;
1625 	ti->tcpi_dupacks = tp->t_dupacks;
1626 	ti->tcpi_rttmin = tp->t_rttlow;
1627 #ifdef TCP_OFFLOAD
1628 	if (tp->t_flags & TF_TOE) {
1629 		ti->tcpi_options |= TCPI_OPT_TOE;
1630 		tcp_offload_tcp_info(tp, ti);
1631 	}
1632 #endif
1633 	/*
1634 	 * AccECN related counters.
1635 	 */
1636 	if ((tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT)) ==
1637 	    (TF2_ECN_PERMIT | TF2_ACE_PERMIT))
1638 		/*
1639 		 * Internal counter starts at 5 for AccECN
1640 		 * but 0 for RFC3168 ECN.
1641 		 */
1642 		ti->tcpi_delivered_ce = tp->t_scep - 5;
1643 	else
1644 		ti->tcpi_delivered_ce = tp->t_scep;
1645 	ti->tcpi_received_ce = tp->t_rcep;
1646 }
1647 
1648 /*
1649  * tcp_ctloutput() must drop the inpcb lock before performing copyin on
1650  * socket option arguments.  When it re-acquires the lock after the copy, it
1651  * has to revalidate that the connection is still valid for the socket
1652  * option.
1653  */
1654 #define INP_WLOCK_RECHECK_CLEANUP(inp, cleanup) do {			\
1655 	INP_WLOCK(inp);							\
1656 	if (inp->inp_flags & INP_DROPPED) {				\
1657 		INP_WUNLOCK(inp);					\
1658 		cleanup;						\
1659 		return (ECONNRESET);					\
1660 	}								\
1661 	tp = intotcpcb(inp);						\
1662 } while(0)
1663 #define INP_WLOCK_RECHECK(inp) INP_WLOCK_RECHECK_CLEANUP((inp), /* noop */)
1664 
1665 int
tcp_ctloutput_set(struct inpcb * inp,struct sockopt * sopt)1666 tcp_ctloutput_set(struct inpcb *inp, struct sockopt *sopt)
1667 {
1668 	struct socket *so = inp->inp_socket;
1669 	struct tcpcb *tp = intotcpcb(inp);
1670 	int error = 0;
1671 
1672 	MPASS(sopt->sopt_dir == SOPT_SET);
1673 	INP_WLOCK_ASSERT(inp);
1674 	KASSERT((inp->inp_flags & INP_DROPPED) == 0,
1675 	    ("inp_flags == %x", inp->inp_flags));
1676 	KASSERT(so != NULL, ("inp_socket == NULL"));
1677 
1678 	if (sopt->sopt_level != IPPROTO_TCP) {
1679 		INP_WUNLOCK(inp);
1680 #ifdef INET6
1681 		if (inp->inp_vflag & INP_IPV6PROTO)
1682 			error = ip6_ctloutput(so, sopt);
1683 #endif
1684 #if defined(INET6) && defined(INET)
1685 		else
1686 #endif
1687 #ifdef INET
1688 			error = ip_ctloutput(so, sopt);
1689 #endif
1690 		/*
1691 		 * When an IP-level socket option affects TCP, pass control
1692 		 * down to stack tfb_tcp_ctloutput, otherwise return what
1693 		 * IP level returned.
1694 		 */
1695 		switch (sopt->sopt_level) {
1696 #ifdef INET6
1697 		case IPPROTO_IPV6:
1698 			if ((inp->inp_vflag & INP_IPV6PROTO) == 0)
1699 				return (error);
1700 			switch (sopt->sopt_name) {
1701 			case IPV6_TCLASS:
1702 				/* Notify tcp stacks that care (e.g. RACK). */
1703 				break;
1704 			case IPV6_USE_MIN_MTU:
1705 				/* Update t_maxseg accordingly. */
1706 				break;
1707 			default:
1708 				return (error);
1709 			}
1710 			break;
1711 #endif
1712 #ifdef INET
1713 		case IPPROTO_IP:
1714 			switch (sopt->sopt_name) {
1715 			case IP_TOS:
1716 				inp->inp_ip_tos &= ~IPTOS_ECN_MASK;
1717 				break;
1718 			case IP_TTL:
1719 				/* Notify tcp stacks that care (e.g. RACK). */
1720 				break;
1721 			default:
1722 				return (error);
1723 			}
1724 			break;
1725 #endif
1726 		default:
1727 			return (error);
1728 		}
1729 		INP_WLOCK_RECHECK(inp);
1730 	} else if (sopt->sopt_name == TCP_FUNCTION_BLK) {
1731 		/*
1732 		 * Protect the TCP option TCP_FUNCTION_BLK so
1733 		 * that a sub-function can *never* overwrite this.
1734 		 */
1735 		struct tcp_function_set fsn;
1736 		struct tcp_function_block *blk;
1737 		void *ptr = NULL;
1738 
1739 		INP_WUNLOCK(inp);
1740 		error = sooptcopyin(sopt, &fsn, sizeof fsn, sizeof fsn);
1741 		if (error)
1742 			return (error);
1743 
1744 		INP_WLOCK_RECHECK(inp);
1745 
1746 		blk = find_and_ref_tcp_functions(&fsn);
1747 		if (blk == NULL) {
1748 			INP_WUNLOCK(inp);
1749 			return (ENOENT);
1750 		}
1751 		if (tp->t_fb == blk) {
1752 			/* You already have this */
1753 			refcount_release(&blk->tfb_refcnt);
1754 			INP_WUNLOCK(inp);
1755 			return (0);
1756 		}
1757 		if (blk->tfb_flags & TCP_FUNC_BEING_REMOVED) {
1758 			refcount_release(&blk->tfb_refcnt);
1759 			INP_WUNLOCK(inp);
1760 			return (ENOENT);
1761 		}
1762 		error = (*blk->tfb_tcp_handoff_ok)(tp);
1763 		if (error) {
1764 			refcount_release(&blk->tfb_refcnt);
1765 			INP_WUNLOCK(inp);
1766 			return (error);
1767 		}
1768 		/*
1769 		 * Ensure the new stack takes ownership with a
1770 		 * clean slate on peak rate threshold.
1771 		 */
1772 		if (tp->t_fb->tfb_tcp_timer_stop_all != NULL)
1773 			tp->t_fb->tfb_tcp_timer_stop_all(tp);
1774 		if (blk->tfb_tcp_fb_init) {
1775 			error = (*blk->tfb_tcp_fb_init)(tp, &ptr);
1776 			if (error) {
1777 				/*
1778 				 * Release the ref count the lookup
1779 				 * acquired.
1780 				 */
1781 				refcount_release(&blk->tfb_refcnt);
1782 				/*
1783 				 * Now there is a chance that the
1784 				 * init() function mucked with some
1785 				 * things before it failed, such as
1786 				 * hpts or inp_flags2 or timer granularity.
1787 				 * It should not of, but lets give the old
1788 				 * stack a chance to reset to a known good state.
1789 				 */
1790 				if (tp->t_fb->tfb_switch_failed) {
1791 					(*tp->t_fb->tfb_switch_failed)(tp);
1792 				}
1793 			 	goto err_out;
1794 			}
1795 		}
1796 		if (tp->t_fb->tfb_tcp_fb_fini) {
1797 			struct epoch_tracker et;
1798 			/*
1799 			 * Tell the stack to cleanup with 0 i.e.
1800 			 * the tcb is not going away.
1801 			 */
1802 			NET_EPOCH_ENTER(et);
1803 			(*tp->t_fb->tfb_tcp_fb_fini)(tp, 0);
1804 			NET_EPOCH_EXIT(et);
1805 		}
1806 		/*
1807 		 * Release the old refcnt, the
1808 		 * lookup acquired a ref on the
1809 		 * new one already.
1810 		 */
1811 		refcount_release(&tp->t_fb->tfb_refcnt);
1812 		/*
1813 		 * Set in the new stack.
1814 		 */
1815 		tp->t_fb = blk;
1816 		tp->t_fb_ptr = ptr;
1817 #ifdef TCP_OFFLOAD
1818 		if (tp->t_flags & TF_TOE) {
1819 			tcp_offload_ctloutput(tp, sopt->sopt_dir,
1820 			     sopt->sopt_name);
1821 		}
1822 #endif
1823 err_out:
1824 		INP_WUNLOCK(inp);
1825 		return (error);
1826 
1827 	}
1828 
1829 	/* Pass in the INP locked, callee must unlock it. */
1830 	return (tp->t_fb->tfb_tcp_ctloutput(tp, sopt));
1831 }
1832 
1833 static int
tcp_ctloutput_get(struct inpcb * inp,struct sockopt * sopt)1834 tcp_ctloutput_get(struct inpcb *inp, struct sockopt *sopt)
1835 {
1836 	struct socket *so = inp->inp_socket;
1837 	struct tcpcb *tp = intotcpcb(inp);
1838 	int error = 0;
1839 
1840 	MPASS(sopt->sopt_dir == SOPT_GET);
1841 	INP_WLOCK_ASSERT(inp);
1842 	KASSERT((inp->inp_flags & INP_DROPPED) == 0,
1843 	    ("inp_flags == %x", inp->inp_flags));
1844 	KASSERT(so != NULL, ("inp_socket == NULL"));
1845 
1846 	if (sopt->sopt_level != IPPROTO_TCP) {
1847 		INP_WUNLOCK(inp);
1848 #ifdef INET6
1849 		if (inp->inp_vflag & INP_IPV6PROTO)
1850 			error = ip6_ctloutput(so, sopt);
1851 #endif /* INET6 */
1852 #if defined(INET6) && defined(INET)
1853 		else
1854 #endif
1855 #ifdef INET
1856 			error = ip_ctloutput(so, sopt);
1857 #endif
1858 		return (error);
1859 	}
1860 	if (((sopt->sopt_name == TCP_FUNCTION_BLK) ||
1861 	     (sopt->sopt_name == TCP_FUNCTION_ALIAS))) {
1862 		struct tcp_function_set fsn;
1863 
1864 		if (sopt->sopt_name == TCP_FUNCTION_ALIAS) {
1865 			memset(&fsn, 0, sizeof(fsn));
1866 			find_tcp_function_alias(tp->t_fb, &fsn);
1867 		} else {
1868 			strncpy(fsn.function_set_name,
1869 			    tp->t_fb->tfb_tcp_block_name,
1870 			    TCP_FUNCTION_NAME_LEN_MAX);
1871 			fsn.function_set_name[TCP_FUNCTION_NAME_LEN_MAX - 1] = '\0';
1872 		}
1873 		fsn.pcbcnt = tp->t_fb->tfb_refcnt;
1874 		INP_WUNLOCK(inp);
1875 		error = sooptcopyout(sopt, &fsn, sizeof fsn);
1876 		return (error);
1877 	}
1878 
1879 	/* Pass in the INP locked, callee must unlock it. */
1880 	return (tp->t_fb->tfb_tcp_ctloutput(tp, sopt));
1881 }
1882 
1883 int
tcp_ctloutput(struct socket * so,struct sockopt * sopt)1884 tcp_ctloutput(struct socket *so, struct sockopt *sopt)
1885 {
1886 	struct	inpcb *inp;
1887 
1888 	inp = sotoinpcb(so);
1889 	KASSERT(inp != NULL, ("tcp_ctloutput: inp == NULL"));
1890 
1891 	INP_WLOCK(inp);
1892 	if (inp->inp_flags & INP_DROPPED) {
1893 		INP_WUNLOCK(inp);
1894 		return (ECONNRESET);
1895 	}
1896 	if (sopt->sopt_dir == SOPT_SET)
1897 		return (tcp_ctloutput_set(inp, sopt));
1898 	else if (sopt->sopt_dir == SOPT_GET)
1899 		return (tcp_ctloutput_get(inp, sopt));
1900 	else
1901 		panic("%s: sopt_dir $%d", __func__, sopt->sopt_dir);
1902 }
1903 
1904 /*
1905  * If this assert becomes untrue, we need to change the size of the buf
1906  * variable in tcp_default_ctloutput().
1907  */
1908 #ifdef CTASSERT
1909 CTASSERT(TCP_CA_NAME_MAX <= TCP_LOG_ID_LEN);
1910 CTASSERT(TCP_LOG_REASON_LEN <= TCP_LOG_ID_LEN);
1911 #endif
1912 
1913 extern struct cc_algo newreno_cc_algo;
1914 
1915 static int
tcp_set_cc_mod(struct inpcb * inp,struct sockopt * sopt)1916 tcp_set_cc_mod(struct inpcb *inp, struct sockopt *sopt)
1917 {
1918 	struct cc_algo *algo;
1919 	void *ptr = NULL;
1920 	struct tcpcb *tp;
1921 	struct cc_var cc_mem;
1922 	char	buf[TCP_CA_NAME_MAX];
1923 	size_t mem_sz;
1924 	int error;
1925 
1926 	INP_WUNLOCK(inp);
1927 	error = sooptcopyin(sopt, buf, TCP_CA_NAME_MAX - 1, 1);
1928 	if (error)
1929 		return(error);
1930 	buf[sopt->sopt_valsize] = '\0';
1931 	CC_LIST_RLOCK();
1932 	STAILQ_FOREACH(algo, &cc_list, entries) {
1933 		if (strncmp(buf, algo->name,
1934 			    TCP_CA_NAME_MAX) == 0) {
1935 			if (algo->flags & CC_MODULE_BEING_REMOVED) {
1936 				/* We can't "see" modules being unloaded */
1937 				continue;
1938 			}
1939 			break;
1940 		}
1941 	}
1942 	if (algo == NULL) {
1943 		CC_LIST_RUNLOCK();
1944 		return(ESRCH);
1945 	}
1946 	/*
1947 	 * With a reference the algorithm cannot be removed
1948 	 * so we hold a reference through the change process.
1949 	 */
1950 	cc_refer(algo);
1951 	CC_LIST_RUNLOCK();
1952 	if (algo->cb_init != NULL) {
1953 		/* We can now pre-get the memory for the CC */
1954 		mem_sz = (*algo->cc_data_sz)();
1955 		if (mem_sz == 0) {
1956 			goto no_mem_needed;
1957 		}
1958 		ptr = malloc(mem_sz, M_CC_MEM, M_WAITOK);
1959 	} else {
1960 no_mem_needed:
1961 		mem_sz = 0;
1962 		ptr = NULL;
1963 	}
1964 	/*
1965 	 * Make sure its all clean and zero and also get
1966 	 * back the inplock.
1967 	 */
1968 	memset(&cc_mem, 0, sizeof(cc_mem));
1969 	INP_WLOCK(inp);
1970 	if (inp->inp_flags & INP_DROPPED) {
1971 		INP_WUNLOCK(inp);
1972 		if (ptr)
1973 			free(ptr, M_CC_MEM);
1974 		/* Release our temp reference */
1975 		CC_LIST_RLOCK();
1976 		cc_release(algo);
1977 		CC_LIST_RUNLOCK();
1978 		return (ECONNRESET);
1979 	}
1980 	tp = intotcpcb(inp);
1981 	if (ptr != NULL)
1982 		memset(ptr, 0, mem_sz);
1983 	cc_mem.tp = tp;
1984 	/*
1985 	 * We once again hold a write lock over the tcb so it's
1986 	 * safe to do these things without ordering concerns.
1987 	 * Note here we init into stack memory.
1988 	 */
1989 	if (algo->cb_init != NULL)
1990 		error = algo->cb_init(&cc_mem, ptr);
1991 	else
1992 		error = 0;
1993 	/*
1994 	 * The CC algorithms, when given their memory
1995 	 * should not fail we could in theory have a
1996 	 * KASSERT here.
1997 	 */
1998 	if (error == 0) {
1999 		/*
2000 		 * Touchdown, lets go ahead and move the
2001 		 * connection to the new CC module by
2002 		 * copying in the cc_mem after we call
2003 		 * the old ones cleanup (if any).
2004 		 */
2005 		if (CC_ALGO(tp)->cb_destroy != NULL)
2006 			CC_ALGO(tp)->cb_destroy(&tp->t_ccv);
2007 		/* Detach the old CC from the tcpcb  */
2008 		cc_detach(tp);
2009 		/* Copy in our temp memory that was inited */
2010 		memcpy(&tp->t_ccv, &cc_mem, sizeof(struct cc_var));
2011 		/* Now attach the new, which takes a reference */
2012 		cc_attach(tp, algo);
2013 		/* Ok now are we where we have gotten past any conn_init? */
2014 		if (TCPS_HAVEESTABLISHED(tp->t_state) && (CC_ALGO(tp)->conn_init != NULL)) {
2015 			/* Yep run the connection init for the new CC */
2016 			CC_ALGO(tp)->conn_init(&tp->t_ccv);
2017 		}
2018 	} else if (ptr)
2019 		free(ptr, M_CC_MEM);
2020 	INP_WUNLOCK(inp);
2021 	/* Now lets release our temp reference */
2022 	CC_LIST_RLOCK();
2023 	cc_release(algo);
2024 	CC_LIST_RUNLOCK();
2025 	return (error);
2026 }
2027 
2028 int
tcp_default_ctloutput(struct tcpcb * tp,struct sockopt * sopt)2029 tcp_default_ctloutput(struct tcpcb *tp, struct sockopt *sopt)
2030 {
2031 	struct inpcb *inp = tptoinpcb(tp);
2032 	int	error, opt, optval;
2033 	u_int	ui;
2034 	struct	tcp_info ti;
2035 #ifdef KERN_TLS
2036 	struct tls_enable tls;
2037 	struct socket *so = inp->inp_socket;
2038 #endif
2039 	char	*pbuf, buf[TCP_LOG_ID_LEN];
2040 #ifdef STATS
2041 	struct statsblob *sbp;
2042 #endif
2043 	size_t	len;
2044 
2045 	INP_WLOCK_ASSERT(inp);
2046 	KASSERT((inp->inp_flags & INP_DROPPED) == 0,
2047 	    ("inp_flags == %x", inp->inp_flags));
2048 	KASSERT(inp->inp_socket != NULL, ("inp_socket == NULL"));
2049 
2050 	switch (sopt->sopt_level) {
2051 #ifdef INET6
2052 	case IPPROTO_IPV6:
2053 		MPASS(inp->inp_vflag & INP_IPV6PROTO);
2054 		switch (sopt->sopt_name) {
2055 		case IPV6_USE_MIN_MTU:
2056 			tcp6_use_min_mtu(tp);
2057 			/* FALLTHROUGH */
2058 		}
2059 		INP_WUNLOCK(inp);
2060 		return (0);
2061 #endif
2062 #ifdef INET
2063 	case IPPROTO_IP:
2064 		INP_WUNLOCK(inp);
2065 		return (0);
2066 #endif
2067 	}
2068 
2069 	/*
2070 	 * For TCP_CCALGOOPT forward the control to CC module, for both
2071 	 * SOPT_SET and SOPT_GET.
2072 	 */
2073 	switch (sopt->sopt_name) {
2074 	case TCP_CCALGOOPT:
2075 		INP_WUNLOCK(inp);
2076 		if (sopt->sopt_valsize > CC_ALGOOPT_LIMIT)
2077 			return (EINVAL);
2078 		pbuf = malloc(sopt->sopt_valsize, M_TEMP, M_WAITOK | M_ZERO);
2079 		error = sooptcopyin(sopt, pbuf, sopt->sopt_valsize,
2080 		    sopt->sopt_valsize);
2081 		if (error) {
2082 			free(pbuf, M_TEMP);
2083 			return (error);
2084 		}
2085 		INP_WLOCK_RECHECK_CLEANUP(inp, free(pbuf, M_TEMP));
2086 		if (CC_ALGO(tp)->ctl_output != NULL)
2087 			error = CC_ALGO(tp)->ctl_output(&tp->t_ccv, sopt, pbuf);
2088 		else
2089 			error = ENOENT;
2090 		INP_WUNLOCK(inp);
2091 		if (error == 0 && sopt->sopt_dir == SOPT_GET)
2092 			error = sooptcopyout(sopt, pbuf, sopt->sopt_valsize);
2093 		free(pbuf, M_TEMP);
2094 		return (error);
2095 	}
2096 
2097 	switch (sopt->sopt_dir) {
2098 	case SOPT_SET:
2099 		switch (sopt->sopt_name) {
2100 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
2101 		case TCP_MD5SIG:
2102 			INP_WUNLOCK(inp);
2103 			if (!TCPMD5_ENABLED())
2104 				return (ENOPROTOOPT);
2105 			error = TCPMD5_PCBCTL(inp, sopt);
2106 			if (error)
2107 				return (error);
2108 			INP_WLOCK_RECHECK(inp);
2109 			goto unlock_and_done;
2110 #endif /* IPSEC */
2111 
2112 		case TCP_NODELAY:
2113 		case TCP_NOOPT:
2114 			INP_WUNLOCK(inp);
2115 			error = sooptcopyin(sopt, &optval, sizeof optval,
2116 			    sizeof optval);
2117 			if (error)
2118 				return (error);
2119 
2120 			INP_WLOCK_RECHECK(inp);
2121 			switch (sopt->sopt_name) {
2122 			case TCP_NODELAY:
2123 				opt = TF_NODELAY;
2124 				break;
2125 			case TCP_NOOPT:
2126 				opt = TF_NOOPT;
2127 				break;
2128 			default:
2129 				opt = 0; /* dead code to fool gcc */
2130 				break;
2131 			}
2132 
2133 			if (optval)
2134 				tp->t_flags |= opt;
2135 			else
2136 				tp->t_flags &= ~opt;
2137 unlock_and_done:
2138 #ifdef TCP_OFFLOAD
2139 			if (tp->t_flags & TF_TOE) {
2140 				tcp_offload_ctloutput(tp, sopt->sopt_dir,
2141 				    sopt->sopt_name);
2142 			}
2143 #endif
2144 			INP_WUNLOCK(inp);
2145 			break;
2146 
2147 		case TCP_NOPUSH:
2148 			INP_WUNLOCK(inp);
2149 			error = sooptcopyin(sopt, &optval, sizeof optval,
2150 			    sizeof optval);
2151 			if (error)
2152 				return (error);
2153 
2154 			INP_WLOCK_RECHECK(inp);
2155 			if (optval)
2156 				tp->t_flags |= TF_NOPUSH;
2157 			else if (tp->t_flags & TF_NOPUSH) {
2158 				tp->t_flags &= ~TF_NOPUSH;
2159 				if (TCPS_HAVEESTABLISHED(tp->t_state)) {
2160 					struct epoch_tracker et;
2161 
2162 					NET_EPOCH_ENTER(et);
2163 					error = tcp_output_nodrop(tp);
2164 					NET_EPOCH_EXIT(et);
2165 				}
2166 			}
2167 			goto unlock_and_done;
2168 
2169 		case TCP_REMOTE_UDP_ENCAPS_PORT:
2170 			INP_WUNLOCK(inp);
2171 			error = sooptcopyin(sopt, &optval, sizeof optval,
2172 			    sizeof optval);
2173 			if (error)
2174 				return (error);
2175 			if ((optval < TCP_TUNNELING_PORT_MIN) ||
2176 			    (optval > TCP_TUNNELING_PORT_MAX)) {
2177 				/* Its got to be in range */
2178 				return (EINVAL);
2179 			}
2180 			if ((V_tcp_udp_tunneling_port == 0) && (optval != 0)) {
2181 				/* You have to have enabled a UDP tunneling port first */
2182 				return (EINVAL);
2183 			}
2184 			INP_WLOCK_RECHECK(inp);
2185 			if (tp->t_state != TCPS_CLOSED) {
2186 				/* You can't change after you are connected */
2187 				error = EINVAL;
2188 			} else {
2189 				/* Ok we are all good set the port */
2190 				tp->t_port = htons(optval);
2191 			}
2192 			goto unlock_and_done;
2193 
2194 		case TCP_MAXSEG:
2195 			INP_WUNLOCK(inp);
2196 			error = sooptcopyin(sopt, &optval, sizeof optval,
2197 			    sizeof optval);
2198 			if (error)
2199 				return (error);
2200 
2201 			INP_WLOCK_RECHECK(inp);
2202 			if (optval > 0 && optval <= tp->t_maxseg &&
2203 			    optval + 40 >= V_tcp_minmss) {
2204 				tp->t_maxseg = optval;
2205 				if (tp->t_maxseg < V_tcp_mssdflt) {
2206 					/*
2207 					 * The MSS is so small we should not process incoming
2208 					 * SACK's since we are subject to attack in such a
2209 					 * case.
2210 					 */
2211 					tp->t_flags2 |= TF2_PROC_SACK_PROHIBIT;
2212 				} else {
2213 					tp->t_flags2 &= ~TF2_PROC_SACK_PROHIBIT;
2214 				}
2215 			} else
2216 				error = EINVAL;
2217 			goto unlock_and_done;
2218 
2219 		case TCP_INFO:
2220 			INP_WUNLOCK(inp);
2221 			error = EINVAL;
2222 			break;
2223 
2224 		case TCP_STATS:
2225 			INP_WUNLOCK(inp);
2226 #ifdef STATS
2227 			error = sooptcopyin(sopt, &optval, sizeof optval,
2228 			    sizeof optval);
2229 			if (error)
2230 				return (error);
2231 
2232 			if (optval > 0)
2233 				sbp = stats_blob_alloc(
2234 				    V_tcp_perconn_stats_dflt_tpl, 0);
2235 			else
2236 				sbp = NULL;
2237 
2238 			INP_WLOCK_RECHECK(inp);
2239 			if ((tp->t_stats != NULL && sbp == NULL) ||
2240 			    (tp->t_stats == NULL && sbp != NULL)) {
2241 				struct statsblob *t = tp->t_stats;
2242 				tp->t_stats = sbp;
2243 				sbp = t;
2244 			}
2245 			INP_WUNLOCK(inp);
2246 
2247 			stats_blob_destroy(sbp);
2248 #else
2249 			return (EOPNOTSUPP);
2250 #endif /* !STATS */
2251 			break;
2252 
2253 		case TCP_CONGESTION:
2254 			error = tcp_set_cc_mod(inp, sopt);
2255 			break;
2256 
2257 		case TCP_REUSPORT_LB_NUMA:
2258 			INP_WUNLOCK(inp);
2259 			error = sooptcopyin(sopt, &optval, sizeof(optval),
2260 			    sizeof(optval));
2261 			INP_WLOCK_RECHECK(inp);
2262 			if (!error)
2263 				error = in_pcblbgroup_numa(inp, optval);
2264 			INP_WUNLOCK(inp);
2265 			break;
2266 
2267 #ifdef KERN_TLS
2268 		case TCP_TXTLS_ENABLE:
2269 			INP_WUNLOCK(inp);
2270 			error = ktls_copyin_tls_enable(sopt, &tls);
2271 			if (error != 0)
2272 				break;
2273 			error = ktls_enable_tx(so, &tls);
2274 			ktls_cleanup_tls_enable(&tls);
2275 			break;
2276 		case TCP_TXTLS_MODE:
2277 			INP_WUNLOCK(inp);
2278 			error = sooptcopyin(sopt, &ui, sizeof(ui), sizeof(ui));
2279 			if (error != 0)
2280 				return (error);
2281 
2282 			INP_WLOCK_RECHECK(inp);
2283 			error = ktls_set_tx_mode(so, ui);
2284 			INP_WUNLOCK(inp);
2285 			break;
2286 		case TCP_RXTLS_ENABLE:
2287 			INP_WUNLOCK(inp);
2288 			error = ktls_copyin_tls_enable(sopt, &tls);
2289 			if (error != 0)
2290 				break;
2291 			error = ktls_enable_rx(so, &tls);
2292 			ktls_cleanup_tls_enable(&tls);
2293 			break;
2294 #endif
2295 		case TCP_MAXUNACKTIME:
2296 		case TCP_KEEPIDLE:
2297 		case TCP_KEEPINTVL:
2298 		case TCP_KEEPINIT:
2299 			INP_WUNLOCK(inp);
2300 			error = sooptcopyin(sopt, &ui, sizeof(ui), sizeof(ui));
2301 			if (error)
2302 				return (error);
2303 
2304 			if (ui > (UINT_MAX / hz)) {
2305 				error = EINVAL;
2306 				break;
2307 			}
2308 			ui *= hz;
2309 
2310 			INP_WLOCK_RECHECK(inp);
2311 			switch (sopt->sopt_name) {
2312 			case TCP_MAXUNACKTIME:
2313 				tp->t_maxunacktime = ui;
2314 				break;
2315 
2316 			case TCP_KEEPIDLE:
2317 				tp->t_keepidle = ui;
2318 				/*
2319 				 * XXX: better check current remaining
2320 				 * timeout and "merge" it with new value.
2321 				 */
2322 				if ((tp->t_state > TCPS_LISTEN) &&
2323 				    (tp->t_state <= TCPS_CLOSING))
2324 					tcp_timer_activate(tp, TT_KEEP,
2325 					    TP_KEEPIDLE(tp));
2326 				break;
2327 			case TCP_KEEPINTVL:
2328 				tp->t_keepintvl = ui;
2329 				if ((tp->t_state == TCPS_FIN_WAIT_2) &&
2330 				    (TP_MAXIDLE(tp) > 0))
2331 					tcp_timer_activate(tp, TT_2MSL,
2332 					    TP_MAXIDLE(tp));
2333 				break;
2334 			case TCP_KEEPINIT:
2335 				tp->t_keepinit = ui;
2336 				if (tp->t_state == TCPS_SYN_RECEIVED ||
2337 				    tp->t_state == TCPS_SYN_SENT)
2338 					tcp_timer_activate(tp, TT_KEEP,
2339 					    TP_KEEPINIT(tp));
2340 				break;
2341 			}
2342 			goto unlock_and_done;
2343 
2344 		case TCP_KEEPCNT:
2345 			INP_WUNLOCK(inp);
2346 			error = sooptcopyin(sopt, &ui, sizeof(ui), sizeof(ui));
2347 			if (error)
2348 				return (error);
2349 
2350 			INP_WLOCK_RECHECK(inp);
2351 			tp->t_keepcnt = ui;
2352 			if ((tp->t_state == TCPS_FIN_WAIT_2) &&
2353 			    (TP_MAXIDLE(tp) > 0))
2354 				tcp_timer_activate(tp, TT_2MSL,
2355 				    TP_MAXIDLE(tp));
2356 			goto unlock_and_done;
2357 
2358 		case TCP_FASTOPEN: {
2359 			struct tcp_fastopen tfo_optval;
2360 
2361 			INP_WUNLOCK(inp);
2362 			if (!V_tcp_fastopen_client_enable &&
2363 			    !V_tcp_fastopen_server_enable)
2364 				return (EPERM);
2365 
2366 			error = sooptcopyin(sopt, &tfo_optval,
2367 				    sizeof(tfo_optval), sizeof(int));
2368 			if (error)
2369 				return (error);
2370 
2371 			INP_WLOCK_RECHECK(inp);
2372 			if ((tp->t_state != TCPS_CLOSED) &&
2373 			    (tp->t_state != TCPS_LISTEN)) {
2374 				error = EINVAL;
2375 				goto unlock_and_done;
2376 			}
2377 			if (tfo_optval.enable) {
2378 				if (tp->t_state == TCPS_LISTEN) {
2379 					if (!V_tcp_fastopen_server_enable) {
2380 						error = EPERM;
2381 						goto unlock_and_done;
2382 					}
2383 
2384 					if (tp->t_tfo_pending == NULL)
2385 						tp->t_tfo_pending =
2386 						    tcp_fastopen_alloc_counter();
2387 				} else {
2388 					/*
2389 					 * If a pre-shared key was provided,
2390 					 * stash it in the client cookie
2391 					 * field of the tcpcb for use during
2392 					 * connect.
2393 					 */
2394 					if (sopt->sopt_valsize ==
2395 					    sizeof(tfo_optval)) {
2396 						memcpy(tp->t_tfo_cookie.client,
2397 						       tfo_optval.psk,
2398 						       TCP_FASTOPEN_PSK_LEN);
2399 						tp->t_tfo_client_cookie_len =
2400 						    TCP_FASTOPEN_PSK_LEN;
2401 					}
2402 				}
2403 				tp->t_flags |= TF_FASTOPEN;
2404 			} else
2405 				tp->t_flags &= ~TF_FASTOPEN;
2406 			goto unlock_and_done;
2407 		}
2408 
2409 #ifdef TCP_BLACKBOX
2410 		case TCP_LOG:
2411 			INP_WUNLOCK(inp);
2412 			error = sooptcopyin(sopt, &optval, sizeof optval,
2413 			    sizeof optval);
2414 			if (error)
2415 				return (error);
2416 
2417 			INP_WLOCK_RECHECK(inp);
2418 			error = tcp_log_state_change(tp, optval);
2419 			goto unlock_and_done;
2420 
2421 		case TCP_LOGBUF:
2422 			INP_WUNLOCK(inp);
2423 			error = EINVAL;
2424 			break;
2425 
2426 		case TCP_LOGID:
2427 			INP_WUNLOCK(inp);
2428 			error = sooptcopyin(sopt, buf, TCP_LOG_ID_LEN - 1, 0);
2429 			if (error)
2430 				break;
2431 			buf[sopt->sopt_valsize] = '\0';
2432 			INP_WLOCK_RECHECK(inp);
2433 			error = tcp_log_set_id(tp, buf);
2434 			/* tcp_log_set_id() unlocks the INP. */
2435 			break;
2436 
2437 		case TCP_LOGDUMP:
2438 		case TCP_LOGDUMPID:
2439 			INP_WUNLOCK(inp);
2440 			error =
2441 			    sooptcopyin(sopt, buf, TCP_LOG_REASON_LEN - 1, 0);
2442 			if (error)
2443 				break;
2444 			buf[sopt->sopt_valsize] = '\0';
2445 			INP_WLOCK_RECHECK(inp);
2446 			if (sopt->sopt_name == TCP_LOGDUMP) {
2447 				error = tcp_log_dump_tp_logbuf(tp, buf,
2448 				    M_WAITOK, true);
2449 				INP_WUNLOCK(inp);
2450 			} else {
2451 				tcp_log_dump_tp_bucket_logbufs(tp, buf);
2452 				/*
2453 				 * tcp_log_dump_tp_bucket_logbufs() drops the
2454 				 * INP lock.
2455 				 */
2456 			}
2457 			break;
2458 #endif
2459 
2460 		default:
2461 			INP_WUNLOCK(inp);
2462 			error = ENOPROTOOPT;
2463 			break;
2464 		}
2465 		break;
2466 
2467 	case SOPT_GET:
2468 		tp = intotcpcb(inp);
2469 		switch (sopt->sopt_name) {
2470 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
2471 		case TCP_MD5SIG:
2472 			INP_WUNLOCK(inp);
2473 			if (!TCPMD5_ENABLED())
2474 				return (ENOPROTOOPT);
2475 			error = TCPMD5_PCBCTL(inp, sopt);
2476 			break;
2477 #endif
2478 
2479 		case TCP_NODELAY:
2480 			optval = tp->t_flags & TF_NODELAY;
2481 			INP_WUNLOCK(inp);
2482 			error = sooptcopyout(sopt, &optval, sizeof optval);
2483 			break;
2484 		case TCP_MAXSEG:
2485 			optval = tp->t_maxseg;
2486 			INP_WUNLOCK(inp);
2487 			error = sooptcopyout(sopt, &optval, sizeof optval);
2488 			break;
2489 		case TCP_REMOTE_UDP_ENCAPS_PORT:
2490 			optval = ntohs(tp->t_port);
2491 			INP_WUNLOCK(inp);
2492 			error = sooptcopyout(sopt, &optval, sizeof optval);
2493 			break;
2494 		case TCP_NOOPT:
2495 			optval = tp->t_flags & TF_NOOPT;
2496 			INP_WUNLOCK(inp);
2497 			error = sooptcopyout(sopt, &optval, sizeof optval);
2498 			break;
2499 		case TCP_NOPUSH:
2500 			optval = tp->t_flags & TF_NOPUSH;
2501 			INP_WUNLOCK(inp);
2502 			error = sooptcopyout(sopt, &optval, sizeof optval);
2503 			break;
2504 		case TCP_INFO:
2505 			tcp_fill_info(tp, &ti);
2506 			INP_WUNLOCK(inp);
2507 			error = sooptcopyout(sopt, &ti, sizeof ti);
2508 			break;
2509 		case TCP_STATS:
2510 			{
2511 #ifdef STATS
2512 			int nheld;
2513 			TYPEOF_MEMBER(struct statsblob, flags) sbflags = 0;
2514 
2515 			error = 0;
2516 			socklen_t outsbsz = sopt->sopt_valsize;
2517 			if (tp->t_stats == NULL)
2518 				error = ENOENT;
2519 			else if (outsbsz >= tp->t_stats->cursz)
2520 				outsbsz = tp->t_stats->cursz;
2521 			else if (outsbsz >= sizeof(struct statsblob))
2522 				outsbsz = sizeof(struct statsblob);
2523 			else
2524 				error = EINVAL;
2525 			INP_WUNLOCK(inp);
2526 			if (error)
2527 				break;
2528 
2529 			sbp = sopt->sopt_val;
2530 			nheld = atop(round_page(((vm_offset_t)sbp) +
2531 			    (vm_size_t)outsbsz) - trunc_page((vm_offset_t)sbp));
2532 			vm_page_t ma[nheld];
2533 			if (vm_fault_quick_hold_pages(
2534 			    &curproc->p_vmspace->vm_map, (vm_offset_t)sbp,
2535 			    outsbsz, VM_PROT_READ | VM_PROT_WRITE, ma,
2536 			    nheld) < 0) {
2537 				error = EFAULT;
2538 				break;
2539 			}
2540 
2541 			if ((error = copyin_nofault(&(sbp->flags), &sbflags,
2542 			    SIZEOF_MEMBER(struct statsblob, flags))))
2543 				goto unhold;
2544 
2545 			INP_WLOCK_RECHECK(inp);
2546 			error = stats_blob_snapshot(&sbp, outsbsz, tp->t_stats,
2547 			    sbflags | SB_CLONE_USRDSTNOFAULT);
2548 			INP_WUNLOCK(inp);
2549 			sopt->sopt_valsize = outsbsz;
2550 unhold:
2551 			vm_page_unhold_pages(ma, nheld);
2552 #else
2553 			INP_WUNLOCK(inp);
2554 			error = EOPNOTSUPP;
2555 #endif /* !STATS */
2556 			break;
2557 			}
2558 		case TCP_CONGESTION:
2559 			len = strlcpy(buf, CC_ALGO(tp)->name, TCP_CA_NAME_MAX);
2560 			INP_WUNLOCK(inp);
2561 			error = sooptcopyout(sopt, buf, len + 1);
2562 			break;
2563 		case TCP_MAXUNACKTIME:
2564 		case TCP_KEEPIDLE:
2565 		case TCP_KEEPINTVL:
2566 		case TCP_KEEPINIT:
2567 		case TCP_KEEPCNT:
2568 			switch (sopt->sopt_name) {
2569 			case TCP_MAXUNACKTIME:
2570 				ui = TP_MAXUNACKTIME(tp) / hz;
2571 				break;
2572 			case TCP_KEEPIDLE:
2573 				ui = TP_KEEPIDLE(tp) / hz;
2574 				break;
2575 			case TCP_KEEPINTVL:
2576 				ui = TP_KEEPINTVL(tp) / hz;
2577 				break;
2578 			case TCP_KEEPINIT:
2579 				ui = TP_KEEPINIT(tp) / hz;
2580 				break;
2581 			case TCP_KEEPCNT:
2582 				ui = TP_KEEPCNT(tp);
2583 				break;
2584 			}
2585 			INP_WUNLOCK(inp);
2586 			error = sooptcopyout(sopt, &ui, sizeof(ui));
2587 			break;
2588 		case TCP_FASTOPEN:
2589 			optval = tp->t_flags & TF_FASTOPEN;
2590 			INP_WUNLOCK(inp);
2591 			error = sooptcopyout(sopt, &optval, sizeof optval);
2592 			break;
2593 #ifdef TCP_BLACKBOX
2594 		case TCP_LOG:
2595 			optval = tcp_get_bblog_state(tp);
2596 			INP_WUNLOCK(inp);
2597 			error = sooptcopyout(sopt, &optval, sizeof(optval));
2598 			break;
2599 		case TCP_LOGBUF:
2600 			/* tcp_log_getlogbuf() does INP_WUNLOCK(inp) */
2601 			error = tcp_log_getlogbuf(sopt, tp);
2602 			break;
2603 		case TCP_LOGID:
2604 			len = tcp_log_get_id(tp, buf);
2605 			INP_WUNLOCK(inp);
2606 			error = sooptcopyout(sopt, buf, len + 1);
2607 			break;
2608 		case TCP_LOGDUMP:
2609 		case TCP_LOGDUMPID:
2610 			INP_WUNLOCK(inp);
2611 			error = EINVAL;
2612 			break;
2613 #endif
2614 #ifdef KERN_TLS
2615 		case TCP_TXTLS_MODE:
2616 			error = ktls_get_tx_mode(so, &optval);
2617 			INP_WUNLOCK(inp);
2618 			if (error == 0)
2619 				error = sooptcopyout(sopt, &optval,
2620 				    sizeof(optval));
2621 			break;
2622 		case TCP_RXTLS_MODE:
2623 			error = ktls_get_rx_mode(so, &optval);
2624 			INP_WUNLOCK(inp);
2625 			if (error == 0)
2626 				error = sooptcopyout(sopt, &optval,
2627 				    sizeof(optval));
2628 			break;
2629 #endif
2630 		default:
2631 			INP_WUNLOCK(inp);
2632 			error = ENOPROTOOPT;
2633 			break;
2634 		}
2635 		break;
2636 	}
2637 	return (error);
2638 }
2639 #undef INP_WLOCK_RECHECK
2640 #undef INP_WLOCK_RECHECK_CLEANUP
2641 
2642 /*
2643  * Initiate (or continue) disconnect.
2644  * If embryonic state, just send reset (once).
2645  * If in ``let data drain'' option and linger null, just drop.
2646  * Otherwise (hard), mark socket disconnecting and drop
2647  * current input data; switch states based on user close, and
2648  * send segment to peer (with FIN).
2649  */
2650 static void
tcp_disconnect(struct tcpcb * tp)2651 tcp_disconnect(struct tcpcb *tp)
2652 {
2653 	struct inpcb *inp = tptoinpcb(tp);
2654 	struct socket *so = tptosocket(tp);
2655 
2656 	NET_EPOCH_ASSERT();
2657 	INP_WLOCK_ASSERT(inp);
2658 
2659 	/*
2660 	 * Neither tcp_close() nor tcp_drop() should return NULL, as the
2661 	 * socket is still open.
2662 	 */
2663 	if (tp->t_state < TCPS_ESTABLISHED &&
2664 	    !(tp->t_state > TCPS_LISTEN && (tp->t_flags & TF_FASTOPEN))) {
2665 		tp = tcp_close(tp);
2666 		KASSERT(tp != NULL,
2667 		    ("tcp_disconnect: tcp_close() returned NULL"));
2668 	} else if ((so->so_options & SO_LINGER) && so->so_linger == 0) {
2669 		tp = tcp_drop(tp, 0);
2670 		KASSERT(tp != NULL,
2671 		    ("tcp_disconnect: tcp_drop() returned NULL"));
2672 	} else {
2673 		soisdisconnecting(so);
2674 		sbflush(&so->so_rcv);
2675 		tcp_usrclosed(tp);
2676 		if (!(inp->inp_flags & INP_DROPPED))
2677 			/* Ignore stack's drop request, we already at it. */
2678 			(void)tcp_output_nodrop(tp);
2679 	}
2680 }
2681 
2682 /*
2683  * User issued close, and wish to trail through shutdown states:
2684  * if never received SYN, just forget it.  If got a SYN from peer,
2685  * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN.
2686  * If already got a FIN from peer, then almost done; go to LAST_ACK
2687  * state.  In all other cases, have already sent FIN to peer (e.g.
2688  * after PRU_SHUTDOWN), and just have to play tedious game waiting
2689  * for peer to send FIN or not respond to keep-alives, etc.
2690  * We can let the user exit from the close as soon as the FIN is acked.
2691  */
2692 static void
tcp_usrclosed(struct tcpcb * tp)2693 tcp_usrclosed(struct tcpcb *tp)
2694 {
2695 
2696 	NET_EPOCH_ASSERT();
2697 	INP_WLOCK_ASSERT(tptoinpcb(tp));
2698 
2699 	switch (tp->t_state) {
2700 	case TCPS_LISTEN:
2701 #ifdef TCP_OFFLOAD
2702 		tcp_offload_listen_stop(tp);
2703 #endif
2704 		tcp_state_change(tp, TCPS_CLOSED);
2705 		/* FALLTHROUGH */
2706 	case TCPS_CLOSED:
2707 		tp = tcp_close(tp);
2708 		/*
2709 		 * tcp_close() should never return NULL here as the socket is
2710 		 * still open.
2711 		 */
2712 		KASSERT(tp != NULL,
2713 		    ("tcp_usrclosed: tcp_close() returned NULL"));
2714 		break;
2715 
2716 	case TCPS_SYN_SENT:
2717 	case TCPS_SYN_RECEIVED:
2718 		tp->t_flags |= TF_NEEDFIN;
2719 		break;
2720 
2721 	case TCPS_ESTABLISHED:
2722 		tcp_state_change(tp, TCPS_FIN_WAIT_1);
2723 		break;
2724 
2725 	case TCPS_CLOSE_WAIT:
2726 		tcp_state_change(tp, TCPS_LAST_ACK);
2727 		break;
2728 	}
2729 	if (tp->t_acktime == 0)
2730 		tp->t_acktime = ticks;
2731 	if (tp->t_state >= TCPS_FIN_WAIT_2) {
2732 		tcp_free_sackholes(tp);
2733 		soisdisconnected(tptosocket(tp));
2734 		/* Prevent the connection hanging in FIN_WAIT_2 forever. */
2735 		if (tp->t_state == TCPS_FIN_WAIT_2) {
2736 			int timeout;
2737 
2738 			timeout = (tcp_fast_finwait2_recycle) ?
2739 			    tcp_finwait2_timeout : TP_MAXIDLE(tp);
2740 			tcp_timer_activate(tp, TT_2MSL, timeout);
2741 		}
2742 	}
2743 }
2744 
2745 #ifdef DDB
2746 static void
db_print_indent(int indent)2747 db_print_indent(int indent)
2748 {
2749 	int i;
2750 
2751 	for (i = 0; i < indent; i++)
2752 		db_printf(" ");
2753 }
2754 
2755 static void
db_print_tstate(int t_state)2756 db_print_tstate(int t_state)
2757 {
2758 
2759 	switch (t_state) {
2760 	case TCPS_CLOSED:
2761 		db_printf("TCPS_CLOSED");
2762 		return;
2763 
2764 	case TCPS_LISTEN:
2765 		db_printf("TCPS_LISTEN");
2766 		return;
2767 
2768 	case TCPS_SYN_SENT:
2769 		db_printf("TCPS_SYN_SENT");
2770 		return;
2771 
2772 	case TCPS_SYN_RECEIVED:
2773 		db_printf("TCPS_SYN_RECEIVED");
2774 		return;
2775 
2776 	case TCPS_ESTABLISHED:
2777 		db_printf("TCPS_ESTABLISHED");
2778 		return;
2779 
2780 	case TCPS_CLOSE_WAIT:
2781 		db_printf("TCPS_CLOSE_WAIT");
2782 		return;
2783 
2784 	case TCPS_FIN_WAIT_1:
2785 		db_printf("TCPS_FIN_WAIT_1");
2786 		return;
2787 
2788 	case TCPS_CLOSING:
2789 		db_printf("TCPS_CLOSING");
2790 		return;
2791 
2792 	case TCPS_LAST_ACK:
2793 		db_printf("TCPS_LAST_ACK");
2794 		return;
2795 
2796 	case TCPS_FIN_WAIT_2:
2797 		db_printf("TCPS_FIN_WAIT_2");
2798 		return;
2799 
2800 	case TCPS_TIME_WAIT:
2801 		db_printf("TCPS_TIME_WAIT");
2802 		return;
2803 
2804 	default:
2805 		db_printf("unknown");
2806 		return;
2807 	}
2808 }
2809 
2810 static void
db_print_bblog_state(int state)2811 db_print_bblog_state(int state)
2812 {
2813 	switch (state) {
2814 	case TCP_LOG_STATE_RATIO_OFF:
2815 		db_printf("TCP_LOG_STATE_RATIO_OFF");
2816 		break;
2817 	case TCP_LOG_STATE_CLEAR:
2818 		db_printf("TCP_LOG_STATE_CLEAR");
2819 		break;
2820 	case TCP_LOG_STATE_OFF:
2821 		db_printf("TCP_LOG_STATE_OFF");
2822 		break;
2823 	case TCP_LOG_STATE_TAIL:
2824 		db_printf("TCP_LOG_STATE_TAIL");
2825 		break;
2826 	case TCP_LOG_STATE_HEAD:
2827 		db_printf("TCP_LOG_STATE_HEAD");
2828 		break;
2829 	case TCP_LOG_STATE_HEAD_AUTO:
2830 		db_printf("TCP_LOG_STATE_HEAD_AUTO");
2831 		break;
2832 	case TCP_LOG_STATE_CONTINUAL:
2833 		db_printf("TCP_LOG_STATE_CONTINUAL");
2834 		break;
2835 	case TCP_LOG_STATE_TAIL_AUTO:
2836 		db_printf("TCP_LOG_STATE_TAIL_AUTO");
2837 		break;
2838 	case TCP_LOG_VIA_BBPOINTS:
2839 		db_printf("TCP_LOG_STATE_BBPOINTS");
2840 		break;
2841 	default:
2842 		db_printf("UNKNOWN(%d)", state);
2843 		break;
2844 	}
2845 }
2846 
2847 static void
db_print_tcpcb(struct tcpcb * tp,const char * name,int indent,bool show_bblog,bool show_inpcb)2848 db_print_tcpcb(struct tcpcb *tp, const char *name, int indent, bool show_bblog,
2849     bool show_inpcb)
2850 {
2851 
2852 	db_print_indent(indent);
2853 	db_printf("%s at %p\n", name, tp);
2854 
2855 	indent += 2;
2856 
2857 	if (show_inpcb)
2858 		db_print_inpcb(tptoinpcb(tp), "t_inpcb", indent);
2859 
2860 	db_print_indent(indent);
2861 	db_printf("t_segq first: %p   t_segqlen: %d   t_dupacks: %d\n",
2862 	   TAILQ_FIRST(&tp->t_segq), tp->t_segqlen, tp->t_dupacks);
2863 
2864 	db_print_indent(indent);
2865 	db_printf("t_callout: %p   t_timers: %p\n",
2866 	    &tp->t_callout, &tp->t_timers);
2867 
2868 	db_print_indent(indent);
2869 	db_printf("t_state: %d (", tp->t_state);
2870 	db_print_tstate(tp->t_state);
2871 	db_printf(")\n");
2872 
2873 	db_print_indent(indent);
2874 	db_printf("t_flags: 0x%b\n", tp->t_flags, TF_BITS);
2875 
2876 	db_print_indent(indent);
2877 	db_printf("t_flags2: 0x%b\n", tp->t_flags2, TF2_BITS);
2878 
2879 	db_print_indent(indent);
2880 	db_printf("snd_una: 0x%08x   snd_max: 0x%08x   snd_nxt: 0x%08x\n",
2881 	    tp->snd_una, tp->snd_max, tp->snd_nxt);
2882 
2883 	db_print_indent(indent);
2884 	db_printf("snd_up: 0x%08x   snd_wl1: 0x%08x   snd_wl2: 0x%08x\n",
2885 	   tp->snd_up, tp->snd_wl1, tp->snd_wl2);
2886 
2887 	db_print_indent(indent);
2888 	db_printf("iss: 0x%08x   irs: 0x%08x   rcv_nxt: 0x%08x\n",
2889 	    tp->iss, tp->irs, tp->rcv_nxt);
2890 
2891 	db_print_indent(indent);
2892 	db_printf("rcv_adv: 0x%08x   rcv_wnd: %u   rcv_up: 0x%08x\n",
2893 	    tp->rcv_adv, tp->rcv_wnd, tp->rcv_up);
2894 
2895 	db_print_indent(indent);
2896 	db_printf("snd_wnd: %u   snd_cwnd: %u\n",
2897 	   tp->snd_wnd, tp->snd_cwnd);
2898 
2899 	db_print_indent(indent);
2900 	db_printf("snd_ssthresh: %u   snd_recover: "
2901 	    "0x%08x\n", tp->snd_ssthresh, tp->snd_recover);
2902 
2903 	db_print_indent(indent);
2904 	db_printf("t_rcvtime: %u   t_startime: %u\n",
2905 	    tp->t_rcvtime, tp->t_starttime);
2906 
2907 	db_print_indent(indent);
2908 	db_printf("t_rttime: %u   t_rtsq: 0x%08x\n",
2909 	    tp->t_rtttime, tp->t_rtseq);
2910 
2911 	db_print_indent(indent);
2912 	db_printf("t_rxtcur: %d   t_maxseg: %u   t_srtt: %d\n",
2913 	    tp->t_rxtcur, tp->t_maxseg, tp->t_srtt);
2914 
2915 	db_print_indent(indent);
2916 	db_printf("t_rttvar: %d   t_rxtshift: %d   t_rttmin: %u\n",
2917 	    tp->t_rttvar, tp->t_rxtshift, tp->t_rttmin);
2918 
2919 	db_print_indent(indent);
2920 	db_printf("t_rttupdated: %u   max_sndwnd: %u   t_softerror: %d\n",
2921 	    tp->t_rttupdated, tp->max_sndwnd, tp->t_softerror);
2922 
2923 	db_print_indent(indent);
2924 	db_printf("t_oobflags: 0x%b   t_iobc: 0x%02x\n", tp->t_oobflags,
2925 	    TCPOOB_BITS, tp->t_iobc);
2926 
2927 	db_print_indent(indent);
2928 	db_printf("snd_scale: %u   rcv_scale: %u   request_r_scale: %u\n",
2929 	    tp->snd_scale, tp->rcv_scale, tp->request_r_scale);
2930 
2931 	db_print_indent(indent);
2932 	db_printf("ts_recent: %u   ts_recent_age: %u\n",
2933 	    tp->ts_recent, tp->ts_recent_age);
2934 
2935 	db_print_indent(indent);
2936 	db_printf("ts_offset: %u   last_ack_sent: 0x%08x   snd_cwnd_prev: "
2937 	    "%u\n", tp->ts_offset, tp->last_ack_sent, tp->snd_cwnd_prev);
2938 
2939 	db_print_indent(indent);
2940 	db_printf("snd_ssthresh_prev: %u   snd_recover_prev: 0x%08x   "
2941 	    "t_badrxtwin: %u\n", tp->snd_ssthresh_prev,
2942 	    tp->snd_recover_prev, tp->t_badrxtwin);
2943 
2944 	db_print_indent(indent);
2945 	db_printf("snd_numholes: %d  snd_holes first: %p\n",
2946 	    tp->snd_numholes, TAILQ_FIRST(&tp->snd_holes));
2947 
2948 	db_print_indent(indent);
2949 	db_printf("snd_fack: 0x%08x   rcv_numsacks: %d\n",
2950 	    tp->snd_fack, tp->rcv_numsacks);
2951 
2952 	/* Skip sackblks, sackhint. */
2953 
2954 	db_print_indent(indent);
2955 	db_printf("t_rttlow: %d   rfbuf_ts: %u   rfbuf_cnt: %d\n",
2956 	    tp->t_rttlow, tp->rfbuf_ts, tp->rfbuf_cnt);
2957 
2958 	db_print_indent(indent);
2959 	db_printf("t_fb.tfb_tcp_block_name: %s\n", tp->t_fb->tfb_tcp_block_name);
2960 
2961 	db_print_indent(indent);
2962 	db_printf("t_cc.name: %s\n", tp->t_cc->name);
2963 
2964 	db_print_indent(indent);
2965 	db_printf("_t_logstate: %d (", tp->_t_logstate);
2966 	db_print_bblog_state(tp->_t_logstate);
2967 	db_printf(")\n");
2968 
2969 	db_print_indent(indent);
2970 	db_printf("t_lognum: %d   t_loglimit: %d   t_logsn: %u\n",
2971 	    tp->t_lognum, tp->t_loglimit, tp->t_logsn);
2972 
2973 	if (show_bblog) {
2974 #ifdef TCP_BLACKBOX
2975 		db_print_bblog_entries(&tp->t_logs, indent);
2976 #else
2977 		db_print_indent(indent);
2978 		db_printf("BBLog not supported\n");
2979 #endif
2980 	}
2981 }
2982 
DB_SHOW_COMMAND(tcpcb,db_show_tcpcb)2983 DB_SHOW_COMMAND(tcpcb, db_show_tcpcb)
2984 {
2985 	struct tcpcb *tp;
2986 	bool show_bblog, show_inpcb;
2987 
2988 	if (!have_addr) {
2989 		db_printf("usage: show tcpcb[/bi] <addr>\n");
2990 		return;
2991 	}
2992 	show_bblog = strchr(modif, 'b') != NULL;
2993 	show_inpcb = strchr(modif, 'i') != NULL;
2994 	tp = (struct tcpcb *)addr;
2995 	db_print_tcpcb(tp, "tcpcb", 0, show_bblog, show_inpcb);
2996 }
2997 
DB_SHOW_ALL_COMMAND(tcpcbs,db_show_all_tcpcbs)2998 DB_SHOW_ALL_COMMAND(tcpcbs, db_show_all_tcpcbs)
2999 {
3000 	VNET_ITERATOR_DECL(vnet_iter);
3001 	struct inpcb *inp;
3002 	struct tcpcb *tp;
3003 	bool only_locked, show_bblog, show_inpcb;
3004 
3005 	only_locked = strchr(modif, 'l') != NULL;
3006 	show_bblog = strchr(modif, 'b') != NULL;
3007 	show_inpcb = strchr(modif, 'i') != NULL;
3008 	VNET_FOREACH(vnet_iter) {
3009 		CURVNET_SET(vnet_iter);
3010 		CK_LIST_FOREACH(inp, &V_tcbinfo.ipi_listhead, inp_list) {
3011 			if (only_locked &&
3012 			    inp->inp_lock.rw_lock == RW_UNLOCKED)
3013 				continue;
3014 			tp = intotcpcb(inp);
3015 			db_print_tcpcb(tp, "tcpcb", 0, show_bblog, show_inpcb);
3016 			if (db_pager_quit)
3017 				break;
3018 		}
3019 		CURVNET_RESTORE();
3020 		if (db_pager_quit)
3021 			break;
3022 	}
3023 }
3024 #endif
3025