1 /*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1982, 1986, 1988, 1993
5 * The Regents of the University of California.
6 * Copyright (c) 2006-2007 Robert N. M. Watson
7 * Copyright (c) 2010-2011 Juniper Networks, Inc.
8 * All rights reserved.
9 *
10 * Portions of this software were developed by Robert N. M. Watson under
11 * contract to Juniper Networks, Inc.
12 *
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions
15 * are met:
16 * 1. Redistributions of source code must retain the above copyright
17 * notice, this list of conditions and the following disclaimer.
18 * 2. Redistributions in binary form must reproduce the above copyright
19 * notice, this list of conditions and the following disclaimer in the
20 * documentation and/or other materials provided with the distribution.
21 * 3. Neither the name of the University nor the names of its contributors
22 * may be used to endorse or promote products derived from this software
23 * without specific prior written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 * SUCH DAMAGE.
36 */
37
38 #include <sys/cdefs.h>
39 #include "opt_ddb.h"
40 #include "opt_inet.h"
41 #include "opt_inet6.h"
42 #include "opt_ipsec.h"
43 #include "opt_kern_tls.h"
44
45 #include <sys/param.h>
46 #include <sys/systm.h>
47 #include <sys/arb.h>
48 #include <sys/limits.h>
49 #include <sys/malloc.h>
50 #include <sys/refcount.h>
51 #include <sys/kernel.h>
52 #include <sys/ktls.h>
53 #include <sys/qmath.h>
54 #include <sys/sysctl.h>
55 #include <sys/mbuf.h>
56 #ifdef INET6
57 #include <sys/domain.h>
58 #endif /* INET6 */
59 #include <sys/socket.h>
60 #include <sys/socketvar.h>
61 #include <sys/protosw.h>
62 #include <sys/proc.h>
63 #include <sys/jail.h>
64 #include <sys/stats.h>
65
66 #ifdef DDB
67 #include <ddb/ddb.h>
68 #endif
69
70 #include <net/if.h>
71 #include <net/if_var.h>
72 #include <net/route.h>
73 #include <net/vnet.h>
74
75 #include <netinet/in.h>
76 #include <netinet/in_kdtrace.h>
77 #include <netinet/in_pcb.h>
78 #include <netinet/in_rss.h>
79 #include <netinet/in_systm.h>
80 #include <netinet/in_var.h>
81 #include <netinet/ip.h>
82 #include <netinet/ip_var.h>
83 #ifdef INET6
84 #include <netinet/ip6.h>
85 #include <netinet6/in6_pcb.h>
86 #include <netinet6/in6_rss.h>
87 #include <netinet6/ip6_var.h>
88 #include <netinet6/scope6_var.h>
89 #endif
90 #include <netinet/tcp.h>
91 #include <netinet/tcp_fsm.h>
92 #include <netinet/tcp_seq.h>
93 #include <netinet/tcp_timer.h>
94 #include <netinet/tcp_var.h>
95 #include <netinet/tcp_log_buf.h>
96 #include <netinet/tcpip.h>
97 #include <netinet/cc/cc.h>
98 #include <netinet/tcp_fastopen.h>
99 #include <netinet/tcp_hpts.h>
100 #ifdef TCP_OFFLOAD
101 #include <netinet/tcp_offload.h>
102 #endif
103 #include <netipsec/ipsec_support.h>
104
105 #include <vm/vm.h>
106 #include <vm/vm_param.h>
107 #include <vm/pmap.h>
108 #include <vm/vm_extern.h>
109 #include <vm/vm_map.h>
110 #include <vm/vm_page.h>
111
112 /*
113 * TCP protocol interface to socket abstraction.
114 */
115 #ifdef INET
116 static int tcp_connect(struct tcpcb *, struct sockaddr_in *,
117 struct thread *td);
118 #endif /* INET */
119 #ifdef INET6
120 static int tcp6_connect(struct tcpcb *, struct sockaddr_in6 *,
121 struct thread *td);
122 #endif /* INET6 */
123 static void tcp_disconnect(struct tcpcb *);
124 static void tcp_usrclosed(struct tcpcb *);
125 static void tcp_fill_info(const struct tcpcb *, struct tcp_info *);
126
127 static int tcp_pru_options_support(struct tcpcb *tp, int flags);
128
129 static void
tcp_bblog_pru(struct tcpcb * tp,uint32_t pru,int error)130 tcp_bblog_pru(struct tcpcb *tp, uint32_t pru, int error)
131 {
132 struct tcp_log_buffer *lgb;
133
134 KASSERT(tp != NULL, ("tcp_bblog_pru: tp == NULL"));
135 INP_WLOCK_ASSERT(tptoinpcb(tp));
136 if (tcp_bblogging_on(tp)) {
137 lgb = tcp_log_event(tp, NULL, NULL, NULL, TCP_LOG_PRU, error,
138 0, NULL, false, NULL, NULL, 0, NULL);
139 } else {
140 lgb = NULL;
141 }
142 if (lgb != NULL) {
143 if (error >= 0) {
144 lgb->tlb_errno = (uint32_t)error;
145 }
146 lgb->tlb_flex1 = pru;
147 }
148 }
149
150 /*
151 * TCP attaches to socket via pr_attach(), reserving space,
152 * and an internet control block.
153 */
154 static int
tcp_usr_attach(struct socket * so,int proto,struct thread * td)155 tcp_usr_attach(struct socket *so, int proto, struct thread *td)
156 {
157 struct inpcb *inp;
158 struct tcpcb *tp = NULL;
159 int error;
160
161 inp = sotoinpcb(so);
162 KASSERT(inp == NULL, ("tcp_usr_attach: inp != NULL"));
163
164 error = soreserve(so, V_tcp_sendspace, V_tcp_recvspace);
165 if (error)
166 goto out;
167
168 so->so_rcv.sb_flags |= SB_AUTOSIZE;
169 so->so_snd.sb_flags |= (SB_AUTOLOWAT | SB_AUTOSIZE);
170 error = in_pcballoc(so, &V_tcbinfo);
171 if (error)
172 goto out;
173 inp = sotoinpcb(so);
174 tp = tcp_newtcpcb(inp, NULL);
175 if (tp == NULL) {
176 error = ENOBUFS;
177 in_pcbfree(inp);
178 goto out;
179 }
180 tp->t_state = TCPS_CLOSED;
181 tcp_bblog_pru(tp, PRU_ATTACH, error);
182 INP_WUNLOCK(inp);
183 TCPSTATES_INC(TCPS_CLOSED);
184 out:
185 TCP_PROBE2(debug__user, tp, PRU_ATTACH);
186 return (error);
187 }
188
189 /*
190 * tcp_usr_detach is called when the socket layer loses its final reference
191 * to the socket, be it a file descriptor reference, a reference from TCP,
192 * etc. At this point, there is only one case in which we will keep around
193 * inpcb state: time wait.
194 */
195 static void
tcp_usr_detach(struct socket * so)196 tcp_usr_detach(struct socket *so)
197 {
198 struct inpcb *inp;
199 struct tcpcb *tp;
200
201 inp = sotoinpcb(so);
202 KASSERT(inp != NULL, ("%s: inp == NULL", __func__));
203 INP_WLOCK(inp);
204 KASSERT(so->so_pcb == inp && inp->inp_socket == so,
205 ("%s: socket %p inp %p mismatch", __func__, so, inp));
206
207 tp = intotcpcb(inp);
208
209 KASSERT(inp->inp_flags & INP_DROPPED ||
210 tp->t_state < TCPS_SYN_SENT,
211 ("%s: inp %p not dropped or embryonic", __func__, inp));
212
213 tcp_discardcb(tp);
214 in_pcbfree(inp);
215 }
216
217 #ifdef INET
218 /*
219 * Give the socket an address.
220 */
221 static int
tcp_usr_bind(struct socket * so,struct sockaddr * nam,struct thread * td)222 tcp_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
223 {
224 int error = 0;
225 struct inpcb *inp;
226 struct tcpcb *tp;
227 struct sockaddr_in *sinp;
228
229 inp = sotoinpcb(so);
230 KASSERT(inp != NULL, ("tcp_usr_bind: inp == NULL"));
231 INP_WLOCK(inp);
232 if (inp->inp_flags & INP_DROPPED) {
233 INP_WUNLOCK(inp);
234 return (EINVAL);
235 }
236 tp = intotcpcb(inp);
237
238 sinp = (struct sockaddr_in *)nam;
239 if (nam->sa_family != AF_INET) {
240 /*
241 * Preserve compatibility with old programs.
242 */
243 if (nam->sa_family != AF_UNSPEC ||
244 nam->sa_len < offsetof(struct sockaddr_in, sin_zero) ||
245 sinp->sin_addr.s_addr != INADDR_ANY) {
246 error = EAFNOSUPPORT;
247 goto out;
248 }
249 nam->sa_family = AF_INET;
250 }
251 if (nam->sa_len != sizeof(*sinp)) {
252 error = EINVAL;
253 goto out;
254 }
255 /*
256 * Must check for multicast addresses and disallow binding
257 * to them.
258 */
259 if (IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) {
260 error = EAFNOSUPPORT;
261 goto out;
262 }
263 INP_HASH_WLOCK(&V_tcbinfo);
264 error = in_pcbbind(inp, sinp, V_tcp_bind_all_fibs ? 0 : INPBIND_FIB,
265 td->td_ucred);
266 INP_HASH_WUNLOCK(&V_tcbinfo);
267 out:
268 tcp_bblog_pru(tp, PRU_BIND, error);
269 TCP_PROBE2(debug__user, tp, PRU_BIND);
270 INP_WUNLOCK(inp);
271
272 return (error);
273 }
274 #endif /* INET */
275
276 #ifdef INET6
277 static int
tcp6_usr_bind(struct socket * so,struct sockaddr * nam,struct thread * td)278 tcp6_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
279 {
280 int error = 0;
281 struct inpcb *inp;
282 struct tcpcb *tp;
283 struct sockaddr_in6 *sin6;
284 u_char vflagsav;
285
286 inp = sotoinpcb(so);
287 KASSERT(inp != NULL, ("tcp6_usr_bind: inp == NULL"));
288 INP_WLOCK(inp);
289 if (inp->inp_flags & INP_DROPPED) {
290 INP_WUNLOCK(inp);
291 return (EINVAL);
292 }
293 tp = intotcpcb(inp);
294
295 vflagsav = inp->inp_vflag;
296
297 sin6 = (struct sockaddr_in6 *)nam;
298 if (nam->sa_family != AF_INET6) {
299 error = EAFNOSUPPORT;
300 goto out;
301 }
302 if (nam->sa_len != sizeof(*sin6)) {
303 error = EINVAL;
304 goto out;
305 }
306 /*
307 * Must check for multicast addresses and disallow binding
308 * to them.
309 */
310 if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) {
311 error = EAFNOSUPPORT;
312 goto out;
313 }
314
315 INP_HASH_WLOCK(&V_tcbinfo);
316 inp->inp_vflag &= ~INP_IPV4;
317 inp->inp_vflag |= INP_IPV6;
318 #ifdef INET
319 if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) {
320 if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
321 inp->inp_vflag |= INP_IPV4;
322 else if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
323 struct sockaddr_in sin;
324
325 in6_sin6_2_sin(&sin, sin6);
326 if (IN_MULTICAST(ntohl(sin.sin_addr.s_addr))) {
327 error = EAFNOSUPPORT;
328 INP_HASH_WUNLOCK(&V_tcbinfo);
329 goto out;
330 }
331 inp->inp_vflag |= INP_IPV4;
332 inp->inp_vflag &= ~INP_IPV6;
333 error = in_pcbbind(inp, &sin, 0, td->td_ucred);
334 INP_HASH_WUNLOCK(&V_tcbinfo);
335 goto out;
336 }
337 }
338 #endif
339 error = in6_pcbbind(inp, sin6, V_tcp_bind_all_fibs ? 0 : INPBIND_FIB,
340 td->td_ucred);
341 INP_HASH_WUNLOCK(&V_tcbinfo);
342 out:
343 if (error != 0)
344 inp->inp_vflag = vflagsav;
345 tcp_bblog_pru(tp, PRU_BIND, error);
346 TCP_PROBE2(debug__user, tp, PRU_BIND);
347 INP_WUNLOCK(inp);
348 return (error);
349 }
350 #endif /* INET6 */
351
352 #ifdef INET
353 /*
354 * Prepare to accept connections.
355 */
356 static int
tcp_usr_listen(struct socket * so,int backlog,struct thread * td)357 tcp_usr_listen(struct socket *so, int backlog, struct thread *td)
358 {
359 struct inpcb *inp;
360 struct tcpcb *tp;
361 int error = 0;
362 bool already_listening;
363
364 inp = sotoinpcb(so);
365 KASSERT(inp != NULL, ("tcp_usr_listen: inp == NULL"));
366 INP_WLOCK(inp);
367 if (inp->inp_flags & INP_DROPPED) {
368 INP_WUNLOCK(inp);
369 return (EINVAL);
370 }
371 tp = intotcpcb(inp);
372
373 SOCK_LOCK(so);
374 already_listening = SOLISTENING(so);
375 error = solisten_proto_check(so);
376 if (error != 0) {
377 SOCK_UNLOCK(so);
378 goto out;
379 }
380 if (inp->inp_lport == 0) {
381 INP_HASH_WLOCK(&V_tcbinfo);
382 error = in_pcbbind(inp, NULL,
383 V_tcp_bind_all_fibs ? 0 : INPBIND_FIB, td->td_ucred);
384 INP_HASH_WUNLOCK(&V_tcbinfo);
385 }
386 if (error == 0) {
387 tcp_state_change(tp, TCPS_LISTEN);
388 solisten_proto(so, backlog);
389 #ifdef TCP_OFFLOAD
390 if ((so->so_options & SO_NO_OFFLOAD) == 0)
391 tcp_offload_listen_start(tp);
392 #endif
393 } else {
394 solisten_proto_abort(so);
395 }
396 SOCK_UNLOCK(so);
397 if (already_listening)
398 goto out;
399
400 if (error == 0)
401 in_pcblisten(inp);
402 if (tp->t_flags & TF_FASTOPEN)
403 tp->t_tfo_pending = tcp_fastopen_alloc_counter();
404
405 out:
406 tcp_bblog_pru(tp, PRU_LISTEN, error);
407 TCP_PROBE2(debug__user, tp, PRU_LISTEN);
408 INP_WUNLOCK(inp);
409 return (error);
410 }
411 #endif /* INET */
412
413 #ifdef INET6
414 static int
tcp6_usr_listen(struct socket * so,int backlog,struct thread * td)415 tcp6_usr_listen(struct socket *so, int backlog, struct thread *td)
416 {
417 struct inpcb *inp;
418 struct tcpcb *tp;
419 u_char vflagsav;
420 int error = 0;
421 bool already_listening;
422
423 inp = sotoinpcb(so);
424 KASSERT(inp != NULL, ("tcp6_usr_listen: inp == NULL"));
425 INP_WLOCK(inp);
426 if (inp->inp_flags & INP_DROPPED) {
427 INP_WUNLOCK(inp);
428 return (EINVAL);
429 }
430 tp = intotcpcb(inp);
431
432 vflagsav = inp->inp_vflag;
433
434 SOCK_LOCK(so);
435 already_listening = SOLISTENING(so);
436 error = solisten_proto_check(so);
437 if (error != 0) {
438 SOCK_UNLOCK(so);
439 goto out;
440 }
441 INP_HASH_WLOCK(&V_tcbinfo);
442 if (inp->inp_lport == 0) {
443 inp->inp_vflag &= ~INP_IPV4;
444 if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0)
445 inp->inp_vflag |= INP_IPV4;
446 error = in6_pcbbind(inp, NULL,
447 V_tcp_bind_all_fibs ? 0 : INPBIND_FIB, td->td_ucred);
448 }
449 INP_HASH_WUNLOCK(&V_tcbinfo);
450 if (error == 0) {
451 tcp_state_change(tp, TCPS_LISTEN);
452 solisten_proto(so, backlog);
453 #ifdef TCP_OFFLOAD
454 if ((so->so_options & SO_NO_OFFLOAD) == 0)
455 tcp_offload_listen_start(tp);
456 #endif
457 } else {
458 solisten_proto_abort(so);
459 }
460 SOCK_UNLOCK(so);
461 if (already_listening)
462 goto out;
463
464 if (error == 0)
465 in_pcblisten(inp);
466 if (tp->t_flags & TF_FASTOPEN)
467 tp->t_tfo_pending = tcp_fastopen_alloc_counter();
468
469 if (error != 0)
470 inp->inp_vflag = vflagsav;
471
472 out:
473 tcp_bblog_pru(tp, PRU_LISTEN, error);
474 TCP_PROBE2(debug__user, tp, PRU_LISTEN);
475 INP_WUNLOCK(inp);
476 return (error);
477 }
478 #endif /* INET6 */
479
480 #ifdef INET
481 /*
482 * Initiate connection to peer.
483 * Create a template for use in transmissions on this connection.
484 * Enter SYN_SENT state, and mark socket as connecting.
485 * Start keep-alive timer, and seed output sequence space.
486 * Send initial segment on connection.
487 */
488 static int
tcp_usr_connect(struct socket * so,struct sockaddr * nam,struct thread * td)489 tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
490 {
491 struct epoch_tracker et;
492 int error = 0;
493 struct inpcb *inp;
494 struct tcpcb *tp;
495 struct sockaddr_in *sinp;
496
497 inp = sotoinpcb(so);
498 KASSERT(inp != NULL, ("tcp_usr_connect: inp == NULL"));
499 INP_WLOCK(inp);
500 if (inp->inp_flags & INP_DROPPED) {
501 INP_WUNLOCK(inp);
502 return (ECONNREFUSED);
503 }
504 tp = intotcpcb(inp);
505
506 sinp = (struct sockaddr_in *)nam;
507 if (nam->sa_family != AF_INET) {
508 error = EAFNOSUPPORT;
509 goto out;
510 }
511 if (nam->sa_len != sizeof (*sinp)) {
512 error = EINVAL;
513 goto out;
514 }
515 /*
516 * Must disallow TCP ``connections'' to multicast addresses.
517 */
518 if (IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) {
519 error = EAFNOSUPPORT;
520 goto out;
521 }
522 if (ntohl(sinp->sin_addr.s_addr) == INADDR_BROADCAST) {
523 error = EACCES;
524 goto out;
525 }
526 if ((error = prison_remote_ip4(td->td_ucred, &sinp->sin_addr)) != 0)
527 goto out;
528 if (SOLISTENING(so)) {
529 error = EOPNOTSUPP;
530 goto out;
531 }
532 NET_EPOCH_ENTER(et);
533 if ((error = tcp_connect(tp, sinp, td)) != 0)
534 goto out_in_epoch;
535 #ifdef TCP_OFFLOAD
536 if (registered_toedevs > 0 &&
537 (so->so_options & SO_NO_OFFLOAD) == 0 &&
538 (error = tcp_offload_connect(so, nam)) == 0)
539 goto out_in_epoch;
540 #endif
541 tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp));
542 error = tcp_output(tp);
543 KASSERT(error >= 0, ("TCP stack %s requested tcp_drop(%p) at connect()"
544 ", error code %d", tp->t_fb->tfb_tcp_block_name, tp, -error));
545 out_in_epoch:
546 NET_EPOCH_EXIT(et);
547 out:
548 tcp_bblog_pru(tp, PRU_CONNECT, error);
549 TCP_PROBE2(debug__user, tp, PRU_CONNECT);
550 INP_WUNLOCK(inp);
551 return (error);
552 }
553 #endif /* INET */
554
555 #ifdef INET6
556 static int
tcp6_usr_connect(struct socket * so,struct sockaddr * nam,struct thread * td)557 tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
558 {
559 struct epoch_tracker et;
560 int error = 0;
561 struct inpcb *inp;
562 struct tcpcb *tp;
563 struct sockaddr_in6 *sin6;
564 u_int8_t incflagsav;
565 u_char vflagsav;
566
567 inp = sotoinpcb(so);
568 KASSERT(inp != NULL, ("tcp6_usr_connect: inp == NULL"));
569 INP_WLOCK(inp);
570 if (inp->inp_flags & INP_DROPPED) {
571 INP_WUNLOCK(inp);
572 return (ECONNREFUSED);
573 }
574 tp = intotcpcb(inp);
575
576 vflagsav = inp->inp_vflag;
577 incflagsav = inp->inp_inc.inc_flags;
578
579 sin6 = (struct sockaddr_in6 *)nam;
580 if (nam->sa_family != AF_INET6) {
581 error = EAFNOSUPPORT;
582 goto out;
583 }
584 if (nam->sa_len != sizeof (*sin6)) {
585 error = EINVAL;
586 goto out;
587 }
588 /*
589 * Must disallow TCP ``connections'' to multicast addresses.
590 */
591 if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) {
592 error = EAFNOSUPPORT;
593 goto out;
594 }
595 if (SOLISTENING(so)) {
596 error = EOPNOTSUPP;
597 goto out;
598 }
599 #ifdef INET
600 /*
601 * XXXRW: Some confusion: V4/V6 flags relate to binding, and
602 * therefore probably require the hash lock, which isn't held here.
603 * Is this a significant problem?
604 */
605 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
606 struct sockaddr_in sin;
607
608 if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) {
609 error = EINVAL;
610 goto out;
611 }
612 if ((inp->inp_vflag & INP_IPV4) == 0) {
613 error = EAFNOSUPPORT;
614 goto out;
615 }
616
617 in6_sin6_2_sin(&sin, sin6);
618 if (IN_MULTICAST(ntohl(sin.sin_addr.s_addr))) {
619 error = EAFNOSUPPORT;
620 goto out;
621 }
622 if (ntohl(sin.sin_addr.s_addr) == INADDR_BROADCAST) {
623 error = EACCES;
624 goto out;
625 }
626 if ((error = prison_remote_ip4(td->td_ucred,
627 &sin.sin_addr)) != 0)
628 goto out;
629 inp->inp_vflag |= INP_IPV4;
630 inp->inp_vflag &= ~INP_IPV6;
631 NET_EPOCH_ENTER(et);
632 if ((error = tcp_connect(tp, &sin, td)) != 0)
633 goto out_in_epoch;
634 #ifdef TCP_OFFLOAD
635 if (registered_toedevs > 0 &&
636 (so->so_options & SO_NO_OFFLOAD) == 0 &&
637 (error = tcp_offload_connect(so, nam)) == 0)
638 goto out_in_epoch;
639 #endif
640 error = tcp_output(tp);
641 goto out_in_epoch;
642 } else {
643 if ((inp->inp_vflag & INP_IPV6) == 0) {
644 error = EAFNOSUPPORT;
645 goto out;
646 }
647 }
648 #endif
649 if ((error = prison_remote_ip6(td->td_ucred, &sin6->sin6_addr)) != 0)
650 goto out;
651 inp->inp_vflag &= ~INP_IPV4;
652 inp->inp_vflag |= INP_IPV6;
653 inp->inp_inc.inc_flags |= INC_ISIPV6;
654 NET_EPOCH_ENTER(et);
655 if ((error = tcp6_connect(tp, sin6, td)) != 0)
656 goto out_in_epoch;
657 #ifdef TCP_OFFLOAD
658 if (registered_toedevs > 0 &&
659 (so->so_options & SO_NO_OFFLOAD) == 0 &&
660 (error = tcp_offload_connect(so, nam)) == 0)
661 goto out_in_epoch;
662 #endif
663 tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp));
664 error = tcp_output(tp);
665 out_in_epoch:
666 NET_EPOCH_EXIT(et);
667 out:
668 KASSERT(error >= 0, ("TCP stack %s requested tcp_drop(%p) at connect()"
669 ", error code %d", tp->t_fb->tfb_tcp_block_name, tp, -error));
670 /*
671 * If the implicit bind in the connect call fails, restore
672 * the flags we modified.
673 */
674 if (error != 0 && inp->inp_lport == 0) {
675 inp->inp_vflag = vflagsav;
676 inp->inp_inc.inc_flags = incflagsav;
677 }
678
679 tcp_bblog_pru(tp, PRU_CONNECT, error);
680 TCP_PROBE2(debug__user, tp, PRU_CONNECT);
681 INP_WUNLOCK(inp);
682 return (error);
683 }
684 #endif /* INET6 */
685
686 /*
687 * Initiate disconnect from peer.
688 * If connection never passed embryonic stage, just drop;
689 * else if don't need to let data drain, then can just drop anyways,
690 * else have to begin TCP shutdown process: mark socket disconnecting,
691 * drain unread data, state switch to reflect user close, and
692 * send segment (e.g. FIN) to peer. Socket will be really disconnected
693 * when peer sends FIN and acks ours.
694 *
695 * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB.
696 */
697 static int
tcp_usr_disconnect(struct socket * so)698 tcp_usr_disconnect(struct socket *so)
699 {
700 struct inpcb *inp;
701 struct tcpcb *tp = NULL;
702 struct epoch_tracker et;
703
704 NET_EPOCH_ENTER(et);
705 inp = sotoinpcb(so);
706 KASSERT(inp != NULL, ("tcp_usr_disconnect: inp == NULL"));
707 INP_WLOCK(inp);
708 tp = intotcpcb(inp);
709
710 if (tp->t_state == TCPS_TIME_WAIT)
711 goto out;
712 tcp_disconnect(tp);
713 out:
714 tcp_bblog_pru(tp, PRU_DISCONNECT, 0);
715 TCP_PROBE2(debug__user, tp, PRU_DISCONNECT);
716 INP_WUNLOCK(inp);
717 NET_EPOCH_EXIT(et);
718 return (0);
719 }
720
721 #ifdef INET
722 /*
723 * Accept a connection. Essentially all the work is done at higher levels;
724 * just return the address of the peer, storing through addr.
725 */
726 static int
tcp_usr_accept(struct socket * so,struct sockaddr * sa)727 tcp_usr_accept(struct socket *so, struct sockaddr *sa)
728 {
729 struct inpcb *inp;
730 struct tcpcb *tp;
731 int error = 0;
732
733 inp = sotoinpcb(so);
734 KASSERT(inp != NULL, ("tcp_usr_accept: inp == NULL"));
735 INP_WLOCK(inp);
736 if (inp->inp_flags & INP_DROPPED) {
737 INP_WUNLOCK(inp);
738 return (ECONNABORTED);
739 }
740 tp = intotcpcb(inp);
741
742 if (so->so_state & SS_ISDISCONNECTED)
743 error = ECONNABORTED;
744 else
745 *(struct sockaddr_in *)sa = (struct sockaddr_in ){
746 .sin_family = AF_INET,
747 .sin_len = sizeof(struct sockaddr_in),
748 .sin_port = inp->inp_fport,
749 .sin_addr = inp->inp_faddr,
750 };
751 tcp_bblog_pru(tp, PRU_ACCEPT, error);
752 TCP_PROBE2(debug__user, tp, PRU_ACCEPT);
753 INP_WUNLOCK(inp);
754
755 return (error);
756 }
757 #endif /* INET */
758
759 #ifdef INET6
760 static int
tcp6_usr_accept(struct socket * so,struct sockaddr * sa)761 tcp6_usr_accept(struct socket *so, struct sockaddr *sa)
762 {
763 struct inpcb *inp;
764 struct tcpcb *tp;
765 int error = 0;
766
767 inp = sotoinpcb(so);
768 KASSERT(inp != NULL, ("tcp6_usr_accept: inp == NULL"));
769 INP_WLOCK(inp);
770 if (inp->inp_flags & INP_DROPPED) {
771 INP_WUNLOCK(inp);
772 return (ECONNABORTED);
773 }
774 tp = intotcpcb(inp);
775
776 if (so->so_state & SS_ISDISCONNECTED) {
777 error = ECONNABORTED;
778 } else {
779 if (inp->inp_vflag & INP_IPV4) {
780 struct sockaddr_in sin = {
781 .sin_family = AF_INET,
782 .sin_len = sizeof(struct sockaddr_in),
783 .sin_port = inp->inp_fport,
784 .sin_addr = inp->inp_faddr,
785 };
786 in6_sin_2_v4mapsin6(&sin, (struct sockaddr_in6 *)sa);
787 } else {
788 *(struct sockaddr_in6 *)sa = (struct sockaddr_in6 ){
789 .sin6_family = AF_INET6,
790 .sin6_len = sizeof(struct sockaddr_in6),
791 .sin6_port = inp->inp_fport,
792 .sin6_addr = inp->in6p_faddr,
793 };
794 /* XXX: should catch errors */
795 (void)sa6_recoverscope((struct sockaddr_in6 *)sa);
796 }
797 }
798
799 tcp_bblog_pru(tp, PRU_ACCEPT, error);
800 TCP_PROBE2(debug__user, tp, PRU_ACCEPT);
801 INP_WUNLOCK(inp);
802
803 return (error);
804 }
805 #endif /* INET6 */
806
807 /*
808 * Mark the connection as being incapable of further output.
809 */
810 static int
tcp_usr_shutdown(struct socket * so,enum shutdown_how how)811 tcp_usr_shutdown(struct socket *so, enum shutdown_how how)
812 {
813 struct epoch_tracker et;
814 struct inpcb *inp = sotoinpcb(so);
815 struct tcpcb *tp = intotcpcb(inp);
816 int error = 0;
817
818 SOCK_LOCK(so);
819 if (SOLISTENING(so)) {
820 if (how != SHUT_WR) {
821 so->so_error = ECONNABORTED;
822 solisten_wakeup(so); /* unlocks so */
823 } else
824 SOCK_UNLOCK(so);
825 return (ENOTCONN);
826 } else if ((so->so_state &
827 (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0) {
828 SOCK_UNLOCK(so);
829 return (ENOTCONN);
830 }
831 SOCK_UNLOCK(so);
832
833 switch (how) {
834 case SHUT_RD:
835 sorflush(so);
836 break;
837 case SHUT_RDWR:
838 sorflush(so);
839 /* FALLTHROUGH */
840 case SHUT_WR:
841 /*
842 * XXXGL: mimicing old soshutdown() here. But shouldn't we
843 * return ECONNRESEST for SHUT_RD as well?
844 */
845 INP_WLOCK(inp);
846 if (inp->inp_flags & INP_DROPPED) {
847 INP_WUNLOCK(inp);
848 return (ECONNRESET);
849 }
850
851 socantsendmore(so);
852 NET_EPOCH_ENTER(et);
853 tcp_usrclosed(tp);
854 error = tcp_output_nodrop(tp);
855 tcp_bblog_pru(tp, PRU_SHUTDOWN, error);
856 TCP_PROBE2(debug__user, tp, PRU_SHUTDOWN);
857 error = tcp_unlock_or_drop(tp, error);
858 NET_EPOCH_EXIT(et);
859 }
860 wakeup(&so->so_timeo);
861
862 return (error);
863 }
864
865 /*
866 * After a receive, possibly send window update to peer.
867 */
868 static int
tcp_usr_rcvd(struct socket * so,int flags)869 tcp_usr_rcvd(struct socket *so, int flags)
870 {
871 struct epoch_tracker et;
872 struct inpcb *inp;
873 struct tcpcb *tp;
874 int outrv = 0, error = 0;
875
876 inp = sotoinpcb(so);
877 KASSERT(inp != NULL, ("tcp_usr_rcvd: inp == NULL"));
878 INP_WLOCK(inp);
879 if (inp->inp_flags & INP_DROPPED) {
880 INP_WUNLOCK(inp);
881 return (ECONNRESET);
882 }
883 tp = intotcpcb(inp);
884
885 NET_EPOCH_ENTER(et);
886 /*
887 * For passively-created TFO connections, don't attempt a window
888 * update while still in SYN_RECEIVED as this may trigger an early
889 * SYN|ACK. It is preferable to have the SYN|ACK be sent along with
890 * application response data, or failing that, when the DELACK timer
891 * expires.
892 */
893 if ((tp->t_flags & TF_FASTOPEN) && (tp->t_state == TCPS_SYN_RECEIVED))
894 goto out;
895 #ifdef TCP_OFFLOAD
896 if (tp->t_flags & TF_TOE)
897 tcp_offload_rcvd(tp);
898 else
899 #endif
900 outrv = tcp_output_nodrop(tp);
901 out:
902 tcp_bblog_pru(tp, PRU_RCVD, error);
903 TCP_PROBE2(debug__user, tp, PRU_RCVD);
904 (void) tcp_unlock_or_drop(tp, outrv);
905 NET_EPOCH_EXIT(et);
906 return (error);
907 }
908
909 /*
910 * Do a send by putting data in output queue and updating urgent
911 * marker if URG set. Possibly send more data. Unlike the other
912 * pr_*() routines, the mbuf chains are our responsibility. We
913 * must either enqueue them or free them. The other pr_*() routines
914 * generally are caller-frees.
915 */
916 static int
tcp_usr_send(struct socket * so,int flags,struct mbuf * m,struct sockaddr * nam,struct mbuf * control,struct thread * td)917 tcp_usr_send(struct socket *so, int flags, struct mbuf *m,
918 struct sockaddr *nam, struct mbuf *control, struct thread *td)
919 {
920 struct epoch_tracker et;
921 int error = 0;
922 struct inpcb *inp;
923 struct tcpcb *tp;
924 #ifdef INET
925 #ifdef INET6
926 struct sockaddr_in sin;
927 #endif
928 struct sockaddr_in *sinp;
929 #endif
930 #ifdef INET6
931 struct sockaddr_in6 *sin6;
932 int isipv6;
933 #endif
934 u_int8_t incflagsav;
935 u_char vflagsav;
936 bool restoreflags;
937
938 inp = sotoinpcb(so);
939 KASSERT(inp != NULL, ("tcp_usr_send: inp == NULL"));
940 INP_WLOCK(inp);
941 if (inp->inp_flags & INP_DROPPED) {
942 if (m != NULL && (flags & PRUS_NOTREADY) == 0)
943 m_freem(m);
944 INP_WUNLOCK(inp);
945 return (ECONNRESET);
946 }
947 tp = intotcpcb(inp);
948
949 vflagsav = inp->inp_vflag;
950 incflagsav = inp->inp_inc.inc_flags;
951 restoreflags = false;
952
953 NET_EPOCH_ENTER(et);
954 if (control != NULL) {
955 /* TCP doesn't do control messages (rights, creds, etc) */
956 if (control->m_len > 0) {
957 m_freem(control);
958 error = EINVAL;
959 goto out;
960 }
961 m_freem(control); /* empty control, just free it */
962 }
963
964 if ((flags & PRUS_OOB) != 0 &&
965 (error = tcp_pru_options_support(tp, PRUS_OOB)) != 0)
966 goto out;
967
968 if (nam != NULL && tp->t_state < TCPS_SYN_SENT) {
969 if (tp->t_state == TCPS_LISTEN) {
970 error = EINVAL;
971 goto out;
972 }
973 switch (nam->sa_family) {
974 #ifdef INET
975 case AF_INET:
976 sinp = (struct sockaddr_in *)nam;
977 if (sinp->sin_len != sizeof(struct sockaddr_in)) {
978 error = EINVAL;
979 goto out;
980 }
981 if ((inp->inp_vflag & INP_IPV6) != 0) {
982 error = EAFNOSUPPORT;
983 goto out;
984 }
985 if (IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) {
986 error = EAFNOSUPPORT;
987 goto out;
988 }
989 if (ntohl(sinp->sin_addr.s_addr) == INADDR_BROADCAST) {
990 error = EACCES;
991 goto out;
992 }
993 if ((error = prison_remote_ip4(td->td_ucred,
994 &sinp->sin_addr)))
995 goto out;
996 #ifdef INET6
997 isipv6 = 0;
998 #endif
999 break;
1000 #endif /* INET */
1001 #ifdef INET6
1002 case AF_INET6:
1003 sin6 = (struct sockaddr_in6 *)nam;
1004 if (sin6->sin6_len != sizeof(*sin6)) {
1005 error = EINVAL;
1006 goto out;
1007 }
1008 if ((inp->inp_vflag & INP_IPV6PROTO) == 0) {
1009 error = EAFNOSUPPORT;
1010 goto out;
1011 }
1012 if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) {
1013 error = EAFNOSUPPORT;
1014 goto out;
1015 }
1016 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
1017 #ifdef INET
1018 if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) {
1019 error = EINVAL;
1020 goto out;
1021 }
1022 if ((inp->inp_vflag & INP_IPV4) == 0) {
1023 error = EAFNOSUPPORT;
1024 goto out;
1025 }
1026 restoreflags = true;
1027 inp->inp_vflag &= ~INP_IPV6;
1028 sinp = &sin;
1029 in6_sin6_2_sin(sinp, sin6);
1030 if (IN_MULTICAST(
1031 ntohl(sinp->sin_addr.s_addr))) {
1032 error = EAFNOSUPPORT;
1033 goto out;
1034 }
1035 if ((error = prison_remote_ip4(td->td_ucred,
1036 &sinp->sin_addr)))
1037 goto out;
1038 isipv6 = 0;
1039 #else /* !INET */
1040 error = EAFNOSUPPORT;
1041 goto out;
1042 #endif /* INET */
1043 } else {
1044 if ((inp->inp_vflag & INP_IPV6) == 0) {
1045 error = EAFNOSUPPORT;
1046 goto out;
1047 }
1048 restoreflags = true;
1049 inp->inp_vflag &= ~INP_IPV4;
1050 inp->inp_inc.inc_flags |= INC_ISIPV6;
1051 if ((error = prison_remote_ip6(td->td_ucred,
1052 &sin6->sin6_addr)))
1053 goto out;
1054 isipv6 = 1;
1055 }
1056 break;
1057 #endif /* INET6 */
1058 default:
1059 error = EAFNOSUPPORT;
1060 goto out;
1061 }
1062 }
1063 if (!(flags & PRUS_OOB)) {
1064 if (tp->t_acktime == 0)
1065 tp->t_acktime = ticks;
1066 sbappendstream(&so->so_snd, m, flags);
1067 m = NULL;
1068 if (nam && tp->t_state < TCPS_SYN_SENT) {
1069 KASSERT(tp->t_state == TCPS_CLOSED,
1070 ("%s: tp %p is listening", __func__, tp));
1071
1072 /*
1073 * Do implied connect if not yet connected,
1074 * initialize window to default value, and
1075 * initialize maxseg using peer's cached MSS.
1076 */
1077 #ifdef INET6
1078 if (isipv6)
1079 error = tcp6_connect(tp, sin6, td);
1080 #endif /* INET6 */
1081 #if defined(INET6) && defined(INET)
1082 else
1083 #endif
1084 #ifdef INET
1085 error = tcp_connect(tp, sinp, td);
1086 #endif
1087 /*
1088 * The bind operation in tcp_connect succeeded. We
1089 * no longer want to restore the flags if later
1090 * operations fail.
1091 */
1092 if (error == 0 || inp->inp_lport != 0)
1093 restoreflags = false;
1094
1095 if (error) {
1096 /* m is freed if PRUS_NOTREADY is unset. */
1097 sbflush(&so->so_snd);
1098 goto out;
1099 }
1100 if (tp->t_flags & TF_FASTOPEN)
1101 tcp_fastopen_connect(tp);
1102 else {
1103 tp->snd_wnd = TTCP_CLIENT_SND_WND;
1104 tcp_mss(tp, -1);
1105 }
1106 }
1107 if (flags & PRUS_EOF) {
1108 /*
1109 * Close the send side of the connection after
1110 * the data is sent.
1111 */
1112 socantsendmore(so);
1113 tcp_usrclosed(tp);
1114 }
1115 if (TCPS_HAVEESTABLISHED(tp->t_state) &&
1116 ((tp->t_flags2 & TF2_FBYTES_COMPLETE) == 0) &&
1117 (tp->t_fbyte_out == 0) &&
1118 (so->so_snd.sb_ccc > 0)) {
1119 tp->t_fbyte_out = ticks;
1120 if (tp->t_fbyte_out == 0)
1121 tp->t_fbyte_out = 1;
1122 if (tp->t_fbyte_out && tp->t_fbyte_in)
1123 tp->t_flags2 |= TF2_FBYTES_COMPLETE;
1124 }
1125 if (!(inp->inp_flags & INP_DROPPED) &&
1126 !(flags & PRUS_NOTREADY)) {
1127 if (flags & PRUS_MORETOCOME)
1128 tp->t_flags |= TF_MORETOCOME;
1129 error = tcp_output_nodrop(tp);
1130 if (flags & PRUS_MORETOCOME)
1131 tp->t_flags &= ~TF_MORETOCOME;
1132 }
1133 } else {
1134 /*
1135 * XXXRW: PRUS_EOF not implemented with PRUS_OOB?
1136 */
1137 SOCK_SENDBUF_LOCK(so);
1138 if (sbspace(&so->so_snd) < -512) {
1139 SOCK_SENDBUF_UNLOCK(so);
1140 error = ENOBUFS;
1141 goto out;
1142 }
1143 /*
1144 * According to RFC961 (Assigned Protocols),
1145 * the urgent pointer points to the last octet
1146 * of urgent data. We continue, however,
1147 * to consider it to indicate the first octet
1148 * of data past the urgent section.
1149 * Otherwise, snd_up should be one lower.
1150 */
1151 if (tp->t_acktime == 0)
1152 tp->t_acktime = ticks;
1153 sbappendstream_locked(&so->so_snd, m, flags);
1154 SOCK_SENDBUF_UNLOCK(so);
1155 m = NULL;
1156 if (nam && tp->t_state < TCPS_SYN_SENT) {
1157 /*
1158 * Do implied connect if not yet connected,
1159 * initialize window to default value, and
1160 * initialize maxseg using peer's cached MSS.
1161 */
1162
1163 /*
1164 * Not going to contemplate SYN|URG
1165 */
1166 if (tp->t_flags & TF_FASTOPEN)
1167 tp->t_flags &= ~TF_FASTOPEN;
1168 #ifdef INET6
1169 if (isipv6)
1170 error = tcp6_connect(tp, sin6, td);
1171 #endif /* INET6 */
1172 #if defined(INET6) && defined(INET)
1173 else
1174 #endif
1175 #ifdef INET
1176 error = tcp_connect(tp, sinp, td);
1177 #endif
1178 /*
1179 * The bind operation in tcp_connect succeeded. We
1180 * no longer want to restore the flags if later
1181 * operations fail.
1182 */
1183 if (error == 0 || inp->inp_lport != 0)
1184 restoreflags = false;
1185
1186 if (error != 0) {
1187 /* m is freed if PRUS_NOTREADY is unset. */
1188 sbflush(&so->so_snd);
1189 goto out;
1190 }
1191 tp->snd_wnd = TTCP_CLIENT_SND_WND;
1192 tcp_mss(tp, -1);
1193 }
1194 tp->snd_up = tp->snd_una + sbavail(&so->so_snd);
1195 if ((flags & PRUS_NOTREADY) == 0) {
1196 tp->t_flags |= TF_FORCEDATA;
1197 error = tcp_output_nodrop(tp);
1198 tp->t_flags &= ~TF_FORCEDATA;
1199 }
1200 }
1201 TCP_LOG_EVENT(tp, NULL,
1202 &inp->inp_socket->so_rcv,
1203 &inp->inp_socket->so_snd,
1204 TCP_LOG_USERSEND, error,
1205 0, NULL, false);
1206
1207 out:
1208 /*
1209 * In case of PRUS_NOTREADY, the caller or tcp_usr_ready() is
1210 * responsible for freeing memory.
1211 */
1212 if (m != NULL && (flags & PRUS_NOTREADY) == 0)
1213 m_freem(m);
1214
1215 /*
1216 * If the request was unsuccessful and we changed flags,
1217 * restore the original flags.
1218 */
1219 if (error != 0 && restoreflags) {
1220 inp->inp_vflag = vflagsav;
1221 inp->inp_inc.inc_flags = incflagsav;
1222 }
1223 tcp_bblog_pru(tp, (flags & PRUS_OOB) ? PRU_SENDOOB :
1224 ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND), error);
1225 TCP_PROBE2(debug__user, tp, (flags & PRUS_OOB) ? PRU_SENDOOB :
1226 ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND));
1227 error = tcp_unlock_or_drop(tp, error);
1228 NET_EPOCH_EXIT(et);
1229 return (error);
1230 }
1231
1232 static int
tcp_usr_ready(struct socket * so,struct mbuf * m,int count)1233 tcp_usr_ready(struct socket *so, struct mbuf *m, int count)
1234 {
1235 struct epoch_tracker et;
1236 struct inpcb *inp;
1237 struct tcpcb *tp;
1238 int error;
1239
1240 inp = sotoinpcb(so);
1241 INP_WLOCK(inp);
1242 if (inp->inp_flags & INP_DROPPED) {
1243 INP_WUNLOCK(inp);
1244 mb_free_notready(m, count);
1245 return (ECONNRESET);
1246 }
1247 tp = intotcpcb(inp);
1248
1249 SOCK_SENDBUF_LOCK(so);
1250 error = sbready(&so->so_snd, m, count);
1251 SOCK_SENDBUF_UNLOCK(so);
1252 if (error) {
1253 INP_WUNLOCK(inp);
1254 return (error);
1255 }
1256 NET_EPOCH_ENTER(et);
1257 error = tcp_output_unlock(tp);
1258 NET_EPOCH_EXIT(et);
1259
1260 return (error);
1261 }
1262
1263 /*
1264 * Abort the TCP. Drop the connection abruptly.
1265 */
1266 static void
tcp_usr_abort(struct socket * so)1267 tcp_usr_abort(struct socket *so)
1268 {
1269 struct inpcb *inp;
1270 struct tcpcb *tp;
1271 struct epoch_tracker et;
1272
1273 inp = sotoinpcb(so);
1274 KASSERT(inp != NULL, ("tcp_usr_abort: inp == NULL"));
1275
1276 NET_EPOCH_ENTER(et);
1277 INP_WLOCK(inp);
1278 KASSERT(inp->inp_socket != NULL,
1279 ("tcp_usr_abort: inp_socket == NULL"));
1280
1281 /*
1282 * If we still have full TCP state, and we're not dropped, drop.
1283 */
1284 if (!(inp->inp_flags & INP_DROPPED)) {
1285 tp = intotcpcb(inp);
1286 tp = tcp_drop(tp, ECONNABORTED);
1287 if (tp == NULL)
1288 goto dropped;
1289 tcp_bblog_pru(tp, PRU_ABORT, 0);
1290 TCP_PROBE2(debug__user, tp, PRU_ABORT);
1291 }
1292 if (!(inp->inp_flags & INP_DROPPED)) {
1293 soref(so);
1294 inp->inp_flags |= INP_SOCKREF;
1295 }
1296 INP_WUNLOCK(inp);
1297 dropped:
1298 NET_EPOCH_EXIT(et);
1299 }
1300
1301 /*
1302 * TCP socket is closed. Start friendly disconnect.
1303 */
1304 static void
tcp_usr_close(struct socket * so)1305 tcp_usr_close(struct socket *so)
1306 {
1307 struct inpcb *inp;
1308 struct tcpcb *tp;
1309 struct epoch_tracker et;
1310
1311 inp = sotoinpcb(so);
1312 KASSERT(inp != NULL, ("tcp_usr_close: inp == NULL"));
1313
1314 NET_EPOCH_ENTER(et);
1315 INP_WLOCK(inp);
1316 KASSERT(inp->inp_socket != NULL,
1317 ("tcp_usr_close: inp_socket == NULL"));
1318
1319 /*
1320 * If we are still connected and we're not dropped, initiate
1321 * a disconnect.
1322 */
1323 if (!(inp->inp_flags & INP_DROPPED)) {
1324 tp = intotcpcb(inp);
1325 if (tp->t_state != TCPS_TIME_WAIT) {
1326 tp->t_flags |= TF_CLOSED;
1327 tcp_disconnect(tp);
1328 tcp_bblog_pru(tp, PRU_CLOSE, 0);
1329 TCP_PROBE2(debug__user, tp, PRU_CLOSE);
1330 }
1331 }
1332 if (!(inp->inp_flags & INP_DROPPED)) {
1333 soref(so);
1334 inp->inp_flags |= INP_SOCKREF;
1335 }
1336 INP_WUNLOCK(inp);
1337 NET_EPOCH_EXIT(et);
1338 }
1339
1340 static int
tcp_pru_options_support(struct tcpcb * tp,int flags)1341 tcp_pru_options_support(struct tcpcb *tp, int flags)
1342 {
1343 /*
1344 * If the specific TCP stack has a pru_options
1345 * specified then it does not always support
1346 * all the PRU_XX options and we must ask it.
1347 * If the function is not specified then all
1348 * of the PRU_XX options are supported.
1349 */
1350 int ret = 0;
1351
1352 if (tp->t_fb->tfb_pru_options) {
1353 ret = (*tp->t_fb->tfb_pru_options)(tp, flags);
1354 }
1355 return (ret);
1356 }
1357
1358 /*
1359 * Receive out-of-band data.
1360 */
1361 static int
tcp_usr_rcvoob(struct socket * so,struct mbuf * m,int flags)1362 tcp_usr_rcvoob(struct socket *so, struct mbuf *m, int flags)
1363 {
1364 int error = 0;
1365 struct inpcb *inp;
1366 struct tcpcb *tp;
1367
1368 inp = sotoinpcb(so);
1369 KASSERT(inp != NULL, ("tcp_usr_rcvoob: inp == NULL"));
1370 INP_WLOCK(inp);
1371 if (inp->inp_flags & INP_DROPPED) {
1372 INP_WUNLOCK(inp);
1373 return (ECONNRESET);
1374 }
1375 tp = intotcpcb(inp);
1376
1377 error = tcp_pru_options_support(tp, PRUS_OOB);
1378 if (error) {
1379 goto out;
1380 }
1381 if ((so->so_oobmark == 0 &&
1382 (so->so_rcv.sb_state & SBS_RCVATMARK) == 0) ||
1383 so->so_options & SO_OOBINLINE ||
1384 tp->t_oobflags & TCPOOB_HADDATA) {
1385 error = EINVAL;
1386 goto out;
1387 }
1388 if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) {
1389 error = EWOULDBLOCK;
1390 goto out;
1391 }
1392 m->m_len = 1;
1393 *mtod(m, caddr_t) = tp->t_iobc;
1394 if ((flags & MSG_PEEK) == 0)
1395 tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA);
1396
1397 out:
1398 tcp_bblog_pru(tp, PRU_RCVOOB, error);
1399 TCP_PROBE2(debug__user, tp, PRU_RCVOOB);
1400 INP_WUNLOCK(inp);
1401 return (error);
1402 }
1403
1404 #ifdef INET
1405 struct protosw tcp_protosw = {
1406 .pr_type = SOCK_STREAM,
1407 .pr_protocol = IPPROTO_TCP,
1408 .pr_flags = PR_CONNREQUIRED | PR_IMPLOPCL | PR_WANTRCVD |
1409 PR_CAPATTACH,
1410 .pr_ctloutput = tcp_ctloutput,
1411 .pr_abort = tcp_usr_abort,
1412 .pr_accept = tcp_usr_accept,
1413 .pr_attach = tcp_usr_attach,
1414 .pr_bind = tcp_usr_bind,
1415 .pr_connect = tcp_usr_connect,
1416 .pr_control = in_control,
1417 .pr_detach = tcp_usr_detach,
1418 .pr_disconnect = tcp_usr_disconnect,
1419 .pr_listen = tcp_usr_listen,
1420 .pr_peeraddr = in_getpeeraddr,
1421 .pr_rcvd = tcp_usr_rcvd,
1422 .pr_rcvoob = tcp_usr_rcvoob,
1423 .pr_send = tcp_usr_send,
1424 .pr_sendfile_wait = sendfile_wait_generic,
1425 .pr_ready = tcp_usr_ready,
1426 .pr_shutdown = tcp_usr_shutdown,
1427 .pr_sockaddr = in_getsockaddr,
1428 .pr_sosetlabel = in_pcbsosetlabel,
1429 .pr_close = tcp_usr_close,
1430 };
1431 #endif /* INET */
1432
1433 #ifdef INET6
1434 struct protosw tcp6_protosw = {
1435 .pr_type = SOCK_STREAM,
1436 .pr_protocol = IPPROTO_TCP,
1437 .pr_flags = PR_CONNREQUIRED | PR_IMPLOPCL |PR_WANTRCVD |
1438 PR_CAPATTACH,
1439 .pr_ctloutput = tcp_ctloutput,
1440 .pr_abort = tcp_usr_abort,
1441 .pr_accept = tcp6_usr_accept,
1442 .pr_attach = tcp_usr_attach,
1443 .pr_bind = tcp6_usr_bind,
1444 .pr_connect = tcp6_usr_connect,
1445 .pr_control = in6_control,
1446 .pr_detach = tcp_usr_detach,
1447 .pr_disconnect = tcp_usr_disconnect,
1448 .pr_listen = tcp6_usr_listen,
1449 .pr_peeraddr = in6_mapped_peeraddr,
1450 .pr_rcvd = tcp_usr_rcvd,
1451 .pr_rcvoob = tcp_usr_rcvoob,
1452 .pr_send = tcp_usr_send,
1453 .pr_sendfile_wait = sendfile_wait_generic,
1454 .pr_ready = tcp_usr_ready,
1455 .pr_shutdown = tcp_usr_shutdown,
1456 .pr_sockaddr = in6_mapped_sockaddr,
1457 .pr_sosetlabel = in_pcbsosetlabel,
1458 .pr_close = tcp_usr_close,
1459 };
1460 #endif /* INET6 */
1461
1462 #ifdef INET
1463 /*
1464 * Common subroutine to open a TCP connection to remote host specified
1465 * by struct sockaddr_in. Call in_pcbconnect() to choose local host address
1466 * and assign a local port number and install the inpcb into the hash.
1467 * Initialize connection parameters and enter SYN-SENT state.
1468 */
1469 static int
tcp_connect(struct tcpcb * tp,struct sockaddr_in * sin,struct thread * td)1470 tcp_connect(struct tcpcb *tp, struct sockaddr_in *sin, struct thread *td)
1471 {
1472 struct inpcb *inp = tptoinpcb(tp);
1473 struct socket *so = tptosocket(tp);
1474 int error;
1475
1476 NET_EPOCH_ASSERT();
1477 INP_WLOCK_ASSERT(inp);
1478
1479 if (__predict_false((so->so_state &
1480 (SS_ISCONNECTING | SS_ISCONNECTED | SS_ISDISCONNECTING |
1481 SS_ISDISCONNECTED)) != 0))
1482 return (EISCONN);
1483 if (__predict_false((so->so_options & SO_REUSEPORT_LB) != 0))
1484 return (EOPNOTSUPP);
1485
1486 INP_HASH_WLOCK(&V_tcbinfo);
1487 error = in_pcbconnect(inp, sin, td->td_ucred);
1488 INP_HASH_WUNLOCK(&V_tcbinfo);
1489 if (error != 0)
1490 return (error);
1491
1492 /* set the hash on the connection */
1493 rss_proto_software_hash_v4(inp->inp_faddr, inp->inp_laddr,
1494 inp->inp_fport, inp->inp_lport, IPPROTO_TCP,
1495 &inp->inp_flowid, &inp->inp_flowtype);
1496 /*
1497 * Compute window scaling to request:
1498 * Scale to fit into sweet spot. See tcp_syncache.c.
1499 * XXX: This should move to tcp_output().
1500 */
1501 while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
1502 (TCP_MAXWIN << tp->request_r_scale) < sb_max)
1503 tp->request_r_scale++;
1504
1505 soisconnecting(so);
1506 TCPSTAT_INC(tcps_connattempt);
1507 tcp_state_change(tp, TCPS_SYN_SENT);
1508 tp->iss = tcp_new_isn(&inp->inp_inc);
1509 if (tp->t_flags & TF_REQ_TSTMP)
1510 tp->ts_offset = tcp_new_ts_offset(&inp->inp_inc);
1511 tcp_sendseqinit(tp);
1512
1513 return (0);
1514 }
1515 #endif /* INET */
1516
1517 #ifdef INET6
1518 static int
tcp6_connect(struct tcpcb * tp,struct sockaddr_in6 * sin6,struct thread * td)1519 tcp6_connect(struct tcpcb *tp, struct sockaddr_in6 *sin6, struct thread *td)
1520 {
1521 struct inpcb *inp = tptoinpcb(tp);
1522 struct socket *so = tptosocket(tp);
1523 int error;
1524
1525 NET_EPOCH_ASSERT();
1526 INP_WLOCK_ASSERT(inp);
1527
1528 if (__predict_false((so->so_state &
1529 (SS_ISCONNECTING | SS_ISCONNECTED | SS_ISDISCONNECTING |
1530 SS_ISDISCONNECTED)) != 0))
1531 return (EISCONN);
1532 if (__predict_false((so->so_options & SO_REUSEPORT_LB) != 0))
1533 return (EOPNOTSUPP);
1534
1535 INP_HASH_WLOCK(&V_tcbinfo);
1536 error = in6_pcbconnect(inp, sin6, td->td_ucred, true);
1537 INP_HASH_WUNLOCK(&V_tcbinfo);
1538 if (error != 0)
1539 return (error);
1540
1541 /* set the hash on the connection */
1542 rss_proto_software_hash_v6(&inp->in6p_faddr,
1543 &inp->in6p_laddr, inp->inp_fport, inp->inp_lport, IPPROTO_TCP,
1544 &inp->inp_flowid, &inp->inp_flowtype);
1545 /* Compute window scaling to request. */
1546 while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
1547 (TCP_MAXWIN << tp->request_r_scale) < sb_max)
1548 tp->request_r_scale++;
1549
1550 soisconnecting(so);
1551 TCPSTAT_INC(tcps_connattempt);
1552 tcp_state_change(tp, TCPS_SYN_SENT);
1553 tp->iss = tcp_new_isn(&inp->inp_inc);
1554 if (tp->t_flags & TF_REQ_TSTMP)
1555 tp->ts_offset = tcp_new_ts_offset(&inp->inp_inc);
1556 tcp_sendseqinit(tp);
1557
1558 return (0);
1559 }
1560 #endif /* INET6 */
1561
1562 /*
1563 * Export TCP internal state information via a struct tcp_info, based on the
1564 * Linux 2.6 API. Not ABI compatible as our constants are mapped differently
1565 * (TCP state machine, etc). We export all information using FreeBSD-native
1566 * constants -- for example, the numeric values for tcpi_state will differ
1567 * from Linux.
1568 */
1569 void
tcp_fill_info(const struct tcpcb * tp,struct tcp_info * ti)1570 tcp_fill_info(const struct tcpcb *tp, struct tcp_info *ti)
1571 {
1572
1573 INP_LOCK_ASSERT(tptoinpcb(tp));
1574 bzero(ti, sizeof(*ti));
1575
1576 ti->tcpi_state = tp->t_state;
1577 if ((tp->t_flags & TF_REQ_TSTMP) && (tp->t_flags & TF_RCVD_TSTMP))
1578 ti->tcpi_options |= TCPI_OPT_TIMESTAMPS;
1579 if (tp->t_flags & TF_SACK_PERMIT)
1580 ti->tcpi_options |= TCPI_OPT_SACK;
1581 if ((tp->t_flags & TF_REQ_SCALE) && (tp->t_flags & TF_RCVD_SCALE)) {
1582 ti->tcpi_options |= TCPI_OPT_WSCALE;
1583 ti->tcpi_snd_wscale = tp->snd_scale;
1584 ti->tcpi_rcv_wscale = tp->rcv_scale;
1585 }
1586 switch (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT)) {
1587 case TF2_ECN_PERMIT:
1588 ti->tcpi_options |= TCPI_OPT_ECN;
1589 break;
1590 case TF2_ACE_PERMIT:
1591 /* FALLTHROUGH */
1592 case TF2_ECN_PERMIT | TF2_ACE_PERMIT:
1593 ti->tcpi_options |= TCPI_OPT_ACE;
1594 break;
1595 default:
1596 break;
1597 }
1598 if (tp->t_flags & TF_FASTOPEN)
1599 ti->tcpi_options |= TCPI_OPT_TFO;
1600
1601 ti->tcpi_rto = tp->t_rxtcur * tick;
1602 ti->tcpi_last_data_recv = ((uint32_t)ticks - tp->t_rcvtime) * tick;
1603 ti->tcpi_rtt = ((u_int64_t)tp->t_srtt * tick) >> TCP_RTT_SHIFT;
1604 ti->tcpi_rttvar = ((u_int64_t)tp->t_rttvar * tick) >> TCP_RTTVAR_SHIFT;
1605
1606 ti->tcpi_snd_ssthresh = tp->snd_ssthresh;
1607 ti->tcpi_snd_cwnd = tp->snd_cwnd;
1608
1609 /*
1610 * FreeBSD-specific extension fields for tcp_info.
1611 */
1612 ti->tcpi_rcv_space = tp->rcv_wnd;
1613 ti->tcpi_rcv_nxt = tp->rcv_nxt;
1614 ti->tcpi_snd_wnd = tp->snd_wnd;
1615 ti->tcpi_snd_bwnd = 0; /* Unused, kept for compat. */
1616 ti->tcpi_snd_nxt = tp->snd_nxt;
1617 ti->tcpi_snd_mss = tp->t_maxseg;
1618 ti->tcpi_rcv_mss = tp->t_maxseg;
1619 ti->tcpi_snd_rexmitpack = tp->t_sndrexmitpack;
1620 ti->tcpi_rcv_ooopack = tp->t_rcvoopack;
1621 ti->tcpi_snd_zerowin = tp->t_sndzerowin;
1622 ti->tcpi_snd_una = tp->snd_una;
1623 ti->tcpi_snd_max = tp->snd_max;
1624 ti->tcpi_rcv_numsacks = tp->rcv_numsacks;
1625 ti->tcpi_rcv_adv = tp->rcv_adv;
1626 ti->tcpi_dupacks = tp->t_dupacks;
1627 ti->tcpi_rttmin = tp->t_rttlow;
1628 #ifdef TCP_OFFLOAD
1629 if (tp->t_flags & TF_TOE) {
1630 ti->tcpi_options |= TCPI_OPT_TOE;
1631 tcp_offload_tcp_info(tp, ti);
1632 }
1633 #endif
1634 /*
1635 * AccECN related counters.
1636 */
1637 if ((tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT)) ==
1638 (TF2_ECN_PERMIT | TF2_ACE_PERMIT))
1639 /*
1640 * Internal counter starts at 5 for AccECN
1641 * but 0 for RFC3168 ECN.
1642 */
1643 ti->tcpi_delivered_ce = tp->t_scep - 5;
1644 else
1645 ti->tcpi_delivered_ce = tp->t_scep;
1646 ti->tcpi_received_ce = tp->t_rcep;
1647 }
1648
1649 /*
1650 * tcp_ctloutput() must drop the inpcb lock before performing copyin on
1651 * socket option arguments. When it re-acquires the lock after the copy, it
1652 * has to revalidate that the connection is still valid for the socket
1653 * option.
1654 */
1655 #define INP_WLOCK_RECHECK_CLEANUP(inp, cleanup) do { \
1656 INP_WLOCK(inp); \
1657 if (inp->inp_flags & INP_DROPPED) { \
1658 INP_WUNLOCK(inp); \
1659 cleanup; \
1660 return (ECONNRESET); \
1661 } \
1662 tp = intotcpcb(inp); \
1663 } while(0)
1664 #define INP_WLOCK_RECHECK(inp) INP_WLOCK_RECHECK_CLEANUP((inp), /* noop */)
1665
1666 int
tcp_ctloutput_set(struct inpcb * inp,struct sockopt * sopt)1667 tcp_ctloutput_set(struct inpcb *inp, struct sockopt *sopt)
1668 {
1669 struct socket *so = inp->inp_socket;
1670 struct tcpcb *tp = intotcpcb(inp);
1671 int error = 0;
1672
1673 MPASS(sopt->sopt_dir == SOPT_SET);
1674 INP_WLOCK_ASSERT(inp);
1675 KASSERT((inp->inp_flags & INP_DROPPED) == 0,
1676 ("inp_flags == %x", inp->inp_flags));
1677 KASSERT(so != NULL, ("inp_socket == NULL"));
1678
1679 if (sopt->sopt_level != IPPROTO_TCP) {
1680 INP_WUNLOCK(inp);
1681 #ifdef INET6
1682 if (inp->inp_vflag & INP_IPV6PROTO)
1683 error = ip6_ctloutput(so, sopt);
1684 #endif
1685 #if defined(INET6) && defined(INET)
1686 else
1687 #endif
1688 #ifdef INET
1689 error = ip_ctloutput(so, sopt);
1690 #endif
1691 /*
1692 * When an IP-level socket option affects TCP, pass control
1693 * down to stack tfb_tcp_ctloutput, otherwise return what
1694 * IP level returned.
1695 */
1696 switch (sopt->sopt_level) {
1697 #ifdef INET6
1698 case IPPROTO_IPV6:
1699 if ((inp->inp_vflag & INP_IPV6PROTO) == 0)
1700 return (error);
1701 switch (sopt->sopt_name) {
1702 case IPV6_TCLASS:
1703 /* Notify tcp stacks that care (e.g. RACK). */
1704 break;
1705 case IPV6_USE_MIN_MTU:
1706 /* Update t_maxseg accordingly. */
1707 break;
1708 default:
1709 return (error);
1710 }
1711 break;
1712 #endif
1713 #ifdef INET
1714 case IPPROTO_IP:
1715 switch (sopt->sopt_name) {
1716 case IP_TOS:
1717 inp->inp_ip_tos &= ~IPTOS_ECN_MASK;
1718 break;
1719 case IP_TTL:
1720 /* Notify tcp stacks that care (e.g. RACK). */
1721 break;
1722 default:
1723 return (error);
1724 }
1725 break;
1726 #endif
1727 default:
1728 return (error);
1729 }
1730 INP_WLOCK_RECHECK(inp);
1731 } else if (sopt->sopt_name == TCP_FUNCTION_BLK) {
1732 /*
1733 * Protect the TCP option TCP_FUNCTION_BLK so
1734 * that a sub-function can *never* overwrite this.
1735 */
1736 struct tcp_function_set fsn;
1737 struct tcp_function_block *blk;
1738 void *ptr = NULL;
1739
1740 INP_WUNLOCK(inp);
1741 error = sooptcopyin(sopt, &fsn, sizeof fsn, sizeof fsn);
1742 if (error)
1743 return (error);
1744
1745 INP_WLOCK_RECHECK(inp);
1746
1747 blk = find_and_ref_tcp_functions(&fsn);
1748 if (blk == NULL) {
1749 INP_WUNLOCK(inp);
1750 return (ENOENT);
1751 }
1752 if (tp->t_fb == blk) {
1753 /* You already have this */
1754 refcount_release(&blk->tfb_refcnt);
1755 INP_WUNLOCK(inp);
1756 return (0);
1757 }
1758 if (blk->tfb_flags & TCP_FUNC_BEING_REMOVED) {
1759 refcount_release(&blk->tfb_refcnt);
1760 INP_WUNLOCK(inp);
1761 return (ENOENT);
1762 }
1763 error = (*blk->tfb_tcp_handoff_ok)(tp);
1764 if (error) {
1765 refcount_release(&blk->tfb_refcnt);
1766 INP_WUNLOCK(inp);
1767 return (error);
1768 }
1769 /*
1770 * Ensure the new stack takes ownership with a
1771 * clean slate on peak rate threshold.
1772 */
1773 if (tp->t_fb->tfb_tcp_timer_stop_all != NULL)
1774 tp->t_fb->tfb_tcp_timer_stop_all(tp);
1775 if (blk->tfb_tcp_fb_init) {
1776 error = (*blk->tfb_tcp_fb_init)(tp, &ptr);
1777 if (error) {
1778 /*
1779 * Release the ref count the lookup
1780 * acquired.
1781 */
1782 refcount_release(&blk->tfb_refcnt);
1783 /*
1784 * Now there is a chance that the
1785 * init() function mucked with some
1786 * things before it failed, such as
1787 * hpts or inp_flags2 or timer granularity.
1788 * It should not of, but lets give the old
1789 * stack a chance to reset to a known good state.
1790 */
1791 if (tp->t_fb->tfb_switch_failed) {
1792 (*tp->t_fb->tfb_switch_failed)(tp);
1793 }
1794 goto err_out;
1795 }
1796 }
1797 if (tp->t_fb->tfb_tcp_fb_fini) {
1798 struct epoch_tracker et;
1799 /*
1800 * Tell the stack to cleanup with 0 i.e.
1801 * the tcb is not going away.
1802 */
1803 NET_EPOCH_ENTER(et);
1804 (*tp->t_fb->tfb_tcp_fb_fini)(tp, 0);
1805 NET_EPOCH_EXIT(et);
1806 }
1807 /*
1808 * Release the old refcnt, the
1809 * lookup acquired a ref on the
1810 * new one already.
1811 */
1812 refcount_release(&tp->t_fb->tfb_refcnt);
1813 /*
1814 * Set in the new stack.
1815 */
1816 tp->t_fb = blk;
1817 tp->t_fb_ptr = ptr;
1818 #ifdef TCP_OFFLOAD
1819 if (tp->t_flags & TF_TOE) {
1820 tcp_offload_ctloutput(tp, sopt->sopt_dir,
1821 sopt->sopt_name);
1822 }
1823 #endif
1824 err_out:
1825 INP_WUNLOCK(inp);
1826 return (error);
1827
1828 }
1829
1830 /* Pass in the INP locked, callee must unlock it. */
1831 return (tp->t_fb->tfb_tcp_ctloutput(tp, sopt));
1832 }
1833
1834 static int
tcp_ctloutput_get(struct inpcb * inp,struct sockopt * sopt)1835 tcp_ctloutput_get(struct inpcb *inp, struct sockopt *sopt)
1836 {
1837 struct socket *so = inp->inp_socket;
1838 struct tcpcb *tp = intotcpcb(inp);
1839 int error = 0;
1840
1841 MPASS(sopt->sopt_dir == SOPT_GET);
1842 INP_WLOCK_ASSERT(inp);
1843 KASSERT((inp->inp_flags & INP_DROPPED) == 0,
1844 ("inp_flags == %x", inp->inp_flags));
1845 KASSERT(so != NULL, ("inp_socket == NULL"));
1846
1847 if (sopt->sopt_level != IPPROTO_TCP) {
1848 INP_WUNLOCK(inp);
1849 #ifdef INET6
1850 if (inp->inp_vflag & INP_IPV6PROTO)
1851 error = ip6_ctloutput(so, sopt);
1852 #endif /* INET6 */
1853 #if defined(INET6) && defined(INET)
1854 else
1855 #endif
1856 #ifdef INET
1857 error = ip_ctloutput(so, sopt);
1858 #endif
1859 return (error);
1860 }
1861 if (((sopt->sopt_name == TCP_FUNCTION_BLK) ||
1862 (sopt->sopt_name == TCP_FUNCTION_ALIAS))) {
1863 struct tcp_function_set fsn;
1864
1865 if (sopt->sopt_name == TCP_FUNCTION_ALIAS) {
1866 memset(&fsn, 0, sizeof(fsn));
1867 find_tcp_function_alias(tp->t_fb, &fsn);
1868 } else {
1869 strncpy(fsn.function_set_name,
1870 tp->t_fb->tfb_tcp_block_name,
1871 TCP_FUNCTION_NAME_LEN_MAX);
1872 fsn.function_set_name[TCP_FUNCTION_NAME_LEN_MAX - 1] = '\0';
1873 }
1874 fsn.pcbcnt = tp->t_fb->tfb_refcnt;
1875 INP_WUNLOCK(inp);
1876 error = sooptcopyout(sopt, &fsn, sizeof fsn);
1877 return (error);
1878 }
1879
1880 /* Pass in the INP locked, callee must unlock it. */
1881 return (tp->t_fb->tfb_tcp_ctloutput(tp, sopt));
1882 }
1883
1884 int
tcp_ctloutput(struct socket * so,struct sockopt * sopt)1885 tcp_ctloutput(struct socket *so, struct sockopt *sopt)
1886 {
1887 struct inpcb *inp;
1888
1889 inp = sotoinpcb(so);
1890 KASSERT(inp != NULL, ("tcp_ctloutput: inp == NULL"));
1891
1892 INP_WLOCK(inp);
1893 if (inp->inp_flags & INP_DROPPED) {
1894 INP_WUNLOCK(inp);
1895 return (ECONNRESET);
1896 }
1897 if (sopt->sopt_dir == SOPT_SET)
1898 return (tcp_ctloutput_set(inp, sopt));
1899 else if (sopt->sopt_dir == SOPT_GET)
1900 return (tcp_ctloutput_get(inp, sopt));
1901 else
1902 panic("%s: sopt_dir $%d", __func__, sopt->sopt_dir);
1903 }
1904
1905 /*
1906 * If this assert becomes untrue, we need to change the size of the buf
1907 * variable in tcp_default_ctloutput().
1908 */
1909 #ifdef CTASSERT
1910 CTASSERT(TCP_CA_NAME_MAX <= TCP_LOG_ID_LEN);
1911 CTASSERT(TCP_LOG_REASON_LEN <= TCP_LOG_ID_LEN);
1912 #endif
1913
1914 extern struct cc_algo newreno_cc_algo;
1915
1916 static int
tcp_set_cc_mod(struct inpcb * inp,struct sockopt * sopt)1917 tcp_set_cc_mod(struct inpcb *inp, struct sockopt *sopt)
1918 {
1919 struct cc_algo *algo;
1920 void *ptr = NULL;
1921 struct tcpcb *tp;
1922 struct cc_var cc_mem;
1923 char buf[TCP_CA_NAME_MAX];
1924 size_t mem_sz;
1925 int error;
1926
1927 INP_WUNLOCK(inp);
1928 error = sooptcopyin(sopt, buf, TCP_CA_NAME_MAX - 1, 1);
1929 if (error)
1930 return(error);
1931 buf[sopt->sopt_valsize] = '\0';
1932 CC_LIST_RLOCK();
1933 STAILQ_FOREACH(algo, &cc_list, entries) {
1934 if (strncmp(buf, algo->name,
1935 TCP_CA_NAME_MAX) == 0) {
1936 if (algo->flags & CC_MODULE_BEING_REMOVED) {
1937 /* We can't "see" modules being unloaded */
1938 continue;
1939 }
1940 break;
1941 }
1942 }
1943 if (algo == NULL) {
1944 CC_LIST_RUNLOCK();
1945 return(ESRCH);
1946 }
1947 /*
1948 * With a reference the algorithm cannot be removed
1949 * so we hold a reference through the change process.
1950 */
1951 cc_refer(algo);
1952 CC_LIST_RUNLOCK();
1953 if (algo->cb_init != NULL) {
1954 /* We can now pre-get the memory for the CC */
1955 mem_sz = (*algo->cc_data_sz)();
1956 if (mem_sz == 0) {
1957 goto no_mem_needed;
1958 }
1959 ptr = malloc(mem_sz, M_CC_MEM, M_WAITOK);
1960 } else {
1961 no_mem_needed:
1962 mem_sz = 0;
1963 ptr = NULL;
1964 }
1965 /*
1966 * Make sure its all clean and zero and also get
1967 * back the inplock.
1968 */
1969 memset(&cc_mem, 0, sizeof(cc_mem));
1970 INP_WLOCK(inp);
1971 if (inp->inp_flags & INP_DROPPED) {
1972 INP_WUNLOCK(inp);
1973 if (ptr)
1974 free(ptr, M_CC_MEM);
1975 /* Release our temp reference */
1976 CC_LIST_RLOCK();
1977 cc_release(algo);
1978 CC_LIST_RUNLOCK();
1979 return (ECONNRESET);
1980 }
1981 tp = intotcpcb(inp);
1982 if (ptr != NULL)
1983 memset(ptr, 0, mem_sz);
1984 cc_mem.tp = tp;
1985 /*
1986 * We once again hold a write lock over the tcb so it's
1987 * safe to do these things without ordering concerns.
1988 * Note here we init into stack memory.
1989 */
1990 if (algo->cb_init != NULL)
1991 error = algo->cb_init(&cc_mem, ptr);
1992 else
1993 error = 0;
1994 /*
1995 * The CC algorithms, when given their memory
1996 * should not fail we could in theory have a
1997 * KASSERT here.
1998 */
1999 if (error == 0) {
2000 /*
2001 * Touchdown, lets go ahead and move the
2002 * connection to the new CC module by
2003 * copying in the cc_mem after we call
2004 * the old ones cleanup (if any).
2005 */
2006 if (CC_ALGO(tp)->cb_destroy != NULL)
2007 CC_ALGO(tp)->cb_destroy(&tp->t_ccv);
2008 /* Detach the old CC from the tcpcb */
2009 cc_detach(tp);
2010 /* Copy in our temp memory that was inited */
2011 memcpy(&tp->t_ccv, &cc_mem, sizeof(struct cc_var));
2012 /* Now attach the new, which takes a reference */
2013 cc_attach(tp, algo);
2014 /* Ok now are we where we have gotten past any conn_init? */
2015 if (TCPS_HAVEESTABLISHED(tp->t_state) && (CC_ALGO(tp)->conn_init != NULL)) {
2016 /* Yep run the connection init for the new CC */
2017 CC_ALGO(tp)->conn_init(&tp->t_ccv);
2018 }
2019 } else if (ptr)
2020 free(ptr, M_CC_MEM);
2021 INP_WUNLOCK(inp);
2022 /* Now lets release our temp reference */
2023 CC_LIST_RLOCK();
2024 cc_release(algo);
2025 CC_LIST_RUNLOCK();
2026 return (error);
2027 }
2028
2029 int
tcp_default_ctloutput(struct tcpcb * tp,struct sockopt * sopt)2030 tcp_default_ctloutput(struct tcpcb *tp, struct sockopt *sopt)
2031 {
2032 struct inpcb *inp = tptoinpcb(tp);
2033 int error, opt, optval;
2034 u_int ui;
2035 struct tcp_info ti;
2036 #ifdef KERN_TLS
2037 struct tls_enable tls;
2038 struct socket *so = inp->inp_socket;
2039 #endif
2040 char *pbuf, buf[TCP_LOG_ID_LEN];
2041 #ifdef STATS
2042 struct statsblob *sbp;
2043 #endif
2044 size_t len;
2045
2046 INP_WLOCK_ASSERT(inp);
2047 KASSERT((inp->inp_flags & INP_DROPPED) == 0,
2048 ("inp_flags == %x", inp->inp_flags));
2049 KASSERT(inp->inp_socket != NULL, ("inp_socket == NULL"));
2050
2051 switch (sopt->sopt_level) {
2052 #ifdef INET6
2053 case IPPROTO_IPV6:
2054 MPASS(inp->inp_vflag & INP_IPV6PROTO);
2055 switch (sopt->sopt_name) {
2056 case IPV6_USE_MIN_MTU:
2057 tcp6_use_min_mtu(tp);
2058 /* FALLTHROUGH */
2059 }
2060 INP_WUNLOCK(inp);
2061 return (0);
2062 #endif
2063 #ifdef INET
2064 case IPPROTO_IP:
2065 INP_WUNLOCK(inp);
2066 return (0);
2067 #endif
2068 }
2069
2070 /*
2071 * For TCP_CCALGOOPT forward the control to CC module, for both
2072 * SOPT_SET and SOPT_GET.
2073 */
2074 switch (sopt->sopt_name) {
2075 case TCP_CCALGOOPT:
2076 INP_WUNLOCK(inp);
2077 if (sopt->sopt_valsize > CC_ALGOOPT_LIMIT)
2078 return (EINVAL);
2079 pbuf = malloc(sopt->sopt_valsize, M_TEMP, M_WAITOK | M_ZERO);
2080 error = sooptcopyin(sopt, pbuf, sopt->sopt_valsize,
2081 sopt->sopt_valsize);
2082 if (error) {
2083 free(pbuf, M_TEMP);
2084 return (error);
2085 }
2086 INP_WLOCK_RECHECK_CLEANUP(inp, free(pbuf, M_TEMP));
2087 if (CC_ALGO(tp)->ctl_output != NULL)
2088 error = CC_ALGO(tp)->ctl_output(&tp->t_ccv, sopt, pbuf);
2089 else
2090 error = ENOENT;
2091 INP_WUNLOCK(inp);
2092 if (error == 0 && sopt->sopt_dir == SOPT_GET)
2093 error = sooptcopyout(sopt, pbuf, sopt->sopt_valsize);
2094 free(pbuf, M_TEMP);
2095 return (error);
2096 }
2097
2098 switch (sopt->sopt_dir) {
2099 case SOPT_SET:
2100 switch (sopt->sopt_name) {
2101 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
2102 case TCP_MD5SIG:
2103 INP_WUNLOCK(inp);
2104 if (!TCPMD5_ENABLED())
2105 return (ENOPROTOOPT);
2106 error = TCPMD5_PCBCTL(inp, sopt);
2107 if (error)
2108 return (error);
2109 INP_WLOCK_RECHECK(inp);
2110 goto unlock_and_done;
2111 #endif /* IPSEC */
2112
2113 case TCP_NODELAY:
2114 case TCP_NOOPT:
2115 INP_WUNLOCK(inp);
2116 error = sooptcopyin(sopt, &optval, sizeof optval,
2117 sizeof optval);
2118 if (error)
2119 return (error);
2120
2121 INP_WLOCK_RECHECK(inp);
2122 switch (sopt->sopt_name) {
2123 case TCP_NODELAY:
2124 opt = TF_NODELAY;
2125 break;
2126 case TCP_NOOPT:
2127 opt = TF_NOOPT;
2128 break;
2129 default:
2130 opt = 0; /* dead code to fool gcc */
2131 break;
2132 }
2133
2134 if (optval)
2135 tp->t_flags |= opt;
2136 else
2137 tp->t_flags &= ~opt;
2138 unlock_and_done:
2139 #ifdef TCP_OFFLOAD
2140 if (tp->t_flags & TF_TOE) {
2141 tcp_offload_ctloutput(tp, sopt->sopt_dir,
2142 sopt->sopt_name);
2143 }
2144 #endif
2145 INP_WUNLOCK(inp);
2146 break;
2147
2148 case TCP_NOPUSH:
2149 INP_WUNLOCK(inp);
2150 error = sooptcopyin(sopt, &optval, sizeof optval,
2151 sizeof optval);
2152 if (error)
2153 return (error);
2154
2155 INP_WLOCK_RECHECK(inp);
2156 if (optval)
2157 tp->t_flags |= TF_NOPUSH;
2158 else if (tp->t_flags & TF_NOPUSH) {
2159 tp->t_flags &= ~TF_NOPUSH;
2160 if (TCPS_HAVEESTABLISHED(tp->t_state)) {
2161 struct epoch_tracker et;
2162
2163 NET_EPOCH_ENTER(et);
2164 error = tcp_output_nodrop(tp);
2165 NET_EPOCH_EXIT(et);
2166 }
2167 }
2168 goto unlock_and_done;
2169
2170 case TCP_REMOTE_UDP_ENCAPS_PORT:
2171 INP_WUNLOCK(inp);
2172 error = sooptcopyin(sopt, &optval, sizeof optval,
2173 sizeof optval);
2174 if (error)
2175 return (error);
2176 if ((optval < TCP_TUNNELING_PORT_MIN) ||
2177 (optval > TCP_TUNNELING_PORT_MAX)) {
2178 /* Its got to be in range */
2179 return (EINVAL);
2180 }
2181 if ((V_tcp_udp_tunneling_port == 0) && (optval != 0)) {
2182 /* You have to have enabled a UDP tunneling port first */
2183 return (EINVAL);
2184 }
2185 INP_WLOCK_RECHECK(inp);
2186 if (tp->t_state != TCPS_CLOSED) {
2187 /* You can't change after you are connected */
2188 error = EINVAL;
2189 } else {
2190 /* Ok we are all good set the port */
2191 tp->t_port = htons(optval);
2192 }
2193 goto unlock_and_done;
2194
2195 case TCP_MAXSEG:
2196 INP_WUNLOCK(inp);
2197 error = sooptcopyin(sopt, &optval, sizeof optval,
2198 sizeof optval);
2199 if (error)
2200 return (error);
2201
2202 INP_WLOCK_RECHECK(inp);
2203 if (optval > 0 && optval <= tp->t_maxseg &&
2204 optval + 40 >= V_tcp_minmss) {
2205 tp->t_maxseg = optval;
2206 if (tp->t_maxseg < V_tcp_mssdflt) {
2207 /*
2208 * The MSS is so small we should not process incoming
2209 * SACK's since we are subject to attack in such a
2210 * case.
2211 */
2212 tp->t_flags2 |= TF2_PROC_SACK_PROHIBIT;
2213 } else {
2214 tp->t_flags2 &= ~TF2_PROC_SACK_PROHIBIT;
2215 }
2216 } else
2217 error = EINVAL;
2218 goto unlock_and_done;
2219
2220 case TCP_INFO:
2221 INP_WUNLOCK(inp);
2222 error = EINVAL;
2223 break;
2224
2225 case TCP_STATS:
2226 INP_WUNLOCK(inp);
2227 #ifdef STATS
2228 error = sooptcopyin(sopt, &optval, sizeof optval,
2229 sizeof optval);
2230 if (error)
2231 return (error);
2232
2233 if (optval > 0)
2234 sbp = stats_blob_alloc(
2235 V_tcp_perconn_stats_dflt_tpl, 0);
2236 else
2237 sbp = NULL;
2238
2239 INP_WLOCK_RECHECK(inp);
2240 if ((tp->t_stats != NULL && sbp == NULL) ||
2241 (tp->t_stats == NULL && sbp != NULL)) {
2242 struct statsblob *t = tp->t_stats;
2243 tp->t_stats = sbp;
2244 sbp = t;
2245 }
2246 INP_WUNLOCK(inp);
2247
2248 stats_blob_destroy(sbp);
2249 #else
2250 return (EOPNOTSUPP);
2251 #endif /* !STATS */
2252 break;
2253
2254 case TCP_CONGESTION:
2255 error = tcp_set_cc_mod(inp, sopt);
2256 break;
2257
2258 case TCP_REUSPORT_LB_NUMA:
2259 INP_WUNLOCK(inp);
2260 error = sooptcopyin(sopt, &optval, sizeof(optval),
2261 sizeof(optval));
2262 INP_WLOCK_RECHECK(inp);
2263 if (!error)
2264 error = in_pcblbgroup_numa(inp, optval);
2265 INP_WUNLOCK(inp);
2266 break;
2267
2268 #ifdef KERN_TLS
2269 case TCP_TXTLS_ENABLE:
2270 INP_WUNLOCK(inp);
2271 error = ktls_copyin_tls_enable(sopt, &tls);
2272 if (error != 0)
2273 break;
2274 error = ktls_enable_tx(so, &tls);
2275 ktls_cleanup_tls_enable(&tls);
2276 break;
2277 case TCP_TXTLS_MODE:
2278 INP_WUNLOCK(inp);
2279 error = sooptcopyin(sopt, &ui, sizeof(ui), sizeof(ui));
2280 if (error != 0)
2281 return (error);
2282
2283 INP_WLOCK_RECHECK(inp);
2284 error = ktls_set_tx_mode(so, ui);
2285 INP_WUNLOCK(inp);
2286 break;
2287 case TCP_RXTLS_ENABLE:
2288 INP_WUNLOCK(inp);
2289 error = ktls_copyin_tls_enable(sopt, &tls);
2290 if (error != 0)
2291 break;
2292 error = ktls_enable_rx(so, &tls);
2293 ktls_cleanup_tls_enable(&tls);
2294 break;
2295 #endif
2296 case TCP_MAXUNACKTIME:
2297 case TCP_KEEPIDLE:
2298 case TCP_KEEPINTVL:
2299 case TCP_KEEPINIT:
2300 INP_WUNLOCK(inp);
2301 error = sooptcopyin(sopt, &ui, sizeof(ui), sizeof(ui));
2302 if (error)
2303 return (error);
2304
2305 if (ui > (UINT_MAX / hz)) {
2306 error = EINVAL;
2307 break;
2308 }
2309 ui *= hz;
2310
2311 INP_WLOCK_RECHECK(inp);
2312 switch (sopt->sopt_name) {
2313 case TCP_MAXUNACKTIME:
2314 tp->t_maxunacktime = ui;
2315 break;
2316
2317 case TCP_KEEPIDLE:
2318 tp->t_keepidle = ui;
2319 /*
2320 * XXX: better check current remaining
2321 * timeout and "merge" it with new value.
2322 */
2323 if ((tp->t_state > TCPS_LISTEN) &&
2324 (tp->t_state <= TCPS_CLOSING))
2325 tcp_timer_activate(tp, TT_KEEP,
2326 TP_KEEPIDLE(tp));
2327 break;
2328 case TCP_KEEPINTVL:
2329 tp->t_keepintvl = ui;
2330 if ((tp->t_state == TCPS_FIN_WAIT_2) &&
2331 (TP_MAXIDLE(tp) > 0))
2332 tcp_timer_activate(tp, TT_2MSL,
2333 TP_MAXIDLE(tp));
2334 break;
2335 case TCP_KEEPINIT:
2336 tp->t_keepinit = ui;
2337 if (tp->t_state == TCPS_SYN_RECEIVED ||
2338 tp->t_state == TCPS_SYN_SENT)
2339 tcp_timer_activate(tp, TT_KEEP,
2340 TP_KEEPINIT(tp));
2341 break;
2342 }
2343 goto unlock_and_done;
2344
2345 case TCP_KEEPCNT:
2346 INP_WUNLOCK(inp);
2347 error = sooptcopyin(sopt, &ui, sizeof(ui), sizeof(ui));
2348 if (error)
2349 return (error);
2350
2351 INP_WLOCK_RECHECK(inp);
2352 tp->t_keepcnt = ui;
2353 if ((tp->t_state == TCPS_FIN_WAIT_2) &&
2354 (TP_MAXIDLE(tp) > 0))
2355 tcp_timer_activate(tp, TT_2MSL,
2356 TP_MAXIDLE(tp));
2357 goto unlock_and_done;
2358
2359 case TCP_FASTOPEN: {
2360 struct tcp_fastopen tfo_optval;
2361
2362 INP_WUNLOCK(inp);
2363 if (!V_tcp_fastopen_client_enable &&
2364 !V_tcp_fastopen_server_enable)
2365 return (EPERM);
2366
2367 error = sooptcopyin(sopt, &tfo_optval,
2368 sizeof(tfo_optval), sizeof(int));
2369 if (error)
2370 return (error);
2371
2372 INP_WLOCK_RECHECK(inp);
2373 if ((tp->t_state != TCPS_CLOSED) &&
2374 (tp->t_state != TCPS_LISTEN)) {
2375 error = EINVAL;
2376 goto unlock_and_done;
2377 }
2378 if (tfo_optval.enable) {
2379 if (tp->t_state == TCPS_LISTEN) {
2380 if (!V_tcp_fastopen_server_enable) {
2381 error = EPERM;
2382 goto unlock_and_done;
2383 }
2384
2385 if (tp->t_tfo_pending == NULL)
2386 tp->t_tfo_pending =
2387 tcp_fastopen_alloc_counter();
2388 } else {
2389 /*
2390 * If a pre-shared key was provided,
2391 * stash it in the client cookie
2392 * field of the tcpcb for use during
2393 * connect.
2394 */
2395 if (sopt->sopt_valsize ==
2396 sizeof(tfo_optval)) {
2397 memcpy(tp->t_tfo_cookie.client,
2398 tfo_optval.psk,
2399 TCP_FASTOPEN_PSK_LEN);
2400 tp->t_tfo_client_cookie_len =
2401 TCP_FASTOPEN_PSK_LEN;
2402 }
2403 }
2404 tp->t_flags |= TF_FASTOPEN;
2405 } else
2406 tp->t_flags &= ~TF_FASTOPEN;
2407 goto unlock_and_done;
2408 }
2409
2410 #ifdef TCP_BLACKBOX
2411 case TCP_LOG:
2412 INP_WUNLOCK(inp);
2413 error = sooptcopyin(sopt, &optval, sizeof optval,
2414 sizeof optval);
2415 if (error)
2416 return (error);
2417
2418 INP_WLOCK_RECHECK(inp);
2419 error = tcp_log_state_change(tp, optval);
2420 goto unlock_and_done;
2421
2422 case TCP_LOGBUF:
2423 INP_WUNLOCK(inp);
2424 error = EINVAL;
2425 break;
2426
2427 case TCP_LOGID:
2428 INP_WUNLOCK(inp);
2429 error = sooptcopyin(sopt, buf, TCP_LOG_ID_LEN - 1, 0);
2430 if (error)
2431 break;
2432 buf[sopt->sopt_valsize] = '\0';
2433 INP_WLOCK_RECHECK(inp);
2434 error = tcp_log_set_id(tp, buf);
2435 /* tcp_log_set_id() unlocks the INP. */
2436 break;
2437
2438 case TCP_LOGDUMP:
2439 case TCP_LOGDUMPID:
2440 INP_WUNLOCK(inp);
2441 error =
2442 sooptcopyin(sopt, buf, TCP_LOG_REASON_LEN - 1, 0);
2443 if (error)
2444 break;
2445 buf[sopt->sopt_valsize] = '\0';
2446 INP_WLOCK_RECHECK(inp);
2447 if (sopt->sopt_name == TCP_LOGDUMP) {
2448 error = tcp_log_dump_tp_logbuf(tp, buf,
2449 M_WAITOK, true);
2450 INP_WUNLOCK(inp);
2451 } else {
2452 tcp_log_dump_tp_bucket_logbufs(tp, buf);
2453 /*
2454 * tcp_log_dump_tp_bucket_logbufs() drops the
2455 * INP lock.
2456 */
2457 }
2458 break;
2459 #endif
2460
2461 default:
2462 INP_WUNLOCK(inp);
2463 error = ENOPROTOOPT;
2464 break;
2465 }
2466 break;
2467
2468 case SOPT_GET:
2469 tp = intotcpcb(inp);
2470 switch (sopt->sopt_name) {
2471 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
2472 case TCP_MD5SIG:
2473 INP_WUNLOCK(inp);
2474 if (!TCPMD5_ENABLED())
2475 return (ENOPROTOOPT);
2476 error = TCPMD5_PCBCTL(inp, sopt);
2477 break;
2478 #endif
2479
2480 case TCP_NODELAY:
2481 optval = tp->t_flags & TF_NODELAY;
2482 INP_WUNLOCK(inp);
2483 error = sooptcopyout(sopt, &optval, sizeof optval);
2484 break;
2485 case TCP_MAXSEG:
2486 optval = tp->t_maxseg;
2487 INP_WUNLOCK(inp);
2488 error = sooptcopyout(sopt, &optval, sizeof optval);
2489 break;
2490 case TCP_REMOTE_UDP_ENCAPS_PORT:
2491 optval = ntohs(tp->t_port);
2492 INP_WUNLOCK(inp);
2493 error = sooptcopyout(sopt, &optval, sizeof optval);
2494 break;
2495 case TCP_NOOPT:
2496 optval = tp->t_flags & TF_NOOPT;
2497 INP_WUNLOCK(inp);
2498 error = sooptcopyout(sopt, &optval, sizeof optval);
2499 break;
2500 case TCP_NOPUSH:
2501 optval = tp->t_flags & TF_NOPUSH;
2502 INP_WUNLOCK(inp);
2503 error = sooptcopyout(sopt, &optval, sizeof optval);
2504 break;
2505 case TCP_INFO:
2506 tcp_fill_info(tp, &ti);
2507 INP_WUNLOCK(inp);
2508 error = sooptcopyout(sopt, &ti, sizeof ti);
2509 break;
2510 case TCP_STATS:
2511 {
2512 #ifdef STATS
2513 int nheld;
2514 TYPEOF_MEMBER(struct statsblob, flags) sbflags = 0;
2515
2516 error = 0;
2517 socklen_t outsbsz = sopt->sopt_valsize;
2518 if (tp->t_stats == NULL)
2519 error = ENOENT;
2520 else if (outsbsz >= tp->t_stats->cursz)
2521 outsbsz = tp->t_stats->cursz;
2522 else if (outsbsz >= sizeof(struct statsblob))
2523 outsbsz = sizeof(struct statsblob);
2524 else
2525 error = EINVAL;
2526 INP_WUNLOCK(inp);
2527 if (error)
2528 break;
2529
2530 sbp = sopt->sopt_val;
2531 nheld = atop(round_page(((vm_offset_t)sbp) +
2532 (vm_size_t)outsbsz) - trunc_page((vm_offset_t)sbp));
2533 vm_page_t ma[nheld];
2534 if (vm_fault_quick_hold_pages(
2535 &curproc->p_vmspace->vm_map, (vm_offset_t)sbp,
2536 outsbsz, VM_PROT_READ | VM_PROT_WRITE, ma,
2537 nheld) < 0) {
2538 error = EFAULT;
2539 break;
2540 }
2541
2542 if ((error = copyin_nofault(&(sbp->flags), &sbflags,
2543 SIZEOF_MEMBER(struct statsblob, flags))))
2544 goto unhold;
2545
2546 INP_WLOCK_RECHECK(inp);
2547 error = stats_blob_snapshot(&sbp, outsbsz, tp->t_stats,
2548 sbflags | SB_CLONE_USRDSTNOFAULT);
2549 INP_WUNLOCK(inp);
2550 sopt->sopt_valsize = outsbsz;
2551 unhold:
2552 vm_page_unhold_pages(ma, nheld);
2553 #else
2554 INP_WUNLOCK(inp);
2555 error = EOPNOTSUPP;
2556 #endif /* !STATS */
2557 break;
2558 }
2559 case TCP_CONGESTION:
2560 len = strlcpy(buf, CC_ALGO(tp)->name, TCP_CA_NAME_MAX);
2561 INP_WUNLOCK(inp);
2562 error = sooptcopyout(sopt, buf, len + 1);
2563 break;
2564 case TCP_MAXUNACKTIME:
2565 case TCP_KEEPIDLE:
2566 case TCP_KEEPINTVL:
2567 case TCP_KEEPINIT:
2568 case TCP_KEEPCNT:
2569 switch (sopt->sopt_name) {
2570 case TCP_MAXUNACKTIME:
2571 ui = TP_MAXUNACKTIME(tp) / hz;
2572 break;
2573 case TCP_KEEPIDLE:
2574 ui = TP_KEEPIDLE(tp) / hz;
2575 break;
2576 case TCP_KEEPINTVL:
2577 ui = TP_KEEPINTVL(tp) / hz;
2578 break;
2579 case TCP_KEEPINIT:
2580 ui = TP_KEEPINIT(tp) / hz;
2581 break;
2582 case TCP_KEEPCNT:
2583 ui = TP_KEEPCNT(tp);
2584 break;
2585 }
2586 INP_WUNLOCK(inp);
2587 error = sooptcopyout(sopt, &ui, sizeof(ui));
2588 break;
2589 case TCP_FASTOPEN:
2590 optval = tp->t_flags & TF_FASTOPEN;
2591 INP_WUNLOCK(inp);
2592 error = sooptcopyout(sopt, &optval, sizeof optval);
2593 break;
2594 #ifdef TCP_BLACKBOX
2595 case TCP_LOG:
2596 optval = tcp_get_bblog_state(tp);
2597 INP_WUNLOCK(inp);
2598 error = sooptcopyout(sopt, &optval, sizeof(optval));
2599 break;
2600 case TCP_LOGBUF:
2601 /* tcp_log_getlogbuf() does INP_WUNLOCK(inp) */
2602 error = tcp_log_getlogbuf(sopt, tp);
2603 break;
2604 case TCP_LOGID:
2605 len = tcp_log_get_id(tp, buf);
2606 INP_WUNLOCK(inp);
2607 error = sooptcopyout(sopt, buf, len + 1);
2608 break;
2609 case TCP_LOGDUMP:
2610 case TCP_LOGDUMPID:
2611 INP_WUNLOCK(inp);
2612 error = EINVAL;
2613 break;
2614 #endif
2615 #ifdef KERN_TLS
2616 case TCP_TXTLS_MODE:
2617 error = ktls_get_tx_mode(so, &optval);
2618 INP_WUNLOCK(inp);
2619 if (error == 0)
2620 error = sooptcopyout(sopt, &optval,
2621 sizeof(optval));
2622 break;
2623 case TCP_RXTLS_MODE:
2624 error = ktls_get_rx_mode(so, &optval);
2625 INP_WUNLOCK(inp);
2626 if (error == 0)
2627 error = sooptcopyout(sopt, &optval,
2628 sizeof(optval));
2629 break;
2630 #endif
2631 default:
2632 INP_WUNLOCK(inp);
2633 error = ENOPROTOOPT;
2634 break;
2635 }
2636 break;
2637 }
2638 return (error);
2639 }
2640 #undef INP_WLOCK_RECHECK
2641 #undef INP_WLOCK_RECHECK_CLEANUP
2642
2643 /*
2644 * Initiate (or continue) disconnect.
2645 * If embryonic state, just send reset (once).
2646 * If in ``let data drain'' option and linger null, just drop.
2647 * Otherwise (hard), mark socket disconnecting and drop
2648 * current input data; switch states based on user close, and
2649 * send segment to peer (with FIN).
2650 */
2651 static void
tcp_disconnect(struct tcpcb * tp)2652 tcp_disconnect(struct tcpcb *tp)
2653 {
2654 struct inpcb *inp = tptoinpcb(tp);
2655 struct socket *so = tptosocket(tp);
2656
2657 NET_EPOCH_ASSERT();
2658 INP_WLOCK_ASSERT(inp);
2659
2660 /*
2661 * Neither tcp_close() nor tcp_drop() should return NULL, as the
2662 * socket is still open.
2663 */
2664 if (tp->t_state < TCPS_ESTABLISHED &&
2665 !(tp->t_state > TCPS_LISTEN && (tp->t_flags & TF_FASTOPEN))) {
2666 tp = tcp_close(tp);
2667 KASSERT(tp != NULL,
2668 ("tcp_disconnect: tcp_close() returned NULL"));
2669 } else if ((so->so_options & SO_LINGER) && so->so_linger == 0) {
2670 tp = tcp_drop(tp, 0);
2671 KASSERT(tp != NULL,
2672 ("tcp_disconnect: tcp_drop() returned NULL"));
2673 } else {
2674 soisdisconnecting(so);
2675 sbflush(&so->so_rcv);
2676 tcp_usrclosed(tp);
2677 if (!(inp->inp_flags & INP_DROPPED))
2678 /* Ignore stack's drop request, we already at it. */
2679 (void)tcp_output_nodrop(tp);
2680 }
2681 }
2682
2683 /*
2684 * User issued close, and wish to trail through shutdown states:
2685 * if never received SYN, just forget it. If got a SYN from peer,
2686 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN.
2687 * If already got a FIN from peer, then almost done; go to LAST_ACK
2688 * state. In all other cases, have already sent FIN to peer (e.g.
2689 * after PRU_SHUTDOWN), and just have to play tedious game waiting
2690 * for peer to send FIN or not respond to keep-alives, etc.
2691 * We can let the user exit from the close as soon as the FIN is acked.
2692 */
2693 static void
tcp_usrclosed(struct tcpcb * tp)2694 tcp_usrclosed(struct tcpcb *tp)
2695 {
2696
2697 NET_EPOCH_ASSERT();
2698 INP_WLOCK_ASSERT(tptoinpcb(tp));
2699
2700 switch (tp->t_state) {
2701 case TCPS_LISTEN:
2702 #ifdef TCP_OFFLOAD
2703 tcp_offload_listen_stop(tp);
2704 #endif
2705 tcp_state_change(tp, TCPS_CLOSED);
2706 /* FALLTHROUGH */
2707 case TCPS_CLOSED:
2708 tp = tcp_close(tp);
2709 /*
2710 * tcp_close() should never return NULL here as the socket is
2711 * still open.
2712 */
2713 KASSERT(tp != NULL,
2714 ("tcp_usrclosed: tcp_close() returned NULL"));
2715 break;
2716
2717 case TCPS_SYN_SENT:
2718 case TCPS_SYN_RECEIVED:
2719 tp->t_flags |= TF_NEEDFIN;
2720 break;
2721
2722 case TCPS_ESTABLISHED:
2723 tcp_state_change(tp, TCPS_FIN_WAIT_1);
2724 break;
2725
2726 case TCPS_CLOSE_WAIT:
2727 tcp_state_change(tp, TCPS_LAST_ACK);
2728 break;
2729 }
2730 if (tp->t_acktime == 0)
2731 tp->t_acktime = ticks;
2732 if (tp->t_state >= TCPS_FIN_WAIT_2) {
2733 tcp_free_sackholes(tp);
2734 soisdisconnected(tptosocket(tp));
2735 /* Prevent the connection hanging in FIN_WAIT_2 forever. */
2736 if (tp->t_state == TCPS_FIN_WAIT_2) {
2737 int timeout;
2738
2739 timeout = (tcp_fast_finwait2_recycle) ?
2740 tcp_finwait2_timeout : TP_MAXIDLE(tp);
2741 tcp_timer_activate(tp, TT_2MSL, timeout);
2742 }
2743 }
2744 }
2745
2746 #ifdef DDB
2747 static void
db_print_indent(int indent)2748 db_print_indent(int indent)
2749 {
2750 int i;
2751
2752 for (i = 0; i < indent; i++)
2753 db_printf(" ");
2754 }
2755
2756 static void
db_print_tstate(int t_state)2757 db_print_tstate(int t_state)
2758 {
2759
2760 switch (t_state) {
2761 case TCPS_CLOSED:
2762 db_printf("TCPS_CLOSED");
2763 return;
2764
2765 case TCPS_LISTEN:
2766 db_printf("TCPS_LISTEN");
2767 return;
2768
2769 case TCPS_SYN_SENT:
2770 db_printf("TCPS_SYN_SENT");
2771 return;
2772
2773 case TCPS_SYN_RECEIVED:
2774 db_printf("TCPS_SYN_RECEIVED");
2775 return;
2776
2777 case TCPS_ESTABLISHED:
2778 db_printf("TCPS_ESTABLISHED");
2779 return;
2780
2781 case TCPS_CLOSE_WAIT:
2782 db_printf("TCPS_CLOSE_WAIT");
2783 return;
2784
2785 case TCPS_FIN_WAIT_1:
2786 db_printf("TCPS_FIN_WAIT_1");
2787 return;
2788
2789 case TCPS_CLOSING:
2790 db_printf("TCPS_CLOSING");
2791 return;
2792
2793 case TCPS_LAST_ACK:
2794 db_printf("TCPS_LAST_ACK");
2795 return;
2796
2797 case TCPS_FIN_WAIT_2:
2798 db_printf("TCPS_FIN_WAIT_2");
2799 return;
2800
2801 case TCPS_TIME_WAIT:
2802 db_printf("TCPS_TIME_WAIT");
2803 return;
2804
2805 default:
2806 db_printf("unknown");
2807 return;
2808 }
2809 }
2810
2811 static void
db_print_bblog_state(int state)2812 db_print_bblog_state(int state)
2813 {
2814 switch (state) {
2815 case TCP_LOG_STATE_RATIO_OFF:
2816 db_printf("TCP_LOG_STATE_RATIO_OFF");
2817 break;
2818 case TCP_LOG_STATE_CLEAR:
2819 db_printf("TCP_LOG_STATE_CLEAR");
2820 break;
2821 case TCP_LOG_STATE_OFF:
2822 db_printf("TCP_LOG_STATE_OFF");
2823 break;
2824 case TCP_LOG_STATE_TAIL:
2825 db_printf("TCP_LOG_STATE_TAIL");
2826 break;
2827 case TCP_LOG_STATE_HEAD:
2828 db_printf("TCP_LOG_STATE_HEAD");
2829 break;
2830 case TCP_LOG_STATE_HEAD_AUTO:
2831 db_printf("TCP_LOG_STATE_HEAD_AUTO");
2832 break;
2833 case TCP_LOG_STATE_CONTINUAL:
2834 db_printf("TCP_LOG_STATE_CONTINUAL");
2835 break;
2836 case TCP_LOG_STATE_TAIL_AUTO:
2837 db_printf("TCP_LOG_STATE_TAIL_AUTO");
2838 break;
2839 case TCP_LOG_VIA_BBPOINTS:
2840 db_printf("TCP_LOG_STATE_BBPOINTS");
2841 break;
2842 default:
2843 db_printf("UNKNOWN(%d)", state);
2844 break;
2845 }
2846 }
2847
2848 static void
db_print_tcpcb(struct tcpcb * tp,const char * name,int indent,bool show_bblog,bool show_inpcb)2849 db_print_tcpcb(struct tcpcb *tp, const char *name, int indent, bool show_bblog,
2850 bool show_inpcb)
2851 {
2852
2853 db_print_indent(indent);
2854 db_printf("%s at %p\n", name, tp);
2855
2856 indent += 2;
2857
2858 if (show_inpcb)
2859 db_print_inpcb(tptoinpcb(tp), "t_inpcb", indent);
2860
2861 db_print_indent(indent);
2862 db_printf("t_segq first: %p t_segqlen: %d t_dupacks: %d\n",
2863 TAILQ_FIRST(&tp->t_segq), tp->t_segqlen, tp->t_dupacks);
2864
2865 db_print_indent(indent);
2866 db_printf("t_callout: %p t_timers: %p\n",
2867 &tp->t_callout, &tp->t_timers);
2868
2869 db_print_indent(indent);
2870 db_printf("t_state: %d (", tp->t_state);
2871 db_print_tstate(tp->t_state);
2872 db_printf(")\n");
2873
2874 db_print_indent(indent);
2875 db_printf("t_flags: 0x%b\n", tp->t_flags, TF_BITS);
2876
2877 db_print_indent(indent);
2878 db_printf("t_flags2: 0x%b\n", tp->t_flags2, TF2_BITS);
2879
2880 db_print_indent(indent);
2881 db_printf("snd_una: 0x%08x snd_max: 0x%08x snd_nxt: 0x%08x\n",
2882 tp->snd_una, tp->snd_max, tp->snd_nxt);
2883
2884 db_print_indent(indent);
2885 db_printf("snd_up: 0x%08x snd_wl1: 0x%08x snd_wl2: 0x%08x\n",
2886 tp->snd_up, tp->snd_wl1, tp->snd_wl2);
2887
2888 db_print_indent(indent);
2889 db_printf("iss: 0x%08x irs: 0x%08x rcv_nxt: 0x%08x\n",
2890 tp->iss, tp->irs, tp->rcv_nxt);
2891
2892 db_print_indent(indent);
2893 db_printf("rcv_adv: 0x%08x rcv_wnd: %u rcv_up: 0x%08x\n",
2894 tp->rcv_adv, tp->rcv_wnd, tp->rcv_up);
2895
2896 db_print_indent(indent);
2897 db_printf("snd_wnd: %u snd_cwnd: %u\n",
2898 tp->snd_wnd, tp->snd_cwnd);
2899
2900 db_print_indent(indent);
2901 db_printf("snd_ssthresh: %u snd_recover: "
2902 "0x%08x\n", tp->snd_ssthresh, tp->snd_recover);
2903
2904 db_print_indent(indent);
2905 db_printf("t_rcvtime: %u t_startime: %u\n",
2906 tp->t_rcvtime, tp->t_starttime);
2907
2908 db_print_indent(indent);
2909 db_printf("t_rttime: %u t_rtsq: 0x%08x\n",
2910 tp->t_rtttime, tp->t_rtseq);
2911
2912 db_print_indent(indent);
2913 db_printf("t_rxtcur: %d t_maxseg: %u t_srtt: %d\n",
2914 tp->t_rxtcur, tp->t_maxseg, tp->t_srtt);
2915
2916 db_print_indent(indent);
2917 db_printf("t_rttvar: %d t_rxtshift: %d t_rttmin: %u\n",
2918 tp->t_rttvar, tp->t_rxtshift, tp->t_rttmin);
2919
2920 db_print_indent(indent);
2921 db_printf("t_rttupdated: %u max_sndwnd: %u t_softerror: %d\n",
2922 tp->t_rttupdated, tp->max_sndwnd, tp->t_softerror);
2923
2924 db_print_indent(indent);
2925 db_printf("t_oobflags: 0x%b t_iobc: 0x%02x\n", tp->t_oobflags,
2926 TCPOOB_BITS, tp->t_iobc);
2927
2928 db_print_indent(indent);
2929 db_printf("snd_scale: %u rcv_scale: %u request_r_scale: %u\n",
2930 tp->snd_scale, tp->rcv_scale, tp->request_r_scale);
2931
2932 db_print_indent(indent);
2933 db_printf("ts_recent: %u ts_recent_age: %u\n",
2934 tp->ts_recent, tp->ts_recent_age);
2935
2936 db_print_indent(indent);
2937 db_printf("ts_offset: %u last_ack_sent: 0x%08x snd_cwnd_prev: "
2938 "%u\n", tp->ts_offset, tp->last_ack_sent, tp->snd_cwnd_prev);
2939
2940 db_print_indent(indent);
2941 db_printf("snd_ssthresh_prev: %u snd_recover_prev: 0x%08x "
2942 "t_badrxtwin: %u\n", tp->snd_ssthresh_prev,
2943 tp->snd_recover_prev, tp->t_badrxtwin);
2944
2945 db_print_indent(indent);
2946 db_printf("snd_numholes: %d snd_holes first: %p\n",
2947 tp->snd_numholes, TAILQ_FIRST(&tp->snd_holes));
2948
2949 db_print_indent(indent);
2950 db_printf("snd_fack: 0x%08x rcv_numsacks: %d\n",
2951 tp->snd_fack, tp->rcv_numsacks);
2952
2953 /* Skip sackblks, sackhint. */
2954
2955 db_print_indent(indent);
2956 db_printf("t_rttlow: %d rfbuf_ts: %u rfbuf_cnt: %d\n",
2957 tp->t_rttlow, tp->rfbuf_ts, tp->rfbuf_cnt);
2958
2959 db_print_indent(indent);
2960 db_printf("t_fb.tfb_tcp_block_name: %s\n", tp->t_fb->tfb_tcp_block_name);
2961
2962 db_print_indent(indent);
2963 db_printf("t_cc.name: %s\n", tp->t_cc->name);
2964
2965 db_print_indent(indent);
2966 db_printf("_t_logstate: %d (", tp->_t_logstate);
2967 db_print_bblog_state(tp->_t_logstate);
2968 db_printf(")\n");
2969
2970 db_print_indent(indent);
2971 db_printf("t_lognum: %d t_loglimit: %d t_logsn: %u\n",
2972 tp->t_lognum, tp->t_loglimit, tp->t_logsn);
2973
2974 if (show_bblog) {
2975 #ifdef TCP_BLACKBOX
2976 db_print_bblog_entries(&tp->t_logs, indent);
2977 #else
2978 db_print_indent(indent);
2979 db_printf("BBLog not supported\n");
2980 #endif
2981 }
2982 }
2983
DB_SHOW_COMMAND(tcpcb,db_show_tcpcb)2984 DB_SHOW_COMMAND(tcpcb, db_show_tcpcb)
2985 {
2986 struct tcpcb *tp;
2987 bool show_bblog, show_inpcb;
2988
2989 if (!have_addr) {
2990 db_printf("usage: show tcpcb[/bi] <addr>\n");
2991 return;
2992 }
2993 show_bblog = strchr(modif, 'b') != NULL;
2994 show_inpcb = strchr(modif, 'i') != NULL;
2995 tp = (struct tcpcb *)addr;
2996 db_print_tcpcb(tp, "tcpcb", 0, show_bblog, show_inpcb);
2997 }
2998
DB_SHOW_ALL_COMMAND(tcpcbs,db_show_all_tcpcbs)2999 DB_SHOW_ALL_COMMAND(tcpcbs, db_show_all_tcpcbs)
3000 {
3001 VNET_ITERATOR_DECL(vnet_iter);
3002 struct inpcb *inp;
3003 struct tcpcb *tp;
3004 bool only_locked, show_bblog, show_inpcb;
3005
3006 only_locked = strchr(modif, 'l') != NULL;
3007 show_bblog = strchr(modif, 'b') != NULL;
3008 show_inpcb = strchr(modif, 'i') != NULL;
3009 VNET_FOREACH(vnet_iter) {
3010 CURVNET_SET(vnet_iter);
3011 CK_LIST_FOREACH(inp, &V_tcbinfo.ipi_listhead, inp_list) {
3012 if (only_locked &&
3013 inp->inp_lock.rw_lock == RW_UNLOCKED)
3014 continue;
3015 tp = intotcpcb(inp);
3016 db_print_tcpcb(tp, "tcpcb", 0, show_bblog, show_inpcb);
3017 if (db_pager_quit)
3018 break;
3019 }
3020 CURVNET_RESTORE();
3021 if (db_pager_quit)
3022 break;
3023 }
3024 }
3025 #endif
3026