1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1982, 1986, 1988, 1993 5 * The Regents of the University of California. 6 * Copyright (c) 2006-2007 Robert N. M. Watson 7 * Copyright (c) 2010-2011 Juniper Networks, Inc. 8 * All rights reserved. 9 * 10 * Portions of this software were developed by Robert N. M. Watson under 11 * contract to Juniper Networks, Inc. 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 3. Neither the name of the University nor the names of its contributors 22 * may be used to endorse or promote products derived from this software 23 * without specific prior written permission. 24 * 25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 35 * SUCH DAMAGE. 36 */ 37 38 #include <sys/cdefs.h> 39 #include "opt_ddb.h" 40 #include "opt_inet.h" 41 #include "opt_inet6.h" 42 #include "opt_ipsec.h" 43 #include "opt_kern_tls.h" 44 45 #include <sys/param.h> 46 #include <sys/systm.h> 47 #include <sys/arb.h> 48 #include <sys/limits.h> 49 #include <sys/malloc.h> 50 #include <sys/refcount.h> 51 #include <sys/kernel.h> 52 #include <sys/ktls.h> 53 #include <sys/qmath.h> 54 #include <sys/sysctl.h> 55 #include <sys/mbuf.h> 56 #ifdef INET6 57 #include <sys/domain.h> 58 #endif /* INET6 */ 59 #include <sys/socket.h> 60 #include <sys/socketvar.h> 61 #include <sys/protosw.h> 62 #include <sys/proc.h> 63 #include <sys/jail.h> 64 #include <sys/stats.h> 65 66 #ifdef DDB 67 #include <ddb/ddb.h> 68 #endif 69 70 #include <net/if.h> 71 #include <net/if_var.h> 72 #include <net/route.h> 73 #include <net/vnet.h> 74 75 #include <netinet/in.h> 76 #include <netinet/in_kdtrace.h> 77 #include <netinet/in_pcb.h> 78 #include <netinet/in_rss.h> 79 #include <netinet/in_systm.h> 80 #include <netinet/in_var.h> 81 #include <netinet/ip.h> 82 #include <netinet/ip_var.h> 83 #ifdef INET6 84 #include <netinet/ip6.h> 85 #include <netinet6/in6_pcb.h> 86 #include <netinet6/in6_rss.h> 87 #include <netinet6/ip6_var.h> 88 #include <netinet6/scope6_var.h> 89 #endif 90 #include <netinet/tcp.h> 91 #include <netinet/tcp_fsm.h> 92 #include <netinet/tcp_seq.h> 93 #include <netinet/tcp_timer.h> 94 #include <netinet/tcp_var.h> 95 #include <netinet/tcp_log_buf.h> 96 #include <netinet/tcpip.h> 97 #include <netinet/cc/cc.h> 98 #include <netinet/tcp_fastopen.h> 99 #include <netinet/tcp_hpts.h> 100 #ifdef TCP_OFFLOAD 101 #include <netinet/tcp_offload.h> 102 #endif 103 #include <netipsec/ipsec_support.h> 104 105 #include <vm/vm.h> 106 #include <vm/vm_param.h> 107 #include <vm/pmap.h> 108 #include <vm/vm_extern.h> 109 #include <vm/vm_map.h> 110 #include <vm/vm_page.h> 111 112 /* 113 * TCP protocol interface to socket abstraction. 114 */ 115 #ifdef INET 116 static int tcp_connect(struct tcpcb *, struct sockaddr_in *, 117 struct thread *td); 118 #endif /* INET */ 119 #ifdef INET6 120 static int tcp6_connect(struct tcpcb *, struct sockaddr_in6 *, 121 struct thread *td); 122 #endif /* INET6 */ 123 static void tcp_disconnect(struct tcpcb *); 124 static void tcp_usrclosed(struct tcpcb *); 125 static void tcp_fill_info(const struct tcpcb *, struct tcp_info *); 126 127 static int tcp_pru_options_support(struct tcpcb *tp, int flags); 128 129 static void 130 tcp_bblog_pru(struct tcpcb *tp, uint32_t pru, int error) 131 { 132 struct tcp_log_buffer *lgb; 133 134 KASSERT(tp != NULL, ("tcp_bblog_pru: tp == NULL")); 135 INP_WLOCK_ASSERT(tptoinpcb(tp)); 136 if (tcp_bblogging_on(tp)) { 137 lgb = tcp_log_event(tp, NULL, NULL, NULL, TCP_LOG_PRU, error, 138 0, NULL, false, NULL, NULL, 0, NULL); 139 } else { 140 lgb = NULL; 141 } 142 if (lgb != NULL) { 143 if (error >= 0) { 144 lgb->tlb_errno = (uint32_t)error; 145 } 146 lgb->tlb_flex1 = pru; 147 } 148 } 149 150 /* 151 * TCP attaches to socket via pr_attach(), reserving space, 152 * and an internet control block. 153 */ 154 static int 155 tcp_usr_attach(struct socket *so, int proto, struct thread *td) 156 { 157 struct inpcb *inp; 158 struct tcpcb *tp = NULL; 159 int error; 160 161 inp = sotoinpcb(so); 162 KASSERT(inp == NULL, ("tcp_usr_attach: inp != NULL")); 163 164 error = soreserve(so, V_tcp_sendspace, V_tcp_recvspace); 165 if (error) 166 goto out; 167 168 so->so_rcv.sb_flags |= SB_AUTOSIZE; 169 so->so_snd.sb_flags |= (SB_AUTOLOWAT | SB_AUTOSIZE); 170 error = in_pcballoc(so, &V_tcbinfo); 171 if (error) 172 goto out; 173 inp = sotoinpcb(so); 174 tp = tcp_newtcpcb(inp, NULL); 175 if (tp == NULL) { 176 error = ENOBUFS; 177 in_pcbfree(inp); 178 goto out; 179 } 180 tp->t_state = TCPS_CLOSED; 181 tcp_bblog_pru(tp, PRU_ATTACH, error); 182 INP_WUNLOCK(inp); 183 TCPSTATES_INC(TCPS_CLOSED); 184 out: 185 TCP_PROBE2(debug__user, tp, PRU_ATTACH); 186 return (error); 187 } 188 189 /* 190 * tcp_usr_detach is called when the socket layer loses its final reference 191 * to the socket, be it a file descriptor reference, a reference from TCP, 192 * etc. At this point, there is only one case in which we will keep around 193 * inpcb state: time wait. 194 */ 195 static void 196 tcp_usr_detach(struct socket *so) 197 { 198 struct inpcb *inp; 199 struct tcpcb *tp; 200 201 inp = sotoinpcb(so); 202 KASSERT(inp != NULL, ("%s: inp == NULL", __func__)); 203 INP_WLOCK(inp); 204 KASSERT(so->so_pcb == inp && inp->inp_socket == so, 205 ("%s: socket %p inp %p mismatch", __func__, so, inp)); 206 207 tp = intotcpcb(inp); 208 209 KASSERT(inp->inp_flags & INP_DROPPED || 210 tp->t_state < TCPS_SYN_SENT, 211 ("%s: inp %p not dropped or embryonic", __func__, inp)); 212 213 tcp_discardcb(tp); 214 in_pcbfree(inp); 215 } 216 217 #ifdef INET 218 /* 219 * Give the socket an address. 220 */ 221 static int 222 tcp_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td) 223 { 224 int error = 0; 225 struct inpcb *inp; 226 struct tcpcb *tp; 227 struct sockaddr_in *sinp; 228 229 inp = sotoinpcb(so); 230 KASSERT(inp != NULL, ("tcp_usr_bind: inp == NULL")); 231 INP_WLOCK(inp); 232 if (inp->inp_flags & INP_DROPPED) { 233 INP_WUNLOCK(inp); 234 return (EINVAL); 235 } 236 tp = intotcpcb(inp); 237 238 sinp = (struct sockaddr_in *)nam; 239 if (nam->sa_family != AF_INET) { 240 /* 241 * Preserve compatibility with old programs. 242 */ 243 if (nam->sa_family != AF_UNSPEC || 244 nam->sa_len < offsetof(struct sockaddr_in, sin_zero) || 245 sinp->sin_addr.s_addr != INADDR_ANY) { 246 error = EAFNOSUPPORT; 247 goto out; 248 } 249 nam->sa_family = AF_INET; 250 } 251 if (nam->sa_len != sizeof(*sinp)) { 252 error = EINVAL; 253 goto out; 254 } 255 /* 256 * Must check for multicast addresses and disallow binding 257 * to them. 258 */ 259 if (IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) { 260 error = EAFNOSUPPORT; 261 goto out; 262 } 263 INP_HASH_WLOCK(&V_tcbinfo); 264 error = in_pcbbind(inp, sinp, V_tcp_bind_all_fibs ? 0 : INPBIND_FIB, 265 td->td_ucred); 266 INP_HASH_WUNLOCK(&V_tcbinfo); 267 out: 268 tcp_bblog_pru(tp, PRU_BIND, error); 269 TCP_PROBE2(debug__user, tp, PRU_BIND); 270 INP_WUNLOCK(inp); 271 272 return (error); 273 } 274 #endif /* INET */ 275 276 #ifdef INET6 277 static int 278 tcp6_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td) 279 { 280 int error = 0; 281 struct inpcb *inp; 282 struct tcpcb *tp; 283 struct sockaddr_in6 *sin6; 284 u_char vflagsav; 285 286 inp = sotoinpcb(so); 287 KASSERT(inp != NULL, ("tcp6_usr_bind: inp == NULL")); 288 INP_WLOCK(inp); 289 if (inp->inp_flags & INP_DROPPED) { 290 INP_WUNLOCK(inp); 291 return (EINVAL); 292 } 293 tp = intotcpcb(inp); 294 295 vflagsav = inp->inp_vflag; 296 297 sin6 = (struct sockaddr_in6 *)nam; 298 if (nam->sa_family != AF_INET6) { 299 error = EAFNOSUPPORT; 300 goto out; 301 } 302 if (nam->sa_len != sizeof(*sin6)) { 303 error = EINVAL; 304 goto out; 305 } 306 /* 307 * Must check for multicast addresses and disallow binding 308 * to them. 309 */ 310 if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) { 311 error = EAFNOSUPPORT; 312 goto out; 313 } 314 315 INP_HASH_WLOCK(&V_tcbinfo); 316 inp->inp_vflag &= ~INP_IPV4; 317 inp->inp_vflag |= INP_IPV6; 318 #ifdef INET 319 if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) { 320 if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) 321 inp->inp_vflag |= INP_IPV4; 322 else if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 323 struct sockaddr_in sin; 324 325 in6_sin6_2_sin(&sin, sin6); 326 if (IN_MULTICAST(ntohl(sin.sin_addr.s_addr))) { 327 error = EAFNOSUPPORT; 328 INP_HASH_WUNLOCK(&V_tcbinfo); 329 goto out; 330 } 331 inp->inp_vflag |= INP_IPV4; 332 inp->inp_vflag &= ~INP_IPV6; 333 error = in_pcbbind(inp, &sin, 0, td->td_ucred); 334 INP_HASH_WUNLOCK(&V_tcbinfo); 335 goto out; 336 } 337 } 338 #endif 339 error = in6_pcbbind(inp, sin6, V_tcp_bind_all_fibs ? 0 : INPBIND_FIB, 340 td->td_ucred); 341 INP_HASH_WUNLOCK(&V_tcbinfo); 342 out: 343 if (error != 0) 344 inp->inp_vflag = vflagsav; 345 tcp_bblog_pru(tp, PRU_BIND, error); 346 TCP_PROBE2(debug__user, tp, PRU_BIND); 347 INP_WUNLOCK(inp); 348 return (error); 349 } 350 #endif /* INET6 */ 351 352 #ifdef INET 353 /* 354 * Prepare to accept connections. 355 */ 356 static int 357 tcp_usr_listen(struct socket *so, int backlog, struct thread *td) 358 { 359 struct inpcb *inp; 360 struct tcpcb *tp; 361 int error = 0; 362 bool already_listening; 363 364 inp = sotoinpcb(so); 365 KASSERT(inp != NULL, ("tcp_usr_listen: inp == NULL")); 366 INP_WLOCK(inp); 367 if (inp->inp_flags & INP_DROPPED) { 368 INP_WUNLOCK(inp); 369 return (EINVAL); 370 } 371 tp = intotcpcb(inp); 372 373 SOCK_LOCK(so); 374 already_listening = SOLISTENING(so); 375 error = solisten_proto_check(so); 376 if (error != 0) { 377 SOCK_UNLOCK(so); 378 goto out; 379 } 380 if (inp->inp_lport == 0) { 381 INP_HASH_WLOCK(&V_tcbinfo); 382 error = in_pcbbind(inp, NULL, 383 V_tcp_bind_all_fibs ? 0 : INPBIND_FIB, td->td_ucred); 384 INP_HASH_WUNLOCK(&V_tcbinfo); 385 } 386 if (error == 0) { 387 tcp_state_change(tp, TCPS_LISTEN); 388 solisten_proto(so, backlog); 389 #ifdef TCP_OFFLOAD 390 if ((so->so_options & SO_NO_OFFLOAD) == 0) 391 tcp_offload_listen_start(tp); 392 #endif 393 } else { 394 solisten_proto_abort(so); 395 } 396 SOCK_UNLOCK(so); 397 if (already_listening) 398 goto out; 399 400 if (error == 0) 401 in_pcblisten(inp); 402 if (tp->t_flags & TF_FASTOPEN) 403 tp->t_tfo_pending = tcp_fastopen_alloc_counter(); 404 405 out: 406 tcp_bblog_pru(tp, PRU_LISTEN, error); 407 TCP_PROBE2(debug__user, tp, PRU_LISTEN); 408 INP_WUNLOCK(inp); 409 return (error); 410 } 411 #endif /* INET */ 412 413 #ifdef INET6 414 static int 415 tcp6_usr_listen(struct socket *so, int backlog, struct thread *td) 416 { 417 struct inpcb *inp; 418 struct tcpcb *tp; 419 u_char vflagsav; 420 int error = 0; 421 bool already_listening; 422 423 inp = sotoinpcb(so); 424 KASSERT(inp != NULL, ("tcp6_usr_listen: inp == NULL")); 425 INP_WLOCK(inp); 426 if (inp->inp_flags & INP_DROPPED) { 427 INP_WUNLOCK(inp); 428 return (EINVAL); 429 } 430 tp = intotcpcb(inp); 431 432 vflagsav = inp->inp_vflag; 433 434 SOCK_LOCK(so); 435 already_listening = SOLISTENING(so); 436 error = solisten_proto_check(so); 437 if (error != 0) { 438 SOCK_UNLOCK(so); 439 goto out; 440 } 441 INP_HASH_WLOCK(&V_tcbinfo); 442 if (inp->inp_lport == 0) { 443 inp->inp_vflag &= ~INP_IPV4; 444 if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) 445 inp->inp_vflag |= INP_IPV4; 446 error = in6_pcbbind(inp, NULL, 447 V_tcp_bind_all_fibs ? 0 : INPBIND_FIB, td->td_ucred); 448 } 449 INP_HASH_WUNLOCK(&V_tcbinfo); 450 if (error == 0) { 451 tcp_state_change(tp, TCPS_LISTEN); 452 solisten_proto(so, backlog); 453 #ifdef TCP_OFFLOAD 454 if ((so->so_options & SO_NO_OFFLOAD) == 0) 455 tcp_offload_listen_start(tp); 456 #endif 457 } else { 458 solisten_proto_abort(so); 459 } 460 SOCK_UNLOCK(so); 461 if (already_listening) 462 goto out; 463 464 if (error == 0) 465 in_pcblisten(inp); 466 if (tp->t_flags & TF_FASTOPEN) 467 tp->t_tfo_pending = tcp_fastopen_alloc_counter(); 468 469 if (error != 0) 470 inp->inp_vflag = vflagsav; 471 472 out: 473 tcp_bblog_pru(tp, PRU_LISTEN, error); 474 TCP_PROBE2(debug__user, tp, PRU_LISTEN); 475 INP_WUNLOCK(inp); 476 return (error); 477 } 478 #endif /* INET6 */ 479 480 #ifdef INET 481 /* 482 * Initiate connection to peer. 483 * Create a template for use in transmissions on this connection. 484 * Enter SYN_SENT state, and mark socket as connecting. 485 * Start keep-alive timer, and seed output sequence space. 486 * Send initial segment on connection. 487 */ 488 static int 489 tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td) 490 { 491 struct epoch_tracker et; 492 int error = 0; 493 struct inpcb *inp; 494 struct tcpcb *tp; 495 struct sockaddr_in *sinp; 496 497 inp = sotoinpcb(so); 498 KASSERT(inp != NULL, ("tcp_usr_connect: inp == NULL")); 499 INP_WLOCK(inp); 500 if (inp->inp_flags & INP_DROPPED) { 501 INP_WUNLOCK(inp); 502 return (ECONNREFUSED); 503 } 504 tp = intotcpcb(inp); 505 506 sinp = (struct sockaddr_in *)nam; 507 if (nam->sa_family != AF_INET) { 508 error = EAFNOSUPPORT; 509 goto out; 510 } 511 if (nam->sa_len != sizeof (*sinp)) { 512 error = EINVAL; 513 goto out; 514 } 515 /* 516 * Must disallow TCP ``connections'' to multicast addresses. 517 */ 518 if (IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) { 519 error = EAFNOSUPPORT; 520 goto out; 521 } 522 if (ntohl(sinp->sin_addr.s_addr) == INADDR_BROADCAST) { 523 error = EACCES; 524 goto out; 525 } 526 if ((error = prison_remote_ip4(td->td_ucred, &sinp->sin_addr)) != 0) 527 goto out; 528 if (SOLISTENING(so)) { 529 error = EOPNOTSUPP; 530 goto out; 531 } 532 NET_EPOCH_ENTER(et); 533 if ((error = tcp_connect(tp, sinp, td)) != 0) 534 goto out_in_epoch; 535 #ifdef TCP_OFFLOAD 536 if (registered_toedevs > 0 && 537 (so->so_options & SO_NO_OFFLOAD) == 0 && 538 (error = tcp_offload_connect(so, nam)) == 0) 539 goto out_in_epoch; 540 #endif 541 tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp)); 542 error = tcp_output(tp); 543 KASSERT(error >= 0, ("TCP stack %s requested tcp_drop(%p) at connect()" 544 ", error code %d", tp->t_fb->tfb_tcp_block_name, tp, -error)); 545 out_in_epoch: 546 NET_EPOCH_EXIT(et); 547 out: 548 tcp_bblog_pru(tp, PRU_CONNECT, error); 549 TCP_PROBE2(debug__user, tp, PRU_CONNECT); 550 INP_WUNLOCK(inp); 551 return (error); 552 } 553 #endif /* INET */ 554 555 #ifdef INET6 556 static int 557 tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td) 558 { 559 struct epoch_tracker et; 560 int error = 0; 561 struct inpcb *inp; 562 struct tcpcb *tp; 563 struct sockaddr_in6 *sin6; 564 u_int8_t incflagsav; 565 u_char vflagsav; 566 567 inp = sotoinpcb(so); 568 KASSERT(inp != NULL, ("tcp6_usr_connect: inp == NULL")); 569 INP_WLOCK(inp); 570 if (inp->inp_flags & INP_DROPPED) { 571 INP_WUNLOCK(inp); 572 return (ECONNREFUSED); 573 } 574 tp = intotcpcb(inp); 575 576 vflagsav = inp->inp_vflag; 577 incflagsav = inp->inp_inc.inc_flags; 578 579 sin6 = (struct sockaddr_in6 *)nam; 580 if (nam->sa_family != AF_INET6) { 581 error = EAFNOSUPPORT; 582 goto out; 583 } 584 if (nam->sa_len != sizeof (*sin6)) { 585 error = EINVAL; 586 goto out; 587 } 588 /* 589 * Must disallow TCP ``connections'' to multicast addresses. 590 */ 591 if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) { 592 error = EAFNOSUPPORT; 593 goto out; 594 } 595 if (SOLISTENING(so)) { 596 error = EOPNOTSUPP; 597 goto out; 598 } 599 #ifdef INET 600 /* 601 * XXXRW: Some confusion: V4/V6 flags relate to binding, and 602 * therefore probably require the hash lock, which isn't held here. 603 * Is this a significant problem? 604 */ 605 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 606 struct sockaddr_in sin; 607 608 if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) { 609 error = EINVAL; 610 goto out; 611 } 612 if ((inp->inp_vflag & INP_IPV4) == 0) { 613 error = EAFNOSUPPORT; 614 goto out; 615 } 616 617 in6_sin6_2_sin(&sin, sin6); 618 if (IN_MULTICAST(ntohl(sin.sin_addr.s_addr))) { 619 error = EAFNOSUPPORT; 620 goto out; 621 } 622 if (ntohl(sin.sin_addr.s_addr) == INADDR_BROADCAST) { 623 error = EACCES; 624 goto out; 625 } 626 if ((error = prison_remote_ip4(td->td_ucred, 627 &sin.sin_addr)) != 0) 628 goto out; 629 inp->inp_vflag |= INP_IPV4; 630 inp->inp_vflag &= ~INP_IPV6; 631 NET_EPOCH_ENTER(et); 632 if ((error = tcp_connect(tp, &sin, td)) != 0) 633 goto out_in_epoch; 634 #ifdef TCP_OFFLOAD 635 if (registered_toedevs > 0 && 636 (so->so_options & SO_NO_OFFLOAD) == 0 && 637 (error = tcp_offload_connect(so, nam)) == 0) 638 goto out_in_epoch; 639 #endif 640 error = tcp_output(tp); 641 goto out_in_epoch; 642 } else { 643 if ((inp->inp_vflag & INP_IPV6) == 0) { 644 error = EAFNOSUPPORT; 645 goto out; 646 } 647 } 648 #endif 649 if ((error = prison_remote_ip6(td->td_ucred, &sin6->sin6_addr)) != 0) 650 goto out; 651 inp->inp_vflag &= ~INP_IPV4; 652 inp->inp_vflag |= INP_IPV6; 653 inp->inp_inc.inc_flags |= INC_ISIPV6; 654 NET_EPOCH_ENTER(et); 655 if ((error = tcp6_connect(tp, sin6, td)) != 0) 656 goto out_in_epoch; 657 #ifdef TCP_OFFLOAD 658 if (registered_toedevs > 0 && 659 (so->so_options & SO_NO_OFFLOAD) == 0 && 660 (error = tcp_offload_connect(so, nam)) == 0) 661 goto out_in_epoch; 662 #endif 663 tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp)); 664 error = tcp_output(tp); 665 out_in_epoch: 666 NET_EPOCH_EXIT(et); 667 out: 668 KASSERT(error >= 0, ("TCP stack %s requested tcp_drop(%p) at connect()" 669 ", error code %d", tp->t_fb->tfb_tcp_block_name, tp, -error)); 670 /* 671 * If the implicit bind in the connect call fails, restore 672 * the flags we modified. 673 */ 674 if (error != 0 && inp->inp_lport == 0) { 675 inp->inp_vflag = vflagsav; 676 inp->inp_inc.inc_flags = incflagsav; 677 } 678 679 tcp_bblog_pru(tp, PRU_CONNECT, error); 680 TCP_PROBE2(debug__user, tp, PRU_CONNECT); 681 INP_WUNLOCK(inp); 682 return (error); 683 } 684 #endif /* INET6 */ 685 686 /* 687 * Initiate disconnect from peer. 688 * If connection never passed embryonic stage, just drop; 689 * else if don't need to let data drain, then can just drop anyways, 690 * else have to begin TCP shutdown process: mark socket disconnecting, 691 * drain unread data, state switch to reflect user close, and 692 * send segment (e.g. FIN) to peer. Socket will be really disconnected 693 * when peer sends FIN and acks ours. 694 * 695 * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB. 696 */ 697 static int 698 tcp_usr_disconnect(struct socket *so) 699 { 700 struct inpcb *inp; 701 struct tcpcb *tp = NULL; 702 struct epoch_tracker et; 703 704 NET_EPOCH_ENTER(et); 705 inp = sotoinpcb(so); 706 KASSERT(inp != NULL, ("tcp_usr_disconnect: inp == NULL")); 707 INP_WLOCK(inp); 708 tp = intotcpcb(inp); 709 710 if (tp->t_state == TCPS_TIME_WAIT) 711 goto out; 712 tcp_disconnect(tp); 713 out: 714 tcp_bblog_pru(tp, PRU_DISCONNECT, 0); 715 TCP_PROBE2(debug__user, tp, PRU_DISCONNECT); 716 INP_WUNLOCK(inp); 717 NET_EPOCH_EXIT(et); 718 return (0); 719 } 720 721 #ifdef INET 722 /* 723 * Accept a connection. Essentially all the work is done at higher levels; 724 * just return the address of the peer, storing through addr. 725 */ 726 static int 727 tcp_usr_accept(struct socket *so, struct sockaddr *sa) 728 { 729 struct inpcb *inp; 730 struct tcpcb *tp; 731 int error = 0; 732 733 inp = sotoinpcb(so); 734 KASSERT(inp != NULL, ("tcp_usr_accept: inp == NULL")); 735 INP_WLOCK(inp); 736 if (inp->inp_flags & INP_DROPPED) { 737 INP_WUNLOCK(inp); 738 return (ECONNABORTED); 739 } 740 tp = intotcpcb(inp); 741 742 if (so->so_state & SS_ISDISCONNECTED) 743 error = ECONNABORTED; 744 else 745 *(struct sockaddr_in *)sa = (struct sockaddr_in ){ 746 .sin_family = AF_INET, 747 .sin_len = sizeof(struct sockaddr_in), 748 .sin_port = inp->inp_fport, 749 .sin_addr = inp->inp_faddr, 750 }; 751 tcp_bblog_pru(tp, PRU_ACCEPT, error); 752 TCP_PROBE2(debug__user, tp, PRU_ACCEPT); 753 INP_WUNLOCK(inp); 754 755 return (error); 756 } 757 #endif /* INET */ 758 759 #ifdef INET6 760 static int 761 tcp6_usr_accept(struct socket *so, struct sockaddr *sa) 762 { 763 struct inpcb *inp; 764 struct tcpcb *tp; 765 int error = 0; 766 767 inp = sotoinpcb(so); 768 KASSERT(inp != NULL, ("tcp6_usr_accept: inp == NULL")); 769 INP_WLOCK(inp); 770 if (inp->inp_flags & INP_DROPPED) { 771 INP_WUNLOCK(inp); 772 return (ECONNABORTED); 773 } 774 tp = intotcpcb(inp); 775 776 if (so->so_state & SS_ISDISCONNECTED) { 777 error = ECONNABORTED; 778 } else { 779 if (inp->inp_vflag & INP_IPV4) { 780 struct sockaddr_in sin = { 781 .sin_family = AF_INET, 782 .sin_len = sizeof(struct sockaddr_in), 783 .sin_port = inp->inp_fport, 784 .sin_addr = inp->inp_faddr, 785 }; 786 in6_sin_2_v4mapsin6(&sin, (struct sockaddr_in6 *)sa); 787 } else { 788 *(struct sockaddr_in6 *)sa = (struct sockaddr_in6 ){ 789 .sin6_family = AF_INET6, 790 .sin6_len = sizeof(struct sockaddr_in6), 791 .sin6_port = inp->inp_fport, 792 .sin6_addr = inp->in6p_faddr, 793 }; 794 /* XXX: should catch errors */ 795 (void)sa6_recoverscope((struct sockaddr_in6 *)sa); 796 } 797 } 798 799 tcp_bblog_pru(tp, PRU_ACCEPT, error); 800 TCP_PROBE2(debug__user, tp, PRU_ACCEPT); 801 INP_WUNLOCK(inp); 802 803 return (error); 804 } 805 #endif /* INET6 */ 806 807 /* 808 * Mark the connection as being incapable of further output. 809 */ 810 static int 811 tcp_usr_shutdown(struct socket *so, enum shutdown_how how) 812 { 813 struct epoch_tracker et; 814 struct inpcb *inp = sotoinpcb(so); 815 struct tcpcb *tp = intotcpcb(inp); 816 int error = 0; 817 818 SOCK_LOCK(so); 819 if (SOLISTENING(so)) { 820 if (how != SHUT_WR) { 821 so->so_error = ECONNABORTED; 822 solisten_wakeup(so); /* unlocks so */ 823 } else 824 SOCK_UNLOCK(so); 825 return (ENOTCONN); 826 } else if ((so->so_state & 827 (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0) { 828 SOCK_UNLOCK(so); 829 return (ENOTCONN); 830 } 831 SOCK_UNLOCK(so); 832 833 switch (how) { 834 case SHUT_RD: 835 sorflush(so); 836 break; 837 case SHUT_RDWR: 838 sorflush(so); 839 /* FALLTHROUGH */ 840 case SHUT_WR: 841 /* 842 * XXXGL: mimicing old soshutdown() here. But shouldn't we 843 * return ECONNRESEST for SHUT_RD as well? 844 */ 845 INP_WLOCK(inp); 846 if (inp->inp_flags & INP_DROPPED) { 847 INP_WUNLOCK(inp); 848 return (ECONNRESET); 849 } 850 851 socantsendmore(so); 852 NET_EPOCH_ENTER(et); 853 tcp_usrclosed(tp); 854 error = tcp_output_nodrop(tp); 855 tcp_bblog_pru(tp, PRU_SHUTDOWN, error); 856 TCP_PROBE2(debug__user, tp, PRU_SHUTDOWN); 857 error = tcp_unlock_or_drop(tp, error); 858 NET_EPOCH_EXIT(et); 859 } 860 wakeup(&so->so_timeo); 861 862 return (error); 863 } 864 865 /* 866 * After a receive, possibly send window update to peer. 867 */ 868 static int 869 tcp_usr_rcvd(struct socket *so, int flags) 870 { 871 struct epoch_tracker et; 872 struct inpcb *inp; 873 struct tcpcb *tp; 874 int outrv = 0, error = 0; 875 876 inp = sotoinpcb(so); 877 KASSERT(inp != NULL, ("tcp_usr_rcvd: inp == NULL")); 878 INP_WLOCK(inp); 879 if (inp->inp_flags & INP_DROPPED) { 880 INP_WUNLOCK(inp); 881 return (ECONNRESET); 882 } 883 tp = intotcpcb(inp); 884 885 NET_EPOCH_ENTER(et); 886 /* 887 * For passively-created TFO connections, don't attempt a window 888 * update while still in SYN_RECEIVED as this may trigger an early 889 * SYN|ACK. It is preferable to have the SYN|ACK be sent along with 890 * application response data, or failing that, when the DELACK timer 891 * expires. 892 */ 893 if ((tp->t_flags & TF_FASTOPEN) && (tp->t_state == TCPS_SYN_RECEIVED)) 894 goto out; 895 #ifdef TCP_OFFLOAD 896 if (tp->t_flags & TF_TOE) 897 tcp_offload_rcvd(tp); 898 else 899 #endif 900 outrv = tcp_output_nodrop(tp); 901 out: 902 tcp_bblog_pru(tp, PRU_RCVD, error); 903 TCP_PROBE2(debug__user, tp, PRU_RCVD); 904 (void) tcp_unlock_or_drop(tp, outrv); 905 NET_EPOCH_EXIT(et); 906 return (error); 907 } 908 909 /* 910 * Do a send by putting data in output queue and updating urgent 911 * marker if URG set. Possibly send more data. Unlike the other 912 * pr_*() routines, the mbuf chains are our responsibility. We 913 * must either enqueue them or free them. The other pr_*() routines 914 * generally are caller-frees. 915 */ 916 static int 917 tcp_usr_send(struct socket *so, int flags, struct mbuf *m, 918 struct sockaddr *nam, struct mbuf *control, struct thread *td) 919 { 920 struct epoch_tracker et; 921 int error = 0; 922 struct inpcb *inp; 923 struct tcpcb *tp; 924 #ifdef INET 925 #ifdef INET6 926 struct sockaddr_in sin; 927 #endif 928 struct sockaddr_in *sinp; 929 #endif 930 #ifdef INET6 931 struct sockaddr_in6 *sin6; 932 int isipv6; 933 #endif 934 u_int8_t incflagsav; 935 u_char vflagsav; 936 bool restoreflags; 937 938 inp = sotoinpcb(so); 939 KASSERT(inp != NULL, ("tcp_usr_send: inp == NULL")); 940 INP_WLOCK(inp); 941 if (inp->inp_flags & INP_DROPPED) { 942 if (m != NULL && (flags & PRUS_NOTREADY) == 0) 943 m_freem(m); 944 INP_WUNLOCK(inp); 945 return (ECONNRESET); 946 } 947 tp = intotcpcb(inp); 948 949 vflagsav = inp->inp_vflag; 950 incflagsav = inp->inp_inc.inc_flags; 951 restoreflags = false; 952 953 NET_EPOCH_ENTER(et); 954 if (control != NULL) { 955 /* TCP doesn't do control messages (rights, creds, etc) */ 956 if (control->m_len > 0) { 957 m_freem(control); 958 error = EINVAL; 959 goto out; 960 } 961 m_freem(control); /* empty control, just free it */ 962 } 963 964 if ((flags & PRUS_OOB) != 0 && 965 (error = tcp_pru_options_support(tp, PRUS_OOB)) != 0) 966 goto out; 967 968 if (nam != NULL && tp->t_state < TCPS_SYN_SENT) { 969 if (tp->t_state == TCPS_LISTEN) { 970 error = EINVAL; 971 goto out; 972 } 973 switch (nam->sa_family) { 974 #ifdef INET 975 case AF_INET: 976 sinp = (struct sockaddr_in *)nam; 977 if (sinp->sin_len != sizeof(struct sockaddr_in)) { 978 error = EINVAL; 979 goto out; 980 } 981 if ((inp->inp_vflag & INP_IPV6) != 0) { 982 error = EAFNOSUPPORT; 983 goto out; 984 } 985 if (IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) { 986 error = EAFNOSUPPORT; 987 goto out; 988 } 989 if (ntohl(sinp->sin_addr.s_addr) == INADDR_BROADCAST) { 990 error = EACCES; 991 goto out; 992 } 993 if ((error = prison_remote_ip4(td->td_ucred, 994 &sinp->sin_addr))) 995 goto out; 996 #ifdef INET6 997 isipv6 = 0; 998 #endif 999 break; 1000 #endif /* INET */ 1001 #ifdef INET6 1002 case AF_INET6: 1003 sin6 = (struct sockaddr_in6 *)nam; 1004 if (sin6->sin6_len != sizeof(*sin6)) { 1005 error = EINVAL; 1006 goto out; 1007 } 1008 if ((inp->inp_vflag & INP_IPV6PROTO) == 0) { 1009 error = EAFNOSUPPORT; 1010 goto out; 1011 } 1012 if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) { 1013 error = EAFNOSUPPORT; 1014 goto out; 1015 } 1016 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 1017 #ifdef INET 1018 if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) { 1019 error = EINVAL; 1020 goto out; 1021 } 1022 if ((inp->inp_vflag & INP_IPV4) == 0) { 1023 error = EAFNOSUPPORT; 1024 goto out; 1025 } 1026 restoreflags = true; 1027 inp->inp_vflag &= ~INP_IPV6; 1028 sinp = &sin; 1029 in6_sin6_2_sin(sinp, sin6); 1030 if (IN_MULTICAST( 1031 ntohl(sinp->sin_addr.s_addr))) { 1032 error = EAFNOSUPPORT; 1033 goto out; 1034 } 1035 if ((error = prison_remote_ip4(td->td_ucred, 1036 &sinp->sin_addr))) 1037 goto out; 1038 isipv6 = 0; 1039 #else /* !INET */ 1040 error = EAFNOSUPPORT; 1041 goto out; 1042 #endif /* INET */ 1043 } else { 1044 if ((inp->inp_vflag & INP_IPV6) == 0) { 1045 error = EAFNOSUPPORT; 1046 goto out; 1047 } 1048 restoreflags = true; 1049 inp->inp_vflag &= ~INP_IPV4; 1050 inp->inp_inc.inc_flags |= INC_ISIPV6; 1051 if ((error = prison_remote_ip6(td->td_ucred, 1052 &sin6->sin6_addr))) 1053 goto out; 1054 isipv6 = 1; 1055 } 1056 break; 1057 #endif /* INET6 */ 1058 default: 1059 error = EAFNOSUPPORT; 1060 goto out; 1061 } 1062 } 1063 if (!(flags & PRUS_OOB)) { 1064 if (tp->t_acktime == 0) 1065 tp->t_acktime = ticks; 1066 sbappendstream(&so->so_snd, m, flags); 1067 m = NULL; 1068 if (nam && tp->t_state < TCPS_SYN_SENT) { 1069 KASSERT(tp->t_state == TCPS_CLOSED, 1070 ("%s: tp %p is listening", __func__, tp)); 1071 1072 /* 1073 * Do implied connect if not yet connected, 1074 * initialize window to default value, and 1075 * initialize maxseg using peer's cached MSS. 1076 */ 1077 #ifdef INET6 1078 if (isipv6) 1079 error = tcp6_connect(tp, sin6, td); 1080 #endif /* INET6 */ 1081 #if defined(INET6) && defined(INET) 1082 else 1083 #endif 1084 #ifdef INET 1085 error = tcp_connect(tp, sinp, td); 1086 #endif 1087 /* 1088 * The bind operation in tcp_connect succeeded. We 1089 * no longer want to restore the flags if later 1090 * operations fail. 1091 */ 1092 if (error == 0 || inp->inp_lport != 0) 1093 restoreflags = false; 1094 1095 if (error) { 1096 /* m is freed if PRUS_NOTREADY is unset. */ 1097 sbflush(&so->so_snd); 1098 goto out; 1099 } 1100 if (tp->t_flags & TF_FASTOPEN) 1101 tcp_fastopen_connect(tp); 1102 else { 1103 tp->snd_wnd = TTCP_CLIENT_SND_WND; 1104 tcp_mss(tp, -1); 1105 } 1106 } 1107 if (flags & PRUS_EOF) { 1108 /* 1109 * Close the send side of the connection after 1110 * the data is sent. 1111 */ 1112 socantsendmore(so); 1113 tcp_usrclosed(tp); 1114 } 1115 if (TCPS_HAVEESTABLISHED(tp->t_state) && 1116 ((tp->t_flags2 & TF2_FBYTES_COMPLETE) == 0) && 1117 (tp->t_fbyte_out == 0) && 1118 (so->so_snd.sb_ccc > 0)) { 1119 tp->t_fbyte_out = ticks; 1120 if (tp->t_fbyte_out == 0) 1121 tp->t_fbyte_out = 1; 1122 if (tp->t_fbyte_out && tp->t_fbyte_in) 1123 tp->t_flags2 |= TF2_FBYTES_COMPLETE; 1124 } 1125 if (!(inp->inp_flags & INP_DROPPED) && 1126 !(flags & PRUS_NOTREADY)) { 1127 if (flags & PRUS_MORETOCOME) 1128 tp->t_flags |= TF_MORETOCOME; 1129 error = tcp_output_nodrop(tp); 1130 if (flags & PRUS_MORETOCOME) 1131 tp->t_flags &= ~TF_MORETOCOME; 1132 } 1133 } else { 1134 /* 1135 * XXXRW: PRUS_EOF not implemented with PRUS_OOB? 1136 */ 1137 SOCK_SENDBUF_LOCK(so); 1138 if (sbspace(&so->so_snd) < -512) { 1139 SOCK_SENDBUF_UNLOCK(so); 1140 error = ENOBUFS; 1141 goto out; 1142 } 1143 /* 1144 * According to RFC961 (Assigned Protocols), 1145 * the urgent pointer points to the last octet 1146 * of urgent data. We continue, however, 1147 * to consider it to indicate the first octet 1148 * of data past the urgent section. 1149 * Otherwise, snd_up should be one lower. 1150 */ 1151 if (tp->t_acktime == 0) 1152 tp->t_acktime = ticks; 1153 sbappendstream_locked(&so->so_snd, m, flags); 1154 SOCK_SENDBUF_UNLOCK(so); 1155 m = NULL; 1156 if (nam && tp->t_state < TCPS_SYN_SENT) { 1157 /* 1158 * Do implied connect if not yet connected, 1159 * initialize window to default value, and 1160 * initialize maxseg using peer's cached MSS. 1161 */ 1162 1163 /* 1164 * Not going to contemplate SYN|URG 1165 */ 1166 if (tp->t_flags & TF_FASTOPEN) 1167 tp->t_flags &= ~TF_FASTOPEN; 1168 #ifdef INET6 1169 if (isipv6) 1170 error = tcp6_connect(tp, sin6, td); 1171 #endif /* INET6 */ 1172 #if defined(INET6) && defined(INET) 1173 else 1174 #endif 1175 #ifdef INET 1176 error = tcp_connect(tp, sinp, td); 1177 #endif 1178 /* 1179 * The bind operation in tcp_connect succeeded. We 1180 * no longer want to restore the flags if later 1181 * operations fail. 1182 */ 1183 if (error == 0 || inp->inp_lport != 0) 1184 restoreflags = false; 1185 1186 if (error != 0) { 1187 /* m is freed if PRUS_NOTREADY is unset. */ 1188 sbflush(&so->so_snd); 1189 goto out; 1190 } 1191 tp->snd_wnd = TTCP_CLIENT_SND_WND; 1192 tcp_mss(tp, -1); 1193 } 1194 tp->snd_up = tp->snd_una + sbavail(&so->so_snd); 1195 if ((flags & PRUS_NOTREADY) == 0) { 1196 tp->t_flags |= TF_FORCEDATA; 1197 error = tcp_output_nodrop(tp); 1198 tp->t_flags &= ~TF_FORCEDATA; 1199 } 1200 } 1201 TCP_LOG_EVENT(tp, NULL, 1202 &inp->inp_socket->so_rcv, 1203 &inp->inp_socket->so_snd, 1204 TCP_LOG_USERSEND, error, 1205 0, NULL, false); 1206 1207 out: 1208 /* 1209 * In case of PRUS_NOTREADY, the caller or tcp_usr_ready() is 1210 * responsible for freeing memory. 1211 */ 1212 if (m != NULL && (flags & PRUS_NOTREADY) == 0) 1213 m_freem(m); 1214 1215 /* 1216 * If the request was unsuccessful and we changed flags, 1217 * restore the original flags. 1218 */ 1219 if (error != 0 && restoreflags) { 1220 inp->inp_vflag = vflagsav; 1221 inp->inp_inc.inc_flags = incflagsav; 1222 } 1223 tcp_bblog_pru(tp, (flags & PRUS_OOB) ? PRU_SENDOOB : 1224 ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND), error); 1225 TCP_PROBE2(debug__user, tp, (flags & PRUS_OOB) ? PRU_SENDOOB : 1226 ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND)); 1227 error = tcp_unlock_or_drop(tp, error); 1228 NET_EPOCH_EXIT(et); 1229 return (error); 1230 } 1231 1232 static int 1233 tcp_usr_ready(struct socket *so, struct mbuf *m, int count) 1234 { 1235 struct epoch_tracker et; 1236 struct inpcb *inp; 1237 struct tcpcb *tp; 1238 int error; 1239 1240 inp = sotoinpcb(so); 1241 INP_WLOCK(inp); 1242 if (inp->inp_flags & INP_DROPPED) { 1243 INP_WUNLOCK(inp); 1244 mb_free_notready(m, count); 1245 return (ECONNRESET); 1246 } 1247 tp = intotcpcb(inp); 1248 1249 SOCK_SENDBUF_LOCK(so); 1250 error = sbready(&so->so_snd, m, count); 1251 SOCK_SENDBUF_UNLOCK(so); 1252 if (error) { 1253 INP_WUNLOCK(inp); 1254 return (error); 1255 } 1256 NET_EPOCH_ENTER(et); 1257 error = tcp_output_unlock(tp); 1258 NET_EPOCH_EXIT(et); 1259 1260 return (error); 1261 } 1262 1263 /* 1264 * Abort the TCP. Drop the connection abruptly. 1265 */ 1266 static void 1267 tcp_usr_abort(struct socket *so) 1268 { 1269 struct inpcb *inp; 1270 struct tcpcb *tp; 1271 struct epoch_tracker et; 1272 1273 inp = sotoinpcb(so); 1274 KASSERT(inp != NULL, ("tcp_usr_abort: inp == NULL")); 1275 1276 NET_EPOCH_ENTER(et); 1277 INP_WLOCK(inp); 1278 KASSERT(inp->inp_socket != NULL, 1279 ("tcp_usr_abort: inp_socket == NULL")); 1280 1281 /* 1282 * If we still have full TCP state, and we're not dropped, drop. 1283 */ 1284 if (!(inp->inp_flags & INP_DROPPED)) { 1285 tp = intotcpcb(inp); 1286 tp = tcp_drop(tp, ECONNABORTED); 1287 if (tp == NULL) 1288 goto dropped; 1289 tcp_bblog_pru(tp, PRU_ABORT, 0); 1290 TCP_PROBE2(debug__user, tp, PRU_ABORT); 1291 } 1292 if (!(inp->inp_flags & INP_DROPPED)) { 1293 soref(so); 1294 inp->inp_flags |= INP_SOCKREF; 1295 } 1296 INP_WUNLOCK(inp); 1297 dropped: 1298 NET_EPOCH_EXIT(et); 1299 } 1300 1301 /* 1302 * TCP socket is closed. Start friendly disconnect. 1303 */ 1304 static void 1305 tcp_usr_close(struct socket *so) 1306 { 1307 struct inpcb *inp; 1308 struct tcpcb *tp; 1309 struct epoch_tracker et; 1310 1311 inp = sotoinpcb(so); 1312 KASSERT(inp != NULL, ("tcp_usr_close: inp == NULL")); 1313 1314 NET_EPOCH_ENTER(et); 1315 INP_WLOCK(inp); 1316 KASSERT(inp->inp_socket != NULL, 1317 ("tcp_usr_close: inp_socket == NULL")); 1318 1319 /* 1320 * If we are still connected and we're not dropped, initiate 1321 * a disconnect. 1322 */ 1323 if (!(inp->inp_flags & INP_DROPPED)) { 1324 tp = intotcpcb(inp); 1325 if (tp->t_state != TCPS_TIME_WAIT) { 1326 tp->t_flags |= TF_CLOSED; 1327 tcp_disconnect(tp); 1328 tcp_bblog_pru(tp, PRU_CLOSE, 0); 1329 TCP_PROBE2(debug__user, tp, PRU_CLOSE); 1330 } 1331 } 1332 if (!(inp->inp_flags & INP_DROPPED)) { 1333 soref(so); 1334 inp->inp_flags |= INP_SOCKREF; 1335 } 1336 INP_WUNLOCK(inp); 1337 NET_EPOCH_EXIT(et); 1338 } 1339 1340 static int 1341 tcp_pru_options_support(struct tcpcb *tp, int flags) 1342 { 1343 /* 1344 * If the specific TCP stack has a pru_options 1345 * specified then it does not always support 1346 * all the PRU_XX options and we must ask it. 1347 * If the function is not specified then all 1348 * of the PRU_XX options are supported. 1349 */ 1350 int ret = 0; 1351 1352 if (tp->t_fb->tfb_pru_options) { 1353 ret = (*tp->t_fb->tfb_pru_options)(tp, flags); 1354 } 1355 return (ret); 1356 } 1357 1358 /* 1359 * Receive out-of-band data. 1360 */ 1361 static int 1362 tcp_usr_rcvoob(struct socket *so, struct mbuf *m, int flags) 1363 { 1364 int error = 0; 1365 struct inpcb *inp; 1366 struct tcpcb *tp; 1367 1368 inp = sotoinpcb(so); 1369 KASSERT(inp != NULL, ("tcp_usr_rcvoob: inp == NULL")); 1370 INP_WLOCK(inp); 1371 if (inp->inp_flags & INP_DROPPED) { 1372 INP_WUNLOCK(inp); 1373 return (ECONNRESET); 1374 } 1375 tp = intotcpcb(inp); 1376 1377 error = tcp_pru_options_support(tp, PRUS_OOB); 1378 if (error) { 1379 goto out; 1380 } 1381 if ((so->so_oobmark == 0 && 1382 (so->so_rcv.sb_state & SBS_RCVATMARK) == 0) || 1383 so->so_options & SO_OOBINLINE || 1384 tp->t_oobflags & TCPOOB_HADDATA) { 1385 error = EINVAL; 1386 goto out; 1387 } 1388 if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) { 1389 error = EWOULDBLOCK; 1390 goto out; 1391 } 1392 m->m_len = 1; 1393 *mtod(m, caddr_t) = tp->t_iobc; 1394 if ((flags & MSG_PEEK) == 0) 1395 tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA); 1396 1397 out: 1398 tcp_bblog_pru(tp, PRU_RCVOOB, error); 1399 TCP_PROBE2(debug__user, tp, PRU_RCVOOB); 1400 INP_WUNLOCK(inp); 1401 return (error); 1402 } 1403 1404 #ifdef INET 1405 struct protosw tcp_protosw = { 1406 .pr_type = SOCK_STREAM, 1407 .pr_protocol = IPPROTO_TCP, 1408 .pr_flags = PR_CONNREQUIRED | PR_IMPLOPCL | PR_WANTRCVD | 1409 PR_CAPATTACH, 1410 .pr_ctloutput = tcp_ctloutput, 1411 .pr_abort = tcp_usr_abort, 1412 .pr_accept = tcp_usr_accept, 1413 .pr_attach = tcp_usr_attach, 1414 .pr_bind = tcp_usr_bind, 1415 .pr_connect = tcp_usr_connect, 1416 .pr_control = in_control, 1417 .pr_detach = tcp_usr_detach, 1418 .pr_disconnect = tcp_usr_disconnect, 1419 .pr_listen = tcp_usr_listen, 1420 .pr_peeraddr = in_getpeeraddr, 1421 .pr_rcvd = tcp_usr_rcvd, 1422 .pr_rcvoob = tcp_usr_rcvoob, 1423 .pr_send = tcp_usr_send, 1424 .pr_sendfile_wait = sendfile_wait_generic, 1425 .pr_ready = tcp_usr_ready, 1426 .pr_shutdown = tcp_usr_shutdown, 1427 .pr_sockaddr = in_getsockaddr, 1428 .pr_sosetlabel = in_pcbsosetlabel, 1429 .pr_close = tcp_usr_close, 1430 }; 1431 #endif /* INET */ 1432 1433 #ifdef INET6 1434 struct protosw tcp6_protosw = { 1435 .pr_type = SOCK_STREAM, 1436 .pr_protocol = IPPROTO_TCP, 1437 .pr_flags = PR_CONNREQUIRED | PR_IMPLOPCL |PR_WANTRCVD | 1438 PR_CAPATTACH, 1439 .pr_ctloutput = tcp_ctloutput, 1440 .pr_abort = tcp_usr_abort, 1441 .pr_accept = tcp6_usr_accept, 1442 .pr_attach = tcp_usr_attach, 1443 .pr_bind = tcp6_usr_bind, 1444 .pr_connect = tcp6_usr_connect, 1445 .pr_control = in6_control, 1446 .pr_detach = tcp_usr_detach, 1447 .pr_disconnect = tcp_usr_disconnect, 1448 .pr_listen = tcp6_usr_listen, 1449 .pr_peeraddr = in6_mapped_peeraddr, 1450 .pr_rcvd = tcp_usr_rcvd, 1451 .pr_rcvoob = tcp_usr_rcvoob, 1452 .pr_send = tcp_usr_send, 1453 .pr_sendfile_wait = sendfile_wait_generic, 1454 .pr_ready = tcp_usr_ready, 1455 .pr_shutdown = tcp_usr_shutdown, 1456 .pr_sockaddr = in6_mapped_sockaddr, 1457 .pr_sosetlabel = in_pcbsosetlabel, 1458 .pr_close = tcp_usr_close, 1459 }; 1460 #endif /* INET6 */ 1461 1462 #ifdef INET 1463 /* 1464 * Common subroutine to open a TCP connection to remote host specified 1465 * by struct sockaddr_in. Call in_pcbconnect() to choose local host address 1466 * and assign a local port number and install the inpcb into the hash. 1467 * Initialize connection parameters and enter SYN-SENT state. 1468 */ 1469 static int 1470 tcp_connect(struct tcpcb *tp, struct sockaddr_in *sin, struct thread *td) 1471 { 1472 struct inpcb *inp = tptoinpcb(tp); 1473 struct socket *so = tptosocket(tp); 1474 int error; 1475 1476 NET_EPOCH_ASSERT(); 1477 INP_WLOCK_ASSERT(inp); 1478 1479 if (__predict_false((so->so_state & 1480 (SS_ISCONNECTING | SS_ISCONNECTED | SS_ISDISCONNECTING | 1481 SS_ISDISCONNECTED)) != 0)) 1482 return (EISCONN); 1483 if (__predict_false((so->so_options & SO_REUSEPORT_LB) != 0)) 1484 return (EOPNOTSUPP); 1485 1486 INP_HASH_WLOCK(&V_tcbinfo); 1487 error = in_pcbconnect(inp, sin, td->td_ucred); 1488 INP_HASH_WUNLOCK(&V_tcbinfo); 1489 if (error != 0) 1490 return (error); 1491 1492 /* set the hash on the connection */ 1493 rss_proto_software_hash_v4(inp->inp_faddr, inp->inp_laddr, 1494 inp->inp_fport, inp->inp_lport, IPPROTO_TCP, 1495 &inp->inp_flowid, &inp->inp_flowtype); 1496 /* 1497 * Compute window scaling to request: 1498 * Scale to fit into sweet spot. See tcp_syncache.c. 1499 * XXX: This should move to tcp_output(). 1500 */ 1501 while (tp->request_r_scale < TCP_MAX_WINSHIFT && 1502 (TCP_MAXWIN << tp->request_r_scale) < sb_max) 1503 tp->request_r_scale++; 1504 1505 soisconnecting(so); 1506 TCPSTAT_INC(tcps_connattempt); 1507 tcp_state_change(tp, TCPS_SYN_SENT); 1508 tp->iss = tcp_new_isn(&inp->inp_inc); 1509 if (tp->t_flags & TF_REQ_TSTMP) 1510 tp->ts_offset = tcp_new_ts_offset(&inp->inp_inc); 1511 tcp_sendseqinit(tp); 1512 1513 return (0); 1514 } 1515 #endif /* INET */ 1516 1517 #ifdef INET6 1518 static int 1519 tcp6_connect(struct tcpcb *tp, struct sockaddr_in6 *sin6, struct thread *td) 1520 { 1521 struct inpcb *inp = tptoinpcb(tp); 1522 struct socket *so = tptosocket(tp); 1523 int error; 1524 1525 NET_EPOCH_ASSERT(); 1526 INP_WLOCK_ASSERT(inp); 1527 1528 if (__predict_false((so->so_state & 1529 (SS_ISCONNECTING | SS_ISCONNECTED | SS_ISDISCONNECTING | 1530 SS_ISDISCONNECTED)) != 0)) 1531 return (EISCONN); 1532 if (__predict_false((so->so_options & SO_REUSEPORT_LB) != 0)) 1533 return (EOPNOTSUPP); 1534 1535 INP_HASH_WLOCK(&V_tcbinfo); 1536 error = in6_pcbconnect(inp, sin6, td->td_ucred, true); 1537 INP_HASH_WUNLOCK(&V_tcbinfo); 1538 if (error != 0) 1539 return (error); 1540 1541 /* set the hash on the connection */ 1542 rss_proto_software_hash_v6(&inp->in6p_faddr, 1543 &inp->in6p_laddr, inp->inp_fport, inp->inp_lport, IPPROTO_TCP, 1544 &inp->inp_flowid, &inp->inp_flowtype); 1545 /* Compute window scaling to request. */ 1546 while (tp->request_r_scale < TCP_MAX_WINSHIFT && 1547 (TCP_MAXWIN << tp->request_r_scale) < sb_max) 1548 tp->request_r_scale++; 1549 1550 soisconnecting(so); 1551 TCPSTAT_INC(tcps_connattempt); 1552 tcp_state_change(tp, TCPS_SYN_SENT); 1553 tp->iss = tcp_new_isn(&inp->inp_inc); 1554 if (tp->t_flags & TF_REQ_TSTMP) 1555 tp->ts_offset = tcp_new_ts_offset(&inp->inp_inc); 1556 tcp_sendseqinit(tp); 1557 1558 return (0); 1559 } 1560 #endif /* INET6 */ 1561 1562 /* 1563 * Export TCP internal state information via a struct tcp_info, based on the 1564 * Linux 2.6 API. Not ABI compatible as our constants are mapped differently 1565 * (TCP state machine, etc). We export all information using FreeBSD-native 1566 * constants -- for example, the numeric values for tcpi_state will differ 1567 * from Linux. 1568 */ 1569 void 1570 tcp_fill_info(const struct tcpcb *tp, struct tcp_info *ti) 1571 { 1572 1573 INP_LOCK_ASSERT(tptoinpcb(tp)); 1574 bzero(ti, sizeof(*ti)); 1575 1576 ti->tcpi_state = tp->t_state; 1577 if ((tp->t_flags & TF_REQ_TSTMP) && (tp->t_flags & TF_RCVD_TSTMP)) 1578 ti->tcpi_options |= TCPI_OPT_TIMESTAMPS; 1579 if (tp->t_flags & TF_SACK_PERMIT) 1580 ti->tcpi_options |= TCPI_OPT_SACK; 1581 if ((tp->t_flags & TF_REQ_SCALE) && (tp->t_flags & TF_RCVD_SCALE)) { 1582 ti->tcpi_options |= TCPI_OPT_WSCALE; 1583 ti->tcpi_snd_wscale = tp->snd_scale; 1584 ti->tcpi_rcv_wscale = tp->rcv_scale; 1585 } 1586 switch (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT)) { 1587 case TF2_ECN_PERMIT: 1588 ti->tcpi_options |= TCPI_OPT_ECN; 1589 break; 1590 case TF2_ACE_PERMIT: 1591 /* FALLTHROUGH */ 1592 case TF2_ECN_PERMIT | TF2_ACE_PERMIT: 1593 ti->tcpi_options |= TCPI_OPT_ACE; 1594 break; 1595 default: 1596 break; 1597 } 1598 if (tp->t_flags & TF_FASTOPEN) 1599 ti->tcpi_options |= TCPI_OPT_TFO; 1600 1601 ti->tcpi_rto = tp->t_rxtcur * tick; 1602 ti->tcpi_last_data_recv = ((uint32_t)ticks - tp->t_rcvtime) * tick; 1603 ti->tcpi_rtt = ((u_int64_t)tp->t_srtt * tick) >> TCP_RTT_SHIFT; 1604 ti->tcpi_rttvar = ((u_int64_t)tp->t_rttvar * tick) >> TCP_RTTVAR_SHIFT; 1605 1606 ti->tcpi_snd_ssthresh = tp->snd_ssthresh; 1607 ti->tcpi_snd_cwnd = tp->snd_cwnd; 1608 1609 /* 1610 * FreeBSD-specific extension fields for tcp_info. 1611 */ 1612 ti->tcpi_rcv_space = tp->rcv_wnd; 1613 ti->tcpi_rcv_nxt = tp->rcv_nxt; 1614 ti->tcpi_snd_wnd = tp->snd_wnd; 1615 ti->tcpi_snd_bwnd = 0; /* Unused, kept for compat. */ 1616 ti->tcpi_snd_nxt = tp->snd_nxt; 1617 ti->tcpi_snd_mss = tp->t_maxseg; 1618 ti->tcpi_rcv_mss = tp->t_maxseg; 1619 ti->tcpi_snd_rexmitpack = tp->t_sndrexmitpack; 1620 ti->tcpi_rcv_ooopack = tp->t_rcvoopack; 1621 ti->tcpi_snd_zerowin = tp->t_sndzerowin; 1622 ti->tcpi_snd_una = tp->snd_una; 1623 ti->tcpi_snd_max = tp->snd_max; 1624 ti->tcpi_rcv_numsacks = tp->rcv_numsacks; 1625 ti->tcpi_rcv_adv = tp->rcv_adv; 1626 ti->tcpi_dupacks = tp->t_dupacks; 1627 ti->tcpi_rttmin = tp->t_rttlow; 1628 #ifdef TCP_OFFLOAD 1629 if (tp->t_flags & TF_TOE) { 1630 ti->tcpi_options |= TCPI_OPT_TOE; 1631 tcp_offload_tcp_info(tp, ti); 1632 } 1633 #endif 1634 /* 1635 * AccECN related counters. 1636 */ 1637 if ((tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT)) == 1638 (TF2_ECN_PERMIT | TF2_ACE_PERMIT)) 1639 /* 1640 * Internal counter starts at 5 for AccECN 1641 * but 0 for RFC3168 ECN. 1642 */ 1643 ti->tcpi_delivered_ce = tp->t_scep - 5; 1644 else 1645 ti->tcpi_delivered_ce = tp->t_scep; 1646 ti->tcpi_received_ce = tp->t_rcep; 1647 } 1648 1649 /* 1650 * tcp_ctloutput() must drop the inpcb lock before performing copyin on 1651 * socket option arguments. When it re-acquires the lock after the copy, it 1652 * has to revalidate that the connection is still valid for the socket 1653 * option. 1654 */ 1655 #define INP_WLOCK_RECHECK_CLEANUP(inp, cleanup) do { \ 1656 INP_WLOCK(inp); \ 1657 if (inp->inp_flags & INP_DROPPED) { \ 1658 INP_WUNLOCK(inp); \ 1659 cleanup; \ 1660 return (ECONNRESET); \ 1661 } \ 1662 tp = intotcpcb(inp); \ 1663 } while(0) 1664 #define INP_WLOCK_RECHECK(inp) INP_WLOCK_RECHECK_CLEANUP((inp), /* noop */) 1665 1666 int 1667 tcp_ctloutput_set(struct inpcb *inp, struct sockopt *sopt) 1668 { 1669 struct socket *so = inp->inp_socket; 1670 struct tcpcb *tp = intotcpcb(inp); 1671 int error = 0; 1672 1673 MPASS(sopt->sopt_dir == SOPT_SET); 1674 INP_WLOCK_ASSERT(inp); 1675 KASSERT((inp->inp_flags & INP_DROPPED) == 0, 1676 ("inp_flags == %x", inp->inp_flags)); 1677 KASSERT(so != NULL, ("inp_socket == NULL")); 1678 1679 if (sopt->sopt_level != IPPROTO_TCP) { 1680 INP_WUNLOCK(inp); 1681 #ifdef INET6 1682 if (inp->inp_vflag & INP_IPV6PROTO) 1683 error = ip6_ctloutput(so, sopt); 1684 #endif 1685 #if defined(INET6) && defined(INET) 1686 else 1687 #endif 1688 #ifdef INET 1689 error = ip_ctloutput(so, sopt); 1690 #endif 1691 /* 1692 * When an IP-level socket option affects TCP, pass control 1693 * down to stack tfb_tcp_ctloutput, otherwise return what 1694 * IP level returned. 1695 */ 1696 switch (sopt->sopt_level) { 1697 #ifdef INET6 1698 case IPPROTO_IPV6: 1699 if ((inp->inp_vflag & INP_IPV6PROTO) == 0) 1700 return (error); 1701 switch (sopt->sopt_name) { 1702 case IPV6_TCLASS: 1703 /* Notify tcp stacks that care (e.g. RACK). */ 1704 break; 1705 case IPV6_USE_MIN_MTU: 1706 /* Update t_maxseg accordingly. */ 1707 break; 1708 default: 1709 return (error); 1710 } 1711 break; 1712 #endif 1713 #ifdef INET 1714 case IPPROTO_IP: 1715 switch (sopt->sopt_name) { 1716 case IP_TOS: 1717 inp->inp_ip_tos &= ~IPTOS_ECN_MASK; 1718 break; 1719 case IP_TTL: 1720 /* Notify tcp stacks that care (e.g. RACK). */ 1721 break; 1722 default: 1723 return (error); 1724 } 1725 break; 1726 #endif 1727 default: 1728 return (error); 1729 } 1730 INP_WLOCK_RECHECK(inp); 1731 } else if (sopt->sopt_name == TCP_FUNCTION_BLK) { 1732 /* 1733 * Protect the TCP option TCP_FUNCTION_BLK so 1734 * that a sub-function can *never* overwrite this. 1735 */ 1736 struct tcp_function_set fsn; 1737 struct tcp_function_block *blk; 1738 void *ptr = NULL; 1739 1740 INP_WUNLOCK(inp); 1741 error = sooptcopyin(sopt, &fsn, sizeof fsn, sizeof fsn); 1742 if (error) 1743 return (error); 1744 1745 INP_WLOCK_RECHECK(inp); 1746 1747 blk = find_and_ref_tcp_functions(&fsn); 1748 if (blk == NULL) { 1749 INP_WUNLOCK(inp); 1750 return (ENOENT); 1751 } 1752 if (tp->t_fb == blk) { 1753 /* You already have this */ 1754 refcount_release(&blk->tfb_refcnt); 1755 INP_WUNLOCK(inp); 1756 return (0); 1757 } 1758 if (blk->tfb_flags & TCP_FUNC_BEING_REMOVED) { 1759 refcount_release(&blk->tfb_refcnt); 1760 INP_WUNLOCK(inp); 1761 return (ENOENT); 1762 } 1763 error = (*blk->tfb_tcp_handoff_ok)(tp); 1764 if (error) { 1765 refcount_release(&blk->tfb_refcnt); 1766 INP_WUNLOCK(inp); 1767 return (error); 1768 } 1769 /* 1770 * Ensure the new stack takes ownership with a 1771 * clean slate on peak rate threshold. 1772 */ 1773 if (tp->t_fb->tfb_tcp_timer_stop_all != NULL) 1774 tp->t_fb->tfb_tcp_timer_stop_all(tp); 1775 if (blk->tfb_tcp_fb_init) { 1776 error = (*blk->tfb_tcp_fb_init)(tp, &ptr); 1777 if (error) { 1778 /* 1779 * Release the ref count the lookup 1780 * acquired. 1781 */ 1782 refcount_release(&blk->tfb_refcnt); 1783 /* 1784 * Now there is a chance that the 1785 * init() function mucked with some 1786 * things before it failed, such as 1787 * hpts or inp_flags2 or timer granularity. 1788 * It should not of, but lets give the old 1789 * stack a chance to reset to a known good state. 1790 */ 1791 if (tp->t_fb->tfb_switch_failed) { 1792 (*tp->t_fb->tfb_switch_failed)(tp); 1793 } 1794 goto err_out; 1795 } 1796 } 1797 if (tp->t_fb->tfb_tcp_fb_fini) { 1798 struct epoch_tracker et; 1799 /* 1800 * Tell the stack to cleanup with 0 i.e. 1801 * the tcb is not going away. 1802 */ 1803 NET_EPOCH_ENTER(et); 1804 (*tp->t_fb->tfb_tcp_fb_fini)(tp, 0); 1805 NET_EPOCH_EXIT(et); 1806 } 1807 /* 1808 * Release the old refcnt, the 1809 * lookup acquired a ref on the 1810 * new one already. 1811 */ 1812 refcount_release(&tp->t_fb->tfb_refcnt); 1813 /* 1814 * Set in the new stack. 1815 */ 1816 tp->t_fb = blk; 1817 tp->t_fb_ptr = ptr; 1818 #ifdef TCP_OFFLOAD 1819 if (tp->t_flags & TF_TOE) { 1820 tcp_offload_ctloutput(tp, sopt->sopt_dir, 1821 sopt->sopt_name); 1822 } 1823 #endif 1824 err_out: 1825 INP_WUNLOCK(inp); 1826 return (error); 1827 1828 } 1829 1830 /* Pass in the INP locked, callee must unlock it. */ 1831 return (tp->t_fb->tfb_tcp_ctloutput(tp, sopt)); 1832 } 1833 1834 static int 1835 tcp_ctloutput_get(struct inpcb *inp, struct sockopt *sopt) 1836 { 1837 struct socket *so = inp->inp_socket; 1838 struct tcpcb *tp = intotcpcb(inp); 1839 int error = 0; 1840 1841 MPASS(sopt->sopt_dir == SOPT_GET); 1842 INP_WLOCK_ASSERT(inp); 1843 KASSERT((inp->inp_flags & INP_DROPPED) == 0, 1844 ("inp_flags == %x", inp->inp_flags)); 1845 KASSERT(so != NULL, ("inp_socket == NULL")); 1846 1847 if (sopt->sopt_level != IPPROTO_TCP) { 1848 INP_WUNLOCK(inp); 1849 #ifdef INET6 1850 if (inp->inp_vflag & INP_IPV6PROTO) 1851 error = ip6_ctloutput(so, sopt); 1852 #endif /* INET6 */ 1853 #if defined(INET6) && defined(INET) 1854 else 1855 #endif 1856 #ifdef INET 1857 error = ip_ctloutput(so, sopt); 1858 #endif 1859 return (error); 1860 } 1861 if (((sopt->sopt_name == TCP_FUNCTION_BLK) || 1862 (sopt->sopt_name == TCP_FUNCTION_ALIAS))) { 1863 struct tcp_function_set fsn; 1864 1865 if (sopt->sopt_name == TCP_FUNCTION_ALIAS) { 1866 memset(&fsn, 0, sizeof(fsn)); 1867 find_tcp_function_alias(tp->t_fb, &fsn); 1868 } else { 1869 strncpy(fsn.function_set_name, 1870 tp->t_fb->tfb_tcp_block_name, 1871 TCP_FUNCTION_NAME_LEN_MAX); 1872 fsn.function_set_name[TCP_FUNCTION_NAME_LEN_MAX - 1] = '\0'; 1873 } 1874 fsn.pcbcnt = tp->t_fb->tfb_refcnt; 1875 INP_WUNLOCK(inp); 1876 error = sooptcopyout(sopt, &fsn, sizeof fsn); 1877 return (error); 1878 } 1879 1880 /* Pass in the INP locked, callee must unlock it. */ 1881 return (tp->t_fb->tfb_tcp_ctloutput(tp, sopt)); 1882 } 1883 1884 int 1885 tcp_ctloutput(struct socket *so, struct sockopt *sopt) 1886 { 1887 struct inpcb *inp; 1888 1889 inp = sotoinpcb(so); 1890 KASSERT(inp != NULL, ("tcp_ctloutput: inp == NULL")); 1891 1892 INP_WLOCK(inp); 1893 if (inp->inp_flags & INP_DROPPED) { 1894 INP_WUNLOCK(inp); 1895 return (ECONNRESET); 1896 } 1897 if (sopt->sopt_dir == SOPT_SET) 1898 return (tcp_ctloutput_set(inp, sopt)); 1899 else if (sopt->sopt_dir == SOPT_GET) 1900 return (tcp_ctloutput_get(inp, sopt)); 1901 else 1902 panic("%s: sopt_dir $%d", __func__, sopt->sopt_dir); 1903 } 1904 1905 /* 1906 * If this assert becomes untrue, we need to change the size of the buf 1907 * variable in tcp_default_ctloutput(). 1908 */ 1909 #ifdef CTASSERT 1910 CTASSERT(TCP_CA_NAME_MAX <= TCP_LOG_ID_LEN); 1911 CTASSERT(TCP_LOG_REASON_LEN <= TCP_LOG_ID_LEN); 1912 #endif 1913 1914 extern struct cc_algo newreno_cc_algo; 1915 1916 static int 1917 tcp_set_cc_mod(struct inpcb *inp, struct sockopt *sopt) 1918 { 1919 struct cc_algo *algo; 1920 void *ptr = NULL; 1921 struct tcpcb *tp; 1922 struct cc_var cc_mem; 1923 char buf[TCP_CA_NAME_MAX]; 1924 size_t mem_sz; 1925 int error; 1926 1927 INP_WUNLOCK(inp); 1928 error = sooptcopyin(sopt, buf, TCP_CA_NAME_MAX - 1, 1); 1929 if (error) 1930 return(error); 1931 buf[sopt->sopt_valsize] = '\0'; 1932 CC_LIST_RLOCK(); 1933 STAILQ_FOREACH(algo, &cc_list, entries) { 1934 if (strncmp(buf, algo->name, 1935 TCP_CA_NAME_MAX) == 0) { 1936 if (algo->flags & CC_MODULE_BEING_REMOVED) { 1937 /* We can't "see" modules being unloaded */ 1938 continue; 1939 } 1940 break; 1941 } 1942 } 1943 if (algo == NULL) { 1944 CC_LIST_RUNLOCK(); 1945 return(ESRCH); 1946 } 1947 /* 1948 * With a reference the algorithm cannot be removed 1949 * so we hold a reference through the change process. 1950 */ 1951 cc_refer(algo); 1952 CC_LIST_RUNLOCK(); 1953 if (algo->cb_init != NULL) { 1954 /* We can now pre-get the memory for the CC */ 1955 mem_sz = (*algo->cc_data_sz)(); 1956 if (mem_sz == 0) { 1957 goto no_mem_needed; 1958 } 1959 ptr = malloc(mem_sz, M_CC_MEM, M_WAITOK); 1960 } else { 1961 no_mem_needed: 1962 mem_sz = 0; 1963 ptr = NULL; 1964 } 1965 /* 1966 * Make sure its all clean and zero and also get 1967 * back the inplock. 1968 */ 1969 memset(&cc_mem, 0, sizeof(cc_mem)); 1970 INP_WLOCK(inp); 1971 if (inp->inp_flags & INP_DROPPED) { 1972 INP_WUNLOCK(inp); 1973 if (ptr) 1974 free(ptr, M_CC_MEM); 1975 /* Release our temp reference */ 1976 CC_LIST_RLOCK(); 1977 cc_release(algo); 1978 CC_LIST_RUNLOCK(); 1979 return (ECONNRESET); 1980 } 1981 tp = intotcpcb(inp); 1982 if (ptr != NULL) 1983 memset(ptr, 0, mem_sz); 1984 cc_mem.tp = tp; 1985 /* 1986 * We once again hold a write lock over the tcb so it's 1987 * safe to do these things without ordering concerns. 1988 * Note here we init into stack memory. 1989 */ 1990 if (algo->cb_init != NULL) 1991 error = algo->cb_init(&cc_mem, ptr); 1992 else 1993 error = 0; 1994 /* 1995 * The CC algorithms, when given their memory 1996 * should not fail we could in theory have a 1997 * KASSERT here. 1998 */ 1999 if (error == 0) { 2000 /* 2001 * Touchdown, lets go ahead and move the 2002 * connection to the new CC module by 2003 * copying in the cc_mem after we call 2004 * the old ones cleanup (if any). 2005 */ 2006 if (CC_ALGO(tp)->cb_destroy != NULL) 2007 CC_ALGO(tp)->cb_destroy(&tp->t_ccv); 2008 /* Detach the old CC from the tcpcb */ 2009 cc_detach(tp); 2010 /* Copy in our temp memory that was inited */ 2011 memcpy(&tp->t_ccv, &cc_mem, sizeof(struct cc_var)); 2012 /* Now attach the new, which takes a reference */ 2013 cc_attach(tp, algo); 2014 /* Ok now are we where we have gotten past any conn_init? */ 2015 if (TCPS_HAVEESTABLISHED(tp->t_state) && (CC_ALGO(tp)->conn_init != NULL)) { 2016 /* Yep run the connection init for the new CC */ 2017 CC_ALGO(tp)->conn_init(&tp->t_ccv); 2018 } 2019 } else if (ptr) 2020 free(ptr, M_CC_MEM); 2021 INP_WUNLOCK(inp); 2022 /* Now lets release our temp reference */ 2023 CC_LIST_RLOCK(); 2024 cc_release(algo); 2025 CC_LIST_RUNLOCK(); 2026 return (error); 2027 } 2028 2029 int 2030 tcp_default_ctloutput(struct tcpcb *tp, struct sockopt *sopt) 2031 { 2032 struct inpcb *inp = tptoinpcb(tp); 2033 int error, opt, optval; 2034 u_int ui; 2035 struct tcp_info ti; 2036 #ifdef KERN_TLS 2037 struct tls_enable tls; 2038 struct socket *so = inp->inp_socket; 2039 #endif 2040 char *pbuf, buf[TCP_LOG_ID_LEN]; 2041 #ifdef STATS 2042 struct statsblob *sbp; 2043 #endif 2044 size_t len; 2045 2046 INP_WLOCK_ASSERT(inp); 2047 KASSERT((inp->inp_flags & INP_DROPPED) == 0, 2048 ("inp_flags == %x", inp->inp_flags)); 2049 KASSERT(inp->inp_socket != NULL, ("inp_socket == NULL")); 2050 2051 switch (sopt->sopt_level) { 2052 #ifdef INET6 2053 case IPPROTO_IPV6: 2054 MPASS(inp->inp_vflag & INP_IPV6PROTO); 2055 switch (sopt->sopt_name) { 2056 case IPV6_USE_MIN_MTU: 2057 tcp6_use_min_mtu(tp); 2058 /* FALLTHROUGH */ 2059 } 2060 INP_WUNLOCK(inp); 2061 return (0); 2062 #endif 2063 #ifdef INET 2064 case IPPROTO_IP: 2065 INP_WUNLOCK(inp); 2066 return (0); 2067 #endif 2068 } 2069 2070 /* 2071 * For TCP_CCALGOOPT forward the control to CC module, for both 2072 * SOPT_SET and SOPT_GET. 2073 */ 2074 switch (sopt->sopt_name) { 2075 case TCP_CCALGOOPT: 2076 INP_WUNLOCK(inp); 2077 if (sopt->sopt_valsize > CC_ALGOOPT_LIMIT) 2078 return (EINVAL); 2079 pbuf = malloc(sopt->sopt_valsize, M_TEMP, M_WAITOK | M_ZERO); 2080 error = sooptcopyin(sopt, pbuf, sopt->sopt_valsize, 2081 sopt->sopt_valsize); 2082 if (error) { 2083 free(pbuf, M_TEMP); 2084 return (error); 2085 } 2086 INP_WLOCK_RECHECK_CLEANUP(inp, free(pbuf, M_TEMP)); 2087 if (CC_ALGO(tp)->ctl_output != NULL) 2088 error = CC_ALGO(tp)->ctl_output(&tp->t_ccv, sopt, pbuf); 2089 else 2090 error = ENOENT; 2091 INP_WUNLOCK(inp); 2092 if (error == 0 && sopt->sopt_dir == SOPT_GET) 2093 error = sooptcopyout(sopt, pbuf, sopt->sopt_valsize); 2094 free(pbuf, M_TEMP); 2095 return (error); 2096 } 2097 2098 switch (sopt->sopt_dir) { 2099 case SOPT_SET: 2100 switch (sopt->sopt_name) { 2101 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) 2102 case TCP_MD5SIG: 2103 INP_WUNLOCK(inp); 2104 if (!TCPMD5_ENABLED()) 2105 return (ENOPROTOOPT); 2106 error = TCPMD5_PCBCTL(inp, sopt); 2107 if (error) 2108 return (error); 2109 INP_WLOCK_RECHECK(inp); 2110 goto unlock_and_done; 2111 #endif /* IPSEC */ 2112 2113 case TCP_NODELAY: 2114 case TCP_NOOPT: 2115 INP_WUNLOCK(inp); 2116 error = sooptcopyin(sopt, &optval, sizeof optval, 2117 sizeof optval); 2118 if (error) 2119 return (error); 2120 2121 INP_WLOCK_RECHECK(inp); 2122 switch (sopt->sopt_name) { 2123 case TCP_NODELAY: 2124 opt = TF_NODELAY; 2125 break; 2126 case TCP_NOOPT: 2127 opt = TF_NOOPT; 2128 break; 2129 default: 2130 opt = 0; /* dead code to fool gcc */ 2131 break; 2132 } 2133 2134 if (optval) 2135 tp->t_flags |= opt; 2136 else 2137 tp->t_flags &= ~opt; 2138 unlock_and_done: 2139 #ifdef TCP_OFFLOAD 2140 if (tp->t_flags & TF_TOE) { 2141 tcp_offload_ctloutput(tp, sopt->sopt_dir, 2142 sopt->sopt_name); 2143 } 2144 #endif 2145 INP_WUNLOCK(inp); 2146 break; 2147 2148 case TCP_NOPUSH: 2149 INP_WUNLOCK(inp); 2150 error = sooptcopyin(sopt, &optval, sizeof optval, 2151 sizeof optval); 2152 if (error) 2153 return (error); 2154 2155 INP_WLOCK_RECHECK(inp); 2156 if (optval) 2157 tp->t_flags |= TF_NOPUSH; 2158 else if (tp->t_flags & TF_NOPUSH) { 2159 tp->t_flags &= ~TF_NOPUSH; 2160 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 2161 struct epoch_tracker et; 2162 2163 NET_EPOCH_ENTER(et); 2164 error = tcp_output_nodrop(tp); 2165 NET_EPOCH_EXIT(et); 2166 } 2167 } 2168 goto unlock_and_done; 2169 2170 case TCP_REMOTE_UDP_ENCAPS_PORT: 2171 INP_WUNLOCK(inp); 2172 error = sooptcopyin(sopt, &optval, sizeof optval, 2173 sizeof optval); 2174 if (error) 2175 return (error); 2176 if ((optval < TCP_TUNNELING_PORT_MIN) || 2177 (optval > TCP_TUNNELING_PORT_MAX)) { 2178 /* Its got to be in range */ 2179 return (EINVAL); 2180 } 2181 if ((V_tcp_udp_tunneling_port == 0) && (optval != 0)) { 2182 /* You have to have enabled a UDP tunneling port first */ 2183 return (EINVAL); 2184 } 2185 INP_WLOCK_RECHECK(inp); 2186 if (tp->t_state != TCPS_CLOSED) { 2187 /* You can't change after you are connected */ 2188 error = EINVAL; 2189 } else { 2190 /* Ok we are all good set the port */ 2191 tp->t_port = htons(optval); 2192 } 2193 goto unlock_and_done; 2194 2195 case TCP_MAXSEG: 2196 INP_WUNLOCK(inp); 2197 error = sooptcopyin(sopt, &optval, sizeof optval, 2198 sizeof optval); 2199 if (error) 2200 return (error); 2201 2202 INP_WLOCK_RECHECK(inp); 2203 if (optval > 0 && optval <= tp->t_maxseg && 2204 optval + 40 >= V_tcp_minmss) { 2205 tp->t_maxseg = optval; 2206 if (tp->t_maxseg < V_tcp_mssdflt) { 2207 /* 2208 * The MSS is so small we should not process incoming 2209 * SACK's since we are subject to attack in such a 2210 * case. 2211 */ 2212 tp->t_flags2 |= TF2_PROC_SACK_PROHIBIT; 2213 } else { 2214 tp->t_flags2 &= ~TF2_PROC_SACK_PROHIBIT; 2215 } 2216 } else 2217 error = EINVAL; 2218 goto unlock_and_done; 2219 2220 case TCP_INFO: 2221 INP_WUNLOCK(inp); 2222 error = EINVAL; 2223 break; 2224 2225 case TCP_STATS: 2226 INP_WUNLOCK(inp); 2227 #ifdef STATS 2228 error = sooptcopyin(sopt, &optval, sizeof optval, 2229 sizeof optval); 2230 if (error) 2231 return (error); 2232 2233 if (optval > 0) 2234 sbp = stats_blob_alloc( 2235 V_tcp_perconn_stats_dflt_tpl, 0); 2236 else 2237 sbp = NULL; 2238 2239 INP_WLOCK_RECHECK(inp); 2240 if ((tp->t_stats != NULL && sbp == NULL) || 2241 (tp->t_stats == NULL && sbp != NULL)) { 2242 struct statsblob *t = tp->t_stats; 2243 tp->t_stats = sbp; 2244 sbp = t; 2245 } 2246 INP_WUNLOCK(inp); 2247 2248 stats_blob_destroy(sbp); 2249 #else 2250 return (EOPNOTSUPP); 2251 #endif /* !STATS */ 2252 break; 2253 2254 case TCP_CONGESTION: 2255 error = tcp_set_cc_mod(inp, sopt); 2256 break; 2257 2258 case TCP_REUSPORT_LB_NUMA: 2259 INP_WUNLOCK(inp); 2260 error = sooptcopyin(sopt, &optval, sizeof(optval), 2261 sizeof(optval)); 2262 INP_WLOCK_RECHECK(inp); 2263 if (!error) 2264 error = in_pcblbgroup_numa(inp, optval); 2265 INP_WUNLOCK(inp); 2266 break; 2267 2268 #ifdef KERN_TLS 2269 case TCP_TXTLS_ENABLE: 2270 INP_WUNLOCK(inp); 2271 error = ktls_copyin_tls_enable(sopt, &tls); 2272 if (error != 0) 2273 break; 2274 error = ktls_enable_tx(so, &tls); 2275 ktls_cleanup_tls_enable(&tls); 2276 break; 2277 case TCP_TXTLS_MODE: 2278 INP_WUNLOCK(inp); 2279 error = sooptcopyin(sopt, &ui, sizeof(ui), sizeof(ui)); 2280 if (error != 0) 2281 return (error); 2282 2283 INP_WLOCK_RECHECK(inp); 2284 error = ktls_set_tx_mode(so, ui); 2285 INP_WUNLOCK(inp); 2286 break; 2287 case TCP_RXTLS_ENABLE: 2288 INP_WUNLOCK(inp); 2289 error = ktls_copyin_tls_enable(sopt, &tls); 2290 if (error != 0) 2291 break; 2292 error = ktls_enable_rx(so, &tls); 2293 ktls_cleanup_tls_enable(&tls); 2294 break; 2295 #endif 2296 case TCP_MAXUNACKTIME: 2297 case TCP_KEEPIDLE: 2298 case TCP_KEEPINTVL: 2299 case TCP_KEEPINIT: 2300 INP_WUNLOCK(inp); 2301 error = sooptcopyin(sopt, &ui, sizeof(ui), sizeof(ui)); 2302 if (error) 2303 return (error); 2304 2305 if (ui > (UINT_MAX / hz)) { 2306 error = EINVAL; 2307 break; 2308 } 2309 ui *= hz; 2310 2311 INP_WLOCK_RECHECK(inp); 2312 switch (sopt->sopt_name) { 2313 case TCP_MAXUNACKTIME: 2314 tp->t_maxunacktime = ui; 2315 break; 2316 2317 case TCP_KEEPIDLE: 2318 tp->t_keepidle = ui; 2319 /* 2320 * XXX: better check current remaining 2321 * timeout and "merge" it with new value. 2322 */ 2323 if ((tp->t_state > TCPS_LISTEN) && 2324 (tp->t_state <= TCPS_CLOSING)) 2325 tcp_timer_activate(tp, TT_KEEP, 2326 TP_KEEPIDLE(tp)); 2327 break; 2328 case TCP_KEEPINTVL: 2329 tp->t_keepintvl = ui; 2330 if ((tp->t_state == TCPS_FIN_WAIT_2) && 2331 (TP_MAXIDLE(tp) > 0)) 2332 tcp_timer_activate(tp, TT_2MSL, 2333 TP_MAXIDLE(tp)); 2334 break; 2335 case TCP_KEEPINIT: 2336 tp->t_keepinit = ui; 2337 if (tp->t_state == TCPS_SYN_RECEIVED || 2338 tp->t_state == TCPS_SYN_SENT) 2339 tcp_timer_activate(tp, TT_KEEP, 2340 TP_KEEPINIT(tp)); 2341 break; 2342 } 2343 goto unlock_and_done; 2344 2345 case TCP_KEEPCNT: 2346 INP_WUNLOCK(inp); 2347 error = sooptcopyin(sopt, &ui, sizeof(ui), sizeof(ui)); 2348 if (error) 2349 return (error); 2350 2351 INP_WLOCK_RECHECK(inp); 2352 tp->t_keepcnt = ui; 2353 if ((tp->t_state == TCPS_FIN_WAIT_2) && 2354 (TP_MAXIDLE(tp) > 0)) 2355 tcp_timer_activate(tp, TT_2MSL, 2356 TP_MAXIDLE(tp)); 2357 goto unlock_and_done; 2358 2359 case TCP_FASTOPEN: { 2360 struct tcp_fastopen tfo_optval; 2361 2362 INP_WUNLOCK(inp); 2363 if (!V_tcp_fastopen_client_enable && 2364 !V_tcp_fastopen_server_enable) 2365 return (EPERM); 2366 2367 error = sooptcopyin(sopt, &tfo_optval, 2368 sizeof(tfo_optval), sizeof(int)); 2369 if (error) 2370 return (error); 2371 2372 INP_WLOCK_RECHECK(inp); 2373 if ((tp->t_state != TCPS_CLOSED) && 2374 (tp->t_state != TCPS_LISTEN)) { 2375 error = EINVAL; 2376 goto unlock_and_done; 2377 } 2378 if (tfo_optval.enable) { 2379 if (tp->t_state == TCPS_LISTEN) { 2380 if (!V_tcp_fastopen_server_enable) { 2381 error = EPERM; 2382 goto unlock_and_done; 2383 } 2384 2385 if (tp->t_tfo_pending == NULL) 2386 tp->t_tfo_pending = 2387 tcp_fastopen_alloc_counter(); 2388 } else { 2389 /* 2390 * If a pre-shared key was provided, 2391 * stash it in the client cookie 2392 * field of the tcpcb for use during 2393 * connect. 2394 */ 2395 if (sopt->sopt_valsize == 2396 sizeof(tfo_optval)) { 2397 memcpy(tp->t_tfo_cookie.client, 2398 tfo_optval.psk, 2399 TCP_FASTOPEN_PSK_LEN); 2400 tp->t_tfo_client_cookie_len = 2401 TCP_FASTOPEN_PSK_LEN; 2402 } 2403 } 2404 tp->t_flags |= TF_FASTOPEN; 2405 } else 2406 tp->t_flags &= ~TF_FASTOPEN; 2407 goto unlock_and_done; 2408 } 2409 2410 #ifdef TCP_BLACKBOX 2411 case TCP_LOG: 2412 INP_WUNLOCK(inp); 2413 error = sooptcopyin(sopt, &optval, sizeof optval, 2414 sizeof optval); 2415 if (error) 2416 return (error); 2417 2418 INP_WLOCK_RECHECK(inp); 2419 error = tcp_log_state_change(tp, optval); 2420 goto unlock_and_done; 2421 2422 case TCP_LOGBUF: 2423 INP_WUNLOCK(inp); 2424 error = EINVAL; 2425 break; 2426 2427 case TCP_LOGID: 2428 INP_WUNLOCK(inp); 2429 error = sooptcopyin(sopt, buf, TCP_LOG_ID_LEN - 1, 0); 2430 if (error) 2431 break; 2432 buf[sopt->sopt_valsize] = '\0'; 2433 INP_WLOCK_RECHECK(inp); 2434 error = tcp_log_set_id(tp, buf); 2435 /* tcp_log_set_id() unlocks the INP. */ 2436 break; 2437 2438 case TCP_LOGDUMP: 2439 case TCP_LOGDUMPID: 2440 INP_WUNLOCK(inp); 2441 error = 2442 sooptcopyin(sopt, buf, TCP_LOG_REASON_LEN - 1, 0); 2443 if (error) 2444 break; 2445 buf[sopt->sopt_valsize] = '\0'; 2446 INP_WLOCK_RECHECK(inp); 2447 if (sopt->sopt_name == TCP_LOGDUMP) { 2448 error = tcp_log_dump_tp_logbuf(tp, buf, 2449 M_WAITOK, true); 2450 INP_WUNLOCK(inp); 2451 } else { 2452 tcp_log_dump_tp_bucket_logbufs(tp, buf); 2453 /* 2454 * tcp_log_dump_tp_bucket_logbufs() drops the 2455 * INP lock. 2456 */ 2457 } 2458 break; 2459 #endif 2460 2461 default: 2462 INP_WUNLOCK(inp); 2463 error = ENOPROTOOPT; 2464 break; 2465 } 2466 break; 2467 2468 case SOPT_GET: 2469 tp = intotcpcb(inp); 2470 switch (sopt->sopt_name) { 2471 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) 2472 case TCP_MD5SIG: 2473 INP_WUNLOCK(inp); 2474 if (!TCPMD5_ENABLED()) 2475 return (ENOPROTOOPT); 2476 error = TCPMD5_PCBCTL(inp, sopt); 2477 break; 2478 #endif 2479 2480 case TCP_NODELAY: 2481 optval = tp->t_flags & TF_NODELAY; 2482 INP_WUNLOCK(inp); 2483 error = sooptcopyout(sopt, &optval, sizeof optval); 2484 break; 2485 case TCP_MAXSEG: 2486 optval = tp->t_maxseg; 2487 INP_WUNLOCK(inp); 2488 error = sooptcopyout(sopt, &optval, sizeof optval); 2489 break; 2490 case TCP_REMOTE_UDP_ENCAPS_PORT: 2491 optval = ntohs(tp->t_port); 2492 INP_WUNLOCK(inp); 2493 error = sooptcopyout(sopt, &optval, sizeof optval); 2494 break; 2495 case TCP_NOOPT: 2496 optval = tp->t_flags & TF_NOOPT; 2497 INP_WUNLOCK(inp); 2498 error = sooptcopyout(sopt, &optval, sizeof optval); 2499 break; 2500 case TCP_NOPUSH: 2501 optval = tp->t_flags & TF_NOPUSH; 2502 INP_WUNLOCK(inp); 2503 error = sooptcopyout(sopt, &optval, sizeof optval); 2504 break; 2505 case TCP_INFO: 2506 tcp_fill_info(tp, &ti); 2507 INP_WUNLOCK(inp); 2508 error = sooptcopyout(sopt, &ti, sizeof ti); 2509 break; 2510 case TCP_STATS: 2511 { 2512 #ifdef STATS 2513 int nheld; 2514 TYPEOF_MEMBER(struct statsblob, flags) sbflags = 0; 2515 2516 error = 0; 2517 socklen_t outsbsz = sopt->sopt_valsize; 2518 if (tp->t_stats == NULL) 2519 error = ENOENT; 2520 else if (outsbsz >= tp->t_stats->cursz) 2521 outsbsz = tp->t_stats->cursz; 2522 else if (outsbsz >= sizeof(struct statsblob)) 2523 outsbsz = sizeof(struct statsblob); 2524 else 2525 error = EINVAL; 2526 INP_WUNLOCK(inp); 2527 if (error) 2528 break; 2529 2530 sbp = sopt->sopt_val; 2531 nheld = atop(round_page(((vm_offset_t)sbp) + 2532 (vm_size_t)outsbsz) - trunc_page((vm_offset_t)sbp)); 2533 vm_page_t ma[nheld]; 2534 if (vm_fault_quick_hold_pages( 2535 &curproc->p_vmspace->vm_map, (vm_offset_t)sbp, 2536 outsbsz, VM_PROT_READ | VM_PROT_WRITE, ma, 2537 nheld) < 0) { 2538 error = EFAULT; 2539 break; 2540 } 2541 2542 if ((error = copyin_nofault(&(sbp->flags), &sbflags, 2543 SIZEOF_MEMBER(struct statsblob, flags)))) 2544 goto unhold; 2545 2546 INP_WLOCK_RECHECK(inp); 2547 error = stats_blob_snapshot(&sbp, outsbsz, tp->t_stats, 2548 sbflags | SB_CLONE_USRDSTNOFAULT); 2549 INP_WUNLOCK(inp); 2550 sopt->sopt_valsize = outsbsz; 2551 unhold: 2552 vm_page_unhold_pages(ma, nheld); 2553 #else 2554 INP_WUNLOCK(inp); 2555 error = EOPNOTSUPP; 2556 #endif /* !STATS */ 2557 break; 2558 } 2559 case TCP_CONGESTION: 2560 len = strlcpy(buf, CC_ALGO(tp)->name, TCP_CA_NAME_MAX); 2561 INP_WUNLOCK(inp); 2562 error = sooptcopyout(sopt, buf, len + 1); 2563 break; 2564 case TCP_MAXUNACKTIME: 2565 case TCP_KEEPIDLE: 2566 case TCP_KEEPINTVL: 2567 case TCP_KEEPINIT: 2568 case TCP_KEEPCNT: 2569 switch (sopt->sopt_name) { 2570 case TCP_MAXUNACKTIME: 2571 ui = TP_MAXUNACKTIME(tp) / hz; 2572 break; 2573 case TCP_KEEPIDLE: 2574 ui = TP_KEEPIDLE(tp) / hz; 2575 break; 2576 case TCP_KEEPINTVL: 2577 ui = TP_KEEPINTVL(tp) / hz; 2578 break; 2579 case TCP_KEEPINIT: 2580 ui = TP_KEEPINIT(tp) / hz; 2581 break; 2582 case TCP_KEEPCNT: 2583 ui = TP_KEEPCNT(tp); 2584 break; 2585 } 2586 INP_WUNLOCK(inp); 2587 error = sooptcopyout(sopt, &ui, sizeof(ui)); 2588 break; 2589 case TCP_FASTOPEN: 2590 optval = tp->t_flags & TF_FASTOPEN; 2591 INP_WUNLOCK(inp); 2592 error = sooptcopyout(sopt, &optval, sizeof optval); 2593 break; 2594 #ifdef TCP_BLACKBOX 2595 case TCP_LOG: 2596 optval = tcp_get_bblog_state(tp); 2597 INP_WUNLOCK(inp); 2598 error = sooptcopyout(sopt, &optval, sizeof(optval)); 2599 break; 2600 case TCP_LOGBUF: 2601 /* tcp_log_getlogbuf() does INP_WUNLOCK(inp) */ 2602 error = tcp_log_getlogbuf(sopt, tp); 2603 break; 2604 case TCP_LOGID: 2605 len = tcp_log_get_id(tp, buf); 2606 INP_WUNLOCK(inp); 2607 error = sooptcopyout(sopt, buf, len + 1); 2608 break; 2609 case TCP_LOGDUMP: 2610 case TCP_LOGDUMPID: 2611 INP_WUNLOCK(inp); 2612 error = EINVAL; 2613 break; 2614 #endif 2615 #ifdef KERN_TLS 2616 case TCP_TXTLS_MODE: 2617 error = ktls_get_tx_mode(so, &optval); 2618 INP_WUNLOCK(inp); 2619 if (error == 0) 2620 error = sooptcopyout(sopt, &optval, 2621 sizeof(optval)); 2622 break; 2623 case TCP_RXTLS_MODE: 2624 error = ktls_get_rx_mode(so, &optval); 2625 INP_WUNLOCK(inp); 2626 if (error == 0) 2627 error = sooptcopyout(sopt, &optval, 2628 sizeof(optval)); 2629 break; 2630 #endif 2631 default: 2632 INP_WUNLOCK(inp); 2633 error = ENOPROTOOPT; 2634 break; 2635 } 2636 break; 2637 } 2638 return (error); 2639 } 2640 #undef INP_WLOCK_RECHECK 2641 #undef INP_WLOCK_RECHECK_CLEANUP 2642 2643 /* 2644 * Initiate (or continue) disconnect. 2645 * If embryonic state, just send reset (once). 2646 * If in ``let data drain'' option and linger null, just drop. 2647 * Otherwise (hard), mark socket disconnecting and drop 2648 * current input data; switch states based on user close, and 2649 * send segment to peer (with FIN). 2650 */ 2651 static void 2652 tcp_disconnect(struct tcpcb *tp) 2653 { 2654 struct inpcb *inp = tptoinpcb(tp); 2655 struct socket *so = tptosocket(tp); 2656 2657 NET_EPOCH_ASSERT(); 2658 INP_WLOCK_ASSERT(inp); 2659 2660 /* 2661 * Neither tcp_close() nor tcp_drop() should return NULL, as the 2662 * socket is still open. 2663 */ 2664 if (tp->t_state < TCPS_ESTABLISHED && 2665 !(tp->t_state > TCPS_LISTEN && (tp->t_flags & TF_FASTOPEN))) { 2666 tp = tcp_close(tp); 2667 KASSERT(tp != NULL, 2668 ("tcp_disconnect: tcp_close() returned NULL")); 2669 } else if ((so->so_options & SO_LINGER) && so->so_linger == 0) { 2670 tp = tcp_drop(tp, 0); 2671 KASSERT(tp != NULL, 2672 ("tcp_disconnect: tcp_drop() returned NULL")); 2673 } else { 2674 soisdisconnecting(so); 2675 sbflush(&so->so_rcv); 2676 tcp_usrclosed(tp); 2677 if (!(inp->inp_flags & INP_DROPPED)) 2678 /* Ignore stack's drop request, we already at it. */ 2679 (void)tcp_output_nodrop(tp); 2680 } 2681 } 2682 2683 /* 2684 * User issued close, and wish to trail through shutdown states: 2685 * if never received SYN, just forget it. If got a SYN from peer, 2686 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN. 2687 * If already got a FIN from peer, then almost done; go to LAST_ACK 2688 * state. In all other cases, have already sent FIN to peer (e.g. 2689 * after PRU_SHUTDOWN), and just have to play tedious game waiting 2690 * for peer to send FIN or not respond to keep-alives, etc. 2691 * We can let the user exit from the close as soon as the FIN is acked. 2692 */ 2693 static void 2694 tcp_usrclosed(struct tcpcb *tp) 2695 { 2696 2697 NET_EPOCH_ASSERT(); 2698 INP_WLOCK_ASSERT(tptoinpcb(tp)); 2699 2700 switch (tp->t_state) { 2701 case TCPS_LISTEN: 2702 #ifdef TCP_OFFLOAD 2703 tcp_offload_listen_stop(tp); 2704 #endif 2705 tcp_state_change(tp, TCPS_CLOSED); 2706 /* FALLTHROUGH */ 2707 case TCPS_CLOSED: 2708 tp = tcp_close(tp); 2709 /* 2710 * tcp_close() should never return NULL here as the socket is 2711 * still open. 2712 */ 2713 KASSERT(tp != NULL, 2714 ("tcp_usrclosed: tcp_close() returned NULL")); 2715 break; 2716 2717 case TCPS_SYN_SENT: 2718 case TCPS_SYN_RECEIVED: 2719 tp->t_flags |= TF_NEEDFIN; 2720 break; 2721 2722 case TCPS_ESTABLISHED: 2723 tcp_state_change(tp, TCPS_FIN_WAIT_1); 2724 break; 2725 2726 case TCPS_CLOSE_WAIT: 2727 tcp_state_change(tp, TCPS_LAST_ACK); 2728 break; 2729 } 2730 if (tp->t_acktime == 0) 2731 tp->t_acktime = ticks; 2732 if (tp->t_state >= TCPS_FIN_WAIT_2) { 2733 tcp_free_sackholes(tp); 2734 soisdisconnected(tptosocket(tp)); 2735 /* Prevent the connection hanging in FIN_WAIT_2 forever. */ 2736 if (tp->t_state == TCPS_FIN_WAIT_2) { 2737 int timeout; 2738 2739 timeout = (tcp_fast_finwait2_recycle) ? 2740 tcp_finwait2_timeout : TP_MAXIDLE(tp); 2741 tcp_timer_activate(tp, TT_2MSL, timeout); 2742 } 2743 } 2744 } 2745 2746 #ifdef DDB 2747 static void 2748 db_print_indent(int indent) 2749 { 2750 int i; 2751 2752 for (i = 0; i < indent; i++) 2753 db_printf(" "); 2754 } 2755 2756 static void 2757 db_print_tstate(int t_state) 2758 { 2759 2760 switch (t_state) { 2761 case TCPS_CLOSED: 2762 db_printf("TCPS_CLOSED"); 2763 return; 2764 2765 case TCPS_LISTEN: 2766 db_printf("TCPS_LISTEN"); 2767 return; 2768 2769 case TCPS_SYN_SENT: 2770 db_printf("TCPS_SYN_SENT"); 2771 return; 2772 2773 case TCPS_SYN_RECEIVED: 2774 db_printf("TCPS_SYN_RECEIVED"); 2775 return; 2776 2777 case TCPS_ESTABLISHED: 2778 db_printf("TCPS_ESTABLISHED"); 2779 return; 2780 2781 case TCPS_CLOSE_WAIT: 2782 db_printf("TCPS_CLOSE_WAIT"); 2783 return; 2784 2785 case TCPS_FIN_WAIT_1: 2786 db_printf("TCPS_FIN_WAIT_1"); 2787 return; 2788 2789 case TCPS_CLOSING: 2790 db_printf("TCPS_CLOSING"); 2791 return; 2792 2793 case TCPS_LAST_ACK: 2794 db_printf("TCPS_LAST_ACK"); 2795 return; 2796 2797 case TCPS_FIN_WAIT_2: 2798 db_printf("TCPS_FIN_WAIT_2"); 2799 return; 2800 2801 case TCPS_TIME_WAIT: 2802 db_printf("TCPS_TIME_WAIT"); 2803 return; 2804 2805 default: 2806 db_printf("unknown"); 2807 return; 2808 } 2809 } 2810 2811 static void 2812 db_print_bblog_state(int state) 2813 { 2814 switch (state) { 2815 case TCP_LOG_STATE_RATIO_OFF: 2816 db_printf("TCP_LOG_STATE_RATIO_OFF"); 2817 break; 2818 case TCP_LOG_STATE_CLEAR: 2819 db_printf("TCP_LOG_STATE_CLEAR"); 2820 break; 2821 case TCP_LOG_STATE_OFF: 2822 db_printf("TCP_LOG_STATE_OFF"); 2823 break; 2824 case TCP_LOG_STATE_TAIL: 2825 db_printf("TCP_LOG_STATE_TAIL"); 2826 break; 2827 case TCP_LOG_STATE_HEAD: 2828 db_printf("TCP_LOG_STATE_HEAD"); 2829 break; 2830 case TCP_LOG_STATE_HEAD_AUTO: 2831 db_printf("TCP_LOG_STATE_HEAD_AUTO"); 2832 break; 2833 case TCP_LOG_STATE_CONTINUAL: 2834 db_printf("TCP_LOG_STATE_CONTINUAL"); 2835 break; 2836 case TCP_LOG_STATE_TAIL_AUTO: 2837 db_printf("TCP_LOG_STATE_TAIL_AUTO"); 2838 break; 2839 case TCP_LOG_VIA_BBPOINTS: 2840 db_printf("TCP_LOG_STATE_BBPOINTS"); 2841 break; 2842 default: 2843 db_printf("UNKNOWN(%d)", state); 2844 break; 2845 } 2846 } 2847 2848 static void 2849 db_print_tcpcb(struct tcpcb *tp, const char *name, int indent, bool show_bblog, 2850 bool show_inpcb) 2851 { 2852 2853 db_print_indent(indent); 2854 db_printf("%s at %p\n", name, tp); 2855 2856 indent += 2; 2857 2858 if (show_inpcb) 2859 db_print_inpcb(tptoinpcb(tp), "t_inpcb", indent); 2860 2861 db_print_indent(indent); 2862 db_printf("t_segq first: %p t_segqlen: %d t_dupacks: %d\n", 2863 TAILQ_FIRST(&tp->t_segq), tp->t_segqlen, tp->t_dupacks); 2864 2865 db_print_indent(indent); 2866 db_printf("t_callout: %p t_timers: %p\n", 2867 &tp->t_callout, &tp->t_timers); 2868 2869 db_print_indent(indent); 2870 db_printf("t_state: %d (", tp->t_state); 2871 db_print_tstate(tp->t_state); 2872 db_printf(")\n"); 2873 2874 db_print_indent(indent); 2875 db_printf("t_flags: 0x%b\n", tp->t_flags, TF_BITS); 2876 2877 db_print_indent(indent); 2878 db_printf("t_flags2: 0x%b\n", tp->t_flags2, TF2_BITS); 2879 2880 db_print_indent(indent); 2881 db_printf("snd_una: 0x%08x snd_max: 0x%08x snd_nxt: 0x%08x\n", 2882 tp->snd_una, tp->snd_max, tp->snd_nxt); 2883 2884 db_print_indent(indent); 2885 db_printf("snd_up: 0x%08x snd_wl1: 0x%08x snd_wl2: 0x%08x\n", 2886 tp->snd_up, tp->snd_wl1, tp->snd_wl2); 2887 2888 db_print_indent(indent); 2889 db_printf("iss: 0x%08x irs: 0x%08x rcv_nxt: 0x%08x\n", 2890 tp->iss, tp->irs, tp->rcv_nxt); 2891 2892 db_print_indent(indent); 2893 db_printf("rcv_adv: 0x%08x rcv_wnd: %u rcv_up: 0x%08x\n", 2894 tp->rcv_adv, tp->rcv_wnd, tp->rcv_up); 2895 2896 db_print_indent(indent); 2897 db_printf("snd_wnd: %u snd_cwnd: %u\n", 2898 tp->snd_wnd, tp->snd_cwnd); 2899 2900 db_print_indent(indent); 2901 db_printf("snd_ssthresh: %u snd_recover: " 2902 "0x%08x\n", tp->snd_ssthresh, tp->snd_recover); 2903 2904 db_print_indent(indent); 2905 db_printf("t_rcvtime: %u t_startime: %u\n", 2906 tp->t_rcvtime, tp->t_starttime); 2907 2908 db_print_indent(indent); 2909 db_printf("t_rttime: %u t_rtsq: 0x%08x\n", 2910 tp->t_rtttime, tp->t_rtseq); 2911 2912 db_print_indent(indent); 2913 db_printf("t_rxtcur: %d t_maxseg: %u t_srtt: %d\n", 2914 tp->t_rxtcur, tp->t_maxseg, tp->t_srtt); 2915 2916 db_print_indent(indent); 2917 db_printf("t_rttvar: %d t_rxtshift: %d t_rttmin: %u\n", 2918 tp->t_rttvar, tp->t_rxtshift, tp->t_rttmin); 2919 2920 db_print_indent(indent); 2921 db_printf("t_rttupdated: %u max_sndwnd: %u t_softerror: %d\n", 2922 tp->t_rttupdated, tp->max_sndwnd, tp->t_softerror); 2923 2924 db_print_indent(indent); 2925 db_printf("t_oobflags: 0x%b t_iobc: 0x%02x\n", tp->t_oobflags, 2926 TCPOOB_BITS, tp->t_iobc); 2927 2928 db_print_indent(indent); 2929 db_printf("snd_scale: %u rcv_scale: %u request_r_scale: %u\n", 2930 tp->snd_scale, tp->rcv_scale, tp->request_r_scale); 2931 2932 db_print_indent(indent); 2933 db_printf("ts_recent: %u ts_recent_age: %u\n", 2934 tp->ts_recent, tp->ts_recent_age); 2935 2936 db_print_indent(indent); 2937 db_printf("ts_offset: %u last_ack_sent: 0x%08x snd_cwnd_prev: " 2938 "%u\n", tp->ts_offset, tp->last_ack_sent, tp->snd_cwnd_prev); 2939 2940 db_print_indent(indent); 2941 db_printf("snd_ssthresh_prev: %u snd_recover_prev: 0x%08x " 2942 "t_badrxtwin: %u\n", tp->snd_ssthresh_prev, 2943 tp->snd_recover_prev, tp->t_badrxtwin); 2944 2945 db_print_indent(indent); 2946 db_printf("snd_numholes: %d snd_holes first: %p\n", 2947 tp->snd_numholes, TAILQ_FIRST(&tp->snd_holes)); 2948 2949 db_print_indent(indent); 2950 db_printf("snd_fack: 0x%08x rcv_numsacks: %d\n", 2951 tp->snd_fack, tp->rcv_numsacks); 2952 2953 /* Skip sackblks, sackhint. */ 2954 2955 db_print_indent(indent); 2956 db_printf("t_rttlow: %d rfbuf_ts: %u rfbuf_cnt: %d\n", 2957 tp->t_rttlow, tp->rfbuf_ts, tp->rfbuf_cnt); 2958 2959 db_print_indent(indent); 2960 db_printf("t_fb.tfb_tcp_block_name: %s\n", tp->t_fb->tfb_tcp_block_name); 2961 2962 db_print_indent(indent); 2963 db_printf("t_cc.name: %s\n", tp->t_cc->name); 2964 2965 db_print_indent(indent); 2966 db_printf("_t_logstate: %d (", tp->_t_logstate); 2967 db_print_bblog_state(tp->_t_logstate); 2968 db_printf(")\n"); 2969 2970 db_print_indent(indent); 2971 db_printf("t_lognum: %d t_loglimit: %d t_logsn: %u\n", 2972 tp->t_lognum, tp->t_loglimit, tp->t_logsn); 2973 2974 if (show_bblog) { 2975 #ifdef TCP_BLACKBOX 2976 db_print_bblog_entries(&tp->t_logs, indent); 2977 #else 2978 db_print_indent(indent); 2979 db_printf("BBLog not supported\n"); 2980 #endif 2981 } 2982 } 2983 2984 DB_SHOW_COMMAND(tcpcb, db_show_tcpcb) 2985 { 2986 struct tcpcb *tp; 2987 bool show_bblog, show_inpcb; 2988 2989 if (!have_addr) { 2990 db_printf("usage: show tcpcb[/bi] <addr>\n"); 2991 return; 2992 } 2993 show_bblog = strchr(modif, 'b') != NULL; 2994 show_inpcb = strchr(modif, 'i') != NULL; 2995 tp = (struct tcpcb *)addr; 2996 db_print_tcpcb(tp, "tcpcb", 0, show_bblog, show_inpcb); 2997 } 2998 2999 DB_SHOW_ALL_COMMAND(tcpcbs, db_show_all_tcpcbs) 3000 { 3001 VNET_ITERATOR_DECL(vnet_iter); 3002 struct inpcb *inp; 3003 struct tcpcb *tp; 3004 bool only_locked, show_bblog, show_inpcb; 3005 3006 only_locked = strchr(modif, 'l') != NULL; 3007 show_bblog = strchr(modif, 'b') != NULL; 3008 show_inpcb = strchr(modif, 'i') != NULL; 3009 VNET_FOREACH(vnet_iter) { 3010 CURVNET_SET(vnet_iter); 3011 CK_LIST_FOREACH(inp, &V_tcbinfo.ipi_listhead, inp_list) { 3012 if (only_locked && 3013 inp->inp_lock.rw_lock == RW_UNLOCKED) 3014 continue; 3015 tp = intotcpcb(inp); 3016 db_print_tcpcb(tp, "tcpcb", 0, show_bblog, show_inpcb); 3017 if (db_pager_quit) 3018 break; 3019 } 3020 CURVNET_RESTORE(); 3021 if (db_pager_quit) 3022 break; 3023 } 3024 } 3025 #endif 3026