1 /*- 2 * Copyright (c) 1982, 1986, 1988, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 4. Neither the name of the University nor the names of its contributors 14 * may be used to endorse or promote products derived from this software 15 * without specific prior written permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 * From: @(#)tcp_usrreq.c 8.2 (Berkeley) 1/3/94 30 * $FreeBSD$ 31 */ 32 33 #include "opt_ipsec.h" 34 #include "opt_inet.h" 35 #include "opt_inet6.h" 36 #include "opt_tcpdebug.h" 37 38 #include <sys/param.h> 39 #include <sys/systm.h> 40 #include <sys/malloc.h> 41 #include <sys/kernel.h> 42 #include <sys/sysctl.h> 43 #include <sys/mbuf.h> 44 #ifdef INET6 45 #include <sys/domain.h> 46 #endif /* INET6 */ 47 #include <sys/socket.h> 48 #include <sys/socketvar.h> 49 #include <sys/protosw.h> 50 #include <sys/proc.h> 51 #include <sys/jail.h> 52 53 #include <net/if.h> 54 #include <net/route.h> 55 56 #include <netinet/in.h> 57 #include <netinet/in_systm.h> 58 #ifdef INET6 59 #include <netinet/ip6.h> 60 #endif 61 #include <netinet/in_pcb.h> 62 #ifdef INET6 63 #include <netinet6/in6_pcb.h> 64 #endif 65 #include <netinet/in_var.h> 66 #include <netinet/ip_var.h> 67 #ifdef INET6 68 #include <netinet6/ip6_var.h> 69 #include <netinet6/scope6_var.h> 70 #endif 71 #include <netinet/tcp.h> 72 #include <netinet/tcp_fsm.h> 73 #include <netinet/tcp_seq.h> 74 #include <netinet/tcp_timer.h> 75 #include <netinet/tcp_var.h> 76 #include <netinet/tcpip.h> 77 #ifdef TCPDEBUG 78 #include <netinet/tcp_debug.h> 79 #endif 80 81 #ifdef IPSEC 82 #include <netinet6/ipsec.h> 83 #endif /*IPSEC*/ 84 85 /* 86 * TCP protocol interface to socket abstraction. 87 */ 88 extern char *tcpstates[]; /* XXX ??? */ 89 90 static int tcp_attach(struct socket *); 91 static int tcp_connect(struct tcpcb *, struct sockaddr *, 92 struct thread *td); 93 #ifdef INET6 94 static int tcp6_connect(struct tcpcb *, struct sockaddr *, 95 struct thread *td); 96 #endif /* INET6 */ 97 static struct tcpcb * 98 tcp_disconnect(struct tcpcb *); 99 static struct tcpcb * 100 tcp_usrclosed(struct tcpcb *); 101 static void tcp_fill_info(struct tcpcb *, struct tcp_info *); 102 103 #ifdef TCPDEBUG 104 #define TCPDEBUG0 int ostate = 0 105 #define TCPDEBUG1() ostate = tp ? tp->t_state : 0 106 #define TCPDEBUG2(req) if (tp && (so->so_options & SO_DEBUG)) \ 107 tcp_trace(TA_USER, ostate, tp, 0, 0, req) 108 #else 109 #define TCPDEBUG0 110 #define TCPDEBUG1() 111 #define TCPDEBUG2(req) 112 #endif 113 114 /* 115 * TCP attaches to socket via pru_attach(), reserving space, 116 * and an internet control block. 117 */ 118 static int 119 tcp_usr_attach(struct socket *so, int proto, struct thread *td) 120 { 121 int error; 122 struct inpcb *inp; 123 struct tcpcb *tp = 0; 124 TCPDEBUG0; 125 126 INP_INFO_WLOCK(&tcbinfo); 127 TCPDEBUG1(); 128 inp = sotoinpcb(so); 129 if (inp) { 130 error = EISCONN; 131 goto out; 132 } 133 134 error = tcp_attach(so); 135 if (error) 136 goto out; 137 138 if ((so->so_options & SO_LINGER) && so->so_linger == 0) 139 so->so_linger = TCP_LINGERTIME; 140 141 inp = sotoinpcb(so); 142 tp = intotcpcb(inp); 143 out: 144 TCPDEBUG2(PRU_ATTACH); 145 INP_INFO_WUNLOCK(&tcbinfo); 146 return error; 147 } 148 149 /* 150 * pru_detach() detaches the TCP protocol from the socket. 151 * If the protocol state is non-embryonic, then can't 152 * do this directly: have to initiate a pru_disconnect(), 153 * which may finish later; embryonic TCB's can just 154 * be discarded here. 155 */ 156 static int 157 tcp_usr_detach(struct socket *so) 158 { 159 int error = 0; 160 struct inpcb *inp; 161 struct tcpcb *tp; 162 TCPDEBUG0; 163 164 INP_INFO_WLOCK(&tcbinfo); 165 inp = sotoinpcb(so); 166 if (inp == NULL) { 167 INP_INFO_WUNLOCK(&tcbinfo); 168 return error; 169 } 170 INP_LOCK(inp); 171 tp = intotcpcb(inp); 172 TCPDEBUG1(); 173 tp = tcp_disconnect(tp); 174 175 TCPDEBUG2(PRU_DETACH); 176 if (tp) 177 INP_UNLOCK(inp); 178 INP_INFO_WUNLOCK(&tcbinfo); 179 return error; 180 } 181 182 #define INI_NOLOCK 0 183 #define INI_READ 1 184 #define INI_WRITE 2 185 186 #define COMMON_START() \ 187 TCPDEBUG0; \ 188 do { \ 189 if (inirw == INI_READ) \ 190 INP_INFO_RLOCK(&tcbinfo); \ 191 else if (inirw == INI_WRITE) \ 192 INP_INFO_WLOCK(&tcbinfo); \ 193 inp = sotoinpcb(so); \ 194 if (inp == 0) { \ 195 if (inirw == INI_READ) \ 196 INP_INFO_RUNLOCK(&tcbinfo); \ 197 else if (inirw == INI_WRITE) \ 198 INP_INFO_WUNLOCK(&tcbinfo); \ 199 return EINVAL; \ 200 } \ 201 INP_LOCK(inp); \ 202 if (inirw == INI_READ) \ 203 INP_INFO_RUNLOCK(&tcbinfo); \ 204 tp = intotcpcb(inp); \ 205 TCPDEBUG1(); \ 206 } while(0) 207 208 #define COMMON_END(req) \ 209 out: TCPDEBUG2(req); \ 210 do { \ 211 if (tp) \ 212 INP_UNLOCK(inp); \ 213 if (inirw == INI_WRITE) \ 214 INP_INFO_WUNLOCK(&tcbinfo); \ 215 return error; \ 216 goto out; \ 217 } while(0) 218 219 /* 220 * Give the socket an address. 221 */ 222 static int 223 tcp_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td) 224 { 225 int error = 0; 226 struct inpcb *inp; 227 struct tcpcb *tp; 228 struct sockaddr_in *sinp; 229 const int inirw = INI_WRITE; 230 231 sinp = (struct sockaddr_in *)nam; 232 if (nam->sa_len != sizeof (*sinp)) 233 return (EINVAL); 234 /* 235 * Must check for multicast addresses and disallow binding 236 * to them. 237 */ 238 if (sinp->sin_family == AF_INET && 239 IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) 240 return (EAFNOSUPPORT); 241 242 COMMON_START(); 243 error = in_pcbbind(inp, nam, td->td_ucred); 244 if (error) 245 goto out; 246 COMMON_END(PRU_BIND); 247 } 248 249 #ifdef INET6 250 static int 251 tcp6_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td) 252 { 253 int error = 0; 254 struct inpcb *inp; 255 struct tcpcb *tp; 256 struct sockaddr_in6 *sin6p; 257 const int inirw = INI_WRITE; 258 259 sin6p = (struct sockaddr_in6 *)nam; 260 if (nam->sa_len != sizeof (*sin6p)) 261 return (EINVAL); 262 /* 263 * Must check for multicast addresses and disallow binding 264 * to them. 265 */ 266 if (sin6p->sin6_family == AF_INET6 && 267 IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr)) 268 return (EAFNOSUPPORT); 269 270 COMMON_START(); 271 inp->inp_vflag &= ~INP_IPV4; 272 inp->inp_vflag |= INP_IPV6; 273 if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) { 274 if (IN6_IS_ADDR_UNSPECIFIED(&sin6p->sin6_addr)) 275 inp->inp_vflag |= INP_IPV4; 276 else if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) { 277 struct sockaddr_in sin; 278 279 in6_sin6_2_sin(&sin, sin6p); 280 inp->inp_vflag |= INP_IPV4; 281 inp->inp_vflag &= ~INP_IPV6; 282 error = in_pcbbind(inp, (struct sockaddr *)&sin, 283 td->td_ucred); 284 goto out; 285 } 286 } 287 error = in6_pcbbind(inp, nam, td->td_ucred); 288 if (error) 289 goto out; 290 COMMON_END(PRU_BIND); 291 } 292 #endif /* INET6 */ 293 294 /* 295 * Prepare to accept connections. 296 */ 297 static int 298 tcp_usr_listen(struct socket *so, struct thread *td) 299 { 300 int error = 0; 301 struct inpcb *inp; 302 struct tcpcb *tp; 303 const int inirw = INI_WRITE; 304 305 COMMON_START(); 306 SOCK_LOCK(so); 307 error = solisten_proto_check(so); 308 if (error == 0 && inp->inp_lport == 0) 309 error = in_pcbbind(inp, (struct sockaddr *)0, td->td_ucred); 310 if (error == 0) { 311 tp->t_state = TCPS_LISTEN; 312 solisten_proto(so); 313 } 314 SOCK_UNLOCK(so); 315 COMMON_END(PRU_LISTEN); 316 } 317 318 #ifdef INET6 319 static int 320 tcp6_usr_listen(struct socket *so, struct thread *td) 321 { 322 int error = 0; 323 struct inpcb *inp; 324 struct tcpcb *tp; 325 const int inirw = INI_WRITE; 326 327 COMMON_START(); 328 SOCK_LOCK(so); 329 error = solisten_proto_check(so); 330 if (error == 0 && inp->inp_lport == 0) { 331 inp->inp_vflag &= ~INP_IPV4; 332 if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) 333 inp->inp_vflag |= INP_IPV4; 334 error = in6_pcbbind(inp, (struct sockaddr *)0, td->td_ucred); 335 } 336 if (error == 0) { 337 tp->t_state = TCPS_LISTEN; 338 solisten_proto(so); 339 } 340 SOCK_UNLOCK(so); 341 COMMON_END(PRU_LISTEN); 342 } 343 #endif /* INET6 */ 344 345 /* 346 * Initiate connection to peer. 347 * Create a template for use in transmissions on this connection. 348 * Enter SYN_SENT state, and mark socket as connecting. 349 * Start keep-alive timer, and seed output sequence space. 350 * Send initial segment on connection. 351 */ 352 static int 353 tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td) 354 { 355 int error = 0; 356 struct inpcb *inp; 357 struct tcpcb *tp; 358 struct sockaddr_in *sinp; 359 const int inirw = INI_WRITE; 360 361 sinp = (struct sockaddr_in *)nam; 362 if (nam->sa_len != sizeof (*sinp)) 363 return (EINVAL); 364 /* 365 * Must disallow TCP ``connections'' to multicast addresses. 366 */ 367 if (sinp->sin_family == AF_INET 368 && IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) 369 return (EAFNOSUPPORT); 370 if (jailed(td->td_ucred)) 371 prison_remote_ip(td->td_ucred, 0, &sinp->sin_addr.s_addr); 372 373 COMMON_START(); 374 if ((error = tcp_connect(tp, nam, td)) != 0) 375 goto out; 376 error = tcp_output(tp); 377 COMMON_END(PRU_CONNECT); 378 } 379 380 #ifdef INET6 381 static int 382 tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td) 383 { 384 int error = 0; 385 struct inpcb *inp; 386 struct tcpcb *tp; 387 struct sockaddr_in6 *sin6p; 388 const int inirw = INI_WRITE; 389 390 sin6p = (struct sockaddr_in6 *)nam; 391 if (nam->sa_len != sizeof (*sin6p)) 392 return (EINVAL); 393 /* 394 * Must disallow TCP ``connections'' to multicast addresses. 395 */ 396 if (sin6p->sin6_family == AF_INET6 397 && IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr)) 398 return (EAFNOSUPPORT); 399 400 COMMON_START(); 401 if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) { 402 struct sockaddr_in sin; 403 404 if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) { 405 error = EINVAL; 406 goto out; 407 } 408 409 in6_sin6_2_sin(&sin, sin6p); 410 inp->inp_vflag |= INP_IPV4; 411 inp->inp_vflag &= ~INP_IPV6; 412 if ((error = tcp_connect(tp, (struct sockaddr *)&sin, td)) != 0) 413 goto out; 414 error = tcp_output(tp); 415 goto out; 416 } 417 inp->inp_vflag &= ~INP_IPV4; 418 inp->inp_vflag |= INP_IPV6; 419 inp->inp_inc.inc_isipv6 = 1; 420 if ((error = tcp6_connect(tp, nam, td)) != 0) 421 goto out; 422 error = tcp_output(tp); 423 COMMON_END(PRU_CONNECT); 424 } 425 #endif /* INET6 */ 426 427 /* 428 * Initiate disconnect from peer. 429 * If connection never passed embryonic stage, just drop; 430 * else if don't need to let data drain, then can just drop anyways, 431 * else have to begin TCP shutdown process: mark socket disconnecting, 432 * drain unread data, state switch to reflect user close, and 433 * send segment (e.g. FIN) to peer. Socket will be really disconnected 434 * when peer sends FIN and acks ours. 435 * 436 * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB. 437 */ 438 static int 439 tcp_usr_disconnect(struct socket *so) 440 { 441 int error = 0; 442 struct inpcb *inp; 443 struct tcpcb *tp; 444 const int inirw = INI_WRITE; 445 446 COMMON_START(); 447 tp = tcp_disconnect(tp); 448 COMMON_END(PRU_DISCONNECT); 449 } 450 451 /* 452 * Accept a connection. Essentially all the work is 453 * done at higher levels; just return the address 454 * of the peer, storing through addr. 455 */ 456 static int 457 tcp_usr_accept(struct socket *so, struct sockaddr **nam) 458 { 459 int error = 0; 460 struct inpcb *inp = NULL; 461 struct tcpcb *tp = NULL; 462 struct in_addr addr; 463 in_port_t port = 0; 464 TCPDEBUG0; 465 466 if (so->so_state & SS_ISDISCONNECTED) { 467 error = ECONNABORTED; 468 goto out; 469 } 470 471 INP_INFO_RLOCK(&tcbinfo); 472 inp = sotoinpcb(so); 473 if (!inp) { 474 INP_INFO_RUNLOCK(&tcbinfo); 475 return (EINVAL); 476 } 477 INP_LOCK(inp); 478 INP_INFO_RUNLOCK(&tcbinfo); 479 tp = intotcpcb(inp); 480 TCPDEBUG1(); 481 482 /* 483 * We inline in_setpeeraddr and COMMON_END here, so that we can 484 * copy the data of interest and defer the malloc until after we 485 * release the lock. 486 */ 487 port = inp->inp_fport; 488 addr = inp->inp_faddr; 489 490 out: TCPDEBUG2(PRU_ACCEPT); 491 if (tp) 492 INP_UNLOCK(inp); 493 if (error == 0) 494 *nam = in_sockaddr(port, &addr); 495 return error; 496 } 497 498 #ifdef INET6 499 static int 500 tcp6_usr_accept(struct socket *so, struct sockaddr **nam) 501 { 502 struct inpcb *inp = NULL; 503 int error = 0; 504 struct tcpcb *tp = NULL; 505 struct in_addr addr; 506 struct in6_addr addr6; 507 in_port_t port = 0; 508 int v4 = 0; 509 TCPDEBUG0; 510 511 if (so->so_state & SS_ISDISCONNECTED) { 512 error = ECONNABORTED; 513 goto out; 514 } 515 516 INP_INFO_RLOCK(&tcbinfo); 517 inp = sotoinpcb(so); 518 if (inp == 0) { 519 INP_INFO_RUNLOCK(&tcbinfo); 520 return (EINVAL); 521 } 522 INP_LOCK(inp); 523 INP_INFO_RUNLOCK(&tcbinfo); 524 tp = intotcpcb(inp); 525 TCPDEBUG1(); 526 /* 527 * We inline in6_mapped_peeraddr and COMMON_END here, so that we can 528 * copy the data of interest and defer the malloc until after we 529 * release the lock. 530 */ 531 if (inp->inp_vflag & INP_IPV4) { 532 v4 = 1; 533 port = inp->inp_fport; 534 addr = inp->inp_faddr; 535 } else { 536 port = inp->inp_fport; 537 addr6 = inp->in6p_faddr; 538 } 539 540 out: TCPDEBUG2(PRU_ACCEPT); 541 if (tp) 542 INP_UNLOCK(inp); 543 if (error == 0) { 544 if (v4) 545 *nam = in6_v4mapsin6_sockaddr(port, &addr); 546 else 547 *nam = in6_sockaddr(port, &addr6); 548 } 549 return error; 550 } 551 #endif /* INET6 */ 552 553 /* 554 * This is the wrapper function for in_setsockaddr. We just pass down 555 * the pcbinfo for in_setsockaddr to lock. We don't want to do the locking 556 * here because in_setsockaddr will call malloc and can block. 557 */ 558 static int 559 tcp_sockaddr(struct socket *so, struct sockaddr **nam) 560 { 561 return (in_setsockaddr(so, nam, &tcbinfo)); 562 } 563 564 /* 565 * This is the wrapper function for in_setpeeraddr. We just pass down 566 * the pcbinfo for in_setpeeraddr to lock. 567 */ 568 static int 569 tcp_peeraddr(struct socket *so, struct sockaddr **nam) 570 { 571 return (in_setpeeraddr(so, nam, &tcbinfo)); 572 } 573 574 /* 575 * Mark the connection as being incapable of further output. 576 */ 577 static int 578 tcp_usr_shutdown(struct socket *so) 579 { 580 int error = 0; 581 struct inpcb *inp; 582 struct tcpcb *tp; 583 const int inirw = INI_WRITE; 584 585 COMMON_START(); 586 socantsendmore(so); 587 tp = tcp_usrclosed(tp); 588 if (tp) 589 error = tcp_output(tp); 590 COMMON_END(PRU_SHUTDOWN); 591 } 592 593 /* 594 * After a receive, possibly send window update to peer. 595 */ 596 static int 597 tcp_usr_rcvd(struct socket *so, int flags) 598 { 599 int error = 0; 600 struct inpcb *inp; 601 struct tcpcb *tp; 602 const int inirw = INI_READ; 603 604 COMMON_START(); 605 tcp_output(tp); 606 COMMON_END(PRU_RCVD); 607 } 608 609 /* 610 * Do a send by putting data in output queue and updating urgent 611 * marker if URG set. Possibly send more data. Unlike the other 612 * pru_*() routines, the mbuf chains are our responsibility. We 613 * must either enqueue them or free them. The other pru_* routines 614 * generally are caller-frees. 615 */ 616 static int 617 tcp_usr_send(struct socket *so, int flags, struct mbuf *m, 618 struct sockaddr *nam, struct mbuf *control, struct thread *td) 619 { 620 int error = 0; 621 struct inpcb *inp; 622 struct tcpcb *tp; 623 int unlocked = 0; 624 #ifdef INET6 625 int isipv6; 626 #endif 627 TCPDEBUG0; 628 629 /* 630 * Need write lock here because this function might call 631 * tcp_connect or tcp_usrclosed. 632 * We really want to have to this function upgrade from read lock 633 * to write lock. XXX 634 */ 635 INP_INFO_WLOCK(&tcbinfo); 636 inp = sotoinpcb(so); 637 if (inp == NULL) { 638 /* 639 * OOPS! we lost a race, the TCP session got reset after 640 * we checked SBS_CANTSENDMORE, eg: while doing uiomove or a 641 * network interrupt in the non-splnet() section of sosend(). 642 */ 643 if (m) 644 m_freem(m); 645 if (control) 646 m_freem(control); 647 error = ECONNRESET; /* XXX EPIPE? */ 648 tp = NULL; 649 TCPDEBUG1(); 650 goto out; 651 } 652 INP_LOCK(inp); 653 #ifdef INET6 654 isipv6 = nam && nam->sa_family == AF_INET6; 655 #endif /* INET6 */ 656 tp = intotcpcb(inp); 657 TCPDEBUG1(); 658 if (control) { 659 /* TCP doesn't do control messages (rights, creds, etc) */ 660 if (control->m_len) { 661 m_freem(control); 662 if (m) 663 m_freem(m); 664 error = EINVAL; 665 goto out; 666 } 667 m_freem(control); /* empty control, just free it */ 668 } 669 if (!(flags & PRUS_OOB)) { 670 sbappendstream(&so->so_snd, m); 671 if (nam && tp->t_state < TCPS_SYN_SENT) { 672 /* 673 * Do implied connect if not yet connected, 674 * initialize window to default value, and 675 * initialize maxseg/maxopd using peer's cached 676 * MSS. 677 */ 678 #ifdef INET6 679 if (isipv6) 680 error = tcp6_connect(tp, nam, td); 681 else 682 #endif /* INET6 */ 683 error = tcp_connect(tp, nam, td); 684 if (error) 685 goto out; 686 tp->snd_wnd = TTCP_CLIENT_SND_WND; 687 tcp_mss(tp, -1); 688 } 689 690 if (flags & PRUS_EOF) { 691 /* 692 * Close the send side of the connection after 693 * the data is sent. 694 */ 695 socantsendmore(so); 696 tp = tcp_usrclosed(tp); 697 } 698 INP_INFO_WUNLOCK(&tcbinfo); 699 unlocked = 1; 700 if (tp != NULL) { 701 if (flags & PRUS_MORETOCOME) 702 tp->t_flags |= TF_MORETOCOME; 703 error = tcp_output(tp); 704 if (flags & PRUS_MORETOCOME) 705 tp->t_flags &= ~TF_MORETOCOME; 706 } 707 } else { 708 SOCKBUF_LOCK(&so->so_snd); 709 if (sbspace(&so->so_snd) < -512) { 710 SOCKBUF_UNLOCK(&so->so_snd); 711 m_freem(m); 712 error = ENOBUFS; 713 goto out; 714 } 715 /* 716 * According to RFC961 (Assigned Protocols), 717 * the urgent pointer points to the last octet 718 * of urgent data. We continue, however, 719 * to consider it to indicate the first octet 720 * of data past the urgent section. 721 * Otherwise, snd_up should be one lower. 722 */ 723 sbappendstream_locked(&so->so_snd, m); 724 SOCKBUF_UNLOCK(&so->so_snd); 725 if (nam && tp->t_state < TCPS_SYN_SENT) { 726 /* 727 * Do implied connect if not yet connected, 728 * initialize window to default value, and 729 * initialize maxseg/maxopd using peer's cached 730 * MSS. 731 */ 732 #ifdef INET6 733 if (isipv6) 734 error = tcp6_connect(tp, nam, td); 735 else 736 #endif /* INET6 */ 737 error = tcp_connect(tp, nam, td); 738 if (error) 739 goto out; 740 tp->snd_wnd = TTCP_CLIENT_SND_WND; 741 tcp_mss(tp, -1); 742 } 743 INP_INFO_WUNLOCK(&tcbinfo); 744 unlocked = 1; 745 tp->snd_up = tp->snd_una + so->so_snd.sb_cc; 746 tp->t_flags |= TF_FORCEDATA; 747 error = tcp_output(tp); 748 tp->t_flags &= ~TF_FORCEDATA; 749 } 750 out: 751 TCPDEBUG2((flags & PRUS_OOB) ? PRU_SENDOOB : 752 ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND)); 753 if (tp) 754 INP_UNLOCK(inp); 755 if (!unlocked) 756 INP_INFO_WUNLOCK(&tcbinfo); 757 return (error); 758 } 759 760 /* 761 * Abort the TCP. 762 */ 763 static int 764 tcp_usr_abort(struct socket *so) 765 { 766 int error = 0; 767 struct inpcb *inp; 768 struct tcpcb *tp; 769 const int inirw = INI_WRITE; 770 771 COMMON_START(); 772 tp = tcp_drop(tp, ECONNABORTED); 773 COMMON_END(PRU_ABORT); 774 } 775 776 /* 777 * Receive out-of-band data. 778 */ 779 static int 780 tcp_usr_rcvoob(struct socket *so, struct mbuf *m, int flags) 781 { 782 int error = 0; 783 struct inpcb *inp; 784 struct tcpcb *tp; 785 const int inirw = INI_READ; 786 787 COMMON_START(); 788 if ((so->so_oobmark == 0 && 789 (so->so_rcv.sb_state & SBS_RCVATMARK) == 0) || 790 so->so_options & SO_OOBINLINE || 791 tp->t_oobflags & TCPOOB_HADDATA) { 792 error = EINVAL; 793 goto out; 794 } 795 if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) { 796 error = EWOULDBLOCK; 797 goto out; 798 } 799 m->m_len = 1; 800 *mtod(m, caddr_t) = tp->t_iobc; 801 if ((flags & MSG_PEEK) == 0) 802 tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA); 803 COMMON_END(PRU_RCVOOB); 804 } 805 806 struct pr_usrreqs tcp_usrreqs = { 807 .pru_abort = tcp_usr_abort, 808 .pru_accept = tcp_usr_accept, 809 .pru_attach = tcp_usr_attach, 810 .pru_bind = tcp_usr_bind, 811 .pru_connect = tcp_usr_connect, 812 .pru_control = in_control, 813 .pru_detach = tcp_usr_detach, 814 .pru_disconnect = tcp_usr_disconnect, 815 .pru_listen = tcp_usr_listen, 816 .pru_peeraddr = tcp_peeraddr, 817 .pru_rcvd = tcp_usr_rcvd, 818 .pru_rcvoob = tcp_usr_rcvoob, 819 .pru_send = tcp_usr_send, 820 .pru_shutdown = tcp_usr_shutdown, 821 .pru_sockaddr = tcp_sockaddr, 822 .pru_sosetlabel = in_pcbsosetlabel 823 }; 824 825 #ifdef INET6 826 struct pr_usrreqs tcp6_usrreqs = { 827 .pru_abort = tcp_usr_abort, 828 .pru_accept = tcp6_usr_accept, 829 .pru_attach = tcp_usr_attach, 830 .pru_bind = tcp6_usr_bind, 831 .pru_connect = tcp6_usr_connect, 832 .pru_control = in6_control, 833 .pru_detach = tcp_usr_detach, 834 .pru_disconnect = tcp_usr_disconnect, 835 .pru_listen = tcp6_usr_listen, 836 .pru_peeraddr = in6_mapped_peeraddr, 837 .pru_rcvd = tcp_usr_rcvd, 838 .pru_rcvoob = tcp_usr_rcvoob, 839 .pru_send = tcp_usr_send, 840 .pru_shutdown = tcp_usr_shutdown, 841 .pru_sockaddr = in6_mapped_sockaddr, 842 .pru_sosetlabel = in_pcbsosetlabel 843 }; 844 #endif /* INET6 */ 845 846 /* 847 * Common subroutine to open a TCP connection to remote host specified 848 * by struct sockaddr_in in mbuf *nam. Call in_pcbbind to assign a local 849 * port number if needed. Call in_pcbconnect_setup to do the routing and 850 * to choose a local host address (interface). If there is an existing 851 * incarnation of the same connection in TIME-WAIT state and if the remote 852 * host was sending CC options and if the connection duration was < MSL, then 853 * truncate the previous TIME-WAIT state and proceed. 854 * Initialize connection parameters and enter SYN-SENT state. 855 */ 856 static int 857 tcp_connect(tp, nam, td) 858 register struct tcpcb *tp; 859 struct sockaddr *nam; 860 struct thread *td; 861 { 862 struct inpcb *inp = tp->t_inpcb, *oinp; 863 struct socket *so = inp->inp_socket; 864 struct in_addr laddr; 865 u_short lport; 866 int error; 867 868 if (inp->inp_lport == 0) { 869 error = in_pcbbind(inp, (struct sockaddr *)0, td->td_ucred); 870 if (error) 871 return error; 872 } 873 874 /* 875 * Cannot simply call in_pcbconnect, because there might be an 876 * earlier incarnation of this same connection still in 877 * TIME_WAIT state, creating an ADDRINUSE error. 878 */ 879 laddr = inp->inp_laddr; 880 lport = inp->inp_lport; 881 error = in_pcbconnect_setup(inp, nam, &laddr.s_addr, &lport, 882 &inp->inp_faddr.s_addr, &inp->inp_fport, &oinp, td->td_ucred); 883 if (error && oinp == NULL) 884 return error; 885 if (oinp) 886 return EADDRINUSE; 887 inp->inp_laddr = laddr; 888 in_pcbrehash(inp); 889 890 /* Compute window scaling to request. */ 891 while (tp->request_r_scale < TCP_MAX_WINSHIFT && 892 (TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.sb_hiwat) 893 tp->request_r_scale++; 894 895 soisconnecting(so); 896 tcpstat.tcps_connattempt++; 897 tp->t_state = TCPS_SYN_SENT; 898 callout_reset(tp->tt_keep, tcp_keepinit, tcp_timer_keep, tp); 899 tp->iss = tcp_new_isn(tp); 900 tp->t_bw_rtseq = tp->iss; 901 tcp_sendseqinit(tp); 902 903 return 0; 904 } 905 906 #ifdef INET6 907 static int 908 tcp6_connect(tp, nam, td) 909 register struct tcpcb *tp; 910 struct sockaddr *nam; 911 struct thread *td; 912 { 913 struct inpcb *inp = tp->t_inpcb, *oinp; 914 struct socket *so = inp->inp_socket; 915 struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam; 916 struct in6_addr *addr6; 917 int error; 918 919 if (inp->inp_lport == 0) { 920 error = in6_pcbbind(inp, (struct sockaddr *)0, td->td_ucred); 921 if (error) 922 return error; 923 } 924 925 /* 926 * Cannot simply call in_pcbconnect, because there might be an 927 * earlier incarnation of this same connection still in 928 * TIME_WAIT state, creating an ADDRINUSE error. 929 * in6_pcbladdr() also handles scope zone IDs. 930 */ 931 error = in6_pcbladdr(inp, nam, &addr6); 932 if (error) 933 return error; 934 oinp = in6_pcblookup_hash(inp->inp_pcbinfo, 935 &sin6->sin6_addr, sin6->sin6_port, 936 IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) 937 ? addr6 938 : &inp->in6p_laddr, 939 inp->inp_lport, 0, NULL); 940 if (oinp) 941 return EADDRINUSE; 942 if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) 943 inp->in6p_laddr = *addr6; 944 inp->in6p_faddr = sin6->sin6_addr; 945 inp->inp_fport = sin6->sin6_port; 946 /* update flowinfo - draft-itojun-ipv6-flowlabel-api-00 */ 947 inp->in6p_flowinfo &= ~IPV6_FLOWLABEL_MASK; 948 if (inp->in6p_flags & IN6P_AUTOFLOWLABEL) 949 inp->in6p_flowinfo |= 950 (htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK); 951 in_pcbrehash(inp); 952 953 /* Compute window scaling to request. */ 954 while (tp->request_r_scale < TCP_MAX_WINSHIFT && 955 (TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.sb_hiwat) 956 tp->request_r_scale++; 957 958 soisconnecting(so); 959 tcpstat.tcps_connattempt++; 960 tp->t_state = TCPS_SYN_SENT; 961 callout_reset(tp->tt_keep, tcp_keepinit, tcp_timer_keep, tp); 962 tp->iss = tcp_new_isn(tp); 963 tp->t_bw_rtseq = tp->iss; 964 tcp_sendseqinit(tp); 965 966 return 0; 967 } 968 #endif /* INET6 */ 969 970 /* 971 * Export TCP internal state information via a struct tcp_info, based on the 972 * Linux 2.6 API. Not ABI compatible as our constants are mapped differently 973 * (TCP state machine, etc). We export all information using FreeBSD-native 974 * constants -- for example, the numeric values for tcpi_state will differ 975 * from Linux. 976 */ 977 static void 978 tcp_fill_info(tp, ti) 979 struct tcpcb *tp; 980 struct tcp_info *ti; 981 { 982 983 INP_LOCK_ASSERT(tp->t_inpcb); 984 bzero(ti, sizeof(*ti)); 985 986 ti->tcpi_state = tp->t_state; 987 if ((tp->t_flags & TF_REQ_TSTMP) && (tp->t_flags & TF_RCVD_TSTMP)) 988 ti->tcpi_options |= TCPI_OPT_TIMESTAMPS; 989 if (tp->sack_enable) 990 ti->tcpi_options |= TCPI_OPT_SACK; 991 if ((tp->t_flags & TF_REQ_SCALE) && (tp->t_flags & TF_RCVD_SCALE)) { 992 ti->tcpi_options |= TCPI_OPT_WSCALE; 993 ti->tcpi_snd_wscale = tp->snd_scale; 994 ti->tcpi_rcv_wscale = tp->rcv_scale; 995 } 996 ti->tcpi_snd_ssthresh = tp->snd_ssthresh; 997 ti->tcpi_snd_cwnd = tp->snd_cwnd; 998 999 /* 1000 * FreeBSD-specific extension fields for tcp_info. 1001 */ 1002 ti->tcpi_rcv_space = tp->rcv_wnd; 1003 ti->tcpi_snd_wnd = tp->snd_wnd; 1004 ti->tcpi_snd_bwnd = tp->snd_bwnd; 1005 } 1006 1007 /* 1008 * The new sockopt interface makes it possible for us to block in the 1009 * copyin/out step (if we take a page fault). Taking a page fault at 1010 * splnet() is probably a Bad Thing. (Since sockets and pcbs both now 1011 * use TSM, there probably isn't any need for this function to run at 1012 * splnet() any more. This needs more examination.) 1013 * 1014 * XXXRW: The locking here is wrong; we may take a page fault while holding 1015 * the inpcb lock. 1016 */ 1017 int 1018 tcp_ctloutput(so, sopt) 1019 struct socket *so; 1020 struct sockopt *sopt; 1021 { 1022 int error, opt, optval; 1023 struct inpcb *inp; 1024 struct tcpcb *tp; 1025 struct tcp_info ti; 1026 1027 error = 0; 1028 INP_INFO_RLOCK(&tcbinfo); 1029 inp = sotoinpcb(so); 1030 if (inp == NULL) { 1031 INP_INFO_RUNLOCK(&tcbinfo); 1032 return (ECONNRESET); 1033 } 1034 INP_LOCK(inp); 1035 INP_INFO_RUNLOCK(&tcbinfo); 1036 if (sopt->sopt_level != IPPROTO_TCP) { 1037 INP_UNLOCK(inp); 1038 #ifdef INET6 1039 if (INP_CHECK_SOCKAF(so, AF_INET6)) 1040 error = ip6_ctloutput(so, sopt); 1041 else 1042 #endif /* INET6 */ 1043 error = ip_ctloutput(so, sopt); 1044 return (error); 1045 } 1046 tp = intotcpcb(inp); 1047 1048 switch (sopt->sopt_dir) { 1049 case SOPT_SET: 1050 switch (sopt->sopt_name) { 1051 #ifdef TCP_SIGNATURE 1052 case TCP_MD5SIG: 1053 error = sooptcopyin(sopt, &optval, sizeof optval, 1054 sizeof optval); 1055 if (error) 1056 break; 1057 1058 if (optval > 0) 1059 tp->t_flags |= TF_SIGNATURE; 1060 else 1061 tp->t_flags &= ~TF_SIGNATURE; 1062 break; 1063 #endif /* TCP_SIGNATURE */ 1064 case TCP_NODELAY: 1065 case TCP_NOOPT: 1066 error = sooptcopyin(sopt, &optval, sizeof optval, 1067 sizeof optval); 1068 if (error) 1069 break; 1070 1071 switch (sopt->sopt_name) { 1072 case TCP_NODELAY: 1073 opt = TF_NODELAY; 1074 break; 1075 case TCP_NOOPT: 1076 opt = TF_NOOPT; 1077 break; 1078 default: 1079 opt = 0; /* dead code to fool gcc */ 1080 break; 1081 } 1082 1083 if (optval) 1084 tp->t_flags |= opt; 1085 else 1086 tp->t_flags &= ~opt; 1087 break; 1088 1089 case TCP_NOPUSH: 1090 error = sooptcopyin(sopt, &optval, sizeof optval, 1091 sizeof optval); 1092 if (error) 1093 break; 1094 1095 if (optval) 1096 tp->t_flags |= TF_NOPUSH; 1097 else { 1098 tp->t_flags &= ~TF_NOPUSH; 1099 error = tcp_output(tp); 1100 } 1101 break; 1102 1103 case TCP_MAXSEG: 1104 error = sooptcopyin(sopt, &optval, sizeof optval, 1105 sizeof optval); 1106 if (error) 1107 break; 1108 1109 if (optval > 0 && optval <= tp->t_maxseg && 1110 optval + 40 >= tcp_minmss) 1111 tp->t_maxseg = optval; 1112 else 1113 error = EINVAL; 1114 break; 1115 1116 case TCP_INFO: 1117 error = EINVAL; 1118 break; 1119 1120 default: 1121 error = ENOPROTOOPT; 1122 break; 1123 } 1124 break; 1125 1126 case SOPT_GET: 1127 switch (sopt->sopt_name) { 1128 #ifdef TCP_SIGNATURE 1129 case TCP_MD5SIG: 1130 optval = (tp->t_flags & TF_SIGNATURE) ? 1 : 0; 1131 error = sooptcopyout(sopt, &optval, sizeof optval); 1132 break; 1133 #endif 1134 case TCP_NODELAY: 1135 optval = tp->t_flags & TF_NODELAY; 1136 error = sooptcopyout(sopt, &optval, sizeof optval); 1137 break; 1138 case TCP_MAXSEG: 1139 optval = tp->t_maxseg; 1140 error = sooptcopyout(sopt, &optval, sizeof optval); 1141 break; 1142 case TCP_NOOPT: 1143 optval = tp->t_flags & TF_NOOPT; 1144 error = sooptcopyout(sopt, &optval, sizeof optval); 1145 break; 1146 case TCP_NOPUSH: 1147 optval = tp->t_flags & TF_NOPUSH; 1148 error = sooptcopyout(sopt, &optval, sizeof optval); 1149 break; 1150 case TCP_INFO: 1151 tcp_fill_info(tp, &ti); 1152 error = sooptcopyout(sopt, &ti, sizeof ti); 1153 break; 1154 default: 1155 error = ENOPROTOOPT; 1156 break; 1157 } 1158 break; 1159 } 1160 INP_UNLOCK(inp); 1161 return (error); 1162 } 1163 1164 /* 1165 * tcp_sendspace and tcp_recvspace are the default send and receive window 1166 * sizes, respectively. These are obsolescent (this information should 1167 * be set by the route). 1168 */ 1169 u_long tcp_sendspace = 1024*32; 1170 SYSCTL_INT(_net_inet_tcp, TCPCTL_SENDSPACE, sendspace, CTLFLAG_RW, 1171 &tcp_sendspace , 0, "Maximum outgoing TCP datagram size"); 1172 u_long tcp_recvspace = 1024*64; 1173 SYSCTL_INT(_net_inet_tcp, TCPCTL_RECVSPACE, recvspace, CTLFLAG_RW, 1174 &tcp_recvspace , 0, "Maximum incoming TCP datagram size"); 1175 1176 /* 1177 * Attach TCP protocol to socket, allocating 1178 * internet protocol control block, tcp control block, 1179 * bufer space, and entering LISTEN state if to accept connections. 1180 */ 1181 static int 1182 tcp_attach(so) 1183 struct socket *so; 1184 { 1185 register struct tcpcb *tp; 1186 struct inpcb *inp; 1187 int error; 1188 #ifdef INET6 1189 int isipv6 = INP_CHECK_SOCKAF(so, AF_INET6) != 0; 1190 #endif 1191 1192 INP_INFO_WLOCK_ASSERT(&tcbinfo); 1193 1194 if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { 1195 error = soreserve(so, tcp_sendspace, tcp_recvspace); 1196 if (error) 1197 return (error); 1198 } 1199 error = in_pcballoc(so, &tcbinfo, "tcpinp"); 1200 if (error) 1201 return (error); 1202 inp = sotoinpcb(so); 1203 #ifdef INET6 1204 if (isipv6) { 1205 inp->inp_vflag |= INP_IPV6; 1206 inp->in6p_hops = -1; /* use kernel default */ 1207 } 1208 else 1209 #endif 1210 inp->inp_vflag |= INP_IPV4; 1211 tp = tcp_newtcpcb(inp); 1212 if (tp == 0) { 1213 int nofd = so->so_state & SS_NOFDREF; /* XXX */ 1214 1215 so->so_state &= ~SS_NOFDREF; /* don't free the socket yet */ 1216 1217 INP_LOCK(inp); 1218 #ifdef INET6 1219 if (isipv6) 1220 in6_pcbdetach(inp); 1221 else 1222 #endif 1223 in_pcbdetach(inp); 1224 so->so_state |= nofd; 1225 return (ENOBUFS); 1226 } 1227 tp->t_state = TCPS_CLOSED; 1228 return (0); 1229 } 1230 1231 /* 1232 * Initiate (or continue) disconnect. 1233 * If embryonic state, just send reset (once). 1234 * If in ``let data drain'' option and linger null, just drop. 1235 * Otherwise (hard), mark socket disconnecting and drop 1236 * current input data; switch states based on user close, and 1237 * send segment to peer (with FIN). 1238 */ 1239 static struct tcpcb * 1240 tcp_disconnect(tp) 1241 register struct tcpcb *tp; 1242 { 1243 struct inpcb *inp = tp->t_inpcb; 1244 struct socket *so = inp->inp_socket; 1245 1246 INP_INFO_WLOCK_ASSERT(&tcbinfo); 1247 INP_LOCK_ASSERT(inp); 1248 1249 if (tp->t_state < TCPS_ESTABLISHED) 1250 tp = tcp_close(tp); 1251 else if ((so->so_options & SO_LINGER) && so->so_linger == 0) 1252 tp = tcp_drop(tp, 0); 1253 else { 1254 soisdisconnecting(so); 1255 sbflush(&so->so_rcv); 1256 tp = tcp_usrclosed(tp); 1257 if (tp) 1258 (void) tcp_output(tp); 1259 } 1260 return (tp); 1261 } 1262 1263 /* 1264 * User issued close, and wish to trail through shutdown states: 1265 * if never received SYN, just forget it. If got a SYN from peer, 1266 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN. 1267 * If already got a FIN from peer, then almost done; go to LAST_ACK 1268 * state. In all other cases, have already sent FIN to peer (e.g. 1269 * after PRU_SHUTDOWN), and just have to play tedious game waiting 1270 * for peer to send FIN or not respond to keep-alives, etc. 1271 * We can let the user exit from the close as soon as the FIN is acked. 1272 */ 1273 static struct tcpcb * 1274 tcp_usrclosed(tp) 1275 register struct tcpcb *tp; 1276 { 1277 1278 INP_INFO_WLOCK_ASSERT(&tcbinfo); 1279 INP_LOCK_ASSERT(tp->t_inpcb); 1280 1281 switch (tp->t_state) { 1282 1283 case TCPS_CLOSED: 1284 case TCPS_LISTEN: 1285 tp->t_state = TCPS_CLOSED; 1286 tp = tcp_close(tp); 1287 break; 1288 1289 case TCPS_SYN_SENT: 1290 case TCPS_SYN_RECEIVED: 1291 tp->t_flags |= TF_NEEDFIN; 1292 break; 1293 1294 case TCPS_ESTABLISHED: 1295 tp->t_state = TCPS_FIN_WAIT_1; 1296 break; 1297 1298 case TCPS_CLOSE_WAIT: 1299 tp->t_state = TCPS_LAST_ACK; 1300 break; 1301 } 1302 if (tp && tp->t_state >= TCPS_FIN_WAIT_2) { 1303 soisdisconnected(tp->t_inpcb->inp_socket); 1304 /* To prevent the connection hanging in FIN_WAIT_2 forever. */ 1305 if (tp->t_state == TCPS_FIN_WAIT_2) 1306 callout_reset(tp->tt_2msl, tcp_maxidle, 1307 tcp_timer_2msl, tp); 1308 } 1309 return (tp); 1310 } 1311