1 /* 2 * Copyright (c) 1982, 1986, 1988, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 3. All advertising materials mentioning features or use of this software 14 * must display the following acknowledgement: 15 * This product includes software developed by the University of 16 * California, Berkeley and its contributors. 17 * 4. Neither the name of the University nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 * 33 * From: @(#)tcp_usrreq.c 8.2 (Berkeley) 1/3/94 34 * $Id: tcp_usrreq.c,v 1.19 1995/11/09 20:23:09 phk Exp $ 35 */ 36 37 #include <sys/param.h> 38 #include <sys/systm.h> 39 #include <sys/kernel.h> 40 #include <sys/sysctl.h> 41 #include <sys/malloc.h> 42 #include <sys/mbuf.h> 43 #include <sys/socket.h> 44 #include <sys/socketvar.h> 45 #include <sys/protosw.h> 46 #include <sys/errno.h> 47 #include <sys/stat.h> 48 #include <vm/vm.h> 49 50 #include <net/if.h> 51 #include <net/route.h> 52 53 #include <netinet/in.h> 54 #include <netinet/in_systm.h> 55 #include <netinet/ip.h> 56 #include <netinet/in_pcb.h> 57 #include <netinet/in_var.h> 58 #include <netinet/ip_var.h> 59 #include <netinet/tcp.h> 60 #include <netinet/tcp_fsm.h> 61 #include <netinet/tcp_seq.h> 62 #include <netinet/tcp_timer.h> 63 #include <netinet/tcp_var.h> 64 #include <netinet/tcpip.h> 65 #ifdef TCPDEBUG 66 #include <netinet/tcp_debug.h> 67 #endif 68 69 /* 70 * TCP protocol interface to socket abstraction. 71 */ 72 extern char *tcpstates[]; 73 74 static int tcp_attach __P((struct socket *)); 75 static int tcp_connect __P((struct tcpcb *, struct mbuf *)); 76 static struct tcpcb * 77 tcp_disconnect __P((struct tcpcb *)); 78 static struct tcpcb * 79 tcp_usrclosed __P((struct tcpcb *)); 80 /* 81 * Process a TCP user request for TCP tb. If this is a send request 82 * then m is the mbuf chain of send data. If this is a timer expiration 83 * (called from the software clock routine), then timertype tells which timer. 84 */ 85 /*ARGSUSED*/ 86 int 87 tcp_usrreq(so, req, m, nam, control) 88 struct socket *so; 89 int req; 90 struct mbuf *m, *nam, *control; 91 { 92 register struct inpcb *inp; 93 register struct tcpcb *tp = 0; 94 struct sockaddr_in *sinp; 95 int s; 96 int error = 0; 97 #ifdef TCPDEBUG 98 int ostate; 99 #endif 100 101 if (req == PRU_CONTROL) 102 return (in_control(so, (u_long)m, (caddr_t)nam, 103 (struct ifnet *)control)); 104 if (control && control->m_len) { 105 m_freem(control); 106 if (m) 107 m_freem(m); 108 return (EINVAL); 109 } 110 111 s = splnet(); 112 inp = sotoinpcb(so); 113 /* 114 * When a TCP is attached to a socket, then there will be 115 * a (struct inpcb) pointed at by the socket, and this 116 * structure will point at a subsidary (struct tcpcb). 117 */ 118 if (inp == 0 && req != PRU_ATTACH) { 119 splx(s); 120 #if 0 121 /* 122 * The following corrects an mbuf leak under rare 123 * circumstances, but has not been fully tested. 124 */ 125 if (m && req != PRU_SENSE) 126 m_freem(m); 127 #else 128 /* safer version of fix for mbuf leak */ 129 if (m && (req == PRU_SEND || req == PRU_SENDOOB)) 130 m_freem(m); 131 #endif 132 return (EINVAL); /* XXX */ 133 } 134 if (inp) { 135 tp = intotcpcb(inp); 136 /* WHAT IF TP IS 0? */ 137 #ifdef KPROF 138 tcp_acounts[tp->t_state][req]++; 139 #endif 140 #ifdef TCPDEBUG 141 ostate = tp->t_state; 142 } else 143 ostate = 0; 144 #else /* TCPDEBUG */ 145 } 146 #endif /* TCPDEBUG */ 147 148 switch (req) { 149 150 /* 151 * TCP attaches to socket via PRU_ATTACH, reserving space, 152 * and an internet control block. 153 */ 154 case PRU_ATTACH: 155 if (inp) { 156 error = EISCONN; 157 break; 158 } 159 error = tcp_attach(so); 160 if (error) 161 break; 162 if ((so->so_options & SO_LINGER) && so->so_linger == 0) 163 so->so_linger = TCP_LINGERTIME * hz; 164 tp = sototcpcb(so); 165 break; 166 167 /* 168 * PRU_DETACH detaches the TCP protocol from the socket. 169 * If the protocol state is non-embryonic, then can't 170 * do this directly: have to initiate a PRU_DISCONNECT, 171 * which may finish later; embryonic TCB's can just 172 * be discarded here. 173 */ 174 case PRU_DETACH: 175 if (tp->t_state > TCPS_LISTEN) 176 tp = tcp_disconnect(tp); 177 else 178 tp = tcp_close(tp); 179 break; 180 181 /* 182 * Give the socket an address. 183 */ 184 case PRU_BIND: 185 /* 186 * Must check for multicast addresses and disallow binding 187 * to them. 188 */ 189 sinp = mtod(nam, struct sockaddr_in *); 190 if (sinp->sin_family == AF_INET && 191 IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) { 192 error = EAFNOSUPPORT; 193 break; 194 } 195 error = in_pcbbind(inp, nam); 196 if (error) 197 break; 198 break; 199 200 /* 201 * Prepare to accept connections. 202 */ 203 case PRU_LISTEN: 204 if (inp->inp_lport == 0) 205 error = in_pcbbind(inp, NULL); 206 if (error == 0) 207 tp->t_state = TCPS_LISTEN; 208 break; 209 210 /* 211 * Initiate connection to peer. 212 * Create a template for use in transmissions on this connection. 213 * Enter SYN_SENT state, and mark socket as connecting. 214 * Start keep-alive timer, and seed output sequence space. 215 * Send initial segment on connection. 216 */ 217 case PRU_CONNECT: 218 /* 219 * Must disallow TCP ``connections'' to multicast addresses. 220 */ 221 sinp = mtod(nam, struct sockaddr_in *); 222 if (sinp->sin_family == AF_INET 223 && IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) { 224 error = EAFNOSUPPORT; 225 break; 226 } 227 228 if ((error = tcp_connect(tp, nam)) != 0) 229 break; 230 error = tcp_output(tp); 231 break; 232 233 /* 234 * Create a TCP connection between two sockets. 235 */ 236 case PRU_CONNECT2: 237 error = EOPNOTSUPP; 238 break; 239 240 /* 241 * Initiate disconnect from peer. 242 * If connection never passed embryonic stage, just drop; 243 * else if don't need to let data drain, then can just drop anyways, 244 * else have to begin TCP shutdown process: mark socket disconnecting, 245 * drain unread data, state switch to reflect user close, and 246 * send segment (e.g. FIN) to peer. Socket will be really disconnected 247 * when peer sends FIN and acks ours. 248 * 249 * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB. 250 */ 251 case PRU_DISCONNECT: 252 tp = tcp_disconnect(tp); 253 break; 254 255 /* 256 * Accept a connection. Essentially all the work is 257 * done at higher levels; just return the address 258 * of the peer, storing through addr. 259 */ 260 case PRU_ACCEPT: 261 in_setpeeraddr(inp, nam); 262 break; 263 264 /* 265 * Mark the connection as being incapable of further output. 266 */ 267 case PRU_SHUTDOWN: 268 socantsendmore(so); 269 tp = tcp_usrclosed(tp); 270 if (tp) 271 error = tcp_output(tp); 272 break; 273 274 /* 275 * After a receive, possibly send window update to peer. 276 */ 277 case PRU_RCVD: 278 (void) tcp_output(tp); 279 break; 280 281 /* 282 * Do a send by putting data in output queue and updating urgent 283 * marker if URG set. Possibly send more data. 284 */ 285 case PRU_SEND_EOF: 286 case PRU_SEND: 287 sbappend(&so->so_snd, m); 288 if (nam && tp->t_state < TCPS_SYN_SENT) { 289 /* 290 * Do implied connect if not yet connected, 291 * initialize window to default value, and 292 * initialize maxseg/maxopd using peer's cached 293 * MSS. 294 */ 295 error = tcp_connect(tp, nam); 296 if (error) 297 break; 298 tp->snd_wnd = TTCP_CLIENT_SND_WND; 299 tcp_mss(tp, -1); 300 } 301 302 if (req == PRU_SEND_EOF) { 303 /* 304 * Close the send side of the connection after 305 * the data is sent. 306 */ 307 socantsendmore(so); 308 tp = tcp_usrclosed(tp); 309 } 310 if (tp != NULL) 311 error = tcp_output(tp); 312 break; 313 314 /* 315 * Abort the TCP. 316 */ 317 case PRU_ABORT: 318 tp = tcp_drop(tp, ECONNABORTED); 319 break; 320 321 case PRU_SENSE: 322 ((struct stat *) m)->st_blksize = so->so_snd.sb_hiwat; 323 (void) splx(s); 324 return (0); 325 326 case PRU_RCVOOB: 327 if ((so->so_oobmark == 0 && 328 (so->so_state & SS_RCVATMARK) == 0) || 329 so->so_options & SO_OOBINLINE || 330 tp->t_oobflags & TCPOOB_HADDATA) { 331 error = EINVAL; 332 break; 333 } 334 if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) { 335 error = EWOULDBLOCK; 336 break; 337 } 338 m->m_len = 1; 339 *mtod(m, caddr_t) = tp->t_iobc; 340 if (((int)nam & MSG_PEEK) == 0) 341 tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA); 342 break; 343 344 case PRU_SENDOOB: 345 if (sbspace(&so->so_snd) < -512) { 346 m_freem(m); 347 error = ENOBUFS; 348 break; 349 } 350 /* 351 * According to RFC961 (Assigned Protocols), 352 * the urgent pointer points to the last octet 353 * of urgent data. We continue, however, 354 * to consider it to indicate the first octet 355 * of data past the urgent section. 356 * Otherwise, snd_up should be one lower. 357 */ 358 sbappend(&so->so_snd, m); 359 tp->snd_up = tp->snd_una + so->so_snd.sb_cc; 360 tp->t_force = 1; 361 error = tcp_output(tp); 362 tp->t_force = 0; 363 break; 364 365 case PRU_SOCKADDR: 366 in_setsockaddr(inp, nam); 367 break; 368 369 case PRU_PEERADDR: 370 in_setpeeraddr(inp, nam); 371 break; 372 373 /* 374 * TCP slow timer went off; going through this 375 * routine for tracing's sake. 376 */ 377 case PRU_SLOWTIMO: 378 tp = tcp_timers(tp, (int)nam); 379 #ifdef TCPDEBUG 380 req |= (int)nam << 8; /* for debug's sake */ 381 #endif 382 break; 383 384 default: 385 panic("tcp_usrreq"); 386 } 387 #ifdef TCPDEBUG 388 if (tp && (so->so_options & SO_DEBUG)) 389 tcp_trace(TA_USER, ostate, tp, (struct tcpiphdr *)0, req); 390 #endif 391 splx(s); 392 return (error); 393 } 394 395 /* 396 * Common subroutine to open a TCP connection to remote host specified 397 * by struct sockaddr_in in mbuf *nam. Call in_pcbbind to assign a local 398 * port number if needed. Call in_pcbladdr to do the routing and to choose 399 * a local host address (interface). If there is an existing incarnation 400 * of the same connection in TIME-WAIT state and if the remote host was 401 * sending CC options and if the connection duration was < MSL, then 402 * truncate the previous TIME-WAIT state and proceed. 403 * Initialize connection parameters and enter SYN-SENT state. 404 */ 405 static int 406 tcp_connect(tp, nam) 407 register struct tcpcb *tp; 408 struct mbuf *nam; 409 { 410 struct inpcb *inp = tp->t_inpcb, *oinp; 411 struct socket *so = inp->inp_socket; 412 struct tcpcb *otp; 413 struct sockaddr_in *sin = mtod(nam, struct sockaddr_in *); 414 struct sockaddr_in *ifaddr; 415 int error; 416 struct rmxp_tao *taop; 417 struct rmxp_tao tao_noncached; 418 419 if (inp->inp_lport == 0) { 420 error = in_pcbbind(inp, NULL); 421 if (error) 422 return error; 423 } 424 425 /* 426 * Cannot simply call in_pcbconnect, because there might be an 427 * earlier incarnation of this same connection still in 428 * TIME_WAIT state, creating an ADDRINUSE error. 429 */ 430 error = in_pcbladdr(inp, nam, &ifaddr); 431 if (error) 432 return error; 433 oinp = in_pcblookup(inp->inp_pcbinfo->listhead, 434 sin->sin_addr, sin->sin_port, 435 inp->inp_laddr.s_addr != INADDR_ANY ? inp->inp_laddr 436 : ifaddr->sin_addr, 437 inp->inp_lport, 0); 438 if (oinp) { 439 if (oinp != inp && (otp = intotcpcb(oinp)) != NULL && 440 otp->t_state == TCPS_TIME_WAIT && 441 otp->t_duration < TCPTV_MSL && 442 (otp->t_flags & TF_RCVD_CC)) 443 otp = tcp_close(otp); 444 else 445 return EADDRINUSE; 446 } 447 if (inp->inp_laddr.s_addr == INADDR_ANY) 448 inp->inp_laddr = ifaddr->sin_addr; 449 inp->inp_faddr = sin->sin_addr; 450 inp->inp_fport = sin->sin_port; 451 in_pcbrehash(inp); 452 453 tp->t_template = tcp_template(tp); 454 if (tp->t_template == 0) { 455 in_pcbdisconnect(inp); 456 return ENOBUFS; 457 } 458 459 /* Compute window scaling to request. */ 460 while (tp->request_r_scale < TCP_MAX_WINSHIFT && 461 (TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.sb_hiwat) 462 tp->request_r_scale++; 463 464 soisconnecting(so); 465 tcpstat.tcps_connattempt++; 466 tp->t_state = TCPS_SYN_SENT; 467 tp->t_timer[TCPT_KEEP] = TCPTV_KEEP_INIT; 468 tp->iss = tcp_iss; tcp_iss += TCP_ISSINCR/2; 469 tcp_sendseqinit(tp); 470 471 /* 472 * Generate a CC value for this connection and 473 * check whether CC or CCnew should be used. 474 */ 475 if ((taop = tcp_gettaocache(tp->t_inpcb)) == NULL) { 476 taop = &tao_noncached; 477 bzero(taop, sizeof(*taop)); 478 } 479 480 tp->cc_send = CC_INC(tcp_ccgen); 481 if (taop->tao_ccsent != 0 && 482 CC_GEQ(tp->cc_send, taop->tao_ccsent)) { 483 taop->tao_ccsent = tp->cc_send; 484 } else { 485 taop->tao_ccsent = 0; 486 tp->t_flags |= TF_SENDCCNEW; 487 } 488 489 return 0; 490 } 491 492 int 493 tcp_ctloutput(op, so, level, optname, mp) 494 int op; 495 struct socket *so; 496 int level, optname; 497 struct mbuf **mp; 498 { 499 int error = 0, s; 500 struct inpcb *inp; 501 register struct tcpcb *tp; 502 register struct mbuf *m; 503 register int i; 504 505 s = splnet(); 506 inp = sotoinpcb(so); 507 if (inp == NULL) { 508 splx(s); 509 if (op == PRCO_SETOPT && *mp) 510 (void) m_free(*mp); 511 return (ECONNRESET); 512 } 513 if (level != IPPROTO_TCP) { 514 error = ip_ctloutput(op, so, level, optname, mp); 515 splx(s); 516 return (error); 517 } 518 tp = intotcpcb(inp); 519 520 switch (op) { 521 522 case PRCO_SETOPT: 523 m = *mp; 524 switch (optname) { 525 526 case TCP_NODELAY: 527 if (m == NULL || m->m_len < sizeof (int)) 528 error = EINVAL; 529 else if (*mtod(m, int *)) 530 tp->t_flags |= TF_NODELAY; 531 else 532 tp->t_flags &= ~TF_NODELAY; 533 break; 534 535 case TCP_MAXSEG: 536 if (m && (i = *mtod(m, int *)) > 0 && i <= tp->t_maxseg) 537 tp->t_maxseg = i; 538 else 539 error = EINVAL; 540 break; 541 542 case TCP_NOOPT: 543 if (m == NULL || m->m_len < sizeof (int)) 544 error = EINVAL; 545 else if (*mtod(m, int *)) 546 tp->t_flags |= TF_NOOPT; 547 else 548 tp->t_flags &= ~TF_NOOPT; 549 break; 550 551 case TCP_NOPUSH: 552 if (m == NULL || m->m_len < sizeof (int)) 553 error = EINVAL; 554 else if (*mtod(m, int *)) 555 tp->t_flags |= TF_NOPUSH; 556 else 557 tp->t_flags &= ~TF_NOPUSH; 558 break; 559 560 default: 561 error = ENOPROTOOPT; 562 break; 563 } 564 if (m) 565 (void) m_free(m); 566 break; 567 568 case PRCO_GETOPT: 569 *mp = m = m_get(M_WAIT, MT_SOOPTS); 570 m->m_len = sizeof(int); 571 572 switch (optname) { 573 case TCP_NODELAY: 574 *mtod(m, int *) = tp->t_flags & TF_NODELAY; 575 break; 576 case TCP_MAXSEG: 577 *mtod(m, int *) = tp->t_maxseg; 578 break; 579 case TCP_NOOPT: 580 *mtod(m, int *) = tp->t_flags & TF_NOOPT; 581 break; 582 case TCP_NOPUSH: 583 *mtod(m, int *) = tp->t_flags & TF_NOPUSH; 584 break; 585 default: 586 error = ENOPROTOOPT; 587 break; 588 } 589 break; 590 } 591 splx(s); 592 return (error); 593 } 594 595 /* 596 * tcp_sendspace and tcp_recvspace are the default send and receive window 597 * sizes, respectively. These are obsolescent (this information should 598 * be set by the route). 599 */ 600 u_long tcp_sendspace = 1024*16; 601 SYSCTL_INT(_net_inet_tcp, TCPCTL_SENDSPACE, sendspace, 602 CTLFLAG_RW, &tcp_sendspace , 0, ""); 603 u_long tcp_recvspace = 1024*16; 604 SYSCTL_INT(_net_inet_tcp, TCPCTL_RECVSPACE, recvspace, 605 CTLFLAG_RW, &tcp_recvspace , 0, ""); 606 607 /* 608 * Attach TCP protocol to socket, allocating 609 * internet protocol control block, tcp control block, 610 * bufer space, and entering LISTEN state if to accept connections. 611 */ 612 static int 613 tcp_attach(so) 614 struct socket *so; 615 { 616 register struct tcpcb *tp; 617 struct inpcb *inp; 618 int error; 619 620 if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { 621 error = soreserve(so, tcp_sendspace, tcp_recvspace); 622 if (error) 623 return (error); 624 } 625 error = in_pcballoc(so, &tcbinfo); 626 if (error) 627 return (error); 628 inp = sotoinpcb(so); 629 tp = tcp_newtcpcb(inp); 630 if (tp == 0) { 631 int nofd = so->so_state & SS_NOFDREF; /* XXX */ 632 633 so->so_state &= ~SS_NOFDREF; /* don't free the socket yet */ 634 in_pcbdetach(inp); 635 so->so_state |= nofd; 636 return (ENOBUFS); 637 } 638 tp->t_state = TCPS_CLOSED; 639 return (0); 640 } 641 642 /* 643 * Initiate (or continue) disconnect. 644 * If embryonic state, just send reset (once). 645 * If in ``let data drain'' option and linger null, just drop. 646 * Otherwise (hard), mark socket disconnecting and drop 647 * current input data; switch states based on user close, and 648 * send segment to peer (with FIN). 649 */ 650 static struct tcpcb * 651 tcp_disconnect(tp) 652 register struct tcpcb *tp; 653 { 654 struct socket *so = tp->t_inpcb->inp_socket; 655 656 if (tp->t_state < TCPS_ESTABLISHED) 657 tp = tcp_close(tp); 658 else if ((so->so_options & SO_LINGER) && so->so_linger == 0) 659 tp = tcp_drop(tp, 0); 660 else { 661 soisdisconnecting(so); 662 sbflush(&so->so_rcv); 663 tp = tcp_usrclosed(tp); 664 if (tp) 665 (void) tcp_output(tp); 666 } 667 return (tp); 668 } 669 670 /* 671 * User issued close, and wish to trail through shutdown states: 672 * if never received SYN, just forget it. If got a SYN from peer, 673 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN. 674 * If already got a FIN from peer, then almost done; go to LAST_ACK 675 * state. In all other cases, have already sent FIN to peer (e.g. 676 * after PRU_SHUTDOWN), and just have to play tedious game waiting 677 * for peer to send FIN or not respond to keep-alives, etc. 678 * We can let the user exit from the close as soon as the FIN is acked. 679 */ 680 static struct tcpcb * 681 tcp_usrclosed(tp) 682 register struct tcpcb *tp; 683 { 684 685 switch (tp->t_state) { 686 687 case TCPS_CLOSED: 688 case TCPS_LISTEN: 689 tp->t_state = TCPS_CLOSED; 690 tp = tcp_close(tp); 691 break; 692 693 case TCPS_SYN_SENT: 694 case TCPS_SYN_RECEIVED: 695 tp->t_flags |= TF_NEEDFIN; 696 break; 697 698 case TCPS_ESTABLISHED: 699 tp->t_state = TCPS_FIN_WAIT_1; 700 break; 701 702 case TCPS_CLOSE_WAIT: 703 tp->t_state = TCPS_LAST_ACK; 704 break; 705 } 706 if (tp && tp->t_state >= TCPS_FIN_WAIT_2) { 707 soisdisconnected(tp->t_inpcb->inp_socket); 708 /* To prevent the connection hanging in FIN_WAIT_2 forever. */ 709 if (tp->t_state == TCPS_FIN_WAIT_2) 710 tp->t_timer[TCPT_2MSL] = tcp_maxidle; 711 } 712 return (tp); 713 } 714 715