1 /* 2 * Copyright (c) 1982, 1986, 1988, 1990, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 3. All advertising materials mentioning features or use of this software 14 * must display the following acknowledgement: 15 * This product includes software developed by the University of 16 * California, Berkeley and its contributors. 17 * 4. Neither the name of the University nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 * 33 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94 34 * $FreeBSD$ 35 */ 36 37 #include <sys/param.h> 38 #include <sys/systm.h> 39 #include <sys/fcntl.h> 40 #include <sys/malloc.h> 41 #include <sys/mbuf.h> 42 #include <sys/domain.h> 43 #include <sys/kernel.h> 44 #include <sys/malloc.h> 45 #include <sys/poll.h> 46 #include <sys/proc.h> 47 #include <sys/protosw.h> 48 #include <sys/socket.h> 49 #include <sys/socketvar.h> 50 #include <sys/resourcevar.h> 51 #include <sys/signalvar.h> 52 #include <sys/sysctl.h> 53 #include <sys/uio.h> 54 #include <vm/vm_zone.h> 55 56 #include <machine/limits.h> 57 58 struct vm_zone *socket_zone; 59 so_gen_t so_gencnt; /* generation count for sockets */ 60 61 MALLOC_DEFINE(M_SONAME, "soname", "socket name"); 62 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block"); 63 64 SYSCTL_DECL(_kern_ipc); 65 66 static int somaxconn = SOMAXCONN; 67 SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLFLAG_RW, 68 &somaxconn, 0, "Maximum pending socket connection queue size"); 69 70 /* 71 * Socket operation routines. 72 * These routines are called by the routines in 73 * sys_socket.c or from a system process, and 74 * implement the semantics of socket operations by 75 * switching out to the protocol specific routines. 76 */ 77 78 /* 79 * Get a socket structure from our zone, and initialize it. 80 * We don't implement `waitok' yet (see comments in uipc_domain.c). 81 * Note that it would probably be better to allocate socket 82 * and PCB at the same time, but I'm not convinced that all 83 * the protocols can be easily modified to do this. 84 */ 85 struct socket * 86 soalloc(waitok) 87 int waitok; 88 { 89 struct socket *so; 90 91 so = zalloci(socket_zone); 92 if (so) { 93 /* XXX race condition for reentrant kernel */ 94 bzero(so, sizeof *so); 95 so->so_gencnt = ++so_gencnt; 96 so->so_zone = socket_zone; 97 } 98 return so; 99 } 100 101 int 102 socreate(dom, aso, type, proto, p) 103 int dom; 104 struct socket **aso; 105 register int type; 106 int proto; 107 struct proc *p; 108 { 109 register struct protosw *prp; 110 register struct socket *so; 111 register int error; 112 113 if (proto) 114 prp = pffindproto(dom, proto, type); 115 else 116 prp = pffindtype(dom, type); 117 if (prp == 0 || prp->pr_usrreqs->pru_attach == 0) 118 return (EPROTONOSUPPORT); 119 if (prp->pr_type != type) 120 return (EPROTOTYPE); 121 so = soalloc(p != 0); 122 if (so == 0) 123 return (ENOBUFS); 124 125 TAILQ_INIT(&so->so_incomp); 126 TAILQ_INIT(&so->so_comp); 127 so->so_type = type; 128 so->so_cred = p->p_ucred; 129 crhold(so->so_cred); 130 so->so_proto = prp; 131 error = (*prp->pr_usrreqs->pru_attach)(so, proto, p); 132 if (error) { 133 so->so_state |= SS_NOFDREF; 134 sofree(so); 135 return (error); 136 } 137 *aso = so; 138 return (0); 139 } 140 141 int 142 sobind(so, nam, p) 143 struct socket *so; 144 struct sockaddr *nam; 145 struct proc *p; 146 { 147 int s = splnet(); 148 int error; 149 150 error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p); 151 splx(s); 152 return (error); 153 } 154 155 void 156 sodealloc(so) 157 struct socket *so; 158 { 159 160 so->so_gencnt = ++so_gencnt; 161 if (so->so_rcv.sb_hiwat) 162 (void)chgsbsize(so->so_cred->cr_uid, 163 -(rlim_t)so->so_rcv.sb_hiwat); 164 if (so->so_snd.sb_hiwat) 165 (void)chgsbsize(so->so_cred->cr_uid, 166 -(rlim_t)so->so_snd.sb_hiwat); 167 crfree(so->so_cred); 168 zfreei(so->so_zone, so); 169 } 170 171 int 172 solisten(so, backlog, p) 173 register struct socket *so; 174 int backlog; 175 struct proc *p; 176 { 177 int s, error; 178 179 s = splnet(); 180 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p); 181 if (error) { 182 splx(s); 183 return (error); 184 } 185 if (TAILQ_EMPTY(&so->so_comp)) 186 so->so_options |= SO_ACCEPTCONN; 187 if (backlog < 0 || backlog > somaxconn) 188 backlog = somaxconn; 189 so->so_qlimit = backlog; 190 splx(s); 191 return (0); 192 } 193 194 void 195 sofree(so) 196 register struct socket *so; 197 { 198 struct socket *head = so->so_head; 199 200 if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) 201 return; 202 if (head != NULL) { 203 if (so->so_state & SS_INCOMP) { 204 TAILQ_REMOVE(&head->so_incomp, so, so_list); 205 head->so_incqlen--; 206 } else if (so->so_state & SS_COMP) { 207 /* 208 * We must not decommission a socket that's 209 * on the accept(2) queue. If we do, then 210 * accept(2) may hang after select(2) indicated 211 * that the listening socket was ready. 212 */ 213 return; 214 } else { 215 panic("sofree: not queued"); 216 } 217 head->so_qlen--; 218 so->so_state &= ~SS_INCOMP; 219 so->so_head = NULL; 220 } 221 sbrelease(&so->so_snd, so); 222 sorflush(so); 223 sodealloc(so); 224 } 225 226 /* 227 * Close a socket on last file table reference removal. 228 * Initiate disconnect if connected. 229 * Free socket when disconnect complete. 230 */ 231 int 232 soclose(so) 233 register struct socket *so; 234 { 235 int s = splnet(); /* conservative */ 236 int error = 0; 237 238 funsetown(so->so_sigio); 239 if (so->so_options & SO_ACCEPTCONN) { 240 struct socket *sp, *sonext; 241 242 sp = TAILQ_FIRST(&so->so_incomp); 243 for (; sp != NULL; sp = sonext) { 244 sonext = TAILQ_NEXT(sp, so_list); 245 (void) soabort(sp); 246 } 247 for (sp = TAILQ_FIRST(&so->so_comp); sp != NULL; sp = sonext) { 248 sonext = TAILQ_NEXT(sp, so_list); 249 /* Dequeue from so_comp since sofree() won't do it */ 250 TAILQ_REMOVE(&so->so_comp, sp, so_list); 251 so->so_qlen--; 252 sp->so_state &= ~SS_COMP; 253 sp->so_head = NULL; 254 (void) soabort(sp); 255 } 256 } 257 if (so->so_pcb == 0) 258 goto discard; 259 if (so->so_state & SS_ISCONNECTED) { 260 if ((so->so_state & SS_ISDISCONNECTING) == 0) { 261 error = sodisconnect(so); 262 if (error) 263 goto drop; 264 } 265 if (so->so_options & SO_LINGER) { 266 if ((so->so_state & SS_ISDISCONNECTING) && 267 (so->so_state & SS_NBIO)) 268 goto drop; 269 while (so->so_state & SS_ISCONNECTED) { 270 error = tsleep((caddr_t)&so->so_timeo, 271 PSOCK | PCATCH, "soclos", so->so_linger * hz); 272 if (error) 273 break; 274 } 275 } 276 } 277 drop: 278 if (so->so_pcb) { 279 int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so); 280 if (error == 0) 281 error = error2; 282 } 283 discard: 284 if (so->so_state & SS_NOFDREF) 285 panic("soclose: NOFDREF"); 286 so->so_state |= SS_NOFDREF; 287 sofree(so); 288 splx(s); 289 return (error); 290 } 291 292 /* 293 * Must be called at splnet... 294 */ 295 int 296 soabort(so) 297 struct socket *so; 298 { 299 300 return (*so->so_proto->pr_usrreqs->pru_abort)(so); 301 } 302 303 int 304 soaccept(so, nam) 305 register struct socket *so; 306 struct sockaddr **nam; 307 { 308 int s = splnet(); 309 int error; 310 311 if ((so->so_state & SS_NOFDREF) == 0) 312 panic("soaccept: !NOFDREF"); 313 so->so_state &= ~SS_NOFDREF; 314 if ((so->so_state & SS_ISDISCONNECTED) == 0) 315 error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam); 316 else { 317 if (nam) 318 *nam = 0; 319 error = 0; 320 } 321 splx(s); 322 return (error); 323 } 324 325 int 326 soconnect(so, nam, p) 327 register struct socket *so; 328 struct sockaddr *nam; 329 struct proc *p; 330 { 331 int s; 332 int error; 333 334 if (so->so_options & SO_ACCEPTCONN) 335 return (EOPNOTSUPP); 336 s = splnet(); 337 /* 338 * If protocol is connection-based, can only connect once. 339 * Otherwise, if connected, try to disconnect first. 340 * This allows user to disconnect by connecting to, e.g., 341 * a null address. 342 */ 343 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && 344 ((so->so_proto->pr_flags & PR_CONNREQUIRED) || 345 (error = sodisconnect(so)))) 346 error = EISCONN; 347 else 348 error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, p); 349 splx(s); 350 return (error); 351 } 352 353 int 354 soconnect2(so1, so2) 355 register struct socket *so1; 356 struct socket *so2; 357 { 358 int s = splnet(); 359 int error; 360 361 error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2); 362 splx(s); 363 return (error); 364 } 365 366 int 367 sodisconnect(so) 368 register struct socket *so; 369 { 370 int s = splnet(); 371 int error; 372 373 if ((so->so_state & SS_ISCONNECTED) == 0) { 374 error = ENOTCONN; 375 goto bad; 376 } 377 if (so->so_state & SS_ISDISCONNECTING) { 378 error = EALREADY; 379 goto bad; 380 } 381 error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so); 382 bad: 383 splx(s); 384 return (error); 385 } 386 387 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK) 388 /* 389 * Send on a socket. 390 * If send must go all at once and message is larger than 391 * send buffering, then hard error. 392 * Lock against other senders. 393 * If must go all at once and not enough room now, then 394 * inform user that this would block and do nothing. 395 * Otherwise, if nonblocking, send as much as possible. 396 * The data to be sent is described by "uio" if nonzero, 397 * otherwise by the mbuf chain "top" (which must be null 398 * if uio is not). Data provided in mbuf chain must be small 399 * enough to send all at once. 400 * 401 * Returns nonzero on error, timeout or signal; callers 402 * must check for short counts if EINTR/ERESTART are returned. 403 * Data and control buffers are freed on return. 404 */ 405 int 406 sosend(so, addr, uio, top, control, flags, p) 407 register struct socket *so; 408 struct sockaddr *addr; 409 struct uio *uio; 410 struct mbuf *top; 411 struct mbuf *control; 412 int flags; 413 struct proc *p; 414 { 415 struct mbuf **mp; 416 register struct mbuf *m; 417 register long space, len, resid; 418 int clen = 0, error, s, dontroute, mlen; 419 int atomic = sosendallatonce(so) || top; 420 421 if (uio) 422 resid = uio->uio_resid; 423 else 424 resid = top->m_pkthdr.len; 425 /* 426 * In theory resid should be unsigned. 427 * However, space must be signed, as it might be less than 0 428 * if we over-committed, and we must use a signed comparison 429 * of space and resid. On the other hand, a negative resid 430 * causes us to loop sending 0-length segments to the protocol. 431 * 432 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM 433 * type sockets since that's an error. 434 */ 435 if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) { 436 error = EINVAL; 437 goto out; 438 } 439 440 dontroute = 441 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && 442 (so->so_proto->pr_flags & PR_ATOMIC); 443 if (p) 444 p->p_stats->p_ru.ru_msgsnd++; 445 if (control) 446 clen = control->m_len; 447 #define snderr(errno) { error = errno; splx(s); goto release; } 448 449 restart: 450 error = sblock(&so->so_snd, SBLOCKWAIT(flags)); 451 if (error) 452 goto out; 453 do { 454 s = splnet(); 455 if (so->so_state & SS_CANTSENDMORE) 456 snderr(EPIPE); 457 if (so->so_error) { 458 error = so->so_error; 459 so->so_error = 0; 460 splx(s); 461 goto release; 462 } 463 if ((so->so_state & SS_ISCONNECTED) == 0) { 464 /* 465 * `sendto' and `sendmsg' is allowed on a connection- 466 * based socket if it supports implied connect. 467 * Return ENOTCONN if not connected and no address is 468 * supplied. 469 */ 470 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && 471 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { 472 if ((so->so_state & SS_ISCONFIRMING) == 0 && 473 !(resid == 0 && clen != 0)) 474 snderr(ENOTCONN); 475 } else if (addr == 0) 476 snderr(so->so_proto->pr_flags & PR_CONNREQUIRED ? 477 ENOTCONN : EDESTADDRREQ); 478 } 479 space = sbspace(&so->so_snd); 480 if (flags & MSG_OOB) 481 space += 1024; 482 if ((atomic && resid > so->so_snd.sb_hiwat) || 483 clen > so->so_snd.sb_hiwat) 484 snderr(EMSGSIZE); 485 if (space < resid + clen && uio && 486 (atomic || space < so->so_snd.sb_lowat || space < clen)) { 487 if (so->so_state & SS_NBIO) 488 snderr(EWOULDBLOCK); 489 sbunlock(&so->so_snd); 490 error = sbwait(&so->so_snd); 491 splx(s); 492 if (error) 493 goto out; 494 goto restart; 495 } 496 splx(s); 497 mp = ⊤ 498 space -= clen; 499 do { 500 if (uio == NULL) { 501 /* 502 * Data is prepackaged in "top". 503 */ 504 resid = 0; 505 if (flags & MSG_EOR) 506 top->m_flags |= M_EOR; 507 } else do { 508 if (top == 0) { 509 MGETHDR(m, M_WAIT, MT_DATA); 510 if (m == NULL) { 511 error = ENOBUFS; 512 goto release; 513 } 514 mlen = MHLEN; 515 m->m_pkthdr.len = 0; 516 m->m_pkthdr.rcvif = (struct ifnet *)0; 517 } else { 518 MGET(m, M_WAIT, MT_DATA); 519 if (m == NULL) { 520 error = ENOBUFS; 521 goto release; 522 } 523 mlen = MLEN; 524 } 525 if (resid >= MINCLSIZE) { 526 MCLGET(m, M_WAIT); 527 if ((m->m_flags & M_EXT) == 0) 528 goto nopages; 529 mlen = MCLBYTES; 530 len = min(min(mlen, resid), space); 531 } else { 532 nopages: 533 len = min(min(mlen, resid), space); 534 /* 535 * For datagram protocols, leave room 536 * for protocol headers in first mbuf. 537 */ 538 if (atomic && top == 0 && len < mlen) 539 MH_ALIGN(m, len); 540 } 541 space -= len; 542 error = uiomove(mtod(m, caddr_t), (int)len, uio); 543 resid = uio->uio_resid; 544 m->m_len = len; 545 *mp = m; 546 top->m_pkthdr.len += len; 547 if (error) 548 goto release; 549 mp = &m->m_next; 550 if (resid <= 0) { 551 if (flags & MSG_EOR) 552 top->m_flags |= M_EOR; 553 break; 554 } 555 } while (space > 0 && atomic); 556 if (dontroute) 557 so->so_options |= SO_DONTROUTE; 558 s = splnet(); /* XXX */ 559 /* 560 * XXX all the SS_CANTSENDMORE checks previously 561 * done could be out of date. We could have recieved 562 * a reset packet in an interrupt or maybe we slept 563 * while doing page faults in uiomove() etc. We could 564 * probably recheck again inside the splnet() protection 565 * here, but there are probably other places that this 566 * also happens. We must rethink this. 567 */ 568 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 569 (flags & MSG_OOB) ? PRUS_OOB : 570 /* 571 * If the user set MSG_EOF, the protocol 572 * understands this flag and nothing left to 573 * send then use PRU_SEND_EOF instead of PRU_SEND. 574 */ 575 ((flags & MSG_EOF) && 576 (so->so_proto->pr_flags & PR_IMPLOPCL) && 577 (resid <= 0)) ? 578 PRUS_EOF : 579 /* If there is more to send set PRUS_MORETOCOME */ 580 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, 581 top, addr, control, p); 582 splx(s); 583 if (dontroute) 584 so->so_options &= ~SO_DONTROUTE; 585 clen = 0; 586 control = 0; 587 top = 0; 588 mp = ⊤ 589 if (error) 590 goto release; 591 } while (resid && space > 0); 592 } while (resid); 593 594 release: 595 sbunlock(&so->so_snd); 596 out: 597 if (top) 598 m_freem(top); 599 if (control) 600 m_freem(control); 601 return (error); 602 } 603 604 /* 605 * Implement receive operations on a socket. 606 * We depend on the way that records are added to the sockbuf 607 * by sbappend*. In particular, each record (mbufs linked through m_next) 608 * must begin with an address if the protocol so specifies, 609 * followed by an optional mbuf or mbufs containing ancillary data, 610 * and then zero or more mbufs of data. 611 * In order to avoid blocking network interrupts for the entire time here, 612 * we splx() while doing the actual copy to user space. 613 * Although the sockbuf is locked, new data may still be appended, 614 * and thus we must maintain consistency of the sockbuf during that time. 615 * 616 * The caller may receive the data as a single mbuf chain by supplying 617 * an mbuf **mp0 for use in returning the chain. The uio is then used 618 * only for the count in uio_resid. 619 */ 620 int 621 soreceive(so, psa, uio, mp0, controlp, flagsp) 622 register struct socket *so; 623 struct sockaddr **psa; 624 struct uio *uio; 625 struct mbuf **mp0; 626 struct mbuf **controlp; 627 int *flagsp; 628 { 629 register struct mbuf *m, **mp; 630 register int flags, len, error, s, offset; 631 struct protosw *pr = so->so_proto; 632 struct mbuf *nextrecord; 633 int moff, type = 0; 634 int orig_resid = uio->uio_resid; 635 636 mp = mp0; 637 if (psa) 638 *psa = 0; 639 if (controlp) 640 *controlp = 0; 641 if (flagsp) 642 flags = *flagsp &~ MSG_EOR; 643 else 644 flags = 0; 645 if (flags & MSG_OOB) { 646 m = m_get(M_WAIT, MT_DATA); 647 if (m == NULL) { 648 error = ENOBUFS; 649 goto release; 650 } 651 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK); 652 if (error) 653 goto bad; 654 do { 655 error = uiomove(mtod(m, caddr_t), 656 (int) min(uio->uio_resid, m->m_len), uio); 657 m = m_free(m); 658 } while (uio->uio_resid && error == 0 && m); 659 bad: 660 if (m) 661 m_freem(m); 662 return (error); 663 } 664 if (mp) 665 *mp = (struct mbuf *)0; 666 if (so->so_state & SS_ISCONFIRMING && uio->uio_resid) 667 (*pr->pr_usrreqs->pru_rcvd)(so, 0); 668 669 restart: 670 error = sblock(&so->so_rcv, SBLOCKWAIT(flags)); 671 if (error) 672 return (error); 673 s = splnet(); 674 675 m = so->so_rcv.sb_mb; 676 /* 677 * If we have less data than requested, block awaiting more 678 * (subject to any timeout) if: 679 * 1. the current count is less than the low water mark, or 680 * 2. MSG_WAITALL is set, and it is possible to do the entire 681 * receive operation at once if we block (resid <= hiwat). 682 * 3. MSG_DONTWAIT is not set 683 * If MSG_WAITALL is set but resid is larger than the receive buffer, 684 * we have to do the receive in sections, and thus risk returning 685 * a short count if a timeout or signal occurs after we start. 686 */ 687 if (m == 0 || (((flags & MSG_DONTWAIT) == 0 && 688 so->so_rcv.sb_cc < uio->uio_resid) && 689 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat || 690 ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) && 691 m->m_nextpkt == 0 && (pr->pr_flags & PR_ATOMIC) == 0)) { 692 KASSERT(m != 0 || !so->so_rcv.sb_cc, ("receive 1")); 693 if (so->so_error) { 694 if (m) 695 goto dontblock; 696 error = so->so_error; 697 if ((flags & MSG_PEEK) == 0) 698 so->so_error = 0; 699 goto release; 700 } 701 if (so->so_state & SS_CANTRCVMORE) { 702 if (m) 703 goto dontblock; 704 else 705 goto release; 706 } 707 for (; m; m = m->m_next) 708 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { 709 m = so->so_rcv.sb_mb; 710 goto dontblock; 711 } 712 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 713 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 714 error = ENOTCONN; 715 goto release; 716 } 717 if (uio->uio_resid == 0) 718 goto release; 719 if ((so->so_state & SS_NBIO) || (flags & MSG_DONTWAIT)) { 720 error = EWOULDBLOCK; 721 goto release; 722 } 723 sbunlock(&so->so_rcv); 724 error = sbwait(&so->so_rcv); 725 splx(s); 726 if (error) 727 return (error); 728 goto restart; 729 } 730 dontblock: 731 if (uio->uio_procp) 732 uio->uio_procp->p_stats->p_ru.ru_msgrcv++; 733 nextrecord = m->m_nextpkt; 734 if (pr->pr_flags & PR_ADDR) { 735 KASSERT(m->m_type == MT_SONAME, ("receive 1a")); 736 orig_resid = 0; 737 if (psa) 738 *psa = dup_sockaddr(mtod(m, struct sockaddr *), 739 mp0 == 0); 740 if (flags & MSG_PEEK) { 741 m = m->m_next; 742 } else { 743 sbfree(&so->so_rcv, m); 744 MFREE(m, so->so_rcv.sb_mb); 745 m = so->so_rcv.sb_mb; 746 } 747 } 748 while (m && m->m_type == MT_CONTROL && error == 0) { 749 if (flags & MSG_PEEK) { 750 if (controlp) 751 *controlp = m_copy(m, 0, m->m_len); 752 m = m->m_next; 753 } else { 754 sbfree(&so->so_rcv, m); 755 if (controlp) { 756 if (pr->pr_domain->dom_externalize && 757 mtod(m, struct cmsghdr *)->cmsg_type == 758 SCM_RIGHTS) 759 error = (*pr->pr_domain->dom_externalize)(m); 760 *controlp = m; 761 so->so_rcv.sb_mb = m->m_next; 762 m->m_next = 0; 763 m = so->so_rcv.sb_mb; 764 } else { 765 MFREE(m, so->so_rcv.sb_mb); 766 m = so->so_rcv.sb_mb; 767 } 768 } 769 if (controlp) { 770 orig_resid = 0; 771 controlp = &(*controlp)->m_next; 772 } 773 } 774 if (m) { 775 if ((flags & MSG_PEEK) == 0) 776 m->m_nextpkt = nextrecord; 777 type = m->m_type; 778 if (type == MT_OOBDATA) 779 flags |= MSG_OOB; 780 } 781 moff = 0; 782 offset = 0; 783 while (m && uio->uio_resid > 0 && error == 0) { 784 if (m->m_type == MT_OOBDATA) { 785 if (type != MT_OOBDATA) 786 break; 787 } else if (type == MT_OOBDATA) 788 break; 789 else 790 KASSERT(m->m_type == MT_DATA || m->m_type == MT_HEADER, 791 ("receive 3")); 792 so->so_state &= ~SS_RCVATMARK; 793 len = uio->uio_resid; 794 if (so->so_oobmark && len > so->so_oobmark - offset) 795 len = so->so_oobmark - offset; 796 if (len > m->m_len - moff) 797 len = m->m_len - moff; 798 /* 799 * If mp is set, just pass back the mbufs. 800 * Otherwise copy them out via the uio, then free. 801 * Sockbuf must be consistent here (points to current mbuf, 802 * it points to next record) when we drop priority; 803 * we must note any additions to the sockbuf when we 804 * block interrupts again. 805 */ 806 if (mp == 0) { 807 splx(s); 808 error = uiomove(mtod(m, caddr_t) + moff, (int)len, uio); 809 s = splnet(); 810 if (error) 811 goto release; 812 } else 813 uio->uio_resid -= len; 814 if (len == m->m_len - moff) { 815 if (m->m_flags & M_EOR) 816 flags |= MSG_EOR; 817 if (flags & MSG_PEEK) { 818 m = m->m_next; 819 moff = 0; 820 } else { 821 nextrecord = m->m_nextpkt; 822 sbfree(&so->so_rcv, m); 823 if (mp) { 824 *mp = m; 825 mp = &m->m_next; 826 so->so_rcv.sb_mb = m = m->m_next; 827 *mp = (struct mbuf *)0; 828 } else { 829 MFREE(m, so->so_rcv.sb_mb); 830 m = so->so_rcv.sb_mb; 831 } 832 if (m) 833 m->m_nextpkt = nextrecord; 834 } 835 } else { 836 if (flags & MSG_PEEK) 837 moff += len; 838 else { 839 if (mp) 840 *mp = m_copym(m, 0, len, M_WAIT); 841 m->m_data += len; 842 m->m_len -= len; 843 so->so_rcv.sb_cc -= len; 844 } 845 } 846 if (so->so_oobmark) { 847 if ((flags & MSG_PEEK) == 0) { 848 so->so_oobmark -= len; 849 if (so->so_oobmark == 0) { 850 so->so_state |= SS_RCVATMARK; 851 break; 852 } 853 } else { 854 offset += len; 855 if (offset == so->so_oobmark) 856 break; 857 } 858 } 859 if (flags & MSG_EOR) 860 break; 861 /* 862 * If the MSG_WAITALL flag is set (for non-atomic socket), 863 * we must not quit until "uio->uio_resid == 0" or an error 864 * termination. If a signal/timeout occurs, return 865 * with a short count but without error. 866 * Keep sockbuf locked against other readers. 867 */ 868 while (flags & MSG_WAITALL && m == 0 && uio->uio_resid > 0 && 869 !sosendallatonce(so) && !nextrecord) { 870 if (so->so_error || so->so_state & SS_CANTRCVMORE) 871 break; 872 error = sbwait(&so->so_rcv); 873 if (error) { 874 sbunlock(&so->so_rcv); 875 splx(s); 876 return (0); 877 } 878 m = so->so_rcv.sb_mb; 879 if (m) 880 nextrecord = m->m_nextpkt; 881 } 882 } 883 884 if (m && pr->pr_flags & PR_ATOMIC) { 885 flags |= MSG_TRUNC; 886 if ((flags & MSG_PEEK) == 0) 887 (void) sbdroprecord(&so->so_rcv); 888 } 889 if ((flags & MSG_PEEK) == 0) { 890 if (m == 0) 891 so->so_rcv.sb_mb = nextrecord; 892 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) 893 (*pr->pr_usrreqs->pru_rcvd)(so, flags); 894 } 895 if (orig_resid == uio->uio_resid && orig_resid && 896 (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) { 897 sbunlock(&so->so_rcv); 898 splx(s); 899 goto restart; 900 } 901 902 if (flagsp) 903 *flagsp |= flags; 904 release: 905 sbunlock(&so->so_rcv); 906 splx(s); 907 return (error); 908 } 909 910 int 911 soshutdown(so, how) 912 register struct socket *so; 913 register int how; 914 { 915 register struct protosw *pr = so->so_proto; 916 917 how++; 918 if (how & FREAD) 919 sorflush(so); 920 if (how & FWRITE) 921 return ((*pr->pr_usrreqs->pru_shutdown)(so)); 922 return (0); 923 } 924 925 void 926 sorflush(so) 927 register struct socket *so; 928 { 929 register struct sockbuf *sb = &so->so_rcv; 930 register struct protosw *pr = so->so_proto; 931 register int s; 932 struct sockbuf asb; 933 934 sb->sb_flags |= SB_NOINTR; 935 (void) sblock(sb, M_WAITOK); 936 s = splimp(); 937 socantrcvmore(so); 938 sbunlock(sb); 939 asb = *sb; 940 bzero((caddr_t)sb, sizeof (*sb)); 941 splx(s); 942 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose) 943 (*pr->pr_domain->dom_dispose)(asb.sb_mb); 944 sbrelease(&asb, so); 945 } 946 947 /* 948 * Perhaps this routine, and sooptcopyout(), below, ought to come in 949 * an additional variant to handle the case where the option value needs 950 * to be some kind of integer, but not a specific size. 951 * In addition to their use here, these functions are also called by the 952 * protocol-level pr_ctloutput() routines. 953 */ 954 int 955 sooptcopyin(sopt, buf, len, minlen) 956 struct sockopt *sopt; 957 void *buf; 958 size_t len; 959 size_t minlen; 960 { 961 size_t valsize; 962 963 /* 964 * If the user gives us more than we wanted, we ignore it, 965 * but if we don't get the minimum length the caller 966 * wants, we return EINVAL. On success, sopt->sopt_valsize 967 * is set to however much we actually retrieved. 968 */ 969 if ((valsize = sopt->sopt_valsize) < minlen) 970 return EINVAL; 971 if (valsize > len) 972 sopt->sopt_valsize = valsize = len; 973 974 if (sopt->sopt_p != 0) 975 return (copyin(sopt->sopt_val, buf, valsize)); 976 977 bcopy(sopt->sopt_val, buf, valsize); 978 return 0; 979 } 980 981 int 982 sosetopt(so, sopt) 983 struct socket *so; 984 struct sockopt *sopt; 985 { 986 int error, optval; 987 struct linger l; 988 struct timeval tv; 989 u_long val; 990 991 error = 0; 992 if (sopt->sopt_level != SOL_SOCKET) { 993 if (so->so_proto && so->so_proto->pr_ctloutput) 994 return ((*so->so_proto->pr_ctloutput) 995 (so, sopt)); 996 error = ENOPROTOOPT; 997 } else { 998 switch (sopt->sopt_name) { 999 case SO_LINGER: 1000 error = sooptcopyin(sopt, &l, sizeof l, sizeof l); 1001 if (error) 1002 goto bad; 1003 1004 so->so_linger = l.l_linger; 1005 if (l.l_onoff) 1006 so->so_options |= SO_LINGER; 1007 else 1008 so->so_options &= ~SO_LINGER; 1009 break; 1010 1011 case SO_DEBUG: 1012 case SO_KEEPALIVE: 1013 case SO_DONTROUTE: 1014 case SO_USELOOPBACK: 1015 case SO_BROADCAST: 1016 case SO_REUSEADDR: 1017 case SO_REUSEPORT: 1018 case SO_OOBINLINE: 1019 case SO_TIMESTAMP: 1020 error = sooptcopyin(sopt, &optval, sizeof optval, 1021 sizeof optval); 1022 if (error) 1023 goto bad; 1024 if (optval) 1025 so->so_options |= sopt->sopt_name; 1026 else 1027 so->so_options &= ~sopt->sopt_name; 1028 break; 1029 1030 case SO_SNDBUF: 1031 case SO_RCVBUF: 1032 case SO_SNDLOWAT: 1033 case SO_RCVLOWAT: 1034 error = sooptcopyin(sopt, &optval, sizeof optval, 1035 sizeof optval); 1036 if (error) 1037 goto bad; 1038 1039 /* 1040 * Values < 1 make no sense for any of these 1041 * options, so disallow them. 1042 */ 1043 if (optval < 1) { 1044 error = EINVAL; 1045 goto bad; 1046 } 1047 1048 switch (sopt->sopt_name) { 1049 case SO_SNDBUF: 1050 case SO_RCVBUF: 1051 if (sbreserve(sopt->sopt_name == SO_SNDBUF ? 1052 &so->so_snd : &so->so_rcv, (u_long)optval, 1053 so, curproc) == 0) { 1054 error = ENOBUFS; 1055 goto bad; 1056 } 1057 break; 1058 1059 /* 1060 * Make sure the low-water is never greater than 1061 * the high-water. 1062 */ 1063 case SO_SNDLOWAT: 1064 so->so_snd.sb_lowat = 1065 (optval > so->so_snd.sb_hiwat) ? 1066 so->so_snd.sb_hiwat : optval; 1067 break; 1068 case SO_RCVLOWAT: 1069 so->so_rcv.sb_lowat = 1070 (optval > so->so_rcv.sb_hiwat) ? 1071 so->so_rcv.sb_hiwat : optval; 1072 break; 1073 } 1074 break; 1075 1076 case SO_SNDTIMEO: 1077 case SO_RCVTIMEO: 1078 error = sooptcopyin(sopt, &tv, sizeof tv, 1079 sizeof tv); 1080 if (error) 1081 goto bad; 1082 1083 /* assert(hz > 0); */ 1084 if (tv.tv_sec < 0 || tv.tv_sec > SHRT_MAX / hz || 1085 tv.tv_usec < 0 || tv.tv_usec >= 1000000) { 1086 error = EDOM; 1087 goto bad; 1088 } 1089 /* assert(tick > 0); */ 1090 /* assert(ULONG_MAX - SHRT_MAX >= 1000000); */ 1091 val = (u_long)(tv.tv_sec * hz) + tv.tv_usec / tick; 1092 if (val > SHRT_MAX) { 1093 error = EDOM; 1094 goto bad; 1095 } 1096 1097 switch (sopt->sopt_name) { 1098 case SO_SNDTIMEO: 1099 so->so_snd.sb_timeo = val; 1100 break; 1101 case SO_RCVTIMEO: 1102 so->so_rcv.sb_timeo = val; 1103 break; 1104 } 1105 break; 1106 1107 default: 1108 error = ENOPROTOOPT; 1109 break; 1110 } 1111 if (error == 0 && so->so_proto && so->so_proto->pr_ctloutput) { 1112 (void) ((*so->so_proto->pr_ctloutput) 1113 (so, sopt)); 1114 } 1115 } 1116 bad: 1117 return (error); 1118 } 1119 1120 /* Helper routine for getsockopt */ 1121 int 1122 sooptcopyout(sopt, buf, len) 1123 struct sockopt *sopt; 1124 void *buf; 1125 size_t len; 1126 { 1127 int error; 1128 size_t valsize; 1129 1130 error = 0; 1131 1132 /* 1133 * Documented get behavior is that we always return a value, 1134 * possibly truncated to fit in the user's buffer. 1135 * Traditional behavior is that we always tell the user 1136 * precisely how much we copied, rather than something useful 1137 * like the total amount we had available for her. 1138 * Note that this interface is not idempotent; the entire answer must 1139 * generated ahead of time. 1140 */ 1141 valsize = min(len, sopt->sopt_valsize); 1142 sopt->sopt_valsize = valsize; 1143 if (sopt->sopt_val != 0) { 1144 if (sopt->sopt_p != 0) 1145 error = copyout(buf, sopt->sopt_val, valsize); 1146 else 1147 bcopy(buf, sopt->sopt_val, valsize); 1148 } 1149 return error; 1150 } 1151 1152 int 1153 sogetopt(so, sopt) 1154 struct socket *so; 1155 struct sockopt *sopt; 1156 { 1157 int error, optval; 1158 struct linger l; 1159 struct timeval tv; 1160 1161 error = 0; 1162 if (sopt->sopt_level != SOL_SOCKET) { 1163 if (so->so_proto && so->so_proto->pr_ctloutput) { 1164 return ((*so->so_proto->pr_ctloutput) 1165 (so, sopt)); 1166 } else 1167 return (ENOPROTOOPT); 1168 } else { 1169 switch (sopt->sopt_name) { 1170 case SO_LINGER: 1171 l.l_onoff = so->so_options & SO_LINGER; 1172 l.l_linger = so->so_linger; 1173 error = sooptcopyout(sopt, &l, sizeof l); 1174 break; 1175 1176 case SO_USELOOPBACK: 1177 case SO_DONTROUTE: 1178 case SO_DEBUG: 1179 case SO_KEEPALIVE: 1180 case SO_REUSEADDR: 1181 case SO_REUSEPORT: 1182 case SO_BROADCAST: 1183 case SO_OOBINLINE: 1184 case SO_TIMESTAMP: 1185 optval = so->so_options & sopt->sopt_name; 1186 integer: 1187 error = sooptcopyout(sopt, &optval, sizeof optval); 1188 break; 1189 1190 case SO_TYPE: 1191 optval = so->so_type; 1192 goto integer; 1193 1194 case SO_ERROR: 1195 optval = so->so_error; 1196 so->so_error = 0; 1197 goto integer; 1198 1199 case SO_SNDBUF: 1200 optval = so->so_snd.sb_hiwat; 1201 goto integer; 1202 1203 case SO_RCVBUF: 1204 optval = so->so_rcv.sb_hiwat; 1205 goto integer; 1206 1207 case SO_SNDLOWAT: 1208 optval = so->so_snd.sb_lowat; 1209 goto integer; 1210 1211 case SO_RCVLOWAT: 1212 optval = so->so_rcv.sb_lowat; 1213 goto integer; 1214 1215 case SO_SNDTIMEO: 1216 case SO_RCVTIMEO: 1217 optval = (sopt->sopt_name == SO_SNDTIMEO ? 1218 so->so_snd.sb_timeo : so->so_rcv.sb_timeo); 1219 1220 tv.tv_sec = optval / hz; 1221 tv.tv_usec = (optval % hz) * tick; 1222 error = sooptcopyout(sopt, &tv, sizeof tv); 1223 break; 1224 1225 default: 1226 error = ENOPROTOOPT; 1227 break; 1228 } 1229 return (error); 1230 } 1231 } 1232 1233 /* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */ 1234 int 1235 soopt_getm(struct sockopt *sopt, struct mbuf **mp) 1236 { 1237 struct mbuf *m, *m_prev; 1238 int sopt_size = sopt->sopt_valsize; 1239 1240 MGET(m, sopt->sopt_p ? M_WAIT : M_DONTWAIT, MT_DATA); 1241 if (m == 0) 1242 return ENOBUFS; 1243 if (sopt_size > MLEN) { 1244 MCLGET(m, sopt->sopt_p ? M_WAIT : M_DONTWAIT); 1245 if ((m->m_flags & M_EXT) == 0) { 1246 m_free(m); 1247 return ENOBUFS; 1248 } 1249 m->m_len = min(MCLBYTES, sopt_size); 1250 } else { 1251 m->m_len = min(MLEN, sopt_size); 1252 } 1253 sopt_size -= m->m_len; 1254 *mp = m; 1255 m_prev = m; 1256 1257 while (sopt_size) { 1258 MGET(m, sopt->sopt_p ? M_WAIT : M_DONTWAIT, MT_DATA); 1259 if (m == 0) { 1260 m_freem(*mp); 1261 return ENOBUFS; 1262 } 1263 if (sopt_size > MLEN) { 1264 MCLGET(m, sopt->sopt_p ? M_WAIT : M_DONTWAIT); 1265 if ((m->m_flags & M_EXT) == 0) { 1266 m_freem(*mp); 1267 return ENOBUFS; 1268 } 1269 m->m_len = min(MCLBYTES, sopt_size); 1270 } else { 1271 m->m_len = min(MLEN, sopt_size); 1272 } 1273 sopt_size -= m->m_len; 1274 m_prev->m_next = m; 1275 m_prev = m; 1276 } 1277 return 0; 1278 } 1279 1280 /* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */ 1281 int 1282 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m) 1283 { 1284 struct mbuf *m0 = m; 1285 1286 if (sopt->sopt_val == NULL) 1287 return 0; 1288 while (m != NULL && sopt->sopt_valsize >= m->m_len) { 1289 if (sopt->sopt_p != NULL) { 1290 int error; 1291 1292 error = copyin(sopt->sopt_val, mtod(m, char *), 1293 m->m_len); 1294 if (error != 0) { 1295 m_freem(m0); 1296 return(error); 1297 } 1298 } else 1299 bcopy(sopt->sopt_val, mtod(m, char *), m->m_len); 1300 sopt->sopt_valsize -= m->m_len; 1301 (caddr_t)sopt->sopt_val += m->m_len; 1302 m = m->m_next; 1303 } 1304 if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */ 1305 panic("ip6_sooptmcopyin"); 1306 return 0; 1307 } 1308 1309 /* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */ 1310 int 1311 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m) 1312 { 1313 struct mbuf *m0 = m; 1314 size_t valsize = 0; 1315 1316 if (sopt->sopt_val == NULL) 1317 return 0; 1318 while (m != NULL && sopt->sopt_valsize >= m->m_len) { 1319 if (sopt->sopt_p != NULL) { 1320 int error; 1321 1322 error = copyout(mtod(m, char *), sopt->sopt_val, 1323 m->m_len); 1324 if (error != 0) { 1325 m_freem(m0); 1326 return(error); 1327 } 1328 } else 1329 bcopy(mtod(m, char *), sopt->sopt_val, m->m_len); 1330 sopt->sopt_valsize -= m->m_len; 1331 (caddr_t)sopt->sopt_val += m->m_len; 1332 valsize += m->m_len; 1333 m = m->m_next; 1334 } 1335 if (m != NULL) { 1336 /* enough soopt buffer should be given from user-land */ 1337 m_freem(m0); 1338 return(EINVAL); 1339 } 1340 sopt->sopt_valsize = valsize; 1341 return 0; 1342 } 1343 1344 void 1345 sohasoutofband(so) 1346 register struct socket *so; 1347 { 1348 if (so->so_sigio != NULL) 1349 pgsigio(so->so_sigio, SIGURG, 0); 1350 selwakeup(&so->so_rcv.sb_sel); 1351 } 1352 1353 int 1354 sopoll(struct socket *so, int events, struct ucred *cred, struct proc *p) 1355 { 1356 int revents = 0; 1357 int s = splnet(); 1358 1359 if (events & (POLLIN | POLLRDNORM)) 1360 if (soreadable(so)) 1361 revents |= events & (POLLIN | POLLRDNORM); 1362 1363 if (events & (POLLOUT | POLLWRNORM)) 1364 if (sowriteable(so)) 1365 revents |= events & (POLLOUT | POLLWRNORM); 1366 1367 if (events & (POLLPRI | POLLRDBAND)) 1368 if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) 1369 revents |= events & (POLLPRI | POLLRDBAND); 1370 1371 if (revents == 0) { 1372 if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) { 1373 selrecord(p, &so->so_rcv.sb_sel); 1374 so->so_rcv.sb_flags |= SB_SEL; 1375 } 1376 1377 if (events & (POLLOUT | POLLWRNORM)) { 1378 selrecord(p, &so->so_snd.sb_sel); 1379 so->so_snd.sb_flags |= SB_SEL; 1380 } 1381 } 1382 1383 splx(s); 1384 return (revents); 1385 } 1386