1 /* 2 * Copyright (c) 1982, 1986, 1988, 1990, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 3. All advertising materials mentioning features or use of this software 14 * must display the following acknowledgement: 15 * This product includes software developed by the University of 16 * California, Berkeley and its contributors. 17 * 4. Neither the name of the University nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 * 33 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94 34 */ 35 36 #include <sys/cdefs.h> 37 __FBSDID("$FreeBSD$"); 38 39 #include "opt_inet.h" 40 #include "opt_mac.h" 41 #include "opt_zero.h" 42 43 #include <sys/param.h> 44 #include <sys/systm.h> 45 #include <sys/fcntl.h> 46 #include <sys/limits.h> 47 #include <sys/lock.h> 48 #include <sys/mac.h> 49 #include <sys/malloc.h> 50 #include <sys/mbuf.h> 51 #include <sys/mutex.h> 52 #include <sys/domain.h> 53 #include <sys/file.h> /* for struct knote */ 54 #include <sys/kernel.h> 55 #include <sys/event.h> 56 #include <sys/poll.h> 57 #include <sys/proc.h> 58 #include <sys/protosw.h> 59 #include <sys/socket.h> 60 #include <sys/socketvar.h> 61 #include <sys/resourcevar.h> 62 #include <sys/signalvar.h> 63 #include <sys/sysctl.h> 64 #include <sys/uio.h> 65 #include <sys/jail.h> 66 67 #include <vm/uma.h> 68 69 70 #ifdef INET 71 static int do_setopt_accept_filter(struct socket *so, struct sockopt *sopt); 72 #endif 73 74 static void filt_sordetach(struct knote *kn); 75 static int filt_soread(struct knote *kn, long hint); 76 static void filt_sowdetach(struct knote *kn); 77 static int filt_sowrite(struct knote *kn, long hint); 78 static int filt_solisten(struct knote *kn, long hint); 79 80 static struct filterops solisten_filtops = 81 { 1, NULL, filt_sordetach, filt_solisten }; 82 static struct filterops soread_filtops = 83 { 1, NULL, filt_sordetach, filt_soread }; 84 static struct filterops sowrite_filtops = 85 { 1, NULL, filt_sowdetach, filt_sowrite }; 86 87 uma_zone_t socket_zone; 88 so_gen_t so_gencnt; /* generation count for sockets */ 89 90 MALLOC_DEFINE(M_SONAME, "soname", "socket name"); 91 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block"); 92 93 SYSCTL_DECL(_kern_ipc); 94 95 static int somaxconn = SOMAXCONN; 96 SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLFLAG_RW, 97 &somaxconn, 0, "Maximum pending socket connection queue size"); 98 static int numopensockets; 99 SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD, 100 &numopensockets, 0, "Number of open sockets"); 101 #ifdef ZERO_COPY_SOCKETS 102 /* These aren't static because they're used in other files. */ 103 int so_zero_copy_send = 1; 104 int so_zero_copy_receive = 1; 105 SYSCTL_NODE(_kern_ipc, OID_AUTO, zero_copy, CTLFLAG_RD, 0, 106 "Zero copy controls"); 107 SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, receive, CTLFLAG_RW, 108 &so_zero_copy_receive, 0, "Enable zero copy receive"); 109 SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, send, CTLFLAG_RW, 110 &so_zero_copy_send, 0, "Enable zero copy send"); 111 #endif /* ZERO_COPY_SOCKETS */ 112 113 114 /* 115 * Socket operation routines. 116 * These routines are called by the routines in 117 * sys_socket.c or from a system process, and 118 * implement the semantics of socket operations by 119 * switching out to the protocol specific routines. 120 */ 121 122 /* 123 * Get a socket structure from our zone, and initialize it. 124 * Note that it would probably be better to allocate socket 125 * and PCB at the same time, but I'm not convinced that all 126 * the protocols can be easily modified to do this. 127 * 128 * soalloc() returns a socket with a ref count of 0. 129 */ 130 struct socket * 131 soalloc(waitok) 132 int waitok; 133 { 134 struct socket *so; 135 #ifdef MAC 136 int error; 137 #endif 138 int flag; 139 140 if (waitok == 1) 141 flag = M_WAITOK; 142 else 143 flag = M_NOWAIT; 144 flag |= M_ZERO; 145 so = uma_zalloc(socket_zone, flag); 146 if (so) { 147 #ifdef MAC 148 error = mac_init_socket(so, flag); 149 if (error != 0) { 150 uma_zfree(socket_zone, so); 151 so = NULL; 152 return so; 153 } 154 #endif 155 /* XXX race condition for reentrant kernel */ 156 so->so_gencnt = ++so_gencnt; 157 /* sx_init(&so->so_sxlock, "socket sxlock"); */ 158 TAILQ_INIT(&so->so_aiojobq); 159 ++numopensockets; 160 } 161 return so; 162 } 163 164 /* 165 * socreate returns a socket with a ref count of 1. The socket should be 166 * closed with soclose(). 167 */ 168 int 169 socreate(dom, aso, type, proto, cred, td) 170 int dom; 171 struct socket **aso; 172 int type; 173 int proto; 174 struct ucred *cred; 175 struct thread *td; 176 { 177 struct protosw *prp; 178 struct socket *so; 179 int error; 180 181 if (proto) 182 prp = pffindproto(dom, proto, type); 183 else 184 prp = pffindtype(dom, type); 185 186 if (prp == 0 || prp->pr_usrreqs->pru_attach == 0) 187 return (EPROTONOSUPPORT); 188 189 if (jailed(cred) && jail_socket_unixiproute_only && 190 prp->pr_domain->dom_family != PF_LOCAL && 191 prp->pr_domain->dom_family != PF_INET && 192 prp->pr_domain->dom_family != PF_ROUTE) { 193 return (EPROTONOSUPPORT); 194 } 195 196 if (prp->pr_type != type) 197 return (EPROTOTYPE); 198 so = soalloc(1); 199 if (so == NULL) 200 return (ENOBUFS); 201 202 TAILQ_INIT(&so->so_incomp); 203 TAILQ_INIT(&so->so_comp); 204 so->so_type = type; 205 so->so_cred = crhold(cred); 206 so->so_proto = prp; 207 #ifdef MAC 208 mac_create_socket(cred, so); 209 #endif 210 soref(so); 211 error = (*prp->pr_usrreqs->pru_attach)(so, proto, td); 212 if (error) { 213 so->so_state |= SS_NOFDREF; 214 sorele(so); 215 return (error); 216 } 217 *aso = so; 218 return (0); 219 } 220 221 int 222 sobind(so, nam, td) 223 struct socket *so; 224 struct sockaddr *nam; 225 struct thread *td; 226 { 227 int s = splnet(); 228 int error; 229 230 error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td); 231 splx(s); 232 return (error); 233 } 234 235 void 236 sodealloc(struct socket *so) 237 { 238 239 KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count)); 240 so->so_gencnt = ++so_gencnt; 241 if (so->so_rcv.sb_hiwat) 242 (void)chgsbsize(so->so_cred->cr_uidinfo, 243 &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY); 244 if (so->so_snd.sb_hiwat) 245 (void)chgsbsize(so->so_cred->cr_uidinfo, 246 &so->so_snd.sb_hiwat, 0, RLIM_INFINITY); 247 #ifdef INET 248 /* remove acccept filter if one is present. */ 249 if (so->so_accf != NULL) 250 do_setopt_accept_filter(so, NULL); 251 #endif 252 #ifdef MAC 253 mac_destroy_socket(so); 254 #endif 255 crfree(so->so_cred); 256 /* sx_destroy(&so->so_sxlock); */ 257 uma_zfree(socket_zone, so); 258 --numopensockets; 259 } 260 261 int 262 solisten(so, backlog, td) 263 struct socket *so; 264 int backlog; 265 struct thread *td; 266 { 267 int s, error; 268 269 s = splnet(); 270 if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | 271 SS_ISDISCONNECTING)) { 272 splx(s); 273 return (EINVAL); 274 } 275 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, td); 276 if (error) { 277 splx(s); 278 return (error); 279 } 280 if (TAILQ_EMPTY(&so->so_comp)) 281 so->so_options |= SO_ACCEPTCONN; 282 if (backlog < 0 || backlog > somaxconn) 283 backlog = somaxconn; 284 so->so_qlimit = backlog; 285 splx(s); 286 return (0); 287 } 288 289 void 290 sofree(so) 291 struct socket *so; 292 { 293 struct socket *head = so->so_head; 294 int s; 295 296 KASSERT(so->so_count == 0, ("socket %p so_count not 0", so)); 297 298 if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) 299 return; 300 if (head != NULL) { 301 if (so->so_state & SS_INCOMP) { 302 TAILQ_REMOVE(&head->so_incomp, so, so_list); 303 head->so_incqlen--; 304 } else if (so->so_state & SS_COMP) { 305 /* 306 * We must not decommission a socket that's 307 * on the accept(2) queue. If we do, then 308 * accept(2) may hang after select(2) indicated 309 * that the listening socket was ready. 310 */ 311 return; 312 } else { 313 panic("sofree: not queued"); 314 } 315 so->so_state &= ~SS_INCOMP; 316 so->so_head = NULL; 317 } 318 so->so_snd.sb_flags |= SB_NOINTR; 319 (void)sblock(&so->so_snd, M_WAITOK); 320 s = splimp(); 321 socantsendmore(so); 322 splx(s); 323 sbunlock(&so->so_snd); 324 sbrelease(&so->so_snd, so); 325 sorflush(so); 326 sodealloc(so); 327 } 328 329 /* 330 * Close a socket on last file table reference removal. 331 * Initiate disconnect if connected. 332 * Free socket when disconnect complete. 333 * 334 * This function will sorele() the socket. Note that soclose() may be 335 * called prior to the ref count reaching zero. The actual socket 336 * structure will not be freed until the ref count reaches zero. 337 */ 338 int 339 soclose(so) 340 struct socket *so; 341 { 342 int s = splnet(); /* conservative */ 343 int error = 0; 344 345 funsetown(&so->so_sigio); 346 if (so->so_options & SO_ACCEPTCONN) { 347 struct socket *sp, *sonext; 348 349 sp = TAILQ_FIRST(&so->so_incomp); 350 for (; sp != NULL; sp = sonext) { 351 sonext = TAILQ_NEXT(sp, so_list); 352 (void) soabort(sp); 353 } 354 for (sp = TAILQ_FIRST(&so->so_comp); sp != NULL; sp = sonext) { 355 sonext = TAILQ_NEXT(sp, so_list); 356 /* Dequeue from so_comp since sofree() won't do it */ 357 TAILQ_REMOVE(&so->so_comp, sp, so_list); 358 so->so_qlen--; 359 sp->so_state &= ~SS_COMP; 360 sp->so_head = NULL; 361 (void) soabort(sp); 362 } 363 } 364 if (so->so_pcb == 0) 365 goto discard; 366 if (so->so_state & SS_ISCONNECTED) { 367 if ((so->so_state & SS_ISDISCONNECTING) == 0) { 368 error = sodisconnect(so); 369 if (error) 370 goto drop; 371 } 372 if (so->so_options & SO_LINGER) { 373 if ((so->so_state & SS_ISDISCONNECTING) && 374 (so->so_state & SS_NBIO)) 375 goto drop; 376 while (so->so_state & SS_ISCONNECTED) { 377 error = tsleep(&so->so_timeo, 378 PSOCK | PCATCH, "soclos", so->so_linger * hz); 379 if (error) 380 break; 381 } 382 } 383 } 384 drop: 385 if (so->so_pcb) { 386 int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so); 387 if (error == 0) 388 error = error2; 389 } 390 discard: 391 if (so->so_state & SS_NOFDREF) 392 panic("soclose: NOFDREF"); 393 so->so_state |= SS_NOFDREF; 394 sorele(so); 395 splx(s); 396 return (error); 397 } 398 399 /* 400 * Must be called at splnet... 401 */ 402 int 403 soabort(so) 404 struct socket *so; 405 { 406 int error; 407 408 error = (*so->so_proto->pr_usrreqs->pru_abort)(so); 409 if (error) { 410 sotryfree(so); /* note: does not decrement the ref count */ 411 return error; 412 } 413 return (0); 414 } 415 416 int 417 soaccept(so, nam) 418 struct socket *so; 419 struct sockaddr **nam; 420 { 421 int s = splnet(); 422 int error; 423 424 if ((so->so_state & SS_NOFDREF) == 0) 425 panic("soaccept: !NOFDREF"); 426 so->so_state &= ~SS_NOFDREF; 427 error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam); 428 splx(s); 429 return (error); 430 } 431 432 int 433 soconnect(so, nam, td) 434 struct socket *so; 435 struct sockaddr *nam; 436 struct thread *td; 437 { 438 int s; 439 int error; 440 441 if (so->so_options & SO_ACCEPTCONN) 442 return (EOPNOTSUPP); 443 s = splnet(); 444 /* 445 * If protocol is connection-based, can only connect once. 446 * Otherwise, if connected, try to disconnect first. 447 * This allows user to disconnect by connecting to, e.g., 448 * a null address. 449 */ 450 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && 451 ((so->so_proto->pr_flags & PR_CONNREQUIRED) || 452 (error = sodisconnect(so)))) 453 error = EISCONN; 454 else 455 error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td); 456 splx(s); 457 return (error); 458 } 459 460 int 461 soconnect2(so1, so2) 462 struct socket *so1; 463 struct socket *so2; 464 { 465 int s = splnet(); 466 int error; 467 468 error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2); 469 splx(s); 470 return (error); 471 } 472 473 int 474 sodisconnect(so) 475 struct socket *so; 476 { 477 int s = splnet(); 478 int error; 479 480 if ((so->so_state & SS_ISCONNECTED) == 0) { 481 error = ENOTCONN; 482 goto bad; 483 } 484 if (so->so_state & SS_ISDISCONNECTING) { 485 error = EALREADY; 486 goto bad; 487 } 488 error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so); 489 bad: 490 splx(s); 491 return (error); 492 } 493 494 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK) 495 /* 496 * Send on a socket. 497 * If send must go all at once and message is larger than 498 * send buffering, then hard error. 499 * Lock against other senders. 500 * If must go all at once and not enough room now, then 501 * inform user that this would block and do nothing. 502 * Otherwise, if nonblocking, send as much as possible. 503 * The data to be sent is described by "uio" if nonzero, 504 * otherwise by the mbuf chain "top" (which must be null 505 * if uio is not). Data provided in mbuf chain must be small 506 * enough to send all at once. 507 * 508 * Returns nonzero on error, timeout or signal; callers 509 * must check for short counts if EINTR/ERESTART are returned. 510 * Data and control buffers are freed on return. 511 */ 512 513 #ifdef ZERO_COPY_SOCKETS 514 struct so_zerocopy_stats{ 515 int size_ok; 516 int align_ok; 517 int found_ifp; 518 }; 519 struct so_zerocopy_stats so_zerocp_stats = {0,0,0}; 520 #include <netinet/in.h> 521 #include <net/route.h> 522 #include <netinet/in_pcb.h> 523 #include <vm/vm.h> 524 #include <vm/vm_page.h> 525 #include <vm/vm_object.h> 526 #endif /*ZERO_COPY_SOCKETS*/ 527 528 int 529 sosend(so, addr, uio, top, control, flags, td) 530 struct socket *so; 531 struct sockaddr *addr; 532 struct uio *uio; 533 struct mbuf *top; 534 struct mbuf *control; 535 int flags; 536 struct thread *td; 537 { 538 struct mbuf **mp; 539 struct mbuf *m; 540 long space, len, resid; 541 int clen = 0, error, s, dontroute, mlen; 542 int atomic = sosendallatonce(so) || top; 543 #ifdef ZERO_COPY_SOCKETS 544 int cow_send; 545 #endif /* ZERO_COPY_SOCKETS */ 546 547 if (uio) 548 resid = uio->uio_resid; 549 else 550 resid = top->m_pkthdr.len; 551 /* 552 * In theory resid should be unsigned. 553 * However, space must be signed, as it might be less than 0 554 * if we over-committed, and we must use a signed comparison 555 * of space and resid. On the other hand, a negative resid 556 * causes us to loop sending 0-length segments to the protocol. 557 * 558 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM 559 * type sockets since that's an error. 560 */ 561 if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) { 562 error = EINVAL; 563 goto out; 564 } 565 566 dontroute = 567 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && 568 (so->so_proto->pr_flags & PR_ATOMIC); 569 if (td) 570 td->td_proc->p_stats->p_ru.ru_msgsnd++; 571 if (control) 572 clen = control->m_len; 573 #define snderr(errno) { error = (errno); splx(s); goto release; } 574 575 restart: 576 error = sblock(&so->so_snd, SBLOCKWAIT(flags)); 577 if (error) 578 goto out; 579 do { 580 s = splnet(); 581 if (so->so_state & SS_CANTSENDMORE) 582 snderr(EPIPE); 583 if (so->so_error) { 584 error = so->so_error; 585 so->so_error = 0; 586 splx(s); 587 goto release; 588 } 589 if ((so->so_state & SS_ISCONNECTED) == 0) { 590 /* 591 * `sendto' and `sendmsg' is allowed on a connection- 592 * based socket if it supports implied connect. 593 * Return ENOTCONN if not connected and no address is 594 * supplied. 595 */ 596 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && 597 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { 598 if ((so->so_state & SS_ISCONFIRMING) == 0 && 599 !(resid == 0 && clen != 0)) 600 snderr(ENOTCONN); 601 } else if (addr == 0) 602 snderr(so->so_proto->pr_flags & PR_CONNREQUIRED ? 603 ENOTCONN : EDESTADDRREQ); 604 } 605 space = sbspace(&so->so_snd); 606 if (flags & MSG_OOB) 607 space += 1024; 608 if ((atomic && resid > so->so_snd.sb_hiwat) || 609 clen > so->so_snd.sb_hiwat) 610 snderr(EMSGSIZE); 611 if (space < resid + clen && 612 (atomic || space < so->so_snd.sb_lowat || space < clen)) { 613 if (so->so_state & SS_NBIO) 614 snderr(EWOULDBLOCK); 615 sbunlock(&so->so_snd); 616 error = sbwait(&so->so_snd); 617 splx(s); 618 if (error) 619 goto out; 620 goto restart; 621 } 622 splx(s); 623 mp = ⊤ 624 space -= clen; 625 do { 626 if (uio == NULL) { 627 /* 628 * Data is prepackaged in "top". 629 */ 630 resid = 0; 631 if (flags & MSG_EOR) 632 top->m_flags |= M_EOR; 633 } else do { 634 #ifdef ZERO_COPY_SOCKETS 635 cow_send = 0; 636 #endif /* ZERO_COPY_SOCKETS */ 637 if (top == 0) { 638 MGETHDR(m, M_TRYWAIT, MT_DATA); 639 if (m == NULL) { 640 error = ENOBUFS; 641 goto release; 642 } 643 mlen = MHLEN; 644 m->m_pkthdr.len = 0; 645 m->m_pkthdr.rcvif = (struct ifnet *)0; 646 } else { 647 MGET(m, M_TRYWAIT, MT_DATA); 648 if (m == NULL) { 649 error = ENOBUFS; 650 goto release; 651 } 652 mlen = MLEN; 653 } 654 if (resid >= MINCLSIZE) { 655 #ifdef ZERO_COPY_SOCKETS 656 if (so_zero_copy_send && 657 resid>=PAGE_SIZE && 658 space>=PAGE_SIZE && 659 uio->uio_iov->iov_len>=PAGE_SIZE) { 660 so_zerocp_stats.size_ok++; 661 if (!((vm_offset_t) 662 uio->uio_iov->iov_base & PAGE_MASK)){ 663 so_zerocp_stats.align_ok++; 664 cow_send = socow_setup(m, uio); 665 } 666 } 667 if (!cow_send){ 668 #endif /* ZERO_COPY_SOCKETS */ 669 MCLGET(m, M_TRYWAIT); 670 if ((m->m_flags & M_EXT) == 0) 671 goto nopages; 672 mlen = MCLBYTES; 673 len = min(min(mlen, resid), space); 674 } else { 675 #ifdef ZERO_COPY_SOCKETS 676 len = PAGE_SIZE; 677 } 678 679 } else { 680 #endif /* ZERO_COPY_SOCKETS */ 681 nopages: 682 len = min(min(mlen, resid), space); 683 /* 684 * For datagram protocols, leave room 685 * for protocol headers in first mbuf. 686 */ 687 if (atomic && top == 0 && len < mlen) 688 MH_ALIGN(m, len); 689 } 690 space -= len; 691 #ifdef ZERO_COPY_SOCKETS 692 if (cow_send) 693 error = 0; 694 else 695 #endif /* ZERO_COPY_SOCKETS */ 696 error = uiomove(mtod(m, void *), (int)len, uio); 697 resid = uio->uio_resid; 698 m->m_len = len; 699 *mp = m; 700 top->m_pkthdr.len += len; 701 if (error) 702 goto release; 703 mp = &m->m_next; 704 if (resid <= 0) { 705 if (flags & MSG_EOR) 706 top->m_flags |= M_EOR; 707 break; 708 } 709 } while (space > 0 && atomic); 710 if (dontroute) 711 so->so_options |= SO_DONTROUTE; 712 s = splnet(); /* XXX */ 713 /* 714 * XXX all the SS_CANTSENDMORE checks previously 715 * done could be out of date. We could have recieved 716 * a reset packet in an interrupt or maybe we slept 717 * while doing page faults in uiomove() etc. We could 718 * probably recheck again inside the splnet() protection 719 * here, but there are probably other places that this 720 * also happens. We must rethink this. 721 */ 722 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 723 (flags & MSG_OOB) ? PRUS_OOB : 724 /* 725 * If the user set MSG_EOF, the protocol 726 * understands this flag and nothing left to 727 * send then use PRU_SEND_EOF instead of PRU_SEND. 728 */ 729 ((flags & MSG_EOF) && 730 (so->so_proto->pr_flags & PR_IMPLOPCL) && 731 (resid <= 0)) ? 732 PRUS_EOF : 733 /* If there is more to send set PRUS_MORETOCOME */ 734 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, 735 top, addr, control, td); 736 splx(s); 737 if (dontroute) 738 so->so_options &= ~SO_DONTROUTE; 739 clen = 0; 740 control = 0; 741 top = 0; 742 mp = ⊤ 743 if (error) 744 goto release; 745 } while (resid && space > 0); 746 } while (resid); 747 748 release: 749 sbunlock(&so->so_snd); 750 out: 751 if (top) 752 m_freem(top); 753 if (control) 754 m_freem(control); 755 return (error); 756 } 757 758 /* 759 * Implement receive operations on a socket. 760 * We depend on the way that records are added to the sockbuf 761 * by sbappend*. In particular, each record (mbufs linked through m_next) 762 * must begin with an address if the protocol so specifies, 763 * followed by an optional mbuf or mbufs containing ancillary data, 764 * and then zero or more mbufs of data. 765 * In order to avoid blocking network interrupts for the entire time here, 766 * we splx() while doing the actual copy to user space. 767 * Although the sockbuf is locked, new data may still be appended, 768 * and thus we must maintain consistency of the sockbuf during that time. 769 * 770 * The caller may receive the data as a single mbuf chain by supplying 771 * an mbuf **mp0 for use in returning the chain. The uio is then used 772 * only for the count in uio_resid. 773 */ 774 int 775 soreceive(so, psa, uio, mp0, controlp, flagsp) 776 struct socket *so; 777 struct sockaddr **psa; 778 struct uio *uio; 779 struct mbuf **mp0; 780 struct mbuf **controlp; 781 int *flagsp; 782 { 783 struct mbuf *m, **mp; 784 int flags, len, error, s, offset; 785 struct protosw *pr = so->so_proto; 786 struct mbuf *nextrecord; 787 int moff, type = 0; 788 int orig_resid = uio->uio_resid; 789 790 mp = mp0; 791 if (psa) 792 *psa = 0; 793 if (controlp) 794 *controlp = 0; 795 if (flagsp) 796 flags = *flagsp &~ MSG_EOR; 797 else 798 flags = 0; 799 if (flags & MSG_OOB) { 800 m = m_get(M_TRYWAIT, MT_DATA); 801 if (m == NULL) 802 return (ENOBUFS); 803 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK); 804 if (error) 805 goto bad; 806 do { 807 #ifdef ZERO_COPY_SOCKETS 808 if (so_zero_copy_receive) { 809 vm_page_t pg; 810 int disposable; 811 812 if ((m->m_flags & M_EXT) 813 && (m->m_ext.ext_type == EXT_DISPOSABLE)) 814 disposable = 1; 815 else 816 disposable = 0; 817 818 pg = PHYS_TO_VM_PAGE(vtophys(mtod(m, caddr_t))); 819 if (uio->uio_offset == -1) 820 uio->uio_offset =IDX_TO_OFF(pg->pindex); 821 822 error = uiomoveco(mtod(m, void *), 823 min(uio->uio_resid, m->m_len), 824 uio, pg->object, 825 disposable); 826 } else 827 #endif /* ZERO_COPY_SOCKETS */ 828 error = uiomove(mtod(m, void *), 829 (int) min(uio->uio_resid, m->m_len), uio); 830 m = m_free(m); 831 } while (uio->uio_resid && error == 0 && m); 832 bad: 833 if (m) 834 m_freem(m); 835 return (error); 836 } 837 if (mp) 838 *mp = (struct mbuf *)0; 839 if (so->so_state & SS_ISCONFIRMING && uio->uio_resid) 840 (*pr->pr_usrreqs->pru_rcvd)(so, 0); 841 842 restart: 843 error = sblock(&so->so_rcv, SBLOCKWAIT(flags)); 844 if (error) 845 return (error); 846 s = splnet(); 847 848 m = so->so_rcv.sb_mb; 849 /* 850 * If we have less data than requested, block awaiting more 851 * (subject to any timeout) if: 852 * 1. the current count is less than the low water mark, or 853 * 2. MSG_WAITALL is set, and it is possible to do the entire 854 * receive operation at once if we block (resid <= hiwat). 855 * 3. MSG_DONTWAIT is not set 856 * If MSG_WAITALL is set but resid is larger than the receive buffer, 857 * we have to do the receive in sections, and thus risk returning 858 * a short count if a timeout or signal occurs after we start. 859 */ 860 if (m == 0 || (((flags & MSG_DONTWAIT) == 0 && 861 so->so_rcv.sb_cc < uio->uio_resid) && 862 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat || 863 ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) && 864 m->m_nextpkt == 0 && (pr->pr_flags & PR_ATOMIC) == 0)) { 865 KASSERT(m != 0 || !so->so_rcv.sb_cc, 866 ("receive: m == %p so->so_rcv.sb_cc == %u", 867 m, so->so_rcv.sb_cc)); 868 if (so->so_error) { 869 if (m) 870 goto dontblock; 871 error = so->so_error; 872 if ((flags & MSG_PEEK) == 0) 873 so->so_error = 0; 874 goto release; 875 } 876 if (so->so_state & SS_CANTRCVMORE) { 877 if (m) 878 goto dontblock; 879 else 880 goto release; 881 } 882 for (; m; m = m->m_next) 883 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { 884 m = so->so_rcv.sb_mb; 885 goto dontblock; 886 } 887 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 888 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 889 error = ENOTCONN; 890 goto release; 891 } 892 if (uio->uio_resid == 0) 893 goto release; 894 if ((so->so_state & SS_NBIO) || (flags & MSG_DONTWAIT)) { 895 error = EWOULDBLOCK; 896 goto release; 897 } 898 SBLASTRECORDCHK(&so->so_rcv); 899 SBLASTMBUFCHK(&so->so_rcv); 900 sbunlock(&so->so_rcv); 901 error = sbwait(&so->so_rcv); 902 splx(s); 903 if (error) 904 return (error); 905 goto restart; 906 } 907 dontblock: 908 if (uio->uio_td) 909 uio->uio_td->td_proc->p_stats->p_ru.ru_msgrcv++; 910 SBLASTRECORDCHK(&so->so_rcv); 911 SBLASTMBUFCHK(&so->so_rcv); 912 nextrecord = m->m_nextpkt; 913 if (pr->pr_flags & PR_ADDR) { 914 KASSERT(m->m_type == MT_SONAME, 915 ("m->m_type == %d", m->m_type)); 916 orig_resid = 0; 917 if (psa) 918 *psa = dup_sockaddr(mtod(m, struct sockaddr *), 919 mp0 == 0); 920 if (flags & MSG_PEEK) { 921 m = m->m_next; 922 } else { 923 sbfree(&so->so_rcv, m); 924 so->so_rcv.sb_mb = m_free(m); 925 m = so->so_rcv.sb_mb; 926 } 927 } 928 while (m && m->m_type == MT_CONTROL && error == 0) { 929 if (flags & MSG_PEEK) { 930 if (controlp) 931 *controlp = m_copy(m, 0, m->m_len); 932 m = m->m_next; 933 } else { 934 sbfree(&so->so_rcv, m); 935 so->so_rcv.sb_mb = m->m_next; 936 m->m_next = NULL; 937 if (pr->pr_domain->dom_externalize) 938 error = 939 (*pr->pr_domain->dom_externalize)(m, controlp); 940 else if (controlp) 941 *controlp = m; 942 else 943 m_freem(m); 944 m = so->so_rcv.sb_mb; 945 } 946 if (controlp) { 947 orig_resid = 0; 948 while (*controlp != NULL) 949 controlp = &(*controlp)->m_next; 950 } 951 } 952 if (m) { 953 if ((flags & MSG_PEEK) == 0) { 954 m->m_nextpkt = nextrecord; 955 /* 956 * If nextrecord == NULL (this is a single chain), 957 * then sb_lastrecord may not be valid here if m 958 * was changed earlier. 959 */ 960 if (nextrecord == NULL) { 961 KASSERT(so->so_rcv.sb_mb == m, 962 ("receive tailq 1")); 963 so->so_rcv.sb_lastrecord = m; 964 } 965 } 966 type = m->m_type; 967 if (type == MT_OOBDATA) 968 flags |= MSG_OOB; 969 } else { 970 if ((flags & MSG_PEEK) == 0) { 971 KASSERT(so->so_rcv.sb_mb == m,("receive tailq 2")); 972 so->so_rcv.sb_mb = nextrecord; 973 SB_EMPTY_FIXUP(&so->so_rcv); 974 } 975 } 976 SBLASTRECORDCHK(&so->so_rcv); 977 SBLASTMBUFCHK(&so->so_rcv); 978 979 moff = 0; 980 offset = 0; 981 while (m && uio->uio_resid > 0 && error == 0) { 982 if (m->m_type == MT_OOBDATA) { 983 if (type != MT_OOBDATA) 984 break; 985 } else if (type == MT_OOBDATA) 986 break; 987 else 988 KASSERT(m->m_type == MT_DATA || m->m_type == MT_HEADER, 989 ("m->m_type == %d", m->m_type)); 990 so->so_state &= ~SS_RCVATMARK; 991 len = uio->uio_resid; 992 if (so->so_oobmark && len > so->so_oobmark - offset) 993 len = so->so_oobmark - offset; 994 if (len > m->m_len - moff) 995 len = m->m_len - moff; 996 /* 997 * If mp is set, just pass back the mbufs. 998 * Otherwise copy them out via the uio, then free. 999 * Sockbuf must be consistent here (points to current mbuf, 1000 * it points to next record) when we drop priority; 1001 * we must note any additions to the sockbuf when we 1002 * block interrupts again. 1003 */ 1004 if (mp == 0) { 1005 SBLASTRECORDCHK(&so->so_rcv); 1006 SBLASTMBUFCHK(&so->so_rcv); 1007 splx(s); 1008 #ifdef ZERO_COPY_SOCKETS 1009 if (so_zero_copy_receive) { 1010 vm_page_t pg; 1011 int disposable; 1012 1013 if ((m->m_flags & M_EXT) 1014 && (m->m_ext.ext_type == EXT_DISPOSABLE)) 1015 disposable = 1; 1016 else 1017 disposable = 0; 1018 1019 pg = PHYS_TO_VM_PAGE(vtophys(mtod(m, caddr_t) + 1020 moff)); 1021 1022 if (uio->uio_offset == -1) 1023 uio->uio_offset =IDX_TO_OFF(pg->pindex); 1024 1025 error = uiomoveco(mtod(m, char *) + moff, 1026 (int)len, uio,pg->object, 1027 disposable); 1028 } else 1029 #endif /* ZERO_COPY_SOCKETS */ 1030 error = uiomove(mtod(m, char *) + moff, (int)len, uio); 1031 s = splnet(); 1032 if (error) 1033 goto release; 1034 } else 1035 uio->uio_resid -= len; 1036 if (len == m->m_len - moff) { 1037 if (m->m_flags & M_EOR) 1038 flags |= MSG_EOR; 1039 if (flags & MSG_PEEK) { 1040 m = m->m_next; 1041 moff = 0; 1042 } else { 1043 nextrecord = m->m_nextpkt; 1044 sbfree(&so->so_rcv, m); 1045 if (mp) { 1046 *mp = m; 1047 mp = &m->m_next; 1048 so->so_rcv.sb_mb = m = m->m_next; 1049 *mp = (struct mbuf *)0; 1050 } else { 1051 so->so_rcv.sb_mb = m_free(m); 1052 m = so->so_rcv.sb_mb; 1053 } 1054 if (m) { 1055 m->m_nextpkt = nextrecord; 1056 if (nextrecord == NULL) 1057 so->so_rcv.sb_lastrecord = m; 1058 } else { 1059 so->so_rcv.sb_mb = nextrecord; 1060 SB_EMPTY_FIXUP(&so->so_rcv); 1061 } 1062 SBLASTRECORDCHK(&so->so_rcv); 1063 SBLASTMBUFCHK(&so->so_rcv); 1064 } 1065 } else { 1066 if (flags & MSG_PEEK) 1067 moff += len; 1068 else { 1069 if (mp) 1070 *mp = m_copym(m, 0, len, M_TRYWAIT); 1071 m->m_data += len; 1072 m->m_len -= len; 1073 so->so_rcv.sb_cc -= len; 1074 } 1075 } 1076 if (so->so_oobmark) { 1077 if ((flags & MSG_PEEK) == 0) { 1078 so->so_oobmark -= len; 1079 if (so->so_oobmark == 0) { 1080 so->so_state |= SS_RCVATMARK; 1081 break; 1082 } 1083 } else { 1084 offset += len; 1085 if (offset == so->so_oobmark) 1086 break; 1087 } 1088 } 1089 if (flags & MSG_EOR) 1090 break; 1091 /* 1092 * If the MSG_WAITALL flag is set (for non-atomic socket), 1093 * we must not quit until "uio->uio_resid == 0" or an error 1094 * termination. If a signal/timeout occurs, return 1095 * with a short count but without error. 1096 * Keep sockbuf locked against other readers. 1097 */ 1098 while (flags & MSG_WAITALL && m == 0 && uio->uio_resid > 0 && 1099 !sosendallatonce(so) && !nextrecord) { 1100 if (so->so_error || so->so_state & SS_CANTRCVMORE) 1101 break; 1102 /* 1103 * Notify the protocol that some data has been 1104 * drained before blocking. 1105 */ 1106 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) 1107 (*pr->pr_usrreqs->pru_rcvd)(so, flags); 1108 SBLASTRECORDCHK(&so->so_rcv); 1109 SBLASTMBUFCHK(&so->so_rcv); 1110 error = sbwait(&so->so_rcv); 1111 if (error) { 1112 sbunlock(&so->so_rcv); 1113 splx(s); 1114 return (0); 1115 } 1116 m = so->so_rcv.sb_mb; 1117 if (m) 1118 nextrecord = m->m_nextpkt; 1119 } 1120 } 1121 1122 if (m && pr->pr_flags & PR_ATOMIC) { 1123 flags |= MSG_TRUNC; 1124 if ((flags & MSG_PEEK) == 0) 1125 (void) sbdroprecord(&so->so_rcv); 1126 } 1127 if ((flags & MSG_PEEK) == 0) { 1128 if (m == 0) { 1129 /* 1130 * First part is an inline SB_EMPTY_FIXUP(). Second 1131 * part makes sure sb_lastrecord is up-to-date if 1132 * there is still data in the socket buffer. 1133 */ 1134 so->so_rcv.sb_mb = nextrecord; 1135 if (so->so_rcv.sb_mb == NULL) { 1136 so->so_rcv.sb_mbtail = NULL; 1137 so->so_rcv.sb_lastrecord = NULL; 1138 } else if (nextrecord->m_nextpkt == NULL) 1139 so->so_rcv.sb_lastrecord = nextrecord; 1140 } 1141 SBLASTRECORDCHK(&so->so_rcv); 1142 SBLASTMBUFCHK(&so->so_rcv); 1143 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) 1144 (*pr->pr_usrreqs->pru_rcvd)(so, flags); 1145 } 1146 if (orig_resid == uio->uio_resid && orig_resid && 1147 (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) { 1148 sbunlock(&so->so_rcv); 1149 splx(s); 1150 goto restart; 1151 } 1152 1153 if (flagsp) 1154 *flagsp |= flags; 1155 release: 1156 sbunlock(&so->so_rcv); 1157 splx(s); 1158 return (error); 1159 } 1160 1161 int 1162 soshutdown(so, how) 1163 struct socket *so; 1164 int how; 1165 { 1166 struct protosw *pr = so->so_proto; 1167 1168 if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR)) 1169 return (EINVAL); 1170 1171 if (how != SHUT_WR) 1172 sorflush(so); 1173 if (how != SHUT_RD) 1174 return ((*pr->pr_usrreqs->pru_shutdown)(so)); 1175 return (0); 1176 } 1177 1178 void 1179 sorflush(so) 1180 struct socket *so; 1181 { 1182 struct sockbuf *sb = &so->so_rcv; 1183 struct protosw *pr = so->so_proto; 1184 int s; 1185 struct sockbuf asb; 1186 1187 sb->sb_flags |= SB_NOINTR; 1188 (void) sblock(sb, M_WAITOK); 1189 s = splimp(); 1190 socantrcvmore(so); 1191 sbunlock(sb); 1192 asb = *sb; 1193 /* 1194 * Invalidate/clear most of the sockbuf structure, but keep 1195 * its selinfo structure valid. 1196 */ 1197 bzero(&sb->sb_startzero, 1198 sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); 1199 splx(s); 1200 1201 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose) 1202 (*pr->pr_domain->dom_dispose)(asb.sb_mb); 1203 sbrelease(&asb, so); 1204 } 1205 1206 #ifdef INET 1207 static int 1208 do_setopt_accept_filter(so, sopt) 1209 struct socket *so; 1210 struct sockopt *sopt; 1211 { 1212 struct accept_filter_arg *afap = NULL; 1213 struct accept_filter *afp; 1214 struct so_accf *af = so->so_accf; 1215 int error = 0; 1216 1217 /* do not set/remove accept filters on non listen sockets */ 1218 if ((so->so_options & SO_ACCEPTCONN) == 0) { 1219 error = EINVAL; 1220 goto out; 1221 } 1222 1223 /* removing the filter */ 1224 if (sopt == NULL) { 1225 if (af != NULL) { 1226 if (af->so_accept_filter != NULL && 1227 af->so_accept_filter->accf_destroy != NULL) { 1228 af->so_accept_filter->accf_destroy(so); 1229 } 1230 if (af->so_accept_filter_str != NULL) { 1231 FREE(af->so_accept_filter_str, M_ACCF); 1232 } 1233 FREE(af, M_ACCF); 1234 so->so_accf = NULL; 1235 } 1236 so->so_options &= ~SO_ACCEPTFILTER; 1237 return (0); 1238 } 1239 /* adding a filter */ 1240 /* must remove previous filter first */ 1241 if (af != NULL) { 1242 error = EINVAL; 1243 goto out; 1244 } 1245 /* don't put large objects on the kernel stack */ 1246 MALLOC(afap, struct accept_filter_arg *, sizeof(*afap), M_TEMP, M_WAITOK); 1247 error = sooptcopyin(sopt, afap, sizeof *afap, sizeof *afap); 1248 afap->af_name[sizeof(afap->af_name)-1] = '\0'; 1249 afap->af_arg[sizeof(afap->af_arg)-1] = '\0'; 1250 if (error) 1251 goto out; 1252 afp = accept_filt_get(afap->af_name); 1253 if (afp == NULL) { 1254 error = ENOENT; 1255 goto out; 1256 } 1257 MALLOC(af, struct so_accf *, sizeof(*af), M_ACCF, M_WAITOK | M_ZERO); 1258 if (afp->accf_create != NULL) { 1259 if (afap->af_name[0] != '\0') { 1260 int len = strlen(afap->af_name) + 1; 1261 1262 MALLOC(af->so_accept_filter_str, char *, len, M_ACCF, M_WAITOK); 1263 strcpy(af->so_accept_filter_str, afap->af_name); 1264 } 1265 af->so_accept_filter_arg = afp->accf_create(so, afap->af_arg); 1266 if (af->so_accept_filter_arg == NULL) { 1267 FREE(af->so_accept_filter_str, M_ACCF); 1268 FREE(af, M_ACCF); 1269 so->so_accf = NULL; 1270 error = EINVAL; 1271 goto out; 1272 } 1273 } 1274 af->so_accept_filter = afp; 1275 so->so_accf = af; 1276 so->so_options |= SO_ACCEPTFILTER; 1277 out: 1278 if (afap != NULL) 1279 FREE(afap, M_TEMP); 1280 return (error); 1281 } 1282 #endif /* INET */ 1283 1284 /* 1285 * Perhaps this routine, and sooptcopyout(), below, ought to come in 1286 * an additional variant to handle the case where the option value needs 1287 * to be some kind of integer, but not a specific size. 1288 * In addition to their use here, these functions are also called by the 1289 * protocol-level pr_ctloutput() routines. 1290 */ 1291 int 1292 sooptcopyin(sopt, buf, len, minlen) 1293 struct sockopt *sopt; 1294 void *buf; 1295 size_t len; 1296 size_t minlen; 1297 { 1298 size_t valsize; 1299 1300 /* 1301 * If the user gives us more than we wanted, we ignore it, 1302 * but if we don't get the minimum length the caller 1303 * wants, we return EINVAL. On success, sopt->sopt_valsize 1304 * is set to however much we actually retrieved. 1305 */ 1306 if ((valsize = sopt->sopt_valsize) < minlen) 1307 return EINVAL; 1308 if (valsize > len) 1309 sopt->sopt_valsize = valsize = len; 1310 1311 if (sopt->sopt_td != 0) 1312 return (copyin(sopt->sopt_val, buf, valsize)); 1313 1314 bcopy(sopt->sopt_val, buf, valsize); 1315 return 0; 1316 } 1317 1318 int 1319 sosetopt(so, sopt) 1320 struct socket *so; 1321 struct sockopt *sopt; 1322 { 1323 int error, optval; 1324 struct linger l; 1325 struct timeval tv; 1326 u_long val; 1327 #ifdef MAC 1328 struct mac extmac; 1329 #endif 1330 1331 error = 0; 1332 if (sopt->sopt_level != SOL_SOCKET) { 1333 if (so->so_proto && so->so_proto->pr_ctloutput) 1334 return ((*so->so_proto->pr_ctloutput) 1335 (so, sopt)); 1336 error = ENOPROTOOPT; 1337 } else { 1338 switch (sopt->sopt_name) { 1339 #ifdef INET 1340 case SO_ACCEPTFILTER: 1341 error = do_setopt_accept_filter(so, sopt); 1342 if (error) 1343 goto bad; 1344 break; 1345 #endif 1346 case SO_LINGER: 1347 error = sooptcopyin(sopt, &l, sizeof l, sizeof l); 1348 if (error) 1349 goto bad; 1350 1351 so->so_linger = l.l_linger; 1352 if (l.l_onoff) 1353 so->so_options |= SO_LINGER; 1354 else 1355 so->so_options &= ~SO_LINGER; 1356 break; 1357 1358 case SO_DEBUG: 1359 case SO_KEEPALIVE: 1360 case SO_DONTROUTE: 1361 case SO_USELOOPBACK: 1362 case SO_BROADCAST: 1363 case SO_REUSEADDR: 1364 case SO_REUSEPORT: 1365 case SO_OOBINLINE: 1366 case SO_TIMESTAMP: 1367 case SO_BINTIME: 1368 case SO_NOSIGPIPE: 1369 error = sooptcopyin(sopt, &optval, sizeof optval, 1370 sizeof optval); 1371 if (error) 1372 goto bad; 1373 if (optval) 1374 so->so_options |= sopt->sopt_name; 1375 else 1376 so->so_options &= ~sopt->sopt_name; 1377 break; 1378 1379 case SO_SNDBUF: 1380 case SO_RCVBUF: 1381 case SO_SNDLOWAT: 1382 case SO_RCVLOWAT: 1383 error = sooptcopyin(sopt, &optval, sizeof optval, 1384 sizeof optval); 1385 if (error) 1386 goto bad; 1387 1388 /* 1389 * Values < 1 make no sense for any of these 1390 * options, so disallow them. 1391 */ 1392 if (optval < 1) { 1393 error = EINVAL; 1394 goto bad; 1395 } 1396 1397 switch (sopt->sopt_name) { 1398 case SO_SNDBUF: 1399 case SO_RCVBUF: 1400 if (sbreserve(sopt->sopt_name == SO_SNDBUF ? 1401 &so->so_snd : &so->so_rcv, (u_long)optval, 1402 so, curthread) == 0) { 1403 error = ENOBUFS; 1404 goto bad; 1405 } 1406 break; 1407 1408 /* 1409 * Make sure the low-water is never greater than 1410 * the high-water. 1411 */ 1412 case SO_SNDLOWAT: 1413 so->so_snd.sb_lowat = 1414 (optval > so->so_snd.sb_hiwat) ? 1415 so->so_snd.sb_hiwat : optval; 1416 break; 1417 case SO_RCVLOWAT: 1418 so->so_rcv.sb_lowat = 1419 (optval > so->so_rcv.sb_hiwat) ? 1420 so->so_rcv.sb_hiwat : optval; 1421 break; 1422 } 1423 break; 1424 1425 case SO_SNDTIMEO: 1426 case SO_RCVTIMEO: 1427 error = sooptcopyin(sopt, &tv, sizeof tv, 1428 sizeof tv); 1429 if (error) 1430 goto bad; 1431 1432 /* assert(hz > 0); */ 1433 if (tv.tv_sec < 0 || tv.tv_sec > SHRT_MAX / hz || 1434 tv.tv_usec < 0 || tv.tv_usec >= 1000000) { 1435 error = EDOM; 1436 goto bad; 1437 } 1438 /* assert(tick > 0); */ 1439 /* assert(ULONG_MAX - SHRT_MAX >= 1000000); */ 1440 val = (u_long)(tv.tv_sec * hz) + tv.tv_usec / tick; 1441 if (val > SHRT_MAX) { 1442 error = EDOM; 1443 goto bad; 1444 } 1445 if (val == 0 && tv.tv_usec != 0) 1446 val = 1; 1447 1448 switch (sopt->sopt_name) { 1449 case SO_SNDTIMEO: 1450 so->so_snd.sb_timeo = val; 1451 break; 1452 case SO_RCVTIMEO: 1453 so->so_rcv.sb_timeo = val; 1454 break; 1455 } 1456 break; 1457 case SO_LABEL: 1458 #ifdef MAC 1459 error = sooptcopyin(sopt, &extmac, sizeof extmac, 1460 sizeof extmac); 1461 if (error) 1462 goto bad; 1463 error = mac_setsockopt_label(sopt->sopt_td->td_ucred, 1464 so, &extmac); 1465 #else 1466 error = EOPNOTSUPP; 1467 #endif 1468 break; 1469 default: 1470 error = ENOPROTOOPT; 1471 break; 1472 } 1473 if (error == 0 && so->so_proto && so->so_proto->pr_ctloutput) { 1474 (void) ((*so->so_proto->pr_ctloutput) 1475 (so, sopt)); 1476 } 1477 } 1478 bad: 1479 return (error); 1480 } 1481 1482 /* Helper routine for getsockopt */ 1483 int 1484 sooptcopyout(struct sockopt *sopt, const void *buf, size_t len) 1485 { 1486 int error; 1487 size_t valsize; 1488 1489 error = 0; 1490 1491 /* 1492 * Documented get behavior is that we always return a value, 1493 * possibly truncated to fit in the user's buffer. 1494 * Traditional behavior is that we always tell the user 1495 * precisely how much we copied, rather than something useful 1496 * like the total amount we had available for her. 1497 * Note that this interface is not idempotent; the entire answer must 1498 * generated ahead of time. 1499 */ 1500 valsize = min(len, sopt->sopt_valsize); 1501 sopt->sopt_valsize = valsize; 1502 if (sopt->sopt_val != 0) { 1503 if (sopt->sopt_td != 0) 1504 error = copyout(buf, sopt->sopt_val, valsize); 1505 else 1506 bcopy(buf, sopt->sopt_val, valsize); 1507 } 1508 return error; 1509 } 1510 1511 int 1512 sogetopt(so, sopt) 1513 struct socket *so; 1514 struct sockopt *sopt; 1515 { 1516 int error, optval; 1517 struct linger l; 1518 struct timeval tv; 1519 #ifdef INET 1520 struct accept_filter_arg *afap; 1521 #endif 1522 #ifdef MAC 1523 struct mac extmac; 1524 #endif 1525 1526 error = 0; 1527 if (sopt->sopt_level != SOL_SOCKET) { 1528 if (so->so_proto && so->so_proto->pr_ctloutput) { 1529 return ((*so->so_proto->pr_ctloutput) 1530 (so, sopt)); 1531 } else 1532 return (ENOPROTOOPT); 1533 } else { 1534 switch (sopt->sopt_name) { 1535 #ifdef INET 1536 case SO_ACCEPTFILTER: 1537 if ((so->so_options & SO_ACCEPTCONN) == 0) 1538 return (EINVAL); 1539 MALLOC(afap, struct accept_filter_arg *, sizeof(*afap), 1540 M_TEMP, M_WAITOK | M_ZERO); 1541 if ((so->so_options & SO_ACCEPTFILTER) != 0) { 1542 strcpy(afap->af_name, so->so_accf->so_accept_filter->accf_name); 1543 if (so->so_accf->so_accept_filter_str != NULL) 1544 strcpy(afap->af_arg, so->so_accf->so_accept_filter_str); 1545 } 1546 error = sooptcopyout(sopt, afap, sizeof(*afap)); 1547 FREE(afap, M_TEMP); 1548 break; 1549 #endif 1550 1551 case SO_LINGER: 1552 l.l_onoff = so->so_options & SO_LINGER; 1553 l.l_linger = so->so_linger; 1554 error = sooptcopyout(sopt, &l, sizeof l); 1555 break; 1556 1557 case SO_USELOOPBACK: 1558 case SO_DONTROUTE: 1559 case SO_DEBUG: 1560 case SO_KEEPALIVE: 1561 case SO_REUSEADDR: 1562 case SO_REUSEPORT: 1563 case SO_BROADCAST: 1564 case SO_OOBINLINE: 1565 case SO_TIMESTAMP: 1566 case SO_BINTIME: 1567 case SO_NOSIGPIPE: 1568 optval = so->so_options & sopt->sopt_name; 1569 integer: 1570 error = sooptcopyout(sopt, &optval, sizeof optval); 1571 break; 1572 1573 case SO_TYPE: 1574 optval = so->so_type; 1575 goto integer; 1576 1577 case SO_ERROR: 1578 optval = so->so_error; 1579 so->so_error = 0; 1580 goto integer; 1581 1582 case SO_SNDBUF: 1583 optval = so->so_snd.sb_hiwat; 1584 goto integer; 1585 1586 case SO_RCVBUF: 1587 optval = so->so_rcv.sb_hiwat; 1588 goto integer; 1589 1590 case SO_SNDLOWAT: 1591 optval = so->so_snd.sb_lowat; 1592 goto integer; 1593 1594 case SO_RCVLOWAT: 1595 optval = so->so_rcv.sb_lowat; 1596 goto integer; 1597 1598 case SO_SNDTIMEO: 1599 case SO_RCVTIMEO: 1600 optval = (sopt->sopt_name == SO_SNDTIMEO ? 1601 so->so_snd.sb_timeo : so->so_rcv.sb_timeo); 1602 1603 tv.tv_sec = optval / hz; 1604 tv.tv_usec = (optval % hz) * tick; 1605 error = sooptcopyout(sopt, &tv, sizeof tv); 1606 break; 1607 case SO_LABEL: 1608 #ifdef MAC 1609 error = sooptcopyin(sopt, &extmac, sizeof(extmac), 1610 sizeof(extmac)); 1611 if (error) 1612 return (error); 1613 error = mac_getsockopt_label(sopt->sopt_td->td_ucred, 1614 so, &extmac); 1615 if (error) 1616 return (error); 1617 error = sooptcopyout(sopt, &extmac, sizeof extmac); 1618 #else 1619 error = EOPNOTSUPP; 1620 #endif 1621 break; 1622 case SO_PEERLABEL: 1623 #ifdef MAC 1624 error = sooptcopyin(sopt, &extmac, sizeof(extmac), 1625 sizeof(extmac)); 1626 if (error) 1627 return (error); 1628 error = mac_getsockopt_peerlabel( 1629 sopt->sopt_td->td_ucred, so, &extmac); 1630 if (error) 1631 return (error); 1632 error = sooptcopyout(sopt, &extmac, sizeof extmac); 1633 #else 1634 error = EOPNOTSUPP; 1635 #endif 1636 break; 1637 default: 1638 error = ENOPROTOOPT; 1639 break; 1640 } 1641 return (error); 1642 } 1643 } 1644 1645 /* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */ 1646 int 1647 soopt_getm(struct sockopt *sopt, struct mbuf **mp) 1648 { 1649 struct mbuf *m, *m_prev; 1650 int sopt_size = sopt->sopt_valsize; 1651 1652 MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA); 1653 if (m == 0) 1654 return ENOBUFS; 1655 if (sopt_size > MLEN) { 1656 MCLGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT); 1657 if ((m->m_flags & M_EXT) == 0) { 1658 m_free(m); 1659 return ENOBUFS; 1660 } 1661 m->m_len = min(MCLBYTES, sopt_size); 1662 } else { 1663 m->m_len = min(MLEN, sopt_size); 1664 } 1665 sopt_size -= m->m_len; 1666 *mp = m; 1667 m_prev = m; 1668 1669 while (sopt_size) { 1670 MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA); 1671 if (m == 0) { 1672 m_freem(*mp); 1673 return ENOBUFS; 1674 } 1675 if (sopt_size > MLEN) { 1676 MCLGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT); 1677 if ((m->m_flags & M_EXT) == 0) { 1678 m_freem(m); 1679 m_freem(*mp); 1680 return ENOBUFS; 1681 } 1682 m->m_len = min(MCLBYTES, sopt_size); 1683 } else { 1684 m->m_len = min(MLEN, sopt_size); 1685 } 1686 sopt_size -= m->m_len; 1687 m_prev->m_next = m; 1688 m_prev = m; 1689 } 1690 return 0; 1691 } 1692 1693 /* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */ 1694 int 1695 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m) 1696 { 1697 struct mbuf *m0 = m; 1698 1699 if (sopt->sopt_val == NULL) 1700 return 0; 1701 while (m != NULL && sopt->sopt_valsize >= m->m_len) { 1702 if (sopt->sopt_td != NULL) { 1703 int error; 1704 1705 error = copyin(sopt->sopt_val, mtod(m, char *), 1706 m->m_len); 1707 if (error != 0) { 1708 m_freem(m0); 1709 return(error); 1710 } 1711 } else 1712 bcopy(sopt->sopt_val, mtod(m, char *), m->m_len); 1713 sopt->sopt_valsize -= m->m_len; 1714 (caddr_t)sopt->sopt_val += m->m_len; 1715 m = m->m_next; 1716 } 1717 if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */ 1718 panic("ip6_sooptmcopyin"); 1719 return 0; 1720 } 1721 1722 /* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */ 1723 int 1724 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m) 1725 { 1726 struct mbuf *m0 = m; 1727 size_t valsize = 0; 1728 1729 if (sopt->sopt_val == NULL) 1730 return 0; 1731 while (m != NULL && sopt->sopt_valsize >= m->m_len) { 1732 if (sopt->sopt_td != NULL) { 1733 int error; 1734 1735 error = copyout(mtod(m, char *), sopt->sopt_val, 1736 m->m_len); 1737 if (error != 0) { 1738 m_freem(m0); 1739 return(error); 1740 } 1741 } else 1742 bcopy(mtod(m, char *), sopt->sopt_val, m->m_len); 1743 sopt->sopt_valsize -= m->m_len; 1744 (caddr_t)sopt->sopt_val += m->m_len; 1745 valsize += m->m_len; 1746 m = m->m_next; 1747 } 1748 if (m != NULL) { 1749 /* enough soopt buffer should be given from user-land */ 1750 m_freem(m0); 1751 return(EINVAL); 1752 } 1753 sopt->sopt_valsize = valsize; 1754 return 0; 1755 } 1756 1757 void 1758 sohasoutofband(so) 1759 struct socket *so; 1760 { 1761 if (so->so_sigio != NULL) 1762 pgsigio(&so->so_sigio, SIGURG, 0); 1763 selwakeuppri(&so->so_rcv.sb_sel, PSOCK); 1764 } 1765 1766 int 1767 sopoll(struct socket *so, int events, struct ucred *active_cred, 1768 struct thread *td) 1769 { 1770 int revents = 0; 1771 int s = splnet(); 1772 1773 if (events & (POLLIN | POLLRDNORM)) 1774 if (soreadable(so)) 1775 revents |= events & (POLLIN | POLLRDNORM); 1776 1777 if (events & POLLINIGNEOF) 1778 if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat || 1779 !TAILQ_EMPTY(&so->so_comp) || so->so_error) 1780 revents |= POLLINIGNEOF; 1781 1782 if (events & (POLLOUT | POLLWRNORM)) 1783 if (sowriteable(so)) 1784 revents |= events & (POLLOUT | POLLWRNORM); 1785 1786 if (events & (POLLPRI | POLLRDBAND)) 1787 if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) 1788 revents |= events & (POLLPRI | POLLRDBAND); 1789 1790 if (revents == 0) { 1791 if (events & 1792 (POLLIN | POLLINIGNEOF | POLLPRI | POLLRDNORM | 1793 POLLRDBAND)) { 1794 selrecord(td, &so->so_rcv.sb_sel); 1795 so->so_rcv.sb_flags |= SB_SEL; 1796 } 1797 1798 if (events & (POLLOUT | POLLWRNORM)) { 1799 selrecord(td, &so->so_snd.sb_sel); 1800 so->so_snd.sb_flags |= SB_SEL; 1801 } 1802 } 1803 1804 splx(s); 1805 return (revents); 1806 } 1807 1808 int 1809 soo_kqfilter(struct file *fp, struct knote *kn) 1810 { 1811 struct socket *so = kn->kn_fp->f_data; 1812 struct sockbuf *sb; 1813 int s; 1814 1815 switch (kn->kn_filter) { 1816 case EVFILT_READ: 1817 if (so->so_options & SO_ACCEPTCONN) 1818 kn->kn_fop = &solisten_filtops; 1819 else 1820 kn->kn_fop = &soread_filtops; 1821 sb = &so->so_rcv; 1822 break; 1823 case EVFILT_WRITE: 1824 kn->kn_fop = &sowrite_filtops; 1825 sb = &so->so_snd; 1826 break; 1827 default: 1828 return (1); 1829 } 1830 1831 s = splnet(); 1832 SLIST_INSERT_HEAD(&sb->sb_sel.si_note, kn, kn_selnext); 1833 sb->sb_flags |= SB_KNOTE; 1834 splx(s); 1835 return (0); 1836 } 1837 1838 static void 1839 filt_sordetach(struct knote *kn) 1840 { 1841 struct socket *so = kn->kn_fp->f_data; 1842 int s = splnet(); 1843 1844 SLIST_REMOVE(&so->so_rcv.sb_sel.si_note, kn, knote, kn_selnext); 1845 if (SLIST_EMPTY(&so->so_rcv.sb_sel.si_note)) 1846 so->so_rcv.sb_flags &= ~SB_KNOTE; 1847 splx(s); 1848 } 1849 1850 /*ARGSUSED*/ 1851 static int 1852 filt_soread(struct knote *kn, long hint) 1853 { 1854 struct socket *so = kn->kn_fp->f_data; 1855 1856 kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl; 1857 if (so->so_state & SS_CANTRCVMORE) { 1858 kn->kn_flags |= EV_EOF; 1859 kn->kn_fflags = so->so_error; 1860 return (1); 1861 } 1862 if (so->so_error) /* temporary udp error */ 1863 return (1); 1864 if (kn->kn_sfflags & NOTE_LOWAT) 1865 return (kn->kn_data >= kn->kn_sdata); 1866 return (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat); 1867 } 1868 1869 static void 1870 filt_sowdetach(struct knote *kn) 1871 { 1872 struct socket *so = kn->kn_fp->f_data; 1873 int s = splnet(); 1874 1875 SLIST_REMOVE(&so->so_snd.sb_sel.si_note, kn, knote, kn_selnext); 1876 if (SLIST_EMPTY(&so->so_snd.sb_sel.si_note)) 1877 so->so_snd.sb_flags &= ~SB_KNOTE; 1878 splx(s); 1879 } 1880 1881 /*ARGSUSED*/ 1882 static int 1883 filt_sowrite(struct knote *kn, long hint) 1884 { 1885 struct socket *so = kn->kn_fp->f_data; 1886 1887 kn->kn_data = sbspace(&so->so_snd); 1888 if (so->so_state & SS_CANTSENDMORE) { 1889 kn->kn_flags |= EV_EOF; 1890 kn->kn_fflags = so->so_error; 1891 return (1); 1892 } 1893 if (so->so_error) /* temporary udp error */ 1894 return (1); 1895 if (((so->so_state & SS_ISCONNECTED) == 0) && 1896 (so->so_proto->pr_flags & PR_CONNREQUIRED)) 1897 return (0); 1898 if (kn->kn_sfflags & NOTE_LOWAT) 1899 return (kn->kn_data >= kn->kn_sdata); 1900 return (kn->kn_data >= so->so_snd.sb_lowat); 1901 } 1902 1903 /*ARGSUSED*/ 1904 static int 1905 filt_solisten(struct knote *kn, long hint) 1906 { 1907 struct socket *so = kn->kn_fp->f_data; 1908 1909 kn->kn_data = so->so_qlen; 1910 return (! TAILQ_EMPTY(&so->so_comp)); 1911 } 1912 1913 int 1914 socheckuid(struct socket *so, uid_t uid) 1915 { 1916 1917 if (so == NULL) 1918 return (EPERM); 1919 if (so->so_cred->cr_uid == uid) 1920 return (0); 1921 return (EPERM); 1922 } 1923