1 /* 2 * Copyright (c) 1982, 1986, 1988, 1990, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 4. Neither the name of the University nor the names of its contributors 14 * may be used to endorse or promote products derived from this software 15 * without specific prior written permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94 30 */ 31 32 #include <sys/cdefs.h> 33 __FBSDID("$FreeBSD$"); 34 35 #include "opt_inet.h" 36 #include "opt_mac.h" 37 #include "opt_zero.h" 38 39 #include <sys/param.h> 40 #include <sys/systm.h> 41 #include <sys/fcntl.h> 42 #include <sys/limits.h> 43 #include <sys/lock.h> 44 #include <sys/mac.h> 45 #include <sys/malloc.h> 46 #include <sys/mbuf.h> 47 #include <sys/mutex.h> 48 #include <sys/domain.h> 49 #include <sys/file.h> /* for struct knote */ 50 #include <sys/kernel.h> 51 #include <sys/event.h> 52 #include <sys/poll.h> 53 #include <sys/proc.h> 54 #include <sys/protosw.h> 55 #include <sys/socket.h> 56 #include <sys/socketvar.h> 57 #include <sys/resourcevar.h> 58 #include <sys/signalvar.h> 59 #include <sys/sysctl.h> 60 #include <sys/uio.h> 61 #include <sys/jail.h> 62 63 #include <vm/uma.h> 64 65 66 #ifdef INET 67 static int do_setopt_accept_filter(struct socket *so, struct sockopt *sopt); 68 #endif 69 70 static void filt_sordetach(struct knote *kn); 71 static int filt_soread(struct knote *kn, long hint); 72 static void filt_sowdetach(struct knote *kn); 73 static int filt_sowrite(struct knote *kn, long hint); 74 static int filt_solisten(struct knote *kn, long hint); 75 76 static struct filterops solisten_filtops = 77 { 1, NULL, filt_sordetach, filt_solisten }; 78 static struct filterops soread_filtops = 79 { 1, NULL, filt_sordetach, filt_soread }; 80 static struct filterops sowrite_filtops = 81 { 1, NULL, filt_sowdetach, filt_sowrite }; 82 83 uma_zone_t socket_zone; 84 so_gen_t so_gencnt; /* generation count for sockets */ 85 86 MALLOC_DEFINE(M_SONAME, "soname", "socket name"); 87 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block"); 88 89 SYSCTL_DECL(_kern_ipc); 90 91 static int somaxconn = SOMAXCONN; 92 SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLFLAG_RW, 93 &somaxconn, 0, "Maximum pending socket connection queue size"); 94 static int numopensockets; 95 SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD, 96 &numopensockets, 0, "Number of open sockets"); 97 #ifdef ZERO_COPY_SOCKETS 98 /* These aren't static because they're used in other files. */ 99 int so_zero_copy_send = 1; 100 int so_zero_copy_receive = 1; 101 SYSCTL_NODE(_kern_ipc, OID_AUTO, zero_copy, CTLFLAG_RD, 0, 102 "Zero copy controls"); 103 SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, receive, CTLFLAG_RW, 104 &so_zero_copy_receive, 0, "Enable zero copy receive"); 105 SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, send, CTLFLAG_RW, 106 &so_zero_copy_send, 0, "Enable zero copy send"); 107 #endif /* ZERO_COPY_SOCKETS */ 108 109 /* 110 * accept_mtx locks down per-socket fields relating to accept queues. See 111 * socketvar.h for an annotation of the protected fields of struct socket. 112 */ 113 struct mtx accept_mtx; 114 MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF); 115 116 /* 117 * so_global_mtx protects so_gencnt, numopensockets, and the per-socket 118 * so_gencnt field. 119 * 120 * XXXRW: These variables might be better manipulated using atomic operations 121 * for improved efficiency. 122 */ 123 static struct mtx so_global_mtx; 124 MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF); 125 126 /* 127 * Socket operation routines. 128 * These routines are called by the routines in 129 * sys_socket.c or from a system process, and 130 * implement the semantics of socket operations by 131 * switching out to the protocol specific routines. 132 */ 133 134 /* 135 * Get a socket structure from our zone, and initialize it. 136 * Note that it would probably be better to allocate socket 137 * and PCB at the same time, but I'm not convinced that all 138 * the protocols can be easily modified to do this. 139 * 140 * soalloc() returns a socket with a ref count of 0. 141 */ 142 struct socket * 143 soalloc(int mflags) 144 { 145 struct socket *so; 146 #ifdef MAC 147 int error; 148 #endif 149 150 so = uma_zalloc(socket_zone, mflags | M_ZERO); 151 if (so != NULL) { 152 #ifdef MAC 153 error = mac_init_socket(so, mflags); 154 if (error != 0) { 155 uma_zfree(socket_zone, so); 156 so = NULL; 157 return so; 158 } 159 #endif 160 SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd"); 161 SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv"); 162 /* sx_init(&so->so_sxlock, "socket sxlock"); */ 163 TAILQ_INIT(&so->so_aiojobq); 164 mtx_lock(&so_global_mtx); 165 so->so_gencnt = ++so_gencnt; 166 ++numopensockets; 167 mtx_unlock(&so_global_mtx); 168 } 169 return so; 170 } 171 172 /* 173 * socreate returns a socket with a ref count of 1. The socket should be 174 * closed with soclose(). 175 */ 176 int 177 socreate(dom, aso, type, proto, cred, td) 178 int dom; 179 struct socket **aso; 180 int type; 181 int proto; 182 struct ucred *cred; 183 struct thread *td; 184 { 185 struct protosw *prp; 186 struct socket *so; 187 int error; 188 189 if (proto) 190 prp = pffindproto(dom, proto, type); 191 else 192 prp = pffindtype(dom, type); 193 194 if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL) 195 return (EPROTONOSUPPORT); 196 197 if (jailed(cred) && jail_socket_unixiproute_only && 198 prp->pr_domain->dom_family != PF_LOCAL && 199 prp->pr_domain->dom_family != PF_INET && 200 prp->pr_domain->dom_family != PF_ROUTE) { 201 return (EPROTONOSUPPORT); 202 } 203 204 if (prp->pr_type != type) 205 return (EPROTOTYPE); 206 so = soalloc(M_WAITOK); 207 if (so == NULL) 208 return (ENOBUFS); 209 210 TAILQ_INIT(&so->so_incomp); 211 TAILQ_INIT(&so->so_comp); 212 so->so_type = type; 213 so->so_cred = crhold(cred); 214 so->so_proto = prp; 215 #ifdef MAC 216 mac_create_socket(cred, so); 217 #endif 218 SOCK_LOCK(so); 219 soref(so); 220 SOCK_UNLOCK(so); 221 error = (*prp->pr_usrreqs->pru_attach)(so, proto, td); 222 if (error) { 223 SOCK_LOCK(so); 224 so->so_state |= SS_NOFDREF; 225 sorele(so); 226 return (error); 227 } 228 *aso = so; 229 return (0); 230 } 231 232 int 233 sobind(so, nam, td) 234 struct socket *so; 235 struct sockaddr *nam; 236 struct thread *td; 237 { 238 239 return ((*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td)); 240 } 241 242 void 243 sodealloc(struct socket *so) 244 { 245 246 KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count)); 247 mtx_lock(&so_global_mtx); 248 so->so_gencnt = ++so_gencnt; 249 mtx_unlock(&so_global_mtx); 250 if (so->so_rcv.sb_hiwat) 251 (void)chgsbsize(so->so_cred->cr_uidinfo, 252 &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY); 253 if (so->so_snd.sb_hiwat) 254 (void)chgsbsize(so->so_cred->cr_uidinfo, 255 &so->so_snd.sb_hiwat, 0, RLIM_INFINITY); 256 #ifdef INET 257 /* remove acccept filter if one is present. */ 258 if (so->so_accf != NULL) 259 do_setopt_accept_filter(so, NULL); 260 #endif 261 #ifdef MAC 262 mac_destroy_socket(so); 263 #endif 264 crfree(so->so_cred); 265 SOCKBUF_LOCK_DESTROY(&so->so_snd); 266 SOCKBUF_LOCK_DESTROY(&so->so_rcv); 267 /* sx_destroy(&so->so_sxlock); */ 268 uma_zfree(socket_zone, so); 269 /* 270 * XXXRW: Seems like a shame to grab the mutex again down here, but 271 * we don't want to decrement the socket count until after we free 272 * the socket, and we can't increment the gencnt on the socket after 273 * we free, it so... 274 */ 275 mtx_lock(&so_global_mtx); 276 --numopensockets; 277 mtx_unlock(&so_global_mtx); 278 } 279 280 int 281 solisten(so, backlog, td) 282 struct socket *so; 283 int backlog; 284 struct thread *td; 285 { 286 int error; 287 288 /* 289 * XXXRW: Ordering issue here -- perhaps we need to set 290 * SO_ACCEPTCONN before the call to pru_listen()? 291 * XXXRW: General atomic test-and-set concerns here also. 292 */ 293 if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | 294 SS_ISDISCONNECTING)) 295 return (EINVAL); 296 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, td); 297 if (error) 298 return (error); 299 ACCEPT_LOCK(); 300 if (TAILQ_EMPTY(&so->so_comp)) { 301 SOCK_LOCK(so); 302 so->so_options |= SO_ACCEPTCONN; 303 SOCK_UNLOCK(so); 304 } 305 if (backlog < 0 || backlog > somaxconn) 306 backlog = somaxconn; 307 so->so_qlimit = backlog; 308 ACCEPT_UNLOCK(); 309 return (0); 310 } 311 312 void 313 sofree(so) 314 struct socket *so; 315 { 316 struct socket *head; 317 318 KASSERT(so->so_count == 0, ("socket %p so_count not 0", so)); 319 SOCK_LOCK_ASSERT(so); 320 321 if (so->so_pcb != NULL || (so->so_state & SS_NOFDREF) == 0) { 322 SOCK_UNLOCK(so); 323 return; 324 } 325 326 SOCK_UNLOCK(so); 327 ACCEPT_LOCK(); 328 head = so->so_head; 329 if (head != NULL) { 330 KASSERT((so->so_qstate & SQ_COMP) != 0 || 331 (so->so_qstate & SQ_INCOMP) != 0, 332 ("sofree: so_head != NULL, but neither SQ_COMP nor " 333 "SQ_INCOMP")); 334 KASSERT((so->so_qstate & SQ_COMP) == 0 || 335 (so->so_qstate & SQ_INCOMP) == 0, 336 ("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP")); 337 /* 338 * accept(2) is responsible draining the completed 339 * connection queue and freeing those sockets, so 340 * we just return here if this socket is currently 341 * on the completed connection queue. Otherwise, 342 * accept(2) may hang after select(2) has indicating 343 * that a listening socket was ready. If it's an 344 * incomplete connection, we remove it from the queue 345 * and free it; otherwise, it won't be released until 346 * the listening socket is closed. 347 */ 348 if ((so->so_qstate & SQ_COMP) != 0) { 349 ACCEPT_UNLOCK(); 350 return; 351 } 352 TAILQ_REMOVE(&head->so_incomp, so, so_list); 353 head->so_incqlen--; 354 so->so_qstate &= ~SQ_INCOMP; 355 so->so_head = NULL; 356 } 357 KASSERT((so->so_qstate & SQ_COMP) == 0 && 358 (so->so_qstate & SQ_INCOMP) == 0, 359 ("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)", 360 so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP)); 361 ACCEPT_UNLOCK(); 362 SOCKBUF_LOCK(&so->so_snd); 363 so->so_snd.sb_flags |= SB_NOINTR; 364 (void)sblock(&so->so_snd, M_WAITOK); 365 /* 366 * socantsendmore_locked() drops the socket buffer mutex so that it 367 * can safely perform wakeups. Re-acquire the mutex before 368 * continuing. 369 */ 370 socantsendmore_locked(so); 371 SOCKBUF_LOCK(&so->so_snd); 372 sbunlock(&so->so_snd); 373 sbrelease_locked(&so->so_snd, so); 374 SOCKBUF_UNLOCK(&so->so_snd); 375 sorflush(so); 376 sodealloc(so); 377 } 378 379 /* 380 * Close a socket on last file table reference removal. 381 * Initiate disconnect if connected. 382 * Free socket when disconnect complete. 383 * 384 * This function will sorele() the socket. Note that soclose() may be 385 * called prior to the ref count reaching zero. The actual socket 386 * structure will not be freed until the ref count reaches zero. 387 */ 388 int 389 soclose(so) 390 struct socket *so; 391 { 392 int error = 0; 393 394 funsetown(&so->so_sigio); 395 if (so->so_options & SO_ACCEPTCONN) { 396 struct socket *sp; 397 ACCEPT_LOCK(); 398 while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) { 399 TAILQ_REMOVE(&so->so_incomp, sp, so_list); 400 so->so_incqlen--; 401 sp->so_qstate &= ~SQ_INCOMP; 402 sp->so_head = NULL; 403 ACCEPT_UNLOCK(); 404 (void) soabort(sp); 405 ACCEPT_LOCK(); 406 } 407 while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) { 408 TAILQ_REMOVE(&so->so_comp, sp, so_list); 409 so->so_qlen--; 410 sp->so_qstate &= ~SQ_COMP; 411 sp->so_head = NULL; 412 ACCEPT_UNLOCK(); 413 (void) soabort(sp); 414 ACCEPT_LOCK(); 415 } 416 ACCEPT_UNLOCK(); 417 } 418 if (so->so_pcb == NULL) 419 goto discard; 420 if (so->so_state & SS_ISCONNECTED) { 421 if ((so->so_state & SS_ISDISCONNECTING) == 0) { 422 error = sodisconnect(so); 423 if (error) 424 goto drop; 425 } 426 if (so->so_options & SO_LINGER) { 427 if ((so->so_state & SS_ISDISCONNECTING) && 428 (so->so_state & SS_NBIO)) 429 goto drop; 430 while (so->so_state & SS_ISCONNECTED) { 431 error = tsleep(&so->so_timeo, 432 PSOCK | PCATCH, "soclos", so->so_linger * hz); 433 if (error) 434 break; 435 } 436 } 437 } 438 drop: 439 if (so->so_pcb != NULL) { 440 int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so); 441 if (error == 0) 442 error = error2; 443 } 444 discard: 445 SOCK_LOCK(so); 446 KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF")); 447 so->so_state |= SS_NOFDREF; 448 sorele(so); 449 return (error); 450 } 451 452 /* 453 * soabort() must not be called with any socket locks held, as it calls 454 * into the protocol, which will call back into the socket code causing 455 * it to acquire additional socket locks that may cause recursion or lock 456 * order reversals. 457 */ 458 int 459 soabort(so) 460 struct socket *so; 461 { 462 int error; 463 464 error = (*so->so_proto->pr_usrreqs->pru_abort)(so); 465 if (error) { 466 SOCK_LOCK(so); 467 sotryfree(so); /* note: does not decrement the ref count */ 468 return error; 469 } 470 return (0); 471 } 472 473 int 474 soaccept(so, nam) 475 struct socket *so; 476 struct sockaddr **nam; 477 { 478 int error; 479 480 SOCK_LOCK(so); 481 KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF")); 482 so->so_state &= ~SS_NOFDREF; 483 SOCK_UNLOCK(so); 484 error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam); 485 return (error); 486 } 487 488 int 489 soconnect(so, nam, td) 490 struct socket *so; 491 struct sockaddr *nam; 492 struct thread *td; 493 { 494 int error; 495 496 if (so->so_options & SO_ACCEPTCONN) 497 return (EOPNOTSUPP); 498 /* 499 * If protocol is connection-based, can only connect once. 500 * Otherwise, if connected, try to disconnect first. 501 * This allows user to disconnect by connecting to, e.g., 502 * a null address. 503 */ 504 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && 505 ((so->so_proto->pr_flags & PR_CONNREQUIRED) || 506 (error = sodisconnect(so)))) 507 error = EISCONN; 508 else 509 error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td); 510 return (error); 511 } 512 513 int 514 soconnect2(so1, so2) 515 struct socket *so1; 516 struct socket *so2; 517 { 518 519 return ((*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2)); 520 } 521 522 int 523 sodisconnect(so) 524 struct socket *so; 525 { 526 int error; 527 528 if ((so->so_state & SS_ISCONNECTED) == 0) 529 return (ENOTCONN); 530 if (so->so_state & SS_ISDISCONNECTING) 531 return (EALREADY); 532 error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so); 533 return (error); 534 } 535 536 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK) 537 /* 538 * Send on a socket. 539 * If send must go all at once and message is larger than 540 * send buffering, then hard error. 541 * Lock against other senders. 542 * If must go all at once and not enough room now, then 543 * inform user that this would block and do nothing. 544 * Otherwise, if nonblocking, send as much as possible. 545 * The data to be sent is described by "uio" if nonzero, 546 * otherwise by the mbuf chain "top" (which must be null 547 * if uio is not). Data provided in mbuf chain must be small 548 * enough to send all at once. 549 * 550 * Returns nonzero on error, timeout or signal; callers 551 * must check for short counts if EINTR/ERESTART are returned. 552 * Data and control buffers are freed on return. 553 */ 554 555 #ifdef ZERO_COPY_SOCKETS 556 struct so_zerocopy_stats{ 557 int size_ok; 558 int align_ok; 559 int found_ifp; 560 }; 561 struct so_zerocopy_stats so_zerocp_stats = {0,0,0}; 562 #include <netinet/in.h> 563 #include <net/route.h> 564 #include <netinet/in_pcb.h> 565 #include <vm/vm.h> 566 #include <vm/vm_page.h> 567 #include <vm/vm_object.h> 568 #endif /*ZERO_COPY_SOCKETS*/ 569 570 int 571 sosend(so, addr, uio, top, control, flags, td) 572 struct socket *so; 573 struct sockaddr *addr; 574 struct uio *uio; 575 struct mbuf *top; 576 struct mbuf *control; 577 int flags; 578 struct thread *td; 579 { 580 struct mbuf **mp; 581 struct mbuf *m; 582 long space, len = 0, resid; 583 int clen = 0, error, dontroute; 584 int atomic = sosendallatonce(so) || top; 585 #ifdef ZERO_COPY_SOCKETS 586 int cow_send; 587 #endif /* ZERO_COPY_SOCKETS */ 588 589 if (uio != NULL) 590 resid = uio->uio_resid; 591 else 592 resid = top->m_pkthdr.len; 593 /* 594 * In theory resid should be unsigned. 595 * However, space must be signed, as it might be less than 0 596 * if we over-committed, and we must use a signed comparison 597 * of space and resid. On the other hand, a negative resid 598 * causes us to loop sending 0-length segments to the protocol. 599 * 600 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM 601 * type sockets since that's an error. 602 */ 603 if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) { 604 error = EINVAL; 605 goto out; 606 } 607 608 dontroute = 609 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && 610 (so->so_proto->pr_flags & PR_ATOMIC); 611 if (td != NULL) 612 td->td_proc->p_stats->p_ru.ru_msgsnd++; 613 if (control != NULL) 614 clen = control->m_len; 615 #define snderr(errno) { error = (errno); goto release; } 616 617 SOCKBUF_LOCK(&so->so_snd); 618 restart: 619 SOCKBUF_LOCK_ASSERT(&so->so_snd); 620 error = sblock(&so->so_snd, SBLOCKWAIT(flags)); 621 if (error) 622 goto out_locked; 623 do { 624 SOCKBUF_LOCK_ASSERT(&so->so_snd); 625 if (so->so_snd.sb_state & SBS_CANTSENDMORE) 626 snderr(EPIPE); 627 if (so->so_error) { 628 error = so->so_error; 629 so->so_error = 0; 630 goto release; 631 } 632 if ((so->so_state & SS_ISCONNECTED) == 0) { 633 /* 634 * `sendto' and `sendmsg' is allowed on a connection- 635 * based socket if it supports implied connect. 636 * Return ENOTCONN if not connected and no address is 637 * supplied. 638 */ 639 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && 640 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { 641 if ((so->so_state & SS_ISCONFIRMING) == 0 && 642 !(resid == 0 && clen != 0)) 643 snderr(ENOTCONN); 644 } else if (addr == NULL) 645 snderr(so->so_proto->pr_flags & PR_CONNREQUIRED ? 646 ENOTCONN : EDESTADDRREQ); 647 } 648 space = sbspace(&so->so_snd); 649 if (flags & MSG_OOB) 650 space += 1024; 651 if ((atomic && resid > so->so_snd.sb_hiwat) || 652 clen > so->so_snd.sb_hiwat) 653 snderr(EMSGSIZE); 654 if (space < resid + clen && 655 (atomic || space < so->so_snd.sb_lowat || space < clen)) { 656 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) 657 snderr(EWOULDBLOCK); 658 sbunlock(&so->so_snd); 659 error = sbwait(&so->so_snd); 660 if (error) 661 goto out_locked; 662 goto restart; 663 } 664 SOCKBUF_UNLOCK(&so->so_snd); 665 mp = ⊤ 666 space -= clen; 667 do { 668 if (uio == NULL) { 669 /* 670 * Data is prepackaged in "top". 671 */ 672 resid = 0; 673 if (flags & MSG_EOR) 674 top->m_flags |= M_EOR; 675 } else do { 676 #ifdef ZERO_COPY_SOCKETS 677 cow_send = 0; 678 #endif /* ZERO_COPY_SOCKETS */ 679 if (resid >= MINCLSIZE) { 680 #ifdef ZERO_COPY_SOCKETS 681 if (top == NULL) { 682 MGETHDR(m, M_TRYWAIT, MT_DATA); 683 if (m == NULL) { 684 error = ENOBUFS; 685 SOCKBUF_LOCK(&so->so_snd); 686 goto release; 687 } 688 m->m_pkthdr.len = 0; 689 m->m_pkthdr.rcvif = (struct ifnet *)0; 690 } else { 691 MGET(m, M_TRYWAIT, MT_DATA); 692 if (m == NULL) { 693 error = ENOBUFS; 694 SOCKBUF_LOCK(&so->so_snd); 695 goto release; 696 } 697 } 698 if (so_zero_copy_send && 699 resid>=PAGE_SIZE && 700 space>=PAGE_SIZE && 701 uio->uio_iov->iov_len>=PAGE_SIZE) { 702 so_zerocp_stats.size_ok++; 703 if (!((vm_offset_t) 704 uio->uio_iov->iov_base & PAGE_MASK)){ 705 so_zerocp_stats.align_ok++; 706 cow_send = socow_setup(m, uio); 707 } 708 } 709 if (!cow_send) { 710 MCLGET(m, M_TRYWAIT); 711 if ((m->m_flags & M_EXT) == 0) { 712 m_free(m); 713 m = NULL; 714 } else { 715 len = min(min(MCLBYTES, resid), space); 716 } 717 } else 718 len = PAGE_SIZE; 719 #else /* ZERO_COPY_SOCKETS */ 720 if (top == NULL) { 721 m = m_getcl(M_TRYWAIT, MT_DATA, M_PKTHDR); 722 m->m_pkthdr.len = 0; 723 m->m_pkthdr.rcvif = (struct ifnet *)0; 724 } else 725 m = m_getcl(M_TRYWAIT, MT_DATA, 0); 726 len = min(min(MCLBYTES, resid), space); 727 #endif /* ZERO_COPY_SOCKETS */ 728 } else { 729 if (top == NULL) { 730 m = m_gethdr(M_TRYWAIT, MT_DATA); 731 m->m_pkthdr.len = 0; 732 m->m_pkthdr.rcvif = (struct ifnet *)0; 733 734 len = min(min(MHLEN, resid), space); 735 /* 736 * For datagram protocols, leave room 737 * for protocol headers in first mbuf. 738 */ 739 if (atomic && m && len < MHLEN) 740 MH_ALIGN(m, len); 741 } else { 742 m = m_get(M_TRYWAIT, MT_DATA); 743 len = min(min(MLEN, resid), space); 744 } 745 } 746 if (m == NULL) { 747 error = ENOBUFS; 748 SOCKBUF_LOCK(&so->so_snd); 749 goto release; 750 } 751 752 space -= len; 753 #ifdef ZERO_COPY_SOCKETS 754 if (cow_send) 755 error = 0; 756 else 757 #endif /* ZERO_COPY_SOCKETS */ 758 error = uiomove(mtod(m, void *), (int)len, uio); 759 resid = uio->uio_resid; 760 m->m_len = len; 761 *mp = m; 762 top->m_pkthdr.len += len; 763 if (error) { 764 SOCKBUF_LOCK(&so->so_snd); 765 goto release; 766 } 767 mp = &m->m_next; 768 if (resid <= 0) { 769 if (flags & MSG_EOR) 770 top->m_flags |= M_EOR; 771 break; 772 } 773 } while (space > 0 && atomic); 774 if (dontroute) { 775 SOCK_LOCK(so); 776 so->so_options |= SO_DONTROUTE; 777 SOCK_UNLOCK(so); 778 } 779 /* 780 * XXX all the SBS_CANTSENDMORE checks previously 781 * done could be out of date. We could have recieved 782 * a reset packet in an interrupt or maybe we slept 783 * while doing page faults in uiomove() etc. We could 784 * probably recheck again inside the splnet() protection 785 * here, but there are probably other places that this 786 * also happens. We must rethink this. 787 */ 788 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 789 (flags & MSG_OOB) ? PRUS_OOB : 790 /* 791 * If the user set MSG_EOF, the protocol 792 * understands this flag and nothing left to 793 * send then use PRU_SEND_EOF instead of PRU_SEND. 794 */ 795 ((flags & MSG_EOF) && 796 (so->so_proto->pr_flags & PR_IMPLOPCL) && 797 (resid <= 0)) ? 798 PRUS_EOF : 799 /* If there is more to send set PRUS_MORETOCOME */ 800 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, 801 top, addr, control, td); 802 if (dontroute) { 803 SOCK_LOCK(so); 804 so->so_options &= ~SO_DONTROUTE; 805 SOCK_UNLOCK(so); 806 } 807 clen = 0; 808 control = NULL; 809 top = NULL; 810 mp = ⊤ 811 if (error) { 812 SOCKBUF_LOCK(&so->so_snd); 813 goto release; 814 } 815 } while (resid && space > 0); 816 SOCKBUF_LOCK(&so->so_snd); 817 } while (resid); 818 819 release: 820 SOCKBUF_LOCK_ASSERT(&so->so_snd); 821 sbunlock(&so->so_snd); 822 out_locked: 823 SOCKBUF_LOCK_ASSERT(&so->so_snd); 824 SOCKBUF_UNLOCK(&so->so_snd); 825 out: 826 if (top != NULL) 827 m_freem(top); 828 if (control != NULL) 829 m_freem(control); 830 return (error); 831 } 832 833 /* 834 * Implement receive operations on a socket. 835 * We depend on the way that records are added to the sockbuf 836 * by sbappend*. In particular, each record (mbufs linked through m_next) 837 * must begin with an address if the protocol so specifies, 838 * followed by an optional mbuf or mbufs containing ancillary data, 839 * and then zero or more mbufs of data. 840 * In order to avoid blocking network interrupts for the entire time here, 841 * we splx() while doing the actual copy to user space. 842 * Although the sockbuf is locked, new data may still be appended, 843 * and thus we must maintain consistency of the sockbuf during that time. 844 * 845 * The caller may receive the data as a single mbuf chain by supplying 846 * an mbuf **mp0 for use in returning the chain. The uio is then used 847 * only for the count in uio_resid. 848 */ 849 int 850 soreceive(so, psa, uio, mp0, controlp, flagsp) 851 struct socket *so; 852 struct sockaddr **psa; 853 struct uio *uio; 854 struct mbuf **mp0; 855 struct mbuf **controlp; 856 int *flagsp; 857 { 858 struct mbuf *m, **mp; 859 int flags, len, error, offset; 860 struct protosw *pr = so->so_proto; 861 struct mbuf *nextrecord; 862 int moff, type = 0; 863 int orig_resid = uio->uio_resid; 864 865 mp = mp0; 866 if (psa != NULL) 867 *psa = 0; 868 if (controlp != NULL) 869 *controlp = 0; 870 if (flagsp != NULL) 871 flags = *flagsp &~ MSG_EOR; 872 else 873 flags = 0; 874 if (flags & MSG_OOB) { 875 m = m_get(M_TRYWAIT, MT_DATA); 876 if (m == NULL) 877 return (ENOBUFS); 878 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK); 879 if (error) 880 goto bad; 881 do { 882 #ifdef ZERO_COPY_SOCKETS 883 if (so_zero_copy_receive) { 884 vm_page_t pg; 885 int disposable; 886 887 if ((m->m_flags & M_EXT) 888 && (m->m_ext.ext_type == EXT_DISPOSABLE)) 889 disposable = 1; 890 else 891 disposable = 0; 892 893 pg = PHYS_TO_VM_PAGE(vtophys(mtod(m, caddr_t))); 894 if (uio->uio_offset == -1) 895 uio->uio_offset =IDX_TO_OFF(pg->pindex); 896 897 error = uiomoveco(mtod(m, void *), 898 min(uio->uio_resid, m->m_len), 899 uio, pg->object, 900 disposable); 901 } else 902 #endif /* ZERO_COPY_SOCKETS */ 903 error = uiomove(mtod(m, void *), 904 (int) min(uio->uio_resid, m->m_len), uio); 905 m = m_free(m); 906 } while (uio->uio_resid && error == 0 && m); 907 bad: 908 if (m != NULL) 909 m_freem(m); 910 return (error); 911 } 912 if (mp != NULL) 913 *mp = NULL; 914 if (so->so_state & SS_ISCONFIRMING && uio->uio_resid) 915 (*pr->pr_usrreqs->pru_rcvd)(so, 0); 916 917 SOCKBUF_LOCK(&so->so_rcv); 918 restart: 919 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 920 error = sblock(&so->so_rcv, SBLOCKWAIT(flags)); 921 if (error) 922 goto out; 923 924 m = so->so_rcv.sb_mb; 925 /* 926 * If we have less data than requested, block awaiting more 927 * (subject to any timeout) if: 928 * 1. the current count is less than the low water mark, or 929 * 2. MSG_WAITALL is set, and it is possible to do the entire 930 * receive operation at once if we block (resid <= hiwat). 931 * 3. MSG_DONTWAIT is not set 932 * If MSG_WAITALL is set but resid is larger than the receive buffer, 933 * we have to do the receive in sections, and thus risk returning 934 * a short count if a timeout or signal occurs after we start. 935 */ 936 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && 937 so->so_rcv.sb_cc < uio->uio_resid) && 938 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat || 939 ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) && 940 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) { 941 KASSERT(m != NULL || !so->so_rcv.sb_cc, 942 ("receive: m == %p so->so_rcv.sb_cc == %u", 943 m, so->so_rcv.sb_cc)); 944 if (so->so_error) { 945 if (m != NULL) 946 goto dontblock; 947 error = so->so_error; 948 if ((flags & MSG_PEEK) == 0) 949 so->so_error = 0; 950 goto release; 951 } 952 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 953 if (m) 954 goto dontblock; 955 else 956 goto release; 957 } 958 for (; m != NULL; m = m->m_next) 959 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { 960 m = so->so_rcv.sb_mb; 961 goto dontblock; 962 } 963 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 964 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 965 error = ENOTCONN; 966 goto release; 967 } 968 if (uio->uio_resid == 0) 969 goto release; 970 if ((so->so_state & SS_NBIO) || 971 (flags & (MSG_DONTWAIT|MSG_NBIO))) { 972 error = EWOULDBLOCK; 973 goto release; 974 } 975 SBLASTRECORDCHK(&so->so_rcv); 976 SBLASTMBUFCHK(&so->so_rcv); 977 sbunlock(&so->so_rcv); 978 error = sbwait(&so->so_rcv); 979 if (error) 980 goto out; 981 goto restart; 982 } 983 dontblock: 984 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 985 if (uio->uio_td) 986 uio->uio_td->td_proc->p_stats->p_ru.ru_msgrcv++; 987 SBLASTRECORDCHK(&so->so_rcv); 988 SBLASTMBUFCHK(&so->so_rcv); 989 nextrecord = m->m_nextpkt; 990 if (pr->pr_flags & PR_ADDR) { 991 KASSERT(m->m_type == MT_SONAME, 992 ("m->m_type == %d", m->m_type)); 993 orig_resid = 0; 994 if (psa != NULL) 995 *psa = sodupsockaddr(mtod(m, struct sockaddr *), 996 M_NOWAIT); 997 if (flags & MSG_PEEK) { 998 m = m->m_next; 999 } else { 1000 sbfree(&so->so_rcv, m); 1001 so->so_rcv.sb_mb = m_free(m); 1002 m = so->so_rcv.sb_mb; 1003 } 1004 } 1005 while (m != NULL && m->m_type == MT_CONTROL && error == 0) { 1006 if (flags & MSG_PEEK) { 1007 if (controlp != NULL) 1008 *controlp = m_copy(m, 0, m->m_len); 1009 m = m->m_next; 1010 } else { 1011 sbfree(&so->so_rcv, m); 1012 so->so_rcv.sb_mb = m->m_next; 1013 m->m_next = NULL; 1014 if (pr->pr_domain->dom_externalize) { 1015 SOCKBUF_UNLOCK(&so->so_rcv); 1016 error = (*pr->pr_domain->dom_externalize) 1017 (m, controlp); 1018 SOCKBUF_LOCK(&so->so_rcv); 1019 } else if (controlp != NULL) 1020 *controlp = m; 1021 else 1022 m_freem(m); 1023 m = so->so_rcv.sb_mb; 1024 } 1025 if (controlp != NULL) { 1026 orig_resid = 0; 1027 while (*controlp != NULL) 1028 controlp = &(*controlp)->m_next; 1029 } 1030 } 1031 if (m != NULL) { 1032 if ((flags & MSG_PEEK) == 0) { 1033 m->m_nextpkt = nextrecord; 1034 /* 1035 * If nextrecord == NULL (this is a single chain), 1036 * then sb_lastrecord may not be valid here if m 1037 * was changed earlier. 1038 */ 1039 if (nextrecord == NULL) { 1040 KASSERT(so->so_rcv.sb_mb == m, 1041 ("receive tailq 1")); 1042 so->so_rcv.sb_lastrecord = m; 1043 } 1044 } 1045 type = m->m_type; 1046 if (type == MT_OOBDATA) 1047 flags |= MSG_OOB; 1048 } else { 1049 if ((flags & MSG_PEEK) == 0) { 1050 KASSERT(so->so_rcv.sb_mb == m,("receive tailq 2")); 1051 so->so_rcv.sb_mb = nextrecord; 1052 SB_EMPTY_FIXUP(&so->so_rcv); 1053 } 1054 } 1055 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1056 SBLASTRECORDCHK(&so->so_rcv); 1057 SBLASTMBUFCHK(&so->so_rcv); 1058 1059 moff = 0; 1060 offset = 0; 1061 while (m != NULL && uio->uio_resid > 0 && error == 0) { 1062 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1063 if (m->m_type == MT_OOBDATA) { 1064 if (type != MT_OOBDATA) 1065 break; 1066 } else if (type == MT_OOBDATA) 1067 break; 1068 else 1069 KASSERT(m->m_type == MT_DATA || m->m_type == MT_HEADER, 1070 ("m->m_type == %d", m->m_type)); 1071 so->so_rcv.sb_state &= ~SBS_RCVATMARK; 1072 len = uio->uio_resid; 1073 if (so->so_oobmark && len > so->so_oobmark - offset) 1074 len = so->so_oobmark - offset; 1075 if (len > m->m_len - moff) 1076 len = m->m_len - moff; 1077 /* 1078 * If mp is set, just pass back the mbufs. 1079 * Otherwise copy them out via the uio, then free. 1080 * Sockbuf must be consistent here (points to current mbuf, 1081 * it points to next record) when we drop priority; 1082 * we must note any additions to the sockbuf when we 1083 * block interrupts again. 1084 */ 1085 if (mp == NULL) { 1086 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1087 SBLASTRECORDCHK(&so->so_rcv); 1088 SBLASTMBUFCHK(&so->so_rcv); 1089 SOCKBUF_UNLOCK(&so->so_rcv); 1090 #ifdef ZERO_COPY_SOCKETS 1091 if (so_zero_copy_receive) { 1092 vm_page_t pg; 1093 int disposable; 1094 1095 if ((m->m_flags & M_EXT) 1096 && (m->m_ext.ext_type == EXT_DISPOSABLE)) 1097 disposable = 1; 1098 else 1099 disposable = 0; 1100 1101 pg = PHYS_TO_VM_PAGE(vtophys(mtod(m, caddr_t) + 1102 moff)); 1103 1104 if (uio->uio_offset == -1) 1105 uio->uio_offset =IDX_TO_OFF(pg->pindex); 1106 1107 error = uiomoveco(mtod(m, char *) + moff, 1108 (int)len, uio,pg->object, 1109 disposable); 1110 } else 1111 #endif /* ZERO_COPY_SOCKETS */ 1112 error = uiomove(mtod(m, char *) + moff, (int)len, uio); 1113 SOCKBUF_LOCK(&so->so_rcv); 1114 if (error) 1115 goto release; 1116 } else 1117 uio->uio_resid -= len; 1118 if (len == m->m_len - moff) { 1119 if (m->m_flags & M_EOR) 1120 flags |= MSG_EOR; 1121 if (flags & MSG_PEEK) { 1122 m = m->m_next; 1123 moff = 0; 1124 } else { 1125 nextrecord = m->m_nextpkt; 1126 sbfree(&so->so_rcv, m); 1127 if (mp != NULL) { 1128 *mp = m; 1129 mp = &m->m_next; 1130 so->so_rcv.sb_mb = m = m->m_next; 1131 *mp = NULL; 1132 } else { 1133 so->so_rcv.sb_mb = m_free(m); 1134 m = so->so_rcv.sb_mb; 1135 } 1136 if (m != NULL) { 1137 m->m_nextpkt = nextrecord; 1138 if (nextrecord == NULL) 1139 so->so_rcv.sb_lastrecord = m; 1140 } else { 1141 so->so_rcv.sb_mb = nextrecord; 1142 SB_EMPTY_FIXUP(&so->so_rcv); 1143 } 1144 SBLASTRECORDCHK(&so->so_rcv); 1145 SBLASTMBUFCHK(&so->so_rcv); 1146 } 1147 } else { 1148 if (flags & MSG_PEEK) 1149 moff += len; 1150 else { 1151 if (mp != NULL) 1152 *mp = m_copym(m, 0, len, M_TRYWAIT); 1153 m->m_data += len; 1154 m->m_len -= len; 1155 so->so_rcv.sb_cc -= len; 1156 } 1157 } 1158 if (so->so_oobmark) { 1159 if ((flags & MSG_PEEK) == 0) { 1160 so->so_oobmark -= len; 1161 if (so->so_oobmark == 0) { 1162 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1163 so->so_rcv.sb_state |= SBS_RCVATMARK; 1164 break; 1165 } 1166 } else { 1167 offset += len; 1168 if (offset == so->so_oobmark) 1169 break; 1170 } 1171 } 1172 if (flags & MSG_EOR) 1173 break; 1174 /* 1175 * If the MSG_WAITALL flag is set (for non-atomic socket), 1176 * we must not quit until "uio->uio_resid == 0" or an error 1177 * termination. If a signal/timeout occurs, return 1178 * with a short count but without error. 1179 * Keep sockbuf locked against other readers. 1180 */ 1181 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 && 1182 !sosendallatonce(so) && nextrecord == NULL) { 1183 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1184 if (so->so_error || so->so_rcv.sb_state & SBS_CANTRCVMORE) 1185 break; 1186 /* 1187 * Notify the protocol that some data has been 1188 * drained before blocking. 1189 */ 1190 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb != NULL) { 1191 SOCKBUF_UNLOCK(&so->so_rcv); 1192 (*pr->pr_usrreqs->pru_rcvd)(so, flags); 1193 SOCKBUF_LOCK(&so->so_rcv); 1194 } 1195 SBLASTRECORDCHK(&so->so_rcv); 1196 SBLASTMBUFCHK(&so->so_rcv); 1197 error = sbwait(&so->so_rcv); 1198 if (error) 1199 goto release; 1200 m = so->so_rcv.sb_mb; 1201 if (m != NULL) 1202 nextrecord = m->m_nextpkt; 1203 } 1204 } 1205 1206 if (m != NULL && pr->pr_flags & PR_ATOMIC) { 1207 flags |= MSG_TRUNC; 1208 if ((flags & MSG_PEEK) == 0) { 1209 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1210 (void) sbdroprecord_locked(&so->so_rcv); 1211 } 1212 } 1213 if ((flags & MSG_PEEK) == 0) { 1214 if (m == NULL) { 1215 /* 1216 * First part is an inline SB_EMPTY_FIXUP(). Second 1217 * part makes sure sb_lastrecord is up-to-date if 1218 * there is still data in the socket buffer. 1219 */ 1220 so->so_rcv.sb_mb = nextrecord; 1221 if (so->so_rcv.sb_mb == NULL) { 1222 so->so_rcv.sb_mbtail = NULL; 1223 so->so_rcv.sb_lastrecord = NULL; 1224 } else if (nextrecord->m_nextpkt == NULL) 1225 so->so_rcv.sb_lastrecord = nextrecord; 1226 } 1227 SBLASTRECORDCHK(&so->so_rcv); 1228 SBLASTMBUFCHK(&so->so_rcv); 1229 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) { 1230 SOCKBUF_UNLOCK(&so->so_rcv); 1231 (*pr->pr_usrreqs->pru_rcvd)(so, flags); 1232 SOCKBUF_LOCK(&so->so_rcv); 1233 } 1234 } 1235 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1236 if (orig_resid == uio->uio_resid && orig_resid && 1237 (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) { 1238 sbunlock(&so->so_rcv); 1239 goto restart; 1240 } 1241 1242 if (flagsp != NULL) 1243 *flagsp |= flags; 1244 release: 1245 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1246 sbunlock(&so->so_rcv); 1247 out: 1248 SOCKBUF_UNLOCK(&so->so_rcv); 1249 return (error); 1250 } 1251 1252 int 1253 soshutdown(so, how) 1254 struct socket *so; 1255 int how; 1256 { 1257 struct protosw *pr = so->so_proto; 1258 1259 if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR)) 1260 return (EINVAL); 1261 1262 if (how != SHUT_WR) 1263 sorflush(so); 1264 if (how != SHUT_RD) 1265 return ((*pr->pr_usrreqs->pru_shutdown)(so)); 1266 return (0); 1267 } 1268 1269 void 1270 sorflush(so) 1271 struct socket *so; 1272 { 1273 struct sockbuf *sb = &so->so_rcv; 1274 struct protosw *pr = so->so_proto; 1275 struct sockbuf asb; 1276 1277 /* 1278 * XXXRW: This is quite ugly. The existing code made a copy of the 1279 * socket buffer, then zero'd the original to clear the buffer 1280 * fields. However, with mutexes in the socket buffer, this causes 1281 * problems. We only clear the zeroable bits of the original; 1282 * however, we have to initialize and destroy the mutex in the copy 1283 * so that dom_dispose() and sbrelease() can lock t as needed. 1284 */ 1285 SOCKBUF_LOCK(sb); 1286 sb->sb_flags |= SB_NOINTR; 1287 (void) sblock(sb, M_WAITOK); 1288 /* 1289 * socantrcvmore_locked() drops the socket buffer mutex so that it 1290 * can safely perform wakeups. Re-acquire the mutex before 1291 * continuing. 1292 */ 1293 socantrcvmore_locked(so); 1294 SOCKBUF_LOCK(sb); 1295 sbunlock(sb); 1296 /* 1297 * Invalidate/clear most of the sockbuf structure, but leave 1298 * selinfo and mutex data unchanged. 1299 */ 1300 bzero(&asb, offsetof(struct sockbuf, sb_startzero)); 1301 bcopy(&sb->sb_startzero, &asb.sb_startzero, 1302 sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); 1303 bzero(&sb->sb_startzero, 1304 sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); 1305 SOCKBUF_UNLOCK(sb); 1306 1307 SOCKBUF_LOCK_INIT(&asb, "so_rcv"); 1308 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL) 1309 (*pr->pr_domain->dom_dispose)(asb.sb_mb); 1310 sbrelease(&asb, so); 1311 SOCKBUF_LOCK_DESTROY(&asb); 1312 } 1313 1314 #ifdef INET 1315 static int 1316 do_setopt_accept_filter(so, sopt) 1317 struct socket *so; 1318 struct sockopt *sopt; 1319 { 1320 struct accept_filter_arg *afap = NULL; 1321 struct accept_filter *afp; 1322 struct so_accf *af = so->so_accf; 1323 int error = 0; 1324 1325 /* do not set/remove accept filters on non listen sockets */ 1326 if ((so->so_options & SO_ACCEPTCONN) == 0) { 1327 error = EINVAL; 1328 goto out; 1329 } 1330 1331 /* removing the filter */ 1332 if (sopt == NULL) { 1333 if (af != NULL) { 1334 if (af->so_accept_filter != NULL && 1335 af->so_accept_filter->accf_destroy != NULL) { 1336 af->so_accept_filter->accf_destroy(so); 1337 } 1338 if (af->so_accept_filter_str != NULL) { 1339 FREE(af->so_accept_filter_str, M_ACCF); 1340 } 1341 FREE(af, M_ACCF); 1342 so->so_accf = NULL; 1343 } 1344 so->so_options &= ~SO_ACCEPTFILTER; 1345 return (0); 1346 } 1347 /* adding a filter */ 1348 /* must remove previous filter first */ 1349 if (af != NULL) { 1350 error = EINVAL; 1351 goto out; 1352 } 1353 /* don't put large objects on the kernel stack */ 1354 MALLOC(afap, struct accept_filter_arg *, sizeof(*afap), M_TEMP, M_WAITOK); 1355 error = sooptcopyin(sopt, afap, sizeof *afap, sizeof *afap); 1356 afap->af_name[sizeof(afap->af_name)-1] = '\0'; 1357 afap->af_arg[sizeof(afap->af_arg)-1] = '\0'; 1358 if (error) 1359 goto out; 1360 afp = accept_filt_get(afap->af_name); 1361 if (afp == NULL) { 1362 error = ENOENT; 1363 goto out; 1364 } 1365 MALLOC(af, struct so_accf *, sizeof(*af), M_ACCF, M_WAITOK | M_ZERO); 1366 if (afp->accf_create != NULL) { 1367 if (afap->af_name[0] != '\0') { 1368 int len = strlen(afap->af_name) + 1; 1369 1370 MALLOC(af->so_accept_filter_str, char *, len, M_ACCF, M_WAITOK); 1371 strcpy(af->so_accept_filter_str, afap->af_name); 1372 } 1373 af->so_accept_filter_arg = afp->accf_create(so, afap->af_arg); 1374 if (af->so_accept_filter_arg == NULL) { 1375 FREE(af->so_accept_filter_str, M_ACCF); 1376 FREE(af, M_ACCF); 1377 so->so_accf = NULL; 1378 error = EINVAL; 1379 goto out; 1380 } 1381 } 1382 af->so_accept_filter = afp; 1383 so->so_accf = af; 1384 so->so_options |= SO_ACCEPTFILTER; 1385 out: 1386 if (afap != NULL) 1387 FREE(afap, M_TEMP); 1388 return (error); 1389 } 1390 #endif /* INET */ 1391 1392 /* 1393 * Perhaps this routine, and sooptcopyout(), below, ought to come in 1394 * an additional variant to handle the case where the option value needs 1395 * to be some kind of integer, but not a specific size. 1396 * In addition to their use here, these functions are also called by the 1397 * protocol-level pr_ctloutput() routines. 1398 */ 1399 int 1400 sooptcopyin(sopt, buf, len, minlen) 1401 struct sockopt *sopt; 1402 void *buf; 1403 size_t len; 1404 size_t minlen; 1405 { 1406 size_t valsize; 1407 1408 /* 1409 * If the user gives us more than we wanted, we ignore it, 1410 * but if we don't get the minimum length the caller 1411 * wants, we return EINVAL. On success, sopt->sopt_valsize 1412 * is set to however much we actually retrieved. 1413 */ 1414 if ((valsize = sopt->sopt_valsize) < minlen) 1415 return EINVAL; 1416 if (valsize > len) 1417 sopt->sopt_valsize = valsize = len; 1418 1419 if (sopt->sopt_td != NULL) 1420 return (copyin(sopt->sopt_val, buf, valsize)); 1421 1422 bcopy(sopt->sopt_val, buf, valsize); 1423 return 0; 1424 } 1425 1426 int 1427 sosetopt(so, sopt) 1428 struct socket *so; 1429 struct sockopt *sopt; 1430 { 1431 int error, optval; 1432 struct linger l; 1433 struct timeval tv; 1434 u_long val; 1435 #ifdef MAC 1436 struct mac extmac; 1437 #endif 1438 1439 error = 0; 1440 if (sopt->sopt_level != SOL_SOCKET) { 1441 if (so->so_proto && so->so_proto->pr_ctloutput) 1442 return ((*so->so_proto->pr_ctloutput) 1443 (so, sopt)); 1444 error = ENOPROTOOPT; 1445 } else { 1446 switch (sopt->sopt_name) { 1447 #ifdef INET 1448 case SO_ACCEPTFILTER: 1449 error = do_setopt_accept_filter(so, sopt); 1450 if (error) 1451 goto bad; 1452 break; 1453 #endif 1454 case SO_LINGER: 1455 error = sooptcopyin(sopt, &l, sizeof l, sizeof l); 1456 if (error) 1457 goto bad; 1458 1459 SOCK_LOCK(so); 1460 so->so_linger = l.l_linger; 1461 if (l.l_onoff) 1462 so->so_options |= SO_LINGER; 1463 else 1464 so->so_options &= ~SO_LINGER; 1465 SOCK_UNLOCK(so); 1466 break; 1467 1468 case SO_DEBUG: 1469 case SO_KEEPALIVE: 1470 case SO_DONTROUTE: 1471 case SO_USELOOPBACK: 1472 case SO_BROADCAST: 1473 case SO_REUSEADDR: 1474 case SO_REUSEPORT: 1475 case SO_OOBINLINE: 1476 case SO_TIMESTAMP: 1477 case SO_BINTIME: 1478 case SO_NOSIGPIPE: 1479 error = sooptcopyin(sopt, &optval, sizeof optval, 1480 sizeof optval); 1481 if (error) 1482 goto bad; 1483 SOCK_LOCK(so); 1484 if (optval) 1485 so->so_options |= sopt->sopt_name; 1486 else 1487 so->so_options &= ~sopt->sopt_name; 1488 SOCK_UNLOCK(so); 1489 break; 1490 1491 case SO_SNDBUF: 1492 case SO_RCVBUF: 1493 case SO_SNDLOWAT: 1494 case SO_RCVLOWAT: 1495 error = sooptcopyin(sopt, &optval, sizeof optval, 1496 sizeof optval); 1497 if (error) 1498 goto bad; 1499 1500 /* 1501 * Values < 1 make no sense for any of these 1502 * options, so disallow them. 1503 */ 1504 if (optval < 1) { 1505 error = EINVAL; 1506 goto bad; 1507 } 1508 1509 switch (sopt->sopt_name) { 1510 case SO_SNDBUF: 1511 case SO_RCVBUF: 1512 if (sbreserve(sopt->sopt_name == SO_SNDBUF ? 1513 &so->so_snd : &so->so_rcv, (u_long)optval, 1514 so, curthread) == 0) { 1515 error = ENOBUFS; 1516 goto bad; 1517 } 1518 break; 1519 1520 /* 1521 * Make sure the low-water is never greater than 1522 * the high-water. 1523 */ 1524 case SO_SNDLOWAT: 1525 SOCKBUF_LOCK(&so->so_snd); 1526 so->so_snd.sb_lowat = 1527 (optval > so->so_snd.sb_hiwat) ? 1528 so->so_snd.sb_hiwat : optval; 1529 SOCKBUF_UNLOCK(&so->so_snd); 1530 break; 1531 case SO_RCVLOWAT: 1532 SOCKBUF_LOCK(&so->so_rcv); 1533 so->so_rcv.sb_lowat = 1534 (optval > so->so_rcv.sb_hiwat) ? 1535 so->so_rcv.sb_hiwat : optval; 1536 SOCKBUF_UNLOCK(&so->so_rcv); 1537 break; 1538 } 1539 break; 1540 1541 case SO_SNDTIMEO: 1542 case SO_RCVTIMEO: 1543 error = sooptcopyin(sopt, &tv, sizeof tv, 1544 sizeof tv); 1545 if (error) 1546 goto bad; 1547 1548 /* assert(hz > 0); */ 1549 if (tv.tv_sec < 0 || tv.tv_sec > SHRT_MAX / hz || 1550 tv.tv_usec < 0 || tv.tv_usec >= 1000000) { 1551 error = EDOM; 1552 goto bad; 1553 } 1554 /* assert(tick > 0); */ 1555 /* assert(ULONG_MAX - SHRT_MAX >= 1000000); */ 1556 val = (u_long)(tv.tv_sec * hz) + tv.tv_usec / tick; 1557 if (val > SHRT_MAX) { 1558 error = EDOM; 1559 goto bad; 1560 } 1561 if (val == 0 && tv.tv_usec != 0) 1562 val = 1; 1563 1564 switch (sopt->sopt_name) { 1565 case SO_SNDTIMEO: 1566 so->so_snd.sb_timeo = val; 1567 break; 1568 case SO_RCVTIMEO: 1569 so->so_rcv.sb_timeo = val; 1570 break; 1571 } 1572 break; 1573 case SO_LABEL: 1574 #ifdef MAC 1575 error = sooptcopyin(sopt, &extmac, sizeof extmac, 1576 sizeof extmac); 1577 if (error) 1578 goto bad; 1579 error = mac_setsockopt_label(sopt->sopt_td->td_ucred, 1580 so, &extmac); 1581 #else 1582 error = EOPNOTSUPP; 1583 #endif 1584 break; 1585 default: 1586 error = ENOPROTOOPT; 1587 break; 1588 } 1589 if (error == 0 && so->so_proto != NULL && 1590 so->so_proto->pr_ctloutput != NULL) { 1591 (void) ((*so->so_proto->pr_ctloutput) 1592 (so, sopt)); 1593 } 1594 } 1595 bad: 1596 return (error); 1597 } 1598 1599 /* Helper routine for getsockopt */ 1600 int 1601 sooptcopyout(struct sockopt *sopt, const void *buf, size_t len) 1602 { 1603 int error; 1604 size_t valsize; 1605 1606 error = 0; 1607 1608 /* 1609 * Documented get behavior is that we always return a value, 1610 * possibly truncated to fit in the user's buffer. 1611 * Traditional behavior is that we always tell the user 1612 * precisely how much we copied, rather than something useful 1613 * like the total amount we had available for her. 1614 * Note that this interface is not idempotent; the entire answer must 1615 * generated ahead of time. 1616 */ 1617 valsize = min(len, sopt->sopt_valsize); 1618 sopt->sopt_valsize = valsize; 1619 if (sopt->sopt_val != NULL) { 1620 if (sopt->sopt_td != NULL) 1621 error = copyout(buf, sopt->sopt_val, valsize); 1622 else 1623 bcopy(buf, sopt->sopt_val, valsize); 1624 } 1625 return error; 1626 } 1627 1628 int 1629 sogetopt(so, sopt) 1630 struct socket *so; 1631 struct sockopt *sopt; 1632 { 1633 int error, optval; 1634 struct linger l; 1635 struct timeval tv; 1636 #ifdef INET 1637 struct accept_filter_arg *afap; 1638 #endif 1639 #ifdef MAC 1640 struct mac extmac; 1641 #endif 1642 1643 error = 0; 1644 if (sopt->sopt_level != SOL_SOCKET) { 1645 if (so->so_proto && so->so_proto->pr_ctloutput) { 1646 return ((*so->so_proto->pr_ctloutput) 1647 (so, sopt)); 1648 } else 1649 return (ENOPROTOOPT); 1650 } else { 1651 switch (sopt->sopt_name) { 1652 #ifdef INET 1653 case SO_ACCEPTFILTER: 1654 if ((so->so_options & SO_ACCEPTCONN) == 0) 1655 return (EINVAL); 1656 MALLOC(afap, struct accept_filter_arg *, sizeof(*afap), 1657 M_TEMP, M_WAITOK | M_ZERO); 1658 if ((so->so_options & SO_ACCEPTFILTER) != 0) { 1659 strcpy(afap->af_name, so->so_accf->so_accept_filter->accf_name); 1660 if (so->so_accf->so_accept_filter_str != NULL) 1661 strcpy(afap->af_arg, so->so_accf->so_accept_filter_str); 1662 } 1663 error = sooptcopyout(sopt, afap, sizeof(*afap)); 1664 FREE(afap, M_TEMP); 1665 break; 1666 #endif 1667 1668 case SO_LINGER: 1669 /* 1670 * XXXRW: We grab the lock here to get a consistent 1671 * snapshot of both fields. This may not really 1672 * be necessary. 1673 */ 1674 SOCK_LOCK(so); 1675 l.l_onoff = so->so_options & SO_LINGER; 1676 l.l_linger = so->so_linger; 1677 SOCK_UNLOCK(so); 1678 error = sooptcopyout(sopt, &l, sizeof l); 1679 break; 1680 1681 case SO_USELOOPBACK: 1682 case SO_DONTROUTE: 1683 case SO_DEBUG: 1684 case SO_KEEPALIVE: 1685 case SO_REUSEADDR: 1686 case SO_REUSEPORT: 1687 case SO_BROADCAST: 1688 case SO_OOBINLINE: 1689 case SO_TIMESTAMP: 1690 case SO_BINTIME: 1691 case SO_NOSIGPIPE: 1692 optval = so->so_options & sopt->sopt_name; 1693 integer: 1694 error = sooptcopyout(sopt, &optval, sizeof optval); 1695 break; 1696 1697 case SO_TYPE: 1698 optval = so->so_type; 1699 goto integer; 1700 1701 case SO_ERROR: 1702 optval = so->so_error; 1703 so->so_error = 0; 1704 goto integer; 1705 1706 case SO_SNDBUF: 1707 optval = so->so_snd.sb_hiwat; 1708 goto integer; 1709 1710 case SO_RCVBUF: 1711 optval = so->so_rcv.sb_hiwat; 1712 goto integer; 1713 1714 case SO_SNDLOWAT: 1715 optval = so->so_snd.sb_lowat; 1716 goto integer; 1717 1718 case SO_RCVLOWAT: 1719 optval = so->so_rcv.sb_lowat; 1720 goto integer; 1721 1722 case SO_SNDTIMEO: 1723 case SO_RCVTIMEO: 1724 optval = (sopt->sopt_name == SO_SNDTIMEO ? 1725 so->so_snd.sb_timeo : so->so_rcv.sb_timeo); 1726 1727 tv.tv_sec = optval / hz; 1728 tv.tv_usec = (optval % hz) * tick; 1729 error = sooptcopyout(sopt, &tv, sizeof tv); 1730 break; 1731 case SO_LABEL: 1732 #ifdef MAC 1733 error = sooptcopyin(sopt, &extmac, sizeof(extmac), 1734 sizeof(extmac)); 1735 if (error) 1736 return (error); 1737 error = mac_getsockopt_label(sopt->sopt_td->td_ucred, 1738 so, &extmac); 1739 if (error) 1740 return (error); 1741 error = sooptcopyout(sopt, &extmac, sizeof extmac); 1742 #else 1743 error = EOPNOTSUPP; 1744 #endif 1745 break; 1746 case SO_PEERLABEL: 1747 #ifdef MAC 1748 error = sooptcopyin(sopt, &extmac, sizeof(extmac), 1749 sizeof(extmac)); 1750 if (error) 1751 return (error); 1752 error = mac_getsockopt_peerlabel( 1753 sopt->sopt_td->td_ucred, so, &extmac); 1754 if (error) 1755 return (error); 1756 error = sooptcopyout(sopt, &extmac, sizeof extmac); 1757 #else 1758 error = EOPNOTSUPP; 1759 #endif 1760 break; 1761 default: 1762 error = ENOPROTOOPT; 1763 break; 1764 } 1765 return (error); 1766 } 1767 } 1768 1769 /* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */ 1770 int 1771 soopt_getm(struct sockopt *sopt, struct mbuf **mp) 1772 { 1773 struct mbuf *m, *m_prev; 1774 int sopt_size = sopt->sopt_valsize; 1775 1776 MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA); 1777 if (m == NULL) 1778 return ENOBUFS; 1779 if (sopt_size > MLEN) { 1780 MCLGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT); 1781 if ((m->m_flags & M_EXT) == 0) { 1782 m_free(m); 1783 return ENOBUFS; 1784 } 1785 m->m_len = min(MCLBYTES, sopt_size); 1786 } else { 1787 m->m_len = min(MLEN, sopt_size); 1788 } 1789 sopt_size -= m->m_len; 1790 *mp = m; 1791 m_prev = m; 1792 1793 while (sopt_size) { 1794 MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA); 1795 if (m == NULL) { 1796 m_freem(*mp); 1797 return ENOBUFS; 1798 } 1799 if (sopt_size > MLEN) { 1800 MCLGET(m, sopt->sopt_td != NULL ? M_TRYWAIT : 1801 M_DONTWAIT); 1802 if ((m->m_flags & M_EXT) == 0) { 1803 m_freem(m); 1804 m_freem(*mp); 1805 return ENOBUFS; 1806 } 1807 m->m_len = min(MCLBYTES, sopt_size); 1808 } else { 1809 m->m_len = min(MLEN, sopt_size); 1810 } 1811 sopt_size -= m->m_len; 1812 m_prev->m_next = m; 1813 m_prev = m; 1814 } 1815 return 0; 1816 } 1817 1818 /* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */ 1819 int 1820 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m) 1821 { 1822 struct mbuf *m0 = m; 1823 1824 if (sopt->sopt_val == NULL) 1825 return 0; 1826 while (m != NULL && sopt->sopt_valsize >= m->m_len) { 1827 if (sopt->sopt_td != NULL) { 1828 int error; 1829 1830 error = copyin(sopt->sopt_val, mtod(m, char *), 1831 m->m_len); 1832 if (error != 0) { 1833 m_freem(m0); 1834 return(error); 1835 } 1836 } else 1837 bcopy(sopt->sopt_val, mtod(m, char *), m->m_len); 1838 sopt->sopt_valsize -= m->m_len; 1839 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; 1840 m = m->m_next; 1841 } 1842 if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */ 1843 panic("ip6_sooptmcopyin"); 1844 return 0; 1845 } 1846 1847 /* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */ 1848 int 1849 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m) 1850 { 1851 struct mbuf *m0 = m; 1852 size_t valsize = 0; 1853 1854 if (sopt->sopt_val == NULL) 1855 return 0; 1856 while (m != NULL && sopt->sopt_valsize >= m->m_len) { 1857 if (sopt->sopt_td != NULL) { 1858 int error; 1859 1860 error = copyout(mtod(m, char *), sopt->sopt_val, 1861 m->m_len); 1862 if (error != 0) { 1863 m_freem(m0); 1864 return(error); 1865 } 1866 } else 1867 bcopy(mtod(m, char *), sopt->sopt_val, m->m_len); 1868 sopt->sopt_valsize -= m->m_len; 1869 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; 1870 valsize += m->m_len; 1871 m = m->m_next; 1872 } 1873 if (m != NULL) { 1874 /* enough soopt buffer should be given from user-land */ 1875 m_freem(m0); 1876 return(EINVAL); 1877 } 1878 sopt->sopt_valsize = valsize; 1879 return 0; 1880 } 1881 1882 void 1883 sohasoutofband(so) 1884 struct socket *so; 1885 { 1886 if (so->so_sigio != NULL) 1887 pgsigio(&so->so_sigio, SIGURG, 0); 1888 selwakeuppri(&so->so_rcv.sb_sel, PSOCK); 1889 } 1890 1891 int 1892 sopoll(struct socket *so, int events, struct ucred *active_cred, 1893 struct thread *td) 1894 { 1895 int revents = 0; 1896 1897 if (events & (POLLIN | POLLRDNORM)) 1898 if (soreadable(so)) 1899 revents |= events & (POLLIN | POLLRDNORM); 1900 1901 if (events & POLLINIGNEOF) 1902 if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat || 1903 !TAILQ_EMPTY(&so->so_comp) || so->so_error) 1904 revents |= POLLINIGNEOF; 1905 1906 if (events & (POLLOUT | POLLWRNORM)) 1907 if (sowriteable(so)) 1908 revents |= events & (POLLOUT | POLLWRNORM); 1909 1910 if (events & (POLLPRI | POLLRDBAND)) 1911 if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK)) 1912 revents |= events & (POLLPRI | POLLRDBAND); 1913 1914 if (revents == 0) { 1915 if (events & 1916 (POLLIN | POLLINIGNEOF | POLLPRI | POLLRDNORM | 1917 POLLRDBAND)) { 1918 SOCKBUF_LOCK(&so->so_rcv); 1919 selrecord(td, &so->so_rcv.sb_sel); 1920 so->so_rcv.sb_flags |= SB_SEL; 1921 SOCKBUF_UNLOCK(&so->so_rcv); 1922 } 1923 1924 if (events & (POLLOUT | POLLWRNORM)) { 1925 SOCKBUF_LOCK(&so->so_snd); 1926 selrecord(td, &so->so_snd.sb_sel); 1927 so->so_snd.sb_flags |= SB_SEL; 1928 SOCKBUF_UNLOCK(&so->so_snd); 1929 } 1930 } 1931 1932 return (revents); 1933 } 1934 1935 int 1936 soo_kqfilter(struct file *fp, struct knote *kn) 1937 { 1938 struct socket *so = kn->kn_fp->f_data; 1939 struct sockbuf *sb; 1940 1941 switch (kn->kn_filter) { 1942 case EVFILT_READ: 1943 if (so->so_options & SO_ACCEPTCONN) 1944 kn->kn_fop = &solisten_filtops; 1945 else 1946 kn->kn_fop = &soread_filtops; 1947 sb = &so->so_rcv; 1948 break; 1949 case EVFILT_WRITE: 1950 kn->kn_fop = &sowrite_filtops; 1951 sb = &so->so_snd; 1952 break; 1953 default: 1954 return (1); 1955 } 1956 1957 SOCKBUF_LOCK(sb); 1958 SLIST_INSERT_HEAD(&sb->sb_sel.si_note, kn, kn_selnext); 1959 sb->sb_flags |= SB_KNOTE; 1960 SOCKBUF_UNLOCK(sb); 1961 return (0); 1962 } 1963 1964 static void 1965 filt_sordetach(struct knote *kn) 1966 { 1967 struct socket *so = kn->kn_fp->f_data; 1968 1969 SOCKBUF_LOCK(&so->so_rcv); 1970 SLIST_REMOVE(&so->so_rcv.sb_sel.si_note, kn, knote, kn_selnext); 1971 if (SLIST_EMPTY(&so->so_rcv.sb_sel.si_note)) 1972 so->so_rcv.sb_flags &= ~SB_KNOTE; 1973 SOCKBUF_UNLOCK(&so->so_rcv); 1974 } 1975 1976 /*ARGSUSED*/ 1977 static int 1978 filt_soread(struct knote *kn, long hint) 1979 { 1980 struct socket *so = kn->kn_fp->f_data; 1981 int need_lock, result; 1982 1983 /* 1984 * XXXRW: Conditional locking because filt_soread() can be called 1985 * either from KNOTE() in the socket context where the socket buffer 1986 * lock is already held, or from kqueue() itself. 1987 */ 1988 need_lock = !SOCKBUF_OWNED(&so->so_rcv); 1989 if (need_lock) 1990 SOCKBUF_LOCK(&so->so_rcv); 1991 kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl; 1992 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 1993 kn->kn_flags |= EV_EOF; 1994 kn->kn_fflags = so->so_error; 1995 result = 1; 1996 } else if (so->so_error) /* temporary udp error */ 1997 result = 1; 1998 else if (kn->kn_sfflags & NOTE_LOWAT) 1999 result = (kn->kn_data >= kn->kn_sdata); 2000 else 2001 result = (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat); 2002 if (need_lock) 2003 SOCKBUF_UNLOCK(&so->so_rcv); 2004 return (result); 2005 } 2006 2007 static void 2008 filt_sowdetach(struct knote *kn) 2009 { 2010 struct socket *so = kn->kn_fp->f_data; 2011 2012 SOCKBUF_LOCK(&so->so_snd); 2013 SLIST_REMOVE(&so->so_snd.sb_sel.si_note, kn, knote, kn_selnext); 2014 if (SLIST_EMPTY(&so->so_snd.sb_sel.si_note)) 2015 so->so_snd.sb_flags &= ~SB_KNOTE; 2016 SOCKBUF_UNLOCK(&so->so_snd); 2017 } 2018 2019 /*ARGSUSED*/ 2020 static int 2021 filt_sowrite(struct knote *kn, long hint) 2022 { 2023 struct socket *so = kn->kn_fp->f_data; 2024 int need_lock, result; 2025 2026 /* 2027 * XXXRW: Conditional locking because filt_soread() can be called 2028 * either from KNOTE() in the socket context where the socket buffer 2029 * lock is already held, or from kqueue() itself. 2030 */ 2031 need_lock = !SOCKBUF_OWNED(&so->so_snd); 2032 if (need_lock) 2033 SOCKBUF_LOCK(&so->so_snd); 2034 kn->kn_data = sbspace(&so->so_snd); 2035 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 2036 kn->kn_flags |= EV_EOF; 2037 kn->kn_fflags = so->so_error; 2038 result = 1; 2039 } else if (so->so_error) /* temporary udp error */ 2040 result = 1; 2041 else if (((so->so_state & SS_ISCONNECTED) == 0) && 2042 (so->so_proto->pr_flags & PR_CONNREQUIRED)) 2043 result = 0; 2044 else if (kn->kn_sfflags & NOTE_LOWAT) 2045 result = (kn->kn_data >= kn->kn_sdata); 2046 else 2047 result = (kn->kn_data >= so->so_snd.sb_lowat); 2048 if (need_lock) 2049 SOCKBUF_UNLOCK(&so->so_snd); 2050 return (result); 2051 } 2052 2053 /*ARGSUSED*/ 2054 static int 2055 filt_solisten(struct knote *kn, long hint) 2056 { 2057 struct socket *so = kn->kn_fp->f_data; 2058 2059 kn->kn_data = so->so_qlen; 2060 return (! TAILQ_EMPTY(&so->so_comp)); 2061 } 2062 2063 int 2064 socheckuid(struct socket *so, uid_t uid) 2065 { 2066 2067 if (so == NULL) 2068 return (EPERM); 2069 if (so->so_cred->cr_uid == uid) 2070 return (0); 2071 return (EPERM); 2072 } 2073