1 /* 2 * Copyright (c) 2004 The FreeBSD Foundation 3 * Copyright (c) 2004 Robert Watson 4 * Copyright (c) 1982, 1986, 1988, 1990, 1993 5 * The Regents of the University of California. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 4. Neither the name of the University nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 * 31 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94 32 */ 33 34 #include <sys/cdefs.h> 35 __FBSDID("$FreeBSD$"); 36 37 #include "opt_inet.h" 38 #include "opt_mac.h" 39 #include "opt_zero.h" 40 41 #include <sys/param.h> 42 #include <sys/systm.h> 43 #include <sys/fcntl.h> 44 #include <sys/limits.h> 45 #include <sys/lock.h> 46 #include <sys/mac.h> 47 #include <sys/malloc.h> 48 #include <sys/mbuf.h> 49 #include <sys/mutex.h> 50 #include <sys/domain.h> 51 #include <sys/file.h> /* for struct knote */ 52 #include <sys/kernel.h> 53 #include <sys/event.h> 54 #include <sys/poll.h> 55 #include <sys/proc.h> 56 #include <sys/protosw.h> 57 #include <sys/socket.h> 58 #include <sys/socketvar.h> 59 #include <sys/resourcevar.h> 60 #include <sys/signalvar.h> 61 #include <sys/sysctl.h> 62 #include <sys/uio.h> 63 #include <sys/jail.h> 64 65 #include <vm/uma.h> 66 67 68 static int soreceive_rcvoob(struct socket *so, struct uio *uio, 69 int flags); 70 71 #ifdef INET 72 static int do_setopt_accept_filter(struct socket *so, struct sockopt *sopt); 73 #endif 74 75 static void filt_sordetach(struct knote *kn); 76 static int filt_soread(struct knote *kn, long hint); 77 static void filt_sowdetach(struct knote *kn); 78 static int filt_sowrite(struct knote *kn, long hint); 79 static int filt_solisten(struct knote *kn, long hint); 80 81 static struct filterops solisten_filtops = 82 { 1, NULL, filt_sordetach, filt_solisten }; 83 static struct filterops soread_filtops = 84 { 1, NULL, filt_sordetach, filt_soread }; 85 static struct filterops sowrite_filtops = 86 { 1, NULL, filt_sowdetach, filt_sowrite }; 87 88 uma_zone_t socket_zone; 89 so_gen_t so_gencnt; /* generation count for sockets */ 90 91 MALLOC_DEFINE(M_SONAME, "soname", "socket name"); 92 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block"); 93 94 SYSCTL_DECL(_kern_ipc); 95 96 static int somaxconn = SOMAXCONN; 97 SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLFLAG_RW, 98 &somaxconn, 0, "Maximum pending socket connection queue size"); 99 static int numopensockets; 100 SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD, 101 &numopensockets, 0, "Number of open sockets"); 102 #ifdef ZERO_COPY_SOCKETS 103 /* These aren't static because they're used in other files. */ 104 int so_zero_copy_send = 1; 105 int so_zero_copy_receive = 1; 106 SYSCTL_NODE(_kern_ipc, OID_AUTO, zero_copy, CTLFLAG_RD, 0, 107 "Zero copy controls"); 108 SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, receive, CTLFLAG_RW, 109 &so_zero_copy_receive, 0, "Enable zero copy receive"); 110 SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, send, CTLFLAG_RW, 111 &so_zero_copy_send, 0, "Enable zero copy send"); 112 #endif /* ZERO_COPY_SOCKETS */ 113 114 /* 115 * accept_mtx locks down per-socket fields relating to accept queues. See 116 * socketvar.h for an annotation of the protected fields of struct socket. 117 */ 118 struct mtx accept_mtx; 119 MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF); 120 121 /* 122 * so_global_mtx protects so_gencnt, numopensockets, and the per-socket 123 * so_gencnt field. 124 * 125 * XXXRW: These variables might be better manipulated using atomic operations 126 * for improved efficiency. 127 */ 128 static struct mtx so_global_mtx; 129 MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF); 130 131 /* 132 * Socket operation routines. 133 * These routines are called by the routines in 134 * sys_socket.c or from a system process, and 135 * implement the semantics of socket operations by 136 * switching out to the protocol specific routines. 137 */ 138 139 /* 140 * Get a socket structure from our zone, and initialize it. 141 * Note that it would probably be better to allocate socket 142 * and PCB at the same time, but I'm not convinced that all 143 * the protocols can be easily modified to do this. 144 * 145 * soalloc() returns a socket with a ref count of 0. 146 */ 147 struct socket * 148 soalloc(int mflags) 149 { 150 struct socket *so; 151 #ifdef MAC 152 int error; 153 #endif 154 155 so = uma_zalloc(socket_zone, mflags | M_ZERO); 156 if (so != NULL) { 157 #ifdef MAC 158 error = mac_init_socket(so, mflags); 159 if (error != 0) { 160 uma_zfree(socket_zone, so); 161 so = NULL; 162 return so; 163 } 164 #endif 165 SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd"); 166 SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv"); 167 /* sx_init(&so->so_sxlock, "socket sxlock"); */ 168 TAILQ_INIT(&so->so_aiojobq); 169 mtx_lock(&so_global_mtx); 170 so->so_gencnt = ++so_gencnt; 171 ++numopensockets; 172 mtx_unlock(&so_global_mtx); 173 } 174 return so; 175 } 176 177 /* 178 * socreate returns a socket with a ref count of 1. The socket should be 179 * closed with soclose(). 180 */ 181 int 182 socreate(dom, aso, type, proto, cred, td) 183 int dom; 184 struct socket **aso; 185 int type; 186 int proto; 187 struct ucred *cred; 188 struct thread *td; 189 { 190 struct protosw *prp; 191 struct socket *so; 192 int error; 193 194 if (proto) 195 prp = pffindproto(dom, proto, type); 196 else 197 prp = pffindtype(dom, type); 198 199 if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL) 200 return (EPROTONOSUPPORT); 201 202 if (jailed(cred) && jail_socket_unixiproute_only && 203 prp->pr_domain->dom_family != PF_LOCAL && 204 prp->pr_domain->dom_family != PF_INET && 205 prp->pr_domain->dom_family != PF_ROUTE) { 206 return (EPROTONOSUPPORT); 207 } 208 209 if (prp->pr_type != type) 210 return (EPROTOTYPE); 211 so = soalloc(M_WAITOK); 212 if (so == NULL) 213 return (ENOBUFS); 214 215 TAILQ_INIT(&so->so_incomp); 216 TAILQ_INIT(&so->so_comp); 217 so->so_type = type; 218 so->so_cred = crhold(cred); 219 so->so_proto = prp; 220 #ifdef MAC 221 mac_create_socket(cred, so); 222 #endif 223 SOCK_LOCK(so); 224 soref(so); 225 SOCK_UNLOCK(so); 226 error = (*prp->pr_usrreqs->pru_attach)(so, proto, td); 227 if (error) { 228 SOCK_LOCK(so); 229 so->so_state |= SS_NOFDREF; 230 sorele(so); 231 return (error); 232 } 233 *aso = so; 234 return (0); 235 } 236 237 int 238 sobind(so, nam, td) 239 struct socket *so; 240 struct sockaddr *nam; 241 struct thread *td; 242 { 243 244 return ((*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td)); 245 } 246 247 void 248 sodealloc(struct socket *so) 249 { 250 251 KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count)); 252 mtx_lock(&so_global_mtx); 253 so->so_gencnt = ++so_gencnt; 254 mtx_unlock(&so_global_mtx); 255 if (so->so_rcv.sb_hiwat) 256 (void)chgsbsize(so->so_cred->cr_uidinfo, 257 &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY); 258 if (so->so_snd.sb_hiwat) 259 (void)chgsbsize(so->so_cred->cr_uidinfo, 260 &so->so_snd.sb_hiwat, 0, RLIM_INFINITY); 261 #ifdef INET 262 /* remove acccept filter if one is present. */ 263 if (so->so_accf != NULL) 264 do_setopt_accept_filter(so, NULL); 265 #endif 266 #ifdef MAC 267 mac_destroy_socket(so); 268 #endif 269 crfree(so->so_cred); 270 SOCKBUF_LOCK_DESTROY(&so->so_snd); 271 SOCKBUF_LOCK_DESTROY(&so->so_rcv); 272 /* sx_destroy(&so->so_sxlock); */ 273 uma_zfree(socket_zone, so); 274 /* 275 * XXXRW: Seems like a shame to grab the mutex again down here, but 276 * we don't want to decrement the socket count until after we free 277 * the socket, and we can't increment the gencnt on the socket after 278 * we free, it so... 279 */ 280 mtx_lock(&so_global_mtx); 281 --numopensockets; 282 mtx_unlock(&so_global_mtx); 283 } 284 285 int 286 solisten(so, backlog, td) 287 struct socket *so; 288 int backlog; 289 struct thread *td; 290 { 291 int error; 292 293 /* 294 * XXXRW: Ordering issue here -- perhaps we need to set 295 * SO_ACCEPTCONN before the call to pru_listen()? 296 * XXXRW: General atomic test-and-set concerns here also. 297 */ 298 if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | 299 SS_ISDISCONNECTING)) 300 return (EINVAL); 301 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, td); 302 if (error) 303 return (error); 304 ACCEPT_LOCK(); 305 if (TAILQ_EMPTY(&so->so_comp)) { 306 SOCK_LOCK(so); 307 so->so_options |= SO_ACCEPTCONN; 308 SOCK_UNLOCK(so); 309 } 310 if (backlog < 0 || backlog > somaxconn) 311 backlog = somaxconn; 312 so->so_qlimit = backlog; 313 ACCEPT_UNLOCK(); 314 return (0); 315 } 316 317 void 318 sofree(so) 319 struct socket *so; 320 { 321 struct socket *head; 322 323 KASSERT(so->so_count == 0, ("socket %p so_count not 0", so)); 324 SOCK_LOCK_ASSERT(so); 325 326 if (so->so_pcb != NULL || (so->so_state & SS_NOFDREF) == 0) { 327 SOCK_UNLOCK(so); 328 return; 329 } 330 331 SOCK_UNLOCK(so); 332 ACCEPT_LOCK(); 333 head = so->so_head; 334 if (head != NULL) { 335 KASSERT((so->so_qstate & SQ_COMP) != 0 || 336 (so->so_qstate & SQ_INCOMP) != 0, 337 ("sofree: so_head != NULL, but neither SQ_COMP nor " 338 "SQ_INCOMP")); 339 KASSERT((so->so_qstate & SQ_COMP) == 0 || 340 (so->so_qstate & SQ_INCOMP) == 0, 341 ("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP")); 342 /* 343 * accept(2) is responsible draining the completed 344 * connection queue and freeing those sockets, so 345 * we just return here if this socket is currently 346 * on the completed connection queue. Otherwise, 347 * accept(2) may hang after select(2) has indicating 348 * that a listening socket was ready. If it's an 349 * incomplete connection, we remove it from the queue 350 * and free it; otherwise, it won't be released until 351 * the listening socket is closed. 352 */ 353 if ((so->so_qstate & SQ_COMP) != 0) { 354 ACCEPT_UNLOCK(); 355 return; 356 } 357 TAILQ_REMOVE(&head->so_incomp, so, so_list); 358 head->so_incqlen--; 359 so->so_qstate &= ~SQ_INCOMP; 360 so->so_head = NULL; 361 } 362 KASSERT((so->so_qstate & SQ_COMP) == 0 && 363 (so->so_qstate & SQ_INCOMP) == 0, 364 ("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)", 365 so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP)); 366 ACCEPT_UNLOCK(); 367 SOCKBUF_LOCK(&so->so_snd); 368 so->so_snd.sb_flags |= SB_NOINTR; 369 (void)sblock(&so->so_snd, M_WAITOK); 370 /* 371 * socantsendmore_locked() drops the socket buffer mutex so that it 372 * can safely perform wakeups. Re-acquire the mutex before 373 * continuing. 374 */ 375 socantsendmore_locked(so); 376 SOCKBUF_LOCK(&so->so_snd); 377 sbunlock(&so->so_snd); 378 sbrelease_locked(&so->so_snd, so); 379 SOCKBUF_UNLOCK(&so->so_snd); 380 sorflush(so); 381 sodealloc(so); 382 } 383 384 /* 385 * Close a socket on last file table reference removal. 386 * Initiate disconnect if connected. 387 * Free socket when disconnect complete. 388 * 389 * This function will sorele() the socket. Note that soclose() may be 390 * called prior to the ref count reaching zero. The actual socket 391 * structure will not be freed until the ref count reaches zero. 392 */ 393 int 394 soclose(so) 395 struct socket *so; 396 { 397 int error = 0; 398 399 KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter")); 400 401 funsetown(&so->so_sigio); 402 if (so->so_options & SO_ACCEPTCONN) { 403 struct socket *sp; 404 ACCEPT_LOCK(); 405 while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) { 406 TAILQ_REMOVE(&so->so_incomp, sp, so_list); 407 so->so_incqlen--; 408 sp->so_qstate &= ~SQ_INCOMP; 409 sp->so_head = NULL; 410 ACCEPT_UNLOCK(); 411 (void) soabort(sp); 412 ACCEPT_LOCK(); 413 } 414 while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) { 415 TAILQ_REMOVE(&so->so_comp, sp, so_list); 416 so->so_qlen--; 417 sp->so_qstate &= ~SQ_COMP; 418 sp->so_head = NULL; 419 ACCEPT_UNLOCK(); 420 (void) soabort(sp); 421 ACCEPT_LOCK(); 422 } 423 ACCEPT_UNLOCK(); 424 } 425 if (so->so_pcb == NULL) 426 goto discard; 427 if (so->so_state & SS_ISCONNECTED) { 428 if ((so->so_state & SS_ISDISCONNECTING) == 0) { 429 error = sodisconnect(so); 430 if (error) 431 goto drop; 432 } 433 if (so->so_options & SO_LINGER) { 434 if ((so->so_state & SS_ISDISCONNECTING) && 435 (so->so_state & SS_NBIO)) 436 goto drop; 437 while (so->so_state & SS_ISCONNECTED) { 438 error = tsleep(&so->so_timeo, 439 PSOCK | PCATCH, "soclos", so->so_linger * hz); 440 if (error) 441 break; 442 } 443 } 444 } 445 drop: 446 if (so->so_pcb != NULL) { 447 int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so); 448 if (error == 0) 449 error = error2; 450 } 451 discard: 452 SOCK_LOCK(so); 453 KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF")); 454 so->so_state |= SS_NOFDREF; 455 sorele(so); 456 return (error); 457 } 458 459 /* 460 * soabort() must not be called with any socket locks held, as it calls 461 * into the protocol, which will call back into the socket code causing 462 * it to acquire additional socket locks that may cause recursion or lock 463 * order reversals. 464 */ 465 int 466 soabort(so) 467 struct socket *so; 468 { 469 int error; 470 471 error = (*so->so_proto->pr_usrreqs->pru_abort)(so); 472 if (error) { 473 SOCK_LOCK(so); 474 sotryfree(so); /* note: does not decrement the ref count */ 475 return error; 476 } 477 return (0); 478 } 479 480 int 481 soaccept(so, nam) 482 struct socket *so; 483 struct sockaddr **nam; 484 { 485 int error; 486 487 SOCK_LOCK(so); 488 KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF")); 489 so->so_state &= ~SS_NOFDREF; 490 SOCK_UNLOCK(so); 491 error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam); 492 return (error); 493 } 494 495 int 496 soconnect(so, nam, td) 497 struct socket *so; 498 struct sockaddr *nam; 499 struct thread *td; 500 { 501 int error; 502 503 if (so->so_options & SO_ACCEPTCONN) 504 return (EOPNOTSUPP); 505 /* 506 * If protocol is connection-based, can only connect once. 507 * Otherwise, if connected, try to disconnect first. 508 * This allows user to disconnect by connecting to, e.g., 509 * a null address. 510 */ 511 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && 512 ((so->so_proto->pr_flags & PR_CONNREQUIRED) || 513 (error = sodisconnect(so)))) 514 error = EISCONN; 515 else 516 error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td); 517 return (error); 518 } 519 520 int 521 soconnect2(so1, so2) 522 struct socket *so1; 523 struct socket *so2; 524 { 525 526 return ((*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2)); 527 } 528 529 int 530 sodisconnect(so) 531 struct socket *so; 532 { 533 int error; 534 535 if ((so->so_state & SS_ISCONNECTED) == 0) 536 return (ENOTCONN); 537 if (so->so_state & SS_ISDISCONNECTING) 538 return (EALREADY); 539 error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so); 540 return (error); 541 } 542 543 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK) 544 /* 545 * Send on a socket. 546 * If send must go all at once and message is larger than 547 * send buffering, then hard error. 548 * Lock against other senders. 549 * If must go all at once and not enough room now, then 550 * inform user that this would block and do nothing. 551 * Otherwise, if nonblocking, send as much as possible. 552 * The data to be sent is described by "uio" if nonzero, 553 * otherwise by the mbuf chain "top" (which must be null 554 * if uio is not). Data provided in mbuf chain must be small 555 * enough to send all at once. 556 * 557 * Returns nonzero on error, timeout or signal; callers 558 * must check for short counts if EINTR/ERESTART are returned. 559 * Data and control buffers are freed on return. 560 */ 561 562 #ifdef ZERO_COPY_SOCKETS 563 struct so_zerocopy_stats{ 564 int size_ok; 565 int align_ok; 566 int found_ifp; 567 }; 568 struct so_zerocopy_stats so_zerocp_stats = {0,0,0}; 569 #include <netinet/in.h> 570 #include <net/route.h> 571 #include <netinet/in_pcb.h> 572 #include <vm/vm.h> 573 #include <vm/vm_page.h> 574 #include <vm/vm_object.h> 575 #endif /*ZERO_COPY_SOCKETS*/ 576 577 int 578 sosend(so, addr, uio, top, control, flags, td) 579 struct socket *so; 580 struct sockaddr *addr; 581 struct uio *uio; 582 struct mbuf *top; 583 struct mbuf *control; 584 int flags; 585 struct thread *td; 586 { 587 struct mbuf **mp; 588 struct mbuf *m; 589 long space, len = 0, resid; 590 int clen = 0, error, dontroute; 591 int atomic = sosendallatonce(so) || top; 592 #ifdef ZERO_COPY_SOCKETS 593 int cow_send; 594 #endif /* ZERO_COPY_SOCKETS */ 595 596 if (uio != NULL) 597 resid = uio->uio_resid; 598 else 599 resid = top->m_pkthdr.len; 600 /* 601 * In theory resid should be unsigned. 602 * However, space must be signed, as it might be less than 0 603 * if we over-committed, and we must use a signed comparison 604 * of space and resid. On the other hand, a negative resid 605 * causes us to loop sending 0-length segments to the protocol. 606 * 607 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM 608 * type sockets since that's an error. 609 */ 610 if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) { 611 error = EINVAL; 612 goto out; 613 } 614 615 dontroute = 616 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && 617 (so->so_proto->pr_flags & PR_ATOMIC); 618 if (td != NULL) 619 td->td_proc->p_stats->p_ru.ru_msgsnd++; 620 if (control != NULL) 621 clen = control->m_len; 622 #define snderr(errno) { error = (errno); goto release; } 623 624 SOCKBUF_LOCK(&so->so_snd); 625 restart: 626 SOCKBUF_LOCK_ASSERT(&so->so_snd); 627 error = sblock(&so->so_snd, SBLOCKWAIT(flags)); 628 if (error) 629 goto out_locked; 630 do { 631 SOCKBUF_LOCK_ASSERT(&so->so_snd); 632 if (so->so_snd.sb_state & SBS_CANTSENDMORE) 633 snderr(EPIPE); 634 if (so->so_error) { 635 error = so->so_error; 636 so->so_error = 0; 637 goto release; 638 } 639 if ((so->so_state & SS_ISCONNECTED) == 0) { 640 /* 641 * `sendto' and `sendmsg' is allowed on a connection- 642 * based socket if it supports implied connect. 643 * Return ENOTCONN if not connected and no address is 644 * supplied. 645 */ 646 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && 647 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { 648 if ((so->so_state & SS_ISCONFIRMING) == 0 && 649 !(resid == 0 && clen != 0)) 650 snderr(ENOTCONN); 651 } else if (addr == NULL) 652 snderr(so->so_proto->pr_flags & PR_CONNREQUIRED ? 653 ENOTCONN : EDESTADDRREQ); 654 } 655 space = sbspace(&so->so_snd); 656 if (flags & MSG_OOB) 657 space += 1024; 658 if ((atomic && resid > so->so_snd.sb_hiwat) || 659 clen > so->so_snd.sb_hiwat) 660 snderr(EMSGSIZE); 661 if (space < resid + clen && 662 (atomic || space < so->so_snd.sb_lowat || space < clen)) { 663 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) 664 snderr(EWOULDBLOCK); 665 sbunlock(&so->so_snd); 666 error = sbwait(&so->so_snd); 667 if (error) 668 goto out_locked; 669 goto restart; 670 } 671 SOCKBUF_UNLOCK(&so->so_snd); 672 mp = ⊤ 673 space -= clen; 674 do { 675 if (uio == NULL) { 676 /* 677 * Data is prepackaged in "top". 678 */ 679 resid = 0; 680 if (flags & MSG_EOR) 681 top->m_flags |= M_EOR; 682 } else do { 683 #ifdef ZERO_COPY_SOCKETS 684 cow_send = 0; 685 #endif /* ZERO_COPY_SOCKETS */ 686 if (resid >= MINCLSIZE) { 687 #ifdef ZERO_COPY_SOCKETS 688 if (top == NULL) { 689 MGETHDR(m, M_TRYWAIT, MT_DATA); 690 if (m == NULL) { 691 error = ENOBUFS; 692 SOCKBUF_LOCK(&so->so_snd); 693 goto release; 694 } 695 m->m_pkthdr.len = 0; 696 m->m_pkthdr.rcvif = (struct ifnet *)0; 697 } else { 698 MGET(m, M_TRYWAIT, MT_DATA); 699 if (m == NULL) { 700 error = ENOBUFS; 701 SOCKBUF_LOCK(&so->so_snd); 702 goto release; 703 } 704 } 705 if (so_zero_copy_send && 706 resid>=PAGE_SIZE && 707 space>=PAGE_SIZE && 708 uio->uio_iov->iov_len>=PAGE_SIZE) { 709 so_zerocp_stats.size_ok++; 710 if (!((vm_offset_t) 711 uio->uio_iov->iov_base & PAGE_MASK)){ 712 so_zerocp_stats.align_ok++; 713 cow_send = socow_setup(m, uio); 714 } 715 } 716 if (!cow_send) { 717 MCLGET(m, M_TRYWAIT); 718 if ((m->m_flags & M_EXT) == 0) { 719 m_free(m); 720 m = NULL; 721 } else { 722 len = min(min(MCLBYTES, resid), space); 723 } 724 } else 725 len = PAGE_SIZE; 726 #else /* ZERO_COPY_SOCKETS */ 727 if (top == NULL) { 728 m = m_getcl(M_TRYWAIT, MT_DATA, M_PKTHDR); 729 m->m_pkthdr.len = 0; 730 m->m_pkthdr.rcvif = (struct ifnet *)0; 731 } else 732 m = m_getcl(M_TRYWAIT, MT_DATA, 0); 733 len = min(min(MCLBYTES, resid), space); 734 #endif /* ZERO_COPY_SOCKETS */ 735 } else { 736 if (top == NULL) { 737 m = m_gethdr(M_TRYWAIT, MT_DATA); 738 m->m_pkthdr.len = 0; 739 m->m_pkthdr.rcvif = (struct ifnet *)0; 740 741 len = min(min(MHLEN, resid), space); 742 /* 743 * For datagram protocols, leave room 744 * for protocol headers in first mbuf. 745 */ 746 if (atomic && m && len < MHLEN) 747 MH_ALIGN(m, len); 748 } else { 749 m = m_get(M_TRYWAIT, MT_DATA); 750 len = min(min(MLEN, resid), space); 751 } 752 } 753 if (m == NULL) { 754 error = ENOBUFS; 755 SOCKBUF_LOCK(&so->so_snd); 756 goto release; 757 } 758 759 space -= len; 760 #ifdef ZERO_COPY_SOCKETS 761 if (cow_send) 762 error = 0; 763 else 764 #endif /* ZERO_COPY_SOCKETS */ 765 error = uiomove(mtod(m, void *), (int)len, uio); 766 resid = uio->uio_resid; 767 m->m_len = len; 768 *mp = m; 769 top->m_pkthdr.len += len; 770 if (error) { 771 SOCKBUF_LOCK(&so->so_snd); 772 goto release; 773 } 774 mp = &m->m_next; 775 if (resid <= 0) { 776 if (flags & MSG_EOR) 777 top->m_flags |= M_EOR; 778 break; 779 } 780 } while (space > 0 && atomic); 781 if (dontroute) { 782 SOCK_LOCK(so); 783 so->so_options |= SO_DONTROUTE; 784 SOCK_UNLOCK(so); 785 } 786 /* 787 * XXX all the SBS_CANTSENDMORE checks previously 788 * done could be out of date. We could have recieved 789 * a reset packet in an interrupt or maybe we slept 790 * while doing page faults in uiomove() etc. We could 791 * probably recheck again inside the splnet() protection 792 * here, but there are probably other places that this 793 * also happens. We must rethink this. 794 */ 795 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 796 (flags & MSG_OOB) ? PRUS_OOB : 797 /* 798 * If the user set MSG_EOF, the protocol 799 * understands this flag and nothing left to 800 * send then use PRU_SEND_EOF instead of PRU_SEND. 801 */ 802 ((flags & MSG_EOF) && 803 (so->so_proto->pr_flags & PR_IMPLOPCL) && 804 (resid <= 0)) ? 805 PRUS_EOF : 806 /* If there is more to send set PRUS_MORETOCOME */ 807 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, 808 top, addr, control, td); 809 if (dontroute) { 810 SOCK_LOCK(so); 811 so->so_options &= ~SO_DONTROUTE; 812 SOCK_UNLOCK(so); 813 } 814 clen = 0; 815 control = NULL; 816 top = NULL; 817 mp = ⊤ 818 if (error) { 819 SOCKBUF_LOCK(&so->so_snd); 820 goto release; 821 } 822 } while (resid && space > 0); 823 SOCKBUF_LOCK(&so->so_snd); 824 } while (resid); 825 826 release: 827 SOCKBUF_LOCK_ASSERT(&so->so_snd); 828 sbunlock(&so->so_snd); 829 out_locked: 830 SOCKBUF_LOCK_ASSERT(&so->so_snd); 831 SOCKBUF_UNLOCK(&so->so_snd); 832 out: 833 if (top != NULL) 834 m_freem(top); 835 if (control != NULL) 836 m_freem(control); 837 return (error); 838 } 839 840 /* 841 * The part of soreceive() that implements reading non-inline out-of-band 842 * data from a socket. For more complete comments, see soreceive(), from 843 * which this code originated. 844 * 845 * XXXRW: Note that soreceive_rcvoob(), unlike the remainder of soreiceve(), 846 * is unable to return an mbuf chain to the caller. 847 */ 848 static int 849 soreceive_rcvoob(so, uio, flags) 850 struct socket *so; 851 struct uio *uio; 852 int flags; 853 { 854 struct protosw *pr = so->so_proto; 855 struct mbuf *m; 856 int error; 857 858 KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0")); 859 860 m = m_get(M_TRYWAIT, MT_DATA); 861 if (m == NULL) 862 return (ENOBUFS); 863 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK); 864 if (error) 865 goto bad; 866 do { 867 #ifdef ZERO_COPY_SOCKETS 868 if (so_zero_copy_receive) { 869 vm_page_t pg; 870 int disposable; 871 872 if ((m->m_flags & M_EXT) 873 && (m->m_ext.ext_type == EXT_DISPOSABLE)) 874 disposable = 1; 875 else 876 disposable = 0; 877 878 pg = PHYS_TO_VM_PAGE(vtophys(mtod(m, caddr_t))); 879 if (uio->uio_offset == -1) 880 uio->uio_offset =IDX_TO_OFF(pg->pindex); 881 882 error = uiomoveco(mtod(m, void *), 883 min(uio->uio_resid, m->m_len), 884 uio, pg->object, 885 disposable); 886 } else 887 #endif /* ZERO_COPY_SOCKETS */ 888 error = uiomove(mtod(m, void *), 889 (int) min(uio->uio_resid, m->m_len), uio); 890 m = m_free(m); 891 } while (uio->uio_resid && error == 0 && m); 892 bad: 893 if (m != NULL) 894 m_freem(m); 895 return (error); 896 } 897 898 /* 899 * Following replacement or removal of the first mbuf on the first mbuf chain 900 * of a socket buffer, push necessary state changes back into the socket 901 * buffer so that other consumers see the values consistently. 'nextrecord' 902 * is the callers locally stored value of the original value of 903 * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes. 904 * NOTE: 'nextrecord' may be NULL. 905 */ 906 static __inline void 907 sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord) 908 { 909 910 SOCKBUF_LOCK_ASSERT(sb); 911 /* 912 * First, update for the new value of nextrecord. If necessary, make 913 * it the first record. 914 */ 915 if (sb->sb_mb != NULL) 916 sb->sb_mb->m_nextpkt = nextrecord; 917 else 918 sb->sb_mb = nextrecord; 919 920 /* 921 * Now update any dependent socket buffer fields to reflect the new 922 * state. This is an expanded inline of SB_EMPTY_FIXUP(), with the 923 * addition of a second clause that takes care of the case where 924 * sb_mb has been updated, but remains the last record. 925 */ 926 if (sb->sb_mb == NULL) { 927 sb->sb_mbtail = NULL; 928 sb->sb_lastrecord = NULL; 929 } else if (sb->sb_mb->m_nextpkt == NULL) 930 sb->sb_lastrecord = sb->sb_mb; 931 } 932 933 934 /* 935 * Implement receive operations on a socket. 936 * We depend on the way that records are added to the sockbuf 937 * by sbappend*. In particular, each record (mbufs linked through m_next) 938 * must begin with an address if the protocol so specifies, 939 * followed by an optional mbuf or mbufs containing ancillary data, 940 * and then zero or more mbufs of data. 941 * In order to avoid blocking network interrupts for the entire time here, 942 * we splx() while doing the actual copy to user space. 943 * Although the sockbuf is locked, new data may still be appended, 944 * and thus we must maintain consistency of the sockbuf during that time. 945 * 946 * The caller may receive the data as a single mbuf chain by supplying 947 * an mbuf **mp0 for use in returning the chain. The uio is then used 948 * only for the count in uio_resid. 949 */ 950 int 951 soreceive(so, psa, uio, mp0, controlp, flagsp) 952 struct socket *so; 953 struct sockaddr **psa; 954 struct uio *uio; 955 struct mbuf **mp0; 956 struct mbuf **controlp; 957 int *flagsp; 958 { 959 struct mbuf *m, **mp; 960 int flags, len, error, offset; 961 struct protosw *pr = so->so_proto; 962 struct mbuf *nextrecord; 963 int moff, type = 0; 964 int orig_resid = uio->uio_resid; 965 966 mp = mp0; 967 if (psa != NULL) 968 *psa = NULL; 969 if (controlp != NULL) 970 *controlp = NULL; 971 if (flagsp != NULL) 972 flags = *flagsp &~ MSG_EOR; 973 else 974 flags = 0; 975 if (flags & MSG_OOB) 976 return (soreceive_rcvoob(so, uio, flags)); 977 if (mp != NULL) 978 *mp = NULL; 979 if (so->so_state & SS_ISCONFIRMING && uio->uio_resid) 980 (*pr->pr_usrreqs->pru_rcvd)(so, 0); 981 982 SOCKBUF_LOCK(&so->so_rcv); 983 restart: 984 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 985 error = sblock(&so->so_rcv, SBLOCKWAIT(flags)); 986 if (error) 987 goto out; 988 989 m = so->so_rcv.sb_mb; 990 /* 991 * If we have less data than requested, block awaiting more 992 * (subject to any timeout) if: 993 * 1. the current count is less than the low water mark, or 994 * 2. MSG_WAITALL is set, and it is possible to do the entire 995 * receive operation at once if we block (resid <= hiwat). 996 * 3. MSG_DONTWAIT is not set 997 * If MSG_WAITALL is set but resid is larger than the receive buffer, 998 * we have to do the receive in sections, and thus risk returning 999 * a short count if a timeout or signal occurs after we start. 1000 */ 1001 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && 1002 so->so_rcv.sb_cc < uio->uio_resid) && 1003 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat || 1004 ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) && 1005 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) { 1006 KASSERT(m != NULL || !so->so_rcv.sb_cc, 1007 ("receive: m == %p so->so_rcv.sb_cc == %u", 1008 m, so->so_rcv.sb_cc)); 1009 if (so->so_error) { 1010 if (m != NULL) 1011 goto dontblock; 1012 error = so->so_error; 1013 if ((flags & MSG_PEEK) == 0) 1014 so->so_error = 0; 1015 goto release; 1016 } 1017 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1018 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 1019 if (m) 1020 goto dontblock; 1021 else 1022 goto release; 1023 } 1024 for (; m != NULL; m = m->m_next) 1025 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { 1026 m = so->so_rcv.sb_mb; 1027 goto dontblock; 1028 } 1029 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 1030 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 1031 error = ENOTCONN; 1032 goto release; 1033 } 1034 if (uio->uio_resid == 0) 1035 goto release; 1036 if ((so->so_state & SS_NBIO) || 1037 (flags & (MSG_DONTWAIT|MSG_NBIO))) { 1038 error = EWOULDBLOCK; 1039 goto release; 1040 } 1041 SBLASTRECORDCHK(&so->so_rcv); 1042 SBLASTMBUFCHK(&so->so_rcv); 1043 sbunlock(&so->so_rcv); 1044 error = sbwait(&so->so_rcv); 1045 if (error) 1046 goto out; 1047 goto restart; 1048 } 1049 dontblock: 1050 /* 1051 * From this point onward, we maintain 'nextrecord' as a cache of the 1052 * pointer to the next record in the socket buffer. We must keep the 1053 * various socket buffer pointers and local stack versions of the 1054 * pointers in sync, pushing out modifications before dropping the 1055 * socket buffer mutex, and re-reading them when picking it up. 1056 * 1057 * Otherwise, we will race with the network stack appending new data 1058 * or records onto the socket buffer by using inconsistent/stale 1059 * versions of the field, possibly resulting in socket buffer 1060 * corruption. 1061 * 1062 * By holding the high-level sblock(), we prevent simultaneous 1063 * readers from pulling off the front of the socket buffer. 1064 */ 1065 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1066 if (uio->uio_td) 1067 uio->uio_td->td_proc->p_stats->p_ru.ru_msgrcv++; 1068 KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb")); 1069 SBLASTRECORDCHK(&so->so_rcv); 1070 SBLASTMBUFCHK(&so->so_rcv); 1071 nextrecord = m->m_nextpkt; 1072 if (pr->pr_flags & PR_ADDR) { 1073 KASSERT(m->m_type == MT_SONAME, 1074 ("m->m_type == %d", m->m_type)); 1075 orig_resid = 0; 1076 if (psa != NULL) 1077 *psa = sodupsockaddr(mtod(m, struct sockaddr *), 1078 M_NOWAIT); 1079 if (flags & MSG_PEEK) { 1080 m = m->m_next; 1081 } else { 1082 sbfree(&so->so_rcv, m); 1083 so->so_rcv.sb_mb = m_free(m); 1084 m = so->so_rcv.sb_mb; 1085 sockbuf_pushsync(&so->so_rcv, nextrecord); 1086 } 1087 } 1088 1089 /* 1090 * Process one or more MT_CONTROL mbufs present before any data mbufs 1091 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we 1092 * just copy the data; if !MSG_PEEK, we call into the protocol to 1093 * perform externalization. 1094 */ 1095 if (m != NULL && m->m_type == MT_CONTROL) { 1096 struct mbuf *cm = NULL; 1097 struct mbuf **cme = &cm; 1098 1099 do { 1100 if (flags & MSG_PEEK) { 1101 if (controlp != NULL) { 1102 *controlp = m_copy(m, 0, m->m_len); 1103 controlp = &(*controlp)->m_next; 1104 } 1105 m = m->m_next; 1106 } else { 1107 sbfree(&so->so_rcv, m); 1108 so->so_rcv.sb_mb = m->m_next; 1109 m->m_next = NULL; 1110 if (controlp) { 1111 /* 1112 * Collect mbufs for processing below. 1113 */ 1114 *cme = m; 1115 cme = &(*cme)->m_next; 1116 } else 1117 m_free(m); 1118 m = so->so_rcv.sb_mb; 1119 } 1120 } while (m != NULL && m->m_type == MT_CONTROL); 1121 if ((flags & MSG_PEEK) == 0) 1122 sockbuf_pushsync(&so->so_rcv, nextrecord); 1123 if (cm != NULL) { 1124 if (pr->pr_domain->dom_externalize != NULL) { 1125 SOCKBUF_UNLOCK(&so->so_rcv); 1126 error = (*pr->pr_domain->dom_externalize) 1127 (cm, controlp); 1128 SOCKBUF_LOCK(&so->so_rcv); 1129 } else 1130 m_freem(cm); 1131 } 1132 nextrecord = so->so_rcv.sb_mb->m_nextpkt; 1133 orig_resid = 0; 1134 } 1135 if (m != NULL) { 1136 if ((flags & MSG_PEEK) == 0) { 1137 KASSERT(m->m_nextpkt == nextrecord, 1138 ("soreceive: post-control, nextrecord !sync")); 1139 if (nextrecord == NULL) { 1140 KASSERT(so->so_rcv.sb_mb == m, 1141 ("soreceive: post-control, sb_mb!=m")); 1142 KASSERT(so->so_rcv.sb_lastrecord == m, 1143 ("soreceive: post-control, lastrecord!=m")); 1144 } 1145 } 1146 type = m->m_type; 1147 if (type == MT_OOBDATA) 1148 flags |= MSG_OOB; 1149 } else { 1150 if ((flags & MSG_PEEK) == 0) { 1151 KASSERT(so->so_rcv.sb_mb == nextrecord, 1152 ("soreceive: sb_mb != nextrecord")); 1153 if (so->so_rcv.sb_mb == NULL) { 1154 KASSERT(so->so_rcv.sb_lastrecord == NULL, 1155 ("soreceive: sb_lastercord != NULL")); 1156 } 1157 } 1158 } 1159 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1160 SBLASTRECORDCHK(&so->so_rcv); 1161 SBLASTMBUFCHK(&so->so_rcv); 1162 1163 /* 1164 * Now continue to read any data mbufs off of the head of the socket 1165 * buffer until the read request is satisfied. Note that 'type' is 1166 * used to store the type of any mbuf reads that have happened so far 1167 * such that soreceive() can stop reading if the type changes, which 1168 * causes soreceive() to return only one of regular data and inline 1169 * out-of-band data in a single socket receive operation. 1170 */ 1171 moff = 0; 1172 offset = 0; 1173 while (m != NULL && uio->uio_resid > 0 && error == 0) { 1174 /* 1175 * If the type of mbuf has changed since the last mbuf 1176 * examined ('type'), end the receive operation. 1177 */ 1178 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1179 if (m->m_type == MT_OOBDATA) { 1180 if (type != MT_OOBDATA) 1181 break; 1182 } else if (type == MT_OOBDATA) 1183 break; 1184 else 1185 KASSERT(m->m_type == MT_DATA || m->m_type == MT_HEADER, 1186 ("m->m_type == %d", m->m_type)); 1187 so->so_rcv.sb_state &= ~SBS_RCVATMARK; 1188 len = uio->uio_resid; 1189 if (so->so_oobmark && len > so->so_oobmark - offset) 1190 len = so->so_oobmark - offset; 1191 if (len > m->m_len - moff) 1192 len = m->m_len - moff; 1193 /* 1194 * If mp is set, just pass back the mbufs. 1195 * Otherwise copy them out via the uio, then free. 1196 * Sockbuf must be consistent here (points to current mbuf, 1197 * it points to next record) when we drop priority; 1198 * we must note any additions to the sockbuf when we 1199 * block interrupts again. 1200 */ 1201 if (mp == NULL) { 1202 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1203 SBLASTRECORDCHK(&so->so_rcv); 1204 SBLASTMBUFCHK(&so->so_rcv); 1205 SOCKBUF_UNLOCK(&so->so_rcv); 1206 #ifdef ZERO_COPY_SOCKETS 1207 if (so_zero_copy_receive) { 1208 vm_page_t pg; 1209 int disposable; 1210 1211 if ((m->m_flags & M_EXT) 1212 && (m->m_ext.ext_type == EXT_DISPOSABLE)) 1213 disposable = 1; 1214 else 1215 disposable = 0; 1216 1217 pg = PHYS_TO_VM_PAGE(vtophys(mtod(m, caddr_t) + 1218 moff)); 1219 1220 if (uio->uio_offset == -1) 1221 uio->uio_offset =IDX_TO_OFF(pg->pindex); 1222 1223 error = uiomoveco(mtod(m, char *) + moff, 1224 (int)len, uio,pg->object, 1225 disposable); 1226 } else 1227 #endif /* ZERO_COPY_SOCKETS */ 1228 error = uiomove(mtod(m, char *) + moff, (int)len, uio); 1229 SOCKBUF_LOCK(&so->so_rcv); 1230 if (error) 1231 goto release; 1232 } else 1233 uio->uio_resid -= len; 1234 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1235 if (len == m->m_len - moff) { 1236 if (m->m_flags & M_EOR) 1237 flags |= MSG_EOR; 1238 if (flags & MSG_PEEK) { 1239 m = m->m_next; 1240 moff = 0; 1241 } else { 1242 nextrecord = m->m_nextpkt; 1243 sbfree(&so->so_rcv, m); 1244 if (mp != NULL) { 1245 *mp = m; 1246 mp = &m->m_next; 1247 so->so_rcv.sb_mb = m = m->m_next; 1248 *mp = NULL; 1249 } else { 1250 so->so_rcv.sb_mb = m_free(m); 1251 m = so->so_rcv.sb_mb; 1252 } 1253 if (m != NULL) { 1254 m->m_nextpkt = nextrecord; 1255 if (nextrecord == NULL) 1256 so->so_rcv.sb_lastrecord = m; 1257 } else { 1258 so->so_rcv.sb_mb = nextrecord; 1259 SB_EMPTY_FIXUP(&so->so_rcv); 1260 } 1261 SBLASTRECORDCHK(&so->so_rcv); 1262 SBLASTMBUFCHK(&so->so_rcv); 1263 } 1264 } else { 1265 if (flags & MSG_PEEK) 1266 moff += len; 1267 else { 1268 if (mp != NULL) { 1269 SOCKBUF_UNLOCK(&so->so_rcv); 1270 *mp = m_copym(m, 0, len, M_TRYWAIT); 1271 SOCKBUF_LOCK(&so->so_rcv); 1272 } 1273 m->m_data += len; 1274 m->m_len -= len; 1275 so->so_rcv.sb_cc -= len; 1276 } 1277 } 1278 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1279 if (so->so_oobmark) { 1280 if ((flags & MSG_PEEK) == 0) { 1281 so->so_oobmark -= len; 1282 if (so->so_oobmark == 0) { 1283 so->so_rcv.sb_state |= SBS_RCVATMARK; 1284 break; 1285 } 1286 } else { 1287 offset += len; 1288 if (offset == so->so_oobmark) 1289 break; 1290 } 1291 } 1292 if (flags & MSG_EOR) 1293 break; 1294 /* 1295 * If the MSG_WAITALL flag is set (for non-atomic socket), 1296 * we must not quit until "uio->uio_resid == 0" or an error 1297 * termination. If a signal/timeout occurs, return 1298 * with a short count but without error. 1299 * Keep sockbuf locked against other readers. 1300 */ 1301 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 && 1302 !sosendallatonce(so) && nextrecord == NULL) { 1303 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1304 if (so->so_error || so->so_rcv.sb_state & SBS_CANTRCVMORE) 1305 break; 1306 /* 1307 * Notify the protocol that some data has been 1308 * drained before blocking. 1309 */ 1310 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb != NULL) { 1311 SOCKBUF_UNLOCK(&so->so_rcv); 1312 (*pr->pr_usrreqs->pru_rcvd)(so, flags); 1313 SOCKBUF_LOCK(&so->so_rcv); 1314 } 1315 SBLASTRECORDCHK(&so->so_rcv); 1316 SBLASTMBUFCHK(&so->so_rcv); 1317 error = sbwait(&so->so_rcv); 1318 if (error) 1319 goto release; 1320 m = so->so_rcv.sb_mb; 1321 if (m != NULL) 1322 nextrecord = m->m_nextpkt; 1323 } 1324 } 1325 1326 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1327 if (m != NULL && pr->pr_flags & PR_ATOMIC) { 1328 flags |= MSG_TRUNC; 1329 if ((flags & MSG_PEEK) == 0) 1330 (void) sbdroprecord_locked(&so->so_rcv); 1331 } 1332 if ((flags & MSG_PEEK) == 0) { 1333 if (m == NULL) { 1334 /* 1335 * First part is an inline SB_EMPTY_FIXUP(). Second 1336 * part makes sure sb_lastrecord is up-to-date if 1337 * there is still data in the socket buffer. 1338 */ 1339 so->so_rcv.sb_mb = nextrecord; 1340 if (so->so_rcv.sb_mb == NULL) { 1341 so->so_rcv.sb_mbtail = NULL; 1342 so->so_rcv.sb_lastrecord = NULL; 1343 } else if (nextrecord->m_nextpkt == NULL) 1344 so->so_rcv.sb_lastrecord = nextrecord; 1345 } 1346 SBLASTRECORDCHK(&so->so_rcv); 1347 SBLASTMBUFCHK(&so->so_rcv); 1348 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) { 1349 SOCKBUF_UNLOCK(&so->so_rcv); 1350 (*pr->pr_usrreqs->pru_rcvd)(so, flags); 1351 SOCKBUF_LOCK(&so->so_rcv); 1352 } 1353 } 1354 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1355 if (orig_resid == uio->uio_resid && orig_resid && 1356 (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) { 1357 sbunlock(&so->so_rcv); 1358 goto restart; 1359 } 1360 1361 if (flagsp != NULL) 1362 *flagsp |= flags; 1363 release: 1364 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1365 sbunlock(&so->so_rcv); 1366 out: 1367 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1368 SOCKBUF_UNLOCK(&so->so_rcv); 1369 return (error); 1370 } 1371 1372 int 1373 soshutdown(so, how) 1374 struct socket *so; 1375 int how; 1376 { 1377 struct protosw *pr = so->so_proto; 1378 1379 if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR)) 1380 return (EINVAL); 1381 1382 if (how != SHUT_WR) 1383 sorflush(so); 1384 if (how != SHUT_RD) 1385 return ((*pr->pr_usrreqs->pru_shutdown)(so)); 1386 return (0); 1387 } 1388 1389 void 1390 sorflush(so) 1391 struct socket *so; 1392 { 1393 struct sockbuf *sb = &so->so_rcv; 1394 struct protosw *pr = so->so_proto; 1395 struct sockbuf asb; 1396 1397 /* 1398 * XXXRW: This is quite ugly. The existing code made a copy of the 1399 * socket buffer, then zero'd the original to clear the buffer 1400 * fields. However, with mutexes in the socket buffer, this causes 1401 * problems. We only clear the zeroable bits of the original; 1402 * however, we have to initialize and destroy the mutex in the copy 1403 * so that dom_dispose() and sbrelease() can lock t as needed. 1404 */ 1405 SOCKBUF_LOCK(sb); 1406 sb->sb_flags |= SB_NOINTR; 1407 (void) sblock(sb, M_WAITOK); 1408 /* 1409 * socantrcvmore_locked() drops the socket buffer mutex so that it 1410 * can safely perform wakeups. Re-acquire the mutex before 1411 * continuing. 1412 */ 1413 socantrcvmore_locked(so); 1414 SOCKBUF_LOCK(sb); 1415 sbunlock(sb); 1416 /* 1417 * Invalidate/clear most of the sockbuf structure, but leave 1418 * selinfo and mutex data unchanged. 1419 */ 1420 bzero(&asb, offsetof(struct sockbuf, sb_startzero)); 1421 bcopy(&sb->sb_startzero, &asb.sb_startzero, 1422 sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); 1423 bzero(&sb->sb_startzero, 1424 sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); 1425 SOCKBUF_UNLOCK(sb); 1426 1427 SOCKBUF_LOCK_INIT(&asb, "so_rcv"); 1428 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL) 1429 (*pr->pr_domain->dom_dispose)(asb.sb_mb); 1430 sbrelease(&asb, so); 1431 SOCKBUF_LOCK_DESTROY(&asb); 1432 } 1433 1434 #ifdef INET 1435 static int 1436 do_setopt_accept_filter(so, sopt) 1437 struct socket *so; 1438 struct sockopt *sopt; 1439 { 1440 struct accept_filter_arg *afap = NULL; 1441 struct accept_filter *afp; 1442 struct so_accf *af = so->so_accf; 1443 int error = 0; 1444 1445 /* do not set/remove accept filters on non listen sockets */ 1446 if ((so->so_options & SO_ACCEPTCONN) == 0) { 1447 error = EINVAL; 1448 goto out; 1449 } 1450 1451 /* removing the filter */ 1452 if (sopt == NULL) { 1453 if (af != NULL) { 1454 if (af->so_accept_filter != NULL && 1455 af->so_accept_filter->accf_destroy != NULL) { 1456 af->so_accept_filter->accf_destroy(so); 1457 } 1458 if (af->so_accept_filter_str != NULL) { 1459 FREE(af->so_accept_filter_str, M_ACCF); 1460 } 1461 FREE(af, M_ACCF); 1462 so->so_accf = NULL; 1463 } 1464 so->so_options &= ~SO_ACCEPTFILTER; 1465 return (0); 1466 } 1467 /* adding a filter */ 1468 /* must remove previous filter first */ 1469 if (af != NULL) { 1470 error = EINVAL; 1471 goto out; 1472 } 1473 /* don't put large objects on the kernel stack */ 1474 MALLOC(afap, struct accept_filter_arg *, sizeof(*afap), M_TEMP, M_WAITOK); 1475 error = sooptcopyin(sopt, afap, sizeof *afap, sizeof *afap); 1476 afap->af_name[sizeof(afap->af_name)-1] = '\0'; 1477 afap->af_arg[sizeof(afap->af_arg)-1] = '\0'; 1478 if (error) 1479 goto out; 1480 afp = accept_filt_get(afap->af_name); 1481 if (afp == NULL) { 1482 error = ENOENT; 1483 goto out; 1484 } 1485 MALLOC(af, struct so_accf *, sizeof(*af), M_ACCF, M_WAITOK | M_ZERO); 1486 if (afp->accf_create != NULL) { 1487 if (afap->af_name[0] != '\0') { 1488 int len = strlen(afap->af_name) + 1; 1489 1490 MALLOC(af->so_accept_filter_str, char *, len, M_ACCF, M_WAITOK); 1491 strcpy(af->so_accept_filter_str, afap->af_name); 1492 } 1493 af->so_accept_filter_arg = afp->accf_create(so, afap->af_arg); 1494 if (af->so_accept_filter_arg == NULL) { 1495 FREE(af->so_accept_filter_str, M_ACCF); 1496 FREE(af, M_ACCF); 1497 so->so_accf = NULL; 1498 error = EINVAL; 1499 goto out; 1500 } 1501 } 1502 af->so_accept_filter = afp; 1503 so->so_accf = af; 1504 so->so_options |= SO_ACCEPTFILTER; 1505 out: 1506 if (afap != NULL) 1507 FREE(afap, M_TEMP); 1508 return (error); 1509 } 1510 #endif /* INET */ 1511 1512 /* 1513 * Perhaps this routine, and sooptcopyout(), below, ought to come in 1514 * an additional variant to handle the case where the option value needs 1515 * to be some kind of integer, but not a specific size. 1516 * In addition to their use here, these functions are also called by the 1517 * protocol-level pr_ctloutput() routines. 1518 */ 1519 int 1520 sooptcopyin(sopt, buf, len, minlen) 1521 struct sockopt *sopt; 1522 void *buf; 1523 size_t len; 1524 size_t minlen; 1525 { 1526 size_t valsize; 1527 1528 /* 1529 * If the user gives us more than we wanted, we ignore it, 1530 * but if we don't get the minimum length the caller 1531 * wants, we return EINVAL. On success, sopt->sopt_valsize 1532 * is set to however much we actually retrieved. 1533 */ 1534 if ((valsize = sopt->sopt_valsize) < minlen) 1535 return EINVAL; 1536 if (valsize > len) 1537 sopt->sopt_valsize = valsize = len; 1538 1539 if (sopt->sopt_td != NULL) 1540 return (copyin(sopt->sopt_val, buf, valsize)); 1541 1542 bcopy(sopt->sopt_val, buf, valsize); 1543 return 0; 1544 } 1545 1546 /* 1547 * Kernel version of setsockopt(2)/ 1548 * XXX: optlen is size_t, not socklen_t 1549 */ 1550 int 1551 so_setsockopt(struct socket *so, int level, int optname, void *optval, 1552 size_t optlen) 1553 { 1554 struct sockopt sopt; 1555 1556 sopt.sopt_level = level; 1557 sopt.sopt_name = optname; 1558 sopt.sopt_dir = SOPT_SET; 1559 sopt.sopt_val = optval; 1560 sopt.sopt_valsize = optlen; 1561 sopt.sopt_td = NULL; 1562 return (sosetopt(so, &sopt)); 1563 } 1564 1565 int 1566 sosetopt(so, sopt) 1567 struct socket *so; 1568 struct sockopt *sopt; 1569 { 1570 int error, optval; 1571 struct linger l; 1572 struct timeval tv; 1573 u_long val; 1574 #ifdef MAC 1575 struct mac extmac; 1576 #endif 1577 1578 error = 0; 1579 if (sopt->sopt_level != SOL_SOCKET) { 1580 if (so->so_proto && so->so_proto->pr_ctloutput) 1581 return ((*so->so_proto->pr_ctloutput) 1582 (so, sopt)); 1583 error = ENOPROTOOPT; 1584 } else { 1585 switch (sopt->sopt_name) { 1586 #ifdef INET 1587 case SO_ACCEPTFILTER: 1588 error = do_setopt_accept_filter(so, sopt); 1589 if (error) 1590 goto bad; 1591 break; 1592 #endif 1593 case SO_LINGER: 1594 error = sooptcopyin(sopt, &l, sizeof l, sizeof l); 1595 if (error) 1596 goto bad; 1597 1598 SOCK_LOCK(so); 1599 so->so_linger = l.l_linger; 1600 if (l.l_onoff) 1601 so->so_options |= SO_LINGER; 1602 else 1603 so->so_options &= ~SO_LINGER; 1604 SOCK_UNLOCK(so); 1605 break; 1606 1607 case SO_DEBUG: 1608 case SO_KEEPALIVE: 1609 case SO_DONTROUTE: 1610 case SO_USELOOPBACK: 1611 case SO_BROADCAST: 1612 case SO_REUSEADDR: 1613 case SO_REUSEPORT: 1614 case SO_OOBINLINE: 1615 case SO_TIMESTAMP: 1616 case SO_BINTIME: 1617 case SO_NOSIGPIPE: 1618 error = sooptcopyin(sopt, &optval, sizeof optval, 1619 sizeof optval); 1620 if (error) 1621 goto bad; 1622 SOCK_LOCK(so); 1623 if (optval) 1624 so->so_options |= sopt->sopt_name; 1625 else 1626 so->so_options &= ~sopt->sopt_name; 1627 SOCK_UNLOCK(so); 1628 break; 1629 1630 case SO_SNDBUF: 1631 case SO_RCVBUF: 1632 case SO_SNDLOWAT: 1633 case SO_RCVLOWAT: 1634 error = sooptcopyin(sopt, &optval, sizeof optval, 1635 sizeof optval); 1636 if (error) 1637 goto bad; 1638 1639 /* 1640 * Values < 1 make no sense for any of these 1641 * options, so disallow them. 1642 */ 1643 if (optval < 1) { 1644 error = EINVAL; 1645 goto bad; 1646 } 1647 1648 switch (sopt->sopt_name) { 1649 case SO_SNDBUF: 1650 case SO_RCVBUF: 1651 if (sbreserve(sopt->sopt_name == SO_SNDBUF ? 1652 &so->so_snd : &so->so_rcv, (u_long)optval, 1653 so, curthread) == 0) { 1654 error = ENOBUFS; 1655 goto bad; 1656 } 1657 break; 1658 1659 /* 1660 * Make sure the low-water is never greater than 1661 * the high-water. 1662 */ 1663 case SO_SNDLOWAT: 1664 SOCKBUF_LOCK(&so->so_snd); 1665 so->so_snd.sb_lowat = 1666 (optval > so->so_snd.sb_hiwat) ? 1667 so->so_snd.sb_hiwat : optval; 1668 SOCKBUF_UNLOCK(&so->so_snd); 1669 break; 1670 case SO_RCVLOWAT: 1671 SOCKBUF_LOCK(&so->so_rcv); 1672 so->so_rcv.sb_lowat = 1673 (optval > so->so_rcv.sb_hiwat) ? 1674 so->so_rcv.sb_hiwat : optval; 1675 SOCKBUF_UNLOCK(&so->so_rcv); 1676 break; 1677 } 1678 break; 1679 1680 case SO_SNDTIMEO: 1681 case SO_RCVTIMEO: 1682 error = sooptcopyin(sopt, &tv, sizeof tv, 1683 sizeof tv); 1684 if (error) 1685 goto bad; 1686 1687 /* assert(hz > 0); */ 1688 if (tv.tv_sec < 0 || tv.tv_sec > SHRT_MAX / hz || 1689 tv.tv_usec < 0 || tv.tv_usec >= 1000000) { 1690 error = EDOM; 1691 goto bad; 1692 } 1693 /* assert(tick > 0); */ 1694 /* assert(ULONG_MAX - SHRT_MAX >= 1000000); */ 1695 val = (u_long)(tv.tv_sec * hz) + tv.tv_usec / tick; 1696 if (val > SHRT_MAX) { 1697 error = EDOM; 1698 goto bad; 1699 } 1700 if (val == 0 && tv.tv_usec != 0) 1701 val = 1; 1702 1703 switch (sopt->sopt_name) { 1704 case SO_SNDTIMEO: 1705 so->so_snd.sb_timeo = val; 1706 break; 1707 case SO_RCVTIMEO: 1708 so->so_rcv.sb_timeo = val; 1709 break; 1710 } 1711 break; 1712 case SO_LABEL: 1713 #ifdef MAC 1714 error = sooptcopyin(sopt, &extmac, sizeof extmac, 1715 sizeof extmac); 1716 if (error) 1717 goto bad; 1718 error = mac_setsockopt_label(sopt->sopt_td->td_ucred, 1719 so, &extmac); 1720 #else 1721 error = EOPNOTSUPP; 1722 #endif 1723 break; 1724 default: 1725 error = ENOPROTOOPT; 1726 break; 1727 } 1728 if (error == 0 && so->so_proto != NULL && 1729 so->so_proto->pr_ctloutput != NULL) { 1730 (void) ((*so->so_proto->pr_ctloutput) 1731 (so, sopt)); 1732 } 1733 } 1734 bad: 1735 return (error); 1736 } 1737 1738 /* Helper routine for getsockopt */ 1739 int 1740 sooptcopyout(struct sockopt *sopt, const void *buf, size_t len) 1741 { 1742 int error; 1743 size_t valsize; 1744 1745 error = 0; 1746 1747 /* 1748 * Documented get behavior is that we always return a value, 1749 * possibly truncated to fit in the user's buffer. 1750 * Traditional behavior is that we always tell the user 1751 * precisely how much we copied, rather than something useful 1752 * like the total amount we had available for her. 1753 * Note that this interface is not idempotent; the entire answer must 1754 * generated ahead of time. 1755 */ 1756 valsize = min(len, sopt->sopt_valsize); 1757 sopt->sopt_valsize = valsize; 1758 if (sopt->sopt_val != NULL) { 1759 if (sopt->sopt_td != NULL) 1760 error = copyout(buf, sopt->sopt_val, valsize); 1761 else 1762 bcopy(buf, sopt->sopt_val, valsize); 1763 } 1764 return error; 1765 } 1766 1767 int 1768 sogetopt(so, sopt) 1769 struct socket *so; 1770 struct sockopt *sopt; 1771 { 1772 int error, optval; 1773 struct linger l; 1774 struct timeval tv; 1775 #ifdef INET 1776 struct accept_filter_arg *afap; 1777 #endif 1778 #ifdef MAC 1779 struct mac extmac; 1780 #endif 1781 1782 error = 0; 1783 if (sopt->sopt_level != SOL_SOCKET) { 1784 if (so->so_proto && so->so_proto->pr_ctloutput) { 1785 return ((*so->so_proto->pr_ctloutput) 1786 (so, sopt)); 1787 } else 1788 return (ENOPROTOOPT); 1789 } else { 1790 switch (sopt->sopt_name) { 1791 #ifdef INET 1792 case SO_ACCEPTFILTER: 1793 if ((so->so_options & SO_ACCEPTCONN) == 0) 1794 return (EINVAL); 1795 MALLOC(afap, struct accept_filter_arg *, sizeof(*afap), 1796 M_TEMP, M_WAITOK | M_ZERO); 1797 if ((so->so_options & SO_ACCEPTFILTER) != 0) { 1798 strcpy(afap->af_name, so->so_accf->so_accept_filter->accf_name); 1799 if (so->so_accf->so_accept_filter_str != NULL) 1800 strcpy(afap->af_arg, so->so_accf->so_accept_filter_str); 1801 } 1802 error = sooptcopyout(sopt, afap, sizeof(*afap)); 1803 FREE(afap, M_TEMP); 1804 break; 1805 #endif 1806 1807 case SO_LINGER: 1808 /* 1809 * XXXRW: We grab the lock here to get a consistent 1810 * snapshot of both fields. This may not really 1811 * be necessary. 1812 */ 1813 SOCK_LOCK(so); 1814 l.l_onoff = so->so_options & SO_LINGER; 1815 l.l_linger = so->so_linger; 1816 SOCK_UNLOCK(so); 1817 error = sooptcopyout(sopt, &l, sizeof l); 1818 break; 1819 1820 case SO_USELOOPBACK: 1821 case SO_DONTROUTE: 1822 case SO_DEBUG: 1823 case SO_KEEPALIVE: 1824 case SO_REUSEADDR: 1825 case SO_REUSEPORT: 1826 case SO_BROADCAST: 1827 case SO_OOBINLINE: 1828 case SO_TIMESTAMP: 1829 case SO_BINTIME: 1830 case SO_NOSIGPIPE: 1831 optval = so->so_options & sopt->sopt_name; 1832 integer: 1833 error = sooptcopyout(sopt, &optval, sizeof optval); 1834 break; 1835 1836 case SO_TYPE: 1837 optval = so->so_type; 1838 goto integer; 1839 1840 case SO_ERROR: 1841 optval = so->so_error; 1842 so->so_error = 0; 1843 goto integer; 1844 1845 case SO_SNDBUF: 1846 optval = so->so_snd.sb_hiwat; 1847 goto integer; 1848 1849 case SO_RCVBUF: 1850 optval = so->so_rcv.sb_hiwat; 1851 goto integer; 1852 1853 case SO_SNDLOWAT: 1854 optval = so->so_snd.sb_lowat; 1855 goto integer; 1856 1857 case SO_RCVLOWAT: 1858 optval = so->so_rcv.sb_lowat; 1859 goto integer; 1860 1861 case SO_SNDTIMEO: 1862 case SO_RCVTIMEO: 1863 optval = (sopt->sopt_name == SO_SNDTIMEO ? 1864 so->so_snd.sb_timeo : so->so_rcv.sb_timeo); 1865 1866 tv.tv_sec = optval / hz; 1867 tv.tv_usec = (optval % hz) * tick; 1868 error = sooptcopyout(sopt, &tv, sizeof tv); 1869 break; 1870 case SO_LABEL: 1871 #ifdef MAC 1872 error = sooptcopyin(sopt, &extmac, sizeof(extmac), 1873 sizeof(extmac)); 1874 if (error) 1875 return (error); 1876 error = mac_getsockopt_label(sopt->sopt_td->td_ucred, 1877 so, &extmac); 1878 if (error) 1879 return (error); 1880 error = sooptcopyout(sopt, &extmac, sizeof extmac); 1881 #else 1882 error = EOPNOTSUPP; 1883 #endif 1884 break; 1885 case SO_PEERLABEL: 1886 #ifdef MAC 1887 error = sooptcopyin(sopt, &extmac, sizeof(extmac), 1888 sizeof(extmac)); 1889 if (error) 1890 return (error); 1891 error = mac_getsockopt_peerlabel( 1892 sopt->sopt_td->td_ucred, so, &extmac); 1893 if (error) 1894 return (error); 1895 error = sooptcopyout(sopt, &extmac, sizeof extmac); 1896 #else 1897 error = EOPNOTSUPP; 1898 #endif 1899 break; 1900 default: 1901 error = ENOPROTOOPT; 1902 break; 1903 } 1904 return (error); 1905 } 1906 } 1907 1908 /* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */ 1909 int 1910 soopt_getm(struct sockopt *sopt, struct mbuf **mp) 1911 { 1912 struct mbuf *m, *m_prev; 1913 int sopt_size = sopt->sopt_valsize; 1914 1915 MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA); 1916 if (m == NULL) 1917 return ENOBUFS; 1918 if (sopt_size > MLEN) { 1919 MCLGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT); 1920 if ((m->m_flags & M_EXT) == 0) { 1921 m_free(m); 1922 return ENOBUFS; 1923 } 1924 m->m_len = min(MCLBYTES, sopt_size); 1925 } else { 1926 m->m_len = min(MLEN, sopt_size); 1927 } 1928 sopt_size -= m->m_len; 1929 *mp = m; 1930 m_prev = m; 1931 1932 while (sopt_size) { 1933 MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA); 1934 if (m == NULL) { 1935 m_freem(*mp); 1936 return ENOBUFS; 1937 } 1938 if (sopt_size > MLEN) { 1939 MCLGET(m, sopt->sopt_td != NULL ? M_TRYWAIT : 1940 M_DONTWAIT); 1941 if ((m->m_flags & M_EXT) == 0) { 1942 m_freem(m); 1943 m_freem(*mp); 1944 return ENOBUFS; 1945 } 1946 m->m_len = min(MCLBYTES, sopt_size); 1947 } else { 1948 m->m_len = min(MLEN, sopt_size); 1949 } 1950 sopt_size -= m->m_len; 1951 m_prev->m_next = m; 1952 m_prev = m; 1953 } 1954 return 0; 1955 } 1956 1957 /* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */ 1958 int 1959 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m) 1960 { 1961 struct mbuf *m0 = m; 1962 1963 if (sopt->sopt_val == NULL) 1964 return 0; 1965 while (m != NULL && sopt->sopt_valsize >= m->m_len) { 1966 if (sopt->sopt_td != NULL) { 1967 int error; 1968 1969 error = copyin(sopt->sopt_val, mtod(m, char *), 1970 m->m_len); 1971 if (error != 0) { 1972 m_freem(m0); 1973 return(error); 1974 } 1975 } else 1976 bcopy(sopt->sopt_val, mtod(m, char *), m->m_len); 1977 sopt->sopt_valsize -= m->m_len; 1978 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; 1979 m = m->m_next; 1980 } 1981 if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */ 1982 panic("ip6_sooptmcopyin"); 1983 return 0; 1984 } 1985 1986 /* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */ 1987 int 1988 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m) 1989 { 1990 struct mbuf *m0 = m; 1991 size_t valsize = 0; 1992 1993 if (sopt->sopt_val == NULL) 1994 return 0; 1995 while (m != NULL && sopt->sopt_valsize >= m->m_len) { 1996 if (sopt->sopt_td != NULL) { 1997 int error; 1998 1999 error = copyout(mtod(m, char *), sopt->sopt_val, 2000 m->m_len); 2001 if (error != 0) { 2002 m_freem(m0); 2003 return(error); 2004 } 2005 } else 2006 bcopy(mtod(m, char *), sopt->sopt_val, m->m_len); 2007 sopt->sopt_valsize -= m->m_len; 2008 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; 2009 valsize += m->m_len; 2010 m = m->m_next; 2011 } 2012 if (m != NULL) { 2013 /* enough soopt buffer should be given from user-land */ 2014 m_freem(m0); 2015 return(EINVAL); 2016 } 2017 sopt->sopt_valsize = valsize; 2018 return 0; 2019 } 2020 2021 void 2022 sohasoutofband(so) 2023 struct socket *so; 2024 { 2025 if (so->so_sigio != NULL) 2026 pgsigio(&so->so_sigio, SIGURG, 0); 2027 selwakeuppri(&so->so_rcv.sb_sel, PSOCK); 2028 } 2029 2030 int 2031 sopoll(struct socket *so, int events, struct ucred *active_cred, 2032 struct thread *td) 2033 { 2034 int revents = 0; 2035 2036 if (events & (POLLIN | POLLRDNORM)) 2037 if (soreadable(so)) 2038 revents |= events & (POLLIN | POLLRDNORM); 2039 2040 if (events & POLLINIGNEOF) 2041 if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat || 2042 !TAILQ_EMPTY(&so->so_comp) || so->so_error) 2043 revents |= POLLINIGNEOF; 2044 2045 if (events & (POLLOUT | POLLWRNORM)) 2046 if (sowriteable(so)) 2047 revents |= events & (POLLOUT | POLLWRNORM); 2048 2049 if (events & (POLLPRI | POLLRDBAND)) 2050 if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK)) 2051 revents |= events & (POLLPRI | POLLRDBAND); 2052 2053 if (revents == 0) { 2054 if (events & 2055 (POLLIN | POLLINIGNEOF | POLLPRI | POLLRDNORM | 2056 POLLRDBAND)) { 2057 SOCKBUF_LOCK(&so->so_rcv); 2058 selrecord(td, &so->so_rcv.sb_sel); 2059 so->so_rcv.sb_flags |= SB_SEL; 2060 SOCKBUF_UNLOCK(&so->so_rcv); 2061 } 2062 2063 if (events & (POLLOUT | POLLWRNORM)) { 2064 SOCKBUF_LOCK(&so->so_snd); 2065 selrecord(td, &so->so_snd.sb_sel); 2066 so->so_snd.sb_flags |= SB_SEL; 2067 SOCKBUF_UNLOCK(&so->so_snd); 2068 } 2069 } 2070 2071 return (revents); 2072 } 2073 2074 int 2075 soo_kqfilter(struct file *fp, struct knote *kn) 2076 { 2077 struct socket *so = kn->kn_fp->f_data; 2078 struct sockbuf *sb; 2079 2080 switch (kn->kn_filter) { 2081 case EVFILT_READ: 2082 if (so->so_options & SO_ACCEPTCONN) 2083 kn->kn_fop = &solisten_filtops; 2084 else 2085 kn->kn_fop = &soread_filtops; 2086 sb = &so->so_rcv; 2087 break; 2088 case EVFILT_WRITE: 2089 kn->kn_fop = &sowrite_filtops; 2090 sb = &so->so_snd; 2091 break; 2092 default: 2093 return (1); 2094 } 2095 2096 SOCKBUF_LOCK(sb); 2097 SLIST_INSERT_HEAD(&sb->sb_sel.si_note, kn, kn_selnext); 2098 sb->sb_flags |= SB_KNOTE; 2099 SOCKBUF_UNLOCK(sb); 2100 return (0); 2101 } 2102 2103 static void 2104 filt_sordetach(struct knote *kn) 2105 { 2106 struct socket *so = kn->kn_fp->f_data; 2107 2108 SOCKBUF_LOCK(&so->so_rcv); 2109 SLIST_REMOVE(&so->so_rcv.sb_sel.si_note, kn, knote, kn_selnext); 2110 if (SLIST_EMPTY(&so->so_rcv.sb_sel.si_note)) 2111 so->so_rcv.sb_flags &= ~SB_KNOTE; 2112 SOCKBUF_UNLOCK(&so->so_rcv); 2113 } 2114 2115 /*ARGSUSED*/ 2116 static int 2117 filt_soread(struct knote *kn, long hint) 2118 { 2119 struct socket *so = kn->kn_fp->f_data; 2120 int need_lock, result; 2121 2122 /* 2123 * XXXRW: Conditional locking because filt_soread() can be called 2124 * either from KNOTE() in the socket context where the socket buffer 2125 * lock is already held, or from kqueue() itself. 2126 */ 2127 need_lock = !SOCKBUF_OWNED(&so->so_rcv); 2128 if (need_lock) 2129 SOCKBUF_LOCK(&so->so_rcv); 2130 kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl; 2131 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 2132 kn->kn_flags |= EV_EOF; 2133 kn->kn_fflags = so->so_error; 2134 result = 1; 2135 } else if (so->so_error) /* temporary udp error */ 2136 result = 1; 2137 else if (kn->kn_sfflags & NOTE_LOWAT) 2138 result = (kn->kn_data >= kn->kn_sdata); 2139 else 2140 result = (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat); 2141 if (need_lock) 2142 SOCKBUF_UNLOCK(&so->so_rcv); 2143 return (result); 2144 } 2145 2146 static void 2147 filt_sowdetach(struct knote *kn) 2148 { 2149 struct socket *so = kn->kn_fp->f_data; 2150 2151 SOCKBUF_LOCK(&so->so_snd); 2152 SLIST_REMOVE(&so->so_snd.sb_sel.si_note, kn, knote, kn_selnext); 2153 if (SLIST_EMPTY(&so->so_snd.sb_sel.si_note)) 2154 so->so_snd.sb_flags &= ~SB_KNOTE; 2155 SOCKBUF_UNLOCK(&so->so_snd); 2156 } 2157 2158 /*ARGSUSED*/ 2159 static int 2160 filt_sowrite(struct knote *kn, long hint) 2161 { 2162 struct socket *so = kn->kn_fp->f_data; 2163 int need_lock, result; 2164 2165 /* 2166 * XXXRW: Conditional locking because filt_soread() can be called 2167 * either from KNOTE() in the socket context where the socket buffer 2168 * lock is already held, or from kqueue() itself. 2169 */ 2170 need_lock = !SOCKBUF_OWNED(&so->so_snd); 2171 if (need_lock) 2172 SOCKBUF_LOCK(&so->so_snd); 2173 kn->kn_data = sbspace(&so->so_snd); 2174 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 2175 kn->kn_flags |= EV_EOF; 2176 kn->kn_fflags = so->so_error; 2177 result = 1; 2178 } else if (so->so_error) /* temporary udp error */ 2179 result = 1; 2180 else if (((so->so_state & SS_ISCONNECTED) == 0) && 2181 (so->so_proto->pr_flags & PR_CONNREQUIRED)) 2182 result = 0; 2183 else if (kn->kn_sfflags & NOTE_LOWAT) 2184 result = (kn->kn_data >= kn->kn_sdata); 2185 else 2186 result = (kn->kn_data >= so->so_snd.sb_lowat); 2187 if (need_lock) 2188 SOCKBUF_UNLOCK(&so->so_snd); 2189 return (result); 2190 } 2191 2192 /*ARGSUSED*/ 2193 static int 2194 filt_solisten(struct knote *kn, long hint) 2195 { 2196 struct socket *so = kn->kn_fp->f_data; 2197 2198 kn->kn_data = so->so_qlen; 2199 return (! TAILQ_EMPTY(&so->so_comp)); 2200 } 2201 2202 int 2203 socheckuid(struct socket *so, uid_t uid) 2204 { 2205 2206 if (so == NULL) 2207 return (EPERM); 2208 if (so->so_cred->cr_uid == uid) 2209 return (0); 2210 return (EPERM); 2211 } 2212