1 /* 2 * Copyright (c) 2004 The FreeBSD Foundation 3 * Copyright (c) 2004 Robert Watson 4 * Copyright (c) 1982, 1986, 1988, 1990, 1993 5 * The Regents of the University of California. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 4. Neither the name of the University nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 * 31 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94 32 */ 33 34 #include <sys/cdefs.h> 35 __FBSDID("$FreeBSD$"); 36 37 #include "opt_inet.h" 38 #include "opt_mac.h" 39 #include "opt_zero.h" 40 41 #include <sys/param.h> 42 #include <sys/systm.h> 43 #include <sys/fcntl.h> 44 #include <sys/limits.h> 45 #include <sys/lock.h> 46 #include <sys/mac.h> 47 #include <sys/malloc.h> 48 #include <sys/mbuf.h> 49 #include <sys/mutex.h> 50 #include <sys/domain.h> 51 #include <sys/file.h> /* for struct knote */ 52 #include <sys/kernel.h> 53 #include <sys/event.h> 54 #include <sys/poll.h> 55 #include <sys/proc.h> 56 #include <sys/protosw.h> 57 #include <sys/socket.h> 58 #include <sys/socketvar.h> 59 #include <sys/resourcevar.h> 60 #include <sys/signalvar.h> 61 #include <sys/sysctl.h> 62 #include <sys/uio.h> 63 #include <sys/jail.h> 64 65 #include <vm/uma.h> 66 67 68 static int soreceive_rcvoob(struct socket *so, struct uio *uio, 69 int flags); 70 71 #ifdef INET 72 static int do_setopt_accept_filter(struct socket *so, struct sockopt *sopt); 73 #endif 74 75 static void filt_sordetach(struct knote *kn); 76 static int filt_soread(struct knote *kn, long hint); 77 static void filt_sowdetach(struct knote *kn); 78 static int filt_sowrite(struct knote *kn, long hint); 79 static int filt_solisten(struct knote *kn, long hint); 80 81 static struct filterops solisten_filtops = 82 { 1, NULL, filt_sordetach, filt_solisten }; 83 static struct filterops soread_filtops = 84 { 1, NULL, filt_sordetach, filt_soread }; 85 static struct filterops sowrite_filtops = 86 { 1, NULL, filt_sowdetach, filt_sowrite }; 87 88 uma_zone_t socket_zone; 89 so_gen_t so_gencnt; /* generation count for sockets */ 90 91 MALLOC_DEFINE(M_SONAME, "soname", "socket name"); 92 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block"); 93 94 SYSCTL_DECL(_kern_ipc); 95 96 static int somaxconn = SOMAXCONN; 97 SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLFLAG_RW, 98 &somaxconn, 0, "Maximum pending socket connection queue size"); 99 static int numopensockets; 100 SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD, 101 &numopensockets, 0, "Number of open sockets"); 102 #ifdef ZERO_COPY_SOCKETS 103 /* These aren't static because they're used in other files. */ 104 int so_zero_copy_send = 1; 105 int so_zero_copy_receive = 1; 106 SYSCTL_NODE(_kern_ipc, OID_AUTO, zero_copy, CTLFLAG_RD, 0, 107 "Zero copy controls"); 108 SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, receive, CTLFLAG_RW, 109 &so_zero_copy_receive, 0, "Enable zero copy receive"); 110 SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, send, CTLFLAG_RW, 111 &so_zero_copy_send, 0, "Enable zero copy send"); 112 #endif /* ZERO_COPY_SOCKETS */ 113 114 /* 115 * accept_mtx locks down per-socket fields relating to accept queues. See 116 * socketvar.h for an annotation of the protected fields of struct socket. 117 */ 118 struct mtx accept_mtx; 119 MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF); 120 121 /* 122 * so_global_mtx protects so_gencnt, numopensockets, and the per-socket 123 * so_gencnt field. 124 * 125 * XXXRW: These variables might be better manipulated using atomic operations 126 * for improved efficiency. 127 */ 128 static struct mtx so_global_mtx; 129 MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF); 130 131 /* 132 * Socket operation routines. 133 * These routines are called by the routines in 134 * sys_socket.c or from a system process, and 135 * implement the semantics of socket operations by 136 * switching out to the protocol specific routines. 137 */ 138 139 /* 140 * Get a socket structure from our zone, and initialize it. 141 * Note that it would probably be better to allocate socket 142 * and PCB at the same time, but I'm not convinced that all 143 * the protocols can be easily modified to do this. 144 * 145 * soalloc() returns a socket with a ref count of 0. 146 */ 147 struct socket * 148 soalloc(int mflags) 149 { 150 struct socket *so; 151 #ifdef MAC 152 int error; 153 #endif 154 155 so = uma_zalloc(socket_zone, mflags | M_ZERO); 156 if (so != NULL) { 157 #ifdef MAC 158 error = mac_init_socket(so, mflags); 159 if (error != 0) { 160 uma_zfree(socket_zone, so); 161 so = NULL; 162 return so; 163 } 164 #endif 165 SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd"); 166 SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv"); 167 /* sx_init(&so->so_sxlock, "socket sxlock"); */ 168 TAILQ_INIT(&so->so_aiojobq); 169 mtx_lock(&so_global_mtx); 170 so->so_gencnt = ++so_gencnt; 171 ++numopensockets; 172 mtx_unlock(&so_global_mtx); 173 } 174 return so; 175 } 176 177 /* 178 * socreate returns a socket with a ref count of 1. The socket should be 179 * closed with soclose(). 180 */ 181 int 182 socreate(dom, aso, type, proto, cred, td) 183 int dom; 184 struct socket **aso; 185 int type; 186 int proto; 187 struct ucred *cred; 188 struct thread *td; 189 { 190 struct protosw *prp; 191 struct socket *so; 192 int error; 193 194 if (proto) 195 prp = pffindproto(dom, proto, type); 196 else 197 prp = pffindtype(dom, type); 198 199 if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL) 200 return (EPROTONOSUPPORT); 201 202 if (jailed(cred) && jail_socket_unixiproute_only && 203 prp->pr_domain->dom_family != PF_LOCAL && 204 prp->pr_domain->dom_family != PF_INET && 205 prp->pr_domain->dom_family != PF_ROUTE) { 206 return (EPROTONOSUPPORT); 207 } 208 209 if (prp->pr_type != type) 210 return (EPROTOTYPE); 211 so = soalloc(M_WAITOK); 212 if (so == NULL) 213 return (ENOBUFS); 214 215 TAILQ_INIT(&so->so_incomp); 216 TAILQ_INIT(&so->so_comp); 217 so->so_type = type; 218 so->so_cred = crhold(cred); 219 so->so_proto = prp; 220 #ifdef MAC 221 mac_create_socket(cred, so); 222 #endif 223 SOCK_LOCK(so); 224 soref(so); 225 SOCK_UNLOCK(so); 226 error = (*prp->pr_usrreqs->pru_attach)(so, proto, td); 227 if (error) { 228 SOCK_LOCK(so); 229 so->so_state |= SS_NOFDREF; 230 sorele(so); 231 return (error); 232 } 233 *aso = so; 234 return (0); 235 } 236 237 int 238 sobind(so, nam, td) 239 struct socket *so; 240 struct sockaddr *nam; 241 struct thread *td; 242 { 243 244 return ((*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td)); 245 } 246 247 void 248 sodealloc(struct socket *so) 249 { 250 251 KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count)); 252 mtx_lock(&so_global_mtx); 253 so->so_gencnt = ++so_gencnt; 254 mtx_unlock(&so_global_mtx); 255 if (so->so_rcv.sb_hiwat) 256 (void)chgsbsize(so->so_cred->cr_uidinfo, 257 &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY); 258 if (so->so_snd.sb_hiwat) 259 (void)chgsbsize(so->so_cred->cr_uidinfo, 260 &so->so_snd.sb_hiwat, 0, RLIM_INFINITY); 261 #ifdef INET 262 /* remove acccept filter if one is present. */ 263 if (so->so_accf != NULL) 264 do_setopt_accept_filter(so, NULL); 265 #endif 266 #ifdef MAC 267 mac_destroy_socket(so); 268 #endif 269 crfree(so->so_cred); 270 SOCKBUF_LOCK_DESTROY(&so->so_snd); 271 SOCKBUF_LOCK_DESTROY(&so->so_rcv); 272 /* sx_destroy(&so->so_sxlock); */ 273 uma_zfree(socket_zone, so); 274 /* 275 * XXXRW: Seems like a shame to grab the mutex again down here, but 276 * we don't want to decrement the socket count until after we free 277 * the socket, and we can't increment the gencnt on the socket after 278 * we free, it so... 279 */ 280 mtx_lock(&so_global_mtx); 281 --numopensockets; 282 mtx_unlock(&so_global_mtx); 283 } 284 285 int 286 solisten(so, backlog, td) 287 struct socket *so; 288 int backlog; 289 struct thread *td; 290 { 291 int error; 292 293 /* 294 * XXXRW: Ordering issue here -- perhaps we need to set 295 * SO_ACCEPTCONN before the call to pru_listen()? 296 * XXXRW: General atomic test-and-set concerns here also. 297 */ 298 if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | 299 SS_ISDISCONNECTING)) 300 return (EINVAL); 301 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, td); 302 if (error) 303 return (error); 304 ACCEPT_LOCK(); 305 if (TAILQ_EMPTY(&so->so_comp)) { 306 SOCK_LOCK(so); 307 so->so_options |= SO_ACCEPTCONN; 308 SOCK_UNLOCK(so); 309 } 310 if (backlog < 0 || backlog > somaxconn) 311 backlog = somaxconn; 312 so->so_qlimit = backlog; 313 ACCEPT_UNLOCK(); 314 return (0); 315 } 316 317 void 318 sofree(so) 319 struct socket *so; 320 { 321 struct socket *head; 322 323 KASSERT(so->so_count == 0, ("socket %p so_count not 0", so)); 324 SOCK_LOCK_ASSERT(so); 325 326 if (so->so_pcb != NULL || (so->so_state & SS_NOFDREF) == 0) { 327 SOCK_UNLOCK(so); 328 return; 329 } 330 331 SOCK_UNLOCK(so); 332 ACCEPT_LOCK(); 333 head = so->so_head; 334 if (head != NULL) { 335 KASSERT((so->so_qstate & SQ_COMP) != 0 || 336 (so->so_qstate & SQ_INCOMP) != 0, 337 ("sofree: so_head != NULL, but neither SQ_COMP nor " 338 "SQ_INCOMP")); 339 KASSERT((so->so_qstate & SQ_COMP) == 0 || 340 (so->so_qstate & SQ_INCOMP) == 0, 341 ("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP")); 342 /* 343 * accept(2) is responsible draining the completed 344 * connection queue and freeing those sockets, so 345 * we just return here if this socket is currently 346 * on the completed connection queue. Otherwise, 347 * accept(2) may hang after select(2) has indicating 348 * that a listening socket was ready. If it's an 349 * incomplete connection, we remove it from the queue 350 * and free it; otherwise, it won't be released until 351 * the listening socket is closed. 352 */ 353 if ((so->so_qstate & SQ_COMP) != 0) { 354 ACCEPT_UNLOCK(); 355 return; 356 } 357 TAILQ_REMOVE(&head->so_incomp, so, so_list); 358 head->so_incqlen--; 359 so->so_qstate &= ~SQ_INCOMP; 360 so->so_head = NULL; 361 } 362 KASSERT((so->so_qstate & SQ_COMP) == 0 && 363 (so->so_qstate & SQ_INCOMP) == 0, 364 ("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)", 365 so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP)); 366 ACCEPT_UNLOCK(); 367 SOCKBUF_LOCK(&so->so_snd); 368 so->so_snd.sb_flags |= SB_NOINTR; 369 (void)sblock(&so->so_snd, M_WAITOK); 370 /* 371 * socantsendmore_locked() drops the socket buffer mutex so that it 372 * can safely perform wakeups. Re-acquire the mutex before 373 * continuing. 374 */ 375 socantsendmore_locked(so); 376 SOCKBUF_LOCK(&so->so_snd); 377 sbunlock(&so->so_snd); 378 sbrelease_locked(&so->so_snd, so); 379 SOCKBUF_UNLOCK(&so->so_snd); 380 sorflush(so); 381 sodealloc(so); 382 } 383 384 /* 385 * Close a socket on last file table reference removal. 386 * Initiate disconnect if connected. 387 * Free socket when disconnect complete. 388 * 389 * This function will sorele() the socket. Note that soclose() may be 390 * called prior to the ref count reaching zero. The actual socket 391 * structure will not be freed until the ref count reaches zero. 392 */ 393 int 394 soclose(so) 395 struct socket *so; 396 { 397 int error = 0; 398 399 KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter")); 400 401 funsetown(&so->so_sigio); 402 if (so->so_options & SO_ACCEPTCONN) { 403 struct socket *sp; 404 ACCEPT_LOCK(); 405 while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) { 406 TAILQ_REMOVE(&so->so_incomp, sp, so_list); 407 so->so_incqlen--; 408 sp->so_qstate &= ~SQ_INCOMP; 409 sp->so_head = NULL; 410 ACCEPT_UNLOCK(); 411 (void) soabort(sp); 412 ACCEPT_LOCK(); 413 } 414 while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) { 415 TAILQ_REMOVE(&so->so_comp, sp, so_list); 416 so->so_qlen--; 417 sp->so_qstate &= ~SQ_COMP; 418 sp->so_head = NULL; 419 ACCEPT_UNLOCK(); 420 (void) soabort(sp); 421 ACCEPT_LOCK(); 422 } 423 ACCEPT_UNLOCK(); 424 } 425 if (so->so_pcb == NULL) 426 goto discard; 427 if (so->so_state & SS_ISCONNECTED) { 428 if ((so->so_state & SS_ISDISCONNECTING) == 0) { 429 error = sodisconnect(so); 430 if (error) 431 goto drop; 432 } 433 if (so->so_options & SO_LINGER) { 434 if ((so->so_state & SS_ISDISCONNECTING) && 435 (so->so_state & SS_NBIO)) 436 goto drop; 437 while (so->so_state & SS_ISCONNECTED) { 438 error = tsleep(&so->so_timeo, 439 PSOCK | PCATCH, "soclos", so->so_linger * hz); 440 if (error) 441 break; 442 } 443 } 444 } 445 drop: 446 if (so->so_pcb != NULL) { 447 int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so); 448 if (error == 0) 449 error = error2; 450 } 451 discard: 452 SOCK_LOCK(so); 453 KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF")); 454 so->so_state |= SS_NOFDREF; 455 sorele(so); 456 return (error); 457 } 458 459 /* 460 * soabort() must not be called with any socket locks held, as it calls 461 * into the protocol, which will call back into the socket code causing 462 * it to acquire additional socket locks that may cause recursion or lock 463 * order reversals. 464 */ 465 int 466 soabort(so) 467 struct socket *so; 468 { 469 int error; 470 471 error = (*so->so_proto->pr_usrreqs->pru_abort)(so); 472 if (error) { 473 SOCK_LOCK(so); 474 sotryfree(so); /* note: does not decrement the ref count */ 475 return error; 476 } 477 return (0); 478 } 479 480 int 481 soaccept(so, nam) 482 struct socket *so; 483 struct sockaddr **nam; 484 { 485 int error; 486 487 SOCK_LOCK(so); 488 KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF")); 489 so->so_state &= ~SS_NOFDREF; 490 SOCK_UNLOCK(so); 491 error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam); 492 return (error); 493 } 494 495 int 496 soconnect(so, nam, td) 497 struct socket *so; 498 struct sockaddr *nam; 499 struct thread *td; 500 { 501 int error; 502 503 if (so->so_options & SO_ACCEPTCONN) 504 return (EOPNOTSUPP); 505 /* 506 * If protocol is connection-based, can only connect once. 507 * Otherwise, if connected, try to disconnect first. 508 * This allows user to disconnect by connecting to, e.g., 509 * a null address. 510 */ 511 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && 512 ((so->so_proto->pr_flags & PR_CONNREQUIRED) || 513 (error = sodisconnect(so)))) 514 error = EISCONN; 515 else 516 error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td); 517 return (error); 518 } 519 520 int 521 soconnect2(so1, so2) 522 struct socket *so1; 523 struct socket *so2; 524 { 525 526 return ((*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2)); 527 } 528 529 int 530 sodisconnect(so) 531 struct socket *so; 532 { 533 int error; 534 535 if ((so->so_state & SS_ISCONNECTED) == 0) 536 return (ENOTCONN); 537 if (so->so_state & SS_ISDISCONNECTING) 538 return (EALREADY); 539 error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so); 540 return (error); 541 } 542 543 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK) 544 /* 545 * Send on a socket. 546 * If send must go all at once and message is larger than 547 * send buffering, then hard error. 548 * Lock against other senders. 549 * If must go all at once and not enough room now, then 550 * inform user that this would block and do nothing. 551 * Otherwise, if nonblocking, send as much as possible. 552 * The data to be sent is described by "uio" if nonzero, 553 * otherwise by the mbuf chain "top" (which must be null 554 * if uio is not). Data provided in mbuf chain must be small 555 * enough to send all at once. 556 * 557 * Returns nonzero on error, timeout or signal; callers 558 * must check for short counts if EINTR/ERESTART are returned. 559 * Data and control buffers are freed on return. 560 */ 561 562 #ifdef ZERO_COPY_SOCKETS 563 struct so_zerocopy_stats{ 564 int size_ok; 565 int align_ok; 566 int found_ifp; 567 }; 568 struct so_zerocopy_stats so_zerocp_stats = {0,0,0}; 569 #include <netinet/in.h> 570 #include <net/route.h> 571 #include <netinet/in_pcb.h> 572 #include <vm/vm.h> 573 #include <vm/vm_page.h> 574 #include <vm/vm_object.h> 575 #endif /*ZERO_COPY_SOCKETS*/ 576 577 int 578 sosend(so, addr, uio, top, control, flags, td) 579 struct socket *so; 580 struct sockaddr *addr; 581 struct uio *uio; 582 struct mbuf *top; 583 struct mbuf *control; 584 int flags; 585 struct thread *td; 586 { 587 struct mbuf **mp; 588 struct mbuf *m; 589 long space, len = 0, resid; 590 int clen = 0, error, dontroute; 591 int atomic = sosendallatonce(so) || top; 592 #ifdef ZERO_COPY_SOCKETS 593 int cow_send; 594 #endif /* ZERO_COPY_SOCKETS */ 595 596 if (uio != NULL) 597 resid = uio->uio_resid; 598 else 599 resid = top->m_pkthdr.len; 600 /* 601 * In theory resid should be unsigned. 602 * However, space must be signed, as it might be less than 0 603 * if we over-committed, and we must use a signed comparison 604 * of space and resid. On the other hand, a negative resid 605 * causes us to loop sending 0-length segments to the protocol. 606 * 607 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM 608 * type sockets since that's an error. 609 */ 610 if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) { 611 error = EINVAL; 612 goto out; 613 } 614 615 dontroute = 616 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && 617 (so->so_proto->pr_flags & PR_ATOMIC); 618 if (td != NULL) 619 td->td_proc->p_stats->p_ru.ru_msgsnd++; 620 if (control != NULL) 621 clen = control->m_len; 622 #define snderr(errno) { error = (errno); goto release; } 623 624 SOCKBUF_LOCK(&so->so_snd); 625 restart: 626 SOCKBUF_LOCK_ASSERT(&so->so_snd); 627 error = sblock(&so->so_snd, SBLOCKWAIT(flags)); 628 if (error) 629 goto out_locked; 630 do { 631 SOCKBUF_LOCK_ASSERT(&so->so_snd); 632 if (so->so_snd.sb_state & SBS_CANTSENDMORE) 633 snderr(EPIPE); 634 if (so->so_error) { 635 error = so->so_error; 636 so->so_error = 0; 637 goto release; 638 } 639 if ((so->so_state & SS_ISCONNECTED) == 0) { 640 /* 641 * `sendto' and `sendmsg' is allowed on a connection- 642 * based socket if it supports implied connect. 643 * Return ENOTCONN if not connected and no address is 644 * supplied. 645 */ 646 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && 647 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { 648 if ((so->so_state & SS_ISCONFIRMING) == 0 && 649 !(resid == 0 && clen != 0)) 650 snderr(ENOTCONN); 651 } else if (addr == NULL) 652 snderr(so->so_proto->pr_flags & PR_CONNREQUIRED ? 653 ENOTCONN : EDESTADDRREQ); 654 } 655 space = sbspace(&so->so_snd); 656 if (flags & MSG_OOB) 657 space += 1024; 658 if ((atomic && resid > so->so_snd.sb_hiwat) || 659 clen > so->so_snd.sb_hiwat) 660 snderr(EMSGSIZE); 661 if (space < resid + clen && 662 (atomic || space < so->so_snd.sb_lowat || space < clen)) { 663 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) 664 snderr(EWOULDBLOCK); 665 sbunlock(&so->so_snd); 666 error = sbwait(&so->so_snd); 667 if (error) 668 goto out_locked; 669 goto restart; 670 } 671 SOCKBUF_UNLOCK(&so->so_snd); 672 mp = ⊤ 673 space -= clen; 674 do { 675 if (uio == NULL) { 676 /* 677 * Data is prepackaged in "top". 678 */ 679 resid = 0; 680 if (flags & MSG_EOR) 681 top->m_flags |= M_EOR; 682 } else do { 683 #ifdef ZERO_COPY_SOCKETS 684 cow_send = 0; 685 #endif /* ZERO_COPY_SOCKETS */ 686 if (resid >= MINCLSIZE) { 687 #ifdef ZERO_COPY_SOCKETS 688 if (top == NULL) { 689 MGETHDR(m, M_TRYWAIT, MT_DATA); 690 if (m == NULL) { 691 error = ENOBUFS; 692 SOCKBUF_LOCK(&so->so_snd); 693 goto release; 694 } 695 m->m_pkthdr.len = 0; 696 m->m_pkthdr.rcvif = (struct ifnet *)0; 697 } else { 698 MGET(m, M_TRYWAIT, MT_DATA); 699 if (m == NULL) { 700 error = ENOBUFS; 701 SOCKBUF_LOCK(&so->so_snd); 702 goto release; 703 } 704 } 705 if (so_zero_copy_send && 706 resid>=PAGE_SIZE && 707 space>=PAGE_SIZE && 708 uio->uio_iov->iov_len>=PAGE_SIZE) { 709 so_zerocp_stats.size_ok++; 710 if (!((vm_offset_t) 711 uio->uio_iov->iov_base & PAGE_MASK)){ 712 so_zerocp_stats.align_ok++; 713 cow_send = socow_setup(m, uio); 714 } 715 } 716 if (!cow_send) { 717 MCLGET(m, M_TRYWAIT); 718 if ((m->m_flags & M_EXT) == 0) { 719 m_free(m); 720 m = NULL; 721 } else { 722 len = min(min(MCLBYTES, resid), space); 723 } 724 } else 725 len = PAGE_SIZE; 726 #else /* ZERO_COPY_SOCKETS */ 727 if (top == NULL) { 728 m = m_getcl(M_TRYWAIT, MT_DATA, M_PKTHDR); 729 m->m_pkthdr.len = 0; 730 m->m_pkthdr.rcvif = (struct ifnet *)0; 731 } else 732 m = m_getcl(M_TRYWAIT, MT_DATA, 0); 733 len = min(min(MCLBYTES, resid), space); 734 #endif /* ZERO_COPY_SOCKETS */ 735 } else { 736 if (top == NULL) { 737 m = m_gethdr(M_TRYWAIT, MT_DATA); 738 m->m_pkthdr.len = 0; 739 m->m_pkthdr.rcvif = (struct ifnet *)0; 740 741 len = min(min(MHLEN, resid), space); 742 /* 743 * For datagram protocols, leave room 744 * for protocol headers in first mbuf. 745 */ 746 if (atomic && m && len < MHLEN) 747 MH_ALIGN(m, len); 748 } else { 749 m = m_get(M_TRYWAIT, MT_DATA); 750 len = min(min(MLEN, resid), space); 751 } 752 } 753 if (m == NULL) { 754 error = ENOBUFS; 755 SOCKBUF_LOCK(&so->so_snd); 756 goto release; 757 } 758 759 space -= len; 760 #ifdef ZERO_COPY_SOCKETS 761 if (cow_send) 762 error = 0; 763 else 764 #endif /* ZERO_COPY_SOCKETS */ 765 error = uiomove(mtod(m, void *), (int)len, uio); 766 resid = uio->uio_resid; 767 m->m_len = len; 768 *mp = m; 769 top->m_pkthdr.len += len; 770 if (error) { 771 SOCKBUF_LOCK(&so->so_snd); 772 goto release; 773 } 774 mp = &m->m_next; 775 if (resid <= 0) { 776 if (flags & MSG_EOR) 777 top->m_flags |= M_EOR; 778 break; 779 } 780 } while (space > 0 && atomic); 781 if (dontroute) { 782 SOCK_LOCK(so); 783 so->so_options |= SO_DONTROUTE; 784 SOCK_UNLOCK(so); 785 } 786 /* 787 * XXX all the SBS_CANTSENDMORE checks previously 788 * done could be out of date. We could have recieved 789 * a reset packet in an interrupt or maybe we slept 790 * while doing page faults in uiomove() etc. We could 791 * probably recheck again inside the splnet() protection 792 * here, but there are probably other places that this 793 * also happens. We must rethink this. 794 */ 795 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 796 (flags & MSG_OOB) ? PRUS_OOB : 797 /* 798 * If the user set MSG_EOF, the protocol 799 * understands this flag and nothing left to 800 * send then use PRU_SEND_EOF instead of PRU_SEND. 801 */ 802 ((flags & MSG_EOF) && 803 (so->so_proto->pr_flags & PR_IMPLOPCL) && 804 (resid <= 0)) ? 805 PRUS_EOF : 806 /* If there is more to send set PRUS_MORETOCOME */ 807 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, 808 top, addr, control, td); 809 if (dontroute) { 810 SOCK_LOCK(so); 811 so->so_options &= ~SO_DONTROUTE; 812 SOCK_UNLOCK(so); 813 } 814 clen = 0; 815 control = NULL; 816 top = NULL; 817 mp = ⊤ 818 if (error) { 819 SOCKBUF_LOCK(&so->so_snd); 820 goto release; 821 } 822 } while (resid && space > 0); 823 SOCKBUF_LOCK(&so->so_snd); 824 } while (resid); 825 826 release: 827 SOCKBUF_LOCK_ASSERT(&so->so_snd); 828 sbunlock(&so->so_snd); 829 out_locked: 830 SOCKBUF_LOCK_ASSERT(&so->so_snd); 831 SOCKBUF_UNLOCK(&so->so_snd); 832 out: 833 if (top != NULL) 834 m_freem(top); 835 if (control != NULL) 836 m_freem(control); 837 return (error); 838 } 839 840 /* 841 * The part of soreceive() that implements reading non-inline out-of-band 842 * data from a socket. For more complete comments, see soreceive(), from 843 * which this code originated. 844 * 845 * XXXRW: Note that soreceive_rcvoob(), unlike the remainder of soreiceve(), 846 * is unable to return an mbuf chain to the caller. 847 */ 848 static int 849 soreceive_rcvoob(so, uio, flags) 850 struct socket *so; 851 struct uio *uio; 852 int flags; 853 { 854 struct protosw *pr = so->so_proto; 855 struct mbuf *m; 856 int error; 857 858 KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0")); 859 860 m = m_get(M_TRYWAIT, MT_DATA); 861 if (m == NULL) 862 return (ENOBUFS); 863 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK); 864 if (error) 865 goto bad; 866 do { 867 #ifdef ZERO_COPY_SOCKETS 868 if (so_zero_copy_receive) { 869 vm_page_t pg; 870 int disposable; 871 872 if ((m->m_flags & M_EXT) 873 && (m->m_ext.ext_type == EXT_DISPOSABLE)) 874 disposable = 1; 875 else 876 disposable = 0; 877 878 pg = PHYS_TO_VM_PAGE(vtophys(mtod(m, caddr_t))); 879 if (uio->uio_offset == -1) 880 uio->uio_offset =IDX_TO_OFF(pg->pindex); 881 882 error = uiomoveco(mtod(m, void *), 883 min(uio->uio_resid, m->m_len), 884 uio, pg->object, 885 disposable); 886 } else 887 #endif /* ZERO_COPY_SOCKETS */ 888 error = uiomove(mtod(m, void *), 889 (int) min(uio->uio_resid, m->m_len), uio); 890 m = m_free(m); 891 } while (uio->uio_resid && error == 0 && m); 892 bad: 893 if (m != NULL) 894 m_freem(m); 895 return (error); 896 } 897 898 /* 899 * Following replacement or removal of the first mbuf on the first mbuf chain 900 * of a socket buffer, push necessary state changes back into the socket 901 * buffer so that other consumers see the values consistently. 'nextrecord' 902 * is the callers locally stored value of the original value of 903 * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes. 904 * NOTE: 'nextrecord' may be NULL. 905 */ 906 static __inline void 907 sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord) 908 { 909 910 SOCKBUF_LOCK_ASSERT(sb); 911 /* 912 * First, update for the new value of nextrecord. If necessary, make 913 * it the first record. 914 */ 915 if (sb->sb_mb != NULL) 916 sb->sb_mb->m_nextpkt = nextrecord; 917 else 918 sb->sb_mb = nextrecord; 919 920 /* 921 * Now update any dependent socket buffer fields to reflect the new 922 * state. This is an expanded inline of SB_EMPTY_FIXUP(), with the 923 * addition of a second clause that takes care of the case where 924 * sb_mb has been updated, but remains the last record. 925 */ 926 if (sb->sb_mb == NULL) { 927 sb->sb_mbtail = NULL; 928 sb->sb_lastrecord = NULL; 929 } else if (sb->sb_mb->m_nextpkt == NULL) 930 sb->sb_lastrecord = sb->sb_mb; 931 } 932 933 934 /* 935 * Implement receive operations on a socket. 936 * We depend on the way that records are added to the sockbuf 937 * by sbappend*. In particular, each record (mbufs linked through m_next) 938 * must begin with an address if the protocol so specifies, 939 * followed by an optional mbuf or mbufs containing ancillary data, 940 * and then zero or more mbufs of data. 941 * In order to avoid blocking network interrupts for the entire time here, 942 * we splx() while doing the actual copy to user space. 943 * Although the sockbuf is locked, new data may still be appended, 944 * and thus we must maintain consistency of the sockbuf during that time. 945 * 946 * The caller may receive the data as a single mbuf chain by supplying 947 * an mbuf **mp0 for use in returning the chain. The uio is then used 948 * only for the count in uio_resid. 949 */ 950 int 951 soreceive(so, psa, uio, mp0, controlp, flagsp) 952 struct socket *so; 953 struct sockaddr **psa; 954 struct uio *uio; 955 struct mbuf **mp0; 956 struct mbuf **controlp; 957 int *flagsp; 958 { 959 struct mbuf *m, **mp; 960 int flags, len, error, offset; 961 struct protosw *pr = so->so_proto; 962 struct mbuf *nextrecord; 963 int moff, type = 0; 964 int orig_resid = uio->uio_resid; 965 966 mp = mp0; 967 if (psa != NULL) 968 *psa = NULL; 969 if (controlp != NULL) 970 *controlp = NULL; 971 if (flagsp != NULL) 972 flags = *flagsp &~ MSG_EOR; 973 else 974 flags = 0; 975 if (flags & MSG_OOB) 976 return (soreceive_rcvoob(so, uio, flags)); 977 if (mp != NULL) 978 *mp = NULL; 979 if (so->so_state & SS_ISCONFIRMING && uio->uio_resid) 980 (*pr->pr_usrreqs->pru_rcvd)(so, 0); 981 982 SOCKBUF_LOCK(&so->so_rcv); 983 restart: 984 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 985 error = sblock(&so->so_rcv, SBLOCKWAIT(flags)); 986 if (error) 987 goto out; 988 989 m = so->so_rcv.sb_mb; 990 /* 991 * If we have less data than requested, block awaiting more 992 * (subject to any timeout) if: 993 * 1. the current count is less than the low water mark, or 994 * 2. MSG_WAITALL is set, and it is possible to do the entire 995 * receive operation at once if we block (resid <= hiwat). 996 * 3. MSG_DONTWAIT is not set 997 * If MSG_WAITALL is set but resid is larger than the receive buffer, 998 * we have to do the receive in sections, and thus risk returning 999 * a short count if a timeout or signal occurs after we start. 1000 */ 1001 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && 1002 so->so_rcv.sb_cc < uio->uio_resid) && 1003 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat || 1004 ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) && 1005 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) { 1006 KASSERT(m != NULL || !so->so_rcv.sb_cc, 1007 ("receive: m == %p so->so_rcv.sb_cc == %u", 1008 m, so->so_rcv.sb_cc)); 1009 if (so->so_error) { 1010 if (m != NULL) 1011 goto dontblock; 1012 error = so->so_error; 1013 if ((flags & MSG_PEEK) == 0) 1014 so->so_error = 0; 1015 goto release; 1016 } 1017 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1018 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 1019 if (m) 1020 goto dontblock; 1021 else 1022 goto release; 1023 } 1024 for (; m != NULL; m = m->m_next) 1025 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { 1026 m = so->so_rcv.sb_mb; 1027 goto dontblock; 1028 } 1029 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 1030 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 1031 error = ENOTCONN; 1032 goto release; 1033 } 1034 if (uio->uio_resid == 0) 1035 goto release; 1036 if ((so->so_state & SS_NBIO) || 1037 (flags & (MSG_DONTWAIT|MSG_NBIO))) { 1038 error = EWOULDBLOCK; 1039 goto release; 1040 } 1041 SBLASTRECORDCHK(&so->so_rcv); 1042 SBLASTMBUFCHK(&so->so_rcv); 1043 sbunlock(&so->so_rcv); 1044 error = sbwait(&so->so_rcv); 1045 if (error) 1046 goto out; 1047 goto restart; 1048 } 1049 dontblock: 1050 /* 1051 * From this point onward, we maintain 'nextrecord' as a cache of the 1052 * pointer to the next record in the socket buffer. We must keep the 1053 * various socket buffer pointers and local stack versions of the 1054 * pointers in sync, pushing out modifications before dropping the 1055 * socket buffer mutex, and re-reading them when picking it up. 1056 * 1057 * Otherwise, we will race with the network stack appending new data 1058 * or records onto the socket buffer by using inconsistent/stale 1059 * versions of the field, possibly resulting in socket buffer 1060 * corruption. 1061 * 1062 * By holding the high-level sblock(), we prevent simultaneous 1063 * readers from pulling off the front of the socket buffer. 1064 */ 1065 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1066 if (uio->uio_td) 1067 uio->uio_td->td_proc->p_stats->p_ru.ru_msgrcv++; 1068 KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb")); 1069 SBLASTRECORDCHK(&so->so_rcv); 1070 SBLASTMBUFCHK(&so->so_rcv); 1071 nextrecord = m->m_nextpkt; 1072 if (pr->pr_flags & PR_ADDR) { 1073 KASSERT(m->m_type == MT_SONAME, 1074 ("m->m_type == %d", m->m_type)); 1075 orig_resid = 0; 1076 if (psa != NULL) 1077 *psa = sodupsockaddr(mtod(m, struct sockaddr *), 1078 M_NOWAIT); 1079 if (flags & MSG_PEEK) { 1080 m = m->m_next; 1081 } else { 1082 sbfree(&so->so_rcv, m); 1083 so->so_rcv.sb_mb = m_free(m); 1084 m = so->so_rcv.sb_mb; 1085 sockbuf_pushsync(&so->so_rcv, nextrecord); 1086 } 1087 } 1088 1089 /* 1090 * Process one or more MT_CONTROL mbufs present before any data mbufs 1091 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we 1092 * just copy the data; if !MSG_PEEK, we call into the protocol to 1093 * perform externalization (or freeing if controlp == NULL). 1094 */ 1095 if (m != NULL && m->m_type == MT_CONTROL) { 1096 struct mbuf *cm = NULL, *cmn; 1097 struct mbuf **cme = &cm; 1098 1099 do { 1100 if (flags & MSG_PEEK) { 1101 if (controlp != NULL) { 1102 *controlp = m_copy(m, 0, m->m_len); 1103 controlp = &(*controlp)->m_next; 1104 } 1105 m = m->m_next; 1106 } else { 1107 sbfree(&so->so_rcv, m); 1108 so->so_rcv.sb_mb = m->m_next; 1109 m->m_next = NULL; 1110 *cme = m; 1111 cme = &(*cme)->m_next; 1112 m = so->so_rcv.sb_mb; 1113 } 1114 } while (m != NULL && m->m_type == MT_CONTROL); 1115 if ((flags & MSG_PEEK) == 0) 1116 sockbuf_pushsync(&so->so_rcv, nextrecord); 1117 while (cm != NULL) { 1118 cmn = cm->m_next; 1119 cm->m_next = NULL; 1120 if (pr->pr_domain->dom_externalize != NULL) { 1121 SOCKBUF_UNLOCK(&so->so_rcv); 1122 error = (*pr->pr_domain->dom_externalize) 1123 (cm, controlp); 1124 SOCKBUF_LOCK(&so->so_rcv); 1125 } else if (controlp != NULL) 1126 *controlp = cm; 1127 else 1128 m_freem(cm); 1129 if (controlp != NULL) { 1130 orig_resid = 0; 1131 while (*controlp != NULL) 1132 controlp = &(*controlp)->m_next; 1133 } 1134 cm = cmn; 1135 } 1136 nextrecord = so->so_rcv.sb_mb->m_nextpkt; 1137 orig_resid = 0; 1138 } 1139 if (m != NULL) { 1140 if ((flags & MSG_PEEK) == 0) { 1141 KASSERT(m->m_nextpkt == nextrecord, 1142 ("soreceive: post-control, nextrecord !sync")); 1143 if (nextrecord == NULL) { 1144 KASSERT(so->so_rcv.sb_mb == m, 1145 ("soreceive: post-control, sb_mb!=m")); 1146 KASSERT(so->so_rcv.sb_lastrecord == m, 1147 ("soreceive: post-control, lastrecord!=m")); 1148 } 1149 } 1150 type = m->m_type; 1151 if (type == MT_OOBDATA) 1152 flags |= MSG_OOB; 1153 } else { 1154 if ((flags & MSG_PEEK) == 0) { 1155 KASSERT(so->so_rcv.sb_mb == nextrecord, 1156 ("soreceive: sb_mb != nextrecord")); 1157 if (so->so_rcv.sb_mb == NULL) { 1158 KASSERT(so->so_rcv.sb_lastrecord == NULL, 1159 ("soreceive: sb_lastercord != NULL")); 1160 } 1161 } 1162 } 1163 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1164 SBLASTRECORDCHK(&so->so_rcv); 1165 SBLASTMBUFCHK(&so->so_rcv); 1166 1167 /* 1168 * Now continue to read any data mbufs off of the head of the socket 1169 * buffer until the read request is satisfied. Note that 'type' is 1170 * used to store the type of any mbuf reads that have happened so far 1171 * such that soreceive() can stop reading if the type changes, which 1172 * causes soreceive() to return only one of regular data and inline 1173 * out-of-band data in a single socket receive operation. 1174 */ 1175 moff = 0; 1176 offset = 0; 1177 while (m != NULL && uio->uio_resid > 0 && error == 0) { 1178 /* 1179 * If the type of mbuf has changed since the last mbuf 1180 * examined ('type'), end the receive operation. 1181 */ 1182 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1183 if (m->m_type == MT_OOBDATA) { 1184 if (type != MT_OOBDATA) 1185 break; 1186 } else if (type == MT_OOBDATA) 1187 break; 1188 else 1189 KASSERT(m->m_type == MT_DATA || m->m_type == MT_HEADER, 1190 ("m->m_type == %d", m->m_type)); 1191 so->so_rcv.sb_state &= ~SBS_RCVATMARK; 1192 len = uio->uio_resid; 1193 if (so->so_oobmark && len > so->so_oobmark - offset) 1194 len = so->so_oobmark - offset; 1195 if (len > m->m_len - moff) 1196 len = m->m_len - moff; 1197 /* 1198 * If mp is set, just pass back the mbufs. 1199 * Otherwise copy them out via the uio, then free. 1200 * Sockbuf must be consistent here (points to current mbuf, 1201 * it points to next record) when we drop priority; 1202 * we must note any additions to the sockbuf when we 1203 * block interrupts again. 1204 */ 1205 if (mp == NULL) { 1206 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1207 SBLASTRECORDCHK(&so->so_rcv); 1208 SBLASTMBUFCHK(&so->so_rcv); 1209 SOCKBUF_UNLOCK(&so->so_rcv); 1210 #ifdef ZERO_COPY_SOCKETS 1211 if (so_zero_copy_receive) { 1212 vm_page_t pg; 1213 int disposable; 1214 1215 if ((m->m_flags & M_EXT) 1216 && (m->m_ext.ext_type == EXT_DISPOSABLE)) 1217 disposable = 1; 1218 else 1219 disposable = 0; 1220 1221 pg = PHYS_TO_VM_PAGE(vtophys(mtod(m, caddr_t) + 1222 moff)); 1223 1224 if (uio->uio_offset == -1) 1225 uio->uio_offset =IDX_TO_OFF(pg->pindex); 1226 1227 error = uiomoveco(mtod(m, char *) + moff, 1228 (int)len, uio,pg->object, 1229 disposable); 1230 } else 1231 #endif /* ZERO_COPY_SOCKETS */ 1232 error = uiomove(mtod(m, char *) + moff, (int)len, uio); 1233 SOCKBUF_LOCK(&so->so_rcv); 1234 if (error) 1235 goto release; 1236 } else 1237 uio->uio_resid -= len; 1238 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1239 if (len == m->m_len - moff) { 1240 if (m->m_flags & M_EOR) 1241 flags |= MSG_EOR; 1242 if (flags & MSG_PEEK) { 1243 m = m->m_next; 1244 moff = 0; 1245 } else { 1246 nextrecord = m->m_nextpkt; 1247 sbfree(&so->so_rcv, m); 1248 if (mp != NULL) { 1249 *mp = m; 1250 mp = &m->m_next; 1251 so->so_rcv.sb_mb = m = m->m_next; 1252 *mp = NULL; 1253 } else { 1254 so->so_rcv.sb_mb = m_free(m); 1255 m = so->so_rcv.sb_mb; 1256 } 1257 if (m != NULL) { 1258 m->m_nextpkt = nextrecord; 1259 if (nextrecord == NULL) 1260 so->so_rcv.sb_lastrecord = m; 1261 } else { 1262 so->so_rcv.sb_mb = nextrecord; 1263 SB_EMPTY_FIXUP(&so->so_rcv); 1264 } 1265 SBLASTRECORDCHK(&so->so_rcv); 1266 SBLASTMBUFCHK(&so->so_rcv); 1267 } 1268 } else { 1269 if (flags & MSG_PEEK) 1270 moff += len; 1271 else { 1272 if (mp != NULL) { 1273 SOCKBUF_UNLOCK(&so->so_rcv); 1274 *mp = m_copym(m, 0, len, M_TRYWAIT); 1275 SOCKBUF_LOCK(&so->so_rcv); 1276 } 1277 m->m_data += len; 1278 m->m_len -= len; 1279 so->so_rcv.sb_cc -= len; 1280 } 1281 } 1282 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1283 if (so->so_oobmark) { 1284 if ((flags & MSG_PEEK) == 0) { 1285 so->so_oobmark -= len; 1286 if (so->so_oobmark == 0) { 1287 so->so_rcv.sb_state |= SBS_RCVATMARK; 1288 break; 1289 } 1290 } else { 1291 offset += len; 1292 if (offset == so->so_oobmark) 1293 break; 1294 } 1295 } 1296 if (flags & MSG_EOR) 1297 break; 1298 /* 1299 * If the MSG_WAITALL flag is set (for non-atomic socket), 1300 * we must not quit until "uio->uio_resid == 0" or an error 1301 * termination. If a signal/timeout occurs, return 1302 * with a short count but without error. 1303 * Keep sockbuf locked against other readers. 1304 */ 1305 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 && 1306 !sosendallatonce(so) && nextrecord == NULL) { 1307 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1308 if (so->so_error || so->so_rcv.sb_state & SBS_CANTRCVMORE) 1309 break; 1310 /* 1311 * Notify the protocol that some data has been 1312 * drained before blocking. 1313 */ 1314 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb != NULL) { 1315 SOCKBUF_UNLOCK(&so->so_rcv); 1316 (*pr->pr_usrreqs->pru_rcvd)(so, flags); 1317 SOCKBUF_LOCK(&so->so_rcv); 1318 } 1319 SBLASTRECORDCHK(&so->so_rcv); 1320 SBLASTMBUFCHK(&so->so_rcv); 1321 error = sbwait(&so->so_rcv); 1322 if (error) 1323 goto release; 1324 m = so->so_rcv.sb_mb; 1325 if (m != NULL) 1326 nextrecord = m->m_nextpkt; 1327 } 1328 } 1329 1330 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1331 if (m != NULL && pr->pr_flags & PR_ATOMIC) { 1332 flags |= MSG_TRUNC; 1333 if ((flags & MSG_PEEK) == 0) 1334 (void) sbdroprecord_locked(&so->so_rcv); 1335 } 1336 if ((flags & MSG_PEEK) == 0) { 1337 if (m == NULL) { 1338 /* 1339 * First part is an inline SB_EMPTY_FIXUP(). Second 1340 * part makes sure sb_lastrecord is up-to-date if 1341 * there is still data in the socket buffer. 1342 */ 1343 so->so_rcv.sb_mb = nextrecord; 1344 if (so->so_rcv.sb_mb == NULL) { 1345 so->so_rcv.sb_mbtail = NULL; 1346 so->so_rcv.sb_lastrecord = NULL; 1347 } else if (nextrecord->m_nextpkt == NULL) 1348 so->so_rcv.sb_lastrecord = nextrecord; 1349 } 1350 SBLASTRECORDCHK(&so->so_rcv); 1351 SBLASTMBUFCHK(&so->so_rcv); 1352 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) { 1353 SOCKBUF_UNLOCK(&so->so_rcv); 1354 (*pr->pr_usrreqs->pru_rcvd)(so, flags); 1355 SOCKBUF_LOCK(&so->so_rcv); 1356 } 1357 } 1358 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1359 if (orig_resid == uio->uio_resid && orig_resid && 1360 (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) { 1361 sbunlock(&so->so_rcv); 1362 goto restart; 1363 } 1364 1365 if (flagsp != NULL) 1366 *flagsp |= flags; 1367 release: 1368 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1369 sbunlock(&so->so_rcv); 1370 out: 1371 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1372 SOCKBUF_UNLOCK(&so->so_rcv); 1373 return (error); 1374 } 1375 1376 int 1377 soshutdown(so, how) 1378 struct socket *so; 1379 int how; 1380 { 1381 struct protosw *pr = so->so_proto; 1382 1383 if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR)) 1384 return (EINVAL); 1385 1386 if (how != SHUT_WR) 1387 sorflush(so); 1388 if (how != SHUT_RD) 1389 return ((*pr->pr_usrreqs->pru_shutdown)(so)); 1390 return (0); 1391 } 1392 1393 void 1394 sorflush(so) 1395 struct socket *so; 1396 { 1397 struct sockbuf *sb = &so->so_rcv; 1398 struct protosw *pr = so->so_proto; 1399 struct sockbuf asb; 1400 1401 /* 1402 * XXXRW: This is quite ugly. The existing code made a copy of the 1403 * socket buffer, then zero'd the original to clear the buffer 1404 * fields. However, with mutexes in the socket buffer, this causes 1405 * problems. We only clear the zeroable bits of the original; 1406 * however, we have to initialize and destroy the mutex in the copy 1407 * so that dom_dispose() and sbrelease() can lock t as needed. 1408 */ 1409 SOCKBUF_LOCK(sb); 1410 sb->sb_flags |= SB_NOINTR; 1411 (void) sblock(sb, M_WAITOK); 1412 /* 1413 * socantrcvmore_locked() drops the socket buffer mutex so that it 1414 * can safely perform wakeups. Re-acquire the mutex before 1415 * continuing. 1416 */ 1417 socantrcvmore_locked(so); 1418 SOCKBUF_LOCK(sb); 1419 sbunlock(sb); 1420 /* 1421 * Invalidate/clear most of the sockbuf structure, but leave 1422 * selinfo and mutex data unchanged. 1423 */ 1424 bzero(&asb, offsetof(struct sockbuf, sb_startzero)); 1425 bcopy(&sb->sb_startzero, &asb.sb_startzero, 1426 sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); 1427 bzero(&sb->sb_startzero, 1428 sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); 1429 SOCKBUF_UNLOCK(sb); 1430 1431 SOCKBUF_LOCK_INIT(&asb, "so_rcv"); 1432 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL) 1433 (*pr->pr_domain->dom_dispose)(asb.sb_mb); 1434 sbrelease(&asb, so); 1435 SOCKBUF_LOCK_DESTROY(&asb); 1436 } 1437 1438 #ifdef INET 1439 static int 1440 do_setopt_accept_filter(so, sopt) 1441 struct socket *so; 1442 struct sockopt *sopt; 1443 { 1444 struct accept_filter_arg *afap; 1445 struct accept_filter *afp; 1446 struct so_accf *newaf; 1447 int error = 0; 1448 1449 newaf = NULL; 1450 afap = NULL; 1451 1452 /* 1453 * XXXRW: Configuring accept filters should be an atomic test-and-set 1454 * operation to prevent races during setup and attach. There may be 1455 * more general issues of racing and ordering here that are not yet 1456 * addressed by locking. 1457 */ 1458 /* do not set/remove accept filters on non listen sockets */ 1459 SOCK_LOCK(so); 1460 if ((so->so_options & SO_ACCEPTCONN) == 0) { 1461 SOCK_UNLOCK(so); 1462 return (EINVAL); 1463 } 1464 1465 /* removing the filter */ 1466 if (sopt == NULL) { 1467 if (so->so_accf != NULL) { 1468 struct so_accf *af = so->so_accf; 1469 if (af->so_accept_filter != NULL && 1470 af->so_accept_filter->accf_destroy != NULL) { 1471 af->so_accept_filter->accf_destroy(so); 1472 } 1473 if (af->so_accept_filter_str != NULL) { 1474 FREE(af->so_accept_filter_str, M_ACCF); 1475 } 1476 FREE(af, M_ACCF); 1477 so->so_accf = NULL; 1478 } 1479 so->so_options &= ~SO_ACCEPTFILTER; 1480 SOCK_UNLOCK(so); 1481 return (0); 1482 } 1483 SOCK_UNLOCK(so); 1484 1485 /*- 1486 * Adding a filter. 1487 * 1488 * Do memory allocation, copyin, and filter lookup now while we're 1489 * not holding any locks. Avoids sleeping with a mutex, as well as 1490 * introducing a lock order between accept filter locks and socket 1491 * locks here. 1492 */ 1493 MALLOC(afap, struct accept_filter_arg *, sizeof(*afap), M_TEMP, 1494 M_WAITOK); 1495 /* don't put large objects on the kernel stack */ 1496 error = sooptcopyin(sopt, afap, sizeof *afap, sizeof *afap); 1497 afap->af_name[sizeof(afap->af_name)-1] = '\0'; 1498 afap->af_arg[sizeof(afap->af_arg)-1] = '\0'; 1499 if (error) { 1500 FREE(afap, M_TEMP); 1501 return (error); 1502 } 1503 afp = accept_filt_get(afap->af_name); 1504 if (afp == NULL) { 1505 FREE(afap, M_TEMP); 1506 return (ENOENT); 1507 } 1508 1509 /* 1510 * Allocate the new accept filter instance storage. We may have to 1511 * free it again later if we fail to attach it. If attached 1512 * properly, 'newaf' is NULLed to avoid a free() while in use. 1513 */ 1514 MALLOC(newaf, struct so_accf *, sizeof(*newaf), M_ACCF, M_WAITOK | 1515 M_ZERO); 1516 if (afp->accf_create != NULL && afap->af_name[0] != '\0') { 1517 int len = strlen(afap->af_name) + 1; 1518 MALLOC(newaf->so_accept_filter_str, char *, len, M_ACCF, 1519 M_WAITOK); 1520 strcpy(newaf->so_accept_filter_str, afap->af_name); 1521 } 1522 1523 SOCK_LOCK(so); 1524 /* must remove previous filter first */ 1525 if (so->so_accf != NULL) { 1526 error = EINVAL; 1527 goto out; 1528 } 1529 /* 1530 * Invoke the accf_create() method of the filter if required. 1531 * XXXRW: the socket mutex is held over this call, so the create 1532 * method cannot block. This may be something we have to change, but 1533 * it would require addressing possible races. 1534 */ 1535 if (afp->accf_create != NULL) { 1536 newaf->so_accept_filter_arg = 1537 afp->accf_create(so, afap->af_arg); 1538 if (newaf->so_accept_filter_arg == NULL) { 1539 error = EINVAL; 1540 goto out; 1541 } 1542 } 1543 newaf->so_accept_filter = afp; 1544 so->so_accf = newaf; 1545 so->so_options |= SO_ACCEPTFILTER; 1546 newaf = NULL; 1547 out: 1548 SOCK_UNLOCK(so); 1549 if (newaf != NULL) { 1550 if (newaf->so_accept_filter_str != NULL) 1551 FREE(newaf->so_accept_filter_str, M_ACCF); 1552 FREE(newaf, M_ACCF); 1553 } 1554 if (afap != NULL) 1555 FREE(afap, M_TEMP); 1556 return (error); 1557 } 1558 #endif /* INET */ 1559 1560 /* 1561 * Perhaps this routine, and sooptcopyout(), below, ought to come in 1562 * an additional variant to handle the case where the option value needs 1563 * to be some kind of integer, but not a specific size. 1564 * In addition to their use here, these functions are also called by the 1565 * protocol-level pr_ctloutput() routines. 1566 */ 1567 int 1568 sooptcopyin(sopt, buf, len, minlen) 1569 struct sockopt *sopt; 1570 void *buf; 1571 size_t len; 1572 size_t minlen; 1573 { 1574 size_t valsize; 1575 1576 /* 1577 * If the user gives us more than we wanted, we ignore it, 1578 * but if we don't get the minimum length the caller 1579 * wants, we return EINVAL. On success, sopt->sopt_valsize 1580 * is set to however much we actually retrieved. 1581 */ 1582 if ((valsize = sopt->sopt_valsize) < minlen) 1583 return EINVAL; 1584 if (valsize > len) 1585 sopt->sopt_valsize = valsize = len; 1586 1587 if (sopt->sopt_td != NULL) 1588 return (copyin(sopt->sopt_val, buf, valsize)); 1589 1590 bcopy(sopt->sopt_val, buf, valsize); 1591 return 0; 1592 } 1593 1594 /* 1595 * Kernel version of setsockopt(2)/ 1596 * XXX: optlen is size_t, not socklen_t 1597 */ 1598 int 1599 so_setsockopt(struct socket *so, int level, int optname, void *optval, 1600 size_t optlen) 1601 { 1602 struct sockopt sopt; 1603 1604 sopt.sopt_level = level; 1605 sopt.sopt_name = optname; 1606 sopt.sopt_dir = SOPT_SET; 1607 sopt.sopt_val = optval; 1608 sopt.sopt_valsize = optlen; 1609 sopt.sopt_td = NULL; 1610 return (sosetopt(so, &sopt)); 1611 } 1612 1613 int 1614 sosetopt(so, sopt) 1615 struct socket *so; 1616 struct sockopt *sopt; 1617 { 1618 int error, optval; 1619 struct linger l; 1620 struct timeval tv; 1621 u_long val; 1622 #ifdef MAC 1623 struct mac extmac; 1624 #endif 1625 1626 error = 0; 1627 if (sopt->sopt_level != SOL_SOCKET) { 1628 if (so->so_proto && so->so_proto->pr_ctloutput) 1629 return ((*so->so_proto->pr_ctloutput) 1630 (so, sopt)); 1631 error = ENOPROTOOPT; 1632 } else { 1633 switch (sopt->sopt_name) { 1634 #ifdef INET 1635 case SO_ACCEPTFILTER: 1636 error = do_setopt_accept_filter(so, sopt); 1637 if (error) 1638 goto bad; 1639 break; 1640 #endif 1641 case SO_LINGER: 1642 error = sooptcopyin(sopt, &l, sizeof l, sizeof l); 1643 if (error) 1644 goto bad; 1645 1646 SOCK_LOCK(so); 1647 so->so_linger = l.l_linger; 1648 if (l.l_onoff) 1649 so->so_options |= SO_LINGER; 1650 else 1651 so->so_options &= ~SO_LINGER; 1652 SOCK_UNLOCK(so); 1653 break; 1654 1655 case SO_DEBUG: 1656 case SO_KEEPALIVE: 1657 case SO_DONTROUTE: 1658 case SO_USELOOPBACK: 1659 case SO_BROADCAST: 1660 case SO_REUSEADDR: 1661 case SO_REUSEPORT: 1662 case SO_OOBINLINE: 1663 case SO_TIMESTAMP: 1664 case SO_BINTIME: 1665 case SO_NOSIGPIPE: 1666 error = sooptcopyin(sopt, &optval, sizeof optval, 1667 sizeof optval); 1668 if (error) 1669 goto bad; 1670 SOCK_LOCK(so); 1671 if (optval) 1672 so->so_options |= sopt->sopt_name; 1673 else 1674 so->so_options &= ~sopt->sopt_name; 1675 SOCK_UNLOCK(so); 1676 break; 1677 1678 case SO_SNDBUF: 1679 case SO_RCVBUF: 1680 case SO_SNDLOWAT: 1681 case SO_RCVLOWAT: 1682 error = sooptcopyin(sopt, &optval, sizeof optval, 1683 sizeof optval); 1684 if (error) 1685 goto bad; 1686 1687 /* 1688 * Values < 1 make no sense for any of these 1689 * options, so disallow them. 1690 */ 1691 if (optval < 1) { 1692 error = EINVAL; 1693 goto bad; 1694 } 1695 1696 switch (sopt->sopt_name) { 1697 case SO_SNDBUF: 1698 case SO_RCVBUF: 1699 if (sbreserve(sopt->sopt_name == SO_SNDBUF ? 1700 &so->so_snd : &so->so_rcv, (u_long)optval, 1701 so, curthread) == 0) { 1702 error = ENOBUFS; 1703 goto bad; 1704 } 1705 break; 1706 1707 /* 1708 * Make sure the low-water is never greater than 1709 * the high-water. 1710 */ 1711 case SO_SNDLOWAT: 1712 SOCKBUF_LOCK(&so->so_snd); 1713 so->so_snd.sb_lowat = 1714 (optval > so->so_snd.sb_hiwat) ? 1715 so->so_snd.sb_hiwat : optval; 1716 SOCKBUF_UNLOCK(&so->so_snd); 1717 break; 1718 case SO_RCVLOWAT: 1719 SOCKBUF_LOCK(&so->so_rcv); 1720 so->so_rcv.sb_lowat = 1721 (optval > so->so_rcv.sb_hiwat) ? 1722 so->so_rcv.sb_hiwat : optval; 1723 SOCKBUF_UNLOCK(&so->so_rcv); 1724 break; 1725 } 1726 break; 1727 1728 case SO_SNDTIMEO: 1729 case SO_RCVTIMEO: 1730 error = sooptcopyin(sopt, &tv, sizeof tv, 1731 sizeof tv); 1732 if (error) 1733 goto bad; 1734 1735 /* assert(hz > 0); */ 1736 if (tv.tv_sec < 0 || tv.tv_sec > SHRT_MAX / hz || 1737 tv.tv_usec < 0 || tv.tv_usec >= 1000000) { 1738 error = EDOM; 1739 goto bad; 1740 } 1741 /* assert(tick > 0); */ 1742 /* assert(ULONG_MAX - SHRT_MAX >= 1000000); */ 1743 val = (u_long)(tv.tv_sec * hz) + tv.tv_usec / tick; 1744 if (val > SHRT_MAX) { 1745 error = EDOM; 1746 goto bad; 1747 } 1748 if (val == 0 && tv.tv_usec != 0) 1749 val = 1; 1750 1751 switch (sopt->sopt_name) { 1752 case SO_SNDTIMEO: 1753 so->so_snd.sb_timeo = val; 1754 break; 1755 case SO_RCVTIMEO: 1756 so->so_rcv.sb_timeo = val; 1757 break; 1758 } 1759 break; 1760 case SO_LABEL: 1761 #ifdef MAC 1762 error = sooptcopyin(sopt, &extmac, sizeof extmac, 1763 sizeof extmac); 1764 if (error) 1765 goto bad; 1766 error = mac_setsockopt_label(sopt->sopt_td->td_ucred, 1767 so, &extmac); 1768 #else 1769 error = EOPNOTSUPP; 1770 #endif 1771 break; 1772 default: 1773 error = ENOPROTOOPT; 1774 break; 1775 } 1776 if (error == 0 && so->so_proto != NULL && 1777 so->so_proto->pr_ctloutput != NULL) { 1778 (void) ((*so->so_proto->pr_ctloutput) 1779 (so, sopt)); 1780 } 1781 } 1782 bad: 1783 return (error); 1784 } 1785 1786 /* Helper routine for getsockopt */ 1787 int 1788 sooptcopyout(struct sockopt *sopt, const void *buf, size_t len) 1789 { 1790 int error; 1791 size_t valsize; 1792 1793 error = 0; 1794 1795 /* 1796 * Documented get behavior is that we always return a value, 1797 * possibly truncated to fit in the user's buffer. 1798 * Traditional behavior is that we always tell the user 1799 * precisely how much we copied, rather than something useful 1800 * like the total amount we had available for her. 1801 * Note that this interface is not idempotent; the entire answer must 1802 * generated ahead of time. 1803 */ 1804 valsize = min(len, sopt->sopt_valsize); 1805 sopt->sopt_valsize = valsize; 1806 if (sopt->sopt_val != NULL) { 1807 if (sopt->sopt_td != NULL) 1808 error = copyout(buf, sopt->sopt_val, valsize); 1809 else 1810 bcopy(buf, sopt->sopt_val, valsize); 1811 } 1812 return error; 1813 } 1814 1815 int 1816 sogetopt(so, sopt) 1817 struct socket *so; 1818 struct sockopt *sopt; 1819 { 1820 int error, optval; 1821 struct linger l; 1822 struct timeval tv; 1823 #ifdef INET 1824 struct accept_filter_arg *afap; 1825 #endif 1826 #ifdef MAC 1827 struct mac extmac; 1828 #endif 1829 1830 error = 0; 1831 if (sopt->sopt_level != SOL_SOCKET) { 1832 if (so->so_proto && so->so_proto->pr_ctloutput) { 1833 return ((*so->so_proto->pr_ctloutput) 1834 (so, sopt)); 1835 } else 1836 return (ENOPROTOOPT); 1837 } else { 1838 switch (sopt->sopt_name) { 1839 #ifdef INET 1840 case SO_ACCEPTFILTER: 1841 /* Unlocked read. */ 1842 if ((so->so_options & SO_ACCEPTCONN) == 0) 1843 return (EINVAL); 1844 MALLOC(afap, struct accept_filter_arg *, sizeof(*afap), 1845 M_TEMP, M_WAITOK | M_ZERO); 1846 SOCK_LOCK(so); 1847 if ((so->so_options & SO_ACCEPTFILTER) != 0) { 1848 strcpy(afap->af_name, so->so_accf->so_accept_filter->accf_name); 1849 if (so->so_accf->so_accept_filter_str != NULL) 1850 strcpy(afap->af_arg, so->so_accf->so_accept_filter_str); 1851 } 1852 SOCK_UNLOCK(so); 1853 error = sooptcopyout(sopt, afap, sizeof(*afap)); 1854 FREE(afap, M_TEMP); 1855 break; 1856 #endif 1857 1858 case SO_LINGER: 1859 /* 1860 * XXXRW: We grab the lock here to get a consistent 1861 * snapshot of both fields. This may not really 1862 * be necessary. 1863 */ 1864 SOCK_LOCK(so); 1865 l.l_onoff = so->so_options & SO_LINGER; 1866 l.l_linger = so->so_linger; 1867 SOCK_UNLOCK(so); 1868 error = sooptcopyout(sopt, &l, sizeof l); 1869 break; 1870 1871 case SO_USELOOPBACK: 1872 case SO_DONTROUTE: 1873 case SO_DEBUG: 1874 case SO_KEEPALIVE: 1875 case SO_REUSEADDR: 1876 case SO_REUSEPORT: 1877 case SO_BROADCAST: 1878 case SO_OOBINLINE: 1879 case SO_TIMESTAMP: 1880 case SO_BINTIME: 1881 case SO_NOSIGPIPE: 1882 optval = so->so_options & sopt->sopt_name; 1883 integer: 1884 error = sooptcopyout(sopt, &optval, sizeof optval); 1885 break; 1886 1887 case SO_TYPE: 1888 optval = so->so_type; 1889 goto integer; 1890 1891 case SO_ERROR: 1892 optval = so->so_error; 1893 so->so_error = 0; 1894 goto integer; 1895 1896 case SO_SNDBUF: 1897 optval = so->so_snd.sb_hiwat; 1898 goto integer; 1899 1900 case SO_RCVBUF: 1901 optval = so->so_rcv.sb_hiwat; 1902 goto integer; 1903 1904 case SO_SNDLOWAT: 1905 optval = so->so_snd.sb_lowat; 1906 goto integer; 1907 1908 case SO_RCVLOWAT: 1909 optval = so->so_rcv.sb_lowat; 1910 goto integer; 1911 1912 case SO_SNDTIMEO: 1913 case SO_RCVTIMEO: 1914 optval = (sopt->sopt_name == SO_SNDTIMEO ? 1915 so->so_snd.sb_timeo : so->so_rcv.sb_timeo); 1916 1917 tv.tv_sec = optval / hz; 1918 tv.tv_usec = (optval % hz) * tick; 1919 error = sooptcopyout(sopt, &tv, sizeof tv); 1920 break; 1921 case SO_LABEL: 1922 #ifdef MAC 1923 error = sooptcopyin(sopt, &extmac, sizeof(extmac), 1924 sizeof(extmac)); 1925 if (error) 1926 return (error); 1927 error = mac_getsockopt_label(sopt->sopt_td->td_ucred, 1928 so, &extmac); 1929 if (error) 1930 return (error); 1931 error = sooptcopyout(sopt, &extmac, sizeof extmac); 1932 #else 1933 error = EOPNOTSUPP; 1934 #endif 1935 break; 1936 case SO_PEERLABEL: 1937 #ifdef MAC 1938 error = sooptcopyin(sopt, &extmac, sizeof(extmac), 1939 sizeof(extmac)); 1940 if (error) 1941 return (error); 1942 error = mac_getsockopt_peerlabel( 1943 sopt->sopt_td->td_ucred, so, &extmac); 1944 if (error) 1945 return (error); 1946 error = sooptcopyout(sopt, &extmac, sizeof extmac); 1947 #else 1948 error = EOPNOTSUPP; 1949 #endif 1950 break; 1951 default: 1952 error = ENOPROTOOPT; 1953 break; 1954 } 1955 return (error); 1956 } 1957 } 1958 1959 /* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */ 1960 int 1961 soopt_getm(struct sockopt *sopt, struct mbuf **mp) 1962 { 1963 struct mbuf *m, *m_prev; 1964 int sopt_size = sopt->sopt_valsize; 1965 1966 MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA); 1967 if (m == NULL) 1968 return ENOBUFS; 1969 if (sopt_size > MLEN) { 1970 MCLGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT); 1971 if ((m->m_flags & M_EXT) == 0) { 1972 m_free(m); 1973 return ENOBUFS; 1974 } 1975 m->m_len = min(MCLBYTES, sopt_size); 1976 } else { 1977 m->m_len = min(MLEN, sopt_size); 1978 } 1979 sopt_size -= m->m_len; 1980 *mp = m; 1981 m_prev = m; 1982 1983 while (sopt_size) { 1984 MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA); 1985 if (m == NULL) { 1986 m_freem(*mp); 1987 return ENOBUFS; 1988 } 1989 if (sopt_size > MLEN) { 1990 MCLGET(m, sopt->sopt_td != NULL ? M_TRYWAIT : 1991 M_DONTWAIT); 1992 if ((m->m_flags & M_EXT) == 0) { 1993 m_freem(m); 1994 m_freem(*mp); 1995 return ENOBUFS; 1996 } 1997 m->m_len = min(MCLBYTES, sopt_size); 1998 } else { 1999 m->m_len = min(MLEN, sopt_size); 2000 } 2001 sopt_size -= m->m_len; 2002 m_prev->m_next = m; 2003 m_prev = m; 2004 } 2005 return 0; 2006 } 2007 2008 /* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */ 2009 int 2010 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m) 2011 { 2012 struct mbuf *m0 = m; 2013 2014 if (sopt->sopt_val == NULL) 2015 return 0; 2016 while (m != NULL && sopt->sopt_valsize >= m->m_len) { 2017 if (sopt->sopt_td != NULL) { 2018 int error; 2019 2020 error = copyin(sopt->sopt_val, mtod(m, char *), 2021 m->m_len); 2022 if (error != 0) { 2023 m_freem(m0); 2024 return(error); 2025 } 2026 } else 2027 bcopy(sopt->sopt_val, mtod(m, char *), m->m_len); 2028 sopt->sopt_valsize -= m->m_len; 2029 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; 2030 m = m->m_next; 2031 } 2032 if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */ 2033 panic("ip6_sooptmcopyin"); 2034 return 0; 2035 } 2036 2037 /* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */ 2038 int 2039 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m) 2040 { 2041 struct mbuf *m0 = m; 2042 size_t valsize = 0; 2043 2044 if (sopt->sopt_val == NULL) 2045 return 0; 2046 while (m != NULL && sopt->sopt_valsize >= m->m_len) { 2047 if (sopt->sopt_td != NULL) { 2048 int error; 2049 2050 error = copyout(mtod(m, char *), sopt->sopt_val, 2051 m->m_len); 2052 if (error != 0) { 2053 m_freem(m0); 2054 return(error); 2055 } 2056 } else 2057 bcopy(mtod(m, char *), sopt->sopt_val, m->m_len); 2058 sopt->sopt_valsize -= m->m_len; 2059 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; 2060 valsize += m->m_len; 2061 m = m->m_next; 2062 } 2063 if (m != NULL) { 2064 /* enough soopt buffer should be given from user-land */ 2065 m_freem(m0); 2066 return(EINVAL); 2067 } 2068 sopt->sopt_valsize = valsize; 2069 return 0; 2070 } 2071 2072 void 2073 sohasoutofband(so) 2074 struct socket *so; 2075 { 2076 if (so->so_sigio != NULL) 2077 pgsigio(&so->so_sigio, SIGURG, 0); 2078 selwakeuppri(&so->so_rcv.sb_sel, PSOCK); 2079 } 2080 2081 int 2082 sopoll(struct socket *so, int events, struct ucred *active_cred, 2083 struct thread *td) 2084 { 2085 int revents = 0; 2086 2087 if (events & (POLLIN | POLLRDNORM)) 2088 if (soreadable(so)) 2089 revents |= events & (POLLIN | POLLRDNORM); 2090 2091 if (events & POLLINIGNEOF) 2092 if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat || 2093 !TAILQ_EMPTY(&so->so_comp) || so->so_error) 2094 revents |= POLLINIGNEOF; 2095 2096 if (events & (POLLOUT | POLLWRNORM)) 2097 if (sowriteable(so)) 2098 revents |= events & (POLLOUT | POLLWRNORM); 2099 2100 if (events & (POLLPRI | POLLRDBAND)) 2101 if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK)) 2102 revents |= events & (POLLPRI | POLLRDBAND); 2103 2104 if (revents == 0) { 2105 if (events & 2106 (POLLIN | POLLINIGNEOF | POLLPRI | POLLRDNORM | 2107 POLLRDBAND)) { 2108 SOCKBUF_LOCK(&so->so_rcv); 2109 selrecord(td, &so->so_rcv.sb_sel); 2110 so->so_rcv.sb_flags |= SB_SEL; 2111 SOCKBUF_UNLOCK(&so->so_rcv); 2112 } 2113 2114 if (events & (POLLOUT | POLLWRNORM)) { 2115 SOCKBUF_LOCK(&so->so_snd); 2116 selrecord(td, &so->so_snd.sb_sel); 2117 so->so_snd.sb_flags |= SB_SEL; 2118 SOCKBUF_UNLOCK(&so->so_snd); 2119 } 2120 } 2121 2122 return (revents); 2123 } 2124 2125 int 2126 soo_kqfilter(struct file *fp, struct knote *kn) 2127 { 2128 struct socket *so = kn->kn_fp->f_data; 2129 struct sockbuf *sb; 2130 2131 switch (kn->kn_filter) { 2132 case EVFILT_READ: 2133 if (so->so_options & SO_ACCEPTCONN) 2134 kn->kn_fop = &solisten_filtops; 2135 else 2136 kn->kn_fop = &soread_filtops; 2137 sb = &so->so_rcv; 2138 break; 2139 case EVFILT_WRITE: 2140 kn->kn_fop = &sowrite_filtops; 2141 sb = &so->so_snd; 2142 break; 2143 default: 2144 return (1); 2145 } 2146 2147 SOCKBUF_LOCK(sb); 2148 SLIST_INSERT_HEAD(&sb->sb_sel.si_note, kn, kn_selnext); 2149 sb->sb_flags |= SB_KNOTE; 2150 SOCKBUF_UNLOCK(sb); 2151 return (0); 2152 } 2153 2154 static void 2155 filt_sordetach(struct knote *kn) 2156 { 2157 struct socket *so = kn->kn_fp->f_data; 2158 2159 SOCKBUF_LOCK(&so->so_rcv); 2160 SLIST_REMOVE(&so->so_rcv.sb_sel.si_note, kn, knote, kn_selnext); 2161 if (SLIST_EMPTY(&so->so_rcv.sb_sel.si_note)) 2162 so->so_rcv.sb_flags &= ~SB_KNOTE; 2163 SOCKBUF_UNLOCK(&so->so_rcv); 2164 } 2165 2166 /*ARGSUSED*/ 2167 static int 2168 filt_soread(struct knote *kn, long hint) 2169 { 2170 struct socket *so = kn->kn_fp->f_data; 2171 int need_lock, result; 2172 2173 /* 2174 * XXXRW: Conditional locking because filt_soread() can be called 2175 * either from KNOTE() in the socket context where the socket buffer 2176 * lock is already held, or from kqueue() itself. 2177 */ 2178 need_lock = !SOCKBUF_OWNED(&so->so_rcv); 2179 if (need_lock) 2180 SOCKBUF_LOCK(&so->so_rcv); 2181 kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl; 2182 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 2183 kn->kn_flags |= EV_EOF; 2184 kn->kn_fflags = so->so_error; 2185 result = 1; 2186 } else if (so->so_error) /* temporary udp error */ 2187 result = 1; 2188 else if (kn->kn_sfflags & NOTE_LOWAT) 2189 result = (kn->kn_data >= kn->kn_sdata); 2190 else 2191 result = (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat); 2192 if (need_lock) 2193 SOCKBUF_UNLOCK(&so->so_rcv); 2194 return (result); 2195 } 2196 2197 static void 2198 filt_sowdetach(struct knote *kn) 2199 { 2200 struct socket *so = kn->kn_fp->f_data; 2201 2202 SOCKBUF_LOCK(&so->so_snd); 2203 SLIST_REMOVE(&so->so_snd.sb_sel.si_note, kn, knote, kn_selnext); 2204 if (SLIST_EMPTY(&so->so_snd.sb_sel.si_note)) 2205 so->so_snd.sb_flags &= ~SB_KNOTE; 2206 SOCKBUF_UNLOCK(&so->so_snd); 2207 } 2208 2209 /*ARGSUSED*/ 2210 static int 2211 filt_sowrite(struct knote *kn, long hint) 2212 { 2213 struct socket *so = kn->kn_fp->f_data; 2214 int need_lock, result; 2215 2216 /* 2217 * XXXRW: Conditional locking because filt_soread() can be called 2218 * either from KNOTE() in the socket context where the socket buffer 2219 * lock is already held, or from kqueue() itself. 2220 */ 2221 need_lock = !SOCKBUF_OWNED(&so->so_snd); 2222 if (need_lock) 2223 SOCKBUF_LOCK(&so->so_snd); 2224 kn->kn_data = sbspace(&so->so_snd); 2225 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 2226 kn->kn_flags |= EV_EOF; 2227 kn->kn_fflags = so->so_error; 2228 result = 1; 2229 } else if (so->so_error) /* temporary udp error */ 2230 result = 1; 2231 else if (((so->so_state & SS_ISCONNECTED) == 0) && 2232 (so->so_proto->pr_flags & PR_CONNREQUIRED)) 2233 result = 0; 2234 else if (kn->kn_sfflags & NOTE_LOWAT) 2235 result = (kn->kn_data >= kn->kn_sdata); 2236 else 2237 result = (kn->kn_data >= so->so_snd.sb_lowat); 2238 if (need_lock) 2239 SOCKBUF_UNLOCK(&so->so_snd); 2240 return (result); 2241 } 2242 2243 /*ARGSUSED*/ 2244 static int 2245 filt_solisten(struct knote *kn, long hint) 2246 { 2247 struct socket *so = kn->kn_fp->f_data; 2248 2249 kn->kn_data = so->so_qlen; 2250 return (! TAILQ_EMPTY(&so->so_comp)); 2251 } 2252 2253 int 2254 socheckuid(struct socket *so, uid_t uid) 2255 { 2256 2257 if (so == NULL) 2258 return (EPERM); 2259 if (so->so_cred->cr_uid == uid) 2260 return (0); 2261 return (EPERM); 2262 } 2263