1 /* 2 * Copyright (c) 2004 The FreeBSD Foundation 3 * Copyright (c) 2004 Robert Watson 4 * Copyright (c) 1982, 1986, 1988, 1990, 1993 5 * The Regents of the University of California. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 4. Neither the name of the University nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 * 31 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94 32 */ 33 34 #include <sys/cdefs.h> 35 __FBSDID("$FreeBSD$"); 36 37 #include "opt_inet.h" 38 #include "opt_mac.h" 39 #include "opt_zero.h" 40 41 #include <sys/param.h> 42 #include <sys/systm.h> 43 #include <sys/fcntl.h> 44 #include <sys/limits.h> 45 #include <sys/lock.h> 46 #include <sys/mac.h> 47 #include <sys/malloc.h> 48 #include <sys/mbuf.h> 49 #include <sys/mutex.h> 50 #include <sys/domain.h> 51 #include <sys/file.h> /* for struct knote */ 52 #include <sys/kernel.h> 53 #include <sys/event.h> 54 #include <sys/poll.h> 55 #include <sys/proc.h> 56 #include <sys/protosw.h> 57 #include <sys/socket.h> 58 #include <sys/socketvar.h> 59 #include <sys/resourcevar.h> 60 #include <sys/signalvar.h> 61 #include <sys/sysctl.h> 62 #include <sys/uio.h> 63 #include <sys/jail.h> 64 65 #include <vm/uma.h> 66 67 68 static int soreceive_rcvoob(struct socket *so, struct uio *uio, 69 int flags); 70 71 #ifdef INET 72 static int do_setopt_accept_filter(struct socket *so, struct sockopt *sopt); 73 #endif 74 75 static void filt_sordetach(struct knote *kn); 76 static int filt_soread(struct knote *kn, long hint); 77 static void filt_sowdetach(struct knote *kn); 78 static int filt_sowrite(struct knote *kn, long hint); 79 static int filt_solisten(struct knote *kn, long hint); 80 81 static struct filterops solisten_filtops = 82 { 1, NULL, filt_sordetach, filt_solisten }; 83 static struct filterops soread_filtops = 84 { 1, NULL, filt_sordetach, filt_soread }; 85 static struct filterops sowrite_filtops = 86 { 1, NULL, filt_sowdetach, filt_sowrite }; 87 88 uma_zone_t socket_zone; 89 so_gen_t so_gencnt; /* generation count for sockets */ 90 91 MALLOC_DEFINE(M_SONAME, "soname", "socket name"); 92 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block"); 93 94 SYSCTL_DECL(_kern_ipc); 95 96 static int somaxconn = SOMAXCONN; 97 SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLFLAG_RW, 98 &somaxconn, 0, "Maximum pending socket connection queue size"); 99 static int numopensockets; 100 SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD, 101 &numopensockets, 0, "Number of open sockets"); 102 #ifdef ZERO_COPY_SOCKETS 103 /* These aren't static because they're used in other files. */ 104 int so_zero_copy_send = 1; 105 int so_zero_copy_receive = 1; 106 SYSCTL_NODE(_kern_ipc, OID_AUTO, zero_copy, CTLFLAG_RD, 0, 107 "Zero copy controls"); 108 SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, receive, CTLFLAG_RW, 109 &so_zero_copy_receive, 0, "Enable zero copy receive"); 110 SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, send, CTLFLAG_RW, 111 &so_zero_copy_send, 0, "Enable zero copy send"); 112 #endif /* ZERO_COPY_SOCKETS */ 113 114 /* 115 * accept_mtx locks down per-socket fields relating to accept queues. See 116 * socketvar.h for an annotation of the protected fields of struct socket. 117 */ 118 struct mtx accept_mtx; 119 MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF); 120 121 /* 122 * so_global_mtx protects so_gencnt, numopensockets, and the per-socket 123 * so_gencnt field. 124 * 125 * XXXRW: These variables might be better manipulated using atomic operations 126 * for improved efficiency. 127 */ 128 static struct mtx so_global_mtx; 129 MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF); 130 131 /* 132 * Socket operation routines. 133 * These routines are called by the routines in 134 * sys_socket.c or from a system process, and 135 * implement the semantics of socket operations by 136 * switching out to the protocol specific routines. 137 */ 138 139 /* 140 * Get a socket structure from our zone, and initialize it. 141 * Note that it would probably be better to allocate socket 142 * and PCB at the same time, but I'm not convinced that all 143 * the protocols can be easily modified to do this. 144 * 145 * soalloc() returns a socket with a ref count of 0. 146 */ 147 struct socket * 148 soalloc(int mflags) 149 { 150 struct socket *so; 151 #ifdef MAC 152 int error; 153 #endif 154 155 so = uma_zalloc(socket_zone, mflags | M_ZERO); 156 if (so != NULL) { 157 #ifdef MAC 158 error = mac_init_socket(so, mflags); 159 if (error != 0) { 160 uma_zfree(socket_zone, so); 161 so = NULL; 162 return so; 163 } 164 #endif 165 SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd"); 166 SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv"); 167 /* sx_init(&so->so_sxlock, "socket sxlock"); */ 168 TAILQ_INIT(&so->so_aiojobq); 169 mtx_lock(&so_global_mtx); 170 so->so_gencnt = ++so_gencnt; 171 ++numopensockets; 172 mtx_unlock(&so_global_mtx); 173 } 174 return so; 175 } 176 177 /* 178 * socreate returns a socket with a ref count of 1. The socket should be 179 * closed with soclose(). 180 */ 181 int 182 socreate(dom, aso, type, proto, cred, td) 183 int dom; 184 struct socket **aso; 185 int type; 186 int proto; 187 struct ucred *cred; 188 struct thread *td; 189 { 190 struct protosw *prp; 191 struct socket *so; 192 int error; 193 194 if (proto) 195 prp = pffindproto(dom, proto, type); 196 else 197 prp = pffindtype(dom, type); 198 199 if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL) 200 return (EPROTONOSUPPORT); 201 202 if (jailed(cred) && jail_socket_unixiproute_only && 203 prp->pr_domain->dom_family != PF_LOCAL && 204 prp->pr_domain->dom_family != PF_INET && 205 prp->pr_domain->dom_family != PF_ROUTE) { 206 return (EPROTONOSUPPORT); 207 } 208 209 if (prp->pr_type != type) 210 return (EPROTOTYPE); 211 so = soalloc(M_WAITOK); 212 if (so == NULL) 213 return (ENOBUFS); 214 215 TAILQ_INIT(&so->so_incomp); 216 TAILQ_INIT(&so->so_comp); 217 so->so_type = type; 218 so->so_cred = crhold(cred); 219 so->so_proto = prp; 220 #ifdef MAC 221 mac_create_socket(cred, so); 222 #endif 223 SOCK_LOCK(so); 224 knlist_init(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv)); 225 knlist_init(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd)); 226 soref(so); 227 SOCK_UNLOCK(so); 228 error = (*prp->pr_usrreqs->pru_attach)(so, proto, td); 229 if (error) { 230 SOCK_LOCK(so); 231 so->so_state |= SS_NOFDREF; 232 sorele(so); 233 return (error); 234 } 235 *aso = so; 236 return (0); 237 } 238 239 int 240 sobind(so, nam, td) 241 struct socket *so; 242 struct sockaddr *nam; 243 struct thread *td; 244 { 245 246 return ((*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td)); 247 } 248 249 void 250 sodealloc(struct socket *so) 251 { 252 253 KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count)); 254 mtx_lock(&so_global_mtx); 255 so->so_gencnt = ++so_gencnt; 256 mtx_unlock(&so_global_mtx); 257 if (so->so_rcv.sb_hiwat) 258 (void)chgsbsize(so->so_cred->cr_uidinfo, 259 &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY); 260 if (so->so_snd.sb_hiwat) 261 (void)chgsbsize(so->so_cred->cr_uidinfo, 262 &so->so_snd.sb_hiwat, 0, RLIM_INFINITY); 263 #ifdef INET 264 /* remove acccept filter if one is present. */ 265 if (so->so_accf != NULL) 266 do_setopt_accept_filter(so, NULL); 267 #endif 268 #ifdef MAC 269 mac_destroy_socket(so); 270 #endif 271 crfree(so->so_cred); 272 SOCKBUF_LOCK_DESTROY(&so->so_snd); 273 SOCKBUF_LOCK_DESTROY(&so->so_rcv); 274 /* sx_destroy(&so->so_sxlock); */ 275 uma_zfree(socket_zone, so); 276 /* 277 * XXXRW: Seems like a shame to grab the mutex again down here, but 278 * we don't want to decrement the socket count until after we free 279 * the socket, and we can't increment the gencnt on the socket after 280 * we free, it so... 281 */ 282 mtx_lock(&so_global_mtx); 283 --numopensockets; 284 mtx_unlock(&so_global_mtx); 285 } 286 287 int 288 solisten(so, backlog, td) 289 struct socket *so; 290 int backlog; 291 struct thread *td; 292 { 293 int error; 294 295 /* 296 * XXXRW: Ordering issue here -- perhaps we need to set 297 * SO_ACCEPTCONN before the call to pru_listen()? 298 * XXXRW: General atomic test-and-set concerns here also. 299 */ 300 if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | 301 SS_ISDISCONNECTING)) 302 return (EINVAL); 303 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, td); 304 if (error) 305 return (error); 306 ACCEPT_LOCK(); 307 if (TAILQ_EMPTY(&so->so_comp)) { 308 SOCK_LOCK(so); 309 so->so_options |= SO_ACCEPTCONN; 310 SOCK_UNLOCK(so); 311 } 312 if (backlog < 0 || backlog > somaxconn) 313 backlog = somaxconn; 314 so->so_qlimit = backlog; 315 ACCEPT_UNLOCK(); 316 return (0); 317 } 318 319 void 320 sofree(so) 321 struct socket *so; 322 { 323 struct socket *head; 324 325 KASSERT(so->so_count == 0, ("socket %p so_count not 0", so)); 326 SOCK_LOCK_ASSERT(so); 327 328 if (so->so_pcb != NULL || (so->so_state & SS_NOFDREF) == 0) { 329 SOCK_UNLOCK(so); 330 return; 331 } 332 333 SOCK_UNLOCK(so); 334 ACCEPT_LOCK(); 335 head = so->so_head; 336 if (head != NULL) { 337 KASSERT((so->so_qstate & SQ_COMP) != 0 || 338 (so->so_qstate & SQ_INCOMP) != 0, 339 ("sofree: so_head != NULL, but neither SQ_COMP nor " 340 "SQ_INCOMP")); 341 KASSERT((so->so_qstate & SQ_COMP) == 0 || 342 (so->so_qstate & SQ_INCOMP) == 0, 343 ("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP")); 344 /* 345 * accept(2) is responsible draining the completed 346 * connection queue and freeing those sockets, so 347 * we just return here if this socket is currently 348 * on the completed connection queue. Otherwise, 349 * accept(2) may hang after select(2) has indicating 350 * that a listening socket was ready. If it's an 351 * incomplete connection, we remove it from the queue 352 * and free it; otherwise, it won't be released until 353 * the listening socket is closed. 354 */ 355 if ((so->so_qstate & SQ_COMP) != 0) { 356 ACCEPT_UNLOCK(); 357 return; 358 } 359 TAILQ_REMOVE(&head->so_incomp, so, so_list); 360 head->so_incqlen--; 361 so->so_qstate &= ~SQ_INCOMP; 362 so->so_head = NULL; 363 } 364 KASSERT((so->so_qstate & SQ_COMP) == 0 && 365 (so->so_qstate & SQ_INCOMP) == 0, 366 ("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)", 367 so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP)); 368 ACCEPT_UNLOCK(); 369 SOCKBUF_LOCK(&so->so_snd); 370 so->so_snd.sb_flags |= SB_NOINTR; 371 (void)sblock(&so->so_snd, M_WAITOK); 372 /* 373 * socantsendmore_locked() drops the socket buffer mutex so that it 374 * can safely perform wakeups. Re-acquire the mutex before 375 * continuing. 376 */ 377 socantsendmore_locked(so); 378 SOCKBUF_LOCK(&so->so_snd); 379 sbunlock(&so->so_snd); 380 sbrelease_locked(&so->so_snd, so); 381 SOCKBUF_UNLOCK(&so->so_snd); 382 sorflush(so); 383 knlist_destroy(&so->so_rcv.sb_sel.si_note); 384 knlist_destroy(&so->so_snd.sb_sel.si_note); 385 sodealloc(so); 386 } 387 388 /* 389 * Close a socket on last file table reference removal. 390 * Initiate disconnect if connected. 391 * Free socket when disconnect complete. 392 * 393 * This function will sorele() the socket. Note that soclose() may be 394 * called prior to the ref count reaching zero. The actual socket 395 * structure will not be freed until the ref count reaches zero. 396 */ 397 int 398 soclose(so) 399 struct socket *so; 400 { 401 int error = 0; 402 403 KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter")); 404 405 funsetown(&so->so_sigio); 406 if (so->so_options & SO_ACCEPTCONN) { 407 struct socket *sp; 408 ACCEPT_LOCK(); 409 while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) { 410 TAILQ_REMOVE(&so->so_incomp, sp, so_list); 411 so->so_incqlen--; 412 sp->so_qstate &= ~SQ_INCOMP; 413 sp->so_head = NULL; 414 ACCEPT_UNLOCK(); 415 (void) soabort(sp); 416 ACCEPT_LOCK(); 417 } 418 while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) { 419 TAILQ_REMOVE(&so->so_comp, sp, so_list); 420 so->so_qlen--; 421 sp->so_qstate &= ~SQ_COMP; 422 sp->so_head = NULL; 423 ACCEPT_UNLOCK(); 424 (void) soabort(sp); 425 ACCEPT_LOCK(); 426 } 427 ACCEPT_UNLOCK(); 428 } 429 if (so->so_pcb == NULL) 430 goto discard; 431 if (so->so_state & SS_ISCONNECTED) { 432 if ((so->so_state & SS_ISDISCONNECTING) == 0) { 433 error = sodisconnect(so); 434 if (error) 435 goto drop; 436 } 437 if (so->so_options & SO_LINGER) { 438 if ((so->so_state & SS_ISDISCONNECTING) && 439 (so->so_state & SS_NBIO)) 440 goto drop; 441 while (so->so_state & SS_ISCONNECTED) { 442 error = tsleep(&so->so_timeo, 443 PSOCK | PCATCH, "soclos", so->so_linger * hz); 444 if (error) 445 break; 446 } 447 } 448 } 449 drop: 450 if (so->so_pcb != NULL) { 451 int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so); 452 if (error == 0) 453 error = error2; 454 } 455 discard: 456 SOCK_LOCK(so); 457 KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF")); 458 so->so_state |= SS_NOFDREF; 459 sorele(so); 460 return (error); 461 } 462 463 /* 464 * soabort() must not be called with any socket locks held, as it calls 465 * into the protocol, which will call back into the socket code causing 466 * it to acquire additional socket locks that may cause recursion or lock 467 * order reversals. 468 */ 469 int 470 soabort(so) 471 struct socket *so; 472 { 473 int error; 474 475 error = (*so->so_proto->pr_usrreqs->pru_abort)(so); 476 if (error) { 477 SOCK_LOCK(so); 478 sotryfree(so); /* note: does not decrement the ref count */ 479 return error; 480 } 481 return (0); 482 } 483 484 int 485 soaccept(so, nam) 486 struct socket *so; 487 struct sockaddr **nam; 488 { 489 int error; 490 491 SOCK_LOCK(so); 492 KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF")); 493 so->so_state &= ~SS_NOFDREF; 494 SOCK_UNLOCK(so); 495 error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam); 496 return (error); 497 } 498 499 int 500 soconnect(so, nam, td) 501 struct socket *so; 502 struct sockaddr *nam; 503 struct thread *td; 504 { 505 int error; 506 507 if (so->so_options & SO_ACCEPTCONN) 508 return (EOPNOTSUPP); 509 /* 510 * If protocol is connection-based, can only connect once. 511 * Otherwise, if connected, try to disconnect first. 512 * This allows user to disconnect by connecting to, e.g., 513 * a null address. 514 */ 515 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && 516 ((so->so_proto->pr_flags & PR_CONNREQUIRED) || 517 (error = sodisconnect(so)))) 518 error = EISCONN; 519 else 520 error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td); 521 return (error); 522 } 523 524 int 525 soconnect2(so1, so2) 526 struct socket *so1; 527 struct socket *so2; 528 { 529 530 return ((*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2)); 531 } 532 533 int 534 sodisconnect(so) 535 struct socket *so; 536 { 537 int error; 538 539 if ((so->so_state & SS_ISCONNECTED) == 0) 540 return (ENOTCONN); 541 if (so->so_state & SS_ISDISCONNECTING) 542 return (EALREADY); 543 error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so); 544 return (error); 545 } 546 547 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK) 548 /* 549 * Send on a socket. 550 * If send must go all at once and message is larger than 551 * send buffering, then hard error. 552 * Lock against other senders. 553 * If must go all at once and not enough room now, then 554 * inform user that this would block and do nothing. 555 * Otherwise, if nonblocking, send as much as possible. 556 * The data to be sent is described by "uio" if nonzero, 557 * otherwise by the mbuf chain "top" (which must be null 558 * if uio is not). Data provided in mbuf chain must be small 559 * enough to send all at once. 560 * 561 * Returns nonzero on error, timeout or signal; callers 562 * must check for short counts if EINTR/ERESTART are returned. 563 * Data and control buffers are freed on return. 564 */ 565 566 #ifdef ZERO_COPY_SOCKETS 567 struct so_zerocopy_stats{ 568 int size_ok; 569 int align_ok; 570 int found_ifp; 571 }; 572 struct so_zerocopy_stats so_zerocp_stats = {0,0,0}; 573 #include <netinet/in.h> 574 #include <net/route.h> 575 #include <netinet/in_pcb.h> 576 #include <vm/vm.h> 577 #include <vm/vm_page.h> 578 #include <vm/vm_object.h> 579 #endif /*ZERO_COPY_SOCKETS*/ 580 581 int 582 sosend(so, addr, uio, top, control, flags, td) 583 struct socket *so; 584 struct sockaddr *addr; 585 struct uio *uio; 586 struct mbuf *top; 587 struct mbuf *control; 588 int flags; 589 struct thread *td; 590 { 591 struct mbuf **mp; 592 struct mbuf *m; 593 long space, len = 0, resid; 594 int clen = 0, error, dontroute; 595 int atomic = sosendallatonce(so) || top; 596 #ifdef ZERO_COPY_SOCKETS 597 int cow_send; 598 #endif /* ZERO_COPY_SOCKETS */ 599 600 if (uio != NULL) 601 resid = uio->uio_resid; 602 else 603 resid = top->m_pkthdr.len; 604 /* 605 * In theory resid should be unsigned. 606 * However, space must be signed, as it might be less than 0 607 * if we over-committed, and we must use a signed comparison 608 * of space and resid. On the other hand, a negative resid 609 * causes us to loop sending 0-length segments to the protocol. 610 * 611 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM 612 * type sockets since that's an error. 613 */ 614 if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) { 615 error = EINVAL; 616 goto out; 617 } 618 619 dontroute = 620 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && 621 (so->so_proto->pr_flags & PR_ATOMIC); 622 if (td != NULL) 623 td->td_proc->p_stats->p_ru.ru_msgsnd++; 624 if (control != NULL) 625 clen = control->m_len; 626 #define snderr(errno) { error = (errno); goto release; } 627 628 SOCKBUF_LOCK(&so->so_snd); 629 restart: 630 SOCKBUF_LOCK_ASSERT(&so->so_snd); 631 error = sblock(&so->so_snd, SBLOCKWAIT(flags)); 632 if (error) 633 goto out_locked; 634 do { 635 SOCKBUF_LOCK_ASSERT(&so->so_snd); 636 if (so->so_snd.sb_state & SBS_CANTSENDMORE) 637 snderr(EPIPE); 638 if (so->so_error) { 639 error = so->so_error; 640 so->so_error = 0; 641 goto release; 642 } 643 if ((so->so_state & SS_ISCONNECTED) == 0) { 644 /* 645 * `sendto' and `sendmsg' is allowed on a connection- 646 * based socket if it supports implied connect. 647 * Return ENOTCONN if not connected and no address is 648 * supplied. 649 */ 650 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && 651 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { 652 if ((so->so_state & SS_ISCONFIRMING) == 0 && 653 !(resid == 0 && clen != 0)) 654 snderr(ENOTCONN); 655 } else if (addr == NULL) 656 snderr(so->so_proto->pr_flags & PR_CONNREQUIRED ? 657 ENOTCONN : EDESTADDRREQ); 658 } 659 space = sbspace(&so->so_snd); 660 if (flags & MSG_OOB) 661 space += 1024; 662 if ((atomic && resid > so->so_snd.sb_hiwat) || 663 clen > so->so_snd.sb_hiwat) 664 snderr(EMSGSIZE); 665 if (space < resid + clen && 666 (atomic || space < so->so_snd.sb_lowat || space < clen)) { 667 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) 668 snderr(EWOULDBLOCK); 669 sbunlock(&so->so_snd); 670 error = sbwait(&so->so_snd); 671 if (error) 672 goto out_locked; 673 goto restart; 674 } 675 SOCKBUF_UNLOCK(&so->so_snd); 676 mp = ⊤ 677 space -= clen; 678 do { 679 if (uio == NULL) { 680 /* 681 * Data is prepackaged in "top". 682 */ 683 resid = 0; 684 if (flags & MSG_EOR) 685 top->m_flags |= M_EOR; 686 } else do { 687 #ifdef ZERO_COPY_SOCKETS 688 cow_send = 0; 689 #endif /* ZERO_COPY_SOCKETS */ 690 if (resid >= MINCLSIZE) { 691 #ifdef ZERO_COPY_SOCKETS 692 if (top == NULL) { 693 MGETHDR(m, M_TRYWAIT, MT_DATA); 694 if (m == NULL) { 695 error = ENOBUFS; 696 SOCKBUF_LOCK(&so->so_snd); 697 goto release; 698 } 699 m->m_pkthdr.len = 0; 700 m->m_pkthdr.rcvif = (struct ifnet *)0; 701 } else { 702 MGET(m, M_TRYWAIT, MT_DATA); 703 if (m == NULL) { 704 error = ENOBUFS; 705 SOCKBUF_LOCK(&so->so_snd); 706 goto release; 707 } 708 } 709 if (so_zero_copy_send && 710 resid>=PAGE_SIZE && 711 space>=PAGE_SIZE && 712 uio->uio_iov->iov_len>=PAGE_SIZE) { 713 so_zerocp_stats.size_ok++; 714 if (!((vm_offset_t) 715 uio->uio_iov->iov_base & PAGE_MASK)){ 716 so_zerocp_stats.align_ok++; 717 cow_send = socow_setup(m, uio); 718 } 719 } 720 if (!cow_send) { 721 MCLGET(m, M_TRYWAIT); 722 if ((m->m_flags & M_EXT) == 0) { 723 m_free(m); 724 m = NULL; 725 } else { 726 len = min(min(MCLBYTES, resid), space); 727 } 728 } else 729 len = PAGE_SIZE; 730 #else /* ZERO_COPY_SOCKETS */ 731 if (top == NULL) { 732 m = m_getcl(M_TRYWAIT, MT_DATA, M_PKTHDR); 733 m->m_pkthdr.len = 0; 734 m->m_pkthdr.rcvif = (struct ifnet *)0; 735 } else 736 m = m_getcl(M_TRYWAIT, MT_DATA, 0); 737 len = min(min(MCLBYTES, resid), space); 738 #endif /* ZERO_COPY_SOCKETS */ 739 } else { 740 if (top == NULL) { 741 m = m_gethdr(M_TRYWAIT, MT_DATA); 742 m->m_pkthdr.len = 0; 743 m->m_pkthdr.rcvif = (struct ifnet *)0; 744 745 len = min(min(MHLEN, resid), space); 746 /* 747 * For datagram protocols, leave room 748 * for protocol headers in first mbuf. 749 */ 750 if (atomic && m && len < MHLEN) 751 MH_ALIGN(m, len); 752 } else { 753 m = m_get(M_TRYWAIT, MT_DATA); 754 len = min(min(MLEN, resid), space); 755 } 756 } 757 if (m == NULL) { 758 error = ENOBUFS; 759 SOCKBUF_LOCK(&so->so_snd); 760 goto release; 761 } 762 763 space -= len; 764 #ifdef ZERO_COPY_SOCKETS 765 if (cow_send) 766 error = 0; 767 else 768 #endif /* ZERO_COPY_SOCKETS */ 769 error = uiomove(mtod(m, void *), (int)len, uio); 770 resid = uio->uio_resid; 771 m->m_len = len; 772 *mp = m; 773 top->m_pkthdr.len += len; 774 if (error) { 775 SOCKBUF_LOCK(&so->so_snd); 776 goto release; 777 } 778 mp = &m->m_next; 779 if (resid <= 0) { 780 if (flags & MSG_EOR) 781 top->m_flags |= M_EOR; 782 break; 783 } 784 } while (space > 0 && atomic); 785 if (dontroute) { 786 SOCK_LOCK(so); 787 so->so_options |= SO_DONTROUTE; 788 SOCK_UNLOCK(so); 789 } 790 /* 791 * XXX all the SBS_CANTSENDMORE checks previously 792 * done could be out of date. We could have recieved 793 * a reset packet in an interrupt or maybe we slept 794 * while doing page faults in uiomove() etc. We could 795 * probably recheck again inside the locking protection 796 * here, but there are probably other places that this 797 * also happens. We must rethink this. 798 */ 799 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 800 (flags & MSG_OOB) ? PRUS_OOB : 801 /* 802 * If the user set MSG_EOF, the protocol 803 * understands this flag and nothing left to 804 * send then use PRU_SEND_EOF instead of PRU_SEND. 805 */ 806 ((flags & MSG_EOF) && 807 (so->so_proto->pr_flags & PR_IMPLOPCL) && 808 (resid <= 0)) ? 809 PRUS_EOF : 810 /* If there is more to send set PRUS_MORETOCOME */ 811 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, 812 top, addr, control, td); 813 if (dontroute) { 814 SOCK_LOCK(so); 815 so->so_options &= ~SO_DONTROUTE; 816 SOCK_UNLOCK(so); 817 } 818 clen = 0; 819 control = NULL; 820 top = NULL; 821 mp = ⊤ 822 if (error) { 823 SOCKBUF_LOCK(&so->so_snd); 824 goto release; 825 } 826 } while (resid && space > 0); 827 SOCKBUF_LOCK(&so->so_snd); 828 } while (resid); 829 830 release: 831 SOCKBUF_LOCK_ASSERT(&so->so_snd); 832 sbunlock(&so->so_snd); 833 out_locked: 834 SOCKBUF_LOCK_ASSERT(&so->so_snd); 835 SOCKBUF_UNLOCK(&so->so_snd); 836 out: 837 if (top != NULL) 838 m_freem(top); 839 if (control != NULL) 840 m_freem(control); 841 return (error); 842 } 843 844 /* 845 * The part of soreceive() that implements reading non-inline out-of-band 846 * data from a socket. For more complete comments, see soreceive(), from 847 * which this code originated. 848 * 849 * XXXRW: Note that soreceive_rcvoob(), unlike the remainder of soreiceve(), 850 * is unable to return an mbuf chain to the caller. 851 */ 852 static int 853 soreceive_rcvoob(so, uio, flags) 854 struct socket *so; 855 struct uio *uio; 856 int flags; 857 { 858 struct protosw *pr = so->so_proto; 859 struct mbuf *m; 860 int error; 861 862 KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0")); 863 864 m = m_get(M_TRYWAIT, MT_DATA); 865 if (m == NULL) 866 return (ENOBUFS); 867 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK); 868 if (error) 869 goto bad; 870 do { 871 #ifdef ZERO_COPY_SOCKETS 872 if (so_zero_copy_receive) { 873 vm_page_t pg; 874 int disposable; 875 876 if ((m->m_flags & M_EXT) 877 && (m->m_ext.ext_type == EXT_DISPOSABLE)) 878 disposable = 1; 879 else 880 disposable = 0; 881 882 pg = PHYS_TO_VM_PAGE(vtophys(mtod(m, caddr_t))); 883 if (uio->uio_offset == -1) 884 uio->uio_offset =IDX_TO_OFF(pg->pindex); 885 886 error = uiomoveco(mtod(m, void *), 887 min(uio->uio_resid, m->m_len), 888 uio, pg->object, 889 disposable); 890 } else 891 #endif /* ZERO_COPY_SOCKETS */ 892 error = uiomove(mtod(m, void *), 893 (int) min(uio->uio_resid, m->m_len), uio); 894 m = m_free(m); 895 } while (uio->uio_resid && error == 0 && m); 896 bad: 897 if (m != NULL) 898 m_freem(m); 899 return (error); 900 } 901 902 /* 903 * Following replacement or removal of the first mbuf on the first mbuf chain 904 * of a socket buffer, push necessary state changes back into the socket 905 * buffer so that other consumers see the values consistently. 'nextrecord' 906 * is the callers locally stored value of the original value of 907 * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes. 908 * NOTE: 'nextrecord' may be NULL. 909 */ 910 static __inline void 911 sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord) 912 { 913 914 SOCKBUF_LOCK_ASSERT(sb); 915 /* 916 * First, update for the new value of nextrecord. If necessary, make 917 * it the first record. 918 */ 919 if (sb->sb_mb != NULL) 920 sb->sb_mb->m_nextpkt = nextrecord; 921 else 922 sb->sb_mb = nextrecord; 923 924 /* 925 * Now update any dependent socket buffer fields to reflect the new 926 * state. This is an expanded inline of SB_EMPTY_FIXUP(), with the 927 * addition of a second clause that takes care of the case where 928 * sb_mb has been updated, but remains the last record. 929 */ 930 if (sb->sb_mb == NULL) { 931 sb->sb_mbtail = NULL; 932 sb->sb_lastrecord = NULL; 933 } else if (sb->sb_mb->m_nextpkt == NULL) 934 sb->sb_lastrecord = sb->sb_mb; 935 } 936 937 938 /* 939 * Implement receive operations on a socket. 940 * We depend on the way that records are added to the sockbuf 941 * by sbappend*. In particular, each record (mbufs linked through m_next) 942 * must begin with an address if the protocol so specifies, 943 * followed by an optional mbuf or mbufs containing ancillary data, 944 * and then zero or more mbufs of data. 945 * In order to avoid blocking network interrupts for the entire time here, 946 * we splx() while doing the actual copy to user space. 947 * Although the sockbuf is locked, new data may still be appended, 948 * and thus we must maintain consistency of the sockbuf during that time. 949 * 950 * The caller may receive the data as a single mbuf chain by supplying 951 * an mbuf **mp0 for use in returning the chain. The uio is then used 952 * only for the count in uio_resid. 953 */ 954 int 955 soreceive(so, psa, uio, mp0, controlp, flagsp) 956 struct socket *so; 957 struct sockaddr **psa; 958 struct uio *uio; 959 struct mbuf **mp0; 960 struct mbuf **controlp; 961 int *flagsp; 962 { 963 struct mbuf *m, **mp; 964 int flags, len, error, offset; 965 struct protosw *pr = so->so_proto; 966 struct mbuf *nextrecord; 967 int moff, type = 0; 968 int orig_resid = uio->uio_resid; 969 970 mp = mp0; 971 if (psa != NULL) 972 *psa = NULL; 973 if (controlp != NULL) 974 *controlp = NULL; 975 if (flagsp != NULL) 976 flags = *flagsp &~ MSG_EOR; 977 else 978 flags = 0; 979 if (flags & MSG_OOB) 980 return (soreceive_rcvoob(so, uio, flags)); 981 if (mp != NULL) 982 *mp = NULL; 983 if (so->so_state & SS_ISCONFIRMING && uio->uio_resid) 984 (*pr->pr_usrreqs->pru_rcvd)(so, 0); 985 986 SOCKBUF_LOCK(&so->so_rcv); 987 restart: 988 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 989 error = sblock(&so->so_rcv, SBLOCKWAIT(flags)); 990 if (error) 991 goto out; 992 993 m = so->so_rcv.sb_mb; 994 /* 995 * If we have less data than requested, block awaiting more 996 * (subject to any timeout) if: 997 * 1. the current count is less than the low water mark, or 998 * 2. MSG_WAITALL is set, and it is possible to do the entire 999 * receive operation at once if we block (resid <= hiwat). 1000 * 3. MSG_DONTWAIT is not set 1001 * If MSG_WAITALL is set but resid is larger than the receive buffer, 1002 * we have to do the receive in sections, and thus risk returning 1003 * a short count if a timeout or signal occurs after we start. 1004 */ 1005 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && 1006 so->so_rcv.sb_cc < uio->uio_resid) && 1007 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat || 1008 ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) && 1009 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) { 1010 KASSERT(m != NULL || !so->so_rcv.sb_cc, 1011 ("receive: m == %p so->so_rcv.sb_cc == %u", 1012 m, so->so_rcv.sb_cc)); 1013 if (so->so_error) { 1014 if (m != NULL) 1015 goto dontblock; 1016 error = so->so_error; 1017 if ((flags & MSG_PEEK) == 0) 1018 so->so_error = 0; 1019 goto release; 1020 } 1021 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1022 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 1023 if (m) 1024 goto dontblock; 1025 else 1026 goto release; 1027 } 1028 for (; m != NULL; m = m->m_next) 1029 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { 1030 m = so->so_rcv.sb_mb; 1031 goto dontblock; 1032 } 1033 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 1034 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 1035 error = ENOTCONN; 1036 goto release; 1037 } 1038 if (uio->uio_resid == 0) 1039 goto release; 1040 if ((so->so_state & SS_NBIO) || 1041 (flags & (MSG_DONTWAIT|MSG_NBIO))) { 1042 error = EWOULDBLOCK; 1043 goto release; 1044 } 1045 SBLASTRECORDCHK(&so->so_rcv); 1046 SBLASTMBUFCHK(&so->so_rcv); 1047 sbunlock(&so->so_rcv); 1048 error = sbwait(&so->so_rcv); 1049 if (error) 1050 goto out; 1051 goto restart; 1052 } 1053 dontblock: 1054 /* 1055 * From this point onward, we maintain 'nextrecord' as a cache of the 1056 * pointer to the next record in the socket buffer. We must keep the 1057 * various socket buffer pointers and local stack versions of the 1058 * pointers in sync, pushing out modifications before dropping the 1059 * socket buffer mutex, and re-reading them when picking it up. 1060 * 1061 * Otherwise, we will race with the network stack appending new data 1062 * or records onto the socket buffer by using inconsistent/stale 1063 * versions of the field, possibly resulting in socket buffer 1064 * corruption. 1065 * 1066 * By holding the high-level sblock(), we prevent simultaneous 1067 * readers from pulling off the front of the socket buffer. 1068 */ 1069 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1070 if (uio->uio_td) 1071 uio->uio_td->td_proc->p_stats->p_ru.ru_msgrcv++; 1072 KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb")); 1073 SBLASTRECORDCHK(&so->so_rcv); 1074 SBLASTMBUFCHK(&so->so_rcv); 1075 nextrecord = m->m_nextpkt; 1076 if (pr->pr_flags & PR_ADDR) { 1077 KASSERT(m->m_type == MT_SONAME, 1078 ("m->m_type == %d", m->m_type)); 1079 orig_resid = 0; 1080 if (psa != NULL) 1081 *psa = sodupsockaddr(mtod(m, struct sockaddr *), 1082 M_NOWAIT); 1083 if (flags & MSG_PEEK) { 1084 m = m->m_next; 1085 } else { 1086 sbfree(&so->so_rcv, m); 1087 so->so_rcv.sb_mb = m_free(m); 1088 m = so->so_rcv.sb_mb; 1089 sockbuf_pushsync(&so->so_rcv, nextrecord); 1090 } 1091 } 1092 1093 /* 1094 * Process one or more MT_CONTROL mbufs present before any data mbufs 1095 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we 1096 * just copy the data; if !MSG_PEEK, we call into the protocol to 1097 * perform externalization (or freeing if controlp == NULL). 1098 */ 1099 if (m != NULL && m->m_type == MT_CONTROL) { 1100 struct mbuf *cm = NULL, *cmn; 1101 struct mbuf **cme = &cm; 1102 1103 do { 1104 if (flags & MSG_PEEK) { 1105 if (controlp != NULL) { 1106 *controlp = m_copy(m, 0, m->m_len); 1107 controlp = &(*controlp)->m_next; 1108 } 1109 m = m->m_next; 1110 } else { 1111 sbfree(&so->so_rcv, m); 1112 so->so_rcv.sb_mb = m->m_next; 1113 m->m_next = NULL; 1114 *cme = m; 1115 cme = &(*cme)->m_next; 1116 m = so->so_rcv.sb_mb; 1117 } 1118 } while (m != NULL && m->m_type == MT_CONTROL); 1119 if ((flags & MSG_PEEK) == 0) 1120 sockbuf_pushsync(&so->so_rcv, nextrecord); 1121 while (cm != NULL) { 1122 cmn = cm->m_next; 1123 cm->m_next = NULL; 1124 if (pr->pr_domain->dom_externalize != NULL) { 1125 SOCKBUF_UNLOCK(&so->so_rcv); 1126 error = (*pr->pr_domain->dom_externalize) 1127 (cm, controlp); 1128 SOCKBUF_LOCK(&so->so_rcv); 1129 } else if (controlp != NULL) 1130 *controlp = cm; 1131 else 1132 m_freem(cm); 1133 if (controlp != NULL) { 1134 orig_resid = 0; 1135 while (*controlp != NULL) 1136 controlp = &(*controlp)->m_next; 1137 } 1138 cm = cmn; 1139 } 1140 nextrecord = so->so_rcv.sb_mb->m_nextpkt; 1141 orig_resid = 0; 1142 } 1143 if (m != NULL) { 1144 if ((flags & MSG_PEEK) == 0) { 1145 KASSERT(m->m_nextpkt == nextrecord, 1146 ("soreceive: post-control, nextrecord !sync")); 1147 if (nextrecord == NULL) { 1148 KASSERT(so->so_rcv.sb_mb == m, 1149 ("soreceive: post-control, sb_mb!=m")); 1150 KASSERT(so->so_rcv.sb_lastrecord == m, 1151 ("soreceive: post-control, lastrecord!=m")); 1152 } 1153 } 1154 type = m->m_type; 1155 if (type == MT_OOBDATA) 1156 flags |= MSG_OOB; 1157 } else { 1158 if ((flags & MSG_PEEK) == 0) { 1159 KASSERT(so->so_rcv.sb_mb == nextrecord, 1160 ("soreceive: sb_mb != nextrecord")); 1161 if (so->so_rcv.sb_mb == NULL) { 1162 KASSERT(so->so_rcv.sb_lastrecord == NULL, 1163 ("soreceive: sb_lastercord != NULL")); 1164 } 1165 } 1166 } 1167 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1168 SBLASTRECORDCHK(&so->so_rcv); 1169 SBLASTMBUFCHK(&so->so_rcv); 1170 1171 /* 1172 * Now continue to read any data mbufs off of the head of the socket 1173 * buffer until the read request is satisfied. Note that 'type' is 1174 * used to store the type of any mbuf reads that have happened so far 1175 * such that soreceive() can stop reading if the type changes, which 1176 * causes soreceive() to return only one of regular data and inline 1177 * out-of-band data in a single socket receive operation. 1178 */ 1179 moff = 0; 1180 offset = 0; 1181 while (m != NULL && uio->uio_resid > 0 && error == 0) { 1182 /* 1183 * If the type of mbuf has changed since the last mbuf 1184 * examined ('type'), end the receive operation. 1185 */ 1186 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1187 if (m->m_type == MT_OOBDATA) { 1188 if (type != MT_OOBDATA) 1189 break; 1190 } else if (type == MT_OOBDATA) 1191 break; 1192 else 1193 KASSERT(m->m_type == MT_DATA || m->m_type == MT_HEADER, 1194 ("m->m_type == %d", m->m_type)); 1195 so->so_rcv.sb_state &= ~SBS_RCVATMARK; 1196 len = uio->uio_resid; 1197 if (so->so_oobmark && len > so->so_oobmark - offset) 1198 len = so->so_oobmark - offset; 1199 if (len > m->m_len - moff) 1200 len = m->m_len - moff; 1201 /* 1202 * If mp is set, just pass back the mbufs. 1203 * Otherwise copy them out via the uio, then free. 1204 * Sockbuf must be consistent here (points to current mbuf, 1205 * it points to next record) when we drop priority; 1206 * we must note any additions to the sockbuf when we 1207 * block interrupts again. 1208 */ 1209 if (mp == NULL) { 1210 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1211 SBLASTRECORDCHK(&so->so_rcv); 1212 SBLASTMBUFCHK(&so->so_rcv); 1213 SOCKBUF_UNLOCK(&so->so_rcv); 1214 #ifdef ZERO_COPY_SOCKETS 1215 if (so_zero_copy_receive) { 1216 vm_page_t pg; 1217 int disposable; 1218 1219 if ((m->m_flags & M_EXT) 1220 && (m->m_ext.ext_type == EXT_DISPOSABLE)) 1221 disposable = 1; 1222 else 1223 disposable = 0; 1224 1225 pg = PHYS_TO_VM_PAGE(vtophys(mtod(m, caddr_t) + 1226 moff)); 1227 1228 if (uio->uio_offset == -1) 1229 uio->uio_offset =IDX_TO_OFF(pg->pindex); 1230 1231 error = uiomoveco(mtod(m, char *) + moff, 1232 (int)len, uio,pg->object, 1233 disposable); 1234 } else 1235 #endif /* ZERO_COPY_SOCKETS */ 1236 error = uiomove(mtod(m, char *) + moff, (int)len, uio); 1237 SOCKBUF_LOCK(&so->so_rcv); 1238 if (error) 1239 goto release; 1240 } else 1241 uio->uio_resid -= len; 1242 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1243 if (len == m->m_len - moff) { 1244 if (m->m_flags & M_EOR) 1245 flags |= MSG_EOR; 1246 if (flags & MSG_PEEK) { 1247 m = m->m_next; 1248 moff = 0; 1249 } else { 1250 nextrecord = m->m_nextpkt; 1251 sbfree(&so->so_rcv, m); 1252 if (mp != NULL) { 1253 *mp = m; 1254 mp = &m->m_next; 1255 so->so_rcv.sb_mb = m = m->m_next; 1256 *mp = NULL; 1257 } else { 1258 so->so_rcv.sb_mb = m_free(m); 1259 m = so->so_rcv.sb_mb; 1260 } 1261 if (m != NULL) { 1262 m->m_nextpkt = nextrecord; 1263 if (nextrecord == NULL) 1264 so->so_rcv.sb_lastrecord = m; 1265 } else { 1266 so->so_rcv.sb_mb = nextrecord; 1267 SB_EMPTY_FIXUP(&so->so_rcv); 1268 } 1269 SBLASTRECORDCHK(&so->so_rcv); 1270 SBLASTMBUFCHK(&so->so_rcv); 1271 } 1272 } else { 1273 if (flags & MSG_PEEK) 1274 moff += len; 1275 else { 1276 if (mp != NULL) { 1277 SOCKBUF_UNLOCK(&so->so_rcv); 1278 *mp = m_copym(m, 0, len, M_TRYWAIT); 1279 SOCKBUF_LOCK(&so->so_rcv); 1280 } 1281 m->m_data += len; 1282 m->m_len -= len; 1283 so->so_rcv.sb_cc -= len; 1284 } 1285 } 1286 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1287 if (so->so_oobmark) { 1288 if ((flags & MSG_PEEK) == 0) { 1289 so->so_oobmark -= len; 1290 if (so->so_oobmark == 0) { 1291 so->so_rcv.sb_state |= SBS_RCVATMARK; 1292 break; 1293 } 1294 } else { 1295 offset += len; 1296 if (offset == so->so_oobmark) 1297 break; 1298 } 1299 } 1300 if (flags & MSG_EOR) 1301 break; 1302 /* 1303 * If the MSG_WAITALL flag is set (for non-atomic socket), 1304 * we must not quit until "uio->uio_resid == 0" or an error 1305 * termination. If a signal/timeout occurs, return 1306 * with a short count but without error. 1307 * Keep sockbuf locked against other readers. 1308 */ 1309 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 && 1310 !sosendallatonce(so) && nextrecord == NULL) { 1311 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1312 if (so->so_error || so->so_rcv.sb_state & SBS_CANTRCVMORE) 1313 break; 1314 /* 1315 * Notify the protocol that some data has been 1316 * drained before blocking. 1317 */ 1318 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb != NULL) { 1319 SOCKBUF_UNLOCK(&so->so_rcv); 1320 (*pr->pr_usrreqs->pru_rcvd)(so, flags); 1321 SOCKBUF_LOCK(&so->so_rcv); 1322 } 1323 SBLASTRECORDCHK(&so->so_rcv); 1324 SBLASTMBUFCHK(&so->so_rcv); 1325 error = sbwait(&so->so_rcv); 1326 if (error) 1327 goto release; 1328 m = so->so_rcv.sb_mb; 1329 if (m != NULL) 1330 nextrecord = m->m_nextpkt; 1331 } 1332 } 1333 1334 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1335 if (m != NULL && pr->pr_flags & PR_ATOMIC) { 1336 flags |= MSG_TRUNC; 1337 if ((flags & MSG_PEEK) == 0) 1338 (void) sbdroprecord_locked(&so->so_rcv); 1339 } 1340 if ((flags & MSG_PEEK) == 0) { 1341 if (m == NULL) { 1342 /* 1343 * First part is an inline SB_EMPTY_FIXUP(). Second 1344 * part makes sure sb_lastrecord is up-to-date if 1345 * there is still data in the socket buffer. 1346 */ 1347 so->so_rcv.sb_mb = nextrecord; 1348 if (so->so_rcv.sb_mb == NULL) { 1349 so->so_rcv.sb_mbtail = NULL; 1350 so->so_rcv.sb_lastrecord = NULL; 1351 } else if (nextrecord->m_nextpkt == NULL) 1352 so->so_rcv.sb_lastrecord = nextrecord; 1353 } 1354 SBLASTRECORDCHK(&so->so_rcv); 1355 SBLASTMBUFCHK(&so->so_rcv); 1356 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) { 1357 SOCKBUF_UNLOCK(&so->so_rcv); 1358 (*pr->pr_usrreqs->pru_rcvd)(so, flags); 1359 SOCKBUF_LOCK(&so->so_rcv); 1360 } 1361 } 1362 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1363 if (orig_resid == uio->uio_resid && orig_resid && 1364 (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) { 1365 sbunlock(&so->so_rcv); 1366 goto restart; 1367 } 1368 1369 if (flagsp != NULL) 1370 *flagsp |= flags; 1371 release: 1372 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1373 sbunlock(&so->so_rcv); 1374 out: 1375 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1376 SOCKBUF_UNLOCK(&so->so_rcv); 1377 return (error); 1378 } 1379 1380 int 1381 soshutdown(so, how) 1382 struct socket *so; 1383 int how; 1384 { 1385 struct protosw *pr = so->so_proto; 1386 1387 if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR)) 1388 return (EINVAL); 1389 1390 if (how != SHUT_WR) 1391 sorflush(so); 1392 if (how != SHUT_RD) 1393 return ((*pr->pr_usrreqs->pru_shutdown)(so)); 1394 return (0); 1395 } 1396 1397 void 1398 sorflush(so) 1399 struct socket *so; 1400 { 1401 struct sockbuf *sb = &so->so_rcv; 1402 struct protosw *pr = so->so_proto; 1403 struct sockbuf asb; 1404 1405 /* 1406 * XXXRW: This is quite ugly. The existing code made a copy of the 1407 * socket buffer, then zero'd the original to clear the buffer 1408 * fields. However, with mutexes in the socket buffer, this causes 1409 * problems. We only clear the zeroable bits of the original; 1410 * however, we have to initialize and destroy the mutex in the copy 1411 * so that dom_dispose() and sbrelease() can lock t as needed. 1412 */ 1413 SOCKBUF_LOCK(sb); 1414 sb->sb_flags |= SB_NOINTR; 1415 (void) sblock(sb, M_WAITOK); 1416 /* 1417 * socantrcvmore_locked() drops the socket buffer mutex so that it 1418 * can safely perform wakeups. Re-acquire the mutex before 1419 * continuing. 1420 */ 1421 socantrcvmore_locked(so); 1422 SOCKBUF_LOCK(sb); 1423 sbunlock(sb); 1424 /* 1425 * Invalidate/clear most of the sockbuf structure, but leave 1426 * selinfo and mutex data unchanged. 1427 */ 1428 bzero(&asb, offsetof(struct sockbuf, sb_startzero)); 1429 bcopy(&sb->sb_startzero, &asb.sb_startzero, 1430 sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); 1431 bzero(&sb->sb_startzero, 1432 sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); 1433 SOCKBUF_UNLOCK(sb); 1434 1435 SOCKBUF_LOCK_INIT(&asb, "so_rcv"); 1436 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL) 1437 (*pr->pr_domain->dom_dispose)(asb.sb_mb); 1438 sbrelease(&asb, so); 1439 SOCKBUF_LOCK_DESTROY(&asb); 1440 } 1441 1442 #ifdef INET 1443 static int 1444 do_setopt_accept_filter(so, sopt) 1445 struct socket *so; 1446 struct sockopt *sopt; 1447 { 1448 struct accept_filter_arg *afap; 1449 struct accept_filter *afp; 1450 struct so_accf *newaf; 1451 int error = 0; 1452 1453 newaf = NULL; 1454 afap = NULL; 1455 1456 /* 1457 * XXXRW: Configuring accept filters should be an atomic test-and-set 1458 * operation to prevent races during setup and attach. There may be 1459 * more general issues of racing and ordering here that are not yet 1460 * addressed by locking. 1461 */ 1462 /* do not set/remove accept filters on non listen sockets */ 1463 SOCK_LOCK(so); 1464 if ((so->so_options & SO_ACCEPTCONN) == 0) { 1465 SOCK_UNLOCK(so); 1466 return (EINVAL); 1467 } 1468 1469 /* removing the filter */ 1470 if (sopt == NULL) { 1471 if (so->so_accf != NULL) { 1472 struct so_accf *af = so->so_accf; 1473 if (af->so_accept_filter != NULL && 1474 af->so_accept_filter->accf_destroy != NULL) { 1475 af->so_accept_filter->accf_destroy(so); 1476 } 1477 if (af->so_accept_filter_str != NULL) { 1478 FREE(af->so_accept_filter_str, M_ACCF); 1479 } 1480 FREE(af, M_ACCF); 1481 so->so_accf = NULL; 1482 } 1483 so->so_options &= ~SO_ACCEPTFILTER; 1484 SOCK_UNLOCK(so); 1485 return (0); 1486 } 1487 SOCK_UNLOCK(so); 1488 1489 /*- 1490 * Adding a filter. 1491 * 1492 * Do memory allocation, copyin, and filter lookup now while we're 1493 * not holding any locks. Avoids sleeping with a mutex, as well as 1494 * introducing a lock order between accept filter locks and socket 1495 * locks here. 1496 */ 1497 MALLOC(afap, struct accept_filter_arg *, sizeof(*afap), M_TEMP, 1498 M_WAITOK); 1499 /* don't put large objects on the kernel stack */ 1500 error = sooptcopyin(sopt, afap, sizeof *afap, sizeof *afap); 1501 afap->af_name[sizeof(afap->af_name)-1] = '\0'; 1502 afap->af_arg[sizeof(afap->af_arg)-1] = '\0'; 1503 if (error) { 1504 FREE(afap, M_TEMP); 1505 return (error); 1506 } 1507 afp = accept_filt_get(afap->af_name); 1508 if (afp == NULL) { 1509 FREE(afap, M_TEMP); 1510 return (ENOENT); 1511 } 1512 1513 /* 1514 * Allocate the new accept filter instance storage. We may have to 1515 * free it again later if we fail to attach it. If attached 1516 * properly, 'newaf' is NULLed to avoid a free() while in use. 1517 */ 1518 MALLOC(newaf, struct so_accf *, sizeof(*newaf), M_ACCF, M_WAITOK | 1519 M_ZERO); 1520 if (afp->accf_create != NULL && afap->af_name[0] != '\0') { 1521 int len = strlen(afap->af_name) + 1; 1522 MALLOC(newaf->so_accept_filter_str, char *, len, M_ACCF, 1523 M_WAITOK); 1524 strcpy(newaf->so_accept_filter_str, afap->af_name); 1525 } 1526 1527 SOCK_LOCK(so); 1528 /* must remove previous filter first */ 1529 if (so->so_accf != NULL) { 1530 error = EINVAL; 1531 goto out; 1532 } 1533 /* 1534 * Invoke the accf_create() method of the filter if required. 1535 * XXXRW: the socket mutex is held over this call, so the create 1536 * method cannot block. This may be something we have to change, but 1537 * it would require addressing possible races. 1538 */ 1539 if (afp->accf_create != NULL) { 1540 newaf->so_accept_filter_arg = 1541 afp->accf_create(so, afap->af_arg); 1542 if (newaf->so_accept_filter_arg == NULL) { 1543 error = EINVAL; 1544 goto out; 1545 } 1546 } 1547 newaf->so_accept_filter = afp; 1548 so->so_accf = newaf; 1549 so->so_options |= SO_ACCEPTFILTER; 1550 newaf = NULL; 1551 out: 1552 SOCK_UNLOCK(so); 1553 if (newaf != NULL) { 1554 if (newaf->so_accept_filter_str != NULL) 1555 FREE(newaf->so_accept_filter_str, M_ACCF); 1556 FREE(newaf, M_ACCF); 1557 } 1558 if (afap != NULL) 1559 FREE(afap, M_TEMP); 1560 return (error); 1561 } 1562 #endif /* INET */ 1563 1564 /* 1565 * Perhaps this routine, and sooptcopyout(), below, ought to come in 1566 * an additional variant to handle the case where the option value needs 1567 * to be some kind of integer, but not a specific size. 1568 * In addition to their use here, these functions are also called by the 1569 * protocol-level pr_ctloutput() routines. 1570 */ 1571 int 1572 sooptcopyin(sopt, buf, len, minlen) 1573 struct sockopt *sopt; 1574 void *buf; 1575 size_t len; 1576 size_t minlen; 1577 { 1578 size_t valsize; 1579 1580 /* 1581 * If the user gives us more than we wanted, we ignore it, 1582 * but if we don't get the minimum length the caller 1583 * wants, we return EINVAL. On success, sopt->sopt_valsize 1584 * is set to however much we actually retrieved. 1585 */ 1586 if ((valsize = sopt->sopt_valsize) < minlen) 1587 return EINVAL; 1588 if (valsize > len) 1589 sopt->sopt_valsize = valsize = len; 1590 1591 if (sopt->sopt_td != NULL) 1592 return (copyin(sopt->sopt_val, buf, valsize)); 1593 1594 bcopy(sopt->sopt_val, buf, valsize); 1595 return 0; 1596 } 1597 1598 /* 1599 * Kernel version of setsockopt(2)/ 1600 * XXX: optlen is size_t, not socklen_t 1601 */ 1602 int 1603 so_setsockopt(struct socket *so, int level, int optname, void *optval, 1604 size_t optlen) 1605 { 1606 struct sockopt sopt; 1607 1608 sopt.sopt_level = level; 1609 sopt.sopt_name = optname; 1610 sopt.sopt_dir = SOPT_SET; 1611 sopt.sopt_val = optval; 1612 sopt.sopt_valsize = optlen; 1613 sopt.sopt_td = NULL; 1614 return (sosetopt(so, &sopt)); 1615 } 1616 1617 int 1618 sosetopt(so, sopt) 1619 struct socket *so; 1620 struct sockopt *sopt; 1621 { 1622 int error, optval; 1623 struct linger l; 1624 struct timeval tv; 1625 u_long val; 1626 #ifdef MAC 1627 struct mac extmac; 1628 #endif 1629 1630 error = 0; 1631 if (sopt->sopt_level != SOL_SOCKET) { 1632 if (so->so_proto && so->so_proto->pr_ctloutput) 1633 return ((*so->so_proto->pr_ctloutput) 1634 (so, sopt)); 1635 error = ENOPROTOOPT; 1636 } else { 1637 switch (sopt->sopt_name) { 1638 #ifdef INET 1639 case SO_ACCEPTFILTER: 1640 error = do_setopt_accept_filter(so, sopt); 1641 if (error) 1642 goto bad; 1643 break; 1644 #endif 1645 case SO_LINGER: 1646 error = sooptcopyin(sopt, &l, sizeof l, sizeof l); 1647 if (error) 1648 goto bad; 1649 1650 SOCK_LOCK(so); 1651 so->so_linger = l.l_linger; 1652 if (l.l_onoff) 1653 so->so_options |= SO_LINGER; 1654 else 1655 so->so_options &= ~SO_LINGER; 1656 SOCK_UNLOCK(so); 1657 break; 1658 1659 case SO_DEBUG: 1660 case SO_KEEPALIVE: 1661 case SO_DONTROUTE: 1662 case SO_USELOOPBACK: 1663 case SO_BROADCAST: 1664 case SO_REUSEADDR: 1665 case SO_REUSEPORT: 1666 case SO_OOBINLINE: 1667 case SO_TIMESTAMP: 1668 case SO_BINTIME: 1669 case SO_NOSIGPIPE: 1670 error = sooptcopyin(sopt, &optval, sizeof optval, 1671 sizeof optval); 1672 if (error) 1673 goto bad; 1674 SOCK_LOCK(so); 1675 if (optval) 1676 so->so_options |= sopt->sopt_name; 1677 else 1678 so->so_options &= ~sopt->sopt_name; 1679 SOCK_UNLOCK(so); 1680 break; 1681 1682 case SO_SNDBUF: 1683 case SO_RCVBUF: 1684 case SO_SNDLOWAT: 1685 case SO_RCVLOWAT: 1686 error = sooptcopyin(sopt, &optval, sizeof optval, 1687 sizeof optval); 1688 if (error) 1689 goto bad; 1690 1691 /* 1692 * Values < 1 make no sense for any of these 1693 * options, so disallow them. 1694 */ 1695 if (optval < 1) { 1696 error = EINVAL; 1697 goto bad; 1698 } 1699 1700 switch (sopt->sopt_name) { 1701 case SO_SNDBUF: 1702 case SO_RCVBUF: 1703 if (sbreserve(sopt->sopt_name == SO_SNDBUF ? 1704 &so->so_snd : &so->so_rcv, (u_long)optval, 1705 so, curthread) == 0) { 1706 error = ENOBUFS; 1707 goto bad; 1708 } 1709 break; 1710 1711 /* 1712 * Make sure the low-water is never greater than 1713 * the high-water. 1714 */ 1715 case SO_SNDLOWAT: 1716 SOCKBUF_LOCK(&so->so_snd); 1717 so->so_snd.sb_lowat = 1718 (optval > so->so_snd.sb_hiwat) ? 1719 so->so_snd.sb_hiwat : optval; 1720 SOCKBUF_UNLOCK(&so->so_snd); 1721 break; 1722 case SO_RCVLOWAT: 1723 SOCKBUF_LOCK(&so->so_rcv); 1724 so->so_rcv.sb_lowat = 1725 (optval > so->so_rcv.sb_hiwat) ? 1726 so->so_rcv.sb_hiwat : optval; 1727 SOCKBUF_UNLOCK(&so->so_rcv); 1728 break; 1729 } 1730 break; 1731 1732 case SO_SNDTIMEO: 1733 case SO_RCVTIMEO: 1734 error = sooptcopyin(sopt, &tv, sizeof tv, 1735 sizeof tv); 1736 if (error) 1737 goto bad; 1738 1739 /* assert(hz > 0); */ 1740 if (tv.tv_sec < 0 || tv.tv_sec > SHRT_MAX / hz || 1741 tv.tv_usec < 0 || tv.tv_usec >= 1000000) { 1742 error = EDOM; 1743 goto bad; 1744 } 1745 /* assert(tick > 0); */ 1746 /* assert(ULONG_MAX - SHRT_MAX >= 1000000); */ 1747 val = (u_long)(tv.tv_sec * hz) + tv.tv_usec / tick; 1748 if (val > SHRT_MAX) { 1749 error = EDOM; 1750 goto bad; 1751 } 1752 if (val == 0 && tv.tv_usec != 0) 1753 val = 1; 1754 1755 switch (sopt->sopt_name) { 1756 case SO_SNDTIMEO: 1757 so->so_snd.sb_timeo = val; 1758 break; 1759 case SO_RCVTIMEO: 1760 so->so_rcv.sb_timeo = val; 1761 break; 1762 } 1763 break; 1764 case SO_LABEL: 1765 #ifdef MAC 1766 error = sooptcopyin(sopt, &extmac, sizeof extmac, 1767 sizeof extmac); 1768 if (error) 1769 goto bad; 1770 error = mac_setsockopt_label(sopt->sopt_td->td_ucred, 1771 so, &extmac); 1772 #else 1773 error = EOPNOTSUPP; 1774 #endif 1775 break; 1776 default: 1777 error = ENOPROTOOPT; 1778 break; 1779 } 1780 if (error == 0 && so->so_proto != NULL && 1781 so->so_proto->pr_ctloutput != NULL) { 1782 (void) ((*so->so_proto->pr_ctloutput) 1783 (so, sopt)); 1784 } 1785 } 1786 bad: 1787 return (error); 1788 } 1789 1790 /* Helper routine for getsockopt */ 1791 int 1792 sooptcopyout(struct sockopt *sopt, const void *buf, size_t len) 1793 { 1794 int error; 1795 size_t valsize; 1796 1797 error = 0; 1798 1799 /* 1800 * Documented get behavior is that we always return a value, 1801 * possibly truncated to fit in the user's buffer. 1802 * Traditional behavior is that we always tell the user 1803 * precisely how much we copied, rather than something useful 1804 * like the total amount we had available for her. 1805 * Note that this interface is not idempotent; the entire answer must 1806 * generated ahead of time. 1807 */ 1808 valsize = min(len, sopt->sopt_valsize); 1809 sopt->sopt_valsize = valsize; 1810 if (sopt->sopt_val != NULL) { 1811 if (sopt->sopt_td != NULL) 1812 error = copyout(buf, sopt->sopt_val, valsize); 1813 else 1814 bcopy(buf, sopt->sopt_val, valsize); 1815 } 1816 return error; 1817 } 1818 1819 int 1820 sogetopt(so, sopt) 1821 struct socket *so; 1822 struct sockopt *sopt; 1823 { 1824 int error, optval; 1825 struct linger l; 1826 struct timeval tv; 1827 #ifdef INET 1828 struct accept_filter_arg *afap; 1829 #endif 1830 #ifdef MAC 1831 struct mac extmac; 1832 #endif 1833 1834 error = 0; 1835 if (sopt->sopt_level != SOL_SOCKET) { 1836 if (so->so_proto && so->so_proto->pr_ctloutput) { 1837 return ((*so->so_proto->pr_ctloutput) 1838 (so, sopt)); 1839 } else 1840 return (ENOPROTOOPT); 1841 } else { 1842 switch (sopt->sopt_name) { 1843 #ifdef INET 1844 case SO_ACCEPTFILTER: 1845 /* Unlocked read. */ 1846 if ((so->so_options & SO_ACCEPTCONN) == 0) 1847 return (EINVAL); 1848 MALLOC(afap, struct accept_filter_arg *, sizeof(*afap), 1849 M_TEMP, M_WAITOK | M_ZERO); 1850 SOCK_LOCK(so); 1851 if ((so->so_options & SO_ACCEPTFILTER) != 0) { 1852 strcpy(afap->af_name, so->so_accf->so_accept_filter->accf_name); 1853 if (so->so_accf->so_accept_filter_str != NULL) 1854 strcpy(afap->af_arg, so->so_accf->so_accept_filter_str); 1855 } 1856 SOCK_UNLOCK(so); 1857 error = sooptcopyout(sopt, afap, sizeof(*afap)); 1858 FREE(afap, M_TEMP); 1859 break; 1860 #endif 1861 1862 case SO_LINGER: 1863 /* 1864 * XXXRW: We grab the lock here to get a consistent 1865 * snapshot of both fields. This may not really 1866 * be necessary. 1867 */ 1868 SOCK_LOCK(so); 1869 l.l_onoff = so->so_options & SO_LINGER; 1870 l.l_linger = so->so_linger; 1871 SOCK_UNLOCK(so); 1872 error = sooptcopyout(sopt, &l, sizeof l); 1873 break; 1874 1875 case SO_USELOOPBACK: 1876 case SO_DONTROUTE: 1877 case SO_DEBUG: 1878 case SO_KEEPALIVE: 1879 case SO_REUSEADDR: 1880 case SO_REUSEPORT: 1881 case SO_BROADCAST: 1882 case SO_OOBINLINE: 1883 case SO_TIMESTAMP: 1884 case SO_BINTIME: 1885 case SO_NOSIGPIPE: 1886 optval = so->so_options & sopt->sopt_name; 1887 integer: 1888 error = sooptcopyout(sopt, &optval, sizeof optval); 1889 break; 1890 1891 case SO_TYPE: 1892 optval = so->so_type; 1893 goto integer; 1894 1895 case SO_ERROR: 1896 optval = so->so_error; 1897 so->so_error = 0; 1898 goto integer; 1899 1900 case SO_SNDBUF: 1901 optval = so->so_snd.sb_hiwat; 1902 goto integer; 1903 1904 case SO_RCVBUF: 1905 optval = so->so_rcv.sb_hiwat; 1906 goto integer; 1907 1908 case SO_SNDLOWAT: 1909 optval = so->so_snd.sb_lowat; 1910 goto integer; 1911 1912 case SO_RCVLOWAT: 1913 optval = so->so_rcv.sb_lowat; 1914 goto integer; 1915 1916 case SO_SNDTIMEO: 1917 case SO_RCVTIMEO: 1918 optval = (sopt->sopt_name == SO_SNDTIMEO ? 1919 so->so_snd.sb_timeo : so->so_rcv.sb_timeo); 1920 1921 tv.tv_sec = optval / hz; 1922 tv.tv_usec = (optval % hz) * tick; 1923 error = sooptcopyout(sopt, &tv, sizeof tv); 1924 break; 1925 case SO_LABEL: 1926 #ifdef MAC 1927 error = sooptcopyin(sopt, &extmac, sizeof(extmac), 1928 sizeof(extmac)); 1929 if (error) 1930 return (error); 1931 error = mac_getsockopt_label(sopt->sopt_td->td_ucred, 1932 so, &extmac); 1933 if (error) 1934 return (error); 1935 error = sooptcopyout(sopt, &extmac, sizeof extmac); 1936 #else 1937 error = EOPNOTSUPP; 1938 #endif 1939 break; 1940 case SO_PEERLABEL: 1941 #ifdef MAC 1942 error = sooptcopyin(sopt, &extmac, sizeof(extmac), 1943 sizeof(extmac)); 1944 if (error) 1945 return (error); 1946 error = mac_getsockopt_peerlabel( 1947 sopt->sopt_td->td_ucred, so, &extmac); 1948 if (error) 1949 return (error); 1950 error = sooptcopyout(sopt, &extmac, sizeof extmac); 1951 #else 1952 error = EOPNOTSUPP; 1953 #endif 1954 break; 1955 default: 1956 error = ENOPROTOOPT; 1957 break; 1958 } 1959 return (error); 1960 } 1961 } 1962 1963 /* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */ 1964 int 1965 soopt_getm(struct sockopt *sopt, struct mbuf **mp) 1966 { 1967 struct mbuf *m, *m_prev; 1968 int sopt_size = sopt->sopt_valsize; 1969 1970 MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA); 1971 if (m == NULL) 1972 return ENOBUFS; 1973 if (sopt_size > MLEN) { 1974 MCLGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT); 1975 if ((m->m_flags & M_EXT) == 0) { 1976 m_free(m); 1977 return ENOBUFS; 1978 } 1979 m->m_len = min(MCLBYTES, sopt_size); 1980 } else { 1981 m->m_len = min(MLEN, sopt_size); 1982 } 1983 sopt_size -= m->m_len; 1984 *mp = m; 1985 m_prev = m; 1986 1987 while (sopt_size) { 1988 MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA); 1989 if (m == NULL) { 1990 m_freem(*mp); 1991 return ENOBUFS; 1992 } 1993 if (sopt_size > MLEN) { 1994 MCLGET(m, sopt->sopt_td != NULL ? M_TRYWAIT : 1995 M_DONTWAIT); 1996 if ((m->m_flags & M_EXT) == 0) { 1997 m_freem(m); 1998 m_freem(*mp); 1999 return ENOBUFS; 2000 } 2001 m->m_len = min(MCLBYTES, sopt_size); 2002 } else { 2003 m->m_len = min(MLEN, sopt_size); 2004 } 2005 sopt_size -= m->m_len; 2006 m_prev->m_next = m; 2007 m_prev = m; 2008 } 2009 return 0; 2010 } 2011 2012 /* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */ 2013 int 2014 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m) 2015 { 2016 struct mbuf *m0 = m; 2017 2018 if (sopt->sopt_val == NULL) 2019 return 0; 2020 while (m != NULL && sopt->sopt_valsize >= m->m_len) { 2021 if (sopt->sopt_td != NULL) { 2022 int error; 2023 2024 error = copyin(sopt->sopt_val, mtod(m, char *), 2025 m->m_len); 2026 if (error != 0) { 2027 m_freem(m0); 2028 return(error); 2029 } 2030 } else 2031 bcopy(sopt->sopt_val, mtod(m, char *), m->m_len); 2032 sopt->sopt_valsize -= m->m_len; 2033 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; 2034 m = m->m_next; 2035 } 2036 if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */ 2037 panic("ip6_sooptmcopyin"); 2038 return 0; 2039 } 2040 2041 /* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */ 2042 int 2043 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m) 2044 { 2045 struct mbuf *m0 = m; 2046 size_t valsize = 0; 2047 2048 if (sopt->sopt_val == NULL) 2049 return 0; 2050 while (m != NULL && sopt->sopt_valsize >= m->m_len) { 2051 if (sopt->sopt_td != NULL) { 2052 int error; 2053 2054 error = copyout(mtod(m, char *), sopt->sopt_val, 2055 m->m_len); 2056 if (error != 0) { 2057 m_freem(m0); 2058 return(error); 2059 } 2060 } else 2061 bcopy(mtod(m, char *), sopt->sopt_val, m->m_len); 2062 sopt->sopt_valsize -= m->m_len; 2063 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; 2064 valsize += m->m_len; 2065 m = m->m_next; 2066 } 2067 if (m != NULL) { 2068 /* enough soopt buffer should be given from user-land */ 2069 m_freem(m0); 2070 return(EINVAL); 2071 } 2072 sopt->sopt_valsize = valsize; 2073 return 0; 2074 } 2075 2076 void 2077 sohasoutofband(so) 2078 struct socket *so; 2079 { 2080 if (so->so_sigio != NULL) 2081 pgsigio(&so->so_sigio, SIGURG, 0); 2082 selwakeuppri(&so->so_rcv.sb_sel, PSOCK); 2083 } 2084 2085 int 2086 sopoll(struct socket *so, int events, struct ucred *active_cred, 2087 struct thread *td) 2088 { 2089 int revents = 0; 2090 2091 if (events & (POLLIN | POLLRDNORM)) 2092 if (soreadable(so)) 2093 revents |= events & (POLLIN | POLLRDNORM); 2094 2095 if (events & POLLINIGNEOF) 2096 if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat || 2097 !TAILQ_EMPTY(&so->so_comp) || so->so_error) 2098 revents |= POLLINIGNEOF; 2099 2100 if (events & (POLLOUT | POLLWRNORM)) 2101 if (sowriteable(so)) 2102 revents |= events & (POLLOUT | POLLWRNORM); 2103 2104 if (events & (POLLPRI | POLLRDBAND)) 2105 if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK)) 2106 revents |= events & (POLLPRI | POLLRDBAND); 2107 2108 if (revents == 0) { 2109 if (events & 2110 (POLLIN | POLLINIGNEOF | POLLPRI | POLLRDNORM | 2111 POLLRDBAND)) { 2112 SOCKBUF_LOCK(&so->so_rcv); 2113 selrecord(td, &so->so_rcv.sb_sel); 2114 so->so_rcv.sb_flags |= SB_SEL; 2115 SOCKBUF_UNLOCK(&so->so_rcv); 2116 } 2117 2118 if (events & (POLLOUT | POLLWRNORM)) { 2119 SOCKBUF_LOCK(&so->so_snd); 2120 selrecord(td, &so->so_snd.sb_sel); 2121 so->so_snd.sb_flags |= SB_SEL; 2122 SOCKBUF_UNLOCK(&so->so_snd); 2123 } 2124 } 2125 2126 return (revents); 2127 } 2128 2129 int 2130 soo_kqfilter(struct file *fp, struct knote *kn) 2131 { 2132 struct socket *so = kn->kn_fp->f_data; 2133 struct sockbuf *sb; 2134 2135 switch (kn->kn_filter) { 2136 case EVFILT_READ: 2137 if (so->so_options & SO_ACCEPTCONN) 2138 kn->kn_fop = &solisten_filtops; 2139 else 2140 kn->kn_fop = &soread_filtops; 2141 sb = &so->so_rcv; 2142 break; 2143 case EVFILT_WRITE: 2144 kn->kn_fop = &sowrite_filtops; 2145 sb = &so->so_snd; 2146 break; 2147 default: 2148 return (EINVAL); 2149 } 2150 2151 SOCKBUF_LOCK(sb); 2152 knlist_add(&sb->sb_sel.si_note, kn, 1); 2153 sb->sb_flags |= SB_KNOTE; 2154 SOCKBUF_UNLOCK(sb); 2155 return (0); 2156 } 2157 2158 static void 2159 filt_sordetach(struct knote *kn) 2160 { 2161 struct socket *so = kn->kn_fp->f_data; 2162 2163 SOCKBUF_LOCK(&so->so_rcv); 2164 knlist_remove(&so->so_rcv.sb_sel.si_note, kn, 1); 2165 if (knlist_empty(&so->so_rcv.sb_sel.si_note)) 2166 so->so_rcv.sb_flags &= ~SB_KNOTE; 2167 SOCKBUF_UNLOCK(&so->so_rcv); 2168 } 2169 2170 /*ARGSUSED*/ 2171 static int 2172 filt_soread(struct knote *kn, long hint) 2173 { 2174 struct socket *so = kn->kn_fp->f_data; 2175 int need_lock, result; 2176 2177 /* 2178 * XXXRW: Conditional locking because filt_soread() can be called 2179 * either from KNOTE() in the socket context where the socket buffer 2180 * lock is already held, or from kqueue() itself. 2181 */ 2182 need_lock = !SOCKBUF_OWNED(&so->so_rcv); 2183 if (need_lock) 2184 SOCKBUF_LOCK(&so->so_rcv); 2185 kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl; 2186 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 2187 kn->kn_flags |= EV_EOF; 2188 kn->kn_fflags = so->so_error; 2189 result = 1; 2190 } else if (so->so_error) /* temporary udp error */ 2191 result = 1; 2192 else if (kn->kn_sfflags & NOTE_LOWAT) 2193 result = (kn->kn_data >= kn->kn_sdata); 2194 else 2195 result = (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat); 2196 if (need_lock) 2197 SOCKBUF_UNLOCK(&so->so_rcv); 2198 return (result); 2199 } 2200 2201 static void 2202 filt_sowdetach(struct knote *kn) 2203 { 2204 struct socket *so = kn->kn_fp->f_data; 2205 2206 SOCKBUF_LOCK(&so->so_snd); 2207 knlist_remove(&so->so_snd.sb_sel.si_note, kn, 1); 2208 if (knlist_empty(&so->so_snd.sb_sel.si_note)) 2209 so->so_snd.sb_flags &= ~SB_KNOTE; 2210 SOCKBUF_UNLOCK(&so->so_snd); 2211 } 2212 2213 /*ARGSUSED*/ 2214 static int 2215 filt_sowrite(struct knote *kn, long hint) 2216 { 2217 struct socket *so = kn->kn_fp->f_data; 2218 int need_lock, result; 2219 2220 /* 2221 * XXXRW: Conditional locking because filt_soread() can be called 2222 * either from KNOTE() in the socket context where the socket buffer 2223 * lock is already held, or from kqueue() itself. 2224 */ 2225 need_lock = !SOCKBUF_OWNED(&so->so_snd); 2226 if (need_lock) 2227 SOCKBUF_LOCK(&so->so_snd); 2228 kn->kn_data = sbspace(&so->so_snd); 2229 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 2230 kn->kn_flags |= EV_EOF; 2231 kn->kn_fflags = so->so_error; 2232 result = 1; 2233 } else if (so->so_error) /* temporary udp error */ 2234 result = 1; 2235 else if (((so->so_state & SS_ISCONNECTED) == 0) && 2236 (so->so_proto->pr_flags & PR_CONNREQUIRED)) 2237 result = 0; 2238 else if (kn->kn_sfflags & NOTE_LOWAT) 2239 result = (kn->kn_data >= kn->kn_sdata); 2240 else 2241 result = (kn->kn_data >= so->so_snd.sb_lowat); 2242 if (need_lock) 2243 SOCKBUF_UNLOCK(&so->so_snd); 2244 return (result); 2245 } 2246 2247 /*ARGSUSED*/ 2248 static int 2249 filt_solisten(struct knote *kn, long hint) 2250 { 2251 struct socket *so = kn->kn_fp->f_data; 2252 2253 kn->kn_data = so->so_qlen; 2254 return (! TAILQ_EMPTY(&so->so_comp)); 2255 } 2256 2257 int 2258 socheckuid(struct socket *so, uid_t uid) 2259 { 2260 2261 if (so == NULL) 2262 return (EPERM); 2263 if (so->so_cred->cr_uid == uid) 2264 return (0); 2265 return (EPERM); 2266 } 2267