1 /* 2 * Copyright (c) 2004 The FreeBSD Foundation 3 * Copyright (c) 2004 Robert Watson 4 * Copyright (c) 1982, 1986, 1988, 1990, 1993 5 * The Regents of the University of California. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 4. Neither the name of the University nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 * 31 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94 32 */ 33 34 #include <sys/cdefs.h> 35 __FBSDID("$FreeBSD$"); 36 37 #include "opt_inet.h" 38 #include "opt_mac.h" 39 #include "opt_zero.h" 40 41 #include <sys/param.h> 42 #include <sys/systm.h> 43 #include <sys/fcntl.h> 44 #include <sys/limits.h> 45 #include <sys/lock.h> 46 #include <sys/mac.h> 47 #include <sys/malloc.h> 48 #include <sys/mbuf.h> 49 #include <sys/mutex.h> 50 #include <sys/domain.h> 51 #include <sys/file.h> /* for struct knote */ 52 #include <sys/kernel.h> 53 #include <sys/event.h> 54 #include <sys/poll.h> 55 #include <sys/proc.h> 56 #include <sys/protosw.h> 57 #include <sys/socket.h> 58 #include <sys/socketvar.h> 59 #include <sys/resourcevar.h> 60 #include <sys/signalvar.h> 61 #include <sys/sysctl.h> 62 #include <sys/uio.h> 63 #include <sys/jail.h> 64 65 #include <vm/uma.h> 66 67 68 static int soreceive_rcvoob(struct socket *so, struct uio *uio, 69 int flags); 70 71 #ifdef INET 72 static int do_setopt_accept_filter(struct socket *so, struct sockopt *sopt); 73 #endif 74 75 static void filt_sordetach(struct knote *kn); 76 static int filt_soread(struct knote *kn, long hint); 77 static void filt_sowdetach(struct knote *kn); 78 static int filt_sowrite(struct knote *kn, long hint); 79 static int filt_solisten(struct knote *kn, long hint); 80 81 static struct filterops solisten_filtops = 82 { 1, NULL, filt_sordetach, filt_solisten }; 83 static struct filterops soread_filtops = 84 { 1, NULL, filt_sordetach, filt_soread }; 85 static struct filterops sowrite_filtops = 86 { 1, NULL, filt_sowdetach, filt_sowrite }; 87 88 uma_zone_t socket_zone; 89 so_gen_t so_gencnt; /* generation count for sockets */ 90 91 MALLOC_DEFINE(M_SONAME, "soname", "socket name"); 92 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block"); 93 94 SYSCTL_DECL(_kern_ipc); 95 96 static int somaxconn = SOMAXCONN; 97 SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLFLAG_RW, 98 &somaxconn, 0, "Maximum pending socket connection queue size"); 99 static int numopensockets; 100 SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD, 101 &numopensockets, 0, "Number of open sockets"); 102 #ifdef ZERO_COPY_SOCKETS 103 /* These aren't static because they're used in other files. */ 104 int so_zero_copy_send = 1; 105 int so_zero_copy_receive = 1; 106 SYSCTL_NODE(_kern_ipc, OID_AUTO, zero_copy, CTLFLAG_RD, 0, 107 "Zero copy controls"); 108 SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, receive, CTLFLAG_RW, 109 &so_zero_copy_receive, 0, "Enable zero copy receive"); 110 SYSCTL_INT(_kern_ipc_zero_copy, OID_AUTO, send, CTLFLAG_RW, 111 &so_zero_copy_send, 0, "Enable zero copy send"); 112 #endif /* ZERO_COPY_SOCKETS */ 113 114 /* 115 * accept_mtx locks down per-socket fields relating to accept queues. See 116 * socketvar.h for an annotation of the protected fields of struct socket. 117 */ 118 struct mtx accept_mtx; 119 MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF); 120 121 /* 122 * so_global_mtx protects so_gencnt, numopensockets, and the per-socket 123 * so_gencnt field. 124 * 125 * XXXRW: These variables might be better manipulated using atomic operations 126 * for improved efficiency. 127 */ 128 static struct mtx so_global_mtx; 129 MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF); 130 131 /* 132 * Socket operation routines. 133 * These routines are called by the routines in 134 * sys_socket.c or from a system process, and 135 * implement the semantics of socket operations by 136 * switching out to the protocol specific routines. 137 */ 138 139 /* 140 * Get a socket structure from our zone, and initialize it. 141 * Note that it would probably be better to allocate socket 142 * and PCB at the same time, but I'm not convinced that all 143 * the protocols can be easily modified to do this. 144 * 145 * soalloc() returns a socket with a ref count of 0. 146 */ 147 struct socket * 148 soalloc(int mflags) 149 { 150 struct socket *so; 151 #ifdef MAC 152 int error; 153 #endif 154 155 so = uma_zalloc(socket_zone, mflags | M_ZERO); 156 if (so != NULL) { 157 #ifdef MAC 158 error = mac_init_socket(so, mflags); 159 if (error != 0) { 160 uma_zfree(socket_zone, so); 161 so = NULL; 162 return so; 163 } 164 #endif 165 SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd"); 166 SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv"); 167 /* sx_init(&so->so_sxlock, "socket sxlock"); */ 168 TAILQ_INIT(&so->so_aiojobq); 169 mtx_lock(&so_global_mtx); 170 so->so_gencnt = ++so_gencnt; 171 ++numopensockets; 172 mtx_unlock(&so_global_mtx); 173 } 174 return so; 175 } 176 177 /* 178 * socreate returns a socket with a ref count of 1. The socket should be 179 * closed with soclose(). 180 */ 181 int 182 socreate(dom, aso, type, proto, cred, td) 183 int dom; 184 struct socket **aso; 185 int type; 186 int proto; 187 struct ucred *cred; 188 struct thread *td; 189 { 190 struct protosw *prp; 191 struct socket *so; 192 int error; 193 194 if (proto) 195 prp = pffindproto(dom, proto, type); 196 else 197 prp = pffindtype(dom, type); 198 199 if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL) 200 return (EPROTONOSUPPORT); 201 202 if (jailed(cred) && jail_socket_unixiproute_only && 203 prp->pr_domain->dom_family != PF_LOCAL && 204 prp->pr_domain->dom_family != PF_INET && 205 prp->pr_domain->dom_family != PF_ROUTE) { 206 return (EPROTONOSUPPORT); 207 } 208 209 if (prp->pr_type != type) 210 return (EPROTOTYPE); 211 so = soalloc(M_WAITOK); 212 if (so == NULL) 213 return (ENOBUFS); 214 215 TAILQ_INIT(&so->so_incomp); 216 TAILQ_INIT(&so->so_comp); 217 so->so_type = type; 218 so->so_cred = crhold(cred); 219 so->so_proto = prp; 220 #ifdef MAC 221 mac_create_socket(cred, so); 222 #endif 223 SOCK_LOCK(so); 224 soref(so); 225 SOCK_UNLOCK(so); 226 error = (*prp->pr_usrreqs->pru_attach)(so, proto, td); 227 if (error) { 228 SOCK_LOCK(so); 229 so->so_state |= SS_NOFDREF; 230 sorele(so); 231 return (error); 232 } 233 *aso = so; 234 return (0); 235 } 236 237 int 238 sobind(so, nam, td) 239 struct socket *so; 240 struct sockaddr *nam; 241 struct thread *td; 242 { 243 244 return ((*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td)); 245 } 246 247 void 248 sodealloc(struct socket *so) 249 { 250 251 KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count)); 252 mtx_lock(&so_global_mtx); 253 so->so_gencnt = ++so_gencnt; 254 mtx_unlock(&so_global_mtx); 255 if (so->so_rcv.sb_hiwat) 256 (void)chgsbsize(so->so_cred->cr_uidinfo, 257 &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY); 258 if (so->so_snd.sb_hiwat) 259 (void)chgsbsize(so->so_cred->cr_uidinfo, 260 &so->so_snd.sb_hiwat, 0, RLIM_INFINITY); 261 #ifdef INET 262 /* remove acccept filter if one is present. */ 263 if (so->so_accf != NULL) 264 do_setopt_accept_filter(so, NULL); 265 #endif 266 #ifdef MAC 267 mac_destroy_socket(so); 268 #endif 269 crfree(so->so_cred); 270 SOCKBUF_LOCK_DESTROY(&so->so_snd); 271 SOCKBUF_LOCK_DESTROY(&so->so_rcv); 272 /* sx_destroy(&so->so_sxlock); */ 273 uma_zfree(socket_zone, so); 274 /* 275 * XXXRW: Seems like a shame to grab the mutex again down here, but 276 * we don't want to decrement the socket count until after we free 277 * the socket, and we can't increment the gencnt on the socket after 278 * we free, it so... 279 */ 280 mtx_lock(&so_global_mtx); 281 --numopensockets; 282 mtx_unlock(&so_global_mtx); 283 } 284 285 int 286 solisten(so, backlog, td) 287 struct socket *so; 288 int backlog; 289 struct thread *td; 290 { 291 int error; 292 293 /* 294 * XXXRW: Ordering issue here -- perhaps we need to set 295 * SO_ACCEPTCONN before the call to pru_listen()? 296 * XXXRW: General atomic test-and-set concerns here also. 297 */ 298 if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | 299 SS_ISDISCONNECTING)) 300 return (EINVAL); 301 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, td); 302 if (error) 303 return (error); 304 ACCEPT_LOCK(); 305 if (TAILQ_EMPTY(&so->so_comp)) { 306 SOCK_LOCK(so); 307 so->so_options |= SO_ACCEPTCONN; 308 SOCK_UNLOCK(so); 309 } 310 if (backlog < 0 || backlog > somaxconn) 311 backlog = somaxconn; 312 so->so_qlimit = backlog; 313 ACCEPT_UNLOCK(); 314 return (0); 315 } 316 317 void 318 sofree(so) 319 struct socket *so; 320 { 321 struct socket *head; 322 323 KASSERT(so->so_count == 0, ("socket %p so_count not 0", so)); 324 SOCK_LOCK_ASSERT(so); 325 326 if (so->so_pcb != NULL || (so->so_state & SS_NOFDREF) == 0) { 327 SOCK_UNLOCK(so); 328 return; 329 } 330 331 SOCK_UNLOCK(so); 332 ACCEPT_LOCK(); 333 head = so->so_head; 334 if (head != NULL) { 335 KASSERT((so->so_qstate & SQ_COMP) != 0 || 336 (so->so_qstate & SQ_INCOMP) != 0, 337 ("sofree: so_head != NULL, but neither SQ_COMP nor " 338 "SQ_INCOMP")); 339 KASSERT((so->so_qstate & SQ_COMP) == 0 || 340 (so->so_qstate & SQ_INCOMP) == 0, 341 ("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP")); 342 /* 343 * accept(2) is responsible draining the completed 344 * connection queue and freeing those sockets, so 345 * we just return here if this socket is currently 346 * on the completed connection queue. Otherwise, 347 * accept(2) may hang after select(2) has indicating 348 * that a listening socket was ready. If it's an 349 * incomplete connection, we remove it from the queue 350 * and free it; otherwise, it won't be released until 351 * the listening socket is closed. 352 */ 353 if ((so->so_qstate & SQ_COMP) != 0) { 354 ACCEPT_UNLOCK(); 355 return; 356 } 357 TAILQ_REMOVE(&head->so_incomp, so, so_list); 358 head->so_incqlen--; 359 so->so_qstate &= ~SQ_INCOMP; 360 so->so_head = NULL; 361 } 362 KASSERT((so->so_qstate & SQ_COMP) == 0 && 363 (so->so_qstate & SQ_INCOMP) == 0, 364 ("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)", 365 so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP)); 366 ACCEPT_UNLOCK(); 367 SOCKBUF_LOCK(&so->so_snd); 368 so->so_snd.sb_flags |= SB_NOINTR; 369 (void)sblock(&so->so_snd, M_WAITOK); 370 /* 371 * socantsendmore_locked() drops the socket buffer mutex so that it 372 * can safely perform wakeups. Re-acquire the mutex before 373 * continuing. 374 */ 375 socantsendmore_locked(so); 376 SOCKBUF_LOCK(&so->so_snd); 377 sbunlock(&so->so_snd); 378 sbrelease_locked(&so->so_snd, so); 379 SOCKBUF_UNLOCK(&so->so_snd); 380 sorflush(so); 381 sodealloc(so); 382 } 383 384 /* 385 * Close a socket on last file table reference removal. 386 * Initiate disconnect if connected. 387 * Free socket when disconnect complete. 388 * 389 * This function will sorele() the socket. Note that soclose() may be 390 * called prior to the ref count reaching zero. The actual socket 391 * structure will not be freed until the ref count reaches zero. 392 */ 393 int 394 soclose(so) 395 struct socket *so; 396 { 397 int error = 0; 398 399 KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter")); 400 401 funsetown(&so->so_sigio); 402 if (so->so_options & SO_ACCEPTCONN) { 403 struct socket *sp; 404 ACCEPT_LOCK(); 405 while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) { 406 TAILQ_REMOVE(&so->so_incomp, sp, so_list); 407 so->so_incqlen--; 408 sp->so_qstate &= ~SQ_INCOMP; 409 sp->so_head = NULL; 410 ACCEPT_UNLOCK(); 411 (void) soabort(sp); 412 ACCEPT_LOCK(); 413 } 414 while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) { 415 TAILQ_REMOVE(&so->so_comp, sp, so_list); 416 so->so_qlen--; 417 sp->so_qstate &= ~SQ_COMP; 418 sp->so_head = NULL; 419 ACCEPT_UNLOCK(); 420 (void) soabort(sp); 421 ACCEPT_LOCK(); 422 } 423 ACCEPT_UNLOCK(); 424 } 425 if (so->so_pcb == NULL) 426 goto discard; 427 if (so->so_state & SS_ISCONNECTED) { 428 if ((so->so_state & SS_ISDISCONNECTING) == 0) { 429 error = sodisconnect(so); 430 if (error) 431 goto drop; 432 } 433 if (so->so_options & SO_LINGER) { 434 if ((so->so_state & SS_ISDISCONNECTING) && 435 (so->so_state & SS_NBIO)) 436 goto drop; 437 while (so->so_state & SS_ISCONNECTED) { 438 error = tsleep(&so->so_timeo, 439 PSOCK | PCATCH, "soclos", so->so_linger * hz); 440 if (error) 441 break; 442 } 443 } 444 } 445 drop: 446 if (so->so_pcb != NULL) { 447 int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so); 448 if (error == 0) 449 error = error2; 450 } 451 discard: 452 SOCK_LOCK(so); 453 KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF")); 454 so->so_state |= SS_NOFDREF; 455 sorele(so); 456 return (error); 457 } 458 459 /* 460 * soabort() must not be called with any socket locks held, as it calls 461 * into the protocol, which will call back into the socket code causing 462 * it to acquire additional socket locks that may cause recursion or lock 463 * order reversals. 464 */ 465 int 466 soabort(so) 467 struct socket *so; 468 { 469 int error; 470 471 error = (*so->so_proto->pr_usrreqs->pru_abort)(so); 472 if (error) { 473 SOCK_LOCK(so); 474 sotryfree(so); /* note: does not decrement the ref count */ 475 return error; 476 } 477 return (0); 478 } 479 480 int 481 soaccept(so, nam) 482 struct socket *so; 483 struct sockaddr **nam; 484 { 485 int error; 486 487 SOCK_LOCK(so); 488 KASSERT((so->so_state & SS_NOFDREF) != 0, ("soaccept: !NOFDREF")); 489 so->so_state &= ~SS_NOFDREF; 490 SOCK_UNLOCK(so); 491 error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam); 492 return (error); 493 } 494 495 int 496 soconnect(so, nam, td) 497 struct socket *so; 498 struct sockaddr *nam; 499 struct thread *td; 500 { 501 int error; 502 503 if (so->so_options & SO_ACCEPTCONN) 504 return (EOPNOTSUPP); 505 /* 506 * If protocol is connection-based, can only connect once. 507 * Otherwise, if connected, try to disconnect first. 508 * This allows user to disconnect by connecting to, e.g., 509 * a null address. 510 */ 511 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && 512 ((so->so_proto->pr_flags & PR_CONNREQUIRED) || 513 (error = sodisconnect(so)))) 514 error = EISCONN; 515 else 516 error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td); 517 return (error); 518 } 519 520 int 521 soconnect2(so1, so2) 522 struct socket *so1; 523 struct socket *so2; 524 { 525 526 return ((*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2)); 527 } 528 529 int 530 sodisconnect(so) 531 struct socket *so; 532 { 533 int error; 534 535 if ((so->so_state & SS_ISCONNECTED) == 0) 536 return (ENOTCONN); 537 if (so->so_state & SS_ISDISCONNECTING) 538 return (EALREADY); 539 error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so); 540 return (error); 541 } 542 543 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK) 544 /* 545 * Send on a socket. 546 * If send must go all at once and message is larger than 547 * send buffering, then hard error. 548 * Lock against other senders. 549 * If must go all at once and not enough room now, then 550 * inform user that this would block and do nothing. 551 * Otherwise, if nonblocking, send as much as possible. 552 * The data to be sent is described by "uio" if nonzero, 553 * otherwise by the mbuf chain "top" (which must be null 554 * if uio is not). Data provided in mbuf chain must be small 555 * enough to send all at once. 556 * 557 * Returns nonzero on error, timeout or signal; callers 558 * must check for short counts if EINTR/ERESTART are returned. 559 * Data and control buffers are freed on return. 560 */ 561 562 #ifdef ZERO_COPY_SOCKETS 563 struct so_zerocopy_stats{ 564 int size_ok; 565 int align_ok; 566 int found_ifp; 567 }; 568 struct so_zerocopy_stats so_zerocp_stats = {0,0,0}; 569 #include <netinet/in.h> 570 #include <net/route.h> 571 #include <netinet/in_pcb.h> 572 #include <vm/vm.h> 573 #include <vm/vm_page.h> 574 #include <vm/vm_object.h> 575 #endif /*ZERO_COPY_SOCKETS*/ 576 577 int 578 sosend(so, addr, uio, top, control, flags, td) 579 struct socket *so; 580 struct sockaddr *addr; 581 struct uio *uio; 582 struct mbuf *top; 583 struct mbuf *control; 584 int flags; 585 struct thread *td; 586 { 587 struct mbuf **mp; 588 struct mbuf *m; 589 long space, len = 0, resid; 590 int clen = 0, error, dontroute; 591 int atomic = sosendallatonce(so) || top; 592 #ifdef ZERO_COPY_SOCKETS 593 int cow_send; 594 #endif /* ZERO_COPY_SOCKETS */ 595 596 if (uio != NULL) 597 resid = uio->uio_resid; 598 else 599 resid = top->m_pkthdr.len; 600 /* 601 * In theory resid should be unsigned. 602 * However, space must be signed, as it might be less than 0 603 * if we over-committed, and we must use a signed comparison 604 * of space and resid. On the other hand, a negative resid 605 * causes us to loop sending 0-length segments to the protocol. 606 * 607 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM 608 * type sockets since that's an error. 609 */ 610 if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) { 611 error = EINVAL; 612 goto out; 613 } 614 615 dontroute = 616 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && 617 (so->so_proto->pr_flags & PR_ATOMIC); 618 if (td != NULL) 619 td->td_proc->p_stats->p_ru.ru_msgsnd++; 620 if (control != NULL) 621 clen = control->m_len; 622 #define snderr(errno) { error = (errno); goto release; } 623 624 SOCKBUF_LOCK(&so->so_snd); 625 restart: 626 SOCKBUF_LOCK_ASSERT(&so->so_snd); 627 error = sblock(&so->so_snd, SBLOCKWAIT(flags)); 628 if (error) 629 goto out_locked; 630 do { 631 SOCKBUF_LOCK_ASSERT(&so->so_snd); 632 if (so->so_snd.sb_state & SBS_CANTSENDMORE) 633 snderr(EPIPE); 634 if (so->so_error) { 635 error = so->so_error; 636 so->so_error = 0; 637 goto release; 638 } 639 if ((so->so_state & SS_ISCONNECTED) == 0) { 640 /* 641 * `sendto' and `sendmsg' is allowed on a connection- 642 * based socket if it supports implied connect. 643 * Return ENOTCONN if not connected and no address is 644 * supplied. 645 */ 646 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && 647 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { 648 if ((so->so_state & SS_ISCONFIRMING) == 0 && 649 !(resid == 0 && clen != 0)) 650 snderr(ENOTCONN); 651 } else if (addr == NULL) 652 snderr(so->so_proto->pr_flags & PR_CONNREQUIRED ? 653 ENOTCONN : EDESTADDRREQ); 654 } 655 space = sbspace(&so->so_snd); 656 if (flags & MSG_OOB) 657 space += 1024; 658 if ((atomic && resid > so->so_snd.sb_hiwat) || 659 clen > so->so_snd.sb_hiwat) 660 snderr(EMSGSIZE); 661 if (space < resid + clen && 662 (atomic || space < so->so_snd.sb_lowat || space < clen)) { 663 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) 664 snderr(EWOULDBLOCK); 665 sbunlock(&so->so_snd); 666 error = sbwait(&so->so_snd); 667 if (error) 668 goto out_locked; 669 goto restart; 670 } 671 SOCKBUF_UNLOCK(&so->so_snd); 672 mp = ⊤ 673 space -= clen; 674 do { 675 if (uio == NULL) { 676 /* 677 * Data is prepackaged in "top". 678 */ 679 resid = 0; 680 if (flags & MSG_EOR) 681 top->m_flags |= M_EOR; 682 } else do { 683 #ifdef ZERO_COPY_SOCKETS 684 cow_send = 0; 685 #endif /* ZERO_COPY_SOCKETS */ 686 if (resid >= MINCLSIZE) { 687 #ifdef ZERO_COPY_SOCKETS 688 if (top == NULL) { 689 MGETHDR(m, M_TRYWAIT, MT_DATA); 690 if (m == NULL) { 691 error = ENOBUFS; 692 SOCKBUF_LOCK(&so->so_snd); 693 goto release; 694 } 695 m->m_pkthdr.len = 0; 696 m->m_pkthdr.rcvif = (struct ifnet *)0; 697 } else { 698 MGET(m, M_TRYWAIT, MT_DATA); 699 if (m == NULL) { 700 error = ENOBUFS; 701 SOCKBUF_LOCK(&so->so_snd); 702 goto release; 703 } 704 } 705 if (so_zero_copy_send && 706 resid>=PAGE_SIZE && 707 space>=PAGE_SIZE && 708 uio->uio_iov->iov_len>=PAGE_SIZE) { 709 so_zerocp_stats.size_ok++; 710 if (!((vm_offset_t) 711 uio->uio_iov->iov_base & PAGE_MASK)){ 712 so_zerocp_stats.align_ok++; 713 cow_send = socow_setup(m, uio); 714 } 715 } 716 if (!cow_send) { 717 MCLGET(m, M_TRYWAIT); 718 if ((m->m_flags & M_EXT) == 0) { 719 m_free(m); 720 m = NULL; 721 } else { 722 len = min(min(MCLBYTES, resid), space); 723 } 724 } else 725 len = PAGE_SIZE; 726 #else /* ZERO_COPY_SOCKETS */ 727 if (top == NULL) { 728 m = m_getcl(M_TRYWAIT, MT_DATA, M_PKTHDR); 729 m->m_pkthdr.len = 0; 730 m->m_pkthdr.rcvif = (struct ifnet *)0; 731 } else 732 m = m_getcl(M_TRYWAIT, MT_DATA, 0); 733 len = min(min(MCLBYTES, resid), space); 734 #endif /* ZERO_COPY_SOCKETS */ 735 } else { 736 if (top == NULL) { 737 m = m_gethdr(M_TRYWAIT, MT_DATA); 738 m->m_pkthdr.len = 0; 739 m->m_pkthdr.rcvif = (struct ifnet *)0; 740 741 len = min(min(MHLEN, resid), space); 742 /* 743 * For datagram protocols, leave room 744 * for protocol headers in first mbuf. 745 */ 746 if (atomic && m && len < MHLEN) 747 MH_ALIGN(m, len); 748 } else { 749 m = m_get(M_TRYWAIT, MT_DATA); 750 len = min(min(MLEN, resid), space); 751 } 752 } 753 if (m == NULL) { 754 error = ENOBUFS; 755 SOCKBUF_LOCK(&so->so_snd); 756 goto release; 757 } 758 759 space -= len; 760 #ifdef ZERO_COPY_SOCKETS 761 if (cow_send) 762 error = 0; 763 else 764 #endif /* ZERO_COPY_SOCKETS */ 765 error = uiomove(mtod(m, void *), (int)len, uio); 766 resid = uio->uio_resid; 767 m->m_len = len; 768 *mp = m; 769 top->m_pkthdr.len += len; 770 if (error) { 771 SOCKBUF_LOCK(&so->so_snd); 772 goto release; 773 } 774 mp = &m->m_next; 775 if (resid <= 0) { 776 if (flags & MSG_EOR) 777 top->m_flags |= M_EOR; 778 break; 779 } 780 } while (space > 0 && atomic); 781 if (dontroute) { 782 SOCK_LOCK(so); 783 so->so_options |= SO_DONTROUTE; 784 SOCK_UNLOCK(so); 785 } 786 /* 787 * XXX all the SBS_CANTSENDMORE checks previously 788 * done could be out of date. We could have recieved 789 * a reset packet in an interrupt or maybe we slept 790 * while doing page faults in uiomove() etc. We could 791 * probably recheck again inside the splnet() protection 792 * here, but there are probably other places that this 793 * also happens. We must rethink this. 794 */ 795 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 796 (flags & MSG_OOB) ? PRUS_OOB : 797 /* 798 * If the user set MSG_EOF, the protocol 799 * understands this flag and nothing left to 800 * send then use PRU_SEND_EOF instead of PRU_SEND. 801 */ 802 ((flags & MSG_EOF) && 803 (so->so_proto->pr_flags & PR_IMPLOPCL) && 804 (resid <= 0)) ? 805 PRUS_EOF : 806 /* If there is more to send set PRUS_MORETOCOME */ 807 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, 808 top, addr, control, td); 809 if (dontroute) { 810 SOCK_LOCK(so); 811 so->so_options &= ~SO_DONTROUTE; 812 SOCK_UNLOCK(so); 813 } 814 clen = 0; 815 control = NULL; 816 top = NULL; 817 mp = ⊤ 818 if (error) { 819 SOCKBUF_LOCK(&so->so_snd); 820 goto release; 821 } 822 } while (resid && space > 0); 823 SOCKBUF_LOCK(&so->so_snd); 824 } while (resid); 825 826 release: 827 SOCKBUF_LOCK_ASSERT(&so->so_snd); 828 sbunlock(&so->so_snd); 829 out_locked: 830 SOCKBUF_LOCK_ASSERT(&so->so_snd); 831 SOCKBUF_UNLOCK(&so->so_snd); 832 out: 833 if (top != NULL) 834 m_freem(top); 835 if (control != NULL) 836 m_freem(control); 837 return (error); 838 } 839 840 /* 841 * The part of soreceive() that implements reading non-inline out-of-band 842 * data from a socket. For more complete comments, see soreceive(), from 843 * which this code originated. 844 * 845 * XXXRW: Note that soreceive_rcvoob(), unlike the remainder of soreiceve(), 846 * is unable to return an mbuf chain to the caller. 847 */ 848 static int 849 soreceive_rcvoob(so, uio, flags) 850 struct socket *so; 851 struct uio *uio; 852 int flags; 853 { 854 struct protosw *pr = so->so_proto; 855 struct mbuf *m; 856 int error; 857 858 KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0")); 859 860 m = m_get(M_TRYWAIT, MT_DATA); 861 if (m == NULL) 862 return (ENOBUFS); 863 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK); 864 if (error) 865 goto bad; 866 do { 867 #ifdef ZERO_COPY_SOCKETS 868 if (so_zero_copy_receive) { 869 vm_page_t pg; 870 int disposable; 871 872 if ((m->m_flags & M_EXT) 873 && (m->m_ext.ext_type == EXT_DISPOSABLE)) 874 disposable = 1; 875 else 876 disposable = 0; 877 878 pg = PHYS_TO_VM_PAGE(vtophys(mtod(m, caddr_t))); 879 if (uio->uio_offset == -1) 880 uio->uio_offset =IDX_TO_OFF(pg->pindex); 881 882 error = uiomoveco(mtod(m, void *), 883 min(uio->uio_resid, m->m_len), 884 uio, pg->object, 885 disposable); 886 } else 887 #endif /* ZERO_COPY_SOCKETS */ 888 error = uiomove(mtod(m, void *), 889 (int) min(uio->uio_resid, m->m_len), uio); 890 m = m_free(m); 891 } while (uio->uio_resid && error == 0 && m); 892 bad: 893 if (m != NULL) 894 m_freem(m); 895 return (error); 896 } 897 898 /* 899 * Following replacement or removal of the first mbuf on the first mbuf chain 900 * of a socket buffer, push necessary state changes back into the socket 901 * buffer so that other consumers see the values consistently. 'nextrecord' 902 * is the callers locally stored value of the original value of 903 * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes. 904 * NOTE: 'nextrecord' may be NULL. 905 */ 906 static __inline void 907 sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord) 908 { 909 910 SOCKBUF_LOCK_ASSERT(sb); 911 /* 912 * First, update for the new value of nextrecord. If necessary, make 913 * it the first record. 914 */ 915 if (sb->sb_mb != NULL) 916 sb->sb_mb->m_nextpkt = nextrecord; 917 else 918 sb->sb_mb = nextrecord; 919 920 /* 921 * Now update any dependent socket buffer fields to reflect the new 922 * state. This is an expanded inline of SB_EMPTY_FIXUP(), with the 923 * addition of a second clause that takes care of the case where 924 * sb_mb has been updated, but remains the last record. 925 */ 926 if (sb->sb_mb == NULL) { 927 sb->sb_mbtail = NULL; 928 sb->sb_lastrecord = NULL; 929 } else if (sb->sb_mb->m_nextpkt == NULL) 930 sb->sb_lastrecord = sb->sb_mb; 931 } 932 933 934 /* 935 * Implement receive operations on a socket. 936 * We depend on the way that records are added to the sockbuf 937 * by sbappend*. In particular, each record (mbufs linked through m_next) 938 * must begin with an address if the protocol so specifies, 939 * followed by an optional mbuf or mbufs containing ancillary data, 940 * and then zero or more mbufs of data. 941 * In order to avoid blocking network interrupts for the entire time here, 942 * we splx() while doing the actual copy to user space. 943 * Although the sockbuf is locked, new data may still be appended, 944 * and thus we must maintain consistency of the sockbuf during that time. 945 * 946 * The caller may receive the data as a single mbuf chain by supplying 947 * an mbuf **mp0 for use in returning the chain. The uio is then used 948 * only for the count in uio_resid. 949 */ 950 int 951 soreceive(so, psa, uio, mp0, controlp, flagsp) 952 struct socket *so; 953 struct sockaddr **psa; 954 struct uio *uio; 955 struct mbuf **mp0; 956 struct mbuf **controlp; 957 int *flagsp; 958 { 959 struct mbuf *m, **mp; 960 int flags, len, error, offset; 961 struct protosw *pr = so->so_proto; 962 struct mbuf *nextrecord; 963 int moff, type = 0; 964 int orig_resid = uio->uio_resid; 965 966 mp = mp0; 967 if (psa != NULL) 968 *psa = NULL; 969 if (controlp != NULL) 970 *controlp = NULL; 971 if (flagsp != NULL) 972 flags = *flagsp &~ MSG_EOR; 973 else 974 flags = 0; 975 if (flags & MSG_OOB) 976 return (soreceive_rcvoob(so, uio, flags)); 977 if (mp != NULL) 978 *mp = NULL; 979 if (so->so_state & SS_ISCONFIRMING && uio->uio_resid) 980 (*pr->pr_usrreqs->pru_rcvd)(so, 0); 981 982 SOCKBUF_LOCK(&so->so_rcv); 983 restart: 984 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 985 error = sblock(&so->so_rcv, SBLOCKWAIT(flags)); 986 if (error) 987 goto out; 988 989 m = so->so_rcv.sb_mb; 990 /* 991 * If we have less data than requested, block awaiting more 992 * (subject to any timeout) if: 993 * 1. the current count is less than the low water mark, or 994 * 2. MSG_WAITALL is set, and it is possible to do the entire 995 * receive operation at once if we block (resid <= hiwat). 996 * 3. MSG_DONTWAIT is not set 997 * If MSG_WAITALL is set but resid is larger than the receive buffer, 998 * we have to do the receive in sections, and thus risk returning 999 * a short count if a timeout or signal occurs after we start. 1000 */ 1001 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 && 1002 so->so_rcv.sb_cc < uio->uio_resid) && 1003 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat || 1004 ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) && 1005 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) { 1006 KASSERT(m != NULL || !so->so_rcv.sb_cc, 1007 ("receive: m == %p so->so_rcv.sb_cc == %u", 1008 m, so->so_rcv.sb_cc)); 1009 if (so->so_error) { 1010 if (m != NULL) 1011 goto dontblock; 1012 error = so->so_error; 1013 if ((flags & MSG_PEEK) == 0) 1014 so->so_error = 0; 1015 goto release; 1016 } 1017 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1018 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 1019 if (m) 1020 goto dontblock; 1021 else 1022 goto release; 1023 } 1024 for (; m != NULL; m = m->m_next) 1025 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { 1026 m = so->so_rcv.sb_mb; 1027 goto dontblock; 1028 } 1029 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && 1030 (so->so_proto->pr_flags & PR_CONNREQUIRED)) { 1031 error = ENOTCONN; 1032 goto release; 1033 } 1034 if (uio->uio_resid == 0) 1035 goto release; 1036 if ((so->so_state & SS_NBIO) || 1037 (flags & (MSG_DONTWAIT|MSG_NBIO))) { 1038 error = EWOULDBLOCK; 1039 goto release; 1040 } 1041 SBLASTRECORDCHK(&so->so_rcv); 1042 SBLASTMBUFCHK(&so->so_rcv); 1043 sbunlock(&so->so_rcv); 1044 error = sbwait(&so->so_rcv); 1045 if (error) 1046 goto out; 1047 goto restart; 1048 } 1049 dontblock: 1050 /* 1051 * From this point onward, we maintain 'nextrecord' as a cache of the 1052 * pointer to the next record in the socket buffer. We must keep the 1053 * various socket buffer pointers and local stack versions of the 1054 * pointers in sync, pushing out modifications before dropping the 1055 * socket buffer mutex, and re-reading them when picking it up. 1056 * 1057 * Otherwise, we will race with the network stack appending new data 1058 * or records onto the socket buffer by using inconsistent/stale 1059 * versions of the field, possibly resulting in socket buffer 1060 * corruption. 1061 * 1062 * By holding the high-level sblock(), we prevent simultaneous 1063 * readers from pulling off the front of the socket buffer. 1064 */ 1065 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1066 if (uio->uio_td) 1067 uio->uio_td->td_proc->p_stats->p_ru.ru_msgrcv++; 1068 KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb")); 1069 SBLASTRECORDCHK(&so->so_rcv); 1070 SBLASTMBUFCHK(&so->so_rcv); 1071 nextrecord = m->m_nextpkt; 1072 if (pr->pr_flags & PR_ADDR) { 1073 KASSERT(m->m_type == MT_SONAME, 1074 ("m->m_type == %d", m->m_type)); 1075 orig_resid = 0; 1076 if (psa != NULL) 1077 *psa = sodupsockaddr(mtod(m, struct sockaddr *), 1078 M_NOWAIT); 1079 if (flags & MSG_PEEK) { 1080 m = m->m_next; 1081 } else { 1082 sbfree(&so->so_rcv, m); 1083 so->so_rcv.sb_mb = m_free(m); 1084 m = so->so_rcv.sb_mb; 1085 sockbuf_pushsync(&so->so_rcv, nextrecord); 1086 } 1087 } 1088 1089 /* 1090 * Process one or more MT_CONTROL mbufs present before any data mbufs 1091 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we 1092 * just copy the data; if !MSG_PEEK, we call into the protocol to 1093 * perform externalization (or freeing if controlp == NULL). 1094 */ 1095 if (m != NULL && m->m_type == MT_CONTROL) { 1096 struct mbuf *cm = NULL, *cmn; 1097 struct mbuf **cme = &cm; 1098 1099 do { 1100 if (flags & MSG_PEEK) { 1101 if (controlp != NULL) { 1102 *controlp = m_copy(m, 0, m->m_len); 1103 controlp = &(*controlp)->m_next; 1104 } 1105 m = m->m_next; 1106 } else { 1107 sbfree(&so->so_rcv, m); 1108 so->so_rcv.sb_mb = m->m_next; 1109 m->m_next = NULL; 1110 *cme = m; 1111 cme = &(*cme)->m_next; 1112 m = so->so_rcv.sb_mb; 1113 } 1114 } while (m != NULL && m->m_type == MT_CONTROL); 1115 if ((flags & MSG_PEEK) == 0) 1116 sockbuf_pushsync(&so->so_rcv, nextrecord); 1117 while (cm != NULL) { 1118 cmn = cm->m_next; 1119 cm->m_next = NULL; 1120 if (pr->pr_domain->dom_externalize != NULL) { 1121 SOCKBUF_UNLOCK(&so->so_rcv); 1122 error = (*pr->pr_domain->dom_externalize) 1123 (cm, controlp); 1124 SOCKBUF_LOCK(&so->so_rcv); 1125 } else if (controlp != NULL) 1126 *controlp = cm; 1127 else 1128 m_freem(cm); 1129 if (controlp != NULL) { 1130 orig_resid = 0; 1131 while (*controlp != NULL) 1132 controlp = &(*controlp)->m_next; 1133 } 1134 cm = cmn; 1135 } 1136 nextrecord = so->so_rcv.sb_mb->m_nextpkt; 1137 orig_resid = 0; 1138 } 1139 if (m != NULL) { 1140 if ((flags & MSG_PEEK) == 0) { 1141 KASSERT(m->m_nextpkt == nextrecord, 1142 ("soreceive: post-control, nextrecord !sync")); 1143 if (nextrecord == NULL) { 1144 KASSERT(so->so_rcv.sb_mb == m, 1145 ("soreceive: post-control, sb_mb!=m")); 1146 KASSERT(so->so_rcv.sb_lastrecord == m, 1147 ("soreceive: post-control, lastrecord!=m")); 1148 } 1149 } 1150 type = m->m_type; 1151 if (type == MT_OOBDATA) 1152 flags |= MSG_OOB; 1153 } else { 1154 if ((flags & MSG_PEEK) == 0) { 1155 KASSERT(so->so_rcv.sb_mb == nextrecord, 1156 ("soreceive: sb_mb != nextrecord")); 1157 if (so->so_rcv.sb_mb == NULL) { 1158 KASSERT(so->so_rcv.sb_lastrecord == NULL, 1159 ("soreceive: sb_lastercord != NULL")); 1160 } 1161 } 1162 } 1163 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1164 SBLASTRECORDCHK(&so->so_rcv); 1165 SBLASTMBUFCHK(&so->so_rcv); 1166 1167 /* 1168 * Now continue to read any data mbufs off of the head of the socket 1169 * buffer until the read request is satisfied. Note that 'type' is 1170 * used to store the type of any mbuf reads that have happened so far 1171 * such that soreceive() can stop reading if the type changes, which 1172 * causes soreceive() to return only one of regular data and inline 1173 * out-of-band data in a single socket receive operation. 1174 */ 1175 moff = 0; 1176 offset = 0; 1177 while (m != NULL && uio->uio_resid > 0 && error == 0) { 1178 /* 1179 * If the type of mbuf has changed since the last mbuf 1180 * examined ('type'), end the receive operation. 1181 */ 1182 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1183 if (m->m_type == MT_OOBDATA) { 1184 if (type != MT_OOBDATA) 1185 break; 1186 } else if (type == MT_OOBDATA) 1187 break; 1188 else 1189 KASSERT(m->m_type == MT_DATA || m->m_type == MT_HEADER, 1190 ("m->m_type == %d", m->m_type)); 1191 so->so_rcv.sb_state &= ~SBS_RCVATMARK; 1192 len = uio->uio_resid; 1193 if (so->so_oobmark && len > so->so_oobmark - offset) 1194 len = so->so_oobmark - offset; 1195 if (len > m->m_len - moff) 1196 len = m->m_len - moff; 1197 /* 1198 * If mp is set, just pass back the mbufs. 1199 * Otherwise copy them out via the uio, then free. 1200 * Sockbuf must be consistent here (points to current mbuf, 1201 * it points to next record) when we drop priority; 1202 * we must note any additions to the sockbuf when we 1203 * block interrupts again. 1204 */ 1205 if (mp == NULL) { 1206 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1207 SBLASTRECORDCHK(&so->so_rcv); 1208 SBLASTMBUFCHK(&so->so_rcv); 1209 SOCKBUF_UNLOCK(&so->so_rcv); 1210 #ifdef ZERO_COPY_SOCKETS 1211 if (so_zero_copy_receive) { 1212 vm_page_t pg; 1213 int disposable; 1214 1215 if ((m->m_flags & M_EXT) 1216 && (m->m_ext.ext_type == EXT_DISPOSABLE)) 1217 disposable = 1; 1218 else 1219 disposable = 0; 1220 1221 pg = PHYS_TO_VM_PAGE(vtophys(mtod(m, caddr_t) + 1222 moff)); 1223 1224 if (uio->uio_offset == -1) 1225 uio->uio_offset =IDX_TO_OFF(pg->pindex); 1226 1227 error = uiomoveco(mtod(m, char *) + moff, 1228 (int)len, uio,pg->object, 1229 disposable); 1230 } else 1231 #endif /* ZERO_COPY_SOCKETS */ 1232 error = uiomove(mtod(m, char *) + moff, (int)len, uio); 1233 SOCKBUF_LOCK(&so->so_rcv); 1234 if (error) 1235 goto release; 1236 } else 1237 uio->uio_resid -= len; 1238 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1239 if (len == m->m_len - moff) { 1240 if (m->m_flags & M_EOR) 1241 flags |= MSG_EOR; 1242 if (flags & MSG_PEEK) { 1243 m = m->m_next; 1244 moff = 0; 1245 } else { 1246 nextrecord = m->m_nextpkt; 1247 sbfree(&so->so_rcv, m); 1248 if (mp != NULL) { 1249 *mp = m; 1250 mp = &m->m_next; 1251 so->so_rcv.sb_mb = m = m->m_next; 1252 *mp = NULL; 1253 } else { 1254 so->so_rcv.sb_mb = m_free(m); 1255 m = so->so_rcv.sb_mb; 1256 } 1257 if (m != NULL) { 1258 m->m_nextpkt = nextrecord; 1259 if (nextrecord == NULL) 1260 so->so_rcv.sb_lastrecord = m; 1261 } else { 1262 so->so_rcv.sb_mb = nextrecord; 1263 SB_EMPTY_FIXUP(&so->so_rcv); 1264 } 1265 SBLASTRECORDCHK(&so->so_rcv); 1266 SBLASTMBUFCHK(&so->so_rcv); 1267 } 1268 } else { 1269 if (flags & MSG_PEEK) 1270 moff += len; 1271 else { 1272 if (mp != NULL) { 1273 SOCKBUF_UNLOCK(&so->so_rcv); 1274 *mp = m_copym(m, 0, len, M_TRYWAIT); 1275 SOCKBUF_LOCK(&so->so_rcv); 1276 } 1277 m->m_data += len; 1278 m->m_len -= len; 1279 so->so_rcv.sb_cc -= len; 1280 } 1281 } 1282 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1283 if (so->so_oobmark) { 1284 if ((flags & MSG_PEEK) == 0) { 1285 so->so_oobmark -= len; 1286 if (so->so_oobmark == 0) { 1287 so->so_rcv.sb_state |= SBS_RCVATMARK; 1288 break; 1289 } 1290 } else { 1291 offset += len; 1292 if (offset == so->so_oobmark) 1293 break; 1294 } 1295 } 1296 if (flags & MSG_EOR) 1297 break; 1298 /* 1299 * If the MSG_WAITALL flag is set (for non-atomic socket), 1300 * we must not quit until "uio->uio_resid == 0" or an error 1301 * termination. If a signal/timeout occurs, return 1302 * with a short count but without error. 1303 * Keep sockbuf locked against other readers. 1304 */ 1305 while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 && 1306 !sosendallatonce(so) && nextrecord == NULL) { 1307 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1308 if (so->so_error || so->so_rcv.sb_state & SBS_CANTRCVMORE) 1309 break; 1310 /* 1311 * Notify the protocol that some data has been 1312 * drained before blocking. 1313 */ 1314 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb != NULL) { 1315 SOCKBUF_UNLOCK(&so->so_rcv); 1316 (*pr->pr_usrreqs->pru_rcvd)(so, flags); 1317 SOCKBUF_LOCK(&so->so_rcv); 1318 } 1319 SBLASTRECORDCHK(&so->so_rcv); 1320 SBLASTMBUFCHK(&so->so_rcv); 1321 error = sbwait(&so->so_rcv); 1322 if (error) 1323 goto release; 1324 m = so->so_rcv.sb_mb; 1325 if (m != NULL) 1326 nextrecord = m->m_nextpkt; 1327 } 1328 } 1329 1330 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1331 if (m != NULL && pr->pr_flags & PR_ATOMIC) { 1332 flags |= MSG_TRUNC; 1333 if ((flags & MSG_PEEK) == 0) 1334 (void) sbdroprecord_locked(&so->so_rcv); 1335 } 1336 if ((flags & MSG_PEEK) == 0) { 1337 if (m == NULL) { 1338 /* 1339 * First part is an inline SB_EMPTY_FIXUP(). Second 1340 * part makes sure sb_lastrecord is up-to-date if 1341 * there is still data in the socket buffer. 1342 */ 1343 so->so_rcv.sb_mb = nextrecord; 1344 if (so->so_rcv.sb_mb == NULL) { 1345 so->so_rcv.sb_mbtail = NULL; 1346 so->so_rcv.sb_lastrecord = NULL; 1347 } else if (nextrecord->m_nextpkt == NULL) 1348 so->so_rcv.sb_lastrecord = nextrecord; 1349 } 1350 SBLASTRECORDCHK(&so->so_rcv); 1351 SBLASTMBUFCHK(&so->so_rcv); 1352 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) { 1353 SOCKBUF_UNLOCK(&so->so_rcv); 1354 (*pr->pr_usrreqs->pru_rcvd)(so, flags); 1355 SOCKBUF_LOCK(&so->so_rcv); 1356 } 1357 } 1358 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1359 if (orig_resid == uio->uio_resid && orig_resid && 1360 (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) { 1361 sbunlock(&so->so_rcv); 1362 goto restart; 1363 } 1364 1365 if (flagsp != NULL) 1366 *flagsp |= flags; 1367 release: 1368 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1369 sbunlock(&so->so_rcv); 1370 out: 1371 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1372 SOCKBUF_UNLOCK(&so->so_rcv); 1373 return (error); 1374 } 1375 1376 int 1377 soshutdown(so, how) 1378 struct socket *so; 1379 int how; 1380 { 1381 struct protosw *pr = so->so_proto; 1382 1383 if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR)) 1384 return (EINVAL); 1385 1386 if (how != SHUT_WR) 1387 sorflush(so); 1388 if (how != SHUT_RD) 1389 return ((*pr->pr_usrreqs->pru_shutdown)(so)); 1390 return (0); 1391 } 1392 1393 void 1394 sorflush(so) 1395 struct socket *so; 1396 { 1397 struct sockbuf *sb = &so->so_rcv; 1398 struct protosw *pr = so->so_proto; 1399 struct sockbuf asb; 1400 1401 /* 1402 * XXXRW: This is quite ugly. The existing code made a copy of the 1403 * socket buffer, then zero'd the original to clear the buffer 1404 * fields. However, with mutexes in the socket buffer, this causes 1405 * problems. We only clear the zeroable bits of the original; 1406 * however, we have to initialize and destroy the mutex in the copy 1407 * so that dom_dispose() and sbrelease() can lock t as needed. 1408 */ 1409 SOCKBUF_LOCK(sb); 1410 sb->sb_flags |= SB_NOINTR; 1411 (void) sblock(sb, M_WAITOK); 1412 /* 1413 * socantrcvmore_locked() drops the socket buffer mutex so that it 1414 * can safely perform wakeups. Re-acquire the mutex before 1415 * continuing. 1416 */ 1417 socantrcvmore_locked(so); 1418 SOCKBUF_LOCK(sb); 1419 sbunlock(sb); 1420 /* 1421 * Invalidate/clear most of the sockbuf structure, but leave 1422 * selinfo and mutex data unchanged. 1423 */ 1424 bzero(&asb, offsetof(struct sockbuf, sb_startzero)); 1425 bcopy(&sb->sb_startzero, &asb.sb_startzero, 1426 sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); 1427 bzero(&sb->sb_startzero, 1428 sizeof(*sb) - offsetof(struct sockbuf, sb_startzero)); 1429 SOCKBUF_UNLOCK(sb); 1430 1431 SOCKBUF_LOCK_INIT(&asb, "so_rcv"); 1432 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL) 1433 (*pr->pr_domain->dom_dispose)(asb.sb_mb); 1434 sbrelease(&asb, so); 1435 SOCKBUF_LOCK_DESTROY(&asb); 1436 } 1437 1438 #ifdef INET 1439 static int 1440 do_setopt_accept_filter(so, sopt) 1441 struct socket *so; 1442 struct sockopt *sopt; 1443 { 1444 struct accept_filter_arg *afap = NULL; 1445 struct accept_filter *afp; 1446 struct so_accf *af = so->so_accf; 1447 int error = 0; 1448 1449 /* do not set/remove accept filters on non listen sockets */ 1450 if ((so->so_options & SO_ACCEPTCONN) == 0) { 1451 error = EINVAL; 1452 goto out; 1453 } 1454 1455 /* removing the filter */ 1456 if (sopt == NULL) { 1457 if (af != NULL) { 1458 if (af->so_accept_filter != NULL && 1459 af->so_accept_filter->accf_destroy != NULL) { 1460 af->so_accept_filter->accf_destroy(so); 1461 } 1462 if (af->so_accept_filter_str != NULL) { 1463 FREE(af->so_accept_filter_str, M_ACCF); 1464 } 1465 FREE(af, M_ACCF); 1466 so->so_accf = NULL; 1467 } 1468 so->so_options &= ~SO_ACCEPTFILTER; 1469 return (0); 1470 } 1471 /* adding a filter */ 1472 /* must remove previous filter first */ 1473 if (af != NULL) { 1474 error = EINVAL; 1475 goto out; 1476 } 1477 /* don't put large objects on the kernel stack */ 1478 MALLOC(afap, struct accept_filter_arg *, sizeof(*afap), M_TEMP, M_WAITOK); 1479 error = sooptcopyin(sopt, afap, sizeof *afap, sizeof *afap); 1480 afap->af_name[sizeof(afap->af_name)-1] = '\0'; 1481 afap->af_arg[sizeof(afap->af_arg)-1] = '\0'; 1482 if (error) 1483 goto out; 1484 afp = accept_filt_get(afap->af_name); 1485 if (afp == NULL) { 1486 error = ENOENT; 1487 goto out; 1488 } 1489 MALLOC(af, struct so_accf *, sizeof(*af), M_ACCF, M_WAITOK | M_ZERO); 1490 if (afp->accf_create != NULL) { 1491 if (afap->af_name[0] != '\0') { 1492 int len = strlen(afap->af_name) + 1; 1493 1494 MALLOC(af->so_accept_filter_str, char *, len, M_ACCF, M_WAITOK); 1495 strcpy(af->so_accept_filter_str, afap->af_name); 1496 } 1497 af->so_accept_filter_arg = afp->accf_create(so, afap->af_arg); 1498 if (af->so_accept_filter_arg == NULL) { 1499 FREE(af->so_accept_filter_str, M_ACCF); 1500 FREE(af, M_ACCF); 1501 so->so_accf = NULL; 1502 error = EINVAL; 1503 goto out; 1504 } 1505 } 1506 af->so_accept_filter = afp; 1507 so->so_accf = af; 1508 so->so_options |= SO_ACCEPTFILTER; 1509 out: 1510 if (afap != NULL) 1511 FREE(afap, M_TEMP); 1512 return (error); 1513 } 1514 #endif /* INET */ 1515 1516 /* 1517 * Perhaps this routine, and sooptcopyout(), below, ought to come in 1518 * an additional variant to handle the case where the option value needs 1519 * to be some kind of integer, but not a specific size. 1520 * In addition to their use here, these functions are also called by the 1521 * protocol-level pr_ctloutput() routines. 1522 */ 1523 int 1524 sooptcopyin(sopt, buf, len, minlen) 1525 struct sockopt *sopt; 1526 void *buf; 1527 size_t len; 1528 size_t minlen; 1529 { 1530 size_t valsize; 1531 1532 /* 1533 * If the user gives us more than we wanted, we ignore it, 1534 * but if we don't get the minimum length the caller 1535 * wants, we return EINVAL. On success, sopt->sopt_valsize 1536 * is set to however much we actually retrieved. 1537 */ 1538 if ((valsize = sopt->sopt_valsize) < minlen) 1539 return EINVAL; 1540 if (valsize > len) 1541 sopt->sopt_valsize = valsize = len; 1542 1543 if (sopt->sopt_td != NULL) 1544 return (copyin(sopt->sopt_val, buf, valsize)); 1545 1546 bcopy(sopt->sopt_val, buf, valsize); 1547 return 0; 1548 } 1549 1550 /* 1551 * Kernel version of setsockopt(2)/ 1552 * XXX: optlen is size_t, not socklen_t 1553 */ 1554 int 1555 so_setsockopt(struct socket *so, int level, int optname, void *optval, 1556 size_t optlen) 1557 { 1558 struct sockopt sopt; 1559 1560 sopt.sopt_level = level; 1561 sopt.sopt_name = optname; 1562 sopt.sopt_dir = SOPT_SET; 1563 sopt.sopt_val = optval; 1564 sopt.sopt_valsize = optlen; 1565 sopt.sopt_td = NULL; 1566 return (sosetopt(so, &sopt)); 1567 } 1568 1569 int 1570 sosetopt(so, sopt) 1571 struct socket *so; 1572 struct sockopt *sopt; 1573 { 1574 int error, optval; 1575 struct linger l; 1576 struct timeval tv; 1577 u_long val; 1578 #ifdef MAC 1579 struct mac extmac; 1580 #endif 1581 1582 error = 0; 1583 if (sopt->sopt_level != SOL_SOCKET) { 1584 if (so->so_proto && so->so_proto->pr_ctloutput) 1585 return ((*so->so_proto->pr_ctloutput) 1586 (so, sopt)); 1587 error = ENOPROTOOPT; 1588 } else { 1589 switch (sopt->sopt_name) { 1590 #ifdef INET 1591 case SO_ACCEPTFILTER: 1592 error = do_setopt_accept_filter(so, sopt); 1593 if (error) 1594 goto bad; 1595 break; 1596 #endif 1597 case SO_LINGER: 1598 error = sooptcopyin(sopt, &l, sizeof l, sizeof l); 1599 if (error) 1600 goto bad; 1601 1602 SOCK_LOCK(so); 1603 so->so_linger = l.l_linger; 1604 if (l.l_onoff) 1605 so->so_options |= SO_LINGER; 1606 else 1607 so->so_options &= ~SO_LINGER; 1608 SOCK_UNLOCK(so); 1609 break; 1610 1611 case SO_DEBUG: 1612 case SO_KEEPALIVE: 1613 case SO_DONTROUTE: 1614 case SO_USELOOPBACK: 1615 case SO_BROADCAST: 1616 case SO_REUSEADDR: 1617 case SO_REUSEPORT: 1618 case SO_OOBINLINE: 1619 case SO_TIMESTAMP: 1620 case SO_BINTIME: 1621 case SO_NOSIGPIPE: 1622 error = sooptcopyin(sopt, &optval, sizeof optval, 1623 sizeof optval); 1624 if (error) 1625 goto bad; 1626 SOCK_LOCK(so); 1627 if (optval) 1628 so->so_options |= sopt->sopt_name; 1629 else 1630 so->so_options &= ~sopt->sopt_name; 1631 SOCK_UNLOCK(so); 1632 break; 1633 1634 case SO_SNDBUF: 1635 case SO_RCVBUF: 1636 case SO_SNDLOWAT: 1637 case SO_RCVLOWAT: 1638 error = sooptcopyin(sopt, &optval, sizeof optval, 1639 sizeof optval); 1640 if (error) 1641 goto bad; 1642 1643 /* 1644 * Values < 1 make no sense for any of these 1645 * options, so disallow them. 1646 */ 1647 if (optval < 1) { 1648 error = EINVAL; 1649 goto bad; 1650 } 1651 1652 switch (sopt->sopt_name) { 1653 case SO_SNDBUF: 1654 case SO_RCVBUF: 1655 if (sbreserve(sopt->sopt_name == SO_SNDBUF ? 1656 &so->so_snd : &so->so_rcv, (u_long)optval, 1657 so, curthread) == 0) { 1658 error = ENOBUFS; 1659 goto bad; 1660 } 1661 break; 1662 1663 /* 1664 * Make sure the low-water is never greater than 1665 * the high-water. 1666 */ 1667 case SO_SNDLOWAT: 1668 SOCKBUF_LOCK(&so->so_snd); 1669 so->so_snd.sb_lowat = 1670 (optval > so->so_snd.sb_hiwat) ? 1671 so->so_snd.sb_hiwat : optval; 1672 SOCKBUF_UNLOCK(&so->so_snd); 1673 break; 1674 case SO_RCVLOWAT: 1675 SOCKBUF_LOCK(&so->so_rcv); 1676 so->so_rcv.sb_lowat = 1677 (optval > so->so_rcv.sb_hiwat) ? 1678 so->so_rcv.sb_hiwat : optval; 1679 SOCKBUF_UNLOCK(&so->so_rcv); 1680 break; 1681 } 1682 break; 1683 1684 case SO_SNDTIMEO: 1685 case SO_RCVTIMEO: 1686 error = sooptcopyin(sopt, &tv, sizeof tv, 1687 sizeof tv); 1688 if (error) 1689 goto bad; 1690 1691 /* assert(hz > 0); */ 1692 if (tv.tv_sec < 0 || tv.tv_sec > SHRT_MAX / hz || 1693 tv.tv_usec < 0 || tv.tv_usec >= 1000000) { 1694 error = EDOM; 1695 goto bad; 1696 } 1697 /* assert(tick > 0); */ 1698 /* assert(ULONG_MAX - SHRT_MAX >= 1000000); */ 1699 val = (u_long)(tv.tv_sec * hz) + tv.tv_usec / tick; 1700 if (val > SHRT_MAX) { 1701 error = EDOM; 1702 goto bad; 1703 } 1704 if (val == 0 && tv.tv_usec != 0) 1705 val = 1; 1706 1707 switch (sopt->sopt_name) { 1708 case SO_SNDTIMEO: 1709 so->so_snd.sb_timeo = val; 1710 break; 1711 case SO_RCVTIMEO: 1712 so->so_rcv.sb_timeo = val; 1713 break; 1714 } 1715 break; 1716 case SO_LABEL: 1717 #ifdef MAC 1718 error = sooptcopyin(sopt, &extmac, sizeof extmac, 1719 sizeof extmac); 1720 if (error) 1721 goto bad; 1722 error = mac_setsockopt_label(sopt->sopt_td->td_ucred, 1723 so, &extmac); 1724 #else 1725 error = EOPNOTSUPP; 1726 #endif 1727 break; 1728 default: 1729 error = ENOPROTOOPT; 1730 break; 1731 } 1732 if (error == 0 && so->so_proto != NULL && 1733 so->so_proto->pr_ctloutput != NULL) { 1734 (void) ((*so->so_proto->pr_ctloutput) 1735 (so, sopt)); 1736 } 1737 } 1738 bad: 1739 return (error); 1740 } 1741 1742 /* Helper routine for getsockopt */ 1743 int 1744 sooptcopyout(struct sockopt *sopt, const void *buf, size_t len) 1745 { 1746 int error; 1747 size_t valsize; 1748 1749 error = 0; 1750 1751 /* 1752 * Documented get behavior is that we always return a value, 1753 * possibly truncated to fit in the user's buffer. 1754 * Traditional behavior is that we always tell the user 1755 * precisely how much we copied, rather than something useful 1756 * like the total amount we had available for her. 1757 * Note that this interface is not idempotent; the entire answer must 1758 * generated ahead of time. 1759 */ 1760 valsize = min(len, sopt->sopt_valsize); 1761 sopt->sopt_valsize = valsize; 1762 if (sopt->sopt_val != NULL) { 1763 if (sopt->sopt_td != NULL) 1764 error = copyout(buf, sopt->sopt_val, valsize); 1765 else 1766 bcopy(buf, sopt->sopt_val, valsize); 1767 } 1768 return error; 1769 } 1770 1771 int 1772 sogetopt(so, sopt) 1773 struct socket *so; 1774 struct sockopt *sopt; 1775 { 1776 int error, optval; 1777 struct linger l; 1778 struct timeval tv; 1779 #ifdef INET 1780 struct accept_filter_arg *afap; 1781 #endif 1782 #ifdef MAC 1783 struct mac extmac; 1784 #endif 1785 1786 error = 0; 1787 if (sopt->sopt_level != SOL_SOCKET) { 1788 if (so->so_proto && so->so_proto->pr_ctloutput) { 1789 return ((*so->so_proto->pr_ctloutput) 1790 (so, sopt)); 1791 } else 1792 return (ENOPROTOOPT); 1793 } else { 1794 switch (sopt->sopt_name) { 1795 #ifdef INET 1796 case SO_ACCEPTFILTER: 1797 if ((so->so_options & SO_ACCEPTCONN) == 0) 1798 return (EINVAL); 1799 MALLOC(afap, struct accept_filter_arg *, sizeof(*afap), 1800 M_TEMP, M_WAITOK | M_ZERO); 1801 if ((so->so_options & SO_ACCEPTFILTER) != 0) { 1802 strcpy(afap->af_name, so->so_accf->so_accept_filter->accf_name); 1803 if (so->so_accf->so_accept_filter_str != NULL) 1804 strcpy(afap->af_arg, so->so_accf->so_accept_filter_str); 1805 } 1806 error = sooptcopyout(sopt, afap, sizeof(*afap)); 1807 FREE(afap, M_TEMP); 1808 break; 1809 #endif 1810 1811 case SO_LINGER: 1812 /* 1813 * XXXRW: We grab the lock here to get a consistent 1814 * snapshot of both fields. This may not really 1815 * be necessary. 1816 */ 1817 SOCK_LOCK(so); 1818 l.l_onoff = so->so_options & SO_LINGER; 1819 l.l_linger = so->so_linger; 1820 SOCK_UNLOCK(so); 1821 error = sooptcopyout(sopt, &l, sizeof l); 1822 break; 1823 1824 case SO_USELOOPBACK: 1825 case SO_DONTROUTE: 1826 case SO_DEBUG: 1827 case SO_KEEPALIVE: 1828 case SO_REUSEADDR: 1829 case SO_REUSEPORT: 1830 case SO_BROADCAST: 1831 case SO_OOBINLINE: 1832 case SO_TIMESTAMP: 1833 case SO_BINTIME: 1834 case SO_NOSIGPIPE: 1835 optval = so->so_options & sopt->sopt_name; 1836 integer: 1837 error = sooptcopyout(sopt, &optval, sizeof optval); 1838 break; 1839 1840 case SO_TYPE: 1841 optval = so->so_type; 1842 goto integer; 1843 1844 case SO_ERROR: 1845 optval = so->so_error; 1846 so->so_error = 0; 1847 goto integer; 1848 1849 case SO_SNDBUF: 1850 optval = so->so_snd.sb_hiwat; 1851 goto integer; 1852 1853 case SO_RCVBUF: 1854 optval = so->so_rcv.sb_hiwat; 1855 goto integer; 1856 1857 case SO_SNDLOWAT: 1858 optval = so->so_snd.sb_lowat; 1859 goto integer; 1860 1861 case SO_RCVLOWAT: 1862 optval = so->so_rcv.sb_lowat; 1863 goto integer; 1864 1865 case SO_SNDTIMEO: 1866 case SO_RCVTIMEO: 1867 optval = (sopt->sopt_name == SO_SNDTIMEO ? 1868 so->so_snd.sb_timeo : so->so_rcv.sb_timeo); 1869 1870 tv.tv_sec = optval / hz; 1871 tv.tv_usec = (optval % hz) * tick; 1872 error = sooptcopyout(sopt, &tv, sizeof tv); 1873 break; 1874 case SO_LABEL: 1875 #ifdef MAC 1876 error = sooptcopyin(sopt, &extmac, sizeof(extmac), 1877 sizeof(extmac)); 1878 if (error) 1879 return (error); 1880 error = mac_getsockopt_label(sopt->sopt_td->td_ucred, 1881 so, &extmac); 1882 if (error) 1883 return (error); 1884 error = sooptcopyout(sopt, &extmac, sizeof extmac); 1885 #else 1886 error = EOPNOTSUPP; 1887 #endif 1888 break; 1889 case SO_PEERLABEL: 1890 #ifdef MAC 1891 error = sooptcopyin(sopt, &extmac, sizeof(extmac), 1892 sizeof(extmac)); 1893 if (error) 1894 return (error); 1895 error = mac_getsockopt_peerlabel( 1896 sopt->sopt_td->td_ucred, so, &extmac); 1897 if (error) 1898 return (error); 1899 error = sooptcopyout(sopt, &extmac, sizeof extmac); 1900 #else 1901 error = EOPNOTSUPP; 1902 #endif 1903 break; 1904 default: 1905 error = ENOPROTOOPT; 1906 break; 1907 } 1908 return (error); 1909 } 1910 } 1911 1912 /* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */ 1913 int 1914 soopt_getm(struct sockopt *sopt, struct mbuf **mp) 1915 { 1916 struct mbuf *m, *m_prev; 1917 int sopt_size = sopt->sopt_valsize; 1918 1919 MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA); 1920 if (m == NULL) 1921 return ENOBUFS; 1922 if (sopt_size > MLEN) { 1923 MCLGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT); 1924 if ((m->m_flags & M_EXT) == 0) { 1925 m_free(m); 1926 return ENOBUFS; 1927 } 1928 m->m_len = min(MCLBYTES, sopt_size); 1929 } else { 1930 m->m_len = min(MLEN, sopt_size); 1931 } 1932 sopt_size -= m->m_len; 1933 *mp = m; 1934 m_prev = m; 1935 1936 while (sopt_size) { 1937 MGET(m, sopt->sopt_td ? M_TRYWAIT : M_DONTWAIT, MT_DATA); 1938 if (m == NULL) { 1939 m_freem(*mp); 1940 return ENOBUFS; 1941 } 1942 if (sopt_size > MLEN) { 1943 MCLGET(m, sopt->sopt_td != NULL ? M_TRYWAIT : 1944 M_DONTWAIT); 1945 if ((m->m_flags & M_EXT) == 0) { 1946 m_freem(m); 1947 m_freem(*mp); 1948 return ENOBUFS; 1949 } 1950 m->m_len = min(MCLBYTES, sopt_size); 1951 } else { 1952 m->m_len = min(MLEN, sopt_size); 1953 } 1954 sopt_size -= m->m_len; 1955 m_prev->m_next = m; 1956 m_prev = m; 1957 } 1958 return 0; 1959 } 1960 1961 /* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */ 1962 int 1963 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m) 1964 { 1965 struct mbuf *m0 = m; 1966 1967 if (sopt->sopt_val == NULL) 1968 return 0; 1969 while (m != NULL && sopt->sopt_valsize >= m->m_len) { 1970 if (sopt->sopt_td != NULL) { 1971 int error; 1972 1973 error = copyin(sopt->sopt_val, mtod(m, char *), 1974 m->m_len); 1975 if (error != 0) { 1976 m_freem(m0); 1977 return(error); 1978 } 1979 } else 1980 bcopy(sopt->sopt_val, mtod(m, char *), m->m_len); 1981 sopt->sopt_valsize -= m->m_len; 1982 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; 1983 m = m->m_next; 1984 } 1985 if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */ 1986 panic("ip6_sooptmcopyin"); 1987 return 0; 1988 } 1989 1990 /* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */ 1991 int 1992 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m) 1993 { 1994 struct mbuf *m0 = m; 1995 size_t valsize = 0; 1996 1997 if (sopt->sopt_val == NULL) 1998 return 0; 1999 while (m != NULL && sopt->sopt_valsize >= m->m_len) { 2000 if (sopt->sopt_td != NULL) { 2001 int error; 2002 2003 error = copyout(mtod(m, char *), sopt->sopt_val, 2004 m->m_len); 2005 if (error != 0) { 2006 m_freem(m0); 2007 return(error); 2008 } 2009 } else 2010 bcopy(mtod(m, char *), sopt->sopt_val, m->m_len); 2011 sopt->sopt_valsize -= m->m_len; 2012 sopt->sopt_val = (char *)sopt->sopt_val + m->m_len; 2013 valsize += m->m_len; 2014 m = m->m_next; 2015 } 2016 if (m != NULL) { 2017 /* enough soopt buffer should be given from user-land */ 2018 m_freem(m0); 2019 return(EINVAL); 2020 } 2021 sopt->sopt_valsize = valsize; 2022 return 0; 2023 } 2024 2025 void 2026 sohasoutofband(so) 2027 struct socket *so; 2028 { 2029 if (so->so_sigio != NULL) 2030 pgsigio(&so->so_sigio, SIGURG, 0); 2031 selwakeuppri(&so->so_rcv.sb_sel, PSOCK); 2032 } 2033 2034 int 2035 sopoll(struct socket *so, int events, struct ucred *active_cred, 2036 struct thread *td) 2037 { 2038 int revents = 0; 2039 2040 if (events & (POLLIN | POLLRDNORM)) 2041 if (soreadable(so)) 2042 revents |= events & (POLLIN | POLLRDNORM); 2043 2044 if (events & POLLINIGNEOF) 2045 if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat || 2046 !TAILQ_EMPTY(&so->so_comp) || so->so_error) 2047 revents |= POLLINIGNEOF; 2048 2049 if (events & (POLLOUT | POLLWRNORM)) 2050 if (sowriteable(so)) 2051 revents |= events & (POLLOUT | POLLWRNORM); 2052 2053 if (events & (POLLPRI | POLLRDBAND)) 2054 if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK)) 2055 revents |= events & (POLLPRI | POLLRDBAND); 2056 2057 if (revents == 0) { 2058 if (events & 2059 (POLLIN | POLLINIGNEOF | POLLPRI | POLLRDNORM | 2060 POLLRDBAND)) { 2061 SOCKBUF_LOCK(&so->so_rcv); 2062 selrecord(td, &so->so_rcv.sb_sel); 2063 so->so_rcv.sb_flags |= SB_SEL; 2064 SOCKBUF_UNLOCK(&so->so_rcv); 2065 } 2066 2067 if (events & (POLLOUT | POLLWRNORM)) { 2068 SOCKBUF_LOCK(&so->so_snd); 2069 selrecord(td, &so->so_snd.sb_sel); 2070 so->so_snd.sb_flags |= SB_SEL; 2071 SOCKBUF_UNLOCK(&so->so_snd); 2072 } 2073 } 2074 2075 return (revents); 2076 } 2077 2078 int 2079 soo_kqfilter(struct file *fp, struct knote *kn) 2080 { 2081 struct socket *so = kn->kn_fp->f_data; 2082 struct sockbuf *sb; 2083 2084 switch (kn->kn_filter) { 2085 case EVFILT_READ: 2086 if (so->so_options & SO_ACCEPTCONN) 2087 kn->kn_fop = &solisten_filtops; 2088 else 2089 kn->kn_fop = &soread_filtops; 2090 sb = &so->so_rcv; 2091 break; 2092 case EVFILT_WRITE: 2093 kn->kn_fop = &sowrite_filtops; 2094 sb = &so->so_snd; 2095 break; 2096 default: 2097 return (1); 2098 } 2099 2100 SOCKBUF_LOCK(sb); 2101 SLIST_INSERT_HEAD(&sb->sb_sel.si_note, kn, kn_selnext); 2102 sb->sb_flags |= SB_KNOTE; 2103 SOCKBUF_UNLOCK(sb); 2104 return (0); 2105 } 2106 2107 static void 2108 filt_sordetach(struct knote *kn) 2109 { 2110 struct socket *so = kn->kn_fp->f_data; 2111 2112 SOCKBUF_LOCK(&so->so_rcv); 2113 SLIST_REMOVE(&so->so_rcv.sb_sel.si_note, kn, knote, kn_selnext); 2114 if (SLIST_EMPTY(&so->so_rcv.sb_sel.si_note)) 2115 so->so_rcv.sb_flags &= ~SB_KNOTE; 2116 SOCKBUF_UNLOCK(&so->so_rcv); 2117 } 2118 2119 /*ARGSUSED*/ 2120 static int 2121 filt_soread(struct knote *kn, long hint) 2122 { 2123 struct socket *so = kn->kn_fp->f_data; 2124 int need_lock, result; 2125 2126 /* 2127 * XXXRW: Conditional locking because filt_soread() can be called 2128 * either from KNOTE() in the socket context where the socket buffer 2129 * lock is already held, or from kqueue() itself. 2130 */ 2131 need_lock = !SOCKBUF_OWNED(&so->so_rcv); 2132 if (need_lock) 2133 SOCKBUF_LOCK(&so->so_rcv); 2134 kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl; 2135 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 2136 kn->kn_flags |= EV_EOF; 2137 kn->kn_fflags = so->so_error; 2138 result = 1; 2139 } else if (so->so_error) /* temporary udp error */ 2140 result = 1; 2141 else if (kn->kn_sfflags & NOTE_LOWAT) 2142 result = (kn->kn_data >= kn->kn_sdata); 2143 else 2144 result = (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat); 2145 if (need_lock) 2146 SOCKBUF_UNLOCK(&so->so_rcv); 2147 return (result); 2148 } 2149 2150 static void 2151 filt_sowdetach(struct knote *kn) 2152 { 2153 struct socket *so = kn->kn_fp->f_data; 2154 2155 SOCKBUF_LOCK(&so->so_snd); 2156 SLIST_REMOVE(&so->so_snd.sb_sel.si_note, kn, knote, kn_selnext); 2157 if (SLIST_EMPTY(&so->so_snd.sb_sel.si_note)) 2158 so->so_snd.sb_flags &= ~SB_KNOTE; 2159 SOCKBUF_UNLOCK(&so->so_snd); 2160 } 2161 2162 /*ARGSUSED*/ 2163 static int 2164 filt_sowrite(struct knote *kn, long hint) 2165 { 2166 struct socket *so = kn->kn_fp->f_data; 2167 int need_lock, result; 2168 2169 /* 2170 * XXXRW: Conditional locking because filt_soread() can be called 2171 * either from KNOTE() in the socket context where the socket buffer 2172 * lock is already held, or from kqueue() itself. 2173 */ 2174 need_lock = !SOCKBUF_OWNED(&so->so_snd); 2175 if (need_lock) 2176 SOCKBUF_LOCK(&so->so_snd); 2177 kn->kn_data = sbspace(&so->so_snd); 2178 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 2179 kn->kn_flags |= EV_EOF; 2180 kn->kn_fflags = so->so_error; 2181 result = 1; 2182 } else if (so->so_error) /* temporary udp error */ 2183 result = 1; 2184 else if (((so->so_state & SS_ISCONNECTED) == 0) && 2185 (so->so_proto->pr_flags & PR_CONNREQUIRED)) 2186 result = 0; 2187 else if (kn->kn_sfflags & NOTE_LOWAT) 2188 result = (kn->kn_data >= kn->kn_sdata); 2189 else 2190 result = (kn->kn_data >= so->so_snd.sb_lowat); 2191 if (need_lock) 2192 SOCKBUF_UNLOCK(&so->so_snd); 2193 return (result); 2194 } 2195 2196 /*ARGSUSED*/ 2197 static int 2198 filt_solisten(struct knote *kn, long hint) 2199 { 2200 struct socket *so = kn->kn_fp->f_data; 2201 2202 kn->kn_data = so->so_qlen; 2203 return (! TAILQ_EMPTY(&so->so_comp)); 2204 } 2205 2206 int 2207 socheckuid(struct socket *so, uid_t uid) 2208 { 2209 2210 if (so == NULL) 2211 return (EPERM); 2212 if (so->so_cred->cr_uid == uid) 2213 return (0); 2214 return (EPERM); 2215 } 2216